1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | |
3 | /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ |
4 | /* Copyright (c) 2008-2019, IBM Corporation */ |
5 | |
6 | #include <linux/errno.h> |
7 | #include <linux/types.h> |
8 | #include <linux/net.h> |
9 | #include <linux/scatterlist.h> |
10 | #include <linux/highmem.h> |
11 | #include <net/tcp.h> |
12 | |
13 | #include <rdma/iw_cm.h> |
14 | #include <rdma/ib_verbs.h> |
15 | #include <rdma/ib_user_verbs.h> |
16 | |
17 | #include "siw.h" |
18 | #include "siw_verbs.h" |
19 | #include "siw_mem.h" |
20 | |
21 | #define MAX_HDR_INLINE \ |
22 | (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ |
23 | sizeof(struct iwarp_send))) & 0xF8) |
24 | |
25 | static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) |
26 | { |
27 | struct siw_pbl *pbl = mem->pbl; |
28 | u64 offset = addr - mem->va; |
29 | dma_addr_t paddr = siw_pbl_get_buffer(pbl, off: offset, NULL, idx); |
30 | |
31 | if (paddr) |
32 | return ib_virt_dma_to_page(dma_addr: paddr); |
33 | |
34 | return NULL; |
35 | } |
36 | |
37 | static struct page *siw_get_page(struct siw_mem *mem, struct siw_sge *sge, |
38 | unsigned long offset, int *pbl_idx) |
39 | { |
40 | if (!mem->is_pbl) |
41 | return siw_get_upage(umem: mem->umem, addr: sge->laddr + offset); |
42 | else |
43 | return siw_get_pblpage(mem, addr: sge->laddr + offset, idx: pbl_idx); |
44 | } |
45 | |
46 | /* |
47 | * Copy short payload at provided destination payload address |
48 | */ |
49 | static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr) |
50 | { |
51 | struct siw_wqe *wqe = &c_tx->wqe_active; |
52 | struct siw_sge *sge = &wqe->sqe.sge[0]; |
53 | u32 bytes = sge->length; |
54 | |
55 | if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) |
56 | return MAX_HDR_INLINE + 1; |
57 | |
58 | if (!bytes) |
59 | return 0; |
60 | |
61 | if (tx_flags(wqe) & SIW_WQE_INLINE) { |
62 | memcpy(paddr, &wqe->sqe.sge[1], bytes); |
63 | } else { |
64 | struct siw_mem *mem = wqe->mem[0]; |
65 | |
66 | if (!mem->mem_obj) { |
67 | /* Kernel client using kva */ |
68 | memcpy(paddr, ib_virt_dma_to_ptr(sge->laddr), bytes); |
69 | } else if (c_tx->in_syscall) { |
70 | if (copy_from_user(to: paddr, u64_to_user_ptr(sge->laddr), |
71 | n: bytes)) |
72 | return -EFAULT; |
73 | } else { |
74 | unsigned int off = sge->laddr & ~PAGE_MASK; |
75 | struct page *p; |
76 | char *buffer; |
77 | int pbl_idx = 0; |
78 | |
79 | p = siw_get_page(mem, sge, offset: 0, pbl_idx: &pbl_idx); |
80 | if (unlikely(!p)) |
81 | return -EFAULT; |
82 | |
83 | buffer = kmap_local_page(page: p); |
84 | |
85 | if (likely(PAGE_SIZE - off >= bytes)) { |
86 | memcpy(paddr, buffer + off, bytes); |
87 | } else { |
88 | unsigned long part = bytes - (PAGE_SIZE - off); |
89 | |
90 | memcpy(paddr, buffer + off, part); |
91 | kunmap_local(buffer); |
92 | |
93 | p = siw_get_page(mem, sge, offset: part, pbl_idx: &pbl_idx); |
94 | if (unlikely(!p)) |
95 | return -EFAULT; |
96 | |
97 | buffer = kmap_local_page(page: p); |
98 | memcpy(paddr + part, buffer, bytes - part); |
99 | } |
100 | kunmap_local(buffer); |
101 | } |
102 | } |
103 | return (int)bytes; |
104 | } |
105 | |
106 | #define PKT_FRAGMENTED 1 |
107 | #define PKT_COMPLETE 0 |
108 | |
109 | /* |
110 | * siw_qp_prepare_tx() |
111 | * |
112 | * Prepare tx state for sending out one fpdu. Builds complete pkt |
113 | * if no user data or only immediate data are present. |
114 | * |
115 | * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. |
116 | */ |
117 | static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) |
118 | { |
119 | struct siw_wqe *wqe = &c_tx->wqe_active; |
120 | char *crc = NULL; |
121 | int data = 0; |
122 | |
123 | switch (tx_type(wqe)) { |
124 | case SIW_OP_READ: |
125 | case SIW_OP_READ_LOCAL_INV: |
126 | memcpy(&c_tx->pkt.ctrl, |
127 | &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, |
128 | sizeof(struct iwarp_ctrl)); |
129 | |
130 | c_tx->pkt.rreq.rsvd = 0; |
131 | c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); |
132 | c_tx->pkt.rreq.ddp_msn = |
133 | htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); |
134 | c_tx->pkt.rreq.ddp_mo = 0; |
135 | c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); |
136 | c_tx->pkt.rreq.sink_to = |
137 | cpu_to_be64(wqe->sqe.sge[0].laddr); |
138 | c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); |
139 | c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); |
140 | c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); |
141 | |
142 | c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); |
143 | crc = (char *)&c_tx->pkt.rreq_pkt.crc; |
144 | break; |
145 | |
146 | case SIW_OP_SEND: |
147 | if (tx_flags(wqe) & SIW_WQE_SOLICITED) |
148 | memcpy(&c_tx->pkt.ctrl, |
149 | &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, |
150 | sizeof(struct iwarp_ctrl)); |
151 | else |
152 | memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, |
153 | sizeof(struct iwarp_ctrl)); |
154 | |
155 | c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; |
156 | c_tx->pkt.send.ddp_msn = |
157 | htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); |
158 | c_tx->pkt.send.ddp_mo = 0; |
159 | |
160 | c_tx->pkt.send_inv.inval_stag = 0; |
161 | |
162 | c_tx->ctrl_len = sizeof(struct iwarp_send); |
163 | |
164 | crc = (char *)&c_tx->pkt.send_pkt.crc; |
165 | data = siw_try_1seg(c_tx, paddr: crc); |
166 | break; |
167 | |
168 | case SIW_OP_SEND_REMOTE_INV: |
169 | if (tx_flags(wqe) & SIW_WQE_SOLICITED) |
170 | memcpy(&c_tx->pkt.ctrl, |
171 | &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, |
172 | sizeof(struct iwarp_ctrl)); |
173 | else |
174 | memcpy(&c_tx->pkt.ctrl, |
175 | &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, |
176 | sizeof(struct iwarp_ctrl)); |
177 | |
178 | c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; |
179 | c_tx->pkt.send.ddp_msn = |
180 | htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); |
181 | c_tx->pkt.send.ddp_mo = 0; |
182 | |
183 | c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); |
184 | |
185 | c_tx->ctrl_len = sizeof(struct iwarp_send_inv); |
186 | |
187 | crc = (char *)&c_tx->pkt.send_pkt.crc; |
188 | data = siw_try_1seg(c_tx, paddr: crc); |
189 | break; |
190 | |
191 | case SIW_OP_WRITE: |
192 | memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, |
193 | sizeof(struct iwarp_ctrl)); |
194 | |
195 | c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); |
196 | c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); |
197 | c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); |
198 | |
199 | crc = (char *)&c_tx->pkt.write_pkt.crc; |
200 | data = siw_try_1seg(c_tx, paddr: crc); |
201 | break; |
202 | |
203 | case SIW_OP_READ_RESPONSE: |
204 | memcpy(&c_tx->pkt.ctrl, |
205 | &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, |
206 | sizeof(struct iwarp_ctrl)); |
207 | |
208 | /* NBO */ |
209 | c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); |
210 | c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); |
211 | |
212 | c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); |
213 | |
214 | crc = (char *)&c_tx->pkt.write_pkt.crc; |
215 | data = siw_try_1seg(c_tx, paddr: crc); |
216 | break; |
217 | |
218 | default: |
219 | siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n" , tx_type(wqe)); |
220 | return -EOPNOTSUPP; |
221 | } |
222 | if (unlikely(data < 0)) |
223 | return data; |
224 | |
225 | c_tx->ctrl_sent = 0; |
226 | |
227 | if (data <= MAX_HDR_INLINE) { |
228 | if (data) { |
229 | wqe->processed = data; |
230 | |
231 | c_tx->pkt.ctrl.mpa_len = |
232 | htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); |
233 | |
234 | /* Add pad, if needed */ |
235 | data += -(int)data & 0x3; |
236 | /* advance CRC location after payload */ |
237 | crc += data; |
238 | c_tx->ctrl_len += data; |
239 | |
240 | if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) |
241 | c_tx->pkt.c_untagged.ddp_mo = 0; |
242 | else |
243 | c_tx->pkt.c_tagged.ddp_to = |
244 | cpu_to_be64(wqe->sqe.raddr); |
245 | } |
246 | |
247 | *(u32 *)crc = 0; |
248 | /* |
249 | * Do complete CRC if enabled and short packet |
250 | */ |
251 | if (c_tx->mpa_crc_hd && |
252 | crypto_shash_digest(desc: c_tx->mpa_crc_hd, data: (u8 *)&c_tx->pkt, |
253 | len: c_tx->ctrl_len, out: (u8 *)crc) != 0) |
254 | return -EINVAL; |
255 | c_tx->ctrl_len += MPA_CRC_SIZE; |
256 | |
257 | return PKT_COMPLETE; |
258 | } |
259 | c_tx->ctrl_len += MPA_CRC_SIZE; |
260 | c_tx->sge_idx = 0; |
261 | c_tx->sge_off = 0; |
262 | c_tx->pbl_idx = 0; |
263 | |
264 | /* |
265 | * Allow direct sending out of user buffer if WR is non signalled |
266 | * and payload is over threshold. |
267 | * Per RDMA verbs, the application should not change the send buffer |
268 | * until the work completed. In iWarp, work completion is only |
269 | * local delivery to TCP. TCP may reuse the buffer for |
270 | * retransmission. Changing unsent data also breaks the CRC, |
271 | * if applied. |
272 | */ |
273 | if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && |
274 | !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) |
275 | c_tx->use_sendpage = 1; |
276 | else |
277 | c_tx->use_sendpage = 0; |
278 | |
279 | return PKT_FRAGMENTED; |
280 | } |
281 | |
282 | /* |
283 | * Send out one complete control type FPDU, or header of FPDU carrying |
284 | * data. Used for fixed sized packets like Read.Requests or zero length |
285 | * SENDs, WRITEs, READ.Responses, or header only. |
286 | */ |
287 | static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, |
288 | int flags) |
289 | { |
290 | struct msghdr msg = { .msg_flags = flags }; |
291 | struct kvec iov = { .iov_base = |
292 | (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, |
293 | .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; |
294 | |
295 | int rv = kernel_sendmsg(sock: s, msg: &msg, vec: &iov, num: 1, len: iov.iov_len); |
296 | |
297 | if (rv >= 0) { |
298 | c_tx->ctrl_sent += rv; |
299 | |
300 | if (c_tx->ctrl_sent == c_tx->ctrl_len) |
301 | rv = 0; |
302 | else |
303 | rv = -EAGAIN; |
304 | } |
305 | return rv; |
306 | } |
307 | |
308 | /* |
309 | * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES. |
310 | * |
311 | * Using sendpage to push page by page appears to be less efficient |
312 | * than using sendmsg, even if data are copied. |
313 | * |
314 | * A general performance limitation might be the extra four bytes |
315 | * trailer checksum segment to be pushed after user data. |
316 | */ |
317 | static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, |
318 | size_t size) |
319 | { |
320 | struct bio_vec bvec; |
321 | struct msghdr msg = { |
322 | .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES), |
323 | }; |
324 | struct sock *sk = s->sk; |
325 | int i = 0, rv = 0, sent = 0; |
326 | |
327 | while (size) { |
328 | size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); |
329 | |
330 | if (size + offset <= PAGE_SIZE) |
331 | msg.msg_flags &= ~MSG_MORE; |
332 | |
333 | tcp_rate_check_app_limited(sk); |
334 | bvec_set_page(bv: &bvec, page: page[i], len: bytes, offset); |
335 | iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: 1, count: size); |
336 | |
337 | try_page_again: |
338 | lock_sock(sk); |
339 | rv = tcp_sendmsg_locked(sk, msg: &msg, size); |
340 | release_sock(sk); |
341 | |
342 | if (rv > 0) { |
343 | size -= rv; |
344 | sent += rv; |
345 | if (rv != bytes) { |
346 | offset += rv; |
347 | bytes -= rv; |
348 | goto try_page_again; |
349 | } |
350 | offset = 0; |
351 | } else { |
352 | if (rv == -EAGAIN || rv == 0) |
353 | break; |
354 | return rv; |
355 | } |
356 | i++; |
357 | } |
358 | return sent; |
359 | } |
360 | |
361 | /* |
362 | * siw_0copy_tx() |
363 | * |
364 | * Pushes list of pages to TCP socket. If pages from multiple |
365 | * SGE's, all referenced pages of each SGE are pushed in one |
366 | * shot. |
367 | */ |
368 | static int siw_0copy_tx(struct socket *s, struct page **page, |
369 | struct siw_sge *sge, unsigned int offset, |
370 | unsigned int size) |
371 | { |
372 | int i = 0, sent = 0, rv; |
373 | int sge_bytes = min(sge->length - offset, size); |
374 | |
375 | offset = (sge->laddr + offset) & ~PAGE_MASK; |
376 | |
377 | while (sent != size) { |
378 | rv = siw_tcp_sendpages(s, page: &page[i], offset, size: sge_bytes); |
379 | if (rv >= 0) { |
380 | sent += rv; |
381 | if (size == sent || sge_bytes > rv) |
382 | break; |
383 | |
384 | i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; |
385 | sge++; |
386 | sge_bytes = min(sge->length, size - sent); |
387 | offset = sge->laddr & ~PAGE_MASK; |
388 | } else { |
389 | sent = rv; |
390 | break; |
391 | } |
392 | } |
393 | return sent; |
394 | } |
395 | |
396 | #define MAX_TRAILER (MPA_CRC_SIZE + 4) |
397 | |
398 | static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len) |
399 | { |
400 | int i; |
401 | |
402 | /* |
403 | * Work backwards through the array to honor the kmap_local_page() |
404 | * ordering requirements. |
405 | */ |
406 | for (i = (len-1); i >= 0; i--) { |
407 | if (kmap_mask & BIT(i)) { |
408 | unsigned long addr = (unsigned long)iov[i].iov_base; |
409 | |
410 | kunmap_local((void *)(addr & PAGE_MASK)); |
411 | } |
412 | } |
413 | } |
414 | |
415 | /* |
416 | * siw_tx_hdt() tries to push a complete packet to TCP where all |
417 | * packet fragments are referenced by the elements of one iovec. |
418 | * For the data portion, each involved page must be referenced by |
419 | * one extra element. All sge's data can be non-aligned to page |
420 | * boundaries. Two more elements are referencing iWARP header |
421 | * and trailer: |
422 | * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL |
423 | */ |
424 | #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) |
425 | |
426 | /* |
427 | * Write out iov referencing hdr, data and trailer of current FPDU. |
428 | * Update transmit state dependent on write return status |
429 | */ |
430 | static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) |
431 | { |
432 | struct siw_wqe *wqe = &c_tx->wqe_active; |
433 | struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; |
434 | struct kvec iov[MAX_ARRAY]; |
435 | struct page *page_array[MAX_ARRAY]; |
436 | struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; |
437 | |
438 | int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; |
439 | unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, |
440 | sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, |
441 | pbl_idx = c_tx->pbl_idx; |
442 | unsigned long kmap_mask = 0L; |
443 | |
444 | if (c_tx->state == SIW_SEND_HDR) { |
445 | if (c_tx->use_sendpage) { |
446 | rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); |
447 | if (rv) |
448 | goto done; |
449 | |
450 | c_tx->state = SIW_SEND_DATA; |
451 | } else { |
452 | iov[0].iov_base = |
453 | (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; |
454 | iov[0].iov_len = hdr_len = |
455 | c_tx->ctrl_len - c_tx->ctrl_sent; |
456 | seg = 1; |
457 | } |
458 | } |
459 | |
460 | wqe->processed += data_len; |
461 | |
462 | while (data_len) { /* walk the list of SGE's */ |
463 | unsigned int sge_len = min(sge->length - sge_off, data_len); |
464 | unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; |
465 | struct siw_mem *mem; |
466 | |
467 | if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { |
468 | mem = wqe->mem[sge_idx]; |
469 | is_kva = mem->mem_obj == NULL ? 1 : 0; |
470 | } else { |
471 | is_kva = 1; |
472 | } |
473 | if (is_kva && !c_tx->use_sendpage) { |
474 | /* |
475 | * tx from kernel virtual address: either inline data |
476 | * or memory region with assigned kernel buffer |
477 | */ |
478 | iov[seg].iov_base = |
479 | ib_virt_dma_to_ptr(dma_addr: sge->laddr + sge_off); |
480 | iov[seg].iov_len = sge_len; |
481 | |
482 | if (do_crc) |
483 | crypto_shash_update(desc: c_tx->mpa_crc_hd, |
484 | data: iov[seg].iov_base, |
485 | len: sge_len); |
486 | sge_off += sge_len; |
487 | data_len -= sge_len; |
488 | seg++; |
489 | goto sge_done; |
490 | } |
491 | |
492 | while (sge_len) { |
493 | size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); |
494 | void *kaddr; |
495 | |
496 | if (!is_kva) { |
497 | struct page *p; |
498 | |
499 | p = siw_get_page(mem, sge, offset: sge_off, pbl_idx: &pbl_idx); |
500 | if (unlikely(!p)) { |
501 | siw_unmap_pages(iov, kmap_mask, len: seg); |
502 | wqe->processed -= c_tx->bytes_unsent; |
503 | rv = -EFAULT; |
504 | goto done_crc; |
505 | } |
506 | page_array[seg] = p; |
507 | |
508 | if (!c_tx->use_sendpage) { |
509 | void *kaddr = kmap_local_page(page: p); |
510 | |
511 | /* Remember for later kunmap() */ |
512 | kmap_mask |= BIT(seg); |
513 | iov[seg].iov_base = kaddr + fp_off; |
514 | iov[seg].iov_len = plen; |
515 | |
516 | if (do_crc) |
517 | crypto_shash_update( |
518 | desc: c_tx->mpa_crc_hd, |
519 | data: iov[seg].iov_base, |
520 | len: plen); |
521 | } else if (do_crc) { |
522 | kaddr = kmap_local_page(page: p); |
523 | crypto_shash_update(desc: c_tx->mpa_crc_hd, |
524 | data: kaddr + fp_off, |
525 | len: plen); |
526 | kunmap_local(kaddr); |
527 | } |
528 | } else { |
529 | /* |
530 | * Cast to an uintptr_t to preserve all 64 bits |
531 | * in sge->laddr. |
532 | */ |
533 | u64 va = sge->laddr + sge_off; |
534 | |
535 | page_array[seg] = ib_virt_dma_to_page(dma_addr: va); |
536 | if (do_crc) |
537 | crypto_shash_update( |
538 | desc: c_tx->mpa_crc_hd, |
539 | data: ib_virt_dma_to_ptr(dma_addr: va), |
540 | len: plen); |
541 | } |
542 | |
543 | sge_len -= plen; |
544 | sge_off += plen; |
545 | data_len -= plen; |
546 | fp_off = 0; |
547 | |
548 | if (++seg >= (int)MAX_ARRAY) { |
549 | siw_dbg_qp(tx_qp(c_tx), "to many fragments\n" ); |
550 | siw_unmap_pages(iov, kmap_mask, len: seg-1); |
551 | wqe->processed -= c_tx->bytes_unsent; |
552 | rv = -EMSGSIZE; |
553 | goto done_crc; |
554 | } |
555 | } |
556 | sge_done: |
557 | /* Update SGE variables at end of SGE */ |
558 | if (sge_off == sge->length && |
559 | (data_len != 0 || wqe->processed < wqe->bytes)) { |
560 | sge_idx++; |
561 | sge++; |
562 | sge_off = 0; |
563 | } |
564 | } |
565 | /* trailer */ |
566 | if (likely(c_tx->state != SIW_SEND_TRAILER)) { |
567 | iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; |
568 | iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); |
569 | } else { |
570 | iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; |
571 | iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; |
572 | } |
573 | |
574 | if (c_tx->pad) { |
575 | *(u32 *)c_tx->trailer.pad = 0; |
576 | if (do_crc) |
577 | crypto_shash_update(desc: c_tx->mpa_crc_hd, |
578 | data: (u8 *)&c_tx->trailer.crc - c_tx->pad, |
579 | len: c_tx->pad); |
580 | } |
581 | if (!c_tx->mpa_crc_hd) |
582 | c_tx->trailer.crc = 0; |
583 | else if (do_crc) |
584 | crypto_shash_final(desc: c_tx->mpa_crc_hd, out: (u8 *)&c_tx->trailer.crc); |
585 | |
586 | data_len = c_tx->bytes_unsent; |
587 | |
588 | if (c_tx->use_sendpage) { |
589 | rv = siw_0copy_tx(s, page: page_array, sge: &wqe->sqe.sge[c_tx->sge_idx], |
590 | offset: c_tx->sge_off, size: data_len); |
591 | if (rv == data_len) { |
592 | rv = kernel_sendmsg(sock: s, msg: &msg, vec: &iov[seg], num: 1, len: trl_len); |
593 | if (rv > 0) |
594 | rv += data_len; |
595 | else |
596 | rv = data_len; |
597 | } |
598 | } else { |
599 | rv = kernel_sendmsg(sock: s, msg: &msg, vec: iov, num: seg + 1, |
600 | len: hdr_len + data_len + trl_len); |
601 | siw_unmap_pages(iov, kmap_mask, len: seg); |
602 | } |
603 | if (rv < (int)hdr_len) { |
604 | /* Not even complete hdr pushed or negative rv */ |
605 | wqe->processed -= data_len; |
606 | if (rv >= 0) { |
607 | c_tx->ctrl_sent += rv; |
608 | rv = -EAGAIN; |
609 | } |
610 | goto done_crc; |
611 | } |
612 | rv -= hdr_len; |
613 | |
614 | if (rv >= (int)data_len) { |
615 | /* all user data pushed to TCP or no data to push */ |
616 | if (data_len > 0 && wqe->processed < wqe->bytes) { |
617 | /* Save the current state for next tx */ |
618 | c_tx->sge_idx = sge_idx; |
619 | c_tx->sge_off = sge_off; |
620 | c_tx->pbl_idx = pbl_idx; |
621 | } |
622 | rv -= data_len; |
623 | |
624 | if (rv == trl_len) /* all pushed */ |
625 | rv = 0; |
626 | else { |
627 | c_tx->state = SIW_SEND_TRAILER; |
628 | c_tx->ctrl_len = MAX_TRAILER; |
629 | c_tx->ctrl_sent = rv + 4 - c_tx->pad; |
630 | c_tx->bytes_unsent = 0; |
631 | rv = -EAGAIN; |
632 | } |
633 | |
634 | } else if (data_len > 0) { |
635 | /* Maybe some user data pushed to TCP */ |
636 | c_tx->state = SIW_SEND_DATA; |
637 | wqe->processed -= data_len - rv; |
638 | |
639 | if (rv) { |
640 | /* |
641 | * Some bytes out. Recompute tx state based |
642 | * on old state and bytes pushed |
643 | */ |
644 | unsigned int sge_unsent; |
645 | |
646 | c_tx->bytes_unsent -= rv; |
647 | sge = &wqe->sqe.sge[c_tx->sge_idx]; |
648 | sge_unsent = sge->length - c_tx->sge_off; |
649 | |
650 | while (sge_unsent <= rv) { |
651 | rv -= sge_unsent; |
652 | c_tx->sge_idx++; |
653 | c_tx->sge_off = 0; |
654 | sge++; |
655 | sge_unsent = sge->length; |
656 | } |
657 | c_tx->sge_off += rv; |
658 | } |
659 | rv = -EAGAIN; |
660 | } |
661 | done_crc: |
662 | c_tx->do_crc = 0; |
663 | done: |
664 | return rv; |
665 | } |
666 | |
667 | static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, |
668 | struct socket *s) |
669 | { |
670 | struct tcp_sock *tp = tcp_sk(s->sk); |
671 | |
672 | if (tp->gso_segs) { |
673 | if (c_tx->gso_seg_limit == 0) |
674 | c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; |
675 | else |
676 | c_tx->tcp_seglen = |
677 | tp->mss_cache * |
678 | min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); |
679 | } else { |
680 | c_tx->tcp_seglen = tp->mss_cache; |
681 | } |
682 | /* Loopback may give odd numbers */ |
683 | c_tx->tcp_seglen &= 0xfffffff8; |
684 | } |
685 | |
686 | /* |
687 | * siw_prepare_fpdu() |
688 | * |
689 | * Prepares transmit context to send out one FPDU if FPDU will contain |
690 | * user data and user data are not immediate data. |
691 | * Computes maximum FPDU length to fill up TCP MSS if possible. |
692 | * |
693 | * @qp: QP from which to transmit |
694 | * @wqe: Current WQE causing transmission |
695 | * |
696 | * TODO: Take into account real available sendspace on socket |
697 | * to avoid header misalignment due to send pausing within |
698 | * fpdu transmission |
699 | */ |
700 | static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) |
701 | { |
702 | struct siw_iwarp_tx *c_tx = &qp->tx_ctx; |
703 | int data_len; |
704 | |
705 | c_tx->ctrl_len = |
706 | iwarp_pktinfo[__rdmap_get_opcode(ctrl: &c_tx->pkt.ctrl)].hdr_len; |
707 | c_tx->ctrl_sent = 0; |
708 | |
709 | /* |
710 | * Update target buffer offset if any |
711 | */ |
712 | if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) |
713 | /* Untagged message */ |
714 | c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); |
715 | else /* Tagged message */ |
716 | c_tx->pkt.c_tagged.ddp_to = |
717 | cpu_to_be64(wqe->sqe.raddr + wqe->processed); |
718 | |
719 | data_len = wqe->bytes - wqe->processed; |
720 | if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { |
721 | /* Trim DDP payload to fit into current TCP segment */ |
722 | data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); |
723 | c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; |
724 | c_tx->pad = 0; |
725 | } else { |
726 | c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; |
727 | c_tx->pad = -data_len & 0x3; |
728 | } |
729 | c_tx->bytes_unsent = data_len; |
730 | |
731 | c_tx->pkt.ctrl.mpa_len = |
732 | htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); |
733 | |
734 | /* |
735 | * Init MPA CRC computation |
736 | */ |
737 | if (c_tx->mpa_crc_hd) { |
738 | crypto_shash_init(desc: c_tx->mpa_crc_hd); |
739 | crypto_shash_update(desc: c_tx->mpa_crc_hd, data: (u8 *)&c_tx->pkt, |
740 | len: c_tx->ctrl_len); |
741 | c_tx->do_crc = 1; |
742 | } |
743 | } |
744 | |
745 | /* |
746 | * siw_check_sgl_tx() |
747 | * |
748 | * Check permissions for a list of SGE's (SGL). |
749 | * A successful check will have all memory referenced |
750 | * for transmission resolved and assigned to the WQE. |
751 | * |
752 | * @pd: Protection Domain SGL should belong to |
753 | * @wqe: WQE to be checked |
754 | * @perms: requested access permissions |
755 | * |
756 | */ |
757 | |
758 | static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, |
759 | enum ib_access_flags perms) |
760 | { |
761 | struct siw_sge *sge = &wqe->sqe.sge[0]; |
762 | int i, len, num_sge = wqe->sqe.num_sge; |
763 | |
764 | if (unlikely(num_sge > SIW_MAX_SGE)) |
765 | return -EINVAL; |
766 | |
767 | for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { |
768 | /* |
769 | * rdma verbs: do not check stag for a zero length sge |
770 | */ |
771 | if (sge->length) { |
772 | int rv = siw_check_sge(pd, sge, mem: &wqe->mem[i], perms, off: 0, |
773 | len: sge->length); |
774 | |
775 | if (unlikely(rv != E_ACCESS_OK)) |
776 | return rv; |
777 | } |
778 | len += sge->length; |
779 | } |
780 | return len; |
781 | } |
782 | |
783 | /* |
784 | * siw_qp_sq_proc_tx() |
785 | * |
786 | * Process one WQE which needs transmission on the wire. |
787 | */ |
788 | static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) |
789 | { |
790 | struct siw_iwarp_tx *c_tx = &qp->tx_ctx; |
791 | struct socket *s = qp->attrs.sk; |
792 | int rv = 0, burst_len = qp->tx_ctx.burst; |
793 | enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; |
794 | |
795 | if (unlikely(wqe->wr_status == SIW_WR_IDLE)) |
796 | return 0; |
797 | |
798 | if (!burst_len) |
799 | burst_len = SQ_USER_MAXBURST; |
800 | |
801 | if (wqe->wr_status == SIW_WR_QUEUED) { |
802 | if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { |
803 | if (tx_type(wqe) == SIW_OP_READ_RESPONSE) |
804 | wqe->sqe.num_sge = 1; |
805 | |
806 | if (tx_type(wqe) != SIW_OP_READ && |
807 | tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { |
808 | /* |
809 | * Reference memory to be tx'd w/o checking |
810 | * access for LOCAL_READ permission, since |
811 | * not defined in RDMA core. |
812 | */ |
813 | rv = siw_check_sgl_tx(pd: qp->pd, wqe, perms: 0); |
814 | if (rv < 0) { |
815 | if (tx_type(wqe) == |
816 | SIW_OP_READ_RESPONSE) |
817 | ecode = siw_rdmap_error(state: -rv); |
818 | rv = -EINVAL; |
819 | goto tx_error; |
820 | } |
821 | wqe->bytes = rv; |
822 | } else { |
823 | wqe->bytes = 0; |
824 | } |
825 | } else { |
826 | wqe->bytes = wqe->sqe.sge[0].length; |
827 | if (!rdma_is_kernel_res(res: &qp->base_qp.res)) { |
828 | if (wqe->bytes > SIW_MAX_INLINE) { |
829 | rv = -EINVAL; |
830 | goto tx_error; |
831 | } |
832 | wqe->sqe.sge[0].laddr = |
833 | (u64)(uintptr_t)&wqe->sqe.sge[1]; |
834 | } |
835 | } |
836 | wqe->wr_status = SIW_WR_INPROGRESS; |
837 | wqe->processed = 0; |
838 | |
839 | siw_update_tcpseg(c_tx, s); |
840 | |
841 | rv = siw_qp_prepare_tx(c_tx); |
842 | if (rv == PKT_FRAGMENTED) { |
843 | c_tx->state = SIW_SEND_HDR; |
844 | siw_prepare_fpdu(qp, wqe); |
845 | } else if (rv == PKT_COMPLETE) { |
846 | c_tx->state = SIW_SEND_SHORT_FPDU; |
847 | } else { |
848 | goto tx_error; |
849 | } |
850 | } |
851 | |
852 | next_segment: |
853 | siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n" , |
854 | tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, |
855 | wqe->sqe.id); |
856 | |
857 | if (--burst_len == 0) { |
858 | rv = -EINPROGRESS; |
859 | goto tx_done; |
860 | } |
861 | if (c_tx->state == SIW_SEND_SHORT_FPDU) { |
862 | enum siw_opcode tx_type = tx_type(wqe); |
863 | unsigned int msg_flags; |
864 | |
865 | if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) |
866 | /* |
867 | * End current TCP segment, if SQ runs empty, |
868 | * or siw_tcp_nagle is not set, or we bail out |
869 | * soon due to no burst credit left. |
870 | */ |
871 | msg_flags = MSG_DONTWAIT; |
872 | else |
873 | msg_flags = MSG_DONTWAIT | MSG_MORE; |
874 | |
875 | rv = siw_tx_ctrl(c_tx, s, flags: msg_flags); |
876 | |
877 | if (!rv && tx_type != SIW_OP_READ && |
878 | tx_type != SIW_OP_READ_LOCAL_INV) |
879 | wqe->processed = wqe->bytes; |
880 | |
881 | goto tx_done; |
882 | |
883 | } else { |
884 | rv = siw_tx_hdt(c_tx, s); |
885 | } |
886 | if (!rv) { |
887 | /* |
888 | * One segment sent. Processing completed if last |
889 | * segment, Do next segment otherwise. |
890 | */ |
891 | if (unlikely(c_tx->tx_suspend)) { |
892 | /* |
893 | * Verbs, 6.4.: Try stopping sending after a full |
894 | * DDP segment if the connection goes down |
895 | * (== peer halfclose) |
896 | */ |
897 | rv = -ECONNABORTED; |
898 | goto tx_done; |
899 | } |
900 | if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { |
901 | siw_dbg_qp(qp, "WQE completed\n" ); |
902 | goto tx_done; |
903 | } |
904 | c_tx->state = SIW_SEND_HDR; |
905 | |
906 | siw_update_tcpseg(c_tx, s); |
907 | |
908 | siw_prepare_fpdu(qp, wqe); |
909 | goto next_segment; |
910 | } |
911 | tx_done: |
912 | qp->tx_ctx.burst = burst_len; |
913 | return rv; |
914 | |
915 | tx_error: |
916 | if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) |
917 | siw_init_terminate(qp, layer: TERM_ERROR_LAYER_RDMAP, |
918 | etype: RDMAP_ETYPE_REMOTE_PROTECTION, ecode, in_tx: 1); |
919 | else |
920 | siw_init_terminate(qp, layer: TERM_ERROR_LAYER_RDMAP, |
921 | etype: RDMAP_ETYPE_CATASTROPHIC, |
922 | ecode: RDMAP_ECODE_UNSPECIFIED, in_tx: 1); |
923 | return rv; |
924 | } |
925 | |
926 | static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) |
927 | { |
928 | struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr; |
929 | struct siw_device *sdev = to_siw_dev(base_dev: pd->device); |
930 | struct siw_mem *mem; |
931 | int rv = 0; |
932 | |
933 | siw_dbg_pd(pd, "STag 0x%08x\n" , sqe->rkey); |
934 | |
935 | if (unlikely(!base_mr)) { |
936 | pr_warn("siw: fastreg: STag 0x%08x unknown\n" , sqe->rkey); |
937 | return -EINVAL; |
938 | } |
939 | |
940 | if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { |
941 | pr_warn("siw: fastreg: STag 0x%08x: bad MR\n" , sqe->rkey); |
942 | return -EINVAL; |
943 | } |
944 | |
945 | mem = siw_mem_id2obj(sdev, stag_index: sqe->rkey >> 8); |
946 | if (unlikely(!mem)) { |
947 | pr_warn("siw: fastreg: STag 0x%08x unknown\n" , sqe->rkey); |
948 | return -EINVAL; |
949 | } |
950 | |
951 | if (unlikely(mem->pd != pd)) { |
952 | pr_warn("siw: fastreg: PD mismatch\n" ); |
953 | rv = -EINVAL; |
954 | goto out; |
955 | } |
956 | if (unlikely(mem->stag_valid)) { |
957 | pr_warn("siw: fastreg: STag 0x%08x already valid\n" , sqe->rkey); |
958 | rv = -EINVAL; |
959 | goto out; |
960 | } |
961 | /* Refresh STag since user may have changed key part */ |
962 | mem->stag = sqe->rkey; |
963 | mem->perms = sqe->access; |
964 | |
965 | siw_dbg_mem(mem, "STag 0x%08x now valid\n" , sqe->rkey); |
966 | mem->va = base_mr->iova; |
967 | mem->stag_valid = 1; |
968 | out: |
969 | siw_mem_put(mem); |
970 | return rv; |
971 | } |
972 | |
973 | static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) |
974 | { |
975 | int rv; |
976 | |
977 | switch (tx_type(wqe)) { |
978 | case SIW_OP_REG_MR: |
979 | rv = siw_fastreg_mr(pd: qp->pd, sqe: &wqe->sqe); |
980 | break; |
981 | |
982 | case SIW_OP_INVAL_STAG: |
983 | rv = siw_invalidate_stag(pd: qp->pd, stag: wqe->sqe.rkey); |
984 | break; |
985 | |
986 | default: |
987 | rv = -EINVAL; |
988 | } |
989 | return rv; |
990 | } |
991 | |
992 | /* |
993 | * siw_qp_sq_process() |
994 | * |
995 | * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. |
996 | * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more |
997 | * MPA FPDUs, each containing a DDP segment. |
998 | * |
999 | * SQ processing may occur in user context as a result of posting |
1000 | * new WQE's or from siw_tx_thread context. Processing in |
1001 | * user context is limited to non-kernel verbs users. |
1002 | * |
1003 | * SQ processing may get paused anytime, possibly in the middle of a WR |
1004 | * or FPDU, if insufficient send space is available. SQ processing |
1005 | * gets resumed from siw_tx_thread, if send space becomes available again. |
1006 | * |
1007 | * Must be called with the QP state read-locked. |
1008 | * |
1009 | * Note: |
1010 | * An outbound RREQ can be satisfied by the corresponding RRESP |
1011 | * _before_ it gets assigned to the ORQ. This happens regularly |
1012 | * in RDMA READ via loopback case. Since both outbound RREQ and |
1013 | * inbound RRESP can be handled by the same CPU, locking the ORQ |
1014 | * is dead-lock prone and thus not an option. With that, the |
1015 | * RREQ gets assigned to the ORQ _before_ being sent - see |
1016 | * siw_activate_tx() - and pulled back in case of send failure. |
1017 | */ |
1018 | int siw_qp_sq_process(struct siw_qp *qp) |
1019 | { |
1020 | struct siw_wqe *wqe = tx_wqe(qp); |
1021 | enum siw_opcode tx_type; |
1022 | unsigned long flags; |
1023 | int rv = 0; |
1024 | |
1025 | siw_dbg_qp(qp, "enter for type %d\n" , tx_type(wqe)); |
1026 | |
1027 | next_wqe: |
1028 | /* |
1029 | * Stop QP processing if SQ state changed |
1030 | */ |
1031 | if (unlikely(qp->tx_ctx.tx_suspend)) { |
1032 | siw_dbg_qp(qp, "tx suspended\n" ); |
1033 | goto done; |
1034 | } |
1035 | tx_type = tx_type(wqe); |
1036 | |
1037 | if (tx_type <= SIW_OP_READ_RESPONSE) |
1038 | rv = siw_qp_sq_proc_tx(qp, wqe); |
1039 | else |
1040 | rv = siw_qp_sq_proc_local(qp, wqe); |
1041 | |
1042 | if (!rv) { |
1043 | /* |
1044 | * WQE processing done |
1045 | */ |
1046 | switch (tx_type) { |
1047 | case SIW_OP_SEND: |
1048 | case SIW_OP_SEND_REMOTE_INV: |
1049 | case SIW_OP_WRITE: |
1050 | siw_wqe_put_mem(wqe, op: tx_type); |
1051 | fallthrough; |
1052 | |
1053 | case SIW_OP_INVAL_STAG: |
1054 | case SIW_OP_REG_MR: |
1055 | if (tx_flags(wqe) & SIW_WQE_SIGNALLED) |
1056 | siw_sqe_complete(qp, sqe: &wqe->sqe, bytes: wqe->bytes, |
1057 | status: SIW_WC_SUCCESS); |
1058 | break; |
1059 | |
1060 | case SIW_OP_READ: |
1061 | case SIW_OP_READ_LOCAL_INV: |
1062 | /* |
1063 | * already enqueued to ORQ queue |
1064 | */ |
1065 | break; |
1066 | |
1067 | case SIW_OP_READ_RESPONSE: |
1068 | siw_wqe_put_mem(wqe, op: tx_type); |
1069 | break; |
1070 | |
1071 | default: |
1072 | WARN(1, "undefined WQE type %d\n" , tx_type); |
1073 | rv = -EINVAL; |
1074 | goto done; |
1075 | } |
1076 | |
1077 | spin_lock_irqsave(&qp->sq_lock, flags); |
1078 | wqe->wr_status = SIW_WR_IDLE; |
1079 | rv = siw_activate_tx(qp); |
1080 | spin_unlock_irqrestore(lock: &qp->sq_lock, flags); |
1081 | |
1082 | if (rv <= 0) |
1083 | goto done; |
1084 | |
1085 | goto next_wqe; |
1086 | |
1087 | } else if (rv == -EAGAIN) { |
1088 | siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n" , |
1089 | qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, |
1090 | qp->tx_ctx.bytes_unsent); |
1091 | rv = 0; |
1092 | goto done; |
1093 | } else if (rv == -EINPROGRESS) { |
1094 | rv = siw_sq_start(qp); |
1095 | goto done; |
1096 | } else { |
1097 | /* |
1098 | * WQE processing failed. |
1099 | * Verbs 8.3.2: |
1100 | * o It turns any WQE into a signalled WQE. |
1101 | * o Local catastrophic error must be surfaced |
1102 | * o QP must be moved into Terminate state: done by code |
1103 | * doing socket state change processing |
1104 | * |
1105 | * o TODO: Termination message must be sent. |
1106 | * o TODO: Implement more precise work completion errors, |
1107 | * see enum ib_wc_status in ib_verbs.h |
1108 | */ |
1109 | siw_dbg_qp(qp, "wqe type %d processing failed: %d\n" , |
1110 | tx_type(wqe), rv); |
1111 | |
1112 | spin_lock_irqsave(&qp->sq_lock, flags); |
1113 | /* |
1114 | * RREQ may have already been completed by inbound RRESP! |
1115 | */ |
1116 | if ((tx_type == SIW_OP_READ || |
1117 | tx_type == SIW_OP_READ_LOCAL_INV) && qp->attrs.orq_size) { |
1118 | /* Cleanup pending entry in ORQ */ |
1119 | qp->orq_put--; |
1120 | qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; |
1121 | } |
1122 | spin_unlock_irqrestore(lock: &qp->sq_lock, flags); |
1123 | /* |
1124 | * immediately suspends further TX processing |
1125 | */ |
1126 | if (!qp->tx_ctx.tx_suspend) |
1127 | siw_qp_cm_drop(qp, schedule: 0); |
1128 | |
1129 | switch (tx_type) { |
1130 | case SIW_OP_SEND: |
1131 | case SIW_OP_SEND_REMOTE_INV: |
1132 | case SIW_OP_SEND_WITH_IMM: |
1133 | case SIW_OP_WRITE: |
1134 | case SIW_OP_READ: |
1135 | case SIW_OP_READ_LOCAL_INV: |
1136 | siw_wqe_put_mem(wqe, op: tx_type); |
1137 | fallthrough; |
1138 | |
1139 | case SIW_OP_INVAL_STAG: |
1140 | case SIW_OP_REG_MR: |
1141 | siw_sqe_complete(qp, sqe: &wqe->sqe, bytes: wqe->bytes, |
1142 | status: SIW_WC_LOC_QP_OP_ERR); |
1143 | |
1144 | siw_qp_event(qp, type: IB_EVENT_QP_FATAL); |
1145 | |
1146 | break; |
1147 | |
1148 | case SIW_OP_READ_RESPONSE: |
1149 | siw_dbg_qp(qp, "proc. read.response failed: %d\n" , rv); |
1150 | |
1151 | siw_qp_event(qp, type: IB_EVENT_QP_REQ_ERR); |
1152 | |
1153 | siw_wqe_put_mem(wqe, op: SIW_OP_READ_RESPONSE); |
1154 | |
1155 | break; |
1156 | |
1157 | default: |
1158 | WARN(1, "undefined WQE type %d\n" , tx_type); |
1159 | rv = -EINVAL; |
1160 | } |
1161 | wqe->wr_status = SIW_WR_IDLE; |
1162 | } |
1163 | done: |
1164 | return rv; |
1165 | } |
1166 | |
1167 | static void siw_sq_resume(struct siw_qp *qp) |
1168 | { |
1169 | if (down_read_trylock(sem: &qp->state_lock)) { |
1170 | if (likely(qp->attrs.state == SIW_QP_STATE_RTS && |
1171 | !qp->tx_ctx.tx_suspend)) { |
1172 | int rv = siw_qp_sq_process(qp); |
1173 | |
1174 | up_read(sem: &qp->state_lock); |
1175 | |
1176 | if (unlikely(rv < 0)) { |
1177 | siw_dbg_qp(qp, "SQ task failed: err %d\n" , rv); |
1178 | |
1179 | if (!qp->tx_ctx.tx_suspend) |
1180 | siw_qp_cm_drop(qp, schedule: 0); |
1181 | } |
1182 | } else { |
1183 | up_read(sem: &qp->state_lock); |
1184 | } |
1185 | } else { |
1186 | siw_dbg_qp(qp, "Resume SQ while QP locked\n" ); |
1187 | } |
1188 | siw_qp_put(qp); |
1189 | } |
1190 | |
1191 | struct tx_task_t { |
1192 | struct llist_head active; |
1193 | wait_queue_head_t waiting; |
1194 | }; |
1195 | |
1196 | static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); |
1197 | |
1198 | int siw_create_tx_threads(void) |
1199 | { |
1200 | int cpu, assigned = 0; |
1201 | |
1202 | for_each_online_cpu(cpu) { |
1203 | struct tx_task_t *tx_task; |
1204 | |
1205 | /* Skip HT cores */ |
1206 | if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) |
1207 | continue; |
1208 | |
1209 | tx_task = &per_cpu(siw_tx_task_g, cpu); |
1210 | init_llist_head(list: &tx_task->active); |
1211 | init_waitqueue_head(&tx_task->waiting); |
1212 | |
1213 | siw_tx_thread[cpu] = |
1214 | kthread_run_on_cpu(threadfn: siw_run_sq, |
1215 | data: (unsigned long *)(long)cpu, |
1216 | cpu, namefmt: "siw_tx/%u" ); |
1217 | if (IS_ERR(ptr: siw_tx_thread[cpu])) { |
1218 | siw_tx_thread[cpu] = NULL; |
1219 | continue; |
1220 | } |
1221 | assigned++; |
1222 | } |
1223 | return assigned; |
1224 | } |
1225 | |
1226 | void siw_stop_tx_threads(void) |
1227 | { |
1228 | int cpu; |
1229 | |
1230 | for_each_possible_cpu(cpu) { |
1231 | if (siw_tx_thread[cpu]) { |
1232 | kthread_stop(k: siw_tx_thread[cpu]); |
1233 | wake_up(&per_cpu(siw_tx_task_g, cpu).waiting); |
1234 | siw_tx_thread[cpu] = NULL; |
1235 | } |
1236 | } |
1237 | } |
1238 | |
1239 | int siw_run_sq(void *data) |
1240 | { |
1241 | const int nr_cpu = (unsigned int)(long)data; |
1242 | struct llist_node *active; |
1243 | struct siw_qp *qp; |
1244 | struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); |
1245 | |
1246 | while (1) { |
1247 | struct llist_node *fifo_list = NULL; |
1248 | |
1249 | wait_event_interruptible(tx_task->waiting, |
1250 | !llist_empty(&tx_task->active) || |
1251 | kthread_should_stop()); |
1252 | |
1253 | if (kthread_should_stop()) |
1254 | break; |
1255 | |
1256 | active = llist_del_all(head: &tx_task->active); |
1257 | /* |
1258 | * llist_del_all returns a list with newest entry first. |
1259 | * Re-order list for fairness among QP's. |
1260 | */ |
1261 | fifo_list = llist_reverse_order(head: active); |
1262 | while (fifo_list) { |
1263 | qp = container_of(fifo_list, struct siw_qp, tx_list); |
1264 | fifo_list = llist_next(node: fifo_list); |
1265 | qp->tx_list.next = NULL; |
1266 | |
1267 | siw_sq_resume(qp); |
1268 | } |
1269 | } |
1270 | active = llist_del_all(head: &tx_task->active); |
1271 | if (active) { |
1272 | llist_for_each_entry(qp, active, tx_list) { |
1273 | qp->tx_list.next = NULL; |
1274 | siw_sq_resume(qp); |
1275 | } |
1276 | } |
1277 | return 0; |
1278 | } |
1279 | |
1280 | int siw_sq_start(struct siw_qp *qp) |
1281 | { |
1282 | if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) |
1283 | return 0; |
1284 | |
1285 | if (unlikely(!cpu_online(qp->tx_cpu))) { |
1286 | siw_put_tx_cpu(cpu: qp->tx_cpu); |
1287 | qp->tx_cpu = siw_get_tx_cpu(sdev: qp->sdev); |
1288 | if (qp->tx_cpu < 0) { |
1289 | pr_warn("siw: no tx cpu available\n" ); |
1290 | |
1291 | return -EIO; |
1292 | } |
1293 | } |
1294 | siw_qp_get(qp); |
1295 | |
1296 | llist_add(new: &qp->tx_list, head: &per_cpu(siw_tx_task_g, qp->tx_cpu).active); |
1297 | |
1298 | wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); |
1299 | |
1300 | return 0; |
1301 | } |
1302 | |