1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4/* Fredy Neeser */
5/* Greg Joyce <greg@opengridcomputing.com> */
6/* Copyright (c) 2008-2019, IBM Corporation */
7/* Copyright (c) 2017, Open Grid Computing, Inc. */
8
9#include <linux/errno.h>
10#include <linux/types.h>
11#include <linux/net.h>
12#include <linux/inetdevice.h>
13#include <net/addrconf.h>
14#include <linux/workqueue.h>
15#include <net/sock.h>
16#include <net/tcp.h>
17#include <linux/inet.h>
18#include <linux/tcp.h>
19#include <trace/events/sock.h>
20
21#include <rdma/iw_cm.h>
22#include <rdma/ib_verbs.h>
23#include <rdma/ib_user_verbs.h>
24
25#include "siw.h"
26#include "siw_cm.h"
27
28/*
29 * Set to any combination of
30 * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
31 */
32static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
33static const bool relaxed_ird_negotiation = true;
34
35static void siw_cm_llp_state_change(struct sock *s);
36static void siw_cm_llp_data_ready(struct sock *s);
37static void siw_cm_llp_write_space(struct sock *s);
38static void siw_cm_llp_error_report(struct sock *s);
39static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
40 int status);
41
42static void siw_sk_assign_cm_upcalls(struct sock *sk)
43{
44 struct siw_cep *cep = sk_to_cep(sk);
45
46 write_lock_bh(&sk->sk_callback_lock);
47 cep->sk_state_change = sk->sk_state_change;
48 cep->sk_data_ready = sk->sk_data_ready;
49 cep->sk_write_space = sk->sk_write_space;
50 cep->sk_error_report = sk->sk_error_report;
51
52 sk->sk_state_change = siw_cm_llp_state_change;
53 sk->sk_data_ready = siw_cm_llp_data_ready;
54 sk->sk_write_space = siw_cm_llp_write_space;
55 sk->sk_error_report = siw_cm_llp_error_report;
56 write_unlock_bh(&sk->sk_callback_lock);
57}
58
59static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
60{
61 sk->sk_state_change = cep->sk_state_change;
62 sk->sk_data_ready = cep->sk_data_ready;
63 sk->sk_write_space = cep->sk_write_space;
64 sk->sk_error_report = cep->sk_error_report;
65 sk->sk_user_data = NULL;
66}
67
68static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
69{
70 struct socket *s = cep->sock;
71 struct sock *sk = s->sk;
72
73 write_lock_bh(&sk->sk_callback_lock);
74
75 qp->attrs.sk = s;
76 sk->sk_data_ready = siw_qp_llp_data_ready;
77 sk->sk_write_space = siw_qp_llp_write_space;
78
79 write_unlock_bh(&sk->sk_callback_lock);
80}
81
82static void siw_socket_disassoc(struct socket *s)
83{
84 struct sock *sk = s->sk;
85 struct siw_cep *cep;
86
87 if (sk) {
88 write_lock_bh(&sk->sk_callback_lock);
89 cep = sk_to_cep(sk);
90 if (cep) {
91 siw_sk_restore_upcalls(sk, cep);
92 siw_cep_put(cep);
93 } else {
94 pr_warn("siw: cannot restore sk callbacks: no ep\n");
95 }
96 write_unlock_bh(&sk->sk_callback_lock);
97 } else {
98 pr_warn("siw: cannot restore sk callbacks: no sk\n");
99 }
100}
101
102static void siw_rtr_data_ready(struct sock *sk)
103{
104 struct siw_cep *cep;
105 struct siw_qp *qp = NULL;
106 read_descriptor_t rd_desc;
107
108 trace_sk_data_ready(sk);
109
110 read_lock(&sk->sk_callback_lock);
111
112 cep = sk_to_cep(sk);
113 if (!cep) {
114 WARN(1, "No connection endpoint\n");
115 goto out;
116 }
117 qp = sk_to_qp(sk);
118
119 memset(&rd_desc, 0, sizeof(rd_desc));
120 rd_desc.arg.data = qp;
121 rd_desc.count = 1;
122
123 tcp_read_sock(sk, desc: &rd_desc, recv_actor: siw_tcp_rx_data);
124 /*
125 * Check if first frame was successfully processed.
126 * Signal connection full establishment if yes.
127 * Failed data processing would have already scheduled
128 * connection drop.
129 */
130 if (!qp->rx_stream.rx_suspend)
131 siw_cm_upcall(cep, reason: IW_CM_EVENT_ESTABLISHED, status: 0);
132out:
133 read_unlock(&sk->sk_callback_lock);
134 if (qp)
135 siw_qp_socket_assoc(cep, qp);
136}
137
138static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
139{
140 struct sock *sk = cep->sock->sk;
141
142 write_lock_bh(&sk->sk_callback_lock);
143 sk->sk_data_ready = siw_rtr_data_ready;
144 sk->sk_write_space = siw_qp_llp_write_space;
145 write_unlock_bh(&sk->sk_callback_lock);
146}
147
148static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
149{
150 cep->sock = s;
151 siw_cep_get(cep);
152 s->sk->sk_user_data = cep;
153
154 siw_sk_assign_cm_upcalls(sk: s->sk);
155}
156
157static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
158{
159 struct siw_cep *cep = kzalloc(size: sizeof(*cep), GFP_KERNEL);
160 unsigned long flags;
161
162 if (!cep)
163 return NULL;
164
165 INIT_LIST_HEAD(list: &cep->listenq);
166 INIT_LIST_HEAD(list: &cep->devq);
167 INIT_LIST_HEAD(list: &cep->work_freelist);
168
169 kref_init(kref: &cep->ref);
170 cep->state = SIW_EPSTATE_IDLE;
171 init_waitqueue_head(&cep->waitq);
172 spin_lock_init(&cep->lock);
173 cep->sdev = sdev;
174 cep->enhanced_rdma_conn_est = false;
175
176 spin_lock_irqsave(&sdev->lock, flags);
177 list_add_tail(new: &cep->devq, head: &sdev->cep_list);
178 spin_unlock_irqrestore(lock: &sdev->lock, flags);
179
180 siw_dbg_cep(cep, "new endpoint\n");
181 return cep;
182}
183
184static void siw_cm_free_work(struct siw_cep *cep)
185{
186 struct list_head *w, *tmp;
187 struct siw_cm_work *work;
188
189 list_for_each_safe(w, tmp, &cep->work_freelist) {
190 work = list_entry(w, struct siw_cm_work, list);
191 list_del(entry: &work->list);
192 kfree(objp: work);
193 }
194}
195
196static void siw_cancel_mpatimer(struct siw_cep *cep)
197{
198 spin_lock_bh(lock: &cep->lock);
199 if (cep->mpa_timer) {
200 if (cancel_delayed_work(dwork: &cep->mpa_timer->work)) {
201 siw_cep_put(cep);
202 kfree(objp: cep->mpa_timer); /* not needed again */
203 }
204 cep->mpa_timer = NULL;
205 }
206 spin_unlock_bh(lock: &cep->lock);
207}
208
209static void siw_put_work(struct siw_cm_work *work)
210{
211 INIT_LIST_HEAD(list: &work->list);
212 spin_lock_bh(lock: &work->cep->lock);
213 list_add(new: &work->list, head: &work->cep->work_freelist);
214 spin_unlock_bh(lock: &work->cep->lock);
215}
216
217static void siw_cep_set_inuse(struct siw_cep *cep)
218{
219 unsigned long flags;
220retry:
221 spin_lock_irqsave(&cep->lock, flags);
222
223 if (cep->in_use) {
224 spin_unlock_irqrestore(lock: &cep->lock, flags);
225 wait_event_interruptible(cep->waitq, !cep->in_use);
226 if (signal_pending(current))
227 flush_signals(current);
228 goto retry;
229 } else {
230 cep->in_use = 1;
231 spin_unlock_irqrestore(lock: &cep->lock, flags);
232 }
233}
234
235static void siw_cep_set_free(struct siw_cep *cep)
236{
237 unsigned long flags;
238
239 spin_lock_irqsave(&cep->lock, flags);
240 cep->in_use = 0;
241 spin_unlock_irqrestore(lock: &cep->lock, flags);
242
243 wake_up(&cep->waitq);
244}
245
246static void __siw_cep_dealloc(struct kref *ref)
247{
248 struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
249 struct siw_device *sdev = cep->sdev;
250 unsigned long flags;
251
252 WARN_ON(cep->listen_cep);
253
254 /* kfree(NULL) is safe */
255 kfree(objp: cep->mpa.pdata);
256 spin_lock_bh(lock: &cep->lock);
257 if (!list_empty(head: &cep->work_freelist))
258 siw_cm_free_work(cep);
259 spin_unlock_bh(lock: &cep->lock);
260
261 spin_lock_irqsave(&sdev->lock, flags);
262 list_del(entry: &cep->devq);
263 spin_unlock_irqrestore(lock: &sdev->lock, flags);
264
265 siw_dbg_cep(cep, "free endpoint\n");
266 kfree(objp: cep);
267}
268
269static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
270{
271 struct siw_cm_work *work = NULL;
272
273 spin_lock_bh(lock: &cep->lock);
274 if (!list_empty(head: &cep->work_freelist)) {
275 work = list_entry(cep->work_freelist.next, struct siw_cm_work,
276 list);
277 list_del_init(entry: &work->list);
278 }
279 spin_unlock_bh(lock: &cep->lock);
280 return work;
281}
282
283static int siw_cm_alloc_work(struct siw_cep *cep, int num)
284{
285 struct siw_cm_work *work;
286
287 while (num--) {
288 work = kmalloc(size: sizeof(*work), GFP_KERNEL);
289 if (!work) {
290 if (!(list_empty(head: &cep->work_freelist)))
291 siw_cm_free_work(cep);
292 return -ENOMEM;
293 }
294 work->cep = cep;
295 INIT_LIST_HEAD(list: &work->list);
296 list_add(new: &work->list, head: &cep->work_freelist);
297 }
298 return 0;
299}
300
301/*
302 * siw_cm_upcall()
303 *
304 * Upcall to IWCM to inform about async connection events
305 */
306static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
307 int status)
308{
309 struct iw_cm_event event;
310 struct iw_cm_id *id;
311
312 memset(&event, 0, sizeof(event));
313 event.status = status;
314 event.event = reason;
315
316 if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
317 event.provider_data = cep;
318 id = cep->listen_cep->cm_id;
319 } else {
320 id = cep->cm_id;
321 }
322 /* Signal IRD and ORD */
323 if (reason == IW_CM_EVENT_ESTABLISHED ||
324 reason == IW_CM_EVENT_CONNECT_REPLY) {
325 /* Signal negotiated IRD/ORD values we will use */
326 event.ird = cep->ird;
327 event.ord = cep->ord;
328 } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
329 event.ird = cep->ord;
330 event.ord = cep->ird;
331 }
332 /* Signal private data and address information */
333 if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
334 reason == IW_CM_EVENT_CONNECT_REPLY) {
335 u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
336
337 if (pd_len) {
338 /*
339 * hand over MPA private data
340 */
341 event.private_data_len = pd_len;
342 event.private_data = cep->mpa.pdata;
343
344 /* Hide MPA V2 IRD/ORD control */
345 if (cep->enhanced_rdma_conn_est) {
346 event.private_data_len -=
347 sizeof(struct mpa_v2_data);
348 event.private_data +=
349 sizeof(struct mpa_v2_data);
350 }
351 }
352 getname_local(s: cep->sock, a: &event.local_addr);
353 getname_peer(s: cep->sock, a: &event.remote_addr);
354 }
355 siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
356 cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
357
358 return id->event_handler(id, &event);
359}
360
361static void siw_free_cm_id(struct siw_cep *cep)
362{
363 if (!cep->cm_id)
364 return;
365
366 cep->cm_id->rem_ref(cep->cm_id);
367 cep->cm_id = NULL;
368}
369
370static void siw_destroy_cep_sock(struct siw_cep *cep)
371{
372 if (cep->sock) {
373 siw_socket_disassoc(s: cep->sock);
374 sock_release(sock: cep->sock);
375 cep->sock = NULL;
376 }
377}
378
379/*
380 * siw_qp_cm_drop()
381 *
382 * Drops established LLP connection if present and not already
383 * scheduled for dropping. Called from user context, SQ workqueue
384 * or receive IRQ. Caller signals if socket can be immediately
385 * closed (basically, if not in IRQ).
386 */
387void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
388{
389 struct siw_cep *cep = qp->cep;
390
391 qp->rx_stream.rx_suspend = 1;
392 qp->tx_ctx.tx_suspend = 1;
393
394 if (!qp->cep)
395 return;
396
397 if (schedule) {
398 siw_cm_queue_work(cep, type: SIW_CM_WORK_CLOSE_LLP);
399 } else {
400 siw_cep_set_inuse(cep);
401
402 if (cep->state == SIW_EPSTATE_CLOSED) {
403 siw_dbg_cep(cep, "already closed\n");
404 goto out;
405 }
406 siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
407
408 siw_send_terminate(qp);
409
410 if (cep->cm_id) {
411 switch (cep->state) {
412 case SIW_EPSTATE_AWAIT_MPAREP:
413 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY,
414 status: -EINVAL);
415 break;
416
417 case SIW_EPSTATE_RDMA_MODE:
418 siw_cm_upcall(cep, reason: IW_CM_EVENT_CLOSE, status: 0);
419 break;
420
421 case SIW_EPSTATE_IDLE:
422 case SIW_EPSTATE_LISTENING:
423 case SIW_EPSTATE_CONNECTING:
424 case SIW_EPSTATE_AWAIT_MPAREQ:
425 case SIW_EPSTATE_RECVD_MPAREQ:
426 case SIW_EPSTATE_CLOSED:
427 default:
428 break;
429 }
430 siw_free_cm_id(cep);
431 siw_cep_put(cep);
432 }
433 cep->state = SIW_EPSTATE_CLOSED;
434
435 siw_destroy_cep_sock(cep);
436 if (cep->qp) {
437 cep->qp = NULL;
438 siw_qp_put(qp);
439 }
440out:
441 siw_cep_set_free(cep);
442 }
443}
444
445void siw_cep_put(struct siw_cep *cep)
446{
447 WARN_ON(kref_read(&cep->ref) < 1);
448 kref_put(kref: &cep->ref, release: __siw_cep_dealloc);
449}
450
451static void siw_cep_set_free_and_put(struct siw_cep *cep)
452{
453 siw_cep_set_free(cep);
454 siw_cep_put(cep);
455}
456
457void siw_cep_get(struct siw_cep *cep)
458{
459 kref_get(kref: &cep->ref);
460}
461
462/*
463 * Expects params->pd_len in host byte order
464 */
465static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
466{
467 struct socket *s = cep->sock;
468 struct mpa_rr *rr = &cep->mpa.hdr;
469 struct kvec iov[3];
470 struct msghdr msg;
471 int rv;
472 int iovec_num = 0;
473 int mpa_len;
474
475 memset(&msg, 0, sizeof(msg));
476
477 iov[iovec_num].iov_base = rr;
478 iov[iovec_num].iov_len = sizeof(*rr);
479 mpa_len = sizeof(*rr);
480
481 if (cep->enhanced_rdma_conn_est) {
482 iovec_num++;
483 iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
484 iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
485 mpa_len += sizeof(cep->mpa.v2_ctrl);
486 }
487 if (pd_len) {
488 iovec_num++;
489 iov[iovec_num].iov_base = (char *)pdata;
490 iov[iovec_num].iov_len = pd_len;
491 mpa_len += pd_len;
492 }
493 if (cep->enhanced_rdma_conn_est)
494 pd_len += sizeof(cep->mpa.v2_ctrl);
495
496 rr->params.pd_len = cpu_to_be16(pd_len);
497
498 rv = kernel_sendmsg(sock: s, msg: &msg, vec: iov, num: iovec_num + 1, len: mpa_len);
499
500 return rv < 0 ? rv : 0;
501}
502
503/*
504 * Receive MPA Request/Reply header.
505 *
506 * Returns 0 if complete MPA Request/Reply header including
507 * eventual private data was received. Returns -EAGAIN if
508 * header was partially received or negative error code otherwise.
509 *
510 * Context: May be called in process context only
511 */
512static int siw_recv_mpa_rr(struct siw_cep *cep)
513{
514 struct mpa_rr *hdr = &cep->mpa.hdr;
515 struct socket *s = cep->sock;
516 u16 pd_len;
517 int rcvd, to_rcv;
518
519 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
520 rcvd = ksock_recv(sock: s, buf: (char *)hdr + cep->mpa.bytes_rcvd,
521 size: sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
522 flags: 0);
523 if (rcvd <= 0)
524 return -ECONNABORTED;
525
526 cep->mpa.bytes_rcvd += rcvd;
527
528 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
529 return -EAGAIN;
530
531 if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
532 return -EPROTO;
533 }
534 pd_len = be16_to_cpu(hdr->params.pd_len);
535
536 /*
537 * At least the MPA Request/Reply header (frame not including
538 * private data) has been received.
539 * Receive (or continue receiving) any private data.
540 */
541 to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
542
543 if (!to_rcv) {
544 /*
545 * We must have hdr->params.pd_len == 0 and thus received a
546 * complete MPA Request/Reply frame.
547 * Check against peer protocol violation.
548 */
549 u32 word;
550
551 rcvd = ksock_recv(sock: s, buf: (char *)&word, size: sizeof(word), MSG_DONTWAIT);
552 if (rcvd == -EAGAIN)
553 return 0;
554
555 if (rcvd == 0) {
556 siw_dbg_cep(cep, "peer EOF\n");
557 return -EPIPE;
558 }
559 if (rcvd < 0) {
560 siw_dbg_cep(cep, "error: %d\n", rcvd);
561 return rcvd;
562 }
563 siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
564
565 return -EPROTO;
566 }
567
568 /*
569 * At this point, we must have hdr->params.pd_len != 0.
570 * A private data buffer gets allocated if hdr->params.pd_len != 0.
571 */
572 if (!cep->mpa.pdata) {
573 cep->mpa.pdata = kmalloc(size: pd_len + 4, GFP_KERNEL);
574 if (!cep->mpa.pdata)
575 return -ENOMEM;
576 }
577 rcvd = ksock_recv(
578 sock: s, buf: cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
579 size: to_rcv + 4, MSG_DONTWAIT);
580
581 if (rcvd < 0)
582 return rcvd;
583
584 if (rcvd > to_rcv)
585 return -EPROTO;
586
587 cep->mpa.bytes_rcvd += rcvd;
588
589 if (to_rcv == rcvd) {
590 siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
591 return 0;
592 }
593 return -EAGAIN;
594}
595
596/*
597 * siw_proc_mpareq()
598 *
599 * Read MPA Request from socket and signal new connection to IWCM
600 * if success. Caller must hold lock on corresponding listening CEP.
601 */
602static int siw_proc_mpareq(struct siw_cep *cep)
603{
604 struct mpa_rr *req;
605 int version, rv;
606 u16 pd_len;
607
608 rv = siw_recv_mpa_rr(cep);
609 if (rv)
610 return rv;
611
612 req = &cep->mpa.hdr;
613
614 version = __mpa_rr_revision(mpa_rr_bits: req->params.bits);
615 pd_len = be16_to_cpu(req->params.pd_len);
616
617 if (version > MPA_REVISION_2)
618 /* allow for 0, 1, and 2 only */
619 return -EPROTO;
620
621 if (memcmp(p: req->key, MPA_KEY_REQ, size: 16))
622 return -EPROTO;
623
624 /* Prepare for sending MPA reply */
625 memcpy(req->key, MPA_KEY_REP, 16);
626
627 if (version == MPA_REVISION_2 &&
628 (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
629 /*
630 * MPA version 2 must signal IRD/ORD values and P2P mode
631 * in private data if header flag MPA_RR_FLAG_ENHANCED
632 * is set.
633 */
634 if (pd_len < sizeof(struct mpa_v2_data))
635 goto reject_conn;
636
637 cep->enhanced_rdma_conn_est = true;
638 }
639
640 /* MPA Markers: currently not supported. Marker TX to be added. */
641 if (req->params.bits & MPA_RR_FLAG_MARKERS)
642 goto reject_conn;
643
644 if (req->params.bits & MPA_RR_FLAG_CRC) {
645 /*
646 * RFC 5044, page 27: CRC MUST be used if peer requests it.
647 * siw specific: 'mpa_crc_strict' parameter to reject
648 * connection with CRC if local CRC off enforced by
649 * 'mpa_crc_strict' module parameter.
650 */
651 if (!mpa_crc_required && mpa_crc_strict)
652 goto reject_conn;
653
654 /* Enable CRC if requested by module parameter */
655 if (mpa_crc_required)
656 req->params.bits |= MPA_RR_FLAG_CRC;
657 }
658 if (cep->enhanced_rdma_conn_est) {
659 struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
660
661 /*
662 * Peer requested ORD becomes requested local IRD,
663 * peer requested IRD becomes requested local ORD.
664 * IRD and ORD get limited by global maximum values.
665 */
666 cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
667 cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
668 cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
669 cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
670
671 /* May get overwritten by locally negotiated values */
672 cep->mpa.v2_ctrl.ird = htons(cep->ird);
673 cep->mpa.v2_ctrl.ord = htons(cep->ord);
674
675 /*
676 * Support for peer sent zero length Write or Read to
677 * let local side enter RTS. Writes are preferred.
678 * Sends would require pre-posting a Receive and are
679 * not supported.
680 * Propose zero length Write if none of Read and Write
681 * is indicated.
682 */
683 if (v2->ird & MPA_V2_PEER_TO_PEER) {
684 cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
685
686 if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
687 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
688 else if (v2->ord & MPA_V2_RDMA_READ_RTR)
689 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
690 else
691 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
692 }
693 }
694
695 cep->state = SIW_EPSTATE_RECVD_MPAREQ;
696
697 /* Keep reference until IWCM accepts/rejects */
698 siw_cep_get(cep);
699 rv = siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REQUEST, status: 0);
700 if (rv)
701 siw_cep_put(cep);
702
703 return rv;
704
705reject_conn:
706 siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
707 req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
708 mpa_crc_required, mpa_crc_strict,
709 req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
710
711 req->params.bits &= ~MPA_RR_FLAG_MARKERS;
712 req->params.bits |= MPA_RR_FLAG_REJECT;
713
714 if (!mpa_crc_required && mpa_crc_strict)
715 req->params.bits &= ~MPA_RR_FLAG_CRC;
716
717 if (pd_len)
718 kfree(objp: cep->mpa.pdata);
719
720 cep->mpa.pdata = NULL;
721
722 siw_send_mpareqrep(cep, NULL, pd_len: 0);
723
724 return -EOPNOTSUPP;
725}
726
727static int siw_proc_mpareply(struct siw_cep *cep)
728{
729 struct siw_qp_attrs qp_attrs;
730 enum siw_qp_attr_mask qp_attr_mask;
731 struct siw_qp *qp = cep->qp;
732 struct mpa_rr *rep;
733 int rv;
734 u16 rep_ord;
735 u16 rep_ird;
736 bool ird_insufficient = false;
737 enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
738
739 rv = siw_recv_mpa_rr(cep);
740 if (rv)
741 goto out_err;
742
743 siw_cancel_mpatimer(cep);
744
745 rep = &cep->mpa.hdr;
746
747 if (__mpa_rr_revision(mpa_rr_bits: rep->params.bits) > MPA_REVISION_2) {
748 /* allow for 0, 1, and 2 only */
749 rv = -EPROTO;
750 goto out_err;
751 }
752 if (memcmp(p: rep->key, MPA_KEY_REP, size: 16)) {
753 siw_init_terminate(qp, layer: TERM_ERROR_LAYER_LLP, etype: LLP_ETYPE_MPA,
754 ecode: LLP_ECODE_INVALID_REQ_RESP, in_tx: 0);
755 siw_send_terminate(qp);
756 rv = -EPROTO;
757 goto out_err;
758 }
759 if (rep->params.bits & MPA_RR_FLAG_REJECT) {
760 siw_dbg_cep(cep, "got mpa reject\n");
761 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY, status: -ECONNRESET);
762
763 return -ECONNRESET;
764 }
765 if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
766 siw_dbg_cep(cep, "peer allows GSO on TX\n");
767 qp->tx_ctx.gso_seg_limit = 0;
768 }
769 if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
770 (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
771 (mpa_crc_strict && !mpa_crc_required &&
772 (rep->params.bits & MPA_RR_FLAG_CRC))) {
773 siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
774 rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
775 mpa_crc_required, mpa_crc_strict,
776 rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
777
778 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY, status: -ECONNREFUSED);
779
780 return -EINVAL;
781 }
782 if (cep->enhanced_rdma_conn_est) {
783 struct mpa_v2_data *v2;
784
785 if (__mpa_rr_revision(mpa_rr_bits: rep->params.bits) < MPA_REVISION_2 ||
786 !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
787 /*
788 * Protocol failure: The responder MUST reply with
789 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
790 */
791 siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
792 __mpa_rr_revision(rep->params.bits),
793 rep->params.bits & MPA_RR_FLAG_ENHANCED ?
794 1 :
795 0);
796
797 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY,
798 status: -ECONNRESET);
799 return -EINVAL;
800 }
801 v2 = (struct mpa_v2_data *)cep->mpa.pdata;
802 rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
803 rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
804
805 if (cep->ird < rep_ord &&
806 (relaxed_ird_negotiation == false ||
807 rep_ord > cep->sdev->attrs.max_ird)) {
808 siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
809 cep->ird, rep_ord,
810 cep->sdev->attrs.max_ord);
811 ird_insufficient = true;
812 }
813 if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
814 siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
815 rep_ird);
816 ird_insufficient = true;
817 }
818 /*
819 * Always report negotiated peer values to user,
820 * even if IRD/ORD negotiation failed
821 */
822 cep->ird = rep_ord;
823 cep->ord = rep_ird;
824
825 if (ird_insufficient) {
826 /*
827 * If the initiator IRD is insuffient for the
828 * responder ORD, send a TERM.
829 */
830 siw_init_terminate(qp, layer: TERM_ERROR_LAYER_LLP,
831 etype: LLP_ETYPE_MPA,
832 ecode: LLP_ECODE_INSUFFICIENT_IRD, in_tx: 0);
833 siw_send_terminate(qp);
834 rv = -ENOMEM;
835 goto out_err;
836 }
837 if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
838 mpa_p2p_mode =
839 cep->mpa.v2_ctrl_req.ord &
840 (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
841
842 /*
843 * Check if we requested P2P mode, and if peer agrees
844 */
845 if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
846 if ((mpa_p2p_mode & v2->ord) == 0) {
847 /*
848 * We requested RTR mode(s), but the peer
849 * did not pick any mode we support.
850 */
851 siw_dbg_cep(cep,
852 "rtr mode: req %2x, got %2x\n",
853 mpa_p2p_mode,
854 v2->ord & (MPA_V2_RDMA_WRITE_RTR |
855 MPA_V2_RDMA_READ_RTR));
856
857 siw_init_terminate(qp, layer: TERM_ERROR_LAYER_LLP,
858 etype: LLP_ETYPE_MPA,
859 ecode: LLP_ECODE_NO_MATCHING_RTR,
860 in_tx: 0);
861 siw_send_terminate(qp);
862 rv = -EPROTO;
863 goto out_err;
864 }
865 mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
866 MPA_V2_RDMA_READ_RTR);
867 }
868 }
869 memset(&qp_attrs, 0, sizeof(qp_attrs));
870
871 if (rep->params.bits & MPA_RR_FLAG_CRC)
872 qp_attrs.flags = SIW_MPA_CRC;
873
874 qp_attrs.irq_size = cep->ird;
875 qp_attrs.orq_size = cep->ord;
876 qp_attrs.sk = cep->sock;
877 qp_attrs.state = SIW_QP_STATE_RTS;
878
879 qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
880 SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
881
882 /* Move socket RX/TX under QP control */
883 down_write(sem: &qp->state_lock);
884 if (qp->attrs.state > SIW_QP_STATE_RTR) {
885 rv = -EINVAL;
886 up_write(sem: &qp->state_lock);
887 goto out_err;
888 }
889 rv = siw_qp_modify(qp, attr: &qp_attrs, mask: qp_attr_mask);
890
891 siw_qp_socket_assoc(cep, qp);
892
893 up_write(sem: &qp->state_lock);
894
895 /* Send extra RDMA frame to trigger peer RTS if negotiated */
896 if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
897 rv = siw_qp_mpa_rts(qp, ctrl: mpa_p2p_mode);
898 if (rv)
899 goto out_err;
900 }
901 if (!rv) {
902 rv = siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY, status: 0);
903 if (!rv)
904 cep->state = SIW_EPSTATE_RDMA_MODE;
905
906 return 0;
907 }
908
909out_err:
910 if (rv != -EAGAIN)
911 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY, status: -EINVAL);
912
913 return rv;
914}
915
916/*
917 * siw_accept_newconn - accept an incoming pending connection
918 *
919 */
920static void siw_accept_newconn(struct siw_cep *cep)
921{
922 struct socket *s = cep->sock;
923 struct socket *new_s = NULL;
924 struct siw_cep *new_cep = NULL;
925 int rv = 0; /* debug only. should disappear */
926
927 if (cep->state != SIW_EPSTATE_LISTENING)
928 goto error;
929
930 new_cep = siw_cep_alloc(sdev: cep->sdev);
931 if (!new_cep)
932 goto error;
933
934 /*
935 * 4: Allocate a sufficient number of work elements
936 * to allow concurrent handling of local + peer close
937 * events, MPA header processing + MPA timeout.
938 */
939 if (siw_cm_alloc_work(cep: new_cep, num: 4) != 0)
940 goto error;
941
942 /*
943 * Copy saved socket callbacks from listening CEP
944 * and assign new socket with new CEP
945 */
946 new_cep->sk_state_change = cep->sk_state_change;
947 new_cep->sk_data_ready = cep->sk_data_ready;
948 new_cep->sk_write_space = cep->sk_write_space;
949 new_cep->sk_error_report = cep->sk_error_report;
950
951 rv = kernel_accept(sock: s, newsock: &new_s, O_NONBLOCK);
952 if (rv != 0) {
953 /*
954 * Connection already aborted by peer..?
955 */
956 siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
957 goto error;
958 }
959 new_cep->sock = new_s;
960 siw_cep_get(cep: new_cep);
961 new_s->sk->sk_user_data = new_cep;
962
963 if (siw_tcp_nagle == false)
964 tcp_sock_set_nodelay(sk: new_s->sk);
965 new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
966
967 rv = siw_cm_queue_work(cep: new_cep, type: SIW_CM_WORK_MPATIMEOUT);
968 if (rv)
969 goto error;
970 /*
971 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
972 */
973 new_cep->listen_cep = cep;
974 siw_cep_get(cep);
975
976 if (atomic_read(v: &new_s->sk->sk_rmem_alloc)) {
977 /*
978 * MPA REQ already queued
979 */
980 siw_dbg_cep(cep, "immediate mpa request\n");
981
982 siw_cep_set_inuse(cep: new_cep);
983 rv = siw_proc_mpareq(cep: new_cep);
984 if (rv != -EAGAIN) {
985 siw_cep_put(cep);
986 new_cep->listen_cep = NULL;
987 if (rv) {
988 siw_cancel_mpatimer(cep: new_cep);
989 siw_cep_set_free(cep: new_cep);
990 goto error;
991 }
992 }
993 siw_cep_set_free(cep: new_cep);
994 }
995 return;
996
997error:
998 if (new_cep)
999 siw_cep_put(cep: new_cep);
1000
1001 if (new_s) {
1002 siw_socket_disassoc(s: new_s);
1003 sock_release(sock: new_s);
1004 new_cep->sock = NULL;
1005 }
1006 siw_dbg_cep(cep, "error %d\n", rv);
1007}
1008
1009static void siw_cm_work_handler(struct work_struct *w)
1010{
1011 struct siw_cm_work *work;
1012 struct siw_cep *cep;
1013 int release_cep = 0, rv = 0;
1014
1015 work = container_of(w, struct siw_cm_work, work.work);
1016 cep = work->cep;
1017
1018 siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
1019 cep->qp ? qp_id(cep->qp) : UINT_MAX,
1020 work->type, cep->state);
1021
1022 siw_cep_set_inuse(cep);
1023
1024 switch (work->type) {
1025 case SIW_CM_WORK_ACCEPT:
1026 siw_accept_newconn(cep);
1027 break;
1028
1029 case SIW_CM_WORK_READ_MPAHDR:
1030 if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1031 if (cep->listen_cep) {
1032 siw_cep_set_inuse(cep: cep->listen_cep);
1033
1034 if (cep->listen_cep->state ==
1035 SIW_EPSTATE_LISTENING)
1036 rv = siw_proc_mpareq(cep);
1037 else
1038 rv = -EFAULT;
1039
1040 siw_cep_set_free(cep: cep->listen_cep);
1041
1042 if (rv != -EAGAIN) {
1043 siw_cep_put(cep: cep->listen_cep);
1044 cep->listen_cep = NULL;
1045 if (rv)
1046 siw_cep_put(cep);
1047 }
1048 }
1049 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1050 rv = siw_proc_mpareply(cep);
1051 } else {
1052 /*
1053 * CEP already moved out of MPA handshake.
1054 * any connection management already done.
1055 * silently ignore the mpa packet.
1056 */
1057 if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1058 cep->sock->sk->sk_data_ready(cep->sock->sk);
1059 siw_dbg_cep(cep, "already in RDMA mode");
1060 } else {
1061 siw_dbg_cep(cep, "out of state: %d\n",
1062 cep->state);
1063 }
1064 }
1065 if (rv && rv != -EAGAIN)
1066 release_cep = 1;
1067 break;
1068
1069 case SIW_CM_WORK_CLOSE_LLP:
1070 /*
1071 * QP scheduled LLP close
1072 */
1073 if (cep->qp)
1074 siw_send_terminate(qp: cep->qp);
1075
1076 if (cep->cm_id)
1077 siw_cm_upcall(cep, reason: IW_CM_EVENT_CLOSE, status: 0);
1078
1079 release_cep = 1;
1080 break;
1081
1082 case SIW_CM_WORK_PEER_CLOSE:
1083 if (cep->cm_id) {
1084 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1085 /*
1086 * MPA reply not received, but connection drop
1087 */
1088 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY,
1089 status: -ECONNRESET);
1090 } else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1091 /*
1092 * NOTE: IW_CM_EVENT_DISCONNECT is given just
1093 * to transition IWCM into CLOSING.
1094 */
1095 siw_cm_upcall(cep, reason: IW_CM_EVENT_DISCONNECT, status: 0);
1096 siw_cm_upcall(cep, reason: IW_CM_EVENT_CLOSE, status: 0);
1097 }
1098 /*
1099 * for other states there is no connection
1100 * known to the IWCM.
1101 */
1102 } else {
1103 if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
1104 /*
1105 * Wait for the ulp/CM to call accept/reject
1106 */
1107 siw_dbg_cep(cep,
1108 "mpa req recvd, wait for ULP\n");
1109 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1110 /*
1111 * Socket close before MPA request received.
1112 */
1113 if (cep->listen_cep) {
1114 siw_dbg_cep(cep,
1115 "no mpareq: drop listener\n");
1116 siw_cep_put(cep: cep->listen_cep);
1117 cep->listen_cep = NULL;
1118 }
1119 }
1120 }
1121 release_cep = 1;
1122 break;
1123
1124 case SIW_CM_WORK_MPATIMEOUT:
1125 cep->mpa_timer = NULL;
1126
1127 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1128 /*
1129 * MPA request timed out:
1130 * Hide any partially received private data and signal
1131 * timeout
1132 */
1133 cep->mpa.hdr.params.pd_len = 0;
1134
1135 if (cep->cm_id)
1136 siw_cm_upcall(cep, reason: IW_CM_EVENT_CONNECT_REPLY,
1137 status: -ETIMEDOUT);
1138 release_cep = 1;
1139
1140 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1141 /*
1142 * No MPA request received after peer TCP stream setup.
1143 */
1144 if (cep->listen_cep) {
1145 siw_cep_put(cep: cep->listen_cep);
1146 cep->listen_cep = NULL;
1147 }
1148 release_cep = 1;
1149 }
1150 break;
1151
1152 default:
1153 WARN(1, "Undefined CM work type: %d\n", work->type);
1154 }
1155 if (release_cep) {
1156 siw_dbg_cep(cep,
1157 "release: timer=%s, QP[%u]\n",
1158 cep->mpa_timer ? "y" : "n",
1159 cep->qp ? qp_id(cep->qp) : UINT_MAX);
1160
1161 siw_cancel_mpatimer(cep);
1162
1163 cep->state = SIW_EPSTATE_CLOSED;
1164
1165 if (cep->qp) {
1166 struct siw_qp *qp = cep->qp;
1167 /*
1168 * Serialize a potential race with application
1169 * closing the QP and calling siw_qp_cm_drop()
1170 */
1171 siw_qp_get(qp);
1172 siw_cep_set_free(cep);
1173
1174 siw_qp_llp_close(qp);
1175 siw_qp_put(qp);
1176
1177 siw_cep_set_inuse(cep);
1178 cep->qp = NULL;
1179 siw_qp_put(qp);
1180 }
1181 if (cep->sock) {
1182 siw_socket_disassoc(s: cep->sock);
1183 sock_release(sock: cep->sock);
1184 cep->sock = NULL;
1185 }
1186 if (cep->cm_id) {
1187 siw_free_cm_id(cep);
1188 siw_cep_put(cep);
1189 }
1190 }
1191 siw_cep_set_free(cep);
1192 siw_put_work(work);
1193 siw_cep_put(cep);
1194}
1195
1196static struct workqueue_struct *siw_cm_wq;
1197
1198int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
1199{
1200 struct siw_cm_work *work = siw_get_work(cep);
1201 unsigned long delay = 0;
1202
1203 if (!work) {
1204 siw_dbg_cep(cep, "failed with no work available\n");
1205 return -ENOMEM;
1206 }
1207 work->type = type;
1208 work->cep = cep;
1209
1210 siw_cep_get(cep);
1211
1212 INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
1213
1214 if (type == SIW_CM_WORK_MPATIMEOUT) {
1215 cep->mpa_timer = work;
1216
1217 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
1218 delay = MPAREQ_TIMEOUT;
1219 else
1220 delay = MPAREP_TIMEOUT;
1221 }
1222 siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
1223 cep->qp ? qp_id(cep->qp) : -1, type, delay);
1224
1225 queue_delayed_work(wq: siw_cm_wq, dwork: &work->work, delay);
1226
1227 return 0;
1228}
1229
1230static void siw_cm_llp_data_ready(struct sock *sk)
1231{
1232 struct siw_cep *cep;
1233
1234 trace_sk_data_ready(sk);
1235
1236 read_lock(&sk->sk_callback_lock);
1237
1238 cep = sk_to_cep(sk);
1239 if (!cep)
1240 goto out;
1241
1242 siw_dbg_cep(cep, "cep state: %d, socket state %d\n",
1243 cep->state, sk->sk_state);
1244
1245 if (sk->sk_state != TCP_ESTABLISHED)
1246 goto out;
1247
1248 switch (cep->state) {
1249 case SIW_EPSTATE_RDMA_MODE:
1250 case SIW_EPSTATE_LISTENING:
1251 break;
1252
1253 case SIW_EPSTATE_AWAIT_MPAREQ:
1254 case SIW_EPSTATE_AWAIT_MPAREP:
1255 siw_cm_queue_work(cep, type: SIW_CM_WORK_READ_MPAHDR);
1256 break;
1257
1258 default:
1259 siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
1260 break;
1261 }
1262out:
1263 read_unlock(&sk->sk_callback_lock);
1264}
1265
1266static void siw_cm_llp_write_space(struct sock *sk)
1267{
1268 struct siw_cep *cep = sk_to_cep(sk);
1269
1270 if (cep)
1271 siw_dbg_cep(cep, "state: %d\n", cep->state);
1272}
1273
1274static void siw_cm_llp_error_report(struct sock *sk)
1275{
1276 struct siw_cep *cep = sk_to_cep(sk);
1277
1278 if (cep) {
1279 siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
1280 sk->sk_err, sk->sk_state, cep->state);
1281 cep->sk_error_report(sk);
1282 }
1283}
1284
1285static void siw_cm_llp_state_change(struct sock *sk)
1286{
1287 struct siw_cep *cep;
1288 void (*orig_state_change)(struct sock *s);
1289
1290 read_lock(&sk->sk_callback_lock);
1291
1292 cep = sk_to_cep(sk);
1293 if (!cep) {
1294 /* endpoint already disassociated */
1295 read_unlock(&sk->sk_callback_lock);
1296 return;
1297 }
1298 orig_state_change = cep->sk_state_change;
1299
1300 siw_dbg_cep(cep, "state: %d\n", cep->state);
1301
1302 switch (sk->sk_state) {
1303 case TCP_ESTABLISHED:
1304 /*
1305 * handle accepting socket as special case where only
1306 * new connection is possible
1307 */
1308 siw_cm_queue_work(cep, type: SIW_CM_WORK_ACCEPT);
1309 break;
1310
1311 case TCP_CLOSE:
1312 case TCP_CLOSE_WAIT:
1313 if (cep->qp)
1314 cep->qp->tx_ctx.tx_suspend = 1;
1315 siw_cm_queue_work(cep, type: SIW_CM_WORK_PEER_CLOSE);
1316 break;
1317
1318 default:
1319 siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
1320 }
1321 read_unlock(&sk->sk_callback_lock);
1322 orig_state_change(sk);
1323}
1324
1325static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
1326 struct sockaddr *raddr, bool afonly)
1327{
1328 int rv, flags = 0;
1329 size_t size = laddr->sa_family == AF_INET ?
1330 sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
1331
1332 /*
1333 * Make address available again asap.
1334 */
1335 sock_set_reuseaddr(sk: s->sk);
1336
1337 if (afonly) {
1338 rv = ip6_sock_set_v6only(sk: s->sk);
1339 if (rv)
1340 return rv;
1341 }
1342
1343 rv = s->ops->bind(s, laddr, size);
1344 if (rv < 0)
1345 return rv;
1346
1347 rv = s->ops->connect(s, raddr, size, flags);
1348
1349 return rv < 0 ? rv : 0;
1350}
1351
1352int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1353{
1354 struct siw_device *sdev = to_siw_dev(base_dev: id->device);
1355 struct siw_qp *qp;
1356 struct siw_cep *cep = NULL;
1357 struct socket *s = NULL;
1358 struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
1359 *raddr = (struct sockaddr *)&id->remote_addr;
1360 bool p2p_mode = peer_to_peer, v4 = true;
1361 u16 pd_len = params->private_data_len;
1362 int version = mpa_version, rv;
1363
1364 if (pd_len > MPA_MAX_PRIVDATA)
1365 return -EINVAL;
1366
1367 if (params->ird > sdev->attrs.max_ird ||
1368 params->ord > sdev->attrs.max_ord)
1369 return -ENOMEM;
1370
1371 if (laddr->sa_family == AF_INET6)
1372 v4 = false;
1373 else if (laddr->sa_family != AF_INET)
1374 return -EAFNOSUPPORT;
1375
1376 /*
1377 * Respect any iwarp port mapping: Use mapped remote address
1378 * if valid. Local address must not be mapped, since siw
1379 * uses kernel TCP stack.
1380 */
1381 if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
1382 to_sockaddr_in6(id->remote_addr).sin6_port != 0)
1383 raddr = (struct sockaddr *)&id->m_remote_addr;
1384
1385 qp = siw_qp_id2obj(sdev, id: params->qpn);
1386 if (!qp) {
1387 WARN(1, "[QP %u] does not exist\n", params->qpn);
1388 rv = -EINVAL;
1389 goto error;
1390 }
1391 siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
1392 raddr);
1393
1394 rv = sock_create(family: v4 ? AF_INET : AF_INET6, type: SOCK_STREAM, IPPROTO_TCP, res: &s);
1395 if (rv < 0)
1396 goto error;
1397
1398 /*
1399 * NOTE: For simplification, connect() is called in blocking
1400 * mode. Might be reconsidered for async connection setup at
1401 * TCP level.
1402 */
1403 rv = kernel_bindconnect(s, laddr, raddr, afonly: id->afonly);
1404 if (rv != 0) {
1405 siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
1406 goto error;
1407 }
1408 if (siw_tcp_nagle == false)
1409 tcp_sock_set_nodelay(sk: s->sk);
1410 cep = siw_cep_alloc(sdev);
1411 if (!cep) {
1412 rv = -ENOMEM;
1413 goto error;
1414 }
1415 siw_cep_set_inuse(cep);
1416
1417 /* Associate QP with CEP */
1418 siw_cep_get(cep);
1419 qp->cep = cep;
1420
1421 /* siw_qp_get(qp) already done by QP lookup */
1422 cep->qp = qp;
1423
1424 id->add_ref(id);
1425 cep->cm_id = id;
1426
1427 /*
1428 * 4: Allocate a sufficient number of work elements
1429 * to allow concurrent handling of local + peer close
1430 * events, MPA header processing + MPA timeout.
1431 */
1432 rv = siw_cm_alloc_work(cep, num: 4);
1433 if (rv != 0) {
1434 rv = -ENOMEM;
1435 goto error;
1436 }
1437 cep->ird = params->ird;
1438 cep->ord = params->ord;
1439
1440 if (p2p_mode && cep->ord == 0)
1441 cep->ord = 1;
1442
1443 cep->state = SIW_EPSTATE_CONNECTING;
1444
1445 /*
1446 * Associate CEP with socket
1447 */
1448 siw_cep_socket_assoc(cep, s);
1449
1450 cep->state = SIW_EPSTATE_AWAIT_MPAREP;
1451
1452 /*
1453 * Set MPA Request bits: CRC if required, no MPA Markers,
1454 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
1455 */
1456 cep->mpa.hdr.params.bits = 0;
1457 if (version > MPA_REVISION_2) {
1458 pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
1459 version = MPA_REVISION_2;
1460 /* Adjust also module parameter */
1461 mpa_version = MPA_REVISION_2;
1462 }
1463 __mpa_rr_set_revision(bits: &cep->mpa.hdr.params.bits, rev: version);
1464
1465 if (try_gso)
1466 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
1467
1468 if (mpa_crc_required)
1469 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
1470
1471 /*
1472 * If MPA version == 2:
1473 * o Include ORD and IRD.
1474 * o Indicate peer-to-peer mode, if required by module
1475 * parameter 'peer_to_peer'.
1476 */
1477 if (version == MPA_REVISION_2) {
1478 cep->enhanced_rdma_conn_est = true;
1479 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
1480
1481 cep->mpa.v2_ctrl.ird = htons(cep->ird);
1482 cep->mpa.v2_ctrl.ord = htons(cep->ord);
1483
1484 if (p2p_mode) {
1485 cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
1486 cep->mpa.v2_ctrl.ord |= rtr_type;
1487 }
1488 /* Remember own P2P mode requested */
1489 cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
1490 cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
1491 }
1492 memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
1493
1494 rv = siw_send_mpareqrep(cep, pdata: params->private_data, pd_len);
1495 /*
1496 * Reset private data.
1497 */
1498 cep->mpa.hdr.params.pd_len = 0;
1499
1500 if (rv >= 0) {
1501 rv = siw_cm_queue_work(cep, type: SIW_CM_WORK_MPATIMEOUT);
1502 if (!rv) {
1503 siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
1504 siw_cep_set_free(cep);
1505 return 0;
1506 }
1507 }
1508error:
1509 siw_dbg(id->device, "failed: %d\n", rv);
1510
1511 if (cep) {
1512 siw_socket_disassoc(s);
1513 sock_release(sock: s);
1514 cep->sock = NULL;
1515
1516 cep->qp = NULL;
1517
1518 cep->cm_id = NULL;
1519 id->rem_ref(id);
1520
1521 qp->cep = NULL;
1522 siw_cep_put(cep);
1523
1524 cep->state = SIW_EPSTATE_CLOSED;
1525
1526 siw_cep_set_free_and_put(cep);
1527
1528 } else if (s) {
1529 sock_release(sock: s);
1530 }
1531 if (qp)
1532 siw_qp_put(qp);
1533
1534 return rv;
1535}
1536
1537/*
1538 * siw_accept - Let SoftiWARP accept an RDMA connection request
1539 *
1540 * @id: New connection management id to be used for accepted
1541 * connection request
1542 * @params: Connection parameters provided by ULP for accepting connection
1543 *
1544 * Transition QP to RTS state, associate new CM id @id with accepted CEP
1545 * and get prepared for TCP input by installing socket callbacks.
1546 * Then send MPA Reply and generate the "connection established" event.
1547 * Socket callbacks must be installed before sending MPA Reply, because
1548 * the latter may cause a first RDMA message to arrive from the RDMA Initiator
1549 * side very quickly, at which time the socket callbacks must be ready.
1550 */
1551int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1552{
1553 struct siw_device *sdev = to_siw_dev(base_dev: id->device);
1554 struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1555 struct siw_qp *qp;
1556 struct siw_qp_attrs qp_attrs;
1557 int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA;
1558 bool wait_for_peer_rts = false;
1559
1560 siw_cep_set_inuse(cep);
1561 siw_cep_put(cep);
1562
1563 /* Free lingering inbound private data */
1564 if (cep->mpa.hdr.params.pd_len) {
1565 cep->mpa.hdr.params.pd_len = 0;
1566 kfree(objp: cep->mpa.pdata);
1567 cep->mpa.pdata = NULL;
1568 }
1569 siw_cancel_mpatimer(cep);
1570
1571 if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1572 siw_dbg_cep(cep, "out of state\n");
1573 rv = -ECONNRESET;
1574 goto free_cep;
1575 }
1576 qp = siw_qp_id2obj(sdev, id: params->qpn);
1577 if (!qp) {
1578 WARN(1, "[QP %d] does not exist\n", params->qpn);
1579 goto free_cep;
1580 }
1581 down_write(sem: &qp->state_lock);
1582 if (qp->attrs.state > SIW_QP_STATE_RTR)
1583 goto error_unlock;
1584 siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
1585
1586 if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
1587 siw_dbg_cep(cep, "peer allows GSO on TX\n");
1588 qp->tx_ctx.gso_seg_limit = 0;
1589 }
1590 if (params->ord > sdev->attrs.max_ord ||
1591 params->ird > sdev->attrs.max_ird) {
1592 siw_dbg_cep(
1593 cep,
1594 "[QP %u]: ord %d (max %d), ird %d (max %d)\n",
1595 qp_id(qp), params->ord, sdev->attrs.max_ord,
1596 params->ird, sdev->attrs.max_ird);
1597 goto error_unlock;
1598 }
1599 if (cep->enhanced_rdma_conn_est)
1600 max_priv_data -= sizeof(struct mpa_v2_data);
1601
1602 if (params->private_data_len > max_priv_data) {
1603 siw_dbg_cep(
1604 cep,
1605 "[QP %u]: private data length: %d (max %d)\n",
1606 qp_id(qp), params->private_data_len, max_priv_data);
1607 goto error_unlock;
1608 }
1609 if (cep->enhanced_rdma_conn_est) {
1610 if (params->ord > cep->ord) {
1611 if (relaxed_ird_negotiation) {
1612 params->ord = cep->ord;
1613 } else {
1614 cep->ird = params->ird;
1615 cep->ord = params->ord;
1616 goto error_unlock;
1617 }
1618 }
1619 if (params->ird < cep->ird) {
1620 if (relaxed_ird_negotiation &&
1621 cep->ird <= sdev->attrs.max_ird)
1622 params->ird = cep->ird;
1623 else {
1624 rv = -ENOMEM;
1625 goto error_unlock;
1626 }
1627 }
1628 if (cep->mpa.v2_ctrl.ord &
1629 (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
1630 wait_for_peer_rts = true;
1631 /*
1632 * Signal back negotiated IRD and ORD values
1633 */
1634 cep->mpa.v2_ctrl.ord =
1635 htons(params->ord & MPA_IRD_ORD_MASK) |
1636 (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
1637 cep->mpa.v2_ctrl.ird =
1638 htons(params->ird & MPA_IRD_ORD_MASK) |
1639 (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
1640 }
1641 cep->ird = params->ird;
1642 cep->ord = params->ord;
1643
1644 cep->cm_id = id;
1645 id->add_ref(id);
1646
1647 memset(&qp_attrs, 0, sizeof(qp_attrs));
1648 qp_attrs.orq_size = cep->ord;
1649 qp_attrs.irq_size = cep->ird;
1650 qp_attrs.sk = cep->sock;
1651 if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
1652 qp_attrs.flags = SIW_MPA_CRC;
1653 qp_attrs.state = SIW_QP_STATE_RTS;
1654
1655 siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
1656
1657 /* Associate QP with CEP */
1658 siw_cep_get(cep);
1659 qp->cep = cep;
1660
1661 /* siw_qp_get(qp) already done by QP lookup */
1662 cep->qp = qp;
1663
1664 cep->state = SIW_EPSTATE_RDMA_MODE;
1665
1666 /* Move socket RX/TX under QP control */
1667 rv = siw_qp_modify(qp, attr: &qp_attrs,
1668 mask: SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
1669 SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
1670 SIW_QP_ATTR_MPA);
1671 up_write(sem: &qp->state_lock);
1672 if (rv)
1673 goto error;
1674
1675 siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
1676 qp_id(qp), params->private_data_len);
1677
1678 rv = siw_send_mpareqrep(cep, pdata: params->private_data,
1679 pd_len: params->private_data_len);
1680 if (rv != 0)
1681 goto error;
1682
1683 if (wait_for_peer_rts) {
1684 siw_sk_assign_rtr_upcalls(cep);
1685 } else {
1686 siw_qp_socket_assoc(cep, qp);
1687 rv = siw_cm_upcall(cep, reason: IW_CM_EVENT_ESTABLISHED, status: 0);
1688 if (rv)
1689 goto error;
1690 }
1691 siw_cep_set_free(cep);
1692
1693 return 0;
1694
1695error_unlock:
1696 up_write(sem: &qp->state_lock);
1697error:
1698 siw_destroy_cep_sock(cep);
1699
1700 cep->state = SIW_EPSTATE_CLOSED;
1701
1702 siw_free_cm_id(cep);
1703 if (qp->cep) {
1704 siw_cep_put(cep);
1705 qp->cep = NULL;
1706 }
1707 cep->qp = NULL;
1708 siw_qp_put(qp);
1709free_cep:
1710 siw_cep_set_free_and_put(cep);
1711 return rv;
1712}
1713
1714/*
1715 * siw_reject()
1716 *
1717 * Local connection reject case. Send private data back to peer,
1718 * close connection and dereference connection id.
1719 */
1720int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
1721{
1722 struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1723
1724 siw_cep_set_inuse(cep);
1725 siw_cep_put(cep);
1726
1727 siw_cancel_mpatimer(cep);
1728
1729 if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1730 siw_dbg_cep(cep, "out of state\n");
1731
1732 siw_cep_set_free_and_put(cep); /* put last reference */
1733
1734 return -ECONNRESET;
1735 }
1736 siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
1737 pd_len);
1738
1739 if (__mpa_rr_revision(mpa_rr_bits: cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
1740 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
1741 siw_send_mpareqrep(cep, pdata, pd_len);
1742 }
1743 siw_destroy_cep_sock(cep);
1744
1745 cep->state = SIW_EPSTATE_CLOSED;
1746
1747 siw_cep_set_free_and_put(cep);
1748
1749 return 0;
1750}
1751
1752/*
1753 * siw_create_listen - Create resources for a listener's IWCM ID @id
1754 *
1755 * Starts listen on the socket address id->local_addr.
1756 *
1757 */
1758int siw_create_listen(struct iw_cm_id *id, int backlog)
1759{
1760 struct socket *s;
1761 struct siw_cep *cep = NULL;
1762 struct siw_device *sdev = to_siw_dev(base_dev: id->device);
1763 int addr_family = id->local_addr.ss_family;
1764 int rv = 0;
1765
1766 if (addr_family != AF_INET && addr_family != AF_INET6)
1767 return -EAFNOSUPPORT;
1768
1769 rv = sock_create(family: addr_family, type: SOCK_STREAM, IPPROTO_TCP, res: &s);
1770 if (rv < 0)
1771 return rv;
1772
1773 /*
1774 * Allow binding local port when still in TIME_WAIT from last close.
1775 */
1776 sock_set_reuseaddr(sk: s->sk);
1777
1778 if (addr_family == AF_INET) {
1779 struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
1780
1781 /* For wildcard addr, limit binding to current device only */
1782 if (ipv4_is_zeronet(addr: laddr->sin_addr.s_addr))
1783 s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
1784
1785 rv = s->ops->bind(s, (struct sockaddr *)laddr,
1786 sizeof(struct sockaddr_in));
1787 } else {
1788 struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
1789
1790 if (id->afonly) {
1791 rv = ip6_sock_set_v6only(sk: s->sk);
1792 if (rv) {
1793 siw_dbg(id->device,
1794 "ip6_sock_set_v6only erro: %d\n", rv);
1795 goto error;
1796 }
1797 }
1798
1799 /* For wildcard addr, limit binding to current device only */
1800 if (ipv6_addr_any(a: &laddr->sin6_addr))
1801 s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
1802
1803 rv = s->ops->bind(s, (struct sockaddr *)laddr,
1804 sizeof(struct sockaddr_in6));
1805 }
1806 if (rv) {
1807 siw_dbg(id->device, "socket bind error: %d\n", rv);
1808 goto error;
1809 }
1810 cep = siw_cep_alloc(sdev);
1811 if (!cep) {
1812 rv = -ENOMEM;
1813 goto error;
1814 }
1815 siw_cep_socket_assoc(cep, s);
1816
1817 rv = siw_cm_alloc_work(cep, num: backlog);
1818 if (rv) {
1819 siw_dbg(id->device,
1820 "alloc_work error %d, backlog %d\n",
1821 rv, backlog);
1822 goto error;
1823 }
1824 rv = s->ops->listen(s, backlog);
1825 if (rv) {
1826 siw_dbg(id->device, "listen error %d\n", rv);
1827 goto error;
1828 }
1829 cep->cm_id = id;
1830 id->add_ref(id);
1831
1832 /*
1833 * In case of a wildcard rdma_listen on a multi-homed device,
1834 * a listener's IWCM id is associated with more than one listening CEP.
1835 *
1836 * We currently use id->provider_data in three different ways:
1837 *
1838 * o For a listener's IWCM id, id->provider_data points to
1839 * the list_head of the list of listening CEPs.
1840 * Uses: siw_create_listen(), siw_destroy_listen()
1841 *
1842 * o For each accepted passive-side IWCM id, id->provider_data
1843 * points to the CEP itself. This is a consequence of
1844 * - siw_cm_upcall() setting event.provider_data = cep and
1845 * - the IWCM's cm_conn_req_handler() setting provider_data of the
1846 * new passive-side IWCM id equal to event.provider_data
1847 * Uses: siw_accept(), siw_reject()
1848 *
1849 * o For an active-side IWCM id, id->provider_data is not used at all.
1850 *
1851 */
1852 if (!id->provider_data) {
1853 id->provider_data =
1854 kmalloc(size: sizeof(struct list_head), GFP_KERNEL);
1855 if (!id->provider_data) {
1856 rv = -ENOMEM;
1857 goto error;
1858 }
1859 INIT_LIST_HEAD(list: (struct list_head *)id->provider_data);
1860 }
1861 list_add_tail(new: &cep->listenq, head: (struct list_head *)id->provider_data);
1862 cep->state = SIW_EPSTATE_LISTENING;
1863
1864 siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
1865
1866 return 0;
1867
1868error:
1869 siw_dbg(id->device, "failed: %d\n", rv);
1870
1871 if (cep) {
1872 siw_cep_set_inuse(cep);
1873
1874 siw_free_cm_id(cep);
1875 cep->sock = NULL;
1876 siw_socket_disassoc(s);
1877 cep->state = SIW_EPSTATE_CLOSED;
1878
1879 siw_cep_set_free_and_put(cep);
1880 }
1881 sock_release(sock: s);
1882
1883 return rv;
1884}
1885
1886static void siw_drop_listeners(struct iw_cm_id *id)
1887{
1888 struct list_head *p, *tmp;
1889
1890 /*
1891 * In case of a wildcard rdma_listen on a multi-homed device,
1892 * a listener's IWCM id is associated with more than one listening CEP.
1893 */
1894 list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
1895 struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
1896
1897 list_del(entry: p);
1898
1899 siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
1900
1901 siw_cep_set_inuse(cep);
1902
1903 siw_free_cm_id(cep);
1904 if (cep->sock) {
1905 siw_socket_disassoc(s: cep->sock);
1906 sock_release(sock: cep->sock);
1907 cep->sock = NULL;
1908 }
1909 cep->state = SIW_EPSTATE_CLOSED;
1910 siw_cep_set_free_and_put(cep);
1911 }
1912}
1913
1914int siw_destroy_listen(struct iw_cm_id *id)
1915{
1916 if (!id->provider_data) {
1917 siw_dbg(id->device, "no cep(s)\n");
1918 return 0;
1919 }
1920 siw_drop_listeners(id);
1921 kfree(objp: id->provider_data);
1922 id->provider_data = NULL;
1923
1924 return 0;
1925}
1926
1927int siw_cm_init(void)
1928{
1929 /*
1930 * create_single_workqueue for strict ordering
1931 */
1932 siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
1933 if (!siw_cm_wq)
1934 return -ENOMEM;
1935
1936 return 0;
1937}
1938
1939void siw_cm_exit(void)
1940{
1941 if (siw_cm_wq)
1942 destroy_workqueue(wq: siw_cm_wq);
1943}
1944

source code of linux/drivers/infiniband/sw/siw/siw_cm.c