1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Shared Memory Communications over RDMA (SMC-R) and RoCE |
4 | * |
5 | * Manage RMBE |
6 | * copy new RMBE data into user space |
7 | * |
8 | * Copyright IBM Corp. 2016 |
9 | * |
10 | * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> |
11 | */ |
12 | |
13 | #include <linux/net.h> |
14 | #include <linux/rcupdate.h> |
15 | #include <linux/sched/signal.h> |
16 | #include <linux/splice.h> |
17 | |
18 | #include <net/sock.h> |
19 | #include <trace/events/sock.h> |
20 | |
21 | #include "smc.h" |
22 | #include "smc_core.h" |
23 | #include "smc_cdc.h" |
24 | #include "smc_tx.h" /* smc_tx_consumer_update() */ |
25 | #include "smc_rx.h" |
26 | #include "smc_stats.h" |
27 | #include "smc_tracepoint.h" |
28 | |
29 | /* callback implementation to wakeup consumers blocked with smc_rx_wait(). |
30 | * indirectly called by smc_cdc_msg_recv_action(). |
31 | */ |
32 | static void smc_rx_wake_up(struct sock *sk) |
33 | { |
34 | struct socket_wq *wq; |
35 | |
36 | trace_sk_data_ready(sk); |
37 | |
38 | /* derived from sock_def_readable() */ |
39 | /* called already in smc_listen_work() */ |
40 | rcu_read_lock(); |
41 | wq = rcu_dereference(sk->sk_wq); |
42 | if (skwq_has_sleeper(wq)) |
43 | wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | |
44 | EPOLLRDNORM | EPOLLRDBAND); |
45 | sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_IN); |
46 | if ((sk->sk_shutdown == SHUTDOWN_MASK) || |
47 | (sk->sk_state == SMC_CLOSED)) |
48 | sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_HUP); |
49 | rcu_read_unlock(); |
50 | } |
51 | |
52 | /* Update consumer cursor |
53 | * @conn connection to update |
54 | * @cons consumer cursor |
55 | * @len number of Bytes consumed |
56 | * Returns: |
57 | * 1 if we should end our receive, 0 otherwise |
58 | */ |
59 | static int smc_rx_update_consumer(struct smc_sock *smc, |
60 | union smc_host_cursor cons, size_t len) |
61 | { |
62 | struct smc_connection *conn = &smc->conn; |
63 | struct sock *sk = &smc->sk; |
64 | bool force = false; |
65 | int diff, rc = 0; |
66 | |
67 | smc_curs_add(size: conn->rmb_desc->len, curs: &cons, value: len); |
68 | |
69 | /* did we process urgent data? */ |
70 | if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) { |
71 | diff = smc_curs_comp(size: conn->rmb_desc->len, old: &cons, |
72 | new: &conn->urg_curs); |
73 | if (sock_flag(sk, flag: SOCK_URGINLINE)) { |
74 | if (diff == 0) { |
75 | force = true; |
76 | rc = 1; |
77 | conn->urg_state = SMC_URG_READ; |
78 | } |
79 | } else { |
80 | if (diff == 1) { |
81 | /* skip urgent byte */ |
82 | force = true; |
83 | smc_curs_add(size: conn->rmb_desc->len, curs: &cons, value: 1); |
84 | conn->urg_rx_skip_pend = false; |
85 | } else if (diff < -1) |
86 | /* we read past urgent byte */ |
87 | conn->urg_state = SMC_URG_READ; |
88 | } |
89 | } |
90 | |
91 | smc_curs_copy(tgt: &conn->local_tx_ctrl.cons, src: &cons, conn); |
92 | |
93 | /* send consumer cursor update if required */ |
94 | /* similar to advertising new TCP rcv_wnd if required */ |
95 | smc_tx_consumer_update(conn, force); |
96 | |
97 | return rc; |
98 | } |
99 | |
100 | static void smc_rx_update_cons(struct smc_sock *smc, size_t len) |
101 | { |
102 | struct smc_connection *conn = &smc->conn; |
103 | union smc_host_cursor cons; |
104 | |
105 | smc_curs_copy(tgt: &cons, src: &conn->local_tx_ctrl.cons, conn); |
106 | smc_rx_update_consumer(smc, cons, len); |
107 | } |
108 | |
109 | struct smc_spd_priv { |
110 | struct smc_sock *smc; |
111 | size_t len; |
112 | }; |
113 | |
114 | static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, |
115 | struct pipe_buffer *buf) |
116 | { |
117 | struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private; |
118 | struct smc_sock *smc = priv->smc; |
119 | struct smc_connection *conn; |
120 | struct sock *sk = &smc->sk; |
121 | |
122 | if (sk->sk_state == SMC_CLOSED || |
123 | sk->sk_state == SMC_PEERFINCLOSEWAIT || |
124 | sk->sk_state == SMC_APPFINCLOSEWAIT) |
125 | goto out; |
126 | conn = &smc->conn; |
127 | lock_sock(sk); |
128 | smc_rx_update_cons(smc, len: priv->len); |
129 | release_sock(sk); |
130 | if (atomic_sub_and_test(i: priv->len, v: &conn->splice_pending)) |
131 | smc_rx_wake_up(sk); |
132 | out: |
133 | kfree(objp: priv); |
134 | put_page(page: buf->page); |
135 | sock_put(sk); |
136 | } |
137 | |
138 | static const struct pipe_buf_operations smc_pipe_ops = { |
139 | .release = smc_rx_pipe_buf_release, |
140 | .get = generic_pipe_buf_get |
141 | }; |
142 | |
143 | static void smc_rx_spd_release(struct splice_pipe_desc *spd, |
144 | unsigned int i) |
145 | { |
146 | put_page(page: spd->pages[i]); |
147 | } |
148 | |
149 | static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, |
150 | struct smc_sock *smc) |
151 | { |
152 | struct smc_link_group *lgr = smc->conn.lgr; |
153 | int offset = offset_in_page(src); |
154 | struct partial_page *partial; |
155 | struct splice_pipe_desc spd; |
156 | struct smc_spd_priv **priv; |
157 | struct page **pages; |
158 | int bytes, nr_pages; |
159 | int i; |
160 | |
161 | nr_pages = !lgr->is_smcd && smc->conn.rmb_desc->is_vm ? |
162 | PAGE_ALIGN(len + offset) / PAGE_SIZE : 1; |
163 | |
164 | pages = kcalloc(n: nr_pages, size: sizeof(*pages), GFP_KERNEL); |
165 | if (!pages) |
166 | goto out; |
167 | partial = kcalloc(n: nr_pages, size: sizeof(*partial), GFP_KERNEL); |
168 | if (!partial) |
169 | goto out_page; |
170 | priv = kcalloc(n: nr_pages, size: sizeof(*priv), GFP_KERNEL); |
171 | if (!priv) |
172 | goto out_part; |
173 | for (i = 0; i < nr_pages; i++) { |
174 | priv[i] = kzalloc(size: sizeof(**priv), GFP_KERNEL); |
175 | if (!priv[i]) |
176 | goto out_priv; |
177 | } |
178 | |
179 | if (lgr->is_smcd || |
180 | (!lgr->is_smcd && !smc->conn.rmb_desc->is_vm)) { |
181 | /* smcd or smcr that uses physically contiguous RMBs */ |
182 | priv[0]->len = len; |
183 | priv[0]->smc = smc; |
184 | partial[0].offset = src - (char *)smc->conn.rmb_desc->cpu_addr; |
185 | partial[0].len = len; |
186 | partial[0].private = (unsigned long)priv[0]; |
187 | pages[0] = smc->conn.rmb_desc->pages; |
188 | } else { |
189 | int size, left = len; |
190 | void *buf = src; |
191 | /* smcr that uses virtually contiguous RMBs*/ |
192 | for (i = 0; i < nr_pages; i++) { |
193 | size = min_t(int, PAGE_SIZE - offset, left); |
194 | priv[i]->len = size; |
195 | priv[i]->smc = smc; |
196 | pages[i] = vmalloc_to_page(addr: buf); |
197 | partial[i].offset = offset; |
198 | partial[i].len = size; |
199 | partial[i].private = (unsigned long)priv[i]; |
200 | buf += size / sizeof(*buf); |
201 | left -= size; |
202 | offset = 0; |
203 | } |
204 | } |
205 | spd.nr_pages_max = nr_pages; |
206 | spd.nr_pages = nr_pages; |
207 | spd.pages = pages; |
208 | spd.partial = partial; |
209 | spd.ops = &smc_pipe_ops; |
210 | spd.spd_release = smc_rx_spd_release; |
211 | |
212 | bytes = splice_to_pipe(pipe, &spd); |
213 | if (bytes > 0) { |
214 | sock_hold(sk: &smc->sk); |
215 | if (!lgr->is_smcd && smc->conn.rmb_desc->is_vm) { |
216 | for (i = 0; i < PAGE_ALIGN(bytes + offset) / PAGE_SIZE; i++) |
217 | get_page(page: pages[i]); |
218 | } else { |
219 | get_page(page: smc->conn.rmb_desc->pages); |
220 | } |
221 | atomic_add(i: bytes, v: &smc->conn.splice_pending); |
222 | } |
223 | kfree(objp: priv); |
224 | kfree(objp: partial); |
225 | kfree(objp: pages); |
226 | |
227 | return bytes; |
228 | |
229 | out_priv: |
230 | for (i = (i - 1); i >= 0; i--) |
231 | kfree(objp: priv[i]); |
232 | kfree(objp: priv); |
233 | out_part: |
234 | kfree(objp: partial); |
235 | out_page: |
236 | kfree(objp: pages); |
237 | out: |
238 | return -ENOMEM; |
239 | } |
240 | |
241 | static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) |
242 | { |
243 | return atomic_read(v: &conn->bytes_to_rcv) && |
244 | !atomic_read(v: &conn->splice_pending); |
245 | } |
246 | |
247 | /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted |
248 | * @smc smc socket |
249 | * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout |
250 | * @fcrit add'l criterion to evaluate as function pointer |
251 | * Returns: |
252 | * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. |
253 | * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). |
254 | */ |
255 | int smc_rx_wait(struct smc_sock *smc, long *timeo, |
256 | int (*fcrit)(struct smc_connection *conn)) |
257 | { |
258 | DEFINE_WAIT_FUNC(wait, woken_wake_function); |
259 | struct smc_connection *conn = &smc->conn; |
260 | struct smc_cdc_conn_state_flags *cflags = |
261 | &conn->local_tx_ctrl.conn_state_flags; |
262 | struct sock *sk = &smc->sk; |
263 | int rc; |
264 | |
265 | if (fcrit(conn)) |
266 | return 1; |
267 | sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); |
268 | add_wait_queue(wq_head: sk_sleep(sk), wq_entry: &wait); |
269 | rc = sk_wait_event(sk, timeo, |
270 | READ_ONCE(sk->sk_err) || |
271 | cflags->peer_conn_abort || |
272 | READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || |
273 | conn->killed || |
274 | fcrit(conn), |
275 | &wait); |
276 | remove_wait_queue(wq_head: sk_sleep(sk), wq_entry: &wait); |
277 | sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); |
278 | return rc; |
279 | } |
280 | |
281 | static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, |
282 | int flags) |
283 | { |
284 | struct smc_connection *conn = &smc->conn; |
285 | union smc_host_cursor cons; |
286 | struct sock *sk = &smc->sk; |
287 | int rc = 0; |
288 | |
289 | if (sock_flag(sk, flag: SOCK_URGINLINE) || |
290 | !(conn->urg_state == SMC_URG_VALID) || |
291 | conn->urg_state == SMC_URG_READ) |
292 | return -EINVAL; |
293 | |
294 | SMC_STAT_INC(smc, urg_data_cnt); |
295 | if (conn->urg_state == SMC_URG_VALID) { |
296 | if (!(flags & MSG_PEEK)) |
297 | smc->conn.urg_state = SMC_URG_READ; |
298 | msg->msg_flags |= MSG_OOB; |
299 | if (len > 0) { |
300 | if (!(flags & MSG_TRUNC)) |
301 | rc = memcpy_to_msg(msg, data: &conn->urg_rx_byte, len: 1); |
302 | len = 1; |
303 | smc_curs_copy(tgt: &cons, src: &conn->local_tx_ctrl.cons, conn); |
304 | if (smc_curs_diff(size: conn->rmb_desc->len, old: &cons, |
305 | new: &conn->urg_curs) > 1) |
306 | conn->urg_rx_skip_pend = true; |
307 | /* Urgent Byte was already accounted for, but trigger |
308 | * skipping the urgent byte in non-inline case |
309 | */ |
310 | if (!(flags & MSG_PEEK)) |
311 | smc_rx_update_consumer(smc, cons, len: 0); |
312 | } else { |
313 | msg->msg_flags |= MSG_TRUNC; |
314 | } |
315 | |
316 | return rc ? -EFAULT : len; |
317 | } |
318 | |
319 | if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN) |
320 | return 0; |
321 | |
322 | return -EAGAIN; |
323 | } |
324 | |
325 | static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) |
326 | { |
327 | struct smc_connection *conn = &smc->conn; |
328 | |
329 | if (smc_rx_data_available(conn)) |
330 | return true; |
331 | else if (conn->urg_state == SMC_URG_VALID) |
332 | /* we received a single urgent Byte - skip */ |
333 | smc_rx_update_cons(smc, len: 0); |
334 | return false; |
335 | } |
336 | |
337 | /* smc_rx_recvmsg - receive data from RMBE |
338 | * @msg: copy data to receive buffer |
339 | * @pipe: copy data to pipe if set - indicates splice() call |
340 | * |
341 | * rcvbuf consumer: main API called by socket layer. |
342 | * Called under sk lock. |
343 | */ |
344 | int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, |
345 | struct pipe_inode_info *pipe, size_t len, int flags) |
346 | { |
347 | size_t copylen, read_done = 0, read_remaining = len; |
348 | size_t chunk_len, chunk_off, chunk_len_sum; |
349 | struct smc_connection *conn = &smc->conn; |
350 | int (*func)(struct smc_connection *conn); |
351 | union smc_host_cursor cons; |
352 | int readable, chunk; |
353 | char *rcvbuf_base; |
354 | struct sock *sk; |
355 | int splbytes; |
356 | long timeo; |
357 | int target; /* Read at least these many bytes */ |
358 | int rc; |
359 | |
360 | if (unlikely(flags & MSG_ERRQUEUE)) |
361 | return -EINVAL; /* future work for sk.sk_family == AF_SMC */ |
362 | |
363 | sk = &smc->sk; |
364 | if (sk->sk_state == SMC_LISTEN) |
365 | return -ENOTCONN; |
366 | if (flags & MSG_OOB) |
367 | return smc_rx_recv_urg(smc, msg, len, flags); |
368 | timeo = sock_rcvtimeo(sk, noblock: flags & MSG_DONTWAIT); |
369 | target = sock_rcvlowat(sk, waitall: flags & MSG_WAITALL, len); |
370 | |
371 | readable = atomic_read(v: &conn->bytes_to_rcv); |
372 | if (readable >= conn->rmb_desc->len) |
373 | SMC_STAT_RMB_RX_FULL(smc, !conn->lnk); |
374 | |
375 | if (len < readable) |
376 | SMC_STAT_RMB_RX_SIZE_SMALL(smc, !conn->lnk); |
377 | /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ |
378 | rcvbuf_base = conn->rx_off + conn->rmb_desc->cpu_addr; |
379 | |
380 | do { /* while (read_remaining) */ |
381 | if (read_done >= target || (pipe && read_done)) |
382 | break; |
383 | |
384 | if (conn->killed) |
385 | break; |
386 | |
387 | if (smc_rx_recvmsg_data_available(smc)) |
388 | goto copy; |
389 | |
390 | if (sk->sk_shutdown & RCV_SHUTDOWN) { |
391 | /* smc_cdc_msg_recv_action() could have run after |
392 | * above smc_rx_recvmsg_data_available() |
393 | */ |
394 | if (smc_rx_recvmsg_data_available(smc)) |
395 | goto copy; |
396 | break; |
397 | } |
398 | |
399 | if (read_done) { |
400 | if (sk->sk_err || |
401 | sk->sk_state == SMC_CLOSED || |
402 | !timeo || |
403 | signal_pending(current)) |
404 | break; |
405 | } else { |
406 | if (sk->sk_err) { |
407 | read_done = sock_error(sk); |
408 | break; |
409 | } |
410 | if (sk->sk_state == SMC_CLOSED) { |
411 | if (!sock_flag(sk, flag: SOCK_DONE)) { |
412 | /* This occurs when user tries to read |
413 | * from never connected socket. |
414 | */ |
415 | read_done = -ENOTCONN; |
416 | break; |
417 | } |
418 | break; |
419 | } |
420 | if (!timeo) |
421 | return -EAGAIN; |
422 | if (signal_pending(current)) { |
423 | read_done = sock_intr_errno(timeo); |
424 | break; |
425 | } |
426 | } |
427 | |
428 | if (!smc_rx_data_available(conn)) { |
429 | smc_rx_wait(smc, timeo: &timeo, fcrit: smc_rx_data_available); |
430 | continue; |
431 | } |
432 | |
433 | copy: |
434 | /* initialize variables for 1st iteration of subsequent loop */ |
435 | /* could be just 1 byte, even after waiting on data above */ |
436 | readable = atomic_read(v: &conn->bytes_to_rcv); |
437 | splbytes = atomic_read(v: &conn->splice_pending); |
438 | if (!readable || (msg && splbytes)) { |
439 | if (splbytes) |
440 | func = smc_rx_data_available_and_no_splice_pend; |
441 | else |
442 | func = smc_rx_data_available; |
443 | smc_rx_wait(smc, timeo: &timeo, fcrit: func); |
444 | continue; |
445 | } |
446 | |
447 | smc_curs_copy(tgt: &cons, src: &conn->local_tx_ctrl.cons, conn); |
448 | /* subsequent splice() calls pick up where previous left */ |
449 | if (splbytes) |
450 | smc_curs_add(size: conn->rmb_desc->len, curs: &cons, value: splbytes); |
451 | if (conn->urg_state == SMC_URG_VALID && |
452 | sock_flag(sk: &smc->sk, flag: SOCK_URGINLINE) && |
453 | readable > 1) |
454 | readable--; /* always stop at urgent Byte */ |
455 | /* not more than what user space asked for */ |
456 | copylen = min_t(size_t, read_remaining, readable); |
457 | /* determine chunks where to read from rcvbuf */ |
458 | /* either unwrapped case, or 1st chunk of wrapped case */ |
459 | chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - |
460 | cons.count); |
461 | chunk_len_sum = chunk_len; |
462 | chunk_off = cons.count; |
463 | smc_rmb_sync_sg_for_cpu(conn); |
464 | for (chunk = 0; chunk < 2; chunk++) { |
465 | if (!(flags & MSG_TRUNC)) { |
466 | if (msg) { |
467 | rc = memcpy_to_msg(msg, data: rcvbuf_base + |
468 | chunk_off, |
469 | len: chunk_len); |
470 | } else { |
471 | rc = smc_rx_splice(pipe, src: rcvbuf_base + |
472 | chunk_off, len: chunk_len, |
473 | smc); |
474 | } |
475 | if (rc < 0) { |
476 | if (!read_done) |
477 | read_done = -EFAULT; |
478 | goto out; |
479 | } |
480 | } |
481 | read_remaining -= chunk_len; |
482 | read_done += chunk_len; |
483 | |
484 | if (chunk_len_sum == copylen) |
485 | break; /* either on 1st or 2nd iteration */ |
486 | /* prepare next (== 2nd) iteration */ |
487 | chunk_len = copylen - chunk_len; /* remainder */ |
488 | chunk_len_sum += chunk_len; |
489 | chunk_off = 0; /* modulo offset in recv ring buffer */ |
490 | } |
491 | |
492 | /* update cursors */ |
493 | if (!(flags & MSG_PEEK)) { |
494 | /* increased in recv tasklet smc_cdc_msg_rcv() */ |
495 | smp_mb__before_atomic(); |
496 | atomic_sub(i: copylen, v: &conn->bytes_to_rcv); |
497 | /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ |
498 | smp_mb__after_atomic(); |
499 | if (msg && smc_rx_update_consumer(smc, cons, len: copylen)) |
500 | goto out; |
501 | } |
502 | |
503 | trace_smc_rx_recvmsg(smc, len: copylen); |
504 | } while (read_remaining); |
505 | out: |
506 | return read_done; |
507 | } |
508 | |
509 | /* Initialize receive properties on connection establishment. NB: not __init! */ |
510 | void smc_rx_init(struct smc_sock *smc) |
511 | { |
512 | smc->sk.sk_data_ready = smc_rx_wake_up; |
513 | atomic_set(v: &smc->conn.splice_pending, i: 0); |
514 | smc->conn.urg_state = SMC_URG_READ; |
515 | } |
516 | |