1 | /* |
2 | * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. |
3 | * |
4 | * This software is available to you under a choice of one of two |
5 | * licenses. You may choose to be licensed under the terms of the GNU |
6 | * General Public License (GPL) Version 2, available from the file |
7 | * COPYING in the main directory of this source tree, or the |
8 | * OpenIB.org BSD license below: |
9 | * |
10 | * Redistribution and use in source and binary forms, with or |
11 | * without modification, are permitted provided that the following |
12 | * conditions are met: |
13 | * |
14 | * - Redistributions of source code must retain the above |
15 | * copyright notice, this list of conditions and the following |
16 | * disclaimer. |
17 | * |
18 | * - Redistributions in binary form must reproduce the above |
19 | * copyright notice, this list of conditions and the following |
20 | * disclaimer in the documentation and/or other materials |
21 | * provided with the distribution. |
22 | * |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
30 | * SOFTWARE. |
31 | * |
32 | */ |
33 | #include <linux/kernel.h> |
34 | #include <linux/gfp.h> |
35 | #include <linux/in.h> |
36 | #include <net/tcp.h> |
37 | #include <trace/events/sock.h> |
38 | |
39 | #include "rds.h" |
40 | #include "tcp.h" |
41 | |
42 | void rds_tcp_keepalive(struct socket *sock) |
43 | { |
44 | /* values below based on xs_udp_default_timeout */ |
45 | int keepidle = 5; /* send a probe 'keepidle' secs after last data */ |
46 | int keepcnt = 5; /* number of unack'ed probes before declaring dead */ |
47 | |
48 | sock_set_keepalive(sk: sock->sk); |
49 | tcp_sock_set_keepcnt(sk: sock->sk, val: keepcnt); |
50 | tcp_sock_set_keepidle(sk: sock->sk, val: keepidle); |
51 | /* KEEPINTVL is the interval between successive probes. We follow |
52 | * the model in xs_tcp_finish_connecting() and re-use keepidle. |
53 | */ |
54 | tcp_sock_set_keepintvl(sk: sock->sk, val: keepidle); |
55 | } |
56 | |
57 | /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the |
58 | * client's ipaddr < server's ipaddr. Otherwise, close the accepted |
59 | * socket and force a reconneect from smaller -> larger ip addr. The reason |
60 | * we special case cp_index 0 is to allow the rds probe ping itself to itself |
61 | * get through efficiently. |
62 | * Since reconnects are only initiated from the node with the numerically |
63 | * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side |
64 | * by moving them to CONNECTING in this function. |
65 | */ |
66 | static |
67 | struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) |
68 | { |
69 | int i; |
70 | int npaths = max_t(int, 1, conn->c_npaths); |
71 | |
72 | /* for mprds, all paths MUST be initiated by the peer |
73 | * with the smaller address. |
74 | */ |
75 | if (rds_addr_cmp(a1: &conn->c_faddr, a2: &conn->c_laddr) >= 0) { |
76 | /* Make sure we initiate at least one path if this |
77 | * has not already been done; rds_start_mprds() will |
78 | * take care of additional paths, if necessary. |
79 | */ |
80 | if (npaths == 1) |
81 | rds_conn_path_connect_if_down(cp: &conn->c_path[0]); |
82 | return NULL; |
83 | } |
84 | |
85 | for (i = 0; i < npaths; i++) { |
86 | struct rds_conn_path *cp = &conn->c_path[i]; |
87 | |
88 | if (rds_conn_path_transition(cp, old: RDS_CONN_DOWN, |
89 | new: RDS_CONN_CONNECTING) || |
90 | rds_conn_path_transition(cp, old: RDS_CONN_ERROR, |
91 | new: RDS_CONN_CONNECTING)) { |
92 | return cp->cp_transport_data; |
93 | } |
94 | } |
95 | return NULL; |
96 | } |
97 | |
98 | int rds_tcp_accept_one(struct socket *sock) |
99 | { |
100 | struct socket *new_sock = NULL; |
101 | struct rds_connection *conn; |
102 | int ret; |
103 | struct inet_sock *inet; |
104 | struct rds_tcp_connection *rs_tcp = NULL; |
105 | int conn_state; |
106 | struct rds_conn_path *cp; |
107 | struct in6_addr *my_addr, *peer_addr; |
108 | #if !IS_ENABLED(CONFIG_IPV6) |
109 | struct in6_addr saddr, daddr; |
110 | #endif |
111 | int dev_if = 0; |
112 | |
113 | if (!sock) /* module unload or netns delete in progress */ |
114 | return -ENETUNREACH; |
115 | |
116 | ret = sock_create_lite(family: sock->sk->sk_family, |
117 | type: sock->sk->sk_type, proto: sock->sk->sk_protocol, |
118 | res: &new_sock); |
119 | if (ret) |
120 | goto out; |
121 | |
122 | ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true); |
123 | if (ret < 0) |
124 | goto out; |
125 | |
126 | /* sock_create_lite() does not get a hold on the owner module so we |
127 | * need to do it here. Note that sock_release() uses sock->ops to |
128 | * determine if it needs to decrement the reference count. So set |
129 | * sock->ops after calling accept() in case that fails. And there's |
130 | * no need to do try_module_get() as the listener should have a hold |
131 | * already. |
132 | */ |
133 | new_sock->ops = sock->ops; |
134 | __module_get(module: new_sock->ops->owner); |
135 | |
136 | rds_tcp_keepalive(sock: new_sock); |
137 | if (!rds_tcp_tune(sock: new_sock)) { |
138 | ret = -EINVAL; |
139 | goto out; |
140 | } |
141 | |
142 | inet = inet_sk(new_sock->sk); |
143 | |
144 | #if IS_ENABLED(CONFIG_IPV6) |
145 | my_addr = &new_sock->sk->sk_v6_rcv_saddr; |
146 | peer_addr = &new_sock->sk->sk_v6_daddr; |
147 | #else |
148 | ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr); |
149 | ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr); |
150 | my_addr = &saddr; |
151 | peer_addr = &daddr; |
152 | #endif |
153 | rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n" , |
154 | sock->sk->sk_family, |
155 | my_addr, ntohs(inet->inet_sport), |
156 | peer_addr, ntohs(inet->inet_dport)); |
157 | |
158 | #if IS_ENABLED(CONFIG_IPV6) |
159 | /* sk_bound_dev_if is not set if the peer address is not link local |
160 | * address. In this case, it happens that mcast_oif is set. So |
161 | * just use it. |
162 | */ |
163 | if ((ipv6_addr_type(addr: my_addr) & IPV6_ADDR_LINKLOCAL) && |
164 | !(ipv6_addr_type(addr: peer_addr) & IPV6_ADDR_LINKLOCAL)) { |
165 | struct ipv6_pinfo *inet6; |
166 | |
167 | inet6 = inet6_sk(sk: new_sock->sk); |
168 | dev_if = inet6->mcast_oif; |
169 | } else { |
170 | dev_if = new_sock->sk->sk_bound_dev_if; |
171 | } |
172 | #endif |
173 | |
174 | if (!rds_tcp_laddr_check(net: sock_net(sk: sock->sk), addr: peer_addr, scope_id: dev_if)) { |
175 | /* local address connection is only allowed via loopback */ |
176 | ret = -EOPNOTSUPP; |
177 | goto out; |
178 | } |
179 | |
180 | conn = rds_conn_create(net: sock_net(sk: sock->sk), |
181 | laddr: my_addr, faddr: peer_addr, |
182 | trans: &rds_tcp_transport, tos: 0, GFP_KERNEL, dev_if); |
183 | |
184 | if (IS_ERR(ptr: conn)) { |
185 | ret = PTR_ERR(ptr: conn); |
186 | goto out; |
187 | } |
188 | /* An incoming SYN request came in, and TCP just accepted it. |
189 | * |
190 | * If the client reboots, this conn will need to be cleaned up. |
191 | * rds_tcp_state_change() will do that cleanup |
192 | */ |
193 | rs_tcp = rds_tcp_accept_one_path(conn); |
194 | if (!rs_tcp) |
195 | goto rst_nsk; |
196 | mutex_lock(&rs_tcp->t_conn_path_lock); |
197 | cp = rs_tcp->t_cpath; |
198 | conn_state = rds_conn_path_state(cp); |
199 | WARN_ON(conn_state == RDS_CONN_UP); |
200 | if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) |
201 | goto rst_nsk; |
202 | if (rs_tcp->t_sock) { |
203 | /* Duelling SYN has been handled in rds_tcp_accept_one() */ |
204 | rds_tcp_reset_callbacks(sock: new_sock, cp); |
205 | /* rds_connect_path_complete() marks RDS_CONN_UP */ |
206 | rds_connect_path_complete(conn: cp, curr: RDS_CONN_RESETTING); |
207 | } else { |
208 | rds_tcp_set_callbacks(sock: new_sock, cp); |
209 | rds_connect_path_complete(conn: cp, curr: RDS_CONN_CONNECTING); |
210 | } |
211 | new_sock = NULL; |
212 | ret = 0; |
213 | if (conn->c_npaths == 0) |
214 | rds_send_ping(conn: cp->cp_conn, cp_index: cp->cp_index); |
215 | goto out; |
216 | rst_nsk: |
217 | /* reset the newly returned accept sock and bail. |
218 | * It is safe to set linger on new_sock because the RDS connection |
219 | * has not been brought up on new_sock, so no RDS-level data could |
220 | * be pending on it. By setting linger, we achieve the side-effect |
221 | * of avoiding TIME_WAIT state on new_sock. |
222 | */ |
223 | sock_no_linger(sk: new_sock->sk); |
224 | kernel_sock_shutdown(sock: new_sock, how: SHUT_RDWR); |
225 | ret = 0; |
226 | out: |
227 | if (rs_tcp) |
228 | mutex_unlock(lock: &rs_tcp->t_conn_path_lock); |
229 | if (new_sock) |
230 | sock_release(sock: new_sock); |
231 | return ret; |
232 | } |
233 | |
234 | void rds_tcp_listen_data_ready(struct sock *sk) |
235 | { |
236 | void (*ready)(struct sock *sk); |
237 | |
238 | trace_sk_data_ready(sk); |
239 | rdsdebug("listen data ready sk %p\n" , sk); |
240 | |
241 | read_lock_bh(&sk->sk_callback_lock); |
242 | ready = sk->sk_user_data; |
243 | if (!ready) { /* check for teardown race */ |
244 | ready = sk->sk_data_ready; |
245 | goto out; |
246 | } |
247 | |
248 | /* |
249 | * ->sk_data_ready is also called for a newly established child socket |
250 | * before it has been accepted and the accepter has set up their |
251 | * data_ready.. we only want to queue listen work for our listening |
252 | * socket |
253 | * |
254 | * (*ready)() may be null if we are racing with netns delete, and |
255 | * the listen socket is being torn down. |
256 | */ |
257 | if (sk->sk_state == TCP_LISTEN) |
258 | rds_tcp_accept_work(sk); |
259 | else |
260 | ready = rds_tcp_listen_sock_def_readable(net: sock_net(sk)); |
261 | |
262 | out: |
263 | read_unlock_bh(&sk->sk_callback_lock); |
264 | if (ready) |
265 | ready(sk); |
266 | } |
267 | |
268 | struct socket *rds_tcp_listen_init(struct net *net, bool isv6) |
269 | { |
270 | struct socket *sock = NULL; |
271 | struct sockaddr_storage ss; |
272 | struct sockaddr_in6 *sin6; |
273 | struct sockaddr_in *sin; |
274 | int addr_len; |
275 | int ret; |
276 | |
277 | ret = sock_create_kern(net, family: isv6 ? PF_INET6 : PF_INET, type: SOCK_STREAM, |
278 | IPPROTO_TCP, res: &sock); |
279 | if (ret < 0) { |
280 | rdsdebug("could not create %s listener socket: %d\n" , |
281 | isv6 ? "IPv6" : "IPv4" , ret); |
282 | goto out; |
283 | } |
284 | |
285 | sock->sk->sk_reuse = SK_CAN_REUSE; |
286 | tcp_sock_set_nodelay(sk: sock->sk); |
287 | |
288 | write_lock_bh(&sock->sk->sk_callback_lock); |
289 | sock->sk->sk_user_data = sock->sk->sk_data_ready; |
290 | sock->sk->sk_data_ready = rds_tcp_listen_data_ready; |
291 | write_unlock_bh(&sock->sk->sk_callback_lock); |
292 | |
293 | if (isv6) { |
294 | sin6 = (struct sockaddr_in6 *)&ss; |
295 | sin6->sin6_family = PF_INET6; |
296 | sin6->sin6_addr = in6addr_any; |
297 | sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); |
298 | sin6->sin6_scope_id = 0; |
299 | sin6->sin6_flowinfo = 0; |
300 | addr_len = sizeof(*sin6); |
301 | } else { |
302 | sin = (struct sockaddr_in *)&ss; |
303 | sin->sin_family = PF_INET; |
304 | sin->sin_addr.s_addr = INADDR_ANY; |
305 | sin->sin_port = (__force u16)htons(RDS_TCP_PORT); |
306 | addr_len = sizeof(*sin); |
307 | } |
308 | |
309 | ret = kernel_bind(sock, addr: (struct sockaddr *)&ss, addrlen: addr_len); |
310 | if (ret < 0) { |
311 | rdsdebug("could not bind %s listener socket: %d\n" , |
312 | isv6 ? "IPv6" : "IPv4" , ret); |
313 | goto out; |
314 | } |
315 | |
316 | ret = sock->ops->listen(sock, 64); |
317 | if (ret < 0) |
318 | goto out; |
319 | |
320 | return sock; |
321 | out: |
322 | if (sock) |
323 | sock_release(sock); |
324 | return NULL; |
325 | } |
326 | |
327 | void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor) |
328 | { |
329 | struct sock *sk; |
330 | |
331 | if (!sock) |
332 | return; |
333 | |
334 | sk = sock->sk; |
335 | |
336 | /* serialize with and prevent further callbacks */ |
337 | lock_sock(sk); |
338 | write_lock_bh(&sk->sk_callback_lock); |
339 | if (sk->sk_user_data) { |
340 | sk->sk_data_ready = sk->sk_user_data; |
341 | sk->sk_user_data = NULL; |
342 | } |
343 | write_unlock_bh(&sk->sk_callback_lock); |
344 | release_sock(sk); |
345 | |
346 | /* wait for accepts to stop and close the socket */ |
347 | flush_workqueue(rds_wq); |
348 | flush_work(work: acceptor); |
349 | sock_release(sock); |
350 | } |
351 | |