tcp_listen.c source code [linux/net/rds/tcp_listen.c]

1	/*
2	* Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3	*
4	* This software is available to you under a choice of one of two
5	* licenses. You may choose to be licensed under the terms of the GNU
6	* General Public License (GPL) Version 2, available from the file
7	* COPYING in the main directory of this source tree, or the
8	* OpenIB.org BSD license below:
9	*
10	* Redistribution and use in source and binary forms, with or
11	* without modification, are permitted provided that the following
12	* conditions are met:
13	*
14	* - Redistributions of source code must retain the above
15	* copyright notice, this list of conditions and the following
16	* disclaimer.
17	*
18	* - Redistributions in binary form must reproduce the above
19	* copyright notice, this list of conditions and the following
20	* disclaimer in the documentation and/or other materials
21	* provided with the distribution.
22	*
23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30	* SOFTWARE.
31	*
32	*/
33	#include <linux/kernel.h>
34	#include <linux/gfp.h>
35	#include <linux/in.h>
36	#include <net/tcp.h>
37	#include <trace/events/sock.h>
38
39	#include "rds.h"
40	#include "tcp.h"
41
42	void rds_tcp_keepalive(struct socket *sock)
43	{
44	/ values below based on xs_udp_default_timeout /
45	int keepidle = `5`; / send a probe 'keepidle' secs after last data /
46	int keepcnt = `5`; / number of unack'ed probes before declaring dead /
47
48	sock_set_keepalive(sk: sock->sk);
49	tcp_sock_set_keepcnt(sk: sock->sk, val: keepcnt);
50	tcp_sock_set_keepidle(sk: sock->sk, val: keepidle);
51	/ KEEPINTVL is the interval between successive probes. We follow*
52	* the model in xs_tcp_finish_connecting() and re-use keepidle.
53	*/
54	tcp_sock_set_keepintvl(sk: sock->sk, val: keepidle);
55	}
56
57	/ rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the*
58	* client's ipaddr < server's ipaddr. Otherwise, close the accepted
59	* socket and force a reconneect from smaller -> larger ip addr. The reason
60	* we special case cp_index 0 is to allow the rds probe ping itself to itself
61	* get through efficiently.
62	* Since reconnects are only initiated from the node with the numerically
63	* smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
64	* by moving them to CONNECTING in this function.
65	*/
66	static
67	struct rds_tcp_connection rds_tcp_accept_one_path(struct* rds_connection *conn)
68	{
69	int i;
70	int npaths = max_t(int, `1`, conn->c_npaths);
71
72	/ for mprds, all paths MUST be initiated by the peer*
73	* with the smaller address.
74	*/
75	if (rds_addr_cmp(a1: &conn->c_faddr, a2: &conn->c_laddr) >= `0`) {
76	/ Make sure we initiate at least one path if this*
77	* has not already been done; rds_start_mprds() will
78	* take care of additional paths, if necessary.
79	*/
80	if (npaths == `1`)
81	rds_conn_path_connect_if_down(cp: &conn->c_path[`0`]);
82	return NULL;
83	}
84
85	for (i = `0`; i < npaths; i++) {
86	struct rds_conn_path *cp = &conn->c_path[i];
87
88	if (rds_conn_path_transition(cp, old: RDS_CONN_DOWN,
89	new: RDS_CONN_CONNECTING) \|\|
90	rds_conn_path_transition(cp, old: RDS_CONN_ERROR,
91	new: RDS_CONN_CONNECTING)) {
92	return cp->cp_transport_data;
93	}
94	}
95	return NULL;
96	}
97
98	int rds_tcp_accept_one(struct socket *sock)
99	{
100	struct socket *new_sock = NULL;
101	struct rds_connection *conn;
102	int ret;
103	struct inet_sock *inet;
104	struct rds_tcp_connection *rs_tcp = NULL;
105	int conn_state;
106	struct rds_conn_path *cp;
107	struct in6_addr my_addr, peer_addr;
108	#if !IS_ENABLED(CONFIG_IPV6)
109	struct in6_addr saddr, daddr;
110	#endif
111	int dev_if = `0`;
112
113	if (!sock) / module unload or netns delete in progress /
114	return -ENETUNREACH;
115
116	ret = sock_create_lite(family: sock->sk->sk_family,
117	type: sock->sk->sk_type, proto: sock->sk->sk_protocol,
118	res: &new_sock);
119	if (ret)
120	goto out;
121
122	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
123	if (ret < `0`)
124	goto out;
125
126	/ sock_create_lite() does not get a hold on the owner module so we*
127	* need to do it here. Note that sock_release() uses sock->ops to
128	* determine if it needs to decrement the reference count. So set
129	* sock->ops after calling accept() in case that fails. And there's
130	* no need to do try_module_get() as the listener should have a hold
131	* already.
132	*/
133	new_sock->ops = sock->ops;
134	__module_get(module: new_sock->ops->owner);
135
136	rds_tcp_keepalive(sock: new_sock);
137	if (!rds_tcp_tune(sock: new_sock)) {
138	ret = -EINVAL;
139	goto out;
140	}
141
142	inet = inet_sk(new_sock->sk);
143
144	#if IS_ENABLED(CONFIG_IPV6)
145	my_addr = &new_sock->sk->sk_v6_rcv_saddr;
146	peer_addr = &new_sock->sk->sk_v6_daddr;
147	#else
148	ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
149	ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
150	my_addr = &saddr;
151	peer_addr = &daddr;
152	#endif
153	rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
154	sock->sk->sk_family,
155	my_addr, ntohs(inet->inet_sport),
156	peer_addr, ntohs(inet->inet_dport));
157
158	#if IS_ENABLED(CONFIG_IPV6)
159	/ sk_bound_dev_if is not set if the peer address is not link local*
160	* address. In this case, it happens that mcast_oif is set. So
161	* just use it.
162	*/
163	if ((ipv6_addr_type(addr: my_addr) & IPV6_ADDR_LINKLOCAL) &&
164	!(ipv6_addr_type(addr: peer_addr) & IPV6_ADDR_LINKLOCAL)) {
165	struct ipv6_pinfo *inet6;
166
167	inet6 = inet6_sk(sk: new_sock->sk);
168	dev_if = inet6->mcast_oif;
169	} else {
170	dev_if = new_sock->sk->sk_bound_dev_if;
171	}
172	#endif
173
174	if (!rds_tcp_laddr_check(net: sock_net(sk: sock->sk), addr: peer_addr, scope_id: dev_if)) {
175	/ local address connection is only allowed via loopback /
176	ret = -EOPNOTSUPP;
177	goto out;
178	}
179
180	conn = rds_conn_create(net: sock_net(sk: sock->sk),
181	laddr: my_addr, faddr: peer_addr,
182	trans: &rds_tcp_transport, tos: `0`, GFP_KERNEL, dev_if);
183
184	if (IS_ERR(ptr: conn)) {
185	ret = PTR_ERR(ptr: conn);
186	goto out;
187	}
188	/ An incoming SYN request came in, and TCP just accepted it.*
189	*
190	* If the client reboots, this conn will need to be cleaned up.
191	* rds_tcp_state_change() will do that cleanup
192	*/
193	rs_tcp = rds_tcp_accept_one_path(conn);
194	if (!rs_tcp)
195	goto rst_nsk;
196	mutex_lock(&rs_tcp->t_conn_path_lock);
197	cp = rs_tcp->t_cpath;
198	conn_state = rds_conn_path_state(cp);
199	WARN_ON(conn_state == RDS_CONN_UP);
200	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
201	goto rst_nsk;
202	if (rs_tcp->t_sock) {
203	/ Duelling SYN has been handled in rds_tcp_accept_one() /
204	rds_tcp_reset_callbacks(sock: new_sock, cp);
205	/ rds_connect_path_complete() marks RDS_CONN_UP /
206	rds_connect_path_complete(conn: cp, curr: RDS_CONN_RESETTING);
207	} else {
208	rds_tcp_set_callbacks(sock: new_sock, cp);
209	rds_connect_path_complete(conn: cp, curr: RDS_CONN_CONNECTING);
210	}
211	new_sock = NULL;
212	ret = `0`;
213	if (conn->c_npaths == `0`)
214	rds_send_ping(conn: cp->cp_conn, cp_index: cp->cp_index);
215	goto out;
216	rst_nsk:
217	/ reset the newly returned accept sock and bail.*
218	* It is safe to set linger on new_sock because the RDS connection
219	* has not been brought up on new_sock, so no RDS-level data could
220	* be pending on it. By setting linger, we achieve the side-effect
221	* of avoiding TIME_WAIT state on new_sock.
222	*/
223	sock_no_linger(sk: new_sock->sk);
224	kernel_sock_shutdown(sock: new_sock, how: SHUT_RDWR);
225	ret = `0`;
226	out:
227	if (rs_tcp)
228	mutex_unlock(lock: &rs_tcp->t_conn_path_lock);
229	if (new_sock)
230	sock_release(sock: new_sock);
231	return ret;
232	}
233
234	void rds_tcp_listen_data_ready(struct sock *sk)
235	{
236	void (ready)(struct* sock *sk);
237
238	trace_sk_data_ready(sk);
239	rdsdebug("listen data ready sk %p\n", sk);
240
241	read_lock_bh(&sk->sk_callback_lock);
242	ready = sk->sk_user_data;
243	if (!ready) { / check for teardown race /
244	ready = sk->sk_data_ready;
245	goto out;
246	}
247
248	/*
249	* ->sk_data_ready is also called for a newly established child socket
250	* before it has been accepted and the accepter has set up their
251	* data_ready.. we only want to queue listen work for our listening
252	* socket
253	*
254	* (*ready)() may be null if we are racing with netns delete, and
255	* the listen socket is being torn down.
256	*/
257	if (sk->sk_state == TCP_LISTEN)
258	rds_tcp_accept_work(sk);
259	else
260	ready = rds_tcp_listen_sock_def_readable(net: sock_net(sk));
261
262	out:
263	read_unlock_bh(&sk->sk_callback_lock);
264	if (ready)
265	ready(sk);
266	}
267
268	struct socket rds_tcp_listen_init(struct* net *net, bool isv6)
269	{
270	struct socket *sock = NULL;
271	struct sockaddr_storage ss;
272	struct sockaddr_in6 *sin6;
273	struct sockaddr_in *sin;
274	int addr_len;
275	int ret;
276
277	ret = sock_create_kern(net, family: isv6 ? PF_INET6 : PF_INET, type: SOCK_STREAM,
278	IPPROTO_TCP, res: &sock);
279	if (ret < `0`) {
280	rdsdebug("could not create %s listener socket: %d\n",
281	isv6 ? "IPv6" : "IPv4", ret);
282	goto out;
283	}
284
285	sock->sk->sk_reuse = SK_CAN_REUSE;
286	tcp_sock_set_nodelay(sk: sock->sk);
287
288	write_lock_bh(&sock->sk->sk_callback_lock);
289	sock->sk->sk_user_data = sock->sk->sk_data_ready;
290	sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
291	write_unlock_bh(&sock->sk->sk_callback_lock);
292
293	if (isv6) {
294	sin6 = (struct sockaddr_in6 *)&ss;
295	sin6->sin6_family = PF_INET6;
296	sin6->sin6_addr = in6addr_any;
297	sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
298	sin6->sin6_scope_id = `0`;
299	sin6->sin6_flowinfo = `0`;
300	addr_len = sizeof(*sin6);
301	} else {
302	sin = (struct sockaddr_in *)&ss;
303	sin->sin_family = PF_INET;
304	sin->sin_addr.s_addr = INADDR_ANY;
305	sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
306	addr_len = sizeof(*sin);
307	}
308
309	ret = kernel_bind(sock, addr: (struct sockaddr *)&ss, addrlen: addr_len);
310	if (ret < `0`) {
311	rdsdebug("could not bind %s listener socket: %d\n",
312	isv6 ? "IPv6" : "IPv4", ret);
313	goto out;
314	}
315
316	ret = sock->ops->listen(sock, `64`);
317	if (ret < `0`)
318	goto out;
319
320	return sock;
321	out:
322	if (sock)
323	sock_release(sock);
324	return NULL;
325	}
326
327	void rds_tcp_listen_stop(struct socket sock, struct* work_struct *acceptor)
328	{
329	struct sock *sk;
330
331	if (!sock)
332	return;
333
334	sk = sock->sk;
335
336	/ serialize with and prevent further callbacks /
337	lock_sock(sk);
338	write_lock_bh(&sk->sk_callback_lock);
339	if (sk->sk_user_data) {
340	sk->sk_data_ready = sk->sk_user_data;
341	sk->sk_user_data = NULL;
342	}
343	write_unlock_bh(&sk->sk_callback_lock);
344	release_sock(sk);
345
346	/ wait for accepts to stop and close the socket /
347	flush_workqueue(rds_wq);
348	flush_work(work: acceptor);
349	sock_release(sock);
350	}
351

source code of linux/net/rds/tcp_listen.c