connection.c source code [linux/net/rds/connection.c]

1	/*
2	* Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3	*
4	* This software is available to you under a choice of one of two
5	* licenses. You may choose to be licensed under the terms of the GNU
6	* General Public License (GPL) Version 2, available from the file
7	* COPYING in the main directory of this source tree, or the
8	* OpenIB.org BSD license below:
9	*
10	* Redistribution and use in source and binary forms, with or
11	* without modification, are permitted provided that the following
12	* conditions are met:
13	*
14	* - Redistributions of source code must retain the above
15	* copyright notice, this list of conditions and the following
16	* disclaimer.
17	*
18	* - Redistributions in binary form must reproduce the above
19	* copyright notice, this list of conditions and the following
20	* disclaimer in the documentation and/or other materials
21	* provided with the distribution.
22	*
23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30	* SOFTWARE.
31	*
32	*/
33	#include <linux/kernel.h>
34	#include <linux/list.h>
35	#include <linux/slab.h>
36	#include <linux/export.h>
37	#include <net/ipv6.h>
38	#include <net/inet6_hashtables.h>
39	#include <net/addrconf.h>
40
41	#include "rds.h"
42	#include "loop.h"
43
44	#define RDS_CONNECTION_HASH_BITS 12
45	#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
46	#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
47
48	/ converting this to RCU is a chore for another day.. /
49	static DEFINE_SPINLOCK(rds_conn_lock);
50	static unsigned long rds_conn_count;
51	static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
52	static struct kmem_cache *rds_conn_slab;
53
54	static struct hlist_head rds_conn_bucket(const* struct in6_addr *laddr,
55	const struct in6_addr *faddr)
56	{
57	static u32 rds6_hash_secret __read_mostly;
58	static u32 rds_hash_secret __read_mostly;
59
60	u32 lhash, fhash, hash;
61
62	net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
63	net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
64
65	lhash = (__force u32)laddr->s6_addr32[`3`];
66	#if IS_ENABLED(CONFIG_IPV6)
67	fhash = __ipv6_addr_jhash(a: faddr, initval: rds6_hash_secret);
68	#else
69	fhash = (__force u32)faddr->s6_addr32[`3`];
70	#endif
71	hash = __inet_ehashfn(laddr: lhash, lport: `0`, faddr: fhash, fport: `0`, initval: rds_hash_secret);
72
73	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
74	}
75
76	#define rds_conn_info_set(var, test, suffix) do { \
77	if (test) \
78	var \|= RDS_INFO_CONNECTION_FLAG_##suffix; \
79	} while (0)
80
81	/ rcu read lock must be held or the connection spinlock /
82	static struct rds_connection rds_conn_lookup(struct* net *net,
83	struct hlist_head *head,
84	const struct in6_addr *laddr,
85	const struct in6_addr *faddr,
86	struct rds_transport *trans,
87	u8 tos, int dev_if)
88	{
89	struct rds_connection conn, ret = NULL;
90
91	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
92	if (ipv6_addr_equal(a1: &conn->c_faddr, a2: faddr) &&
93	ipv6_addr_equal(a1: &conn->c_laddr, a2: laddr) &&
94	conn->c_trans == trans &&
95	conn->c_tos == tos &&
96	net == rds_conn_net(conn) &&
97	conn->c_dev_if == dev_if) {
98	ret = conn;
99	break;
100	}
101	}
102	rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
103	laddr, faddr);
104	return ret;
105	}
106
107	/*
108	* This is called by transports as they're bringing down a connection.
109	* It clears partial message state so that the transport can start sending
110	* and receiving over this connection again in the future. It is up to
111	* the transport to have serialized this call with its send and recv.
112	*/
113	static void rds_conn_path_reset(struct rds_conn_path *cp)
114	{
115	struct rds_connection *conn = cp->cp_conn;
116
117	rdsdebug("connection %pI6c to %pI6c reset\n",
118	&conn->c_laddr, &conn->c_faddr);
119
120	rds_stats_inc(s_conn_reset);
121	rds_send_path_reset(conn: cp);
122	cp->cp_flags = `0`;
123
124	/ Do not clear next_rx_seq here, else we cannot distinguish*
125	* retransmitted packets from new packets, and will hand all
126	* of them to the application. That is not consistent with the
127	* reliability guarantees of RDS. */
128	}
129
130	static void __rds_conn_path_init(struct rds_connection *conn,
131	struct rds_conn_path *cp, bool is_outgoing)
132	{
133	spin_lock_init(&cp->cp_lock);
134	cp->cp_next_tx_seq = `1`;
135	init_waitqueue_head(&cp->cp_waitq);
136	INIT_LIST_HEAD(list: &cp->cp_send_queue);
137	INIT_LIST_HEAD(list: &cp->cp_retrans);
138
139	cp->cp_conn = conn;
140	atomic_set(v: &cp->cp_state, i: RDS_CONN_DOWN);
141	cp->cp_send_gen = `0`;
142	cp->cp_reconnect_jiffies = `0`;
143	cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
144	INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
145	INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
146	INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
147	INIT_WORK(&cp->cp_down_w, rds_shutdown_worker);
148	mutex_init(&cp->cp_cm_lock);
149	cp->cp_flags = `0`;
150	}
151
152	/*
153	* There is only every one 'conn' for a given pair of addresses in the
154	* system at a time. They contain messages to be retransmitted and so
155	* span the lifetime of the actual underlying transport connections.
156	*
157	* For now they are not garbage collected once they're created. They
158	* are torn down as the module is removed, if ever.
159	*/
160	static struct rds_connection __rds_conn_create(struct* net *net,
161	const struct in6_addr *laddr,
162	const struct in6_addr *faddr,
163	struct rds_transport *trans,
164	gfp_t gfp, u8 tos,
165	int is_outgoing,
166	int dev_if)
167	{
168	struct rds_connection conn, parent = NULL;
169	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
170	struct rds_transport *loop_trans;
171	unsigned long flags;
172	int ret, i;
173	int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : `1`);
174
175	rcu_read_lock();
176	conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
177	if (conn &&
178	conn->c_loopback &&
179	conn->c_trans != &rds_loop_transport &&
180	ipv6_addr_equal(a1: laddr, a2: faddr) &&
181	!is_outgoing) {
182	/ This is a looped back IB connection, and we're*
183	* called by the code handling the incoming connect.
184	* We need a second connection object into which we
185	* can stick the other QP. */
186	parent = conn;
187	conn = parent->c_passive;
188	}
189	rcu_read_unlock();
190	if (conn)
191	goto out;
192
193	conn = kmem_cache_zalloc(k: rds_conn_slab, flags: gfp);
194	if (!conn) {
195	conn = ERR_PTR(error: -ENOMEM);
196	goto out;
197	}
198	conn->c_path = kcalloc(n: npaths, size: sizeof(struct rds_conn_path), flags: gfp);
199	if (!conn->c_path) {
200	kmem_cache_free(s: rds_conn_slab, objp: conn);
201	conn = ERR_PTR(error: -ENOMEM);
202	goto out;
203	}
204
205	INIT_HLIST_NODE(h: &conn->c_hash_node);
206	conn->c_laddr = *laddr;
207	conn->c_isv6 = !ipv6_addr_v4mapped(a: laddr);
208	conn->c_faddr = *faddr;
209	conn->c_dev_if = dev_if;
210	conn->c_tos = tos;
211
212	#if IS_ENABLED(CONFIG_IPV6)
213	/ If the local address is link local, set c_bound_if to be the*
214	* index used for this connection. Otherwise, set it to 0 as
215	* the socket is not bound to an interface. c_bound_if is used
216	* to look up a socket when a packet is received
217	*/
218	if (ipv6_addr_type(addr: laddr) & IPV6_ADDR_LINKLOCAL)
219	conn->c_bound_if = dev_if;
220	else
221	#endif
222	conn->c_bound_if = `0`;
223
224	rds_conn_net_set(conn, net);
225
226	ret = rds_cong_get_maps(conn);
227	if (ret) {
228	kfree(objp: conn->c_path);
229	kmem_cache_free(s: rds_conn_slab, objp: conn);
230	conn = ERR_PTR(error: ret);
231	goto out;
232	}
233
234	/*
235	* This is where a connection becomes loopback. If any RDS sockets
236	* can bind to the destination address then we'd rather the messages
237	* flow through loopback rather than either transport.
238	*/
239	loop_trans = rds_trans_get_preferred(net, addr: faddr, scope_id: conn->c_dev_if);
240	if (loop_trans) {
241	rds_trans_put(trans: loop_trans);
242	conn->c_loopback = `1`;
243	if (trans->t_prefer_loopback) {
244	if (likely(is_outgoing)) {
245	/ "outgoing" connection to local address.*
246	* Protocol says it wants the connection
247	* handled by the loopback transport.
248	* This is what TCP does.
249	*/
250	trans = &rds_loop_transport;
251	} else {
252	/ No transport currently in use*
253	* should end up here, but if it
254	* does, reset/destroy the connection.
255	*/
256	kfree(objp: conn->c_path);
257	kmem_cache_free(s: rds_conn_slab, objp: conn);
258	conn = ERR_PTR(error: -EOPNOTSUPP);
259	goto out;
260	}
261	}
262	}
263
264	conn->c_trans = trans;
265
266	init_waitqueue_head(&conn->c_hs_waitq);
267	for (i = `0`; i < npaths; i++) {
268	__rds_conn_path_init(conn, cp: &conn->c_path[i],
269	is_outgoing);
270	conn->c_path[i].cp_index = i;
271	}
272	rcu_read_lock();
273	if (rds_destroy_pending(conn))
274	ret = -ENETDOWN;
275	else
276	ret = trans->conn_alloc(conn, GFP_ATOMIC);
277	if (ret) {
278	rcu_read_unlock();
279	kfree(objp: conn->c_path);
280	kmem_cache_free(s: rds_conn_slab, objp: conn);
281	conn = ERR_PTR(error: ret);
282	goto out;
283	}
284
285	rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
286	conn, laddr, faddr,
287	strnlen(trans->t_name, sizeof(trans->t_name)) ?
288	trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
289
290	/*
291	* Since we ran without holding the conn lock, someone could
292	* have created the same conn (either normal or passive) in the
293	* interim. We check while holding the lock. If we won, we complete
294	* init and return our conn. If we lost, we rollback and return the
295	* other one.
296	*/
297	spin_lock_irqsave(&rds_conn_lock, flags);
298	if (parent) {
299	/ Creating passive conn /
300	if (parent->c_passive) {
301	trans->conn_free(conn->c_path[`0`].cp_transport_data);
302	kfree(objp: conn->c_path);
303	kmem_cache_free(s: rds_conn_slab, objp: conn);
304	conn = parent->c_passive;
305	} else {
306	parent->c_passive = conn;
307	rds_cong_add_conn(conn);
308	rds_conn_count++;
309	}
310	} else {
311	/ Creating normal conn /
312	struct rds_connection *found;
313
314	found = rds_conn_lookup(net, head, laddr, faddr, trans,
315	tos, dev_if);
316	if (found) {
317	struct rds_conn_path *cp;
318	int i;
319
320	for (i = `0`; i < npaths; i++) {
321	cp = &conn->c_path[i];
322	/ The ->conn_alloc invocation may have*
323	* allocated resource for all paths, so all
324	* of them may have to be freed here.
325	*/
326	if (cp->cp_transport_data)
327	trans->conn_free(cp->cp_transport_data);
328	}
329	kfree(objp: conn->c_path);
330	kmem_cache_free(s: rds_conn_slab, objp: conn);
331	conn = found;
332	} else {
333	conn->c_my_gen_num = rds_gen_num;
334	conn->c_peer_gen_num = `0`;
335	hlist_add_head_rcu(n: &conn->c_hash_node, h: head);
336	rds_cong_add_conn(conn);
337	rds_conn_count++;
338	}
339	}
340	spin_unlock_irqrestore(lock: &rds_conn_lock, flags);
341	rcu_read_unlock();
342
343	out:
344	return conn;
345	}
346
347	struct rds_connection rds_conn_create(struct* net *net,
348	const struct in6_addr *laddr,
349	const struct in6_addr *faddr,
350	struct rds_transport *trans, u8 tos,
351	gfp_t gfp, int dev_if)
352	{
353	return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, is_outgoing: `0`, dev_if);
354	}
355	EXPORT_SYMBOL_GPL(rds_conn_create);
356
357	struct rds_connection rds_conn_create_outgoing(struct* net *net,
358	const struct in6_addr *laddr,
359	const struct in6_addr *faddr,
360	struct rds_transport *trans,
361	u8 tos, gfp_t gfp, int dev_if)
362	{
363	return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, is_outgoing: `1`, dev_if);
364	}
365	EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
366
367	void rds_conn_shutdown(struct rds_conn_path *cp)
368	{
369	struct rds_connection *conn = cp->cp_conn;
370
371	/ shut it down unless it's down already /
372	if (!rds_conn_path_transition(cp, old: RDS_CONN_DOWN, new: RDS_CONN_DOWN)) {
373	/*
374	* Quiesce the connection mgmt handlers before we start tearing
375	* things down. We don't hold the mutex for the entire
376	* duration of the shutdown operation, else we may be
377	* deadlocking with the CM handler. Instead, the CM event
378	* handler is supposed to check for state DISCONNECTING
379	*/
380	mutex_lock(&cp->cp_cm_lock);
381	if (!rds_conn_path_transition(cp, old: RDS_CONN_UP,
382	new: RDS_CONN_DISCONNECTING) &&
383	!rds_conn_path_transition(cp, old: RDS_CONN_ERROR,
384	new: RDS_CONN_DISCONNECTING)) {
385	rds_conn_path_error(cp,
386	"shutdown called in state %d\n",
387	atomic_read(&cp->cp_state));
388	mutex_unlock(lock: &cp->cp_cm_lock);
389	return;
390	}
391	mutex_unlock(lock: &cp->cp_cm_lock);
392
393	wait_event(cp->cp_waitq,
394	!test_bit(RDS_IN_XMIT, &cp->cp_flags));
395	wait_event(cp->cp_waitq,
396	!test_bit(RDS_RECV_REFILL, &cp->cp_flags));
397
398	conn->c_trans->conn_path_shutdown(cp);
399	rds_conn_path_reset(cp);
400
401	if (!rds_conn_path_transition(cp, old: RDS_CONN_DISCONNECTING,
402	new: RDS_CONN_DOWN) &&
403	!rds_conn_path_transition(cp, old: RDS_CONN_ERROR,
404	new: RDS_CONN_DOWN)) {
405	/ This can happen - eg when we're in the middle of tearing*
406	* down the connection, and someone unloads the rds module.
407	* Quite reproducible with loopback connections.
408	* Mostly harmless.
409	*
410	* Note that this also happens with rds-tcp because
411	* we could have triggered rds_conn_path_drop in irq
412	* mode from rds_tcp_state change on the receipt of
413	* a FIN, thus we need to recheck for RDS_CONN_ERROR
414	* here.
415	*/
416	rds_conn_path_error(cp, "%s: failed to transition "
417	"to state DOWN, current state "
418	"is %d\n", __func__,
419	atomic_read(&cp->cp_state));
420	return;
421	}
422	}
423
424	/ Then reconnect if it's still live.*
425	* The passive side of an IB loopback connection is never added
426	* to the conn hash, so we never trigger a reconnect on this
427	* conn - the reconnect is always triggered by the active peer. */
428	cancel_delayed_work_sync(dwork: &cp->cp_conn_w);
429	rcu_read_lock();
430	if (!hlist_unhashed(h: &conn->c_hash_node)) {
431	rcu_read_unlock();
432	rds_queue_reconnect(cp);
433	} else {
434	rcu_read_unlock();
435	}
436	}
437
438	/ destroy a single rds_conn_path. rds_conn_destroy() iterates over*
439	* all paths using rds_conn_path_destroy()
440	*/
441	static void rds_conn_path_destroy(struct rds_conn_path *cp)
442	{
443	struct rds_message rm, rtmp;
444
445	if (!cp->cp_transport_data)
446	return;
447
448	/ make sure lingering queued work won't try to ref the conn /
449	cancel_delayed_work_sync(dwork: &cp->cp_send_w);
450	cancel_delayed_work_sync(dwork: &cp->cp_recv_w);
451
452	rds_conn_path_drop(cpath: cp, destroy: true);
453	flush_work(work: &cp->cp_down_w);
454
455	/ tear down queued messages /
456	list_for_each_entry_safe(rm, rtmp,
457	&cp->cp_send_queue,
458	m_conn_item) {
459	list_del_init(entry: &rm->m_conn_item);
460	BUG_ON(!list_empty(&rm->m_sock_item));
461	rds_message_put(rm);
462	}
463	if (cp->cp_xmit_rm)
464	rds_message_put(rm: cp->cp_xmit_rm);
465
466	WARN_ON(delayed_work_pending(&cp->cp_send_w));
467	WARN_ON(delayed_work_pending(&cp->cp_recv_w));
468	WARN_ON(delayed_work_pending(&cp->cp_conn_w));
469	WARN_ON(work_pending(&cp->cp_down_w));
470
471	cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
472	}
473
474	/*
475	* Stop and free a connection.
476	*
477	* This can only be used in very limited circumstances. It assumes that once
478	* the conn has been shutdown that no one else is referencing the connection.
479	* We can only ensure this in the rmmod path in the current code.
480	*/
481	void rds_conn_destroy(struct rds_connection *conn)
482	{
483	unsigned long flags;
484	int i;
485	struct rds_conn_path *cp;
486	int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : `1`);
487
488	rdsdebug("freeing conn %p for %pI4 -> "
489	"%pI4\n", conn, &conn->c_laddr,
490	&conn->c_faddr);
491
492	/ Ensure conn will not be scheduled for reconnect /
493	spin_lock_irq(lock: &rds_conn_lock);
494	hlist_del_init_rcu(n: &conn->c_hash_node);
495	spin_unlock_irq(lock: &rds_conn_lock);
496	synchronize_rcu();
497
498	/ shut the connection down /
499	for (i = `0`; i < npaths; i++) {
500	cp = &conn->c_path[i];
501	rds_conn_path_destroy(cp);
502	BUG_ON(!list_empty(&cp->cp_retrans));
503	}
504
505	/*
506	* The congestion maps aren't freed up here. They're
507	* freed by rds_cong_exit() after all the connections
508	* have been freed.
509	*/
510	rds_cong_remove_conn(conn);
511
512	kfree(objp: conn->c_path);
513	kmem_cache_free(s: rds_conn_slab, objp: conn);
514
515	spin_lock_irqsave(&rds_conn_lock, flags);
516	rds_conn_count--;
517	spin_unlock_irqrestore(lock: &rds_conn_lock, flags);
518	}
519	EXPORT_SYMBOL_GPL(rds_conn_destroy);
520
521	static void __rds_inc_msg_cp(struct rds_incoming *inc,
522	struct rds_info_iterator *iter,
523	void saddr, void* daddr, int* flip, bool isv6)
524	{
525	#if IS_ENABLED(CONFIG_IPV6)
526	if (isv6)
527	rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
528	else
529	#endif
530	rds_inc_info_copy(inc, iter, saddr: (__be32 )saddr,
531	daddr: (__be32 )daddr, flip);
532	}
533
534	static void rds_conn_message_info_cmn(struct socket sock, unsigned* int len,
535	struct rds_info_iterator *iter,
536	struct rds_info_lengths *lens,
537	int want_send, bool isv6)
538	{
539	struct hlist_head *head;
540	struct list_head *list;
541	struct rds_connection *conn;
542	struct rds_message *rm;
543	unsigned int total = `0`;
544	unsigned long flags;
545	size_t i;
546	int j;
547
548	if (isv6)
549	len /= sizeof(struct rds6_info_message);
550	else
551	len /= sizeof(struct rds_info_message);
552
553	rcu_read_lock();
554
555	for (i = `0`, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
556	i++, head++) {
557	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
558	struct rds_conn_path *cp;
559	int npaths;
560
561	if (!isv6 && conn->c_isv6)
562	continue;
563
564	npaths = (conn->c_trans->t_mp_capable ?
565	RDS_MPATH_WORKERS : `1`);
566
567	for (j = `0`; j < npaths; j++) {
568	cp = &conn->c_path[j];
569	if (want_send)
570	list = &cp->cp_send_queue;
571	else
572	list = &cp->cp_retrans;
573
574	spin_lock_irqsave(&cp->cp_lock, flags);
575
576	/ XXX too lazy to maintain counts.. /
577	list_for_each_entry(rm, list, m_conn_item) {
578	total++;
579	if (total <= len)
580	__rds_inc_msg_cp(inc: &rm->m_inc,
581	iter,
582	saddr: &conn->c_laddr,
583	daddr: &conn->c_faddr,
584	flip: `0`, isv6);
585	}
586
587	spin_unlock_irqrestore(lock: &cp->cp_lock, flags);
588	}
589	}
590	}
591	rcu_read_unlock();
592
593	lens->nr = total;
594	if (isv6)
595	lens->each = sizeof(struct rds6_info_message);
596	else
597	lens->each = sizeof(struct rds_info_message);
598	}
599
600	static void rds_conn_message_info(struct socket sock, unsigned* int len,
601	struct rds_info_iterator *iter,
602	struct rds_info_lengths *lens,
603	int want_send)
604	{
605	rds_conn_message_info_cmn(sock, len, iter, lens, want_send, isv6: false);
606	}
607
608	#if IS_ENABLED(CONFIG_IPV6)
609	static void rds6_conn_message_info(struct socket sock, unsigned* int len,
610	struct rds_info_iterator *iter,
611	struct rds_info_lengths *lens,
612	int want_send)
613	{
614	rds_conn_message_info_cmn(sock, len, iter, lens, want_send, isv6: true);
615	}
616	#endif
617
618	static void rds_conn_message_info_send(struct socket sock, unsigned* int len,
619	struct rds_info_iterator *iter,
620	struct rds_info_lengths *lens)
621	{
622	rds_conn_message_info(sock, len, iter, lens, want_send: `1`);
623	}
624
625	#if IS_ENABLED(CONFIG_IPV6)
626	static void rds6_conn_message_info_send(struct socket sock, unsigned* int len,
627	struct rds_info_iterator *iter,
628	struct rds_info_lengths *lens)
629	{
630	rds6_conn_message_info(sock, len, iter, lens, want_send: `1`);
631	}
632	#endif
633
634	static void rds_conn_message_info_retrans(struct socket *sock,
635	unsigned int len,
636	struct rds_info_iterator *iter,
637	struct rds_info_lengths *lens)
638	{
639	rds_conn_message_info(sock, len, iter, lens, want_send: `0`);
640	}
641
642	#if IS_ENABLED(CONFIG_IPV6)
643	static void rds6_conn_message_info_retrans(struct socket *sock,
644	unsigned int len,
645	struct rds_info_iterator *iter,
646	struct rds_info_lengths *lens)
647	{
648	rds6_conn_message_info(sock, len, iter, lens, want_send: `0`);
649	}
650	#endif
651
652	void rds_for_each_conn_info(struct socket sock, unsigned* int len,
653	struct rds_info_iterator *iter,
654	struct rds_info_lengths *lens,
655	int (visitor)(struct* rds_connection , void* *),
656	u64 *buffer,
657	size_t item_len)
658	{
659	struct hlist_head *head;
660	struct rds_connection *conn;
661	size_t i;
662
663	rcu_read_lock();
664
665	lens->nr = `0`;
666	lens->each = item_len;
667
668	for (i = `0`, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
669	i++, head++) {
670	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
671
672	/ XXX no c_lock usage.. /
673	if (!visitor(conn, buffer))
674	continue;
675
676	/ We copy as much as we can fit in the buffer,*
677	* but we count all items so that the caller
678	* can resize the buffer. */
679	if (len >= item_len) {
680	rds_info_copy(iter, data: buffer, bytes: item_len);
681	len -= item_len;
682	}
683	lens->nr++;
684	}
685	}
686	rcu_read_unlock();
687	}
688	EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
689
690	static void rds_walk_conn_path_info(struct socket sock, unsigned* int len,
691	struct rds_info_iterator *iter,
692	struct rds_info_lengths *lens,
693	int (visitor)(struct* rds_conn_path , void* *),
694	u64 *buffer,
695	size_t item_len)
696	{
697	struct hlist_head *head;
698	struct rds_connection *conn;
699	size_t i;
700
701	rcu_read_lock();
702
703	lens->nr = `0`;
704	lens->each = item_len;
705
706	for (i = `0`, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
707	i++, head++) {
708	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
709	struct rds_conn_path *cp;
710
711	/ XXX We only copy the information from the first*
712	* path for now. The problem is that if there are
713	* more than one underlying paths, we cannot report
714	* information of all of them using the existing
715	* API. For example, there is only one next_tx_seq,
716	* which path's next_tx_seq should we report? It is
717	* a bug in the design of MPRDS.
718	*/
719	cp = conn->c_path;
720
721	/ XXX no cp_lock usage.. /
722	if (!visitor(cp, buffer))
723	continue;
724
725	/ We copy as much as we can fit in the buffer,*
726	* but we count all items so that the caller
727	* can resize the buffer.
728	*/
729	if (len >= item_len) {
730	rds_info_copy(iter, data: buffer, bytes: item_len);
731	len -= item_len;
732	}
733	lens->nr++;
734	}
735	}
736	rcu_read_unlock();
737	}
738
739	static int rds_conn_info_visitor(struct rds_conn_path cp, void* *buffer)
740	{
741	struct rds_info_connection *cinfo = buffer;
742	struct rds_connection *conn = cp->cp_conn;
743
744	if (conn->c_isv6)
745	return `0`;
746
747	cinfo->next_tx_seq = cp->cp_next_tx_seq;
748	cinfo->next_rx_seq = cp->cp_next_rx_seq;
749	cinfo->laddr = conn->c_laddr.s6_addr32[`3`];
750	cinfo->faddr = conn->c_faddr.s6_addr32[`3`];
751	cinfo->tos = conn->c_tos;
752	strncpy(p: cinfo->transport, q: conn->c_trans->t_name,
753	size: sizeof(cinfo->transport));
754	cinfo->flags = `0`;
755
756	rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
757	SENDING);
758	/ XXX Future: return the state rather than these funky bits /
759	rds_conn_info_set(cinfo->flags,
760	atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
761	CONNECTING);
762	rds_conn_info_set(cinfo->flags,
763	atomic_read(&cp->cp_state) == RDS_CONN_UP,
764	CONNECTED);
765	return `1`;
766	}
767
768	#if IS_ENABLED(CONFIG_IPV6)
769	static int rds6_conn_info_visitor(struct rds_conn_path cp, void* *buffer)
770	{
771	struct rds6_info_connection *cinfo6 = buffer;
772	struct rds_connection *conn = cp->cp_conn;
773
774	cinfo6->next_tx_seq = cp->cp_next_tx_seq;
775	cinfo6->next_rx_seq = cp->cp_next_rx_seq;
776	cinfo6->laddr = conn->c_laddr;
777	cinfo6->faddr = conn->c_faddr;
778	strncpy(p: cinfo6->transport, q: conn->c_trans->t_name,
779	size: sizeof(cinfo6->transport));
780	cinfo6->flags = `0`;
781
782	rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
783	SENDING);
784	/ XXX Future: return the state rather than these funky bits /
785	rds_conn_info_set(cinfo6->flags,
786	atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
787	CONNECTING);
788	rds_conn_info_set(cinfo6->flags,
789	atomic_read(&cp->cp_state) == RDS_CONN_UP,
790	CONNECTED);
791	/ Just return 1 as there is no error case. This is a helper function*
792	* for rds_walk_conn_path_info() and it wants a return value.
793	*/
794	return `1`;
795	}
796	#endif
797
798	static void rds_conn_info(struct socket sock, unsigned* int len,
799	struct rds_info_iterator *iter,
800	struct rds_info_lengths *lens)
801	{
802	u64 buffer[(sizeof(struct rds_info_connection) + `7`) / `8`];
803
804	rds_walk_conn_path_info(sock, len, iter, lens,
805	visitor: rds_conn_info_visitor,
806	buffer,
807	item_len: sizeof(struct rds_info_connection));
808	}
809
810	#if IS_ENABLED(CONFIG_IPV6)
811	static void rds6_conn_info(struct socket sock, unsigned* int len,
812	struct rds_info_iterator *iter,
813	struct rds_info_lengths *lens)
814	{
815	u64 buffer[(sizeof(struct rds6_info_connection) + `7`) / `8`];
816
817	rds_walk_conn_path_info(sock, len, iter, lens,
818	visitor: rds6_conn_info_visitor,
819	buffer,
820	item_len: sizeof(struct rds6_info_connection));
821	}
822	#endif
823
824	int rds_conn_init(void)
825	{
826	int ret;
827
828	ret = rds_loop_net_init(); / register pernet callback /
829	if (ret)
830	return ret;
831
832	rds_conn_slab = kmem_cache_create(name: "rds_connection",
833	size: sizeof(struct rds_connection),
834	align: `0`, flags: `0`, NULL);
835	if (!rds_conn_slab) {
836	rds_loop_net_exit();
837	return -ENOMEM;
838	}
839
840	rds_info_register_func(RDS_INFO_CONNECTIONS, func: rds_conn_info);
841	rds_info_register_func(RDS_INFO_SEND_MESSAGES,
842	func: rds_conn_message_info_send);
843	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
844	func: rds_conn_message_info_retrans);
845	#if IS_ENABLED(CONFIG_IPV6)
846	rds_info_register_func(RDS6_INFO_CONNECTIONS, func: rds6_conn_info);
847	rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
848	func: rds6_conn_message_info_send);
849	rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
850	func: rds6_conn_message_info_retrans);
851	#endif
852	return `0`;
853	}
854
855	void rds_conn_exit(void)
856	{
857	rds_loop_net_exit(); / unregister pernet callback /
858	rds_loop_exit();
859
860	WARN_ON(!hlist_empty(rds_conn_hash));
861
862	kmem_cache_destroy(s: rds_conn_slab);
863
864	rds_info_deregister_func(RDS_INFO_CONNECTIONS, func: rds_conn_info);
865	rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
866	func: rds_conn_message_info_send);
867	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
868	func: rds_conn_message_info_retrans);
869	#if IS_ENABLED(CONFIG_IPV6)
870	rds_info_deregister_func(RDS6_INFO_CONNECTIONS, func: rds6_conn_info);
871	rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
872	func: rds6_conn_message_info_send);
873	rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
874	func: rds6_conn_message_info_retrans);
875	#endif
876	}
877
878	/*
879	* Force a disconnect
880	*/
881	void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
882	{
883	atomic_set(v: &cp->cp_state, i: RDS_CONN_ERROR);
884
885	rcu_read_lock();
886	if (!destroy && rds_destroy_pending(conn: cp->cp_conn)) {
887	rcu_read_unlock();
888	return;
889	}
890	queue_work(wq: rds_wq, work: &cp->cp_down_w);
891	rcu_read_unlock();
892	}
893	EXPORT_SYMBOL_GPL(rds_conn_path_drop);
894
895	void rds_conn_drop(struct rds_connection *conn)
896	{
897	WARN_ON(conn->c_trans->t_mp_capable);
898	rds_conn_path_drop(&conn->c_path[`0`], false);
899	}
900	EXPORT_SYMBOL_GPL(rds_conn_drop);
901
902	/*
903	* If the connection is down, trigger a connect. We may have scheduled a
904	* delayed reconnect however - in this case we should not interfere.
905	*/
906	void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
907	{
908	rcu_read_lock();
909	if (rds_destroy_pending(conn: cp->cp_conn)) {
910	rcu_read_unlock();
911	return;
912	}
913	if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
914	!test_and_set_bit(RDS_RECONNECT_PENDING, addr: &cp->cp_flags))
915	queue_delayed_work(wq: rds_wq, dwork: &cp->cp_conn_w, delay: `0`);
916	rcu_read_unlock();
917	}
918	EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
919
920	/ Check connectivity of all paths*
921	*/
922	void rds_check_all_paths(struct rds_connection *conn)
923	{
924	int i = `0`;
925
926	do {
927	rds_conn_path_connect_if_down(&conn->c_path[i]);
928	} while (++i < conn->c_npaths);
929	}
930
931	void rds_conn_connect_if_down(struct rds_connection *conn)
932	{
933	WARN_ON(conn->c_trans->t_mp_capable);
934	rds_conn_path_connect_if_down(&conn->c_path[`0`]);
935	}
936	EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
937
938	void
939	__rds_conn_path_error(struct rds_conn_path cp, const* char *fmt, ...)
940	{
941	va_list ap;
942
943	va_start(ap, fmt);
944	vprintk(fmt, args: ap);
945	va_end(ap);
946
947	rds_conn_path_drop(cp, false);
948	}
949

source code of linux/net/rds/connection.c