1 | /* |
2 | * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. |
3 | * |
4 | * This software is available to you under a choice of one of two |
5 | * licenses. You may choose to be licensed under the terms of the GNU |
6 | * General Public License (GPL) Version 2, available from the file |
7 | * COPYING in the main directory of this source tree, or the |
8 | * OpenIB.org BSD license below: |
9 | * |
10 | * Redistribution and use in source and binary forms, with or |
11 | * without modification, are permitted provided that the following |
12 | * conditions are met: |
13 | * |
14 | * - Redistributions of source code must retain the above |
15 | * copyright notice, this list of conditions and the following |
16 | * disclaimer. |
17 | * |
18 | * - Redistributions in binary form must reproduce the above |
19 | * copyright notice, this list of conditions and the following |
20 | * disclaimer in the documentation and/or other materials |
21 | * provided with the distribution. |
22 | * |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
30 | * SOFTWARE. |
31 | * |
32 | */ |
33 | #include <linux/kernel.h> |
34 | #include <linux/list.h> |
35 | #include <linux/slab.h> |
36 | #include <linux/export.h> |
37 | #include <net/ipv6.h> |
38 | #include <net/inet6_hashtables.h> |
39 | #include <net/addrconf.h> |
40 | |
41 | #include "rds.h" |
42 | #include "loop.h" |
43 | |
44 | #define RDS_CONNECTION_HASH_BITS 12 |
45 | #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) |
46 | #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) |
47 | |
48 | /* converting this to RCU is a chore for another day.. */ |
49 | static DEFINE_SPINLOCK(rds_conn_lock); |
50 | static unsigned long rds_conn_count; |
51 | static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; |
52 | static struct kmem_cache *rds_conn_slab; |
53 | |
54 | static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, |
55 | const struct in6_addr *faddr) |
56 | { |
57 | static u32 rds6_hash_secret __read_mostly; |
58 | static u32 rds_hash_secret __read_mostly; |
59 | |
60 | u32 lhash, fhash, hash; |
61 | |
62 | net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); |
63 | net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); |
64 | |
65 | lhash = (__force u32)laddr->s6_addr32[3]; |
66 | #if IS_ENABLED(CONFIG_IPV6) |
67 | fhash = __ipv6_addr_jhash(a: faddr, initval: rds6_hash_secret); |
68 | #else |
69 | fhash = (__force u32)faddr->s6_addr32[3]; |
70 | #endif |
71 | hash = __inet_ehashfn(laddr: lhash, lport: 0, faddr: fhash, fport: 0, initval: rds_hash_secret); |
72 | |
73 | return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; |
74 | } |
75 | |
76 | #define rds_conn_info_set(var, test, suffix) do { \ |
77 | if (test) \ |
78 | var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ |
79 | } while (0) |
80 | |
81 | /* rcu read lock must be held or the connection spinlock */ |
82 | static struct rds_connection *rds_conn_lookup(struct net *net, |
83 | struct hlist_head *head, |
84 | const struct in6_addr *laddr, |
85 | const struct in6_addr *faddr, |
86 | struct rds_transport *trans, |
87 | u8 tos, int dev_if) |
88 | { |
89 | struct rds_connection *conn, *ret = NULL; |
90 | |
91 | hlist_for_each_entry_rcu(conn, head, c_hash_node) { |
92 | if (ipv6_addr_equal(a1: &conn->c_faddr, a2: faddr) && |
93 | ipv6_addr_equal(a1: &conn->c_laddr, a2: laddr) && |
94 | conn->c_trans == trans && |
95 | conn->c_tos == tos && |
96 | net == rds_conn_net(conn) && |
97 | conn->c_dev_if == dev_if) { |
98 | ret = conn; |
99 | break; |
100 | } |
101 | } |
102 | rdsdebug("returning conn %p for %pI6c -> %pI6c\n" , ret, |
103 | laddr, faddr); |
104 | return ret; |
105 | } |
106 | |
107 | /* |
108 | * This is called by transports as they're bringing down a connection. |
109 | * It clears partial message state so that the transport can start sending |
110 | * and receiving over this connection again in the future. It is up to |
111 | * the transport to have serialized this call with its send and recv. |
112 | */ |
113 | static void rds_conn_path_reset(struct rds_conn_path *cp) |
114 | { |
115 | struct rds_connection *conn = cp->cp_conn; |
116 | |
117 | rdsdebug("connection %pI6c to %pI6c reset\n" , |
118 | &conn->c_laddr, &conn->c_faddr); |
119 | |
120 | rds_stats_inc(s_conn_reset); |
121 | rds_send_path_reset(conn: cp); |
122 | cp->cp_flags = 0; |
123 | |
124 | /* Do not clear next_rx_seq here, else we cannot distinguish |
125 | * retransmitted packets from new packets, and will hand all |
126 | * of them to the application. That is not consistent with the |
127 | * reliability guarantees of RDS. */ |
128 | } |
129 | |
130 | static void __rds_conn_path_init(struct rds_connection *conn, |
131 | struct rds_conn_path *cp, bool is_outgoing) |
132 | { |
133 | spin_lock_init(&cp->cp_lock); |
134 | cp->cp_next_tx_seq = 1; |
135 | init_waitqueue_head(&cp->cp_waitq); |
136 | INIT_LIST_HEAD(list: &cp->cp_send_queue); |
137 | INIT_LIST_HEAD(list: &cp->cp_retrans); |
138 | |
139 | cp->cp_conn = conn; |
140 | atomic_set(v: &cp->cp_state, i: RDS_CONN_DOWN); |
141 | cp->cp_send_gen = 0; |
142 | cp->cp_reconnect_jiffies = 0; |
143 | cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; |
144 | INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); |
145 | INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); |
146 | INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); |
147 | INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); |
148 | mutex_init(&cp->cp_cm_lock); |
149 | cp->cp_flags = 0; |
150 | } |
151 | |
152 | /* |
153 | * There is only every one 'conn' for a given pair of addresses in the |
154 | * system at a time. They contain messages to be retransmitted and so |
155 | * span the lifetime of the actual underlying transport connections. |
156 | * |
157 | * For now they are not garbage collected once they're created. They |
158 | * are torn down as the module is removed, if ever. |
159 | */ |
160 | static struct rds_connection *__rds_conn_create(struct net *net, |
161 | const struct in6_addr *laddr, |
162 | const struct in6_addr *faddr, |
163 | struct rds_transport *trans, |
164 | gfp_t gfp, u8 tos, |
165 | int is_outgoing, |
166 | int dev_if) |
167 | { |
168 | struct rds_connection *conn, *parent = NULL; |
169 | struct hlist_head *head = rds_conn_bucket(laddr, faddr); |
170 | struct rds_transport *loop_trans; |
171 | unsigned long flags; |
172 | int ret, i; |
173 | int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); |
174 | |
175 | rcu_read_lock(); |
176 | conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); |
177 | if (conn && |
178 | conn->c_loopback && |
179 | conn->c_trans != &rds_loop_transport && |
180 | ipv6_addr_equal(a1: laddr, a2: faddr) && |
181 | !is_outgoing) { |
182 | /* This is a looped back IB connection, and we're |
183 | * called by the code handling the incoming connect. |
184 | * We need a second connection object into which we |
185 | * can stick the other QP. */ |
186 | parent = conn; |
187 | conn = parent->c_passive; |
188 | } |
189 | rcu_read_unlock(); |
190 | if (conn) |
191 | goto out; |
192 | |
193 | conn = kmem_cache_zalloc(k: rds_conn_slab, flags: gfp); |
194 | if (!conn) { |
195 | conn = ERR_PTR(error: -ENOMEM); |
196 | goto out; |
197 | } |
198 | conn->c_path = kcalloc(n: npaths, size: sizeof(struct rds_conn_path), flags: gfp); |
199 | if (!conn->c_path) { |
200 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
201 | conn = ERR_PTR(error: -ENOMEM); |
202 | goto out; |
203 | } |
204 | |
205 | INIT_HLIST_NODE(h: &conn->c_hash_node); |
206 | conn->c_laddr = *laddr; |
207 | conn->c_isv6 = !ipv6_addr_v4mapped(a: laddr); |
208 | conn->c_faddr = *faddr; |
209 | conn->c_dev_if = dev_if; |
210 | conn->c_tos = tos; |
211 | |
212 | #if IS_ENABLED(CONFIG_IPV6) |
213 | /* If the local address is link local, set c_bound_if to be the |
214 | * index used for this connection. Otherwise, set it to 0 as |
215 | * the socket is not bound to an interface. c_bound_if is used |
216 | * to look up a socket when a packet is received |
217 | */ |
218 | if (ipv6_addr_type(addr: laddr) & IPV6_ADDR_LINKLOCAL) |
219 | conn->c_bound_if = dev_if; |
220 | else |
221 | #endif |
222 | conn->c_bound_if = 0; |
223 | |
224 | rds_conn_net_set(conn, net); |
225 | |
226 | ret = rds_cong_get_maps(conn); |
227 | if (ret) { |
228 | kfree(objp: conn->c_path); |
229 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
230 | conn = ERR_PTR(error: ret); |
231 | goto out; |
232 | } |
233 | |
234 | /* |
235 | * This is where a connection becomes loopback. If *any* RDS sockets |
236 | * can bind to the destination address then we'd rather the messages |
237 | * flow through loopback rather than either transport. |
238 | */ |
239 | loop_trans = rds_trans_get_preferred(net, addr: faddr, scope_id: conn->c_dev_if); |
240 | if (loop_trans) { |
241 | rds_trans_put(trans: loop_trans); |
242 | conn->c_loopback = 1; |
243 | if (trans->t_prefer_loopback) { |
244 | if (likely(is_outgoing)) { |
245 | /* "outgoing" connection to local address. |
246 | * Protocol says it wants the connection |
247 | * handled by the loopback transport. |
248 | * This is what TCP does. |
249 | */ |
250 | trans = &rds_loop_transport; |
251 | } else { |
252 | /* No transport currently in use |
253 | * should end up here, but if it |
254 | * does, reset/destroy the connection. |
255 | */ |
256 | kfree(objp: conn->c_path); |
257 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
258 | conn = ERR_PTR(error: -EOPNOTSUPP); |
259 | goto out; |
260 | } |
261 | } |
262 | } |
263 | |
264 | conn->c_trans = trans; |
265 | |
266 | init_waitqueue_head(&conn->c_hs_waitq); |
267 | for (i = 0; i < npaths; i++) { |
268 | __rds_conn_path_init(conn, cp: &conn->c_path[i], |
269 | is_outgoing); |
270 | conn->c_path[i].cp_index = i; |
271 | } |
272 | rcu_read_lock(); |
273 | if (rds_destroy_pending(conn)) |
274 | ret = -ENETDOWN; |
275 | else |
276 | ret = trans->conn_alloc(conn, GFP_ATOMIC); |
277 | if (ret) { |
278 | rcu_read_unlock(); |
279 | kfree(objp: conn->c_path); |
280 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
281 | conn = ERR_PTR(error: ret); |
282 | goto out; |
283 | } |
284 | |
285 | rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n" , |
286 | conn, laddr, faddr, |
287 | strnlen(trans->t_name, sizeof(trans->t_name)) ? |
288 | trans->t_name : "[unknown]" , is_outgoing ? "(outgoing)" : "" ); |
289 | |
290 | /* |
291 | * Since we ran without holding the conn lock, someone could |
292 | * have created the same conn (either normal or passive) in the |
293 | * interim. We check while holding the lock. If we won, we complete |
294 | * init and return our conn. If we lost, we rollback and return the |
295 | * other one. |
296 | */ |
297 | spin_lock_irqsave(&rds_conn_lock, flags); |
298 | if (parent) { |
299 | /* Creating passive conn */ |
300 | if (parent->c_passive) { |
301 | trans->conn_free(conn->c_path[0].cp_transport_data); |
302 | kfree(objp: conn->c_path); |
303 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
304 | conn = parent->c_passive; |
305 | } else { |
306 | parent->c_passive = conn; |
307 | rds_cong_add_conn(conn); |
308 | rds_conn_count++; |
309 | } |
310 | } else { |
311 | /* Creating normal conn */ |
312 | struct rds_connection *found; |
313 | |
314 | found = rds_conn_lookup(net, head, laddr, faddr, trans, |
315 | tos, dev_if); |
316 | if (found) { |
317 | struct rds_conn_path *cp; |
318 | int i; |
319 | |
320 | for (i = 0; i < npaths; i++) { |
321 | cp = &conn->c_path[i]; |
322 | /* The ->conn_alloc invocation may have |
323 | * allocated resource for all paths, so all |
324 | * of them may have to be freed here. |
325 | */ |
326 | if (cp->cp_transport_data) |
327 | trans->conn_free(cp->cp_transport_data); |
328 | } |
329 | kfree(objp: conn->c_path); |
330 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
331 | conn = found; |
332 | } else { |
333 | conn->c_my_gen_num = rds_gen_num; |
334 | conn->c_peer_gen_num = 0; |
335 | hlist_add_head_rcu(n: &conn->c_hash_node, h: head); |
336 | rds_cong_add_conn(conn); |
337 | rds_conn_count++; |
338 | } |
339 | } |
340 | spin_unlock_irqrestore(lock: &rds_conn_lock, flags); |
341 | rcu_read_unlock(); |
342 | |
343 | out: |
344 | return conn; |
345 | } |
346 | |
347 | struct rds_connection *rds_conn_create(struct net *net, |
348 | const struct in6_addr *laddr, |
349 | const struct in6_addr *faddr, |
350 | struct rds_transport *trans, u8 tos, |
351 | gfp_t gfp, int dev_if) |
352 | { |
353 | return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, is_outgoing: 0, dev_if); |
354 | } |
355 | EXPORT_SYMBOL_GPL(rds_conn_create); |
356 | |
357 | struct rds_connection *rds_conn_create_outgoing(struct net *net, |
358 | const struct in6_addr *laddr, |
359 | const struct in6_addr *faddr, |
360 | struct rds_transport *trans, |
361 | u8 tos, gfp_t gfp, int dev_if) |
362 | { |
363 | return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, is_outgoing: 1, dev_if); |
364 | } |
365 | EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); |
366 | |
367 | void rds_conn_shutdown(struct rds_conn_path *cp) |
368 | { |
369 | struct rds_connection *conn = cp->cp_conn; |
370 | |
371 | /* shut it down unless it's down already */ |
372 | if (!rds_conn_path_transition(cp, old: RDS_CONN_DOWN, new: RDS_CONN_DOWN)) { |
373 | /* |
374 | * Quiesce the connection mgmt handlers before we start tearing |
375 | * things down. We don't hold the mutex for the entire |
376 | * duration of the shutdown operation, else we may be |
377 | * deadlocking with the CM handler. Instead, the CM event |
378 | * handler is supposed to check for state DISCONNECTING |
379 | */ |
380 | mutex_lock(&cp->cp_cm_lock); |
381 | if (!rds_conn_path_transition(cp, old: RDS_CONN_UP, |
382 | new: RDS_CONN_DISCONNECTING) && |
383 | !rds_conn_path_transition(cp, old: RDS_CONN_ERROR, |
384 | new: RDS_CONN_DISCONNECTING)) { |
385 | rds_conn_path_error(cp, |
386 | "shutdown called in state %d\n" , |
387 | atomic_read(&cp->cp_state)); |
388 | mutex_unlock(lock: &cp->cp_cm_lock); |
389 | return; |
390 | } |
391 | mutex_unlock(lock: &cp->cp_cm_lock); |
392 | |
393 | wait_event(cp->cp_waitq, |
394 | !test_bit(RDS_IN_XMIT, &cp->cp_flags)); |
395 | wait_event(cp->cp_waitq, |
396 | !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); |
397 | |
398 | conn->c_trans->conn_path_shutdown(cp); |
399 | rds_conn_path_reset(cp); |
400 | |
401 | if (!rds_conn_path_transition(cp, old: RDS_CONN_DISCONNECTING, |
402 | new: RDS_CONN_DOWN) && |
403 | !rds_conn_path_transition(cp, old: RDS_CONN_ERROR, |
404 | new: RDS_CONN_DOWN)) { |
405 | /* This can happen - eg when we're in the middle of tearing |
406 | * down the connection, and someone unloads the rds module. |
407 | * Quite reproducible with loopback connections. |
408 | * Mostly harmless. |
409 | * |
410 | * Note that this also happens with rds-tcp because |
411 | * we could have triggered rds_conn_path_drop in irq |
412 | * mode from rds_tcp_state change on the receipt of |
413 | * a FIN, thus we need to recheck for RDS_CONN_ERROR |
414 | * here. |
415 | */ |
416 | rds_conn_path_error(cp, "%s: failed to transition " |
417 | "to state DOWN, current state " |
418 | "is %d\n" , __func__, |
419 | atomic_read(&cp->cp_state)); |
420 | return; |
421 | } |
422 | } |
423 | |
424 | /* Then reconnect if it's still live. |
425 | * The passive side of an IB loopback connection is never added |
426 | * to the conn hash, so we never trigger a reconnect on this |
427 | * conn - the reconnect is always triggered by the active peer. */ |
428 | cancel_delayed_work_sync(dwork: &cp->cp_conn_w); |
429 | rcu_read_lock(); |
430 | if (!hlist_unhashed(h: &conn->c_hash_node)) { |
431 | rcu_read_unlock(); |
432 | rds_queue_reconnect(cp); |
433 | } else { |
434 | rcu_read_unlock(); |
435 | } |
436 | } |
437 | |
438 | /* destroy a single rds_conn_path. rds_conn_destroy() iterates over |
439 | * all paths using rds_conn_path_destroy() |
440 | */ |
441 | static void rds_conn_path_destroy(struct rds_conn_path *cp) |
442 | { |
443 | struct rds_message *rm, *rtmp; |
444 | |
445 | if (!cp->cp_transport_data) |
446 | return; |
447 | |
448 | /* make sure lingering queued work won't try to ref the conn */ |
449 | cancel_delayed_work_sync(dwork: &cp->cp_send_w); |
450 | cancel_delayed_work_sync(dwork: &cp->cp_recv_w); |
451 | |
452 | rds_conn_path_drop(cpath: cp, destroy: true); |
453 | flush_work(work: &cp->cp_down_w); |
454 | |
455 | /* tear down queued messages */ |
456 | list_for_each_entry_safe(rm, rtmp, |
457 | &cp->cp_send_queue, |
458 | m_conn_item) { |
459 | list_del_init(entry: &rm->m_conn_item); |
460 | BUG_ON(!list_empty(&rm->m_sock_item)); |
461 | rds_message_put(rm); |
462 | } |
463 | if (cp->cp_xmit_rm) |
464 | rds_message_put(rm: cp->cp_xmit_rm); |
465 | |
466 | WARN_ON(delayed_work_pending(&cp->cp_send_w)); |
467 | WARN_ON(delayed_work_pending(&cp->cp_recv_w)); |
468 | WARN_ON(delayed_work_pending(&cp->cp_conn_w)); |
469 | WARN_ON(work_pending(&cp->cp_down_w)); |
470 | |
471 | cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); |
472 | } |
473 | |
474 | /* |
475 | * Stop and free a connection. |
476 | * |
477 | * This can only be used in very limited circumstances. It assumes that once |
478 | * the conn has been shutdown that no one else is referencing the connection. |
479 | * We can only ensure this in the rmmod path in the current code. |
480 | */ |
481 | void rds_conn_destroy(struct rds_connection *conn) |
482 | { |
483 | unsigned long flags; |
484 | int i; |
485 | struct rds_conn_path *cp; |
486 | int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); |
487 | |
488 | rdsdebug("freeing conn %p for %pI4 -> " |
489 | "%pI4\n" , conn, &conn->c_laddr, |
490 | &conn->c_faddr); |
491 | |
492 | /* Ensure conn will not be scheduled for reconnect */ |
493 | spin_lock_irq(lock: &rds_conn_lock); |
494 | hlist_del_init_rcu(n: &conn->c_hash_node); |
495 | spin_unlock_irq(lock: &rds_conn_lock); |
496 | synchronize_rcu(); |
497 | |
498 | /* shut the connection down */ |
499 | for (i = 0; i < npaths; i++) { |
500 | cp = &conn->c_path[i]; |
501 | rds_conn_path_destroy(cp); |
502 | BUG_ON(!list_empty(&cp->cp_retrans)); |
503 | } |
504 | |
505 | /* |
506 | * The congestion maps aren't freed up here. They're |
507 | * freed by rds_cong_exit() after all the connections |
508 | * have been freed. |
509 | */ |
510 | rds_cong_remove_conn(conn); |
511 | |
512 | kfree(objp: conn->c_path); |
513 | kmem_cache_free(s: rds_conn_slab, objp: conn); |
514 | |
515 | spin_lock_irqsave(&rds_conn_lock, flags); |
516 | rds_conn_count--; |
517 | spin_unlock_irqrestore(lock: &rds_conn_lock, flags); |
518 | } |
519 | EXPORT_SYMBOL_GPL(rds_conn_destroy); |
520 | |
521 | static void __rds_inc_msg_cp(struct rds_incoming *inc, |
522 | struct rds_info_iterator *iter, |
523 | void *saddr, void *daddr, int flip, bool isv6) |
524 | { |
525 | #if IS_ENABLED(CONFIG_IPV6) |
526 | if (isv6) |
527 | rds6_inc_info_copy(inc, iter, saddr, daddr, flip); |
528 | else |
529 | #endif |
530 | rds_inc_info_copy(inc, iter, saddr: *(__be32 *)saddr, |
531 | daddr: *(__be32 *)daddr, flip); |
532 | } |
533 | |
534 | static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, |
535 | struct rds_info_iterator *iter, |
536 | struct rds_info_lengths *lens, |
537 | int want_send, bool isv6) |
538 | { |
539 | struct hlist_head *head; |
540 | struct list_head *list; |
541 | struct rds_connection *conn; |
542 | struct rds_message *rm; |
543 | unsigned int total = 0; |
544 | unsigned long flags; |
545 | size_t i; |
546 | int j; |
547 | |
548 | if (isv6) |
549 | len /= sizeof(struct rds6_info_message); |
550 | else |
551 | len /= sizeof(struct rds_info_message); |
552 | |
553 | rcu_read_lock(); |
554 | |
555 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
556 | i++, head++) { |
557 | hlist_for_each_entry_rcu(conn, head, c_hash_node) { |
558 | struct rds_conn_path *cp; |
559 | int npaths; |
560 | |
561 | if (!isv6 && conn->c_isv6) |
562 | continue; |
563 | |
564 | npaths = (conn->c_trans->t_mp_capable ? |
565 | RDS_MPATH_WORKERS : 1); |
566 | |
567 | for (j = 0; j < npaths; j++) { |
568 | cp = &conn->c_path[j]; |
569 | if (want_send) |
570 | list = &cp->cp_send_queue; |
571 | else |
572 | list = &cp->cp_retrans; |
573 | |
574 | spin_lock_irqsave(&cp->cp_lock, flags); |
575 | |
576 | /* XXX too lazy to maintain counts.. */ |
577 | list_for_each_entry(rm, list, m_conn_item) { |
578 | total++; |
579 | if (total <= len) |
580 | __rds_inc_msg_cp(inc: &rm->m_inc, |
581 | iter, |
582 | saddr: &conn->c_laddr, |
583 | daddr: &conn->c_faddr, |
584 | flip: 0, isv6); |
585 | } |
586 | |
587 | spin_unlock_irqrestore(lock: &cp->cp_lock, flags); |
588 | } |
589 | } |
590 | } |
591 | rcu_read_unlock(); |
592 | |
593 | lens->nr = total; |
594 | if (isv6) |
595 | lens->each = sizeof(struct rds6_info_message); |
596 | else |
597 | lens->each = sizeof(struct rds_info_message); |
598 | } |
599 | |
600 | static void rds_conn_message_info(struct socket *sock, unsigned int len, |
601 | struct rds_info_iterator *iter, |
602 | struct rds_info_lengths *lens, |
603 | int want_send) |
604 | { |
605 | rds_conn_message_info_cmn(sock, len, iter, lens, want_send, isv6: false); |
606 | } |
607 | |
608 | #if IS_ENABLED(CONFIG_IPV6) |
609 | static void rds6_conn_message_info(struct socket *sock, unsigned int len, |
610 | struct rds_info_iterator *iter, |
611 | struct rds_info_lengths *lens, |
612 | int want_send) |
613 | { |
614 | rds_conn_message_info_cmn(sock, len, iter, lens, want_send, isv6: true); |
615 | } |
616 | #endif |
617 | |
618 | static void rds_conn_message_info_send(struct socket *sock, unsigned int len, |
619 | struct rds_info_iterator *iter, |
620 | struct rds_info_lengths *lens) |
621 | { |
622 | rds_conn_message_info(sock, len, iter, lens, want_send: 1); |
623 | } |
624 | |
625 | #if IS_ENABLED(CONFIG_IPV6) |
626 | static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, |
627 | struct rds_info_iterator *iter, |
628 | struct rds_info_lengths *lens) |
629 | { |
630 | rds6_conn_message_info(sock, len, iter, lens, want_send: 1); |
631 | } |
632 | #endif |
633 | |
634 | static void rds_conn_message_info_retrans(struct socket *sock, |
635 | unsigned int len, |
636 | struct rds_info_iterator *iter, |
637 | struct rds_info_lengths *lens) |
638 | { |
639 | rds_conn_message_info(sock, len, iter, lens, want_send: 0); |
640 | } |
641 | |
642 | #if IS_ENABLED(CONFIG_IPV6) |
643 | static void rds6_conn_message_info_retrans(struct socket *sock, |
644 | unsigned int len, |
645 | struct rds_info_iterator *iter, |
646 | struct rds_info_lengths *lens) |
647 | { |
648 | rds6_conn_message_info(sock, len, iter, lens, want_send: 0); |
649 | } |
650 | #endif |
651 | |
652 | void rds_for_each_conn_info(struct socket *sock, unsigned int len, |
653 | struct rds_info_iterator *iter, |
654 | struct rds_info_lengths *lens, |
655 | int (*visitor)(struct rds_connection *, void *), |
656 | u64 *buffer, |
657 | size_t item_len) |
658 | { |
659 | struct hlist_head *head; |
660 | struct rds_connection *conn; |
661 | size_t i; |
662 | |
663 | rcu_read_lock(); |
664 | |
665 | lens->nr = 0; |
666 | lens->each = item_len; |
667 | |
668 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
669 | i++, head++) { |
670 | hlist_for_each_entry_rcu(conn, head, c_hash_node) { |
671 | |
672 | /* XXX no c_lock usage.. */ |
673 | if (!visitor(conn, buffer)) |
674 | continue; |
675 | |
676 | /* We copy as much as we can fit in the buffer, |
677 | * but we count all items so that the caller |
678 | * can resize the buffer. */ |
679 | if (len >= item_len) { |
680 | rds_info_copy(iter, data: buffer, bytes: item_len); |
681 | len -= item_len; |
682 | } |
683 | lens->nr++; |
684 | } |
685 | } |
686 | rcu_read_unlock(); |
687 | } |
688 | EXPORT_SYMBOL_GPL(rds_for_each_conn_info); |
689 | |
690 | static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, |
691 | struct rds_info_iterator *iter, |
692 | struct rds_info_lengths *lens, |
693 | int (*visitor)(struct rds_conn_path *, void *), |
694 | u64 *buffer, |
695 | size_t item_len) |
696 | { |
697 | struct hlist_head *head; |
698 | struct rds_connection *conn; |
699 | size_t i; |
700 | |
701 | rcu_read_lock(); |
702 | |
703 | lens->nr = 0; |
704 | lens->each = item_len; |
705 | |
706 | for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); |
707 | i++, head++) { |
708 | hlist_for_each_entry_rcu(conn, head, c_hash_node) { |
709 | struct rds_conn_path *cp; |
710 | |
711 | /* XXX We only copy the information from the first |
712 | * path for now. The problem is that if there are |
713 | * more than one underlying paths, we cannot report |
714 | * information of all of them using the existing |
715 | * API. For example, there is only one next_tx_seq, |
716 | * which path's next_tx_seq should we report? It is |
717 | * a bug in the design of MPRDS. |
718 | */ |
719 | cp = conn->c_path; |
720 | |
721 | /* XXX no cp_lock usage.. */ |
722 | if (!visitor(cp, buffer)) |
723 | continue; |
724 | |
725 | /* We copy as much as we can fit in the buffer, |
726 | * but we count all items so that the caller |
727 | * can resize the buffer. |
728 | */ |
729 | if (len >= item_len) { |
730 | rds_info_copy(iter, data: buffer, bytes: item_len); |
731 | len -= item_len; |
732 | } |
733 | lens->nr++; |
734 | } |
735 | } |
736 | rcu_read_unlock(); |
737 | } |
738 | |
739 | static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) |
740 | { |
741 | struct rds_info_connection *cinfo = buffer; |
742 | struct rds_connection *conn = cp->cp_conn; |
743 | |
744 | if (conn->c_isv6) |
745 | return 0; |
746 | |
747 | cinfo->next_tx_seq = cp->cp_next_tx_seq; |
748 | cinfo->next_rx_seq = cp->cp_next_rx_seq; |
749 | cinfo->laddr = conn->c_laddr.s6_addr32[3]; |
750 | cinfo->faddr = conn->c_faddr.s6_addr32[3]; |
751 | cinfo->tos = conn->c_tos; |
752 | strncpy(p: cinfo->transport, q: conn->c_trans->t_name, |
753 | size: sizeof(cinfo->transport)); |
754 | cinfo->flags = 0; |
755 | |
756 | rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), |
757 | SENDING); |
758 | /* XXX Future: return the state rather than these funky bits */ |
759 | rds_conn_info_set(cinfo->flags, |
760 | atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, |
761 | CONNECTING); |
762 | rds_conn_info_set(cinfo->flags, |
763 | atomic_read(&cp->cp_state) == RDS_CONN_UP, |
764 | CONNECTED); |
765 | return 1; |
766 | } |
767 | |
768 | #if IS_ENABLED(CONFIG_IPV6) |
769 | static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) |
770 | { |
771 | struct rds6_info_connection *cinfo6 = buffer; |
772 | struct rds_connection *conn = cp->cp_conn; |
773 | |
774 | cinfo6->next_tx_seq = cp->cp_next_tx_seq; |
775 | cinfo6->next_rx_seq = cp->cp_next_rx_seq; |
776 | cinfo6->laddr = conn->c_laddr; |
777 | cinfo6->faddr = conn->c_faddr; |
778 | strncpy(p: cinfo6->transport, q: conn->c_trans->t_name, |
779 | size: sizeof(cinfo6->transport)); |
780 | cinfo6->flags = 0; |
781 | |
782 | rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), |
783 | SENDING); |
784 | /* XXX Future: return the state rather than these funky bits */ |
785 | rds_conn_info_set(cinfo6->flags, |
786 | atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, |
787 | CONNECTING); |
788 | rds_conn_info_set(cinfo6->flags, |
789 | atomic_read(&cp->cp_state) == RDS_CONN_UP, |
790 | CONNECTED); |
791 | /* Just return 1 as there is no error case. This is a helper function |
792 | * for rds_walk_conn_path_info() and it wants a return value. |
793 | */ |
794 | return 1; |
795 | } |
796 | #endif |
797 | |
798 | static void rds_conn_info(struct socket *sock, unsigned int len, |
799 | struct rds_info_iterator *iter, |
800 | struct rds_info_lengths *lens) |
801 | { |
802 | u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; |
803 | |
804 | rds_walk_conn_path_info(sock, len, iter, lens, |
805 | visitor: rds_conn_info_visitor, |
806 | buffer, |
807 | item_len: sizeof(struct rds_info_connection)); |
808 | } |
809 | |
810 | #if IS_ENABLED(CONFIG_IPV6) |
811 | static void rds6_conn_info(struct socket *sock, unsigned int len, |
812 | struct rds_info_iterator *iter, |
813 | struct rds_info_lengths *lens) |
814 | { |
815 | u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; |
816 | |
817 | rds_walk_conn_path_info(sock, len, iter, lens, |
818 | visitor: rds6_conn_info_visitor, |
819 | buffer, |
820 | item_len: sizeof(struct rds6_info_connection)); |
821 | } |
822 | #endif |
823 | |
824 | int rds_conn_init(void) |
825 | { |
826 | int ret; |
827 | |
828 | ret = rds_loop_net_init(); /* register pernet callback */ |
829 | if (ret) |
830 | return ret; |
831 | |
832 | rds_conn_slab = kmem_cache_create(name: "rds_connection" , |
833 | size: sizeof(struct rds_connection), |
834 | align: 0, flags: 0, NULL); |
835 | if (!rds_conn_slab) { |
836 | rds_loop_net_exit(); |
837 | return -ENOMEM; |
838 | } |
839 | |
840 | rds_info_register_func(RDS_INFO_CONNECTIONS, func: rds_conn_info); |
841 | rds_info_register_func(RDS_INFO_SEND_MESSAGES, |
842 | func: rds_conn_message_info_send); |
843 | rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, |
844 | func: rds_conn_message_info_retrans); |
845 | #if IS_ENABLED(CONFIG_IPV6) |
846 | rds_info_register_func(RDS6_INFO_CONNECTIONS, func: rds6_conn_info); |
847 | rds_info_register_func(RDS6_INFO_SEND_MESSAGES, |
848 | func: rds6_conn_message_info_send); |
849 | rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, |
850 | func: rds6_conn_message_info_retrans); |
851 | #endif |
852 | return 0; |
853 | } |
854 | |
855 | void rds_conn_exit(void) |
856 | { |
857 | rds_loop_net_exit(); /* unregister pernet callback */ |
858 | rds_loop_exit(); |
859 | |
860 | WARN_ON(!hlist_empty(rds_conn_hash)); |
861 | |
862 | kmem_cache_destroy(s: rds_conn_slab); |
863 | |
864 | rds_info_deregister_func(RDS_INFO_CONNECTIONS, func: rds_conn_info); |
865 | rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, |
866 | func: rds_conn_message_info_send); |
867 | rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, |
868 | func: rds_conn_message_info_retrans); |
869 | #if IS_ENABLED(CONFIG_IPV6) |
870 | rds_info_deregister_func(RDS6_INFO_CONNECTIONS, func: rds6_conn_info); |
871 | rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, |
872 | func: rds6_conn_message_info_send); |
873 | rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, |
874 | func: rds6_conn_message_info_retrans); |
875 | #endif |
876 | } |
877 | |
878 | /* |
879 | * Force a disconnect |
880 | */ |
881 | void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) |
882 | { |
883 | atomic_set(v: &cp->cp_state, i: RDS_CONN_ERROR); |
884 | |
885 | rcu_read_lock(); |
886 | if (!destroy && rds_destroy_pending(conn: cp->cp_conn)) { |
887 | rcu_read_unlock(); |
888 | return; |
889 | } |
890 | queue_work(wq: rds_wq, work: &cp->cp_down_w); |
891 | rcu_read_unlock(); |
892 | } |
893 | EXPORT_SYMBOL_GPL(rds_conn_path_drop); |
894 | |
895 | void rds_conn_drop(struct rds_connection *conn) |
896 | { |
897 | WARN_ON(conn->c_trans->t_mp_capable); |
898 | rds_conn_path_drop(&conn->c_path[0], false); |
899 | } |
900 | EXPORT_SYMBOL_GPL(rds_conn_drop); |
901 | |
902 | /* |
903 | * If the connection is down, trigger a connect. We may have scheduled a |
904 | * delayed reconnect however - in this case we should not interfere. |
905 | */ |
906 | void rds_conn_path_connect_if_down(struct rds_conn_path *cp) |
907 | { |
908 | rcu_read_lock(); |
909 | if (rds_destroy_pending(conn: cp->cp_conn)) { |
910 | rcu_read_unlock(); |
911 | return; |
912 | } |
913 | if (rds_conn_path_state(cp) == RDS_CONN_DOWN && |
914 | !test_and_set_bit(RDS_RECONNECT_PENDING, addr: &cp->cp_flags)) |
915 | queue_delayed_work(wq: rds_wq, dwork: &cp->cp_conn_w, delay: 0); |
916 | rcu_read_unlock(); |
917 | } |
918 | EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); |
919 | |
920 | /* Check connectivity of all paths |
921 | */ |
922 | void rds_check_all_paths(struct rds_connection *conn) |
923 | { |
924 | int i = 0; |
925 | |
926 | do { |
927 | rds_conn_path_connect_if_down(&conn->c_path[i]); |
928 | } while (++i < conn->c_npaths); |
929 | } |
930 | |
931 | void rds_conn_connect_if_down(struct rds_connection *conn) |
932 | { |
933 | WARN_ON(conn->c_trans->t_mp_capable); |
934 | rds_conn_path_connect_if_down(&conn->c_path[0]); |
935 | } |
936 | EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); |
937 | |
938 | void |
939 | __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) |
940 | { |
941 | va_list ap; |
942 | |
943 | va_start(ap, fmt); |
944 | vprintk(fmt, args: ap); |
945 | va_end(ap); |
946 | |
947 | rds_conn_path_drop(cp, false); |
948 | } |
949 | |