1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /****************************************************************************** |
3 | ******************************************************************************* |
4 | ** |
5 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
6 | ** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. |
7 | ** |
8 | ** |
9 | ******************************************************************************* |
10 | ******************************************************************************/ |
11 | |
12 | /* |
13 | * midcomms.c |
14 | * |
15 | * This is the appallingly named "mid-level" comms layer. It takes care about |
16 | * deliver an on application layer "reliable" communication above the used |
17 | * lowcomms transport layer. |
18 | * |
19 | * How it works: |
20 | * |
21 | * Each nodes keeps track of all send DLM messages in send_queue with a sequence |
22 | * number. The receive will send an DLM_ACK message back for every DLM message |
23 | * received at the other side. If a reconnect happens in lowcomms we will send |
24 | * all unacknowledged dlm messages again. The receiving side might drop any already |
25 | * received message by comparing sequence numbers. |
26 | * |
27 | * How version detection works: |
28 | * |
29 | * Due the fact that dlm has pre-configured node addresses on every side |
30 | * it is in it's nature that every side connects at starts to transmit |
31 | * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS |
32 | * and their replies are the first messages which are exchanges. Due backwards |
33 | * compatibility these messages are not covered by the midcomms re-transmission |
34 | * layer. These messages have their own re-transmission handling in the dlm |
35 | * application layer. The version field of every node will be set on these RCOM |
36 | * messages as soon as they arrived and the node isn't yet part of the nodes |
37 | * hash. There exists also logic to detect version mismatched if something weird |
38 | * going on or the first messages isn't an expected one. |
39 | * |
40 | * Termination: |
41 | * |
42 | * The midcomms layer does a 4 way handshake for termination on DLM protocol |
43 | * like TCP supports it with half-closed socket support. SCTP doesn't support |
44 | * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be |
45 | * interrupted by .e.g. tcp reset itself. Additional there exists the othercon |
46 | * paradigm in lowcomms which cannot be easily without breaking backwards |
47 | * compatibility. A node cannot send anything to another node when a DLM_FIN |
48 | * message was send. There exists additional logic to print a warning if |
49 | * DLM wants to do it. There exists a state handling like RFC 793 but reduced |
50 | * to termination only. The event "member removal event" describes the cluster |
51 | * manager removed the node from internal lists, at this point DLM does not |
52 | * send any message to the other node. There exists two cases: |
53 | * |
54 | * 1. The cluster member was removed and we received a FIN |
55 | * OR |
56 | * 2. We received a FIN but the member was not removed yet |
57 | * |
58 | * One of these cases will do the CLOSE_WAIT to LAST_ACK change. |
59 | * |
60 | * |
61 | * +---------+ |
62 | * | CLOSED | |
63 | * +---------+ |
64 | * | add member/receive RCOM version |
65 | * | detection msg |
66 | * V |
67 | * +---------+ |
68 | * | ESTAB | |
69 | * +---------+ |
70 | * CLOSE | | rcv FIN |
71 | * ------- | | ------- |
72 | * +---------+ snd FIN / \ snd ACK +---------+ |
73 | * | FIN |<----------------- ------------------>| CLOSE | |
74 | * | WAIT-1 |------------------ | WAIT | |
75 | * +---------+ rcv FIN \ +---------+ |
76 | * | rcv ACK of FIN ------- | CLOSE | member |
77 | * | -------------- snd ACK | ------- | removal |
78 | * V x V snd FIN V event |
79 | * +---------+ +---------+ +---------+ |
80 | * |FINWAIT-2| | CLOSING | | LAST-ACK| |
81 | * +---------+ +---------+ +---------+ |
82 | * | rcv ACK of FIN | rcv ACK of FIN | |
83 | * | rcv FIN -------------- | -------------- | |
84 | * | ------- x V x V |
85 | * \ snd ACK +---------+ +---------+ |
86 | * ------------------------>| CLOSED | | CLOSED | |
87 | * +---------+ +---------+ |
88 | * |
89 | * NOTE: any state can interrupted by midcomms_close() and state will be |
90 | * switched to CLOSED in case of fencing. There exists also some timeout |
91 | * handling when we receive the version detection RCOM messages which is |
92 | * made by observation. |
93 | * |
94 | * Future improvements: |
95 | * |
96 | * There exists some known issues/improvements of the dlm handling. Some |
97 | * of them should be done in a next major dlm version bump which makes |
98 | * it incompatible with previous versions. |
99 | * |
100 | * Unaligned memory access: |
101 | * |
102 | * There exists cases when the dlm message buffer length is not aligned |
103 | * to 8 byte. However seems nobody detected any problem with it. This |
104 | * can be fixed in the next major version bump of dlm. |
105 | * |
106 | * Version detection: |
107 | * |
108 | * The version detection and how it's done is related to backwards |
109 | * compatibility. There exists better ways to make a better handling. |
110 | * However this should be changed in the next major version bump of dlm. |
111 | * |
112 | * Tail Size checking: |
113 | * |
114 | * There exists a message tail payload in e.g. DLM_MSG however we don't |
115 | * check it against the message length yet regarding to the receive buffer |
116 | * length. That need to be validated. |
117 | * |
118 | * Fencing bad nodes: |
119 | * |
120 | * At timeout places or weird sequence number behaviours we should send |
121 | * a fencing request to the cluster manager. |
122 | */ |
123 | |
124 | /* Debug switch to enable a 5 seconds sleep waiting of a termination. |
125 | * This can be useful to test fencing while termination is running. |
126 | * This requires a setup with only gfs2 as dlm user, so that the |
127 | * last umount will terminate the connection. |
128 | * |
129 | * However it became useful to test, while the 5 seconds block in umount |
130 | * just press the reset button. In a lot of dropping the termination |
131 | * process can could take several seconds. |
132 | */ |
133 | #define DLM_DEBUG_FENCE_TERMINATION 0 |
134 | |
135 | #include <trace/events/dlm.h> |
136 | #include <net/tcp.h> |
137 | |
138 | #include "dlm_internal.h" |
139 | #include "lowcomms.h" |
140 | #include "config.h" |
141 | #include "memory.h" |
142 | #include "lock.h" |
143 | #include "util.h" |
144 | #include "midcomms.h" |
145 | |
146 | /* init value for sequence numbers for testing purpose only e.g. overflows */ |
147 | #define DLM_SEQ_INIT 0 |
148 | /* 5 seconds wait to sync ending of dlm */ |
149 | #define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) |
150 | #define DLM_VERSION_NOT_SET 0 |
151 | #define DLM_SEND_ACK_BACK_MSG_THRESHOLD 32 |
152 | #define DLM_RECV_ACK_BACK_MSG_THRESHOLD (DLM_SEND_ACK_BACK_MSG_THRESHOLD * 8) |
153 | |
154 | struct midcomms_node { |
155 | int nodeid; |
156 | uint32_t version; |
157 | atomic_t seq_send; |
158 | atomic_t seq_next; |
159 | /* These queues are unbound because we cannot drop any message in dlm. |
160 | * We could send a fence signal for a specific node to the cluster |
161 | * manager if queues hits some maximum value, however this handling |
162 | * not supported yet. |
163 | */ |
164 | struct list_head send_queue; |
165 | spinlock_t send_queue_lock; |
166 | atomic_t send_queue_cnt; |
167 | #define DLM_NODE_FLAG_CLOSE 1 |
168 | #define DLM_NODE_FLAG_STOP_TX 2 |
169 | #define DLM_NODE_FLAG_STOP_RX 3 |
170 | atomic_t ulp_delivered; |
171 | unsigned long flags; |
172 | wait_queue_head_t shutdown_wait; |
173 | |
174 | /* dlm tcp termination state */ |
175 | #define DLM_CLOSED 1 |
176 | #define DLM_ESTABLISHED 2 |
177 | #define DLM_FIN_WAIT1 3 |
178 | #define DLM_FIN_WAIT2 4 |
179 | #define DLM_CLOSE_WAIT 5 |
180 | #define DLM_LAST_ACK 6 |
181 | #define DLM_CLOSING 7 |
182 | int state; |
183 | spinlock_t state_lock; |
184 | |
185 | /* counts how many lockspaces are using this node |
186 | * this refcount is necessary to determine if the |
187 | * node wants to disconnect. |
188 | */ |
189 | int users; |
190 | |
191 | /* not protected by srcu, node_hash lifetime */ |
192 | void *debugfs; |
193 | |
194 | struct hlist_node hlist; |
195 | struct rcu_head rcu; |
196 | }; |
197 | |
198 | struct dlm_mhandle { |
199 | const union dlm_packet *inner_p; |
200 | struct midcomms_node *node; |
201 | struct dlm_opts *opts; |
202 | struct dlm_msg *msg; |
203 | bool committed; |
204 | uint32_t seq; |
205 | |
206 | void (*ack_rcv)(struct midcomms_node *node); |
207 | |
208 | /* get_mhandle/commit srcu idx exchange */ |
209 | int idx; |
210 | |
211 | struct list_head list; |
212 | struct rcu_head rcu; |
213 | }; |
214 | |
215 | static struct hlist_head node_hash[CONN_HASH_SIZE]; |
216 | static DEFINE_SPINLOCK(nodes_lock); |
217 | DEFINE_STATIC_SRCU(nodes_srcu); |
218 | |
219 | /* This mutex prevents that midcomms_close() is running while |
220 | * stop() or remove(). As I experienced invalid memory access |
221 | * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and |
222 | * resetting machines. I will end in some double deletion in nodes |
223 | * datastructure. |
224 | */ |
225 | static DEFINE_MUTEX(close_lock); |
226 | |
227 | struct kmem_cache *dlm_midcomms_cache_create(void) |
228 | { |
229 | return kmem_cache_create(name: "dlm_mhandle" , size: sizeof(struct dlm_mhandle), |
230 | align: 0, flags: 0, NULL); |
231 | } |
232 | |
233 | static inline const char *dlm_state_str(int state) |
234 | { |
235 | switch (state) { |
236 | case DLM_CLOSED: |
237 | return "CLOSED" ; |
238 | case DLM_ESTABLISHED: |
239 | return "ESTABLISHED" ; |
240 | case DLM_FIN_WAIT1: |
241 | return "FIN_WAIT1" ; |
242 | case DLM_FIN_WAIT2: |
243 | return "FIN_WAIT2" ; |
244 | case DLM_CLOSE_WAIT: |
245 | return "CLOSE_WAIT" ; |
246 | case DLM_LAST_ACK: |
247 | return "LAST_ACK" ; |
248 | case DLM_CLOSING: |
249 | return "CLOSING" ; |
250 | default: |
251 | return "UNKNOWN" ; |
252 | } |
253 | } |
254 | |
255 | const char *dlm_midcomms_state(struct midcomms_node *node) |
256 | { |
257 | return dlm_state_str(state: node->state); |
258 | } |
259 | |
260 | unsigned long dlm_midcomms_flags(struct midcomms_node *node) |
261 | { |
262 | return node->flags; |
263 | } |
264 | |
265 | int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) |
266 | { |
267 | return atomic_read(v: &node->send_queue_cnt); |
268 | } |
269 | |
270 | uint32_t dlm_midcomms_version(struct midcomms_node *node) |
271 | { |
272 | return node->version; |
273 | } |
274 | |
275 | static struct midcomms_node *__find_node(int nodeid, int r) |
276 | { |
277 | struct midcomms_node *node; |
278 | |
279 | hlist_for_each_entry_rcu(node, &node_hash[r], hlist) { |
280 | if (node->nodeid == nodeid) |
281 | return node; |
282 | } |
283 | |
284 | return NULL; |
285 | } |
286 | |
287 | static void dlm_mhandle_release(struct rcu_head *rcu) |
288 | { |
289 | struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); |
290 | |
291 | dlm_lowcomms_put_msg(msg: mh->msg); |
292 | dlm_free_mhandle(mhandle: mh); |
293 | } |
294 | |
295 | static void dlm_mhandle_delete(struct midcomms_node *node, |
296 | struct dlm_mhandle *mh) |
297 | { |
298 | list_del_rcu(entry: &mh->list); |
299 | atomic_dec(v: &node->send_queue_cnt); |
300 | call_rcu(head: &mh->rcu, func: dlm_mhandle_release); |
301 | } |
302 | |
303 | static void dlm_send_queue_flush(struct midcomms_node *node) |
304 | { |
305 | struct dlm_mhandle *mh; |
306 | |
307 | pr_debug("flush midcomms send queue of node %d\n" , node->nodeid); |
308 | |
309 | rcu_read_lock(); |
310 | spin_lock_bh(lock: &node->send_queue_lock); |
311 | list_for_each_entry_rcu(mh, &node->send_queue, list) { |
312 | dlm_mhandle_delete(node, mh); |
313 | } |
314 | spin_unlock_bh(lock: &node->send_queue_lock); |
315 | rcu_read_unlock(); |
316 | } |
317 | |
318 | static void midcomms_node_reset(struct midcomms_node *node) |
319 | { |
320 | pr_debug("reset node %d\n" , node->nodeid); |
321 | |
322 | atomic_set(v: &node->seq_next, DLM_SEQ_INIT); |
323 | atomic_set(v: &node->seq_send, DLM_SEQ_INIT); |
324 | atomic_set(v: &node->ulp_delivered, i: 0); |
325 | node->version = DLM_VERSION_NOT_SET; |
326 | node->flags = 0; |
327 | |
328 | dlm_send_queue_flush(node); |
329 | node->state = DLM_CLOSED; |
330 | wake_up(&node->shutdown_wait); |
331 | } |
332 | |
333 | static struct midcomms_node *nodeid2node(int nodeid) |
334 | { |
335 | return __find_node(nodeid, r: nodeid_hash(nodeid)); |
336 | } |
337 | |
338 | int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) |
339 | { |
340 | int ret, idx, r = nodeid_hash(nodeid); |
341 | struct midcomms_node *node; |
342 | |
343 | ret = dlm_lowcomms_addr(nodeid, addr, len); |
344 | if (ret) |
345 | return ret; |
346 | |
347 | idx = srcu_read_lock(ssp: &nodes_srcu); |
348 | node = __find_node(nodeid, r); |
349 | if (node) { |
350 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
351 | return 0; |
352 | } |
353 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
354 | |
355 | node = kmalloc(size: sizeof(*node), GFP_NOFS); |
356 | if (!node) |
357 | return -ENOMEM; |
358 | |
359 | node->nodeid = nodeid; |
360 | spin_lock_init(&node->state_lock); |
361 | spin_lock_init(&node->send_queue_lock); |
362 | atomic_set(v: &node->send_queue_cnt, i: 0); |
363 | INIT_LIST_HEAD(list: &node->send_queue); |
364 | init_waitqueue_head(&node->shutdown_wait); |
365 | node->users = 0; |
366 | midcomms_node_reset(node); |
367 | |
368 | spin_lock(lock: &nodes_lock); |
369 | hlist_add_head_rcu(n: &node->hlist, h: &node_hash[r]); |
370 | spin_unlock(lock: &nodes_lock); |
371 | |
372 | node->debugfs = dlm_create_debug_comms_file(nodeid, data: node); |
373 | return 0; |
374 | } |
375 | |
376 | static int dlm_send_ack(int nodeid, uint32_t seq) |
377 | { |
378 | int mb_len = sizeof(struct dlm_header); |
379 | struct dlm_header *; |
380 | struct dlm_msg *msg; |
381 | char *ppc; |
382 | |
383 | msg = dlm_lowcomms_new_msg(nodeid, len: mb_len, GFP_ATOMIC, ppc: &ppc, |
384 | NULL, NULL); |
385 | if (!msg) |
386 | return -ENOMEM; |
387 | |
388 | m_header = (struct dlm_header *)ppc; |
389 | |
390 | m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); |
391 | m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); |
392 | m_header->h_length = cpu_to_le16(mb_len); |
393 | m_header->h_cmd = DLM_ACK; |
394 | m_header->u.h_seq = cpu_to_le32(seq); |
395 | |
396 | dlm_lowcomms_commit_msg(msg); |
397 | dlm_lowcomms_put_msg(msg); |
398 | |
399 | return 0; |
400 | } |
401 | |
402 | static void dlm_send_ack_threshold(struct midcomms_node *node, |
403 | uint32_t threshold) |
404 | { |
405 | uint32_t oval, nval; |
406 | bool send_ack; |
407 | |
408 | /* let only send one user trigger threshold to send ack back */ |
409 | do { |
410 | oval = atomic_read(v: &node->ulp_delivered); |
411 | send_ack = (oval > threshold); |
412 | /* abort if threshold is not reached */ |
413 | if (!send_ack) |
414 | break; |
415 | |
416 | nval = 0; |
417 | /* try to reset ulp_delivered counter */ |
418 | } while (atomic_cmpxchg(v: &node->ulp_delivered, old: oval, new: nval) != oval); |
419 | |
420 | if (send_ack) |
421 | dlm_send_ack(nodeid: node->nodeid, seq: atomic_read(v: &node->seq_next)); |
422 | } |
423 | |
424 | static int dlm_send_fin(struct midcomms_node *node, |
425 | void (*ack_rcv)(struct midcomms_node *node)) |
426 | { |
427 | int mb_len = sizeof(struct dlm_header); |
428 | struct dlm_header *; |
429 | struct dlm_mhandle *mh; |
430 | char *ppc; |
431 | |
432 | mh = dlm_midcomms_get_mhandle(nodeid: node->nodeid, len: mb_len, GFP_ATOMIC, ppc: &ppc); |
433 | if (!mh) |
434 | return -ENOMEM; |
435 | |
436 | set_bit(DLM_NODE_FLAG_STOP_TX, addr: &node->flags); |
437 | mh->ack_rcv = ack_rcv; |
438 | |
439 | m_header = (struct dlm_header *)ppc; |
440 | |
441 | m_header->h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); |
442 | m_header->h_nodeid = cpu_to_le32(dlm_our_nodeid()); |
443 | m_header->h_length = cpu_to_le16(mb_len); |
444 | m_header->h_cmd = DLM_FIN; |
445 | |
446 | pr_debug("sending fin msg to node %d\n" , node->nodeid); |
447 | dlm_midcomms_commit_mhandle(mh, NULL, namelen: 0); |
448 | |
449 | return 0; |
450 | } |
451 | |
452 | static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) |
453 | { |
454 | struct dlm_mhandle *mh; |
455 | |
456 | rcu_read_lock(); |
457 | list_for_each_entry_rcu(mh, &node->send_queue, list) { |
458 | if (before(seq1: mh->seq, seq2: seq)) { |
459 | if (mh->ack_rcv) |
460 | mh->ack_rcv(node); |
461 | } else { |
462 | /* send queue should be ordered */ |
463 | break; |
464 | } |
465 | } |
466 | |
467 | spin_lock_bh(lock: &node->send_queue_lock); |
468 | list_for_each_entry_rcu(mh, &node->send_queue, list) { |
469 | if (before(seq1: mh->seq, seq2: seq)) { |
470 | dlm_mhandle_delete(node, mh); |
471 | } else { |
472 | /* send queue should be ordered */ |
473 | break; |
474 | } |
475 | } |
476 | spin_unlock_bh(lock: &node->send_queue_lock); |
477 | rcu_read_unlock(); |
478 | } |
479 | |
480 | static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) |
481 | { |
482 | spin_lock(lock: &node->state_lock); |
483 | pr_debug("receive passive fin ack from node %d with state %s\n" , |
484 | node->nodeid, dlm_state_str(node->state)); |
485 | |
486 | switch (node->state) { |
487 | case DLM_LAST_ACK: |
488 | /* DLM_CLOSED */ |
489 | midcomms_node_reset(node); |
490 | break; |
491 | case DLM_CLOSED: |
492 | /* not valid but somehow we got what we want */ |
493 | wake_up(&node->shutdown_wait); |
494 | break; |
495 | default: |
496 | spin_unlock(lock: &node->state_lock); |
497 | log_print("%s: unexpected state: %d" , |
498 | __func__, node->state); |
499 | WARN_ON_ONCE(1); |
500 | return; |
501 | } |
502 | spin_unlock(lock: &node->state_lock); |
503 | } |
504 | |
505 | static void dlm_receive_buffer_3_2_trace(uint32_t seq, |
506 | const union dlm_packet *p) |
507 | { |
508 | switch (p->header.h_cmd) { |
509 | case DLM_MSG: |
510 | trace_dlm_recv_message(dst: dlm_our_nodeid(), h_seq: seq, ms: &p->message); |
511 | break; |
512 | case DLM_RCOM: |
513 | trace_dlm_recv_rcom(dst: dlm_our_nodeid(), h_seq: seq, rc: &p->rcom); |
514 | break; |
515 | default: |
516 | break; |
517 | } |
518 | } |
519 | |
520 | static void dlm_midcomms_receive_buffer(const union dlm_packet *p, |
521 | struct midcomms_node *node, |
522 | uint32_t seq) |
523 | { |
524 | bool is_expected_seq; |
525 | uint32_t oval, nval; |
526 | |
527 | do { |
528 | oval = atomic_read(v: &node->seq_next); |
529 | is_expected_seq = (oval == seq); |
530 | if (!is_expected_seq) |
531 | break; |
532 | |
533 | nval = oval + 1; |
534 | } while (atomic_cmpxchg(v: &node->seq_next, old: oval, new: nval) != oval); |
535 | |
536 | if (is_expected_seq) { |
537 | switch (p->header.h_cmd) { |
538 | case DLM_FIN: |
539 | spin_lock(lock: &node->state_lock); |
540 | pr_debug("receive fin msg from node %d with state %s\n" , |
541 | node->nodeid, dlm_state_str(node->state)); |
542 | |
543 | switch (node->state) { |
544 | case DLM_ESTABLISHED: |
545 | dlm_send_ack(nodeid: node->nodeid, seq: nval); |
546 | |
547 | /* passive shutdown DLM_LAST_ACK case 1 |
548 | * additional we check if the node is used by |
549 | * cluster manager events at all. |
550 | */ |
551 | if (node->users == 0) { |
552 | node->state = DLM_LAST_ACK; |
553 | pr_debug("switch node %d to state %s case 1\n" , |
554 | node->nodeid, dlm_state_str(node->state)); |
555 | set_bit(DLM_NODE_FLAG_STOP_RX, addr: &node->flags); |
556 | dlm_send_fin(node, ack_rcv: dlm_pas_fin_ack_rcv); |
557 | } else { |
558 | node->state = DLM_CLOSE_WAIT; |
559 | pr_debug("switch node %d to state %s\n" , |
560 | node->nodeid, dlm_state_str(node->state)); |
561 | } |
562 | break; |
563 | case DLM_FIN_WAIT1: |
564 | dlm_send_ack(nodeid: node->nodeid, seq: nval); |
565 | node->state = DLM_CLOSING; |
566 | set_bit(DLM_NODE_FLAG_STOP_RX, addr: &node->flags); |
567 | pr_debug("switch node %d to state %s\n" , |
568 | node->nodeid, dlm_state_str(node->state)); |
569 | break; |
570 | case DLM_FIN_WAIT2: |
571 | dlm_send_ack(nodeid: node->nodeid, seq: nval); |
572 | midcomms_node_reset(node); |
573 | pr_debug("switch node %d to state %s\n" , |
574 | node->nodeid, dlm_state_str(node->state)); |
575 | break; |
576 | case DLM_LAST_ACK: |
577 | /* probably remove_member caught it, do nothing */ |
578 | break; |
579 | default: |
580 | spin_unlock(lock: &node->state_lock); |
581 | log_print("%s: unexpected state: %d" , |
582 | __func__, node->state); |
583 | WARN_ON_ONCE(1); |
584 | return; |
585 | } |
586 | spin_unlock(lock: &node->state_lock); |
587 | break; |
588 | default: |
589 | WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); |
590 | dlm_receive_buffer_3_2_trace(seq, p); |
591 | dlm_receive_buffer(p, nodeid: node->nodeid); |
592 | atomic_inc(v: &node->ulp_delivered); |
593 | /* unlikely case to send ack back when we don't transmit */ |
594 | dlm_send_ack_threshold(node, DLM_RECV_ACK_BACK_MSG_THRESHOLD); |
595 | break; |
596 | } |
597 | } else { |
598 | /* retry to ack message which we already have by sending back |
599 | * current node->seq_next number as ack. |
600 | */ |
601 | if (seq < oval) |
602 | dlm_send_ack(nodeid: node->nodeid, seq: oval); |
603 | |
604 | log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d" , |
605 | seq, oval, node->nodeid); |
606 | } |
607 | } |
608 | |
609 | static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, |
610 | int nodeid) |
611 | { |
612 | int len = msglen; |
613 | |
614 | /* we only trust outer header msglen because |
615 | * it's checked against receive buffer length. |
616 | */ |
617 | if (len < sizeof(struct dlm_opts)) |
618 | return -1; |
619 | len -= sizeof(struct dlm_opts); |
620 | |
621 | if (len < le16_to_cpu(p->opts.o_optlen)) |
622 | return -1; |
623 | len -= le16_to_cpu(p->opts.o_optlen); |
624 | |
625 | switch (p->opts.o_nextcmd) { |
626 | case DLM_FIN: |
627 | if (len < sizeof(struct dlm_header)) { |
628 | log_print("fin too small: %d, will skip this message from node %d" , |
629 | len, nodeid); |
630 | return -1; |
631 | } |
632 | |
633 | break; |
634 | case DLM_MSG: |
635 | if (len < sizeof(struct dlm_message)) { |
636 | log_print("msg too small: %d, will skip this message from node %d" , |
637 | msglen, nodeid); |
638 | return -1; |
639 | } |
640 | |
641 | break; |
642 | case DLM_RCOM: |
643 | if (len < sizeof(struct dlm_rcom)) { |
644 | log_print("rcom msg too small: %d, will skip this message from node %d" , |
645 | len, nodeid); |
646 | return -1; |
647 | } |
648 | |
649 | break; |
650 | default: |
651 | log_print("unsupported o_nextcmd received: %u, will skip this message from node %d" , |
652 | p->opts.o_nextcmd, nodeid); |
653 | return -1; |
654 | } |
655 | |
656 | return 0; |
657 | } |
658 | |
659 | static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) |
660 | { |
661 | uint16_t msglen = le16_to_cpu(p->header.h_length); |
662 | struct midcomms_node *node; |
663 | uint32_t seq; |
664 | int ret, idx; |
665 | |
666 | idx = srcu_read_lock(ssp: &nodes_srcu); |
667 | node = nodeid2node(nodeid); |
668 | if (WARN_ON_ONCE(!node)) |
669 | goto out; |
670 | |
671 | switch (node->version) { |
672 | case DLM_VERSION_NOT_SET: |
673 | node->version = DLM_VERSION_3_2; |
674 | wake_up(&node->shutdown_wait); |
675 | log_print("version 0x%08x for node %d detected" , DLM_VERSION_3_2, |
676 | node->nodeid); |
677 | |
678 | spin_lock(lock: &node->state_lock); |
679 | switch (node->state) { |
680 | case DLM_CLOSED: |
681 | node->state = DLM_ESTABLISHED; |
682 | pr_debug("switch node %d to state %s\n" , |
683 | node->nodeid, dlm_state_str(node->state)); |
684 | break; |
685 | default: |
686 | break; |
687 | } |
688 | spin_unlock(lock: &node->state_lock); |
689 | |
690 | break; |
691 | case DLM_VERSION_3_2: |
692 | break; |
693 | default: |
694 | log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x" , |
695 | DLM_VERSION_3_2, node->nodeid, node->version); |
696 | goto out; |
697 | } |
698 | |
699 | switch (p->header.h_cmd) { |
700 | case DLM_RCOM: |
701 | /* these rcom message we use to determine version. |
702 | * they have their own retransmission handling and |
703 | * are the first messages of dlm. |
704 | * |
705 | * length already checked. |
706 | */ |
707 | switch (p->rcom.rc_type) { |
708 | case cpu_to_le32(DLM_RCOM_NAMES): |
709 | fallthrough; |
710 | case cpu_to_le32(DLM_RCOM_NAMES_REPLY): |
711 | fallthrough; |
712 | case cpu_to_le32(DLM_RCOM_STATUS): |
713 | fallthrough; |
714 | case cpu_to_le32(DLM_RCOM_STATUS_REPLY): |
715 | break; |
716 | default: |
717 | log_print("unsupported rcom type received: %u, will skip this message from node %d" , |
718 | le32_to_cpu(p->rcom.rc_type), nodeid); |
719 | goto out; |
720 | } |
721 | |
722 | WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); |
723 | dlm_receive_buffer(p, nodeid); |
724 | break; |
725 | case DLM_OPTS: |
726 | seq = le32_to_cpu(p->header.u.h_seq); |
727 | |
728 | ret = dlm_opts_check_msglen(p, msglen, nodeid); |
729 | if (ret < 0) { |
730 | log_print("opts msg too small: %u, will skip this message from node %d" , |
731 | msglen, nodeid); |
732 | goto out; |
733 | } |
734 | |
735 | p = (union dlm_packet *)((unsigned char *)p->opts.o_opts + |
736 | le16_to_cpu(p->opts.o_optlen)); |
737 | |
738 | /* recheck inner msglen just if it's not garbage */ |
739 | msglen = le16_to_cpu(p->header.h_length); |
740 | switch (p->header.h_cmd) { |
741 | case DLM_RCOM: |
742 | if (msglen < sizeof(struct dlm_rcom)) { |
743 | log_print("inner rcom msg too small: %u, will skip this message from node %d" , |
744 | msglen, nodeid); |
745 | goto out; |
746 | } |
747 | |
748 | break; |
749 | case DLM_MSG: |
750 | if (msglen < sizeof(struct dlm_message)) { |
751 | log_print("inner msg too small: %u, will skip this message from node %d" , |
752 | msglen, nodeid); |
753 | goto out; |
754 | } |
755 | |
756 | break; |
757 | case DLM_FIN: |
758 | if (msglen < sizeof(struct dlm_header)) { |
759 | log_print("inner fin too small: %u, will skip this message from node %d" , |
760 | msglen, nodeid); |
761 | goto out; |
762 | } |
763 | |
764 | break; |
765 | default: |
766 | log_print("unsupported inner h_cmd received: %u, will skip this message from node %d" , |
767 | msglen, nodeid); |
768 | goto out; |
769 | } |
770 | |
771 | dlm_midcomms_receive_buffer(p, node, seq); |
772 | break; |
773 | case DLM_ACK: |
774 | seq = le32_to_cpu(p->header.u.h_seq); |
775 | dlm_receive_ack(node, seq); |
776 | break; |
777 | default: |
778 | log_print("unsupported h_cmd received: %u, will skip this message from node %d" , |
779 | p->header.h_cmd, nodeid); |
780 | break; |
781 | } |
782 | |
783 | out: |
784 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
785 | } |
786 | |
787 | static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) |
788 | { |
789 | uint16_t msglen = le16_to_cpu(p->header.h_length); |
790 | struct midcomms_node *node; |
791 | int idx; |
792 | |
793 | idx = srcu_read_lock(ssp: &nodes_srcu); |
794 | node = nodeid2node(nodeid); |
795 | if (WARN_ON_ONCE(!node)) { |
796 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
797 | return; |
798 | } |
799 | |
800 | switch (node->version) { |
801 | case DLM_VERSION_NOT_SET: |
802 | node->version = DLM_VERSION_3_1; |
803 | wake_up(&node->shutdown_wait); |
804 | log_print("version 0x%08x for node %d detected" , DLM_VERSION_3_1, |
805 | node->nodeid); |
806 | break; |
807 | case DLM_VERSION_3_1: |
808 | break; |
809 | default: |
810 | log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x" , |
811 | DLM_VERSION_3_1, node->nodeid, node->version); |
812 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
813 | return; |
814 | } |
815 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
816 | |
817 | switch (p->header.h_cmd) { |
818 | case DLM_RCOM: |
819 | /* length already checked */ |
820 | break; |
821 | case DLM_MSG: |
822 | if (msglen < sizeof(struct dlm_message)) { |
823 | log_print("msg too small: %u, will skip this message from node %d" , |
824 | msglen, nodeid); |
825 | return; |
826 | } |
827 | |
828 | break; |
829 | default: |
830 | log_print("unsupported h_cmd received: %u, will skip this message from node %d" , |
831 | p->header.h_cmd, nodeid); |
832 | return; |
833 | } |
834 | |
835 | dlm_receive_buffer(p, nodeid); |
836 | } |
837 | |
838 | int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len) |
839 | { |
840 | const unsigned char *ptr = buf; |
841 | const struct dlm_header *hd; |
842 | uint16_t msglen; |
843 | int ret = 0; |
844 | |
845 | while (len >= sizeof(struct dlm_header)) { |
846 | hd = (struct dlm_header *)ptr; |
847 | |
848 | /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or |
849 | * less than dlm_header size. |
850 | * |
851 | * Some messages does not have a 8 byte length boundary yet |
852 | * which can occur in a unaligned memory access of some dlm |
853 | * messages. However this problem need to be fixed at the |
854 | * sending side, for now it seems nobody run into architecture |
855 | * related issues yet but it slows down some processing. |
856 | * Fixing this issue should be scheduled in future by doing |
857 | * the next major version bump. |
858 | */ |
859 | msglen = le16_to_cpu(hd->h_length); |
860 | if (msglen > DLM_MAX_SOCKET_BUFSIZE || |
861 | msglen < sizeof(struct dlm_header)) { |
862 | log_print("received invalid length header: %u from node %d, will abort message parsing" , |
863 | msglen, nodeid); |
864 | return -EBADMSG; |
865 | } |
866 | |
867 | /* caller will take care that leftover |
868 | * will be parsed next call with more data |
869 | */ |
870 | if (msglen > len) |
871 | break; |
872 | |
873 | ret += msglen; |
874 | len -= msglen; |
875 | ptr += msglen; |
876 | } |
877 | |
878 | return ret; |
879 | } |
880 | |
881 | /* |
882 | * Called from the low-level comms layer to process a buffer of |
883 | * commands. |
884 | */ |
885 | int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) |
886 | { |
887 | const unsigned char *ptr = buf; |
888 | const struct dlm_header *hd; |
889 | uint16_t msglen; |
890 | int ret = 0; |
891 | |
892 | while (len >= sizeof(struct dlm_header)) { |
893 | hd = (struct dlm_header *)ptr; |
894 | |
895 | msglen = le16_to_cpu(hd->h_length); |
896 | if (msglen > len) |
897 | break; |
898 | |
899 | switch (hd->h_version) { |
900 | case cpu_to_le32(DLM_VERSION_3_1): |
901 | dlm_midcomms_receive_buffer_3_1(p: (const union dlm_packet *)ptr, nodeid); |
902 | break; |
903 | case cpu_to_le32(DLM_VERSION_3_2): |
904 | dlm_midcomms_receive_buffer_3_2(p: (const union dlm_packet *)ptr, nodeid); |
905 | break; |
906 | default: |
907 | log_print("received invalid version header: %u from node %d, will skip this message" , |
908 | le32_to_cpu(hd->h_version), nodeid); |
909 | break; |
910 | } |
911 | |
912 | ret += msglen; |
913 | len -= msglen; |
914 | ptr += msglen; |
915 | } |
916 | |
917 | return ret; |
918 | } |
919 | |
920 | void dlm_midcomms_unack_msg_resend(int nodeid) |
921 | { |
922 | struct midcomms_node *node; |
923 | struct dlm_mhandle *mh; |
924 | int idx, ret; |
925 | |
926 | idx = srcu_read_lock(ssp: &nodes_srcu); |
927 | node = nodeid2node(nodeid); |
928 | if (WARN_ON_ONCE(!node)) { |
929 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
930 | return; |
931 | } |
932 | |
933 | /* old protocol, we don't support to retransmit on failure */ |
934 | switch (node->version) { |
935 | case DLM_VERSION_3_2: |
936 | break; |
937 | default: |
938 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
939 | return; |
940 | } |
941 | |
942 | rcu_read_lock(); |
943 | list_for_each_entry_rcu(mh, &node->send_queue, list) { |
944 | if (!mh->committed) |
945 | continue; |
946 | |
947 | ret = dlm_lowcomms_resend_msg(msg: mh->msg); |
948 | if (!ret) |
949 | log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d" , |
950 | mh->seq, node->nodeid); |
951 | } |
952 | rcu_read_unlock(); |
953 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
954 | } |
955 | |
956 | static void (struct dlm_opts *opts, uint16_t inner_len, |
957 | uint32_t seq) |
958 | { |
959 | opts->o_header.h_cmd = DLM_OPTS; |
960 | opts->o_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR); |
961 | opts->o_header.h_nodeid = cpu_to_le32(dlm_our_nodeid()); |
962 | opts->o_header.h_length = cpu_to_le16(DLM_MIDCOMMS_OPT_LEN + inner_len); |
963 | opts->o_header.u.h_seq = cpu_to_le32(seq); |
964 | } |
965 | |
966 | static void midcomms_new_msg_cb(void *data) |
967 | { |
968 | struct dlm_mhandle *mh = data; |
969 | |
970 | atomic_inc(v: &mh->node->send_queue_cnt); |
971 | |
972 | spin_lock_bh(lock: &mh->node->send_queue_lock); |
973 | list_add_tail_rcu(new: &mh->list, head: &mh->node->send_queue); |
974 | spin_unlock_bh(lock: &mh->node->send_queue_lock); |
975 | |
976 | mh->seq = atomic_fetch_inc(v: &mh->node->seq_send); |
977 | } |
978 | |
979 | static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, |
980 | int len, gfp_t allocation, char **ppc) |
981 | { |
982 | struct dlm_opts *opts; |
983 | struct dlm_msg *msg; |
984 | |
985 | msg = dlm_lowcomms_new_msg(nodeid, len: len + DLM_MIDCOMMS_OPT_LEN, |
986 | allocation, ppc, cb: midcomms_new_msg_cb, data: mh); |
987 | if (!msg) |
988 | return NULL; |
989 | |
990 | opts = (struct dlm_opts *)*ppc; |
991 | mh->opts = opts; |
992 | |
993 | /* add possible options here */ |
994 | dlm_fill_opts_header(opts, inner_len: len, seq: mh->seq); |
995 | |
996 | *ppc += sizeof(*opts); |
997 | mh->inner_p = (const union dlm_packet *)*ppc; |
998 | return msg; |
999 | } |
1000 | |
1001 | /* avoid false positive for nodes_srcu, unlock happens in |
1002 | * dlm_midcomms_commit_mhandle which is a must call if success |
1003 | */ |
1004 | #ifndef __CHECKER__ |
1005 | struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, |
1006 | gfp_t allocation, char **ppc) |
1007 | { |
1008 | struct midcomms_node *node; |
1009 | struct dlm_mhandle *mh; |
1010 | struct dlm_msg *msg; |
1011 | int idx; |
1012 | |
1013 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1014 | node = nodeid2node(nodeid); |
1015 | if (WARN_ON_ONCE(!node)) |
1016 | goto err; |
1017 | |
1018 | /* this is a bug, however we going on and hope it will be resolved */ |
1019 | WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); |
1020 | |
1021 | mh = dlm_allocate_mhandle(allocation); |
1022 | if (!mh) |
1023 | goto err; |
1024 | |
1025 | mh->committed = false; |
1026 | mh->ack_rcv = NULL; |
1027 | mh->idx = idx; |
1028 | mh->node = node; |
1029 | |
1030 | switch (node->version) { |
1031 | case DLM_VERSION_3_1: |
1032 | msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, |
1033 | NULL, NULL); |
1034 | if (!msg) { |
1035 | dlm_free_mhandle(mhandle: mh); |
1036 | goto err; |
1037 | } |
1038 | |
1039 | break; |
1040 | case DLM_VERSION_3_2: |
1041 | /* send ack back if necessary */ |
1042 | dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD); |
1043 | |
1044 | msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, |
1045 | ppc); |
1046 | if (!msg) { |
1047 | dlm_free_mhandle(mhandle: mh); |
1048 | goto err; |
1049 | } |
1050 | break; |
1051 | default: |
1052 | dlm_free_mhandle(mhandle: mh); |
1053 | WARN_ON_ONCE(1); |
1054 | goto err; |
1055 | } |
1056 | |
1057 | mh->msg = msg; |
1058 | |
1059 | /* keep in mind that is a must to call |
1060 | * dlm_midcomms_commit_msg() which releases |
1061 | * nodes_srcu using mh->idx which is assumed |
1062 | * here that the application will call it. |
1063 | */ |
1064 | return mh; |
1065 | |
1066 | err: |
1067 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1068 | return NULL; |
1069 | } |
1070 | #endif |
1071 | |
1072 | static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh, |
1073 | const void *name, int namelen) |
1074 | { |
1075 | switch (mh->inner_p->header.h_cmd) { |
1076 | case DLM_MSG: |
1077 | trace_dlm_send_message(dst: mh->node->nodeid, h_seq: mh->seq, |
1078 | ms: &mh->inner_p->message, |
1079 | name, namelen); |
1080 | break; |
1081 | case DLM_RCOM: |
1082 | trace_dlm_send_rcom(dst: mh->node->nodeid, h_seq: mh->seq, |
1083 | rc: &mh->inner_p->rcom); |
1084 | break; |
1085 | default: |
1086 | /* nothing to trace */ |
1087 | break; |
1088 | } |
1089 | } |
1090 | |
1091 | static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh, |
1092 | const void *name, int namelen) |
1093 | { |
1094 | /* nexthdr chain for fast lookup */ |
1095 | mh->opts->o_nextcmd = mh->inner_p->header.h_cmd; |
1096 | mh->committed = true; |
1097 | dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen); |
1098 | dlm_lowcomms_commit_msg(msg: mh->msg); |
1099 | } |
1100 | |
1101 | /* avoid false positive for nodes_srcu, lock was happen in |
1102 | * dlm_midcomms_get_mhandle |
1103 | */ |
1104 | #ifndef __CHECKER__ |
1105 | void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, |
1106 | const void *name, int namelen) |
1107 | { |
1108 | |
1109 | switch (mh->node->version) { |
1110 | case DLM_VERSION_3_1: |
1111 | srcu_read_unlock(ssp: &nodes_srcu, idx: mh->idx); |
1112 | |
1113 | dlm_lowcomms_commit_msg(msg: mh->msg); |
1114 | dlm_lowcomms_put_msg(msg: mh->msg); |
1115 | /* mh is not part of rcu list in this case */ |
1116 | dlm_free_mhandle(mhandle: mh); |
1117 | break; |
1118 | case DLM_VERSION_3_2: |
1119 | /* held rcu read lock here, because we sending the |
1120 | * dlm message out, when we do that we could receive |
1121 | * an ack back which releases the mhandle and we |
1122 | * get a use after free. |
1123 | */ |
1124 | rcu_read_lock(); |
1125 | dlm_midcomms_commit_msg_3_2(mh, name, namelen); |
1126 | srcu_read_unlock(ssp: &nodes_srcu, idx: mh->idx); |
1127 | rcu_read_unlock(); |
1128 | break; |
1129 | default: |
1130 | srcu_read_unlock(ssp: &nodes_srcu, idx: mh->idx); |
1131 | WARN_ON_ONCE(1); |
1132 | break; |
1133 | } |
1134 | } |
1135 | #endif |
1136 | |
1137 | int dlm_midcomms_start(void) |
1138 | { |
1139 | return dlm_lowcomms_start(); |
1140 | } |
1141 | |
1142 | void dlm_midcomms_stop(void) |
1143 | { |
1144 | dlm_lowcomms_stop(); |
1145 | } |
1146 | |
1147 | void dlm_midcomms_init(void) |
1148 | { |
1149 | int i; |
1150 | |
1151 | for (i = 0; i < CONN_HASH_SIZE; i++) |
1152 | INIT_HLIST_HEAD(&node_hash[i]); |
1153 | |
1154 | dlm_lowcomms_init(); |
1155 | } |
1156 | |
1157 | static void midcomms_node_release(struct rcu_head *rcu) |
1158 | { |
1159 | struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); |
1160 | |
1161 | WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); |
1162 | dlm_send_queue_flush(node); |
1163 | kfree(objp: node); |
1164 | } |
1165 | |
1166 | void dlm_midcomms_exit(void) |
1167 | { |
1168 | struct midcomms_node *node; |
1169 | int i, idx; |
1170 | |
1171 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1172 | for (i = 0; i < CONN_HASH_SIZE; i++) { |
1173 | hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { |
1174 | dlm_delete_debug_comms_file(ctx: node->debugfs); |
1175 | |
1176 | spin_lock(lock: &nodes_lock); |
1177 | hlist_del_rcu(n: &node->hlist); |
1178 | spin_unlock(lock: &nodes_lock); |
1179 | |
1180 | call_srcu(ssp: &nodes_srcu, head: &node->rcu, func: midcomms_node_release); |
1181 | } |
1182 | } |
1183 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1184 | |
1185 | dlm_lowcomms_exit(); |
1186 | } |
1187 | |
1188 | static void dlm_act_fin_ack_rcv(struct midcomms_node *node) |
1189 | { |
1190 | spin_lock(lock: &node->state_lock); |
1191 | pr_debug("receive active fin ack from node %d with state %s\n" , |
1192 | node->nodeid, dlm_state_str(node->state)); |
1193 | |
1194 | switch (node->state) { |
1195 | case DLM_FIN_WAIT1: |
1196 | node->state = DLM_FIN_WAIT2; |
1197 | pr_debug("switch node %d to state %s\n" , |
1198 | node->nodeid, dlm_state_str(node->state)); |
1199 | break; |
1200 | case DLM_CLOSING: |
1201 | midcomms_node_reset(node); |
1202 | pr_debug("switch node %d to state %s\n" , |
1203 | node->nodeid, dlm_state_str(node->state)); |
1204 | break; |
1205 | case DLM_CLOSED: |
1206 | /* not valid but somehow we got what we want */ |
1207 | wake_up(&node->shutdown_wait); |
1208 | break; |
1209 | default: |
1210 | spin_unlock(lock: &node->state_lock); |
1211 | log_print("%s: unexpected state: %d" , |
1212 | __func__, node->state); |
1213 | WARN_ON_ONCE(1); |
1214 | return; |
1215 | } |
1216 | spin_unlock(lock: &node->state_lock); |
1217 | } |
1218 | |
1219 | void dlm_midcomms_add_member(int nodeid) |
1220 | { |
1221 | struct midcomms_node *node; |
1222 | int idx; |
1223 | |
1224 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1225 | node = nodeid2node(nodeid); |
1226 | if (WARN_ON_ONCE(!node)) { |
1227 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1228 | return; |
1229 | } |
1230 | |
1231 | spin_lock(lock: &node->state_lock); |
1232 | if (!node->users) { |
1233 | pr_debug("receive add member from node %d with state %s\n" , |
1234 | node->nodeid, dlm_state_str(node->state)); |
1235 | switch (node->state) { |
1236 | case DLM_ESTABLISHED: |
1237 | break; |
1238 | case DLM_CLOSED: |
1239 | node->state = DLM_ESTABLISHED; |
1240 | pr_debug("switch node %d to state %s\n" , |
1241 | node->nodeid, dlm_state_str(node->state)); |
1242 | break; |
1243 | default: |
1244 | /* some invalid state passive shutdown |
1245 | * was failed, we try to reset and |
1246 | * hope it will go on. |
1247 | */ |
1248 | log_print("reset node %d because shutdown stuck" , |
1249 | node->nodeid); |
1250 | |
1251 | midcomms_node_reset(node); |
1252 | node->state = DLM_ESTABLISHED; |
1253 | break; |
1254 | } |
1255 | } |
1256 | |
1257 | node->users++; |
1258 | pr_debug("node %d users inc count %d\n" , nodeid, node->users); |
1259 | spin_unlock(lock: &node->state_lock); |
1260 | |
1261 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1262 | } |
1263 | |
1264 | void dlm_midcomms_remove_member(int nodeid) |
1265 | { |
1266 | struct midcomms_node *node; |
1267 | int idx; |
1268 | |
1269 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1270 | node = nodeid2node(nodeid); |
1271 | /* in case of dlm_midcomms_close() removes node */ |
1272 | if (!node) { |
1273 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1274 | return; |
1275 | } |
1276 | |
1277 | spin_lock(lock: &node->state_lock); |
1278 | /* case of dlm_midcomms_addr() created node but |
1279 | * was not added before because dlm_midcomms_close() |
1280 | * removed the node |
1281 | */ |
1282 | if (!node->users) { |
1283 | spin_unlock(lock: &node->state_lock); |
1284 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1285 | return; |
1286 | } |
1287 | |
1288 | node->users--; |
1289 | pr_debug("node %d users dec count %d\n" , nodeid, node->users); |
1290 | |
1291 | /* hitting users count to zero means the |
1292 | * other side is running dlm_midcomms_stop() |
1293 | * we meet us to have a clean disconnect. |
1294 | */ |
1295 | if (node->users == 0) { |
1296 | pr_debug("receive remove member from node %d with state %s\n" , |
1297 | node->nodeid, dlm_state_str(node->state)); |
1298 | switch (node->state) { |
1299 | case DLM_ESTABLISHED: |
1300 | break; |
1301 | case DLM_CLOSE_WAIT: |
1302 | /* passive shutdown DLM_LAST_ACK case 2 */ |
1303 | node->state = DLM_LAST_ACK; |
1304 | pr_debug("switch node %d to state %s case 2\n" , |
1305 | node->nodeid, dlm_state_str(node->state)); |
1306 | set_bit(DLM_NODE_FLAG_STOP_RX, addr: &node->flags); |
1307 | dlm_send_fin(node, ack_rcv: dlm_pas_fin_ack_rcv); |
1308 | break; |
1309 | case DLM_LAST_ACK: |
1310 | /* probably receive fin caught it, do nothing */ |
1311 | break; |
1312 | case DLM_CLOSED: |
1313 | /* already gone, do nothing */ |
1314 | break; |
1315 | default: |
1316 | log_print("%s: unexpected state: %d" , |
1317 | __func__, node->state); |
1318 | break; |
1319 | } |
1320 | } |
1321 | spin_unlock(lock: &node->state_lock); |
1322 | |
1323 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1324 | } |
1325 | |
1326 | void dlm_midcomms_version_wait(void) |
1327 | { |
1328 | struct midcomms_node *node; |
1329 | int i, idx, ret; |
1330 | |
1331 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1332 | for (i = 0; i < CONN_HASH_SIZE; i++) { |
1333 | hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { |
1334 | ret = wait_event_timeout(node->shutdown_wait, |
1335 | node->version != DLM_VERSION_NOT_SET || |
1336 | node->state == DLM_CLOSED || |
1337 | test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), |
1338 | DLM_SHUTDOWN_TIMEOUT); |
1339 | if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) |
1340 | pr_debug("version wait timed out for node %d with state %s\n" , |
1341 | node->nodeid, dlm_state_str(node->state)); |
1342 | } |
1343 | } |
1344 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1345 | } |
1346 | |
1347 | static void midcomms_shutdown(struct midcomms_node *node) |
1348 | { |
1349 | int ret; |
1350 | |
1351 | /* old protocol, we don't wait for pending operations */ |
1352 | switch (node->version) { |
1353 | case DLM_VERSION_3_2: |
1354 | break; |
1355 | default: |
1356 | return; |
1357 | } |
1358 | |
1359 | spin_lock(lock: &node->state_lock); |
1360 | pr_debug("receive active shutdown for node %d with state %s\n" , |
1361 | node->nodeid, dlm_state_str(node->state)); |
1362 | switch (node->state) { |
1363 | case DLM_ESTABLISHED: |
1364 | node->state = DLM_FIN_WAIT1; |
1365 | pr_debug("switch node %d to state %s case 2\n" , |
1366 | node->nodeid, dlm_state_str(node->state)); |
1367 | dlm_send_fin(node, ack_rcv: dlm_act_fin_ack_rcv); |
1368 | break; |
1369 | case DLM_CLOSED: |
1370 | /* we have what we want */ |
1371 | break; |
1372 | default: |
1373 | /* busy to enter DLM_FIN_WAIT1, wait until passive |
1374 | * done in shutdown_wait to enter DLM_CLOSED. |
1375 | */ |
1376 | break; |
1377 | } |
1378 | spin_unlock(lock: &node->state_lock); |
1379 | |
1380 | if (DLM_DEBUG_FENCE_TERMINATION) |
1381 | msleep(msecs: 5000); |
1382 | |
1383 | /* wait for other side dlm + fin */ |
1384 | ret = wait_event_timeout(node->shutdown_wait, |
1385 | node->state == DLM_CLOSED || |
1386 | test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), |
1387 | DLM_SHUTDOWN_TIMEOUT); |
1388 | if (!ret) |
1389 | pr_debug("active shutdown timed out for node %d with state %s\n" , |
1390 | node->nodeid, dlm_state_str(node->state)); |
1391 | else |
1392 | pr_debug("active shutdown done for node %d with state %s\n" , |
1393 | node->nodeid, dlm_state_str(node->state)); |
1394 | } |
1395 | |
1396 | void dlm_midcomms_shutdown(void) |
1397 | { |
1398 | struct midcomms_node *node; |
1399 | int i, idx; |
1400 | |
1401 | mutex_lock(&close_lock); |
1402 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1403 | for (i = 0; i < CONN_HASH_SIZE; i++) { |
1404 | hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { |
1405 | midcomms_shutdown(node); |
1406 | } |
1407 | } |
1408 | |
1409 | dlm_lowcomms_shutdown(); |
1410 | |
1411 | for (i = 0; i < CONN_HASH_SIZE; i++) { |
1412 | hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { |
1413 | midcomms_node_reset(node); |
1414 | } |
1415 | } |
1416 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1417 | mutex_unlock(lock: &close_lock); |
1418 | } |
1419 | |
1420 | int dlm_midcomms_close(int nodeid) |
1421 | { |
1422 | struct midcomms_node *node; |
1423 | int idx, ret; |
1424 | |
1425 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1426 | /* Abort pending close/remove operation */ |
1427 | node = nodeid2node(nodeid); |
1428 | if (node) { |
1429 | /* let shutdown waiters leave */ |
1430 | set_bit(DLM_NODE_FLAG_CLOSE, addr: &node->flags); |
1431 | wake_up(&node->shutdown_wait); |
1432 | } |
1433 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1434 | |
1435 | synchronize_srcu(ssp: &nodes_srcu); |
1436 | |
1437 | mutex_lock(&close_lock); |
1438 | idx = srcu_read_lock(ssp: &nodes_srcu); |
1439 | node = nodeid2node(nodeid); |
1440 | if (!node) { |
1441 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1442 | mutex_unlock(lock: &close_lock); |
1443 | return dlm_lowcomms_close(nodeid); |
1444 | } |
1445 | |
1446 | ret = dlm_lowcomms_close(nodeid); |
1447 | dlm_delete_debug_comms_file(ctx: node->debugfs); |
1448 | |
1449 | spin_lock(lock: &nodes_lock); |
1450 | hlist_del_rcu(n: &node->hlist); |
1451 | spin_unlock(lock: &nodes_lock); |
1452 | srcu_read_unlock(ssp: &nodes_srcu, idx); |
1453 | |
1454 | /* wait that all readers left until flush send queue */ |
1455 | synchronize_srcu(ssp: &nodes_srcu); |
1456 | |
1457 | /* drop all pending dlm messages, this is fine as |
1458 | * this function get called when the node is fenced |
1459 | */ |
1460 | dlm_send_queue_flush(node); |
1461 | |
1462 | call_srcu(ssp: &nodes_srcu, head: &node->rcu, func: midcomms_node_release); |
1463 | mutex_unlock(lock: &close_lock); |
1464 | |
1465 | return ret; |
1466 | } |
1467 | |
1468 | /* debug functionality to send raw dlm msg from user space */ |
1469 | struct dlm_rawmsg_data { |
1470 | struct midcomms_node *node; |
1471 | void *buf; |
1472 | }; |
1473 | |
1474 | static void midcomms_new_rawmsg_cb(void *data) |
1475 | { |
1476 | struct dlm_rawmsg_data *rd = data; |
1477 | struct dlm_header *h = rd->buf; |
1478 | |
1479 | switch (h->h_version) { |
1480 | case cpu_to_le32(DLM_VERSION_3_1): |
1481 | break; |
1482 | default: |
1483 | switch (h->h_cmd) { |
1484 | case DLM_OPTS: |
1485 | if (!h->u.h_seq) |
1486 | h->u.h_seq = cpu_to_le32(atomic_fetch_inc(&rd->node->seq_send)); |
1487 | break; |
1488 | default: |
1489 | break; |
1490 | } |
1491 | break; |
1492 | } |
1493 | } |
1494 | |
1495 | int dlm_midcomms_rawmsg_send(struct midcomms_node *node, void *buf, |
1496 | int buflen) |
1497 | { |
1498 | struct dlm_rawmsg_data rd; |
1499 | struct dlm_msg *msg; |
1500 | char *msgbuf; |
1501 | |
1502 | rd.node = node; |
1503 | rd.buf = buf; |
1504 | |
1505 | msg = dlm_lowcomms_new_msg(nodeid: node->nodeid, len: buflen, GFP_NOFS, |
1506 | ppc: &msgbuf, cb: midcomms_new_rawmsg_cb, data: &rd); |
1507 | if (!msg) |
1508 | return -ENOMEM; |
1509 | |
1510 | memcpy(msgbuf, buf, buflen); |
1511 | dlm_lowcomms_commit_msg(msg); |
1512 | return 0; |
1513 | } |
1514 | |
1515 | |