1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /****************************************************************************** |
3 | ******************************************************************************* |
4 | ** |
5 | ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. |
6 | ** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. |
7 | ** |
8 | ** |
9 | ******************************************************************************* |
10 | ******************************************************************************/ |
11 | |
12 | #include "dlm_internal.h" |
13 | #include "lockspace.h" |
14 | #include "member.h" |
15 | #include "dir.h" |
16 | #include "ast.h" |
17 | #include "recover.h" |
18 | #include "lowcomms.h" |
19 | #include "lock.h" |
20 | #include "requestqueue.h" |
21 | #include "recoverd.h" |
22 | |
23 | |
24 | /* If the start for which we're re-enabling locking (seq) has been superseded |
25 | by a newer stop (ls_recover_seq), we need to leave locking disabled. |
26 | |
27 | We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees |
28 | locking stopped and b) adds a message to the requestqueue, but dlm_recoverd |
29 | enables locking and clears the requestqueue between a and b. */ |
30 | |
31 | static int enable_locking(struct dlm_ls *ls, uint64_t seq) |
32 | { |
33 | int error = -EINTR; |
34 | |
35 | down_write(sem: &ls->ls_recv_active); |
36 | |
37 | spin_lock(lock: &ls->ls_recover_lock); |
38 | if (ls->ls_recover_seq == seq) { |
39 | set_bit(LSFL_RUNNING, addr: &ls->ls_flags); |
40 | /* unblocks processes waiting to enter the dlm */ |
41 | up_write(sem: &ls->ls_in_recovery); |
42 | clear_bit(LSFL_RECOVER_LOCK, addr: &ls->ls_flags); |
43 | error = 0; |
44 | } |
45 | spin_unlock(lock: &ls->ls_recover_lock); |
46 | |
47 | up_write(sem: &ls->ls_recv_active); |
48 | return error; |
49 | } |
50 | |
51 | static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) |
52 | { |
53 | unsigned long start; |
54 | int error, neg = 0; |
55 | |
56 | log_rinfo(ls, "dlm_recover %llu" , (unsigned long long)rv->seq); |
57 | |
58 | mutex_lock(&ls->ls_recoverd_active); |
59 | |
60 | dlm_callback_suspend(ls); |
61 | |
62 | dlm_clear_toss(ls); |
63 | |
64 | /* |
65 | * This list of root rsb's will be the basis of most of the recovery |
66 | * routines. |
67 | */ |
68 | |
69 | dlm_create_root_list(ls); |
70 | |
71 | /* |
72 | * Add or remove nodes from the lockspace's ls_nodes list. |
73 | * |
74 | * Due to the fact that we must report all membership changes to lsops |
75 | * or midcomms layer, it is not permitted to abort ls_recover() until |
76 | * this is done. |
77 | */ |
78 | |
79 | error = dlm_recover_members(ls, rv, neg_out: &neg); |
80 | if (error) { |
81 | log_rinfo(ls, "dlm_recover_members error %d" , error); |
82 | goto fail; |
83 | } |
84 | |
85 | dlm_recover_dir_nodeid(ls); |
86 | |
87 | ls->ls_recover_dir_sent_res = 0; |
88 | ls->ls_recover_dir_sent_msg = 0; |
89 | ls->ls_recover_locks_in = 0; |
90 | |
91 | dlm_set_recover_status(ls, DLM_RS_NODES); |
92 | |
93 | error = dlm_recover_members_wait(ls, seq: rv->seq); |
94 | if (error) { |
95 | log_rinfo(ls, "dlm_recover_members_wait error %d" , error); |
96 | goto fail; |
97 | } |
98 | |
99 | start = jiffies; |
100 | |
101 | /* |
102 | * Rebuild our own share of the directory by collecting from all other |
103 | * nodes their master rsb names that hash to us. |
104 | */ |
105 | |
106 | error = dlm_recover_directory(ls, seq: rv->seq); |
107 | if (error) { |
108 | log_rinfo(ls, "dlm_recover_directory error %d" , error); |
109 | goto fail; |
110 | } |
111 | |
112 | dlm_set_recover_status(ls, DLM_RS_DIR); |
113 | |
114 | error = dlm_recover_directory_wait(ls, seq: rv->seq); |
115 | if (error) { |
116 | log_rinfo(ls, "dlm_recover_directory_wait error %d" , error); |
117 | goto fail; |
118 | } |
119 | |
120 | log_rinfo(ls, "dlm_recover_directory %u out %u messages" , |
121 | ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg); |
122 | |
123 | /* |
124 | * We may have outstanding operations that are waiting for a reply from |
125 | * a failed node. Mark these to be resent after recovery. Unlock and |
126 | * cancel ops can just be completed. |
127 | */ |
128 | |
129 | dlm_recover_waiters_pre(ls); |
130 | |
131 | if (dlm_recovery_stopped(ls)) { |
132 | error = -EINTR; |
133 | goto fail; |
134 | } |
135 | |
136 | if (neg || dlm_no_directory(ls)) { |
137 | /* |
138 | * Clear lkb's for departed nodes. |
139 | */ |
140 | |
141 | dlm_recover_purge(ls); |
142 | |
143 | /* |
144 | * Get new master nodeid's for rsb's that were mastered on |
145 | * departed nodes. |
146 | */ |
147 | |
148 | error = dlm_recover_masters(ls, seq: rv->seq); |
149 | if (error) { |
150 | log_rinfo(ls, "dlm_recover_masters error %d" , error); |
151 | goto fail; |
152 | } |
153 | |
154 | /* |
155 | * Send our locks on remastered rsb's to the new masters. |
156 | */ |
157 | |
158 | error = dlm_recover_locks(ls, seq: rv->seq); |
159 | if (error) { |
160 | log_rinfo(ls, "dlm_recover_locks error %d" , error); |
161 | goto fail; |
162 | } |
163 | |
164 | dlm_set_recover_status(ls, DLM_RS_LOCKS); |
165 | |
166 | error = dlm_recover_locks_wait(ls, seq: rv->seq); |
167 | if (error) { |
168 | log_rinfo(ls, "dlm_recover_locks_wait error %d" , error); |
169 | goto fail; |
170 | } |
171 | |
172 | log_rinfo(ls, "dlm_recover_locks %u in" , |
173 | ls->ls_recover_locks_in); |
174 | |
175 | /* |
176 | * Finalize state in master rsb's now that all locks can be |
177 | * checked. This includes conversion resolution and lvb |
178 | * settings. |
179 | */ |
180 | |
181 | dlm_recover_rsbs(ls); |
182 | } else { |
183 | /* |
184 | * Other lockspace members may be going through the "neg" steps |
185 | * while also adding us to the lockspace, in which case they'll |
186 | * be doing the recover_locks (RS_LOCKS) barrier. |
187 | */ |
188 | dlm_set_recover_status(ls, DLM_RS_LOCKS); |
189 | |
190 | error = dlm_recover_locks_wait(ls, seq: rv->seq); |
191 | if (error) { |
192 | log_rinfo(ls, "dlm_recover_locks_wait error %d" , error); |
193 | goto fail; |
194 | } |
195 | } |
196 | |
197 | dlm_release_root_list(ls); |
198 | |
199 | /* |
200 | * Purge directory-related requests that are saved in requestqueue. |
201 | * All dir requests from before recovery are invalid now due to the dir |
202 | * rebuild and will be resent by the requesting nodes. |
203 | */ |
204 | |
205 | dlm_purge_requestqueue(ls); |
206 | |
207 | dlm_set_recover_status(ls, DLM_RS_DONE); |
208 | |
209 | error = dlm_recover_done_wait(ls, seq: rv->seq); |
210 | if (error) { |
211 | log_rinfo(ls, "dlm_recover_done_wait error %d" , error); |
212 | goto fail; |
213 | } |
214 | |
215 | dlm_clear_members_gone(ls); |
216 | |
217 | dlm_callback_resume(ls); |
218 | |
219 | error = enable_locking(ls, seq: rv->seq); |
220 | if (error) { |
221 | log_rinfo(ls, "enable_locking error %d" , error); |
222 | goto fail; |
223 | } |
224 | |
225 | error = dlm_process_requestqueue(ls); |
226 | if (error) { |
227 | log_rinfo(ls, "dlm_process_requestqueue error %d" , error); |
228 | goto fail; |
229 | } |
230 | |
231 | error = dlm_recover_waiters_post(ls); |
232 | if (error) { |
233 | log_rinfo(ls, "dlm_recover_waiters_post error %d" , error); |
234 | goto fail; |
235 | } |
236 | |
237 | dlm_recover_grant(ls); |
238 | |
239 | log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms" , |
240 | (unsigned long long)rv->seq, ls->ls_generation, |
241 | jiffies_to_msecs(jiffies - start)); |
242 | mutex_unlock(lock: &ls->ls_recoverd_active); |
243 | |
244 | return 0; |
245 | |
246 | fail: |
247 | dlm_release_root_list(ls); |
248 | mutex_unlock(lock: &ls->ls_recoverd_active); |
249 | |
250 | return error; |
251 | } |
252 | |
253 | /* The dlm_ls_start() that created the rv we take here may already have been |
254 | stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP |
255 | flag set. */ |
256 | |
257 | static void do_ls_recovery(struct dlm_ls *ls) |
258 | { |
259 | struct dlm_recover *rv = NULL; |
260 | int error; |
261 | |
262 | spin_lock(lock: &ls->ls_recover_lock); |
263 | rv = ls->ls_recover_args; |
264 | ls->ls_recover_args = NULL; |
265 | if (rv && ls->ls_recover_seq == rv->seq) |
266 | clear_bit(LSFL_RECOVER_STOP, addr: &ls->ls_flags); |
267 | spin_unlock(lock: &ls->ls_recover_lock); |
268 | |
269 | if (rv) { |
270 | error = ls_recover(ls, rv); |
271 | switch (error) { |
272 | case 0: |
273 | ls->ls_recovery_result = 0; |
274 | complete(&ls->ls_recovery_done); |
275 | |
276 | dlm_lsop_recover_done(ls); |
277 | break; |
278 | case -EINTR: |
279 | /* if recovery was interrupted -EINTR we wait for the next |
280 | * ls_recover() iteration until it hopefully succeeds. |
281 | */ |
282 | log_rinfo(ls, "%s %llu interrupted and should be queued to run again" , |
283 | __func__, (unsigned long long)rv->seq); |
284 | break; |
285 | default: |
286 | log_rinfo(ls, "%s %llu error %d" , __func__, |
287 | (unsigned long long)rv->seq, error); |
288 | |
289 | /* let new_lockspace() get aware of critical error */ |
290 | ls->ls_recovery_result = error; |
291 | complete(&ls->ls_recovery_done); |
292 | break; |
293 | } |
294 | |
295 | kfree(objp: rv->nodes); |
296 | kfree(objp: rv); |
297 | } |
298 | } |
299 | |
300 | static int dlm_recoverd(void *arg) |
301 | { |
302 | struct dlm_ls *ls; |
303 | |
304 | ls = dlm_find_lockspace_local(id: arg); |
305 | if (!ls) { |
306 | log_print("dlm_recoverd: no lockspace %p" , arg); |
307 | return -1; |
308 | } |
309 | |
310 | down_write(sem: &ls->ls_in_recovery); |
311 | set_bit(LSFL_RECOVER_LOCK, addr: &ls->ls_flags); |
312 | wake_up(&ls->ls_recover_lock_wait); |
313 | |
314 | while (1) { |
315 | /* |
316 | * We call kthread_should_stop() after set_current_state(). |
317 | * This is because it works correctly if kthread_stop() is |
318 | * called just before set_current_state(). |
319 | */ |
320 | set_current_state(TASK_INTERRUPTIBLE); |
321 | if (kthread_should_stop()) { |
322 | set_current_state(TASK_RUNNING); |
323 | break; |
324 | } |
325 | if (!test_bit(LSFL_RECOVER_WORK, &ls->ls_flags) && |
326 | !test_bit(LSFL_RECOVER_DOWN, &ls->ls_flags)) { |
327 | if (kthread_should_stop()) |
328 | break; |
329 | schedule(); |
330 | } |
331 | set_current_state(TASK_RUNNING); |
332 | |
333 | if (test_and_clear_bit(LSFL_RECOVER_DOWN, addr: &ls->ls_flags)) { |
334 | down_write(sem: &ls->ls_in_recovery); |
335 | set_bit(LSFL_RECOVER_LOCK, addr: &ls->ls_flags); |
336 | wake_up(&ls->ls_recover_lock_wait); |
337 | } |
338 | |
339 | if (test_and_clear_bit(LSFL_RECOVER_WORK, addr: &ls->ls_flags)) |
340 | do_ls_recovery(ls); |
341 | } |
342 | |
343 | if (test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags)) |
344 | up_write(sem: &ls->ls_in_recovery); |
345 | |
346 | dlm_put_lockspace(ls); |
347 | return 0; |
348 | } |
349 | |
350 | int dlm_recoverd_start(struct dlm_ls *ls) |
351 | { |
352 | struct task_struct *p; |
353 | int error = 0; |
354 | |
355 | p = kthread_run(dlm_recoverd, ls, "dlm_recoverd" ); |
356 | if (IS_ERR(ptr: p)) |
357 | error = PTR_ERR(ptr: p); |
358 | else |
359 | ls->ls_recoverd_task = p; |
360 | return error; |
361 | } |
362 | |
363 | void dlm_recoverd_stop(struct dlm_ls *ls) |
364 | { |
365 | kthread_stop(k: ls->ls_recoverd_task); |
366 | } |
367 | |
368 | void dlm_recoverd_suspend(struct dlm_ls *ls) |
369 | { |
370 | wake_up(&ls->ls_wait_general); |
371 | mutex_lock(&ls->ls_recoverd_active); |
372 | } |
373 | |
374 | void dlm_recoverd_resume(struct dlm_ls *ls) |
375 | { |
376 | mutex_unlock(lock: &ls->ls_recoverd_active); |
377 | } |
378 | |
379 | |