1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* Handle fileserver selection and rotation. |
3 | * |
4 | * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. |
5 | * Written by David Howells (dhowells@redhat.com) |
6 | */ |
7 | |
8 | #include <linux/kernel.h> |
9 | #include <linux/slab.h> |
10 | #include <linux/fs.h> |
11 | #include <linux/sched.h> |
12 | #include <linux/delay.h> |
13 | #include <linux/sched/signal.h> |
14 | #include "internal.h" |
15 | #include "afs_fs.h" |
16 | |
17 | /* |
18 | * Begin iteration through a server list, starting with the vnode's last used |
19 | * server if possible, or the last recorded good server if not. |
20 | */ |
21 | static bool afs_start_fs_iteration(struct afs_operation *op, |
22 | struct afs_vnode *vnode) |
23 | { |
24 | struct afs_server *server; |
25 | void *cb_server; |
26 | int i; |
27 | |
28 | read_lock(&op->volume->servers_lock); |
29 | op->server_list = afs_get_serverlist( |
30 | rcu_dereference_protected(op->volume->servers, |
31 | lockdep_is_held(&op->volume->servers_lock))); |
32 | read_unlock(&op->volume->servers_lock); |
33 | |
34 | op->untried = (1UL << op->server_list->nr_servers) - 1; |
35 | op->index = READ_ONCE(op->server_list->preferred); |
36 | |
37 | cb_server = vnode->cb_server; |
38 | if (cb_server) { |
39 | /* See if the vnode's preferred record is still available */ |
40 | for (i = 0; i < op->server_list->nr_servers; i++) { |
41 | server = op->server_list->servers[i].server; |
42 | if (server == cb_server) { |
43 | op->index = i; |
44 | goto found_interest; |
45 | } |
46 | } |
47 | |
48 | /* If we have a lock outstanding on a server that's no longer |
49 | * serving this vnode, then we can't switch to another server |
50 | * and have to return an error. |
51 | */ |
52 | if (op->flags & AFS_OPERATION_CUR_ONLY) { |
53 | op->error = -ESTALE; |
54 | return false; |
55 | } |
56 | |
57 | /* Note that the callback promise is effectively broken */ |
58 | write_seqlock(sl: &vnode->cb_lock); |
59 | ASSERTCMP(cb_server, ==, vnode->cb_server); |
60 | vnode->cb_server = NULL; |
61 | if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, addr: &vnode->flags)) |
62 | vnode->cb_break++; |
63 | write_sequnlock(sl: &vnode->cb_lock); |
64 | } |
65 | |
66 | found_interest: |
67 | return true; |
68 | } |
69 | |
70 | /* |
71 | * Post volume busy note. |
72 | */ |
73 | static void afs_busy(struct afs_volume *volume, u32 abort_code) |
74 | { |
75 | const char *m; |
76 | |
77 | switch (abort_code) { |
78 | case VOFFLINE: m = "offline" ; break; |
79 | case VRESTARTING: m = "restarting" ; break; |
80 | case VSALVAGING: m = "being salvaged" ; break; |
81 | default: m = "busy" ; break; |
82 | } |
83 | |
84 | pr_notice("kAFS: Volume %llu '%s' is %s\n" , volume->vid, volume->name, m); |
85 | } |
86 | |
87 | /* |
88 | * Sleep and retry the operation to the same fileserver. |
89 | */ |
90 | static bool afs_sleep_and_retry(struct afs_operation *op) |
91 | { |
92 | if (!(op->flags & AFS_OPERATION_UNINTR)) { |
93 | msleep_interruptible(msecs: 1000); |
94 | if (signal_pending(current)) { |
95 | op->error = -ERESTARTSYS; |
96 | return false; |
97 | } |
98 | } else { |
99 | msleep(msecs: 1000); |
100 | } |
101 | |
102 | return true; |
103 | } |
104 | |
105 | /* |
106 | * Select the fileserver to use. May be called multiple times to rotate |
107 | * through the fileservers. |
108 | */ |
109 | bool afs_select_fileserver(struct afs_operation *op) |
110 | { |
111 | struct afs_addr_list *alist; |
112 | struct afs_server *server; |
113 | struct afs_vnode *vnode = op->file[0].vnode; |
114 | struct afs_error e; |
115 | u32 rtt; |
116 | int error = op->ac.error, i; |
117 | |
118 | _enter("%lx[%d],%lx[%d],%d,%d" , |
119 | op->untried, op->index, |
120 | op->ac.tried, op->ac.index, |
121 | error, op->ac.abort_code); |
122 | |
123 | if (op->flags & AFS_OPERATION_STOP) { |
124 | _leave(" = f [stopped]" ); |
125 | return false; |
126 | } |
127 | |
128 | op->nr_iterations++; |
129 | |
130 | /* Evaluate the result of the previous operation, if there was one. */ |
131 | switch (error) { |
132 | case SHRT_MAX: |
133 | goto start; |
134 | |
135 | case 0: |
136 | default: |
137 | /* Success or local failure. Stop. */ |
138 | op->error = error; |
139 | op->flags |= AFS_OPERATION_STOP; |
140 | _leave(" = f [okay/local %d]" , error); |
141 | return false; |
142 | |
143 | case -ECONNABORTED: |
144 | /* The far side rejected the operation on some grounds. This |
145 | * might involve the server being busy or the volume having been moved. |
146 | */ |
147 | switch (op->ac.abort_code) { |
148 | case VNOVOL: |
149 | /* This fileserver doesn't know about the volume. |
150 | * - May indicate that the VL is wrong - retry once and compare |
151 | * the results. |
152 | * - May indicate that the fileserver couldn't attach to the vol. |
153 | */ |
154 | if (op->flags & AFS_OPERATION_VNOVOL) { |
155 | op->error = -EREMOTEIO; |
156 | goto next_server; |
157 | } |
158 | |
159 | write_lock(&op->volume->servers_lock); |
160 | op->server_list->vnovol_mask |= 1 << op->index; |
161 | write_unlock(&op->volume->servers_lock); |
162 | |
163 | set_bit(AFS_VOLUME_NEEDS_UPDATE, addr: &op->volume->flags); |
164 | error = afs_check_volume_status(op->volume, op); |
165 | if (error < 0) |
166 | goto failed_set_error; |
167 | |
168 | if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { |
169 | op->error = -ENOMEDIUM; |
170 | goto failed; |
171 | } |
172 | |
173 | /* If the server list didn't change, then assume that |
174 | * it's the fileserver having trouble. |
175 | */ |
176 | if (rcu_access_pointer(op->volume->servers) == op->server_list) { |
177 | op->error = -EREMOTEIO; |
178 | goto next_server; |
179 | } |
180 | |
181 | /* Try again */ |
182 | op->flags |= AFS_OPERATION_VNOVOL; |
183 | _leave(" = t [vnovol]" ); |
184 | return true; |
185 | |
186 | case VSALVAGE: /* TODO: Should this return an error or iterate? */ |
187 | case VVOLEXISTS: |
188 | case VNOSERVICE: |
189 | case VONLINE: |
190 | case VDISKFULL: |
191 | case VOVERQUOTA: |
192 | op->error = afs_abort_to_error(op->ac.abort_code); |
193 | goto next_server; |
194 | |
195 | case VOFFLINE: |
196 | if (!test_and_set_bit(AFS_VOLUME_OFFLINE, addr: &op->volume->flags)) { |
197 | afs_busy(volume: op->volume, abort_code: op->ac.abort_code); |
198 | clear_bit(AFS_VOLUME_BUSY, addr: &op->volume->flags); |
199 | } |
200 | if (op->flags & AFS_OPERATION_NO_VSLEEP) { |
201 | op->error = -EADV; |
202 | goto failed; |
203 | } |
204 | if (op->flags & AFS_OPERATION_CUR_ONLY) { |
205 | op->error = -ESTALE; |
206 | goto failed; |
207 | } |
208 | goto busy; |
209 | |
210 | case VSALVAGING: |
211 | case VRESTARTING: |
212 | case VBUSY: |
213 | /* Retry after going round all the servers unless we |
214 | * have a file lock we need to maintain. |
215 | */ |
216 | if (op->flags & AFS_OPERATION_NO_VSLEEP) { |
217 | op->error = -EBUSY; |
218 | goto failed; |
219 | } |
220 | if (!test_and_set_bit(AFS_VOLUME_BUSY, addr: &op->volume->flags)) { |
221 | afs_busy(volume: op->volume, abort_code: op->ac.abort_code); |
222 | clear_bit(AFS_VOLUME_OFFLINE, addr: &op->volume->flags); |
223 | } |
224 | busy: |
225 | if (op->flags & AFS_OPERATION_CUR_ONLY) { |
226 | if (!afs_sleep_and_retry(op)) |
227 | goto failed; |
228 | |
229 | /* Retry with same server & address */ |
230 | _leave(" = t [vbusy]" ); |
231 | return true; |
232 | } |
233 | |
234 | op->flags |= AFS_OPERATION_VBUSY; |
235 | goto next_server; |
236 | |
237 | case VMOVED: |
238 | /* The volume migrated to another server. We consider |
239 | * consider all locks and callbacks broken and request |
240 | * an update from the VLDB. |
241 | * |
242 | * We also limit the number of VMOVED hops we will |
243 | * honour, just in case someone sets up a loop. |
244 | */ |
245 | if (op->flags & AFS_OPERATION_VMOVED) { |
246 | op->error = -EREMOTEIO; |
247 | goto failed; |
248 | } |
249 | op->flags |= AFS_OPERATION_VMOVED; |
250 | |
251 | set_bit(AFS_VOLUME_WAIT, addr: &op->volume->flags); |
252 | set_bit(AFS_VOLUME_NEEDS_UPDATE, addr: &op->volume->flags); |
253 | error = afs_check_volume_status(op->volume, op); |
254 | if (error < 0) |
255 | goto failed_set_error; |
256 | |
257 | /* If the server list didn't change, then the VLDB is |
258 | * out of sync with the fileservers. This is hopefully |
259 | * a temporary condition, however, so we don't want to |
260 | * permanently block access to the file. |
261 | * |
262 | * TODO: Try other fileservers if we can. |
263 | * |
264 | * TODO: Retry a few times with sleeps. |
265 | */ |
266 | if (rcu_access_pointer(op->volume->servers) == op->server_list) { |
267 | op->error = -ENOMEDIUM; |
268 | goto failed; |
269 | } |
270 | |
271 | goto restart_from_beginning; |
272 | |
273 | default: |
274 | clear_bit(AFS_VOLUME_OFFLINE, addr: &op->volume->flags); |
275 | clear_bit(AFS_VOLUME_BUSY, addr: &op->volume->flags); |
276 | op->error = afs_abort_to_error(op->ac.abort_code); |
277 | goto failed; |
278 | } |
279 | |
280 | case -ETIMEDOUT: |
281 | case -ETIME: |
282 | if (op->error != -EDESTADDRREQ) |
283 | goto iterate_address; |
284 | fallthrough; |
285 | case -ERFKILL: |
286 | case -EADDRNOTAVAIL: |
287 | case -ENETUNREACH: |
288 | case -EHOSTUNREACH: |
289 | case -EHOSTDOWN: |
290 | case -ECONNREFUSED: |
291 | _debug("no conn" ); |
292 | op->error = error; |
293 | goto iterate_address; |
294 | |
295 | case -ENETRESET: |
296 | pr_warn("kAFS: Peer reset %s (op=%x)\n" , |
297 | op->type ? op->type->name : "???" , op->debug_id); |
298 | fallthrough; |
299 | case -ECONNRESET: |
300 | _debug("call reset" ); |
301 | op->error = error; |
302 | goto failed; |
303 | } |
304 | |
305 | restart_from_beginning: |
306 | _debug("restart" ); |
307 | afs_end_cursor(&op->ac); |
308 | op->server = NULL; |
309 | afs_put_serverlist(op->net, op->server_list); |
310 | op->server_list = NULL; |
311 | start: |
312 | _debug("start" ); |
313 | /* See if we need to do an update of the volume record. Note that the |
314 | * volume may have moved or even have been deleted. |
315 | */ |
316 | error = afs_check_volume_status(op->volume, op); |
317 | if (error < 0) |
318 | goto failed_set_error; |
319 | |
320 | if (!afs_start_fs_iteration(op, vnode)) |
321 | goto failed; |
322 | |
323 | _debug("__ VOL %llx __" , op->volume->vid); |
324 | |
325 | pick_server: |
326 | _debug("pick [%lx]" , op->untried); |
327 | |
328 | error = afs_wait_for_fs_probes(op->server_list, op->untried); |
329 | if (error < 0) |
330 | goto failed_set_error; |
331 | |
332 | /* Pick the untried server with the lowest RTT. If we have outstanding |
333 | * callbacks, we stick with the server we're already using if we can. |
334 | */ |
335 | if (op->server) { |
336 | _debug("server %u" , op->index); |
337 | if (test_bit(op->index, &op->untried)) |
338 | goto selected_server; |
339 | op->server = NULL; |
340 | _debug("no server" ); |
341 | } |
342 | |
343 | op->index = -1; |
344 | rtt = U32_MAX; |
345 | for (i = 0; i < op->server_list->nr_servers; i++) { |
346 | struct afs_server *s = op->server_list->servers[i].server; |
347 | |
348 | if (!test_bit(i, &op->untried) || |
349 | !test_bit(AFS_SERVER_FL_RESPONDING, &s->flags)) |
350 | continue; |
351 | if (s->probe.rtt < rtt) { |
352 | op->index = i; |
353 | rtt = s->probe.rtt; |
354 | } |
355 | } |
356 | |
357 | if (op->index == -1) |
358 | goto no_more_servers; |
359 | |
360 | selected_server: |
361 | _debug("use %d" , op->index); |
362 | __clear_bit(op->index, &op->untried); |
363 | |
364 | /* We're starting on a different fileserver from the list. We need to |
365 | * check it, create a callback intercept, find its address list and |
366 | * probe its capabilities before we use it. |
367 | */ |
368 | ASSERTCMP(op->ac.alist, ==, NULL); |
369 | server = op->server_list->servers[op->index].server; |
370 | |
371 | if (!afs_check_server_record(op, server)) |
372 | goto failed; |
373 | |
374 | _debug("USING SERVER: %pU" , &server->uuid); |
375 | |
376 | op->flags |= AFS_OPERATION_RETRY_SERVER; |
377 | op->server = server; |
378 | if (vnode->cb_server != server) { |
379 | vnode->cb_server = server; |
380 | vnode->cb_s_break = server->cb_s_break; |
381 | vnode->cb_fs_s_break = atomic_read(v: &server->cell->fs_s_break); |
382 | vnode->cb_v_break = vnode->volume->cb_v_break; |
383 | clear_bit(AFS_VNODE_CB_PROMISED, addr: &vnode->flags); |
384 | } |
385 | |
386 | read_lock(&server->fs_lock); |
387 | alist = rcu_dereference_protected(server->addresses, |
388 | lockdep_is_held(&server->fs_lock)); |
389 | afs_get_addrlist(alist); |
390 | read_unlock(&server->fs_lock); |
391 | |
392 | retry_server: |
393 | memset(&op->ac, 0, sizeof(op->ac)); |
394 | |
395 | if (!op->ac.alist) |
396 | op->ac.alist = alist; |
397 | else |
398 | afs_put_addrlist(alist); |
399 | |
400 | op->ac.index = -1; |
401 | |
402 | iterate_address: |
403 | ASSERT(op->ac.alist); |
404 | /* Iterate over the current server's address list to try and find an |
405 | * address on which it will respond to us. |
406 | */ |
407 | if (!afs_iterate_addresses(&op->ac)) |
408 | goto out_of_addresses; |
409 | |
410 | _debug("address [%u] %u/%u %pISp" , |
411 | op->index, op->ac.index, op->ac.alist->nr_addrs, |
412 | &op->ac.alist->addrs[op->ac.index].transport); |
413 | |
414 | _leave(" = t" ); |
415 | return true; |
416 | |
417 | out_of_addresses: |
418 | /* We've now had a failure to respond on all of a server's addresses - |
419 | * immediately probe them again and consider retrying the server. |
420 | */ |
421 | afs_probe_fileserver(op->net, op->server); |
422 | if (op->flags & AFS_OPERATION_RETRY_SERVER) { |
423 | alist = op->ac.alist; |
424 | error = afs_wait_for_one_fs_probe( |
425 | op->server, !(op->flags & AFS_OPERATION_UNINTR)); |
426 | switch (error) { |
427 | case 0: |
428 | op->flags &= ~AFS_OPERATION_RETRY_SERVER; |
429 | goto retry_server; |
430 | case -ERESTARTSYS: |
431 | goto failed_set_error; |
432 | case -ETIME: |
433 | case -EDESTADDRREQ: |
434 | goto next_server; |
435 | } |
436 | } |
437 | |
438 | next_server: |
439 | _debug("next" ); |
440 | afs_end_cursor(&op->ac); |
441 | goto pick_server; |
442 | |
443 | no_more_servers: |
444 | /* That's all the servers poked to no good effect. Try again if some |
445 | * of them were busy. |
446 | */ |
447 | if (op->flags & AFS_OPERATION_VBUSY) |
448 | goto restart_from_beginning; |
449 | |
450 | e.error = -EDESTADDRREQ; |
451 | e.responded = false; |
452 | for (i = 0; i < op->server_list->nr_servers; i++) { |
453 | struct afs_server *s = op->server_list->servers[i].server; |
454 | |
455 | afs_prioritise_error(&e, READ_ONCE(s->probe.error), |
456 | s->probe.abort_code); |
457 | } |
458 | |
459 | error = e.error; |
460 | |
461 | failed_set_error: |
462 | op->error = error; |
463 | failed: |
464 | op->flags |= AFS_OPERATION_STOP; |
465 | afs_end_cursor(&op->ac); |
466 | _leave(" = f [failed %d]" , op->error); |
467 | return false; |
468 | } |
469 | |
470 | /* |
471 | * Dump cursor state in the case of the error being EDESTADDRREQ. |
472 | */ |
473 | void afs_dump_edestaddrreq(const struct afs_operation *op) |
474 | { |
475 | static int count; |
476 | int i; |
477 | |
478 | if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3) |
479 | return; |
480 | count++; |
481 | |
482 | rcu_read_lock(); |
483 | |
484 | pr_notice("EDESTADDR occurred\n" ); |
485 | pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n" , |
486 | op->file[0].cb_break_before, |
487 | op->file[1].cb_break_before, op->flags, op->error); |
488 | pr_notice("FC: ut=%lx ix=%d ni=%u\n" , |
489 | op->untried, op->index, op->nr_iterations); |
490 | |
491 | if (op->server_list) { |
492 | const struct afs_server_list *sl = op->server_list; |
493 | pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n" , |
494 | sl->nr_servers, sl->preferred, sl->vnovol_mask); |
495 | for (i = 0; i < sl->nr_servers; i++) { |
496 | const struct afs_server *s = sl->servers[i].server; |
497 | pr_notice("FC: server fl=%lx av=%u %pU\n" , |
498 | s->flags, s->addr_version, &s->uuid); |
499 | if (s->addresses) { |
500 | const struct afs_addr_list *a = |
501 | rcu_dereference(s->addresses); |
502 | pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n" , |
503 | a->version, |
504 | a->nr_ipv4, a->nr_addrs, a->max_addrs, |
505 | a->preferred); |
506 | pr_notice("FC: - R=%lx F=%lx\n" , |
507 | a->responded, a->failed); |
508 | if (a == op->ac.alist) |
509 | pr_notice("FC: - current\n" ); |
510 | } |
511 | } |
512 | } |
513 | |
514 | pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n" , |
515 | op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error, |
516 | op->ac.responded, op->ac.nr_iterations); |
517 | rcu_read_unlock(); |
518 | } |
519 | |