1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* vnode and volume validity verification. |
3 | * |
4 | * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. |
5 | * Written by David Howells (dhowells@redhat.com) |
6 | */ |
7 | |
8 | #include <linux/kernel.h> |
9 | #include <linux/module.h> |
10 | #include <linux/sched.h> |
11 | #include "internal.h" |
12 | |
13 | /* |
14 | * Data validation is managed through a number of mechanisms from the server: |
15 | * |
16 | * (1) On first contact with a server (such as if it has just been rebooted), |
17 | * the server sends us a CB.InitCallBackState* request. |
18 | * |
19 | * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC |
20 | * calls, the server maintains a time-limited per-vnode promise that it |
21 | * will send us a CB.CallBack request if a third party alters the vnodes |
22 | * accessed. |
23 | * |
24 | * Note that a vnode-level callbacks may also be sent for other reasons, |
25 | * such as filelock release. |
26 | * |
27 | * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC |
28 | * calls, each server maintains a time-limited per-volume promise that it |
29 | * will send us a CB.CallBack request if the RO volume is updated to a |
30 | * snapshot of the RW volume ("vos release"). This is an atomic event |
31 | * that cuts over all instances of the RO volume across multiple servers |
32 | * simultaneously. |
33 | * |
34 | * Note that a volume-level callbacks may also be sent for other reasons, |
35 | * such as the volumeserver taking over control of the volume from the |
36 | * fileserver. |
37 | * |
38 | * Note also that each server maintains an independent time limit on an |
39 | * independent callback. |
40 | * |
41 | * (4) Certain RPC calls include a volume information record "VolSync" in |
42 | * their reply. This contains a creation date for the volume that should |
43 | * remain unchanged for a RW volume (but will be changed if the volume is |
44 | * restored from backup) or will be bumped to the time of snapshotting |
45 | * when a RO volume is released. |
46 | * |
47 | * In order to track this events, the following are provided: |
48 | * |
49 | * ->cb_v_break. A counter of events that might mean that the contents of |
50 | * a volume have been altered since we last checked a vnode. |
51 | * |
52 | * ->cb_v_check. A counter of the number of events that we've sent a |
53 | * query to the server for. Everything's up to date if this equals |
54 | * cb_v_break. |
55 | * |
56 | * ->cb_scrub. A counter of the number of regression events for which we |
57 | * have to completely wipe the cache. |
58 | * |
59 | * ->cb_ro_snapshot. A counter of the number of times that we've |
60 | * recognised that a RO volume has been updated. |
61 | * |
62 | * ->cb_break. A counter of events that might mean that the contents of a |
63 | * vnode have been altered. |
64 | * |
65 | * ->cb_expires_at. The time at which the callback promise expires or |
66 | * AFS_NO_CB_PROMISE if we have no promise. |
67 | * |
68 | * The way we manage things is: |
69 | * |
70 | * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on |
71 | * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the |
72 | * volume and volume's server record. |
73 | * |
74 | * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level |
75 | * callback break on all the volumes that have been using that volume |
76 | * (ie. increment ->cb_v_break and reset ->cb_expires_at). |
77 | * |
78 | * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the |
79 | * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also |
80 | * dispatch a work item to unmap all PTEs to the vnode's pagecache to |
81 | * force reentry to the filesystem for revalidation. |
82 | * |
83 | * (4) When entering the filesystem, we call afs_validate() to check the |
84 | * validity of a vnode. This first checks to see if ->cb_v_check and |
85 | * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock |
86 | * exclusively and perform an FS.FetchStatus on the vnode. |
87 | * |
88 | * After checking the volume, we check the vnode. If there's a mismatch |
89 | * between the volume counters and the vnode's mirrors of those counters, |
90 | * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. |
91 | * |
92 | * (5) When the reply from FS.FetchStatus arrives, the VolSync record is |
93 | * parsed: |
94 | * |
95 | * (A) If the Creation timestamp has changed on a RW volume or regressed |
96 | * on a RO volume, we try to increment ->cb_scrub; if it advances on a |
97 | * RO volume, we assume "vos release" happened and try to increment |
98 | * ->cb_ro_snapshot. |
99 | * |
100 | * (B) If the Update timestamp has regressed, we try to increment |
101 | * ->cb_scrub. |
102 | * |
103 | * Note that in both of these cases, we only do the increment if we can |
104 | * cmpxchg the value of the timestamp from the value we noted before the |
105 | * op. This tries to prevent parallel ops from fighting one another. |
106 | * |
107 | * volume->cb_v_check is then set to ->cb_v_break. |
108 | * |
109 | * (6) The AFSCallBack record included in the FS.FetchStatus reply is also |
110 | * parsed and used to set the promise in ->cb_expires_at for the vnode, |
111 | * the volume and the volume's server record. |
112 | * |
113 | * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for |
114 | * the vnode. |
115 | */ |
116 | |
117 | /* |
118 | * Check the validity of a vnode/inode and its parent volume. |
119 | */ |
120 | bool afs_check_validity(const struct afs_vnode *vnode) |
121 | { |
122 | const struct afs_volume *volume = vnode->volume; |
123 | time64_t deadline = ktime_get_real_seconds() + 10; |
124 | |
125 | if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) |
126 | return true; |
127 | |
128 | if (atomic_read(v: &volume->cb_v_check) != atomic_read(v: &volume->cb_v_break) || |
129 | atomic64_read(v: &vnode->cb_expires_at) <= deadline || |
130 | volume->cb_expires_at <= deadline || |
131 | vnode->cb_ro_snapshot != atomic_read(v: &volume->cb_ro_snapshot) || |
132 | vnode->cb_scrub != atomic_read(v: &volume->cb_scrub) || |
133 | test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { |
134 | _debug("inval" ); |
135 | return false; |
136 | } |
137 | |
138 | return true; |
139 | } |
140 | |
141 | /* |
142 | * See if the server we've just talked to is currently excluded. |
143 | */ |
144 | static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) |
145 | { |
146 | const struct afs_server_entry *se; |
147 | const struct afs_server_list *slist; |
148 | bool is_excluded = true; |
149 | int i; |
150 | |
151 | rcu_read_lock(); |
152 | |
153 | slist = rcu_dereference(volume->servers); |
154 | for (i = 0; i < slist->nr_servers; i++) { |
155 | se = &slist->servers[i]; |
156 | if (op->server == se->server) { |
157 | is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); |
158 | break; |
159 | } |
160 | } |
161 | |
162 | rcu_read_unlock(); |
163 | return is_excluded; |
164 | } |
165 | |
166 | /* |
167 | * Update the volume's server list when the creation time changes and see if |
168 | * the server we've just talked to is currently excluded. |
169 | */ |
170 | static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) |
171 | { |
172 | int ret; |
173 | |
174 | if (__afs_is_server_excluded(op, volume)) |
175 | return 1; |
176 | |
177 | set_bit(AFS_VOLUME_NEEDS_UPDATE, addr: &volume->flags); |
178 | ret = afs_check_volume_status(op->volume, op); |
179 | if (ret < 0) |
180 | return ret; |
181 | |
182 | return __afs_is_server_excluded(op, volume); |
183 | } |
184 | |
185 | /* |
186 | * Handle a change to the volume creation time in the VolSync record. |
187 | */ |
188 | static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) |
189 | { |
190 | unsigned int snap; |
191 | time64_t cur = volume->creation_time; |
192 | time64_t old = op->pre_volsync.creation; |
193 | time64_t new = op->volsync.creation; |
194 | int ret; |
195 | |
196 | _enter("%llx,%llx,%llx->%llx" , volume->vid, cur, old, new); |
197 | |
198 | if (cur == TIME64_MIN) { |
199 | volume->creation_time = new; |
200 | return 0; |
201 | } |
202 | |
203 | if (new == cur) |
204 | return 0; |
205 | |
206 | /* Try to advance the creation timestamp from what we had before the |
207 | * operation to what we got back from the server. This should |
208 | * hopefully ensure that in a race between multiple operations only one |
209 | * of them will do this. |
210 | */ |
211 | if (cur != old) |
212 | return 0; |
213 | |
214 | /* If the creation time changes in an unexpected way, we need to scrub |
215 | * our caches. For a RW vol, this will only change if the volume is |
216 | * restored from a backup; for a RO/Backup vol, this will advance when |
217 | * the volume is updated to a new snapshot (eg. "vos release"). |
218 | */ |
219 | if (volume->type == AFSVL_RWVOL) |
220 | goto regressed; |
221 | if (volume->type == AFSVL_BACKVOL) { |
222 | if (new < old) |
223 | goto regressed; |
224 | goto advance; |
225 | } |
226 | |
227 | /* We have an RO volume, we need to query the VL server and look at the |
228 | * server flags to see if RW->RO replication is in progress. |
229 | */ |
230 | ret = afs_is_server_excluded(op, volume); |
231 | if (ret < 0) |
232 | return ret; |
233 | if (ret > 0) { |
234 | snap = atomic_read(v: &volume->cb_ro_snapshot); |
235 | trace_afs_cb_v_break(vid: volume->vid, cb_v_break: snap, reason: afs_cb_break_volume_excluded); |
236 | return ret; |
237 | } |
238 | |
239 | advance: |
240 | snap = atomic_inc_return(v: &volume->cb_ro_snapshot); |
241 | trace_afs_cb_v_break(vid: volume->vid, cb_v_break: snap, reason: afs_cb_break_for_vos_release); |
242 | volume->creation_time = new; |
243 | return 0; |
244 | |
245 | regressed: |
246 | atomic_inc(v: &volume->cb_scrub); |
247 | trace_afs_cb_v_break(vid: volume->vid, cb_v_break: 0, reason: afs_cb_break_for_creation_regress); |
248 | volume->creation_time = new; |
249 | return 0; |
250 | } |
251 | |
252 | /* |
253 | * Handle a change to the volume update time in the VolSync record. |
254 | */ |
255 | static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) |
256 | { |
257 | enum afs_cb_break_reason reason = afs_cb_break_no_break; |
258 | time64_t cur = volume->update_time; |
259 | time64_t old = op->pre_volsync.update; |
260 | time64_t new = op->volsync.update; |
261 | |
262 | _enter("%llx,%llx,%llx->%llx" , volume->vid, cur, old, new); |
263 | |
264 | if (cur == TIME64_MIN) { |
265 | volume->update_time = new; |
266 | return; |
267 | } |
268 | |
269 | if (new == cur) |
270 | return; |
271 | |
272 | /* If the volume update time changes in an unexpected way, we need to |
273 | * scrub our caches. For a RW vol, this will advance on every |
274 | * modification op; for a RO/Backup vol, this will advance when the |
275 | * volume is updated to a new snapshot (eg. "vos release"). |
276 | */ |
277 | if (new < old) |
278 | reason = afs_cb_break_for_update_regress; |
279 | |
280 | /* Try to advance the update timestamp from what we had before the |
281 | * operation to what we got back from the server. This should |
282 | * hopefully ensure that in a race between multiple operations only one |
283 | * of them will do this. |
284 | */ |
285 | if (cur == old) { |
286 | if (reason == afs_cb_break_for_update_regress) { |
287 | atomic_inc(v: &volume->cb_scrub); |
288 | trace_afs_cb_v_break(vid: volume->vid, cb_v_break: 0, reason); |
289 | } |
290 | volume->update_time = new; |
291 | } |
292 | } |
293 | |
294 | static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) |
295 | { |
296 | int ret = 0; |
297 | |
298 | if (likely(op->volsync.creation == volume->creation_time && |
299 | op->volsync.update == volume->update_time)) |
300 | return 0; |
301 | |
302 | mutex_lock(&volume->volsync_lock); |
303 | if (op->volsync.creation != volume->creation_time) { |
304 | ret = afs_update_volume_creation_time(op, volume); |
305 | if (ret < 0) |
306 | goto out; |
307 | } |
308 | if (op->volsync.update != volume->update_time) |
309 | afs_update_volume_update_time(op, volume); |
310 | out: |
311 | mutex_unlock(lock: &volume->volsync_lock); |
312 | return ret; |
313 | } |
314 | |
315 | /* |
316 | * Update the state of a volume, including recording the expiration time of the |
317 | * callback promise. Returns 1 to redo the operation from the start. |
318 | */ |
319 | int afs_update_volume_state(struct afs_operation *op) |
320 | { |
321 | struct afs_server_list *slist = op->server_list; |
322 | struct afs_server_entry *se = &slist->servers[op->server_index]; |
323 | struct afs_callback *cb = &op->file[0].scb.callback; |
324 | struct afs_volume *volume = op->volume; |
325 | unsigned int cb_v_break = atomic_read(v: &volume->cb_v_break); |
326 | unsigned int cb_v_check = atomic_read(v: &volume->cb_v_check); |
327 | int ret; |
328 | |
329 | _enter("%llx" , op->volume->vid); |
330 | |
331 | if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { |
332 | ret = afs_update_volume_times(op, volume); |
333 | if (ret != 0) { |
334 | _leave(" = %d" , ret); |
335 | return ret; |
336 | } |
337 | } |
338 | |
339 | if (op->cb_v_break == cb_v_break && |
340 | (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { |
341 | time64_t expires_at = cb->expires_at; |
342 | |
343 | if (!op->file[0].scb.have_cb) |
344 | expires_at = op->file[1].scb.callback.expires_at; |
345 | |
346 | se->cb_expires_at = expires_at; |
347 | volume->cb_expires_at = expires_at; |
348 | } |
349 | if (cb_v_check < op->cb_v_break) |
350 | atomic_cmpxchg(v: &volume->cb_v_check, old: cb_v_check, new: op->cb_v_break); |
351 | return 0; |
352 | } |
353 | |
354 | /* |
355 | * mark the data attached to an inode as obsolete due to a write on the server |
356 | * - might also want to ditch all the outstanding writes and dirty pages |
357 | */ |
358 | static void afs_zap_data(struct afs_vnode *vnode) |
359 | { |
360 | _enter("{%llx:%llu}" , vnode->fid.vid, vnode->fid.vnode); |
361 | |
362 | afs_invalidate_cache(vnode, flags: 0); |
363 | |
364 | /* nuke all the non-dirty pages that aren't locked, mapped or being |
365 | * written back in a regular file and completely discard the pages in a |
366 | * directory or symlink */ |
367 | if (S_ISREG(vnode->netfs.inode.i_mode)) |
368 | invalidate_remote_inode(inode: &vnode->netfs.inode); |
369 | else |
370 | invalidate_inode_pages2(mapping: vnode->netfs.inode.i_mapping); |
371 | } |
372 | |
373 | /* |
374 | * validate a vnode/inode |
375 | * - there are several things we need to check |
376 | * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, |
377 | * symlink) |
378 | * - parent dir metadata changed (security changes) |
379 | * - dentry data changed (write, truncate) |
380 | * - dentry metadata changed (security changes) |
381 | */ |
382 | int afs_validate(struct afs_vnode *vnode, struct key *key) |
383 | { |
384 | struct afs_volume *volume = vnode->volume; |
385 | unsigned int cb_ro_snapshot, cb_scrub; |
386 | time64_t deadline = ktime_get_real_seconds() + 10; |
387 | bool zap = false, locked_vol = false; |
388 | int ret; |
389 | |
390 | _enter("{v={%llx:%llu} fl=%lx},%x" , |
391 | vnode->fid.vid, vnode->fid.vnode, vnode->flags, |
392 | key_serial(key)); |
393 | |
394 | if (afs_check_validity(vnode)) |
395 | return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; |
396 | |
397 | ret = down_write_killable(sem: &vnode->validate_lock); |
398 | if (ret < 0) |
399 | goto error; |
400 | |
401 | if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { |
402 | ret = -ESTALE; |
403 | goto error_unlock; |
404 | } |
405 | |
406 | /* Validate a volume after the v_break has changed or the volume |
407 | * callback expired. We only want to do this once per volume per |
408 | * v_break change. The actual work will be done when parsing the |
409 | * status fetch reply. |
410 | */ |
411 | if (volume->cb_expires_at <= deadline || |
412 | atomic_read(v: &volume->cb_v_check) != atomic_read(v: &volume->cb_v_break)) { |
413 | ret = mutex_lock_interruptible(&volume->cb_check_lock); |
414 | if (ret < 0) |
415 | goto error_unlock; |
416 | locked_vol = true; |
417 | } |
418 | |
419 | cb_ro_snapshot = atomic_read(v: &volume->cb_ro_snapshot); |
420 | cb_scrub = atomic_read(v: &volume->cb_scrub); |
421 | if (vnode->cb_ro_snapshot != cb_ro_snapshot || |
422 | vnode->cb_scrub != cb_scrub) |
423 | unmap_mapping_pages(mapping: vnode->netfs.inode.i_mapping, start: 0, nr: 0, even_cows: false); |
424 | |
425 | if (vnode->cb_ro_snapshot != cb_ro_snapshot || |
426 | vnode->cb_scrub != cb_scrub || |
427 | volume->cb_expires_at <= deadline || |
428 | atomic_read(v: &volume->cb_v_check) != atomic_read(v: &volume->cb_v_break) || |
429 | atomic64_read(v: &vnode->cb_expires_at) <= deadline |
430 | ) { |
431 | ret = afs_fetch_status(vnode, key, false, NULL); |
432 | if (ret < 0) { |
433 | if (ret == -ENOENT) { |
434 | set_bit(AFS_VNODE_DELETED, addr: &vnode->flags); |
435 | ret = -ESTALE; |
436 | } |
437 | goto error_unlock; |
438 | } |
439 | |
440 | _debug("new promise [fl=%lx]" , vnode->flags); |
441 | } |
442 | |
443 | /* We can drop the volume lock now as. */ |
444 | if (locked_vol) { |
445 | mutex_unlock(lock: &volume->cb_check_lock); |
446 | locked_vol = false; |
447 | } |
448 | |
449 | cb_ro_snapshot = atomic_read(v: &volume->cb_ro_snapshot); |
450 | cb_scrub = atomic_read(v: &volume->cb_scrub); |
451 | _debug("vnode inval %x==%x %x==%x" , |
452 | vnode->cb_ro_snapshot, cb_ro_snapshot, |
453 | vnode->cb_scrub, cb_scrub); |
454 | if (vnode->cb_scrub != cb_scrub) |
455 | zap = true; |
456 | vnode->cb_ro_snapshot = cb_ro_snapshot; |
457 | vnode->cb_scrub = cb_scrub; |
458 | |
459 | /* if the vnode's data version number changed then its contents are |
460 | * different */ |
461 | zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, addr: &vnode->flags); |
462 | if (zap) |
463 | afs_zap_data(vnode); |
464 | up_write(sem: &vnode->validate_lock); |
465 | _leave(" = 0" ); |
466 | return 0; |
467 | |
468 | error_unlock: |
469 | if (locked_vol) |
470 | mutex_unlock(lock: &volume->cb_check_lock); |
471 | up_write(sem: &vnode->validate_lock); |
472 | error: |
473 | _leave(" = %d" , ret); |
474 | return ret; |
475 | } |
476 | |