validation.c source code [linux/fs/afs/validation.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/ vnode and volume validity verification.*
3	*
4	* Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5	* Written by David Howells (dhowells@redhat.com)
6	*/
7
8	#include <linux/kernel.h>
9	#include <linux/module.h>
10	#include <linux/sched.h>
11	#include "internal.h"
12
13	/*
14	* Data validation is managed through a number of mechanisms from the server:
15	*
16	* (1) On first contact with a server (such as if it has just been rebooted),
17	* the server sends us a CB.InitCallBackState* request.
18	*
19	* (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20	* calls, the server maintains a time-limited per-vnode promise that it
21	* will send us a CB.CallBack request if a third party alters the vnodes
22	* accessed.
23	*
24	* Note that a vnode-level callbacks may also be sent for other reasons,
25	* such as filelock release.
26	*
27	* (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28	* calls, each server maintains a time-limited per-volume promise that it
29	* will send us a CB.CallBack request if the RO volume is updated to a
30	* snapshot of the RW volume ("vos release"). This is an atomic event
31	* that cuts over all instances of the RO volume across multiple servers
32	* simultaneously.
33	*
34	* Note that a volume-level callbacks may also be sent for other reasons,
35	* such as the volumeserver taking over control of the volume from the
36	* fileserver.
37	*
38	* Note also that each server maintains an independent time limit on an
39	* independent callback.
40	*
41	* (4) Certain RPC calls include a volume information record "VolSync" in
42	* their reply. This contains a creation date for the volume that should
43	* remain unchanged for a RW volume (but will be changed if the volume is
44	* restored from backup) or will be bumped to the time of snapshotting
45	* when a RO volume is released.
46	*
47	* In order to track this events, the following are provided:
48	*
49	* ->cb_v_break. A counter of events that might mean that the contents of
50	* a volume have been altered since we last checked a vnode.
51	*
52	* ->cb_v_check. A counter of the number of events that we've sent a
53	* query to the server for. Everything's up to date if this equals
54	* cb_v_break.
55	*
56	* ->cb_scrub. A counter of the number of regression events for which we
57	* have to completely wipe the cache.
58	*
59	* ->cb_ro_snapshot. A counter of the number of times that we've
60	* recognised that a RO volume has been updated.
61	*
62	* ->cb_break. A counter of events that might mean that the contents of a
63	* vnode have been altered.
64	*
65	* ->cb_expires_at. The time at which the callback promise expires or
66	* AFS_NO_CB_PROMISE if we have no promise.
67	*
68	* The way we manage things is:
69	*
70	* (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71	* the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72	* volume and volume's server record.
73	*
74	* (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75	* callback break on all the volumes that have been using that volume
76	* (ie. increment ->cb_v_break and reset ->cb_expires_at).
77	*
78	* (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79	* vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
80	* dispatch a work item to unmap all PTEs to the vnode's pagecache to
81	* force reentry to the filesystem for revalidation.
82	*
83	* (4) When entering the filesystem, we call afs_validate() to check the
84	* validity of a vnode. This first checks to see if ->cb_v_check and
85	* ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86	* exclusively and perform an FS.FetchStatus on the vnode.
87	*
88	* After checking the volume, we check the vnode. If there's a mismatch
89	* between the volume counters and the vnode's mirrors of those counters,
90	* we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91	*
92	* (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93	* parsed:
94	*
95	* (A) If the Creation timestamp has changed on a RW volume or regressed
96	* on a RO volume, we try to increment ->cb_scrub; if it advances on a
97	* RO volume, we assume "vos release" happened and try to increment
98	* ->cb_ro_snapshot.
99	*
100	* (B) If the Update timestamp has regressed, we try to increment
101	* ->cb_scrub.
102	*
103	* Note that in both of these cases, we only do the increment if we can
104	* cmpxchg the value of the timestamp from the value we noted before the
105	* op. This tries to prevent parallel ops from fighting one another.
106	*
107	* volume->cb_v_check is then set to ->cb_v_break.
108	*
109	* (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110	* parsed and used to set the promise in ->cb_expires_at for the vnode,
111	* the volume and the volume's server record.
112	*
113	* (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114	* the vnode.
115	*/
116
117	/*
118	* Check the validity of a vnode/inode and its parent volume.
119	*/
120	bool afs_check_validity(const struct afs_vnode *vnode)
121	{
122	const struct afs_volume *volume = vnode->volume;
123	time64_t deadline = ktime_get_real_seconds() + `10`;
124
125	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
126	return true;
127
128	if (atomic_read(v: &volume->cb_v_check) != atomic_read(v: &volume->cb_v_break) \|\|
129	atomic64_read(v: &vnode->cb_expires_at) <= deadline \|\|
130	volume->cb_expires_at <= deadline \|\|
131	vnode->cb_ro_snapshot != atomic_read(v: &volume->cb_ro_snapshot) \|\|
132	vnode->cb_scrub != atomic_read(v: &volume->cb_scrub) \|\|
133	test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
134	_debug("inval");
135	return false;
136	}
137
138	return true;
139	}
140
141	/*
142	* See if the server we've just talked to is currently excluded.
143	*/
144	static bool __afs_is_server_excluded(struct afs_operation op, struct* afs_volume *volume)
145	{
146	const struct afs_server_entry *se;
147	const struct afs_server_list *slist;
148	bool is_excluded = true;
149	int i;
150
151	rcu_read_lock();
152
153	slist = rcu_dereference(volume->servers);
154	for (i = `0`; i < slist->nr_servers; i++) {
155	se = &slist->servers[i];
156	if (op->server == se->server) {
157	is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
158	break;
159	}
160	}
161
162	rcu_read_unlock();
163	return is_excluded;
164	}
165
166	/*
167	* Update the volume's server list when the creation time changes and see if
168	* the server we've just talked to is currently excluded.
169	*/
170	static int afs_is_server_excluded(struct afs_operation op, struct* afs_volume *volume)
171	{
172	int ret;
173
174	if (__afs_is_server_excluded(op, volume))
175	return `1`;
176
177	set_bit(AFS_VOLUME_NEEDS_UPDATE, addr: &volume->flags);
178	ret = afs_check_volume_status(op->volume, op);
179	if (ret < `0`)
180	return ret;
181
182	return __afs_is_server_excluded(op, volume);
183	}
184
185	/*
186	* Handle a change to the volume creation time in the VolSync record.
187	*/
188	static int afs_update_volume_creation_time(struct afs_operation op, struct* afs_volume *volume)
189	{
190	unsigned int snap;
191	time64_t cur = volume->creation_time;
192	time64_t old = op->pre_volsync.creation;
193	time64_t new = op->volsync.creation;
194	int ret;
195
196	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
197
198	if (cur == TIME64_MIN) {
199	volume->creation_time = new;
200	return `0`;
201	}
202
203	if (new == cur)
204	return `0`;
205
206	/ Try to advance the creation timestamp from what we had before the*
207	* operation to what we got back from the server. This should
208	* hopefully ensure that in a race between multiple operations only one
209	* of them will do this.
210	*/
211	if (cur != old)
212	return `0`;
213
214	/ If the creation time changes in an unexpected way, we need to scrub*
215	* our caches. For a RW vol, this will only change if the volume is
216	* restored from a backup; for a RO/Backup vol, this will advance when
217	* the volume is updated to a new snapshot (eg. "vos release").
218	*/
219	if (volume->type == AFSVL_RWVOL)
220	goto regressed;
221	if (volume->type == AFSVL_BACKVOL) {
222	if (new < old)
223	goto regressed;
224	goto advance;
225	}
226
227	/ We have an RO volume, we need to query the VL server and look at the*
228	* server flags to see if RW->RO replication is in progress.
229	*/
230	ret = afs_is_server_excluded(op, volume);
231	if (ret < `0`)
232	return ret;
233	if (ret > `0`) {
234	snap = atomic_read(v: &volume->cb_ro_snapshot);
235	trace_afs_cb_v_break(vid: volume->vid, cb_v_break: snap, reason: afs_cb_break_volume_excluded);
236	return ret;
237	}
238
239	advance:
240	snap = atomic_inc_return(v: &volume->cb_ro_snapshot);
241	trace_afs_cb_v_break(vid: volume->vid, cb_v_break: snap, reason: afs_cb_break_for_vos_release);
242	volume->creation_time = new;
243	return `0`;
244
245	regressed:
246	atomic_inc(v: &volume->cb_scrub);
247	trace_afs_cb_v_break(vid: volume->vid, cb_v_break: `0`, reason: afs_cb_break_for_creation_regress);
248	volume->creation_time = new;
249	return `0`;
250	}
251
252	/*
253	* Handle a change to the volume update time in the VolSync record.
254	*/
255	static void afs_update_volume_update_time(struct afs_operation op, struct* afs_volume *volume)
256	{
257	enum afs_cb_break_reason reason = afs_cb_break_no_break;
258	time64_t cur = volume->update_time;
259	time64_t old = op->pre_volsync.update;
260	time64_t new = op->volsync.update;
261
262	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
263
264	if (cur == TIME64_MIN) {
265	volume->update_time = new;
266	return;
267	}
268
269	if (new == cur)
270	return;
271
272	/ If the volume update time changes in an unexpected way, we need to*
273	* scrub our caches. For a RW vol, this will advance on every
274	* modification op; for a RO/Backup vol, this will advance when the
275	* volume is updated to a new snapshot (eg. "vos release").
276	*/
277	if (new < old)
278	reason = afs_cb_break_for_update_regress;
279
280	/ Try to advance the update timestamp from what we had before the*
281	* operation to what we got back from the server. This should
282	* hopefully ensure that in a race between multiple operations only one
283	* of them will do this.
284	*/
285	if (cur == old) {
286	if (reason == afs_cb_break_for_update_regress) {
287	atomic_inc(v: &volume->cb_scrub);
288	trace_afs_cb_v_break(vid: volume->vid, cb_v_break: `0`, reason);
289	}
290	volume->update_time = new;
291	}
292	}
293
294	static int afs_update_volume_times(struct afs_operation op, struct* afs_volume *volume)
295	{
296	int ret = `0`;
297
298	if (likely(op->volsync.creation == volume->creation_time &&
299	op->volsync.update == volume->update_time))
300	return `0`;
301
302	mutex_lock(&volume->volsync_lock);
303	if (op->volsync.creation != volume->creation_time) {
304	ret = afs_update_volume_creation_time(op, volume);
305	if (ret < `0`)
306	goto out;
307	}
308	if (op->volsync.update != volume->update_time)
309	afs_update_volume_update_time(op, volume);
310	out:
311	mutex_unlock(lock: &volume->volsync_lock);
312	return ret;
313	}
314
315	/*
316	* Update the state of a volume, including recording the expiration time of the
317	* callback promise. Returns 1 to redo the operation from the start.
318	*/
319	int afs_update_volume_state(struct afs_operation *op)
320	{
321	struct afs_server_list *slist = op->server_list;
322	struct afs_server_entry *se = &slist->servers[op->server_index];
323	struct afs_callback *cb = &op->file[`0`].scb.callback;
324	struct afs_volume *volume = op->volume;
325	unsigned int cb_v_break = atomic_read(v: &volume->cb_v_break);
326	unsigned int cb_v_check = atomic_read(v: &volume->cb_v_check);
327	int ret;
328
329	_enter("%llx", op->volume->vid);
330
331	if (op->volsync.creation != TIME64_MIN \|\| op->volsync.update != TIME64_MIN) {
332	ret = afs_update_volume_times(op, volume);
333	if (ret != `0`) {
334	_leave(" = %d", ret);
335	return ret;
336	}
337	}
338
339	if (op->cb_v_break == cb_v_break &&
340	(op->file[`0`].scb.have_cb \|\| op->file[`1`].scb.have_cb)) {
341	time64_t expires_at = cb->expires_at;
342
343	if (!op->file[`0`].scb.have_cb)
344	expires_at = op->file[`1`].scb.callback.expires_at;
345
346	se->cb_expires_at = expires_at;
347	volume->cb_expires_at = expires_at;
348	}
349	if (cb_v_check < op->cb_v_break)
350	atomic_cmpxchg(v: &volume->cb_v_check, old: cb_v_check, new: op->cb_v_break);
351	return `0`;
352	}
353
354	/*
355	* mark the data attached to an inode as obsolete due to a write on the server
356	* - might also want to ditch all the outstanding writes and dirty pages
357	*/
358	static void afs_zap_data(struct afs_vnode *vnode)
359	{
360	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
361
362	afs_invalidate_cache(vnode, flags: `0`);
363
364	/ nuke all the non-dirty pages that aren't locked, mapped or being*
365	* written back in a regular file and completely discard the pages in a
366	* directory or symlink */
367	if (S_ISREG(vnode->netfs.inode.i_mode))
368	invalidate_remote_inode(inode: &vnode->netfs.inode);
369	else
370	invalidate_inode_pages2(mapping: vnode->netfs.inode.i_mapping);
371	}
372
373	/*
374	* validate a vnode/inode
375	* - there are several things we need to check
376	* - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
377	* symlink)
378	* - parent dir metadata changed (security changes)
379	* - dentry data changed (write, truncate)
380	* - dentry metadata changed (security changes)
381	*/
382	int afs_validate(struct afs_vnode vnode, struct* key *key)
383	{
384	struct afs_volume *volume = vnode->volume;
385	unsigned int cb_ro_snapshot, cb_scrub;
386	time64_t deadline = ktime_get_real_seconds() + `10`;
387	bool zap = false, locked_vol = false;
388	int ret;
389
390	_enter("{v={%llx:%llu} fl=%lx},%x",
391	vnode->fid.vid, vnode->fid.vnode, vnode->flags,
392	key_serial(key));
393
394	if (afs_check_validity(vnode))
395	return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : `0`;
396
397	ret = down_write_killable(sem: &vnode->validate_lock);
398	if (ret < `0`)
399	goto error;
400
401	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
402	ret = -ESTALE;
403	goto error_unlock;
404	}
405
406	/ Validate a volume after the v_break has changed or the volume*
407	* callback expired. We only want to do this once per volume per
408	* v_break change. The actual work will be done when parsing the
409	* status fetch reply.
410	*/
411	if (volume->cb_expires_at <= deadline \|\|
412	atomic_read(v: &volume->cb_v_check) != atomic_read(v: &volume->cb_v_break)) {
413	ret = mutex_lock_interruptible(&volume->cb_check_lock);
414	if (ret < `0`)
415	goto error_unlock;
416	locked_vol = true;
417	}
418
419	cb_ro_snapshot = atomic_read(v: &volume->cb_ro_snapshot);
420	cb_scrub = atomic_read(v: &volume->cb_scrub);
421	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
422	vnode->cb_scrub != cb_scrub)
423	unmap_mapping_pages(mapping: vnode->netfs.inode.i_mapping, start: `0`, nr: `0`, even_cows: false);
424
425	if (vnode->cb_ro_snapshot != cb_ro_snapshot \|\|
426	vnode->cb_scrub != cb_scrub \|\|
427	volume->cb_expires_at <= deadline \|\|
428	atomic_read(v: &volume->cb_v_check) != atomic_read(v: &volume->cb_v_break) \|\|
429	atomic64_read(v: &vnode->cb_expires_at) <= deadline
430	) {
431	ret = afs_fetch_status(vnode, key, false, NULL);
432	if (ret < `0`) {
433	if (ret == -ENOENT) {
434	set_bit(AFS_VNODE_DELETED, addr: &vnode->flags);
435	ret = -ESTALE;
436	}
437	goto error_unlock;
438	}
439
440	_debug("new promise [fl=%lx]", vnode->flags);
441	}
442
443	/ We can drop the volume lock now as. /
444	if (locked_vol) {
445	mutex_unlock(lock: &volume->cb_check_lock);
446	locked_vol = false;
447	}
448
449	cb_ro_snapshot = atomic_read(v: &volume->cb_ro_snapshot);
450	cb_scrub = atomic_read(v: &volume->cb_scrub);
451	_debug("vnode inval %x==%x %x==%x",
452	vnode->cb_ro_snapshot, cb_ro_snapshot,
453	vnode->cb_scrub, cb_scrub);
454	if (vnode->cb_scrub != cb_scrub)
455	zap = true;
456	vnode->cb_ro_snapshot = cb_ro_snapshot;
457	vnode->cb_scrub = cb_scrub;
458
459	/ if the vnode's data version number changed then its contents are*
460	* different */
461	zap \|= test_and_clear_bit(AFS_VNODE_ZAP_DATA, addr: &vnode->flags);
462	if (zap)
463	afs_zap_data(vnode);
464	up_write(sem: &vnode->validate_lock);
465	_leave(" = 0");
466	return `0`;
467
468	error_unlock:
469	if (locked_vol)
470	mutex_unlock(lock: &volume->cb_check_lock);
471	up_write(sem: &vnode->validate_lock);
472	error:
473	_leave(" = %d", ret);
474	return ret;
475	}
476

source code of linux/fs/afs/validation.c