fscounters.c source code [linux/fs/xfs/scrub/fscounters.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2019-2023 Oracle. All Rights Reserved.
4	* Author: Darrick J. Wong <djwong@kernel.org>
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_trans_resv.h"
11	#include "xfs_log_format.h"
12	#include "xfs_trans.h"
13	#include "xfs_mount.h"
14	#include "xfs_alloc.h"
15	#include "xfs_ialloc.h"
16	#include "xfs_health.h"
17	#include "xfs_btree.h"
18	#include "xfs_ag.h"
19	#include "xfs_rtbitmap.h"
20	#include "xfs_inode.h"
21	#include "xfs_icache.h"
22	#include "scrub/scrub.h"
23	#include "scrub/common.h"
24	#include "scrub/trace.h"
25	#include "scrub/fscounters.h"
26
27	/*
28	* FS Summary Counters
29	* ===================
30	*
31	* The basics of filesystem summary counter checking are that we iterate the
32	* AGs counting the number of free blocks, free space btree blocks, per-AG
33	* reservations, inodes, delayed allocation reservations, and free inodes.
34	* Then we compare what we computed against the in-core counters.
35	*
36	* However, the reality is that summary counters are a tricky beast to check.
37	* While we /could/ freeze the filesystem and scramble around the AGs counting
38	* the free blocks, in practice we prefer not do that for a scan because
39	* freezing is costly. To get around this, we added a per-cpu counter of the
40	* delalloc reservations so that we can rotor around the AGs relatively
41	* quickly, and we allow the counts to be slightly off because we're not taking
42	* any locks while we do this.
43	*
44	* So the first thing we do is warm up the buffer cache in the setup routine by
45	* walking all the AGs to make sure the incore per-AG structure has been
46	* initialized. The expected value calculation then iterates the incore per-AG
47	* structures as quickly as it can. We snapshot the percpu counters before and
48	* after this operation and use the difference in counter values to guess at
49	* our tolerance for mismatch between expected and actual counter values.
50	*/
51
52	/*
53	* Since the expected value computation is lockless but only browses incore
54	* values, the percpu counters should be fairly close to each other. However,
55	* we'll allow ourselves to be off by at least this (arbitrary) amount.
56	*/
57	#define XCHK_FSCOUNT_MIN_VARIANCE (512)
58
59	/*
60	* Make sure the per-AG structure has been initialized from the on-disk header
61	* contents and trust that the incore counters match the ondisk counters. (The
62	* AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
63	* summary counters after checking all AG headers). Do this from the setup
64	* function so that the inner AG aggregation loop runs as quickly as possible.
65	*
66	* This function runs during the setup phase /before/ we start checking any
67	* metadata.
68	*/
69	STATIC int
70	xchk_fscount_warmup(
71	struct xfs_scrub *sc)
72	{
73	struct xfs_mount *mp = sc->mp;
74	struct xfs_buf *agi_bp = NULL;
75	struct xfs_buf *agf_bp = NULL;
76	struct xfs_perag *pag = NULL;
77	xfs_agnumber_t agno;
78	int error = `0`;
79
80	for_each_perag(mp, agno, pag) {
81	if (xchk_should_terminate(sc, &error))
82	break;
83	if (xfs_perag_initialised_agi(pag) &&
84	xfs_perag_initialised_agf(pag))
85	continue;
86
87	/ Lock both AG headers. /
88	error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
89	if (error)
90	break;
91	error = xfs_alloc_read_agf(pag, sc->tp, `0`, &agf_bp);
92	if (error)
93	break;
94
95	/*
96	* These are supposed to be initialized by the header read
97	* function.
98	*/
99	if (!xfs_perag_initialised_agi(pag) \|\|
100	!xfs_perag_initialised_agf(pag)) {
101	error = -EFSCORRUPTED;
102	break;
103	}
104
105	xfs_buf_relse(agf_bp);
106	agf_bp = NULL;
107	xfs_buf_relse(agi_bp);
108	agi_bp = NULL;
109	}
110
111	if (agf_bp)
112	xfs_buf_relse(agf_bp);
113	if (agi_bp)
114	xfs_buf_relse(agi_bp);
115	if (pag)
116	xfs_perag_rele(pag);
117	return error;
118	}
119
120	static inline int
121	xchk_fsfreeze(
122	struct xfs_scrub *sc)
123	{
124	int error;
125
126	error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
127	trace_xchk_fsfreeze(sc, error);
128	return error;
129	}
130
131	static inline int
132	xchk_fsthaw(
133	struct xfs_scrub *sc)
134	{
135	int error;
136
137	/ This should always succeed, we have a kernel freeze /
138	error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
139	trace_xchk_fsthaw(sc, error);
140	return error;
141	}
142
143	/*
144	* We couldn't stabilize the filesystem long enough to sample all the variables
145	* that comprise the summary counters and compare them to the percpu counters.
146	* We need to disable all writer threads, which means taking the first two
147	* freeze levels to put userspace to sleep, and the third freeze level to
148	* prevent background threads from starting new transactions. Take one level
149	* more to prevent other callers from unfreezing the filesystem while we run.
150	*/
151	STATIC int
152	xchk_fscounters_freeze(
153	struct xfs_scrub *sc)
154	{
155	struct xchk_fscounters *fsc = sc->buf;
156	int error = `0`;
157
158	if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
159	sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
160	mnt_drop_write_file(sc->file);
161	}
162
163	/ Try to grab a kernel freeze. /
164	while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
165	if (xchk_should_terminate(sc, &error))
166	return error;
167
168	delay(HZ / `10`);
169	}
170	if (error)
171	return error;
172
173	fsc->frozen = true;
174	return `0`;
175	}
176
177	/ Thaw the filesystem after checking or repairing fscounters. /
178	STATIC void
179	xchk_fscounters_cleanup(
180	void *buf)
181	{
182	struct xchk_fscounters *fsc = buf;
183	struct xfs_scrub *sc = fsc->sc;
184	int error;
185
186	if (!fsc->frozen)
187	return;
188
189	error = xchk_fsthaw(sc);
190	if (error)
191	xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
192	else
193	fsc->frozen = false;
194	}
195
196	int
197	xchk_setup_fscounters(
198	struct xfs_scrub *sc)
199	{
200	struct xchk_fscounters *fsc;
201	int error;
202
203	/*
204	* If the AGF doesn't track btreeblks, we have to lock the AGF to count
205	* btree block usage by walking the actual btrees.
206	*/
207	if (!xfs_has_lazysbcount(sc->mp))
208	xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
209
210	sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
211	if (!sc->buf)
212	return -ENOMEM;
213	sc->buf_cleanup = xchk_fscounters_cleanup;
214	fsc = sc->buf;
215	fsc->sc = sc;
216
217	xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
218
219	/ We must get the incore counters set up before we can proceed. /
220	error = xchk_fscount_warmup(sc);
221	if (error)
222	return error;
223
224	/*
225	* Pause all writer activity in the filesystem while we're scrubbing to
226	* reduce the likelihood of background perturbations to the counters
227	* throwing off our calculations.
228	*
229	* If we're repairing, we need to prevent any other thread from
230	* changing the global fs summary counters while we're repairing them.
231	* This requires the fs to be frozen, which will disable background
232	* reclaim and purge all inactive inodes.
233	*/
234	if ((sc->flags & XCHK_TRY_HARDER) \|\| xchk_could_repair(sc)) {
235	error = xchk_fscounters_freeze(sc);
236	if (error)
237	return error;
238	}
239
240	return xchk_trans_alloc_empty(sc);
241	}
242
243	/*
244	* Part 1: Collecting filesystem summary counts. For each AG, we add its
245	* summary counts (total inodes, free inodes, free data blocks) to an incore
246	* copy of the overall filesystem summary counts.
247	*
248	* To avoid false corruption reports in part 2, any failure in this part must
249	* set the INCOMPLETE flag even when a negative errno is returned. This care
250	* must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
251	* ECANCELED) that are absorbed into a scrub state flag update by
252	* xchk_*_process_error. Scrub and repair share the same incore data
253	* structures, so the INCOMPLETE flag is critical to prevent a repair based on
254	* insufficient information.
255	*/
256
257	/ Count free space btree blocks manually for pre-lazysbcount filesystems. /
258	static int
259	xchk_fscount_btreeblks(
260	struct xfs_scrub *sc,
261	struct xchk_fscounters *fsc,
262	xfs_agnumber_t agno)
263	{
264	xfs_extlen_t blocks;
265	int error;
266
267	error = xchk_ag_init_existing(sc, agno, &sc->sa);
268	if (error)
269	goto out_free;
270
271	error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
272	if (error)
273	goto out_free;
274	fsc->fdblocks += blocks - `1`;
275
276	error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
277	if (error)
278	goto out_free;
279	fsc->fdblocks += blocks - `1`;
280
281	out_free:
282	xchk_ag_free(sc, &sc->sa);
283	return error;
284	}
285
286	/*
287	* Calculate what the global in-core counters ought to be from the incore
288	* per-AG structure. Callers can compare this to the actual in-core counters
289	* to estimate by how much both in-core and on-disk counters need to be
290	* adjusted.
291	*/
292	STATIC int
293	xchk_fscount_aggregate_agcounts(
294	struct xfs_scrub *sc,
295	struct xchk_fscounters *fsc)
296	{
297	struct xfs_mount *mp = sc->mp;
298	struct xfs_perag *pag;
299	uint64_t delayed;
300	xfs_agnumber_t agno;
301	int tries = `8`;
302	int error = `0`;
303
304	retry:
305	fsc->icount = `0`;
306	fsc->ifree = `0`;
307	fsc->fdblocks = `0`;
308
309	for_each_perag(mp, agno, pag) {
310	if (xchk_should_terminate(sc, &error))
311	break;
312
313	/ This somehow got unset since the warmup? /
314	if (!xfs_perag_initialised_agi(pag) \|\|
315	!xfs_perag_initialised_agf(pag)) {
316	error = -EFSCORRUPTED;
317	break;
318	}
319
320	/ Count all the inodes /
321	fsc->icount += pag->pagi_count;
322	fsc->ifree += pag->pagi_freecount;
323
324	/ Add up the free/freelist/bnobt/cntbt blocks /
325	fsc->fdblocks += pag->pagf_freeblks;
326	fsc->fdblocks += pag->pagf_flcount;
327	if (xfs_has_lazysbcount(sc->mp)) {
328	fsc->fdblocks += pag->pagf_btreeblks;
329	} else {
330	error = xchk_fscount_btreeblks(sc, fsc, agno);
331	if (error)
332	break;
333	}
334
335	/*
336	* Per-AG reservations are taken out of the incore counters,
337	* so they must be left out of the free blocks computation.
338	*/
339	fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
340	fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
341
342	}
343	if (pag)
344	xfs_perag_rele(pag);
345	if (error) {
346	xchk_set_incomplete(sc);
347	return error;
348	}
349
350	/*
351	* The global incore space reservation is taken from the incore
352	* counters, so leave that out of the computation.
353	*/
354	fsc->fdblocks -= mp->m_resblks_avail;
355
356	/*
357	* Delayed allocation reservations are taken out of the incore counters
358	* but not recorded on disk, so leave them and their indlen blocks out
359	* of the computation.
360	*/
361	delayed = percpu_counter_sum(&mp->m_delalloc_blks);
362	fsc->fdblocks -= delayed;
363
364	trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
365	delayed);
366
367
368	/ Bail out if the values we compute are totally nonsense. /
369	if (fsc->icount < fsc->icount_min \|\| fsc->icount > fsc->icount_max \|\|
370	fsc->fdblocks > mp->m_sb.sb_dblocks \|\|
371	fsc->ifree > fsc->icount_max)
372	return -EFSCORRUPTED;
373
374	/*
375	* If ifree > icount then we probably had some perturbation in the
376	* counters while we were calculating things. We'll try a few times
377	* to maintain ifree <= icount before giving up.
378	*/
379	if (fsc->ifree > fsc->icount) {
380	if (tries--)
381	goto retry;
382	return -EDEADLOCK;
383	}
384
385	return `0`;
386	}
387
388	#ifdef CONFIG_XFS_RT
389	STATIC int
390	xchk_fscount_add_frextent(
391	struct xfs_mount *mp,
392	struct xfs_trans *tp,
393	const struct xfs_rtalloc_rec *rec,
394	void *priv)
395	{
396	struct xchk_fscounters *fsc = priv;
397	int error = `0`;
398
399	fsc->frextents += rec->ar_extcount;
400
401	xchk_should_terminate(fsc->sc, &error);
402	return error;
403	}
404
405	/ Calculate the number of free realtime extents from the realtime bitmap. /
406	STATIC int
407	xchk_fscount_count_frextents(
408	struct xfs_scrub *sc,
409	struct xchk_fscounters *fsc)
410	{
411	struct xfs_mount *mp = sc->mp;
412	int error;
413
414	fsc->frextents = `0`;
415	if (!xfs_has_realtime(mp))
416	return `0`;
417
418	xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED \| XFS_ILOCK_RTBITMAP);
419	error = xfs_rtalloc_query_all(sc->mp, sc->tp,
420	xchk_fscount_add_frextent, fsc);
421	if (error) {
422	xchk_set_incomplete(sc);
423	goto out_unlock;
424	}
425
426	out_unlock:
427	xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED \| XFS_ILOCK_RTBITMAP);
428	return error;
429	}
430	#else
431	STATIC int
432	xchk_fscount_count_frextents(
433	struct xfs_scrub *sc,
434	struct xchk_fscounters *fsc)
435	{
436	fsc->frextents = `0`;
437	return `0`;
438	}
439	#endif /* CONFIG_XFS_RT */
440
441	/*
442	* Part 2: Comparing filesystem summary counters. All we have to do here is
443	* sum the percpu counters and compare them to what we've observed.
444	*/
445
446	/*
447	* Is the @counter reasonably close to the @expected value?
448	*
449	* We neither locked nor froze anything in the filesystem while aggregating the
450	* per-AG data to compute the @expected value, which means that the counter
451	* could have changed. We know the @old_value of the summation of the counter
452	* before the aggregation, and we re-sum the counter now. If the expected
453	* value falls between the two summations, we're ok.
454	*
455	* Otherwise, we /might/ have a problem. If the change in the summations is
456	* more than we want to tolerate, the filesystem is probably busy and we should
457	* just send back INCOMPLETE and see if userspace will try again.
458	*
459	* If we're repairing then we require an exact match.
460	*/
461	static inline bool
462	xchk_fscount_within_range(
463	struct xfs_scrub *sc,
464	const int64_t old_value,
465	struct percpu_counter *counter,
466	uint64_t expected)
467	{
468	int64_t min_value, max_value;
469	int64_t curr_value = percpu_counter_sum(counter);
470
471	trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
472	old_value);
473
474	/ Negative values are always wrong. /
475	if (curr_value < `0`)
476	return false;
477
478	/ Exact matches are always ok. /
479	if (curr_value == expected)
480	return true;
481
482	/ We require exact matches when repair is running. /
483	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
484	return false;
485
486	min_value = min(old_value, curr_value);
487	max_value = max(old_value, curr_value);
488
489	/ Within the before-and-after range is ok. /
490	if (expected >= min_value && expected <= max_value)
491	return true;
492
493	/ Everything else is bad. /
494	return false;
495	}
496
497	/ Check the superblock counters. /
498	int
499	xchk_fscounters(
500	struct xfs_scrub *sc)
501	{
502	struct xfs_mount *mp = sc->mp;
503	struct xchk_fscounters *fsc = sc->buf;
504	int64_t icount, ifree, fdblocks, frextents;
505	bool try_again = false;
506	int error;
507
508	/ Snapshot the percpu counters. /
509	icount = percpu_counter_sum(&mp->m_icount);
510	ifree = percpu_counter_sum(&mp->m_ifree);
511	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
512	frextents = percpu_counter_sum(&mp->m_frextents);
513
514	/ No negative values, please! /
515	if (icount < `0` \|\| ifree < `0`)
516	xchk_set_corrupt(sc);
517
518	/*
519	* If the filesystem is not frozen, the counter summation calls above
520	* can race with xfs_mod_freecounter, which subtracts a requested space
521	* reservation from the counter and undoes the subtraction if that made
522	* the counter go negative. Therefore, it's possible to see negative
523	* values here, and we should only flag that as a corruption if we
524	* froze the fs. This is much more likely to happen with frextents
525	* since there are no reserved pools.
526	*/
527	if (fdblocks < `0` \|\| frextents < `0`) {
528	if (!fsc->frozen)
529	return -EDEADLOCK;
530
531	xchk_set_corrupt(sc);
532	return `0`;
533	}
534
535	/ See if icount is obviously wrong. /
536	if (icount < fsc->icount_min \|\| icount > fsc->icount_max)
537	xchk_set_corrupt(sc);
538
539	/ See if fdblocks is obviously wrong. /
540	if (fdblocks > mp->m_sb.sb_dblocks)
541	xchk_set_corrupt(sc);
542
543	/ See if frextents is obviously wrong. /
544	if (frextents > mp->m_sb.sb_rextents)
545	xchk_set_corrupt(sc);
546
547	/*
548	* If ifree exceeds icount by more than the minimum variance then
549	* something's probably wrong with the counters.
550	*/
551	if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
552	xchk_set_corrupt(sc);
553
554	/ Walk the incore AG headers to calculate the expected counters. /
555	error = xchk_fscount_aggregate_agcounts(sc, fsc);
556	if (!xchk_process_error(sc, `0`, XFS_SB_BLOCK(mp), &error))
557	return error;
558
559	/ Count the free extents counter for rt volumes. /
560	error = xchk_fscount_count_frextents(sc, fsc);
561	if (!xchk_process_error(sc, `0`, XFS_SB_BLOCK(mp), &error))
562	return error;
563	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
564	return `0`;
565
566	/*
567	* Compare the in-core counters with whatever we counted. If the fs is
568	* frozen, we treat the discrepancy as a corruption because the freeze
569	* should have stabilized the counter values. Otherwise, we need
570	* userspace to call us back having granted us freeze permission.
571	*/
572	if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
573	fsc->icount)) {
574	if (fsc->frozen)
575	xchk_set_corrupt(sc);
576	else
577	try_again = true;
578	}
579
580	if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
581	if (fsc->frozen)
582	xchk_set_corrupt(sc);
583	else
584	try_again = true;
585	}
586
587	if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
588	fsc->fdblocks)) {
589	if (fsc->frozen)
590	xchk_set_corrupt(sc);
591	else
592	try_again = true;
593	}
594
595	if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
596	fsc->frextents)) {
597	if (fsc->frozen)
598	xchk_set_corrupt(sc);
599	else
600	try_again = true;
601	}
602
603	if (try_again)
604	return -EDEADLOCK;
605
606	return `0`;
607	}
608

source code of linux/fs/xfs/scrub/fscounters.c