reap.c source code [linux/fs/xfs/scrub/reap.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4	* Author: Darrick J. Wong <djwong@kernel.org>
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_trans_resv.h"
11	#include "xfs_mount.h"
12	#include "xfs_btree.h"
13	#include "xfs_log_format.h"
14	#include "xfs_trans.h"
15	#include "xfs_sb.h"
16	#include "xfs_inode.h"
17	#include "xfs_alloc.h"
18	#include "xfs_alloc_btree.h"
19	#include "xfs_ialloc.h"
20	#include "xfs_ialloc_btree.h"
21	#include "xfs_rmap.h"
22	#include "xfs_rmap_btree.h"
23	#include "xfs_refcount.h"
24	#include "xfs_refcount_btree.h"
25	#include "xfs_extent_busy.h"
26	#include "xfs_ag.h"
27	#include "xfs_ag_resv.h"
28	#include "xfs_quota.h"
29	#include "xfs_qm.h"
30	#include "xfs_bmap.h"
31	#include "xfs_da_format.h"
32	#include "xfs_da_btree.h"
33	#include "xfs_attr.h"
34	#include "xfs_attr_remote.h"
35	#include "xfs_defer.h"
36	#include "scrub/scrub.h"
37	#include "scrub/common.h"
38	#include "scrub/trace.h"
39	#include "scrub/repair.h"
40	#include "scrub/bitmap.h"
41	#include "scrub/agb_bitmap.h"
42	#include "scrub/fsb_bitmap.h"
43	#include "scrub/reap.h"
44
45	/*
46	* Disposal of Blocks from Old Metadata
47	*
48	* Now that we've constructed a new btree to replace the damaged one, we want
49	* to dispose of the blocks that (we think) the old btree was using.
50	* Previously, we used the rmapbt to collect the extents (bitmap) with the
51	* rmap owner corresponding to the tree we rebuilt, collected extents for any
52	* blocks with the same rmap owner that are owned by another data structure
53	* (sublist), and subtracted sublist from bitmap. In theory the extents
54	* remaining in bitmap are the old btree's blocks.
55	*
56	* Unfortunately, it's possible that the btree was crosslinked with other
57	* blocks on disk. The rmap data can tell us if there are multiple owners, so
58	* if the rmapbt says there is an owner of this block other than @oinfo, then
59	* the block is crosslinked. Remove the reverse mapping and continue.
60	*
61	* If there is one rmap record, we can free the block, which removes the
62	* reverse mapping but doesn't add the block to the free space. Our repair
63	* strategy is to hope the other metadata objects crosslinked on this block
64	* will be rebuilt (atop different blocks), thereby removing all the cross
65	* links.
66	*
67	* If there are no rmap records at all, we also free the block. If the btree
68	* being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69	* supposed to be a rmap record and everything is ok. For other btrees there
70	* had to have been an rmap entry for the block to have ended up on @bitmap,
71	* so if it's gone now there's something wrong and the fs will shut down.
72	*
73	* Note: If there are multiple rmap records with only the same rmap owner as
74	* the btree we're trying to rebuild and the block is indeed owned by another
75	* data structure with the same rmap owner, then the block will be in sublist
76	* and therefore doesn't need disposal. If there are multiple rmap records
77	* with only the same rmap owner but the block is not owned by something with
78	* the same rmap owner, the block will be freed.
79	*
80	* The caller is responsible for locking the AG headers/inode for the entire
81	* rebuild operation so that nothing else can sneak in and change the incore
82	* state while we're not looking. We must also invalidate any buffers
83	* associated with @bitmap.
84	*/
85
86	/ Information about reaping extents after a repair. /
87	struct xreap_state {
88	struct xfs_scrub *sc;
89
90	/ Reverse mapping owner and metadata reservation type. /
91	const struct xfs_owner_info *oinfo;
92	enum xfs_ag_resv_type resv;
93
94	/ If true, roll the transaction before reaping the next extent. /
95	bool force_roll;
96
97	/ Number of deferred reaps attached to the current transaction. /
98	unsigned int deferred;
99
100	/ Number of invalidated buffers logged to the current transaction. /
101	unsigned int invalidated;
102
103	/ Number of deferred reaps queued during the whole reap sequence. /
104	unsigned long long total_deferred;
105	};
106
107	/ Put a block back on the AGFL. /
108	STATIC int
109	xreap_put_freelist(
110	struct xfs_scrub *sc,
111	xfs_agblock_t agbno)
112	{
113	struct xfs_buf *agfl_bp;
114	int error;
115
116	/ Make sure there's space on the freelist. /
117	error = xrep_fix_freelist(sc, `0`);
118	if (error)
119	return error;
120
121	/*
122	* Since we're "freeing" a lost block onto the AGFL, we have to
123	* create an rmap for the block prior to merging it or else other
124	* parts will break.
125	*/
126	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, `1`,
127	&XFS_RMAP_OINFO_AG);
128	if (error)
129	return error;
130
131	/ Put the block on the AGFL. /
132	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133	if (error)
134	return error;
135
136	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137	agfl_bp, agbno, `0`);
138	if (error)
139	return error;
140	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, `1`,
141	XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143	return `0`;
144	}
145
146	/ Are there any uncommitted reap operations? /
147	static inline bool xreap_dirty(const struct xreap_state *rs)
148	{
149	if (rs->force_roll)
150	return true;
151	if (rs->deferred)
152	return true;
153	if (rs->invalidated)
154	return true;
155	if (rs->total_deferred)
156	return true;
157	return false;
158	}
159
160	#define XREAP_MAX_BINVAL (2048)
161
162	/*
163	* Decide if we want to roll the transaction after reaping an extent. We don't
164	* want to overrun the transaction reservation, so we prohibit more than
165	* 128 EFIs per transaction. For the same reason, we limit the number
166	* of buffer invalidations to 2048.
167	*/
168	static inline bool xreap_want_roll(const struct xreap_state *rs)
169	{
170	if (rs->force_roll)
171	return true;
172	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173	return true;
174	if (rs->invalidated > XREAP_MAX_BINVAL)
175	return true;
176	return false;
177	}
178
179	static inline void xreap_reset(struct xreap_state *rs)
180	{
181	rs->total_deferred += rs->deferred;
182	rs->deferred = `0`;
183	rs->invalidated = `0`;
184	rs->force_roll = false;
185	}
186
187	#define XREAP_MAX_DEFER_CHAIN (2048)
188
189	/*
190	* Decide if we want to finish the deferred ops that are attached to the scrub
191	* transaction. We don't want to queue huge chains of deferred ops because
192	* that can consume a lot of log space and kernel memory. Hence we trigger a
193	* xfs_defer_finish if there are more than 2048 deferred reap operations or the
194	* caller did some real work.
195	*/
196	static inline bool
197	xreap_want_defer_finish(const struct xreap_state *rs)
198	{
199	if (rs->force_roll)
200	return true;
201	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202	return true;
203	return false;
204	}
205
206	static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207	{
208	rs->total_deferred = `0`;
209	rs->deferred = `0`;
210	rs->invalidated = `0`;
211	rs->force_roll = false;
212	}
213
214	/ Try to invalidate the incore buffers for an extent that we're freeing. /
215	STATIC void
216	xreap_agextent_binval(
217	struct xreap_state *rs,
218	xfs_agblock_t agbno,
219	xfs_extlen_t *aglenp)
220	{
221	struct xfs_scrub *sc = rs->sc;
222	struct xfs_perag *pag = sc->sa.pag;
223	struct xfs_mount *mp = sc->mp;
224	xfs_agnumber_t agno = sc->sa.pag->pag_agno;
225	xfs_agblock_t agbno_next = agbno + *aglenp;
226	xfs_agblock_t bno = agbno;
227
228	/*
229	* Avoid invalidating AG headers and post-EOFS blocks because we never
230	* own those.
231	*/
232	if (!xfs_verify_agbno(pag, agbno) \|\|
233	!xfs_verify_agbno(pag, agbno_next - `1`))
234	return;
235
236	/*
237	* If there are incore buffers for these blocks, invalidate them. We
238	* assume that the lack of any other known owners means that the buffer
239	* can be locked without risk of deadlocking. The buffer cache cannot
240	* detect aliasing, so employ nested loops to scan for incore buffers
241	* of any plausible size.
242	*/
243	while (bno < agbno_next) {
244	xfs_agblock_t fsbcount;
245	xfs_agblock_t max_fsbs;
246
247	/*
248	* Max buffer size is the max remote xattr buffer size, which
249	* is one fs block larger than 64k.
250	*/
251	max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
252	xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
253
254	for (fsbcount = `1`; fsbcount <= max_fsbs; fsbcount++) {
255	struct xfs_buf *bp = NULL;
256	xfs_daddr_t daddr;
257	int error;
258
259	daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
260	error = xfs_buf_incore(mp->m_ddev_targp, daddr,
261	XFS_FSB_TO_BB(mp, fsbcount),
262	XBF_LIVESCAN, &bp);
263	if (error)
264	continue;
265
266	xfs_trans_bjoin(sc->tp, bp);
267	xfs_trans_binval(sc->tp, bp);
268	rs->invalidated++;
269
270	/*
271	* Stop invalidating if we've hit the limit; we should
272	* still have enough reservation left to free however
273	* far we've gotten.
274	*/
275	if (rs->invalidated > XREAP_MAX_BINVAL) {
276	*aglenp -= agbno_next - bno;
277	goto out;
278	}
279	}
280
281	bno++;
282	}
283
284	out:
285	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
286	}
287
288	/*
289	* Figure out the longest run of blocks that we can dispose of with a single
290	* call. Cross-linked blocks should have their reverse mappings removed, but
291	* single-owner extents can be freed. AGFL blocks can only be put back one at
292	* a time.
293	*/
294	STATIC int
295	xreap_agextent_select(
296	struct xreap_state *rs,
297	xfs_agblock_t agbno,
298	xfs_agblock_t agbno_next,
299	bool *crosslinked,
300	xfs_extlen_t *aglenp)
301	{
302	struct xfs_scrub *sc = rs->sc;
303	struct xfs_btree_cur *cur;
304	xfs_agblock_t bno = agbno + `1`;
305	xfs_extlen_t len = `1`;
306	int error;
307
308	/*
309	* Determine if there are any other rmap records covering the first
310	* block of this extent. If so, the block is crosslinked.
311	*/
312	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
313	sc->sa.pag);
314	error = xfs_rmap_has_other_keys(cur, agbno, `1`, rs->oinfo,
315	crosslinked);
316	if (error)
317	goto out_cur;
318
319	/ AGFL blocks can only be deal with one at a time. /
320	if (rs->resv == XFS_AG_RESV_AGFL)
321	goto out_found;
322
323	/*
324	* Figure out how many of the subsequent blocks have the same crosslink
325	* status.
326	*/
327	while (bno < agbno_next) {
328	bool also_crosslinked;
329
330	error = xfs_rmap_has_other_keys(cur, bno, `1`, rs->oinfo,
331	&also_crosslinked);
332	if (error)
333	goto out_cur;
334
335	if (*crosslinked != also_crosslinked)
336	break;
337
338	len++;
339	bno++;
340	}
341
342	out_found:
343	*aglenp = len;
344	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
345	out_cur:
346	xfs_btree_del_cursor(cur, error);
347	return error;
348	}
349
350	/*
351	* Dispose of as much of the beginning of this AG extent as possible. The
352	* number of blocks disposed of will be returned in @aglenp.
353	*/
354	STATIC int
355	xreap_agextent_iter(
356	struct xreap_state *rs,
357	xfs_agblock_t agbno,
358	xfs_extlen_t *aglenp,
359	bool crosslinked)
360	{
361	struct xfs_scrub *sc = rs->sc;
362	xfs_fsblock_t fsbno;
363	int error = `0`;
364
365	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
366
367	/*
368	* If there are other rmappings, this block is cross linked and must
369	* not be freed. Remove the reverse mapping and move on. Otherwise,
370	* we were the only owner of the block, so free the extent, which will
371	* also remove the rmap.
372	*
373	* XXX: XFS doesn't support detecting the case where a single block
374	* metadata structure is crosslinked with a multi-block structure
375	* because the buffer cache doesn't detect aliasing problems, so we
376	* can't fix 100% of crosslinking problems (yet). The verifiers will
377	* blow on writeout, the filesystem will shut down, and the admin gets
378	* to run xfs_repair.
379	*/
380	if (crosslinked) {
381	trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
382
383	rs->force_roll = true;
384
385	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
386	/*
387	* If we're unmapping CoW staging extents, remove the
388	* records from the refcountbt, which will remove the
389	* rmap record as well.
390	*/
391	xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
392	return `0`;
393	}
394
395	return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
396	*aglenp, rs->oinfo);
397	}
398
399	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
400
401	/*
402	* Invalidate as many buffers as we can, starting at agbno. If this
403	* function sets *aglenp to zero, the transaction is full of logged
404	* buffer invalidations, so we need to return early so that we can
405	* roll and retry.
406	*/
407	xreap_agextent_binval(rs, agbno, aglenp);
408	if (*aglenp == `0`) {
409	ASSERT(xreap_want_roll(rs));
410	return `0`;
411	}
412
413	/*
414	* If we're getting rid of CoW staging extents, use deferred work items
415	* to remove the refcountbt records (which removes the rmap records)
416	* and free the extent. We're not worried about the system going down
417	* here because log recovery walks the refcount btree to clean out the
418	* CoW staging extents.
419	*/
420	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
421	ASSERT(rs->resv == XFS_AG_RESV_NONE);
422
423	xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
424	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
425	rs->resv, true);
426	if (error)
427	return error;
428
429	rs->force_roll = true;
430	return `0`;
431	}
432
433	/ Put blocks back on the AGFL one at a time. /
434	if (rs->resv == XFS_AG_RESV_AGFL) {
435	ASSERT(*aglenp == `1`);
436	error = xreap_put_freelist(sc, agbno);
437	if (error)
438	return error;
439
440	rs->force_roll = true;
441	return `0`;
442	}
443
444	/*
445	* Use deferred frees to get rid of the old btree blocks to try to
446	* minimize the window in which we could crash and lose the old blocks.
447	* Add a defer ops barrier every other extent to avoid stressing the
448	* system with large EFIs.
449	*/
450	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
451	rs->resv, true);
452	if (error)
453	return error;
454
455	rs->deferred++;
456	if (rs->deferred % `2` == `0`)
457	xfs_defer_add_barrier(sc->tp);
458	return `0`;
459	}
460
461	/*
462	* Break an AG metadata extent into sub-extents by fate (crosslinked, not
463	* crosslinked), and dispose of each sub-extent separately.
464	*/
465	STATIC int
466	xreap_agmeta_extent(
467	uint32_t agbno,
468	uint32_t len,
469	void *priv)
470	{
471	struct xreap_state *rs = priv;
472	struct xfs_scrub *sc = rs->sc;
473	xfs_agblock_t agbno_next = agbno + len;
474	int error = `0`;
475
476	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
477	ASSERT(sc->ip == NULL);
478
479	while (agbno < agbno_next) {
480	xfs_extlen_t aglen;
481	bool crosslinked;
482
483	error = xreap_agextent_select(rs, agbno, agbno_next,
484	&crosslinked, &aglen);
485	if (error)
486	return error;
487
488	error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
489	if (error)
490	return error;
491
492	if (xreap_want_defer_finish(rs)) {
493	error = xrep_defer_finish(sc);
494	if (error)
495	return error;
496	xreap_defer_finish_reset(rs);
497	} else if (xreap_want_roll(rs)) {
498	error = xrep_roll_ag_trans(sc);
499	if (error)
500	return error;
501	xreap_reset(rs);
502	}
503
504	agbno += aglen;
505	}
506
507	return `0`;
508	}
509
510	/ Dispose of every block of every AG metadata extent in the bitmap. /
511	int
512	xrep_reap_agblocks(
513	struct xfs_scrub *sc,
514	struct xagb_bitmap *bitmap,
515	const struct xfs_owner_info *oinfo,
516	enum xfs_ag_resv_type type)
517	{
518	struct xreap_state rs = {
519	.sc = sc,
520	.oinfo = oinfo,
521	.resv = type,
522	};
523	int error;
524
525	ASSERT(xfs_has_rmapbt(sc->mp));
526	ASSERT(sc->ip == NULL);
527
528	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
529	if (error)
530	return error;
531
532	if (xreap_dirty(&rs))
533	return xrep_defer_finish(sc);
534
535	return `0`;
536	}
537
538	/*
539	* Break a file metadata extent into sub-extents by fate (crosslinked, not
540	* crosslinked), and dispose of each sub-extent separately. The extent must
541	* not cross an AG boundary.
542	*/
543	STATIC int
544	xreap_fsmeta_extent(
545	uint64_t fsbno,
546	uint64_t len,
547	void *priv)
548	{
549	struct xreap_state *rs = priv;
550	struct xfs_scrub *sc = rs->sc;
551	xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
552	xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
553	xfs_agblock_t agbno_next = agbno + len;
554	int error = `0`;
555
556	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
557	ASSERT(sc->ip != NULL);
558	ASSERT(!sc->sa.pag);
559
560	/*
561	* We're reaping blocks after repairing file metadata, which means that
562	* we have to init the xchk_ag structure ourselves.
563	*/
564	sc->sa.pag = xfs_perag_get(sc->mp, agno);
565	if (!sc->sa.pag)
566	return -EFSCORRUPTED;
567
568	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, `0`, &sc->sa.agf_bp);
569	if (error)
570	goto out_pag;
571
572	while (agbno < agbno_next) {
573	xfs_extlen_t aglen;
574	bool crosslinked;
575
576	error = xreap_agextent_select(rs, agbno, agbno_next,
577	&crosslinked, &aglen);
578	if (error)
579	goto out_agf;
580
581	error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
582	if (error)
583	goto out_agf;
584
585	if (xreap_want_defer_finish(rs)) {
586	/*
587	* Holds the AGF buffer across the deferred chain
588	* processing.
589	*/
590	error = xrep_defer_finish(sc);
591	if (error)
592	goto out_agf;
593	xreap_defer_finish_reset(rs);
594	} else if (xreap_want_roll(rs)) {
595	/*
596	* Hold the AGF buffer across the transaction roll so
597	* that we don't have to reattach it to the scrub
598	* context.
599	*/
600	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
601	error = xfs_trans_roll_inode(&sc->tp, sc->ip);
602	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
603	if (error)
604	goto out_agf;
605	xreap_reset(rs);
606	}
607
608	agbno += aglen;
609	}
610
611	out_agf:
612	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
613	sc->sa.agf_bp = NULL;
614	out_pag:
615	xfs_perag_put(sc->sa.pag);
616	sc->sa.pag = NULL;
617	return error;
618	}
619
620	/*
621	* Dispose of every block of every fs metadata extent in the bitmap.
622	* Do not use this to dispose of the mappings in an ondisk inode fork.
623	*/
624	int
625	xrep_reap_fsblocks(
626	struct xfs_scrub *sc,
627	struct xfsb_bitmap *bitmap,
628	const struct xfs_owner_info *oinfo)
629	{
630	struct xreap_state rs = {
631	.sc = sc,
632	.oinfo = oinfo,
633	.resv = XFS_AG_RESV_NONE,
634	};
635	int error;
636
637	ASSERT(xfs_has_rmapbt(sc->mp));
638	ASSERT(sc->ip != NULL);
639
640	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
641	if (error)
642	return error;
643
644	if (xreap_dirty(&rs))
645	return xrep_defer_finish(sc);
646
647	return `0`;
648	}
649

source code of linux/fs/xfs/scrub/reap.c