1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_inode.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_refcount.h"
24#include "xfs_refcount_btree.h"
25#include "xfs_extent_busy.h"
26#include "xfs_ag.h"
27#include "xfs_ag_resv.h"
28#include "xfs_quota.h"
29#include "xfs_qm.h"
30#include "xfs_bmap.h"
31#include "xfs_da_format.h"
32#include "xfs_da_btree.h"
33#include "xfs_attr.h"
34#include "xfs_attr_remote.h"
35#include "xfs_defer.h"
36#include "scrub/scrub.h"
37#include "scrub/common.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/bitmap.h"
41#include "scrub/agb_bitmap.h"
42#include "scrub/fsb_bitmap.h"
43#include "scrub/reap.h"
44
45/*
46 * Disposal of Blocks from Old Metadata
47 *
48 * Now that we've constructed a new btree to replace the damaged one, we want
49 * to dispose of the blocks that (we think) the old btree was using.
50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
52 * blocks with the same rmap owner that are owned by another data structure
53 * (sublist), and subtracted sublist from bitmap. In theory the extents
54 * remaining in bitmap are the old btree's blocks.
55 *
56 * Unfortunately, it's possible that the btree was crosslinked with other
57 * blocks on disk. The rmap data can tell us if there are multiple owners, so
58 * if the rmapbt says there is an owner of this block other than @oinfo, then
59 * the block is crosslinked. Remove the reverse mapping and continue.
60 *
61 * If there is one rmap record, we can free the block, which removes the
62 * reverse mapping but doesn't add the block to the free space. Our repair
63 * strategy is to hope the other metadata objects crosslinked on this block
64 * will be rebuilt (atop different blocks), thereby removing all the cross
65 * links.
66 *
67 * If there are no rmap records at all, we also free the block. If the btree
68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69 * supposed to be a rmap record and everything is ok. For other btrees there
70 * had to have been an rmap entry for the block to have ended up on @bitmap,
71 * so if it's gone now there's something wrong and the fs will shut down.
72 *
73 * Note: If there are multiple rmap records with only the same rmap owner as
74 * the btree we're trying to rebuild and the block is indeed owned by another
75 * data structure with the same rmap owner, then the block will be in sublist
76 * and therefore doesn't need disposal. If there are multiple rmap records
77 * with only the same rmap owner but the block is not owned by something with
78 * the same rmap owner, the block will be freed.
79 *
80 * The caller is responsible for locking the AG headers/inode for the entire
81 * rebuild operation so that nothing else can sneak in and change the incore
82 * state while we're not looking. We must also invalidate any buffers
83 * associated with @bitmap.
84 */
85
86/* Information about reaping extents after a repair. */
87struct xreap_state {
88 struct xfs_scrub *sc;
89
90 /* Reverse mapping owner and metadata reservation type. */
91 const struct xfs_owner_info *oinfo;
92 enum xfs_ag_resv_type resv;
93
94 /* If true, roll the transaction before reaping the next extent. */
95 bool force_roll;
96
97 /* Number of deferred reaps attached to the current transaction. */
98 unsigned int deferred;
99
100 /* Number of invalidated buffers logged to the current transaction. */
101 unsigned int invalidated;
102
103 /* Number of deferred reaps queued during the whole reap sequence. */
104 unsigned long long total_deferred;
105};
106
107/* Put a block back on the AGFL. */
108STATIC int
109xreap_put_freelist(
110 struct xfs_scrub *sc,
111 xfs_agblock_t agbno)
112{
113 struct xfs_buf *agfl_bp;
114 int error;
115
116 /* Make sure there's space on the freelist. */
117 error = xrep_fix_freelist(sc, 0);
118 if (error)
119 return error;
120
121 /*
122 * Since we're "freeing" a lost block onto the AGFL, we have to
123 * create an rmap for the block prior to merging it or else other
124 * parts will break.
125 */
126 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127 &XFS_RMAP_OINFO_AG);
128 if (error)
129 return error;
130
131 /* Put the block on the AGFL. */
132 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133 if (error)
134 return error;
135
136 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137 agfl_bp, agbno, 0);
138 if (error)
139 return error;
140 xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
141 XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143 return 0;
144}
145
146/* Are there any uncommitted reap operations? */
147static inline bool xreap_dirty(const struct xreap_state *rs)
148{
149 if (rs->force_roll)
150 return true;
151 if (rs->deferred)
152 return true;
153 if (rs->invalidated)
154 return true;
155 if (rs->total_deferred)
156 return true;
157 return false;
158}
159
160#define XREAP_MAX_BINVAL (2048)
161
162/*
163 * Decide if we want to roll the transaction after reaping an extent. We don't
164 * want to overrun the transaction reservation, so we prohibit more than
165 * 128 EFIs per transaction. For the same reason, we limit the number
166 * of buffer invalidations to 2048.
167 */
168static inline bool xreap_want_roll(const struct xreap_state *rs)
169{
170 if (rs->force_roll)
171 return true;
172 if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173 return true;
174 if (rs->invalidated > XREAP_MAX_BINVAL)
175 return true;
176 return false;
177}
178
179static inline void xreap_reset(struct xreap_state *rs)
180{
181 rs->total_deferred += rs->deferred;
182 rs->deferred = 0;
183 rs->invalidated = 0;
184 rs->force_roll = false;
185}
186
187#define XREAP_MAX_DEFER_CHAIN (2048)
188
189/*
190 * Decide if we want to finish the deferred ops that are attached to the scrub
191 * transaction. We don't want to queue huge chains of deferred ops because
192 * that can consume a lot of log space and kernel memory. Hence we trigger a
193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194 * caller did some real work.
195 */
196static inline bool
197xreap_want_defer_finish(const struct xreap_state *rs)
198{
199 if (rs->force_roll)
200 return true;
201 if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202 return true;
203 return false;
204}
205
206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207{
208 rs->total_deferred = 0;
209 rs->deferred = 0;
210 rs->invalidated = 0;
211 rs->force_roll = false;
212}
213
214/* Try to invalidate the incore buffers for an extent that we're freeing. */
215STATIC void
216xreap_agextent_binval(
217 struct xreap_state *rs,
218 xfs_agblock_t agbno,
219 xfs_extlen_t *aglenp)
220{
221 struct xfs_scrub *sc = rs->sc;
222 struct xfs_perag *pag = sc->sa.pag;
223 struct xfs_mount *mp = sc->mp;
224 xfs_agnumber_t agno = sc->sa.pag->pag_agno;
225 xfs_agblock_t agbno_next = agbno + *aglenp;
226 xfs_agblock_t bno = agbno;
227
228 /*
229 * Avoid invalidating AG headers and post-EOFS blocks because we never
230 * own those.
231 */
232 if (!xfs_verify_agbno(pag, agbno) ||
233 !xfs_verify_agbno(pag, agbno_next - 1))
234 return;
235
236 /*
237 * If there are incore buffers for these blocks, invalidate them. We
238 * assume that the lack of any other known owners means that the buffer
239 * can be locked without risk of deadlocking. The buffer cache cannot
240 * detect aliasing, so employ nested loops to scan for incore buffers
241 * of any plausible size.
242 */
243 while (bno < agbno_next) {
244 xfs_agblock_t fsbcount;
245 xfs_agblock_t max_fsbs;
246
247 /*
248 * Max buffer size is the max remote xattr buffer size, which
249 * is one fs block larger than 64k.
250 */
251 max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
252 xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
253
254 for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
255 struct xfs_buf *bp = NULL;
256 xfs_daddr_t daddr;
257 int error;
258
259 daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
260 error = xfs_buf_incore(mp->m_ddev_targp, daddr,
261 XFS_FSB_TO_BB(mp, fsbcount),
262 XBF_LIVESCAN, &bp);
263 if (error)
264 continue;
265
266 xfs_trans_bjoin(sc->tp, bp);
267 xfs_trans_binval(sc->tp, bp);
268 rs->invalidated++;
269
270 /*
271 * Stop invalidating if we've hit the limit; we should
272 * still have enough reservation left to free however
273 * far we've gotten.
274 */
275 if (rs->invalidated > XREAP_MAX_BINVAL) {
276 *aglenp -= agbno_next - bno;
277 goto out;
278 }
279 }
280
281 bno++;
282 }
283
284out:
285 trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
286}
287
288/*
289 * Figure out the longest run of blocks that we can dispose of with a single
290 * call. Cross-linked blocks should have their reverse mappings removed, but
291 * single-owner extents can be freed. AGFL blocks can only be put back one at
292 * a time.
293 */
294STATIC int
295xreap_agextent_select(
296 struct xreap_state *rs,
297 xfs_agblock_t agbno,
298 xfs_agblock_t agbno_next,
299 bool *crosslinked,
300 xfs_extlen_t *aglenp)
301{
302 struct xfs_scrub *sc = rs->sc;
303 struct xfs_btree_cur *cur;
304 xfs_agblock_t bno = agbno + 1;
305 xfs_extlen_t len = 1;
306 int error;
307
308 /*
309 * Determine if there are any other rmap records covering the first
310 * block of this extent. If so, the block is crosslinked.
311 */
312 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
313 sc->sa.pag);
314 error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
315 crosslinked);
316 if (error)
317 goto out_cur;
318
319 /* AGFL blocks can only be deal with one at a time. */
320 if (rs->resv == XFS_AG_RESV_AGFL)
321 goto out_found;
322
323 /*
324 * Figure out how many of the subsequent blocks have the same crosslink
325 * status.
326 */
327 while (bno < agbno_next) {
328 bool also_crosslinked;
329
330 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
331 &also_crosslinked);
332 if (error)
333 goto out_cur;
334
335 if (*crosslinked != also_crosslinked)
336 break;
337
338 len++;
339 bno++;
340 }
341
342out_found:
343 *aglenp = len;
344 trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
345out_cur:
346 xfs_btree_del_cursor(cur, error);
347 return error;
348}
349
350/*
351 * Dispose of as much of the beginning of this AG extent as possible. The
352 * number of blocks disposed of will be returned in @aglenp.
353 */
354STATIC int
355xreap_agextent_iter(
356 struct xreap_state *rs,
357 xfs_agblock_t agbno,
358 xfs_extlen_t *aglenp,
359 bool crosslinked)
360{
361 struct xfs_scrub *sc = rs->sc;
362 xfs_fsblock_t fsbno;
363 int error = 0;
364
365 fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
366
367 /*
368 * If there are other rmappings, this block is cross linked and must
369 * not be freed. Remove the reverse mapping and move on. Otherwise,
370 * we were the only owner of the block, so free the extent, which will
371 * also remove the rmap.
372 *
373 * XXX: XFS doesn't support detecting the case where a single block
374 * metadata structure is crosslinked with a multi-block structure
375 * because the buffer cache doesn't detect aliasing problems, so we
376 * can't fix 100% of crosslinking problems (yet). The verifiers will
377 * blow on writeout, the filesystem will shut down, and the admin gets
378 * to run xfs_repair.
379 */
380 if (crosslinked) {
381 trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
382
383 rs->force_roll = true;
384
385 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
386 /*
387 * If we're unmapping CoW staging extents, remove the
388 * records from the refcountbt, which will remove the
389 * rmap record as well.
390 */
391 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
392 return 0;
393 }
394
395 return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
396 *aglenp, rs->oinfo);
397 }
398
399 trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
400
401 /*
402 * Invalidate as many buffers as we can, starting at agbno. If this
403 * function sets *aglenp to zero, the transaction is full of logged
404 * buffer invalidations, so we need to return early so that we can
405 * roll and retry.
406 */
407 xreap_agextent_binval(rs, agbno, aglenp);
408 if (*aglenp == 0) {
409 ASSERT(xreap_want_roll(rs));
410 return 0;
411 }
412
413 /*
414 * If we're getting rid of CoW staging extents, use deferred work items
415 * to remove the refcountbt records (which removes the rmap records)
416 * and free the extent. We're not worried about the system going down
417 * here because log recovery walks the refcount btree to clean out the
418 * CoW staging extents.
419 */
420 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
421 ASSERT(rs->resv == XFS_AG_RESV_NONE);
422
423 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
424 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
425 rs->resv, true);
426 if (error)
427 return error;
428
429 rs->force_roll = true;
430 return 0;
431 }
432
433 /* Put blocks back on the AGFL one at a time. */
434 if (rs->resv == XFS_AG_RESV_AGFL) {
435 ASSERT(*aglenp == 1);
436 error = xreap_put_freelist(sc, agbno);
437 if (error)
438 return error;
439
440 rs->force_roll = true;
441 return 0;
442 }
443
444 /*
445 * Use deferred frees to get rid of the old btree blocks to try to
446 * minimize the window in which we could crash and lose the old blocks.
447 * Add a defer ops barrier every other extent to avoid stressing the
448 * system with large EFIs.
449 */
450 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
451 rs->resv, true);
452 if (error)
453 return error;
454
455 rs->deferred++;
456 if (rs->deferred % 2 == 0)
457 xfs_defer_add_barrier(sc->tp);
458 return 0;
459}
460
461/*
462 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
463 * crosslinked), and dispose of each sub-extent separately.
464 */
465STATIC int
466xreap_agmeta_extent(
467 uint32_t agbno,
468 uint32_t len,
469 void *priv)
470{
471 struct xreap_state *rs = priv;
472 struct xfs_scrub *sc = rs->sc;
473 xfs_agblock_t agbno_next = agbno + len;
474 int error = 0;
475
476 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
477 ASSERT(sc->ip == NULL);
478
479 while (agbno < agbno_next) {
480 xfs_extlen_t aglen;
481 bool crosslinked;
482
483 error = xreap_agextent_select(rs, agbno, agbno_next,
484 &crosslinked, &aglen);
485 if (error)
486 return error;
487
488 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
489 if (error)
490 return error;
491
492 if (xreap_want_defer_finish(rs)) {
493 error = xrep_defer_finish(sc);
494 if (error)
495 return error;
496 xreap_defer_finish_reset(rs);
497 } else if (xreap_want_roll(rs)) {
498 error = xrep_roll_ag_trans(sc);
499 if (error)
500 return error;
501 xreap_reset(rs);
502 }
503
504 agbno += aglen;
505 }
506
507 return 0;
508}
509
510/* Dispose of every block of every AG metadata extent in the bitmap. */
511int
512xrep_reap_agblocks(
513 struct xfs_scrub *sc,
514 struct xagb_bitmap *bitmap,
515 const struct xfs_owner_info *oinfo,
516 enum xfs_ag_resv_type type)
517{
518 struct xreap_state rs = {
519 .sc = sc,
520 .oinfo = oinfo,
521 .resv = type,
522 };
523 int error;
524
525 ASSERT(xfs_has_rmapbt(sc->mp));
526 ASSERT(sc->ip == NULL);
527
528 error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
529 if (error)
530 return error;
531
532 if (xreap_dirty(&rs))
533 return xrep_defer_finish(sc);
534
535 return 0;
536}
537
538/*
539 * Break a file metadata extent into sub-extents by fate (crosslinked, not
540 * crosslinked), and dispose of each sub-extent separately. The extent must
541 * not cross an AG boundary.
542 */
543STATIC int
544xreap_fsmeta_extent(
545 uint64_t fsbno,
546 uint64_t len,
547 void *priv)
548{
549 struct xreap_state *rs = priv;
550 struct xfs_scrub *sc = rs->sc;
551 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
552 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
553 xfs_agblock_t agbno_next = agbno + len;
554 int error = 0;
555
556 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
557 ASSERT(sc->ip != NULL);
558 ASSERT(!sc->sa.pag);
559
560 /*
561 * We're reaping blocks after repairing file metadata, which means that
562 * we have to init the xchk_ag structure ourselves.
563 */
564 sc->sa.pag = xfs_perag_get(sc->mp, agno);
565 if (!sc->sa.pag)
566 return -EFSCORRUPTED;
567
568 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
569 if (error)
570 goto out_pag;
571
572 while (agbno < agbno_next) {
573 xfs_extlen_t aglen;
574 bool crosslinked;
575
576 error = xreap_agextent_select(rs, agbno, agbno_next,
577 &crosslinked, &aglen);
578 if (error)
579 goto out_agf;
580
581 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
582 if (error)
583 goto out_agf;
584
585 if (xreap_want_defer_finish(rs)) {
586 /*
587 * Holds the AGF buffer across the deferred chain
588 * processing.
589 */
590 error = xrep_defer_finish(sc);
591 if (error)
592 goto out_agf;
593 xreap_defer_finish_reset(rs);
594 } else if (xreap_want_roll(rs)) {
595 /*
596 * Hold the AGF buffer across the transaction roll so
597 * that we don't have to reattach it to the scrub
598 * context.
599 */
600 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
601 error = xfs_trans_roll_inode(&sc->tp, sc->ip);
602 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
603 if (error)
604 goto out_agf;
605 xreap_reset(rs);
606 }
607
608 agbno += aglen;
609 }
610
611out_agf:
612 xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
613 sc->sa.agf_bp = NULL;
614out_pag:
615 xfs_perag_put(sc->sa.pag);
616 sc->sa.pag = NULL;
617 return error;
618}
619
620/*
621 * Dispose of every block of every fs metadata extent in the bitmap.
622 * Do not use this to dispose of the mappings in an ondisk inode fork.
623 */
624int
625xrep_reap_fsblocks(
626 struct xfs_scrub *sc,
627 struct xfsb_bitmap *bitmap,
628 const struct xfs_owner_info *oinfo)
629{
630 struct xreap_state rs = {
631 .sc = sc,
632 .oinfo = oinfo,
633 .resv = XFS_AG_RESV_NONE,
634 };
635 int error;
636
637 ASSERT(xfs_has_rmapbt(sc->mp));
638 ASSERT(sc->ip != NULL);
639
640 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
641 if (error)
642 return error;
643
644 if (xreap_dirty(&rs))
645 return xrep_defer_finish(sc);
646
647 return 0;
648}
649

source code of linux/fs/xfs/scrub/reap.c