1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (C) 2017-2023 Oracle. All Rights Reserved. |
4 | * Author: Darrick J. Wong <djwong@kernel.org> |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_trans_resv.h" |
11 | #include "xfs_mount.h" |
12 | #include "xfs_btree.h" |
13 | #include "xfs_log_format.h" |
14 | #include "xfs_trans.h" |
15 | #include "xfs_inode.h" |
16 | #include "xfs_icache.h" |
17 | #include "xfs_alloc.h" |
18 | #include "xfs_alloc_btree.h" |
19 | #include "xfs_ialloc.h" |
20 | #include "xfs_ialloc_btree.h" |
21 | #include "xfs_refcount_btree.h" |
22 | #include "xfs_rmap.h" |
23 | #include "xfs_rmap_btree.h" |
24 | #include "xfs_log.h" |
25 | #include "xfs_trans_priv.h" |
26 | #include "xfs_da_format.h" |
27 | #include "xfs_da_btree.h" |
28 | #include "xfs_dir2_priv.h" |
29 | #include "xfs_attr.h" |
30 | #include "xfs_reflink.h" |
31 | #include "xfs_ag.h" |
32 | #include "xfs_error.h" |
33 | #include "xfs_quota.h" |
34 | #include "scrub/scrub.h" |
35 | #include "scrub/common.h" |
36 | #include "scrub/trace.h" |
37 | #include "scrub/repair.h" |
38 | #include "scrub/health.h" |
39 | |
40 | /* Common code for the metadata scrubbers. */ |
41 | |
42 | /* |
43 | * Handling operational errors. |
44 | * |
45 | * The *_process_error() family of functions are used to process error return |
46 | * codes from functions called as part of a scrub operation. |
47 | * |
48 | * If there's no error, we return true to tell the caller that it's ok |
49 | * to move on to the next check in its list. |
50 | * |
51 | * For non-verifier errors (e.g. ENOMEM) we return false to tell the |
52 | * caller that something bad happened, and we preserve *error so that |
53 | * the caller can return the *error up the stack to userspace. |
54 | * |
55 | * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting |
56 | * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, |
57 | * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, |
58 | * not via return codes. We return false to tell the caller that |
59 | * something bad happened. Since the error has been cleared, the caller |
60 | * will (presumably) return that zero and scrubbing will move on to |
61 | * whatever's next. |
62 | * |
63 | * ftrace can be used to record the precise metadata location and the |
64 | * approximate code location of the failed operation. |
65 | */ |
66 | |
67 | /* Check for operational errors. */ |
68 | static bool |
69 | __xchk_process_error( |
70 | struct xfs_scrub *sc, |
71 | xfs_agnumber_t agno, |
72 | xfs_agblock_t bno, |
73 | int *error, |
74 | __u32 errflag, |
75 | void *ret_ip) |
76 | { |
77 | switch (*error) { |
78 | case 0: |
79 | return true; |
80 | case -EDEADLOCK: |
81 | case -ECHRNG: |
82 | /* Used to restart an op with deadlock avoidance. */ |
83 | trace_xchk_deadlock_retry( |
84 | sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), |
85 | sc->sm, *error); |
86 | break; |
87 | case -ECANCELED: |
88 | /* |
89 | * ECANCELED here means that the caller set one of the scrub |
90 | * outcome flags (corrupt, xfail, xcorrupt) and wants to exit |
91 | * quickly. Set error to zero and do not continue. |
92 | */ |
93 | trace_xchk_op_error(sc, agno, bno, *error, ret_ip); |
94 | *error = 0; |
95 | break; |
96 | case -EFSBADCRC: |
97 | case -EFSCORRUPTED: |
98 | /* Note the badness but don't abort. */ |
99 | sc->sm->sm_flags |= errflag; |
100 | *error = 0; |
101 | fallthrough; |
102 | default: |
103 | trace_xchk_op_error(sc, agno, bno, *error, ret_ip); |
104 | break; |
105 | } |
106 | return false; |
107 | } |
108 | |
109 | bool |
110 | xchk_process_error( |
111 | struct xfs_scrub *sc, |
112 | xfs_agnumber_t agno, |
113 | xfs_agblock_t bno, |
114 | int *error) |
115 | { |
116 | return __xchk_process_error(sc, agno, bno, error, |
117 | XFS_SCRUB_OFLAG_CORRUPT, __return_address); |
118 | } |
119 | |
120 | bool |
121 | xchk_xref_process_error( |
122 | struct xfs_scrub *sc, |
123 | xfs_agnumber_t agno, |
124 | xfs_agblock_t bno, |
125 | int *error) |
126 | { |
127 | return __xchk_process_error(sc, agno, bno, error, |
128 | XFS_SCRUB_OFLAG_XFAIL, __return_address); |
129 | } |
130 | |
131 | /* Check for operational errors for a file offset. */ |
132 | static bool |
133 | __xchk_fblock_process_error( |
134 | struct xfs_scrub *sc, |
135 | int whichfork, |
136 | xfs_fileoff_t offset, |
137 | int *error, |
138 | __u32 errflag, |
139 | void *ret_ip) |
140 | { |
141 | switch (*error) { |
142 | case 0: |
143 | return true; |
144 | case -EDEADLOCK: |
145 | case -ECHRNG: |
146 | /* Used to restart an op with deadlock avoidance. */ |
147 | trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); |
148 | break; |
149 | case -ECANCELED: |
150 | /* |
151 | * ECANCELED here means that the caller set one of the scrub |
152 | * outcome flags (corrupt, xfail, xcorrupt) and wants to exit |
153 | * quickly. Set error to zero and do not continue. |
154 | */ |
155 | trace_xchk_file_op_error(sc, whichfork, offset, *error, |
156 | ret_ip); |
157 | *error = 0; |
158 | break; |
159 | case -EFSBADCRC: |
160 | case -EFSCORRUPTED: |
161 | /* Note the badness but don't abort. */ |
162 | sc->sm->sm_flags |= errflag; |
163 | *error = 0; |
164 | fallthrough; |
165 | default: |
166 | trace_xchk_file_op_error(sc, whichfork, offset, *error, |
167 | ret_ip); |
168 | break; |
169 | } |
170 | return false; |
171 | } |
172 | |
173 | bool |
174 | xchk_fblock_process_error( |
175 | struct xfs_scrub *sc, |
176 | int whichfork, |
177 | xfs_fileoff_t offset, |
178 | int *error) |
179 | { |
180 | return __xchk_fblock_process_error(sc, whichfork, offset, error, |
181 | XFS_SCRUB_OFLAG_CORRUPT, __return_address); |
182 | } |
183 | |
184 | bool |
185 | xchk_fblock_xref_process_error( |
186 | struct xfs_scrub *sc, |
187 | int whichfork, |
188 | xfs_fileoff_t offset, |
189 | int *error) |
190 | { |
191 | return __xchk_fblock_process_error(sc, whichfork, offset, error, |
192 | XFS_SCRUB_OFLAG_XFAIL, __return_address); |
193 | } |
194 | |
195 | /* |
196 | * Handling scrub corruption/optimization/warning checks. |
197 | * |
198 | * The *_set_{corrupt,preen,warning}() family of functions are used to |
199 | * record the presence of metadata that is incorrect (corrupt), could be |
200 | * optimized somehow (preen), or should be flagged for administrative |
201 | * review but is not incorrect (warn). |
202 | * |
203 | * ftrace can be used to record the precise metadata location and |
204 | * approximate code location of the failed check. |
205 | */ |
206 | |
207 | /* Record a block which could be optimized. */ |
208 | void |
209 | xchk_block_set_preen( |
210 | struct xfs_scrub *sc, |
211 | struct xfs_buf *bp) |
212 | { |
213 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; |
214 | trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); |
215 | } |
216 | |
217 | /* |
218 | * Record an inode which could be optimized. The trace data will |
219 | * include the block given by bp if bp is given; otherwise it will use |
220 | * the block location of the inode record itself. |
221 | */ |
222 | void |
223 | xchk_ino_set_preen( |
224 | struct xfs_scrub *sc, |
225 | xfs_ino_t ino) |
226 | { |
227 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; |
228 | trace_xchk_ino_preen(sc, ino, __return_address); |
229 | } |
230 | |
231 | /* Record something being wrong with the filesystem primary superblock. */ |
232 | void |
233 | xchk_set_corrupt( |
234 | struct xfs_scrub *sc) |
235 | { |
236 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
237 | trace_xchk_fs_error(sc, 0, __return_address); |
238 | } |
239 | |
240 | /* Record a corrupt block. */ |
241 | void |
242 | xchk_block_set_corrupt( |
243 | struct xfs_scrub *sc, |
244 | struct xfs_buf *bp) |
245 | { |
246 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
247 | trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); |
248 | } |
249 | |
250 | #ifdef CONFIG_XFS_QUOTA |
251 | /* Record a corrupt quota counter. */ |
252 | void |
253 | xchk_qcheck_set_corrupt( |
254 | struct xfs_scrub *sc, |
255 | unsigned int dqtype, |
256 | xfs_dqid_t id) |
257 | { |
258 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
259 | trace_xchk_qcheck_error(sc, dqtype, id, __return_address); |
260 | } |
261 | #endif |
262 | |
263 | /* Record a corruption while cross-referencing. */ |
264 | void |
265 | xchk_block_xref_set_corrupt( |
266 | struct xfs_scrub *sc, |
267 | struct xfs_buf *bp) |
268 | { |
269 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; |
270 | trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); |
271 | } |
272 | |
273 | /* |
274 | * Record a corrupt inode. The trace data will include the block given |
275 | * by bp if bp is given; otherwise it will use the block location of the |
276 | * inode record itself. |
277 | */ |
278 | void |
279 | xchk_ino_set_corrupt( |
280 | struct xfs_scrub *sc, |
281 | xfs_ino_t ino) |
282 | { |
283 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
284 | trace_xchk_ino_error(sc, ino, __return_address); |
285 | } |
286 | |
287 | /* Record a corruption while cross-referencing with an inode. */ |
288 | void |
289 | xchk_ino_xref_set_corrupt( |
290 | struct xfs_scrub *sc, |
291 | xfs_ino_t ino) |
292 | { |
293 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; |
294 | trace_xchk_ino_error(sc, ino, __return_address); |
295 | } |
296 | |
297 | /* Record corruption in a block indexed by a file fork. */ |
298 | void |
299 | xchk_fblock_set_corrupt( |
300 | struct xfs_scrub *sc, |
301 | int whichfork, |
302 | xfs_fileoff_t offset) |
303 | { |
304 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
305 | trace_xchk_fblock_error(sc, whichfork, offset, __return_address); |
306 | } |
307 | |
308 | /* Record a corruption while cross-referencing a fork block. */ |
309 | void |
310 | xchk_fblock_xref_set_corrupt( |
311 | struct xfs_scrub *sc, |
312 | int whichfork, |
313 | xfs_fileoff_t offset) |
314 | { |
315 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; |
316 | trace_xchk_fblock_error(sc, whichfork, offset, __return_address); |
317 | } |
318 | |
319 | /* |
320 | * Warn about inodes that need administrative review but is not |
321 | * incorrect. |
322 | */ |
323 | void |
324 | xchk_ino_set_warning( |
325 | struct xfs_scrub *sc, |
326 | xfs_ino_t ino) |
327 | { |
328 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; |
329 | trace_xchk_ino_warning(sc, ino, __return_address); |
330 | } |
331 | |
332 | /* Warn about a block indexed by a file fork that needs review. */ |
333 | void |
334 | xchk_fblock_set_warning( |
335 | struct xfs_scrub *sc, |
336 | int whichfork, |
337 | xfs_fileoff_t offset) |
338 | { |
339 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; |
340 | trace_xchk_fblock_warning(sc, whichfork, offset, __return_address); |
341 | } |
342 | |
343 | /* Signal an incomplete scrub. */ |
344 | void |
345 | xchk_set_incomplete( |
346 | struct xfs_scrub *sc) |
347 | { |
348 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; |
349 | trace_xchk_incomplete(sc, __return_address); |
350 | } |
351 | |
352 | /* |
353 | * rmap scrubbing -- compute the number of blocks with a given owner, |
354 | * at least according to the reverse mapping data. |
355 | */ |
356 | |
357 | struct xchk_rmap_ownedby_info { |
358 | const struct xfs_owner_info *oinfo; |
359 | xfs_filblks_t *blocks; |
360 | }; |
361 | |
362 | STATIC int |
363 | xchk_count_rmap_ownedby_irec( |
364 | struct xfs_btree_cur *cur, |
365 | const struct xfs_rmap_irec *rec, |
366 | void *priv) |
367 | { |
368 | struct xchk_rmap_ownedby_info *sroi = priv; |
369 | bool irec_attr; |
370 | bool oinfo_attr; |
371 | |
372 | irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; |
373 | oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; |
374 | |
375 | if (rec->rm_owner != sroi->oinfo->oi_owner) |
376 | return 0; |
377 | |
378 | if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) |
379 | (*sroi->blocks) += rec->rm_blockcount; |
380 | |
381 | return 0; |
382 | } |
383 | |
384 | /* |
385 | * Calculate the number of blocks the rmap thinks are owned by something. |
386 | * The caller should pass us an rmapbt cursor. |
387 | */ |
388 | int |
389 | xchk_count_rmap_ownedby_ag( |
390 | struct xfs_scrub *sc, |
391 | struct xfs_btree_cur *cur, |
392 | const struct xfs_owner_info *oinfo, |
393 | xfs_filblks_t *blocks) |
394 | { |
395 | struct xchk_rmap_ownedby_info sroi = { |
396 | .oinfo = oinfo, |
397 | .blocks = blocks, |
398 | }; |
399 | |
400 | *blocks = 0; |
401 | return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, |
402 | &sroi); |
403 | } |
404 | |
405 | /* |
406 | * AG scrubbing |
407 | * |
408 | * These helpers facilitate locking an allocation group's header |
409 | * buffers, setting up cursors for all btrees that are present, and |
410 | * cleaning everything up once we're through. |
411 | */ |
412 | |
413 | /* Decide if we want to return an AG header read failure. */ |
414 | static inline bool |
415 | ( |
416 | struct xfs_scrub *sc, |
417 | unsigned int type) |
418 | { |
419 | /* Return all AG header read failures when scanning btrees. */ |
420 | if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && |
421 | sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && |
422 | sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) |
423 | return true; |
424 | /* |
425 | * If we're scanning a given type of AG header, we only want to |
426 | * see read failures from that specific header. We'd like the |
427 | * other headers to cross-check them, but this isn't required. |
428 | */ |
429 | if (sc->sm->sm_type == type) |
430 | return true; |
431 | return false; |
432 | } |
433 | |
434 | /* |
435 | * Grab the AG header buffers for the attached perag structure. |
436 | * |
437 | * The headers should be released by xchk_ag_free, but as a fail safe we attach |
438 | * all the buffers we grab to the scrub transaction so they'll all be freed |
439 | * when we cancel it. |
440 | */ |
441 | static inline int |
442 | ( |
443 | struct xfs_scrub *sc, |
444 | struct xchk_ag *sa) |
445 | { |
446 | int error; |
447 | |
448 | error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); |
449 | if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) |
450 | return error; |
451 | |
452 | error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); |
453 | if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) |
454 | return error; |
455 | |
456 | return 0; |
457 | } |
458 | |
459 | /* |
460 | * Grab the AG headers for the attached perag structure and wait for pending |
461 | * intents to drain. |
462 | */ |
463 | int |
464 | xchk_perag_drain_and_lock( |
465 | struct xfs_scrub *sc) |
466 | { |
467 | struct xchk_ag *sa = &sc->sa; |
468 | int error = 0; |
469 | |
470 | ASSERT(sa->pag != NULL); |
471 | ASSERT(sa->agi_bp == NULL); |
472 | ASSERT(sa->agf_bp == NULL); |
473 | |
474 | do { |
475 | if (xchk_should_terminate(sc, &error)) |
476 | return error; |
477 | |
478 | error = xchk_perag_read_headers(sc, sa); |
479 | if (error) |
480 | return error; |
481 | |
482 | /* |
483 | * If we've grabbed an inode for scrubbing then we assume that |
484 | * holding its ILOCK will suffice to coordinate with any intent |
485 | * chains involving this inode. |
486 | */ |
487 | if (sc->ip) |
488 | return 0; |
489 | |
490 | /* |
491 | * Decide if this AG is quiet enough for all metadata to be |
492 | * consistent with each other. XFS allows the AG header buffer |
493 | * locks to cycle across transaction rolls while processing |
494 | * chains of deferred ops, which means that there could be |
495 | * other threads in the middle of processing a chain of |
496 | * deferred ops. For regular operations we are careful about |
497 | * ordering operations to prevent collisions between threads |
498 | * (which is why we don't need a per-AG lock), but scrub and |
499 | * repair have to serialize against chained operations. |
500 | * |
501 | * We just locked all the AG headers buffers; now take a look |
502 | * to see if there are any intents in progress. If there are, |
503 | * drop the AG headers and wait for the intents to drain. |
504 | * Since we hold all the AG header locks for the duration of |
505 | * the scrub, this is the only time we have to sample the |
506 | * intents counter; any threads increasing it after this point |
507 | * can't possibly be in the middle of a chain of AG metadata |
508 | * updates. |
509 | * |
510 | * Obviously, this should be slanted against scrub and in favor |
511 | * of runtime threads. |
512 | */ |
513 | if (!xfs_perag_intent_busy(sa->pag)) |
514 | return 0; |
515 | |
516 | if (sa->agf_bp) { |
517 | xfs_trans_brelse(sc->tp, sa->agf_bp); |
518 | sa->agf_bp = NULL; |
519 | } |
520 | |
521 | if (sa->agi_bp) { |
522 | xfs_trans_brelse(sc->tp, sa->agi_bp); |
523 | sa->agi_bp = NULL; |
524 | } |
525 | |
526 | if (!(sc->flags & XCHK_FSGATES_DRAIN)) |
527 | return -ECHRNG; |
528 | error = xfs_perag_intent_drain(sa->pag); |
529 | if (error == -ERESTARTSYS) |
530 | error = -EINTR; |
531 | } while (!error); |
532 | |
533 | return error; |
534 | } |
535 | |
536 | /* |
537 | * Grab the per-AG structure, grab all AG header buffers, and wait until there |
538 | * aren't any pending intents. Returns -ENOENT if we can't grab the perag |
539 | * structure. |
540 | */ |
541 | int |
542 | ( |
543 | struct xfs_scrub *sc, |
544 | xfs_agnumber_t agno, |
545 | struct xchk_ag *sa) |
546 | { |
547 | struct xfs_mount *mp = sc->mp; |
548 | |
549 | ASSERT(!sa->pag); |
550 | sa->pag = xfs_perag_get(mp, agno); |
551 | if (!sa->pag) |
552 | return -ENOENT; |
553 | |
554 | return xchk_perag_drain_and_lock(sc); |
555 | } |
556 | |
557 | /* Release all the AG btree cursors. */ |
558 | void |
559 | xchk_ag_btcur_free( |
560 | struct xchk_ag *sa) |
561 | { |
562 | if (sa->refc_cur) |
563 | xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); |
564 | if (sa->rmap_cur) |
565 | xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); |
566 | if (sa->fino_cur) |
567 | xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); |
568 | if (sa->ino_cur) |
569 | xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); |
570 | if (sa->cnt_cur) |
571 | xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); |
572 | if (sa->bno_cur) |
573 | xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); |
574 | |
575 | sa->refc_cur = NULL; |
576 | sa->rmap_cur = NULL; |
577 | sa->fino_cur = NULL; |
578 | sa->ino_cur = NULL; |
579 | sa->bno_cur = NULL; |
580 | sa->cnt_cur = NULL; |
581 | } |
582 | |
583 | /* Initialize all the btree cursors for an AG. */ |
584 | void |
585 | xchk_ag_btcur_init( |
586 | struct xfs_scrub *sc, |
587 | struct xchk_ag *sa) |
588 | { |
589 | struct xfs_mount *mp = sc->mp; |
590 | |
591 | if (sa->agf_bp) { |
592 | /* Set up a bnobt cursor for cross-referencing. */ |
593 | sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, |
594 | sa->pag); |
595 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, |
596 | XFS_SCRUB_TYPE_BNOBT); |
597 | |
598 | /* Set up a cntbt cursor for cross-referencing. */ |
599 | sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, |
600 | sa->pag); |
601 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, |
602 | XFS_SCRUB_TYPE_CNTBT); |
603 | |
604 | /* Set up a rmapbt cursor for cross-referencing. */ |
605 | if (xfs_has_rmapbt(mp)) { |
606 | sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, |
607 | sa->agf_bp, sa->pag); |
608 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, |
609 | XFS_SCRUB_TYPE_RMAPBT); |
610 | } |
611 | |
612 | /* Set up a refcountbt cursor for cross-referencing. */ |
613 | if (xfs_has_reflink(mp)) { |
614 | sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, |
615 | sa->agf_bp, sa->pag); |
616 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, |
617 | XFS_SCRUB_TYPE_REFCNTBT); |
618 | } |
619 | } |
620 | |
621 | if (sa->agi_bp) { |
622 | /* Set up a inobt cursor for cross-referencing. */ |
623 | sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, |
624 | sa->agi_bp); |
625 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, |
626 | XFS_SCRUB_TYPE_INOBT); |
627 | |
628 | /* Set up a finobt cursor for cross-referencing. */ |
629 | if (xfs_has_finobt(mp)) { |
630 | sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, |
631 | sa->agi_bp); |
632 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, |
633 | XFS_SCRUB_TYPE_FINOBT); |
634 | } |
635 | } |
636 | } |
637 | |
638 | /* Release the AG header context and btree cursors. */ |
639 | void |
640 | xchk_ag_free( |
641 | struct xfs_scrub *sc, |
642 | struct xchk_ag *sa) |
643 | { |
644 | xchk_ag_btcur_free(sa); |
645 | xrep_reset_perag_resv(sc); |
646 | if (sa->agf_bp) { |
647 | xfs_trans_brelse(sc->tp, sa->agf_bp); |
648 | sa->agf_bp = NULL; |
649 | } |
650 | if (sa->agi_bp) { |
651 | xfs_trans_brelse(sc->tp, sa->agi_bp); |
652 | sa->agi_bp = NULL; |
653 | } |
654 | if (sa->pag) { |
655 | xfs_perag_put(sa->pag); |
656 | sa->pag = NULL; |
657 | } |
658 | } |
659 | |
660 | /* |
661 | * For scrub, grab the perag structure, the AGI, and the AGF headers, in that |
662 | * order. Locking order requires us to get the AGI before the AGF. We use the |
663 | * transaction to avoid deadlocking on crosslinked metadata buffers; either the |
664 | * caller passes one in (bmap scrub) or we have to create a transaction |
665 | * ourselves. Returns ENOENT if the perag struct cannot be grabbed. |
666 | */ |
667 | int |
668 | xchk_ag_init( |
669 | struct xfs_scrub *sc, |
670 | xfs_agnumber_t agno, |
671 | struct xchk_ag *sa) |
672 | { |
673 | int error; |
674 | |
675 | error = xchk_ag_read_headers(sc, agno, sa); |
676 | if (error) |
677 | return error; |
678 | |
679 | xchk_ag_btcur_init(sc, sa); |
680 | return 0; |
681 | } |
682 | |
683 | /* Per-scrubber setup functions */ |
684 | |
685 | void |
686 | xchk_trans_cancel( |
687 | struct xfs_scrub *sc) |
688 | { |
689 | xfs_trans_cancel(sc->tp); |
690 | sc->tp = NULL; |
691 | } |
692 | |
693 | int |
694 | xchk_trans_alloc_empty( |
695 | struct xfs_scrub *sc) |
696 | { |
697 | return xfs_trans_alloc_empty(sc->mp, &sc->tp); |
698 | } |
699 | |
700 | /* |
701 | * Grab an empty transaction so that we can re-grab locked buffers if |
702 | * one of our btrees turns out to be cyclic. |
703 | * |
704 | * If we're going to repair something, we need to ask for the largest possible |
705 | * log reservation so that we can handle the worst case scenario for metadata |
706 | * updates while rebuilding a metadata item. We also need to reserve as many |
707 | * blocks in the head transaction as we think we're going to need to rebuild |
708 | * the metadata object. |
709 | */ |
710 | int |
711 | xchk_trans_alloc( |
712 | struct xfs_scrub *sc, |
713 | uint resblks) |
714 | { |
715 | if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) |
716 | return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, |
717 | resblks, 0, 0, &sc->tp); |
718 | |
719 | return xchk_trans_alloc_empty(sc); |
720 | } |
721 | |
722 | /* Set us up with a transaction and an empty context. */ |
723 | int |
724 | xchk_setup_fs( |
725 | struct xfs_scrub *sc) |
726 | { |
727 | uint resblks; |
728 | |
729 | resblks = xrep_calc_ag_resblks(sc); |
730 | return xchk_trans_alloc(sc, resblks); |
731 | } |
732 | |
733 | /* Set us up with AG headers and btree cursors. */ |
734 | int |
735 | xchk_setup_ag_btree( |
736 | struct xfs_scrub *sc, |
737 | bool force_log) |
738 | { |
739 | struct xfs_mount *mp = sc->mp; |
740 | int error; |
741 | |
742 | /* |
743 | * If the caller asks us to checkpont the log, do so. This |
744 | * expensive operation should be performed infrequently and only |
745 | * as a last resort. Any caller that sets force_log should |
746 | * document why they need to do so. |
747 | */ |
748 | if (force_log) { |
749 | error = xchk_checkpoint_log(mp); |
750 | if (error) |
751 | return error; |
752 | } |
753 | |
754 | error = xchk_setup_fs(sc); |
755 | if (error) |
756 | return error; |
757 | |
758 | return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa); |
759 | } |
760 | |
761 | /* Push everything out of the log onto disk. */ |
762 | int |
763 | xchk_checkpoint_log( |
764 | struct xfs_mount *mp) |
765 | { |
766 | int error; |
767 | |
768 | error = xfs_log_force(mp, XFS_LOG_SYNC); |
769 | if (error) |
770 | return error; |
771 | xfs_ail_push_all_sync(mp->m_ail); |
772 | return 0; |
773 | } |
774 | |
775 | /* Verify that an inode is allocated ondisk, then return its cached inode. */ |
776 | int |
777 | xchk_iget( |
778 | struct xfs_scrub *sc, |
779 | xfs_ino_t inum, |
780 | struct xfs_inode **ipp) |
781 | { |
782 | ASSERT(sc->tp != NULL); |
783 | |
784 | return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); |
785 | } |
786 | |
787 | /* |
788 | * Try to grab an inode in a manner that avoids races with physical inode |
789 | * allocation. If we can't, return the locked AGI buffer so that the caller |
790 | * can single-step the loading process to see where things went wrong. |
791 | * Callers must have a valid scrub transaction. |
792 | * |
793 | * If the iget succeeds, return 0, a NULL AGI, and the inode. |
794 | * |
795 | * If the iget fails, return the error, the locked AGI, and a NULL inode. This |
796 | * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are |
797 | * no longer allocated; or any other corruption or runtime error. |
798 | * |
799 | * If the AGI read fails, return the error, a NULL AGI, and NULL inode. |
800 | * |
801 | * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. |
802 | */ |
803 | int |
804 | xchk_iget_agi( |
805 | struct xfs_scrub *sc, |
806 | xfs_ino_t inum, |
807 | struct xfs_buf **agi_bpp, |
808 | struct xfs_inode **ipp) |
809 | { |
810 | struct xfs_mount *mp = sc->mp; |
811 | struct xfs_trans *tp = sc->tp; |
812 | struct xfs_perag *pag; |
813 | int error; |
814 | |
815 | ASSERT(sc->tp != NULL); |
816 | |
817 | again: |
818 | *agi_bpp = NULL; |
819 | *ipp = NULL; |
820 | error = 0; |
821 | |
822 | if (xchk_should_terminate(sc, &error)) |
823 | return error; |
824 | |
825 | /* |
826 | * Attach the AGI buffer to the scrub transaction to avoid deadlocks |
827 | * in the iget cache miss path. |
828 | */ |
829 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); |
830 | error = xfs_ialloc_read_agi(pag, tp, agi_bpp); |
831 | xfs_perag_put(pag); |
832 | if (error) |
833 | return error; |
834 | |
835 | error = xfs_iget(mp, tp, inum, |
836 | XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); |
837 | if (error == -EAGAIN) { |
838 | /* |
839 | * The inode may be in core but temporarily unavailable and may |
840 | * require the AGI buffer before it can be returned. Drop the |
841 | * AGI buffer and retry the lookup. |
842 | * |
843 | * Incore lookup will fail with EAGAIN on a cache hit if the |
844 | * inode is queued to the inactivation list. The inactivation |
845 | * worker may remove the inode from the unlinked list and hence |
846 | * needs the AGI. |
847 | * |
848 | * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN |
849 | * to allow inodegc to make progress and move the inode to |
850 | * IRECLAIMABLE state where xfs_iget will be able to return it |
851 | * again if it can lock the inode. |
852 | */ |
853 | xfs_trans_brelse(tp, *agi_bpp); |
854 | delay(1); |
855 | goto again; |
856 | } |
857 | if (error) |
858 | return error; |
859 | |
860 | /* We got the inode, so we can release the AGI. */ |
861 | ASSERT(*ipp != NULL); |
862 | xfs_trans_brelse(tp, *agi_bpp); |
863 | *agi_bpp = NULL; |
864 | return 0; |
865 | } |
866 | |
867 | #ifdef CONFIG_XFS_QUOTA |
868 | /* |
869 | * Try to attach dquots to this inode if we think we might want to repair it. |
870 | * Callers must not hold any ILOCKs. If the dquots are broken and cannot be |
871 | * attached, a quotacheck will be scheduled. |
872 | */ |
873 | int |
874 | xchk_ino_dqattach( |
875 | struct xfs_scrub *sc) |
876 | { |
877 | ASSERT(sc->tp != NULL); |
878 | ASSERT(sc->ip != NULL); |
879 | |
880 | if (!xchk_could_repair(sc)) |
881 | return 0; |
882 | |
883 | return xrep_ino_dqattach(sc); |
884 | } |
885 | #endif |
886 | |
887 | /* Install an inode that we opened by handle for scrubbing. */ |
888 | int |
889 | xchk_install_handle_inode( |
890 | struct xfs_scrub *sc, |
891 | struct xfs_inode *ip) |
892 | { |
893 | if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { |
894 | xchk_irele(sc, ip); |
895 | return -ENOENT; |
896 | } |
897 | |
898 | sc->ip = ip; |
899 | return 0; |
900 | } |
901 | |
902 | /* |
903 | * Install an already-referenced inode for scrubbing. Get our own reference to |
904 | * the inode to make disposal simpler. The inode must not be in I_FREEING or |
905 | * I_WILL_FREE state! |
906 | */ |
907 | int |
908 | xchk_install_live_inode( |
909 | struct xfs_scrub *sc, |
910 | struct xfs_inode *ip) |
911 | { |
912 | if (!igrab(VFS_I(ip))) { |
913 | xchk_ino_set_corrupt(sc, ip->i_ino); |
914 | return -EFSCORRUPTED; |
915 | } |
916 | |
917 | sc->ip = ip; |
918 | return 0; |
919 | } |
920 | |
921 | /* |
922 | * In preparation to scrub metadata structures that hang off of an inode, |
923 | * grab either the inode referenced in the scrub control structure or the |
924 | * inode passed in. If the inumber does not reference an allocated inode |
925 | * record, the function returns ENOENT to end the scrub early. The inode |
926 | * is not locked. |
927 | */ |
928 | int |
929 | xchk_iget_for_scrubbing( |
930 | struct xfs_scrub *sc) |
931 | { |
932 | struct xfs_imap imap; |
933 | struct xfs_mount *mp = sc->mp; |
934 | struct xfs_perag *pag; |
935 | struct xfs_buf *agi_bp; |
936 | struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); |
937 | struct xfs_inode *ip = NULL; |
938 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); |
939 | int error; |
940 | |
941 | ASSERT(sc->tp == NULL); |
942 | |
943 | /* We want to scan the inode we already had opened. */ |
944 | if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) |
945 | return xchk_install_live_inode(sc, ip: ip_in); |
946 | |
947 | /* Reject internal metadata files and obviously bad inode numbers. */ |
948 | if (xfs_internal_inum(mp, sc->sm->sm_ino)) |
949 | return -ENOENT; |
950 | if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) |
951 | return -ENOENT; |
952 | |
953 | /* Try a safe untrusted iget. */ |
954 | error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); |
955 | if (!error) |
956 | return xchk_install_handle_inode(sc, ip); |
957 | if (error == -ENOENT) |
958 | return error; |
959 | if (error != -EINVAL) |
960 | goto out_error; |
961 | |
962 | /* |
963 | * EINVAL with IGET_UNTRUSTED probably means one of several things: |
964 | * userspace gave us an inode number that doesn't correspond to fs |
965 | * space; the inode btree lacks a record for this inode; or there is a |
966 | * record, and it says this inode is free. |
967 | * |
968 | * We want to look up this inode in the inobt to distinguish two |
969 | * scenarios: (1) the inobt says the inode is free, in which case |
970 | * there's nothing to do; and (2) the inobt says the inode is |
971 | * allocated, but loading it failed due to corruption. |
972 | * |
973 | * Allocate a transaction and grab the AGI to prevent inobt activity |
974 | * in this AG. Retry the iget in case someone allocated a new inode |
975 | * after the first iget failed. |
976 | */ |
977 | error = xchk_trans_alloc(sc, 0); |
978 | if (error) |
979 | goto out_error; |
980 | |
981 | error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); |
982 | if (error == 0) { |
983 | /* Actually got the inode, so install it. */ |
984 | xchk_trans_cancel(sc); |
985 | return xchk_install_handle_inode(sc, ip); |
986 | } |
987 | if (error == -ENOENT) |
988 | goto out_gone; |
989 | if (error != -EINVAL) |
990 | goto out_cancel; |
991 | |
992 | /* Ensure that we have protected against inode allocation/freeing. */ |
993 | if (agi_bp == NULL) { |
994 | ASSERT(agi_bp != NULL); |
995 | error = -ECANCELED; |
996 | goto out_cancel; |
997 | } |
998 | |
999 | /* |
1000 | * Untrusted iget failed a second time. Let's try an inobt lookup. |
1001 | * If the inobt thinks this the inode neither can exist inside the |
1002 | * filesystem nor is allocated, return ENOENT to signal that the check |
1003 | * can be skipped. |
1004 | * |
1005 | * If the lookup returns corruption, we'll mark this inode corrupt and |
1006 | * exit to userspace. There's little chance of fixing anything until |
1007 | * the inobt is straightened out, but there's nothing we can do here. |
1008 | * |
1009 | * If the lookup encounters any other error, exit to userspace. |
1010 | * |
1011 | * If the lookup succeeds, something else must be very wrong in the fs |
1012 | * such that setting up the incore inode failed in some strange way. |
1013 | * Treat those as corruptions. |
1014 | */ |
1015 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); |
1016 | if (!pag) { |
1017 | error = -EFSCORRUPTED; |
1018 | goto out_cancel; |
1019 | } |
1020 | |
1021 | error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, |
1022 | XFS_IGET_UNTRUSTED); |
1023 | xfs_perag_put(pag); |
1024 | if (error == -EINVAL || error == -ENOENT) |
1025 | goto out_gone; |
1026 | if (!error) |
1027 | error = -EFSCORRUPTED; |
1028 | |
1029 | out_cancel: |
1030 | xchk_trans_cancel(sc); |
1031 | out_error: |
1032 | trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), |
1033 | error, __return_address); |
1034 | return error; |
1035 | out_gone: |
1036 | /* The file is gone, so there's nothing to check. */ |
1037 | xchk_trans_cancel(sc); |
1038 | return -ENOENT; |
1039 | } |
1040 | |
1041 | /* Release an inode, possibly dropping it in the process. */ |
1042 | void |
1043 | xchk_irele( |
1044 | struct xfs_scrub *sc, |
1045 | struct xfs_inode *ip) |
1046 | { |
1047 | if (sc->tp) { |
1048 | /* |
1049 | * If we are in a transaction, we /cannot/ drop the inode |
1050 | * ourselves, because the VFS will trigger writeback, which |
1051 | * can require a transaction. Clear DONTCACHE to force the |
1052 | * inode to the LRU, where someone else can take care of |
1053 | * dropping it. |
1054 | * |
1055 | * Note that when we grabbed our reference to the inode, it |
1056 | * could have had an active ref and DONTCACHE set if a sysadmin |
1057 | * is trying to coerce a change in file access mode. icache |
1058 | * hits do not clear DONTCACHE, so we must do it here. |
1059 | */ |
1060 | spin_lock(&VFS_I(ip)->i_lock); |
1061 | VFS_I(ip)->i_state &= ~I_DONTCACHE; |
1062 | spin_unlock(&VFS_I(ip)->i_lock); |
1063 | } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { |
1064 | /* |
1065 | * If this is the last reference to the inode and the caller |
1066 | * permits it, set DONTCACHE to avoid thrashing. |
1067 | */ |
1068 | d_mark_dontcache(VFS_I(ip)); |
1069 | } |
1070 | |
1071 | xfs_irele(ip); |
1072 | } |
1073 | |
1074 | /* |
1075 | * Set us up to scrub metadata mapped by a file's fork. Callers must not use |
1076 | * this to operate on user-accessible regular file data because the MMAPLOCK is |
1077 | * not taken. |
1078 | */ |
1079 | int |
1080 | xchk_setup_inode_contents( |
1081 | struct xfs_scrub *sc, |
1082 | unsigned int resblks) |
1083 | { |
1084 | int error; |
1085 | |
1086 | error = xchk_iget_for_scrubbing(sc); |
1087 | if (error) |
1088 | return error; |
1089 | |
1090 | /* Lock the inode so the VFS cannot touch this file. */ |
1091 | xchk_ilock(sc, XFS_IOLOCK_EXCL); |
1092 | |
1093 | error = xchk_trans_alloc(sc, resblks); |
1094 | if (error) |
1095 | goto out; |
1096 | |
1097 | error = xchk_ino_dqattach(sc); |
1098 | if (error) |
1099 | goto out; |
1100 | |
1101 | xchk_ilock(sc, XFS_ILOCK_EXCL); |
1102 | out: |
1103 | /* scrub teardown will unlock and release the inode for us */ |
1104 | return error; |
1105 | } |
1106 | |
1107 | void |
1108 | xchk_ilock( |
1109 | struct xfs_scrub *sc, |
1110 | unsigned int ilock_flags) |
1111 | { |
1112 | xfs_ilock(sc->ip, ilock_flags); |
1113 | sc->ilock_flags |= ilock_flags; |
1114 | } |
1115 | |
1116 | bool |
1117 | xchk_ilock_nowait( |
1118 | struct xfs_scrub *sc, |
1119 | unsigned int ilock_flags) |
1120 | { |
1121 | if (xfs_ilock_nowait(sc->ip, ilock_flags)) { |
1122 | sc->ilock_flags |= ilock_flags; |
1123 | return true; |
1124 | } |
1125 | |
1126 | return false; |
1127 | } |
1128 | |
1129 | void |
1130 | xchk_iunlock( |
1131 | struct xfs_scrub *sc, |
1132 | unsigned int ilock_flags) |
1133 | { |
1134 | sc->ilock_flags &= ~ilock_flags; |
1135 | xfs_iunlock(sc->ip, ilock_flags); |
1136 | } |
1137 | |
1138 | /* |
1139 | * Predicate that decides if we need to evaluate the cross-reference check. |
1140 | * If there was an error accessing the cross-reference btree, just delete |
1141 | * the cursor and skip the check. |
1142 | */ |
1143 | bool |
1144 | xchk_should_check_xref( |
1145 | struct xfs_scrub *sc, |
1146 | int *error, |
1147 | struct xfs_btree_cur **curpp) |
1148 | { |
1149 | /* No point in xref if we already know we're corrupt. */ |
1150 | if (xchk_skip_xref(sc->sm)) |
1151 | return false; |
1152 | |
1153 | if (*error == 0) |
1154 | return true; |
1155 | |
1156 | if (curpp) { |
1157 | /* If we've already given up on xref, just bail out. */ |
1158 | if (!*curpp) |
1159 | return false; |
1160 | |
1161 | /* xref error, delete cursor and bail out. */ |
1162 | xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); |
1163 | *curpp = NULL; |
1164 | } |
1165 | |
1166 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; |
1167 | trace_xchk_xref_error(sc, *error, __return_address); |
1168 | |
1169 | /* |
1170 | * Errors encountered during cross-referencing with another |
1171 | * data structure should not cause this scrubber to abort. |
1172 | */ |
1173 | *error = 0; |
1174 | return false; |
1175 | } |
1176 | |
1177 | /* Run the structure verifiers on in-memory buffers to detect bad memory. */ |
1178 | void |
1179 | xchk_buffer_recheck( |
1180 | struct xfs_scrub *sc, |
1181 | struct xfs_buf *bp) |
1182 | { |
1183 | xfs_failaddr_t fa; |
1184 | |
1185 | if (bp->b_ops == NULL) { |
1186 | xchk_block_set_corrupt(sc, bp); |
1187 | return; |
1188 | } |
1189 | if (bp->b_ops->verify_struct == NULL) { |
1190 | xchk_set_incomplete(sc); |
1191 | return; |
1192 | } |
1193 | fa = bp->b_ops->verify_struct(bp); |
1194 | if (!fa) |
1195 | return; |
1196 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; |
1197 | trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); |
1198 | } |
1199 | |
1200 | static inline int |
1201 | xchk_metadata_inode_subtype( |
1202 | struct xfs_scrub *sc, |
1203 | unsigned int scrub_type) |
1204 | { |
1205 | __u32 smtype = sc->sm->sm_type; |
1206 | unsigned int sick_mask = sc->sick_mask; |
1207 | int error; |
1208 | |
1209 | sc->sm->sm_type = scrub_type; |
1210 | |
1211 | switch (scrub_type) { |
1212 | case XFS_SCRUB_TYPE_INODE: |
1213 | error = xchk_inode(sc); |
1214 | break; |
1215 | case XFS_SCRUB_TYPE_BMBTD: |
1216 | error = xchk_bmap_data(sc); |
1217 | break; |
1218 | default: |
1219 | ASSERT(0); |
1220 | error = -EFSCORRUPTED; |
1221 | break; |
1222 | } |
1223 | |
1224 | sc->sick_mask = sick_mask; |
1225 | sc->sm->sm_type = smtype; |
1226 | return error; |
1227 | } |
1228 | |
1229 | /* |
1230 | * Scrub the attr/data forks of a metadata inode. The metadata inode must be |
1231 | * pointed to by sc->ip and the ILOCK must be held. |
1232 | */ |
1233 | int |
1234 | xchk_metadata_inode_forks( |
1235 | struct xfs_scrub *sc) |
1236 | { |
1237 | bool shared; |
1238 | int error; |
1239 | |
1240 | if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) |
1241 | return 0; |
1242 | |
1243 | /* Check the inode record. */ |
1244 | error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); |
1245 | if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) |
1246 | return error; |
1247 | |
1248 | /* Metadata inodes don't live on the rt device. */ |
1249 | if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { |
1250 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
1251 | return 0; |
1252 | } |
1253 | |
1254 | /* They should never participate in reflink. */ |
1255 | if (xfs_is_reflink_inode(sc->ip)) { |
1256 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
1257 | return 0; |
1258 | } |
1259 | |
1260 | /* They also should never have extended attributes. */ |
1261 | if (xfs_inode_hasattr(sc->ip)) { |
1262 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
1263 | return 0; |
1264 | } |
1265 | |
1266 | /* Invoke the data fork scrubber. */ |
1267 | error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); |
1268 | if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) |
1269 | return error; |
1270 | |
1271 | /* Look for incorrect shared blocks. */ |
1272 | if (xfs_has_reflink(sc->mp)) { |
1273 | error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, |
1274 | &shared); |
1275 | if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, |
1276 | &error)) |
1277 | return error; |
1278 | if (shared) |
1279 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
1280 | } |
1281 | |
1282 | return 0; |
1283 | } |
1284 | |
1285 | /* |
1286 | * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub |
1287 | * operation. Callers must not hold any locks that intersect with the CPU |
1288 | * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs |
1289 | * to change kernel code. |
1290 | */ |
1291 | void |
1292 | xchk_fsgates_enable( |
1293 | struct xfs_scrub *sc, |
1294 | unsigned int scrub_fsgates) |
1295 | { |
1296 | ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); |
1297 | ASSERT(!(sc->flags & scrub_fsgates)); |
1298 | |
1299 | trace_xchk_fsgates_enable(sc, scrub_fsgates); |
1300 | |
1301 | if (scrub_fsgates & XCHK_FSGATES_DRAIN) |
1302 | xfs_drain_wait_enable(); |
1303 | |
1304 | if (scrub_fsgates & XCHK_FSGATES_QUOTA) |
1305 | xfs_dqtrx_hook_enable(); |
1306 | |
1307 | if (scrub_fsgates & XCHK_FSGATES_DIRENTS) |
1308 | xfs_dir_hook_enable(); |
1309 | |
1310 | if (scrub_fsgates & XCHK_FSGATES_RMAP) |
1311 | xfs_rmap_hook_enable(); |
1312 | |
1313 | sc->flags |= scrub_fsgates; |
1314 | } |
1315 | |
1316 | /* |
1317 | * Decide if this is this a cached inode that's also allocated. The caller |
1318 | * must hold a reference to an AG and the AGI buffer lock to prevent inodes |
1319 | * from being allocated or freed. |
1320 | * |
1321 | * Look up an inode by number in the given file system. If the inode number |
1322 | * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. |
1323 | * If the inode is being reclaimed, return -ENODATA because we know the inode |
1324 | * cache cannot be updating the ondisk metadata. |
1325 | * |
1326 | * Otherwise, the incore inode is the one we want, and it is either live, |
1327 | * somewhere in the inactivation machinery, or reclaimable. The inode is |
1328 | * allocated if i_mode is nonzero. In all three cases, the cached inode will |
1329 | * be more up to date than the ondisk inode buffer, so we must use the incore |
1330 | * i_mode. |
1331 | */ |
1332 | int |
1333 | xchk_inode_is_allocated( |
1334 | struct xfs_scrub *sc, |
1335 | xfs_agino_t agino, |
1336 | bool *inuse) |
1337 | { |
1338 | struct xfs_mount *mp = sc->mp; |
1339 | struct xfs_perag *pag = sc->sa.pag; |
1340 | xfs_ino_t ino; |
1341 | struct xfs_inode *ip; |
1342 | int error; |
1343 | |
1344 | /* caller must hold perag reference */ |
1345 | if (pag == NULL) { |
1346 | ASSERT(pag != NULL); |
1347 | return -EINVAL; |
1348 | } |
1349 | |
1350 | /* caller must have AGI buffer */ |
1351 | if (sc->sa.agi_bp == NULL) { |
1352 | ASSERT(sc->sa.agi_bp != NULL); |
1353 | return -EINVAL; |
1354 | } |
1355 | |
1356 | /* reject inode numbers outside existing AGs */ |
1357 | ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); |
1358 | if (!xfs_verify_ino(mp, ino)) |
1359 | return -EINVAL; |
1360 | |
1361 | error = -ENODATA; |
1362 | rcu_read_lock(); |
1363 | ip = radix_tree_lookup(&pag->pag_ici_root, agino); |
1364 | if (!ip) { |
1365 | /* cache miss */ |
1366 | goto out_rcu; |
1367 | } |
1368 | |
1369 | /* |
1370 | * If the inode number doesn't match, the incore inode got reused |
1371 | * during an RCU grace period and the radix tree hasn't been updated. |
1372 | * This isn't the inode we want. |
1373 | */ |
1374 | spin_lock(&ip->i_flags_lock); |
1375 | if (ip->i_ino != ino) |
1376 | goto out_skip; |
1377 | |
1378 | trace_xchk_inode_is_allocated(ip); |
1379 | |
1380 | /* |
1381 | * We have an incore inode that matches the inode we want, and the |
1382 | * caller holds the perag structure and the AGI buffer. Let's check |
1383 | * our assumptions below: |
1384 | */ |
1385 | |
1386 | #ifdef DEBUG |
1387 | /* |
1388 | * (1) If the incore inode is live (i.e. referenced from the dcache), |
1389 | * it will not be INEW, nor will it be in the inactivation or reclaim |
1390 | * machinery. The ondisk inode had better be allocated. This is the |
1391 | * most trivial case. |
1392 | */ |
1393 | if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | |
1394 | XFS_INACTIVATING))) { |
1395 | /* live inode */ |
1396 | ASSERT(VFS_I(ip)->i_mode != 0); |
1397 | } |
1398 | |
1399 | /* |
1400 | * If the incore inode is INEW, there are several possibilities: |
1401 | * |
1402 | * (2) For a file that is being created, note that we allocate the |
1403 | * ondisk inode before allocating, initializing, and adding the incore |
1404 | * inode to the radix tree. |
1405 | * |
1406 | * (3) If the incore inode is being recycled, the inode has to be |
1407 | * allocated because we don't allow freed inodes to be recycled. |
1408 | * Recycling doesn't touch i_mode. |
1409 | */ |
1410 | if (ip->i_flags & XFS_INEW) { |
1411 | /* created on disk already or recycling */ |
1412 | ASSERT(VFS_I(ip)->i_mode != 0); |
1413 | } |
1414 | |
1415 | /* |
1416 | * (4) If the inode is queued for inactivation (NEED_INACTIVE) but |
1417 | * inactivation has not started (!INACTIVATING), it is still allocated. |
1418 | */ |
1419 | if ((ip->i_flags & XFS_NEED_INACTIVE) && |
1420 | !(ip->i_flags & XFS_INACTIVATING)) { |
1421 | /* definitely before difree */ |
1422 | ASSERT(VFS_I(ip)->i_mode != 0); |
1423 | } |
1424 | #endif |
1425 | |
1426 | /* |
1427 | * If the incore inode is undergoing inactivation (INACTIVATING), there |
1428 | * are two possibilities: |
1429 | * |
1430 | * (5) It is before the point where it would get freed ondisk, in which |
1431 | * case i_mode is still nonzero. |
1432 | * |
1433 | * (6) It has already been freed, in which case i_mode is zero. |
1434 | * |
1435 | * We don't take the ILOCK here, but difree and dialloc update the AGI, |
1436 | * and we've taken the AGI buffer lock, which prevents that from |
1437 | * happening. |
1438 | */ |
1439 | |
1440 | /* |
1441 | * (7) Inodes undergoing inactivation (INACTIVATING) or queued for |
1442 | * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still |
1443 | * reflects the ondisk state. |
1444 | */ |
1445 | |
1446 | /* |
1447 | * (8) If the inode is in IFLUSHING, it's safe to query i_mode because |
1448 | * the flush code uses i_mode to format the ondisk inode. |
1449 | */ |
1450 | |
1451 | /* |
1452 | * (9) If the inode is in IRECLAIM and was reachable via the radix |
1453 | * tree, it still has the same i_mode as it did before it entered |
1454 | * reclaim. The inode object is still alive because we hold the RCU |
1455 | * read lock. |
1456 | */ |
1457 | |
1458 | *inuse = VFS_I(ip)->i_mode != 0; |
1459 | error = 0; |
1460 | |
1461 | out_skip: |
1462 | spin_unlock(&ip->i_flags_lock); |
1463 | out_rcu: |
1464 | rcu_read_unlock(); |
1465 | return error; |
1466 | } |
1467 | |