1 | // SPDX-License-Identifier: GPL-2.0+ |
2 | /* |
3 | * linux/fs/jbd2/recovery.c |
4 | * |
5 | * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 |
6 | * |
7 | * Copyright 1999-2000 Red Hat Software --- All Rights Reserved |
8 | * |
9 | * Journal recovery routines for the generic filesystem journaling code; |
10 | * part of the ext2fs journaling system. |
11 | */ |
12 | |
13 | #ifndef __KERNEL__ |
14 | #include "jfs_user.h" |
15 | #else |
16 | #include <linux/time.h> |
17 | #include <linux/fs.h> |
18 | #include <linux/jbd2.h> |
19 | #include <linux/errno.h> |
20 | #include <linux/crc32.h> |
21 | #include <linux/blkdev.h> |
22 | #endif |
23 | |
24 | /* |
25 | * Maintain information about the progress of the recovery job, so that |
26 | * the different passes can carry information between them. |
27 | */ |
28 | struct recovery_info |
29 | { |
30 | tid_t start_transaction; |
31 | tid_t end_transaction; |
32 | unsigned long head_block; |
33 | |
34 | int nr_replays; |
35 | int nr_revokes; |
36 | int nr_revoke_hits; |
37 | }; |
38 | |
39 | static int do_one_pass(journal_t *journal, |
40 | struct recovery_info *info, enum passtype pass); |
41 | static int scan_revoke_records(journal_t *, struct buffer_head *, |
42 | tid_t, struct recovery_info *); |
43 | |
44 | #ifdef __KERNEL__ |
45 | |
46 | /* Release readahead buffers after use */ |
47 | static void journal_brelse_array(struct buffer_head *b[], int n) |
48 | { |
49 | while (--n >= 0) |
50 | brelse (bh: b[n]); |
51 | } |
52 | |
53 | |
54 | /* |
55 | * When reading from the journal, we are going through the block device |
56 | * layer directly and so there is no readahead being done for us. We |
57 | * need to implement any readahead ourselves if we want it to happen at |
58 | * all. Recovery is basically one long sequential read, so make sure we |
59 | * do the IO in reasonably large chunks. |
60 | * |
61 | * This is not so critical that we need to be enormously clever about |
62 | * the readahead size, though. 128K is a purely arbitrary, good-enough |
63 | * fixed value. |
64 | */ |
65 | |
66 | #define MAXBUF 8 |
67 | static int do_readahead(journal_t *journal, unsigned int start) |
68 | { |
69 | int err; |
70 | unsigned int max, nbufs, next; |
71 | unsigned long long blocknr; |
72 | struct buffer_head *bh; |
73 | |
74 | struct buffer_head * bufs[MAXBUF]; |
75 | |
76 | /* Do up to 128K of readahead */ |
77 | max = start + (128 * 1024 / journal->j_blocksize); |
78 | if (max > journal->j_total_len) |
79 | max = journal->j_total_len; |
80 | |
81 | /* Do the readahead itself. We'll submit MAXBUF buffer_heads at |
82 | * a time to the block device IO layer. */ |
83 | |
84 | nbufs = 0; |
85 | |
86 | for (next = start; next < max; next++) { |
87 | err = jbd2_journal_bmap(journal, next, &blocknr); |
88 | |
89 | if (err) { |
90 | printk(KERN_ERR "JBD2: bad block at offset %u\n" , |
91 | next); |
92 | goto failed; |
93 | } |
94 | |
95 | bh = __getblk(bdev: journal->j_dev, block: blocknr, size: journal->j_blocksize); |
96 | if (!bh) { |
97 | err = -ENOMEM; |
98 | goto failed; |
99 | } |
100 | |
101 | if (!buffer_uptodate(bh) && !buffer_locked(bh)) { |
102 | bufs[nbufs++] = bh; |
103 | if (nbufs == MAXBUF) { |
104 | bh_readahead_batch(nr: nbufs, bhs: bufs, op_flags: 0); |
105 | journal_brelse_array(b: bufs, n: nbufs); |
106 | nbufs = 0; |
107 | } |
108 | } else |
109 | brelse(bh); |
110 | } |
111 | |
112 | if (nbufs) |
113 | bh_readahead_batch(nr: nbufs, bhs: bufs, op_flags: 0); |
114 | err = 0; |
115 | |
116 | failed: |
117 | if (nbufs) |
118 | journal_brelse_array(b: bufs, n: nbufs); |
119 | return err; |
120 | } |
121 | |
122 | #endif /* __KERNEL__ */ |
123 | |
124 | |
125 | /* |
126 | * Read a block from the journal |
127 | */ |
128 | |
129 | static int jread(struct buffer_head **bhp, journal_t *journal, |
130 | unsigned int offset) |
131 | { |
132 | int err; |
133 | unsigned long long blocknr; |
134 | struct buffer_head *bh; |
135 | |
136 | *bhp = NULL; |
137 | |
138 | if (offset >= journal->j_total_len) { |
139 | printk(KERN_ERR "JBD2: corrupted journal superblock\n" ); |
140 | return -EFSCORRUPTED; |
141 | } |
142 | |
143 | err = jbd2_journal_bmap(journal, offset, &blocknr); |
144 | |
145 | if (err) { |
146 | printk(KERN_ERR "JBD2: bad block at offset %u\n" , |
147 | offset); |
148 | return err; |
149 | } |
150 | |
151 | bh = __getblk(bdev: journal->j_dev, block: blocknr, size: journal->j_blocksize); |
152 | if (!bh) |
153 | return -ENOMEM; |
154 | |
155 | if (!buffer_uptodate(bh)) { |
156 | /* |
157 | * If this is a brand new buffer, start readahead. |
158 | * Otherwise, we assume we are already reading it. |
159 | */ |
160 | bool need_readahead = !buffer_req(bh); |
161 | |
162 | bh_read_nowait(bh, op_flags: 0); |
163 | if (need_readahead) |
164 | do_readahead(journal, start: offset); |
165 | wait_on_buffer(bh); |
166 | } |
167 | |
168 | if (!buffer_uptodate(bh)) { |
169 | printk(KERN_ERR "JBD2: Failed to read block at offset %u\n" , |
170 | offset); |
171 | brelse(bh); |
172 | return -EIO; |
173 | } |
174 | |
175 | *bhp = bh; |
176 | return 0; |
177 | } |
178 | |
179 | static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) |
180 | { |
181 | struct jbd2_journal_block_tail *tail; |
182 | __be32 provided; |
183 | __u32 calculated; |
184 | |
185 | if (!jbd2_journal_has_csum_v2or3(journal: j)) |
186 | return 1; |
187 | |
188 | tail = (struct jbd2_journal_block_tail *)((char *)buf + |
189 | j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); |
190 | provided = tail->t_checksum; |
191 | tail->t_checksum = 0; |
192 | calculated = jbd2_chksum(journal: j, crc: j->j_csum_seed, address: buf, length: j->j_blocksize); |
193 | tail->t_checksum = provided; |
194 | |
195 | return provided == cpu_to_be32(calculated); |
196 | } |
197 | |
198 | /* |
199 | * Count the number of in-use tags in a journal descriptor block. |
200 | */ |
201 | |
202 | static int count_tags(journal_t *journal, struct buffer_head *bh) |
203 | { |
204 | char * tagp; |
205 | journal_block_tag_t tag; |
206 | int nr = 0, size = journal->j_blocksize; |
207 | int tag_bytes = journal_tag_bytes(journal); |
208 | |
209 | if (jbd2_journal_has_csum_v2or3(journal)) |
210 | size -= sizeof(struct jbd2_journal_block_tail); |
211 | |
212 | tagp = &bh->b_data[sizeof(journal_header_t)]; |
213 | |
214 | while ((tagp - bh->b_data + tag_bytes) <= size) { |
215 | memcpy(&tag, tagp, sizeof(tag)); |
216 | |
217 | nr++; |
218 | tagp += tag_bytes; |
219 | if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) |
220 | tagp += 16; |
221 | |
222 | if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) |
223 | break; |
224 | } |
225 | |
226 | return nr; |
227 | } |
228 | |
229 | |
230 | /* Make sure we wrap around the log correctly! */ |
231 | #define wrap(journal, var) \ |
232 | do { \ |
233 | if (var >= (journal)->j_last) \ |
234 | var -= ((journal)->j_last - (journal)->j_first); \ |
235 | } while (0) |
236 | |
237 | static int fc_do_one_pass(journal_t *journal, |
238 | struct recovery_info *info, enum passtype pass) |
239 | { |
240 | unsigned int expected_commit_id = info->end_transaction; |
241 | unsigned long next_fc_block; |
242 | struct buffer_head *bh; |
243 | int err = 0; |
244 | |
245 | next_fc_block = journal->j_fc_first; |
246 | if (!journal->j_fc_replay_callback) |
247 | return 0; |
248 | |
249 | while (next_fc_block <= journal->j_fc_last) { |
250 | jbd2_debug(3, "Fast commit replay: next block %ld\n" , |
251 | next_fc_block); |
252 | err = jread(bhp: &bh, journal, offset: next_fc_block); |
253 | if (err) { |
254 | jbd2_debug(3, "Fast commit replay: read error\n" ); |
255 | break; |
256 | } |
257 | |
258 | err = journal->j_fc_replay_callback(journal, bh, pass, |
259 | next_fc_block - journal->j_fc_first, |
260 | expected_commit_id); |
261 | brelse(bh); |
262 | next_fc_block++; |
263 | if (err < 0 || err == JBD2_FC_REPLAY_STOP) |
264 | break; |
265 | err = 0; |
266 | } |
267 | |
268 | if (err) |
269 | jbd2_debug(3, "Fast commit replay failed, err = %d\n" , err); |
270 | |
271 | return err; |
272 | } |
273 | |
274 | /** |
275 | * jbd2_journal_recover - recovers a on-disk journal |
276 | * @journal: the journal to recover |
277 | * |
278 | * The primary function for recovering the log contents when mounting a |
279 | * journaled device. |
280 | * |
281 | * Recovery is done in three passes. In the first pass, we look for the |
282 | * end of the log. In the second, we assemble the list of revoke |
283 | * blocks. In the third and final pass, we replay any un-revoked blocks |
284 | * in the log. |
285 | */ |
286 | int jbd2_journal_recover(journal_t *journal) |
287 | { |
288 | int err, err2; |
289 | journal_superblock_t * sb; |
290 | |
291 | struct recovery_info info; |
292 | errseq_t wb_err; |
293 | struct address_space *mapping; |
294 | |
295 | memset(&info, 0, sizeof(info)); |
296 | sb = journal->j_superblock; |
297 | |
298 | /* |
299 | * The journal superblock's s_start field (the current log head) |
300 | * is always zero if, and only if, the journal was cleanly |
301 | * unmounted. |
302 | */ |
303 | if (!sb->s_start) { |
304 | jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n" , |
305 | be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); |
306 | journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; |
307 | journal->j_head = be32_to_cpu(sb->s_head); |
308 | return 0; |
309 | } |
310 | |
311 | wb_err = 0; |
312 | mapping = journal->j_fs_dev->bd_inode->i_mapping; |
313 | errseq_check_and_advance(eseq: &mapping->wb_err, since: &wb_err); |
314 | err = do_one_pass(journal, info: &info, pass: PASS_SCAN); |
315 | if (!err) |
316 | err = do_one_pass(journal, info: &info, pass: PASS_REVOKE); |
317 | if (!err) |
318 | err = do_one_pass(journal, info: &info, pass: PASS_REPLAY); |
319 | |
320 | jbd2_debug(1, "JBD2: recovery, exit status %d, " |
321 | "recovered transactions %u to %u\n" , |
322 | err, info.start_transaction, info.end_transaction); |
323 | jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n" , |
324 | info.nr_replays, info.nr_revoke_hits, info.nr_revokes); |
325 | |
326 | /* Restart the log at the next transaction ID, thus invalidating |
327 | * any existing commit records in the log. */ |
328 | journal->j_transaction_sequence = ++info.end_transaction; |
329 | journal->j_head = info.head_block; |
330 | jbd2_debug(1, "JBD2: last transaction %d, head block %lu\n" , |
331 | journal->j_transaction_sequence, journal->j_head); |
332 | |
333 | jbd2_journal_clear_revoke(journal); |
334 | err2 = sync_blockdev(bdev: journal->j_fs_dev); |
335 | if (!err) |
336 | err = err2; |
337 | err2 = errseq_check_and_advance(eseq: &mapping->wb_err, since: &wb_err); |
338 | if (!err) |
339 | err = err2; |
340 | /* Make sure all replayed data is on permanent storage */ |
341 | if (journal->j_flags & JBD2_BARRIER) { |
342 | err2 = blkdev_issue_flush(bdev: journal->j_fs_dev); |
343 | if (!err) |
344 | err = err2; |
345 | } |
346 | return err; |
347 | } |
348 | |
349 | /** |
350 | * jbd2_journal_skip_recovery - Start journal and wipe exiting records |
351 | * @journal: journal to startup |
352 | * |
353 | * Locate any valid recovery information from the journal and set up the |
354 | * journal structures in memory to ignore it (presumably because the |
355 | * caller has evidence that it is out of date). |
356 | * This function doesn't appear to be exported.. |
357 | * |
358 | * We perform one pass over the journal to allow us to tell the user how |
359 | * much recovery information is being erased, and to let us initialise |
360 | * the journal transaction sequence numbers to the next unused ID. |
361 | */ |
362 | int jbd2_journal_skip_recovery(journal_t *journal) |
363 | { |
364 | int err; |
365 | |
366 | struct recovery_info info; |
367 | |
368 | memset (&info, 0, sizeof(info)); |
369 | |
370 | err = do_one_pass(journal, info: &info, pass: PASS_SCAN); |
371 | |
372 | if (err) { |
373 | printk(KERN_ERR "JBD2: error %d scanning journal\n" , err); |
374 | ++journal->j_transaction_sequence; |
375 | journal->j_head = journal->j_first; |
376 | } else { |
377 | #ifdef CONFIG_JBD2_DEBUG |
378 | int dropped = info.end_transaction - |
379 | be32_to_cpu(journal->j_superblock->s_sequence); |
380 | jbd2_debug(1, |
381 | "JBD2: ignoring %d transaction%s from the journal.\n" , |
382 | dropped, (dropped == 1) ? "" : "s" ); |
383 | #endif |
384 | journal->j_transaction_sequence = ++info.end_transaction; |
385 | journal->j_head = info.head_block; |
386 | } |
387 | |
388 | journal->j_tail = 0; |
389 | return err; |
390 | } |
391 | |
392 | static inline unsigned long long read_tag_block(journal_t *journal, |
393 | journal_block_tag_t *tag) |
394 | { |
395 | unsigned long long block = be32_to_cpu(tag->t_blocknr); |
396 | if (jbd2_has_feature_64bit(j: journal)) |
397 | block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; |
398 | return block; |
399 | } |
400 | |
401 | /* |
402 | * calc_chksums calculates the checksums for the blocks described in the |
403 | * descriptor block. |
404 | */ |
405 | static int calc_chksums(journal_t *journal, struct buffer_head *bh, |
406 | unsigned long *next_log_block, __u32 *crc32_sum) |
407 | { |
408 | int i, num_blks, err; |
409 | unsigned long io_block; |
410 | struct buffer_head *obh; |
411 | |
412 | num_blks = count_tags(journal, bh); |
413 | /* Calculate checksum of the descriptor block. */ |
414 | *crc32_sum = crc32_be(crc: *crc32_sum, p: (void *)bh->b_data, len: bh->b_size); |
415 | |
416 | for (i = 0; i < num_blks; i++) { |
417 | io_block = (*next_log_block)++; |
418 | wrap(journal, *next_log_block); |
419 | err = jread(bhp: &obh, journal, offset: io_block); |
420 | if (err) { |
421 | printk(KERN_ERR "JBD2: IO error %d recovering block " |
422 | "%lu in log\n" , err, io_block); |
423 | return 1; |
424 | } else { |
425 | *crc32_sum = crc32_be(crc: *crc32_sum, p: (void *)obh->b_data, |
426 | len: obh->b_size); |
427 | } |
428 | put_bh(bh: obh); |
429 | } |
430 | return 0; |
431 | } |
432 | |
433 | static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) |
434 | { |
435 | struct commit_header *h; |
436 | __be32 provided; |
437 | __u32 calculated; |
438 | |
439 | if (!jbd2_journal_has_csum_v2or3(journal: j)) |
440 | return 1; |
441 | |
442 | h = buf; |
443 | provided = h->h_chksum[0]; |
444 | h->h_chksum[0] = 0; |
445 | calculated = jbd2_chksum(journal: j, crc: j->j_csum_seed, address: buf, length: j->j_blocksize); |
446 | h->h_chksum[0] = provided; |
447 | |
448 | return provided == cpu_to_be32(calculated); |
449 | } |
450 | |
451 | static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, |
452 | journal_block_tag3_t *tag3, |
453 | void *buf, __u32 sequence) |
454 | { |
455 | __u32 csum32; |
456 | __be32 seq; |
457 | |
458 | if (!jbd2_journal_has_csum_v2or3(journal: j)) |
459 | return 1; |
460 | |
461 | seq = cpu_to_be32(sequence); |
462 | csum32 = jbd2_chksum(journal: j, crc: j->j_csum_seed, address: (__u8 *)&seq, length: sizeof(seq)); |
463 | csum32 = jbd2_chksum(journal: j, crc: csum32, address: buf, length: j->j_blocksize); |
464 | |
465 | if (jbd2_has_feature_csum3(j)) |
466 | return tag3->t_checksum == cpu_to_be32(csum32); |
467 | else |
468 | return tag->t_checksum == cpu_to_be16(csum32); |
469 | } |
470 | |
471 | static int do_one_pass(journal_t *journal, |
472 | struct recovery_info *info, enum passtype pass) |
473 | { |
474 | unsigned int first_commit_ID, next_commit_ID; |
475 | unsigned long next_log_block, head_block; |
476 | int err, success = 0; |
477 | journal_superblock_t * sb; |
478 | journal_header_t * tmp; |
479 | struct buffer_head * bh; |
480 | unsigned int sequence; |
481 | int blocktype; |
482 | int tag_bytes = journal_tag_bytes(journal); |
483 | __u32 crc32_sum = ~0; /* Transactional Checksums */ |
484 | int descr_csum_size = 0; |
485 | int block_error = 0; |
486 | bool need_check_commit_time = false; |
487 | __u64 last_trans_commit_time = 0, commit_time; |
488 | |
489 | /* |
490 | * First thing is to establish what we expect to find in the log |
491 | * (in terms of transaction IDs), and where (in terms of log |
492 | * block offsets): query the superblock. |
493 | */ |
494 | |
495 | sb = journal->j_superblock; |
496 | next_commit_ID = be32_to_cpu(sb->s_sequence); |
497 | next_log_block = be32_to_cpu(sb->s_start); |
498 | head_block = next_log_block; |
499 | |
500 | first_commit_ID = next_commit_ID; |
501 | if (pass == PASS_SCAN) |
502 | info->start_transaction = first_commit_ID; |
503 | |
504 | jbd2_debug(1, "Starting recovery pass %d\n" , pass); |
505 | |
506 | /* |
507 | * Now we walk through the log, transaction by transaction, |
508 | * making sure that each transaction has a commit block in the |
509 | * expected place. Each complete transaction gets replayed back |
510 | * into the main filesystem. |
511 | */ |
512 | |
513 | while (1) { |
514 | int flags; |
515 | char * tagp; |
516 | journal_block_tag_t tag; |
517 | struct buffer_head * obh; |
518 | struct buffer_head * nbh; |
519 | |
520 | cond_resched(); |
521 | |
522 | /* If we already know where to stop the log traversal, |
523 | * check right now that we haven't gone past the end of |
524 | * the log. */ |
525 | |
526 | if (pass != PASS_SCAN) |
527 | if (tid_geq(x: next_commit_ID, y: info->end_transaction)) |
528 | break; |
529 | |
530 | jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n" , |
531 | next_commit_ID, next_log_block, journal->j_last); |
532 | |
533 | /* Skip over each chunk of the transaction looking |
534 | * either the next descriptor block or the final commit |
535 | * record. */ |
536 | |
537 | jbd2_debug(3, "JBD2: checking block %ld\n" , next_log_block); |
538 | err = jread(bhp: &bh, journal, offset: next_log_block); |
539 | if (err) |
540 | goto failed; |
541 | |
542 | next_log_block++; |
543 | wrap(journal, next_log_block); |
544 | |
545 | /* What kind of buffer is it? |
546 | * |
547 | * If it is a descriptor block, check that it has the |
548 | * expected sequence number. Otherwise, we're all done |
549 | * here. */ |
550 | |
551 | tmp = (journal_header_t *)bh->b_data; |
552 | |
553 | if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { |
554 | brelse(bh); |
555 | break; |
556 | } |
557 | |
558 | blocktype = be32_to_cpu(tmp->h_blocktype); |
559 | sequence = be32_to_cpu(tmp->h_sequence); |
560 | jbd2_debug(3, "Found magic %d, sequence %d\n" , |
561 | blocktype, sequence); |
562 | |
563 | if (sequence != next_commit_ID) { |
564 | brelse(bh); |
565 | break; |
566 | } |
567 | |
568 | /* OK, we have a valid descriptor block which matches |
569 | * all of the sequence number checks. What are we going |
570 | * to do with it? That depends on the pass... */ |
571 | |
572 | switch(blocktype) { |
573 | case JBD2_DESCRIPTOR_BLOCK: |
574 | /* Verify checksum first */ |
575 | if (jbd2_journal_has_csum_v2or3(journal)) |
576 | descr_csum_size = |
577 | sizeof(struct jbd2_journal_block_tail); |
578 | if (descr_csum_size > 0 && |
579 | !jbd2_descriptor_block_csum_verify(j: journal, |
580 | buf: bh->b_data)) { |
581 | /* |
582 | * PASS_SCAN can see stale blocks due to lazy |
583 | * journal init. Don't error out on those yet. |
584 | */ |
585 | if (pass != PASS_SCAN) { |
586 | pr_err("JBD2: Invalid checksum recovering block %lu in log\n" , |
587 | next_log_block); |
588 | err = -EFSBADCRC; |
589 | brelse(bh); |
590 | goto failed; |
591 | } |
592 | need_check_commit_time = true; |
593 | jbd2_debug(1, |
594 | "invalid descriptor block found in %lu\n" , |
595 | next_log_block); |
596 | } |
597 | |
598 | /* If it is a valid descriptor block, replay it |
599 | * in pass REPLAY; if journal_checksums enabled, then |
600 | * calculate checksums in PASS_SCAN, otherwise, |
601 | * just skip over the blocks it describes. */ |
602 | if (pass != PASS_REPLAY) { |
603 | if (pass == PASS_SCAN && |
604 | jbd2_has_feature_checksum(j: journal) && |
605 | !need_check_commit_time && |
606 | !info->end_transaction) { |
607 | if (calc_chksums(journal, bh, |
608 | next_log_block: &next_log_block, |
609 | crc32_sum: &crc32_sum)) { |
610 | put_bh(bh); |
611 | break; |
612 | } |
613 | put_bh(bh); |
614 | continue; |
615 | } |
616 | next_log_block += count_tags(journal, bh); |
617 | wrap(journal, next_log_block); |
618 | put_bh(bh); |
619 | continue; |
620 | } |
621 | |
622 | /* A descriptor block: we can now write all of |
623 | * the data blocks. Yay, useful work is finally |
624 | * getting done here! */ |
625 | |
626 | tagp = &bh->b_data[sizeof(journal_header_t)]; |
627 | while ((tagp - bh->b_data + tag_bytes) |
628 | <= journal->j_blocksize - descr_csum_size) { |
629 | unsigned long io_block; |
630 | |
631 | memcpy(&tag, tagp, sizeof(tag)); |
632 | flags = be16_to_cpu(tag.t_flags); |
633 | |
634 | io_block = next_log_block++; |
635 | wrap(journal, next_log_block); |
636 | err = jread(bhp: &obh, journal, offset: io_block); |
637 | if (err) { |
638 | /* Recover what we can, but |
639 | * report failure at the end. */ |
640 | success = err; |
641 | printk(KERN_ERR |
642 | "JBD2: IO error %d recovering " |
643 | "block %lu in log\n" , |
644 | err, io_block); |
645 | } else { |
646 | unsigned long long blocknr; |
647 | |
648 | J_ASSERT(obh != NULL); |
649 | blocknr = read_tag_block(journal, |
650 | tag: &tag); |
651 | |
652 | /* If the block has been |
653 | * revoked, then we're all done |
654 | * here. */ |
655 | if (jbd2_journal_test_revoke |
656 | (journal, blocknr, |
657 | next_commit_ID)) { |
658 | brelse(bh: obh); |
659 | ++info->nr_revoke_hits; |
660 | goto skip_write; |
661 | } |
662 | |
663 | /* Look for block corruption */ |
664 | if (!jbd2_block_tag_csum_verify( |
665 | j: journal, tag: &tag, tag3: (journal_block_tag3_t *)tagp, |
666 | buf: obh->b_data, be32_to_cpu(tmp->h_sequence))) { |
667 | brelse(bh: obh); |
668 | success = -EFSBADCRC; |
669 | printk(KERN_ERR "JBD2: Invalid " |
670 | "checksum recovering " |
671 | "data block %llu in " |
672 | "journal block %lu\n" , |
673 | blocknr, io_block); |
674 | block_error = 1; |
675 | goto skip_write; |
676 | } |
677 | |
678 | /* Find a buffer for the new |
679 | * data being restored */ |
680 | nbh = __getblk(bdev: journal->j_fs_dev, |
681 | block: blocknr, |
682 | size: journal->j_blocksize); |
683 | if (nbh == NULL) { |
684 | printk(KERN_ERR |
685 | "JBD2: Out of memory " |
686 | "during recovery.\n" ); |
687 | err = -ENOMEM; |
688 | brelse(bh); |
689 | brelse(bh: obh); |
690 | goto failed; |
691 | } |
692 | |
693 | lock_buffer(bh: nbh); |
694 | memcpy(nbh->b_data, obh->b_data, |
695 | journal->j_blocksize); |
696 | if (flags & JBD2_FLAG_ESCAPE) { |
697 | *((__be32 *)nbh->b_data) = |
698 | cpu_to_be32(JBD2_MAGIC_NUMBER); |
699 | } |
700 | |
701 | BUFFER_TRACE(nbh, "marking dirty" ); |
702 | set_buffer_uptodate(nbh); |
703 | mark_buffer_dirty(bh: nbh); |
704 | BUFFER_TRACE(nbh, "marking uptodate" ); |
705 | ++info->nr_replays; |
706 | unlock_buffer(bh: nbh); |
707 | brelse(bh: obh); |
708 | brelse(bh: nbh); |
709 | } |
710 | |
711 | skip_write: |
712 | tagp += tag_bytes; |
713 | if (!(flags & JBD2_FLAG_SAME_UUID)) |
714 | tagp += 16; |
715 | |
716 | if (flags & JBD2_FLAG_LAST_TAG) |
717 | break; |
718 | } |
719 | |
720 | brelse(bh); |
721 | continue; |
722 | |
723 | case JBD2_COMMIT_BLOCK: |
724 | /* How to differentiate between interrupted commit |
725 | * and journal corruption ? |
726 | * |
727 | * {nth transaction} |
728 | * Checksum Verification Failed |
729 | * | |
730 | * ____________________ |
731 | * | | |
732 | * async_commit sync_commit |
733 | * | | |
734 | * | GO TO NEXT "Journal Corruption" |
735 | * | TRANSACTION |
736 | * | |
737 | * {(n+1)th transanction} |
738 | * | |
739 | * _______|______________ |
740 | * | | |
741 | * Commit block found Commit block not found |
742 | * | | |
743 | * "Journal Corruption" | |
744 | * _____________|_________ |
745 | * | | |
746 | * nth trans corrupt OR nth trans |
747 | * and (n+1)th interrupted interrupted |
748 | * before commit block |
749 | * could reach the disk. |
750 | * (Cannot find the difference in above |
751 | * mentioned conditions. Hence assume |
752 | * "Interrupted Commit".) |
753 | */ |
754 | commit_time = be64_to_cpu( |
755 | ((struct commit_header *)bh->b_data)->h_commit_sec); |
756 | /* |
757 | * If need_check_commit_time is set, it means we are in |
758 | * PASS_SCAN and csum verify failed before. If |
759 | * commit_time is increasing, it's the same journal, |
760 | * otherwise it is stale journal block, just end this |
761 | * recovery. |
762 | */ |
763 | if (need_check_commit_time) { |
764 | if (commit_time >= last_trans_commit_time) { |
765 | pr_err("JBD2: Invalid checksum found in transaction %u\n" , |
766 | next_commit_ID); |
767 | err = -EFSBADCRC; |
768 | brelse(bh); |
769 | goto failed; |
770 | } |
771 | ignore_crc_mismatch: |
772 | /* |
773 | * It likely does not belong to same journal, |
774 | * just end this recovery with success. |
775 | */ |
776 | jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n" , |
777 | next_commit_ID); |
778 | brelse(bh); |
779 | goto done; |
780 | } |
781 | |
782 | /* |
783 | * Found an expected commit block: if checksums |
784 | * are present, verify them in PASS_SCAN; else not |
785 | * much to do other than move on to the next sequence |
786 | * number. |
787 | */ |
788 | if (pass == PASS_SCAN && |
789 | jbd2_has_feature_checksum(j: journal)) { |
790 | struct commit_header *cbh = |
791 | (struct commit_header *)bh->b_data; |
792 | unsigned found_chksum = |
793 | be32_to_cpu(cbh->h_chksum[0]); |
794 | |
795 | if (info->end_transaction) { |
796 | journal->j_failed_commit = |
797 | info->end_transaction; |
798 | brelse(bh); |
799 | break; |
800 | } |
801 | |
802 | /* Neither checksum match nor unused? */ |
803 | if (!((crc32_sum == found_chksum && |
804 | cbh->h_chksum_type == |
805 | JBD2_CRC32_CHKSUM && |
806 | cbh->h_chksum_size == |
807 | JBD2_CRC32_CHKSUM_SIZE) || |
808 | (cbh->h_chksum_type == 0 && |
809 | cbh->h_chksum_size == 0 && |
810 | found_chksum == 0))) |
811 | goto chksum_error; |
812 | |
813 | crc32_sum = ~0; |
814 | } |
815 | if (pass == PASS_SCAN && |
816 | !jbd2_commit_block_csum_verify(j: journal, |
817 | buf: bh->b_data)) { |
818 | chksum_error: |
819 | if (commit_time < last_trans_commit_time) |
820 | goto ignore_crc_mismatch; |
821 | info->end_transaction = next_commit_ID; |
822 | info->head_block = head_block; |
823 | |
824 | if (!jbd2_has_feature_async_commit(j: journal)) { |
825 | journal->j_failed_commit = |
826 | next_commit_ID; |
827 | brelse(bh); |
828 | break; |
829 | } |
830 | } |
831 | if (pass == PASS_SCAN) { |
832 | last_trans_commit_time = commit_time; |
833 | head_block = next_log_block; |
834 | } |
835 | brelse(bh); |
836 | next_commit_ID++; |
837 | continue; |
838 | |
839 | case JBD2_REVOKE_BLOCK: |
840 | /* |
841 | * Check revoke block crc in pass_scan, if csum verify |
842 | * failed, check commit block time later. |
843 | */ |
844 | if (pass == PASS_SCAN && |
845 | !jbd2_descriptor_block_csum_verify(j: journal, |
846 | buf: bh->b_data)) { |
847 | jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n" , |
848 | next_log_block); |
849 | need_check_commit_time = true; |
850 | } |
851 | /* If we aren't in the REVOKE pass, then we can |
852 | * just skip over this block. */ |
853 | if (pass != PASS_REVOKE) { |
854 | brelse(bh); |
855 | continue; |
856 | } |
857 | |
858 | err = scan_revoke_records(journal, bh, |
859 | next_commit_ID, info); |
860 | brelse(bh); |
861 | if (err) |
862 | goto failed; |
863 | continue; |
864 | |
865 | default: |
866 | jbd2_debug(3, "Unrecognised magic %d, end of scan.\n" , |
867 | blocktype); |
868 | brelse(bh); |
869 | goto done; |
870 | } |
871 | } |
872 | |
873 | done: |
874 | /* |
875 | * We broke out of the log scan loop: either we came to the |
876 | * known end of the log or we found an unexpected block in the |
877 | * log. If the latter happened, then we know that the "current" |
878 | * transaction marks the end of the valid log. |
879 | */ |
880 | |
881 | if (pass == PASS_SCAN) { |
882 | if (!info->end_transaction) |
883 | info->end_transaction = next_commit_ID; |
884 | if (!info->head_block) |
885 | info->head_block = head_block; |
886 | } else { |
887 | /* It's really bad news if different passes end up at |
888 | * different places (but possible due to IO errors). */ |
889 | if (info->end_transaction != next_commit_ID) { |
890 | printk(KERN_ERR "JBD2: recovery pass %d ended at " |
891 | "transaction %u, expected %u\n" , |
892 | pass, next_commit_ID, info->end_transaction); |
893 | if (!success) |
894 | success = -EIO; |
895 | } |
896 | } |
897 | |
898 | if (jbd2_has_feature_fast_commit(j: journal) && pass != PASS_REVOKE) { |
899 | err = fc_do_one_pass(journal, info, pass); |
900 | if (err) |
901 | success = err; |
902 | } |
903 | |
904 | if (block_error && success == 0) |
905 | success = -EIO; |
906 | return success; |
907 | |
908 | failed: |
909 | return err; |
910 | } |
911 | |
912 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ |
913 | |
914 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, |
915 | tid_t sequence, struct recovery_info *info) |
916 | { |
917 | jbd2_journal_revoke_header_t *; |
918 | int offset, max; |
919 | unsigned csum_size = 0; |
920 | __u32 rcount; |
921 | int record_len = 4; |
922 | |
923 | header = (jbd2_journal_revoke_header_t *) bh->b_data; |
924 | offset = sizeof(jbd2_journal_revoke_header_t); |
925 | rcount = be32_to_cpu(header->r_count); |
926 | |
927 | if (jbd2_journal_has_csum_v2or3(journal)) |
928 | csum_size = sizeof(struct jbd2_journal_block_tail); |
929 | if (rcount > journal->j_blocksize - csum_size) |
930 | return -EINVAL; |
931 | max = rcount; |
932 | |
933 | if (jbd2_has_feature_64bit(j: journal)) |
934 | record_len = 8; |
935 | |
936 | while (offset + record_len <= max) { |
937 | unsigned long long blocknr; |
938 | int err; |
939 | |
940 | if (record_len == 4) |
941 | blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); |
942 | else |
943 | blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset))); |
944 | offset += record_len; |
945 | err = jbd2_journal_set_revoke(journal, blocknr, sequence); |
946 | if (err) |
947 | return err; |
948 | ++info->nr_revokes; |
949 | } |
950 | return 0; |
951 | } |
952 | |