1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "alloc_background.h" |
5 | #include "bkey_buf.h" |
6 | #include "btree_journal_iter.h" |
7 | #include "btree_node_scan.h" |
8 | #include "btree_update.h" |
9 | #include "btree_update_interior.h" |
10 | #include "btree_io.h" |
11 | #include "buckets.h" |
12 | #include "dirent.h" |
13 | #include "errcode.h" |
14 | #include "error.h" |
15 | #include "fs-common.h" |
16 | #include "journal_io.h" |
17 | #include "journal_reclaim.h" |
18 | #include "journal_seq_blacklist.h" |
19 | #include "logged_ops.h" |
20 | #include "move.h" |
21 | #include "quota.h" |
22 | #include "rebalance.h" |
23 | #include "recovery.h" |
24 | #include "recovery_passes.h" |
25 | #include "replicas.h" |
26 | #include "sb-clean.h" |
27 | #include "sb-downgrade.h" |
28 | #include "snapshot.h" |
29 | #include "super-io.h" |
30 | |
31 | #include <linux/sort.h> |
32 | #include <linux/stat.h> |
33 | |
34 | #define QSTR(n) { { { .len = strlen(n) } }, .name = n } |
35 | |
36 | void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) |
37 | { |
38 | u64 b = BIT_ULL(btree); |
39 | |
40 | if (!(c->sb.btrees_lost_data & b)) { |
41 | bch_err(c, "flagging btree %s lost data" , bch2_btree_id_str(btree)); |
42 | |
43 | mutex_lock(&c->sb_lock); |
44 | bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); |
45 | bch2_write_super(c); |
46 | mutex_unlock(lock: &c->sb_lock); |
47 | } |
48 | } |
49 | |
50 | /* for -o reconstruct_alloc: */ |
51 | static void bch2_reconstruct_alloc(struct bch_fs *c) |
52 | { |
53 | bch2_journal_log_msg(c, "dropping alloc info" ); |
54 | bch_info(c, "dropping and reconstructing all alloc info" ); |
55 | |
56 | mutex_lock(&c->sb_lock); |
57 | struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); |
58 | |
59 | __set_bit_le64(bit: BCH_RECOVERY_PASS_STABLE_check_allocations, addr: ext->recovery_passes_required); |
60 | __set_bit_le64(bit: BCH_RECOVERY_PASS_STABLE_check_alloc_info, addr: ext->recovery_passes_required); |
61 | __set_bit_le64(bit: BCH_RECOVERY_PASS_STABLE_check_lrus, addr: ext->recovery_passes_required); |
62 | __set_bit_le64(bit: BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, addr: ext->recovery_passes_required); |
63 | __set_bit_le64(bit: BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, addr: ext->recovery_passes_required); |
64 | |
65 | __set_bit_le64(bit: BCH_FSCK_ERR_ptr_to_missing_alloc_key, addr: ext->errors_silent); |
66 | __set_bit_le64(bit: BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, addr: ext->errors_silent); |
67 | __set_bit_le64(bit: BCH_FSCK_ERR_stale_dirty_ptr, addr: ext->errors_silent); |
68 | __set_bit_le64(bit: BCH_FSCK_ERR_alloc_key_data_type_wrong, addr: ext->errors_silent); |
69 | __set_bit_le64(bit: BCH_FSCK_ERR_alloc_key_gen_wrong, addr: ext->errors_silent); |
70 | __set_bit_le64(bit: BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, addr: ext->errors_silent); |
71 | __set_bit_le64(bit: BCH_FSCK_ERR_alloc_key_stripe_wrong, addr: ext->errors_silent); |
72 | __set_bit_le64(bit: BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, addr: ext->errors_silent); |
73 | __set_bit_le64(bit: BCH_FSCK_ERR_need_discard_key_wrong, addr: ext->errors_silent); |
74 | __set_bit_le64(bit: BCH_FSCK_ERR_freespace_key_wrong, addr: ext->errors_silent); |
75 | __set_bit_le64(bit: BCH_FSCK_ERR_bucket_gens_key_wrong, addr: ext->errors_silent); |
76 | __set_bit_le64(bit: BCH_FSCK_ERR_freespace_hole_missing, addr: ext->errors_silent); |
77 | __set_bit_le64(bit: BCH_FSCK_ERR_ptr_to_missing_backpointer, addr: ext->errors_silent); |
78 | __set_bit_le64(bit: BCH_FSCK_ERR_lru_entry_bad, addr: ext->errors_silent); |
79 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
80 | |
81 | bch2_write_super(c); |
82 | mutex_unlock(lock: &c->sb_lock); |
83 | |
84 | c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); |
85 | |
86 | |
87 | bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, |
88 | 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); |
89 | bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, |
90 | 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); |
91 | bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, |
92 | 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); |
93 | bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, |
94 | 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); |
95 | bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, |
96 | 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); |
97 | } |
98 | |
99 | /* |
100 | * Btree node pointers have a field to stack a pointer to the in memory btree |
101 | * node; we need to zero out this field when reading in btree nodes, or when |
102 | * reading in keys from the journal: |
103 | */ |
104 | static void zero_out_btree_mem_ptr(struct journal_keys *keys) |
105 | { |
106 | darray_for_each(*keys, i) |
107 | if (i->k->k.type == KEY_TYPE_btree_ptr_v2) |
108 | bkey_i_to_btree_ptr_v2(k: i->k)->v.mem_ptr = 0; |
109 | } |
110 | |
111 | /* journal replay: */ |
112 | |
113 | static void replay_now_at(struct journal *j, u64 seq) |
114 | { |
115 | BUG_ON(seq < j->replay_journal_seq); |
116 | |
117 | seq = min(seq, j->replay_journal_seq_end); |
118 | |
119 | while (j->replay_journal_seq < seq) |
120 | bch2_journal_pin_put(j, j->replay_journal_seq++); |
121 | } |
122 | |
123 | static int bch2_journal_replay_key(struct btree_trans *trans, |
124 | struct journal_key *k) |
125 | { |
126 | struct btree_iter iter; |
127 | unsigned iter_flags = |
128 | BTREE_ITER_INTENT| |
129 | BTREE_ITER_NOT_EXTENTS; |
130 | unsigned update_flags = BTREE_TRIGGER_NORUN; |
131 | int ret; |
132 | |
133 | if (k->overwritten) |
134 | return 0; |
135 | |
136 | trans->journal_res.seq = k->journal_seq; |
137 | |
138 | /* |
139 | * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to |
140 | * keep the key cache coherent with the underlying btree. Nothing |
141 | * besides the allocator is doing updates yet so we don't need key cache |
142 | * coherency for non-alloc btrees, and key cache fills for snapshots |
143 | * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until |
144 | * the snapshots recovery pass runs. |
145 | */ |
146 | if (!k->level && k->btree_id == BTREE_ID_alloc) |
147 | iter_flags |= BTREE_ITER_CACHED; |
148 | else |
149 | update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; |
150 | |
151 | bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, |
152 | BTREE_MAX_DEPTH, k->level, |
153 | iter_flags); |
154 | ret = bch2_btree_iter_traverse(&iter); |
155 | if (ret) |
156 | goto out; |
157 | |
158 | struct btree_path *path = btree_iter_path(trans, iter: &iter); |
159 | if (unlikely(!btree_path_node(path, k->level))) { |
160 | bch2_trans_iter_exit(trans, &iter); |
161 | bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, |
162 | BTREE_MAX_DEPTH, 0, iter_flags); |
163 | ret = bch2_btree_iter_traverse(&iter) ?: |
164 | bch2_btree_increase_depth(trans, iter.path, 0) ?: |
165 | -BCH_ERR_transaction_restart_nested; |
166 | goto out; |
167 | } |
168 | |
169 | /* Must be checked with btree locked: */ |
170 | if (k->overwritten) |
171 | goto out; |
172 | |
173 | ret = bch2_trans_update(trans, &iter, k->k, update_flags); |
174 | out: |
175 | bch2_trans_iter_exit(trans, &iter); |
176 | return ret; |
177 | } |
178 | |
179 | static int journal_sort_seq_cmp(const void *_l, const void *_r) |
180 | { |
181 | const struct journal_key *l = *((const struct journal_key **)_l); |
182 | const struct journal_key *r = *((const struct journal_key **)_r); |
183 | |
184 | return cmp_int(l->journal_seq, r->journal_seq); |
185 | } |
186 | |
187 | int bch2_journal_replay(struct bch_fs *c) |
188 | { |
189 | struct journal_keys *keys = &c->journal_keys; |
190 | DARRAY(struct journal_key *) keys_sorted = { 0 }; |
191 | struct journal *j = &c->journal; |
192 | u64 start_seq = c->journal_replay_seq_start; |
193 | u64 end_seq = c->journal_replay_seq_start; |
194 | struct btree_trans *trans = bch2_trans_get(c); |
195 | bool immediate_flush = false; |
196 | int ret = 0; |
197 | |
198 | if (keys->nr) { |
199 | ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)" , |
200 | keys->nr, start_seq, end_seq); |
201 | if (ret) |
202 | goto err; |
203 | } |
204 | |
205 | BUG_ON(!atomic_read(&keys->ref)); |
206 | |
207 | move_gap(keys, keys->nr); |
208 | |
209 | /* |
210 | * First, attempt to replay keys in sorted order. This is more |
211 | * efficient - better locality of btree access - but some might fail if |
212 | * that would cause a journal deadlock. |
213 | */ |
214 | darray_for_each(*keys, k) { |
215 | cond_resched(); |
216 | |
217 | /* |
218 | * k->allocated means the key wasn't read in from the journal, |
219 | * rather it was from early repair code |
220 | */ |
221 | if (k->allocated) |
222 | immediate_flush = true; |
223 | |
224 | /* Skip fastpath if we're low on space in the journal */ |
225 | ret = c->journal.watermark ? -1 : |
226 | commit_do(trans, NULL, NULL, |
227 | BCH_TRANS_COMMIT_no_enospc| |
228 | BCH_TRANS_COMMIT_journal_reclaim| |
229 | (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), |
230 | bch2_journal_replay_key(trans, k)); |
231 | BUG_ON(!ret && !k->overwritten); |
232 | if (ret) { |
233 | ret = darray_push(&keys_sorted, k); |
234 | if (ret) |
235 | goto err; |
236 | } |
237 | } |
238 | |
239 | /* |
240 | * Now, replay any remaining keys in the order in which they appear in |
241 | * the journal, unpinning those journal entries as we go: |
242 | */ |
243 | sort(base: keys_sorted.data, num: keys_sorted.nr, |
244 | size: sizeof(keys_sorted.data[0]), |
245 | cmp_func: journal_sort_seq_cmp, NULL); |
246 | |
247 | darray_for_each(keys_sorted, kp) { |
248 | cond_resched(); |
249 | |
250 | struct journal_key *k = *kp; |
251 | |
252 | if (k->journal_seq) |
253 | replay_now_at(j, seq: k->journal_seq); |
254 | else |
255 | replay_now_at(j, seq: j->replay_journal_seq_end); |
256 | |
257 | ret = commit_do(trans, NULL, NULL, |
258 | BCH_TRANS_COMMIT_no_enospc| |
259 | (!k->allocated |
260 | ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim |
261 | : 0), |
262 | bch2_journal_replay_key(trans, k)); |
263 | bch_err_msg(c, ret, "while replaying key at btree %s level %u:" , |
264 | bch2_btree_id_str(k->btree_id), k->level); |
265 | if (ret) |
266 | goto err; |
267 | |
268 | BUG_ON(!k->overwritten); |
269 | } |
270 | |
271 | /* |
272 | * We need to put our btree_trans before calling flush_all_pins(), since |
273 | * that will use a btree_trans internally |
274 | */ |
275 | bch2_trans_put(trans); |
276 | trans = NULL; |
277 | |
278 | if (!c->opts.retain_recovery_info && |
279 | c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) |
280 | bch2_journal_keys_put_initial(c); |
281 | |
282 | replay_now_at(j, seq: j->replay_journal_seq_end); |
283 | j->replay_journal_seq = 0; |
284 | |
285 | bch2_journal_set_replay_done(j); |
286 | |
287 | /* if we did any repair, flush it immediately */ |
288 | if (immediate_flush) { |
289 | bch2_journal_flush_all_pins(j: &c->journal); |
290 | ret = bch2_journal_meta(&c->journal); |
291 | } |
292 | |
293 | if (keys->nr) |
294 | bch2_journal_log_msg(c, "journal replay finished" ); |
295 | err: |
296 | if (trans) |
297 | bch2_trans_put(trans); |
298 | darray_exit(&keys_sorted); |
299 | bch_err_fn(c, ret); |
300 | return ret; |
301 | } |
302 | |
303 | /* journal replay early: */ |
304 | |
305 | static int journal_replay_entry_early(struct bch_fs *c, |
306 | struct jset_entry *entry) |
307 | { |
308 | int ret = 0; |
309 | |
310 | switch (entry->type) { |
311 | case BCH_JSET_ENTRY_btree_root: { |
312 | struct btree_root *r; |
313 | |
314 | while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { |
315 | ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); |
316 | if (ret) |
317 | return ret; |
318 | } |
319 | |
320 | r = bch2_btree_id_root(c, id: entry->btree_id); |
321 | |
322 | if (entry->u64s) { |
323 | r->level = entry->level; |
324 | bkey_copy(dst: &r->key, src: (struct bkey_i *) entry->start); |
325 | r->error = 0; |
326 | } else { |
327 | r->error = -BCH_ERR_btree_node_read_error; |
328 | } |
329 | r->alive = true; |
330 | break; |
331 | } |
332 | case BCH_JSET_ENTRY_usage: { |
333 | struct jset_entry_usage *u = |
334 | container_of(entry, struct jset_entry_usage, entry); |
335 | |
336 | switch (entry->btree_id) { |
337 | case BCH_FS_USAGE_reserved: |
338 | if (entry->level < BCH_REPLICAS_MAX) |
339 | c->usage_base->persistent_reserved[entry->level] = |
340 | le64_to_cpu(u->v); |
341 | break; |
342 | case BCH_FS_USAGE_inodes: |
343 | c->usage_base->b.nr_inodes = le64_to_cpu(u->v); |
344 | break; |
345 | case BCH_FS_USAGE_key_version: |
346 | atomic64_set(v: &c->key_version, |
347 | le64_to_cpu(u->v)); |
348 | break; |
349 | } |
350 | |
351 | break; |
352 | } |
353 | case BCH_JSET_ENTRY_data_usage: { |
354 | struct jset_entry_data_usage *u = |
355 | container_of(entry, struct jset_entry_data_usage, entry); |
356 | |
357 | ret = bch2_replicas_set_usage(c, &u->r, |
358 | le64_to_cpu(u->v)); |
359 | break; |
360 | } |
361 | case BCH_JSET_ENTRY_dev_usage: { |
362 | struct jset_entry_dev_usage *u = |
363 | container_of(entry, struct jset_entry_dev_usage, entry); |
364 | struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); |
365 | unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); |
366 | |
367 | for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { |
368 | ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); |
369 | ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); |
370 | ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); |
371 | } |
372 | |
373 | break; |
374 | } |
375 | case BCH_JSET_ENTRY_blacklist: { |
376 | struct jset_entry_blacklist *bl_entry = |
377 | container_of(entry, struct jset_entry_blacklist, entry); |
378 | |
379 | ret = bch2_journal_seq_blacklist_add(c, |
380 | le64_to_cpu(bl_entry->seq), |
381 | le64_to_cpu(bl_entry->seq) + 1); |
382 | break; |
383 | } |
384 | case BCH_JSET_ENTRY_blacklist_v2: { |
385 | struct jset_entry_blacklist_v2 *bl_entry = |
386 | container_of(entry, struct jset_entry_blacklist_v2, entry); |
387 | |
388 | ret = bch2_journal_seq_blacklist_add(c, |
389 | le64_to_cpu(bl_entry->start), |
390 | le64_to_cpu(bl_entry->end) + 1); |
391 | break; |
392 | } |
393 | case BCH_JSET_ENTRY_clock: { |
394 | struct jset_entry_clock *clock = |
395 | container_of(entry, struct jset_entry_clock, entry); |
396 | |
397 | atomic64_set(v: &c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); |
398 | } |
399 | } |
400 | |
401 | return ret; |
402 | } |
403 | |
404 | static int journal_replay_early(struct bch_fs *c, |
405 | struct bch_sb_field_clean *clean) |
406 | { |
407 | if (clean) { |
408 | for (struct jset_entry *entry = clean->start; |
409 | entry != vstruct_end(&clean->field); |
410 | entry = vstruct_next(entry)) { |
411 | int ret = journal_replay_entry_early(c, entry); |
412 | if (ret) |
413 | return ret; |
414 | } |
415 | } else { |
416 | struct genradix_iter iter; |
417 | struct journal_replay *i, **_i; |
418 | |
419 | genradix_for_each(&c->journal_entries, iter, _i) { |
420 | i = *_i; |
421 | |
422 | if (journal_replay_ignore(i)) |
423 | continue; |
424 | |
425 | vstruct_for_each(&i->j, entry) { |
426 | int ret = journal_replay_entry_early(c, entry); |
427 | if (ret) |
428 | return ret; |
429 | } |
430 | } |
431 | } |
432 | |
433 | bch2_fs_usage_initialize(c); |
434 | |
435 | return 0; |
436 | } |
437 | |
438 | /* sb clean section: */ |
439 | |
440 | static int read_btree_roots(struct bch_fs *c) |
441 | { |
442 | int ret = 0; |
443 | |
444 | for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { |
445 | struct btree_root *r = bch2_btree_id_root(c, id: i); |
446 | |
447 | if (!r->alive) |
448 | continue; |
449 | |
450 | if (btree_id_is_alloc(id: i) && c->opts.reconstruct_alloc) |
451 | continue; |
452 | |
453 | if (mustfix_fsck_err_on((ret = r->error), |
454 | c, btree_root_bkey_invalid, |
455 | "invalid btree root %s" , |
456 | bch2_btree_id_str(i)) || |
457 | mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), |
458 | c, btree_root_read_error, |
459 | "error reading btree root %s l=%u: %s" , |
460 | bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { |
461 | if (btree_id_is_alloc(id: i)) { |
462 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); |
463 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); |
464 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); |
465 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); |
466 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); |
467 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
468 | r->error = 0; |
469 | } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { |
470 | bch_info(c, "will run btree node scan" ); |
471 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); |
472 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); |
473 | } |
474 | |
475 | ret = 0; |
476 | bch2_btree_lost_data(c, btree: i); |
477 | } |
478 | } |
479 | |
480 | for (unsigned i = 0; i < BTREE_ID_NR; i++) { |
481 | struct btree_root *r = bch2_btree_id_root(c, id: i); |
482 | |
483 | if (!r->b && !r->error) { |
484 | r->alive = false; |
485 | r->level = 0; |
486 | bch2_btree_root_alloc_fake(c, i, 0); |
487 | } |
488 | } |
489 | fsck_err: |
490 | return ret; |
491 | } |
492 | |
493 | static bool check_version_upgrade(struct bch_fs *c) |
494 | { |
495 | unsigned latest_version = bcachefs_metadata_version_current; |
496 | unsigned latest_compatible = min(latest_version, |
497 | bch2_latest_compatible_version(c->sb.version)); |
498 | unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; |
499 | unsigned new_version = 0; |
500 | |
501 | if (old_version < bcachefs_metadata_required_upgrade_below) { |
502 | if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || |
503 | latest_compatible < bcachefs_metadata_required_upgrade_below) |
504 | new_version = latest_version; |
505 | else |
506 | new_version = latest_compatible; |
507 | } else { |
508 | switch (c->opts.version_upgrade) { |
509 | case BCH_VERSION_UPGRADE_compatible: |
510 | new_version = latest_compatible; |
511 | break; |
512 | case BCH_VERSION_UPGRADE_incompatible: |
513 | new_version = latest_version; |
514 | break; |
515 | case BCH_VERSION_UPGRADE_none: |
516 | new_version = min(old_version, latest_version); |
517 | break; |
518 | } |
519 | } |
520 | |
521 | if (new_version > old_version) { |
522 | struct printbuf buf = PRINTBUF; |
523 | |
524 | if (old_version < bcachefs_metadata_required_upgrade_below) |
525 | prt_str(out: &buf, str: "Version upgrade required:\n" ); |
526 | |
527 | if (old_version != c->sb.version) { |
528 | prt_str(out: &buf, str: "Version upgrade from " ); |
529 | bch2_version_to_text(&buf, c->sb.version_upgrade_complete); |
530 | prt_str(out: &buf, str: " to " ); |
531 | bch2_version_to_text(&buf, c->sb.version); |
532 | prt_str(out: &buf, str: " incomplete\n" ); |
533 | } |
534 | |
535 | prt_printf(&buf, "Doing %s version upgrade from " , |
536 | BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) |
537 | ? "incompatible" : "compatible" ); |
538 | bch2_version_to_text(&buf, old_version); |
539 | prt_str(out: &buf, str: " to " ); |
540 | bch2_version_to_text(&buf, new_version); |
541 | prt_newline(&buf); |
542 | |
543 | struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); |
544 | __le64 passes = ext->recovery_passes_required[0]; |
545 | bch2_sb_set_upgrade(c, old_version, new_version); |
546 | passes = ext->recovery_passes_required[0] & ~passes; |
547 | |
548 | if (passes) { |
549 | prt_str(out: &buf, str: " running recovery passes: " ); |
550 | prt_bitflags(&buf, bch2_recovery_passes, |
551 | bch2_recovery_passes_from_stable(le64_to_cpu(passes))); |
552 | } |
553 | |
554 | bch_info(c, "%s" , buf.buf); |
555 | |
556 | bch2_sb_upgrade(c, new_version); |
557 | |
558 | printbuf_exit(&buf); |
559 | return true; |
560 | } |
561 | |
562 | return false; |
563 | } |
564 | |
565 | int bch2_fs_recovery(struct bch_fs *c) |
566 | { |
567 | struct bch_sb_field_clean *clean = NULL; |
568 | struct jset *last_journal_entry = NULL; |
569 | u64 last_seq = 0, blacklist_seq, journal_seq; |
570 | int ret = 0; |
571 | |
572 | if (c->sb.clean) { |
573 | clean = bch2_read_superblock_clean(c); |
574 | ret = PTR_ERR_OR_ZERO(ptr: clean); |
575 | if (ret) |
576 | goto err; |
577 | |
578 | bch_info(c, "recovering from clean shutdown, journal seq %llu" , |
579 | le64_to_cpu(clean->journal_seq)); |
580 | } else { |
581 | bch_info(c, "recovering from unclean shutdown" ); |
582 | } |
583 | |
584 | if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { |
585 | bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported" ); |
586 | ret = -EINVAL; |
587 | goto err; |
588 | } |
589 | |
590 | if (!c->sb.clean && |
591 | !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { |
592 | bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix" ); |
593 | ret = -EINVAL; |
594 | goto err; |
595 | } |
596 | |
597 | if (c->opts.norecovery) |
598 | c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; |
599 | |
600 | if (!c->opts.nochanges) { |
601 | mutex_lock(&c->sb_lock); |
602 | struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); |
603 | bool write_sb = false; |
604 | |
605 | if (BCH_SB_HAS_TOPOLOGY_ERRORS(k: c->disk_sb.sb)) { |
606 | ext->recovery_passes_required[0] |= |
607 | cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology))); |
608 | write_sb = true; |
609 | } |
610 | |
611 | u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); |
612 | if (sb_passes) { |
613 | struct printbuf buf = PRINTBUF; |
614 | prt_str(out: &buf, str: "superblock requires following recovery passes to be run:\n " ); |
615 | prt_bitflags(&buf, bch2_recovery_passes, sb_passes); |
616 | bch_info(c, "%s" , buf.buf); |
617 | printbuf_exit(&buf); |
618 | } |
619 | |
620 | if (bch2_check_version_downgrade(c)) { |
621 | struct printbuf buf = PRINTBUF; |
622 | |
623 | prt_str(out: &buf, str: "Version downgrade required:" ); |
624 | |
625 | __le64 passes = ext->recovery_passes_required[0]; |
626 | bch2_sb_set_downgrade(c, |
627 | BCH_VERSION_MINOR(bcachefs_metadata_version_current), |
628 | BCH_VERSION_MINOR(c->sb.version)); |
629 | passes = ext->recovery_passes_required[0] & ~passes; |
630 | if (passes) { |
631 | prt_str(out: &buf, str: "\n running recovery passes: " ); |
632 | prt_bitflags(&buf, bch2_recovery_passes, |
633 | bch2_recovery_passes_from_stable(le64_to_cpu(passes))); |
634 | } |
635 | |
636 | bch_info(c, "%s" , buf.buf); |
637 | printbuf_exit(&buf); |
638 | write_sb = true; |
639 | } |
640 | |
641 | if (check_version_upgrade(c)) |
642 | write_sb = true; |
643 | |
644 | if (write_sb) |
645 | bch2_write_super(c); |
646 | |
647 | c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); |
648 | mutex_unlock(lock: &c->sb_lock); |
649 | } |
650 | |
651 | if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) |
652 | c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); |
653 | |
654 | if (c->opts.fsck) |
655 | set_bit(nr: BCH_FS_fsck_running, addr: &c->flags); |
656 | |
657 | ret = bch2_blacklist_table_initialize(c); |
658 | if (ret) { |
659 | bch_err(c, "error initializing blacklist table" ); |
660 | goto err; |
661 | } |
662 | |
663 | if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) { |
664 | struct genradix_iter iter; |
665 | struct journal_replay **i; |
666 | |
667 | bch_verbose(c, "starting journal read" ); |
668 | ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); |
669 | if (ret) |
670 | goto err; |
671 | |
672 | /* |
673 | * note: cmd_list_journal needs the blacklist table fully up to date so |
674 | * it can asterisk ignored journal entries: |
675 | */ |
676 | if (c->opts.read_journal_only) |
677 | goto out; |
678 | |
679 | genradix_for_each_reverse(&c->journal_entries, iter, i) |
680 | if (!journal_replay_ignore(i: *i)) { |
681 | last_journal_entry = &(*i)->j; |
682 | break; |
683 | } |
684 | |
685 | if (mustfix_fsck_err_on(c->sb.clean && |
686 | last_journal_entry && |
687 | !journal_entry_empty(last_journal_entry), c, |
688 | clean_but_journal_not_empty, |
689 | "filesystem marked clean but journal not empty" )) { |
690 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
691 | SET_BCH_SB_CLEAN(k: c->disk_sb.sb, v: false); |
692 | c->sb.clean = false; |
693 | } |
694 | |
695 | if (!last_journal_entry) { |
696 | fsck_err_on(!c->sb.clean, c, |
697 | dirty_but_no_journal_entries, |
698 | "no journal entries found" ); |
699 | if (clean) |
700 | goto use_clean; |
701 | |
702 | genradix_for_each_reverse(&c->journal_entries, iter, i) |
703 | if (*i) { |
704 | last_journal_entry = &(*i)->j; |
705 | (*i)->ignore_blacklisted = false; |
706 | (*i)->ignore_not_dirty= false; |
707 | /* |
708 | * This was probably a NO_FLUSH entry, |
709 | * so last_seq was garbage - but we know |
710 | * we're only using a single journal |
711 | * entry, set it here: |
712 | */ |
713 | (*i)->j.last_seq = (*i)->j.seq; |
714 | break; |
715 | } |
716 | } |
717 | |
718 | ret = bch2_journal_keys_sort(c); |
719 | if (ret) |
720 | goto err; |
721 | |
722 | if (c->sb.clean && last_journal_entry) { |
723 | ret = bch2_verify_superblock_clean(c, &clean, |
724 | last_journal_entry); |
725 | if (ret) |
726 | goto err; |
727 | } |
728 | } else { |
729 | use_clean: |
730 | if (!clean) { |
731 | bch_err(c, "no superblock clean section found" ); |
732 | ret = -BCH_ERR_fsck_repair_impossible; |
733 | goto err; |
734 | |
735 | } |
736 | blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; |
737 | } |
738 | |
739 | c->journal_replay_seq_start = last_seq; |
740 | c->journal_replay_seq_end = blacklist_seq - 1; |
741 | |
742 | if (c->opts.reconstruct_alloc) |
743 | bch2_reconstruct_alloc(c); |
744 | |
745 | zero_out_btree_mem_ptr(keys: &c->journal_keys); |
746 | |
747 | ret = journal_replay_early(c, clean); |
748 | if (ret) |
749 | goto err; |
750 | |
751 | /* |
752 | * After an unclean shutdown, skip then next few journal sequence |
753 | * numbers as they may have been referenced by btree writes that |
754 | * happened before their corresponding journal writes - those btree |
755 | * writes need to be ignored, by skipping and blacklisting the next few |
756 | * journal sequence numbers: |
757 | */ |
758 | if (!c->sb.clean) |
759 | journal_seq += 8; |
760 | |
761 | if (blacklist_seq != journal_seq) { |
762 | ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu" , |
763 | blacklist_seq, journal_seq) ?: |
764 | bch2_journal_seq_blacklist_add(c, |
765 | blacklist_seq, journal_seq); |
766 | if (ret) { |
767 | bch_err_msg(c, ret, "error creating new journal seq blacklist entry" ); |
768 | goto err; |
769 | } |
770 | } |
771 | |
772 | ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu" , |
773 | journal_seq, last_seq, blacklist_seq - 1) ?: |
774 | bch2_fs_journal_start(&c->journal, journal_seq); |
775 | if (ret) |
776 | goto err; |
777 | |
778 | /* |
779 | * Skip past versions that might have possibly been used (as nonces), |
780 | * but hadn't had their pointers written: |
781 | */ |
782 | if (c->sb.encryption_type && !c->sb.clean) |
783 | atomic64_add(i: 1 << 16, v: &c->key_version); |
784 | |
785 | ret = read_btree_roots(c); |
786 | if (ret) |
787 | goto err; |
788 | |
789 | ret = bch2_run_recovery_passes(c); |
790 | if (ret) |
791 | goto err; |
792 | |
793 | clear_bit(nr: BCH_FS_fsck_running, addr: &c->flags); |
794 | |
795 | /* fsync if we fixed errors */ |
796 | if (test_bit(BCH_FS_errors_fixed, &c->flags)) { |
797 | bch2_journal_flush_all_pins(j: &c->journal); |
798 | bch2_journal_meta(&c->journal); |
799 | } |
800 | |
801 | /* If we fixed errors, verify that fs is actually clean now: */ |
802 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && |
803 | test_bit(BCH_FS_errors_fixed, &c->flags) && |
804 | !test_bit(BCH_FS_errors_not_fixed, &c->flags) && |
805 | !test_bit(BCH_FS_error, &c->flags)) { |
806 | bch2_flush_fsck_errs(c); |
807 | |
808 | bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean" ); |
809 | clear_bit(nr: BCH_FS_errors_fixed, addr: &c->flags); |
810 | |
811 | c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; |
812 | |
813 | ret = bch2_run_recovery_passes(c); |
814 | if (ret) |
815 | goto err; |
816 | |
817 | if (test_bit(BCH_FS_errors_fixed, &c->flags) || |
818 | test_bit(BCH_FS_errors_not_fixed, &c->flags)) { |
819 | bch_err(c, "Second fsck run was not clean" ); |
820 | set_bit(nr: BCH_FS_errors_not_fixed, addr: &c->flags); |
821 | } |
822 | |
823 | set_bit(nr: BCH_FS_errors_fixed, addr: &c->flags); |
824 | } |
825 | |
826 | if (enabled_qtypes(c)) { |
827 | bch_verbose(c, "reading quotas" ); |
828 | ret = bch2_fs_quota_read(c); |
829 | if (ret) |
830 | goto err; |
831 | bch_verbose(c, "quotas done" ); |
832 | } |
833 | |
834 | mutex_lock(&c->sb_lock); |
835 | struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); |
836 | bool write_sb = false; |
837 | |
838 | if (BCH_SB_VERSION_UPGRADE_COMPLETE(k: c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) { |
839 | SET_BCH_SB_VERSION_UPGRADE_COMPLETE(k: c->disk_sb.sb, le16_to_cpu(c->disk_sb.sb->version)); |
840 | write_sb = true; |
841 | } |
842 | |
843 | if (!test_bit(BCH_FS_error, &c->flags) && |
844 | !(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) { |
845 | c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); |
846 | write_sb = true; |
847 | } |
848 | |
849 | if (!test_bit(BCH_FS_error, &c->flags) && |
850 | !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) { |
851 | memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); |
852 | write_sb = true; |
853 | } |
854 | |
855 | if (c->opts.fsck && |
856 | !test_bit(BCH_FS_error, &c->flags) && |
857 | c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && |
858 | ext->btrees_lost_data) { |
859 | ext->btrees_lost_data = 0; |
860 | write_sb = true; |
861 | } |
862 | |
863 | if (c->opts.fsck && |
864 | !test_bit(BCH_FS_error, &c->flags) && |
865 | !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { |
866 | SET_BCH_SB_HAS_ERRORS(k: c->disk_sb.sb, v: 0); |
867 | SET_BCH_SB_HAS_TOPOLOGY_ERRORS(k: c->disk_sb.sb, v: 0); |
868 | write_sb = true; |
869 | } |
870 | |
871 | if (write_sb) |
872 | bch2_write_super(c); |
873 | mutex_unlock(lock: &c->sb_lock); |
874 | |
875 | if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || |
876 | c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { |
877 | struct bch_move_stats stats; |
878 | |
879 | bch2_move_stats_init(&stats, "recovery" ); |
880 | |
881 | struct printbuf buf = PRINTBUF; |
882 | bch2_version_to_text(&buf, c->sb.version_min); |
883 | bch_info(c, "scanning for old btree nodes: min_version %s" , buf.buf); |
884 | printbuf_exit(&buf); |
885 | |
886 | ret = bch2_fs_read_write_early(c) ?: |
887 | bch2_scan_old_btree_nodes(c, &stats); |
888 | if (ret) |
889 | goto err; |
890 | bch_info(c, "scanning for old btree nodes done" ); |
891 | } |
892 | |
893 | if (c->journal_seq_blacklist_table && |
894 | c->journal_seq_blacklist_table->nr > 128) |
895 | queue_work(wq: system_long_wq, work: &c->journal_seq_blacklist_gc_work); |
896 | |
897 | ret = 0; |
898 | out: |
899 | bch2_flush_fsck_errs(c); |
900 | |
901 | if (!c->opts.retain_recovery_info) { |
902 | bch2_journal_keys_put_initial(c); |
903 | bch2_find_btree_nodes_exit(&c->found_btree_nodes); |
904 | } |
905 | kfree(objp: clean); |
906 | |
907 | if (!ret && |
908 | test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && |
909 | !c->opts.nochanges) { |
910 | bch2_fs_read_write_early(c); |
911 | bch2_delete_dead_snapshots_async(c); |
912 | } |
913 | |
914 | bch_err_fn(c, ret); |
915 | return ret; |
916 | err: |
917 | fsck_err: |
918 | bch2_fs_emergency_read_only(c); |
919 | goto out; |
920 | } |
921 | |
922 | int bch2_fs_initialize(struct bch_fs *c) |
923 | { |
924 | struct bch_inode_unpacked root_inode, lostfound_inode; |
925 | struct bkey_inode_buf packed_inode; |
926 | struct qstr lostfound = QSTR("lost+found" ); |
927 | int ret; |
928 | |
929 | bch_notice(c, "initializing new filesystem" ); |
930 | set_bit(nr: BCH_FS_new_fs, addr: &c->flags); |
931 | |
932 | mutex_lock(&c->sb_lock); |
933 | c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); |
934 | c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); |
935 | |
936 | bch2_check_version_downgrade(c); |
937 | |
938 | if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { |
939 | bch2_sb_upgrade(c, bcachefs_metadata_version_current); |
940 | SET_BCH_SB_VERSION_UPGRADE_COMPLETE(k: c->disk_sb.sb, bcachefs_metadata_version_current); |
941 | bch2_write_super(c); |
942 | } |
943 | mutex_unlock(lock: &c->sb_lock); |
944 | |
945 | c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; |
946 | set_bit(nr: BCH_FS_may_go_rw, addr: &c->flags); |
947 | |
948 | for (unsigned i = 0; i < BTREE_ID_NR; i++) |
949 | bch2_btree_root_alloc_fake(c, i, 0); |
950 | |
951 | for_each_member_device(c, ca) |
952 | bch2_dev_usage_init(ca); |
953 | |
954 | ret = bch2_fs_journal_alloc(c); |
955 | if (ret) |
956 | goto err; |
957 | |
958 | /* |
959 | * journal_res_get() will crash if called before this has |
960 | * set up the journal.pin FIFO and journal.cur pointer: |
961 | */ |
962 | bch2_fs_journal_start(&c->journal, 1); |
963 | bch2_journal_set_replay_done(j: &c->journal); |
964 | |
965 | ret = bch2_fs_read_write_early(c); |
966 | if (ret) |
967 | goto err; |
968 | |
969 | /* |
970 | * Write out the superblock and journal buckets, now that we can do |
971 | * btree updates |
972 | */ |
973 | bch_verbose(c, "marking superblocks" ); |
974 | ret = bch2_trans_mark_dev_sbs(c); |
975 | bch_err_msg(c, ret, "marking superblocks" ); |
976 | if (ret) |
977 | goto err; |
978 | |
979 | for_each_online_member(c, ca) |
980 | ca->new_fs_bucket_idx = 0; |
981 | |
982 | ret = bch2_fs_freespace_init(c); |
983 | if (ret) |
984 | goto err; |
985 | |
986 | ret = bch2_initialize_subvolumes(c); |
987 | if (ret) |
988 | goto err; |
989 | |
990 | bch_verbose(c, "reading snapshots table" ); |
991 | ret = bch2_snapshots_read(c); |
992 | if (ret) |
993 | goto err; |
994 | bch_verbose(c, "reading snapshots done" ); |
995 | |
996 | bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); |
997 | root_inode.bi_inum = BCACHEFS_ROOT_INO; |
998 | root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; |
999 | bch2_inode_pack(&packed_inode, &root_inode); |
1000 | packed_inode.inode.k.p.snapshot = U32_MAX; |
1001 | |
1002 | ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, flags: 0); |
1003 | bch_err_msg(c, ret, "creating root directory" ); |
1004 | if (ret) |
1005 | goto err; |
1006 | |
1007 | bch2_inode_init_early(c, &lostfound_inode); |
1008 | |
1009 | ret = bch2_trans_do(c, NULL, NULL, 0, |
1010 | bch2_create_trans(trans, |
1011 | BCACHEFS_ROOT_SUBVOL_INUM, |
1012 | &root_inode, &lostfound_inode, |
1013 | &lostfound, |
1014 | 0, 0, S_IFDIR|0700, 0, |
1015 | NULL, NULL, (subvol_inum) { 0 }, 0)); |
1016 | bch_err_msg(c, ret, "creating lost+found" ); |
1017 | if (ret) |
1018 | goto err; |
1019 | |
1020 | c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; |
1021 | |
1022 | if (enabled_qtypes(c)) { |
1023 | ret = bch2_fs_quota_read(c); |
1024 | if (ret) |
1025 | goto err; |
1026 | } |
1027 | |
1028 | ret = bch2_journal_flush(&c->journal); |
1029 | bch_err_msg(c, ret, "writing first journal entry" ); |
1030 | if (ret) |
1031 | goto err; |
1032 | |
1033 | mutex_lock(&c->sb_lock); |
1034 | SET_BCH_SB_INITIALIZED(k: c->disk_sb.sb, v: true); |
1035 | SET_BCH_SB_CLEAN(k: c->disk_sb.sb, v: false); |
1036 | |
1037 | bch2_write_super(c); |
1038 | mutex_unlock(lock: &c->sb_lock); |
1039 | |
1040 | return 0; |
1041 | err: |
1042 | bch_err_fn(c, ret); |
1043 | return ret; |
1044 | } |
1045 | |