1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "btree_update_interior.h" |
5 | #include "buckets.h" |
6 | #include "error.h" |
7 | #include "journal_io.h" |
8 | #include "replicas.h" |
9 | #include "sb-clean.h" |
10 | #include "super-io.h" |
11 | |
12 | /* |
13 | * BCH_SB_FIELD_clean: |
14 | * |
15 | * Btree roots, and a few other things, are recovered from the journal after an |
16 | * unclean shutdown - but after a clean shutdown, to avoid having to read the |
17 | * journal, we can store them in the superblock. |
18 | * |
19 | * bch_sb_field_clean simply contains a list of journal entries, stored exactly |
20 | * as they would be in the journal: |
21 | */ |
22 | |
23 | int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, |
24 | int write) |
25 | { |
26 | struct jset_entry *entry; |
27 | int ret; |
28 | |
29 | for (entry = clean->start; |
30 | entry < (struct jset_entry *) vstruct_end(&clean->field); |
31 | entry = vstruct_next(entry)) { |
32 | if (vstruct_end(entry) > vstruct_end(&clean->field)) { |
33 | bch_err(c, "journal entry (u64s %u) overran end of superblock clean section (u64s %u) by %zu" , |
34 | le16_to_cpu(entry->u64s), le32_to_cpu(clean->field.u64s), |
35 | (u64 *) vstruct_end(entry) - (u64 *) vstruct_end(&clean->field)); |
36 | bch2_sb_error_count(c, BCH_FSCK_ERR_sb_clean_entry_overrun); |
37 | return -BCH_ERR_fsck_repair_unimplemented; |
38 | } |
39 | |
40 | ret = bch2_journal_entry_validate(c, NULL, entry, |
41 | le16_to_cpu(c->disk_sb.sb->version), |
42 | BCH_SB_BIG_ENDIAN(k: c->disk_sb.sb), |
43 | write); |
44 | if (ret) |
45 | return ret; |
46 | } |
47 | |
48 | return 0; |
49 | } |
50 | |
51 | static struct bkey_i *btree_root_find(struct bch_fs *c, |
52 | struct bch_sb_field_clean *clean, |
53 | struct jset *j, |
54 | enum btree_id id, unsigned *level) |
55 | { |
56 | struct bkey_i *k; |
57 | struct jset_entry *entry, *start, *end; |
58 | |
59 | if (clean) { |
60 | start = clean->start; |
61 | end = vstruct_end(&clean->field); |
62 | } else { |
63 | start = j->start; |
64 | end = vstruct_last(j); |
65 | } |
66 | |
67 | for (entry = start; entry < end; entry = vstruct_next(entry)) |
68 | if (entry->type == BCH_JSET_ENTRY_btree_root && |
69 | entry->btree_id == id) |
70 | goto found; |
71 | |
72 | return NULL; |
73 | found: |
74 | if (!entry->u64s) |
75 | return ERR_PTR(error: -EINVAL); |
76 | |
77 | k = entry->start; |
78 | *level = entry->level; |
79 | return k; |
80 | } |
81 | |
82 | int bch2_verify_superblock_clean(struct bch_fs *c, |
83 | struct bch_sb_field_clean **cleanp, |
84 | struct jset *j) |
85 | { |
86 | unsigned i; |
87 | struct bch_sb_field_clean *clean = *cleanp; |
88 | struct printbuf buf1 = PRINTBUF; |
89 | struct printbuf buf2 = PRINTBUF; |
90 | int ret = 0; |
91 | |
92 | if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, |
93 | sb_clean_journal_seq_mismatch, |
94 | "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown" , |
95 | le64_to_cpu(clean->journal_seq), |
96 | le64_to_cpu(j->seq))) { |
97 | kfree(objp: clean); |
98 | *cleanp = NULL; |
99 | return 0; |
100 | } |
101 | |
102 | for (i = 0; i < BTREE_ID_NR; i++) { |
103 | struct bkey_i *k1, *k2; |
104 | unsigned l1 = 0, l2 = 0; |
105 | |
106 | k1 = btree_root_find(c, clean, NULL, id: i, level: &l1); |
107 | k2 = btree_root_find(c, NULL, j, id: i, level: &l2); |
108 | |
109 | if (!k1 && !k2) |
110 | continue; |
111 | |
112 | printbuf_reset(buf: &buf1); |
113 | printbuf_reset(buf: &buf2); |
114 | |
115 | if (k1) |
116 | bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k: k1)); |
117 | else |
118 | prt_printf(&buf1, "(none)" ); |
119 | |
120 | if (k2) |
121 | bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k: k2)); |
122 | else |
123 | prt_printf(&buf2, "(none)" ); |
124 | |
125 | mustfix_fsck_err_on(!k1 || !k2 || |
126 | IS_ERR(k1) || |
127 | IS_ERR(k2) || |
128 | k1->k.u64s != k2->k.u64s || |
129 | memcmp(k1, k2, bkey_bytes(&k1->k)) || |
130 | l1 != l2, c, |
131 | sb_clean_btree_root_mismatch, |
132 | "superblock btree root %u doesn't match journal after clean shutdown\n" |
133 | "sb: l=%u %s\n" |
134 | "journal: l=%u %s\n" , i, |
135 | l1, buf1.buf, |
136 | l2, buf2.buf); |
137 | } |
138 | fsck_err: |
139 | printbuf_exit(&buf2); |
140 | printbuf_exit(&buf1); |
141 | return ret; |
142 | } |
143 | |
144 | struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) |
145 | { |
146 | struct bch_sb_field_clean *clean, *sb_clean; |
147 | int ret; |
148 | |
149 | mutex_lock(&c->sb_lock); |
150 | sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); |
151 | |
152 | if (fsck_err_on(!sb_clean, c, |
153 | sb_clean_missing, |
154 | "superblock marked clean but clean section not present" )) { |
155 | SET_BCH_SB_CLEAN(k: c->disk_sb.sb, v: false); |
156 | c->sb.clean = false; |
157 | mutex_unlock(lock: &c->sb_lock); |
158 | return NULL; |
159 | } |
160 | |
161 | clean = kmemdup(p: sb_clean, vstruct_bytes(&sb_clean->field), |
162 | GFP_KERNEL); |
163 | if (!clean) { |
164 | mutex_unlock(lock: &c->sb_lock); |
165 | return ERR_PTR(error: -BCH_ERR_ENOMEM_read_superblock_clean); |
166 | } |
167 | |
168 | ret = bch2_sb_clean_validate_late(c, clean, READ); |
169 | if (ret) { |
170 | mutex_unlock(lock: &c->sb_lock); |
171 | return ERR_PTR(error: ret); |
172 | } |
173 | |
174 | mutex_unlock(lock: &c->sb_lock); |
175 | |
176 | return clean; |
177 | fsck_err: |
178 | mutex_unlock(lock: &c->sb_lock); |
179 | return ERR_PTR(error: ret); |
180 | } |
181 | |
182 | void bch2_journal_super_entries_add_common(struct bch_fs *c, |
183 | struct jset_entry **end, |
184 | u64 journal_seq) |
185 | { |
186 | percpu_down_read(sem: &c->mark_lock); |
187 | |
188 | if (!journal_seq) { |
189 | for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) |
190 | bch2_fs_usage_acc_to_base(c, i); |
191 | } else { |
192 | bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); |
193 | } |
194 | |
195 | { |
196 | struct jset_entry_usage *u = |
197 | container_of(jset_entry_init(end, sizeof(*u)), |
198 | struct jset_entry_usage, entry); |
199 | |
200 | u->entry.type = BCH_JSET_ENTRY_usage; |
201 | u->entry.btree_id = BCH_FS_USAGE_inodes; |
202 | u->v = cpu_to_le64(c->usage_base->b.nr_inodes); |
203 | } |
204 | |
205 | { |
206 | struct jset_entry_usage *u = |
207 | container_of(jset_entry_init(end, sizeof(*u)), |
208 | struct jset_entry_usage, entry); |
209 | |
210 | u->entry.type = BCH_JSET_ENTRY_usage; |
211 | u->entry.btree_id = BCH_FS_USAGE_key_version; |
212 | u->v = cpu_to_le64(atomic64_read(&c->key_version)); |
213 | } |
214 | |
215 | for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { |
216 | struct jset_entry_usage *u = |
217 | container_of(jset_entry_init(end, sizeof(*u)), |
218 | struct jset_entry_usage, entry); |
219 | |
220 | u->entry.type = BCH_JSET_ENTRY_usage; |
221 | u->entry.btree_id = BCH_FS_USAGE_reserved; |
222 | u->entry.level = i; |
223 | u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); |
224 | } |
225 | |
226 | for (unsigned i = 0; i < c->replicas.nr; i++) { |
227 | struct bch_replicas_entry_v1 *e = |
228 | cpu_replicas_entry(r: &c->replicas, i); |
229 | struct jset_entry_data_usage *u = |
230 | container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), |
231 | struct jset_entry_data_usage, entry); |
232 | |
233 | u->entry.type = BCH_JSET_ENTRY_data_usage; |
234 | u->v = cpu_to_le64(c->usage_base->replicas[i]); |
235 | unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), |
236 | "embedded variable length struct" ); |
237 | } |
238 | |
239 | for_each_member_device(c, ca) { |
240 | unsigned b = sizeof(struct jset_entry_dev_usage) + |
241 | sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; |
242 | struct jset_entry_dev_usage *u = |
243 | container_of(jset_entry_init(end, b), |
244 | struct jset_entry_dev_usage, entry); |
245 | |
246 | u->entry.type = BCH_JSET_ENTRY_dev_usage; |
247 | u->dev = cpu_to_le32(ca->dev_idx); |
248 | |
249 | for (unsigned i = 0; i < BCH_DATA_NR; i++) { |
250 | u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); |
251 | u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); |
252 | u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); |
253 | } |
254 | } |
255 | |
256 | percpu_up_read(sem: &c->mark_lock); |
257 | |
258 | for (unsigned i = 0; i < 2; i++) { |
259 | struct jset_entry_clock *clock = |
260 | container_of(jset_entry_init(end, sizeof(*clock)), |
261 | struct jset_entry_clock, entry); |
262 | |
263 | clock->entry.type = BCH_JSET_ENTRY_clock; |
264 | clock->rw = i; |
265 | clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); |
266 | } |
267 | } |
268 | |
269 | static int bch2_sb_clean_validate(struct bch_sb *sb, |
270 | struct bch_sb_field *f, |
271 | struct printbuf *err) |
272 | { |
273 | struct bch_sb_field_clean *clean = field_to_type(f, clean); |
274 | |
275 | if (vstruct_bytes(&clean->field) < sizeof(*clean)) { |
276 | prt_printf(err, "wrong size (got %zu should be %zu)" , |
277 | vstruct_bytes(&clean->field), sizeof(*clean)); |
278 | return -BCH_ERR_invalid_sb_clean; |
279 | } |
280 | |
281 | return 0; |
282 | } |
283 | |
284 | static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, |
285 | struct bch_sb_field *f) |
286 | { |
287 | struct bch_sb_field_clean *clean = field_to_type(f, clean); |
288 | struct jset_entry *entry; |
289 | |
290 | prt_printf(out, "flags: %x" , le32_to_cpu(clean->flags)); |
291 | prt_newline(out); |
292 | prt_printf(out, "journal_seq: %llu" , le64_to_cpu(clean->journal_seq)); |
293 | prt_newline(out); |
294 | |
295 | for (entry = clean->start; |
296 | entry != vstruct_end(&clean->field); |
297 | entry = vstruct_next(entry)) { |
298 | if (entry->type == BCH_JSET_ENTRY_btree_keys && |
299 | !entry->u64s) |
300 | continue; |
301 | |
302 | bch2_journal_entry_to_text(out, NULL, entry); |
303 | prt_newline(out); |
304 | } |
305 | } |
306 | |
307 | const struct bch_sb_field_ops bch_sb_field_ops_clean = { |
308 | .validate = bch2_sb_clean_validate, |
309 | .to_text = bch2_sb_clean_to_text, |
310 | }; |
311 | |
312 | int bch2_fs_mark_dirty(struct bch_fs *c) |
313 | { |
314 | int ret; |
315 | |
316 | /* |
317 | * Unconditionally write superblock, to verify it hasn't changed before |
318 | * we go rw: |
319 | */ |
320 | |
321 | mutex_lock(&c->sb_lock); |
322 | SET_BCH_SB_CLEAN(k: c->disk_sb.sb, v: false); |
323 | c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); |
324 | |
325 | ret = bch2_write_super(c); |
326 | mutex_unlock(lock: &c->sb_lock); |
327 | |
328 | return ret; |
329 | } |
330 | |
331 | void bch2_fs_mark_clean(struct bch_fs *c) |
332 | { |
333 | struct bch_sb_field_clean *sb_clean; |
334 | struct jset_entry *entry; |
335 | unsigned u64s; |
336 | int ret; |
337 | |
338 | mutex_lock(&c->sb_lock); |
339 | if (BCH_SB_CLEAN(k: c->disk_sb.sb)) |
340 | goto out; |
341 | |
342 | SET_BCH_SB_CLEAN(k: c->disk_sb.sb, v: true); |
343 | |
344 | c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); |
345 | c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); |
346 | c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); |
347 | c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); |
348 | |
349 | u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; |
350 | |
351 | sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); |
352 | if (!sb_clean) { |
353 | bch_err(c, "error resizing superblock while setting filesystem clean" ); |
354 | goto out; |
355 | } |
356 | |
357 | sb_clean->flags = 0; |
358 | sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); |
359 | |
360 | /* Trying to catch outstanding bug: */ |
361 | BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); |
362 | |
363 | entry = sb_clean->start; |
364 | bch2_journal_super_entries_add_common(c, end: &entry, journal_seq: 0); |
365 | entry = bch2_btree_roots_to_journal_entries(c, entry, 0); |
366 | BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); |
367 | |
368 | memset(entry, 0, |
369 | vstruct_end(&sb_clean->field) - (void *) entry); |
370 | |
371 | /* |
372 | * this should be in the write path, and we should be validating every |
373 | * superblock section: |
374 | */ |
375 | ret = bch2_sb_clean_validate_late(c, clean: sb_clean, WRITE); |
376 | if (ret) { |
377 | bch_err(c, "error writing marking filesystem clean: validate error" ); |
378 | goto out; |
379 | } |
380 | |
381 | bch2_write_super(c); |
382 | out: |
383 | mutex_unlock(lock: &c->sb_lock); |
384 | } |
385 | |