1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "btree_cache.h" |
5 | #include "btree_io.h" |
6 | #include "btree_journal_iter.h" |
7 | #include "btree_node_scan.h" |
8 | #include "btree_update_interior.h" |
9 | #include "buckets.h" |
10 | #include "error.h" |
11 | #include "journal_io.h" |
12 | #include "recovery_passes.h" |
13 | |
14 | #include <linux/kthread.h> |
15 | #include <linux/sort.h> |
16 | |
17 | struct find_btree_nodes_worker { |
18 | struct closure *cl; |
19 | struct find_btree_nodes *f; |
20 | struct bch_dev *ca; |
21 | }; |
22 | |
23 | static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) |
24 | { |
25 | prt_printf(out, "%s l=%u seq=%u cookie=%llx " , bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); |
26 | bch2_bpos_to_text(out, n->min_key); |
27 | prt_str(out, str: "-" ); |
28 | bch2_bpos_to_text(out, n->max_key); |
29 | |
30 | if (n->range_updated) |
31 | prt_str(out, str: " range updated" ); |
32 | if (n->overwritten) |
33 | prt_str(out, str: " overwritten" ); |
34 | |
35 | for (unsigned i = 0; i < n->nr_ptrs; i++) { |
36 | prt_char(out, c: ' '); |
37 | bch2_extent_ptr_to_text(out, c, n->ptrs + i); |
38 | } |
39 | } |
40 | |
41 | static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) |
42 | { |
43 | printbuf_indent_add(out, 2); |
44 | darray_for_each(nodes, i) { |
45 | found_btree_node_to_text(out, c, n: i); |
46 | prt_newline(out); |
47 | } |
48 | printbuf_indent_sub(out, 2); |
49 | } |
50 | |
51 | static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) |
52 | { |
53 | struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k: k); |
54 | |
55 | set_bkey_val_u64s(k: &bp->k, val_u64s: sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); |
56 | bp->k.p = f->max_key; |
57 | bp->v.seq = cpu_to_le64(f->cookie); |
58 | bp->v.sectors_written = 0; |
59 | bp->v.flags = 0; |
60 | bp->v.min_key = f->min_key; |
61 | SET_BTREE_PTR_RANGE_UPDATED(k: &bp->v, v: f->range_updated); |
62 | memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); |
63 | } |
64 | |
65 | static bool found_btree_node_is_readable(struct btree_trans *trans, |
66 | const struct found_btree_node *f) |
67 | { |
68 | struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; |
69 | |
70 | found_btree_node_to_key(k: &k.k, f); |
71 | |
72 | struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); |
73 | bool ret = !IS_ERR_OR_NULL(ptr: b); |
74 | if (ret) |
75 | six_unlock_read(lock: &b->c.lock); |
76 | |
77 | /* |
78 | * We might update this node's range; if that happens, we need the node |
79 | * to be re-read so the read path can trim keys that are no longer in |
80 | * this node |
81 | */ |
82 | if (b != btree_node_root(c: trans->c, b)) |
83 | bch2_btree_node_evict(trans, &k.k); |
84 | return ret; |
85 | } |
86 | |
87 | static int found_btree_node_cmp_cookie(const void *_l, const void *_r) |
88 | { |
89 | const struct found_btree_node *l = _l; |
90 | const struct found_btree_node *r = _r; |
91 | |
92 | return cmp_int(l->btree_id, r->btree_id) ?: |
93 | cmp_int(l->level, r->level) ?: |
94 | cmp_int(l->cookie, r->cookie); |
95 | } |
96 | |
97 | /* |
98 | * Given two found btree nodes, if their sequence numbers are equal, take the |
99 | * one that's readable: |
100 | */ |
101 | static int found_btree_node_cmp_time(const struct found_btree_node *l, |
102 | const struct found_btree_node *r) |
103 | { |
104 | return cmp_int(l->seq, r->seq); |
105 | } |
106 | |
107 | static int found_btree_node_cmp_pos(const void *_l, const void *_r) |
108 | { |
109 | const struct found_btree_node *l = _l; |
110 | const struct found_btree_node *r = _r; |
111 | |
112 | return cmp_int(l->btree_id, r->btree_id) ?: |
113 | -cmp_int(l->level, r->level) ?: |
114 | bpos_cmp(l: l->min_key, r: r->min_key) ?: |
115 | -found_btree_node_cmp_time(l, r); |
116 | } |
117 | |
118 | static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, |
119 | struct bio *bio, struct btree_node *bn, u64 offset) |
120 | { |
121 | struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); |
122 | |
123 | bio_reset(bio, bdev: ca->disk_sb.bdev, opf: REQ_OP_READ); |
124 | bio->bi_iter.bi_sector = offset; |
125 | bch2_bio_map(bio, base: bn, PAGE_SIZE); |
126 | |
127 | submit_bio_wait(bio); |
128 | if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, |
129 | "IO error in try_read_btree_node() at %llu: %s" , |
130 | offset, bch2_blk_status_to_str(bio->bi_status))) |
131 | return; |
132 | |
133 | if (le64_to_cpu(bn->magic) != bset_magic(c)) |
134 | return; |
135 | |
136 | if (bch2_csum_type_is_encryption(type: BSET_CSUM_TYPE(k: &bn->keys))) { |
137 | struct nonce nonce = btree_nonce(i: &bn->keys, offset: 0); |
138 | unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; |
139 | |
140 | bch2_encrypt(c, BSET_CSUM_TYPE(k: &bn->keys), nonce, data: &bn->flags, bytes); |
141 | } |
142 | |
143 | if (btree_id_is_alloc(id: BTREE_NODE_ID(n: bn))) |
144 | return; |
145 | |
146 | if (BTREE_NODE_LEVEL(k: bn) >= BTREE_MAX_DEPTH) |
147 | return; |
148 | |
149 | rcu_read_lock(); |
150 | struct found_btree_node n = { |
151 | .btree_id = BTREE_NODE_ID(n: bn), |
152 | .level = BTREE_NODE_LEVEL(k: bn), |
153 | .seq = BTREE_NODE_SEQ(k: bn), |
154 | .cookie = le64_to_cpu(bn->keys.seq), |
155 | .min_key = bn->min_key, |
156 | .max_key = bn->max_key, |
157 | .nr_ptrs = 1, |
158 | .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, |
159 | .ptrs[0].offset = offset, |
160 | .ptrs[0].dev = ca->dev_idx, |
161 | .ptrs[0].gen = *bucket_gen(ca, b: sector_to_bucket(ca, s: offset)), |
162 | }; |
163 | rcu_read_unlock(); |
164 | |
165 | if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { |
166 | mutex_lock(&f->lock); |
167 | if (BSET_BIG_ENDIAN(k: &bn->keys) != CPU_BIG_ENDIAN) { |
168 | bch_err(c, "try_read_btree_node() can't handle endian conversion" ); |
169 | f->ret = -EINVAL; |
170 | goto unlock; |
171 | } |
172 | |
173 | if (darray_push(&f->nodes, n)) |
174 | f->ret = -ENOMEM; |
175 | unlock: |
176 | mutex_unlock(lock: &f->lock); |
177 | } |
178 | } |
179 | |
180 | static int read_btree_nodes_worker(void *p) |
181 | { |
182 | struct find_btree_nodes_worker *w = p; |
183 | struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); |
184 | struct bch_dev *ca = w->ca; |
185 | void *buf = (void *) __get_free_page(GFP_KERNEL); |
186 | struct bio *bio = bio_alloc(NULL, nr_vecs: 1, opf: 0, GFP_KERNEL); |
187 | unsigned long last_print = jiffies; |
188 | |
189 | if (!buf || !bio) { |
190 | bch_err(c, "read_btree_nodes_worker: error allocating bio/buf" ); |
191 | w->f->ret = -ENOMEM; |
192 | goto err; |
193 | } |
194 | |
195 | for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) |
196 | for (unsigned bucket_offset = 0; |
197 | bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; |
198 | bucket_offset += btree_sectors(c)) { |
199 | if (time_after(jiffies, last_print + HZ * 30)) { |
200 | u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; |
201 | u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; |
202 | |
203 | bch_info(ca, "%s: %2u%% done" , __func__, |
204 | (unsigned) div64_u64(cur_sector * 100, end_sector)); |
205 | last_print = jiffies; |
206 | } |
207 | |
208 | u64 sector = bucket * ca->mi.bucket_size + bucket_offset; |
209 | |
210 | if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && |
211 | !bch2_dev_btree_bitmap_marked_sectors(ca, start: sector, sectors: btree_sectors(c))) |
212 | continue; |
213 | |
214 | try_read_btree_node(f: w->f, ca, bio, bn: buf, offset: sector); |
215 | } |
216 | err: |
217 | bio_put(bio); |
218 | free_page((unsigned long) buf); |
219 | percpu_ref_get(ref: &ca->io_ref); |
220 | closure_put(cl: w->cl); |
221 | kfree(objp: w); |
222 | return 0; |
223 | } |
224 | |
225 | static int read_btree_nodes(struct find_btree_nodes *f) |
226 | { |
227 | struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); |
228 | struct closure cl; |
229 | int ret = 0; |
230 | |
231 | closure_init_stack(cl: &cl); |
232 | |
233 | for_each_online_member(c, ca) { |
234 | if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) |
235 | continue; |
236 | |
237 | struct find_btree_nodes_worker *w = kmalloc(size: sizeof(*w), GFP_KERNEL); |
238 | struct task_struct *t; |
239 | |
240 | if (!w) { |
241 | percpu_ref_put(ref: &ca->io_ref); |
242 | ret = -ENOMEM; |
243 | goto err; |
244 | } |
245 | |
246 | percpu_ref_get(ref: &ca->io_ref); |
247 | closure_get(cl: &cl); |
248 | w->cl = &cl; |
249 | w->f = f; |
250 | w->ca = ca; |
251 | |
252 | t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s" , ca->name); |
253 | ret = IS_ERR_OR_NULL(ptr: t); |
254 | if (ret) { |
255 | percpu_ref_put(ref: &ca->io_ref); |
256 | closure_put(cl: &cl); |
257 | f->ret = ret; |
258 | bch_err(c, "error starting kthread: %i" , ret); |
259 | break; |
260 | } |
261 | } |
262 | err: |
263 | closure_sync(cl: &cl); |
264 | return f->ret ?: ret; |
265 | } |
266 | |
267 | static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) |
268 | { |
269 | while (n + 1 < end && |
270 | found_btree_node_cmp_pos(l: n, r: n + 1) > 0) { |
271 | swap(n[0], n[1]); |
272 | n++; |
273 | } |
274 | } |
275 | |
276 | static int handle_overwrites(struct bch_fs *c, |
277 | struct found_btree_node *start, |
278 | struct found_btree_node *end) |
279 | { |
280 | struct found_btree_node *n; |
281 | again: |
282 | for (n = start + 1; |
283 | n < end && |
284 | n->btree_id == start->btree_id && |
285 | n->level == start->level && |
286 | bpos_lt(l: n->min_key, r: start->max_key); |
287 | n++) { |
288 | int cmp = found_btree_node_cmp_time(l: start, r: n); |
289 | |
290 | if (cmp > 0) { |
291 | if (bpos_cmp(l: start->max_key, r: n->max_key) >= 0) |
292 | n->overwritten = true; |
293 | else { |
294 | n->range_updated = true; |
295 | n->min_key = bpos_successor(p: start->max_key); |
296 | n->range_updated = true; |
297 | bubble_up(n, end); |
298 | goto again; |
299 | } |
300 | } else if (cmp < 0) { |
301 | BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); |
302 | |
303 | start->max_key = bpos_predecessor(p: n->min_key); |
304 | start->range_updated = true; |
305 | } else if (n->level) { |
306 | n->overwritten = true; |
307 | } else { |
308 | struct printbuf buf = PRINTBUF; |
309 | |
310 | prt_str(out: &buf, str: "overlapping btree nodes with same seq! halting\n " ); |
311 | found_btree_node_to_text(out: &buf, c, n: start); |
312 | prt_str(out: &buf, str: "\n " ); |
313 | found_btree_node_to_text(out: &buf, c, n); |
314 | bch_err(c, "%s" , buf.buf); |
315 | printbuf_exit(&buf); |
316 | return -BCH_ERR_fsck_repair_unimplemented; |
317 | } |
318 | } |
319 | |
320 | return 0; |
321 | } |
322 | |
323 | int bch2_scan_for_btree_nodes(struct bch_fs *c) |
324 | { |
325 | struct find_btree_nodes *f = &c->found_btree_nodes; |
326 | struct printbuf buf = PRINTBUF; |
327 | size_t dst; |
328 | int ret = 0; |
329 | |
330 | if (f->nodes.nr) |
331 | return 0; |
332 | |
333 | mutex_init(&f->lock); |
334 | |
335 | ret = read_btree_nodes(f); |
336 | if (ret) |
337 | return ret; |
338 | |
339 | if (!f->nodes.nr) { |
340 | bch_err(c, "%s: no btree nodes found" , __func__); |
341 | ret = -EINVAL; |
342 | goto err; |
343 | } |
344 | |
345 | if (0 && c->opts.verbose) { |
346 | printbuf_reset(buf: &buf); |
347 | prt_printf(&buf, "%s: nodes found:\n" , __func__); |
348 | found_btree_nodes_to_text(out: &buf, c, nodes: f->nodes); |
349 | bch2_print_string_as_lines(KERN_INFO, lines: buf.buf); |
350 | } |
351 | |
352 | sort(base: f->nodes.data, num: f->nodes.nr, size: sizeof(f->nodes.data[0]), cmp_func: found_btree_node_cmp_cookie, NULL); |
353 | |
354 | dst = 0; |
355 | darray_for_each(f->nodes, i) { |
356 | struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; |
357 | |
358 | if (prev && |
359 | prev->cookie == i->cookie) { |
360 | if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { |
361 | bch_err(c, "%s: found too many replicas for btree node" , __func__); |
362 | ret = -EINVAL; |
363 | goto err; |
364 | } |
365 | prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; |
366 | } else { |
367 | f->nodes.data[dst++] = *i; |
368 | } |
369 | } |
370 | f->nodes.nr = dst; |
371 | |
372 | sort(base: f->nodes.data, num: f->nodes.nr, size: sizeof(f->nodes.data[0]), cmp_func: found_btree_node_cmp_pos, NULL); |
373 | |
374 | if (0 && c->opts.verbose) { |
375 | printbuf_reset(buf: &buf); |
376 | prt_printf(&buf, "%s: nodes after merging replicas:\n" , __func__); |
377 | found_btree_nodes_to_text(out: &buf, c, nodes: f->nodes); |
378 | bch2_print_string_as_lines(KERN_INFO, lines: buf.buf); |
379 | } |
380 | |
381 | dst = 0; |
382 | darray_for_each(f->nodes, i) { |
383 | if (i->overwritten) |
384 | continue; |
385 | |
386 | ret = handle_overwrites(c, start: i, end: &darray_top(f->nodes)); |
387 | if (ret) |
388 | goto err; |
389 | |
390 | BUG_ON(i->overwritten); |
391 | f->nodes.data[dst++] = *i; |
392 | } |
393 | f->nodes.nr = dst; |
394 | |
395 | if (c->opts.verbose) { |
396 | printbuf_reset(buf: &buf); |
397 | prt_printf(&buf, "%s: nodes found after overwrites:\n" , __func__); |
398 | found_btree_nodes_to_text(out: &buf, c, nodes: f->nodes); |
399 | bch2_print_string_as_lines(KERN_INFO, lines: buf.buf); |
400 | } |
401 | |
402 | eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); |
403 | err: |
404 | printbuf_exit(&buf); |
405 | return ret; |
406 | } |
407 | |
408 | static int found_btree_node_range_start_cmp(const void *_l, const void *_r) |
409 | { |
410 | const struct found_btree_node *l = _l; |
411 | const struct found_btree_node *r = _r; |
412 | |
413 | return cmp_int(l->btree_id, r->btree_id) ?: |
414 | -cmp_int(l->level, r->level) ?: |
415 | bpos_cmp(l: l->max_key, r: r->min_key); |
416 | } |
417 | |
418 | #define for_each_found_btree_node_in_range(_f, _search, _idx) \ |
419 | for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ |
420 | sizeof((_f)->nodes.data[0]), \ |
421 | found_btree_node_range_start_cmp, &search); \ |
422 | _idx < (_f)->nodes.nr && \ |
423 | (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ |
424 | (_f)->nodes.data[_idx].level == _search.level && \ |
425 | bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ |
426 | _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) |
427 | |
428 | bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) |
429 | { |
430 | struct find_btree_nodes *f = &c->found_btree_nodes; |
431 | |
432 | struct found_btree_node search = { |
433 | .btree_id = b->c.btree_id, |
434 | .level = b->c.level, |
435 | .min_key = b->data->min_key, |
436 | .max_key = b->key.k.p, |
437 | }; |
438 | |
439 | for_each_found_btree_node_in_range(f, search, idx) |
440 | if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(k: b->data)) |
441 | return true; |
442 | return false; |
443 | } |
444 | |
445 | bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) |
446 | { |
447 | struct found_btree_node search = { |
448 | .btree_id = btree, |
449 | .level = 0, |
450 | .min_key = POS_MIN, |
451 | .max_key = SPOS_MAX, |
452 | }; |
453 | |
454 | for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) |
455 | return true; |
456 | return false; |
457 | } |
458 | |
459 | int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, |
460 | unsigned level, struct bpos node_min, struct bpos node_max) |
461 | { |
462 | if (btree_id_is_alloc(id: btree)) |
463 | return 0; |
464 | |
465 | struct find_btree_nodes *f = &c->found_btree_nodes; |
466 | |
467 | int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); |
468 | if (ret) |
469 | return ret; |
470 | |
471 | if (c->opts.verbose) { |
472 | struct printbuf buf = PRINTBUF; |
473 | |
474 | prt_printf(&buf, "recovering %s l=%u " , bch2_btree_id_str(btree), level); |
475 | bch2_bpos_to_text(&buf, node_min); |
476 | prt_str(out: &buf, str: " - " ); |
477 | bch2_bpos_to_text(&buf, node_max); |
478 | |
479 | bch_info(c, "%s(): %s" , __func__, buf.buf); |
480 | printbuf_exit(&buf); |
481 | } |
482 | |
483 | struct found_btree_node search = { |
484 | .btree_id = btree, |
485 | .level = level, |
486 | .min_key = node_min, |
487 | .max_key = node_max, |
488 | }; |
489 | |
490 | for_each_found_btree_node_in_range(f, search, idx) { |
491 | struct found_btree_node n = f->nodes.data[idx]; |
492 | |
493 | n.range_updated |= bpos_lt(l: n.min_key, r: node_min); |
494 | n.min_key = bpos_max(l: n.min_key, r: node_min); |
495 | |
496 | n.range_updated |= bpos_gt(l: n.max_key, r: node_max); |
497 | n.max_key = bpos_min(l: n.max_key, r: node_max); |
498 | |
499 | struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; |
500 | |
501 | found_btree_node_to_key(k: &tmp.k, f: &n); |
502 | |
503 | struct printbuf buf = PRINTBUF; |
504 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &tmp.k)); |
505 | bch_verbose(c, "%s(): recovering %s" , __func__, buf.buf); |
506 | printbuf_exit(&buf); |
507 | |
508 | BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); |
509 | |
510 | ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); |
511 | if (ret) |
512 | return ret; |
513 | } |
514 | |
515 | return 0; |
516 | } |
517 | |
518 | void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) |
519 | { |
520 | darray_exit(&f->nodes); |
521 | } |
522 | |