1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "btree_cache.h"
5#include "btree_io.h"
6#include "btree_journal_iter.h"
7#include "btree_node_scan.h"
8#include "btree_update_interior.h"
9#include "buckets.h"
10#include "error.h"
11#include "journal_io.h"
12#include "recovery_passes.h"
13
14#include <linux/kthread.h>
15#include <linux/sort.h>
16
17struct find_btree_nodes_worker {
18 struct closure *cl;
19 struct find_btree_nodes *f;
20 struct bch_dev *ca;
21};
22
23static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
24{
25 prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
26 bch2_bpos_to_text(out, n->min_key);
27 prt_str(out, str: "-");
28 bch2_bpos_to_text(out, n->max_key);
29
30 if (n->range_updated)
31 prt_str(out, str: " range updated");
32 if (n->overwritten)
33 prt_str(out, str: " overwritten");
34
35 for (unsigned i = 0; i < n->nr_ptrs; i++) {
36 prt_char(out, c: ' ');
37 bch2_extent_ptr_to_text(out, c, n->ptrs + i);
38 }
39}
40
41static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
42{
43 printbuf_indent_add(out, 2);
44 darray_for_each(nodes, i) {
45 found_btree_node_to_text(out, c, n: i);
46 prt_newline(out);
47 }
48 printbuf_indent_sub(out, 2);
49}
50
51static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
52{
53 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k: k);
54
55 set_bkey_val_u64s(k: &bp->k, val_u64s: sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
56 bp->k.p = f->max_key;
57 bp->v.seq = cpu_to_le64(f->cookie);
58 bp->v.sectors_written = 0;
59 bp->v.flags = 0;
60 bp->v.min_key = f->min_key;
61 SET_BTREE_PTR_RANGE_UPDATED(k: &bp->v, v: f->range_updated);
62 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
63}
64
65static bool found_btree_node_is_readable(struct btree_trans *trans,
66 const struct found_btree_node *f)
67{
68 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
69
70 found_btree_node_to_key(k: &k.k, f);
71
72 struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
73 bool ret = !IS_ERR_OR_NULL(ptr: b);
74 if (ret)
75 six_unlock_read(lock: &b->c.lock);
76
77 /*
78 * We might update this node's range; if that happens, we need the node
79 * to be re-read so the read path can trim keys that are no longer in
80 * this node
81 */
82 if (b != btree_node_root(c: trans->c, b))
83 bch2_btree_node_evict(trans, &k.k);
84 return ret;
85}
86
87static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
88{
89 const struct found_btree_node *l = _l;
90 const struct found_btree_node *r = _r;
91
92 return cmp_int(l->btree_id, r->btree_id) ?:
93 cmp_int(l->level, r->level) ?:
94 cmp_int(l->cookie, r->cookie);
95}
96
97/*
98 * Given two found btree nodes, if their sequence numbers are equal, take the
99 * one that's readable:
100 */
101static int found_btree_node_cmp_time(const struct found_btree_node *l,
102 const struct found_btree_node *r)
103{
104 return cmp_int(l->seq, r->seq);
105}
106
107static int found_btree_node_cmp_pos(const void *_l, const void *_r)
108{
109 const struct found_btree_node *l = _l;
110 const struct found_btree_node *r = _r;
111
112 return cmp_int(l->btree_id, r->btree_id) ?:
113 -cmp_int(l->level, r->level) ?:
114 bpos_cmp(l: l->min_key, r: r->min_key) ?:
115 -found_btree_node_cmp_time(l, r);
116}
117
118static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
119 struct bio *bio, struct btree_node *bn, u64 offset)
120{
121 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
122
123 bio_reset(bio, bdev: ca->disk_sb.bdev, opf: REQ_OP_READ);
124 bio->bi_iter.bi_sector = offset;
125 bch2_bio_map(bio, base: bn, PAGE_SIZE);
126
127 submit_bio_wait(bio);
128 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
129 "IO error in try_read_btree_node() at %llu: %s",
130 offset, bch2_blk_status_to_str(bio->bi_status)))
131 return;
132
133 if (le64_to_cpu(bn->magic) != bset_magic(c))
134 return;
135
136 if (bch2_csum_type_is_encryption(type: BSET_CSUM_TYPE(k: &bn->keys))) {
137 struct nonce nonce = btree_nonce(i: &bn->keys, offset: 0);
138 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
139
140 bch2_encrypt(c, BSET_CSUM_TYPE(k: &bn->keys), nonce, data: &bn->flags, bytes);
141 }
142
143 if (btree_id_is_alloc(id: BTREE_NODE_ID(n: bn)))
144 return;
145
146 if (BTREE_NODE_LEVEL(k: bn) >= BTREE_MAX_DEPTH)
147 return;
148
149 rcu_read_lock();
150 struct found_btree_node n = {
151 .btree_id = BTREE_NODE_ID(n: bn),
152 .level = BTREE_NODE_LEVEL(k: bn),
153 .seq = BTREE_NODE_SEQ(k: bn),
154 .cookie = le64_to_cpu(bn->keys.seq),
155 .min_key = bn->min_key,
156 .max_key = bn->max_key,
157 .nr_ptrs = 1,
158 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
159 .ptrs[0].offset = offset,
160 .ptrs[0].dev = ca->dev_idx,
161 .ptrs[0].gen = *bucket_gen(ca, b: sector_to_bucket(ca, s: offset)),
162 };
163 rcu_read_unlock();
164
165 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
166 mutex_lock(&f->lock);
167 if (BSET_BIG_ENDIAN(k: &bn->keys) != CPU_BIG_ENDIAN) {
168 bch_err(c, "try_read_btree_node() can't handle endian conversion");
169 f->ret = -EINVAL;
170 goto unlock;
171 }
172
173 if (darray_push(&f->nodes, n))
174 f->ret = -ENOMEM;
175unlock:
176 mutex_unlock(lock: &f->lock);
177 }
178}
179
180static int read_btree_nodes_worker(void *p)
181{
182 struct find_btree_nodes_worker *w = p;
183 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
184 struct bch_dev *ca = w->ca;
185 void *buf = (void *) __get_free_page(GFP_KERNEL);
186 struct bio *bio = bio_alloc(NULL, nr_vecs: 1, opf: 0, GFP_KERNEL);
187 unsigned long last_print = jiffies;
188
189 if (!buf || !bio) {
190 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
191 w->f->ret = -ENOMEM;
192 goto err;
193 }
194
195 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
196 for (unsigned bucket_offset = 0;
197 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
198 bucket_offset += btree_sectors(c)) {
199 if (time_after(jiffies, last_print + HZ * 30)) {
200 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
201 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
202
203 bch_info(ca, "%s: %2u%% done", __func__,
204 (unsigned) div64_u64(cur_sector * 100, end_sector));
205 last_print = jiffies;
206 }
207
208 u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
209
210 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
211 !bch2_dev_btree_bitmap_marked_sectors(ca, start: sector, sectors: btree_sectors(c)))
212 continue;
213
214 try_read_btree_node(f: w->f, ca, bio, bn: buf, offset: sector);
215 }
216err:
217 bio_put(bio);
218 free_page((unsigned long) buf);
219 percpu_ref_get(ref: &ca->io_ref);
220 closure_put(cl: w->cl);
221 kfree(objp: w);
222 return 0;
223}
224
225static int read_btree_nodes(struct find_btree_nodes *f)
226{
227 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
228 struct closure cl;
229 int ret = 0;
230
231 closure_init_stack(cl: &cl);
232
233 for_each_online_member(c, ca) {
234 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
235 continue;
236
237 struct find_btree_nodes_worker *w = kmalloc(size: sizeof(*w), GFP_KERNEL);
238 struct task_struct *t;
239
240 if (!w) {
241 percpu_ref_put(ref: &ca->io_ref);
242 ret = -ENOMEM;
243 goto err;
244 }
245
246 percpu_ref_get(ref: &ca->io_ref);
247 closure_get(cl: &cl);
248 w->cl = &cl;
249 w->f = f;
250 w->ca = ca;
251
252 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
253 ret = IS_ERR_OR_NULL(ptr: t);
254 if (ret) {
255 percpu_ref_put(ref: &ca->io_ref);
256 closure_put(cl: &cl);
257 f->ret = ret;
258 bch_err(c, "error starting kthread: %i", ret);
259 break;
260 }
261 }
262err:
263 closure_sync(cl: &cl);
264 return f->ret ?: ret;
265}
266
267static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
268{
269 while (n + 1 < end &&
270 found_btree_node_cmp_pos(l: n, r: n + 1) > 0) {
271 swap(n[0], n[1]);
272 n++;
273 }
274}
275
276static int handle_overwrites(struct bch_fs *c,
277 struct found_btree_node *start,
278 struct found_btree_node *end)
279{
280 struct found_btree_node *n;
281again:
282 for (n = start + 1;
283 n < end &&
284 n->btree_id == start->btree_id &&
285 n->level == start->level &&
286 bpos_lt(l: n->min_key, r: start->max_key);
287 n++) {
288 int cmp = found_btree_node_cmp_time(l: start, r: n);
289
290 if (cmp > 0) {
291 if (bpos_cmp(l: start->max_key, r: n->max_key) >= 0)
292 n->overwritten = true;
293 else {
294 n->range_updated = true;
295 n->min_key = bpos_successor(p: start->max_key);
296 n->range_updated = true;
297 bubble_up(n, end);
298 goto again;
299 }
300 } else if (cmp < 0) {
301 BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
302
303 start->max_key = bpos_predecessor(p: n->min_key);
304 start->range_updated = true;
305 } else if (n->level) {
306 n->overwritten = true;
307 } else {
308 struct printbuf buf = PRINTBUF;
309
310 prt_str(out: &buf, str: "overlapping btree nodes with same seq! halting\n ");
311 found_btree_node_to_text(out: &buf, c, n: start);
312 prt_str(out: &buf, str: "\n ");
313 found_btree_node_to_text(out: &buf, c, n);
314 bch_err(c, "%s", buf.buf);
315 printbuf_exit(&buf);
316 return -BCH_ERR_fsck_repair_unimplemented;
317 }
318 }
319
320 return 0;
321}
322
323int bch2_scan_for_btree_nodes(struct bch_fs *c)
324{
325 struct find_btree_nodes *f = &c->found_btree_nodes;
326 struct printbuf buf = PRINTBUF;
327 size_t dst;
328 int ret = 0;
329
330 if (f->nodes.nr)
331 return 0;
332
333 mutex_init(&f->lock);
334
335 ret = read_btree_nodes(f);
336 if (ret)
337 return ret;
338
339 if (!f->nodes.nr) {
340 bch_err(c, "%s: no btree nodes found", __func__);
341 ret = -EINVAL;
342 goto err;
343 }
344
345 if (0 && c->opts.verbose) {
346 printbuf_reset(buf: &buf);
347 prt_printf(&buf, "%s: nodes found:\n", __func__);
348 found_btree_nodes_to_text(out: &buf, c, nodes: f->nodes);
349 bch2_print_string_as_lines(KERN_INFO, lines: buf.buf);
350 }
351
352 sort(base: f->nodes.data, num: f->nodes.nr, size: sizeof(f->nodes.data[0]), cmp_func: found_btree_node_cmp_cookie, NULL);
353
354 dst = 0;
355 darray_for_each(f->nodes, i) {
356 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
357
358 if (prev &&
359 prev->cookie == i->cookie) {
360 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
361 bch_err(c, "%s: found too many replicas for btree node", __func__);
362 ret = -EINVAL;
363 goto err;
364 }
365 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
366 } else {
367 f->nodes.data[dst++] = *i;
368 }
369 }
370 f->nodes.nr = dst;
371
372 sort(base: f->nodes.data, num: f->nodes.nr, size: sizeof(f->nodes.data[0]), cmp_func: found_btree_node_cmp_pos, NULL);
373
374 if (0 && c->opts.verbose) {
375 printbuf_reset(buf: &buf);
376 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
377 found_btree_nodes_to_text(out: &buf, c, nodes: f->nodes);
378 bch2_print_string_as_lines(KERN_INFO, lines: buf.buf);
379 }
380
381 dst = 0;
382 darray_for_each(f->nodes, i) {
383 if (i->overwritten)
384 continue;
385
386 ret = handle_overwrites(c, start: i, end: &darray_top(f->nodes));
387 if (ret)
388 goto err;
389
390 BUG_ON(i->overwritten);
391 f->nodes.data[dst++] = *i;
392 }
393 f->nodes.nr = dst;
394
395 if (c->opts.verbose) {
396 printbuf_reset(buf: &buf);
397 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
398 found_btree_nodes_to_text(out: &buf, c, nodes: f->nodes);
399 bch2_print_string_as_lines(KERN_INFO, lines: buf.buf);
400 }
401
402 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
403err:
404 printbuf_exit(&buf);
405 return ret;
406}
407
408static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
409{
410 const struct found_btree_node *l = _l;
411 const struct found_btree_node *r = _r;
412
413 return cmp_int(l->btree_id, r->btree_id) ?:
414 -cmp_int(l->level, r->level) ?:
415 bpos_cmp(l: l->max_key, r: r->min_key);
416}
417
418#define for_each_found_btree_node_in_range(_f, _search, _idx) \
419 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
420 sizeof((_f)->nodes.data[0]), \
421 found_btree_node_range_start_cmp, &search); \
422 _idx < (_f)->nodes.nr && \
423 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
424 (_f)->nodes.data[_idx].level == _search.level && \
425 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
426 _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
427
428bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
429{
430 struct find_btree_nodes *f = &c->found_btree_nodes;
431
432 struct found_btree_node search = {
433 .btree_id = b->c.btree_id,
434 .level = b->c.level,
435 .min_key = b->data->min_key,
436 .max_key = b->key.k.p,
437 };
438
439 for_each_found_btree_node_in_range(f, search, idx)
440 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(k: b->data))
441 return true;
442 return false;
443}
444
445bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
446{
447 struct found_btree_node search = {
448 .btree_id = btree,
449 .level = 0,
450 .min_key = POS_MIN,
451 .max_key = SPOS_MAX,
452 };
453
454 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
455 return true;
456 return false;
457}
458
459int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
460 unsigned level, struct bpos node_min, struct bpos node_max)
461{
462 if (btree_id_is_alloc(id: btree))
463 return 0;
464
465 struct find_btree_nodes *f = &c->found_btree_nodes;
466
467 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
468 if (ret)
469 return ret;
470
471 if (c->opts.verbose) {
472 struct printbuf buf = PRINTBUF;
473
474 prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
475 bch2_bpos_to_text(&buf, node_min);
476 prt_str(out: &buf, str: " - ");
477 bch2_bpos_to_text(&buf, node_max);
478
479 bch_info(c, "%s(): %s", __func__, buf.buf);
480 printbuf_exit(&buf);
481 }
482
483 struct found_btree_node search = {
484 .btree_id = btree,
485 .level = level,
486 .min_key = node_min,
487 .max_key = node_max,
488 };
489
490 for_each_found_btree_node_in_range(f, search, idx) {
491 struct found_btree_node n = f->nodes.data[idx];
492
493 n.range_updated |= bpos_lt(l: n.min_key, r: node_min);
494 n.min_key = bpos_max(l: n.min_key, r: node_min);
495
496 n.range_updated |= bpos_gt(l: n.max_key, r: node_max);
497 n.max_key = bpos_min(l: n.max_key, r: node_max);
498
499 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
500
501 found_btree_node_to_key(k: &tmp.k, f: &n);
502
503 struct printbuf buf = PRINTBUF;
504 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &tmp.k));
505 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
506 printbuf_exit(&buf);
507
508 BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
509
510 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
511 if (ret)
512 return ret;
513 }
514
515 return 0;
516}
517
518void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
519{
520 darray_exit(&f->nodes);
521}
522

source code of linux/fs/bcachefs/btree_node_scan.c