1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "bkey_methods.h" |
5 | #include "bkey_sort.h" |
6 | #include "btree_cache.h" |
7 | #include "btree_io.h" |
8 | #include "btree_iter.h" |
9 | #include "btree_locking.h" |
10 | #include "btree_update.h" |
11 | #include "btree_update_interior.h" |
12 | #include "buckets.h" |
13 | #include "checksum.h" |
14 | #include "debug.h" |
15 | #include "error.h" |
16 | #include "extents.h" |
17 | #include "io_write.h" |
18 | #include "journal_reclaim.h" |
19 | #include "journal_seq_blacklist.h" |
20 | #include "recovery.h" |
21 | #include "super-io.h" |
22 | #include "trace.h" |
23 | |
24 | #include <linux/sched/mm.h> |
25 | |
26 | void bch2_btree_node_io_unlock(struct btree *b) |
27 | { |
28 | EBUG_ON(!btree_node_write_in_flight(b)); |
29 | |
30 | clear_btree_node_write_in_flight_inner(b); |
31 | clear_btree_node_write_in_flight(b); |
32 | wake_up_bit(word: &b->flags, bit: BTREE_NODE_write_in_flight); |
33 | } |
34 | |
35 | void bch2_btree_node_io_lock(struct btree *b) |
36 | { |
37 | bch2_assert_btree_nodes_not_locked(); |
38 | |
39 | wait_on_bit_lock_io(word: &b->flags, bit: BTREE_NODE_write_in_flight, |
40 | TASK_UNINTERRUPTIBLE); |
41 | } |
42 | |
43 | void __bch2_btree_node_wait_on_read(struct btree *b) |
44 | { |
45 | wait_on_bit_io(word: &b->flags, bit: BTREE_NODE_read_in_flight, |
46 | TASK_UNINTERRUPTIBLE); |
47 | } |
48 | |
49 | void __bch2_btree_node_wait_on_write(struct btree *b) |
50 | { |
51 | wait_on_bit_io(word: &b->flags, bit: BTREE_NODE_write_in_flight, |
52 | TASK_UNINTERRUPTIBLE); |
53 | } |
54 | |
55 | void bch2_btree_node_wait_on_read(struct btree *b) |
56 | { |
57 | bch2_assert_btree_nodes_not_locked(); |
58 | |
59 | wait_on_bit_io(word: &b->flags, bit: BTREE_NODE_read_in_flight, |
60 | TASK_UNINTERRUPTIBLE); |
61 | } |
62 | |
63 | void bch2_btree_node_wait_on_write(struct btree *b) |
64 | { |
65 | bch2_assert_btree_nodes_not_locked(); |
66 | |
67 | wait_on_bit_io(word: &b->flags, bit: BTREE_NODE_write_in_flight, |
68 | TASK_UNINTERRUPTIBLE); |
69 | } |
70 | |
71 | static void verify_no_dups(struct btree *b, |
72 | struct bkey_packed *start, |
73 | struct bkey_packed *end) |
74 | { |
75 | #ifdef CONFIG_BCACHEFS_DEBUG |
76 | struct bkey_packed *k, *p; |
77 | |
78 | if (start == end) |
79 | return; |
80 | |
81 | for (p = start, k = bkey_p_next(start); |
82 | k != end; |
83 | p = k, k = bkey_p_next(k)) { |
84 | struct bkey l = bkey_unpack_key(b, src: p); |
85 | struct bkey r = bkey_unpack_key(b, src: k); |
86 | |
87 | BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); |
88 | } |
89 | #endif |
90 | } |
91 | |
92 | static void set_needs_whiteout(struct bset *i, int v) |
93 | { |
94 | struct bkey_packed *k; |
95 | |
96 | for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) |
97 | k->needs_whiteout = v; |
98 | } |
99 | |
100 | static void btree_bounce_free(struct bch_fs *c, size_t size, |
101 | bool used_mempool, void *p) |
102 | { |
103 | if (used_mempool) |
104 | mempool_free(element: p, pool: &c->btree_bounce_pool); |
105 | else |
106 | kvfree(addr: p); |
107 | } |
108 | |
109 | static void *btree_bounce_alloc(struct bch_fs *c, size_t size, |
110 | bool *used_mempool) |
111 | { |
112 | unsigned flags = memalloc_nofs_save(); |
113 | void *p; |
114 | |
115 | BUG_ON(size > c->opts.btree_node_size); |
116 | |
117 | *used_mempool = false; |
118 | p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); |
119 | if (!p) { |
120 | *used_mempool = true; |
121 | p = mempool_alloc(pool: &c->btree_bounce_pool, GFP_NOFS); |
122 | } |
123 | memalloc_nofs_restore(flags); |
124 | return p; |
125 | } |
126 | |
127 | static void sort_bkey_ptrs(const struct btree *bt, |
128 | struct bkey_packed **ptrs, unsigned nr) |
129 | { |
130 | unsigned n = nr, a = nr / 2, b, c, d; |
131 | |
132 | if (!a) |
133 | return; |
134 | |
135 | /* Heap sort: see lib/sort.c: */ |
136 | while (1) { |
137 | if (a) |
138 | a--; |
139 | else if (--n) |
140 | swap(ptrs[0], ptrs[n]); |
141 | else |
142 | break; |
143 | |
144 | for (b = a; c = 2 * b + 1, (d = c + 1) < n;) |
145 | b = bch2_bkey_cmp_packed(bt, |
146 | ptrs[c], |
147 | ptrs[d]) >= 0 ? c : d; |
148 | if (d == n) |
149 | b = c; |
150 | |
151 | while (b != a && |
152 | bch2_bkey_cmp_packed(bt, |
153 | ptrs[a], |
154 | ptrs[b]) >= 0) |
155 | b = (b - 1) / 2; |
156 | c = b; |
157 | while (b != a) { |
158 | b = (b - 1) / 2; |
159 | swap(ptrs[b], ptrs[c]); |
160 | } |
161 | } |
162 | } |
163 | |
164 | static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) |
165 | { |
166 | struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; |
167 | bool used_mempool = false; |
168 | size_t bytes = b->whiteout_u64s * sizeof(u64); |
169 | |
170 | if (!b->whiteout_u64s) |
171 | return; |
172 | |
173 | new_whiteouts = btree_bounce_alloc(c, size: bytes, used_mempool: &used_mempool); |
174 | |
175 | ptrs = ptrs_end = ((void *) new_whiteouts + bytes); |
176 | |
177 | for (k = unwritten_whiteouts_start(b); |
178 | k != unwritten_whiteouts_end(b); |
179 | k = bkey_p_next(k)) |
180 | *--ptrs = k; |
181 | |
182 | sort_bkey_ptrs(bt: b, ptrs, nr: ptrs_end - ptrs); |
183 | |
184 | k = new_whiteouts; |
185 | |
186 | while (ptrs != ptrs_end) { |
187 | bkey_p_copy(dst: k, src: *ptrs); |
188 | k = bkey_p_next(k); |
189 | ptrs++; |
190 | } |
191 | |
192 | verify_no_dups(b, start: new_whiteouts, |
193 | end: (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); |
194 | |
195 | memcpy_u64s(dst: unwritten_whiteouts_start(b), |
196 | src: new_whiteouts, u64s: b->whiteout_u64s); |
197 | |
198 | btree_bounce_free(c, size: bytes, used_mempool, p: new_whiteouts); |
199 | } |
200 | |
201 | static bool should_compact_bset(struct btree *b, struct bset_tree *t, |
202 | bool compacting, enum compact_mode mode) |
203 | { |
204 | if (!bset_dead_u64s(b, t)) |
205 | return false; |
206 | |
207 | switch (mode) { |
208 | case COMPACT_LAZY: |
209 | return should_compact_bset_lazy(b, t) || |
210 | (compacting && !bset_written(b, i: bset(b, t))); |
211 | case COMPACT_ALL: |
212 | return true; |
213 | default: |
214 | BUG(); |
215 | } |
216 | } |
217 | |
218 | static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) |
219 | { |
220 | struct bset_tree *t; |
221 | bool ret = false; |
222 | |
223 | for_each_bset(b, t) { |
224 | struct bset *i = bset(b, t); |
225 | struct bkey_packed *k, *n, *out, *start, *end; |
226 | struct btree_node_entry *src = NULL, *dst = NULL; |
227 | |
228 | if (t != b->set && !bset_written(b, i)) { |
229 | src = container_of(i, struct btree_node_entry, keys); |
230 | dst = max(write_block(b), |
231 | (void *) btree_bkey_last(b, t - 1)); |
232 | } |
233 | |
234 | if (src != dst) |
235 | ret = true; |
236 | |
237 | if (!should_compact_bset(b, t, compacting: ret, mode)) { |
238 | if (src != dst) { |
239 | memmove(dst, src, sizeof(*src) + |
240 | le16_to_cpu(src->keys.u64s) * |
241 | sizeof(u64)); |
242 | i = &dst->keys; |
243 | set_btree_bset(b, t, i); |
244 | } |
245 | continue; |
246 | } |
247 | |
248 | start = btree_bkey_first(b, t); |
249 | end = btree_bkey_last(b, t); |
250 | |
251 | if (src != dst) { |
252 | memmove(dst, src, sizeof(*src)); |
253 | i = &dst->keys; |
254 | set_btree_bset(b, t, i); |
255 | } |
256 | |
257 | out = i->start; |
258 | |
259 | for (k = start; k != end; k = n) { |
260 | n = bkey_p_next(k); |
261 | |
262 | if (!bkey_deleted(k)) { |
263 | bkey_p_copy(dst: out, src: k); |
264 | out = bkey_p_next(out); |
265 | } else { |
266 | BUG_ON(k->needs_whiteout); |
267 | } |
268 | } |
269 | |
270 | i->u64s = cpu_to_le16((u64 *) out - i->_data); |
271 | set_btree_bset_end(b, t); |
272 | bch2_bset_set_no_aux_tree(b, t); |
273 | ret = true; |
274 | } |
275 | |
276 | bch2_verify_btree_nr_keys(b); |
277 | |
278 | bch2_btree_build_aux_trees(b); |
279 | |
280 | return ret; |
281 | } |
282 | |
283 | bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, |
284 | enum compact_mode mode) |
285 | { |
286 | return bch2_drop_whiteouts(b, mode); |
287 | } |
288 | |
289 | static void btree_node_sort(struct bch_fs *c, struct btree *b, |
290 | unsigned start_idx, |
291 | unsigned end_idx, |
292 | bool filter_whiteouts) |
293 | { |
294 | struct btree_node *out; |
295 | struct sort_iter_stack sort_iter; |
296 | struct bset_tree *t; |
297 | struct bset *start_bset = bset(b, t: &b->set[start_idx]); |
298 | bool used_mempool = false; |
299 | u64 start_time, seq = 0; |
300 | unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; |
301 | bool sorting_entire_node = start_idx == 0 && |
302 | end_idx == b->nsets; |
303 | |
304 | sort_iter_stack_init(iter: &sort_iter, b); |
305 | |
306 | for (t = b->set + start_idx; |
307 | t < b->set + end_idx; |
308 | t++) { |
309 | u64s += le16_to_cpu(bset(b, t)->u64s); |
310 | sort_iter_add(iter: &sort_iter.iter, |
311 | btree_bkey_first(b, t), |
312 | btree_bkey_last(b, t)); |
313 | } |
314 | |
315 | bytes = sorting_entire_node |
316 | ? btree_buf_bytes(b) |
317 | : __vstruct_bytes(struct btree_node, u64s); |
318 | |
319 | out = btree_bounce_alloc(c, size: bytes, used_mempool: &used_mempool); |
320 | |
321 | start_time = local_clock(); |
322 | |
323 | u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); |
324 | |
325 | out->keys.u64s = cpu_to_le16(u64s); |
326 | |
327 | BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); |
328 | |
329 | if (sorting_entire_node) |
330 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_node_sort], |
331 | start: start_time); |
332 | |
333 | /* Make sure we preserve bset journal_seq: */ |
334 | for (t = b->set + start_idx; t < b->set + end_idx; t++) |
335 | seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); |
336 | start_bset->journal_seq = cpu_to_le64(seq); |
337 | |
338 | if (sorting_entire_node) { |
339 | u64s = le16_to_cpu(out->keys.u64s); |
340 | |
341 | BUG_ON(bytes != btree_buf_bytes(b)); |
342 | |
343 | /* |
344 | * Our temporary buffer is the same size as the btree node's |
345 | * buffer, we can just swap buffers instead of doing a big |
346 | * memcpy() |
347 | */ |
348 | *out = *b->data; |
349 | out->keys.u64s = cpu_to_le16(u64s); |
350 | swap(out, b->data); |
351 | set_btree_bset(b, t: b->set, i: &b->data->keys); |
352 | } else { |
353 | start_bset->u64s = out->keys.u64s; |
354 | memcpy_u64s(dst: start_bset->start, |
355 | src: out->keys.start, |
356 | le16_to_cpu(out->keys.u64s)); |
357 | } |
358 | |
359 | for (i = start_idx + 1; i < end_idx; i++) |
360 | b->nr.bset_u64s[start_idx] += |
361 | b->nr.bset_u64s[i]; |
362 | |
363 | b->nsets -= shift; |
364 | |
365 | for (i = start_idx + 1; i < b->nsets; i++) { |
366 | b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; |
367 | b->set[i] = b->set[i + shift]; |
368 | } |
369 | |
370 | for (i = b->nsets; i < MAX_BSETS; i++) |
371 | b->nr.bset_u64s[i] = 0; |
372 | |
373 | set_btree_bset_end(b, t: &b->set[start_idx]); |
374 | bch2_bset_set_no_aux_tree(b, t: &b->set[start_idx]); |
375 | |
376 | btree_bounce_free(c, size: bytes, used_mempool, p: out); |
377 | |
378 | bch2_verify_btree_nr_keys(b); |
379 | } |
380 | |
381 | void bch2_btree_sort_into(struct bch_fs *c, |
382 | struct btree *dst, |
383 | struct btree *src) |
384 | { |
385 | struct btree_nr_keys nr; |
386 | struct btree_node_iter src_iter; |
387 | u64 start_time = local_clock(); |
388 | |
389 | BUG_ON(dst->nsets != 1); |
390 | |
391 | bch2_bset_set_no_aux_tree(b: dst, t: dst->set); |
392 | |
393 | bch2_btree_node_iter_init_from_start(&src_iter, src); |
394 | |
395 | nr = bch2_sort_repack(btree_bset_first(b: dst), |
396 | src, &src_iter, |
397 | &dst->format, |
398 | true); |
399 | |
400 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_node_sort], |
401 | start: start_time); |
402 | |
403 | set_btree_bset_end(b: dst, t: dst->set); |
404 | |
405 | dst->nr.live_u64s += nr.live_u64s; |
406 | dst->nr.bset_u64s[0] += nr.bset_u64s[0]; |
407 | dst->nr.packed_keys += nr.packed_keys; |
408 | dst->nr.unpacked_keys += nr.unpacked_keys; |
409 | |
410 | bch2_verify_btree_nr_keys(b: dst); |
411 | } |
412 | |
413 | /* |
414 | * We're about to add another bset to the btree node, so if there's currently |
415 | * too many bsets - sort some of them together: |
416 | */ |
417 | static bool btree_node_compact(struct bch_fs *c, struct btree *b) |
418 | { |
419 | unsigned unwritten_idx; |
420 | bool ret = false; |
421 | |
422 | for (unwritten_idx = 0; |
423 | unwritten_idx < b->nsets; |
424 | unwritten_idx++) |
425 | if (!bset_written(b, i: bset(b, t: &b->set[unwritten_idx]))) |
426 | break; |
427 | |
428 | if (b->nsets - unwritten_idx > 1) { |
429 | btree_node_sort(c, b, start_idx: unwritten_idx, |
430 | end_idx: b->nsets, filter_whiteouts: false); |
431 | ret = true; |
432 | } |
433 | |
434 | if (unwritten_idx > 1) { |
435 | btree_node_sort(c, b, start_idx: 0, end_idx: unwritten_idx, filter_whiteouts: false); |
436 | ret = true; |
437 | } |
438 | |
439 | return ret; |
440 | } |
441 | |
442 | void bch2_btree_build_aux_trees(struct btree *b) |
443 | { |
444 | struct bset_tree *t; |
445 | |
446 | for_each_bset(b, t) |
447 | bch2_bset_build_aux_tree(b, t, |
448 | !bset_written(b, i: bset(b, t)) && |
449 | t == bset_tree_last(b)); |
450 | } |
451 | |
452 | /* |
453 | * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? |
454 | * |
455 | * The first bset is going to be of similar order to the size of the node, the |
456 | * last bset is bounded by btree_write_set_buffer(), which is set to keep the |
457 | * memmove on insert from being too expensive: the middle bset should, ideally, |
458 | * be the geometric mean of the first and the last. |
459 | * |
460 | * Returns true if the middle bset is greater than that geometric mean: |
461 | */ |
462 | static inline bool should_compact_all(struct bch_fs *c, struct btree *b) |
463 | { |
464 | unsigned mid_u64s_bits = |
465 | (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; |
466 | |
467 | return bset_u64s(t: &b->set[1]) > 1U << mid_u64s_bits; |
468 | } |
469 | |
470 | /* |
471 | * @bch_btree_init_next - initialize a new (unwritten) bset that can then be |
472 | * inserted into |
473 | * |
474 | * Safe to call if there already is an unwritten bset - will only add a new bset |
475 | * if @b doesn't already have one. |
476 | * |
477 | * Returns true if we sorted (i.e. invalidated iterators |
478 | */ |
479 | void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) |
480 | { |
481 | struct bch_fs *c = trans->c; |
482 | struct btree_node_entry *bne; |
483 | bool reinit_iter = false; |
484 | |
485 | EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); |
486 | BUG_ON(bset_written(b, bset(b, &b->set[1]))); |
487 | BUG_ON(btree_node_just_written(b)); |
488 | |
489 | if (b->nsets == MAX_BSETS && |
490 | !btree_node_write_in_flight(b) && |
491 | should_compact_all(c, b)) { |
492 | bch2_btree_node_write(c, b, SIX_LOCK_write, |
493 | BTREE_WRITE_init_next_bset); |
494 | reinit_iter = true; |
495 | } |
496 | |
497 | if (b->nsets == MAX_BSETS && |
498 | btree_node_compact(c, b)) |
499 | reinit_iter = true; |
500 | |
501 | BUG_ON(b->nsets >= MAX_BSETS); |
502 | |
503 | bne = want_new_bset(c, b); |
504 | if (bne) |
505 | bch2_bset_init_next(b, bne); |
506 | |
507 | bch2_btree_build_aux_trees(b); |
508 | |
509 | if (reinit_iter) |
510 | bch2_trans_node_reinit_iter(trans, b); |
511 | } |
512 | |
513 | static void btree_err_msg(struct printbuf *out, struct bch_fs *c, |
514 | struct bch_dev *ca, |
515 | struct btree *b, struct bset *i, |
516 | unsigned offset, int write) |
517 | { |
518 | prt_printf(out, bch2_log_msg(c, "%s" ), |
519 | write == READ |
520 | ? "error validating btree node " |
521 | : "corrupt btree node before write " ); |
522 | if (ca) |
523 | prt_printf(out, "on %s " , ca->name); |
524 | prt_printf(out, "at btree " ); |
525 | bch2_btree_pos_to_text(out, c, b); |
526 | |
527 | prt_printf(out, "\n node offset %u/%u" , |
528 | b->written, btree_ptr_sectors_written(&b->key)); |
529 | if (i) |
530 | prt_printf(out, " bset u64s %u" , le16_to_cpu(i->u64s)); |
531 | prt_str(out, str: ": " ); |
532 | } |
533 | |
534 | __printf(9, 10) |
535 | static int __btree_err(int ret, |
536 | struct bch_fs *c, |
537 | struct bch_dev *ca, |
538 | struct btree *b, |
539 | struct bset *i, |
540 | int write, |
541 | bool have_retry, |
542 | enum bch_sb_error_id err_type, |
543 | const char *fmt, ...) |
544 | { |
545 | struct printbuf out = PRINTBUF; |
546 | va_list args; |
547 | |
548 | btree_err_msg(out: &out, c, ca, b, i, offset: b->written, write); |
549 | |
550 | va_start(args, fmt); |
551 | prt_vprintf(&out, fmt, args); |
552 | va_end(args); |
553 | |
554 | if (write == WRITE) { |
555 | bch2_print_string_as_lines(KERN_ERR, lines: out.buf); |
556 | ret = c->opts.errors == BCH_ON_ERROR_continue |
557 | ? 0 |
558 | : -BCH_ERR_fsck_errors_not_fixed; |
559 | goto out; |
560 | } |
561 | |
562 | if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) |
563 | ret = -BCH_ERR_btree_node_read_err_fixable; |
564 | if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) |
565 | ret = -BCH_ERR_btree_node_read_err_bad_node; |
566 | |
567 | if (ret != -BCH_ERR_btree_node_read_err_fixable) |
568 | bch2_sb_error_count(c, err_type); |
569 | |
570 | switch (ret) { |
571 | case -BCH_ERR_btree_node_read_err_fixable: |
572 | ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s" , out.buf); |
573 | if (ret != -BCH_ERR_fsck_fix && |
574 | ret != -BCH_ERR_fsck_ignore) |
575 | goto fsck_err; |
576 | ret = -BCH_ERR_fsck_fix; |
577 | break; |
578 | case -BCH_ERR_btree_node_read_err_want_retry: |
579 | case -BCH_ERR_btree_node_read_err_must_retry: |
580 | bch2_print_string_as_lines(KERN_ERR, lines: out.buf); |
581 | break; |
582 | case -BCH_ERR_btree_node_read_err_bad_node: |
583 | bch2_print_string_as_lines(KERN_ERR, lines: out.buf); |
584 | ret = bch2_topology_error(c); |
585 | break; |
586 | case -BCH_ERR_btree_node_read_err_incompatible: |
587 | bch2_print_string_as_lines(KERN_ERR, lines: out.buf); |
588 | ret = -BCH_ERR_fsck_errors_not_fixed; |
589 | break; |
590 | default: |
591 | BUG(); |
592 | } |
593 | out: |
594 | fsck_err: |
595 | printbuf_exit(&out); |
596 | return ret; |
597 | } |
598 | |
599 | #define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ |
600 | ({ \ |
601 | int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ |
602 | BCH_FSCK_ERR_##_err_type, \ |
603 | msg, ##__VA_ARGS__); \ |
604 | \ |
605 | if (_ret != -BCH_ERR_fsck_fix) { \ |
606 | ret = _ret; \ |
607 | goto fsck_err; \ |
608 | } \ |
609 | \ |
610 | *saw_error = true; \ |
611 | }) |
612 | |
613 | #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) |
614 | |
615 | /* |
616 | * When btree topology repair changes the start or end of a node, that might |
617 | * mean we have to drop keys that are no longer inside the node: |
618 | */ |
619 | __cold |
620 | void bch2_btree_node_drop_keys_outside_node(struct btree *b) |
621 | { |
622 | struct bset_tree *t; |
623 | |
624 | for_each_bset(b, t) { |
625 | struct bset *i = bset(b, t); |
626 | struct bkey_packed *k; |
627 | |
628 | for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) |
629 | if (bkey_cmp_left_packed(b, l: k, r: &b->data->min_key) >= 0) |
630 | break; |
631 | |
632 | if (k != i->start) { |
633 | unsigned shift = (u64 *) k - (u64 *) i->start; |
634 | |
635 | memmove_u64s_down(dst: i->start, src: k, |
636 | u64s: (u64 *) vstruct_end(i) - (u64 *) k); |
637 | i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); |
638 | set_btree_bset_end(b, t); |
639 | } |
640 | |
641 | for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) |
642 | if (bkey_cmp_left_packed(b, l: k, r: &b->data->max_key) > 0) |
643 | break; |
644 | |
645 | if (k != vstruct_last(i)) { |
646 | i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); |
647 | set_btree_bset_end(b, t); |
648 | } |
649 | } |
650 | |
651 | /* |
652 | * Always rebuild search trees: eytzinger search tree nodes directly |
653 | * depend on the values of min/max key: |
654 | */ |
655 | bch2_bset_set_no_aux_tree(b, t: b->set); |
656 | bch2_btree_build_aux_trees(b); |
657 | b->nr = bch2_btree_node_count_keys(b); |
658 | |
659 | struct bkey_s_c k; |
660 | struct bkey unpacked; |
661 | struct btree_node_iter iter; |
662 | for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { |
663 | BUG_ON(bpos_lt(k.k->p, b->data->min_key)); |
664 | BUG_ON(bpos_gt(k.k->p, b->data->max_key)); |
665 | } |
666 | } |
667 | |
668 | static int validate_bset(struct bch_fs *c, struct bch_dev *ca, |
669 | struct btree *b, struct bset *i, |
670 | unsigned offset, unsigned sectors, |
671 | int write, bool have_retry, bool *saw_error) |
672 | { |
673 | unsigned version = le16_to_cpu(i->version); |
674 | struct printbuf buf1 = PRINTBUF; |
675 | struct printbuf buf2 = PRINTBUF; |
676 | int ret = 0; |
677 | |
678 | btree_err_on(!bch2_version_compatible(version), |
679 | -BCH_ERR_btree_node_read_err_incompatible, |
680 | c, ca, b, i, |
681 | btree_node_unsupported_version, |
682 | "unsupported bset version %u.%u" , |
683 | BCH_VERSION_MAJOR(version), |
684 | BCH_VERSION_MINOR(version)); |
685 | |
686 | if (btree_err_on(version < c->sb.version_min, |
687 | -BCH_ERR_btree_node_read_err_fixable, |
688 | c, NULL, b, i, |
689 | btree_node_bset_older_than_sb_min, |
690 | "bset version %u older than superblock version_min %u" , |
691 | version, c->sb.version_min)) { |
692 | mutex_lock(&c->sb_lock); |
693 | c->disk_sb.sb->version_min = cpu_to_le16(version); |
694 | bch2_write_super(c); |
695 | mutex_unlock(lock: &c->sb_lock); |
696 | } |
697 | |
698 | if (btree_err_on(BCH_VERSION_MAJOR(version) > |
699 | BCH_VERSION_MAJOR(c->sb.version), |
700 | -BCH_ERR_btree_node_read_err_fixable, |
701 | c, NULL, b, i, |
702 | btree_node_bset_newer_than_sb, |
703 | "bset version %u newer than superblock version %u" , |
704 | version, c->sb.version)) { |
705 | mutex_lock(&c->sb_lock); |
706 | c->disk_sb.sb->version = cpu_to_le16(version); |
707 | bch2_write_super(c); |
708 | mutex_unlock(lock: &c->sb_lock); |
709 | } |
710 | |
711 | btree_err_on(BSET_SEPARATE_WHITEOUTS(i), |
712 | -BCH_ERR_btree_node_read_err_incompatible, |
713 | c, ca, b, i, |
714 | btree_node_unsupported_version, |
715 | "BSET_SEPARATE_WHITEOUTS no longer supported" ); |
716 | |
717 | if (btree_err_on(offset + sectors > btree_sectors(c), |
718 | -BCH_ERR_btree_node_read_err_fixable, |
719 | c, ca, b, i, |
720 | bset_past_end_of_btree_node, |
721 | "bset past end of btree node" )) { |
722 | i->u64s = 0; |
723 | ret = 0; |
724 | goto out; |
725 | } |
726 | |
727 | btree_err_on(offset && !i->u64s, |
728 | -BCH_ERR_btree_node_read_err_fixable, |
729 | c, ca, b, i, |
730 | bset_empty, |
731 | "empty bset" ); |
732 | |
733 | btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, |
734 | -BCH_ERR_btree_node_read_err_want_retry, |
735 | c, ca, b, i, |
736 | bset_wrong_sector_offset, |
737 | "bset at wrong sector offset" ); |
738 | |
739 | if (!offset) { |
740 | struct btree_node *bn = |
741 | container_of(i, struct btree_node, keys); |
742 | /* These indicate that we read the wrong btree node: */ |
743 | |
744 | if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { |
745 | struct bch_btree_ptr_v2 *bp = |
746 | &bkey_i_to_btree_ptr_v2(k: &b->key)->v; |
747 | |
748 | /* XXX endianness */ |
749 | btree_err_on(bp->seq != bn->keys.seq, |
750 | -BCH_ERR_btree_node_read_err_must_retry, |
751 | c, ca, b, NULL, |
752 | bset_bad_seq, |
753 | "incorrect sequence number (wrong btree node)" ); |
754 | } |
755 | |
756 | btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, |
757 | -BCH_ERR_btree_node_read_err_must_retry, |
758 | c, ca, b, i, |
759 | btree_node_bad_btree, |
760 | "incorrect btree id" ); |
761 | |
762 | btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, |
763 | -BCH_ERR_btree_node_read_err_must_retry, |
764 | c, ca, b, i, |
765 | btree_node_bad_level, |
766 | "incorrect level" ); |
767 | |
768 | if (!write) |
769 | compat_btree_node(level: b->c.level, btree_id: b->c.btree_id, version, |
770 | big_endian: BSET_BIG_ENDIAN(k: i), write, bn); |
771 | |
772 | if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { |
773 | struct bch_btree_ptr_v2 *bp = |
774 | &bkey_i_to_btree_ptr_v2(k: &b->key)->v; |
775 | |
776 | if (BTREE_PTR_RANGE_UPDATED(k: bp)) { |
777 | b->data->min_key = bp->min_key; |
778 | b->data->max_key = b->key.k.p; |
779 | } |
780 | |
781 | btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), |
782 | -BCH_ERR_btree_node_read_err_must_retry, |
783 | c, ca, b, NULL, |
784 | btree_node_bad_min_key, |
785 | "incorrect min_key: got %s should be %s" , |
786 | (printbuf_reset(&buf1), |
787 | bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), |
788 | (printbuf_reset(&buf2), |
789 | bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); |
790 | } |
791 | |
792 | btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), |
793 | -BCH_ERR_btree_node_read_err_must_retry, |
794 | c, ca, b, i, |
795 | btree_node_bad_max_key, |
796 | "incorrect max key %s" , |
797 | (printbuf_reset(&buf1), |
798 | bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); |
799 | |
800 | if (write) |
801 | compat_btree_node(level: b->c.level, btree_id: b->c.btree_id, version, |
802 | big_endian: BSET_BIG_ENDIAN(k: i), write, bn); |
803 | |
804 | btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), |
805 | -BCH_ERR_btree_node_read_err_bad_node, |
806 | c, ca, b, i, |
807 | btree_node_bad_format, |
808 | "invalid bkey format: %s\n %s" , buf1.buf, |
809 | (printbuf_reset(&buf2), |
810 | bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); |
811 | printbuf_reset(buf: &buf1); |
812 | |
813 | compat_bformat(level: b->c.level, btree_id: b->c.btree_id, version, |
814 | big_endian: BSET_BIG_ENDIAN(k: i), write, |
815 | f: &bn->format); |
816 | } |
817 | out: |
818 | fsck_err: |
819 | printbuf_exit(&buf2); |
820 | printbuf_exit(&buf1); |
821 | return ret; |
822 | } |
823 | |
824 | static int bset_key_invalid(struct bch_fs *c, struct btree *b, |
825 | struct bkey_s_c k, |
826 | bool updated_range, int rw, |
827 | struct printbuf *err) |
828 | { |
829 | return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: |
830 | (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: |
831 | (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); |
832 | } |
833 | |
834 | static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, |
835 | struct bset *i, struct bkey_packed *k) |
836 | { |
837 | if (bkey_p_next(k) > vstruct_last(i)) |
838 | return false; |
839 | |
840 | if (k->format > KEY_FORMAT_CURRENT) |
841 | return false; |
842 | |
843 | if (!bkeyp_u64s_valid(f: &b->format, k)) |
844 | return false; |
845 | |
846 | struct printbuf buf = PRINTBUF; |
847 | struct bkey tmp; |
848 | struct bkey_s u = __bkey_disassemble(b, k, u: &tmp); |
849 | bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf); |
850 | printbuf_exit(&buf); |
851 | return ret; |
852 | } |
853 | |
854 | static int validate_bset_keys(struct bch_fs *c, struct btree *b, |
855 | struct bset *i, int write, |
856 | bool have_retry, bool *saw_error) |
857 | { |
858 | unsigned version = le16_to_cpu(i->version); |
859 | struct bkey_packed *k, *prev = NULL; |
860 | struct printbuf buf = PRINTBUF; |
861 | bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
862 | BTREE_PTR_RANGE_UPDATED(k: &bkey_i_to_btree_ptr_v2(k: &b->key)->v); |
863 | int ret = 0; |
864 | |
865 | for (k = i->start; |
866 | k != vstruct_last(i);) { |
867 | struct bkey_s u; |
868 | struct bkey tmp; |
869 | unsigned next_good_key; |
870 | |
871 | if (btree_err_on(bkey_p_next(k) > vstruct_last(i), |
872 | -BCH_ERR_btree_node_read_err_fixable, |
873 | c, NULL, b, i, |
874 | btree_node_bkey_past_bset_end, |
875 | "key extends past end of bset" )) { |
876 | i->u64s = cpu_to_le16((u64 *) k - i->_data); |
877 | break; |
878 | } |
879 | |
880 | if (btree_err_on(k->format > KEY_FORMAT_CURRENT, |
881 | -BCH_ERR_btree_node_read_err_fixable, |
882 | c, NULL, b, i, |
883 | btree_node_bkey_bad_format, |
884 | "invalid bkey format %u" , k->format)) |
885 | goto drop_this_key; |
886 | |
887 | if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), |
888 | -BCH_ERR_btree_node_read_err_fixable, |
889 | c, NULL, b, i, |
890 | btree_node_bkey_bad_u64s, |
891 | "bad k->u64s %u (min %u max %zu)" , k->u64s, |
892 | bkeyp_key_u64s(&b->format, k), |
893 | U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k))) |
894 | goto drop_this_key; |
895 | |
896 | if (!write) |
897 | bch2_bkey_compat(level: b->c.level, btree_id: b->c.btree_id, version, |
898 | big_endian: BSET_BIG_ENDIAN(k: i), write, |
899 | f: &b->format, k); |
900 | |
901 | u = __bkey_disassemble(b, k, u: &tmp); |
902 | |
903 | printbuf_reset(buf: &buf); |
904 | if (bset_key_invalid(c, b, k: u.s_c, updated_range, rw: write, err: &buf)) { |
905 | printbuf_reset(buf: &buf); |
906 | bset_key_invalid(c, b, k: u.s_c, updated_range, rw: write, err: &buf); |
907 | prt_printf(&buf, "\n " ); |
908 | bch2_bkey_val_to_text(&buf, c, u.s_c); |
909 | |
910 | btree_err(-BCH_ERR_btree_node_read_err_fixable, |
911 | c, NULL, b, i, |
912 | btree_node_bad_bkey, |
913 | "invalid bkey: %s" , buf.buf); |
914 | goto drop_this_key; |
915 | } |
916 | |
917 | if (write) |
918 | bch2_bkey_compat(level: b->c.level, btree_id: b->c.btree_id, version, |
919 | big_endian: BSET_BIG_ENDIAN(k: i), write, |
920 | f: &b->format, k); |
921 | |
922 | if (prev && bkey_iter_cmp(b, l: prev, r: k) > 0) { |
923 | struct bkey up = bkey_unpack_key(b, src: prev); |
924 | |
925 | printbuf_reset(buf: &buf); |
926 | prt_printf(&buf, "keys out of order: " ); |
927 | bch2_bkey_to_text(&buf, &up); |
928 | prt_printf(&buf, " > " ); |
929 | bch2_bkey_to_text(&buf, u.k); |
930 | |
931 | if (btree_err(-BCH_ERR_btree_node_read_err_fixable, |
932 | c, NULL, b, i, |
933 | btree_node_bkey_out_of_order, |
934 | "%s" , buf.buf)) |
935 | goto drop_this_key; |
936 | } |
937 | |
938 | prev = k; |
939 | k = bkey_p_next(k); |
940 | continue; |
941 | drop_this_key: |
942 | next_good_key = k->u64s; |
943 | |
944 | if (!next_good_key || |
945 | (BSET_BIG_ENDIAN(k: i) == CPU_BIG_ENDIAN && |
946 | version >= bcachefs_metadata_version_snapshot)) { |
947 | /* |
948 | * only do scanning if bch2_bkey_compat() has nothing to |
949 | * do |
950 | */ |
951 | |
952 | if (!bkey_packed_valid(c, b, i, k: (void *) ((u64 *) k + next_good_key))) { |
953 | for (next_good_key = 1; |
954 | next_good_key < (u64 *) vstruct_last(i) - (u64 *) k; |
955 | next_good_key++) |
956 | if (bkey_packed_valid(c, b, i, k: (void *) ((u64 *) k + next_good_key))) |
957 | goto got_good_key; |
958 | } |
959 | |
960 | /* |
961 | * didn't find a good key, have to truncate the rest of |
962 | * the bset |
963 | */ |
964 | next_good_key = (u64 *) vstruct_last(i) - (u64 *) k; |
965 | } |
966 | got_good_key: |
967 | le16_add_cpu(var: &i->u64s, val: -next_good_key); |
968 | memmove_u64s_down(dst: k, bkey_p_next(k), u64s: (u64 *) vstruct_end(i) - (u64 *) k); |
969 | } |
970 | fsck_err: |
971 | printbuf_exit(&buf); |
972 | return ret; |
973 | } |
974 | |
975 | int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, |
976 | struct btree *b, bool have_retry, bool *saw_error) |
977 | { |
978 | struct btree_node_entry *bne; |
979 | struct sort_iter *iter; |
980 | struct btree_node *sorted; |
981 | struct bkey_packed *k; |
982 | struct bset *i; |
983 | bool used_mempool, blacklisted; |
984 | bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
985 | BTREE_PTR_RANGE_UPDATED(k: &bkey_i_to_btree_ptr_v2(k: &b->key)->v); |
986 | unsigned u64s; |
987 | unsigned ptr_written = btree_ptr_sectors_written(k: &b->key); |
988 | struct printbuf buf = PRINTBUF; |
989 | int ret = 0, retry_read = 0, write = READ; |
990 | u64 start_time = local_clock(); |
991 | |
992 | b->version_ondisk = U16_MAX; |
993 | /* We might get called multiple times on read retry: */ |
994 | b->written = 0; |
995 | |
996 | iter = mempool_alloc(pool: &c->fill_iter, GFP_NOFS); |
997 | sort_iter_init(iter, b, size: (btree_blocks(c) + 1) * 2); |
998 | |
999 | if (bch2_meta_read_fault("btree" )) |
1000 | btree_err(-BCH_ERR_btree_node_read_err_must_retry, |
1001 | c, ca, b, NULL, |
1002 | btree_node_fault_injected, |
1003 | "dynamic fault" ); |
1004 | |
1005 | btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), |
1006 | -BCH_ERR_btree_node_read_err_must_retry, |
1007 | c, ca, b, NULL, |
1008 | btree_node_bad_magic, |
1009 | "bad magic: want %llx, got %llx" , |
1010 | bset_magic(c), le64_to_cpu(b->data->magic)); |
1011 | |
1012 | if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { |
1013 | struct bch_btree_ptr_v2 *bp = |
1014 | &bkey_i_to_btree_ptr_v2(k: &b->key)->v; |
1015 | |
1016 | bch2_bpos_to_text(&buf, b->data->min_key); |
1017 | prt_str(out: &buf, str: "-" ); |
1018 | bch2_bpos_to_text(&buf, b->data->max_key); |
1019 | |
1020 | btree_err_on(b->data->keys.seq != bp->seq, |
1021 | -BCH_ERR_btree_node_read_err_must_retry, |
1022 | c, ca, b, NULL, |
1023 | btree_node_bad_seq, |
1024 | "got wrong btree node (want %llx got %llx)\n" |
1025 | "got btree %s level %llu pos %s" , |
1026 | bp->seq, b->data->keys.seq, |
1027 | bch2_btree_id_str(BTREE_NODE_ID(b->data)), |
1028 | BTREE_NODE_LEVEL(b->data), |
1029 | buf.buf); |
1030 | } else { |
1031 | btree_err_on(!b->data->keys.seq, |
1032 | -BCH_ERR_btree_node_read_err_must_retry, |
1033 | c, ca, b, NULL, |
1034 | btree_node_bad_seq, |
1035 | "bad btree header: seq 0" ); |
1036 | } |
1037 | |
1038 | while (b->written < (ptr_written ?: btree_sectors(c))) { |
1039 | unsigned sectors; |
1040 | struct nonce nonce; |
1041 | bool first = !b->written; |
1042 | bool csum_bad; |
1043 | |
1044 | if (!b->written) { |
1045 | i = &b->data->keys; |
1046 | |
1047 | btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), |
1048 | -BCH_ERR_btree_node_read_err_want_retry, |
1049 | c, ca, b, i, |
1050 | bset_unknown_csum, |
1051 | "unknown checksum type %llu" , BSET_CSUM_TYPE(i)); |
1052 | |
1053 | nonce = btree_nonce(i, offset: b->written << 9); |
1054 | |
1055 | struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); |
1056 | csum_bad = bch2_crc_cmp(l: b->data->csum, r: csum); |
1057 | if (csum_bad) |
1058 | bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); |
1059 | |
1060 | btree_err_on(csum_bad, |
1061 | -BCH_ERR_btree_node_read_err_want_retry, |
1062 | c, ca, b, i, |
1063 | bset_bad_csum, |
1064 | "%s" , |
1065 | (printbuf_reset(&buf), |
1066 | bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), |
1067 | buf.buf)); |
1068 | |
1069 | ret = bset_encrypt(c, i, offset: b->written << 9); |
1070 | if (bch2_fs_fatal_err_on(ret, c, |
1071 | "decrypting btree node: %s" , bch2_err_str(ret))) |
1072 | goto fsck_err; |
1073 | |
1074 | btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && |
1075 | !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), |
1076 | -BCH_ERR_btree_node_read_err_incompatible, |
1077 | c, NULL, b, NULL, |
1078 | btree_node_unsupported_version, |
1079 | "btree node does not have NEW_EXTENT_OVERWRITE set" ); |
1080 | |
1081 | sectors = vstruct_sectors(b->data, c->block_bits); |
1082 | } else { |
1083 | bne = write_block(b); |
1084 | i = &bne->keys; |
1085 | |
1086 | if (i->seq != b->data->keys.seq) |
1087 | break; |
1088 | |
1089 | btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), |
1090 | -BCH_ERR_btree_node_read_err_want_retry, |
1091 | c, ca, b, i, |
1092 | bset_unknown_csum, |
1093 | "unknown checksum type %llu" , BSET_CSUM_TYPE(i)); |
1094 | |
1095 | nonce = btree_nonce(i, offset: b->written << 9); |
1096 | struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); |
1097 | csum_bad = bch2_crc_cmp(l: bne->csum, r: csum); |
1098 | if (csum_bad) |
1099 | bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); |
1100 | |
1101 | btree_err_on(csum_bad, |
1102 | -BCH_ERR_btree_node_read_err_want_retry, |
1103 | c, ca, b, i, |
1104 | bset_bad_csum, |
1105 | "%s" , |
1106 | (printbuf_reset(&buf), |
1107 | bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), |
1108 | buf.buf)); |
1109 | |
1110 | ret = bset_encrypt(c, i, offset: b->written << 9); |
1111 | if (bch2_fs_fatal_err_on(ret, c, |
1112 | "decrypting btree node: %s" , bch2_err_str(ret))) |
1113 | goto fsck_err; |
1114 | |
1115 | sectors = vstruct_sectors(bne, c->block_bits); |
1116 | } |
1117 | |
1118 | b->version_ondisk = min(b->version_ondisk, |
1119 | le16_to_cpu(i->version)); |
1120 | |
1121 | ret = validate_bset(c, ca, b, i, offset: b->written, sectors, |
1122 | READ, have_retry, saw_error); |
1123 | if (ret) |
1124 | goto fsck_err; |
1125 | |
1126 | if (!b->written) |
1127 | btree_node_set_format(b, f: b->data->format); |
1128 | |
1129 | ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); |
1130 | if (ret) |
1131 | goto fsck_err; |
1132 | |
1133 | SET_BSET_BIG_ENDIAN(k: i, CPU_BIG_ENDIAN); |
1134 | |
1135 | blacklisted = bch2_journal_seq_is_blacklisted(c, |
1136 | le64_to_cpu(i->journal_seq), |
1137 | true); |
1138 | |
1139 | btree_err_on(blacklisted && first, |
1140 | -BCH_ERR_btree_node_read_err_fixable, |
1141 | c, ca, b, i, |
1142 | bset_blacklisted_journal_seq, |
1143 | "first btree node bset has blacklisted journal seq (%llu)" , |
1144 | le64_to_cpu(i->journal_seq)); |
1145 | |
1146 | btree_err_on(blacklisted && ptr_written, |
1147 | -BCH_ERR_btree_node_read_err_fixable, |
1148 | c, ca, b, i, |
1149 | first_bset_blacklisted_journal_seq, |
1150 | "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u" , |
1151 | le64_to_cpu(i->journal_seq), |
1152 | b->written, b->written + sectors, ptr_written); |
1153 | |
1154 | b->written += sectors; |
1155 | |
1156 | if (blacklisted && !first) |
1157 | continue; |
1158 | |
1159 | sort_iter_add(iter, |
1160 | vstruct_idx(i, 0), |
1161 | vstruct_last(i)); |
1162 | } |
1163 | |
1164 | if (ptr_written) { |
1165 | btree_err_on(b->written < ptr_written, |
1166 | -BCH_ERR_btree_node_read_err_want_retry, |
1167 | c, ca, b, NULL, |
1168 | btree_node_data_missing, |
1169 | "btree node data missing: expected %u sectors, found %u" , |
1170 | ptr_written, b->written); |
1171 | } else { |
1172 | for (bne = write_block(b); |
1173 | bset_byte_offset(b, i: bne) < btree_buf_bytes(b); |
1174 | bne = (void *) bne + block_bytes(c)) |
1175 | btree_err_on(bne->keys.seq == b->data->keys.seq && |
1176 | !bch2_journal_seq_is_blacklisted(c, |
1177 | le64_to_cpu(bne->keys.journal_seq), |
1178 | true), |
1179 | -BCH_ERR_btree_node_read_err_want_retry, |
1180 | c, ca, b, NULL, |
1181 | btree_node_bset_after_end, |
1182 | "found bset signature after last bset" ); |
1183 | } |
1184 | |
1185 | sorted = btree_bounce_alloc(c, size: btree_buf_bytes(b), used_mempool: &used_mempool); |
1186 | sorted->keys.u64s = 0; |
1187 | |
1188 | set_btree_bset(b, t: b->set, i: &b->data->keys); |
1189 | |
1190 | b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); |
1191 | |
1192 | u64s = le16_to_cpu(sorted->keys.u64s); |
1193 | *sorted = *b->data; |
1194 | sorted->keys.u64s = cpu_to_le16(u64s); |
1195 | swap(sorted, b->data); |
1196 | set_btree_bset(b, t: b->set, i: &b->data->keys); |
1197 | b->nsets = 1; |
1198 | |
1199 | BUG_ON(b->nr.live_u64s != u64s); |
1200 | |
1201 | btree_bounce_free(c, size: btree_buf_bytes(b), used_mempool, p: sorted); |
1202 | |
1203 | if (updated_range) |
1204 | bch2_btree_node_drop_keys_outside_node(b); |
1205 | |
1206 | i = &b->data->keys; |
1207 | for (k = i->start; k != vstruct_last(i);) { |
1208 | struct bkey tmp; |
1209 | struct bkey_s u = __bkey_disassemble(b, k, u: &tmp); |
1210 | |
1211 | printbuf_reset(buf: &buf); |
1212 | |
1213 | if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || |
1214 | (bch2_inject_invalid_keys && |
1215 | !bversion_cmp(l: u.k->version, MAX_VERSION))) { |
1216 | printbuf_reset(buf: &buf); |
1217 | |
1218 | prt_printf(&buf, "invalid bkey: " ); |
1219 | bch2_bkey_val_invalid(c, u.s_c, READ, &buf); |
1220 | prt_printf(&buf, "\n " ); |
1221 | bch2_bkey_val_to_text(&buf, c, u.s_c); |
1222 | |
1223 | btree_err(-BCH_ERR_btree_node_read_err_fixable, |
1224 | c, NULL, b, i, |
1225 | btree_node_bad_bkey, |
1226 | "%s" , buf.buf); |
1227 | |
1228 | btree_keys_account_key_drop(&b->nr, 0, k); |
1229 | |
1230 | i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); |
1231 | memmove_u64s_down(dst: k, bkey_p_next(k), |
1232 | u64s: (u64 *) vstruct_end(i) - (u64 *) k); |
1233 | set_btree_bset_end(b, t: b->set); |
1234 | continue; |
1235 | } |
1236 | |
1237 | if (u.k->type == KEY_TYPE_btree_ptr_v2) { |
1238 | struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k: u); |
1239 | |
1240 | bp.v->mem_ptr = 0; |
1241 | } |
1242 | |
1243 | k = bkey_p_next(k); |
1244 | } |
1245 | |
1246 | bch2_bset_build_aux_tree(b, b->set, false); |
1247 | |
1248 | set_needs_whiteout(i: btree_bset_first(b), v: true); |
1249 | |
1250 | btree_node_reset_sib_u64s(b); |
1251 | |
1252 | bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { |
1253 | struct bch_dev *ca2 = bch_dev_bkey_exists(c, idx: ptr->dev); |
1254 | |
1255 | if (ca2->mi.state != BCH_MEMBER_STATE_rw) |
1256 | set_btree_node_need_rewrite(b); |
1257 | } |
1258 | |
1259 | if (!ptr_written) |
1260 | set_btree_node_need_rewrite(b); |
1261 | out: |
1262 | mempool_free(element: iter, pool: &c->fill_iter); |
1263 | printbuf_exit(&buf); |
1264 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_node_read_done], start: start_time); |
1265 | return retry_read; |
1266 | fsck_err: |
1267 | if (ret == -BCH_ERR_btree_node_read_err_want_retry || |
1268 | ret == -BCH_ERR_btree_node_read_err_must_retry) { |
1269 | retry_read = 1; |
1270 | } else { |
1271 | set_btree_node_read_error(b); |
1272 | bch2_btree_lost_data(c, b->c.btree_id); |
1273 | } |
1274 | goto out; |
1275 | } |
1276 | |
1277 | static void btree_node_read_work(struct work_struct *work) |
1278 | { |
1279 | struct btree_read_bio *rb = |
1280 | container_of(work, struct btree_read_bio, work); |
1281 | struct bch_fs *c = rb->c; |
1282 | struct btree *b = rb->b; |
1283 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: rb->pick.ptr.dev); |
1284 | struct bio *bio = &rb->bio; |
1285 | struct bch_io_failures failed = { .nr = 0 }; |
1286 | struct printbuf buf = PRINTBUF; |
1287 | bool saw_error = false; |
1288 | bool retry = false; |
1289 | bool can_retry; |
1290 | |
1291 | goto start; |
1292 | while (1) { |
1293 | retry = true; |
1294 | bch_info(c, "retrying read" ); |
1295 | ca = bch_dev_bkey_exists(c, idx: rb->pick.ptr.dev); |
1296 | rb->have_ioref = bch2_dev_get_ioref(ca, READ); |
1297 | bio_reset(bio, NULL, opf: REQ_OP_READ|REQ_SYNC|REQ_META); |
1298 | bio->bi_iter.bi_sector = rb->pick.ptr.offset; |
1299 | bio->bi_iter.bi_size = btree_buf_bytes(b); |
1300 | |
1301 | if (rb->have_ioref) { |
1302 | bio_set_dev(bio, bdev: ca->disk_sb.bdev); |
1303 | submit_bio_wait(bio); |
1304 | } else { |
1305 | bio->bi_status = BLK_STS_REMOVED; |
1306 | } |
1307 | start: |
1308 | printbuf_reset(buf: &buf); |
1309 | bch2_btree_pos_to_text(&buf, c, b); |
1310 | bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, |
1311 | "btree read error %s for %s" , |
1312 | bch2_blk_status_to_str(bio->bi_status), buf.buf); |
1313 | if (rb->have_ioref) |
1314 | percpu_ref_put(ref: &ca->io_ref); |
1315 | rb->have_ioref = false; |
1316 | |
1317 | bch2_mark_io_failure(&failed, &rb->pick); |
1318 | |
1319 | can_retry = bch2_bkey_pick_read_device(c, |
1320 | bkey_i_to_s_c(k: &b->key), |
1321 | &failed, &rb->pick) > 0; |
1322 | |
1323 | if (!bio->bi_status && |
1324 | !bch2_btree_node_read_done(c, ca, b, have_retry: can_retry, saw_error: &saw_error)) { |
1325 | if (retry) |
1326 | bch_info(c, "retry success" ); |
1327 | break; |
1328 | } |
1329 | |
1330 | saw_error = true; |
1331 | |
1332 | if (!can_retry) { |
1333 | set_btree_node_read_error(b); |
1334 | bch2_btree_lost_data(c, b->c.btree_id); |
1335 | break; |
1336 | } |
1337 | } |
1338 | |
1339 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_node_read], |
1340 | start: rb->start_time); |
1341 | bio_put(&rb->bio); |
1342 | |
1343 | if (saw_error && |
1344 | !btree_node_read_error(b) && |
1345 | c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { |
1346 | printbuf_reset(buf: &buf); |
1347 | bch2_bpos_to_text(&buf, b->key.k.p); |
1348 | bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error" , |
1349 | __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); |
1350 | |
1351 | bch2_btree_node_rewrite_async(c, b); |
1352 | } |
1353 | |
1354 | printbuf_exit(&buf); |
1355 | clear_btree_node_read_in_flight(b); |
1356 | wake_up_bit(word: &b->flags, bit: BTREE_NODE_read_in_flight); |
1357 | } |
1358 | |
1359 | static void btree_node_read_endio(struct bio *bio) |
1360 | { |
1361 | struct btree_read_bio *rb = |
1362 | container_of(bio, struct btree_read_bio, bio); |
1363 | struct bch_fs *c = rb->c; |
1364 | |
1365 | if (rb->have_ioref) { |
1366 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: rb->pick.ptr.dev); |
1367 | |
1368 | bch2_latency_acct(ca, submit_time: rb->start_time, READ); |
1369 | } |
1370 | |
1371 | queue_work(wq: c->io_complete_wq, work: &rb->work); |
1372 | } |
1373 | |
1374 | struct btree_node_read_all { |
1375 | struct closure cl; |
1376 | struct bch_fs *c; |
1377 | struct btree *b; |
1378 | unsigned nr; |
1379 | void *buf[BCH_REPLICAS_MAX]; |
1380 | struct bio *bio[BCH_REPLICAS_MAX]; |
1381 | blk_status_t err[BCH_REPLICAS_MAX]; |
1382 | }; |
1383 | |
1384 | static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) |
1385 | { |
1386 | struct btree_node *bn = data; |
1387 | struct btree_node_entry *bne; |
1388 | unsigned offset = 0; |
1389 | |
1390 | if (le64_to_cpu(bn->magic) != bset_magic(c)) |
1391 | return 0; |
1392 | |
1393 | while (offset < btree_sectors(c)) { |
1394 | if (!offset) { |
1395 | offset += vstruct_sectors(bn, c->block_bits); |
1396 | } else { |
1397 | bne = data + (offset << 9); |
1398 | if (bne->keys.seq != bn->keys.seq) |
1399 | break; |
1400 | offset += vstruct_sectors(bne, c->block_bits); |
1401 | } |
1402 | } |
1403 | |
1404 | return offset; |
1405 | } |
1406 | |
1407 | static bool (struct bch_fs *c, unsigned offset, void *data) |
1408 | { |
1409 | struct btree_node *bn = data; |
1410 | struct btree_node_entry *bne; |
1411 | |
1412 | if (!offset) |
1413 | return false; |
1414 | |
1415 | while (offset < btree_sectors(c)) { |
1416 | bne = data + (offset << 9); |
1417 | if (bne->keys.seq == bn->keys.seq) |
1418 | return true; |
1419 | offset++; |
1420 | } |
1421 | |
1422 | return false; |
1423 | return offset; |
1424 | } |
1425 | |
1426 | static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) |
1427 | { |
1428 | closure_type(ra, struct btree_node_read_all, cl); |
1429 | struct bch_fs *c = ra->c; |
1430 | struct btree *b = ra->b; |
1431 | struct printbuf buf = PRINTBUF; |
1432 | bool dump_bset_maps = false; |
1433 | bool have_retry = false; |
1434 | int ret = 0, best = -1, write = READ; |
1435 | unsigned i, written = 0, written2 = 0; |
1436 | __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 |
1437 | ? bkey_i_to_btree_ptr_v2(k: &b->key)->v.seq : 0; |
1438 | bool _saw_error = false, *saw_error = &_saw_error; |
1439 | |
1440 | for (i = 0; i < ra->nr; i++) { |
1441 | struct btree_node *bn = ra->buf[i]; |
1442 | |
1443 | if (ra->err[i]) |
1444 | continue; |
1445 | |
1446 | if (le64_to_cpu(bn->magic) != bset_magic(c) || |
1447 | (seq && seq != bn->keys.seq)) |
1448 | continue; |
1449 | |
1450 | if (best < 0) { |
1451 | best = i; |
1452 | written = btree_node_sectors_written(c, data: bn); |
1453 | continue; |
1454 | } |
1455 | |
1456 | written2 = btree_node_sectors_written(c, data: ra->buf[i]); |
1457 | if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, |
1458 | c, NULL, b, NULL, |
1459 | btree_node_replicas_sectors_written_mismatch, |
1460 | "btree node sectors written mismatch: %u != %u" , |
1461 | written, written2) || |
1462 | btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), |
1463 | -BCH_ERR_btree_node_read_err_fixable, |
1464 | c, NULL, b, NULL, |
1465 | btree_node_bset_after_end, |
1466 | "found bset signature after last bset" ) || |
1467 | btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), |
1468 | -BCH_ERR_btree_node_read_err_fixable, |
1469 | c, NULL, b, NULL, |
1470 | btree_node_replicas_data_mismatch, |
1471 | "btree node replicas content mismatch" )) |
1472 | dump_bset_maps = true; |
1473 | |
1474 | if (written2 > written) { |
1475 | written = written2; |
1476 | best = i; |
1477 | } |
1478 | } |
1479 | fsck_err: |
1480 | if (dump_bset_maps) { |
1481 | for (i = 0; i < ra->nr; i++) { |
1482 | struct btree_node *bn = ra->buf[i]; |
1483 | struct btree_node_entry *bne = NULL; |
1484 | unsigned offset = 0, sectors; |
1485 | bool gap = false; |
1486 | |
1487 | if (ra->err[i]) |
1488 | continue; |
1489 | |
1490 | printbuf_reset(buf: &buf); |
1491 | |
1492 | while (offset < btree_sectors(c)) { |
1493 | if (!offset) { |
1494 | sectors = vstruct_sectors(bn, c->block_bits); |
1495 | } else { |
1496 | bne = ra->buf[i] + (offset << 9); |
1497 | if (bne->keys.seq != bn->keys.seq) |
1498 | break; |
1499 | sectors = vstruct_sectors(bne, c->block_bits); |
1500 | } |
1501 | |
1502 | prt_printf(&buf, " %u-%u" , offset, offset + sectors); |
1503 | if (bne && bch2_journal_seq_is_blacklisted(c, |
1504 | le64_to_cpu(bne->keys.journal_seq), false)) |
1505 | prt_printf(&buf, "*" ); |
1506 | offset += sectors; |
1507 | } |
1508 | |
1509 | while (offset < btree_sectors(c)) { |
1510 | bne = ra->buf[i] + (offset << 9); |
1511 | if (bne->keys.seq == bn->keys.seq) { |
1512 | if (!gap) |
1513 | prt_printf(&buf, " GAP" ); |
1514 | gap = true; |
1515 | |
1516 | sectors = vstruct_sectors(bne, c->block_bits); |
1517 | prt_printf(&buf, " %u-%u" , offset, offset + sectors); |
1518 | if (bch2_journal_seq_is_blacklisted(c, |
1519 | le64_to_cpu(bne->keys.journal_seq), false)) |
1520 | prt_printf(&buf, "*" ); |
1521 | } |
1522 | offset++; |
1523 | } |
1524 | |
1525 | bch_err(c, "replica %u:%s" , i, buf.buf); |
1526 | } |
1527 | } |
1528 | |
1529 | if (best >= 0) { |
1530 | memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); |
1531 | ret = bch2_btree_node_read_done(c, NULL, b, have_retry: false, saw_error); |
1532 | } else { |
1533 | ret = -1; |
1534 | } |
1535 | |
1536 | if (ret) { |
1537 | set_btree_node_read_error(b); |
1538 | bch2_btree_lost_data(c, b->c.btree_id); |
1539 | } else if (*saw_error) |
1540 | bch2_btree_node_rewrite_async(c, b); |
1541 | |
1542 | for (i = 0; i < ra->nr; i++) { |
1543 | mempool_free(element: ra->buf[i], pool: &c->btree_bounce_pool); |
1544 | bio_put(ra->bio[i]); |
1545 | } |
1546 | |
1547 | closure_debug_destroy(cl: &ra->cl); |
1548 | kfree(objp: ra); |
1549 | printbuf_exit(&buf); |
1550 | |
1551 | clear_btree_node_read_in_flight(b); |
1552 | wake_up_bit(word: &b->flags, bit: BTREE_NODE_read_in_flight); |
1553 | } |
1554 | |
1555 | static void btree_node_read_all_replicas_endio(struct bio *bio) |
1556 | { |
1557 | struct btree_read_bio *rb = |
1558 | container_of(bio, struct btree_read_bio, bio); |
1559 | struct bch_fs *c = rb->c; |
1560 | struct btree_node_read_all *ra = rb->ra; |
1561 | |
1562 | if (rb->have_ioref) { |
1563 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: rb->pick.ptr.dev); |
1564 | |
1565 | bch2_latency_acct(ca, submit_time: rb->start_time, READ); |
1566 | } |
1567 | |
1568 | ra->err[rb->idx] = bio->bi_status; |
1569 | closure_put(cl: &ra->cl); |
1570 | } |
1571 | |
1572 | /* |
1573 | * XXX This allocates multiple times from the same mempools, and can deadlock |
1574 | * under sufficient memory pressure (but is only a debug path) |
1575 | */ |
1576 | static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) |
1577 | { |
1578 | struct bkey_s_c k = bkey_i_to_s_c(k: &b->key); |
1579 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
1580 | const union bch_extent_entry *entry; |
1581 | struct extent_ptr_decoded pick; |
1582 | struct btree_node_read_all *ra; |
1583 | unsigned i; |
1584 | |
1585 | ra = kzalloc(size: sizeof(*ra), GFP_NOFS); |
1586 | if (!ra) |
1587 | return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; |
1588 | |
1589 | closure_init(cl: &ra->cl, NULL); |
1590 | ra->c = c; |
1591 | ra->b = b; |
1592 | ra->nr = bch2_bkey_nr_ptrs(k); |
1593 | |
1594 | for (i = 0; i < ra->nr; i++) { |
1595 | ra->buf[i] = mempool_alloc(pool: &c->btree_bounce_pool, GFP_NOFS); |
1596 | ra->bio[i] = bio_alloc_bioset(NULL, |
1597 | nr_vecs: buf_pages(p: ra->buf[i], len: btree_buf_bytes(b)), |
1598 | opf: REQ_OP_READ|REQ_SYNC|REQ_META, |
1599 | GFP_NOFS, |
1600 | bs: &c->btree_bio); |
1601 | } |
1602 | |
1603 | i = 0; |
1604 | bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { |
1605 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: pick.ptr.dev); |
1606 | struct btree_read_bio *rb = |
1607 | container_of(ra->bio[i], struct btree_read_bio, bio); |
1608 | rb->c = c; |
1609 | rb->b = b; |
1610 | rb->ra = ra; |
1611 | rb->start_time = local_clock(); |
1612 | rb->have_ioref = bch2_dev_get_ioref(ca, READ); |
1613 | rb->idx = i; |
1614 | rb->pick = pick; |
1615 | rb->bio.bi_iter.bi_sector = pick.ptr.offset; |
1616 | rb->bio.bi_end_io = btree_node_read_all_replicas_endio; |
1617 | bch2_bio_map(bio: &rb->bio, base: ra->buf[i], btree_buf_bytes(b)); |
1618 | |
1619 | if (rb->have_ioref) { |
1620 | this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], |
1621 | bio_sectors(&rb->bio)); |
1622 | bio_set_dev(bio: &rb->bio, bdev: ca->disk_sb.bdev); |
1623 | |
1624 | closure_get(cl: &ra->cl); |
1625 | submit_bio(bio: &rb->bio); |
1626 | } else { |
1627 | ra->err[i] = BLK_STS_REMOVED; |
1628 | } |
1629 | |
1630 | i++; |
1631 | } |
1632 | |
1633 | if (sync) { |
1634 | closure_sync(cl: &ra->cl); |
1635 | btree_node_read_all_replicas_done(ws: &ra->cl.work); |
1636 | } else { |
1637 | continue_at(&ra->cl, btree_node_read_all_replicas_done, |
1638 | c->io_complete_wq); |
1639 | } |
1640 | |
1641 | return 0; |
1642 | } |
1643 | |
1644 | void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, |
1645 | bool sync) |
1646 | { |
1647 | struct bch_fs *c = trans->c; |
1648 | struct extent_ptr_decoded pick; |
1649 | struct btree_read_bio *rb; |
1650 | struct bch_dev *ca; |
1651 | struct bio *bio; |
1652 | int ret; |
1653 | |
1654 | trace_and_count(c, btree_node_read, trans, b); |
1655 | |
1656 | if (bch2_verify_all_btree_replicas && |
1657 | !btree_node_read_all_replicas(c, b, sync)) |
1658 | return; |
1659 | |
1660 | ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(k: &b->key), |
1661 | NULL, &pick); |
1662 | |
1663 | if (ret <= 0) { |
1664 | struct printbuf buf = PRINTBUF; |
1665 | |
1666 | prt_str(out: &buf, str: "btree node read error: no device to read from\n at " ); |
1667 | bch2_btree_pos_to_text(&buf, c, b); |
1668 | bch_err_ratelimited(c, "%s" , buf.buf); |
1669 | |
1670 | if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && |
1671 | c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) |
1672 | bch2_fatal_error(c); |
1673 | |
1674 | set_btree_node_read_error(b); |
1675 | bch2_btree_lost_data(c, b->c.btree_id); |
1676 | clear_btree_node_read_in_flight(b); |
1677 | wake_up_bit(word: &b->flags, bit: BTREE_NODE_read_in_flight); |
1678 | printbuf_exit(&buf); |
1679 | return; |
1680 | } |
1681 | |
1682 | ca = bch_dev_bkey_exists(c, idx: pick.ptr.dev); |
1683 | |
1684 | bio = bio_alloc_bioset(NULL, |
1685 | nr_vecs: buf_pages(p: b->data, len: btree_buf_bytes(b)), |
1686 | opf: REQ_OP_READ|REQ_SYNC|REQ_META, |
1687 | GFP_NOFS, |
1688 | bs: &c->btree_bio); |
1689 | rb = container_of(bio, struct btree_read_bio, bio); |
1690 | rb->c = c; |
1691 | rb->b = b; |
1692 | rb->ra = NULL; |
1693 | rb->start_time = local_clock(); |
1694 | rb->have_ioref = bch2_dev_get_ioref(ca, READ); |
1695 | rb->pick = pick; |
1696 | INIT_WORK(&rb->work, btree_node_read_work); |
1697 | bio->bi_iter.bi_sector = pick.ptr.offset; |
1698 | bio->bi_end_io = btree_node_read_endio; |
1699 | bch2_bio_map(bio, base: b->data, btree_buf_bytes(b)); |
1700 | |
1701 | if (rb->have_ioref) { |
1702 | this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], |
1703 | bio_sectors(bio)); |
1704 | bio_set_dev(bio, bdev: ca->disk_sb.bdev); |
1705 | |
1706 | if (sync) { |
1707 | submit_bio_wait(bio); |
1708 | bch2_latency_acct(ca, submit_time: rb->start_time, READ); |
1709 | btree_node_read_work(work: &rb->work); |
1710 | } else { |
1711 | submit_bio(bio); |
1712 | } |
1713 | } else { |
1714 | bio->bi_status = BLK_STS_REMOVED; |
1715 | |
1716 | if (sync) |
1717 | btree_node_read_work(work: &rb->work); |
1718 | else |
1719 | queue_work(wq: c->io_complete_wq, work: &rb->work); |
1720 | } |
1721 | } |
1722 | |
1723 | static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, |
1724 | const struct bkey_i *k, unsigned level) |
1725 | { |
1726 | struct bch_fs *c = trans->c; |
1727 | struct closure cl; |
1728 | struct btree *b; |
1729 | int ret; |
1730 | |
1731 | closure_init_stack(cl: &cl); |
1732 | |
1733 | do { |
1734 | ret = bch2_btree_cache_cannibalize_lock(trans, &cl); |
1735 | closure_sync(cl: &cl); |
1736 | } while (ret); |
1737 | |
1738 | b = bch2_btree_node_mem_alloc(trans, level != 0); |
1739 | bch2_btree_cache_cannibalize_unlock(trans); |
1740 | |
1741 | BUG_ON(IS_ERR(b)); |
1742 | |
1743 | bkey_copy(dst: &b->key, src: k); |
1744 | BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); |
1745 | |
1746 | set_btree_node_read_in_flight(b); |
1747 | |
1748 | bch2_btree_node_read(trans, b, sync: true); |
1749 | |
1750 | if (btree_node_read_error(b)) { |
1751 | bch2_btree_node_hash_remove(&c->btree_cache, b); |
1752 | |
1753 | mutex_lock(&c->btree_cache.lock); |
1754 | list_move(list: &b->list, head: &c->btree_cache.freeable); |
1755 | mutex_unlock(lock: &c->btree_cache.lock); |
1756 | |
1757 | ret = -BCH_ERR_btree_node_read_error; |
1758 | goto err; |
1759 | } |
1760 | |
1761 | bch2_btree_set_root_for_read(c, b); |
1762 | err: |
1763 | six_unlock_write(lock: &b->c.lock); |
1764 | six_unlock_intent(lock: &b->c.lock); |
1765 | |
1766 | return ret; |
1767 | } |
1768 | |
1769 | int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, |
1770 | const struct bkey_i *k, unsigned level) |
1771 | { |
1772 | return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); |
1773 | } |
1774 | |
1775 | static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, |
1776 | struct btree_write *w) |
1777 | { |
1778 | unsigned long old, new, v = READ_ONCE(b->will_make_reachable); |
1779 | |
1780 | do { |
1781 | old = new = v; |
1782 | if (!(old & 1)) |
1783 | break; |
1784 | |
1785 | new &= ~1UL; |
1786 | } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); |
1787 | |
1788 | if (old & 1) |
1789 | closure_put(cl: &((struct btree_update *) new)->cl); |
1790 | |
1791 | bch2_journal_pin_drop(&c->journal, &w->journal); |
1792 | } |
1793 | |
1794 | static void __btree_node_write_done(struct bch_fs *c, struct btree *b) |
1795 | { |
1796 | struct btree_write *w = btree_prev_write(b); |
1797 | unsigned long old, new, v; |
1798 | unsigned type = 0; |
1799 | |
1800 | bch2_btree_complete_write(c, b, w); |
1801 | |
1802 | v = READ_ONCE(b->flags); |
1803 | do { |
1804 | old = new = v; |
1805 | |
1806 | if ((old & (1U << BTREE_NODE_dirty)) && |
1807 | (old & (1U << BTREE_NODE_need_write)) && |
1808 | !(old & (1U << BTREE_NODE_never_write)) && |
1809 | !(old & (1U << BTREE_NODE_write_blocked)) && |
1810 | !(old & (1U << BTREE_NODE_will_make_reachable))) { |
1811 | new &= ~(1U << BTREE_NODE_dirty); |
1812 | new &= ~(1U << BTREE_NODE_need_write); |
1813 | new |= (1U << BTREE_NODE_write_in_flight); |
1814 | new |= (1U << BTREE_NODE_write_in_flight_inner); |
1815 | new |= (1U << BTREE_NODE_just_written); |
1816 | new ^= (1U << BTREE_NODE_write_idx); |
1817 | |
1818 | type = new & BTREE_WRITE_TYPE_MASK; |
1819 | new &= ~BTREE_WRITE_TYPE_MASK; |
1820 | } else { |
1821 | new &= ~(1U << BTREE_NODE_write_in_flight); |
1822 | new &= ~(1U << BTREE_NODE_write_in_flight_inner); |
1823 | } |
1824 | } while ((v = cmpxchg(&b->flags, old, new)) != old); |
1825 | |
1826 | if (new & (1U << BTREE_NODE_write_in_flight)) |
1827 | __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); |
1828 | else |
1829 | wake_up_bit(word: &b->flags, bit: BTREE_NODE_write_in_flight); |
1830 | } |
1831 | |
1832 | static void btree_node_write_done(struct bch_fs *c, struct btree *b) |
1833 | { |
1834 | struct btree_trans *trans = bch2_trans_get(c); |
1835 | |
1836 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_read); |
1837 | __btree_node_write_done(c, b); |
1838 | six_unlock_read(lock: &b->c.lock); |
1839 | |
1840 | bch2_trans_put(trans); |
1841 | } |
1842 | |
1843 | static void btree_node_write_work(struct work_struct *work) |
1844 | { |
1845 | struct btree_write_bio *wbio = |
1846 | container_of(work, struct btree_write_bio, work); |
1847 | struct bch_fs *c = wbio->wbio.c; |
1848 | struct btree *b = wbio->wbio.bio.bi_private; |
1849 | struct bch_extent_ptr *ptr; |
1850 | int ret = 0; |
1851 | |
1852 | btree_bounce_free(c, |
1853 | size: wbio->data_bytes, |
1854 | used_mempool: wbio->wbio.used_mempool, |
1855 | p: wbio->data); |
1856 | |
1857 | bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, |
1858 | bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); |
1859 | |
1860 | if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k: &wbio->key))) { |
1861 | ret = -BCH_ERR_btree_node_write_all_failed; |
1862 | goto err; |
1863 | } |
1864 | |
1865 | if (wbio->wbio.first_btree_write) { |
1866 | if (wbio->wbio.failed.nr) { |
1867 | |
1868 | } |
1869 | } else { |
1870 | ret = bch2_trans_do(c, NULL, NULL, 0, |
1871 | bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, |
1872 | BCH_WATERMARK_interior_updates| |
1873 | BCH_TRANS_COMMIT_journal_reclaim| |
1874 | BCH_TRANS_COMMIT_no_enospc| |
1875 | BCH_TRANS_COMMIT_no_check_rw, |
1876 | !wbio->wbio.failed.nr)); |
1877 | if (ret) |
1878 | goto err; |
1879 | } |
1880 | out: |
1881 | bio_put(&wbio->wbio.bio); |
1882 | btree_node_write_done(c, b); |
1883 | return; |
1884 | err: |
1885 | set_btree_node_noevict(b); |
1886 | bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, |
1887 | "writing btree node: %s" , bch2_err_str(ret)); |
1888 | goto out; |
1889 | } |
1890 | |
1891 | static void btree_node_write_endio(struct bio *bio) |
1892 | { |
1893 | struct bch_write_bio *wbio = to_wbio(bio); |
1894 | struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; |
1895 | struct bch_write_bio *orig = parent ?: wbio; |
1896 | struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); |
1897 | struct bch_fs *c = wbio->c; |
1898 | struct btree *b = wbio->bio.bi_private; |
1899 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: wbio->dev); |
1900 | unsigned long flags; |
1901 | |
1902 | if (wbio->have_ioref) |
1903 | bch2_latency_acct(ca, submit_time: wbio->submit_time, WRITE); |
1904 | |
1905 | if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, |
1906 | "btree write error: %s" , |
1907 | bch2_blk_status_to_str(bio->bi_status)) || |
1908 | bch2_meta_write_fault("btree" )) { |
1909 | spin_lock_irqsave(&c->btree_write_error_lock, flags); |
1910 | bch2_dev_list_add_dev(devs: &orig->failed, dev: wbio->dev); |
1911 | spin_unlock_irqrestore(lock: &c->btree_write_error_lock, flags); |
1912 | } |
1913 | |
1914 | if (wbio->have_ioref) |
1915 | percpu_ref_put(ref: &ca->io_ref); |
1916 | |
1917 | if (parent) { |
1918 | bio_put(bio); |
1919 | bio_endio(&parent->bio); |
1920 | return; |
1921 | } |
1922 | |
1923 | clear_btree_node_write_in_flight_inner(b); |
1924 | wake_up_bit(word: &b->flags, bit: BTREE_NODE_write_in_flight_inner); |
1925 | INIT_WORK(&wb->work, btree_node_write_work); |
1926 | queue_work(wq: c->btree_io_complete_wq, work: &wb->work); |
1927 | } |
1928 | |
1929 | static int validate_bset_for_write(struct bch_fs *c, struct btree *b, |
1930 | struct bset *i, unsigned sectors) |
1931 | { |
1932 | struct printbuf buf = PRINTBUF; |
1933 | bool saw_error; |
1934 | int ret; |
1935 | |
1936 | ret = bch2_bkey_invalid(c, bkey_i_to_s_c(k: &b->key), |
1937 | BKEY_TYPE_btree, WRITE, &buf); |
1938 | |
1939 | if (ret) |
1940 | bch2_fs_inconsistent(c, "invalid btree node key before write: %s" , buf.buf); |
1941 | printbuf_exit(&buf); |
1942 | if (ret) |
1943 | return ret; |
1944 | |
1945 | ret = validate_bset_keys(c, b, i, WRITE, have_retry: false, saw_error: &saw_error) ?: |
1946 | validate_bset(c, NULL, b, i, offset: b->written, sectors, WRITE, have_retry: false, saw_error: &saw_error); |
1947 | if (ret) { |
1948 | bch2_inconsistent_error(c); |
1949 | dump_stack(); |
1950 | } |
1951 | |
1952 | return ret; |
1953 | } |
1954 | |
1955 | static void btree_write_submit(struct work_struct *work) |
1956 | { |
1957 | struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); |
1958 | BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; |
1959 | |
1960 | bkey_copy(dst: &tmp.k, src: &wbio->key); |
1961 | |
1962 | bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) |
1963 | ptr->offset += wbio->sector_offset; |
1964 | |
1965 | bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, |
1966 | &tmp.k, false); |
1967 | } |
1968 | |
1969 | void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) |
1970 | { |
1971 | struct btree_write_bio *wbio; |
1972 | struct bset_tree *t; |
1973 | struct bset *i; |
1974 | struct btree_node *bn = NULL; |
1975 | struct btree_node_entry *bne = NULL; |
1976 | struct sort_iter_stack sort_iter; |
1977 | struct nonce nonce; |
1978 | unsigned bytes_to_write, sectors_to_write, bytes, u64s; |
1979 | u64 seq = 0; |
1980 | bool used_mempool; |
1981 | unsigned long old, new; |
1982 | bool validate_before_checksum = false; |
1983 | enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; |
1984 | void *data; |
1985 | int ret; |
1986 | |
1987 | if (flags & BTREE_WRITE_ALREADY_STARTED) |
1988 | goto do_write; |
1989 | |
1990 | /* |
1991 | * We may only have a read lock on the btree node - the dirty bit is our |
1992 | * "lock" against racing with other threads that may be trying to start |
1993 | * a write, we do a write iff we clear the dirty bit. Since setting the |
1994 | * dirty bit requires a write lock, we can't race with other threads |
1995 | * redirtying it: |
1996 | */ |
1997 | do { |
1998 | old = new = READ_ONCE(b->flags); |
1999 | |
2000 | if (!(old & (1 << BTREE_NODE_dirty))) |
2001 | return; |
2002 | |
2003 | if ((flags & BTREE_WRITE_ONLY_IF_NEED) && |
2004 | !(old & (1 << BTREE_NODE_need_write))) |
2005 | return; |
2006 | |
2007 | if (old & |
2008 | ((1 << BTREE_NODE_never_write)| |
2009 | (1 << BTREE_NODE_write_blocked))) |
2010 | return; |
2011 | |
2012 | if (b->written && |
2013 | (old & (1 << BTREE_NODE_will_make_reachable))) |
2014 | return; |
2015 | |
2016 | if (old & (1 << BTREE_NODE_write_in_flight)) |
2017 | return; |
2018 | |
2019 | if (flags & BTREE_WRITE_ONLY_IF_NEED) |
2020 | type = new & BTREE_WRITE_TYPE_MASK; |
2021 | new &= ~BTREE_WRITE_TYPE_MASK; |
2022 | |
2023 | new &= ~(1 << BTREE_NODE_dirty); |
2024 | new &= ~(1 << BTREE_NODE_need_write); |
2025 | new |= (1 << BTREE_NODE_write_in_flight); |
2026 | new |= (1 << BTREE_NODE_write_in_flight_inner); |
2027 | new |= (1 << BTREE_NODE_just_written); |
2028 | new ^= (1 << BTREE_NODE_write_idx); |
2029 | } while (cmpxchg_acquire(&b->flags, old, new) != old); |
2030 | |
2031 | if (new & (1U << BTREE_NODE_need_write)) |
2032 | return; |
2033 | do_write: |
2034 | BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); |
2035 | |
2036 | atomic_dec(v: &c->btree_cache.dirty); |
2037 | |
2038 | BUG_ON(btree_node_fake(b)); |
2039 | BUG_ON((b->will_make_reachable != 0) != !b->written); |
2040 | |
2041 | BUG_ON(b->written >= btree_sectors(c)); |
2042 | BUG_ON(b->written & (block_sectors(c) - 1)); |
2043 | BUG_ON(bset_written(b, btree_bset_last(b))); |
2044 | BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); |
2045 | BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); |
2046 | |
2047 | bch2_sort_whiteouts(c, b); |
2048 | |
2049 | sort_iter_stack_init(iter: &sort_iter, b); |
2050 | |
2051 | bytes = !b->written |
2052 | ? sizeof(struct btree_node) |
2053 | : sizeof(struct btree_node_entry); |
2054 | |
2055 | bytes += b->whiteout_u64s * sizeof(u64); |
2056 | |
2057 | for_each_bset(b, t) { |
2058 | i = bset(b, t); |
2059 | |
2060 | if (bset_written(b, i)) |
2061 | continue; |
2062 | |
2063 | bytes += le16_to_cpu(i->u64s) * sizeof(u64); |
2064 | sort_iter_add(iter: &sort_iter.iter, |
2065 | btree_bkey_first(b, t), |
2066 | btree_bkey_last(b, t)); |
2067 | seq = max(seq, le64_to_cpu(i->journal_seq)); |
2068 | } |
2069 | |
2070 | BUG_ON(b->written && !seq); |
2071 | |
2072 | /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ |
2073 | bytes += 8; |
2074 | |
2075 | /* buffer must be a multiple of the block size */ |
2076 | bytes = round_up(bytes, block_bytes(c)); |
2077 | |
2078 | data = btree_bounce_alloc(c, size: bytes, used_mempool: &used_mempool); |
2079 | |
2080 | if (!b->written) { |
2081 | bn = data; |
2082 | *bn = *b->data; |
2083 | i = &bn->keys; |
2084 | } else { |
2085 | bne = data; |
2086 | bne->keys = b->data->keys; |
2087 | i = &bne->keys; |
2088 | } |
2089 | |
2090 | i->journal_seq = cpu_to_le64(seq); |
2091 | i->u64s = 0; |
2092 | |
2093 | sort_iter_add(iter: &sort_iter.iter, |
2094 | k: unwritten_whiteouts_start(b), |
2095 | end: unwritten_whiteouts_end(b)); |
2096 | SET_BSET_SEPARATE_WHITEOUTS(k: i, v: false); |
2097 | |
2098 | b->whiteout_u64s = 0; |
2099 | |
2100 | u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); |
2101 | le16_add_cpu(var: &i->u64s, val: u64s); |
2102 | |
2103 | BUG_ON(!b->written && i->u64s != b->data->keys.u64s); |
2104 | |
2105 | set_needs_whiteout(i, v: false); |
2106 | |
2107 | /* do we have data to write? */ |
2108 | if (b->written && !i->u64s) |
2109 | goto nowrite; |
2110 | |
2111 | bytes_to_write = vstruct_end(i) - data; |
2112 | sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; |
2113 | |
2114 | if (!b->written && |
2115 | b->key.k.type == KEY_TYPE_btree_ptr_v2) |
2116 | BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); |
2117 | |
2118 | memset(data + bytes_to_write, 0, |
2119 | (sectors_to_write << 9) - bytes_to_write); |
2120 | |
2121 | BUG_ON(b->written + sectors_to_write > btree_sectors(c)); |
2122 | BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); |
2123 | BUG_ON(i->seq != b->data->keys.seq); |
2124 | |
2125 | i->version = cpu_to_le16(c->sb.version); |
2126 | SET_BSET_OFFSET(k: i, v: b->written); |
2127 | SET_BSET_CSUM_TYPE(k: i, v: bch2_meta_checksum_type(c)); |
2128 | |
2129 | if (bch2_csum_type_is_encryption(type: BSET_CSUM_TYPE(k: i))) |
2130 | validate_before_checksum = true; |
2131 | |
2132 | /* validate_bset will be modifying: */ |
2133 | if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) |
2134 | validate_before_checksum = true; |
2135 | |
2136 | /* if we're going to be encrypting, check metadata validity first: */ |
2137 | if (validate_before_checksum && |
2138 | validate_bset_for_write(c, b, i, sectors: sectors_to_write)) |
2139 | goto err; |
2140 | |
2141 | ret = bset_encrypt(c, i, offset: b->written << 9); |
2142 | if (bch2_fs_fatal_err_on(ret, c, |
2143 | "encrypting btree node: %s" , bch2_err_str(ret))) |
2144 | goto err; |
2145 | |
2146 | nonce = btree_nonce(i, offset: b->written << 9); |
2147 | |
2148 | if (bn) |
2149 | bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); |
2150 | else |
2151 | bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); |
2152 | |
2153 | /* if we're not encrypting, check metadata after checksumming: */ |
2154 | if (!validate_before_checksum && |
2155 | validate_bset_for_write(c, b, i, sectors: sectors_to_write)) |
2156 | goto err; |
2157 | |
2158 | /* |
2159 | * We handle btree write errors by immediately halting the journal - |
2160 | * after we've done that, we can't issue any subsequent btree writes |
2161 | * because they might have pointers to new nodes that failed to write. |
2162 | * |
2163 | * Furthermore, there's no point in doing any more btree writes because |
2164 | * with the journal stopped, we're never going to update the journal to |
2165 | * reflect that those writes were done and the data flushed from the |
2166 | * journal: |
2167 | * |
2168 | * Also on journal error, the pending write may have updates that were |
2169 | * never journalled (interior nodes, see btree_update_nodes_written()) - |
2170 | * it's critical that we don't do the write in that case otherwise we |
2171 | * will have updates visible that weren't in the journal: |
2172 | * |
2173 | * Make sure to update b->written so bch2_btree_init_next() doesn't |
2174 | * break: |
2175 | */ |
2176 | if (bch2_journal_error(j: &c->journal) || |
2177 | c->opts.nochanges) |
2178 | goto err; |
2179 | |
2180 | trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); |
2181 | |
2182 | wbio = container_of(bio_alloc_bioset(NULL, |
2183 | buf_pages(data, sectors_to_write << 9), |
2184 | REQ_OP_WRITE|REQ_META, |
2185 | GFP_NOFS, |
2186 | &c->btree_bio), |
2187 | struct btree_write_bio, wbio.bio); |
2188 | wbio_init(bio: &wbio->wbio.bio); |
2189 | wbio->data = data; |
2190 | wbio->data_bytes = bytes; |
2191 | wbio->sector_offset = b->written; |
2192 | wbio->wbio.c = c; |
2193 | wbio->wbio.used_mempool = used_mempool; |
2194 | wbio->wbio.first_btree_write = !b->written; |
2195 | wbio->wbio.bio.bi_end_io = btree_node_write_endio; |
2196 | wbio->wbio.bio.bi_private = b; |
2197 | |
2198 | bch2_bio_map(bio: &wbio->wbio.bio, base: data, sectors_to_write << 9); |
2199 | |
2200 | bkey_copy(dst: &wbio->key, src: &b->key); |
2201 | |
2202 | b->written += sectors_to_write; |
2203 | |
2204 | if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) |
2205 | bkey_i_to_btree_ptr_v2(k: &wbio->key)->v.sectors_written = |
2206 | cpu_to_le16(b->written); |
2207 | |
2208 | atomic64_inc(v: &c->btree_write_stats[type].nr); |
2209 | atomic64_add(i: bytes_to_write, v: &c->btree_write_stats[type].bytes); |
2210 | |
2211 | INIT_WORK(&wbio->work, btree_write_submit); |
2212 | queue_work(wq: c->io_complete_wq, work: &wbio->work); |
2213 | return; |
2214 | err: |
2215 | set_btree_node_noevict(b); |
2216 | b->written += sectors_to_write; |
2217 | nowrite: |
2218 | btree_bounce_free(c, size: bytes, used_mempool, p: data); |
2219 | __btree_node_write_done(c, b); |
2220 | } |
2221 | |
2222 | /* |
2223 | * Work that must be done with write lock held: |
2224 | */ |
2225 | bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) |
2226 | { |
2227 | bool invalidated_iter = false; |
2228 | struct btree_node_entry *bne; |
2229 | struct bset_tree *t; |
2230 | |
2231 | if (!btree_node_just_written(b)) |
2232 | return false; |
2233 | |
2234 | BUG_ON(b->whiteout_u64s); |
2235 | |
2236 | clear_btree_node_just_written(b); |
2237 | |
2238 | /* |
2239 | * Note: immediately after write, bset_written() doesn't work - the |
2240 | * amount of data we had to write after compaction might have been |
2241 | * smaller than the offset of the last bset. |
2242 | * |
2243 | * However, we know that all bsets have been written here, as long as |
2244 | * we're still holding the write lock: |
2245 | */ |
2246 | |
2247 | /* |
2248 | * XXX: decide if we really want to unconditionally sort down to a |
2249 | * single bset: |
2250 | */ |
2251 | if (b->nsets > 1) { |
2252 | btree_node_sort(c, b, start_idx: 0, end_idx: b->nsets, filter_whiteouts: true); |
2253 | invalidated_iter = true; |
2254 | } else { |
2255 | invalidated_iter = bch2_drop_whiteouts(b, mode: COMPACT_ALL); |
2256 | } |
2257 | |
2258 | for_each_bset(b, t) |
2259 | set_needs_whiteout(i: bset(b, t), v: true); |
2260 | |
2261 | bch2_btree_verify(c, b); |
2262 | |
2263 | /* |
2264 | * If later we don't unconditionally sort down to a single bset, we have |
2265 | * to ensure this is still true: |
2266 | */ |
2267 | BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); |
2268 | |
2269 | bne = want_new_bset(c, b); |
2270 | if (bne) |
2271 | bch2_bset_init_next(b, bne); |
2272 | |
2273 | bch2_btree_build_aux_trees(b); |
2274 | |
2275 | return invalidated_iter; |
2276 | } |
2277 | |
2278 | /* |
2279 | * Use this one if the node is intent locked: |
2280 | */ |
2281 | void bch2_btree_node_write(struct bch_fs *c, struct btree *b, |
2282 | enum six_lock_type lock_type_held, |
2283 | unsigned flags) |
2284 | { |
2285 | if (lock_type_held == SIX_LOCK_intent || |
2286 | (lock_type_held == SIX_LOCK_read && |
2287 | six_lock_tryupgrade(&b->c.lock))) { |
2288 | __bch2_btree_node_write(c, b, flags); |
2289 | |
2290 | /* don't cycle lock unnecessarily: */ |
2291 | if (btree_node_just_written(b) && |
2292 | six_trylock_write(lock: &b->c.lock)) { |
2293 | bch2_btree_post_write_cleanup(c, b); |
2294 | six_unlock_write(lock: &b->c.lock); |
2295 | } |
2296 | |
2297 | if (lock_type_held == SIX_LOCK_read) |
2298 | six_lock_downgrade(&b->c.lock); |
2299 | } else { |
2300 | __bch2_btree_node_write(c, b, flags); |
2301 | if (lock_type_held == SIX_LOCK_write && |
2302 | btree_node_just_written(b)) |
2303 | bch2_btree_post_write_cleanup(c, b); |
2304 | } |
2305 | } |
2306 | |
2307 | static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) |
2308 | { |
2309 | struct bucket_table *tbl; |
2310 | struct rhash_head *pos; |
2311 | struct btree *b; |
2312 | unsigned i; |
2313 | bool ret = false; |
2314 | restart: |
2315 | rcu_read_lock(); |
2316 | for_each_cached_btree(b, c, tbl, i, pos) |
2317 | if (test_bit(flag, &b->flags)) { |
2318 | rcu_read_unlock(); |
2319 | wait_on_bit_io(word: &b->flags, bit: flag, TASK_UNINTERRUPTIBLE); |
2320 | ret = true; |
2321 | goto restart; |
2322 | } |
2323 | rcu_read_unlock(); |
2324 | |
2325 | return ret; |
2326 | } |
2327 | |
2328 | bool bch2_btree_flush_all_reads(struct bch_fs *c) |
2329 | { |
2330 | return __bch2_btree_flush_all(c, flag: BTREE_NODE_read_in_flight); |
2331 | } |
2332 | |
2333 | bool bch2_btree_flush_all_writes(struct bch_fs *c) |
2334 | { |
2335 | return __bch2_btree_flush_all(c, flag: BTREE_NODE_write_in_flight); |
2336 | } |
2337 | |
2338 | static const char * const bch2_btree_write_types[] = { |
2339 | #define x(t, n) [n] = #t, |
2340 | BCH_BTREE_WRITE_TYPES() |
2341 | NULL |
2342 | }; |
2343 | |
2344 | void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) |
2345 | { |
2346 | printbuf_tabstop_push(out, 20); |
2347 | printbuf_tabstop_push(out, 10); |
2348 | |
2349 | prt_tab(out); |
2350 | prt_str(out, str: "nr" ); |
2351 | prt_tab(out); |
2352 | prt_str(out, str: "size" ); |
2353 | prt_newline(out); |
2354 | |
2355 | for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { |
2356 | u64 nr = atomic64_read(v: &c->btree_write_stats[i].nr); |
2357 | u64 bytes = atomic64_read(v: &c->btree_write_stats[i].bytes); |
2358 | |
2359 | prt_printf(out, "%s:" , bch2_btree_write_types[i]); |
2360 | prt_tab(out); |
2361 | prt_u64(out, nr); |
2362 | prt_tab(out); |
2363 | prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); |
2364 | prt_newline(out); |
2365 | } |
2366 | } |
2367 | |