1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "alloc_foreground.h" |
5 | #include "bkey_buf.h" |
6 | #include "bkey_methods.h" |
7 | #include "btree_cache.h" |
8 | #include "btree_gc.h" |
9 | #include "btree_journal_iter.h" |
10 | #include "btree_update.h" |
11 | #include "btree_update_interior.h" |
12 | #include "btree_io.h" |
13 | #include "btree_iter.h" |
14 | #include "btree_locking.h" |
15 | #include "buckets.h" |
16 | #include "clock.h" |
17 | #include "error.h" |
18 | #include "extents.h" |
19 | #include "journal.h" |
20 | #include "journal_reclaim.h" |
21 | #include "keylist.h" |
22 | #include "recovery_passes.h" |
23 | #include "replicas.h" |
24 | #include "sb-members.h" |
25 | #include "super-io.h" |
26 | #include "trace.h" |
27 | |
28 | #include <linux/random.h> |
29 | |
30 | static const char * const bch2_btree_update_modes[] = { |
31 | #define x(t) #t, |
32 | BTREE_UPDATE_MODES() |
33 | #undef x |
34 | NULL |
35 | }; |
36 | |
37 | static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, |
38 | btree_path_idx_t, struct btree *, struct keylist *); |
39 | static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); |
40 | |
41 | static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, |
42 | enum btree_id btree_id, |
43 | unsigned level, |
44 | struct bpos pos) |
45 | { |
46 | btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, |
47 | BTREE_ITER_NOPRESERVE| |
48 | BTREE_ITER_INTENT, _RET_IP_); |
49 | path_idx = bch2_btree_path_make_mut(trans, path: path_idx, intent: true, _RET_IP_); |
50 | |
51 | struct btree_path *path = trans->paths + path_idx; |
52 | bch2_btree_path_downgrade(trans, path); |
53 | __bch2_btree_path_unlock(trans, path); |
54 | return path_idx; |
55 | } |
56 | |
57 | /* |
58 | * Verify that child nodes correctly span parent node's range: |
59 | */ |
60 | int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) |
61 | { |
62 | struct bch_fs *c = trans->c; |
63 | struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2 |
64 | ? bkey_i_to_btree_ptr_v2(k: &b->key)->v.min_key |
65 | : b->data->min_key; |
66 | struct btree_and_journal_iter iter; |
67 | struct bkey_s_c k; |
68 | struct printbuf buf = PRINTBUF; |
69 | struct bkey_buf prev; |
70 | int ret = 0; |
71 | |
72 | BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
73 | !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, |
74 | b->data->min_key)); |
75 | |
76 | if (!b->c.level) |
77 | return 0; |
78 | |
79 | bch2_bkey_buf_init(s: &prev); |
80 | bkey_init(k: &prev.k->k); |
81 | bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); |
82 | |
83 | while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { |
84 | if (k.k->type != KEY_TYPE_btree_ptr_v2) |
85 | goto out; |
86 | |
87 | struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); |
88 | |
89 | struct bpos expected_min = bkey_deleted(&prev.k->k) |
90 | ? node_min |
91 | : bpos_successor(p: prev.k->k.p); |
92 | |
93 | if (!bpos_eq(l: expected_min, r: bp.v->min_key)) { |
94 | bch2_topology_error(c); |
95 | |
96 | printbuf_reset(buf: &buf); |
97 | prt_str(out: &buf, str: "end of prev node doesn't match start of next node\n" ), |
98 | prt_printf(&buf, " in btree %s level %u node " , |
99 | bch2_btree_id_str(b->c.btree_id), b->c.level); |
100 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &b->key)); |
101 | prt_str(out: &buf, str: "\n prev " ); |
102 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: prev.k)); |
103 | prt_str(out: &buf, str: "\n next " ); |
104 | bch2_bkey_val_to_text(&buf, c, k); |
105 | |
106 | need_fsck_err(c, btree_node_topology_bad_min_key, "%s" , buf.buf); |
107 | goto topology_repair; |
108 | } |
109 | |
110 | bch2_bkey_buf_reassemble(s: &prev, c, k); |
111 | bch2_btree_and_journal_iter_advance(&iter); |
112 | } |
113 | |
114 | if (bkey_deleted(&prev.k->k)) { |
115 | bch2_topology_error(c); |
116 | |
117 | printbuf_reset(buf: &buf); |
118 | prt_str(out: &buf, str: "empty interior node\n" ); |
119 | prt_printf(&buf, " in btree %s level %u node " , |
120 | bch2_btree_id_str(b->c.btree_id), b->c.level); |
121 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &b->key)); |
122 | |
123 | need_fsck_err(c, btree_node_topology_empty_interior_node, "%s" , buf.buf); |
124 | goto topology_repair; |
125 | } else if (!bpos_eq(l: prev.k->k.p, r: b->key.k.p)) { |
126 | bch2_topology_error(c); |
127 | |
128 | printbuf_reset(buf: &buf); |
129 | prt_str(out: &buf, str: "last child node doesn't end at end of parent node\n" ); |
130 | prt_printf(&buf, " in btree %s level %u node " , |
131 | bch2_btree_id_str(b->c.btree_id), b->c.level); |
132 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &b->key)); |
133 | prt_str(out: &buf, str: "\n last key " ); |
134 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: prev.k)); |
135 | |
136 | need_fsck_err(c, btree_node_topology_bad_max_key, "%s" , buf.buf); |
137 | goto topology_repair; |
138 | } |
139 | out: |
140 | fsck_err: |
141 | bch2_btree_and_journal_iter_exit(&iter); |
142 | bch2_bkey_buf_exit(s: &prev, c); |
143 | printbuf_exit(&buf); |
144 | return ret; |
145 | topology_repair: |
146 | if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && |
147 | c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { |
148 | bch2_inconsistent_error(c); |
149 | ret = -BCH_ERR_btree_need_topology_repair; |
150 | } else { |
151 | ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); |
152 | } |
153 | goto out; |
154 | } |
155 | |
156 | /* Calculate ideal packed bkey format for new btree nodes: */ |
157 | |
158 | static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) |
159 | { |
160 | struct bkey_packed *k; |
161 | struct bset_tree *t; |
162 | struct bkey uk; |
163 | |
164 | for_each_bset(b, t) |
165 | bset_tree_for_each_key(b, t, k) |
166 | if (!bkey_deleted(k)) { |
167 | uk = bkey_unpack_key(b, src: k); |
168 | bch2_bkey_format_add_key(s, k: &uk); |
169 | } |
170 | } |
171 | |
172 | static struct bkey_format bch2_btree_calc_format(struct btree *b) |
173 | { |
174 | struct bkey_format_state s; |
175 | |
176 | bch2_bkey_format_init(&s); |
177 | bch2_bkey_format_add_pos(&s, b->data->min_key); |
178 | bch2_bkey_format_add_pos(&s, b->data->max_key); |
179 | __bch2_btree_calc_format(s: &s, b); |
180 | |
181 | return bch2_bkey_format_done(&s); |
182 | } |
183 | |
184 | static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, |
185 | struct bkey_format *old_f, |
186 | struct bkey_format *new_f) |
187 | { |
188 | /* stupid integer promotion rules */ |
189 | ssize_t delta = |
190 | (((int) new_f->key_u64s - old_f->key_u64s) * |
191 | (int) nr.packed_keys) + |
192 | (((int) new_f->key_u64s - BKEY_U64s) * |
193 | (int) nr.unpacked_keys); |
194 | |
195 | BUG_ON(delta + nr.live_u64s < 0); |
196 | |
197 | return nr.live_u64s + delta; |
198 | } |
199 | |
200 | /** |
201 | * bch2_btree_node_format_fits - check if we could rewrite node with a new format |
202 | * |
203 | * @c: filesystem handle |
204 | * @b: btree node to rewrite |
205 | * @nr: number of keys for new node (i.e. b->nr) |
206 | * @new_f: bkey format to translate keys to |
207 | * |
208 | * Returns: true if all re-packed keys will be able to fit in a new node. |
209 | * |
210 | * Assumes all keys will successfully pack with the new format. |
211 | */ |
212 | static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, |
213 | struct btree_nr_keys nr, |
214 | struct bkey_format *new_f) |
215 | { |
216 | size_t u64s = btree_node_u64s_with_format(nr, old_f: &b->format, new_f); |
217 | |
218 | return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b); |
219 | } |
220 | |
221 | /* Btree node freeing/allocation: */ |
222 | |
223 | static void __btree_node_free(struct btree_trans *trans, struct btree *b) |
224 | { |
225 | struct bch_fs *c = trans->c; |
226 | |
227 | trace_and_count(c, btree_node_free, trans, b); |
228 | |
229 | BUG_ON(btree_node_write_blocked(b)); |
230 | BUG_ON(btree_node_dirty(b)); |
231 | BUG_ON(btree_node_need_write(b)); |
232 | BUG_ON(b == btree_node_root(c, b)); |
233 | BUG_ON(b->ob.nr); |
234 | BUG_ON(!list_empty(&b->write_blocked)); |
235 | BUG_ON(b->will_make_reachable); |
236 | |
237 | clear_btree_node_noevict(b); |
238 | |
239 | mutex_lock(&c->btree_cache.lock); |
240 | list_move(list: &b->list, head: &c->btree_cache.freeable); |
241 | mutex_unlock(lock: &c->btree_cache.lock); |
242 | } |
243 | |
244 | static void bch2_btree_node_free_inmem(struct btree_trans *trans, |
245 | struct btree_path *path, |
246 | struct btree *b) |
247 | { |
248 | struct bch_fs *c = trans->c; |
249 | unsigned i, level = b->c.level; |
250 | |
251 | bch2_btree_node_lock_write_nofail(trans, path, &b->c); |
252 | bch2_btree_node_hash_remove(&c->btree_cache, b); |
253 | __btree_node_free(trans, b); |
254 | six_unlock_write(lock: &b->c.lock); |
255 | mark_btree_node_locked_noreset(path, level, type: BTREE_NODE_INTENT_LOCKED); |
256 | |
257 | trans_for_each_path(trans, path, i) |
258 | if (path->l[level].b == b) { |
259 | btree_node_unlock(trans, path, level); |
260 | path->l[level].b = ERR_PTR(error: -BCH_ERR_no_btree_node_init); |
261 | } |
262 | } |
263 | |
264 | static void bch2_btree_node_free_never_used(struct btree_update *as, |
265 | struct btree_trans *trans, |
266 | struct btree *b) |
267 | { |
268 | struct bch_fs *c = as->c; |
269 | struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; |
270 | struct btree_path *path; |
271 | unsigned i, level = b->c.level; |
272 | |
273 | BUG_ON(!list_empty(&b->write_blocked)); |
274 | BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); |
275 | |
276 | b->will_make_reachable = 0; |
277 | closure_put(cl: &as->cl); |
278 | |
279 | clear_btree_node_will_make_reachable(b); |
280 | clear_btree_node_accessed(b); |
281 | clear_btree_node_dirty_acct(c, b); |
282 | clear_btree_node_need_write(b); |
283 | |
284 | mutex_lock(&c->btree_cache.lock); |
285 | list_del_init(entry: &b->list); |
286 | bch2_btree_node_hash_remove(&c->btree_cache, b); |
287 | mutex_unlock(lock: &c->btree_cache.lock); |
288 | |
289 | BUG_ON(p->nr >= ARRAY_SIZE(p->b)); |
290 | p->b[p->nr++] = b; |
291 | |
292 | six_unlock_intent(lock: &b->c.lock); |
293 | |
294 | trans_for_each_path(trans, path, i) |
295 | if (path->l[level].b == b) { |
296 | btree_node_unlock(trans, path, level); |
297 | path->l[level].b = ERR_PTR(error: -BCH_ERR_no_btree_node_init); |
298 | } |
299 | } |
300 | |
301 | static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, |
302 | struct disk_reservation *res, |
303 | struct closure *cl, |
304 | bool interior_node, |
305 | unsigned flags) |
306 | { |
307 | struct bch_fs *c = trans->c; |
308 | struct write_point *wp; |
309 | struct btree *b; |
310 | BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; |
311 | struct open_buckets obs = { .nr = 0 }; |
312 | struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; |
313 | enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; |
314 | unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim |
315 | ? BTREE_NODE_RESERVE |
316 | : 0; |
317 | int ret; |
318 | |
319 | mutex_lock(&c->btree_reserve_cache_lock); |
320 | if (c->btree_reserve_cache_nr > nr_reserve) { |
321 | struct btree_alloc *a = |
322 | &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; |
323 | |
324 | obs = a->ob; |
325 | bkey_copy(dst: &tmp.k, src: &a->k); |
326 | mutex_unlock(lock: &c->btree_reserve_cache_lock); |
327 | goto mem_alloc; |
328 | } |
329 | mutex_unlock(lock: &c->btree_reserve_cache_lock); |
330 | |
331 | retry: |
332 | ret = bch2_alloc_sectors_start_trans(trans, |
333 | c->opts.metadata_target ?: |
334 | c->opts.foreground_target, |
335 | 0, |
336 | writepoint_ptr(wp: &c->btree_write_point), |
337 | &devs_have, |
338 | res->nr_replicas, |
339 | min(res->nr_replicas, |
340 | c->opts.metadata_replicas_required), |
341 | watermark, 0, cl, &wp); |
342 | if (unlikely(ret)) |
343 | return ERR_PTR(error: ret); |
344 | |
345 | if (wp->sectors_free < btree_sectors(c)) { |
346 | struct open_bucket *ob; |
347 | unsigned i; |
348 | |
349 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
350 | if (ob->sectors_free < btree_sectors(c)) |
351 | ob->sectors_free = 0; |
352 | |
353 | bch2_alloc_sectors_done(c, wp); |
354 | goto retry; |
355 | } |
356 | |
357 | bkey_btree_ptr_v2_init(k: &tmp.k); |
358 | bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); |
359 | |
360 | bch2_open_bucket_get(c, wp, ptrs: &obs); |
361 | bch2_alloc_sectors_done(c, wp); |
362 | mem_alloc: |
363 | b = bch2_btree_node_mem_alloc(trans, interior_node); |
364 | six_unlock_write(lock: &b->c.lock); |
365 | six_unlock_intent(lock: &b->c.lock); |
366 | |
367 | /* we hold cannibalize_lock: */ |
368 | BUG_ON(IS_ERR(b)); |
369 | BUG_ON(b->ob.nr); |
370 | |
371 | bkey_copy(dst: &b->key, src: &tmp.k); |
372 | b->ob = obs; |
373 | |
374 | return b; |
375 | } |
376 | |
377 | static struct btree *bch2_btree_node_alloc(struct btree_update *as, |
378 | struct btree_trans *trans, |
379 | unsigned level) |
380 | { |
381 | struct bch_fs *c = as->c; |
382 | struct btree *b; |
383 | struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; |
384 | int ret; |
385 | |
386 | BUG_ON(level >= BTREE_MAX_DEPTH); |
387 | BUG_ON(!p->nr); |
388 | |
389 | b = p->b[--p->nr]; |
390 | |
391 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_intent); |
392 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_write); |
393 | |
394 | set_btree_node_accessed(b); |
395 | set_btree_node_dirty_acct(c, b); |
396 | set_btree_node_need_write(b); |
397 | |
398 | bch2_bset_init_first(b, &b->data->keys); |
399 | b->c.level = level; |
400 | b->c.btree_id = as->btree_id; |
401 | b->version_ondisk = c->sb.version; |
402 | |
403 | memset(&b->nr, 0, sizeof(b->nr)); |
404 | b->data->magic = cpu_to_le64(bset_magic(c)); |
405 | memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); |
406 | b->data->flags = 0; |
407 | SET_BTREE_NODE_ID(n: b->data, v: as->btree_id); |
408 | SET_BTREE_NODE_LEVEL(k: b->data, v: level); |
409 | |
410 | if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { |
411 | struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(k: &b->key); |
412 | |
413 | bp->v.mem_ptr = 0; |
414 | bp->v.seq = b->data->keys.seq; |
415 | bp->v.sectors_written = 0; |
416 | } |
417 | |
418 | SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(k: b->data, v: true); |
419 | |
420 | bch2_btree_build_aux_trees(b); |
421 | |
422 | ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); |
423 | BUG_ON(ret); |
424 | |
425 | trace_and_count(c, btree_node_alloc, trans, b); |
426 | bch2_increment_clock(c, sectors: btree_sectors(c), WRITE); |
427 | return b; |
428 | } |
429 | |
430 | static void btree_set_min(struct btree *b, struct bpos pos) |
431 | { |
432 | if (b->key.k.type == KEY_TYPE_btree_ptr_v2) |
433 | bkey_i_to_btree_ptr_v2(k: &b->key)->v.min_key = pos; |
434 | b->data->min_key = pos; |
435 | } |
436 | |
437 | static void btree_set_max(struct btree *b, struct bpos pos) |
438 | { |
439 | b->key.k.p = pos; |
440 | b->data->max_key = pos; |
441 | } |
442 | |
443 | static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, |
444 | struct btree_trans *trans, |
445 | struct btree *b) |
446 | { |
447 | struct btree *n = bch2_btree_node_alloc(as, trans, level: b->c.level); |
448 | struct bkey_format format = bch2_btree_calc_format(b); |
449 | |
450 | /* |
451 | * The keys might expand with the new format - if they wouldn't fit in |
452 | * the btree node anymore, use the old format for now: |
453 | */ |
454 | if (!bch2_btree_node_format_fits(c: as->c, b, nr: b->nr, new_f: &format)) |
455 | format = b->format; |
456 | |
457 | SET_BTREE_NODE_SEQ(k: n->data, v: BTREE_NODE_SEQ(k: b->data) + 1); |
458 | |
459 | btree_set_min(b: n, pos: b->data->min_key); |
460 | btree_set_max(b: n, pos: b->data->max_key); |
461 | |
462 | n->data->format = format; |
463 | btree_node_set_format(b: n, f: format); |
464 | |
465 | bch2_btree_sort_into(as->c, n, b); |
466 | |
467 | btree_node_reset_sib_u64s(b: n); |
468 | return n; |
469 | } |
470 | |
471 | static struct btree *__btree_root_alloc(struct btree_update *as, |
472 | struct btree_trans *trans, unsigned level) |
473 | { |
474 | struct btree *b = bch2_btree_node_alloc(as, trans, level); |
475 | |
476 | btree_set_min(b, POS_MIN); |
477 | btree_set_max(b, SPOS_MAX); |
478 | b->data->format = bch2_btree_calc_format(b); |
479 | |
480 | btree_node_set_format(b, f: b->data->format); |
481 | bch2_btree_build_aux_trees(b); |
482 | |
483 | return b; |
484 | } |
485 | |
486 | static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) |
487 | { |
488 | struct bch_fs *c = as->c; |
489 | struct prealloc_nodes *p; |
490 | |
491 | for (p = as->prealloc_nodes; |
492 | p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); |
493 | p++) { |
494 | while (p->nr) { |
495 | struct btree *b = p->b[--p->nr]; |
496 | |
497 | mutex_lock(&c->btree_reserve_cache_lock); |
498 | |
499 | if (c->btree_reserve_cache_nr < |
500 | ARRAY_SIZE(c->btree_reserve_cache)) { |
501 | struct btree_alloc *a = |
502 | &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; |
503 | |
504 | a->ob = b->ob; |
505 | b->ob.nr = 0; |
506 | bkey_copy(dst: &a->k, src: &b->key); |
507 | } else { |
508 | bch2_open_buckets_put(c, ptrs: &b->ob); |
509 | } |
510 | |
511 | mutex_unlock(lock: &c->btree_reserve_cache_lock); |
512 | |
513 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_intent); |
514 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_write); |
515 | __btree_node_free(trans, b); |
516 | six_unlock_write(lock: &b->c.lock); |
517 | six_unlock_intent(lock: &b->c.lock); |
518 | } |
519 | } |
520 | } |
521 | |
522 | static int bch2_btree_reserve_get(struct btree_trans *trans, |
523 | struct btree_update *as, |
524 | unsigned nr_nodes[2], |
525 | unsigned flags, |
526 | struct closure *cl) |
527 | { |
528 | struct btree *b; |
529 | unsigned interior; |
530 | int ret = 0; |
531 | |
532 | BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); |
533 | |
534 | /* |
535 | * Protects reaping from the btree node cache and using the btree node |
536 | * open bucket reserve: |
537 | */ |
538 | ret = bch2_btree_cache_cannibalize_lock(trans, cl); |
539 | if (ret) |
540 | return ret; |
541 | |
542 | for (interior = 0; interior < 2; interior++) { |
543 | struct prealloc_nodes *p = as->prealloc_nodes + interior; |
544 | |
545 | while (p->nr < nr_nodes[interior]) { |
546 | b = __bch2_btree_node_alloc(trans, res: &as->disk_res, cl, |
547 | interior_node: interior, flags); |
548 | if (IS_ERR(ptr: b)) { |
549 | ret = PTR_ERR(ptr: b); |
550 | goto err; |
551 | } |
552 | |
553 | p->b[p->nr++] = b; |
554 | } |
555 | } |
556 | err: |
557 | bch2_btree_cache_cannibalize_unlock(trans); |
558 | return ret; |
559 | } |
560 | |
561 | /* Asynchronous interior node update machinery */ |
562 | |
563 | static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) |
564 | { |
565 | struct bch_fs *c = as->c; |
566 | |
567 | if (as->took_gc_lock) |
568 | up_read(sem: &c->gc_lock); |
569 | as->took_gc_lock = false; |
570 | |
571 | bch2_journal_pin_drop(&c->journal, &as->journal); |
572 | bch2_journal_pin_flush(&c->journal, &as->journal); |
573 | bch2_disk_reservation_put(c, res: &as->disk_res); |
574 | bch2_btree_reserve_put(as, trans); |
575 | |
576 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_interior_update_total], |
577 | start: as->start_time); |
578 | |
579 | mutex_lock(&c->btree_interior_update_lock); |
580 | list_del(entry: &as->unwritten_list); |
581 | list_del(entry: &as->list); |
582 | |
583 | closure_debug_destroy(cl: &as->cl); |
584 | mempool_free(element: as, pool: &c->btree_interior_update_pool); |
585 | |
586 | /* |
587 | * Have to do the wakeup with btree_interior_update_lock still held, |
588 | * since being on btree_interior_update_list is our ref on @c: |
589 | */ |
590 | closure_wake_up(list: &c->btree_interior_update_wait); |
591 | |
592 | mutex_unlock(lock: &c->btree_interior_update_lock); |
593 | } |
594 | |
595 | static void btree_update_add_key(struct btree_update *as, |
596 | struct keylist *keys, struct btree *b) |
597 | { |
598 | struct bkey_i *k = &b->key; |
599 | |
600 | BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > |
601 | ARRAY_SIZE(as->_old_keys)); |
602 | |
603 | bkey_copy(dst: keys->top, src: k); |
604 | bkey_i_to_btree_ptr_v2(k: keys->top)->v.mem_ptr = b->c.level + 1; |
605 | |
606 | bch2_keylist_push(l: keys); |
607 | } |
608 | |
609 | static bool btree_update_new_nodes_marked_sb(struct btree_update *as) |
610 | { |
611 | for_each_keylist_key(&as->new_keys, k) |
612 | if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k))) |
613 | return false; |
614 | return true; |
615 | } |
616 | |
617 | static void btree_update_new_nodes_mark_sb(struct btree_update *as) |
618 | { |
619 | struct bch_fs *c = as->c; |
620 | |
621 | mutex_lock(&c->sb_lock); |
622 | for_each_keylist_key(&as->new_keys, k) |
623 | bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); |
624 | |
625 | bch2_write_super(c); |
626 | mutex_unlock(lock: &c->sb_lock); |
627 | } |
628 | |
629 | /* |
630 | * The transactional part of an interior btree node update, where we journal the |
631 | * update we did to the interior node and update alloc info: |
632 | */ |
633 | static int btree_update_nodes_written_trans(struct btree_trans *trans, |
634 | struct btree_update *as) |
635 | { |
636 | struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, u64s: as->journal_u64s); |
637 | int ret = PTR_ERR_OR_ZERO(ptr: e); |
638 | if (ret) |
639 | return ret; |
640 | |
641 | memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); |
642 | |
643 | trans->journal_pin = &as->journal; |
644 | |
645 | for_each_keylist_key(&as->old_keys, k) { |
646 | unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; |
647 | |
648 | ret = bch2_key_trigger_old(trans, btree_id: as->btree_id, level, old: bkey_i_to_s_c(k), |
649 | BTREE_TRIGGER_TRANSACTIONAL); |
650 | if (ret) |
651 | return ret; |
652 | } |
653 | |
654 | for_each_keylist_key(&as->new_keys, k) { |
655 | unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; |
656 | |
657 | ret = bch2_key_trigger_new(trans, btree_id: as->btree_id, level, new: bkey_i_to_s(k), |
658 | BTREE_TRIGGER_TRANSACTIONAL); |
659 | if (ret) |
660 | return ret; |
661 | } |
662 | |
663 | return 0; |
664 | } |
665 | |
666 | static void btree_update_nodes_written(struct btree_update *as) |
667 | { |
668 | struct bch_fs *c = as->c; |
669 | struct btree *b; |
670 | struct btree_trans *trans = bch2_trans_get(c); |
671 | u64 journal_seq = 0; |
672 | unsigned i; |
673 | int ret; |
674 | |
675 | /* |
676 | * If we're already in an error state, it might be because a btree node |
677 | * was never written, and we might be trying to free that same btree |
678 | * node here, but it won't have been marked as allocated and we'll see |
679 | * spurious disk usage inconsistencies in the transactional part below |
680 | * if we don't skip it: |
681 | */ |
682 | ret = bch2_journal_error(j: &c->journal); |
683 | if (ret) |
684 | goto err; |
685 | |
686 | if (!btree_update_new_nodes_marked_sb(as)) |
687 | btree_update_new_nodes_mark_sb(as); |
688 | |
689 | /* |
690 | * Wait for any in flight writes to finish before we free the old nodes |
691 | * on disk: |
692 | */ |
693 | for (i = 0; i < as->nr_old_nodes; i++) { |
694 | __le64 seq; |
695 | |
696 | b = as->old_nodes[i]; |
697 | |
698 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_read); |
699 | seq = b->data ? b->data->keys.seq : 0; |
700 | six_unlock_read(lock: &b->c.lock); |
701 | |
702 | if (seq == as->old_nodes_seq[i]) |
703 | wait_on_bit_io(word: &b->flags, bit: BTREE_NODE_write_in_flight_inner, |
704 | TASK_UNINTERRUPTIBLE); |
705 | } |
706 | |
707 | /* |
708 | * We did an update to a parent node where the pointers we added pointed |
709 | * to child nodes that weren't written yet: now, the child nodes have |
710 | * been written so we can write out the update to the interior node. |
711 | */ |
712 | |
713 | /* |
714 | * We can't call into journal reclaim here: we'd block on the journal |
715 | * reclaim lock, but we may need to release the open buckets we have |
716 | * pinned in order for other btree updates to make forward progress, and |
717 | * journal reclaim does btree updates when flushing bkey_cached entries, |
718 | * which may require allocations as well. |
719 | */ |
720 | ret = commit_do(trans, &as->disk_res, &journal_seq, |
721 | BCH_WATERMARK_interior_updates| |
722 | BCH_TRANS_COMMIT_no_enospc| |
723 | BCH_TRANS_COMMIT_no_check_rw| |
724 | BCH_TRANS_COMMIT_journal_reclaim, |
725 | btree_update_nodes_written_trans(trans, as)); |
726 | bch2_trans_unlock(trans); |
727 | |
728 | bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, |
729 | "%s" , bch2_err_str(ret)); |
730 | err: |
731 | /* |
732 | * We have to be careful because another thread might be getting ready |
733 | * to free as->b and calling btree_update_reparent() on us - we'll |
734 | * recheck under btree_update_lock below: |
735 | */ |
736 | b = READ_ONCE(as->b); |
737 | if (b) { |
738 | btree_path_idx_t path_idx = get_unlocked_mut_path(trans, |
739 | btree_id: as->btree_id, level: b->c.level, pos: b->key.k.p); |
740 | struct btree_path *path = trans->paths + path_idx; |
741 | /* |
742 | * @b is the node we did the final insert into: |
743 | * |
744 | * On failure to get a journal reservation, we still have to |
745 | * unblock the write and allow most of the write path to happen |
746 | * so that shutdown works, but the i->journal_seq mechanism |
747 | * won't work to prevent the btree write from being visible (we |
748 | * didn't get a journal sequence number) - instead |
749 | * __bch2_btree_node_write() doesn't do the actual write if |
750 | * we're in journal error state: |
751 | */ |
752 | |
753 | /* |
754 | * Ensure transaction is unlocked before using |
755 | * btree_node_lock_nopath() (the use of which is always suspect, |
756 | * we need to work on removing this in the future) |
757 | * |
758 | * It should be, but get_unlocked_mut_path() -> bch2_path_get() |
759 | * calls bch2_path_upgrade(), before we call path_make_mut(), so |
760 | * we may rarely end up with a locked path besides the one we |
761 | * have here: |
762 | */ |
763 | bch2_trans_unlock(trans); |
764 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_intent); |
765 | mark_btree_node_locked(trans, path, level: b->c.level, type: BTREE_NODE_INTENT_LOCKED); |
766 | path->l[b->c.level].lock_seq = six_lock_seq(lock: &b->c.lock); |
767 | path->l[b->c.level].b = b; |
768 | |
769 | bch2_btree_node_lock_write_nofail(trans, path, &b->c); |
770 | |
771 | mutex_lock(&c->btree_interior_update_lock); |
772 | |
773 | list_del(entry: &as->write_blocked_list); |
774 | if (list_empty(head: &b->write_blocked)) |
775 | clear_btree_node_write_blocked(b); |
776 | |
777 | /* |
778 | * Node might have been freed, recheck under |
779 | * btree_interior_update_lock: |
780 | */ |
781 | if (as->b == b) { |
782 | BUG_ON(!b->c.level); |
783 | BUG_ON(!btree_node_dirty(b)); |
784 | |
785 | if (!ret) { |
786 | struct bset *last = btree_bset_last(b); |
787 | |
788 | last->journal_seq = cpu_to_le64( |
789 | max(journal_seq, |
790 | le64_to_cpu(last->journal_seq))); |
791 | |
792 | bch2_btree_add_journal_pin(c, b, journal_seq); |
793 | } else { |
794 | /* |
795 | * If we didn't get a journal sequence number we |
796 | * can't write this btree node, because recovery |
797 | * won't know to ignore this write: |
798 | */ |
799 | set_btree_node_never_write(b); |
800 | } |
801 | } |
802 | |
803 | mutex_unlock(lock: &c->btree_interior_update_lock); |
804 | |
805 | mark_btree_node_locked_noreset(path, level: b->c.level, type: BTREE_NODE_INTENT_LOCKED); |
806 | six_unlock_write(lock: &b->c.lock); |
807 | |
808 | btree_node_write_if_need(c, b, lock_held: SIX_LOCK_intent); |
809 | btree_node_unlock(trans, path, level: b->c.level); |
810 | bch2_path_put(trans, path_idx, true); |
811 | } |
812 | |
813 | bch2_journal_pin_drop(&c->journal, &as->journal); |
814 | |
815 | mutex_lock(&c->btree_interior_update_lock); |
816 | for (i = 0; i < as->nr_new_nodes; i++) { |
817 | b = as->new_nodes[i]; |
818 | |
819 | BUG_ON(b->will_make_reachable != (unsigned long) as); |
820 | b->will_make_reachable = 0; |
821 | clear_btree_node_will_make_reachable(b); |
822 | } |
823 | mutex_unlock(lock: &c->btree_interior_update_lock); |
824 | |
825 | for (i = 0; i < as->nr_new_nodes; i++) { |
826 | b = as->new_nodes[i]; |
827 | |
828 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_read); |
829 | btree_node_write_if_need(c, b, lock_held: SIX_LOCK_read); |
830 | six_unlock_read(lock: &b->c.lock); |
831 | } |
832 | |
833 | for (i = 0; i < as->nr_open_buckets; i++) |
834 | bch2_open_bucket_put(c, ob: c->open_buckets + as->open_buckets[i]); |
835 | |
836 | bch2_btree_update_free(as, trans); |
837 | bch2_trans_put(trans); |
838 | } |
839 | |
840 | static void btree_interior_update_work(struct work_struct *work) |
841 | { |
842 | struct bch_fs *c = |
843 | container_of(work, struct bch_fs, btree_interior_update_work); |
844 | struct btree_update *as; |
845 | |
846 | while (1) { |
847 | mutex_lock(&c->btree_interior_update_lock); |
848 | as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, |
849 | struct btree_update, unwritten_list); |
850 | if (as && !as->nodes_written) |
851 | as = NULL; |
852 | mutex_unlock(lock: &c->btree_interior_update_lock); |
853 | |
854 | if (!as) |
855 | break; |
856 | |
857 | btree_update_nodes_written(as); |
858 | } |
859 | } |
860 | |
861 | static CLOSURE_CALLBACK(btree_update_set_nodes_written) |
862 | { |
863 | closure_type(as, struct btree_update, cl); |
864 | struct bch_fs *c = as->c; |
865 | |
866 | mutex_lock(&c->btree_interior_update_lock); |
867 | as->nodes_written = true; |
868 | mutex_unlock(lock: &c->btree_interior_update_lock); |
869 | |
870 | queue_work(wq: c->btree_interior_update_worker, work: &c->btree_interior_update_work); |
871 | } |
872 | |
873 | /* |
874 | * We're updating @b with pointers to nodes that haven't finished writing yet: |
875 | * block @b from being written until @as completes |
876 | */ |
877 | static void btree_update_updated_node(struct btree_update *as, struct btree *b) |
878 | { |
879 | struct bch_fs *c = as->c; |
880 | |
881 | BUG_ON(as->mode != BTREE_UPDATE_none); |
882 | BUG_ON(as->update_level_end < b->c.level); |
883 | BUG_ON(!btree_node_dirty(b)); |
884 | BUG_ON(!b->c.level); |
885 | |
886 | mutex_lock(&c->btree_interior_update_lock); |
887 | list_add_tail(new: &as->unwritten_list, head: &c->btree_interior_updates_unwritten); |
888 | |
889 | as->mode = BTREE_UPDATE_node; |
890 | as->b = b; |
891 | as->update_level_end = b->c.level; |
892 | |
893 | set_btree_node_write_blocked(b); |
894 | list_add(new: &as->write_blocked_list, head: &b->write_blocked); |
895 | |
896 | mutex_unlock(lock: &c->btree_interior_update_lock); |
897 | } |
898 | |
899 | static int bch2_update_reparent_journal_pin_flush(struct journal *j, |
900 | struct journal_entry_pin *_pin, u64 seq) |
901 | { |
902 | return 0; |
903 | } |
904 | |
905 | static void btree_update_reparent(struct btree_update *as, |
906 | struct btree_update *child) |
907 | { |
908 | struct bch_fs *c = as->c; |
909 | |
910 | lockdep_assert_held(&c->btree_interior_update_lock); |
911 | |
912 | child->b = NULL; |
913 | child->mode = BTREE_UPDATE_update; |
914 | |
915 | bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, |
916 | bch2_update_reparent_journal_pin_flush); |
917 | } |
918 | |
919 | static void btree_update_updated_root(struct btree_update *as, struct btree *b) |
920 | { |
921 | struct bkey_i *insert = &b->key; |
922 | struct bch_fs *c = as->c; |
923 | |
924 | BUG_ON(as->mode != BTREE_UPDATE_none); |
925 | |
926 | BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > |
927 | ARRAY_SIZE(as->journal_entries)); |
928 | |
929 | as->journal_u64s += |
930 | journal_entry_set(entry: (void *) &as->journal_entries[as->journal_u64s], |
931 | type: BCH_JSET_ENTRY_btree_root, |
932 | id: b->c.btree_id, level: b->c.level, |
933 | data: insert, u64s: insert->k.u64s); |
934 | |
935 | mutex_lock(&c->btree_interior_update_lock); |
936 | list_add_tail(new: &as->unwritten_list, head: &c->btree_interior_updates_unwritten); |
937 | |
938 | as->mode = BTREE_UPDATE_root; |
939 | mutex_unlock(lock: &c->btree_interior_update_lock); |
940 | } |
941 | |
942 | /* |
943 | * bch2_btree_update_add_new_node: |
944 | * |
945 | * This causes @as to wait on @b to be written, before it gets to |
946 | * bch2_btree_update_nodes_written |
947 | * |
948 | * Additionally, it sets b->will_make_reachable to prevent any additional writes |
949 | * to @b from happening besides the first until @b is reachable on disk |
950 | * |
951 | * And it adds @b to the list of @as's new nodes, so that we can update sector |
952 | * counts in bch2_btree_update_nodes_written: |
953 | */ |
954 | static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) |
955 | { |
956 | struct bch_fs *c = as->c; |
957 | |
958 | closure_get(cl: &as->cl); |
959 | |
960 | mutex_lock(&c->btree_interior_update_lock); |
961 | BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); |
962 | BUG_ON(b->will_make_reachable); |
963 | |
964 | as->new_nodes[as->nr_new_nodes++] = b; |
965 | b->will_make_reachable = 1UL|(unsigned long) as; |
966 | set_btree_node_will_make_reachable(b); |
967 | |
968 | mutex_unlock(lock: &c->btree_interior_update_lock); |
969 | |
970 | btree_update_add_key(as, keys: &as->new_keys, b); |
971 | |
972 | if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { |
973 | unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; |
974 | unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; |
975 | |
976 | bkey_i_to_btree_ptr_v2(k: &b->key)->v.sectors_written = |
977 | cpu_to_le16(sectors); |
978 | } |
979 | } |
980 | |
981 | /* |
982 | * returns true if @b was a new node |
983 | */ |
984 | static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) |
985 | { |
986 | struct btree_update *as; |
987 | unsigned long v; |
988 | unsigned i; |
989 | |
990 | mutex_lock(&c->btree_interior_update_lock); |
991 | /* |
992 | * When b->will_make_reachable != 0, it owns a ref on as->cl that's |
993 | * dropped when it gets written by bch2_btree_complete_write - the |
994 | * xchg() is for synchronization with bch2_btree_complete_write: |
995 | */ |
996 | v = xchg(&b->will_make_reachable, 0); |
997 | clear_btree_node_will_make_reachable(b); |
998 | as = (struct btree_update *) (v & ~1UL); |
999 | |
1000 | if (!as) { |
1001 | mutex_unlock(lock: &c->btree_interior_update_lock); |
1002 | return; |
1003 | } |
1004 | |
1005 | for (i = 0; i < as->nr_new_nodes; i++) |
1006 | if (as->new_nodes[i] == b) |
1007 | goto found; |
1008 | |
1009 | BUG(); |
1010 | found: |
1011 | array_remove_item(as->new_nodes, as->nr_new_nodes, i); |
1012 | mutex_unlock(lock: &c->btree_interior_update_lock); |
1013 | |
1014 | if (v & 1) |
1015 | closure_put(cl: &as->cl); |
1016 | } |
1017 | |
1018 | static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) |
1019 | { |
1020 | while (b->ob.nr) |
1021 | as->open_buckets[as->nr_open_buckets++] = |
1022 | b->ob.v[--b->ob.nr]; |
1023 | } |
1024 | |
1025 | static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j, |
1026 | struct journal_entry_pin *_pin, u64 seq) |
1027 | { |
1028 | return 0; |
1029 | } |
1030 | |
1031 | /* |
1032 | * @b is being split/rewritten: it may have pointers to not-yet-written btree |
1033 | * nodes and thus outstanding btree_updates - redirect @b's |
1034 | * btree_updates to point to this btree_update: |
1035 | */ |
1036 | static void bch2_btree_interior_update_will_free_node(struct btree_update *as, |
1037 | struct btree *b) |
1038 | { |
1039 | struct bch_fs *c = as->c; |
1040 | struct btree_update *p, *n; |
1041 | struct btree_write *w; |
1042 | |
1043 | set_btree_node_dying(b); |
1044 | |
1045 | if (btree_node_fake(b)) |
1046 | return; |
1047 | |
1048 | mutex_lock(&c->btree_interior_update_lock); |
1049 | |
1050 | /* |
1051 | * Does this node have any btree_update operations preventing |
1052 | * it from being written? |
1053 | * |
1054 | * If so, redirect them to point to this btree_update: we can |
1055 | * write out our new nodes, but we won't make them visible until those |
1056 | * operations complete |
1057 | */ |
1058 | list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { |
1059 | list_del_init(entry: &p->write_blocked_list); |
1060 | btree_update_reparent(as, child: p); |
1061 | |
1062 | /* |
1063 | * for flush_held_btree_writes() waiting on updates to flush or |
1064 | * nodes to be writeable: |
1065 | */ |
1066 | closure_wake_up(list: &c->btree_interior_update_wait); |
1067 | } |
1068 | |
1069 | clear_btree_node_dirty_acct(c, b); |
1070 | clear_btree_node_need_write(b); |
1071 | clear_btree_node_write_blocked(b); |
1072 | |
1073 | /* |
1074 | * Does this node have unwritten data that has a pin on the journal? |
1075 | * |
1076 | * If so, transfer that pin to the btree_update operation - |
1077 | * note that if we're freeing multiple nodes, we only need to keep the |
1078 | * oldest pin of any of the nodes we're freeing. We'll release the pin |
1079 | * when the new nodes are persistent and reachable on disk: |
1080 | */ |
1081 | w = btree_current_write(b); |
1082 | bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, |
1083 | bch2_btree_update_will_free_node_journal_pin_flush); |
1084 | bch2_journal_pin_drop(&c->journal, &w->journal); |
1085 | |
1086 | w = btree_prev_write(b); |
1087 | bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, |
1088 | bch2_btree_update_will_free_node_journal_pin_flush); |
1089 | bch2_journal_pin_drop(&c->journal, &w->journal); |
1090 | |
1091 | mutex_unlock(lock: &c->btree_interior_update_lock); |
1092 | |
1093 | /* |
1094 | * Is this a node that isn't reachable on disk yet? |
1095 | * |
1096 | * Nodes that aren't reachable yet have writes blocked until they're |
1097 | * reachable - now that we've cancelled any pending writes and moved |
1098 | * things waiting on that write to wait on this update, we can drop this |
1099 | * node from the list of nodes that the other update is making |
1100 | * reachable, prior to freeing it: |
1101 | */ |
1102 | btree_update_drop_new_node(c, b); |
1103 | |
1104 | btree_update_add_key(as, keys: &as->old_keys, b); |
1105 | |
1106 | as->old_nodes[as->nr_old_nodes] = b; |
1107 | as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; |
1108 | as->nr_old_nodes++; |
1109 | } |
1110 | |
1111 | static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) |
1112 | { |
1113 | struct bch_fs *c = as->c; |
1114 | u64 start_time = as->start_time; |
1115 | |
1116 | BUG_ON(as->mode == BTREE_UPDATE_none); |
1117 | |
1118 | if (as->took_gc_lock) |
1119 | up_read(sem: &as->c->gc_lock); |
1120 | as->took_gc_lock = false; |
1121 | |
1122 | bch2_btree_reserve_put(as, trans); |
1123 | |
1124 | continue_at(&as->cl, btree_update_set_nodes_written, |
1125 | as->c->btree_interior_update_worker); |
1126 | |
1127 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_interior_update_foreground], |
1128 | start: start_time); |
1129 | } |
1130 | |
1131 | static struct btree_update * |
1132 | bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, |
1133 | unsigned level_start, bool split, unsigned flags) |
1134 | { |
1135 | struct bch_fs *c = trans->c; |
1136 | struct btree_update *as; |
1137 | u64 start_time = local_clock(); |
1138 | int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc) |
1139 | ? BCH_DISK_RESERVATION_NOFAIL : 0; |
1140 | unsigned nr_nodes[2] = { 0, 0 }; |
1141 | unsigned level_end = level_start; |
1142 | enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; |
1143 | int ret = 0; |
1144 | u32 restart_count = trans->restart_count; |
1145 | |
1146 | BUG_ON(!path->should_be_locked); |
1147 | |
1148 | if (watermark == BCH_WATERMARK_copygc) |
1149 | watermark = BCH_WATERMARK_btree_copygc; |
1150 | if (watermark < BCH_WATERMARK_btree) |
1151 | watermark = BCH_WATERMARK_btree; |
1152 | |
1153 | flags &= ~BCH_WATERMARK_MASK; |
1154 | flags |= watermark; |
1155 | |
1156 | if (watermark < BCH_WATERMARK_reclaim && |
1157 | test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) { |
1158 | if (flags & BCH_TRANS_COMMIT_journal_reclaim) |
1159 | return ERR_PTR(error: -BCH_ERR_journal_reclaim_would_deadlock); |
1160 | |
1161 | bch2_trans_unlock(trans); |
1162 | wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)); |
1163 | ret = bch2_trans_relock(trans); |
1164 | if (ret) |
1165 | return ERR_PTR(error: ret); |
1166 | } |
1167 | |
1168 | while (1) { |
1169 | nr_nodes[!!level_end] += 1 + split; |
1170 | level_end++; |
1171 | |
1172 | ret = bch2_btree_path_upgrade(trans, path, new_locks_want: level_end + 1); |
1173 | if (ret) |
1174 | return ERR_PTR(error: ret); |
1175 | |
1176 | if (!btree_path_node(path, level: level_end)) { |
1177 | /* Allocating new root? */ |
1178 | nr_nodes[1] += split; |
1179 | level_end = BTREE_MAX_DEPTH; |
1180 | break; |
1181 | } |
1182 | |
1183 | /* |
1184 | * Always check for space for two keys, even if we won't have to |
1185 | * split at prior level - it might have been a merge instead: |
1186 | */ |
1187 | if (bch2_btree_node_insert_fits(b: path->l[level_end].b, |
1188 | BKEY_BTREE_PTR_U64s_MAX * 2)) |
1189 | break; |
1190 | |
1191 | split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); |
1192 | } |
1193 | |
1194 | if (!down_read_trylock(sem: &c->gc_lock)) { |
1195 | ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); |
1196 | if (ret) { |
1197 | up_read(sem: &c->gc_lock); |
1198 | return ERR_PTR(error: ret); |
1199 | } |
1200 | } |
1201 | |
1202 | as = mempool_alloc(pool: &c->btree_interior_update_pool, GFP_NOFS); |
1203 | memset(as, 0, sizeof(*as)); |
1204 | closure_init(cl: &as->cl, NULL); |
1205 | as->c = c; |
1206 | as->start_time = start_time; |
1207 | as->ip_started = _RET_IP_; |
1208 | as->mode = BTREE_UPDATE_none; |
1209 | as->watermark = watermark; |
1210 | as->took_gc_lock = true; |
1211 | as->btree_id = path->btree_id; |
1212 | as->update_level_start = level_start; |
1213 | as->update_level_end = level_end; |
1214 | INIT_LIST_HEAD(list: &as->list); |
1215 | INIT_LIST_HEAD(list: &as->unwritten_list); |
1216 | INIT_LIST_HEAD(list: &as->write_blocked_list); |
1217 | bch2_keylist_init(l: &as->old_keys, inline_keys: as->_old_keys); |
1218 | bch2_keylist_init(l: &as->new_keys, inline_keys: as->_new_keys); |
1219 | bch2_keylist_init(l: &as->parent_keys, inline_keys: as->inline_keys); |
1220 | |
1221 | mutex_lock(&c->btree_interior_update_lock); |
1222 | list_add_tail(new: &as->list, head: &c->btree_interior_update_list); |
1223 | mutex_unlock(lock: &c->btree_interior_update_lock); |
1224 | |
1225 | /* |
1226 | * We don't want to allocate if we're in an error state, that can cause |
1227 | * deadlock on emergency shutdown due to open buckets getting stuck in |
1228 | * the btree_reserve_cache after allocator shutdown has cleared it out. |
1229 | * This check needs to come after adding us to the btree_interior_update |
1230 | * list but before calling bch2_btree_reserve_get, to synchronize with |
1231 | * __bch2_fs_read_only(). |
1232 | */ |
1233 | ret = bch2_journal_error(j: &c->journal); |
1234 | if (ret) |
1235 | goto err; |
1236 | |
1237 | ret = bch2_disk_reservation_get(c, res: &as->disk_res, |
1238 | sectors: (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), |
1239 | nr_replicas: c->opts.metadata_replicas, |
1240 | flags: disk_res_flags); |
1241 | if (ret) |
1242 | goto err; |
1243 | |
1244 | ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); |
1245 | if (bch2_err_matches(ret, ENOSPC) || |
1246 | bch2_err_matches(ret, ENOMEM)) { |
1247 | struct closure cl; |
1248 | |
1249 | /* |
1250 | * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK |
1251 | * flag |
1252 | */ |
1253 | if (bch2_err_matches(ret, ENOSPC) && |
1254 | (flags & BCH_TRANS_COMMIT_journal_reclaim) && |
1255 | watermark < BCH_WATERMARK_reclaim) { |
1256 | ret = -BCH_ERR_journal_reclaim_would_deadlock; |
1257 | goto err; |
1258 | } |
1259 | |
1260 | closure_init_stack(cl: &cl); |
1261 | |
1262 | do { |
1263 | ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, cl: &cl); |
1264 | |
1265 | bch2_trans_unlock(trans); |
1266 | closure_sync(cl: &cl); |
1267 | } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); |
1268 | } |
1269 | |
1270 | if (ret) { |
1271 | trace_and_count(c, btree_reserve_get_fail, trans->fn, |
1272 | _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); |
1273 | goto err; |
1274 | } |
1275 | |
1276 | ret = bch2_trans_relock(trans); |
1277 | if (ret) |
1278 | goto err; |
1279 | |
1280 | bch2_trans_verify_not_restarted(trans, restart_count); |
1281 | return as; |
1282 | err: |
1283 | bch2_btree_update_free(as, trans); |
1284 | if (!bch2_err_matches(ret, ENOSPC) && |
1285 | !bch2_err_matches(ret, EROFS) && |
1286 | ret != -BCH_ERR_journal_reclaim_would_deadlock) |
1287 | bch_err_fn_ratelimited(c, ret); |
1288 | return ERR_PTR(error: ret); |
1289 | } |
1290 | |
1291 | /* Btree root updates: */ |
1292 | |
1293 | static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) |
1294 | { |
1295 | /* Root nodes cannot be reaped */ |
1296 | mutex_lock(&c->btree_cache.lock); |
1297 | list_del_init(entry: &b->list); |
1298 | mutex_unlock(lock: &c->btree_cache.lock); |
1299 | |
1300 | mutex_lock(&c->btree_root_lock); |
1301 | bch2_btree_id_root(c, id: b->c.btree_id)->b = b; |
1302 | mutex_unlock(lock: &c->btree_root_lock); |
1303 | |
1304 | bch2_recalc_btree_reserve(c); |
1305 | } |
1306 | |
1307 | static int bch2_btree_set_root(struct btree_update *as, |
1308 | struct btree_trans *trans, |
1309 | struct btree_path *path, |
1310 | struct btree *b, |
1311 | bool nofail) |
1312 | { |
1313 | struct bch_fs *c = as->c; |
1314 | |
1315 | trace_and_count(c, btree_node_set_root, trans, b); |
1316 | |
1317 | struct btree *old = btree_node_root(c, b); |
1318 | |
1319 | /* |
1320 | * Ensure no one is using the old root while we switch to the |
1321 | * new root: |
1322 | */ |
1323 | if (nofail) { |
1324 | bch2_btree_node_lock_write_nofail(trans, path, &old->c); |
1325 | } else { |
1326 | int ret = bch2_btree_node_lock_write(trans, path, b: &old->c); |
1327 | if (ret) |
1328 | return ret; |
1329 | } |
1330 | |
1331 | bch2_btree_set_root_inmem(c, b); |
1332 | |
1333 | btree_update_updated_root(as, b); |
1334 | |
1335 | /* |
1336 | * Unlock old root after new root is visible: |
1337 | * |
1338 | * The new root isn't persistent, but that's ok: we still have |
1339 | * an intent lock on the new root, and any updates that would |
1340 | * depend on the new root would have to update the new root. |
1341 | */ |
1342 | bch2_btree_node_unlock_write(trans, path, old); |
1343 | return 0; |
1344 | } |
1345 | |
1346 | /* Interior node updates: */ |
1347 | |
1348 | static void bch2_insert_fixup_btree_ptr(struct btree_update *as, |
1349 | struct btree_trans *trans, |
1350 | struct btree_path *path, |
1351 | struct btree *b, |
1352 | struct btree_node_iter *node_iter, |
1353 | struct bkey_i *insert) |
1354 | { |
1355 | struct bch_fs *c = as->c; |
1356 | struct bkey_packed *k; |
1357 | struct printbuf buf = PRINTBUF; |
1358 | unsigned long old, new, v; |
1359 | |
1360 | BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && |
1361 | !btree_ptr_sectors_written(insert)); |
1362 | |
1363 | if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) |
1364 | bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); |
1365 | |
1366 | if (bch2_bkey_invalid(c, bkey_i_to_s_c(k: insert), |
1367 | btree_node_type(b), WRITE, &buf) ?: |
1368 | bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(k: insert), &buf)) { |
1369 | printbuf_reset(buf: &buf); |
1370 | prt_printf(&buf, "inserting invalid bkey\n " ); |
1371 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: insert)); |
1372 | prt_printf(&buf, "\n " ); |
1373 | bch2_bkey_invalid(c, bkey_i_to_s_c(k: insert), |
1374 | btree_node_type(b), WRITE, &buf); |
1375 | bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(k: insert), &buf); |
1376 | |
1377 | bch2_fs_inconsistent(c, "%s" , buf.buf); |
1378 | dump_stack(); |
1379 | } |
1380 | |
1381 | BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > |
1382 | ARRAY_SIZE(as->journal_entries)); |
1383 | |
1384 | as->journal_u64s += |
1385 | journal_entry_set(entry: (void *) &as->journal_entries[as->journal_u64s], |
1386 | type: BCH_JSET_ENTRY_btree_keys, |
1387 | id: b->c.btree_id, level: b->c.level, |
1388 | data: insert, u64s: insert->k.u64s); |
1389 | |
1390 | while ((k = bch2_btree_node_iter_peek_all(iter: node_iter, b)) && |
1391 | bkey_iter_pos_cmp(b, l: k, r: &insert->k.p) < 0) |
1392 | bch2_btree_node_iter_advance(node_iter, b); |
1393 | |
1394 | bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); |
1395 | set_btree_node_dirty_acct(c, b); |
1396 | |
1397 | v = READ_ONCE(b->flags); |
1398 | do { |
1399 | old = new = v; |
1400 | |
1401 | new &= ~BTREE_WRITE_TYPE_MASK; |
1402 | new |= BTREE_WRITE_interior; |
1403 | new |= 1 << BTREE_NODE_need_write; |
1404 | } while ((v = cmpxchg(&b->flags, old, new)) != old); |
1405 | |
1406 | printbuf_exit(&buf); |
1407 | } |
1408 | |
1409 | static void |
1410 | bch2_btree_insert_keys_interior(struct btree_update *as, |
1411 | struct btree_trans *trans, |
1412 | struct btree_path *path, |
1413 | struct btree *b, |
1414 | struct btree_node_iter node_iter, |
1415 | struct keylist *keys) |
1416 | { |
1417 | struct bkey_i *insert = bch2_keylist_front(l: keys); |
1418 | struct bkey_packed *k; |
1419 | |
1420 | BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); |
1421 | |
1422 | while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && |
1423 | (bkey_cmp_left_packed(b, l: k, r: &insert->k.p) >= 0)) |
1424 | ; |
1425 | |
1426 | while (!bch2_keylist_empty(l: keys)) { |
1427 | insert = bch2_keylist_front(l: keys); |
1428 | |
1429 | if (bpos_gt(l: insert->k.p, r: b->key.k.p)) |
1430 | break; |
1431 | |
1432 | bch2_insert_fixup_btree_ptr(as, trans, path, b, node_iter: &node_iter, insert); |
1433 | bch2_keylist_pop_front(keys); |
1434 | } |
1435 | } |
1436 | |
1437 | /* |
1438 | * Move keys from n1 (original replacement node, now lower node) to n2 (higher |
1439 | * node) |
1440 | */ |
1441 | static void __btree_split_node(struct btree_update *as, |
1442 | struct btree_trans *trans, |
1443 | struct btree *b, |
1444 | struct btree *n[2]) |
1445 | { |
1446 | struct bkey_packed *k; |
1447 | struct bpos n1_pos = POS_MIN; |
1448 | struct btree_node_iter iter; |
1449 | struct bset *bsets[2]; |
1450 | struct bkey_format_state format[2]; |
1451 | struct bkey_packed *out[2]; |
1452 | struct bkey uk; |
1453 | unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; |
1454 | struct { unsigned nr_keys, val_u64s; } nr_keys[2]; |
1455 | int i; |
1456 | |
1457 | memset(&nr_keys, 0, sizeof(nr_keys)); |
1458 | |
1459 | for (i = 0; i < 2; i++) { |
1460 | BUG_ON(n[i]->nsets != 1); |
1461 | |
1462 | bsets[i] = btree_bset_first(b: n[i]); |
1463 | out[i] = bsets[i]->start; |
1464 | |
1465 | SET_BTREE_NODE_SEQ(k: n[i]->data, v: BTREE_NODE_SEQ(k: b->data) + 1); |
1466 | bch2_bkey_format_init(&format[i]); |
1467 | } |
1468 | |
1469 | u64s = 0; |
1470 | for_each_btree_node_key(b, k, &iter) { |
1471 | if (bkey_deleted(k)) |
1472 | continue; |
1473 | |
1474 | uk = bkey_unpack_key(b, src: k); |
1475 | |
1476 | if (b->c.level && |
1477 | u64s < n1_u64s && |
1478 | u64s + k->u64s >= n1_u64s && |
1479 | bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p)) |
1480 | n1_u64s += k->u64s; |
1481 | |
1482 | i = u64s >= n1_u64s; |
1483 | u64s += k->u64s; |
1484 | if (!i) |
1485 | n1_pos = uk.p; |
1486 | bch2_bkey_format_add_key(s: &format[i], k: &uk); |
1487 | |
1488 | nr_keys[i].nr_keys++; |
1489 | nr_keys[i].val_u64s += bkeyp_val_u64s(format: &b->format, k); |
1490 | } |
1491 | |
1492 | btree_set_min(b: n[0], pos: b->data->min_key); |
1493 | btree_set_max(b: n[0], pos: n1_pos); |
1494 | btree_set_min(b: n[1], pos: bpos_successor(p: n1_pos)); |
1495 | btree_set_max(b: n[1], pos: b->data->max_key); |
1496 | |
1497 | for (i = 0; i < 2; i++) { |
1498 | bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); |
1499 | bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); |
1500 | |
1501 | n[i]->data->format = bch2_bkey_format_done(&format[i]); |
1502 | |
1503 | unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + |
1504 | nr_keys[i].val_u64s; |
1505 | if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b)) |
1506 | n[i]->data->format = b->format; |
1507 | |
1508 | btree_node_set_format(b: n[i], f: n[i]->data->format); |
1509 | } |
1510 | |
1511 | u64s = 0; |
1512 | for_each_btree_node_key(b, k, &iter) { |
1513 | if (bkey_deleted(k)) |
1514 | continue; |
1515 | |
1516 | i = u64s >= n1_u64s; |
1517 | u64s += k->u64s; |
1518 | |
1519 | if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) |
1520 | ? &b->format: &bch2_bkey_format_current, k)) |
1521 | out[i]->format = KEY_FORMAT_LOCAL_BTREE; |
1522 | else |
1523 | bch2_bkey_unpack(b, (void *) out[i], k); |
1524 | |
1525 | out[i]->needs_whiteout = false; |
1526 | |
1527 | btree_keys_account_key_add(&n[i]->nr, 0, out[i]); |
1528 | out[i] = bkey_p_next(out[i]); |
1529 | } |
1530 | |
1531 | for (i = 0; i < 2; i++) { |
1532 | bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); |
1533 | |
1534 | BUG_ON(!bsets[i]->u64s); |
1535 | |
1536 | set_btree_bset_end(b: n[i], t: n[i]->set); |
1537 | |
1538 | btree_node_reset_sib_u64s(b: n[i]); |
1539 | |
1540 | bch2_verify_btree_nr_keys(b: n[i]); |
1541 | |
1542 | BUG_ON(bch2_btree_node_check_topology(trans, n[i])); |
1543 | } |
1544 | } |
1545 | |
1546 | /* |
1547 | * For updates to interior nodes, we've got to do the insert before we split |
1548 | * because the stuff we're inserting has to be inserted atomically. Post split, |
1549 | * the keys might have to go in different nodes and the split would no longer be |
1550 | * atomic. |
1551 | * |
1552 | * Worse, if the insert is from btree node coalescing, if we do the insert after |
1553 | * we do the split (and pick the pivot) - the pivot we pick might be between |
1554 | * nodes that were coalesced, and thus in the middle of a child node post |
1555 | * coalescing: |
1556 | */ |
1557 | static void btree_split_insert_keys(struct btree_update *as, |
1558 | struct btree_trans *trans, |
1559 | btree_path_idx_t path_idx, |
1560 | struct btree *b, |
1561 | struct keylist *keys) |
1562 | { |
1563 | struct btree_path *path = trans->paths + path_idx; |
1564 | |
1565 | if (!bch2_keylist_empty(l: keys) && |
1566 | bpos_le(l: bch2_keylist_front(l: keys)->k.p, r: b->data->max_key)) { |
1567 | struct btree_node_iter node_iter; |
1568 | |
1569 | bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(l: keys)->k.p); |
1570 | |
1571 | bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); |
1572 | |
1573 | BUG_ON(bch2_btree_node_check_topology(trans, b)); |
1574 | } |
1575 | } |
1576 | |
1577 | static int btree_split(struct btree_update *as, struct btree_trans *trans, |
1578 | btree_path_idx_t path, struct btree *b, |
1579 | struct keylist *keys) |
1580 | { |
1581 | struct bch_fs *c = as->c; |
1582 | struct btree *parent = btree_node_parent(path: trans->paths + path, b); |
1583 | struct btree *n1, *n2 = NULL, *n3 = NULL; |
1584 | btree_path_idx_t path1 = 0, path2 = 0; |
1585 | u64 start_time = local_clock(); |
1586 | int ret = 0; |
1587 | |
1588 | bch2_verify_btree_nr_keys(b); |
1589 | BUG_ON(!parent && (b != btree_node_root(c, b))); |
1590 | BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); |
1591 | |
1592 | ret = bch2_btree_node_check_topology(trans, b); |
1593 | if (ret) |
1594 | return ret; |
1595 | |
1596 | bch2_btree_interior_update_will_free_node(as, b); |
1597 | |
1598 | if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { |
1599 | struct btree *n[2]; |
1600 | |
1601 | trace_and_count(c, btree_node_split, trans, b); |
1602 | |
1603 | n[0] = n1 = bch2_btree_node_alloc(as, trans, level: b->c.level); |
1604 | n[1] = n2 = bch2_btree_node_alloc(as, trans, level: b->c.level); |
1605 | |
1606 | __btree_split_node(as, trans, b, n); |
1607 | |
1608 | if (keys) { |
1609 | btree_split_insert_keys(as, trans, path_idx: path, b: n1, keys); |
1610 | btree_split_insert_keys(as, trans, path_idx: path, b: n2, keys); |
1611 | BUG_ON(!bch2_keylist_empty(keys)); |
1612 | } |
1613 | |
1614 | bch2_btree_build_aux_trees(n2); |
1615 | bch2_btree_build_aux_trees(n1); |
1616 | |
1617 | bch2_btree_update_add_new_node(as, b: n1); |
1618 | bch2_btree_update_add_new_node(as, b: n2); |
1619 | six_unlock_write(lock: &n2->c.lock); |
1620 | six_unlock_write(lock: &n1->c.lock); |
1621 | |
1622 | path1 = get_unlocked_mut_path(trans, btree_id: as->btree_id, level: n1->c.level, pos: n1->key.k.p); |
1623 | six_lock_increment(&n1->c.lock, SIX_LOCK_intent); |
1624 | mark_btree_node_locked(trans, path: trans->paths + path1, level: n1->c.level, type: BTREE_NODE_INTENT_LOCKED); |
1625 | bch2_btree_path_level_init(trans, trans->paths + path1, n1); |
1626 | |
1627 | path2 = get_unlocked_mut_path(trans, btree_id: as->btree_id, level: n2->c.level, pos: n2->key.k.p); |
1628 | six_lock_increment(&n2->c.lock, SIX_LOCK_intent); |
1629 | mark_btree_node_locked(trans, path: trans->paths + path2, level: n2->c.level, type: BTREE_NODE_INTENT_LOCKED); |
1630 | bch2_btree_path_level_init(trans, trans->paths + path2, n2); |
1631 | |
1632 | /* |
1633 | * Note that on recursive parent_keys == keys, so we |
1634 | * can't start adding new keys to parent_keys before emptying it |
1635 | * out (which we did with btree_split_insert_keys() above) |
1636 | */ |
1637 | bch2_keylist_add(l: &as->parent_keys, k: &n1->key); |
1638 | bch2_keylist_add(l: &as->parent_keys, k: &n2->key); |
1639 | |
1640 | if (!parent) { |
1641 | /* Depth increases, make a new root */ |
1642 | n3 = __btree_root_alloc(as, trans, level: b->c.level + 1); |
1643 | |
1644 | bch2_btree_update_add_new_node(as, b: n3); |
1645 | six_unlock_write(lock: &n3->c.lock); |
1646 | |
1647 | trans->paths[path2].locks_want++; |
1648 | BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); |
1649 | six_lock_increment(&n3->c.lock, SIX_LOCK_intent); |
1650 | mark_btree_node_locked(trans, path: trans->paths + path2, level: n3->c.level, type: BTREE_NODE_INTENT_LOCKED); |
1651 | bch2_btree_path_level_init(trans, trans->paths + path2, n3); |
1652 | |
1653 | n3->sib_u64s[0] = U16_MAX; |
1654 | n3->sib_u64s[1] = U16_MAX; |
1655 | |
1656 | btree_split_insert_keys(as, trans, path_idx: path, b: n3, keys: &as->parent_keys); |
1657 | } |
1658 | } else { |
1659 | trace_and_count(c, btree_node_compact, trans, b); |
1660 | |
1661 | n1 = bch2_btree_node_alloc_replacement(as, trans, b); |
1662 | |
1663 | if (keys) { |
1664 | btree_split_insert_keys(as, trans, path_idx: path, b: n1, keys); |
1665 | BUG_ON(!bch2_keylist_empty(keys)); |
1666 | } |
1667 | |
1668 | bch2_btree_build_aux_trees(n1); |
1669 | bch2_btree_update_add_new_node(as, b: n1); |
1670 | six_unlock_write(lock: &n1->c.lock); |
1671 | |
1672 | path1 = get_unlocked_mut_path(trans, btree_id: as->btree_id, level: n1->c.level, pos: n1->key.k.p); |
1673 | six_lock_increment(&n1->c.lock, SIX_LOCK_intent); |
1674 | mark_btree_node_locked(trans, path: trans->paths + path1, level: n1->c.level, type: BTREE_NODE_INTENT_LOCKED); |
1675 | bch2_btree_path_level_init(trans, trans->paths + path1, n1); |
1676 | |
1677 | if (parent) |
1678 | bch2_keylist_add(l: &as->parent_keys, k: &n1->key); |
1679 | } |
1680 | |
1681 | /* New nodes all written, now make them visible: */ |
1682 | |
1683 | if (parent) { |
1684 | /* Split a non root node */ |
1685 | ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); |
1686 | } else if (n3) { |
1687 | ret = bch2_btree_set_root(as, trans, path: trans->paths + path, b: n3, nofail: false); |
1688 | } else { |
1689 | /* Root filled up but didn't need to be split */ |
1690 | ret = bch2_btree_set_root(as, trans, path: trans->paths + path, b: n1, nofail: false); |
1691 | } |
1692 | |
1693 | if (ret) |
1694 | goto err; |
1695 | |
1696 | if (n3) { |
1697 | bch2_btree_update_get_open_buckets(as, b: n3); |
1698 | bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); |
1699 | } |
1700 | if (n2) { |
1701 | bch2_btree_update_get_open_buckets(as, b: n2); |
1702 | bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); |
1703 | } |
1704 | bch2_btree_update_get_open_buckets(as, b: n1); |
1705 | bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); |
1706 | |
1707 | /* |
1708 | * The old node must be freed (in memory) _before_ unlocking the new |
1709 | * nodes - else another thread could re-acquire a read lock on the old |
1710 | * node after another thread has locked and updated the new node, thus |
1711 | * seeing stale data: |
1712 | */ |
1713 | bch2_btree_node_free_inmem(trans, path: trans->paths + path, b); |
1714 | |
1715 | if (n3) |
1716 | bch2_trans_node_add(trans, trans->paths + path, n3); |
1717 | if (n2) |
1718 | bch2_trans_node_add(trans, trans->paths + path2, n2); |
1719 | bch2_trans_node_add(trans, trans->paths + path1, n1); |
1720 | |
1721 | if (n3) |
1722 | six_unlock_intent(lock: &n3->c.lock); |
1723 | if (n2) |
1724 | six_unlock_intent(lock: &n2->c.lock); |
1725 | six_unlock_intent(lock: &n1->c.lock); |
1726 | out: |
1727 | if (path2) { |
1728 | __bch2_btree_path_unlock(trans, path: trans->paths + path2); |
1729 | bch2_path_put(trans, path2, true); |
1730 | } |
1731 | if (path1) { |
1732 | __bch2_btree_path_unlock(trans, path: trans->paths + path1); |
1733 | bch2_path_put(trans, path1, true); |
1734 | } |
1735 | |
1736 | bch2_trans_verify_locks(trans); |
1737 | |
1738 | bch2_time_stats_update(stats: &c->times[n2 |
1739 | ? BCH_TIME_btree_node_split |
1740 | : BCH_TIME_btree_node_compact], |
1741 | start: start_time); |
1742 | return ret; |
1743 | err: |
1744 | if (n3) |
1745 | bch2_btree_node_free_never_used(as, trans, b: n3); |
1746 | if (n2) |
1747 | bch2_btree_node_free_never_used(as, trans, b: n2); |
1748 | bch2_btree_node_free_never_used(as, trans, b: n1); |
1749 | goto out; |
1750 | } |
1751 | |
1752 | /** |
1753 | * bch2_btree_insert_node - insert bkeys into a given btree node |
1754 | * |
1755 | * @as: btree_update object |
1756 | * @trans: btree_trans object |
1757 | * @path_idx: path that points to current node |
1758 | * @b: node to insert keys into |
1759 | * @keys: list of keys to insert |
1760 | * |
1761 | * Returns: 0 on success, typically transaction restart error on failure |
1762 | * |
1763 | * Inserts as many keys as it can into a given btree node, splitting it if full. |
1764 | * If a split occurred, this function will return early. This can only happen |
1765 | * for leaf nodes -- inserts into interior nodes have to be atomic. |
1766 | */ |
1767 | static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, |
1768 | btree_path_idx_t path_idx, struct btree *b, |
1769 | struct keylist *keys) |
1770 | { |
1771 | struct bch_fs *c = as->c; |
1772 | struct btree_path *path = trans->paths + path_idx, *linked; |
1773 | unsigned i; |
1774 | int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); |
1775 | int old_live_u64s = b->nr.live_u64s; |
1776 | int live_u64s_added, u64s_added; |
1777 | int ret; |
1778 | |
1779 | lockdep_assert_held(&c->gc_lock); |
1780 | BUG_ON(!btree_node_intent_locked(path, b->c.level)); |
1781 | BUG_ON(!b->c.level); |
1782 | BUG_ON(!as || as->b); |
1783 | bch2_verify_keylist_sorted(keys); |
1784 | |
1785 | ret = bch2_btree_node_lock_write(trans, path, b: &b->c); |
1786 | if (ret) |
1787 | return ret; |
1788 | |
1789 | bch2_btree_node_prep_for_write(trans, path, b); |
1790 | |
1791 | if (!bch2_btree_node_insert_fits(b, u64s: bch2_keylist_u64s(l: keys))) { |
1792 | bch2_btree_node_unlock_write(trans, path, b); |
1793 | goto split; |
1794 | } |
1795 | |
1796 | ret = bch2_btree_node_check_topology(trans, b); |
1797 | if (ret) { |
1798 | bch2_btree_node_unlock_write(trans, path, b); |
1799 | return ret; |
1800 | } |
1801 | |
1802 | bch2_btree_insert_keys_interior(as, trans, path, b, |
1803 | node_iter: path->l[b->c.level].iter, keys); |
1804 | |
1805 | trans_for_each_path_with_node(trans, b, linked, i) |
1806 | bch2_btree_node_iter_peek(iter: &linked->l[b->c.level].iter, b); |
1807 | |
1808 | bch2_trans_verify_paths(trans); |
1809 | |
1810 | live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; |
1811 | u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; |
1812 | |
1813 | if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) |
1814 | b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); |
1815 | if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) |
1816 | b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); |
1817 | |
1818 | if (u64s_added > live_u64s_added && |
1819 | bch2_maybe_compact_whiteouts(c, b)) |
1820 | bch2_trans_node_reinit_iter(trans, b); |
1821 | |
1822 | btree_update_updated_node(as, b); |
1823 | bch2_btree_node_unlock_write(trans, path, b); |
1824 | |
1825 | BUG_ON(bch2_btree_node_check_topology(trans, b)); |
1826 | return 0; |
1827 | split: |
1828 | /* |
1829 | * We could attempt to avoid the transaction restart, by calling |
1830 | * bch2_btree_path_upgrade() and allocating more nodes: |
1831 | */ |
1832 | if (b->c.level >= as->update_level_end) { |
1833 | trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); |
1834 | return btree_trans_restart(trans, err: BCH_ERR_transaction_restart_split_race); |
1835 | } |
1836 | |
1837 | return btree_split(as, trans, path: path_idx, b, keys); |
1838 | } |
1839 | |
1840 | int bch2_btree_split_leaf(struct btree_trans *trans, |
1841 | btree_path_idx_t path, |
1842 | unsigned flags) |
1843 | { |
1844 | /* btree_split & merge may both cause paths array to be reallocated */ |
1845 | struct btree *b = path_l(path: trans->paths + path)->b; |
1846 | struct btree_update *as; |
1847 | unsigned l; |
1848 | int ret = 0; |
1849 | |
1850 | as = bch2_btree_update_start(trans, path: trans->paths + path, |
1851 | level_start: trans->paths[path].level, |
1852 | split: true, flags); |
1853 | if (IS_ERR(ptr: as)) |
1854 | return PTR_ERR(ptr: as); |
1855 | |
1856 | ret = btree_split(as, trans, path, b, NULL); |
1857 | if (ret) { |
1858 | bch2_btree_update_free(as, trans); |
1859 | return ret; |
1860 | } |
1861 | |
1862 | bch2_btree_update_done(as, trans); |
1863 | |
1864 | for (l = trans->paths[path].level + 1; |
1865 | btree_node_intent_locked(path: &trans->paths[path], l) && !ret; |
1866 | l++) |
1867 | ret = bch2_foreground_maybe_merge(trans, path, level: l, flags); |
1868 | |
1869 | return ret; |
1870 | } |
1871 | |
1872 | static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, |
1873 | btree_path_idx_t path_idx) |
1874 | { |
1875 | struct bch_fs *c = as->c; |
1876 | struct btree_path *path = trans->paths + path_idx; |
1877 | struct btree *n, *b = bch2_btree_id_root(c, id: path->btree_id)->b; |
1878 | |
1879 | BUG_ON(!btree_node_locked(path, b->c.level)); |
1880 | |
1881 | n = __btree_root_alloc(as, trans, level: b->c.level + 1); |
1882 | |
1883 | bch2_btree_update_add_new_node(as, b: n); |
1884 | six_unlock_write(lock: &n->c.lock); |
1885 | |
1886 | path->locks_want++; |
1887 | BUG_ON(btree_node_locked(path, n->c.level)); |
1888 | six_lock_increment(&n->c.lock, SIX_LOCK_intent); |
1889 | mark_btree_node_locked(trans, path, level: n->c.level, type: BTREE_NODE_INTENT_LOCKED); |
1890 | bch2_btree_path_level_init(trans, path, n); |
1891 | |
1892 | n->sib_u64s[0] = U16_MAX; |
1893 | n->sib_u64s[1] = U16_MAX; |
1894 | |
1895 | bch2_keylist_add(l: &as->parent_keys, k: &b->key); |
1896 | btree_split_insert_keys(as, trans, path_idx, b: n, keys: &as->parent_keys); |
1897 | |
1898 | int ret = bch2_btree_set_root(as, trans, path, b: n, nofail: true); |
1899 | BUG_ON(ret); |
1900 | |
1901 | bch2_btree_update_get_open_buckets(as, b: n); |
1902 | bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); |
1903 | bch2_trans_node_add(trans, path, n); |
1904 | six_unlock_intent(lock: &n->c.lock); |
1905 | |
1906 | mutex_lock(&c->btree_cache.lock); |
1907 | list_add_tail(new: &b->list, head: &c->btree_cache.live); |
1908 | mutex_unlock(lock: &c->btree_cache.lock); |
1909 | |
1910 | bch2_trans_verify_locks(trans); |
1911 | } |
1912 | |
1913 | int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) |
1914 | { |
1915 | struct bch_fs *c = trans->c; |
1916 | struct btree *b = bch2_btree_id_root(c, id: trans->paths[path].btree_id)->b; |
1917 | |
1918 | if (btree_node_fake(b)) |
1919 | return bch2_btree_split_leaf(trans, path, flags); |
1920 | |
1921 | struct btree_update *as = |
1922 | bch2_btree_update_start(trans, path: trans->paths + path, level_start: b->c.level, split: true, flags); |
1923 | if (IS_ERR(ptr: as)) |
1924 | return PTR_ERR(ptr: as); |
1925 | |
1926 | __btree_increase_depth(as, trans, path_idx: path); |
1927 | bch2_btree_update_done(as, trans); |
1928 | return 0; |
1929 | } |
1930 | |
1931 | int __bch2_foreground_maybe_merge(struct btree_trans *trans, |
1932 | btree_path_idx_t path, |
1933 | unsigned level, |
1934 | unsigned flags, |
1935 | enum btree_node_sibling sib) |
1936 | { |
1937 | struct bch_fs *c = trans->c; |
1938 | struct btree_update *as; |
1939 | struct bkey_format_state new_s; |
1940 | struct bkey_format new_f; |
1941 | struct bkey_i delete; |
1942 | struct btree *b, *m, *n, *prev, *next, *parent; |
1943 | struct bpos sib_pos; |
1944 | size_t sib_u64s; |
1945 | enum btree_id btree = trans->paths[path].btree_id; |
1946 | btree_path_idx_t sib_path = 0, new_path = 0; |
1947 | u64 start_time = local_clock(); |
1948 | int ret = 0; |
1949 | |
1950 | BUG_ON(!trans->paths[path].should_be_locked); |
1951 | BUG_ON(!btree_node_locked(&trans->paths[path], level)); |
1952 | |
1953 | /* |
1954 | * Work around a deadlock caused by the btree write buffer not doing |
1955 | * merges and leaving tons of merges for us to do - we really don't need |
1956 | * to be doing merges at all from the interior update path, and if the |
1957 | * interior update path is generating too many new interior updates we |
1958 | * deadlock: |
1959 | */ |
1960 | if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates) |
1961 | return 0; |
1962 | |
1963 | if ((flags & BCH_WATERMARK_MASK) <= BCH_WATERMARK_reclaim) { |
1964 | flags &= ~BCH_WATERMARK_MASK; |
1965 | flags |= BCH_WATERMARK_btree; |
1966 | flags |= BCH_TRANS_COMMIT_journal_reclaim; |
1967 | } |
1968 | |
1969 | b = trans->paths[path].l[level].b; |
1970 | |
1971 | if ((sib == btree_prev_sib && bpos_eq(l: b->data->min_key, POS_MIN)) || |
1972 | (sib == btree_next_sib && bpos_eq(l: b->data->max_key, SPOS_MAX))) { |
1973 | b->sib_u64s[sib] = U16_MAX; |
1974 | return 0; |
1975 | } |
1976 | |
1977 | sib_pos = sib == btree_prev_sib |
1978 | ? bpos_predecessor(p: b->data->min_key) |
1979 | : bpos_successor(p: b->data->max_key); |
1980 | |
1981 | sib_path = bch2_path_get(trans, btree, sib_pos, |
1982 | U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); |
1983 | ret = bch2_btree_path_traverse(trans, path: sib_path, flags: false); |
1984 | if (ret) |
1985 | goto err; |
1986 | |
1987 | btree_path_set_should_be_locked(path: trans->paths + sib_path); |
1988 | |
1989 | m = trans->paths[sib_path].l[level].b; |
1990 | |
1991 | if (btree_node_parent(path: trans->paths + path, b) != |
1992 | btree_node_parent(path: trans->paths + sib_path, b: m)) { |
1993 | b->sib_u64s[sib] = U16_MAX; |
1994 | goto out; |
1995 | } |
1996 | |
1997 | if (sib == btree_prev_sib) { |
1998 | prev = m; |
1999 | next = b; |
2000 | } else { |
2001 | prev = b; |
2002 | next = m; |
2003 | } |
2004 | |
2005 | if (!bpos_eq(l: bpos_successor(p: prev->data->max_key), r: next->data->min_key)) { |
2006 | struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; |
2007 | |
2008 | bch2_bpos_to_text(&buf1, prev->data->max_key); |
2009 | bch2_bpos_to_text(&buf2, next->data->min_key); |
2010 | bch_err(c, |
2011 | "%s(): btree topology error:\n" |
2012 | " prev ends at %s\n" |
2013 | " next starts at %s" , |
2014 | __func__, buf1.buf, buf2.buf); |
2015 | printbuf_exit(&buf1); |
2016 | printbuf_exit(&buf2); |
2017 | ret = bch2_topology_error(c); |
2018 | goto err; |
2019 | } |
2020 | |
2021 | bch2_bkey_format_init(&new_s); |
2022 | bch2_bkey_format_add_pos(&new_s, prev->data->min_key); |
2023 | __bch2_btree_calc_format(s: &new_s, b: prev); |
2024 | __bch2_btree_calc_format(s: &new_s, b: next); |
2025 | bch2_bkey_format_add_pos(&new_s, next->data->max_key); |
2026 | new_f = bch2_bkey_format_done(&new_s); |
2027 | |
2028 | sib_u64s = btree_node_u64s_with_format(nr: b->nr, old_f: &b->format, new_f: &new_f) + |
2029 | btree_node_u64s_with_format(nr: m->nr, old_f: &m->format, new_f: &new_f); |
2030 | |
2031 | if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { |
2032 | sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); |
2033 | sib_u64s /= 2; |
2034 | sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); |
2035 | } |
2036 | |
2037 | sib_u64s = min(sib_u64s, btree_max_u64s(c)); |
2038 | sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); |
2039 | b->sib_u64s[sib] = sib_u64s; |
2040 | |
2041 | if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) |
2042 | goto out; |
2043 | |
2044 | parent = btree_node_parent(path: trans->paths + path, b); |
2045 | as = bch2_btree_update_start(trans, path: trans->paths + path, level_start: level, split: false, |
2046 | flags: BCH_TRANS_COMMIT_no_enospc|flags); |
2047 | ret = PTR_ERR_OR_ZERO(ptr: as); |
2048 | if (ret) |
2049 | goto err; |
2050 | |
2051 | trace_and_count(c, btree_node_merge, trans, b); |
2052 | |
2053 | bch2_btree_interior_update_will_free_node(as, b); |
2054 | bch2_btree_interior_update_will_free_node(as, b: m); |
2055 | |
2056 | n = bch2_btree_node_alloc(as, trans, level: b->c.level); |
2057 | |
2058 | SET_BTREE_NODE_SEQ(k: n->data, |
2059 | max(BTREE_NODE_SEQ(b->data), |
2060 | BTREE_NODE_SEQ(m->data)) + 1); |
2061 | |
2062 | btree_set_min(b: n, pos: prev->data->min_key); |
2063 | btree_set_max(b: n, pos: next->data->max_key); |
2064 | |
2065 | n->data->format = new_f; |
2066 | btree_node_set_format(b: n, f: new_f); |
2067 | |
2068 | bch2_btree_sort_into(c, n, prev); |
2069 | bch2_btree_sort_into(c, n, next); |
2070 | |
2071 | bch2_btree_build_aux_trees(n); |
2072 | bch2_btree_update_add_new_node(as, b: n); |
2073 | six_unlock_write(lock: &n->c.lock); |
2074 | |
2075 | new_path = get_unlocked_mut_path(trans, btree_id: btree, level: n->c.level, pos: n->key.k.p); |
2076 | six_lock_increment(&n->c.lock, SIX_LOCK_intent); |
2077 | mark_btree_node_locked(trans, path: trans->paths + new_path, level: n->c.level, type: BTREE_NODE_INTENT_LOCKED); |
2078 | bch2_btree_path_level_init(trans, trans->paths + new_path, n); |
2079 | |
2080 | bkey_init(k: &delete.k); |
2081 | delete.k.p = prev->key.k.p; |
2082 | bch2_keylist_add(l: &as->parent_keys, k: &delete); |
2083 | bch2_keylist_add(l: &as->parent_keys, k: &n->key); |
2084 | |
2085 | bch2_trans_verify_paths(trans); |
2086 | |
2087 | ret = bch2_btree_insert_node(as, trans, path_idx: path, b: parent, keys: &as->parent_keys); |
2088 | if (ret) |
2089 | goto err_free_update; |
2090 | |
2091 | bch2_trans_verify_paths(trans); |
2092 | |
2093 | bch2_btree_update_get_open_buckets(as, b: n); |
2094 | bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); |
2095 | |
2096 | bch2_btree_node_free_inmem(trans, path: trans->paths + path, b); |
2097 | bch2_btree_node_free_inmem(trans, path: trans->paths + sib_path, b: m); |
2098 | |
2099 | bch2_trans_node_add(trans, trans->paths + path, n); |
2100 | |
2101 | bch2_trans_verify_paths(trans); |
2102 | |
2103 | six_unlock_intent(lock: &n->c.lock); |
2104 | |
2105 | bch2_btree_update_done(as, trans); |
2106 | |
2107 | bch2_time_stats_update(stats: &c->times[BCH_TIME_btree_node_merge], start: start_time); |
2108 | out: |
2109 | err: |
2110 | if (new_path) |
2111 | bch2_path_put(trans, new_path, true); |
2112 | bch2_path_put(trans, sib_path, true); |
2113 | bch2_trans_verify_locks(trans); |
2114 | if (ret == -BCH_ERR_journal_reclaim_would_deadlock) |
2115 | ret = 0; |
2116 | if (!ret) |
2117 | ret = bch2_trans_relock(trans); |
2118 | return ret; |
2119 | err_free_update: |
2120 | bch2_btree_node_free_never_used(as, trans, b: n); |
2121 | bch2_btree_update_free(as, trans); |
2122 | goto out; |
2123 | } |
2124 | |
2125 | int bch2_btree_node_rewrite(struct btree_trans *trans, |
2126 | struct btree_iter *iter, |
2127 | struct btree *b, |
2128 | unsigned flags) |
2129 | { |
2130 | struct bch_fs *c = trans->c; |
2131 | struct btree *n, *parent; |
2132 | struct btree_update *as; |
2133 | btree_path_idx_t new_path = 0; |
2134 | int ret; |
2135 | |
2136 | flags |= BCH_TRANS_COMMIT_no_enospc; |
2137 | |
2138 | struct btree_path *path = btree_iter_path(trans, iter); |
2139 | parent = btree_node_parent(path, b); |
2140 | as = bch2_btree_update_start(trans, path, level_start: b->c.level, split: false, flags); |
2141 | ret = PTR_ERR_OR_ZERO(ptr: as); |
2142 | if (ret) |
2143 | goto out; |
2144 | |
2145 | bch2_btree_interior_update_will_free_node(as, b); |
2146 | |
2147 | n = bch2_btree_node_alloc_replacement(as, trans, b); |
2148 | |
2149 | bch2_btree_build_aux_trees(n); |
2150 | bch2_btree_update_add_new_node(as, b: n); |
2151 | six_unlock_write(lock: &n->c.lock); |
2152 | |
2153 | new_path = get_unlocked_mut_path(trans, btree_id: iter->btree_id, level: n->c.level, pos: n->key.k.p); |
2154 | six_lock_increment(&n->c.lock, SIX_LOCK_intent); |
2155 | mark_btree_node_locked(trans, path: trans->paths + new_path, level: n->c.level, type: BTREE_NODE_INTENT_LOCKED); |
2156 | bch2_btree_path_level_init(trans, trans->paths + new_path, n); |
2157 | |
2158 | trace_and_count(c, btree_node_rewrite, trans, b); |
2159 | |
2160 | if (parent) { |
2161 | bch2_keylist_add(l: &as->parent_keys, k: &n->key); |
2162 | ret = bch2_btree_insert_node(as, trans, path_idx: iter->path, b: parent, keys: &as->parent_keys); |
2163 | } else { |
2164 | ret = bch2_btree_set_root(as, trans, path: btree_iter_path(trans, iter), b: n, nofail: false); |
2165 | } |
2166 | |
2167 | if (ret) |
2168 | goto err; |
2169 | |
2170 | bch2_btree_update_get_open_buckets(as, b: n); |
2171 | bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); |
2172 | |
2173 | bch2_btree_node_free_inmem(trans, path: btree_iter_path(trans, iter), b); |
2174 | |
2175 | bch2_trans_node_add(trans, trans->paths + iter->path, n); |
2176 | six_unlock_intent(lock: &n->c.lock); |
2177 | |
2178 | bch2_btree_update_done(as, trans); |
2179 | out: |
2180 | if (new_path) |
2181 | bch2_path_put(trans, new_path, true); |
2182 | bch2_trans_downgrade(trans); |
2183 | return ret; |
2184 | err: |
2185 | bch2_btree_node_free_never_used(as, trans, b: n); |
2186 | bch2_btree_update_free(as, trans); |
2187 | goto out; |
2188 | } |
2189 | |
2190 | struct async_btree_rewrite { |
2191 | struct bch_fs *c; |
2192 | struct work_struct work; |
2193 | struct list_head list; |
2194 | enum btree_id btree_id; |
2195 | unsigned level; |
2196 | struct bpos pos; |
2197 | __le64 seq; |
2198 | }; |
2199 | |
2200 | static int async_btree_node_rewrite_trans(struct btree_trans *trans, |
2201 | struct async_btree_rewrite *a) |
2202 | { |
2203 | struct bch_fs *c = trans->c; |
2204 | struct btree_iter iter; |
2205 | struct btree *b; |
2206 | int ret; |
2207 | |
2208 | bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, |
2209 | BTREE_MAX_DEPTH, a->level, 0); |
2210 | b = bch2_btree_iter_peek_node(&iter); |
2211 | ret = PTR_ERR_OR_ZERO(ptr: b); |
2212 | if (ret) |
2213 | goto out; |
2214 | |
2215 | if (!b || b->data->keys.seq != a->seq) { |
2216 | struct printbuf buf = PRINTBUF; |
2217 | |
2218 | if (b) |
2219 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &b->key)); |
2220 | else |
2221 | prt_str(out: &buf, str: "(null" ); |
2222 | bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s" , |
2223 | __func__, a->seq, buf.buf); |
2224 | printbuf_exit(&buf); |
2225 | goto out; |
2226 | } |
2227 | |
2228 | ret = bch2_btree_node_rewrite(trans, iter: &iter, b, flags: 0); |
2229 | out: |
2230 | bch2_trans_iter_exit(trans, &iter); |
2231 | |
2232 | return ret; |
2233 | } |
2234 | |
2235 | static void async_btree_node_rewrite_work(struct work_struct *work) |
2236 | { |
2237 | struct async_btree_rewrite *a = |
2238 | container_of(work, struct async_btree_rewrite, work); |
2239 | struct bch_fs *c = a->c; |
2240 | int ret; |
2241 | |
2242 | ret = bch2_trans_do(c, NULL, NULL, 0, |
2243 | async_btree_node_rewrite_trans(trans, a)); |
2244 | bch_err_fn_ratelimited(c, ret); |
2245 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_node_rewrite); |
2246 | kfree(objp: a); |
2247 | } |
2248 | |
2249 | void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) |
2250 | { |
2251 | struct async_btree_rewrite *a; |
2252 | int ret; |
2253 | |
2254 | a = kmalloc(size: sizeof(*a), GFP_NOFS); |
2255 | if (!a) { |
2256 | bch_err(c, "%s: error allocating memory" , __func__); |
2257 | return; |
2258 | } |
2259 | |
2260 | a->c = c; |
2261 | a->btree_id = b->c.btree_id; |
2262 | a->level = b->c.level; |
2263 | a->pos = b->key.k.p; |
2264 | a->seq = b->data->keys.seq; |
2265 | INIT_WORK(&a->work, async_btree_node_rewrite_work); |
2266 | |
2267 | if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { |
2268 | mutex_lock(&c->pending_node_rewrites_lock); |
2269 | list_add(new: &a->list, head: &c->pending_node_rewrites); |
2270 | mutex_unlock(lock: &c->pending_node_rewrites_lock); |
2271 | return; |
2272 | } |
2273 | |
2274 | if (!bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_node_rewrite)) { |
2275 | if (test_bit(BCH_FS_started, &c->flags)) { |
2276 | bch_err(c, "%s: error getting c->writes ref" , __func__); |
2277 | kfree(objp: a); |
2278 | return; |
2279 | } |
2280 | |
2281 | ret = bch2_fs_read_write_early(c); |
2282 | bch_err_msg(c, ret, "going read-write" ); |
2283 | if (ret) { |
2284 | kfree(objp: a); |
2285 | return; |
2286 | } |
2287 | |
2288 | bch2_write_ref_get(c, ref: BCH_WRITE_REF_node_rewrite); |
2289 | } |
2290 | |
2291 | queue_work(wq: c->btree_node_rewrite_worker, work: &a->work); |
2292 | } |
2293 | |
2294 | void bch2_do_pending_node_rewrites(struct bch_fs *c) |
2295 | { |
2296 | struct async_btree_rewrite *a, *n; |
2297 | |
2298 | mutex_lock(&c->pending_node_rewrites_lock); |
2299 | list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { |
2300 | list_del(entry: &a->list); |
2301 | |
2302 | bch2_write_ref_get(c, ref: BCH_WRITE_REF_node_rewrite); |
2303 | queue_work(wq: c->btree_node_rewrite_worker, work: &a->work); |
2304 | } |
2305 | mutex_unlock(lock: &c->pending_node_rewrites_lock); |
2306 | } |
2307 | |
2308 | void bch2_free_pending_node_rewrites(struct bch_fs *c) |
2309 | { |
2310 | struct async_btree_rewrite *a, *n; |
2311 | |
2312 | mutex_lock(&c->pending_node_rewrites_lock); |
2313 | list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { |
2314 | list_del(entry: &a->list); |
2315 | |
2316 | kfree(objp: a); |
2317 | } |
2318 | mutex_unlock(lock: &c->pending_node_rewrites_lock); |
2319 | } |
2320 | |
2321 | static int __bch2_btree_node_update_key(struct btree_trans *trans, |
2322 | struct btree_iter *iter, |
2323 | struct btree *b, struct btree *new_hash, |
2324 | struct bkey_i *new_key, |
2325 | unsigned commit_flags, |
2326 | bool skip_triggers) |
2327 | { |
2328 | struct bch_fs *c = trans->c; |
2329 | struct btree_iter iter2 = { NULL }; |
2330 | struct btree *parent; |
2331 | int ret; |
2332 | |
2333 | if (!skip_triggers) { |
2334 | ret = bch2_key_trigger_old(trans, btree_id: b->c.btree_id, level: b->c.level + 1, |
2335 | old: bkey_i_to_s_c(k: &b->key), |
2336 | BTREE_TRIGGER_TRANSACTIONAL) ?: |
2337 | bch2_key_trigger_new(trans, btree_id: b->c.btree_id, level: b->c.level + 1, |
2338 | new: bkey_i_to_s(k: new_key), |
2339 | BTREE_TRIGGER_TRANSACTIONAL); |
2340 | if (ret) |
2341 | return ret; |
2342 | } |
2343 | |
2344 | if (new_hash) { |
2345 | bkey_copy(dst: &new_hash->key, src: new_key); |
2346 | ret = bch2_btree_node_hash_insert(&c->btree_cache, |
2347 | new_hash, b->c.level, b->c.btree_id); |
2348 | BUG_ON(ret); |
2349 | } |
2350 | |
2351 | parent = btree_node_parent(path: btree_iter_path(trans, iter), b); |
2352 | if (parent) { |
2353 | bch2_trans_copy_iter(&iter2, iter); |
2354 | |
2355 | iter2.path = bch2_btree_path_make_mut(trans, path: iter2.path, |
2356 | intent: iter2.flags & BTREE_ITER_INTENT, |
2357 | _THIS_IP_); |
2358 | |
2359 | struct btree_path *path2 = btree_iter_path(trans, iter: &iter2); |
2360 | BUG_ON(path2->level != b->c.level); |
2361 | BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); |
2362 | |
2363 | btree_path_set_level_up(trans, path: path2); |
2364 | |
2365 | trans->paths_sorted = false; |
2366 | |
2367 | ret = bch2_btree_iter_traverse(&iter2) ?: |
2368 | bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); |
2369 | if (ret) |
2370 | goto err; |
2371 | } else { |
2372 | BUG_ON(btree_node_root(c, b) != b); |
2373 | |
2374 | struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, |
2375 | u64s: jset_u64s(u64s: new_key->k.u64s)); |
2376 | ret = PTR_ERR_OR_ZERO(ptr: e); |
2377 | if (ret) |
2378 | return ret; |
2379 | |
2380 | journal_entry_set(entry: e, |
2381 | type: BCH_JSET_ENTRY_btree_root, |
2382 | id: b->c.btree_id, level: b->c.level, |
2383 | data: new_key, u64s: new_key->k.u64s); |
2384 | } |
2385 | |
2386 | ret = bch2_trans_commit(trans, NULL, NULL, flags: commit_flags); |
2387 | if (ret) |
2388 | goto err; |
2389 | |
2390 | bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); |
2391 | |
2392 | if (new_hash) { |
2393 | mutex_lock(&c->btree_cache.lock); |
2394 | bch2_btree_node_hash_remove(&c->btree_cache, new_hash); |
2395 | bch2_btree_node_hash_remove(&c->btree_cache, b); |
2396 | |
2397 | bkey_copy(dst: &b->key, src: new_key); |
2398 | ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); |
2399 | BUG_ON(ret); |
2400 | mutex_unlock(lock: &c->btree_cache.lock); |
2401 | } else { |
2402 | bkey_copy(dst: &b->key, src: new_key); |
2403 | } |
2404 | |
2405 | bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); |
2406 | out: |
2407 | bch2_trans_iter_exit(trans, &iter2); |
2408 | return ret; |
2409 | err: |
2410 | if (new_hash) { |
2411 | mutex_lock(&c->btree_cache.lock); |
2412 | bch2_btree_node_hash_remove(&c->btree_cache, b); |
2413 | mutex_unlock(lock: &c->btree_cache.lock); |
2414 | } |
2415 | goto out; |
2416 | } |
2417 | |
2418 | int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, |
2419 | struct btree *b, struct bkey_i *new_key, |
2420 | unsigned commit_flags, bool skip_triggers) |
2421 | { |
2422 | struct bch_fs *c = trans->c; |
2423 | struct btree *new_hash = NULL; |
2424 | struct btree_path *path = btree_iter_path(trans, iter); |
2425 | struct closure cl; |
2426 | int ret = 0; |
2427 | |
2428 | ret = bch2_btree_path_upgrade(trans, path, new_locks_want: b->c.level + 1); |
2429 | if (ret) |
2430 | return ret; |
2431 | |
2432 | closure_init_stack(cl: &cl); |
2433 | |
2434 | /* |
2435 | * check btree_ptr_hash_val() after @b is locked by |
2436 | * btree_iter_traverse(): |
2437 | */ |
2438 | if (btree_ptr_hash_val(k: new_key) != b->hash_val) { |
2439 | ret = bch2_btree_cache_cannibalize_lock(trans, &cl); |
2440 | if (ret) { |
2441 | ret = drop_locks_do(trans, (closure_sync(&cl), 0)); |
2442 | if (ret) |
2443 | return ret; |
2444 | } |
2445 | |
2446 | new_hash = bch2_btree_node_mem_alloc(trans, false); |
2447 | } |
2448 | |
2449 | path->intent_ref++; |
2450 | ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, |
2451 | commit_flags, skip_triggers); |
2452 | --path->intent_ref; |
2453 | |
2454 | if (new_hash) { |
2455 | mutex_lock(&c->btree_cache.lock); |
2456 | list_move(list: &new_hash->list, head: &c->btree_cache.freeable); |
2457 | mutex_unlock(lock: &c->btree_cache.lock); |
2458 | |
2459 | six_unlock_write(lock: &new_hash->c.lock); |
2460 | six_unlock_intent(lock: &new_hash->c.lock); |
2461 | } |
2462 | closure_sync(cl: &cl); |
2463 | bch2_btree_cache_cannibalize_unlock(trans); |
2464 | return ret; |
2465 | } |
2466 | |
2467 | int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, |
2468 | struct btree *b, struct bkey_i *new_key, |
2469 | unsigned commit_flags, bool skip_triggers) |
2470 | { |
2471 | struct btree_iter iter; |
2472 | int ret; |
2473 | |
2474 | bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, |
2475 | BTREE_MAX_DEPTH, b->c.level, |
2476 | BTREE_ITER_INTENT); |
2477 | ret = bch2_btree_iter_traverse(&iter); |
2478 | if (ret) |
2479 | goto out; |
2480 | |
2481 | /* has node been freed? */ |
2482 | if (btree_iter_path(trans, iter: &iter)->l[b->c.level].b != b) { |
2483 | /* node has been freed: */ |
2484 | BUG_ON(!btree_node_dying(b)); |
2485 | goto out; |
2486 | } |
2487 | |
2488 | BUG_ON(!btree_node_hashed(b)); |
2489 | |
2490 | struct bch_extent_ptr *ptr; |
2491 | bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, |
2492 | !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); |
2493 | |
2494 | ret = bch2_btree_node_update_key(trans, iter: &iter, b, new_key, |
2495 | commit_flags, skip_triggers); |
2496 | out: |
2497 | bch2_trans_iter_exit(trans, &iter); |
2498 | return ret; |
2499 | } |
2500 | |
2501 | /* Init code: */ |
2502 | |
2503 | /* |
2504 | * Only for filesystem bringup, when first reading the btree roots or allocating |
2505 | * btree roots when initializing a new filesystem: |
2506 | */ |
2507 | void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) |
2508 | { |
2509 | BUG_ON(btree_node_root(c, b)); |
2510 | |
2511 | bch2_btree_set_root_inmem(c, b); |
2512 | } |
2513 | |
2514 | static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level) |
2515 | { |
2516 | struct bch_fs *c = trans->c; |
2517 | struct closure cl; |
2518 | struct btree *b; |
2519 | int ret; |
2520 | |
2521 | closure_init_stack(cl: &cl); |
2522 | |
2523 | do { |
2524 | ret = bch2_btree_cache_cannibalize_lock(trans, &cl); |
2525 | closure_sync(cl: &cl); |
2526 | } while (ret); |
2527 | |
2528 | b = bch2_btree_node_mem_alloc(trans, false); |
2529 | bch2_btree_cache_cannibalize_unlock(trans); |
2530 | |
2531 | set_btree_node_fake(b); |
2532 | set_btree_node_need_rewrite(b); |
2533 | b->c.level = level; |
2534 | b->c.btree_id = id; |
2535 | |
2536 | bkey_btree_ptr_init(k: &b->key); |
2537 | b->key.k.p = SPOS_MAX; |
2538 | *((u64 *) bkey_i_to_btree_ptr(k: &b->key)->v.start) = U64_MAX - id; |
2539 | |
2540 | bch2_bset_init_first(b, &b->data->keys); |
2541 | bch2_btree_build_aux_trees(b); |
2542 | |
2543 | b->data->flags = 0; |
2544 | btree_set_min(b, POS_MIN); |
2545 | btree_set_max(b, SPOS_MAX); |
2546 | b->data->format = bch2_btree_calc_format(b); |
2547 | btree_node_set_format(b, f: b->data->format); |
2548 | |
2549 | ret = bch2_btree_node_hash_insert(&c->btree_cache, b, |
2550 | b->c.level, b->c.btree_id); |
2551 | BUG_ON(ret); |
2552 | |
2553 | bch2_btree_set_root_inmem(c, b); |
2554 | |
2555 | six_unlock_write(lock: &b->c.lock); |
2556 | six_unlock_intent(lock: &b->c.lock); |
2557 | return 0; |
2558 | } |
2559 | |
2560 | void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) |
2561 | { |
2562 | bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level)); |
2563 | } |
2564 | |
2565 | static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) |
2566 | { |
2567 | prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n" , |
2568 | (void *) as->ip_started, |
2569 | bch2_btree_id_str(as->btree_id), |
2570 | as->update_level_start, |
2571 | as->update_level_end, |
2572 | bch2_watermarks[as->watermark], |
2573 | bch2_btree_update_modes[as->mode], |
2574 | as->nodes_written, |
2575 | closure_nr_remaining(&as->cl), |
2576 | as->journal.seq); |
2577 | } |
2578 | |
2579 | void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) |
2580 | { |
2581 | struct btree_update *as; |
2582 | |
2583 | mutex_lock(&c->btree_interior_update_lock); |
2584 | list_for_each_entry(as, &c->btree_interior_update_list, list) |
2585 | bch2_btree_update_to_text(out, as); |
2586 | mutex_unlock(lock: &c->btree_interior_update_lock); |
2587 | } |
2588 | |
2589 | static bool bch2_btree_interior_updates_pending(struct bch_fs *c) |
2590 | { |
2591 | bool ret; |
2592 | |
2593 | mutex_lock(&c->btree_interior_update_lock); |
2594 | ret = !list_empty(head: &c->btree_interior_update_list); |
2595 | mutex_unlock(lock: &c->btree_interior_update_lock); |
2596 | |
2597 | return ret; |
2598 | } |
2599 | |
2600 | bool bch2_btree_interior_updates_flush(struct bch_fs *c) |
2601 | { |
2602 | bool ret = bch2_btree_interior_updates_pending(c); |
2603 | |
2604 | if (ret) |
2605 | closure_wait_event(&c->btree_interior_update_wait, |
2606 | !bch2_btree_interior_updates_pending(c)); |
2607 | return ret; |
2608 | } |
2609 | |
2610 | void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) |
2611 | { |
2612 | struct btree_root *r = bch2_btree_id_root(c, id: entry->btree_id); |
2613 | |
2614 | mutex_lock(&c->btree_root_lock); |
2615 | |
2616 | r->level = entry->level; |
2617 | r->alive = true; |
2618 | bkey_copy(dst: &r->key, src: (struct bkey_i *) entry->start); |
2619 | |
2620 | mutex_unlock(lock: &c->btree_root_lock); |
2621 | } |
2622 | |
2623 | struct jset_entry * |
2624 | bch2_btree_roots_to_journal_entries(struct bch_fs *c, |
2625 | struct jset_entry *end, |
2626 | unsigned long skip) |
2627 | { |
2628 | unsigned i; |
2629 | |
2630 | mutex_lock(&c->btree_root_lock); |
2631 | |
2632 | for (i = 0; i < btree_id_nr_alive(c); i++) { |
2633 | struct btree_root *r = bch2_btree_id_root(c, id: i); |
2634 | |
2635 | if (r->alive && !test_bit(i, &skip)) { |
2636 | journal_entry_set(entry: end, type: BCH_JSET_ENTRY_btree_root, |
2637 | id: i, level: r->level, data: &r->key, u64s: r->key.k.u64s); |
2638 | end = vstruct_next(end); |
2639 | } |
2640 | } |
2641 | |
2642 | mutex_unlock(lock: &c->btree_root_lock); |
2643 | |
2644 | return end; |
2645 | } |
2646 | |
2647 | void bch2_fs_btree_interior_update_exit(struct bch_fs *c) |
2648 | { |
2649 | if (c->btree_node_rewrite_worker) |
2650 | destroy_workqueue(wq: c->btree_node_rewrite_worker); |
2651 | if (c->btree_interior_update_worker) |
2652 | destroy_workqueue(wq: c->btree_interior_update_worker); |
2653 | mempool_exit(pool: &c->btree_interior_update_pool); |
2654 | } |
2655 | |
2656 | void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) |
2657 | { |
2658 | mutex_init(&c->btree_reserve_cache_lock); |
2659 | INIT_LIST_HEAD(list: &c->btree_interior_update_list); |
2660 | INIT_LIST_HEAD(list: &c->btree_interior_updates_unwritten); |
2661 | mutex_init(&c->btree_interior_update_lock); |
2662 | INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); |
2663 | |
2664 | INIT_LIST_HEAD(list: &c->pending_node_rewrites); |
2665 | mutex_init(&c->pending_node_rewrites_lock); |
2666 | } |
2667 | |
2668 | int bch2_fs_btree_interior_update_init(struct bch_fs *c) |
2669 | { |
2670 | c->btree_interior_update_worker = |
2671 | alloc_workqueue(fmt: "btree_update" , flags: WQ_UNBOUND|WQ_MEM_RECLAIM, max_active: 8); |
2672 | if (!c->btree_interior_update_worker) |
2673 | return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; |
2674 | |
2675 | c->btree_node_rewrite_worker = |
2676 | alloc_ordered_workqueue("btree_node_rewrite" , WQ_UNBOUND); |
2677 | if (!c->btree_node_rewrite_worker) |
2678 | return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; |
2679 | |
2680 | if (mempool_init_kmalloc_pool(pool: &c->btree_interior_update_pool, min_nr: 1, |
2681 | size: sizeof(struct btree_update))) |
2682 | return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; |
2683 | |
2684 | return 0; |
2685 | } |
2686 | |