1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "btree_gc.h" |
5 | #include "btree_io.h" |
6 | #include "btree_iter.h" |
7 | #include "btree_journal_iter.h" |
8 | #include "btree_key_cache.h" |
9 | #include "btree_update_interior.h" |
10 | #include "btree_write_buffer.h" |
11 | #include "buckets.h" |
12 | #include "errcode.h" |
13 | #include "error.h" |
14 | #include "journal.h" |
15 | #include "journal_io.h" |
16 | #include "journal_reclaim.h" |
17 | #include "replicas.h" |
18 | #include "snapshot.h" |
19 | |
20 | #include <linux/prefetch.h> |
21 | |
22 | static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) |
23 | { |
24 | #ifdef CONFIG_BCACHEFS_DEBUG |
25 | struct bch_fs *c = trans->c; |
26 | struct bkey u; |
27 | struct bkey_s_c k = bch2_btree_path_peek_slot_exact(path: trans->paths + i->path, u: &u); |
28 | |
29 | if (unlikely(trans->journal_replay_not_finished)) { |
30 | struct bkey_i *j_k = |
31 | bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); |
32 | |
33 | if (j_k) |
34 | k = bkey_i_to_s_c(k: j_k); |
35 | } |
36 | |
37 | u = *k.k; |
38 | u.needs_whiteout = i->old_k.needs_whiteout; |
39 | |
40 | BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); |
41 | BUG_ON(i->old_v != k.v); |
42 | #endif |
43 | } |
44 | |
45 | static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) |
46 | { |
47 | return (trans->paths + i->path)->l + i->level; |
48 | } |
49 | |
50 | static inline bool same_leaf_as_prev(struct btree_trans *trans, |
51 | struct btree_insert_entry *i) |
52 | { |
53 | return i != trans->updates && |
54 | insert_l(trans, i: &i[0])->b == insert_l(trans, i: &i[-1])->b; |
55 | } |
56 | |
57 | static inline bool same_leaf_as_next(struct btree_trans *trans, |
58 | struct btree_insert_entry *i) |
59 | { |
60 | return i + 1 < trans->updates + trans->nr_updates && |
61 | insert_l(trans, i: &i[0])->b == insert_l(trans, i: &i[1])->b; |
62 | } |
63 | |
64 | inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, |
65 | struct btree_path *path, |
66 | struct btree *b) |
67 | { |
68 | struct bch_fs *c = trans->c; |
69 | |
70 | if (unlikely(btree_node_just_written(b)) && |
71 | bch2_btree_post_write_cleanup(c, b)) |
72 | bch2_trans_node_reinit_iter(trans, b); |
73 | |
74 | /* |
75 | * If the last bset has been written, or if it's gotten too big - start |
76 | * a new bset to insert into: |
77 | */ |
78 | if (want_new_bset(c, b)) |
79 | bch2_btree_init_next(trans, b); |
80 | } |
81 | |
82 | static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) |
83 | { |
84 | while (--i >= trans->updates) { |
85 | if (same_leaf_as_prev(trans, i)) |
86 | continue; |
87 | |
88 | bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); |
89 | } |
90 | |
91 | trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); |
92 | return btree_trans_restart(trans, err: BCH_ERR_transaction_restart_would_deadlock_write); |
93 | } |
94 | |
95 | static inline int bch2_trans_lock_write(struct btree_trans *trans) |
96 | { |
97 | EBUG_ON(trans->write_locked); |
98 | |
99 | trans_for_each_update(trans, i) { |
100 | if (same_leaf_as_prev(trans, i)) |
101 | continue; |
102 | |
103 | if (bch2_btree_node_lock_write(trans, path: trans->paths + i->path, b: &insert_l(trans, i)->b->c)) |
104 | return trans_lock_write_fail(trans, i); |
105 | |
106 | if (!i->cached) |
107 | bch2_btree_node_prep_for_write(trans, path: trans->paths + i->path, b: insert_l(trans, i)->b); |
108 | } |
109 | |
110 | trans->write_locked = true; |
111 | return 0; |
112 | } |
113 | |
114 | static inline void bch2_trans_unlock_write(struct btree_trans *trans) |
115 | { |
116 | if (likely(trans->write_locked)) { |
117 | trans_for_each_update(trans, i) |
118 | if (!same_leaf_as_prev(trans, i)) |
119 | bch2_btree_node_unlock_write_inlined(trans, |
120 | path: trans->paths + i->path, b: insert_l(trans, i)->b); |
121 | trans->write_locked = false; |
122 | } |
123 | } |
124 | |
125 | /* Inserting into a given leaf node (last stage of insert): */ |
126 | |
127 | /* Handle overwrites and do insert, for non extents: */ |
128 | bool bch2_btree_bset_insert_key(struct btree_trans *trans, |
129 | struct btree_path *path, |
130 | struct btree *b, |
131 | struct btree_node_iter *node_iter, |
132 | struct bkey_i *insert) |
133 | { |
134 | struct bkey_packed *k; |
135 | unsigned clobber_u64s = 0, new_u64s = 0; |
136 | |
137 | EBUG_ON(btree_node_just_written(b)); |
138 | EBUG_ON(bset_written(b, btree_bset_last(b))); |
139 | EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); |
140 | EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); |
141 | EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); |
142 | EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); |
143 | EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); |
144 | |
145 | k = bch2_btree_node_iter_peek_all(iter: node_iter, b); |
146 | if (k && bkey_cmp_left_packed(b, l: k, r: &insert->k.p)) |
147 | k = NULL; |
148 | |
149 | /* @k is the key being overwritten/deleted, if any: */ |
150 | EBUG_ON(k && bkey_deleted(k)); |
151 | |
152 | /* Deleting, but not found? nothing to do: */ |
153 | if (bkey_deleted(&insert->k) && !k) |
154 | return false; |
155 | |
156 | if (bkey_deleted(&insert->k)) { |
157 | /* Deleting: */ |
158 | btree_account_key_drop(b, k); |
159 | k->type = KEY_TYPE_deleted; |
160 | |
161 | if (k->needs_whiteout) |
162 | push_whiteout(b, pos: insert->k.p); |
163 | k->needs_whiteout = false; |
164 | |
165 | if (k >= btree_bset_last(b)->start) { |
166 | clobber_u64s = k->u64s; |
167 | bch2_bset_delete(b, k, clobber_u64s); |
168 | goto fix_iter; |
169 | } else { |
170 | bch2_btree_path_fix_key_modified(trans, b, k); |
171 | } |
172 | |
173 | return true; |
174 | } |
175 | |
176 | if (k) { |
177 | /* Overwriting: */ |
178 | btree_account_key_drop(b, k); |
179 | k->type = KEY_TYPE_deleted; |
180 | |
181 | insert->k.needs_whiteout = k->needs_whiteout; |
182 | k->needs_whiteout = false; |
183 | |
184 | if (k >= btree_bset_last(b)->start) { |
185 | clobber_u64s = k->u64s; |
186 | goto overwrite; |
187 | } else { |
188 | bch2_btree_path_fix_key_modified(trans, b, k); |
189 | } |
190 | } |
191 | |
192 | k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); |
193 | overwrite: |
194 | bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); |
195 | new_u64s = k->u64s; |
196 | fix_iter: |
197 | if (clobber_u64s != new_u64s) |
198 | bch2_btree_node_iter_fix(trans, path, b, node_iter, k, |
199 | clobber_u64s, new_u64s); |
200 | return true; |
201 | } |
202 | |
203 | static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, |
204 | unsigned i, u64 seq) |
205 | { |
206 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
207 | struct btree_write *w = container_of(pin, struct btree_write, journal); |
208 | struct btree *b = container_of(w, struct btree, writes[i]); |
209 | struct btree_trans *trans = bch2_trans_get(c); |
210 | unsigned long old, new, v; |
211 | unsigned idx = w - b->writes; |
212 | |
213 | btree_node_lock_nopath_nofail(trans, b: &b->c, type: SIX_LOCK_read); |
214 | v = READ_ONCE(b->flags); |
215 | |
216 | do { |
217 | old = new = v; |
218 | |
219 | if (!(old & (1 << BTREE_NODE_dirty)) || |
220 | !!(old & (1 << BTREE_NODE_write_idx)) != idx || |
221 | w->journal.seq != seq) |
222 | break; |
223 | |
224 | new &= ~BTREE_WRITE_TYPE_MASK; |
225 | new |= BTREE_WRITE_journal_reclaim; |
226 | new |= 1 << BTREE_NODE_need_write; |
227 | } while ((v = cmpxchg(&b->flags, old, new)) != old); |
228 | |
229 | btree_node_write_if_need(c, b, lock_held: SIX_LOCK_read); |
230 | six_unlock_read(lock: &b->c.lock); |
231 | |
232 | bch2_trans_put(trans); |
233 | return 0; |
234 | } |
235 | |
236 | int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) |
237 | { |
238 | return __btree_node_flush(j, pin, i: 0, seq); |
239 | } |
240 | |
241 | int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) |
242 | { |
243 | return __btree_node_flush(j, pin, i: 1, seq); |
244 | } |
245 | |
246 | inline void bch2_btree_add_journal_pin(struct bch_fs *c, |
247 | struct btree *b, u64 seq) |
248 | { |
249 | struct btree_write *w = btree_current_write(b); |
250 | |
251 | bch2_journal_pin_add(j: &c->journal, seq, pin: &w->journal, |
252 | flush_fn: btree_node_write_idx(b) == 0 |
253 | ? bch2_btree_node_flush0 |
254 | : bch2_btree_node_flush1); |
255 | } |
256 | |
257 | /** |
258 | * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node |
259 | * @trans: btree transaction object |
260 | * @path: path pointing to @insert's pos |
261 | * @insert: key to insert |
262 | * @journal_seq: sequence number of journal reservation |
263 | */ |
264 | inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, |
265 | struct btree_path *path, |
266 | struct bkey_i *insert, |
267 | u64 journal_seq) |
268 | { |
269 | struct bch_fs *c = trans->c; |
270 | struct btree *b = path_l(path)->b; |
271 | struct bset_tree *t = bset_tree_last(b); |
272 | struct bset *i = bset(b, t); |
273 | int old_u64s = bset_u64s(t); |
274 | int old_live_u64s = b->nr.live_u64s; |
275 | int live_u64s_added, u64s_added; |
276 | |
277 | if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, |
278 | &path_l(path)->iter, insert))) |
279 | return; |
280 | |
281 | i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); |
282 | |
283 | bch2_btree_add_journal_pin(c, b, seq: journal_seq); |
284 | |
285 | if (unlikely(!btree_node_dirty(b))) { |
286 | EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); |
287 | set_btree_node_dirty_acct(c, b); |
288 | } |
289 | |
290 | live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; |
291 | u64s_added = (int) bset_u64s(t) - old_u64s; |
292 | |
293 | if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) |
294 | b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); |
295 | if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) |
296 | b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); |
297 | |
298 | if (u64s_added > live_u64s_added && |
299 | bch2_maybe_compact_whiteouts(c, b)) |
300 | bch2_trans_node_reinit_iter(trans, b); |
301 | } |
302 | |
303 | /* Cached btree updates: */ |
304 | |
305 | /* Normal update interface: */ |
306 | |
307 | static inline void btree_insert_entry_checks(struct btree_trans *trans, |
308 | struct btree_insert_entry *i) |
309 | { |
310 | struct btree_path *path = trans->paths + i->path; |
311 | |
312 | BUG_ON(!bpos_eq(i->k->k.p, path->pos)); |
313 | BUG_ON(i->cached != path->cached); |
314 | BUG_ON(i->level != path->level); |
315 | BUG_ON(i->btree_id != path->btree_id); |
316 | EBUG_ON(!i->level && |
317 | btree_type_has_snapshots(i->btree_id) && |
318 | !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && |
319 | test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && |
320 | i->k->k.p.snapshot && |
321 | bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0); |
322 | } |
323 | |
324 | static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, |
325 | unsigned flags) |
326 | { |
327 | return bch2_journal_res_get(j: &trans->c->journal, res: &trans->journal_res, |
328 | u64s: trans->journal_u64s, flags); |
329 | } |
330 | |
331 | #define JSET_ENTRY_LOG_U64s 4 |
332 | |
333 | static noinline void journal_transaction_name(struct btree_trans *trans) |
334 | { |
335 | struct bch_fs *c = trans->c; |
336 | struct journal *j = &c->journal; |
337 | struct jset_entry *entry = |
338 | bch2_journal_add_entry(j, res: &trans->journal_res, |
339 | type: BCH_JSET_ENTRY_log, id: 0, level: 0, |
340 | JSET_ENTRY_LOG_U64s); |
341 | struct jset_entry_log *l = |
342 | container_of(entry, struct jset_entry_log, entry); |
343 | |
344 | strncpy(p: l->d, q: trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); |
345 | } |
346 | |
347 | static inline int btree_key_can_insert(struct btree_trans *trans, |
348 | struct btree *b, unsigned u64s) |
349 | { |
350 | if (!bch2_btree_node_insert_fits(b, u64s)) |
351 | return -BCH_ERR_btree_insert_btree_node_full; |
352 | |
353 | return 0; |
354 | } |
355 | |
356 | noinline static int |
357 | btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, |
358 | struct btree_path *path, unsigned new_u64s) |
359 | { |
360 | struct bkey_cached *ck = (void *) path->l[0].b; |
361 | struct bkey_i *new_k; |
362 | int ret; |
363 | |
364 | bch2_trans_unlock_write(trans); |
365 | bch2_trans_unlock(trans); |
366 | |
367 | new_k = kmalloc(size: new_u64s * sizeof(u64), GFP_KERNEL); |
368 | if (!new_k) { |
369 | bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u" , |
370 | bch2_btree_id_str(path->btree_id), new_u64s); |
371 | return -BCH_ERR_ENOMEM_btree_key_cache_insert; |
372 | } |
373 | |
374 | ret = bch2_trans_relock(trans) ?: |
375 | bch2_trans_lock_write(trans); |
376 | if (unlikely(ret)) { |
377 | kfree(objp: new_k); |
378 | return ret; |
379 | } |
380 | |
381 | memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); |
382 | |
383 | trans_for_each_update(trans, i) |
384 | if (i->old_v == &ck->k->v) |
385 | i->old_v = &new_k->v; |
386 | |
387 | kfree(objp: ck->k); |
388 | ck->u64s = new_u64s; |
389 | ck->k = new_k; |
390 | return 0; |
391 | } |
392 | |
393 | static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, |
394 | struct btree_path *path, unsigned u64s) |
395 | { |
396 | struct bch_fs *c = trans->c; |
397 | struct bkey_cached *ck = (void *) path->l[0].b; |
398 | unsigned new_u64s; |
399 | struct bkey_i *new_k; |
400 | unsigned watermark = flags & BCH_WATERMARK_MASK; |
401 | |
402 | EBUG_ON(path->level); |
403 | |
404 | if (watermark < BCH_WATERMARK_reclaim && |
405 | !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && |
406 | bch2_btree_key_cache_must_wait(c)) |
407 | return -BCH_ERR_btree_insert_need_journal_reclaim; |
408 | |
409 | /* |
410 | * bch2_varint_decode can read past the end of the buffer by at most 7 |
411 | * bytes (it won't be used): |
412 | */ |
413 | u64s += 1; |
414 | |
415 | if (u64s <= ck->u64s) |
416 | return 0; |
417 | |
418 | new_u64s = roundup_pow_of_two(u64s); |
419 | new_k = krealloc(objp: ck->k, new_size: new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); |
420 | if (unlikely(!new_k)) |
421 | return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); |
422 | |
423 | trans_for_each_update(trans, i) |
424 | if (i->old_v == &ck->k->v) |
425 | i->old_v = &new_k->v; |
426 | |
427 | ck->u64s = new_u64s; |
428 | ck->k = new_k; |
429 | return 0; |
430 | } |
431 | |
432 | /* Triggers: */ |
433 | |
434 | static int run_one_mem_trigger(struct btree_trans *trans, |
435 | struct btree_insert_entry *i, |
436 | unsigned flags) |
437 | { |
438 | struct bkey_s_c old = { &i->old_k, i->old_v }; |
439 | struct bkey_i *new = i->k; |
440 | const struct bkey_ops *old_ops = bch2_bkey_type_ops(type: old.k->type); |
441 | const struct bkey_ops *new_ops = bch2_bkey_type_ops(type: i->k->k.type); |
442 | int ret; |
443 | |
444 | verify_update_old_key(trans, i); |
445 | |
446 | if (unlikely(flags & BTREE_TRIGGER_NORUN)) |
447 | return 0; |
448 | |
449 | if (old_ops->trigger == new_ops->trigger) { |
450 | ret = bch2_key_trigger(trans, btree: i->btree_id, level: i->level, |
451 | old, new: bkey_i_to_s(k: new), |
452 | BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); |
453 | } else { |
454 | ret = bch2_key_trigger_new(trans, btree_id: i->btree_id, level: i->level, |
455 | new: bkey_i_to_s(k: new), flags) ?: |
456 | bch2_key_trigger_old(trans, btree_id: i->btree_id, level: i->level, |
457 | old, flags); |
458 | } |
459 | |
460 | return ret; |
461 | } |
462 | |
463 | static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, |
464 | bool overwrite) |
465 | { |
466 | /* |
467 | * Transactional triggers create new btree_insert_entries, so we can't |
468 | * pass them a pointer to a btree_insert_entry, that memory is going to |
469 | * move: |
470 | */ |
471 | struct bkey old_k = i->old_k; |
472 | struct bkey_s_c old = { &old_k, i->old_v }; |
473 | const struct bkey_ops *old_ops = bch2_bkey_type_ops(type: old.k->type); |
474 | const struct bkey_ops *new_ops = bch2_bkey_type_ops(type: i->k->k.type); |
475 | unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL; |
476 | |
477 | verify_update_old_key(trans, i); |
478 | |
479 | if ((i->flags & BTREE_TRIGGER_NORUN) || |
480 | !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) |
481 | return 0; |
482 | |
483 | if (!i->insert_trigger_run && |
484 | !i->overwrite_trigger_run && |
485 | old_ops->trigger == new_ops->trigger) { |
486 | i->overwrite_trigger_run = true; |
487 | i->insert_trigger_run = true; |
488 | return bch2_key_trigger(trans, btree: i->btree_id, level: i->level, old, new: bkey_i_to_s(k: i->k), |
489 | BTREE_TRIGGER_INSERT| |
490 | BTREE_TRIGGER_OVERWRITE|flags) ?: 1; |
491 | } else if (overwrite && !i->overwrite_trigger_run) { |
492 | i->overwrite_trigger_run = true; |
493 | return bch2_key_trigger_old(trans, btree_id: i->btree_id, level: i->level, old, flags) ?: 1; |
494 | } else if (!overwrite && !i->insert_trigger_run) { |
495 | i->insert_trigger_run = true; |
496 | return bch2_key_trigger_new(trans, btree_id: i->btree_id, level: i->level, new: bkey_i_to_s(k: i->k), flags) ?: 1; |
497 | } else { |
498 | return 0; |
499 | } |
500 | } |
501 | |
502 | static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, |
503 | unsigned btree_id_start) |
504 | { |
505 | bool trans_trigger_run; |
506 | int ret, overwrite; |
507 | |
508 | for (overwrite = 1; overwrite >= 0; --overwrite) { |
509 | |
510 | /* |
511 | * Running triggers will append more updates to the list of updates as |
512 | * we're walking it: |
513 | */ |
514 | do { |
515 | trans_trigger_run = false; |
516 | |
517 | for (unsigned i = btree_id_start; |
518 | i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; |
519 | i++) { |
520 | if (trans->updates[i].btree_id != btree_id) |
521 | continue; |
522 | |
523 | ret = run_one_trans_trigger(trans, i: trans->updates + i, overwrite); |
524 | if (ret < 0) |
525 | return ret; |
526 | if (ret) |
527 | trans_trigger_run = true; |
528 | } |
529 | } while (trans_trigger_run); |
530 | } |
531 | |
532 | return 0; |
533 | } |
534 | |
535 | static int bch2_trans_commit_run_triggers(struct btree_trans *trans) |
536 | { |
537 | unsigned btree_id = 0, btree_id_start = 0; |
538 | int ret = 0; |
539 | |
540 | /* |
541 | * |
542 | * For a given btree, this algorithm runs insert triggers before |
543 | * overwrite triggers: this is so that when extents are being moved |
544 | * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before |
545 | * they are re-added. |
546 | */ |
547 | for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { |
548 | if (btree_id == BTREE_ID_alloc) |
549 | continue; |
550 | |
551 | while (btree_id_start < trans->nr_updates && |
552 | trans->updates[btree_id_start].btree_id < btree_id) |
553 | btree_id_start++; |
554 | |
555 | ret = run_btree_triggers(trans, btree_id, btree_id_start); |
556 | if (ret) |
557 | return ret; |
558 | } |
559 | |
560 | for (unsigned idx = 0; idx < trans->nr_updates; idx++) { |
561 | struct btree_insert_entry *i = trans->updates + idx; |
562 | |
563 | if (i->btree_id > BTREE_ID_alloc) |
564 | break; |
565 | if (i->btree_id == BTREE_ID_alloc) { |
566 | ret = run_btree_triggers(trans, btree_id: BTREE_ID_alloc, btree_id_start: idx); |
567 | if (ret) |
568 | return ret; |
569 | break; |
570 | } |
571 | } |
572 | |
573 | #ifdef CONFIG_BCACHEFS_DEBUG |
574 | trans_for_each_update(trans, i) |
575 | BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && |
576 | (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && |
577 | (!i->insert_trigger_run || !i->overwrite_trigger_run)); |
578 | #endif |
579 | return 0; |
580 | } |
581 | |
582 | static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) |
583 | { |
584 | trans_for_each_update(trans, i) { |
585 | /* |
586 | * XXX: synchronization of cached update triggers with gc |
587 | * XXX: synchronization of interior node updates with gc |
588 | */ |
589 | BUG_ON(i->cached || i->level); |
590 | |
591 | if (btree_node_type_needs_gc(type: __btree_node_type(level: i->level, id: i->btree_id)) && |
592 | gc_visited(c: trans->c, pos: gc_pos_btree_node(b: insert_l(trans, i)->b))) { |
593 | int ret = run_one_mem_trigger(trans, i, flags: i->flags|BTREE_TRIGGER_GC); |
594 | if (ret) |
595 | return ret; |
596 | } |
597 | } |
598 | |
599 | return 0; |
600 | } |
601 | |
602 | static inline int |
603 | bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, |
604 | struct btree_insert_entry **stopped_at, |
605 | unsigned long trace_ip) |
606 | { |
607 | struct bch_fs *c = trans->c; |
608 | struct btree_trans_commit_hook *h; |
609 | unsigned u64s = 0; |
610 | int ret; |
611 | |
612 | if (race_fault()) { |
613 | trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); |
614 | return btree_trans_restart_nounlock(trans, err: BCH_ERR_transaction_restart_fault_inject); |
615 | } |
616 | |
617 | /* |
618 | * Check if the insert will fit in the leaf node with the write lock |
619 | * held, otherwise another thread could write the node changing the |
620 | * amount of space available: |
621 | */ |
622 | |
623 | prefetch(&trans->c->journal.flags); |
624 | |
625 | trans_for_each_update(trans, i) { |
626 | /* Multiple inserts might go to same leaf: */ |
627 | if (!same_leaf_as_prev(trans, i)) |
628 | u64s = 0; |
629 | |
630 | u64s += i->k->k.u64s; |
631 | ret = !i->cached |
632 | ? btree_key_can_insert(trans, b: insert_l(trans, i)->b, u64s) |
633 | : btree_key_can_insert_cached(trans, flags, path: trans->paths + i->path, u64s); |
634 | if (ret) { |
635 | *stopped_at = i; |
636 | return ret; |
637 | } |
638 | |
639 | i->k->k.needs_whiteout = false; |
640 | } |
641 | |
642 | /* |
643 | * Don't get journal reservation until after we know insert will |
644 | * succeed: |
645 | */ |
646 | if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { |
647 | ret = bch2_trans_journal_res_get(trans, |
648 | flags: (flags & BCH_WATERMARK_MASK)| |
649 | JOURNAL_RES_GET_NONBLOCK); |
650 | if (ret) |
651 | return ret; |
652 | |
653 | if (unlikely(trans->journal_transaction_names)) |
654 | journal_transaction_name(trans); |
655 | } |
656 | |
657 | /* |
658 | * Not allowed to fail after we've gotten our journal reservation - we |
659 | * have to use it: |
660 | */ |
661 | |
662 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && |
663 | !(flags & BCH_TRANS_COMMIT_no_journal_res)) { |
664 | if (bch2_journal_seq_verify) |
665 | trans_for_each_update(trans, i) |
666 | i->k->k.version.lo = trans->journal_res.seq; |
667 | else if (bch2_inject_invalid_keys) |
668 | trans_for_each_update(trans, i) |
669 | i->k->k.version = MAX_VERSION; |
670 | } |
671 | |
672 | if (trans->fs_usage_deltas && |
673 | bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) |
674 | return -BCH_ERR_btree_insert_need_mark_replicas; |
675 | |
676 | /* XXX: we only want to run this if deltas are nonzero */ |
677 | bch2_trans_account_disk_usage_change(trans); |
678 | |
679 | h = trans->hooks; |
680 | while (h) { |
681 | ret = h->fn(trans, h); |
682 | if (ret) |
683 | goto revert_fs_usage; |
684 | h = h->next; |
685 | } |
686 | |
687 | trans_for_each_update(trans, i) |
688 | if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) { |
689 | ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags); |
690 | if (ret) |
691 | goto fatal_err; |
692 | } |
693 | |
694 | if (unlikely(c->gc_pos.phase)) { |
695 | ret = bch2_trans_commit_run_gc_triggers(trans); |
696 | if (ret) |
697 | goto fatal_err; |
698 | } |
699 | |
700 | if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { |
701 | struct journal *j = &c->journal; |
702 | struct jset_entry *entry; |
703 | |
704 | trans_for_each_update(trans, i) { |
705 | if (i->key_cache_already_flushed) |
706 | continue; |
707 | |
708 | if (i->flags & BTREE_UPDATE_NOJOURNAL) |
709 | continue; |
710 | |
711 | verify_update_old_key(trans, i); |
712 | |
713 | if (trans->journal_transaction_names) { |
714 | entry = bch2_journal_add_entry(j, res: &trans->journal_res, |
715 | type: BCH_JSET_ENTRY_overwrite, |
716 | id: i->btree_id, level: i->level, |
717 | u64s: i->old_k.u64s); |
718 | bkey_reassemble(dst: (struct bkey_i *) entry->start, |
719 | src: (struct bkey_s_c) { &i->old_k, i->old_v }); |
720 | } |
721 | |
722 | entry = bch2_journal_add_entry(j, res: &trans->journal_res, |
723 | type: BCH_JSET_ENTRY_btree_keys, |
724 | id: i->btree_id, level: i->level, |
725 | u64s: i->k->k.u64s); |
726 | bkey_copy(dst: (struct bkey_i *) entry->start, src: i->k); |
727 | } |
728 | |
729 | memcpy_u64s_small(dst: journal_res_entry(j: &c->journal, res: &trans->journal_res), |
730 | src: trans->journal_entries, |
731 | u64s: trans->journal_entries_u64s); |
732 | |
733 | trans->journal_res.offset += trans->journal_entries_u64s; |
734 | trans->journal_res.u64s -= trans->journal_entries_u64s; |
735 | |
736 | if (trans->journal_seq) |
737 | *trans->journal_seq = trans->journal_res.seq; |
738 | } |
739 | |
740 | trans_for_each_update(trans, i) { |
741 | struct btree_path *path = trans->paths + i->path; |
742 | |
743 | if (!i->cached) { |
744 | bch2_btree_insert_key_leaf(trans, path, insert: i->k, journal_seq: trans->journal_res.seq); |
745 | } else if (!i->key_cache_already_flushed) |
746 | bch2_btree_insert_key_cached(trans, flags, i); |
747 | else { |
748 | bch2_btree_key_cache_drop(trans, path); |
749 | btree_path_set_dirty(path, u: BTREE_ITER_NEED_TRAVERSE); |
750 | } |
751 | } |
752 | |
753 | return 0; |
754 | fatal_err: |
755 | bch2_fatal_error(c); |
756 | revert_fs_usage: |
757 | if (trans->fs_usage_deltas) |
758 | bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); |
759 | return ret; |
760 | } |
761 | |
762 | static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) |
763 | { |
764 | trans_for_each_update(trans, i) |
765 | bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); |
766 | } |
767 | |
768 | static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, |
769 | enum bkey_invalid_flags flags, |
770 | struct btree_insert_entry *i, |
771 | struct printbuf *err) |
772 | { |
773 | struct bch_fs *c = trans->c; |
774 | |
775 | printbuf_reset(buf: err); |
776 | prt_printf(err, "invalid bkey on insert from %s -> %ps" , |
777 | trans->fn, (void *) i->ip_allocated); |
778 | prt_newline(err); |
779 | printbuf_indent_add(err, 2); |
780 | |
781 | bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(k: i->k)); |
782 | prt_newline(err); |
783 | |
784 | bch2_bkey_invalid(c, bkey_i_to_s_c(k: i->k), i->bkey_type, flags, err); |
785 | bch2_print_string_as_lines(KERN_ERR, lines: err->buf); |
786 | |
787 | bch2_inconsistent_error(c); |
788 | bch2_dump_trans_updates(trans); |
789 | |
790 | return -EINVAL; |
791 | } |
792 | |
793 | static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans, |
794 | struct jset_entry *i) |
795 | { |
796 | struct bch_fs *c = trans->c; |
797 | struct printbuf buf = PRINTBUF; |
798 | |
799 | prt_printf(&buf, "invalid bkey on insert from %s" , trans->fn); |
800 | prt_newline(&buf); |
801 | printbuf_indent_add(&buf, 2); |
802 | |
803 | bch2_journal_entry_to_text(&buf, c, i); |
804 | prt_newline(&buf); |
805 | |
806 | bch2_print_string_as_lines(KERN_ERR, lines: buf.buf); |
807 | |
808 | bch2_inconsistent_error(c); |
809 | bch2_dump_trans_updates(trans); |
810 | |
811 | return -EINVAL; |
812 | } |
813 | |
814 | static int bch2_trans_commit_journal_pin_flush(struct journal *j, |
815 | struct journal_entry_pin *_pin, u64 seq) |
816 | { |
817 | return 0; |
818 | } |
819 | |
820 | /* |
821 | * Get journal reservation, take write locks, and attempt to do btree update(s): |
822 | */ |
823 | static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, |
824 | struct btree_insert_entry **stopped_at, |
825 | unsigned long trace_ip) |
826 | { |
827 | struct bch_fs *c = trans->c; |
828 | int ret = 0, u64s_delta = 0; |
829 | |
830 | for (unsigned idx = 0; idx < trans->nr_updates; idx++) { |
831 | struct btree_insert_entry *i = trans->updates + idx; |
832 | if (i->cached) |
833 | continue; |
834 | |
835 | u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; |
836 | u64s_delta -= i->old_btree_u64s; |
837 | |
838 | if (!same_leaf_as_next(trans, i)) { |
839 | if (u64s_delta <= 0) { |
840 | ret = bch2_foreground_maybe_merge(trans, path: i->path, |
841 | level: i->level, flags); |
842 | if (unlikely(ret)) |
843 | return ret; |
844 | } |
845 | |
846 | u64s_delta = 0; |
847 | } |
848 | } |
849 | |
850 | ret = bch2_trans_lock_write(trans); |
851 | if (unlikely(ret)) |
852 | return ret; |
853 | |
854 | ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); |
855 | |
856 | if (!ret && unlikely(trans->journal_replay_not_finished)) |
857 | bch2_drop_overwrites_from_journal(trans); |
858 | |
859 | bch2_trans_unlock_write(trans); |
860 | |
861 | if (!ret && trans->journal_pin) |
862 | bch2_journal_pin_add(j: &c->journal, seq: trans->journal_res.seq, |
863 | pin: trans->journal_pin, |
864 | flush_fn: bch2_trans_commit_journal_pin_flush); |
865 | |
866 | /* |
867 | * Drop journal reservation after dropping write locks, since dropping |
868 | * the journal reservation may kick off a journal write: |
869 | */ |
870 | if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) |
871 | bch2_journal_res_put(j: &c->journal, res: &trans->journal_res); |
872 | |
873 | return ret; |
874 | } |
875 | |
876 | static int journal_reclaim_wait_done(struct bch_fs *c) |
877 | { |
878 | int ret = bch2_journal_error(j: &c->journal) ?: |
879 | !bch2_btree_key_cache_must_wait(c); |
880 | |
881 | if (!ret) |
882 | journal_reclaim_kick(j: &c->journal); |
883 | return ret; |
884 | } |
885 | |
886 | static noinline |
887 | int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, |
888 | struct btree_insert_entry *i, |
889 | int ret, unsigned long trace_ip) |
890 | { |
891 | struct bch_fs *c = trans->c; |
892 | enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; |
893 | |
894 | switch (ret) { |
895 | case -BCH_ERR_btree_insert_btree_node_full: |
896 | ret = bch2_btree_split_leaf(trans, i->path, flags); |
897 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
898 | trace_and_count(c, trans_restart_btree_node_split, trans, |
899 | trace_ip, trans->paths + i->path); |
900 | break; |
901 | case -BCH_ERR_btree_insert_need_mark_replicas: |
902 | ret = drop_locks_do(trans, |
903 | bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); |
904 | break; |
905 | case -BCH_ERR_journal_res_get_blocked: |
906 | /* |
907 | * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK |
908 | * flag |
909 | */ |
910 | if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && |
911 | watermark < BCH_WATERMARK_reclaim) { |
912 | ret = -BCH_ERR_journal_reclaim_would_deadlock; |
913 | break; |
914 | } |
915 | |
916 | ret = drop_locks_do(trans, |
917 | bch2_trans_journal_res_get(trans, |
918 | (flags & BCH_WATERMARK_MASK)| |
919 | JOURNAL_RES_GET_CHECK)); |
920 | break; |
921 | case -BCH_ERR_btree_insert_need_journal_reclaim: |
922 | bch2_trans_unlock(trans); |
923 | |
924 | trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); |
925 | |
926 | wait_event_freezable(c->journal.reclaim_wait, |
927 | (ret = journal_reclaim_wait_done(c))); |
928 | if (ret < 0) |
929 | break; |
930 | |
931 | ret = bch2_trans_relock(trans); |
932 | break; |
933 | default: |
934 | BUG_ON(ret >= 0); |
935 | break; |
936 | } |
937 | |
938 | BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); |
939 | |
940 | bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && |
941 | (flags & BCH_TRANS_COMMIT_no_enospc), c, |
942 | "%s: incorrectly got %s\n" , __func__, bch2_err_str(ret)); |
943 | |
944 | return ret; |
945 | } |
946 | |
947 | static noinline int |
948 | bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) |
949 | { |
950 | struct bch_fs *c = trans->c; |
951 | int ret; |
952 | |
953 | if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || |
954 | test_bit(BCH_FS_started, &c->flags)) |
955 | return -BCH_ERR_erofs_trans_commit; |
956 | |
957 | ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); |
958 | if (ret) |
959 | return ret; |
960 | |
961 | bch2_write_ref_get(c, ref: BCH_WRITE_REF_trans); |
962 | return 0; |
963 | } |
964 | |
965 | /* |
966 | * This is for updates done in the early part of fsck - btree_gc - before we've |
967 | * gone RW. we only add the new key to the list of keys for journal replay to |
968 | * do. |
969 | */ |
970 | static noinline int |
971 | do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) |
972 | { |
973 | struct bch_fs *c = trans->c; |
974 | int ret = 0; |
975 | |
976 | trans_for_each_update(trans, i) { |
977 | ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); |
978 | if (ret) |
979 | break; |
980 | } |
981 | |
982 | return ret; |
983 | } |
984 | |
985 | int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) |
986 | { |
987 | struct btree_insert_entry *errored_at = NULL; |
988 | struct bch_fs *c = trans->c; |
989 | int ret = 0; |
990 | |
991 | if (!trans->nr_updates && |
992 | !trans->journal_entries_u64s) |
993 | goto out_reset; |
994 | |
995 | memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); |
996 | |
997 | ret = bch2_trans_commit_run_triggers(trans); |
998 | if (ret) |
999 | goto out_reset; |
1000 | |
1001 | trans_for_each_update(trans, i) { |
1002 | struct printbuf buf = PRINTBUF; |
1003 | enum bkey_invalid_flags invalid_flags = 0; |
1004 | |
1005 | if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) |
1006 | invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; |
1007 | |
1008 | if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), |
1009 | i->bkey_type, invalid_flags, &buf))) |
1010 | ret = bch2_trans_commit_bkey_invalid(trans, flags: invalid_flags, i, err: &buf); |
1011 | btree_insert_entry_checks(trans, i); |
1012 | printbuf_exit(&buf); |
1013 | |
1014 | if (ret) |
1015 | return ret; |
1016 | } |
1017 | |
1018 | for (struct jset_entry *i = trans->journal_entries; |
1019 | i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); |
1020 | i = vstruct_next(i)) { |
1021 | enum bkey_invalid_flags invalid_flags = 0; |
1022 | |
1023 | if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) |
1024 | invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; |
1025 | |
1026 | if (unlikely(bch2_journal_entry_validate(c, NULL, i, |
1027 | bcachefs_metadata_version_current, |
1028 | CPU_BIG_ENDIAN, invalid_flags))) |
1029 | ret = bch2_trans_commit_journal_entry_invalid(trans, i); |
1030 | |
1031 | if (ret) |
1032 | return ret; |
1033 | } |
1034 | |
1035 | if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { |
1036 | ret = do_bch2_trans_commit_to_journal_replay(trans); |
1037 | goto out_reset; |
1038 | } |
1039 | |
1040 | if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && |
1041 | unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { |
1042 | ret = bch2_trans_commit_get_rw_cold(trans, flags); |
1043 | if (ret) |
1044 | goto out_reset; |
1045 | } |
1046 | |
1047 | EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); |
1048 | |
1049 | trans->journal_u64s = trans->journal_entries_u64s; |
1050 | trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); |
1051 | if (trans->journal_transaction_names) |
1052 | trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); |
1053 | |
1054 | trans_for_each_update(trans, i) { |
1055 | struct btree_path *path = trans->paths + i->path; |
1056 | |
1057 | EBUG_ON(!path->should_be_locked); |
1058 | |
1059 | ret = bch2_btree_path_upgrade(trans, path, new_locks_want: i->level + 1); |
1060 | if (unlikely(ret)) |
1061 | goto out; |
1062 | |
1063 | EBUG_ON(!btree_node_intent_locked(path, i->level)); |
1064 | |
1065 | if (i->key_cache_already_flushed) |
1066 | continue; |
1067 | |
1068 | if (i->flags & BTREE_UPDATE_NOJOURNAL) |
1069 | continue; |
1070 | |
1071 | /* we're going to journal the key being updated: */ |
1072 | trans->journal_u64s += jset_u64s(u64s: i->k->k.u64s); |
1073 | |
1074 | /* and we're also going to log the overwrite: */ |
1075 | if (trans->journal_transaction_names) |
1076 | trans->journal_u64s += jset_u64s(u64s: i->old_k.u64s); |
1077 | } |
1078 | |
1079 | if (trans->extra_disk_res) { |
1080 | ret = bch2_disk_reservation_add(c, res: trans->disk_res, |
1081 | sectors: trans->extra_disk_res, |
1082 | flags: (flags & BCH_TRANS_COMMIT_no_enospc) |
1083 | ? BCH_DISK_RESERVATION_NOFAIL : 0); |
1084 | if (ret) |
1085 | goto err; |
1086 | } |
1087 | retry: |
1088 | errored_at = NULL; |
1089 | bch2_trans_verify_not_in_restart(trans); |
1090 | if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) |
1091 | memset(&trans->journal_res, 0, sizeof(trans->journal_res)); |
1092 | |
1093 | ret = do_bch2_trans_commit(trans, flags, stopped_at: &errored_at, _RET_IP_); |
1094 | |
1095 | /* make sure we didn't drop or screw up locks: */ |
1096 | bch2_trans_verify_locks(trans); |
1097 | |
1098 | if (ret) |
1099 | goto err; |
1100 | |
1101 | trace_and_count(c, transaction_commit, trans, _RET_IP_); |
1102 | out: |
1103 | if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) |
1104 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_trans); |
1105 | out_reset: |
1106 | if (!ret) |
1107 | bch2_trans_downgrade(trans); |
1108 | bch2_trans_reset_updates(trans); |
1109 | |
1110 | return ret; |
1111 | err: |
1112 | ret = bch2_trans_commit_error(trans, flags, i: errored_at, ret, _RET_IP_); |
1113 | if (ret) |
1114 | goto out; |
1115 | |
1116 | /* |
1117 | * We might have done another transaction commit in the error path - |
1118 | * i.e. btree write buffer flush - which will have made use of |
1119 | * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is |
1120 | * how the journal sequence number to pin is passed in - so we must |
1121 | * restart: |
1122 | */ |
1123 | if (flags & BCH_TRANS_COMMIT_no_journal_res) { |
1124 | ret = -BCH_ERR_transaction_restart_nested; |
1125 | goto out; |
1126 | } |
1127 | |
1128 | goto retry; |
1129 | } |
1130 | |