1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H |
3 | #define _BCACHEFS_BTREE_UPDATE_INTERIOR_H |
4 | |
5 | #include "btree_cache.h" |
6 | #include "btree_locking.h" |
7 | #include "btree_update.h" |
8 | |
9 | #define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) |
10 | |
11 | #define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) |
12 | |
13 | int bch2_btree_node_check_topology(struct btree_trans *, struct btree *); |
14 | |
15 | #define BTREE_UPDATE_MODES() \ |
16 | x(none) \ |
17 | x(node) \ |
18 | x(root) \ |
19 | x(update) |
20 | |
21 | enum btree_update_mode { |
22 | #define x(n) BTREE_UPDATE_##n, |
23 | BTREE_UPDATE_MODES() |
24 | #undef x |
25 | }; |
26 | |
27 | /* |
28 | * Tracks an in progress split/rewrite of a btree node and the update to the |
29 | * parent node: |
30 | * |
31 | * When we split/rewrite a node, we do all the updates in memory without |
32 | * waiting for any writes to complete - we allocate the new node(s) and update |
33 | * the parent node, possibly recursively up to the root. |
34 | * |
35 | * The end result is that we have one or more new nodes being written - |
36 | * possibly several, if there were multiple splits - and then a write (updating |
37 | * an interior node) which will make all these new nodes visible. |
38 | * |
39 | * Additionally, as we split/rewrite nodes we free the old nodes - but the old |
40 | * nodes can't be freed (their space on disk can't be reclaimed) until the |
41 | * update to the interior node that makes the new node visible completes - |
42 | * until then, the old nodes are still reachable on disk. |
43 | * |
44 | */ |
45 | struct btree_update { |
46 | struct closure cl; |
47 | struct bch_fs *c; |
48 | u64 start_time; |
49 | unsigned long ip_started; |
50 | |
51 | struct list_head list; |
52 | struct list_head unwritten_list; |
53 | |
54 | enum btree_update_mode mode; |
55 | enum bch_watermark watermark; |
56 | unsigned nodes_written:1; |
57 | unsigned took_gc_lock:1; |
58 | |
59 | enum btree_id btree_id; |
60 | unsigned update_level_start; |
61 | unsigned update_level_end; |
62 | |
63 | struct disk_reservation disk_res; |
64 | |
65 | /* |
66 | * BTREE_UPDATE_node: |
67 | * The update that made the new nodes visible was a regular update to an |
68 | * existing interior node - @b. We can't write out the update to @b |
69 | * until the new nodes we created are finished writing, so we block @b |
70 | * from writing by putting this btree_interior update on the |
71 | * @b->write_blocked list with @write_blocked_list: |
72 | */ |
73 | struct btree *b; |
74 | struct list_head write_blocked_list; |
75 | |
76 | /* |
77 | * We may be freeing nodes that were dirty, and thus had journal entries |
78 | * pinned: we need to transfer the oldest of those pins to the |
79 | * btree_update operation, and release it when the new node(s) |
80 | * are all persistent and reachable: |
81 | */ |
82 | struct journal_entry_pin journal; |
83 | |
84 | /* Preallocated nodes we reserve when we start the update: */ |
85 | struct prealloc_nodes { |
86 | struct btree *b[BTREE_UPDATE_NODES_MAX]; |
87 | unsigned nr; |
88 | } prealloc_nodes[2]; |
89 | |
90 | /* Nodes being freed: */ |
91 | struct keylist old_keys; |
92 | u64 _old_keys[BTREE_UPDATE_NODES_MAX * |
93 | BKEY_BTREE_PTR_U64s_MAX]; |
94 | |
95 | /* Nodes being added: */ |
96 | struct keylist new_keys; |
97 | u64 _new_keys[BTREE_UPDATE_NODES_MAX * |
98 | BKEY_BTREE_PTR_U64s_MAX]; |
99 | |
100 | /* New nodes, that will be made reachable by this update: */ |
101 | struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; |
102 | unsigned nr_new_nodes; |
103 | |
104 | struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; |
105 | __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; |
106 | unsigned nr_old_nodes; |
107 | |
108 | open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * |
109 | BCH_REPLICAS_MAX]; |
110 | open_bucket_idx_t nr_open_buckets; |
111 | |
112 | unsigned journal_u64s; |
113 | u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; |
114 | |
115 | /* Only here to reduce stack usage on recursive splits: */ |
116 | struct keylist parent_keys; |
117 | /* |
118 | * Enough room for btree_split's keys without realloc - btree node |
119 | * pointers never have crc/compression info, so we only need to acount |
120 | * for the pointers for three keys |
121 | */ |
122 | u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; |
123 | }; |
124 | |
125 | struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, |
126 | struct btree_trans *, |
127 | struct btree *, |
128 | struct bkey_format); |
129 | |
130 | int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); |
131 | |
132 | int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); |
133 | |
134 | int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, |
135 | unsigned, unsigned, enum btree_node_sibling); |
136 | |
137 | static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, |
138 | btree_path_idx_t path_idx, |
139 | unsigned level, unsigned flags, |
140 | enum btree_node_sibling sib) |
141 | { |
142 | struct btree_path *path = trans->paths + path_idx; |
143 | struct btree *b; |
144 | |
145 | EBUG_ON(!btree_node_locked(path, level)); |
146 | |
147 | b = path->l[level].b; |
148 | if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) |
149 | return 0; |
150 | |
151 | return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); |
152 | } |
153 | |
154 | static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, |
155 | btree_path_idx_t path, |
156 | unsigned level, |
157 | unsigned flags) |
158 | { |
159 | return bch2_foreground_maybe_merge_sibling(trans, path_idx: path, level, flags, |
160 | sib: btree_prev_sib) ?: |
161 | bch2_foreground_maybe_merge_sibling(trans, path_idx: path, level, flags, |
162 | sib: btree_next_sib); |
163 | } |
164 | |
165 | int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, |
166 | struct btree *, unsigned); |
167 | void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); |
168 | int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, |
169 | struct btree *, struct bkey_i *, |
170 | unsigned, bool); |
171 | int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, |
172 | struct bkey_i *, unsigned, bool); |
173 | |
174 | void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); |
175 | void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned); |
176 | |
177 | static inline unsigned btree_update_reserve_required(struct bch_fs *c, |
178 | struct btree *b) |
179 | { |
180 | unsigned depth = btree_node_root(c, b)->c.level + 1; |
181 | |
182 | /* |
183 | * Number of nodes we might have to allocate in a worst case btree |
184 | * split operation - we split all the way up to the root, then allocate |
185 | * a new root, unless we're already at max depth: |
186 | */ |
187 | if (depth < BTREE_MAX_DEPTH) |
188 | return (depth - b->c.level) * 2 + 1; |
189 | else |
190 | return (depth - b->c.level) * 2 - 1; |
191 | } |
192 | |
193 | static inline void btree_node_reset_sib_u64s(struct btree *b) |
194 | { |
195 | b->sib_u64s[0] = b->nr.live_u64s; |
196 | b->sib_u64s[1] = b->nr.live_u64s; |
197 | } |
198 | |
199 | static inline void *btree_data_end(struct btree *b) |
200 | { |
201 | return (void *) b->data + btree_buf_bytes(b); |
202 | } |
203 | |
204 | static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b) |
205 | { |
206 | return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s); |
207 | } |
208 | |
209 | static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b) |
210 | { |
211 | return btree_data_end(b); |
212 | } |
213 | |
214 | static inline void *write_block(struct btree *b) |
215 | { |
216 | return (void *) b->data + (b->written << 9); |
217 | } |
218 | |
219 | static inline bool __btree_addr_written(struct btree *b, void *p) |
220 | { |
221 | return p < write_block(b); |
222 | } |
223 | |
224 | static inline bool bset_written(struct btree *b, struct bset *i) |
225 | { |
226 | return __btree_addr_written(b, p: i); |
227 | } |
228 | |
229 | static inline bool bkey_written(struct btree *b, struct bkey_packed *k) |
230 | { |
231 | return __btree_addr_written(b, p: k); |
232 | } |
233 | |
234 | static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end) |
235 | { |
236 | ssize_t used = bset_byte_offset(b, i: end) / sizeof(u64) + |
237 | b->whiteout_u64s; |
238 | ssize_t total = btree_buf_bytes(b) >> 3; |
239 | |
240 | /* Always leave one extra u64 for bch2_varint_decode: */ |
241 | used++; |
242 | |
243 | return total - used; |
244 | } |
245 | |
246 | static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b) |
247 | { |
248 | ssize_t remaining = __bch2_btree_u64s_remaining(b, |
249 | btree_bkey_last(b, bset_tree_last(b))); |
250 | |
251 | BUG_ON(remaining < 0); |
252 | |
253 | if (bset_written(b, i: btree_bset_last(b))) |
254 | return 0; |
255 | |
256 | return remaining; |
257 | } |
258 | |
259 | #define BTREE_WRITE_SET_U64s_BITS 9 |
260 | |
261 | static inline unsigned btree_write_set_buffer(struct btree *b) |
262 | { |
263 | /* |
264 | * Could buffer up larger amounts of keys for btrees with larger keys, |
265 | * pending benchmarking: |
266 | */ |
267 | return 8 << BTREE_WRITE_SET_U64s_BITS; |
268 | } |
269 | |
270 | static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b) |
271 | { |
272 | struct bset_tree *t = bset_tree_last(b); |
273 | struct btree_node_entry *bne = max(write_block(b), |
274 | (void *) btree_bkey_last(b, bset_tree_last(b))); |
275 | ssize_t remaining_space = |
276 | __bch2_btree_u64s_remaining(b, end: bne->keys.start); |
277 | |
278 | if (unlikely(bset_written(b, bset(b, t)))) { |
279 | if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) |
280 | return bne; |
281 | } else { |
282 | if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && |
283 | remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) |
284 | return bne; |
285 | } |
286 | |
287 | return NULL; |
288 | } |
289 | |
290 | static inline void push_whiteout(struct btree *b, struct bpos pos) |
291 | { |
292 | struct bkey_packed k; |
293 | |
294 | BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s); |
295 | EBUG_ON(btree_node_just_written(b)); |
296 | |
297 | if (!bkey_pack_pos(out: &k, in: pos, b)) { |
298 | struct bkey *u = (void *) &k; |
299 | |
300 | bkey_init(k: u); |
301 | u->p = pos; |
302 | } |
303 | |
304 | k.needs_whiteout = true; |
305 | |
306 | b->whiteout_u64s += k.u64s; |
307 | bkey_p_copy(dst: unwritten_whiteouts_start(b), src: &k); |
308 | } |
309 | |
310 | /* |
311 | * write lock must be held on @b (else the dirty bset that we were going to |
312 | * insert into could be written out from under us) |
313 | */ |
314 | static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s) |
315 | { |
316 | if (unlikely(btree_node_need_rewrite(b))) |
317 | return false; |
318 | |
319 | return u64s <= bch2_btree_keys_u64s_remaining(b); |
320 | } |
321 | |
322 | void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); |
323 | |
324 | bool bch2_btree_interior_updates_flush(struct bch_fs *); |
325 | |
326 | void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); |
327 | struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, |
328 | struct jset_entry *, unsigned long); |
329 | |
330 | void bch2_do_pending_node_rewrites(struct bch_fs *); |
331 | void bch2_free_pending_node_rewrites(struct bch_fs *); |
332 | |
333 | void bch2_fs_btree_interior_update_exit(struct bch_fs *); |
334 | void bch2_fs_btree_interior_update_init_early(struct bch_fs *); |
335 | int bch2_fs_btree_interior_update_init(struct bch_fs *); |
336 | |
337 | #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ |
338 | |