1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "bkey_methods.h" |
5 | #include "bkey_buf.h" |
6 | #include "btree_cache.h" |
7 | #include "btree_iter.h" |
8 | #include "btree_journal_iter.h" |
9 | #include "btree_key_cache.h" |
10 | #include "btree_locking.h" |
11 | #include "btree_update.h" |
12 | #include "debug.h" |
13 | #include "error.h" |
14 | #include "extents.h" |
15 | #include "journal.h" |
16 | #include "journal_io.h" |
17 | #include "replicas.h" |
18 | #include "snapshot.h" |
19 | #include "trace.h" |
20 | |
21 | #include <linux/random.h> |
22 | #include <linux/prefetch.h> |
23 | |
24 | static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); |
25 | static inline void btree_path_list_add(struct btree_trans *, |
26 | btree_path_idx_t, btree_path_idx_t); |
27 | |
28 | static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) |
29 | { |
30 | #ifdef TRACK_PATH_ALLOCATED |
31 | return iter->ip_allocated; |
32 | #else |
33 | return 0; |
34 | #endif |
35 | } |
36 | |
37 | static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); |
38 | static void bch2_trans_srcu_lock(struct btree_trans *); |
39 | |
40 | static inline int __btree_path_cmp(const struct btree_path *l, |
41 | enum btree_id r_btree_id, |
42 | bool r_cached, |
43 | struct bpos r_pos, |
44 | unsigned r_level) |
45 | { |
46 | /* |
47 | * Must match lock ordering as defined by __bch2_btree_node_lock: |
48 | */ |
49 | return cmp_int(l->btree_id, r_btree_id) ?: |
50 | cmp_int((int) l->cached, (int) r_cached) ?: |
51 | bpos_cmp(l: l->pos, r: r_pos) ?: |
52 | -cmp_int(l->level, r_level); |
53 | } |
54 | |
55 | static inline int btree_path_cmp(const struct btree_path *l, |
56 | const struct btree_path *r) |
57 | { |
58 | return __btree_path_cmp(l, r_btree_id: r->btree_id, r_cached: r->cached, r_pos: r->pos, r_level: r->level); |
59 | } |
60 | |
61 | static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) |
62 | { |
63 | /* Are we iterating over keys in all snapshots? */ |
64 | if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { |
65 | p = bpos_successor(p); |
66 | } else { |
67 | p = bpos_nosnap_successor(p); |
68 | p.snapshot = iter->snapshot; |
69 | } |
70 | |
71 | return p; |
72 | } |
73 | |
74 | static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) |
75 | { |
76 | /* Are we iterating over keys in all snapshots? */ |
77 | if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { |
78 | p = bpos_predecessor(p); |
79 | } else { |
80 | p = bpos_nosnap_predecessor(p); |
81 | p.snapshot = iter->snapshot; |
82 | } |
83 | |
84 | return p; |
85 | } |
86 | |
87 | static inline struct bpos btree_iter_search_key(struct btree_iter *iter) |
88 | { |
89 | struct bpos pos = iter->pos; |
90 | |
91 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && |
92 | !bkey_eq(l: pos, POS_MAX)) |
93 | pos = bkey_successor(iter, p: pos); |
94 | return pos; |
95 | } |
96 | |
97 | static inline bool btree_path_pos_before_node(struct btree_path *path, |
98 | struct btree *b) |
99 | { |
100 | return bpos_lt(l: path->pos, r: b->data->min_key); |
101 | } |
102 | |
103 | static inline bool btree_path_pos_after_node(struct btree_path *path, |
104 | struct btree *b) |
105 | { |
106 | return bpos_gt(l: path->pos, r: b->key.k.p); |
107 | } |
108 | |
109 | static inline bool btree_path_pos_in_node(struct btree_path *path, |
110 | struct btree *b) |
111 | { |
112 | return path->btree_id == b->c.btree_id && |
113 | !btree_path_pos_before_node(path, b) && |
114 | !btree_path_pos_after_node(path, b); |
115 | } |
116 | |
117 | /* Btree iterator: */ |
118 | |
119 | #ifdef CONFIG_BCACHEFS_DEBUG |
120 | |
121 | static void bch2_btree_path_verify_cached(struct btree_trans *trans, |
122 | struct btree_path *path) |
123 | { |
124 | struct bkey_cached *ck; |
125 | bool locked = btree_node_locked(path, level: 0); |
126 | |
127 | if (!bch2_btree_node_relock(trans, path, level: 0)) |
128 | return; |
129 | |
130 | ck = (void *) path->l[0].b; |
131 | BUG_ON(ck->key.btree_id != path->btree_id || |
132 | !bkey_eq(ck->key.pos, path->pos)); |
133 | |
134 | if (!locked) |
135 | btree_node_unlock(trans, path, level: 0); |
136 | } |
137 | |
138 | static void bch2_btree_path_verify_level(struct btree_trans *trans, |
139 | struct btree_path *path, unsigned level) |
140 | { |
141 | struct btree_path_level *l; |
142 | struct btree_node_iter tmp; |
143 | bool locked; |
144 | struct bkey_packed *p, *k; |
145 | struct printbuf buf1 = PRINTBUF; |
146 | struct printbuf buf2 = PRINTBUF; |
147 | struct printbuf buf3 = PRINTBUF; |
148 | const char *msg; |
149 | |
150 | if (!bch2_debug_check_iterators) |
151 | return; |
152 | |
153 | l = &path->l[level]; |
154 | tmp = l->iter; |
155 | locked = btree_node_locked(path, level); |
156 | |
157 | if (path->cached) { |
158 | if (!level) |
159 | bch2_btree_path_verify_cached(trans, path); |
160 | return; |
161 | } |
162 | |
163 | if (!btree_path_node(path, level)) |
164 | return; |
165 | |
166 | if (!bch2_btree_node_relock_notrace(trans, path, level)) |
167 | return; |
168 | |
169 | BUG_ON(!btree_path_pos_in_node(path, l->b)); |
170 | |
171 | bch2_btree_node_iter_verify(&l->iter, l->b); |
172 | |
173 | /* |
174 | * For interior nodes, the iterator will have skipped past deleted keys: |
175 | */ |
176 | p = level |
177 | ? bch2_btree_node_iter_prev(&tmp, l->b) |
178 | : bch2_btree_node_iter_prev_all(&tmp, l->b); |
179 | k = bch2_btree_node_iter_peek_all(iter: &l->iter, b: l->b); |
180 | |
181 | if (p && bkey_iter_pos_cmp(b: l->b, l: p, r: &path->pos) >= 0) { |
182 | msg = "before" ; |
183 | goto err; |
184 | } |
185 | |
186 | if (k && bkey_iter_pos_cmp(b: l->b, l: k, r: &path->pos) < 0) { |
187 | msg = "after" ; |
188 | goto err; |
189 | } |
190 | |
191 | if (!locked) |
192 | btree_node_unlock(trans, path, level); |
193 | return; |
194 | err: |
195 | bch2_bpos_to_text(&buf1, path->pos); |
196 | |
197 | if (p) { |
198 | struct bkey uk = bkey_unpack_key(b: l->b, src: p); |
199 | |
200 | bch2_bkey_to_text(&buf2, &uk); |
201 | } else { |
202 | prt_printf(&buf2, "(none)" ); |
203 | } |
204 | |
205 | if (k) { |
206 | struct bkey uk = bkey_unpack_key(b: l->b, src: k); |
207 | |
208 | bch2_bkey_to_text(&buf3, &uk); |
209 | } else { |
210 | prt_printf(&buf3, "(none)" ); |
211 | } |
212 | |
213 | panic(fmt: "path should be %s key at level %u:\n" |
214 | "path pos %s\n" |
215 | "prev key %s\n" |
216 | "cur key %s\n" , |
217 | msg, level, buf1.buf, buf2.buf, buf3.buf); |
218 | } |
219 | |
220 | static void bch2_btree_path_verify(struct btree_trans *trans, |
221 | struct btree_path *path) |
222 | { |
223 | struct bch_fs *c = trans->c; |
224 | unsigned i; |
225 | |
226 | EBUG_ON(path->btree_id >= BTREE_ID_NR); |
227 | |
228 | for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { |
229 | if (!path->l[i].b) { |
230 | BUG_ON(!path->cached && |
231 | bch2_btree_id_root(c, path->btree_id)->b->c.level > i); |
232 | break; |
233 | } |
234 | |
235 | bch2_btree_path_verify_level(trans, path, level: i); |
236 | } |
237 | |
238 | bch2_btree_path_verify_locks(path); |
239 | } |
240 | |
241 | void bch2_trans_verify_paths(struct btree_trans *trans) |
242 | { |
243 | struct btree_path *path; |
244 | unsigned iter; |
245 | |
246 | trans_for_each_path(trans, path, iter) |
247 | bch2_btree_path_verify(trans, path); |
248 | } |
249 | |
250 | static void bch2_btree_iter_verify(struct btree_iter *iter) |
251 | { |
252 | struct btree_trans *trans = iter->trans; |
253 | |
254 | BUG_ON(iter->btree_id >= BTREE_ID_NR); |
255 | |
256 | BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); |
257 | |
258 | BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && |
259 | (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); |
260 | |
261 | BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && |
262 | (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && |
263 | !btree_type_has_snapshot_field(iter->btree_id)); |
264 | |
265 | if (iter->update_path) |
266 | bch2_btree_path_verify(trans, path: &trans->paths[iter->update_path]); |
267 | bch2_btree_path_verify(trans, path: btree_iter_path(trans, iter)); |
268 | } |
269 | |
270 | static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) |
271 | { |
272 | BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && |
273 | !iter->pos.snapshot); |
274 | |
275 | BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && |
276 | iter->pos.snapshot != iter->snapshot); |
277 | |
278 | BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || |
279 | bkey_gt(iter->pos, iter->k.p)); |
280 | } |
281 | |
282 | static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) |
283 | { |
284 | struct btree_trans *trans = iter->trans; |
285 | struct btree_iter copy; |
286 | struct bkey_s_c prev; |
287 | int ret = 0; |
288 | |
289 | if (!bch2_debug_check_iterators) |
290 | return 0; |
291 | |
292 | if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) |
293 | return 0; |
294 | |
295 | if (bkey_err(k) || !k.k) |
296 | return 0; |
297 | |
298 | BUG_ON(!bch2_snapshot_is_ancestor(trans->c, |
299 | iter->snapshot, |
300 | k.k->p.snapshot)); |
301 | |
302 | bch2_trans_iter_init(trans, iter: ©, btree_id: iter->btree_id, pos: iter->pos, |
303 | flags: BTREE_ITER_NOPRESERVE| |
304 | BTREE_ITER_ALL_SNAPSHOTS); |
305 | prev = bch2_btree_iter_prev(©); |
306 | if (!prev.k) |
307 | goto out; |
308 | |
309 | ret = bkey_err(prev); |
310 | if (ret) |
311 | goto out; |
312 | |
313 | if (bkey_eq(l: prev.k->p, r: k.k->p) && |
314 | bch2_snapshot_is_ancestor(c: trans->c, id: iter->snapshot, |
315 | ancestor: prev.k->p.snapshot) > 0) { |
316 | struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; |
317 | |
318 | bch2_bkey_to_text(&buf1, k.k); |
319 | bch2_bkey_to_text(&buf2, prev.k); |
320 | |
321 | panic(fmt: "iter snap %u\n" |
322 | "k %s\n" |
323 | "prev %s\n" , |
324 | iter->snapshot, |
325 | buf1.buf, buf2.buf); |
326 | } |
327 | out: |
328 | bch2_trans_iter_exit(trans, ©); |
329 | return ret; |
330 | } |
331 | |
332 | void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, |
333 | struct bpos pos, bool key_cache) |
334 | { |
335 | struct btree_path *path; |
336 | struct trans_for_each_path_inorder_iter iter; |
337 | struct printbuf buf = PRINTBUF; |
338 | |
339 | btree_trans_sort_paths(trans); |
340 | |
341 | trans_for_each_path_inorder(trans, path, iter) { |
342 | int cmp = cmp_int(path->btree_id, id) ?: |
343 | cmp_int(path->cached, key_cache); |
344 | |
345 | if (cmp > 0) |
346 | break; |
347 | if (cmp < 0) |
348 | continue; |
349 | |
350 | if (!btree_node_locked(path, 0) || |
351 | !path->should_be_locked) |
352 | continue; |
353 | |
354 | if (!key_cache) { |
355 | if (bkey_ge(pos, path->l[0].b->data->min_key) && |
356 | bkey_le(pos, path->l[0].b->key.k.p)) |
357 | return; |
358 | } else { |
359 | if (bkey_eq(pos, path->pos)) |
360 | return; |
361 | } |
362 | } |
363 | |
364 | bch2_dump_trans_paths_updates(trans); |
365 | bch2_bpos_to_text(&buf, pos); |
366 | |
367 | panic("not locked: %s %s%s\n" , |
368 | bch2_btree_id_str(id), buf.buf, |
369 | key_cache ? " cached" : "" ); |
370 | } |
371 | |
372 | #else |
373 | |
374 | static inline void bch2_btree_path_verify_level(struct btree_trans *trans, |
375 | struct btree_path *path, unsigned l) {} |
376 | static inline void bch2_btree_path_verify(struct btree_trans *trans, |
377 | struct btree_path *path) {} |
378 | static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} |
379 | static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} |
380 | static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } |
381 | |
382 | #endif |
383 | |
384 | /* Btree path: fixups after btree updates */ |
385 | |
386 | static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, |
387 | struct btree *b, |
388 | struct bset_tree *t, |
389 | struct bkey_packed *k) |
390 | { |
391 | struct btree_node_iter_set *set; |
392 | |
393 | btree_node_iter_for_each(iter, set) |
394 | if (set->end == t->end_offset) { |
395 | set->k = __btree_node_key_to_offset(b, k); |
396 | bch2_btree_node_iter_sort(iter, b); |
397 | return; |
398 | } |
399 | |
400 | bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); |
401 | } |
402 | |
403 | static void __bch2_btree_path_fix_key_modified(struct btree_path *path, |
404 | struct btree *b, |
405 | struct bkey_packed *where) |
406 | { |
407 | struct btree_path_level *l = &path->l[b->c.level]; |
408 | |
409 | if (where != bch2_btree_node_iter_peek_all(iter: &l->iter, b: l->b)) |
410 | return; |
411 | |
412 | if (bkey_iter_pos_cmp(b: l->b, l: where, r: &path->pos) < 0) |
413 | bch2_btree_node_iter_advance(&l->iter, l->b); |
414 | } |
415 | |
416 | void bch2_btree_path_fix_key_modified(struct btree_trans *trans, |
417 | struct btree *b, |
418 | struct bkey_packed *where) |
419 | { |
420 | struct btree_path *path; |
421 | unsigned i; |
422 | |
423 | trans_for_each_path_with_node(trans, b, path, i) { |
424 | __bch2_btree_path_fix_key_modified(path, b, where); |
425 | bch2_btree_path_verify_level(trans, path, level: b->c.level); |
426 | } |
427 | } |
428 | |
429 | static void __bch2_btree_node_iter_fix(struct btree_path *path, |
430 | struct btree *b, |
431 | struct btree_node_iter *node_iter, |
432 | struct bset_tree *t, |
433 | struct bkey_packed *where, |
434 | unsigned clobber_u64s, |
435 | unsigned new_u64s) |
436 | { |
437 | const struct bkey_packed *end = btree_bkey_last(b, t); |
438 | struct btree_node_iter_set *set; |
439 | unsigned offset = __btree_node_key_to_offset(b, k: where); |
440 | int shift = new_u64s - clobber_u64s; |
441 | unsigned old_end = t->end_offset - shift; |
442 | unsigned orig_iter_pos = node_iter->data[0].k; |
443 | bool iter_current_key_modified = |
444 | orig_iter_pos >= offset && |
445 | orig_iter_pos <= offset + clobber_u64s; |
446 | |
447 | btree_node_iter_for_each(node_iter, set) |
448 | if (set->end == old_end) |
449 | goto found; |
450 | |
451 | /* didn't find the bset in the iterator - might have to readd it: */ |
452 | if (new_u64s && |
453 | bkey_iter_pos_cmp(b, l: where, r: &path->pos) >= 0) { |
454 | bch2_btree_node_iter_push(node_iter, b, where, end); |
455 | goto fixup_done; |
456 | } else { |
457 | /* Iterator is after key that changed */ |
458 | return; |
459 | } |
460 | found: |
461 | set->end = t->end_offset; |
462 | |
463 | /* Iterator hasn't gotten to the key that changed yet: */ |
464 | if (set->k < offset) |
465 | return; |
466 | |
467 | if (new_u64s && |
468 | bkey_iter_pos_cmp(b, l: where, r: &path->pos) >= 0) { |
469 | set->k = offset; |
470 | } else if (set->k < offset + clobber_u64s) { |
471 | set->k = offset + new_u64s; |
472 | if (set->k == set->end) |
473 | bch2_btree_node_iter_set_drop(node_iter, set); |
474 | } else { |
475 | /* Iterator is after key that changed */ |
476 | set->k = (int) set->k + shift; |
477 | return; |
478 | } |
479 | |
480 | bch2_btree_node_iter_sort(node_iter, b); |
481 | fixup_done: |
482 | if (node_iter->data[0].k != orig_iter_pos) |
483 | iter_current_key_modified = true; |
484 | |
485 | /* |
486 | * When a new key is added, and the node iterator now points to that |
487 | * key, the iterator might have skipped past deleted keys that should |
488 | * come after the key the iterator now points to. We have to rewind to |
489 | * before those deleted keys - otherwise |
490 | * bch2_btree_node_iter_prev_all() breaks: |
491 | */ |
492 | if (!bch2_btree_node_iter_end(iter: node_iter) && |
493 | iter_current_key_modified && |
494 | b->c.level) { |
495 | struct bkey_packed *k, *k2, *p; |
496 | |
497 | k = bch2_btree_node_iter_peek_all(iter: node_iter, b); |
498 | |
499 | for_each_bset(b, t) { |
500 | bool set_pos = false; |
501 | |
502 | if (node_iter->data[0].end == t->end_offset) |
503 | continue; |
504 | |
505 | k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); |
506 | |
507 | while ((p = bch2_bkey_prev_all(b, t, k: k2)) && |
508 | bkey_iter_cmp(b, l: k, r: p) < 0) { |
509 | k2 = p; |
510 | set_pos = true; |
511 | } |
512 | |
513 | if (set_pos) |
514 | btree_node_iter_set_set_pos(iter: node_iter, |
515 | b, t, k: k2); |
516 | } |
517 | } |
518 | } |
519 | |
520 | void bch2_btree_node_iter_fix(struct btree_trans *trans, |
521 | struct btree_path *path, |
522 | struct btree *b, |
523 | struct btree_node_iter *node_iter, |
524 | struct bkey_packed *where, |
525 | unsigned clobber_u64s, |
526 | unsigned new_u64s) |
527 | { |
528 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, k: where); |
529 | struct btree_path *linked; |
530 | unsigned i; |
531 | |
532 | if (node_iter != &path->l[b->c.level].iter) { |
533 | __bch2_btree_node_iter_fix(path, b, node_iter, t, |
534 | where, clobber_u64s, new_u64s); |
535 | |
536 | if (bch2_debug_check_iterators) |
537 | bch2_btree_node_iter_verify(node_iter, b); |
538 | } |
539 | |
540 | trans_for_each_path_with_node(trans, b, linked, i) { |
541 | __bch2_btree_node_iter_fix(path: linked, b, |
542 | node_iter: &linked->l[b->c.level].iter, t, |
543 | where, clobber_u64s, new_u64s); |
544 | bch2_btree_path_verify_level(trans, path: linked, level: b->c.level); |
545 | } |
546 | } |
547 | |
548 | /* Btree path level: pointer to a particular btree node and node iter */ |
549 | |
550 | static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, |
551 | struct btree_path_level *l, |
552 | struct bkey *u, |
553 | struct bkey_packed *k) |
554 | { |
555 | if (unlikely(!k)) { |
556 | /* |
557 | * signal to bch2_btree_iter_peek_slot() that we're currently at |
558 | * a hole |
559 | */ |
560 | u->type = KEY_TYPE_deleted; |
561 | return bkey_s_c_null; |
562 | } |
563 | |
564 | return bkey_disassemble(b: l->b, k, u); |
565 | } |
566 | |
567 | static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, |
568 | struct btree_path_level *l, |
569 | struct bkey *u) |
570 | { |
571 | return __btree_iter_unpack(c, l, u, |
572 | k: bch2_btree_node_iter_peek_all(iter: &l->iter, b: l->b)); |
573 | } |
574 | |
575 | static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, |
576 | struct btree_path *path, |
577 | struct btree_path_level *l, |
578 | struct bkey *u) |
579 | { |
580 | struct bkey_s_c k = __btree_iter_unpack(c: trans->c, l, u, |
581 | k: bch2_btree_node_iter_peek(iter: &l->iter, b: l->b)); |
582 | |
583 | path->pos = k.k ? k.k->p : l->b->key.k.p; |
584 | trans->paths_sorted = false; |
585 | bch2_btree_path_verify_level(trans, path, level: l - path->l); |
586 | return k; |
587 | } |
588 | |
589 | static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, |
590 | struct btree_path *path, |
591 | struct btree_path_level *l, |
592 | struct bkey *u) |
593 | { |
594 | struct bkey_s_c k = __btree_iter_unpack(c: trans->c, l, u, |
595 | k: bch2_btree_node_iter_prev(&l->iter, l->b)); |
596 | |
597 | path->pos = k.k ? k.k->p : l->b->data->min_key; |
598 | trans->paths_sorted = false; |
599 | bch2_btree_path_verify_level(trans, path, level: l - path->l); |
600 | return k; |
601 | } |
602 | |
603 | static inline bool btree_path_advance_to_pos(struct btree_path *path, |
604 | struct btree_path_level *l, |
605 | int max_advance) |
606 | { |
607 | struct bkey_packed *k; |
608 | int nr_advanced = 0; |
609 | |
610 | while ((k = bch2_btree_node_iter_peek_all(iter: &l->iter, b: l->b)) && |
611 | bkey_iter_pos_cmp(b: l->b, l: k, r: &path->pos) < 0) { |
612 | if (max_advance > 0 && nr_advanced >= max_advance) |
613 | return false; |
614 | |
615 | bch2_btree_node_iter_advance(&l->iter, l->b); |
616 | nr_advanced++; |
617 | } |
618 | |
619 | return true; |
620 | } |
621 | |
622 | static inline void __btree_path_level_init(struct btree_path *path, |
623 | unsigned level) |
624 | { |
625 | struct btree_path_level *l = &path->l[level]; |
626 | |
627 | bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); |
628 | |
629 | /* |
630 | * Iterators to interior nodes should always be pointed at the first non |
631 | * whiteout: |
632 | */ |
633 | if (level) |
634 | bch2_btree_node_iter_peek(iter: &l->iter, b: l->b); |
635 | } |
636 | |
637 | void bch2_btree_path_level_init(struct btree_trans *trans, |
638 | struct btree_path *path, |
639 | struct btree *b) |
640 | { |
641 | BUG_ON(path->cached); |
642 | |
643 | EBUG_ON(!btree_path_pos_in_node(path, b)); |
644 | |
645 | path->l[b->c.level].lock_seq = six_lock_seq(lock: &b->c.lock); |
646 | path->l[b->c.level].b = b; |
647 | __btree_path_level_init(path, level: b->c.level); |
648 | } |
649 | |
650 | /* Btree path: fixups after btree node updates: */ |
651 | |
652 | static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) |
653 | { |
654 | struct bch_fs *c = trans->c; |
655 | |
656 | trans_for_each_update(trans, i) |
657 | if (!i->cached && |
658 | i->level == b->c.level && |
659 | i->btree_id == b->c.btree_id && |
660 | bpos_cmp(l: i->k->k.p, r: b->data->min_key) >= 0 && |
661 | bpos_cmp(l: i->k->k.p, r: b->data->max_key) <= 0) { |
662 | i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; |
663 | |
664 | if (unlikely(trans->journal_replay_not_finished)) { |
665 | struct bkey_i *j_k = |
666 | bch2_journal_keys_peek_slot(c, i->btree_id, i->level, |
667 | i->k->k.p); |
668 | |
669 | if (j_k) { |
670 | i->old_k = j_k->k; |
671 | i->old_v = &j_k->v; |
672 | } |
673 | } |
674 | } |
675 | } |
676 | |
677 | /* |
678 | * A btree node is being replaced - update the iterator to point to the new |
679 | * node: |
680 | */ |
681 | void bch2_trans_node_add(struct btree_trans *trans, |
682 | struct btree_path *path, |
683 | struct btree *b) |
684 | { |
685 | struct btree_path *prev; |
686 | |
687 | BUG_ON(!btree_path_pos_in_node(path, b)); |
688 | |
689 | while ((prev = prev_btree_path(trans, path)) && |
690 | btree_path_pos_in_node(path: prev, b)) |
691 | path = prev; |
692 | |
693 | for (; |
694 | path && btree_path_pos_in_node(path, b); |
695 | path = next_btree_path(trans, path)) |
696 | if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { |
697 | enum btree_node_locked_type t = |
698 | btree_lock_want(path, level: b->c.level); |
699 | |
700 | if (t != BTREE_NODE_UNLOCKED) { |
701 | btree_node_unlock(trans, path, level: b->c.level); |
702 | six_lock_increment(&b->c.lock, (enum six_lock_type) t); |
703 | mark_btree_node_locked(trans, path, level: b->c.level, type: t); |
704 | } |
705 | |
706 | bch2_btree_path_level_init(trans, path, b); |
707 | } |
708 | |
709 | bch2_trans_revalidate_updates_in_node(trans, b); |
710 | } |
711 | |
712 | /* |
713 | * A btree node has been modified in such a way as to invalidate iterators - fix |
714 | * them: |
715 | */ |
716 | void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) |
717 | { |
718 | struct btree_path *path; |
719 | unsigned i; |
720 | |
721 | trans_for_each_path_with_node(trans, b, path, i) |
722 | __btree_path_level_init(path, level: b->c.level); |
723 | |
724 | bch2_trans_revalidate_updates_in_node(trans, b); |
725 | } |
726 | |
727 | /* Btree path: traverse, set_pos: */ |
728 | |
729 | static inline int btree_path_lock_root(struct btree_trans *trans, |
730 | struct btree_path *path, |
731 | unsigned depth_want, |
732 | unsigned long trace_ip) |
733 | { |
734 | struct bch_fs *c = trans->c; |
735 | struct btree *b, **rootp = &bch2_btree_id_root(c, id: path->btree_id)->b; |
736 | enum six_lock_type lock_type; |
737 | unsigned i; |
738 | int ret; |
739 | |
740 | EBUG_ON(path->nodes_locked); |
741 | |
742 | while (1) { |
743 | b = READ_ONCE(*rootp); |
744 | path->level = READ_ONCE(b->c.level); |
745 | |
746 | if (unlikely(path->level < depth_want)) { |
747 | /* |
748 | * the root is at a lower depth than the depth we want: |
749 | * got to the end of the btree, or we're walking nodes |
750 | * greater than some depth and there are no nodes >= |
751 | * that depth |
752 | */ |
753 | path->level = depth_want; |
754 | for (i = path->level; i < BTREE_MAX_DEPTH; i++) |
755 | path->l[i].b = NULL; |
756 | return 1; |
757 | } |
758 | |
759 | lock_type = __btree_lock_want(path, level: path->level); |
760 | ret = btree_node_lock(trans, path, b: &b->c, |
761 | level: path->level, type: lock_type, ip: trace_ip); |
762 | if (unlikely(ret)) { |
763 | if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) |
764 | continue; |
765 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
766 | return ret; |
767 | BUG(); |
768 | } |
769 | |
770 | if (likely(b == READ_ONCE(*rootp) && |
771 | b->c.level == path->level && |
772 | !race_fault())) { |
773 | for (i = 0; i < path->level; i++) |
774 | path->l[i].b = ERR_PTR(error: -BCH_ERR_no_btree_node_lock_root); |
775 | path->l[path->level].b = b; |
776 | for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) |
777 | path->l[i].b = NULL; |
778 | |
779 | mark_btree_node_locked(trans, path, level: path->level, |
780 | type: (enum btree_node_locked_type) lock_type); |
781 | bch2_btree_path_level_init(trans, path, b); |
782 | return 0; |
783 | } |
784 | |
785 | six_unlock_type(lock: &b->c.lock, type: lock_type); |
786 | } |
787 | } |
788 | |
789 | noinline |
790 | static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) |
791 | { |
792 | struct bch_fs *c = trans->c; |
793 | struct btree_path_level *l = path_l(path); |
794 | struct btree_node_iter node_iter = l->iter; |
795 | struct bkey_packed *k; |
796 | struct bkey_buf tmp; |
797 | unsigned nr = test_bit(BCH_FS_started, &c->flags) |
798 | ? (path->level > 1 ? 0 : 2) |
799 | : (path->level > 1 ? 1 : 16); |
800 | bool was_locked = btree_node_locked(path, level: path->level); |
801 | int ret = 0; |
802 | |
803 | bch2_bkey_buf_init(s: &tmp); |
804 | |
805 | while (nr-- && !ret) { |
806 | if (!bch2_btree_node_relock(trans, path, level: path->level)) |
807 | break; |
808 | |
809 | bch2_btree_node_iter_advance(&node_iter, l->b); |
810 | k = bch2_btree_node_iter_peek(iter: &node_iter, b: l->b); |
811 | if (!k) |
812 | break; |
813 | |
814 | bch2_bkey_buf_unpack(s: &tmp, c, b: l->b, src: k); |
815 | ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, |
816 | path->level - 1); |
817 | } |
818 | |
819 | if (!was_locked) |
820 | btree_node_unlock(trans, path, level: path->level); |
821 | |
822 | bch2_bkey_buf_exit(s: &tmp, c); |
823 | return ret; |
824 | } |
825 | |
826 | static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, |
827 | struct btree_and_journal_iter *jiter) |
828 | { |
829 | struct bch_fs *c = trans->c; |
830 | struct bkey_s_c k; |
831 | struct bkey_buf tmp; |
832 | unsigned nr = test_bit(BCH_FS_started, &c->flags) |
833 | ? (path->level > 1 ? 0 : 2) |
834 | : (path->level > 1 ? 1 : 16); |
835 | bool was_locked = btree_node_locked(path, level: path->level); |
836 | int ret = 0; |
837 | |
838 | bch2_bkey_buf_init(s: &tmp); |
839 | |
840 | while (nr-- && !ret) { |
841 | if (!bch2_btree_node_relock(trans, path, level: path->level)) |
842 | break; |
843 | |
844 | bch2_btree_and_journal_iter_advance(jiter); |
845 | k = bch2_btree_and_journal_iter_peek(jiter); |
846 | if (!k.k) |
847 | break; |
848 | |
849 | bch2_bkey_buf_reassemble(s: &tmp, c, k); |
850 | ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, |
851 | path->level - 1); |
852 | } |
853 | |
854 | if (!was_locked) |
855 | btree_node_unlock(trans, path, level: path->level); |
856 | |
857 | bch2_bkey_buf_exit(s: &tmp, c); |
858 | return ret; |
859 | } |
860 | |
861 | static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, |
862 | struct btree_path *path, |
863 | unsigned plevel, struct btree *b) |
864 | { |
865 | struct btree_path_level *l = &path->l[plevel]; |
866 | bool locked = btree_node_locked(path, level: plevel); |
867 | struct bkey_packed *k; |
868 | struct bch_btree_ptr_v2 *bp; |
869 | |
870 | if (!bch2_btree_node_relock(trans, path, level: plevel)) |
871 | return; |
872 | |
873 | k = bch2_btree_node_iter_peek_all(iter: &l->iter, b: l->b); |
874 | BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); |
875 | |
876 | bp = (void *) bkeyp_val(&l->b->format, k); |
877 | bp->mem_ptr = (unsigned long)b; |
878 | |
879 | if (!locked) |
880 | btree_node_unlock(trans, path, level: plevel); |
881 | } |
882 | |
883 | static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, |
884 | struct btree_path *path, |
885 | unsigned flags, |
886 | struct bkey_buf *out) |
887 | { |
888 | struct bch_fs *c = trans->c; |
889 | struct btree_path_level *l = path_l(path); |
890 | struct btree_and_journal_iter jiter; |
891 | struct bkey_s_c k; |
892 | int ret = 0; |
893 | |
894 | __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); |
895 | |
896 | k = bch2_btree_and_journal_iter_peek(&jiter); |
897 | |
898 | bch2_bkey_buf_reassemble(s: out, c, k); |
899 | |
900 | if ((flags & BTREE_ITER_PREFETCH) && |
901 | c->opts.btree_node_prefetch) |
902 | ret = btree_path_prefetch_j(trans, path, jiter: &jiter); |
903 | |
904 | bch2_btree_and_journal_iter_exit(&jiter); |
905 | return ret; |
906 | } |
907 | |
908 | static __always_inline int btree_path_down(struct btree_trans *trans, |
909 | struct btree_path *path, |
910 | unsigned flags, |
911 | unsigned long trace_ip) |
912 | { |
913 | struct bch_fs *c = trans->c; |
914 | struct btree_path_level *l = path_l(path); |
915 | struct btree *b; |
916 | unsigned level = path->level - 1; |
917 | enum six_lock_type lock_type = __btree_lock_want(path, level); |
918 | struct bkey_buf tmp; |
919 | int ret; |
920 | |
921 | EBUG_ON(!btree_node_locked(path, path->level)); |
922 | |
923 | bch2_bkey_buf_init(s: &tmp); |
924 | |
925 | if (unlikely(trans->journal_replay_not_finished)) { |
926 | ret = btree_node_iter_and_journal_peek(trans, path, flags, out: &tmp); |
927 | if (ret) |
928 | goto err; |
929 | } else { |
930 | struct bkey_packed *k = bch2_btree_node_iter_peek(iter: &l->iter, b: l->b); |
931 | if (!k) { |
932 | struct printbuf buf = PRINTBUF; |
933 | |
934 | prt_str(out: &buf, str: "node not found at pos " ); |
935 | bch2_bpos_to_text(&buf, path->pos); |
936 | prt_str(out: &buf, str: " within parent node " ); |
937 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: &l->b->key)); |
938 | |
939 | bch2_fs_fatal_error(c, "%s" , buf.buf); |
940 | printbuf_exit(&buf); |
941 | ret = -BCH_ERR_btree_need_topology_repair; |
942 | goto err; |
943 | } |
944 | |
945 | bch2_bkey_buf_unpack(s: &tmp, c, b: l->b, src: k); |
946 | |
947 | if ((flags & BTREE_ITER_PREFETCH) && |
948 | c->opts.btree_node_prefetch) { |
949 | ret = btree_path_prefetch(trans, path); |
950 | if (ret) |
951 | goto err; |
952 | } |
953 | } |
954 | |
955 | b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); |
956 | ret = PTR_ERR_OR_ZERO(ptr: b); |
957 | if (unlikely(ret)) |
958 | goto err; |
959 | |
960 | if (likely(!trans->journal_replay_not_finished && |
961 | tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && |
962 | unlikely(b != btree_node_mem_ptr(tmp.k))) |
963 | btree_node_mem_ptr_set(trans, path, plevel: level + 1, b); |
964 | |
965 | if (btree_node_read_locked(path, l: level + 1)) |
966 | btree_node_unlock(trans, path, level: level + 1); |
967 | |
968 | mark_btree_node_locked(trans, path, level, |
969 | type: (enum btree_node_locked_type) lock_type); |
970 | path->level = level; |
971 | bch2_btree_path_level_init(trans, path, b); |
972 | |
973 | bch2_btree_path_verify_locks(path); |
974 | err: |
975 | bch2_bkey_buf_exit(s: &tmp, c); |
976 | return ret; |
977 | } |
978 | |
979 | static int bch2_btree_path_traverse_all(struct btree_trans *trans) |
980 | { |
981 | struct bch_fs *c = trans->c; |
982 | struct btree_path *path; |
983 | unsigned long trace_ip = _RET_IP_; |
984 | unsigned i; |
985 | int ret = 0; |
986 | |
987 | if (trans->in_traverse_all) |
988 | return -BCH_ERR_transaction_restart_in_traverse_all; |
989 | |
990 | trans->in_traverse_all = true; |
991 | retry_all: |
992 | trans->restarted = 0; |
993 | trans->last_restarted_ip = 0; |
994 | |
995 | trans_for_each_path(trans, path, i) |
996 | path->should_be_locked = false; |
997 | |
998 | btree_trans_sort_paths(trans); |
999 | |
1000 | bch2_trans_unlock(trans); |
1001 | cond_resched(); |
1002 | |
1003 | if (unlikely(trans->memory_allocation_failure)) { |
1004 | struct closure cl; |
1005 | |
1006 | closure_init_stack(cl: &cl); |
1007 | |
1008 | do { |
1009 | ret = bch2_btree_cache_cannibalize_lock(trans, &cl); |
1010 | closure_sync(cl: &cl); |
1011 | } while (ret); |
1012 | } |
1013 | |
1014 | /* Now, redo traversals in correct order: */ |
1015 | i = 0; |
1016 | while (i < trans->nr_sorted) { |
1017 | btree_path_idx_t idx = trans->sorted[i]; |
1018 | |
1019 | /* |
1020 | * Traversing a path can cause another path to be added at about |
1021 | * the same position: |
1022 | */ |
1023 | if (trans->paths[idx].uptodate) { |
1024 | __btree_path_get(path: &trans->paths[idx], intent: false); |
1025 | ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); |
1026 | __btree_path_put(path: &trans->paths[idx], intent: false); |
1027 | |
1028 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || |
1029 | bch2_err_matches(ret, ENOMEM)) |
1030 | goto retry_all; |
1031 | if (ret) |
1032 | goto err; |
1033 | } else { |
1034 | i++; |
1035 | } |
1036 | } |
1037 | |
1038 | /* |
1039 | * We used to assert that all paths had been traversed here |
1040 | * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since |
1041 | * path->should_be_locked is not set yet, we might have unlocked and |
1042 | * then failed to relock a path - that's fine. |
1043 | */ |
1044 | err: |
1045 | bch2_btree_cache_cannibalize_unlock(trans); |
1046 | |
1047 | trans->in_traverse_all = false; |
1048 | |
1049 | trace_and_count(c, trans_traverse_all, trans, trace_ip); |
1050 | return ret; |
1051 | } |
1052 | |
1053 | static inline bool btree_path_check_pos_in_node(struct btree_path *path, |
1054 | unsigned l, int check_pos) |
1055 | { |
1056 | if (check_pos < 0 && btree_path_pos_before_node(path, b: path->l[l].b)) |
1057 | return false; |
1058 | if (check_pos > 0 && btree_path_pos_after_node(path, b: path->l[l].b)) |
1059 | return false; |
1060 | return true; |
1061 | } |
1062 | |
1063 | static inline bool btree_path_good_node(struct btree_trans *trans, |
1064 | struct btree_path *path, |
1065 | unsigned l, int check_pos) |
1066 | { |
1067 | return is_btree_node(path, l) && |
1068 | bch2_btree_node_relock(trans, path, level: l) && |
1069 | btree_path_check_pos_in_node(path, l, check_pos); |
1070 | } |
1071 | |
1072 | static void btree_path_set_level_down(struct btree_trans *trans, |
1073 | struct btree_path *path, |
1074 | unsigned new_level) |
1075 | { |
1076 | unsigned l; |
1077 | |
1078 | path->level = new_level; |
1079 | |
1080 | for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) |
1081 | if (btree_lock_want(path, level: l) == BTREE_NODE_UNLOCKED) |
1082 | btree_node_unlock(trans, path, level: l); |
1083 | |
1084 | btree_path_set_dirty(path, u: BTREE_ITER_NEED_TRAVERSE); |
1085 | bch2_btree_path_verify(trans, path); |
1086 | } |
1087 | |
1088 | static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, |
1089 | struct btree_path *path, |
1090 | int check_pos) |
1091 | { |
1092 | unsigned i, l = path->level; |
1093 | again: |
1094 | while (btree_path_node(path, level: l) && |
1095 | !btree_path_good_node(trans, path, l, check_pos)) |
1096 | __btree_path_set_level_up(trans, path, l: l++); |
1097 | |
1098 | /* If we need intent locks, take them too: */ |
1099 | for (i = l + 1; |
1100 | i < path->locks_want && btree_path_node(path, level: i); |
1101 | i++) |
1102 | if (!bch2_btree_node_relock(trans, path, level: i)) { |
1103 | while (l <= i) |
1104 | __btree_path_set_level_up(trans, path, l: l++); |
1105 | goto again; |
1106 | } |
1107 | |
1108 | return l; |
1109 | } |
1110 | |
1111 | static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, |
1112 | struct btree_path *path, |
1113 | int check_pos) |
1114 | { |
1115 | return likely(btree_node_locked(path, path->level) && |
1116 | btree_path_check_pos_in_node(path, path->level, check_pos)) |
1117 | ? path->level |
1118 | : __btree_path_up_until_good_node(trans, path, check_pos); |
1119 | } |
1120 | |
1121 | /* |
1122 | * This is the main state machine for walking down the btree - walks down to a |
1123 | * specified depth |
1124 | * |
1125 | * Returns 0 on success, -EIO on error (error reading in a btree node). |
1126 | * |
1127 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is |
1128 | * stashed in the iterator and returned from bch2_trans_exit(). |
1129 | */ |
1130 | int bch2_btree_path_traverse_one(struct btree_trans *trans, |
1131 | btree_path_idx_t path_idx, |
1132 | unsigned flags, |
1133 | unsigned long trace_ip) |
1134 | { |
1135 | struct btree_path *path = &trans->paths[path_idx]; |
1136 | unsigned depth_want = path->level; |
1137 | int ret = -((int) trans->restarted); |
1138 | |
1139 | if (unlikely(ret)) |
1140 | goto out; |
1141 | |
1142 | if (unlikely(!trans->srcu_held)) |
1143 | bch2_trans_srcu_lock(trans); |
1144 | |
1145 | /* |
1146 | * Ensure we obey path->should_be_locked: if it's set, we can't unlock |
1147 | * and re-traverse the path without a transaction restart: |
1148 | */ |
1149 | if (path->should_be_locked) { |
1150 | ret = bch2_btree_path_relock(trans, path, trace_ip); |
1151 | goto out; |
1152 | } |
1153 | |
1154 | if (path->cached) { |
1155 | ret = bch2_btree_path_traverse_cached(trans, path, flags); |
1156 | goto out; |
1157 | } |
1158 | |
1159 | path = &trans->paths[path_idx]; |
1160 | |
1161 | if (unlikely(path->level >= BTREE_MAX_DEPTH)) |
1162 | goto out_uptodate; |
1163 | |
1164 | path->level = btree_path_up_until_good_node(trans, path, check_pos: 0); |
1165 | |
1166 | EBUG_ON(btree_path_node(path, path->level) && |
1167 | !btree_node_locked(path, path->level)); |
1168 | |
1169 | /* |
1170 | * Note: path->nodes[path->level] may be temporarily NULL here - that |
1171 | * would indicate to other code that we got to the end of the btree, |
1172 | * here it indicates that relocking the root failed - it's critical that |
1173 | * btree_path_lock_root() comes next and that it can't fail |
1174 | */ |
1175 | while (path->level > depth_want) { |
1176 | ret = btree_path_node(path, level: path->level) |
1177 | ? btree_path_down(trans, path, flags, trace_ip) |
1178 | : btree_path_lock_root(trans, path, depth_want, trace_ip); |
1179 | if (unlikely(ret)) { |
1180 | if (ret == 1) { |
1181 | /* |
1182 | * No nodes at this level - got to the end of |
1183 | * the btree: |
1184 | */ |
1185 | ret = 0; |
1186 | goto out; |
1187 | } |
1188 | |
1189 | __bch2_btree_path_unlock(trans, path); |
1190 | path->level = depth_want; |
1191 | path->l[path->level].b = ERR_PTR(error: ret); |
1192 | goto out; |
1193 | } |
1194 | } |
1195 | out_uptodate: |
1196 | path->uptodate = BTREE_ITER_UPTODATE; |
1197 | out: |
1198 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) |
1199 | panic(fmt: "ret %s (%i) trans->restarted %s (%i)\n" , |
1200 | bch2_err_str(ret), ret, |
1201 | bch2_err_str(trans->restarted), trans->restarted); |
1202 | bch2_btree_path_verify(trans, path); |
1203 | return ret; |
1204 | } |
1205 | |
1206 | static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, |
1207 | struct btree_path *src) |
1208 | { |
1209 | unsigned i, offset = offsetof(struct btree_path, pos); |
1210 | |
1211 | memcpy((void *) dst + offset, |
1212 | (void *) src + offset, |
1213 | sizeof(struct btree_path) - offset); |
1214 | |
1215 | for (i = 0; i < BTREE_MAX_DEPTH; i++) { |
1216 | unsigned t = btree_node_locked_type(path: dst, level: i); |
1217 | |
1218 | if (t != BTREE_NODE_UNLOCKED) |
1219 | six_lock_increment(&dst->l[i].b->c.lock, t); |
1220 | } |
1221 | } |
1222 | |
1223 | static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, |
1224 | bool intent) |
1225 | { |
1226 | btree_path_idx_t new = btree_path_alloc(trans, src); |
1227 | btree_path_copy(trans, dst: trans->paths + new, src: trans->paths + src); |
1228 | __btree_path_get(path: trans->paths + new, intent); |
1229 | return new; |
1230 | } |
1231 | |
1232 | __flatten |
1233 | btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, |
1234 | btree_path_idx_t path, bool intent, unsigned long ip) |
1235 | { |
1236 | __btree_path_put(path: trans->paths + path, intent); |
1237 | path = btree_path_clone(trans, src: path, intent); |
1238 | trans->paths[path].preserve = false; |
1239 | return path; |
1240 | } |
1241 | |
1242 | btree_path_idx_t __must_check |
1243 | __bch2_btree_path_set_pos(struct btree_trans *trans, |
1244 | btree_path_idx_t path_idx, struct bpos new_pos, |
1245 | bool intent, unsigned long ip) |
1246 | { |
1247 | int cmp = bpos_cmp(l: new_pos, r: trans->paths[path_idx].pos); |
1248 | |
1249 | bch2_trans_verify_not_in_restart(trans); |
1250 | EBUG_ON(!trans->paths[path_idx].ref); |
1251 | |
1252 | path_idx = bch2_btree_path_make_mut(trans, path: path_idx, intent, ip); |
1253 | |
1254 | struct btree_path *path = trans->paths + path_idx; |
1255 | path->pos = new_pos; |
1256 | trans->paths_sorted = false; |
1257 | |
1258 | if (unlikely(path->cached)) { |
1259 | btree_node_unlock(trans, path, level: 0); |
1260 | path->l[0].b = ERR_PTR(error: -BCH_ERR_no_btree_node_up); |
1261 | btree_path_set_dirty(path, u: BTREE_ITER_NEED_TRAVERSE); |
1262 | goto out; |
1263 | } |
1264 | |
1265 | unsigned level = btree_path_up_until_good_node(trans, path, check_pos: cmp); |
1266 | |
1267 | if (btree_path_node(path, level)) { |
1268 | struct btree_path_level *l = &path->l[level]; |
1269 | |
1270 | BUG_ON(!btree_node_locked(path, level)); |
1271 | /* |
1272 | * We might have to skip over many keys, or just a few: try |
1273 | * advancing the node iterator, and if we have to skip over too |
1274 | * many keys just reinit it (or if we're rewinding, since that |
1275 | * is expensive). |
1276 | */ |
1277 | if (cmp < 0 || |
1278 | !btree_path_advance_to_pos(path, l, max_advance: 8)) |
1279 | bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); |
1280 | |
1281 | /* |
1282 | * Iterators to interior nodes should always be pointed at the first non |
1283 | * whiteout: |
1284 | */ |
1285 | if (unlikely(level)) |
1286 | bch2_btree_node_iter_peek(iter: &l->iter, b: l->b); |
1287 | } |
1288 | |
1289 | if (unlikely(level != path->level)) { |
1290 | btree_path_set_dirty(path, u: BTREE_ITER_NEED_TRAVERSE); |
1291 | __bch2_btree_path_unlock(trans, path); |
1292 | } |
1293 | out: |
1294 | bch2_btree_path_verify(trans, path); |
1295 | return path_idx; |
1296 | } |
1297 | |
1298 | /* Btree path: main interface: */ |
1299 | |
1300 | static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) |
1301 | { |
1302 | struct btree_path *sib; |
1303 | |
1304 | sib = prev_btree_path(trans, path); |
1305 | if (sib && !btree_path_cmp(l: sib, r: path)) |
1306 | return sib; |
1307 | |
1308 | sib = next_btree_path(trans, path); |
1309 | if (sib && !btree_path_cmp(l: sib, r: path)) |
1310 | return sib; |
1311 | |
1312 | return NULL; |
1313 | } |
1314 | |
1315 | static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) |
1316 | { |
1317 | struct btree_path *sib; |
1318 | |
1319 | sib = prev_btree_path(trans, path); |
1320 | if (sib && sib->level == path->level && path_l(path: sib)->b == path_l(path)->b) |
1321 | return sib; |
1322 | |
1323 | sib = next_btree_path(trans, path); |
1324 | if (sib && sib->level == path->level && path_l(path: sib)->b == path_l(path)->b) |
1325 | return sib; |
1326 | |
1327 | return NULL; |
1328 | } |
1329 | |
1330 | static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) |
1331 | { |
1332 | __bch2_btree_path_unlock(trans, path: trans->paths + path); |
1333 | btree_path_list_remove(trans, trans->paths + path); |
1334 | __clear_bit(path, trans->paths_allocated); |
1335 | } |
1336 | |
1337 | void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) |
1338 | { |
1339 | struct btree_path *path = trans->paths + path_idx, *dup; |
1340 | |
1341 | if (!__btree_path_put(path, intent)) |
1342 | return; |
1343 | |
1344 | dup = path->preserve |
1345 | ? have_path_at_pos(trans, path) |
1346 | : have_node_at_pos(trans, path); |
1347 | |
1348 | if (!dup && !(!path->preserve && !is_btree_node(path, l: path->level))) |
1349 | return; |
1350 | |
1351 | if (path->should_be_locked && |
1352 | !trans->restarted && |
1353 | (!dup || !bch2_btree_path_relock_norestart(trans, dup))) |
1354 | return; |
1355 | |
1356 | if (dup) { |
1357 | dup->preserve |= path->preserve; |
1358 | dup->should_be_locked |= path->should_be_locked; |
1359 | } |
1360 | |
1361 | __bch2_path_free(trans, path: path_idx); |
1362 | } |
1363 | |
1364 | static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, |
1365 | bool intent) |
1366 | { |
1367 | if (!__btree_path_put(path: trans->paths + path, intent)) |
1368 | return; |
1369 | |
1370 | __bch2_path_free(trans, path); |
1371 | } |
1372 | |
1373 | void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) |
1374 | { |
1375 | panic(fmt: "trans->restart_count %u, should be %u, last restarted by %pS\n" , |
1376 | trans->restart_count, restart_count, |
1377 | (void *) trans->last_begin_ip); |
1378 | } |
1379 | |
1380 | void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) |
1381 | { |
1382 | panic(fmt: "in transaction restart: %s, last restarted by %pS\n" , |
1383 | bch2_err_str(trans->restarted), |
1384 | (void *) trans->last_restarted_ip); |
1385 | } |
1386 | |
1387 | noinline __cold |
1388 | void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) |
1389 | { |
1390 | prt_printf(buf, "transaction updates for %s journal seq %llu" , |
1391 | trans->fn, trans->journal_res.seq); |
1392 | prt_newline(buf); |
1393 | printbuf_indent_add(buf, 2); |
1394 | |
1395 | trans_for_each_update(trans, i) { |
1396 | struct bkey_s_c old = { &i->old_k, i->old_v }; |
1397 | |
1398 | prt_printf(buf, "update: btree=%s cached=%u %pS" , |
1399 | bch2_btree_id_str(i->btree_id), |
1400 | i->cached, |
1401 | (void *) i->ip_allocated); |
1402 | prt_newline(buf); |
1403 | |
1404 | prt_printf(buf, " old " ); |
1405 | bch2_bkey_val_to_text(buf, trans->c, old); |
1406 | prt_newline(buf); |
1407 | |
1408 | prt_printf(buf, " new " ); |
1409 | bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(k: i->k)); |
1410 | prt_newline(buf); |
1411 | } |
1412 | |
1413 | for (struct jset_entry *e = trans->journal_entries; |
1414 | e != btree_trans_journal_entries_top(trans); |
1415 | e = vstruct_next(e)) |
1416 | bch2_journal_entry_to_text(buf, trans->c, e); |
1417 | |
1418 | printbuf_indent_sub(buf, 2); |
1419 | } |
1420 | |
1421 | noinline __cold |
1422 | void bch2_dump_trans_updates(struct btree_trans *trans) |
1423 | { |
1424 | struct printbuf buf = PRINTBUF; |
1425 | |
1426 | bch2_trans_updates_to_text(buf: &buf, trans); |
1427 | bch2_print_string_as_lines(KERN_ERR, lines: buf.buf); |
1428 | printbuf_exit(&buf); |
1429 | } |
1430 | |
1431 | static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) |
1432 | { |
1433 | struct btree_path *path = trans->paths + path_idx; |
1434 | |
1435 | prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos " , |
1436 | path_idx, path->ref, path->intent_ref, |
1437 | path->preserve ? 'P' : ' ', |
1438 | path->should_be_locked ? 'S' : ' ', |
1439 | bch2_btree_id_str(path->btree_id), |
1440 | path->level); |
1441 | bch2_bpos_to_text(out, path->pos); |
1442 | |
1443 | prt_printf(out, " locks %u" , path->nodes_locked); |
1444 | #ifdef TRACK_PATH_ALLOCATED |
1445 | prt_printf(out, " %pS" , (void *) path->ip_allocated); |
1446 | #endif |
1447 | prt_newline(out); |
1448 | } |
1449 | |
1450 | static noinline __cold |
1451 | void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, |
1452 | bool nosort) |
1453 | { |
1454 | struct trans_for_each_path_inorder_iter iter; |
1455 | |
1456 | if (!nosort) |
1457 | btree_trans_sort_paths(trans); |
1458 | |
1459 | trans_for_each_path_idx_inorder(trans, iter) |
1460 | bch2_btree_path_to_text(out, trans, iter.path_idx); |
1461 | } |
1462 | |
1463 | noinline __cold |
1464 | void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) |
1465 | { |
1466 | __bch2_trans_paths_to_text(out, trans, nosort: false); |
1467 | } |
1468 | |
1469 | static noinline __cold |
1470 | void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) |
1471 | { |
1472 | struct printbuf buf = PRINTBUF; |
1473 | |
1474 | __bch2_trans_paths_to_text(out: &buf, trans, nosort); |
1475 | bch2_trans_updates_to_text(buf: &buf, trans); |
1476 | |
1477 | bch2_print_string_as_lines(KERN_ERR, lines: buf.buf); |
1478 | printbuf_exit(&buf); |
1479 | } |
1480 | |
1481 | noinline __cold |
1482 | void bch2_dump_trans_paths_updates(struct btree_trans *trans) |
1483 | { |
1484 | __bch2_dump_trans_paths_updates(trans, nosort: false); |
1485 | } |
1486 | |
1487 | noinline __cold |
1488 | static void bch2_trans_update_max_paths(struct btree_trans *trans) |
1489 | { |
1490 | struct btree_transaction_stats *s = btree_trans_stats(trans); |
1491 | struct printbuf buf = PRINTBUF; |
1492 | size_t nr = bitmap_weight(src: trans->paths_allocated, nbits: trans->nr_paths); |
1493 | |
1494 | bch2_trans_paths_to_text(out: &buf, trans); |
1495 | |
1496 | if (!buf.allocation_failure) { |
1497 | mutex_lock(&s->lock); |
1498 | if (nr > s->nr_max_paths) { |
1499 | s->nr_max_paths = nr; |
1500 | swap(s->max_paths_text, buf.buf); |
1501 | } |
1502 | mutex_unlock(lock: &s->lock); |
1503 | } |
1504 | |
1505 | printbuf_exit(&buf); |
1506 | |
1507 | trans->nr_paths_max = nr; |
1508 | } |
1509 | |
1510 | noinline __cold |
1511 | int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) |
1512 | { |
1513 | if (trace_trans_restart_too_many_iters_enabled()) { |
1514 | struct printbuf buf = PRINTBUF; |
1515 | |
1516 | bch2_trans_paths_to_text(out: &buf, trans); |
1517 | trace_trans_restart_too_many_iters(trans, _THIS_IP_, paths: buf.buf); |
1518 | printbuf_exit(&buf); |
1519 | } |
1520 | |
1521 | count_event(trans->c, trans_restart_too_many_iters); |
1522 | |
1523 | return btree_trans_restart(trans, err: BCH_ERR_transaction_restart_too_many_iters); |
1524 | } |
1525 | |
1526 | static noinline void btree_path_overflow(struct btree_trans *trans) |
1527 | { |
1528 | bch2_dump_trans_paths_updates(trans); |
1529 | bch_err(trans->c, "trans path overflow" ); |
1530 | } |
1531 | |
1532 | static noinline void btree_paths_realloc(struct btree_trans *trans) |
1533 | { |
1534 | unsigned nr = trans->nr_paths * 2; |
1535 | |
1536 | void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + |
1537 | sizeof(struct btree_trans_paths) + |
1538 | nr * sizeof(struct btree_path) + |
1539 | nr * sizeof(btree_path_idx_t) + 8 + |
1540 | nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); |
1541 | |
1542 | unsigned long *paths_allocated = p; |
1543 | memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); |
1544 | p += BITS_TO_LONGS(nr) * sizeof(unsigned long); |
1545 | |
1546 | p += sizeof(struct btree_trans_paths); |
1547 | struct btree_path *paths = p; |
1548 | *trans_paths_nr(paths) = nr; |
1549 | memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); |
1550 | p += nr * sizeof(struct btree_path); |
1551 | |
1552 | btree_path_idx_t *sorted = p; |
1553 | memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); |
1554 | p += nr * sizeof(btree_path_idx_t) + 8; |
1555 | |
1556 | struct btree_insert_entry *updates = p; |
1557 | memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); |
1558 | |
1559 | unsigned long *old = trans->paths_allocated; |
1560 | |
1561 | rcu_assign_pointer(trans->paths_allocated, paths_allocated); |
1562 | rcu_assign_pointer(trans->paths, paths); |
1563 | rcu_assign_pointer(trans->sorted, sorted); |
1564 | rcu_assign_pointer(trans->updates, updates); |
1565 | |
1566 | trans->nr_paths = nr; |
1567 | |
1568 | if (old != trans->_paths_allocated) |
1569 | kfree_rcu_mightsleep(old); |
1570 | } |
1571 | |
1572 | static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, |
1573 | btree_path_idx_t pos) |
1574 | { |
1575 | btree_path_idx_t idx = find_first_zero_bit(addr: trans->paths_allocated, size: trans->nr_paths); |
1576 | |
1577 | if (unlikely(idx == trans->nr_paths)) { |
1578 | if (trans->nr_paths == BTREE_ITER_MAX) { |
1579 | btree_path_overflow(trans); |
1580 | return 0; |
1581 | } |
1582 | |
1583 | btree_paths_realloc(trans); |
1584 | } |
1585 | |
1586 | /* |
1587 | * Do this before marking the new path as allocated, since it won't be |
1588 | * initialized yet: |
1589 | */ |
1590 | if (unlikely(idx > trans->nr_paths_max)) |
1591 | bch2_trans_update_max_paths(trans); |
1592 | |
1593 | __set_bit(idx, trans->paths_allocated); |
1594 | |
1595 | struct btree_path *path = &trans->paths[idx]; |
1596 | path->ref = 0; |
1597 | path->intent_ref = 0; |
1598 | path->nodes_locked = 0; |
1599 | |
1600 | btree_path_list_add(trans, pos, idx); |
1601 | trans->paths_sorted = false; |
1602 | return idx; |
1603 | } |
1604 | |
1605 | btree_path_idx_t bch2_path_get(struct btree_trans *trans, |
1606 | enum btree_id btree_id, struct bpos pos, |
1607 | unsigned locks_want, unsigned level, |
1608 | unsigned flags, unsigned long ip) |
1609 | { |
1610 | struct btree_path *path; |
1611 | bool cached = flags & BTREE_ITER_CACHED; |
1612 | bool intent = flags & BTREE_ITER_INTENT; |
1613 | struct trans_for_each_path_inorder_iter iter; |
1614 | btree_path_idx_t path_pos = 0, path_idx; |
1615 | |
1616 | bch2_trans_verify_not_in_restart(trans); |
1617 | bch2_trans_verify_locks(trans); |
1618 | |
1619 | btree_trans_sort_paths(trans); |
1620 | |
1621 | trans_for_each_path_inorder(trans, path, iter) { |
1622 | if (__btree_path_cmp(path, |
1623 | btree_id, |
1624 | cached, |
1625 | pos, |
1626 | level) > 0) |
1627 | break; |
1628 | |
1629 | path_pos = iter.path_idx; |
1630 | } |
1631 | |
1632 | if (path_pos && |
1633 | trans->paths[path_pos].cached == cached && |
1634 | trans->paths[path_pos].btree_id == btree_id && |
1635 | trans->paths[path_pos].level == level) { |
1636 | __btree_path_get(trans->paths + path_pos, intent); |
1637 | path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); |
1638 | path = trans->paths + path_idx; |
1639 | } else { |
1640 | path_idx = btree_path_alloc(trans, path_pos); |
1641 | path = trans->paths + path_idx; |
1642 | |
1643 | __btree_path_get(path, intent); |
1644 | path->pos = pos; |
1645 | path->btree_id = btree_id; |
1646 | path->cached = cached; |
1647 | path->uptodate = BTREE_ITER_NEED_TRAVERSE; |
1648 | path->should_be_locked = false; |
1649 | path->level = level; |
1650 | path->locks_want = locks_want; |
1651 | path->nodes_locked = 0; |
1652 | for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) |
1653 | path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); |
1654 | #ifdef TRACK_PATH_ALLOCATED |
1655 | path->ip_allocated = ip; |
1656 | #endif |
1657 | trans->paths_sorted = false; |
1658 | } |
1659 | |
1660 | if (!(flags & BTREE_ITER_NOPRESERVE)) |
1661 | path->preserve = true; |
1662 | |
1663 | if (path->intent_ref) |
1664 | locks_want = max(locks_want, level + 1); |
1665 | |
1666 | /* |
1667 | * If the path has locks_want greater than requested, we don't downgrade |
1668 | * it here - on transaction restart because btree node split needs to |
1669 | * upgrade locks, we might be putting/getting the iterator again. |
1670 | * Downgrading iterators only happens via bch2_trans_downgrade(), after |
1671 | * a successful transaction commit. |
1672 | */ |
1673 | |
1674 | locks_want = min(locks_want, BTREE_MAX_DEPTH); |
1675 | if (locks_want > path->locks_want) |
1676 | bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); |
1677 | |
1678 | return path_idx; |
1679 | } |
1680 | |
1681 | struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) |
1682 | { |
1683 | |
1684 | struct btree_path_level *l = path_l(path); |
1685 | struct bkey_packed *_k; |
1686 | struct bkey_s_c k; |
1687 | |
1688 | if (unlikely(!l->b)) |
1689 | return bkey_s_c_null; |
1690 | |
1691 | EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); |
1692 | EBUG_ON(!btree_node_locked(path, path->level)); |
1693 | |
1694 | if (!path->cached) { |
1695 | _k = bch2_btree_node_iter_peek_all(iter: &l->iter, b: l->b); |
1696 | k = _k ? bkey_disassemble(b: l->b, k: _k, u) : bkey_s_c_null; |
1697 | |
1698 | EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); |
1699 | |
1700 | if (!k.k || !bpos_eq(l: path->pos, r: k.k->p)) |
1701 | goto hole; |
1702 | } else { |
1703 | struct bkey_cached *ck = (void *) path->l[0].b; |
1704 | |
1705 | EBUG_ON(ck && |
1706 | (path->btree_id != ck->key.btree_id || |
1707 | !bkey_eq(path->pos, ck->key.pos))); |
1708 | if (!ck || !ck->valid) |
1709 | return bkey_s_c_null; |
1710 | |
1711 | *u = ck->k->k; |
1712 | k = bkey_i_to_s_c(k: ck->k); |
1713 | } |
1714 | |
1715 | return k; |
1716 | hole: |
1717 | bkey_init(k: u); |
1718 | u->p = path->pos; |
1719 | return (struct bkey_s_c) { u, NULL }; |
1720 | } |
1721 | |
1722 | /* Btree iterators: */ |
1723 | |
1724 | int __must_check |
1725 | __bch2_btree_iter_traverse(struct btree_iter *iter) |
1726 | { |
1727 | return bch2_btree_path_traverse(trans: iter->trans, path: iter->path, flags: iter->flags); |
1728 | } |
1729 | |
1730 | int __must_check |
1731 | bch2_btree_iter_traverse(struct btree_iter *iter) |
1732 | { |
1733 | struct btree_trans *trans = iter->trans; |
1734 | int ret; |
1735 | |
1736 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, |
1737 | new_pos: btree_iter_search_key(iter), |
1738 | intent: iter->flags & BTREE_ITER_INTENT, |
1739 | ip: btree_iter_ip_allocated(iter)); |
1740 | |
1741 | ret = bch2_btree_path_traverse(trans: iter->trans, path: iter->path, flags: iter->flags); |
1742 | if (ret) |
1743 | return ret; |
1744 | |
1745 | struct btree_path *path = btree_iter_path(trans, iter); |
1746 | if (btree_path_node(path, level: path->level)) |
1747 | btree_path_set_should_be_locked(path); |
1748 | return 0; |
1749 | } |
1750 | |
1751 | /* Iterate across nodes (leaf and interior nodes) */ |
1752 | |
1753 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) |
1754 | { |
1755 | struct btree_trans *trans = iter->trans; |
1756 | struct btree *b = NULL; |
1757 | int ret; |
1758 | |
1759 | EBUG_ON(trans->paths[iter->path].cached); |
1760 | bch2_btree_iter_verify(iter); |
1761 | |
1762 | ret = bch2_btree_path_traverse(trans, path: iter->path, flags: iter->flags); |
1763 | if (ret) |
1764 | goto err; |
1765 | |
1766 | struct btree_path *path = btree_iter_path(trans, iter); |
1767 | b = btree_path_node(path, level: path->level); |
1768 | if (!b) |
1769 | goto out; |
1770 | |
1771 | BUG_ON(bpos_lt(b->key.k.p, iter->pos)); |
1772 | |
1773 | bkey_init(k: &iter->k); |
1774 | iter->k.p = iter->pos = b->key.k.p; |
1775 | |
1776 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, new_pos: b->key.k.p, |
1777 | intent: iter->flags & BTREE_ITER_INTENT, |
1778 | ip: btree_iter_ip_allocated(iter)); |
1779 | btree_path_set_should_be_locked(path: btree_iter_path(trans, iter)); |
1780 | out: |
1781 | bch2_btree_iter_verify_entry_exit(iter); |
1782 | bch2_btree_iter_verify(iter); |
1783 | |
1784 | return b; |
1785 | err: |
1786 | b = ERR_PTR(error: ret); |
1787 | goto out; |
1788 | } |
1789 | |
1790 | struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) |
1791 | { |
1792 | struct btree *b; |
1793 | |
1794 | while (b = bch2_btree_iter_peek_node(iter), |
1795 | bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) |
1796 | bch2_trans_begin(iter->trans); |
1797 | |
1798 | return b; |
1799 | } |
1800 | |
1801 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) |
1802 | { |
1803 | struct btree_trans *trans = iter->trans; |
1804 | struct btree *b = NULL; |
1805 | int ret; |
1806 | |
1807 | EBUG_ON(trans->paths[iter->path].cached); |
1808 | bch2_trans_verify_not_in_restart(trans); |
1809 | bch2_btree_iter_verify(iter); |
1810 | |
1811 | struct btree_path *path = btree_iter_path(trans, iter); |
1812 | |
1813 | /* already at end? */ |
1814 | if (!btree_path_node(path, level: path->level)) |
1815 | return NULL; |
1816 | |
1817 | /* got to end? */ |
1818 | if (!btree_path_node(path, level: path->level + 1)) { |
1819 | btree_path_set_level_up(trans, path); |
1820 | return NULL; |
1821 | } |
1822 | |
1823 | if (!bch2_btree_node_relock(trans, path, level: path->level + 1)) { |
1824 | __bch2_btree_path_unlock(trans, path); |
1825 | path->l[path->level].b = ERR_PTR(error: -BCH_ERR_no_btree_node_relock); |
1826 | path->l[path->level + 1].b = ERR_PTR(error: -BCH_ERR_no_btree_node_relock); |
1827 | btree_path_set_dirty(path, u: BTREE_ITER_NEED_TRAVERSE); |
1828 | trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); |
1829 | ret = btree_trans_restart(trans, err: BCH_ERR_transaction_restart_relock); |
1830 | goto err; |
1831 | } |
1832 | |
1833 | b = btree_path_node(path, level: path->level + 1); |
1834 | |
1835 | if (bpos_eq(l: iter->pos, r: b->key.k.p)) { |
1836 | __btree_path_set_level_up(trans, path, l: path->level++); |
1837 | } else { |
1838 | /* |
1839 | * Haven't gotten to the end of the parent node: go back down to |
1840 | * the next child node |
1841 | */ |
1842 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, |
1843 | new_pos: bpos_successor(p: iter->pos), |
1844 | intent: iter->flags & BTREE_ITER_INTENT, |
1845 | ip: btree_iter_ip_allocated(iter)); |
1846 | |
1847 | path = btree_iter_path(trans, iter); |
1848 | btree_path_set_level_down(trans, path, new_level: iter->min_depth); |
1849 | |
1850 | ret = bch2_btree_path_traverse(trans, path: iter->path, flags: iter->flags); |
1851 | if (ret) |
1852 | goto err; |
1853 | |
1854 | path = btree_iter_path(trans, iter); |
1855 | b = path->l[path->level].b; |
1856 | } |
1857 | |
1858 | bkey_init(k: &iter->k); |
1859 | iter->k.p = iter->pos = b->key.k.p; |
1860 | |
1861 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, new_pos: b->key.k.p, |
1862 | intent: iter->flags & BTREE_ITER_INTENT, |
1863 | ip: btree_iter_ip_allocated(iter)); |
1864 | btree_path_set_should_be_locked(path: btree_iter_path(trans, iter)); |
1865 | EBUG_ON(btree_iter_path(trans, iter)->uptodate); |
1866 | out: |
1867 | bch2_btree_iter_verify_entry_exit(iter); |
1868 | bch2_btree_iter_verify(iter); |
1869 | |
1870 | return b; |
1871 | err: |
1872 | b = ERR_PTR(error: ret); |
1873 | goto out; |
1874 | } |
1875 | |
1876 | /* Iterate across keys (in leaf nodes only) */ |
1877 | |
1878 | inline bool bch2_btree_iter_advance(struct btree_iter *iter) |
1879 | { |
1880 | struct bpos pos = iter->k.p; |
1881 | bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS |
1882 | ? bpos_eq(l: pos, SPOS_MAX) |
1883 | : bkey_eq(l: pos, SPOS_MAX)); |
1884 | |
1885 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
1886 | pos = bkey_successor(iter, p: pos); |
1887 | bch2_btree_iter_set_pos(iter, new_pos: pos); |
1888 | return ret; |
1889 | } |
1890 | |
1891 | inline bool bch2_btree_iter_rewind(struct btree_iter *iter) |
1892 | { |
1893 | struct bpos pos = bkey_start_pos(k: &iter->k); |
1894 | bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS |
1895 | ? bpos_eq(l: pos, POS_MIN) |
1896 | : bkey_eq(l: pos, POS_MIN)); |
1897 | |
1898 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
1899 | pos = bkey_predecessor(iter, p: pos); |
1900 | bch2_btree_iter_set_pos(iter, new_pos: pos); |
1901 | return ret; |
1902 | } |
1903 | |
1904 | static noinline |
1905 | void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, |
1906 | struct bkey_s_c *k) |
1907 | { |
1908 | struct bpos end = path_l(path: btree_iter_path(trans, iter))->b->data->min_key; |
1909 | |
1910 | trans_for_each_update(trans, i) |
1911 | if (!i->key_cache_already_flushed && |
1912 | i->btree_id == iter->btree_id && |
1913 | bpos_le(l: i->k->k.p, r: iter->pos) && |
1914 | bpos_ge(l: i->k->k.p, r: k->k ? k->k->p : end)) { |
1915 | iter->k = i->k->k; |
1916 | *k = bkey_i_to_s_c(k: i->k); |
1917 | } |
1918 | } |
1919 | |
1920 | static noinline |
1921 | void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, |
1922 | struct bkey_s_c *k) |
1923 | { |
1924 | struct btree_path *path = btree_iter_path(trans, iter); |
1925 | struct bpos end = path_l(path)->b->key.k.p; |
1926 | |
1927 | trans_for_each_update(trans, i) |
1928 | if (!i->key_cache_already_flushed && |
1929 | i->btree_id == iter->btree_id && |
1930 | bpos_ge(l: i->k->k.p, r: path->pos) && |
1931 | bpos_le(l: i->k->k.p, r: k->k ? k->k->p : end)) { |
1932 | iter->k = i->k->k; |
1933 | *k = bkey_i_to_s_c(k: i->k); |
1934 | } |
1935 | } |
1936 | |
1937 | static noinline |
1938 | void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter, |
1939 | struct bkey_s_c *k) |
1940 | { |
1941 | trans_for_each_update(trans, i) |
1942 | if (!i->key_cache_already_flushed && |
1943 | i->btree_id == iter->btree_id && |
1944 | bpos_eq(l: i->k->k.p, r: iter->pos)) { |
1945 | iter->k = i->k->k; |
1946 | *k = bkey_i_to_s_c(k: i->k); |
1947 | } |
1948 | } |
1949 | |
1950 | static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, |
1951 | struct btree_iter *iter, |
1952 | struct bpos end_pos) |
1953 | { |
1954 | struct btree_path *path = btree_iter_path(trans, iter); |
1955 | |
1956 | return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, |
1957 | path->level, |
1958 | path->pos, |
1959 | end_pos, |
1960 | &iter->journal_idx); |
1961 | } |
1962 | |
1963 | static noinline |
1964 | struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, |
1965 | struct btree_iter *iter) |
1966 | { |
1967 | struct btree_path *path = btree_iter_path(trans, iter); |
1968 | struct bkey_i *k = bch2_btree_journal_peek(trans, iter, end_pos: path->pos); |
1969 | |
1970 | if (k) { |
1971 | iter->k = k->k; |
1972 | return bkey_i_to_s_c(k); |
1973 | } else { |
1974 | return bkey_s_c_null; |
1975 | } |
1976 | } |
1977 | |
1978 | static noinline |
1979 | struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, |
1980 | struct btree_iter *iter, |
1981 | struct bkey_s_c k) |
1982 | { |
1983 | struct btree_path *path = btree_iter_path(trans, iter); |
1984 | struct bkey_i *next_journal = |
1985 | bch2_btree_journal_peek(trans, iter, |
1986 | end_pos: k.k ? k.k->p : path_l(path)->b->key.k.p); |
1987 | |
1988 | if (next_journal) { |
1989 | iter->k = next_journal->k; |
1990 | k = bkey_i_to_s_c(k: next_journal); |
1991 | } |
1992 | |
1993 | return k; |
1994 | } |
1995 | |
1996 | /* |
1997 | * Checks btree key cache for key at iter->pos and returns it if present, or |
1998 | * bkey_s_c_null: |
1999 | */ |
2000 | static noinline |
2001 | struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) |
2002 | { |
2003 | struct btree_trans *trans = iter->trans; |
2004 | struct bch_fs *c = trans->c; |
2005 | struct bkey u; |
2006 | struct bkey_s_c k; |
2007 | int ret; |
2008 | |
2009 | if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && |
2010 | bpos_eq(l: iter->pos, r: pos)) |
2011 | return bkey_s_c_null; |
2012 | |
2013 | if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) |
2014 | return bkey_s_c_null; |
2015 | |
2016 | if (!iter->key_cache_path) |
2017 | iter->key_cache_path = bch2_path_get(trans, btree_id: iter->btree_id, pos, |
2018 | locks_want: iter->flags & BTREE_ITER_INTENT, level: 0, |
2019 | flags: iter->flags|BTREE_ITER_CACHED| |
2020 | BTREE_ITER_CACHED_NOFILL, |
2021 | _THIS_IP_); |
2022 | |
2023 | iter->key_cache_path = bch2_btree_path_set_pos(trans, path: iter->key_cache_path, new_pos: pos, |
2024 | intent: iter->flags & BTREE_ITER_INTENT, |
2025 | ip: btree_iter_ip_allocated(iter)); |
2026 | |
2027 | ret = bch2_btree_path_traverse(trans, path: iter->key_cache_path, |
2028 | flags: iter->flags|BTREE_ITER_CACHED) ?: |
2029 | bch2_btree_path_relock(trans, path: btree_iter_path(trans, iter), _THIS_IP_); |
2030 | if (unlikely(ret)) |
2031 | return bkey_s_c_err(ret); |
2032 | |
2033 | btree_path_set_should_be_locked(path: trans->paths + iter->key_cache_path); |
2034 | |
2035 | k = bch2_btree_path_peek_slot(path: trans->paths + iter->key_cache_path, u: &u); |
2036 | if (k.k && !bkey_err(k)) { |
2037 | iter->k = u; |
2038 | k.k = &iter->k; |
2039 | } |
2040 | return k; |
2041 | } |
2042 | |
2043 | static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) |
2044 | { |
2045 | struct btree_trans *trans = iter->trans; |
2046 | struct bkey_s_c k, k2; |
2047 | int ret; |
2048 | |
2049 | EBUG_ON(btree_iter_path(trans, iter)->cached); |
2050 | bch2_btree_iter_verify(iter); |
2051 | |
2052 | while (1) { |
2053 | struct btree_path_level *l; |
2054 | |
2055 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, new_pos: search_key, |
2056 | intent: iter->flags & BTREE_ITER_INTENT, |
2057 | ip: btree_iter_ip_allocated(iter)); |
2058 | |
2059 | ret = bch2_btree_path_traverse(trans, path: iter->path, flags: iter->flags); |
2060 | if (unlikely(ret)) { |
2061 | /* ensure that iter->k is consistent with iter->pos: */ |
2062 | bch2_btree_iter_set_pos(iter, new_pos: iter->pos); |
2063 | k = bkey_s_c_err(ret); |
2064 | goto out; |
2065 | } |
2066 | |
2067 | struct btree_path *path = btree_iter_path(trans, iter); |
2068 | l = path_l(path); |
2069 | |
2070 | if (unlikely(!l->b)) { |
2071 | /* No btree nodes at requested level: */ |
2072 | bch2_btree_iter_set_pos(iter, SPOS_MAX); |
2073 | k = bkey_s_c_null; |
2074 | goto out; |
2075 | } |
2076 | |
2077 | btree_path_set_should_be_locked(path); |
2078 | |
2079 | k = btree_path_level_peek_all(c: trans->c, l, u: &iter->k); |
2080 | |
2081 | if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && |
2082 | k.k && |
2083 | (k2 = btree_trans_peek_key_cache(iter, pos: k.k->p)).k) { |
2084 | k = k2; |
2085 | ret = bkey_err(k); |
2086 | if (ret) { |
2087 | bch2_btree_iter_set_pos(iter, new_pos: iter->pos); |
2088 | goto out; |
2089 | } |
2090 | } |
2091 | |
2092 | if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) |
2093 | k = btree_trans_peek_journal(trans, iter, k); |
2094 | |
2095 | if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && |
2096 | trans->nr_updates)) |
2097 | bch2_btree_trans_peek_updates(trans, iter, k: &k); |
2098 | |
2099 | if (k.k && bkey_deleted(k.k)) { |
2100 | /* |
2101 | * If we've got a whiteout, and it's after the search |
2102 | * key, advance the search key to the whiteout instead |
2103 | * of just after the whiteout - it might be a btree |
2104 | * whiteout, with a real key at the same position, since |
2105 | * in the btree deleted keys sort before non deleted. |
2106 | */ |
2107 | search_key = !bpos_eq(l: search_key, r: k.k->p) |
2108 | ? k.k->p |
2109 | : bpos_successor(p: k.k->p); |
2110 | continue; |
2111 | } |
2112 | |
2113 | if (likely(k.k)) { |
2114 | break; |
2115 | } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { |
2116 | /* Advance to next leaf node: */ |
2117 | search_key = bpos_successor(p: l->b->key.k.p); |
2118 | } else { |
2119 | /* End of btree: */ |
2120 | bch2_btree_iter_set_pos(iter, SPOS_MAX); |
2121 | k = bkey_s_c_null; |
2122 | goto out; |
2123 | } |
2124 | } |
2125 | out: |
2126 | bch2_btree_iter_verify(iter); |
2127 | |
2128 | return k; |
2129 | } |
2130 | |
2131 | /** |
2132 | * bch2_btree_iter_peek_upto() - returns first key greater than or equal to |
2133 | * iterator's current position |
2134 | * @iter: iterator to peek from |
2135 | * @end: search limit: returns keys less than or equal to @end |
2136 | * |
2137 | * Returns: key if found, or an error extractable with bkey_err(). |
2138 | */ |
2139 | struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) |
2140 | { |
2141 | struct btree_trans *trans = iter->trans; |
2142 | struct bpos search_key = btree_iter_search_key(iter); |
2143 | struct bkey_s_c k; |
2144 | struct bpos iter_pos; |
2145 | int ret; |
2146 | |
2147 | EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); |
2148 | |
2149 | if (iter->update_path) { |
2150 | bch2_path_put_nokeep(trans, path: iter->update_path, |
2151 | intent: iter->flags & BTREE_ITER_INTENT); |
2152 | iter->update_path = 0; |
2153 | } |
2154 | |
2155 | bch2_btree_iter_verify_entry_exit(iter); |
2156 | |
2157 | while (1) { |
2158 | k = __bch2_btree_iter_peek(iter, search_key); |
2159 | if (unlikely(!k.k)) |
2160 | goto end; |
2161 | if (unlikely(bkey_err(k))) |
2162 | goto out_no_locked; |
2163 | |
2164 | /* |
2165 | * We need to check against @end before FILTER_SNAPSHOTS because |
2166 | * if we get to a different inode that requested we might be |
2167 | * seeing keys for a different snapshot tree that will all be |
2168 | * filtered out. |
2169 | * |
2170 | * But we can't do the full check here, because bkey_start_pos() |
2171 | * isn't monotonically increasing before FILTER_SNAPSHOTS, and |
2172 | * that's what we check against in extents mode: |
2173 | */ |
2174 | if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) |
2175 | ? bkey_gt(k.k->p, end) |
2176 | : k.k->p.inode > end.inode)) |
2177 | goto end; |
2178 | |
2179 | if (iter->update_path && |
2180 | !bkey_eq(l: trans->paths[iter->update_path].pos, r: k.k->p)) { |
2181 | bch2_path_put_nokeep(trans, path: iter->update_path, |
2182 | intent: iter->flags & BTREE_ITER_INTENT); |
2183 | iter->update_path = 0; |
2184 | } |
2185 | |
2186 | if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && |
2187 | (iter->flags & BTREE_ITER_INTENT) && |
2188 | !(iter->flags & BTREE_ITER_IS_EXTENTS) && |
2189 | !iter->update_path) { |
2190 | struct bpos pos = k.k->p; |
2191 | |
2192 | if (pos.snapshot < iter->snapshot) { |
2193 | search_key = bpos_successor(p: k.k->p); |
2194 | continue; |
2195 | } |
2196 | |
2197 | pos.snapshot = iter->snapshot; |
2198 | |
2199 | /* |
2200 | * advance, same as on exit for iter->path, but only up |
2201 | * to snapshot |
2202 | */ |
2203 | __btree_path_get(path: trans->paths + iter->path, intent: iter->flags & BTREE_ITER_INTENT); |
2204 | iter->update_path = iter->path; |
2205 | |
2206 | iter->update_path = bch2_btree_path_set_pos(trans, |
2207 | path: iter->update_path, new_pos: pos, |
2208 | intent: iter->flags & BTREE_ITER_INTENT, |
2209 | _THIS_IP_); |
2210 | ret = bch2_btree_path_traverse(trans, path: iter->update_path, flags: iter->flags); |
2211 | if (unlikely(ret)) { |
2212 | k = bkey_s_c_err(ret); |
2213 | goto out_no_locked; |
2214 | } |
2215 | } |
2216 | |
2217 | /* |
2218 | * We can never have a key in a leaf node at POS_MAX, so |
2219 | * we don't have to check these successor() calls: |
2220 | */ |
2221 | if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && |
2222 | !bch2_snapshot_is_ancestor(c: trans->c, |
2223 | id: iter->snapshot, |
2224 | ancestor: k.k->p.snapshot)) { |
2225 | search_key = bpos_successor(p: k.k->p); |
2226 | continue; |
2227 | } |
2228 | |
2229 | if (bkey_whiteout(k.k) && |
2230 | !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { |
2231 | search_key = bkey_successor(iter, p: k.k->p); |
2232 | continue; |
2233 | } |
2234 | |
2235 | /* |
2236 | * iter->pos should be mononotically increasing, and always be |
2237 | * equal to the key we just returned - except extents can |
2238 | * straddle iter->pos: |
2239 | */ |
2240 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) |
2241 | iter_pos = k.k->p; |
2242 | else |
2243 | iter_pos = bkey_max(l: iter->pos, r: bkey_start_pos(k: k.k)); |
2244 | |
2245 | if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) |
2246 | ? bkey_gt(iter_pos, end) |
2247 | : bkey_ge(iter_pos, end))) |
2248 | goto end; |
2249 | |
2250 | break; |
2251 | } |
2252 | |
2253 | iter->pos = iter_pos; |
2254 | |
2255 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, new_pos: k.k->p, |
2256 | intent: iter->flags & BTREE_ITER_INTENT, |
2257 | ip: btree_iter_ip_allocated(iter)); |
2258 | |
2259 | btree_path_set_should_be_locked(path: btree_iter_path(trans, iter)); |
2260 | out_no_locked: |
2261 | if (iter->update_path) { |
2262 | ret = bch2_btree_path_relock(trans, path: trans->paths + iter->update_path, _THIS_IP_); |
2263 | if (unlikely(ret)) |
2264 | k = bkey_s_c_err(ret); |
2265 | else |
2266 | btree_path_set_should_be_locked(path: trans->paths + iter->update_path); |
2267 | } |
2268 | |
2269 | if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) |
2270 | iter->pos.snapshot = iter->snapshot; |
2271 | |
2272 | ret = bch2_btree_iter_verify_ret(iter, k); |
2273 | if (unlikely(ret)) { |
2274 | bch2_btree_iter_set_pos(iter, new_pos: iter->pos); |
2275 | k = bkey_s_c_err(ret); |
2276 | } |
2277 | |
2278 | bch2_btree_iter_verify_entry_exit(iter); |
2279 | |
2280 | return k; |
2281 | end: |
2282 | bch2_btree_iter_set_pos(iter, new_pos: end); |
2283 | k = bkey_s_c_null; |
2284 | goto out_no_locked; |
2285 | } |
2286 | |
2287 | /** |
2288 | * bch2_btree_iter_next() - returns first key greater than iterator's current |
2289 | * position |
2290 | * @iter: iterator to peek from |
2291 | * |
2292 | * Returns: key if found, or an error extractable with bkey_err(). |
2293 | */ |
2294 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) |
2295 | { |
2296 | if (!bch2_btree_iter_advance(iter)) |
2297 | return bkey_s_c_null; |
2298 | |
2299 | return bch2_btree_iter_peek(iter); |
2300 | } |
2301 | |
2302 | /** |
2303 | * bch2_btree_iter_peek_prev() - returns first key less than or equal to |
2304 | * iterator's current position |
2305 | * @iter: iterator to peek from |
2306 | * |
2307 | * Returns: key if found, or an error extractable with bkey_err(). |
2308 | */ |
2309 | struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) |
2310 | { |
2311 | struct btree_trans *trans = iter->trans; |
2312 | struct bpos search_key = iter->pos; |
2313 | struct bkey_s_c k; |
2314 | struct bkey saved_k; |
2315 | const struct bch_val *saved_v; |
2316 | btree_path_idx_t saved_path = 0; |
2317 | int ret; |
2318 | |
2319 | EBUG_ON(btree_iter_path(trans, iter)->cached || |
2320 | btree_iter_path(trans, iter)->level); |
2321 | |
2322 | if (iter->flags & BTREE_ITER_WITH_JOURNAL) |
2323 | return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); |
2324 | |
2325 | bch2_btree_iter_verify(iter); |
2326 | bch2_btree_iter_verify_entry_exit(iter); |
2327 | |
2328 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) |
2329 | search_key.snapshot = U32_MAX; |
2330 | |
2331 | while (1) { |
2332 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, new_pos: search_key, |
2333 | intent: iter->flags & BTREE_ITER_INTENT, |
2334 | ip: btree_iter_ip_allocated(iter)); |
2335 | |
2336 | ret = bch2_btree_path_traverse(trans, path: iter->path, flags: iter->flags); |
2337 | if (unlikely(ret)) { |
2338 | /* ensure that iter->k is consistent with iter->pos: */ |
2339 | bch2_btree_iter_set_pos(iter, new_pos: iter->pos); |
2340 | k = bkey_s_c_err(ret); |
2341 | goto out_no_locked; |
2342 | } |
2343 | |
2344 | struct btree_path *path = btree_iter_path(trans, iter); |
2345 | |
2346 | k = btree_path_level_peek(trans, path, l: &path->l[0], u: &iter->k); |
2347 | if (!k.k || |
2348 | ((iter->flags & BTREE_ITER_IS_EXTENTS) |
2349 | ? bpos_ge(l: bkey_start_pos(k: k.k), r: search_key) |
2350 | : bpos_gt(l: k.k->p, r: search_key))) |
2351 | k = btree_path_level_prev(trans, path, l: &path->l[0], u: &iter->k); |
2352 | |
2353 | if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && |
2354 | trans->nr_updates)) |
2355 | bch2_btree_trans_peek_prev_updates(trans, iter, k: &k); |
2356 | |
2357 | if (likely(k.k)) { |
2358 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { |
2359 | if (k.k->p.snapshot == iter->snapshot) |
2360 | goto got_key; |
2361 | |
2362 | /* |
2363 | * If we have a saved candidate, and we're no |
2364 | * longer at the same _key_ (not pos), return |
2365 | * that candidate |
2366 | */ |
2367 | if (saved_path && !bkey_eq(l: k.k->p, r: saved_k.p)) { |
2368 | bch2_path_put_nokeep(trans, path: iter->path, |
2369 | intent: iter->flags & BTREE_ITER_INTENT); |
2370 | iter->path = saved_path; |
2371 | saved_path = 0; |
2372 | iter->k = saved_k; |
2373 | k.v = saved_v; |
2374 | goto got_key; |
2375 | } |
2376 | |
2377 | if (bch2_snapshot_is_ancestor(c: trans->c, |
2378 | id: iter->snapshot, |
2379 | ancestor: k.k->p.snapshot)) { |
2380 | if (saved_path) |
2381 | bch2_path_put_nokeep(trans, path: saved_path, |
2382 | intent: iter->flags & BTREE_ITER_INTENT); |
2383 | saved_path = btree_path_clone(trans, src: iter->path, |
2384 | intent: iter->flags & BTREE_ITER_INTENT); |
2385 | path = btree_iter_path(trans, iter); |
2386 | saved_k = *k.k; |
2387 | saved_v = k.v; |
2388 | } |
2389 | |
2390 | search_key = bpos_predecessor(p: k.k->p); |
2391 | continue; |
2392 | } |
2393 | got_key: |
2394 | if (bkey_whiteout(k.k) && |
2395 | !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { |
2396 | search_key = bkey_predecessor(iter, p: k.k->p); |
2397 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) |
2398 | search_key.snapshot = U32_MAX; |
2399 | continue; |
2400 | } |
2401 | |
2402 | btree_path_set_should_be_locked(path); |
2403 | break; |
2404 | } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { |
2405 | /* Advance to previous leaf node: */ |
2406 | search_key = bpos_predecessor(p: path->l[0].b->data->min_key); |
2407 | } else { |
2408 | /* Start of btree: */ |
2409 | bch2_btree_iter_set_pos(iter, POS_MIN); |
2410 | k = bkey_s_c_null; |
2411 | goto out_no_locked; |
2412 | } |
2413 | } |
2414 | |
2415 | EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); |
2416 | |
2417 | /* Extents can straddle iter->pos: */ |
2418 | if (bkey_lt(l: k.k->p, r: iter->pos)) |
2419 | iter->pos = k.k->p; |
2420 | |
2421 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) |
2422 | iter->pos.snapshot = iter->snapshot; |
2423 | out_no_locked: |
2424 | if (saved_path) |
2425 | bch2_path_put_nokeep(trans, path: saved_path, intent: iter->flags & BTREE_ITER_INTENT); |
2426 | |
2427 | bch2_btree_iter_verify_entry_exit(iter); |
2428 | bch2_btree_iter_verify(iter); |
2429 | |
2430 | return k; |
2431 | } |
2432 | |
2433 | /** |
2434 | * bch2_btree_iter_prev() - returns first key less than iterator's current |
2435 | * position |
2436 | * @iter: iterator to peek from |
2437 | * |
2438 | * Returns: key if found, or an error extractable with bkey_err(). |
2439 | */ |
2440 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) |
2441 | { |
2442 | if (!bch2_btree_iter_rewind(iter)) |
2443 | return bkey_s_c_null; |
2444 | |
2445 | return bch2_btree_iter_peek_prev(iter); |
2446 | } |
2447 | |
2448 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) |
2449 | { |
2450 | struct btree_trans *trans = iter->trans; |
2451 | struct bpos search_key; |
2452 | struct bkey_s_c k; |
2453 | int ret; |
2454 | |
2455 | bch2_btree_iter_verify(iter); |
2456 | bch2_btree_iter_verify_entry_exit(iter); |
2457 | EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); |
2458 | |
2459 | /* extents can't span inode numbers: */ |
2460 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && |
2461 | unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { |
2462 | if (iter->pos.inode == KEY_INODE_MAX) |
2463 | return bkey_s_c_null; |
2464 | |
2465 | bch2_btree_iter_set_pos(iter, new_pos: bpos_nosnap_successor(p: iter->pos)); |
2466 | } |
2467 | |
2468 | search_key = btree_iter_search_key(iter); |
2469 | iter->path = bch2_btree_path_set_pos(trans, path: iter->path, new_pos: search_key, |
2470 | intent: iter->flags & BTREE_ITER_INTENT, |
2471 | ip: btree_iter_ip_allocated(iter)); |
2472 | |
2473 | ret = bch2_btree_path_traverse(trans, path: iter->path, flags: iter->flags); |
2474 | if (unlikely(ret)) { |
2475 | k = bkey_s_c_err(ret); |
2476 | goto out_no_locked; |
2477 | } |
2478 | |
2479 | if ((iter->flags & BTREE_ITER_CACHED) || |
2480 | !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { |
2481 | k = bkey_s_c_null; |
2482 | |
2483 | if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) && |
2484 | trans->nr_updates)) { |
2485 | bch2_btree_trans_peek_slot_updates(trans, iter, k: &k); |
2486 | if (k.k) |
2487 | goto out; |
2488 | } |
2489 | |
2490 | if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && |
2491 | (k = btree_trans_peek_slot_journal(trans, iter)).k) |
2492 | goto out; |
2493 | |
2494 | if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && |
2495 | (k = btree_trans_peek_key_cache(iter, pos: iter->pos)).k) { |
2496 | if (!bkey_err(k)) |
2497 | iter->k = *k.k; |
2498 | /* We're not returning a key from iter->path: */ |
2499 | goto out_no_locked; |
2500 | } |
2501 | |
2502 | k = bch2_btree_path_peek_slot(path: trans->paths + iter->path, u: &iter->k); |
2503 | if (unlikely(!k.k)) |
2504 | goto out_no_locked; |
2505 | } else { |
2506 | struct bpos next; |
2507 | struct bpos end = iter->pos; |
2508 | |
2509 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
2510 | end.offset = U64_MAX; |
2511 | |
2512 | EBUG_ON(btree_iter_path(trans, iter)->level); |
2513 | |
2514 | if (iter->flags & BTREE_ITER_INTENT) { |
2515 | struct btree_iter iter2; |
2516 | |
2517 | bch2_trans_copy_iter(&iter2, iter); |
2518 | k = bch2_btree_iter_peek_upto(iter: &iter2, end); |
2519 | |
2520 | if (k.k && !bkey_err(k)) { |
2521 | swap(iter->key_cache_path, iter2.key_cache_path); |
2522 | iter->k = iter2.k; |
2523 | k.k = &iter->k; |
2524 | } |
2525 | bch2_trans_iter_exit(trans, &iter2); |
2526 | } else { |
2527 | struct bpos pos = iter->pos; |
2528 | |
2529 | k = bch2_btree_iter_peek_upto(iter, end); |
2530 | if (unlikely(bkey_err(k))) |
2531 | bch2_btree_iter_set_pos(iter, new_pos: pos); |
2532 | else |
2533 | iter->pos = pos; |
2534 | } |
2535 | |
2536 | if (unlikely(bkey_err(k))) |
2537 | goto out_no_locked; |
2538 | |
2539 | next = k.k ? bkey_start_pos(k: k.k) : POS_MAX; |
2540 | |
2541 | if (bkey_lt(l: iter->pos, r: next)) { |
2542 | bkey_init(k: &iter->k); |
2543 | iter->k.p = iter->pos; |
2544 | |
2545 | if (iter->flags & BTREE_ITER_IS_EXTENTS) { |
2546 | bch2_key_resize(k: &iter->k, |
2547 | min_t(u64, KEY_SIZE_MAX, |
2548 | (next.inode == iter->pos.inode |
2549 | ? next.offset |
2550 | : KEY_OFFSET_MAX) - |
2551 | iter->pos.offset)); |
2552 | EBUG_ON(!iter->k.size); |
2553 | } |
2554 | |
2555 | k = (struct bkey_s_c) { &iter->k, NULL }; |
2556 | } |
2557 | } |
2558 | out: |
2559 | btree_path_set_should_be_locked(path: btree_iter_path(trans, iter)); |
2560 | out_no_locked: |
2561 | bch2_btree_iter_verify_entry_exit(iter); |
2562 | bch2_btree_iter_verify(iter); |
2563 | ret = bch2_btree_iter_verify_ret(iter, k); |
2564 | if (unlikely(ret)) |
2565 | return bkey_s_c_err(ret); |
2566 | |
2567 | return k; |
2568 | } |
2569 | |
2570 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) |
2571 | { |
2572 | if (!bch2_btree_iter_advance(iter)) |
2573 | return bkey_s_c_null; |
2574 | |
2575 | return bch2_btree_iter_peek_slot(iter); |
2576 | } |
2577 | |
2578 | struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) |
2579 | { |
2580 | if (!bch2_btree_iter_rewind(iter)) |
2581 | return bkey_s_c_null; |
2582 | |
2583 | return bch2_btree_iter_peek_slot(iter); |
2584 | } |
2585 | |
2586 | struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) |
2587 | { |
2588 | struct bkey_s_c k; |
2589 | |
2590 | while (btree_trans_too_many_iters(trans: iter->trans) || |
2591 | (k = bch2_btree_iter_peek_type(iter, flags: iter->flags), |
2592 | bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) |
2593 | bch2_trans_begin(iter->trans); |
2594 | |
2595 | return k; |
2596 | } |
2597 | |
2598 | /* new transactional stuff: */ |
2599 | |
2600 | #ifdef CONFIG_BCACHEFS_DEBUG |
2601 | static void btree_trans_verify_sorted_refs(struct btree_trans *trans) |
2602 | { |
2603 | struct btree_path *path; |
2604 | unsigned i; |
2605 | |
2606 | BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); |
2607 | |
2608 | trans_for_each_path(trans, path, i) { |
2609 | BUG_ON(path->sorted_idx >= trans->nr_sorted); |
2610 | BUG_ON(trans->sorted[path->sorted_idx] != i); |
2611 | } |
2612 | |
2613 | for (i = 0; i < trans->nr_sorted; i++) { |
2614 | unsigned idx = trans->sorted[i]; |
2615 | |
2616 | BUG_ON(!test_bit(idx, trans->paths_allocated)); |
2617 | BUG_ON(trans->paths[idx].sorted_idx != i); |
2618 | } |
2619 | } |
2620 | |
2621 | static void btree_trans_verify_sorted(struct btree_trans *trans) |
2622 | { |
2623 | struct btree_path *path, *prev = NULL; |
2624 | struct trans_for_each_path_inorder_iter iter; |
2625 | |
2626 | if (!bch2_debug_check_iterators) |
2627 | return; |
2628 | |
2629 | trans_for_each_path_inorder(trans, path, iter) { |
2630 | if (prev && btree_path_cmp(prev, path) > 0) { |
2631 | __bch2_dump_trans_paths_updates(trans, true); |
2632 | panic("trans paths out of order!\n" ); |
2633 | } |
2634 | prev = path; |
2635 | } |
2636 | } |
2637 | #else |
2638 | static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} |
2639 | static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} |
2640 | #endif |
2641 | |
2642 | void __bch2_btree_trans_sort_paths(struct btree_trans *trans) |
2643 | { |
2644 | int i, l = 0, r = trans->nr_sorted, inc = 1; |
2645 | bool swapped; |
2646 | |
2647 | btree_trans_verify_sorted_refs(trans); |
2648 | |
2649 | if (trans->paths_sorted) |
2650 | goto out; |
2651 | |
2652 | /* |
2653 | * Cocktail shaker sort: this is efficient because iterators will be |
2654 | * mostly sorted. |
2655 | */ |
2656 | do { |
2657 | swapped = false; |
2658 | |
2659 | for (i = inc > 0 ? l : r - 2; |
2660 | i + 1 < r && i >= l; |
2661 | i += inc) { |
2662 | if (btree_path_cmp(l: trans->paths + trans->sorted[i], |
2663 | r: trans->paths + trans->sorted[i + 1]) > 0) { |
2664 | swap(trans->sorted[i], trans->sorted[i + 1]); |
2665 | trans->paths[trans->sorted[i]].sorted_idx = i; |
2666 | trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; |
2667 | swapped = true; |
2668 | } |
2669 | } |
2670 | |
2671 | if (inc > 0) |
2672 | --r; |
2673 | else |
2674 | l++; |
2675 | inc = -inc; |
2676 | } while (swapped); |
2677 | |
2678 | trans->paths_sorted = true; |
2679 | out: |
2680 | btree_trans_verify_sorted(trans); |
2681 | } |
2682 | |
2683 | static inline void btree_path_list_remove(struct btree_trans *trans, |
2684 | struct btree_path *path) |
2685 | { |
2686 | EBUG_ON(path->sorted_idx >= trans->nr_sorted); |
2687 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
2688 | trans->nr_sorted--; |
2689 | memmove_u64s_down_small(dst: trans->sorted + path->sorted_idx, |
2690 | src: trans->sorted + path->sorted_idx + 1, |
2691 | DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, |
2692 | sizeof(u64) / sizeof(btree_path_idx_t))); |
2693 | #else |
2694 | array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); |
2695 | #endif |
2696 | for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) |
2697 | trans->paths[trans->sorted[i]].sorted_idx = i; |
2698 | } |
2699 | |
2700 | static inline void btree_path_list_add(struct btree_trans *trans, |
2701 | btree_path_idx_t pos, |
2702 | btree_path_idx_t path_idx) |
2703 | { |
2704 | struct btree_path *path = trans->paths + path_idx; |
2705 | |
2706 | path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; |
2707 | |
2708 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
2709 | memmove_u64s_up_small(dst: trans->sorted + path->sorted_idx + 1, |
2710 | src: trans->sorted + path->sorted_idx, |
2711 | DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, |
2712 | sizeof(u64) / sizeof(btree_path_idx_t))); |
2713 | trans->nr_sorted++; |
2714 | trans->sorted[path->sorted_idx] = path_idx; |
2715 | #else |
2716 | array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); |
2717 | #endif |
2718 | |
2719 | for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) |
2720 | trans->paths[trans->sorted[i]].sorted_idx = i; |
2721 | |
2722 | btree_trans_verify_sorted_refs(trans); |
2723 | } |
2724 | |
2725 | void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) |
2726 | { |
2727 | if (iter->update_path) |
2728 | bch2_path_put_nokeep(trans, path: iter->update_path, |
2729 | intent: iter->flags & BTREE_ITER_INTENT); |
2730 | if (iter->path) |
2731 | bch2_path_put(trans, path_idx: iter->path, |
2732 | intent: iter->flags & BTREE_ITER_INTENT); |
2733 | if (iter->key_cache_path) |
2734 | bch2_path_put(trans, path_idx: iter->key_cache_path, |
2735 | intent: iter->flags & BTREE_ITER_INTENT); |
2736 | iter->path = 0; |
2737 | iter->update_path = 0; |
2738 | iter->key_cache_path = 0; |
2739 | iter->trans = NULL; |
2740 | } |
2741 | |
2742 | void bch2_trans_iter_init_outlined(struct btree_trans *trans, |
2743 | struct btree_iter *iter, |
2744 | enum btree_id btree_id, struct bpos pos, |
2745 | unsigned flags) |
2746 | { |
2747 | bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want: 0, depth: 0, |
2748 | flags: bch2_btree_iter_flags(trans, btree_id, flags), |
2749 | _RET_IP_); |
2750 | } |
2751 | |
2752 | void bch2_trans_node_iter_init(struct btree_trans *trans, |
2753 | struct btree_iter *iter, |
2754 | enum btree_id btree_id, |
2755 | struct bpos pos, |
2756 | unsigned locks_want, |
2757 | unsigned depth, |
2758 | unsigned flags) |
2759 | { |
2760 | flags |= BTREE_ITER_NOT_EXTENTS; |
2761 | flags |= __BTREE_ITER_ALL_SNAPSHOTS; |
2762 | flags |= BTREE_ITER_ALL_SNAPSHOTS; |
2763 | |
2764 | bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, |
2765 | flags: __bch2_btree_iter_flags(trans, btree_id, flags), |
2766 | _RET_IP_); |
2767 | |
2768 | iter->min_depth = depth; |
2769 | |
2770 | struct btree_path *path = btree_iter_path(trans, iter); |
2771 | BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); |
2772 | BUG_ON(path->level != depth); |
2773 | BUG_ON(iter->min_depth != depth); |
2774 | } |
2775 | |
2776 | void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) |
2777 | { |
2778 | struct btree_trans *trans = src->trans; |
2779 | |
2780 | *dst = *src; |
2781 | #ifdef TRACK_PATH_ALLOCATED |
2782 | dst->ip_allocated = _RET_IP_; |
2783 | #endif |
2784 | if (src->path) |
2785 | __btree_path_get(path: trans->paths + src->path, intent: src->flags & BTREE_ITER_INTENT); |
2786 | if (src->update_path) |
2787 | __btree_path_get(path: trans->paths + src->update_path, intent: src->flags & BTREE_ITER_INTENT); |
2788 | dst->key_cache_path = 0; |
2789 | } |
2790 | |
2791 | void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) |
2792 | { |
2793 | struct bch_fs *c = trans->c; |
2794 | unsigned new_top = trans->mem_top + size; |
2795 | unsigned old_bytes = trans->mem_bytes; |
2796 | unsigned new_bytes = roundup_pow_of_two(new_top); |
2797 | int ret; |
2798 | void *new_mem; |
2799 | void *p; |
2800 | |
2801 | WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); |
2802 | |
2803 | struct btree_transaction_stats *s = btree_trans_stats(trans); |
2804 | s->max_mem = max(s->max_mem, new_bytes); |
2805 | |
2806 | if (trans->used_mempool) { |
2807 | if (trans->mem_bytes >= new_bytes) |
2808 | goto out_change_top; |
2809 | |
2810 | /* No more space from mempool item, need malloc new one */ |
2811 | new_mem = kmalloc(size: new_bytes, GFP_NOWAIT|__GFP_NOWARN); |
2812 | if (unlikely(!new_mem)) { |
2813 | bch2_trans_unlock(trans); |
2814 | |
2815 | new_mem = kmalloc(size: new_bytes, GFP_KERNEL); |
2816 | if (!new_mem) |
2817 | return ERR_PTR(error: -BCH_ERR_ENOMEM_trans_kmalloc); |
2818 | |
2819 | ret = bch2_trans_relock(trans); |
2820 | if (ret) { |
2821 | kfree(objp: new_mem); |
2822 | return ERR_PTR(error: ret); |
2823 | } |
2824 | } |
2825 | memcpy(new_mem, trans->mem, trans->mem_top); |
2826 | trans->used_mempool = false; |
2827 | mempool_free(element: trans->mem, pool: &c->btree_trans_mem_pool); |
2828 | goto out_new_mem; |
2829 | } |
2830 | |
2831 | new_mem = krealloc(objp: trans->mem, new_size: new_bytes, GFP_NOWAIT|__GFP_NOWARN); |
2832 | if (unlikely(!new_mem)) { |
2833 | bch2_trans_unlock(trans); |
2834 | |
2835 | new_mem = krealloc(objp: trans->mem, new_size: new_bytes, GFP_KERNEL); |
2836 | if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { |
2837 | new_mem = mempool_alloc(pool: &c->btree_trans_mem_pool, GFP_KERNEL); |
2838 | new_bytes = BTREE_TRANS_MEM_MAX; |
2839 | memcpy(new_mem, trans->mem, trans->mem_top); |
2840 | trans->used_mempool = true; |
2841 | kfree(objp: trans->mem); |
2842 | } |
2843 | |
2844 | if (!new_mem) |
2845 | return ERR_PTR(error: -BCH_ERR_ENOMEM_trans_kmalloc); |
2846 | |
2847 | trans->mem = new_mem; |
2848 | trans->mem_bytes = new_bytes; |
2849 | |
2850 | ret = bch2_trans_relock(trans); |
2851 | if (ret) |
2852 | return ERR_PTR(error: ret); |
2853 | } |
2854 | out_new_mem: |
2855 | trans->mem = new_mem; |
2856 | trans->mem_bytes = new_bytes; |
2857 | |
2858 | if (old_bytes) { |
2859 | trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); |
2860 | return ERR_PTR(error: btree_trans_restart(trans, err: BCH_ERR_transaction_restart_mem_realloced)); |
2861 | } |
2862 | out_change_top: |
2863 | p = trans->mem + trans->mem_top; |
2864 | trans->mem_top += size; |
2865 | memset(p, 0, size); |
2866 | return p; |
2867 | } |
2868 | |
2869 | static inline void check_srcu_held_too_long(struct btree_trans *trans) |
2870 | { |
2871 | WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), |
2872 | "btree trans held srcu lock (delaying memory reclaim) for %lu seconds" , |
2873 | (jiffies - trans->srcu_lock_time) / HZ); |
2874 | } |
2875 | |
2876 | void bch2_trans_srcu_unlock(struct btree_trans *trans) |
2877 | { |
2878 | if (trans->srcu_held) { |
2879 | struct bch_fs *c = trans->c; |
2880 | struct btree_path *path; |
2881 | unsigned i; |
2882 | |
2883 | trans_for_each_path(trans, path, i) |
2884 | if (path->cached && !btree_node_locked(path, level: 0)) |
2885 | path->l[0].b = ERR_PTR(error: -BCH_ERR_no_btree_node_srcu_reset); |
2886 | |
2887 | check_srcu_held_too_long(trans); |
2888 | srcu_read_unlock(ssp: &c->btree_trans_barrier, idx: trans->srcu_idx); |
2889 | trans->srcu_held = false; |
2890 | } |
2891 | } |
2892 | |
2893 | static void bch2_trans_srcu_lock(struct btree_trans *trans) |
2894 | { |
2895 | if (!trans->srcu_held) { |
2896 | trans->srcu_idx = srcu_read_lock(ssp: &trans->c->btree_trans_barrier); |
2897 | trans->srcu_lock_time = jiffies; |
2898 | trans->srcu_held = true; |
2899 | } |
2900 | } |
2901 | |
2902 | /** |
2903 | * bch2_trans_begin() - reset a transaction after a interrupted attempt |
2904 | * @trans: transaction to reset |
2905 | * |
2906 | * Returns: current restart counter, to be used with trans_was_restarted() |
2907 | * |
2908 | * While iterating over nodes or updating nodes a attempt to lock a btree node |
2909 | * may return BCH_ERR_transaction_restart when the trylock fails. When this |
2910 | * occurs bch2_trans_begin() should be called and the transaction retried. |
2911 | */ |
2912 | u32 bch2_trans_begin(struct btree_trans *trans) |
2913 | { |
2914 | struct btree_path *path; |
2915 | unsigned i; |
2916 | u64 now; |
2917 | |
2918 | bch2_trans_reset_updates(trans); |
2919 | |
2920 | trans->restart_count++; |
2921 | trans->mem_top = 0; |
2922 | trans->journal_entries = NULL; |
2923 | |
2924 | trans_for_each_path(trans, path, i) { |
2925 | path->should_be_locked = false; |
2926 | |
2927 | /* |
2928 | * If the transaction wasn't restarted, we're presuming to be |
2929 | * doing something new: dont keep iterators excpt the ones that |
2930 | * are in use - except for the subvolumes btree: |
2931 | */ |
2932 | if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) |
2933 | path->preserve = false; |
2934 | |
2935 | /* |
2936 | * XXX: we probably shouldn't be doing this if the transaction |
2937 | * was restarted, but currently we still overflow transaction |
2938 | * iterators if we do that |
2939 | */ |
2940 | if (!path->ref && !path->preserve) |
2941 | __bch2_path_free(trans, path: i); |
2942 | else |
2943 | path->preserve = false; |
2944 | } |
2945 | |
2946 | now = local_clock(); |
2947 | |
2948 | if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) && |
2949 | time_after64(now, trans->last_begin_time + 10)) |
2950 | __bch2_time_stats_update(stats: &btree_trans_stats(trans)->duration, |
2951 | trans->last_begin_time, now); |
2952 | |
2953 | if (!trans->restarted && |
2954 | (need_resched() || |
2955 | time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) { |
2956 | drop_locks_do(trans, (cond_resched(), 0)); |
2957 | now = local_clock(); |
2958 | } |
2959 | trans->last_begin_time = now; |
2960 | |
2961 | if (unlikely(trans->srcu_held && |
2962 | time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) |
2963 | bch2_trans_srcu_unlock(trans); |
2964 | |
2965 | trans->last_begin_ip = _RET_IP_; |
2966 | if (trans->restarted) { |
2967 | bch2_btree_path_traverse_all(trans); |
2968 | trans->notrace_relock_fail = false; |
2969 | } |
2970 | |
2971 | return trans->restart_count; |
2972 | } |
2973 | |
2974 | const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" }; |
2975 | |
2976 | unsigned bch2_trans_get_fn_idx(const char *fn) |
2977 | { |
2978 | for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) |
2979 | if (!bch2_btree_transaction_fns[i] || |
2980 | bch2_btree_transaction_fns[i] == fn) { |
2981 | bch2_btree_transaction_fns[i] = fn; |
2982 | return i; |
2983 | } |
2984 | |
2985 | pr_warn_once("BCH_TRANSACTIONS_NR not big enough!" ); |
2986 | return 0; |
2987 | } |
2988 | |
2989 | struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) |
2990 | __acquires(&c->btree_trans_barrier) |
2991 | { |
2992 | struct btree_trans *trans; |
2993 | |
2994 | if (IS_ENABLED(__KERNEL__)) { |
2995 | trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); |
2996 | if (trans) { |
2997 | memset(trans, 0, offsetof(struct btree_trans, list)); |
2998 | goto got_trans; |
2999 | } |
3000 | } |
3001 | |
3002 | trans = mempool_alloc(pool: &c->btree_trans_pool, GFP_NOFS); |
3003 | memset(trans, 0, sizeof(*trans)); |
3004 | closure_init_stack(cl: &trans->ref); |
3005 | |
3006 | seqmutex_lock(lock: &c->btree_trans_lock); |
3007 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { |
3008 | struct btree_trans *pos; |
3009 | pid_t pid = current->pid; |
3010 | |
3011 | trans->locking_wait.task = current; |
3012 | |
3013 | list_for_each_entry(pos, &c->btree_trans_list, list) { |
3014 | struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); |
3015 | /* |
3016 | * We'd much prefer to be stricter here and completely |
3017 | * disallow multiple btree_trans in the same thread - |
3018 | * but the data move path calls bch2_write when we |
3019 | * already have a btree_trans initialized. |
3020 | */ |
3021 | BUG_ON(pos_task && |
3022 | pid == pos_task->pid && |
3023 | bch2_trans_locked(pos)); |
3024 | |
3025 | if (pos_task && pid < pos_task->pid) { |
3026 | list_add_tail(new: &trans->list, head: &pos->list); |
3027 | goto list_add_done; |
3028 | } |
3029 | } |
3030 | } |
3031 | list_add_tail(new: &trans->list, head: &c->btree_trans_list); |
3032 | list_add_done: |
3033 | seqmutex_unlock(lock: &c->btree_trans_lock); |
3034 | got_trans: |
3035 | trans->c = c; |
3036 | trans->last_begin_time = local_clock(); |
3037 | trans->fn_idx = fn_idx; |
3038 | trans->locking_wait.task = current; |
3039 | trans->journal_replay_not_finished = |
3040 | unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && |
3041 | atomic_inc_not_zero(v: &c->journal_keys.ref); |
3042 | trans->nr_paths = ARRAY_SIZE(trans->_paths); |
3043 | trans->paths_allocated = trans->_paths_allocated; |
3044 | trans->sorted = trans->_sorted; |
3045 | trans->paths = trans->_paths; |
3046 | trans->updates = trans->_updates; |
3047 | |
3048 | *trans_paths_nr(paths: trans->paths) = BTREE_ITER_INITIAL; |
3049 | |
3050 | trans->paths_allocated[0] = 1; |
3051 | |
3052 | if (fn_idx < BCH_TRANSACTIONS_NR) { |
3053 | trans->fn = bch2_btree_transaction_fns[fn_idx]; |
3054 | |
3055 | struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; |
3056 | |
3057 | if (s->max_mem) { |
3058 | unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); |
3059 | |
3060 | trans->mem = kmalloc(size: expected_mem_bytes, GFP_KERNEL); |
3061 | if (likely(trans->mem)) |
3062 | trans->mem_bytes = expected_mem_bytes; |
3063 | } |
3064 | |
3065 | trans->nr_paths_max = s->nr_max_paths; |
3066 | trans->journal_entries_size = s->journal_entries_size; |
3067 | } |
3068 | |
3069 | trans->srcu_idx = srcu_read_lock(ssp: &c->btree_trans_barrier); |
3070 | trans->srcu_lock_time = jiffies; |
3071 | trans->srcu_held = true; |
3072 | return trans; |
3073 | } |
3074 | |
3075 | static void check_btree_paths_leaked(struct btree_trans *trans) |
3076 | { |
3077 | #ifdef CONFIG_BCACHEFS_DEBUG |
3078 | struct bch_fs *c = trans->c; |
3079 | struct btree_path *path; |
3080 | unsigned i; |
3081 | |
3082 | trans_for_each_path(trans, path, i) |
3083 | if (path->ref) |
3084 | goto leaked; |
3085 | return; |
3086 | leaked: |
3087 | bch_err(c, "btree paths leaked from %s!" , trans->fn); |
3088 | trans_for_each_path(trans, path, i) |
3089 | if (path->ref) |
3090 | printk(KERN_ERR " btree %s %pS\n" , |
3091 | bch2_btree_id_str(path->btree_id), |
3092 | (void *) path->ip_allocated); |
3093 | /* Be noisy about this: */ |
3094 | bch2_fatal_error(c); |
3095 | #endif |
3096 | } |
3097 | |
3098 | void bch2_trans_put(struct btree_trans *trans) |
3099 | __releases(&c->btree_trans_barrier) |
3100 | { |
3101 | struct bch_fs *c = trans->c; |
3102 | |
3103 | bch2_trans_unlock(trans); |
3104 | |
3105 | trans_for_each_update(trans, i) |
3106 | __btree_path_put(path: trans->paths + i->path, intent: true); |
3107 | trans->nr_updates = 0; |
3108 | trans->locking_wait.task = NULL; |
3109 | |
3110 | check_btree_paths_leaked(trans); |
3111 | |
3112 | if (trans->srcu_held) { |
3113 | check_srcu_held_too_long(trans); |
3114 | srcu_read_unlock(ssp: &c->btree_trans_barrier, idx: trans->srcu_idx); |
3115 | } |
3116 | |
3117 | if (trans->fs_usage_deltas) { |
3118 | if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == |
3119 | REPLICAS_DELTA_LIST_MAX) |
3120 | mempool_free(element: trans->fs_usage_deltas, |
3121 | pool: &c->replicas_delta_pool); |
3122 | else |
3123 | kfree(objp: trans->fs_usage_deltas); |
3124 | } |
3125 | |
3126 | if (unlikely(trans->journal_replay_not_finished)) |
3127 | bch2_journal_keys_put(c); |
3128 | |
3129 | unsigned long *paths_allocated = trans->paths_allocated; |
3130 | trans->paths_allocated = NULL; |
3131 | trans->paths = NULL; |
3132 | |
3133 | if (paths_allocated != trans->_paths_allocated) |
3134 | kvfree_rcu_mightsleep(paths_allocated); |
3135 | |
3136 | if (trans->used_mempool) |
3137 | mempool_free(element: trans->mem, pool: &c->btree_trans_mem_pool); |
3138 | else |
3139 | kfree(objp: trans->mem); |
3140 | |
3141 | /* Userspace doesn't have a real percpu implementation: */ |
3142 | if (IS_ENABLED(__KERNEL__)) |
3143 | trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); |
3144 | |
3145 | if (trans) { |
3146 | closure_sync(cl: &trans->ref); |
3147 | |
3148 | seqmutex_lock(lock: &c->btree_trans_lock); |
3149 | list_del(entry: &trans->list); |
3150 | seqmutex_unlock(lock: &c->btree_trans_lock); |
3151 | |
3152 | mempool_free(element: trans, pool: &c->btree_trans_pool); |
3153 | } |
3154 | } |
3155 | |
3156 | static void __maybe_unused |
3157 | bch2_btree_bkey_cached_common_to_text(struct printbuf *out, |
3158 | struct btree_bkey_cached_common *b) |
3159 | { |
3160 | struct six_lock_count c = six_lock_counts(&b->lock); |
3161 | struct task_struct *owner; |
3162 | pid_t pid; |
3163 | |
3164 | rcu_read_lock(); |
3165 | owner = READ_ONCE(b->lock.owner); |
3166 | pid = owner ? owner->pid : 0; |
3167 | rcu_read_unlock(); |
3168 | |
3169 | prt_tab(out); |
3170 | prt_printf(out, "%px %c l=%u %s:" , b, b->cached ? 'c' : 'b', |
3171 | b->level, bch2_btree_id_str(b->btree_id)); |
3172 | bch2_bpos_to_text(out, btree_node_pos(b)); |
3173 | |
3174 | prt_tab(out); |
3175 | prt_printf(out, " locks %u:%u:%u held by pid %u" , |
3176 | c.n[0], c.n[1], c.n[2], pid); |
3177 | } |
3178 | |
3179 | void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) |
3180 | { |
3181 | struct btree_bkey_cached_common *b; |
3182 | static char lock_types[] = { 'r', 'i', 'w' }; |
3183 | struct task_struct *task = READ_ONCE(trans->locking_wait.task); |
3184 | unsigned l, idx; |
3185 | |
3186 | /* before rcu_read_lock(): */ |
3187 | bch2_printbuf_make_room(out, 4096); |
3188 | |
3189 | if (!out->nr_tabstops) { |
3190 | printbuf_tabstop_push(out, 16); |
3191 | printbuf_tabstop_push(out, 32); |
3192 | } |
3193 | |
3194 | prt_printf(out, "%i %s\n" , task ? task->pid : 0, trans->fn); |
3195 | |
3196 | /* trans->paths is rcu protected vs. freeing */ |
3197 | rcu_read_lock(); |
3198 | out->atomic++; |
3199 | |
3200 | struct btree_path *paths = rcu_dereference(trans->paths); |
3201 | if (!paths) |
3202 | goto out; |
3203 | |
3204 | unsigned long *paths_allocated = trans_paths_allocated(paths); |
3205 | |
3206 | trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { |
3207 | struct btree_path *path = paths + idx; |
3208 | if (!path->nodes_locked) |
3209 | continue; |
3210 | |
3211 | prt_printf(out, " path %u %c l=%u %s:" , |
3212 | idx, |
3213 | path->cached ? 'c' : 'b', |
3214 | path->level, |
3215 | bch2_btree_id_str(path->btree_id)); |
3216 | bch2_bpos_to_text(out, path->pos); |
3217 | prt_newline(out); |
3218 | |
3219 | for (l = 0; l < BTREE_MAX_DEPTH; l++) { |
3220 | if (btree_node_locked(path, level: l) && |
3221 | !IS_ERR_OR_NULL(ptr: b = (void *) READ_ONCE(path->l[l].b))) { |
3222 | prt_printf(out, " %c l=%u " , |
3223 | lock_types[btree_node_locked_type(path, l)], l); |
3224 | bch2_btree_bkey_cached_common_to_text(out, b); |
3225 | prt_newline(out); |
3226 | } |
3227 | } |
3228 | } |
3229 | |
3230 | b = READ_ONCE(trans->locking); |
3231 | if (b) { |
3232 | prt_printf(out, " blocked for %lluus on" , |
3233 | div_u64(local_clock() - trans->locking_wait.start_time, |
3234 | 1000)); |
3235 | prt_newline(out); |
3236 | prt_printf(out, " %c" , lock_types[trans->locking_wait.lock_want]); |
3237 | bch2_btree_bkey_cached_common_to_text(out, b); |
3238 | prt_newline(out); |
3239 | } |
3240 | out: |
3241 | --out->atomic; |
3242 | rcu_read_unlock(); |
3243 | } |
3244 | |
3245 | void bch2_fs_btree_iter_exit(struct bch_fs *c) |
3246 | { |
3247 | struct btree_transaction_stats *s; |
3248 | struct btree_trans *trans; |
3249 | int cpu; |
3250 | |
3251 | if (c->btree_trans_bufs) |
3252 | for_each_possible_cpu(cpu) { |
3253 | struct btree_trans *trans = |
3254 | per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; |
3255 | |
3256 | if (trans) { |
3257 | closure_sync(cl: &trans->ref); |
3258 | |
3259 | seqmutex_lock(lock: &c->btree_trans_lock); |
3260 | list_del(entry: &trans->list); |
3261 | seqmutex_unlock(lock: &c->btree_trans_lock); |
3262 | } |
3263 | kfree(objp: trans); |
3264 | } |
3265 | free_percpu(pdata: c->btree_trans_bufs); |
3266 | |
3267 | trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); |
3268 | if (trans) |
3269 | panic(fmt: "%s leaked btree_trans\n" , trans->fn); |
3270 | |
3271 | for (s = c->btree_transaction_stats; |
3272 | s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); |
3273 | s++) { |
3274 | kfree(objp: s->max_paths_text); |
3275 | bch2_time_stats_exit(&s->lock_hold_times); |
3276 | } |
3277 | |
3278 | if (c->btree_trans_barrier_initialized) |
3279 | cleanup_srcu_struct(ssp: &c->btree_trans_barrier); |
3280 | mempool_exit(pool: &c->btree_trans_mem_pool); |
3281 | mempool_exit(pool: &c->btree_trans_pool); |
3282 | } |
3283 | |
3284 | void bch2_fs_btree_iter_init_early(struct bch_fs *c) |
3285 | { |
3286 | struct btree_transaction_stats *s; |
3287 | |
3288 | for (s = c->btree_transaction_stats; |
3289 | s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); |
3290 | s++) { |
3291 | bch2_time_stats_init(&s->duration); |
3292 | bch2_time_stats_init(&s->lock_hold_times); |
3293 | mutex_init(&s->lock); |
3294 | } |
3295 | |
3296 | INIT_LIST_HEAD(list: &c->btree_trans_list); |
3297 | seqmutex_init(&c->btree_trans_lock); |
3298 | } |
3299 | |
3300 | int bch2_fs_btree_iter_init(struct bch_fs *c) |
3301 | { |
3302 | int ret; |
3303 | |
3304 | c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); |
3305 | if (!c->btree_trans_bufs) |
3306 | return -ENOMEM; |
3307 | |
3308 | ret = mempool_init_kmalloc_pool(pool: &c->btree_trans_pool, min_nr: 1, |
3309 | size: sizeof(struct btree_trans)) ?: |
3310 | mempool_init_kmalloc_pool(pool: &c->btree_trans_mem_pool, min_nr: 1, |
3311 | BTREE_TRANS_MEM_MAX) ?: |
3312 | init_srcu_struct(&c->btree_trans_barrier); |
3313 | if (!ret) |
3314 | c->btree_trans_barrier_initialized = true; |
3315 | return ret; |
3316 | } |
3317 | |