1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Code for working with individual keys, and sorted sets of keys with in a |
4 | * btree node |
5 | * |
6 | * Copyright 2012 Google, Inc. |
7 | */ |
8 | |
9 | #include "bcachefs.h" |
10 | #include "btree_cache.h" |
11 | #include "bset.h" |
12 | #include "eytzinger.h" |
13 | #include "trace.h" |
14 | #include "util.h" |
15 | |
16 | #include <asm/unaligned.h> |
17 | #include <linux/console.h> |
18 | #include <linux/random.h> |
19 | #include <linux/prefetch.h> |
20 | |
21 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, |
22 | struct btree *); |
23 | |
24 | static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) |
25 | { |
26 | unsigned n = ARRAY_SIZE(iter->data); |
27 | |
28 | while (n && __btree_node_iter_set_end(iter, i: n - 1)) |
29 | --n; |
30 | |
31 | return n; |
32 | } |
33 | |
34 | struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) |
35 | { |
36 | return bch2_bkey_to_bset_inlined(b, k); |
37 | } |
38 | |
39 | /* |
40 | * There are never duplicate live keys in the btree - but including keys that |
41 | * have been flagged as deleted (and will be cleaned up later) we _will_ see |
42 | * duplicates. |
43 | * |
44 | * Thus the sort order is: usual key comparison first, but for keys that compare |
45 | * equal the deleted key(s) come first, and the (at most one) live version comes |
46 | * last. |
47 | * |
48 | * The main reason for this is insertion: to handle overwrites, we first iterate |
49 | * over keys that compare equal to our insert key, and then insert immediately |
50 | * prior to the first key greater than the key we're inserting - our insert |
51 | * position will be after all keys that compare equal to our insert key, which |
52 | * by the time we actually do the insert will all be deleted. |
53 | */ |
54 | |
55 | void bch2_dump_bset(struct bch_fs *c, struct btree *b, |
56 | struct bset *i, unsigned set) |
57 | { |
58 | struct bkey_packed *_k, *_n; |
59 | struct bkey uk, n; |
60 | struct bkey_s_c k; |
61 | struct printbuf buf = PRINTBUF; |
62 | |
63 | if (!i->u64s) |
64 | return; |
65 | |
66 | for (_k = i->start; |
67 | _k < vstruct_last(i); |
68 | _k = _n) { |
69 | _n = bkey_p_next(_k); |
70 | |
71 | if (!_k->u64s) { |
72 | printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n" , set, |
73 | _k->_data - i->_data); |
74 | break; |
75 | } |
76 | |
77 | k = bkey_disassemble(b, k: _k, u: &uk); |
78 | |
79 | printbuf_reset(buf: &buf); |
80 | if (c) |
81 | bch2_bkey_val_to_text(&buf, c, k); |
82 | else |
83 | bch2_bkey_to_text(&buf, k.k); |
84 | printk(KERN_ERR "block %u key %5zu: %s\n" , set, |
85 | _k->_data - i->_data, buf.buf); |
86 | |
87 | if (_n == vstruct_last(i)) |
88 | continue; |
89 | |
90 | n = bkey_unpack_key(b, src: _n); |
91 | |
92 | if (bpos_lt(l: n.p, r: k.k->p)) { |
93 | printk(KERN_ERR "Key skipped backwards\n" ); |
94 | continue; |
95 | } |
96 | |
97 | if (!bkey_deleted(k.k) && bpos_eq(l: n.p, r: k.k->p)) |
98 | printk(KERN_ERR "Duplicate keys\n" ); |
99 | } |
100 | |
101 | printbuf_exit(&buf); |
102 | } |
103 | |
104 | void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) |
105 | { |
106 | struct bset_tree *t; |
107 | |
108 | console_lock(); |
109 | for_each_bset(b, t) |
110 | bch2_dump_bset(c, b, i: bset(b, t), set: t - b->set); |
111 | console_unlock(); |
112 | } |
113 | |
114 | void bch2_dump_btree_node_iter(struct btree *b, |
115 | struct btree_node_iter *iter) |
116 | { |
117 | struct btree_node_iter_set *set; |
118 | struct printbuf buf = PRINTBUF; |
119 | |
120 | printk(KERN_ERR "btree node iter with %u/%u sets:\n" , |
121 | __btree_node_iter_used(iter), b->nsets); |
122 | |
123 | btree_node_iter_for_each(iter, set) { |
124 | struct bkey_packed *k = __btree_node_offset_to_key(b, k: set->k); |
125 | struct bset_tree *t = bch2_bkey_to_bset(b, k); |
126 | struct bkey uk = bkey_unpack_key(b, src: k); |
127 | |
128 | printbuf_reset(buf: &buf); |
129 | bch2_bkey_to_text(&buf, &uk); |
130 | printk(KERN_ERR "set %zu key %u: %s\n" , |
131 | t - b->set, set->k, buf.buf); |
132 | } |
133 | |
134 | printbuf_exit(&buf); |
135 | } |
136 | |
137 | struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) |
138 | { |
139 | struct bset_tree *t; |
140 | struct bkey_packed *k; |
141 | struct btree_nr_keys nr = {}; |
142 | |
143 | for_each_bset(b, t) |
144 | bset_tree_for_each_key(b, t, k) |
145 | if (!bkey_deleted(k)) |
146 | btree_keys_account_key_add(&nr, t - b->set, k); |
147 | return nr; |
148 | } |
149 | |
150 | #ifdef CONFIG_BCACHEFS_DEBUG |
151 | |
152 | void __bch2_verify_btree_nr_keys(struct btree *b) |
153 | { |
154 | struct btree_nr_keys nr = bch2_btree_node_count_keys(b); |
155 | |
156 | BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); |
157 | } |
158 | |
159 | static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, |
160 | struct btree *b) |
161 | { |
162 | struct btree_node_iter iter = *_iter; |
163 | const struct bkey_packed *k, *n; |
164 | |
165 | k = bch2_btree_node_iter_peek_all(iter: &iter, b); |
166 | __bch2_btree_node_iter_advance(&iter, b); |
167 | n = bch2_btree_node_iter_peek_all(iter: &iter, b); |
168 | |
169 | bkey_unpack_key(b, src: k); |
170 | |
171 | if (n && |
172 | bkey_iter_cmp(b, l: k, r: n) > 0) { |
173 | struct btree_node_iter_set *set; |
174 | struct bkey ku = bkey_unpack_key(b, src: k); |
175 | struct bkey nu = bkey_unpack_key(b, src: n); |
176 | struct printbuf buf1 = PRINTBUF; |
177 | struct printbuf buf2 = PRINTBUF; |
178 | |
179 | bch2_dump_btree_node(NULL, b); |
180 | bch2_bkey_to_text(&buf1, &ku); |
181 | bch2_bkey_to_text(&buf2, &nu); |
182 | printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n" , |
183 | buf1.buf, buf2.buf); |
184 | printk(KERN_ERR "iter was:" ); |
185 | |
186 | btree_node_iter_for_each(_iter, set) { |
187 | struct bkey_packed *k2 = __btree_node_offset_to_key(b, k: set->k); |
188 | struct bset_tree *t = bch2_bkey_to_bset(b, k: k2); |
189 | printk(" [%zi %zi]" , t - b->set, |
190 | k2->_data - bset(b, t)->_data); |
191 | } |
192 | panic(fmt: "\n" ); |
193 | } |
194 | } |
195 | |
196 | void bch2_btree_node_iter_verify(struct btree_node_iter *iter, |
197 | struct btree *b) |
198 | { |
199 | struct btree_node_iter_set *set, *s2; |
200 | struct bkey_packed *k, *p; |
201 | struct bset_tree *t; |
202 | |
203 | if (bch2_btree_node_iter_end(iter)) |
204 | return; |
205 | |
206 | /* Verify no duplicates: */ |
207 | btree_node_iter_for_each(iter, set) { |
208 | BUG_ON(set->k > set->end); |
209 | btree_node_iter_for_each(iter, s2) |
210 | BUG_ON(set != s2 && set->end == s2->end); |
211 | } |
212 | |
213 | /* Verify that set->end is correct: */ |
214 | btree_node_iter_for_each(iter, set) { |
215 | for_each_bset(b, t) |
216 | if (set->end == t->end_offset) |
217 | goto found; |
218 | BUG(); |
219 | found: |
220 | BUG_ON(set->k < btree_bkey_first_offset(t) || |
221 | set->k >= t->end_offset); |
222 | } |
223 | |
224 | /* Verify iterator is sorted: */ |
225 | btree_node_iter_for_each(iter, set) |
226 | BUG_ON(set != iter->data && |
227 | btree_node_iter_cmp(b, set[-1], set[0]) > 0); |
228 | |
229 | k = bch2_btree_node_iter_peek_all(iter, b); |
230 | |
231 | for_each_bset(b, t) { |
232 | if (iter->data[0].end == t->end_offset) |
233 | continue; |
234 | |
235 | p = bch2_bkey_prev_all(b, t, |
236 | k: bch2_btree_node_iter_bset_pos(iter, b, t)); |
237 | |
238 | BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); |
239 | } |
240 | } |
241 | |
242 | void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, |
243 | struct bkey_packed *insert, unsigned clobber_u64s) |
244 | { |
245 | struct bset_tree *t = bch2_bkey_to_bset(b, k: where); |
246 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, k: where); |
247 | struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); |
248 | struct printbuf buf1 = PRINTBUF; |
249 | struct printbuf buf2 = PRINTBUF; |
250 | #if 0 |
251 | BUG_ON(prev && |
252 | bkey_iter_cmp(b, prev, insert) > 0); |
253 | #else |
254 | if (prev && |
255 | bkey_iter_cmp(b, l: prev, r: insert) > 0) { |
256 | struct bkey k1 = bkey_unpack_key(b, src: prev); |
257 | struct bkey k2 = bkey_unpack_key(b, src: insert); |
258 | |
259 | bch2_dump_btree_node(NULL, b); |
260 | bch2_bkey_to_text(&buf1, &k1); |
261 | bch2_bkey_to_text(&buf2, &k2); |
262 | |
263 | panic(fmt: "prev > insert:\n" |
264 | "prev key %s\n" |
265 | "insert key %s\n" , |
266 | buf1.buf, buf2.buf); |
267 | } |
268 | #endif |
269 | #if 0 |
270 | BUG_ON(next != btree_bkey_last(b, t) && |
271 | bkey_iter_cmp(b, insert, next) > 0); |
272 | #else |
273 | if (next != btree_bkey_last(b, t) && |
274 | bkey_iter_cmp(b, l: insert, r: next) > 0) { |
275 | struct bkey k1 = bkey_unpack_key(b, src: insert); |
276 | struct bkey k2 = bkey_unpack_key(b, src: next); |
277 | |
278 | bch2_dump_btree_node(NULL, b); |
279 | bch2_bkey_to_text(&buf1, &k1); |
280 | bch2_bkey_to_text(&buf2, &k2); |
281 | |
282 | panic(fmt: "insert > next:\n" |
283 | "insert key %s\n" |
284 | "next key %s\n" , |
285 | buf1.buf, buf2.buf); |
286 | } |
287 | #endif |
288 | } |
289 | |
290 | #else |
291 | |
292 | static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, |
293 | struct btree *b) {} |
294 | |
295 | #endif |
296 | |
297 | /* Auxiliary search trees */ |
298 | |
299 | #define BFLOAT_FAILED_UNPACKED U8_MAX |
300 | #define BFLOAT_FAILED U8_MAX |
301 | |
302 | struct bkey_float { |
303 | u8 exponent; |
304 | u8 key_offset; |
305 | u16 mantissa; |
306 | }; |
307 | #define BKEY_MANTISSA_BITS 16 |
308 | |
309 | static unsigned bkey_float_byte_offset(unsigned idx) |
310 | { |
311 | return idx * sizeof(struct bkey_float); |
312 | } |
313 | |
314 | struct ro_aux_tree { |
315 | u8 nothing[0]; |
316 | struct bkey_float f[]; |
317 | }; |
318 | |
319 | struct rw_aux_tree { |
320 | u16 offset; |
321 | struct bpos k; |
322 | }; |
323 | |
324 | static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) |
325 | { |
326 | BUG_ON(t->aux_data_offset == U16_MAX); |
327 | |
328 | switch (bset_aux_tree_type(t)) { |
329 | case BSET_NO_AUX_TREE: |
330 | return t->aux_data_offset; |
331 | case BSET_RO_AUX_TREE: |
332 | return t->aux_data_offset + |
333 | DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + |
334 | t->size * sizeof(u8), 8); |
335 | case BSET_RW_AUX_TREE: |
336 | return t->aux_data_offset + |
337 | DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); |
338 | default: |
339 | BUG(); |
340 | } |
341 | } |
342 | |
343 | static unsigned bset_aux_tree_buf_start(const struct btree *b, |
344 | const struct bset_tree *t) |
345 | { |
346 | return t == b->set |
347 | ? DIV_ROUND_UP(b->unpack_fn_len, 8) |
348 | : bset_aux_tree_buf_end(t: t - 1); |
349 | } |
350 | |
351 | static void *__aux_tree_base(const struct btree *b, |
352 | const struct bset_tree *t) |
353 | { |
354 | return b->aux_data + t->aux_data_offset * 8; |
355 | } |
356 | |
357 | static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, |
358 | const struct bset_tree *t) |
359 | { |
360 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); |
361 | |
362 | return __aux_tree_base(b, t); |
363 | } |
364 | |
365 | static u8 *ro_aux_tree_prev(const struct btree *b, |
366 | const struct bset_tree *t) |
367 | { |
368 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); |
369 | |
370 | return __aux_tree_base(b, t) + bkey_float_byte_offset(idx: t->size); |
371 | } |
372 | |
373 | static struct bkey_float *bkey_float(const struct btree *b, |
374 | const struct bset_tree *t, |
375 | unsigned idx) |
376 | { |
377 | return ro_aux_tree_base(b, t)->f + idx; |
378 | } |
379 | |
380 | static void bset_aux_tree_verify(const struct btree *b) |
381 | { |
382 | #ifdef CONFIG_BCACHEFS_DEBUG |
383 | const struct bset_tree *t; |
384 | |
385 | for_each_bset(b, t) { |
386 | if (t->aux_data_offset == U16_MAX) |
387 | continue; |
388 | |
389 | BUG_ON(t != b->set && |
390 | t[-1].aux_data_offset == U16_MAX); |
391 | |
392 | BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); |
393 | BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); |
394 | BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); |
395 | } |
396 | #endif |
397 | } |
398 | |
399 | void bch2_btree_keys_init(struct btree *b) |
400 | { |
401 | unsigned i; |
402 | |
403 | b->nsets = 0; |
404 | memset(&b->nr, 0, sizeof(b->nr)); |
405 | |
406 | for (i = 0; i < MAX_BSETS; i++) |
407 | b->set[i].data_offset = U16_MAX; |
408 | |
409 | bch2_bset_set_no_aux_tree(b, t: b->set); |
410 | } |
411 | |
412 | /* Binary tree stuff for auxiliary search trees */ |
413 | |
414 | /* |
415 | * Cacheline/offset <-> bkey pointer arithmetic: |
416 | * |
417 | * t->tree is a binary search tree in an array; each node corresponds to a key |
418 | * in one cacheline in t->set (BSET_CACHELINE bytes). |
419 | * |
420 | * This means we don't have to store the full index of the key that a node in |
421 | * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and |
422 | * then bkey_float->m gives us the offset within that cacheline, in units of 8 |
423 | * bytes. |
424 | * |
425 | * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to |
426 | * make this work. |
427 | * |
428 | * To construct the bfloat for an arbitrary key we need to know what the key |
429 | * immediately preceding it is: we have to check if the two keys differ in the |
430 | * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size |
431 | * of the previous key so we can walk backwards to it from t->tree[j]'s key. |
432 | */ |
433 | |
434 | static inline void *bset_cacheline(const struct btree *b, |
435 | const struct bset_tree *t, |
436 | unsigned cacheline) |
437 | { |
438 | return (void *) round_down((unsigned long) btree_bkey_first(b, t), |
439 | L1_CACHE_BYTES) + |
440 | cacheline * BSET_CACHELINE; |
441 | } |
442 | |
443 | static struct bkey_packed *cacheline_to_bkey(const struct btree *b, |
444 | const struct bset_tree *t, |
445 | unsigned cacheline, |
446 | unsigned offset) |
447 | { |
448 | return bset_cacheline(b, t, cacheline) + offset * 8; |
449 | } |
450 | |
451 | static unsigned bkey_to_cacheline(const struct btree *b, |
452 | const struct bset_tree *t, |
453 | const struct bkey_packed *k) |
454 | { |
455 | return ((void *) k - bset_cacheline(b, t, cacheline: 0)) / BSET_CACHELINE; |
456 | } |
457 | |
458 | static ssize_t __bkey_to_cacheline_offset(const struct btree *b, |
459 | const struct bset_tree *t, |
460 | unsigned cacheline, |
461 | const struct bkey_packed *k) |
462 | { |
463 | return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); |
464 | } |
465 | |
466 | static unsigned bkey_to_cacheline_offset(const struct btree *b, |
467 | const struct bset_tree *t, |
468 | unsigned cacheline, |
469 | const struct bkey_packed *k) |
470 | { |
471 | size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); |
472 | |
473 | EBUG_ON(m > U8_MAX); |
474 | return m; |
475 | } |
476 | |
477 | static inline struct bkey_packed *tree_to_bkey(const struct btree *b, |
478 | const struct bset_tree *t, |
479 | unsigned j) |
480 | { |
481 | return cacheline_to_bkey(b, t, |
482 | cacheline: __eytzinger1_to_inorder(i: j, size: t->size - 1, extra: t->extra), |
483 | offset: bkey_float(b, t, idx: j)->key_offset); |
484 | } |
485 | |
486 | static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, |
487 | const struct bset_tree *t, |
488 | unsigned j) |
489 | { |
490 | unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; |
491 | |
492 | return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); |
493 | } |
494 | |
495 | static struct rw_aux_tree *rw_aux_tree(const struct btree *b, |
496 | const struct bset_tree *t) |
497 | { |
498 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); |
499 | |
500 | return __aux_tree_base(b, t); |
501 | } |
502 | |
503 | /* |
504 | * For the write set - the one we're currently inserting keys into - we don't |
505 | * maintain a full search tree, we just keep a simple lookup table in t->prev. |
506 | */ |
507 | static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, |
508 | struct bset_tree *t, |
509 | unsigned j) |
510 | { |
511 | return __btree_node_offset_to_key(b, k: rw_aux_tree(b, t)[j].offset); |
512 | } |
513 | |
514 | static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, |
515 | unsigned j, struct bkey_packed *k) |
516 | { |
517 | EBUG_ON(k >= btree_bkey_last(b, t)); |
518 | |
519 | rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { |
520 | .offset = __btree_node_key_to_offset(b, k), |
521 | .k = bkey_unpack_pos(b, src: k), |
522 | }; |
523 | } |
524 | |
525 | static void bch2_bset_verify_rw_aux_tree(struct btree *b, |
526 | struct bset_tree *t) |
527 | { |
528 | struct bkey_packed *k = btree_bkey_first(b, t); |
529 | unsigned j = 0; |
530 | |
531 | if (!bch2_expensive_debug_checks) |
532 | return; |
533 | |
534 | BUG_ON(bset_has_ro_aux_tree(t)); |
535 | |
536 | if (!bset_has_rw_aux_tree(t)) |
537 | return; |
538 | |
539 | BUG_ON(t->size < 1); |
540 | BUG_ON(rw_aux_to_bkey(b, t, j) != k); |
541 | |
542 | goto start; |
543 | while (1) { |
544 | if (rw_aux_to_bkey(b, t, j) == k) { |
545 | BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, |
546 | bkey_unpack_pos(b, k))); |
547 | start: |
548 | if (++j == t->size) |
549 | break; |
550 | |
551 | BUG_ON(rw_aux_tree(b, t)[j].offset <= |
552 | rw_aux_tree(b, t)[j - 1].offset); |
553 | } |
554 | |
555 | k = bkey_p_next(k); |
556 | BUG_ON(k >= btree_bkey_last(b, t)); |
557 | } |
558 | } |
559 | |
560 | /* returns idx of first entry >= offset: */ |
561 | static unsigned rw_aux_tree_bsearch(struct btree *b, |
562 | struct bset_tree *t, |
563 | unsigned offset) |
564 | { |
565 | unsigned bset_offs = offset - btree_bkey_first_offset(t); |
566 | unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); |
567 | unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; |
568 | |
569 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); |
570 | EBUG_ON(!t->size); |
571 | EBUG_ON(idx > t->size); |
572 | |
573 | while (idx < t->size && |
574 | rw_aux_tree(b, t)[idx].offset < offset) |
575 | idx++; |
576 | |
577 | while (idx && |
578 | rw_aux_tree(b, t)[idx - 1].offset >= offset) |
579 | idx--; |
580 | |
581 | EBUG_ON(idx < t->size && |
582 | rw_aux_tree(b, t)[idx].offset < offset); |
583 | EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); |
584 | EBUG_ON(idx + 1 < t->size && |
585 | rw_aux_tree(b, t)[idx].offset == |
586 | rw_aux_tree(b, t)[idx + 1].offset); |
587 | |
588 | return idx; |
589 | } |
590 | |
591 | static inline unsigned bkey_mantissa(const struct bkey_packed *k, |
592 | const struct bkey_float *f, |
593 | unsigned idx) |
594 | { |
595 | u64 v; |
596 | |
597 | EBUG_ON(!bkey_packed(k)); |
598 | |
599 | v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); |
600 | |
601 | /* |
602 | * In little endian, we're shifting off low bits (and then the bits we |
603 | * want are at the low end), in big endian we're shifting off high bits |
604 | * (and then the bits we want are at the high end, so we shift them |
605 | * back down): |
606 | */ |
607 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
608 | v >>= f->exponent & 7; |
609 | #else |
610 | v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; |
611 | #endif |
612 | return (u16) v; |
613 | } |
614 | |
615 | static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, |
616 | unsigned j, |
617 | struct bkey_packed *min_key, |
618 | struct bkey_packed *max_key) |
619 | { |
620 | struct bkey_float *f = bkey_float(b, t, idx: j); |
621 | struct bkey_packed *m = tree_to_bkey(b, t, j); |
622 | struct bkey_packed *l = is_power_of_2(n: j) |
623 | ? min_key |
624 | : tree_to_prev_bkey(b, t, j: j >> ffs(j)); |
625 | struct bkey_packed *r = is_power_of_2(n: j + 1) |
626 | ? max_key |
627 | : tree_to_bkey(b, t, j: j >> (ffz(j) + 1)); |
628 | unsigned mantissa; |
629 | int shift, exponent, high_bit; |
630 | |
631 | /* |
632 | * for failed bfloats, the lookup code falls back to comparing against |
633 | * the original key. |
634 | */ |
635 | |
636 | if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || |
637 | !b->nr_key_bits) { |
638 | f->exponent = BFLOAT_FAILED_UNPACKED; |
639 | return; |
640 | } |
641 | |
642 | /* |
643 | * The greatest differing bit of l and r is the first bit we must |
644 | * include in the bfloat mantissa we're creating in order to do |
645 | * comparisons - that bit always becomes the high bit of |
646 | * bfloat->mantissa, and thus the exponent we're calculating here is |
647 | * the position of what will become the low bit in bfloat->mantissa: |
648 | * |
649 | * Note that this may be negative - we may be running off the low end |
650 | * of the key: we handle this later: |
651 | */ |
652 | high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), |
653 | min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); |
654 | exponent = high_bit - (BKEY_MANTISSA_BITS - 1); |
655 | |
656 | /* |
657 | * Then we calculate the actual shift value, from the start of the key |
658 | * (k->_data), to get the key bits starting at exponent: |
659 | */ |
660 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
661 | shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; |
662 | |
663 | EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); |
664 | #else |
665 | shift = high_bit_offset + |
666 | b->nr_key_bits - |
667 | exponent - |
668 | BKEY_MANTISSA_BITS; |
669 | |
670 | EBUG_ON(shift < KEY_PACKED_BITS_START); |
671 | #endif |
672 | EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); |
673 | |
674 | f->exponent = shift; |
675 | mantissa = bkey_mantissa(k: m, f, idx: j); |
676 | |
677 | /* |
678 | * If we've got garbage bits, set them to all 1s - it's legal for the |
679 | * bfloat to compare larger than the original key, but not smaller: |
680 | */ |
681 | if (exponent < 0) |
682 | mantissa |= ~(~0U << -exponent); |
683 | |
684 | f->mantissa = mantissa; |
685 | } |
686 | |
687 | /* bytes remaining - only valid for last bset: */ |
688 | static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) |
689 | { |
690 | bset_aux_tree_verify(b); |
691 | |
692 | return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); |
693 | } |
694 | |
695 | static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) |
696 | { |
697 | return __bset_tree_capacity(b, t) / |
698 | (sizeof(struct bkey_float) + sizeof(u8)); |
699 | } |
700 | |
701 | static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) |
702 | { |
703 | return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); |
704 | } |
705 | |
706 | static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) |
707 | { |
708 | struct bkey_packed *k; |
709 | |
710 | t->size = 1; |
711 | t->extra = BSET_RW_AUX_TREE_VAL; |
712 | rw_aux_tree(b, t)[0].offset = |
713 | __btree_node_key_to_offset(b, btree_bkey_first(b, t)); |
714 | |
715 | bset_tree_for_each_key(b, t, k) { |
716 | if (t->size == bset_rw_tree_capacity(b, t)) |
717 | break; |
718 | |
719 | if ((void *) k - (void *) rw_aux_to_bkey(b, t, j: t->size - 1) > |
720 | L1_CACHE_BYTES) |
721 | rw_aux_tree_set(b, t, j: t->size++, k); |
722 | } |
723 | } |
724 | |
725 | static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) |
726 | { |
727 | struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); |
728 | struct bkey_i min_key, max_key; |
729 | unsigned cacheline = 1; |
730 | |
731 | t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), |
732 | bset_ro_tree_capacity(b, t)); |
733 | retry: |
734 | if (t->size < 2) { |
735 | t->size = 0; |
736 | t->extra = BSET_NO_AUX_TREE_VAL; |
737 | return; |
738 | } |
739 | |
740 | t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; |
741 | |
742 | /* First we figure out where the first key in each cacheline is */ |
743 | eytzinger1_for_each(j, t->size - 1) { |
744 | while (bkey_to_cacheline(b, t, k) < cacheline) |
745 | prev = k, k = bkey_p_next(k); |
746 | |
747 | if (k >= btree_bkey_last(b, t)) { |
748 | /* XXX: this path sucks */ |
749 | t->size--; |
750 | goto retry; |
751 | } |
752 | |
753 | ro_aux_tree_prev(b, t)[j] = prev->u64s; |
754 | bkey_float(b, t, idx: j)->key_offset = |
755 | bkey_to_cacheline_offset(b, t, cacheline: cacheline++, k); |
756 | |
757 | EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); |
758 | EBUG_ON(tree_to_bkey(b, t, j) != k); |
759 | } |
760 | |
761 | while (k != btree_bkey_last(b, t)) |
762 | prev = k, k = bkey_p_next(k); |
763 | |
764 | if (!bkey_pack_pos(out: bkey_to_packed(k: &min_key), in: b->data->min_key, b)) { |
765 | bkey_init(k: &min_key.k); |
766 | min_key.k.p = b->data->min_key; |
767 | } |
768 | |
769 | if (!bkey_pack_pos(out: bkey_to_packed(k: &max_key), in: b->data->max_key, b)) { |
770 | bkey_init(k: &max_key.k); |
771 | max_key.k.p = b->data->max_key; |
772 | } |
773 | |
774 | /* Then we build the tree */ |
775 | eytzinger1_for_each(j, t->size - 1) |
776 | make_bfloat(b, t, j, |
777 | min_key: bkey_to_packed(k: &min_key), |
778 | max_key: bkey_to_packed(k: &max_key)); |
779 | } |
780 | |
781 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) |
782 | { |
783 | struct bset_tree *i; |
784 | |
785 | for (i = b->set; i != t; i++) |
786 | BUG_ON(bset_has_rw_aux_tree(i)); |
787 | |
788 | bch2_bset_set_no_aux_tree(b, t); |
789 | |
790 | /* round up to next cacheline: */ |
791 | t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), |
792 | SMP_CACHE_BYTES / sizeof(u64)); |
793 | |
794 | bset_aux_tree_verify(b); |
795 | } |
796 | |
797 | void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, |
798 | bool writeable) |
799 | { |
800 | if (writeable |
801 | ? bset_has_rw_aux_tree(t) |
802 | : bset_has_ro_aux_tree(t)) |
803 | return; |
804 | |
805 | bset_alloc_tree(b, t); |
806 | |
807 | if (!__bset_tree_capacity(b, t)) |
808 | return; |
809 | |
810 | if (writeable) |
811 | __build_rw_aux_tree(b, t); |
812 | else |
813 | __build_ro_aux_tree(b, t); |
814 | |
815 | bset_aux_tree_verify(b); |
816 | } |
817 | |
818 | void bch2_bset_init_first(struct btree *b, struct bset *i) |
819 | { |
820 | struct bset_tree *t; |
821 | |
822 | BUG_ON(b->nsets); |
823 | |
824 | memset(i, 0, sizeof(*i)); |
825 | get_random_bytes(buf: &i->seq, len: sizeof(i->seq)); |
826 | SET_BSET_BIG_ENDIAN(k: i, CPU_BIG_ENDIAN); |
827 | |
828 | t = &b->set[b->nsets++]; |
829 | set_btree_bset(b, t, i); |
830 | } |
831 | |
832 | void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) |
833 | { |
834 | struct bset *i = &bne->keys; |
835 | struct bset_tree *t; |
836 | |
837 | BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); |
838 | BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); |
839 | BUG_ON(b->nsets >= MAX_BSETS); |
840 | |
841 | memset(i, 0, sizeof(*i)); |
842 | i->seq = btree_bset_first(b)->seq; |
843 | SET_BSET_BIG_ENDIAN(k: i, CPU_BIG_ENDIAN); |
844 | |
845 | t = &b->set[b->nsets++]; |
846 | set_btree_bset(b, t, i); |
847 | } |
848 | |
849 | /* |
850 | * find _some_ key in the same bset as @k that precedes @k - not necessarily the |
851 | * immediate predecessor: |
852 | */ |
853 | static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, |
854 | struct bkey_packed *k) |
855 | { |
856 | struct bkey_packed *p; |
857 | unsigned offset; |
858 | int j; |
859 | |
860 | EBUG_ON(k < btree_bkey_first(b, t) || |
861 | k > btree_bkey_last(b, t)); |
862 | |
863 | if (k == btree_bkey_first(b, t)) |
864 | return NULL; |
865 | |
866 | switch (bset_aux_tree_type(t)) { |
867 | case BSET_NO_AUX_TREE: |
868 | p = btree_bkey_first(b, t); |
869 | break; |
870 | case BSET_RO_AUX_TREE: |
871 | j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); |
872 | |
873 | do { |
874 | p = j ? tree_to_bkey(b, t, |
875 | j: __inorder_to_eytzinger1(i: j--, |
876 | size: t->size - 1, extra: t->extra)) |
877 | : btree_bkey_first(b, t); |
878 | } while (p >= k); |
879 | break; |
880 | case BSET_RW_AUX_TREE: |
881 | offset = __btree_node_key_to_offset(b, k); |
882 | j = rw_aux_tree_bsearch(b, t, offset); |
883 | p = j ? rw_aux_to_bkey(b, t, j: j - 1) |
884 | : btree_bkey_first(b, t); |
885 | break; |
886 | } |
887 | |
888 | return p; |
889 | } |
890 | |
891 | struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, |
892 | struct bset_tree *t, |
893 | struct bkey_packed *k, |
894 | unsigned min_key_type) |
895 | { |
896 | struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; |
897 | |
898 | while ((p = __bkey_prev(b, t, k)) && !ret) { |
899 | for (i = p; i != k; i = bkey_p_next(i)) |
900 | if (i->type >= min_key_type) |
901 | ret = i; |
902 | |
903 | k = p; |
904 | } |
905 | |
906 | if (bch2_expensive_debug_checks) { |
907 | BUG_ON(ret >= orig_k); |
908 | |
909 | for (i = ret |
910 | ? bkey_p_next(ret) |
911 | : btree_bkey_first(b, t); |
912 | i != orig_k; |
913 | i = bkey_p_next(i)) |
914 | BUG_ON(i->type >= min_key_type); |
915 | } |
916 | |
917 | return ret; |
918 | } |
919 | |
920 | /* Insert */ |
921 | |
922 | static void bch2_bset_fix_lookup_table(struct btree *b, |
923 | struct bset_tree *t, |
924 | struct bkey_packed *_where, |
925 | unsigned clobber_u64s, |
926 | unsigned new_u64s) |
927 | { |
928 | int shift = new_u64s - clobber_u64s; |
929 | unsigned l, j, where = __btree_node_key_to_offset(b, k: _where); |
930 | |
931 | EBUG_ON(bset_has_ro_aux_tree(t)); |
932 | |
933 | if (!bset_has_rw_aux_tree(t)) |
934 | return; |
935 | |
936 | /* returns first entry >= where */ |
937 | l = rw_aux_tree_bsearch(b, t, offset: where); |
938 | |
939 | if (!l) /* never delete first entry */ |
940 | l++; |
941 | else if (l < t->size && |
942 | where < t->end_offset && |
943 | rw_aux_tree(b, t)[l].offset == where) |
944 | rw_aux_tree_set(b, t, j: l++, k: _where); |
945 | |
946 | /* l now > where */ |
947 | |
948 | for (j = l; |
949 | j < t->size && |
950 | rw_aux_tree(b, t)[j].offset < where + clobber_u64s; |
951 | j++) |
952 | ; |
953 | |
954 | if (j < t->size && |
955 | rw_aux_tree(b, t)[j].offset + shift == |
956 | rw_aux_tree(b, t)[l - 1].offset) |
957 | j++; |
958 | |
959 | memmove(&rw_aux_tree(b, t)[l], |
960 | &rw_aux_tree(b, t)[j], |
961 | (void *) &rw_aux_tree(b, t)[t->size] - |
962 | (void *) &rw_aux_tree(b, t)[j]); |
963 | t->size -= j - l; |
964 | |
965 | for (j = l; j < t->size; j++) |
966 | rw_aux_tree(b, t)[j].offset += shift; |
967 | |
968 | EBUG_ON(l < t->size && |
969 | rw_aux_tree(b, t)[l].offset == |
970 | rw_aux_tree(b, t)[l - 1].offset); |
971 | |
972 | if (t->size < bset_rw_tree_capacity(b, t) && |
973 | (l < t->size |
974 | ? rw_aux_tree(b, t)[l].offset |
975 | : t->end_offset) - |
976 | rw_aux_tree(b, t)[l - 1].offset > |
977 | L1_CACHE_BYTES / sizeof(u64)) { |
978 | struct bkey_packed *start = rw_aux_to_bkey(b, t, j: l - 1); |
979 | struct bkey_packed *end = l < t->size |
980 | ? rw_aux_to_bkey(b, t, j: l) |
981 | : btree_bkey_last(b, t); |
982 | struct bkey_packed *k = start; |
983 | |
984 | while (1) { |
985 | k = bkey_p_next(k); |
986 | if (k == end) |
987 | break; |
988 | |
989 | if ((void *) k - (void *) start >= L1_CACHE_BYTES) { |
990 | memmove(&rw_aux_tree(b, t)[l + 1], |
991 | &rw_aux_tree(b, t)[l], |
992 | (void *) &rw_aux_tree(b, t)[t->size] - |
993 | (void *) &rw_aux_tree(b, t)[l]); |
994 | t->size++; |
995 | rw_aux_tree_set(b, t, j: l, k); |
996 | break; |
997 | } |
998 | } |
999 | } |
1000 | |
1001 | bch2_bset_verify_rw_aux_tree(b, t); |
1002 | bset_aux_tree_verify(b); |
1003 | } |
1004 | |
1005 | void bch2_bset_insert(struct btree *b, |
1006 | struct btree_node_iter *iter, |
1007 | struct bkey_packed *where, |
1008 | struct bkey_i *insert, |
1009 | unsigned clobber_u64s) |
1010 | { |
1011 | struct bkey_format *f = &b->format; |
1012 | struct bset_tree *t = bset_tree_last(b); |
1013 | struct bkey_packed packed, *src = bkey_to_packed(k: insert); |
1014 | |
1015 | bch2_bset_verify_rw_aux_tree(b, t); |
1016 | bch2_verify_insert_pos(b, where, insert: bkey_to_packed(k: insert), clobber_u64s); |
1017 | |
1018 | if (bch2_bkey_pack_key(&packed, &insert->k, f)) |
1019 | src = &packed; |
1020 | |
1021 | if (!bkey_deleted(&insert->k)) |
1022 | btree_keys_account_key_add(&b->nr, t - b->set, src); |
1023 | |
1024 | if (src->u64s != clobber_u64s) { |
1025 | u64 *src_p = (u64 *) where->_data + clobber_u64s; |
1026 | u64 *dst_p = (u64 *) where->_data + src->u64s; |
1027 | |
1028 | EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < |
1029 | (int) clobber_u64s - src->u64s); |
1030 | |
1031 | memmove_u64s(dst: dst_p, src: src_p, btree_bkey_last(b, t)->_data - src_p); |
1032 | le16_add_cpu(var: &bset(b, t)->u64s, val: src->u64s - clobber_u64s); |
1033 | set_btree_bset_end(b, t); |
1034 | } |
1035 | |
1036 | memcpy_u64s_small(dst: where, src, |
1037 | u64s: bkeyp_key_u64s(format: f, k: src)); |
1038 | memcpy_u64s(bkeyp_val(f, where), src: &insert->v, |
1039 | u64s: bkeyp_val_u64s(format: f, k: src)); |
1040 | |
1041 | if (src->u64s != clobber_u64s) |
1042 | bch2_bset_fix_lookup_table(b, t, where: where, clobber_u64s, new_u64s: src->u64s); |
1043 | |
1044 | bch2_verify_btree_nr_keys(b); |
1045 | } |
1046 | |
1047 | void bch2_bset_delete(struct btree *b, |
1048 | struct bkey_packed *where, |
1049 | unsigned clobber_u64s) |
1050 | { |
1051 | struct bset_tree *t = bset_tree_last(b); |
1052 | u64 *src_p = (u64 *) where->_data + clobber_u64s; |
1053 | u64 *dst_p = where->_data; |
1054 | |
1055 | bch2_bset_verify_rw_aux_tree(b, t); |
1056 | |
1057 | EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); |
1058 | |
1059 | memmove_u64s_down(dst: dst_p, src: src_p, btree_bkey_last(b, t)->_data - src_p); |
1060 | le16_add_cpu(var: &bset(b, t)->u64s, val: -clobber_u64s); |
1061 | set_btree_bset_end(b, t); |
1062 | |
1063 | bch2_bset_fix_lookup_table(b, t, where: where, clobber_u64s, new_u64s: 0); |
1064 | } |
1065 | |
1066 | /* Lookup */ |
1067 | |
1068 | __flatten |
1069 | static struct bkey_packed *bset_search_write_set(const struct btree *b, |
1070 | struct bset_tree *t, |
1071 | struct bpos *search) |
1072 | { |
1073 | unsigned l = 0, r = t->size; |
1074 | |
1075 | while (l + 1 != r) { |
1076 | unsigned m = (l + r) >> 1; |
1077 | |
1078 | if (bpos_lt(l: rw_aux_tree(b, t)[m].k, r: *search)) |
1079 | l = m; |
1080 | else |
1081 | r = m; |
1082 | } |
1083 | |
1084 | return rw_aux_to_bkey(b, t, j: l); |
1085 | } |
1086 | |
1087 | static inline void prefetch_four_cachelines(void *p) |
1088 | { |
1089 | #ifdef CONFIG_X86_64 |
1090 | asm("prefetcht0 (-127 + 64 * 0)(%0);" |
1091 | "prefetcht0 (-127 + 64 * 1)(%0);" |
1092 | "prefetcht0 (-127 + 64 * 2)(%0);" |
1093 | "prefetcht0 (-127 + 64 * 3)(%0);" |
1094 | : |
1095 | : "r" (p + 127)); |
1096 | #else |
1097 | prefetch(p + L1_CACHE_BYTES * 0); |
1098 | prefetch(p + L1_CACHE_BYTES * 1); |
1099 | prefetch(p + L1_CACHE_BYTES * 2); |
1100 | prefetch(p + L1_CACHE_BYTES * 3); |
1101 | #endif |
1102 | } |
1103 | |
1104 | static inline bool bkey_mantissa_bits_dropped(const struct btree *b, |
1105 | const struct bkey_float *f, |
1106 | unsigned idx) |
1107 | { |
1108 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
1109 | unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; |
1110 | |
1111 | return f->exponent > key_bits_start; |
1112 | #else |
1113 | unsigned key_bits_end = high_bit_offset + b->nr_key_bits; |
1114 | |
1115 | return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; |
1116 | #endif |
1117 | } |
1118 | |
1119 | __flatten |
1120 | static struct bkey_packed *bset_search_tree(const struct btree *b, |
1121 | const struct bset_tree *t, |
1122 | const struct bpos *search, |
1123 | const struct bkey_packed *packed_search) |
1124 | { |
1125 | struct ro_aux_tree *base = ro_aux_tree_base(b, t); |
1126 | struct bkey_float *f; |
1127 | struct bkey_packed *k; |
1128 | unsigned inorder, n = 1, l, r; |
1129 | int cmp; |
1130 | |
1131 | do { |
1132 | if (likely(n << 4 < t->size)) |
1133 | prefetch(&base->f[n << 4]); |
1134 | |
1135 | f = &base->f[n]; |
1136 | if (unlikely(f->exponent >= BFLOAT_FAILED)) |
1137 | goto slowpath; |
1138 | |
1139 | l = f->mantissa; |
1140 | r = bkey_mantissa(k: packed_search, f, idx: n); |
1141 | |
1142 | if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, idx: n)) |
1143 | goto slowpath; |
1144 | |
1145 | n = n * 2 + (l < r); |
1146 | continue; |
1147 | slowpath: |
1148 | k = tree_to_bkey(b, t, j: n); |
1149 | cmp = bkey_cmp_p_or_unp(b, l: k, r_packed: packed_search, r: search); |
1150 | if (!cmp) |
1151 | return k; |
1152 | |
1153 | n = n * 2 + (cmp < 0); |
1154 | } while (n < t->size); |
1155 | |
1156 | inorder = __eytzinger1_to_inorder(i: n >> 1, size: t->size - 1, extra: t->extra); |
1157 | |
1158 | /* |
1159 | * n would have been the node we recursed to - the low bit tells us if |
1160 | * we recursed left or recursed right. |
1161 | */ |
1162 | if (likely(!(n & 1))) { |
1163 | --inorder; |
1164 | if (unlikely(!inorder)) |
1165 | return btree_bkey_first(b, t); |
1166 | |
1167 | f = &base->f[eytzinger1_prev(i: n >> 1, size: t->size - 1)]; |
1168 | } |
1169 | |
1170 | return cacheline_to_bkey(b, t, cacheline: inorder, offset: f->key_offset); |
1171 | } |
1172 | |
1173 | static __always_inline __flatten |
1174 | struct bkey_packed *__bch2_bset_search(struct btree *b, |
1175 | struct bset_tree *t, |
1176 | struct bpos *search, |
1177 | const struct bkey_packed *lossy_packed_search) |
1178 | { |
1179 | |
1180 | /* |
1181 | * First, we search for a cacheline, then lastly we do a linear search |
1182 | * within that cacheline. |
1183 | * |
1184 | * To search for the cacheline, there's three different possibilities: |
1185 | * * The set is too small to have a search tree, so we just do a linear |
1186 | * search over the whole set. |
1187 | * * The set is the one we're currently inserting into; keeping a full |
1188 | * auxiliary search tree up to date would be too expensive, so we |
1189 | * use a much simpler lookup table to do a binary search - |
1190 | * bset_search_write_set(). |
1191 | * * Or we use the auxiliary search tree we constructed earlier - |
1192 | * bset_search_tree() |
1193 | */ |
1194 | |
1195 | switch (bset_aux_tree_type(t)) { |
1196 | case BSET_NO_AUX_TREE: |
1197 | return btree_bkey_first(b, t); |
1198 | case BSET_RW_AUX_TREE: |
1199 | return bset_search_write_set(b, t, search); |
1200 | case BSET_RO_AUX_TREE: |
1201 | return bset_search_tree(b, t, search, packed_search: lossy_packed_search); |
1202 | default: |
1203 | BUG(); |
1204 | } |
1205 | } |
1206 | |
1207 | static __always_inline __flatten |
1208 | struct bkey_packed *bch2_bset_search_linear(struct btree *b, |
1209 | struct bset_tree *t, |
1210 | struct bpos *search, |
1211 | struct bkey_packed *packed_search, |
1212 | const struct bkey_packed *lossy_packed_search, |
1213 | struct bkey_packed *m) |
1214 | { |
1215 | if (lossy_packed_search) |
1216 | while (m != btree_bkey_last(b, t) && |
1217 | bkey_iter_cmp_p_or_unp(b, l: m, |
1218 | r_packed: lossy_packed_search, r: search) < 0) |
1219 | m = bkey_p_next(m); |
1220 | |
1221 | if (!packed_search) |
1222 | while (m != btree_bkey_last(b, t) && |
1223 | bkey_iter_pos_cmp(b, l: m, r: search) < 0) |
1224 | m = bkey_p_next(m); |
1225 | |
1226 | if (bch2_expensive_debug_checks) { |
1227 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, k: m); |
1228 | |
1229 | BUG_ON(prev && |
1230 | bkey_iter_cmp_p_or_unp(b, prev, |
1231 | packed_search, search) >= 0); |
1232 | } |
1233 | |
1234 | return m; |
1235 | } |
1236 | |
1237 | /* Btree node iterator */ |
1238 | |
1239 | static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, |
1240 | struct btree *b, |
1241 | const struct bkey_packed *k, |
1242 | const struct bkey_packed *end) |
1243 | { |
1244 | if (k != end) { |
1245 | struct btree_node_iter_set *pos; |
1246 | |
1247 | btree_node_iter_for_each(iter, pos) |
1248 | ; |
1249 | |
1250 | BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); |
1251 | *pos = (struct btree_node_iter_set) { |
1252 | __btree_node_key_to_offset(b, k), |
1253 | __btree_node_key_to_offset(b, k: end) |
1254 | }; |
1255 | } |
1256 | } |
1257 | |
1258 | void bch2_btree_node_iter_push(struct btree_node_iter *iter, |
1259 | struct btree *b, |
1260 | const struct bkey_packed *k, |
1261 | const struct bkey_packed *end) |
1262 | { |
1263 | __bch2_btree_node_iter_push(iter, b, k, end); |
1264 | bch2_btree_node_iter_sort(iter, b); |
1265 | } |
1266 | |
1267 | noinline __flatten __cold |
1268 | static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, |
1269 | struct btree *b, struct bpos *search) |
1270 | { |
1271 | struct bkey_packed *k; |
1272 | |
1273 | trace_bkey_pack_pos_fail(p: search); |
1274 | |
1275 | bch2_btree_node_iter_init_from_start(iter, b); |
1276 | |
1277 | while ((k = bch2_btree_node_iter_peek(iter, b)) && |
1278 | bkey_iter_pos_cmp(b, l: k, r: search) < 0) |
1279 | bch2_btree_node_iter_advance(iter, b); |
1280 | } |
1281 | |
1282 | /** |
1283 | * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a |
1284 | * given position |
1285 | * |
1286 | * @iter: iterator to initialize |
1287 | * @b: btree node to search |
1288 | * @search: search key |
1289 | * |
1290 | * Main entry point to the lookup code for individual btree nodes: |
1291 | * |
1292 | * NOTE: |
1293 | * |
1294 | * When you don't filter out deleted keys, btree nodes _do_ contain duplicate |
1295 | * keys. This doesn't matter for most code, but it does matter for lookups. |
1296 | * |
1297 | * Some adjacent keys with a string of equal keys: |
1298 | * i j k k k k l m |
1299 | * |
1300 | * If you search for k, the lookup code isn't guaranteed to return you any |
1301 | * specific k. The lookup code is conceptually doing a binary search and |
1302 | * iterating backwards is very expensive so if the pivot happens to land at the |
1303 | * last k that's what you'll get. |
1304 | * |
1305 | * This works out ok, but it's something to be aware of: |
1306 | * |
1307 | * - For non extents, we guarantee that the live key comes last - see |
1308 | * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't |
1309 | * see will only be deleted keys you don't care about. |
1310 | * |
1311 | * - For extents, deleted keys sort last (see the comment at the top of this |
1312 | * file). But when you're searching for extents, you actually want the first |
1313 | * key strictly greater than your search key - an extent that compares equal |
1314 | * to the search key is going to have 0 sectors after the search key. |
1315 | * |
1316 | * But this does mean that we can't just search for |
1317 | * bpos_successor(start_of_range) to get the first extent that overlaps with |
1318 | * the range we want - if we're unlucky and there's an extent that ends |
1319 | * exactly where we searched, then there could be a deleted key at the same |
1320 | * position and we'd get that when we search instead of the preceding extent |
1321 | * we needed. |
1322 | * |
1323 | * So we've got to search for start_of_range, then after the lookup iterate |
1324 | * past any extents that compare equal to the position we searched for. |
1325 | */ |
1326 | __flatten |
1327 | void bch2_btree_node_iter_init(struct btree_node_iter *iter, |
1328 | struct btree *b, struct bpos *search) |
1329 | { |
1330 | struct bkey_packed p, *packed_search = NULL; |
1331 | struct btree_node_iter_set *pos = iter->data; |
1332 | struct bkey_packed *k[MAX_BSETS]; |
1333 | unsigned i; |
1334 | |
1335 | EBUG_ON(bpos_lt(*search, b->data->min_key)); |
1336 | EBUG_ON(bpos_gt(*search, b->data->max_key)); |
1337 | bset_aux_tree_verify(b); |
1338 | |
1339 | memset(iter, 0, sizeof(*iter)); |
1340 | |
1341 | switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { |
1342 | case BKEY_PACK_POS_EXACT: |
1343 | packed_search = &p; |
1344 | break; |
1345 | case BKEY_PACK_POS_SMALLER: |
1346 | packed_search = NULL; |
1347 | break; |
1348 | case BKEY_PACK_POS_FAIL: |
1349 | btree_node_iter_init_pack_failed(iter, b, search); |
1350 | return; |
1351 | } |
1352 | |
1353 | for (i = 0; i < b->nsets; i++) { |
1354 | k[i] = __bch2_bset_search(b, t: b->set + i, search, lossy_packed_search: &p); |
1355 | prefetch_four_cachelines(p: k[i]); |
1356 | } |
1357 | |
1358 | for (i = 0; i < b->nsets; i++) { |
1359 | struct bset_tree *t = b->set + i; |
1360 | struct bkey_packed *end = btree_bkey_last(b, t); |
1361 | |
1362 | k[i] = bch2_bset_search_linear(b, t, search, |
1363 | packed_search, lossy_packed_search: &p, m: k[i]); |
1364 | if (k[i] != end) |
1365 | *pos++ = (struct btree_node_iter_set) { |
1366 | __btree_node_key_to_offset(b, k: k[i]), |
1367 | __btree_node_key_to_offset(b, k: end) |
1368 | }; |
1369 | } |
1370 | |
1371 | bch2_btree_node_iter_sort(iter, b); |
1372 | } |
1373 | |
1374 | void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, |
1375 | struct btree *b) |
1376 | { |
1377 | struct bset_tree *t; |
1378 | |
1379 | memset(iter, 0, sizeof(*iter)); |
1380 | |
1381 | for_each_bset(b, t) |
1382 | __bch2_btree_node_iter_push(iter, b, |
1383 | btree_bkey_first(b, t), |
1384 | btree_bkey_last(b, t)); |
1385 | bch2_btree_node_iter_sort(iter, b); |
1386 | } |
1387 | |
1388 | struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, |
1389 | struct btree *b, |
1390 | struct bset_tree *t) |
1391 | { |
1392 | struct btree_node_iter_set *set; |
1393 | |
1394 | btree_node_iter_for_each(iter, set) |
1395 | if (set->end == t->end_offset) |
1396 | return __btree_node_offset_to_key(b, k: set->k); |
1397 | |
1398 | return btree_bkey_last(b, t); |
1399 | } |
1400 | |
1401 | static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, |
1402 | struct btree *b, |
1403 | unsigned first) |
1404 | { |
1405 | bool ret; |
1406 | |
1407 | if ((ret = (btree_node_iter_cmp(b, |
1408 | l: iter->data[first], |
1409 | r: iter->data[first + 1]) > 0))) |
1410 | swap(iter->data[first], iter->data[first + 1]); |
1411 | return ret; |
1412 | } |
1413 | |
1414 | void bch2_btree_node_iter_sort(struct btree_node_iter *iter, |
1415 | struct btree *b) |
1416 | { |
1417 | /* unrolled bubble sort: */ |
1418 | |
1419 | if (!__btree_node_iter_set_end(iter, i: 2)) { |
1420 | btree_node_iter_sort_two(iter, b, first: 0); |
1421 | btree_node_iter_sort_two(iter, b, first: 1); |
1422 | } |
1423 | |
1424 | if (!__btree_node_iter_set_end(iter, i: 1)) |
1425 | btree_node_iter_sort_two(iter, b, first: 0); |
1426 | } |
1427 | |
1428 | void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, |
1429 | struct btree_node_iter_set *set) |
1430 | { |
1431 | struct btree_node_iter_set *last = |
1432 | iter->data + ARRAY_SIZE(iter->data) - 1; |
1433 | |
1434 | memmove(&set[0], &set[1], (void *) last - (void *) set); |
1435 | *last = (struct btree_node_iter_set) { 0, 0 }; |
1436 | } |
1437 | |
1438 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, |
1439 | struct btree *b) |
1440 | { |
1441 | iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; |
1442 | |
1443 | EBUG_ON(iter->data->k > iter->data->end); |
1444 | |
1445 | if (unlikely(__btree_node_iter_set_end(iter, 0))) { |
1446 | /* avoid an expensive memmove call: */ |
1447 | iter->data[0] = iter->data[1]; |
1448 | iter->data[1] = iter->data[2]; |
1449 | iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; |
1450 | return; |
1451 | } |
1452 | |
1453 | if (__btree_node_iter_set_end(iter, i: 1)) |
1454 | return; |
1455 | |
1456 | if (!btree_node_iter_sort_two(iter, b, first: 0)) |
1457 | return; |
1458 | |
1459 | if (__btree_node_iter_set_end(iter, i: 2)) |
1460 | return; |
1461 | |
1462 | btree_node_iter_sort_two(iter, b, first: 1); |
1463 | } |
1464 | |
1465 | void bch2_btree_node_iter_advance(struct btree_node_iter *iter, |
1466 | struct btree *b) |
1467 | { |
1468 | if (bch2_expensive_debug_checks) { |
1469 | bch2_btree_node_iter_verify(iter, b); |
1470 | bch2_btree_node_iter_next_check(iter: iter, b); |
1471 | } |
1472 | |
1473 | __bch2_btree_node_iter_advance(iter, b); |
1474 | } |
1475 | |
1476 | /* |
1477 | * Expensive: |
1478 | */ |
1479 | struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, |
1480 | struct btree *b) |
1481 | { |
1482 | struct bkey_packed *k, *prev = NULL; |
1483 | struct btree_node_iter_set *set; |
1484 | struct bset_tree *t; |
1485 | unsigned end = 0; |
1486 | |
1487 | if (bch2_expensive_debug_checks) |
1488 | bch2_btree_node_iter_verify(iter, b); |
1489 | |
1490 | for_each_bset(b, t) { |
1491 | k = bch2_bkey_prev_all(b, t, |
1492 | k: bch2_btree_node_iter_bset_pos(iter, b, t)); |
1493 | if (k && |
1494 | (!prev || bkey_iter_cmp(b, l: k, r: prev) > 0)) { |
1495 | prev = k; |
1496 | end = t->end_offset; |
1497 | } |
1498 | } |
1499 | |
1500 | if (!prev) |
1501 | return NULL; |
1502 | |
1503 | /* |
1504 | * We're manually memmoving instead of just calling sort() to ensure the |
1505 | * prev we picked ends up in slot 0 - sort won't necessarily put it |
1506 | * there because of duplicate deleted keys: |
1507 | */ |
1508 | btree_node_iter_for_each(iter, set) |
1509 | if (set->end == end) |
1510 | goto found; |
1511 | |
1512 | BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); |
1513 | found: |
1514 | BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); |
1515 | |
1516 | memmove(&iter->data[1], |
1517 | &iter->data[0], |
1518 | (void *) set - (void *) &iter->data[0]); |
1519 | |
1520 | iter->data[0].k = __btree_node_key_to_offset(b, k: prev); |
1521 | iter->data[0].end = end; |
1522 | |
1523 | if (bch2_expensive_debug_checks) |
1524 | bch2_btree_node_iter_verify(iter, b); |
1525 | return prev; |
1526 | } |
1527 | |
1528 | struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, |
1529 | struct btree *b) |
1530 | { |
1531 | struct bkey_packed *prev; |
1532 | |
1533 | do { |
1534 | prev = bch2_btree_node_iter_prev_all(iter, b); |
1535 | } while (prev && bkey_deleted(prev)); |
1536 | |
1537 | return prev; |
1538 | } |
1539 | |
1540 | struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, |
1541 | struct btree *b, |
1542 | struct bkey *u) |
1543 | { |
1544 | struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); |
1545 | |
1546 | return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; |
1547 | } |
1548 | |
1549 | /* Mergesort */ |
1550 | |
1551 | void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) |
1552 | { |
1553 | const struct bset_tree *t; |
1554 | |
1555 | for_each_bset(b, t) { |
1556 | enum bset_aux_tree_type type = bset_aux_tree_type(t); |
1557 | size_t j; |
1558 | |
1559 | stats->sets[type].nr++; |
1560 | stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * |
1561 | sizeof(u64); |
1562 | |
1563 | if (bset_has_ro_aux_tree(t)) { |
1564 | stats->floats += t->size - 1; |
1565 | |
1566 | for (j = 1; j < t->size; j++) |
1567 | stats->failed += |
1568 | bkey_float(b, t, idx: j)->exponent == |
1569 | BFLOAT_FAILED; |
1570 | } |
1571 | } |
1572 | } |
1573 | |
1574 | void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, |
1575 | struct bkey_packed *k) |
1576 | { |
1577 | struct bset_tree *t = bch2_bkey_to_bset(b, k); |
1578 | struct bkey uk; |
1579 | unsigned j, inorder; |
1580 | |
1581 | if (!bset_has_ro_aux_tree(t)) |
1582 | return; |
1583 | |
1584 | inorder = bkey_to_cacheline(b, t, k); |
1585 | if (!inorder || inorder >= t->size) |
1586 | return; |
1587 | |
1588 | j = __inorder_to_eytzinger1(i: inorder, size: t->size - 1, extra: t->extra); |
1589 | if (k != tree_to_bkey(b, t, j)) |
1590 | return; |
1591 | |
1592 | switch (bkey_float(b, t, idx: j)->exponent) { |
1593 | case BFLOAT_FAILED: |
1594 | uk = bkey_unpack_key(b, src: k); |
1595 | prt_printf(out, |
1596 | " failed unpacked at depth %u\n" |
1597 | "\t" , |
1598 | ilog2(j)); |
1599 | bch2_bpos_to_text(out, uk.p); |
1600 | prt_printf(out, "\n" ); |
1601 | break; |
1602 | } |
1603 | } |
1604 | |