btree.c source code [linux/drivers/md/bcache/btree.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4	*
5	* Uses a block device as cache for other block devices; optimized for SSDs.
6	* All allocation is done in buckets, which should match the erase block size
7	* of the device.
8	*
9	* Buckets containing cached data are kept on a heap sorted by priority;
10	* bucket priority is increased on cache hit, and periodically all the buckets
11	* on the heap have their priority scaled down. This currently is just used as
12	* an LRU but in the future should allow for more intelligent heuristics.
13	*
14	* Buckets have an 8 bit counter; freeing is accomplished by incrementing the
15	* counter. Garbage collection is used to remove stale pointers.
16	*
17	* Indexing is done via a btree; nodes are not necessarily fully sorted, rather
18	* as keys are inserted we only sort the pages that have not yet been written.
19	* When garbage collection is run, we resort the entire node.
20	*
21	* All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
22	*/
23
24	#include "bcache.h"
25	#include "btree.h"
26	#include "debug.h"
27	#include "extents.h"
28
29	#include <linux/slab.h>
30	#include <linux/bitops.h>
31	#include <linux/hash.h>
32	#include <linux/kthread.h>
33	#include <linux/prefetch.h>
34	#include <linux/random.h>
35	#include <linux/rcupdate.h>
36	#include <linux/sched/clock.h>
37	#include <linux/rculist.h>
38	#include <linux/delay.h>
39	#include <trace/events/bcache.h>
40
41	/*
42	* Todo:
43	* register_bcache: Return errors out to userspace correctly
44	*
45	* Writeback: don't undirty key until after a cache flush
46	*
47	* Create an iterator for key pointers
48	*
49	* On btree write error, mark bucket such that it won't be freed from the cache
50	*
51	* Journalling:
52	* Check for bad keys in replay
53	* Propagate barriers
54	* Refcount journal entries in journal_replay
55	*
56	* Garbage collection:
57	* Finish incremental gc
58	* Gc should free old UUIDs, data for invalid UUIDs
59	*
60	* Provide a way to list backing device UUIDs we have data cached for, and
61	* probably how long it's been since we've seen them, and a way to invalidate
62	* dirty data for devices that will never be attached again
63	*
64	* Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
65	* that based on that and how much dirty data we have we can keep writeback
66	* from being starved
67	*
68	* Add a tracepoint or somesuch to watch for writeback starvation
69	*
70	* When btree depth > 1 and splitting an interior node, we have to make sure
71	* alloc_bucket() cannot fail. This should be true but is not completely
72	* obvious.
73	*
74	* Plugging?
75	*
76	* If data write is less than hard sector size of ssd, round up offset in open
77	* bucket to the next whole sector
78	*
79	* Superblock needs to be fleshed out for multiple cache devices
80	*
81	* Add a sysfs tunable for the number of writeback IOs in flight
82	*
83	* Add a sysfs tunable for the number of open data buckets
84	*
85	* IO tracking: Can we track when one process is doing io on behalf of another?
86	* IO tracking: Don't use just an average, weigh more recent stuff higher
87	*
88	* Test module load/unload
89	*/
90
91	#define MAX_NEED_GC 64
92	#define MAX_SAVE_PRIO 72
93	#define MAX_GC_TIMES 100
94	#define MIN_GC_NODES 100
95	#define GC_SLEEP_MS 100
96
97	#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
98
99	#define PTR_HASH(c, k) \
100	(((k)->ptr[0] >> c->bucket_bits) \| PTR_GEN(k, 0))
101
102	static struct workqueue_struct *btree_io_wq;
103
104	#define insert_lock(s, b) ((b)->level <= (s)->lock)
105
106
107	static inline struct bset write_block(struct* btree *b)
108	{
109	return ((void ) btree_bset_first(b)) + b->written block_bytes(b->c->cache);
110	}
111
112	static void bch_btree_init_next(struct btree *b)
113	{
114	/ If not a leaf node, always sort /
115	if (b->level && b->keys.nsets)
116	bch_btree_sort(b: &b->keys, state: &b->c->sort);
117	else
118	bch_btree_sort_lazy(b: &b->keys, state: &b->c->sort);
119
120	if (b->written < btree_blocks(b))
121	bch_bset_init_next(b: &b->keys, i: write_block(b),
122	magic: bset_magic(sb: &b->c->cache->sb));
123
124	}
125
126	/ Btree key manipulation /
127
128	void bkey_put(struct cache_set c, struct* bkey *k)
129	{
130	unsigned int i;
131
132	for (i = `0`; i < KEY_PTRS(k); i++)
133	if (ptr_available(c, k, i))
134	atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
135	}
136
137	/ Btree IO /
138
139	static uint64_t btree_csum_set(struct btree b, struct* bset *i)
140	{
141	uint64_t crc = b->key.ptr[`0`];
142	void data = (void* ) i + `8`, end = bset_bkey_last(i);
143
144	crc = crc64_be(crc, p: data, len: end - data);
145	return crc ^ `0xffffffffffffffffULL`;
146	}
147
148	void bch_btree_node_read_done(struct btree *b)
149	{
150	const char *err = "bad btree header";
151	struct bset *i = btree_bset_first(b);
152	struct btree_iter *iter;
153
154	/*
155	* c->fill_iter can allocate an iterator with more memory space
156	* than static MAX_BSETS.
157	* See the comment arount cache_set->fill_iter.
158	*/
159	iter = mempool_alloc(pool: &b->c->fill_iter, GFP_NOIO);
160	iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
161	iter->used = `0`;
162
163	#ifdef CONFIG_BCACHE_DEBUG
164	iter->b = &b->keys;
165	#endif
166
167	if (!i->seq)
168	goto err;
169
170	for (;
171	b->written < btree_blocks(b) && i->seq == b->keys.set[`0`].data->seq;
172	i = write_block(b)) {
173	err = "unsupported bset version";
174	if (i->version > BCACHE_BSET_VERSION)
175	goto err;
176
177	err = "bad btree header";
178	if (b->written + set_blocks(i, block_bytes(b->c->cache)) >
179	btree_blocks(b))
180	goto err;
181
182	err = "bad magic";
183	if (i->magic != bset_magic(sb: &b->c->cache->sb))
184	goto err;
185
186	err = "bad checksum";
187	switch (i->version) {
188	case `0`:
189	if (i->csum != csum_set(i))
190	goto err;
191	break;
192	case BCACHE_BSET_VERSION:
193	if (i->csum != btree_csum_set(b, i))
194	goto err;
195	break;
196	}
197
198	err = "empty set";
199	if (i != b->keys.set[`0`].data && !i->keys)
200	goto err;
201
202	bch_btree_iter_push(iter, k: i->start, bset_bkey_last(i));
203
204	b->written += set_blocks(i, block_bytes(b->c->cache));
205	}
206
207	err = "corrupted btree";
208	for (i = write_block(b);
209	bset_sector_offset(b: &b->keys, i) < KEY_SIZE(k: &b->key);
210	i = ((void *) i) + block_bytes(b->c->cache))
211	if (i->seq == b->keys.set[`0`].data->seq)
212	goto err;
213
214	bch_btree_sort_and_fix_extents(b: &b->keys, iter, state: &b->c->sort);
215
216	i = b->keys.set[`0`].data;
217	err = "short btree key";
218	if (b->keys.set[`0`].size &&
219	bkey_cmp(l: &b->key, r: &b->keys.set[`0`].end) < `0`)
220	goto err;
221
222	if (b->written < btree_blocks(b))
223	bch_bset_init_next(b: &b->keys, i: write_block(b),
224	magic: bset_magic(sb: &b->c->cache->sb));
225	out:
226	mempool_free(element: iter, pool: &b->c->fill_iter);
227	return;
228	err:
229	set_btree_node_io_error(b);
230	bch_cache_set_error(c: b->c, fmt: "%s at bucket %zu, block %u, %u keys",
231	err, PTR_BUCKET_NR(c: b->c, k: &b->key, ptr: `0`),
232	bset_block_offset(b, i), i->keys);
233	goto out;
234	}
235
236	static void btree_node_read_endio(struct bio *bio)
237	{
238	struct closure *cl = bio->bi_private;
239
240	closure_put(cl);
241	}
242
243	static void bch_btree_node_read(struct btree *b)
244	{
245	uint64_t start_time = local_clock();
246	struct closure cl;
247	struct bio *bio;
248
249	trace_bcache_btree_read(b);
250
251	closure_init_stack(cl: &cl);
252
253	bio = bch_bbio_alloc(c: b->c);
254	bio->bi_iter.bi_size = KEY_SIZE(k: &b->key) << `9`;
255	bio->bi_end_io = btree_node_read_endio;
256	bio->bi_private = &cl;
257	bio->bi_opf = REQ_OP_READ \| REQ_META;
258
259	bch_bio_map(bio, base: b->keys.set[`0`].data);
260
261	bch_submit_bbio(bio, c: b->c, k: &b->key, ptr: `0`);
262	closure_sync(cl: &cl);
263
264	if (bio->bi_status)
265	set_btree_node_io_error(b);
266
267	bch_bbio_free(bio, c: b->c);
268
269	if (btree_node_io_error(b))
270	goto err;
271
272	bch_btree_node_read_done(b);
273	bch_time_stats_update(stats: &b->c->btree_read_time, time: start_time);
274
275	return;
276	err:
277	bch_cache_set_error(c: b->c, fmt: "io error reading bucket %zu",
278	PTR_BUCKET_NR(c: b->c, k: &b->key, ptr: `0`));
279	}
280
281	static void btree_complete_write(struct btree b, struct* btree_write *w)
282	{
283	if (w->prio_blocked &&
284	!atomic_sub_return(i: w->prio_blocked, v: &b->c->prio_blocked))
285	wake_up_allocators(c: b->c);
286
287	if (w->journal) {
288	atomic_dec_bug(w->journal);
289	__closure_wake_up(list: &b->c->journal.wait);
290	}
291
292	w->prio_blocked = `0`;
293	w->journal = NULL;
294	}
295
296	static void btree_node_write_unlock(struct closure *cl)
297	{
298	struct btree b = container_of(cl, struct* btree, io);
299
300	up(sem: &b->io_mutex);
301	}
302
303	static void __btree_node_write_done(struct closure *cl)
304	{
305	struct btree b = container_of(cl, struct* btree, io);
306	struct btree_write *w = btree_prev_write(b);
307
308	bch_bbio_free(bio: b->bio, c: b->c);
309	b->bio = NULL;
310	btree_complete_write(b, w);
311
312	if (btree_node_dirty(b))
313	queue_delayed_work(wq: btree_io_wq, dwork: &b->work, delay: `30` * HZ);
314
315	closure_return_with_destructor(cl, btree_node_write_unlock);
316	}
317
318	static void btree_node_write_done(struct closure *cl)
319	{
320	struct btree b = container_of(cl, struct* btree, io);
321
322	bio_free_pages(bio: b->bio);
323	__btree_node_write_done(cl);
324	}
325
326	static void btree_node_write_endio(struct bio *bio)
327	{
328	struct closure *cl = bio->bi_private;
329	struct btree b = container_of(cl, struct* btree, io);
330
331	if (bio->bi_status)
332	set_btree_node_io_error(b);
333
334	bch_bbio_count_io_errors(c: b->c, bio, error: bio->bi_status, m: "writing btree");
335	closure_put(cl);
336	}
337
338	static void do_btree_node_write(struct btree *b)
339	{
340	struct closure *cl = &b->io;
341	struct bset *i = btree_bset_last(b);
342	BKEY_PADDED(key) k;
343
344	i->version = BCACHE_BSET_VERSION;
345	i->csum = btree_csum_set(b, i);
346
347	BUG_ON(b->bio);
348	b->bio = bch_bbio_alloc(c: b->c);
349
350	b->bio->bi_end_io = btree_node_write_endio;
351	b->bio->bi_private = cl;
352	b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache));
353	b->bio->bi_opf = REQ_OP_WRITE \| REQ_META \| REQ_FUA;
354	bch_bio_map(bio: b->bio, base: i);
355
356	/*
357	* If we're appending to a leaf node, we don't technically need FUA -
358	* this write just needs to be persisted before the next journal write,
359	* which will be marked FLUSH\|FUA.
360	*
361	* Similarly if we're writing a new btree root - the pointer is going to
362	* be in the next journal entry.
363	*
364	* But if we're writing a new btree node (that isn't a root) or
365	* appending to a non leaf btree node, we need either FUA or a flush
366	* when we write the parent with the new pointer. FUA is cheaper than a
367	* flush, and writes appending to leaf nodes aren't blocking anything so
368	* just make all btree node writes FUA to keep things sane.
369	*/
370
371	bkey_copy(&k.key, &b->key);
372	SET_PTR_OFFSET(k: &k.key, i: `0`, v: PTR_OFFSET(k: &k.key, i: `0`) +
373	bset_sector_offset(b: &b->keys, i));
374
375	if (!bch_bio_alloc_pages(bio: b->bio, __GFP_NOWARN\|GFP_NOWAIT)) {
376	struct bio_vec *bv;
377	void addr = (void* ) ((unsigned* long) i & ~(PAGE_SIZE - `1`));
378	struct bvec_iter_all iter_all;
379
380	bio_for_each_segment_all(bv, b->bio, iter_all) {
381	memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
382	addr += PAGE_SIZE;
383	}
384
385	bch_submit_bbio(bio: b->bio, c: b->c, k: &k.key, ptr: `0`);
386
387	continue_at(cl, btree_node_write_done, NULL);
388	} else {
389	/*
390	* No problem for multipage bvec since the bio is
391	* just allocated
392	*/
393	b->bio->bi_vcnt = `0`;
394	bch_bio_map(bio: b->bio, base: i);
395
396	bch_submit_bbio(bio: b->bio, c: b->c, k: &k.key, ptr: `0`);
397
398	closure_sync(cl);
399	continue_at_nobarrier(cl, __btree_node_write_done, NULL);
400	}
401	}
402
403	void __bch_btree_node_write(struct btree b, struct* closure *parent)
404	{
405	struct bset *i = btree_bset_last(b);
406
407	lockdep_assert_held(&b->write_lock);
408
409	trace_bcache_btree_write(b);
410
411	BUG_ON(current->bio_list);
412	BUG_ON(b->written >= btree_blocks(b));
413	BUG_ON(b->written && !i->keys);
414	BUG_ON(btree_bset_first(b)->seq != i->seq);
415	bch_check_keys(&b->keys, "writing");
416
417	cancel_delayed_work(dwork: &b->work);
418
419	/ If caller isn't waiting for write, parent refcount is cache set /
420	down(sem: &b->io_mutex);
421	closure_init(cl: &b->io, parent: parent ?: &b->c->cl);
422
423	clear_bit(nr: BTREE_NODE_dirty, addr: &b->flags);
424	change_bit(nr: BTREE_NODE_write_idx, addr: &b->flags);
425
426	do_btree_node_write(b);
427
428	atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size,
429	v: &b->c->cache->btree_sectors_written);
430
431	b->written += set_blocks(i, block_bytes(b->c->cache));
432	}
433
434	void bch_btree_node_write(struct btree b, struct* closure *parent)
435	{
436	unsigned int nsets = b->keys.nsets;
437
438	lockdep_assert_held(&b->lock);
439
440	__bch_btree_node_write(b, parent);
441
442	/*
443	* do verify if there was more than one set initially (i.e. we did a
444	* sort) and we sorted down to a single set:
445	*/
446	if (nsets && !b->keys.nsets)
447	bch_btree_verify(b);
448
449	bch_btree_init_next(b);
450	}
451
452	static void bch_btree_node_write_sync(struct btree *b)
453	{
454	struct closure cl;
455
456	closure_init_stack(cl: &cl);
457
458	mutex_lock(&b->write_lock);
459	bch_btree_node_write(b, parent: &cl);
460	mutex_unlock(lock: &b->write_lock);
461
462	closure_sync(cl: &cl);
463	}
464
465	static void btree_node_write_work(struct work_struct *w)
466	{
467	struct btree b = container_of(to_delayed_work(w), struct* btree, work);
468
469	mutex_lock(&b->write_lock);
470	if (btree_node_dirty(b))
471	__bch_btree_node_write(b, NULL);
472	mutex_unlock(lock: &b->write_lock);
473	}
474
475	static void bch_btree_leaf_dirty(struct btree b, atomic_t journal_ref)
476	{
477	struct bset *i = btree_bset_last(b);
478	struct btree_write *w = btree_current_write(b);
479
480	lockdep_assert_held(&b->write_lock);
481
482	BUG_ON(!b->written);
483	BUG_ON(!i->keys);
484
485	if (!btree_node_dirty(b))
486	queue_delayed_work(wq: btree_io_wq, dwork: &b->work, delay: `30` * HZ);
487
488	set_btree_node_dirty(b);
489
490	/*
491	* w->journal is always the oldest journal pin of all bkeys
492	* in the leaf node, to make sure the oldest jset seq won't
493	* be increased before this btree node is flushed.
494	*/
495	if (journal_ref) {
496	if (w->journal &&
497	journal_pin_cmp(b->c, w->journal, journal_ref)) {
498	atomic_dec_bug(w->journal);
499	w->journal = NULL;
500	}
501
502	if (!w->journal) {
503	w->journal = journal_ref;
504	atomic_inc(v: w->journal);
505	}
506	}
507
508	/ Force write if set is too big /
509	if (set_bytes(i) > PAGE_SIZE - `48` &&
510	!current->bio_list)
511	bch_btree_node_write(b, NULL);
512	}
513
514	/*
515	* Btree in memory cache - allocation/freeing
516	* mca -> memory cache
517	*/
518
519	#define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \
520	? c->root->level : 1) * 8 + 16)
521	#define mca_can_free(c) \
522	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
523
524	static void mca_data_free(struct btree *b)
525	{
526	BUG_ON(b->io_mutex.count != `1`);
527
528	bch_btree_keys_free(b: &b->keys);
529
530	b->c->btree_cache_used--;
531	list_move(list: &b->list, head: &b->c->btree_cache_freed);
532	}
533
534	static void mca_bucket_free(struct btree *b)
535	{
536	BUG_ON(btree_node_dirty(b));
537
538	b->key.ptr[`0`] = `0`;
539	hlist_del_init_rcu(n: &b->hash);
540	list_move(list: &b->list, head: &b->c->btree_cache_freeable);
541	}
542
543	static unsigned int btree_order(struct bkey *k)
544	{
545	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: `1`);
546	}
547
548	static void mca_data_alloc(struct btree b, struct* bkey *k, gfp_t gfp)
549	{
550	if (!bch_btree_keys_alloc(b: &b->keys,
551	max_t(unsigned int,
552	ilog2(b->c->btree_pages),
553	btree_order(k)),
554	gfp)) {
555	b->c->btree_cache_used++;
556	list_move(list: &b->list, head: &b->c->btree_cache);
557	} else {
558	list_move(list: &b->list, head: &b->c->btree_cache_freed);
559	}
560	}
561
562	#define cmp_int(l, r) ((l > r) - (l < r))
563
564	#ifdef CONFIG_PROVE_LOCKING
565	static int btree_lock_cmp_fn(const struct lockdep_map *_a,
566	const struct lockdep_map *_b)
567	{
568	const struct btree a = container_of(_a, struct* btree, lock.dep_map);
569	const struct btree b = container_of(_b, struct* btree, lock.dep_map);
570
571	return -cmp_int(a->level, b->level) ?: bkey_cmp(l: &a->key, r: &b->key);
572	}
573
574	static void btree_lock_print_fn(const struct lockdep_map *map)
575	{
576	const struct btree b = container_of(map, struct* btree, lock.dep_map);
577
578	printk(KERN_CONT " l=%u %llu:%llu", b->level,
579	KEY_INODE(&b->key), KEY_OFFSET(&b->key));
580	}
581	#endif
582
583	static struct btree mca_bucket_alloc(struct* cache_set *c,
584	struct bkey *k, gfp_t gfp)
585	{
586	/*
587	* kzalloc() is necessary here for initialization,
588	* see code comments in bch_btree_keys_init().
589	*/
590	struct btree b = kzalloc(size: sizeof(struct* btree), flags: gfp);
591
592	if (!b)
593	return NULL;
594
595	init_rwsem(&b->lock);
596	lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn);
597	mutex_init(&b->write_lock);
598	lockdep_set_novalidate_class(&b->write_lock);
599	INIT_LIST_HEAD(list: &b->list);
600	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
601	b->c = c;
602	sema_init(sem: &b->io_mutex, val: `1`);
603
604	mca_data_alloc(b, k, gfp);
605	return b;
606	}
607
608	static int mca_reap(struct btree b, unsigned* int min_order, bool flush)
609	{
610	struct closure cl;
611
612	closure_init_stack(cl: &cl);
613	lockdep_assert_held(&b->c->bucket_lock);
614
615	if (!down_write_trylock(sem: &b->lock))
616	return -ENOMEM;
617
618	BUG_ON(btree_node_dirty(b) && !b->keys.set[`0`].data);
619
620	if (b->keys.page_order < min_order)
621	goto out_unlock;
622
623	if (!flush) {
624	if (btree_node_dirty(b))
625	goto out_unlock;
626
627	if (down_trylock(sem: &b->io_mutex))
628	goto out_unlock;
629	up(sem: &b->io_mutex);
630	}
631
632	retry:
633	/*
634	* BTREE_NODE_dirty might be cleared in btree_flush_btree() by
635	* __bch_btree_node_write(). To avoid an extra flush, acquire
636	* b->write_lock before checking BTREE_NODE_dirty bit.
637	*/
638	mutex_lock(&b->write_lock);
639	/*
640	* If this btree node is selected in btree_flush_write() by journal
641	* code, delay and retry until the node is flushed by journal code
642	* and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
643	*/
644	if (btree_node_journal_flush(b)) {
645	pr_debug("bnode %p is flushing by journal, retry\n", b);
646	mutex_unlock(lock: &b->write_lock);
647	udelay(`1`);
648	goto retry;
649	}
650
651	if (btree_node_dirty(b))
652	__bch_btree_node_write(b, parent: &cl);
653	mutex_unlock(lock: &b->write_lock);
654
655	closure_sync(cl: &cl);
656
657	/ wait for any in flight btree write /
658	down(sem: &b->io_mutex);
659	up(sem: &b->io_mutex);
660
661	return `0`;
662	out_unlock:
663	rw_unlock(w: true, b);
664	return -ENOMEM;
665	}
666
667	static unsigned long bch_mca_scan(struct shrinker *shrink,
668	struct shrink_control *sc)
669	{
670	struct cache_set *c = shrink->private_data;
671	struct btree b, t;
672	unsigned long i, nr = sc->nr_to_scan;
673	unsigned long freed = `0`;
674	unsigned int btree_cache_used;
675
676	if (c->shrinker_disabled)
677	return SHRINK_STOP;
678
679	if (c->btree_cache_alloc_lock)
680	return SHRINK_STOP;
681
682	/ Return -1 if we can't do anything right now /
683	if (sc->gfp_mask & __GFP_IO)
684	mutex_lock(&c->bucket_lock);
685	else if (!mutex_trylock(lock: &c->bucket_lock))
686	return -`1`;
687
688	/*
689	* It's _really_ critical that we don't free too many btree nodes - we
690	* have to always leave ourselves a reserve. The reserve is how we
691	* guarantee that allocating memory for a new btree node can always
692	* succeed, so that inserting keys into the btree can always succeed and
693	* IO can always make forward progress:
694	*/
695	nr /= c->btree_pages;
696	if (nr == `0`)
697	nr = `1`;
698	nr = min_t(unsigned long, nr, mca_can_free(c));
699
700	i = `0`;
701	btree_cache_used = c->btree_cache_used;
702	list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
703	if (nr <= `0`)
704	goto out;
705
706	if (!mca_reap(b, min_order: `0`, flush: false)) {
707	mca_data_free(b);
708	rw_unlock(w: true, b);
709	freed++;
710	}
711	nr--;
712	i++;
713	}
714
715	list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
716	if (nr <= `0` \|\| i >= btree_cache_used)
717	goto out;
718
719	if (!mca_reap(b, min_order: `0`, flush: false)) {
720	mca_bucket_free(b);
721	mca_data_free(b);
722	rw_unlock(w: true, b);
723	freed++;
724	}
725
726	nr--;
727	i++;
728	}
729	out:
730	mutex_unlock(lock: &c->bucket_lock);
731	return freed * c->btree_pages;
732	}
733
734	static unsigned long bch_mca_count(struct shrinker *shrink,
735	struct shrink_control *sc)
736	{
737	struct cache_set *c = shrink->private_data;
738
739	if (c->shrinker_disabled)
740	return `0`;
741
742	if (c->btree_cache_alloc_lock)
743	return `0`;
744
745	return mca_can_free(c) * c->btree_pages;
746	}
747
748	void bch_btree_cache_free(struct cache_set *c)
749	{
750	struct btree *b;
751	struct closure cl;
752
753	closure_init_stack(cl: &cl);
754
755	if (c->shrink)
756	shrinker_free(shrinker: c->shrink);
757
758	mutex_lock(&c->bucket_lock);
759
760	#ifdef CONFIG_BCACHE_DEBUG
761	if (c->verify_data)
762	list_move(list: &c->verify_data->list, head: &c->btree_cache);
763
764	free_pages(addr: (unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb)));
765	#endif
766
767	list_splice(list: &c->btree_cache_freeable,
768	head: &c->btree_cache);
769
770	while (!list_empty(head: &c->btree_cache)) {
771	b = list_first_entry(&c->btree_cache, struct btree, list);
772
773	/*
774	* This function is called by cache_set_free(), no I/O
775	* request on cache now, it is unnecessary to acquire
776	* b->write_lock before clearing BTREE_NODE_dirty anymore.
777	*/
778	if (btree_node_dirty(b)) {
779	btree_complete_write(b, w: btree_current_write(b));
780	clear_bit(nr: BTREE_NODE_dirty, addr: &b->flags);
781	}
782	mca_data_free(b);
783	}
784
785	while (!list_empty(head: &c->btree_cache_freed)) {
786	b = list_first_entry(&c->btree_cache_freed,
787	struct btree, list);
788	list_del(entry: &b->list);
789	cancel_delayed_work_sync(dwork: &b->work);
790	kfree(objp: b);
791	}
792
793	mutex_unlock(lock: &c->bucket_lock);
794	}
795
796	int bch_btree_cache_alloc(struct cache_set *c)
797	{
798	unsigned int i;
799
800	for (i = `0`; i < mca_reserve(c); i++)
801	if (!mca_bucket_alloc(c, k: &ZERO_KEY, GFP_KERNEL))
802	return -ENOMEM;
803
804	list_splice_init(list: &c->btree_cache,
805	head: &c->btree_cache_freeable);
806
807	#ifdef CONFIG_BCACHE_DEBUG
808	mutex_init(&c->verify_lock);
809
810	c->verify_ondisk = (void *)
811	__get_free_pages(GFP_KERNEL\|__GFP_COMP,
812	ilog2(meta_bucket_pages(&c->cache->sb)));
813	if (!c->verify_ondisk) {
814	/*
815	* Don't worry about the mca_rereserve buckets
816	* allocated in previous for-loop, they will be
817	* handled properly in bch_cache_set_unregister().
818	*/
819	return -ENOMEM;
820	}
821
822	c->verify_data = mca_bucket_alloc(c, k: &ZERO_KEY, GFP_KERNEL);
823
824	if (c->verify_data &&
825	c->verify_data->keys.set->data)
826	list_del_init(entry: &c->verify_data->list);
827	else
828	c->verify_data = NULL;
829	#endif
830
831	c->shrink = shrinker_alloc(flags: `0`, fmt: "md-bcache:%pU", c->set_uuid);
832	if (!c->shrink) {
833	pr_warn("bcache: %s: could not allocate shrinker\n", __func__);
834	return `0`;
835	}
836
837	c->shrink->count_objects = bch_mca_count;
838	c->shrink->scan_objects = bch_mca_scan;
839	c->shrink->seeks = `4`;
840	c->shrink->batch = c->btree_pages * `2`;
841	c->shrink->private_data = c;
842
843	shrinker_register(shrinker: c->shrink);
844
845	return `0`;
846	}
847
848	/ Btree in memory cache - hash table /
849
850	static struct hlist_head mca_hash(struct* cache_set c, struct* bkey *k)
851	{
852	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
853	}
854
855	static struct btree mca_find(struct* cache_set c, struct* bkey *k)
856	{
857	struct btree *b;
858
859	rcu_read_lock();
860	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
861	if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
862	goto out;
863	b = NULL;
864	out:
865	rcu_read_unlock();
866	return b;
867	}
868
869	static int mca_cannibalize_lock(struct cache_set c, struct* btree_op *op)
870	{
871	spin_lock(lock: &c->btree_cannibalize_lock);
872	if (likely(c->btree_cache_alloc_lock == NULL)) {
873	c->btree_cache_alloc_lock = current;
874	} else if (c->btree_cache_alloc_lock != current) {
875	if (op)
876	prepare_to_wait(wq_head: &c->btree_cache_wait, wq_entry: &op->wait,
877	TASK_UNINTERRUPTIBLE);
878	spin_unlock(lock: &c->btree_cannibalize_lock);
879	return -EINTR;
880	}
881	spin_unlock(lock: &c->btree_cannibalize_lock);
882
883	return `0`;
884	}
885
886	static struct btree mca_cannibalize(struct* cache_set c, struct* btree_op *op,
887	struct bkey *k)
888	{
889	struct btree *b;
890
891	trace_bcache_btree_cache_cannibalize(c);
892
893	if (mca_cannibalize_lock(c, op))
894	return ERR_PTR(error: -EINTR);
895
896	list_for_each_entry_reverse(b, &c->btree_cache, list)
897	if (!mca_reap(b, min_order: btree_order(k), flush: false))
898	return b;
899
900	list_for_each_entry_reverse(b, &c->btree_cache, list)
901	if (!mca_reap(b, min_order: btree_order(k), flush: true))
902	return b;
903
904	WARN(`1`, "btree cache cannibalize failed\n");
905	return ERR_PTR(error: -ENOMEM);
906	}
907
908	/*
909	* We can only have one thread cannibalizing other cached btree nodes at a time,
910	* or we'll deadlock. We use an open coded mutex to ensure that, which a
911	* cannibalize_bucket() will take. This means every time we unlock the root of
912	* the btree, we need to release this lock if we have it held.
913	*/
914	void bch_cannibalize_unlock(struct cache_set *c)
915	{
916	spin_lock(lock: &c->btree_cannibalize_lock);
917	if (c->btree_cache_alloc_lock == current) {
918	c->btree_cache_alloc_lock = NULL;
919	wake_up(&c->btree_cache_wait);
920	}
921	spin_unlock(lock: &c->btree_cannibalize_lock);
922	}
923
924	static struct btree mca_alloc(struct* cache_set c, struct* btree_op *op,
925	struct bkey k, int* level)
926	{
927	struct btree *b;
928
929	BUG_ON(current->bio_list);
930
931	lockdep_assert_held(&c->bucket_lock);
932
933	if (mca_find(c, k))
934	return NULL;
935
936	/ btree_free() doesn't free memory; it sticks the node on the end of*
937	* the list. Check if there's any freed nodes there:
938	*/
939	list_for_each_entry(b, &c->btree_cache_freeable, list)
940	if (!mca_reap(b, min_order: btree_order(k), flush: false))
941	goto out;
942
943	/ We never free struct btree itself, just the memory that holds the on*
944	* disk node. Check the freed list before allocating a new one:
945	*/
946	list_for_each_entry(b, &c->btree_cache_freed, list)
947	if (!mca_reap(b, min_order: `0`, flush: false)) {
948	mca_data_alloc(b, k, __GFP_NOWARN\|GFP_NOIO);
949	if (!b->keys.set[`0`].data)
950	goto err;
951	else
952	goto out;
953	}
954
955	b = mca_bucket_alloc(c, k, __GFP_NOWARN\|GFP_NOIO);
956	if (!b)
957	goto err;
958
959	BUG_ON(!down_write_trylock(&b->lock));
960	if (!b->keys.set->data)
961	goto err;
962	out:
963	BUG_ON(b->io_mutex.count != `1`);
964
965	bkey_copy(&b->key, k);
966	list_move(list: &b->list, head: &c->btree_cache);
967	hlist_del_init_rcu(n: &b->hash);
968	hlist_add_head_rcu(n: &b->hash, h: mca_hash(c, k));
969
970	lock_set_subclass(lock: &b->lock.dep_map, subclass: level + `1`, _THIS_IP_);
971	b->parent = (void *) ~`0UL`;
972	b->flags = `0`;
973	b->written = `0`;
974	b->level = level;
975
976	if (!b->level)
977	bch_btree_keys_init(b: &b->keys, ops: &bch_extent_keys_ops,
978	expensive_debug_checks: &b->c->expensive_debug_checks);
979	else
980	bch_btree_keys_init(b: &b->keys, ops: &bch_btree_keys_ops,
981	expensive_debug_checks: &b->c->expensive_debug_checks);
982
983	return b;
984	err:
985	if (b)
986	rw_unlock(w: true, b);
987
988	b = mca_cannibalize(c, op, k);
989	if (!IS_ERR(ptr: b))
990	goto out;
991
992	return b;
993	}
994
995	/*
996	* bch_btree_node_get - find a btree node in the cache and lock it, reading it
997	* in from disk if necessary.
998	*
999	* If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
1000	*
1001	* The btree node will have either a read or a write lock held, depending on
1002	* level and op->lock.
1003	*/
1004	struct btree bch_btree_node_get(struct* cache_set c, struct* btree_op *op,
1005	struct bkey k, int* level, bool write,
1006	struct btree *parent)
1007	{
1008	int i = `0`;
1009	struct btree *b;
1010
1011	BUG_ON(level < `0`);
1012	retry:
1013	b = mca_find(c, k);
1014
1015	if (!b) {
1016	if (current->bio_list)
1017	return ERR_PTR(error: -EAGAIN);
1018
1019	mutex_lock(&c->bucket_lock);
1020	b = mca_alloc(c, op, k, level);
1021	mutex_unlock(lock: &c->bucket_lock);
1022
1023	if (!b)
1024	goto retry;
1025	if (IS_ERR(ptr: b))
1026	return b;
1027
1028	bch_btree_node_read(b);
1029
1030	if (!write)
1031	downgrade_write(sem: &b->lock);
1032	} else {
1033	rw_lock(w: write, b, level);
1034	if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
1035	rw_unlock(w: write, b);
1036	goto retry;
1037	}
1038	BUG_ON(b->level != level);
1039	}
1040
1041	if (btree_node_io_error(b)) {
1042	rw_unlock(w: write, b);
1043	return ERR_PTR(error: -EIO);
1044	}
1045
1046	BUG_ON(!b->written);
1047
1048	b->parent = parent;
1049
1050	for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
1051	prefetch(b->keys.set[i].tree);
1052	prefetch(b->keys.set[i].data);
1053	}
1054
1055	for (; i <= b->keys.nsets; i++)
1056	prefetch(b->keys.set[i].data);
1057
1058	return b;
1059	}
1060
1061	static void btree_node_prefetch(struct btree parent, struct* bkey *k)
1062	{
1063	struct btree *b;
1064
1065	mutex_lock(&parent->c->bucket_lock);
1066	b = mca_alloc(c: parent->c, NULL, k, level: parent->level - `1`);
1067	mutex_unlock(lock: &parent->c->bucket_lock);
1068
1069	if (!IS_ERR_OR_NULL(ptr: b)) {
1070	b->parent = parent;
1071	bch_btree_node_read(b);
1072	rw_unlock(w: true, b);
1073	}
1074	}
1075
1076	/ Btree alloc /
1077
1078	static void btree_node_free(struct btree *b)
1079	{
1080	trace_bcache_btree_node_free(b);
1081
1082	BUG_ON(b == b->c->root);
1083
1084	retry:
1085	mutex_lock(&b->write_lock);
1086	/*
1087	* If the btree node is selected and flushing in btree_flush_write(),
1088	* delay and retry until the BTREE_NODE_journal_flush bit cleared,
1089	* then it is safe to free the btree node here. Otherwise this btree
1090	* node will be in race condition.
1091	*/
1092	if (btree_node_journal_flush(b)) {
1093	mutex_unlock(lock: &b->write_lock);
1094	pr_debug("bnode %p journal_flush set, retry\n", b);
1095	udelay(`1`);
1096	goto retry;
1097	}
1098
1099	if (btree_node_dirty(b)) {
1100	btree_complete_write(b, w: btree_current_write(b));
1101	clear_bit(nr: BTREE_NODE_dirty, addr: &b->flags);
1102	}
1103
1104	mutex_unlock(lock: &b->write_lock);
1105
1106	cancel_delayed_work(dwork: &b->work);
1107
1108	mutex_lock(&b->c->bucket_lock);
1109	bch_bucket_free(c: b->c, k: &b->key);
1110	mca_bucket_free(b);
1111	mutex_unlock(lock: &b->c->bucket_lock);
1112	}
1113
1114	struct btree __bch_btree_node_alloc(struct* cache_set c, struct* btree_op *op,
1115	int level, bool wait,
1116	struct btree *parent)
1117	{
1118	BKEY_PADDED(key) k;
1119	struct btree *b;
1120
1121	mutex_lock(&c->bucket_lock);
1122	retry:
1123	/ return ERR_PTR(-EAGAIN) when it fails /
1124	b = ERR_PTR(error: -EAGAIN);
1125	if (__bch_bucket_alloc_set(c, reserve: RESERVE_BTREE, k: &k.key, wait))
1126	goto err;
1127
1128	bkey_put(c, k: &k.key);
1129	SET_KEY_SIZE(k: &k.key, v: c->btree_pages * PAGE_SECTORS);
1130
1131	b = mca_alloc(c, op, k: &k.key, level);
1132	if (IS_ERR(ptr: b))
1133	goto err_free;
1134
1135	if (!b) {
1136	cache_bug(c,
1137	"Tried to allocate bucket that was in btree cache");
1138	goto retry;
1139	}
1140
1141	b->parent = parent;
1142	bch_bset_init_next(b: &b->keys, i: b->keys.set->data, magic: bset_magic(sb: &b->c->cache->sb));
1143
1144	mutex_unlock(lock: &c->bucket_lock);
1145
1146	trace_bcache_btree_node_alloc(b);
1147	return b;
1148	err_free:
1149	bch_bucket_free(c, k: &k.key);
1150	err:
1151	mutex_unlock(lock: &c->bucket_lock);
1152
1153	trace_bcache_btree_node_alloc_fail(c);
1154	return b;
1155	}
1156
1157	static struct btree bch_btree_node_alloc(struct* cache_set *c,
1158	struct btree_op op, int* level,
1159	struct btree *parent)
1160	{
1161	return __bch_btree_node_alloc(c, op, level, wait: op != NULL, parent);
1162	}
1163
1164	static struct btree btree_node_alloc_replacement(struct* btree *b,
1165	struct btree_op *op)
1166	{
1167	struct btree *n = bch_btree_node_alloc(c: b->c, op, level: b->level, parent: b->parent);
1168
1169	if (!IS_ERR(ptr: n)) {
1170	mutex_lock(&n->write_lock);
1171	bch_btree_sort_into(b: &b->keys, new: &n->keys, state: &b->c->sort);
1172	bkey_copy_key(dest: &n->key, src: &b->key);
1173	mutex_unlock(lock: &n->write_lock);
1174	}
1175
1176	return n;
1177	}
1178
1179	static void make_btree_freeing_key(struct btree b, struct* bkey *k)
1180	{
1181	unsigned int i;
1182
1183	mutex_lock(&b->c->bucket_lock);
1184
1185	atomic_inc(v: &b->c->prio_blocked);
1186
1187	bkey_copy(k, &b->key);
1188	bkey_copy_key(dest: k, src: &ZERO_KEY);
1189
1190	for (i = `0`; i < KEY_PTRS(k); i++)
1191	SET_PTR_GEN(k, i,
1192	v: bch_inc_gen(ca: b->c->cache,
1193	b: PTR_BUCKET(c: b->c, k: &b->key, ptr: i)));
1194
1195	mutex_unlock(lock: &b->c->bucket_lock);
1196	}
1197
1198	static int btree_check_reserve(struct btree b, struct* btree_op *op)
1199	{
1200	struct cache_set *c = b->c;
1201	struct cache *ca = c->cache;
1202	unsigned int reserve = (c->root->level - b->level) * `2` + `1`;
1203
1204	mutex_lock(&c->bucket_lock);
1205
1206	if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1207	if (op)
1208	prepare_to_wait(wq_head: &c->btree_cache_wait, wq_entry: &op->wait,
1209	TASK_UNINTERRUPTIBLE);
1210	mutex_unlock(lock: &c->bucket_lock);
1211	return -EINTR;
1212	}
1213
1214	mutex_unlock(lock: &c->bucket_lock);
1215
1216	return mca_cannibalize_lock(c: b->c, op);
1217	}
1218
1219	/ Garbage collection /
1220
1221	static uint8_t __bch_btree_mark_key(struct cache_set c, int* level,
1222	struct bkey *k)
1223	{
1224	uint8_t stale = `0`;
1225	unsigned int i;
1226	struct bucket *g;
1227
1228	/*
1229	* ptr_invalid() can't return true for the keys that mark btree nodes as
1230	* freed, but since ptr_bad() returns true we'll never actually use them
1231	* for anything and thus we don't want mark their pointers here
1232	*/
1233	if (!bkey_cmp(l: k, r: &ZERO_KEY))
1234	return stale;
1235
1236	for (i = `0`; i < KEY_PTRS(k); i++) {
1237	if (!ptr_available(c, k, i))
1238	continue;
1239
1240	g = PTR_BUCKET(c, k, ptr: i);
1241
1242	if (gen_after(a: g->last_gc, b: PTR_GEN(k, i)))
1243	g->last_gc = PTR_GEN(k, i);
1244
1245	if (ptr_stale(c, k, i)) {
1246	stale = max(stale, ptr_stale(c, k, i));
1247	continue;
1248	}
1249
1250	cache_bug_on(GC_MARK(g) &&
1251	(GC_MARK(g) == GC_MARK_METADATA) != (level != `0`),
1252	c, "inconsistent ptrs: mark = %llu, level = %i",
1253	GC_MARK(g), level);
1254
1255	if (level)
1256	SET_GC_MARK(k: g, GC_MARK_METADATA);
1257	else if (KEY_DIRTY(k))
1258	SET_GC_MARK(k: g, GC_MARK_DIRTY);
1259	else if (!GC_MARK(k: g))
1260	SET_GC_MARK(k: g, GC_MARK_RECLAIMABLE);
1261
1262	/ guard against overflow /
1263	SET_GC_SECTORS_USED(k: g, min_t(unsigned int,
1264	GC_SECTORS_USED(g) + KEY_SIZE(k),
1265	MAX_GC_SECTORS_USED));
1266
1267	BUG_ON(!GC_SECTORS_USED(g));
1268	}
1269
1270	return stale;
1271	}
1272
1273	#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
1274
1275	void bch_initial_mark_key(struct cache_set c, int* level, struct bkey *k)
1276	{
1277	unsigned int i;
1278
1279	for (i = `0`; i < KEY_PTRS(k); i++)
1280	if (ptr_available(c, k, i) &&
1281	!ptr_stale(c, k, i)) {
1282	struct bucket *b = PTR_BUCKET(c, k, ptr: i);
1283
1284	b->gen = PTR_GEN(k, i);
1285
1286	if (level && bkey_cmp(l: k, r: &ZERO_KEY))
1287	b->prio = BTREE_PRIO;
1288	else if (!level && b->prio == BTREE_PRIO)
1289	b->prio = INITIAL_PRIO;
1290	}
1291
1292	__bch_btree_mark_key(c, level, k);
1293	}
1294
1295	void bch_update_bucket_in_use(struct cache_set c, struct* gc_stat *stats)
1296	{
1297	stats->in_use = (c->nbuckets - c->avail_nbuckets) * `100` / c->nbuckets;
1298	}
1299
1300	static bool btree_gc_mark_node(struct btree b, struct* gc_stat *gc)
1301	{
1302	uint8_t stale = `0`;
1303	unsigned int keys = `0`, good_keys = `0`;
1304	struct bkey *k;
1305	struct btree_iter iter;
1306	struct bset_tree *t;
1307
1308	gc->nodes++;
1309
1310	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1311	stale = max(stale, btree_mark_key(b, k));
1312	keys++;
1313
1314	if (bch_ptr_bad(b: &b->keys, k))
1315	continue;
1316
1317	gc->key_bytes += bkey_u64s(k);
1318	gc->nkeys++;
1319	good_keys++;
1320
1321	gc->data += KEY_SIZE(k);
1322	}
1323
1324	for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1325	btree_bug_on(t->size &&
1326	bset_written(&b->keys, t) &&
1327	bkey_cmp(&b->key, &t->end) < `0`,
1328	b, "found short btree key in gc");
1329
1330	if (b->c->gc_always_rewrite)
1331	return true;
1332
1333	if (stale > `10`)
1334	return true;
1335
1336	if ((keys - good_keys) * `2` > keys)
1337	return true;
1338
1339	return false;
1340	}
1341
1342	#define GC_MERGE_NODES 4U
1343
1344	struct gc_merge_info {
1345	struct btree *b;
1346	unsigned int keys;
1347	};
1348
1349	static int bch_btree_insert_node(struct btree b, struct* btree_op *op,
1350	struct keylist *insert_keys,
1351	atomic_t *journal_ref,
1352	struct bkey *replace_key);
1353
1354	static int btree_gc_coalesce(struct btree b, struct* btree_op *op,
1355	struct gc_stat gc, struct* gc_merge_info *r)
1356	{
1357	unsigned int i, nodes = `0`, keys = `0`, blocks;
1358	struct btree *new_nodes[GC_MERGE_NODES];
1359	struct keylist keylist;
1360	struct closure cl;
1361	struct bkey *k;
1362
1363	bch_keylist_init(l: &keylist);
1364
1365	if (btree_check_reserve(b, NULL))
1366	return `0`;
1367
1368	memset(new_nodes, `0`, sizeof(new_nodes));
1369	closure_init_stack(cl: &cl);
1370
1371	while (nodes < GC_MERGE_NODES && !IS_ERR(ptr: r[nodes].b))
1372	keys += r[nodes++].keys;
1373
1374	blocks = btree_default_blocks(b->c) * `2` / `3`;
1375
1376	if (nodes < `2` \|\|
1377	__set_blocks(b->keys.set[`0`].data, keys,
1378	block_bytes(b->c->cache)) > blocks * (nodes - `1`))
1379	return `0`;
1380
1381	for (i = `0`; i < nodes; i++) {
1382	new_nodes[i] = btree_node_alloc_replacement(b: r[i].b, NULL);
1383	if (IS_ERR(ptr: new_nodes[i]))
1384	goto out_nocoalesce;
1385	}
1386
1387	/*
1388	* We have to check the reserve here, after we've allocated our new
1389	* nodes, to make sure the insert below will succeed - we also check
1390	* before as an optimization to potentially avoid a bunch of expensive
1391	* allocs/sorts
1392	*/
1393	if (btree_check_reserve(b, NULL))
1394	goto out_nocoalesce;
1395
1396	for (i = `0`; i < nodes; i++)
1397	mutex_lock(&new_nodes[i]->write_lock);
1398
1399	for (i = nodes - `1`; i > `0`; --i) {
1400	struct bset *n1 = btree_bset_first(b: new_nodes[i]);
1401	struct bset *n2 = btree_bset_first(b: new_nodes[i - `1`]);
1402	struct bkey k, last = NULL;
1403
1404	keys = `0`;
1405
1406	if (i > `1`) {
1407	for (k = n2->start;
1408	k < bset_bkey_last(n2);
1409	k = bkey_next(k)) {
1410	if (__set_blocks(n1, n1->keys + keys +
1411	bkey_u64s(k),
1412	block_bytes(b->c->cache)) > blocks)
1413	break;
1414
1415	last = k;
1416	keys += bkey_u64s(k);
1417	}
1418	} else {
1419	/*
1420	* Last node we're not getting rid of - we're getting
1421	* rid of the node at r[0]. Have to try and fit all of
1422	* the remaining keys into this node; we can't ensure
1423	* they will always fit due to rounding and variable
1424	* length keys (shouldn't be possible in practice,
1425	* though)
1426	*/
1427	if (__set_blocks(n1, n1->keys + n2->keys,
1428	block_bytes(b->c->cache)) >
1429	btree_blocks(new_nodes[i]))
1430	goto out_unlock_nocoalesce;
1431
1432	keys = n2->keys;
1433	/ Take the key of the node we're getting rid of /
1434	last = &r->b->key;
1435	}
1436
1437	BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) >
1438	btree_blocks(new_nodes[i]));
1439
1440	if (last)
1441	bkey_copy_key(dest: &new_nodes[i]->key, src: last);
1442
1443	memcpy(bset_bkey_last(n1),
1444	n2->start,
1445	(void ) bset_bkey_idx(n2, keys) - (void* *) n2->start);
1446
1447	n1->keys += keys;
1448	r[i].keys = n1->keys;
1449
1450	memmove(n2->start,
1451	bset_bkey_idx(n2, keys),
1452	(void *) bset_bkey_last(n2) -
1453	(void *) bset_bkey_idx(n2, keys));
1454
1455	n2->keys -= keys;
1456
1457	if (__bch_keylist_realloc(l: &keylist,
1458	u64s: bkey_u64s(k: &new_nodes[i]->key)))
1459	goto out_unlock_nocoalesce;
1460
1461	bch_btree_node_write(b: new_nodes[i], parent: &cl);
1462	bch_keylist_add(l: &keylist, k: &new_nodes[i]->key);
1463	}
1464
1465	for (i = `0`; i < nodes; i++)
1466	mutex_unlock(lock: &new_nodes[i]->write_lock);
1467
1468	closure_sync(cl: &cl);
1469
1470	/ We emptied out this node /
1471	BUG_ON(btree_bset_first(new_nodes[`0`])->keys);
1472	btree_node_free(b: new_nodes[`0`]);
1473	rw_unlock(w: true, b: new_nodes[`0`]);
1474	new_nodes[`0`] = NULL;
1475
1476	for (i = `0`; i < nodes; i++) {
1477	if (__bch_keylist_realloc(l: &keylist, u64s: bkey_u64s(k: &r[i].b->key)))
1478	goto out_nocoalesce;
1479
1480	make_btree_freeing_key(b: r[i].b, k: keylist.top);
1481	bch_keylist_push(l: &keylist);
1482	}
1483
1484	bch_btree_insert_node(b, op, insert_keys: &keylist, NULL, NULL);
1485	BUG_ON(!bch_keylist_empty(&keylist));
1486
1487	for (i = `0`; i < nodes; i++) {
1488	btree_node_free(b: r[i].b);
1489	rw_unlock(w: true, b: r[i].b);
1490
1491	r[i].b = new_nodes[i];
1492	}
1493
1494	memmove(r, r + `1`, sizeof(r[`0`]) * (nodes - `1`));
1495	r[nodes - `1`].b = ERR_PTR(error: -EINTR);
1496
1497	trace_bcache_btree_gc_coalesce(nodes);
1498	gc->nodes--;
1499
1500	bch_keylist_free(l: &keylist);
1501
1502	/ Invalidated our iterator /
1503	return -EINTR;
1504
1505	out_unlock_nocoalesce:
1506	for (i = `0`; i < nodes; i++)
1507	mutex_unlock(lock: &new_nodes[i]->write_lock);
1508
1509	out_nocoalesce:
1510	closure_sync(cl: &cl);
1511
1512	while ((k = bch_keylist_pop(l: &keylist)))
1513	if (!bkey_cmp(l: k, r: &ZERO_KEY))
1514	atomic_dec(v: &b->c->prio_blocked);
1515	bch_keylist_free(l: &keylist);
1516
1517	for (i = `0`; i < nodes; i++)
1518	if (!IS_ERR(ptr: new_nodes[i])) {
1519	btree_node_free(b: new_nodes[i]);
1520	rw_unlock(w: true, b: new_nodes[i]);
1521	}
1522	return `0`;
1523	}
1524
1525	static int btree_gc_rewrite_node(struct btree b, struct* btree_op *op,
1526	struct btree *replace)
1527	{
1528	struct keylist keys;
1529	struct btree *n;
1530
1531	if (btree_check_reserve(b, NULL))
1532	return `0`;
1533
1534	n = btree_node_alloc_replacement(b: replace, NULL);
1535
1536	/ recheck reserve after allocating replacement node /
1537	if (btree_check_reserve(b, NULL)) {
1538	btree_node_free(b: n);
1539	rw_unlock(w: true, b: n);
1540	return `0`;
1541	}
1542
1543	bch_btree_node_write_sync(b: n);
1544
1545	bch_keylist_init(l: &keys);
1546	bch_keylist_add(l: &keys, k: &n->key);
1547
1548	make_btree_freeing_key(b: replace, k: keys.top);
1549	bch_keylist_push(l: &keys);
1550
1551	bch_btree_insert_node(b, op, insert_keys: &keys, NULL, NULL);
1552	BUG_ON(!bch_keylist_empty(&keys));
1553
1554	btree_node_free(b: replace);
1555	rw_unlock(w: true, b: n);
1556
1557	/ Invalidated our iterator /
1558	return -EINTR;
1559	}
1560
1561	static unsigned int btree_gc_count_keys(struct btree *b)
1562	{
1563	struct bkey *k;
1564	struct btree_iter iter;
1565	unsigned int ret = `0`;
1566
1567	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1568	ret += bkey_u64s(k);
1569
1570	return ret;
1571	}
1572
1573	static size_t btree_gc_min_nodes(struct cache_set *c)
1574	{
1575	size_t min_nodes;
1576
1577	/*
1578	* Since incremental GC would stop 100ms when front
1579	* side I/O comes, so when there are many btree nodes,
1580	* if GC only processes constant (100) nodes each time,
1581	* GC would last a long time, and the front side I/Os
1582	* would run out of the buckets (since no new bucket
1583	* can be allocated during GC), and be blocked again.
1584	* So GC should not process constant nodes, but varied
1585	* nodes according to the number of btree nodes, which
1586	* realized by dividing GC into constant(100) times,
1587	* so when there are many btree nodes, GC can process
1588	* more nodes each time, otherwise, GC will process less
1589	* nodes each time (but no less than MIN_GC_NODES)
1590	*/
1591	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
1592	if (min_nodes < MIN_GC_NODES)
1593	min_nodes = MIN_GC_NODES;
1594
1595	return min_nodes;
1596	}
1597
1598
1599	static int btree_gc_recurse(struct btree b, struct* btree_op *op,
1600	struct closure writes, struct* gc_stat *gc)
1601	{
1602	int ret = `0`;
1603	bool should_rewrite;
1604	struct bkey *k;
1605	struct btree_iter iter;
1606	struct gc_merge_info r[GC_MERGE_NODES];
1607	struct gc_merge_info i, last = r + ARRAY_SIZE(r) - `1`;
1608
1609	bch_btree_iter_init(b: &b->keys, iter: &iter, search: &b->c->gc_done);
1610
1611	for (i = r; i < r + ARRAY_SIZE(r); i++)
1612	i->b = ERR_PTR(error: -EINTR);
1613
1614	while (`1`) {
1615	k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys, fn: bch_ptr_bad);
1616	if (k) {
1617	r->b = bch_btree_node_get(c: b->c, op, k, level: b->level - `1`,
1618	write: true, parent: b);
1619	if (IS_ERR(ptr: r->b)) {
1620	ret = PTR_ERR(ptr: r->b);
1621	break;
1622	}
1623
1624	r->keys = btree_gc_count_keys(b: r->b);
1625
1626	ret = btree_gc_coalesce(b, op, gc, r);
1627	if (ret)
1628	break;
1629	}
1630
1631	if (!last->b)
1632	break;
1633
1634	if (!IS_ERR(ptr: last->b)) {
1635	should_rewrite = btree_gc_mark_node(b: last->b, gc);
1636	if (should_rewrite) {
1637	ret = btree_gc_rewrite_node(b, op, replace: last->b);
1638	if (ret)
1639	break;
1640	}
1641
1642	if (last->b->level) {
1643	ret = btree_gc_recurse(b: last->b, op, writes, gc);
1644	if (ret)
1645	break;
1646	}
1647
1648	bkey_copy_key(dest: &b->c->gc_done, src: &last->b->key);
1649
1650	/*
1651	* Must flush leaf nodes before gc ends, since replace
1652	* operations aren't journalled
1653	*/
1654	mutex_lock(&last->b->write_lock);
1655	if (btree_node_dirty(b: last->b))
1656	bch_btree_node_write(b: last->b, parent: writes);
1657	mutex_unlock(lock: &last->b->write_lock);
1658	rw_unlock(w: true, b: last->b);
1659	}
1660
1661	memmove(r + `1`, r, sizeof(r[`0`]) * (GC_MERGE_NODES - `1`));
1662	r->b = NULL;
1663
1664	if (atomic_read(v: &b->c->search_inflight) &&
1665	gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(c: b->c)) {
1666	gc->nodes_pre = gc->nodes;
1667	ret = -EAGAIN;
1668	break;
1669	}
1670
1671	if (need_resched()) {
1672	ret = -EAGAIN;
1673	break;
1674	}
1675	}
1676
1677	for (i = r; i < r + ARRAY_SIZE(r); i++)
1678	if (!IS_ERR_OR_NULL(ptr: i->b)) {
1679	mutex_lock(&i->b->write_lock);
1680	if (btree_node_dirty(b: i->b))
1681	bch_btree_node_write(b: i->b, parent: writes);
1682	mutex_unlock(lock: &i->b->write_lock);
1683	rw_unlock(w: true, b: i->b);
1684	}
1685
1686	return ret;
1687	}
1688
1689	static int bch_btree_gc_root(struct btree b, struct* btree_op *op,
1690	struct closure writes, struct* gc_stat *gc)
1691	{
1692	struct btree *n = NULL;
1693	int ret = `0`;
1694	bool should_rewrite;
1695
1696	should_rewrite = btree_gc_mark_node(b, gc);
1697	if (should_rewrite) {
1698	n = btree_node_alloc_replacement(b, NULL);
1699
1700	if (!IS_ERR(ptr: n)) {
1701	bch_btree_node_write_sync(b: n);
1702
1703	bch_btree_set_root(b: n);
1704	btree_node_free(b);
1705	rw_unlock(w: true, b: n);
1706
1707	return -EINTR;
1708	}
1709	}
1710
1711	__bch_btree_mark_key(c: b->c, level: b->level + `1`, k: &b->key);
1712
1713	if (b->level) {
1714	ret = btree_gc_recurse(b, op, writes, gc);
1715	if (ret)
1716	return ret;
1717	}
1718
1719	bkey_copy_key(dest: &b->c->gc_done, src: &b->key);
1720
1721	return ret;
1722	}
1723
1724	static void btree_gc_start(struct cache_set *c)
1725	{
1726	struct cache *ca;
1727	struct bucket *b;
1728
1729	if (!c->gc_mark_valid)
1730	return;
1731
1732	mutex_lock(&c->bucket_lock);
1733
1734	c->gc_mark_valid = `0`;
1735	c->gc_done = ZERO_KEY;
1736
1737	ca = c->cache;
1738	for_each_bucket(b, ca) {
1739	b->last_gc = b->gen;
1740	if (!atomic_read(v: &b->pin)) {
1741	SET_GC_MARK(k: b, v: `0`);
1742	SET_GC_SECTORS_USED(k: b, v: `0`);
1743	}
1744	}
1745
1746	mutex_unlock(lock: &c->bucket_lock);
1747	}
1748
1749	static void bch_btree_gc_finish(struct cache_set *c)
1750	{
1751	struct bucket *b;
1752	struct cache *ca;
1753	unsigned int i, j;
1754	uint64_t *k;
1755
1756	mutex_lock(&c->bucket_lock);
1757
1758	set_gc_sectors(c);
1759	c->gc_mark_valid = `1`;
1760	c->need_gc = `0`;
1761
1762	for (i = `0`; i < KEY_PTRS(k: &c->uuid_bucket); i++)
1763	SET_GC_MARK(k: PTR_BUCKET(c, k: &c->uuid_bucket, ptr: i),
1764	GC_MARK_METADATA);
1765
1766	/ don't reclaim buckets to which writeback keys point /
1767	rcu_read_lock();
1768	for (i = `0`; i < c->devices_max_used; i++) {
1769	struct bcache_device *d = c->devices[i];
1770	struct cached_dev *dc;
1771	struct keybuf_key w, n;
1772
1773	if (!d \|\| UUID_FLASH_ONLY(k: &c->uuids[i]))
1774	continue;
1775	dc = container_of(d, struct cached_dev, disk);
1776
1777	spin_lock(lock: &dc->writeback_keys.lock);
1778	rbtree_postorder_for_each_entry_safe(w, n,
1779	&dc->writeback_keys.keys, node)
1780	for (j = `0`; j < KEY_PTRS(k: &w->key); j++)
1781	SET_GC_MARK(k: PTR_BUCKET(c, k: &w->key, ptr: j),
1782	GC_MARK_DIRTY);
1783	spin_unlock(lock: &dc->writeback_keys.lock);
1784	}
1785	rcu_read_unlock();
1786
1787	c->avail_nbuckets = `0`;
1788
1789	ca = c->cache;
1790	ca->invalidate_needs_gc = `0`;
1791
1792	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
1793	SET_GC_MARK(k: ca->buckets + *k, GC_MARK_METADATA);
1794
1795	for (k = ca->prio_buckets;
1796	k < ca->prio_buckets + prio_buckets(ca) * `2`; k++)
1797	SET_GC_MARK(k: ca->buckets + *k, GC_MARK_METADATA);
1798
1799	for_each_bucket(b, ca) {
1800	c->need_gc = max(c->need_gc, bucket_gc_gen(b));
1801
1802	if (atomic_read(v: &b->pin))
1803	continue;
1804
1805	BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1806
1807	if (!GC_MARK(k: b) \|\| GC_MARK(k: b) == GC_MARK_RECLAIMABLE)
1808	c->avail_nbuckets++;
1809	}
1810
1811	mutex_unlock(lock: &c->bucket_lock);
1812	}
1813
1814	static void bch_btree_gc(struct cache_set *c)
1815	{
1816	int ret;
1817	struct gc_stat stats;
1818	struct closure writes;
1819	struct btree_op op;
1820	uint64_t start_time = local_clock();
1821
1822	trace_bcache_gc_start(c);
1823
1824	memset(&stats, `0`, sizeof(struct gc_stat));
1825	closure_init_stack(cl: &writes);
1826	bch_btree_op_init(op: &op, SHRT_MAX);
1827
1828	btree_gc_start(c);
1829
1830	/ if CACHE_SET_IO_DISABLE set, gc thread should stop too /
1831	do {
1832	ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
1833	closure_sync(cl: &writes);
1834	cond_resched();
1835
1836	if (ret == -EAGAIN)
1837	schedule_timeout_interruptible(timeout: msecs_to_jiffies
1838	(GC_SLEEP_MS));
1839	else if (ret)
1840	pr_warn("gc failed!\n");
1841	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
1842
1843	bch_btree_gc_finish(c);
1844	wake_up_allocators(c);
1845
1846	bch_time_stats_update(stats: &c->btree_gc_time, time: start_time);
1847
1848	stats.key_bytes = sizeof*(uint64_t);
1849	stats.data <<= `9`;
1850	bch_update_bucket_in_use(c, stats: &stats);
1851	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1852
1853	trace_bcache_gc_end(c);
1854
1855	bch_moving_gc(c);
1856	}
1857
1858	static bool gc_should_run(struct cache_set *c)
1859	{
1860	struct cache *ca = c->cache;
1861
1862	if (ca->invalidate_needs_gc)
1863	return true;
1864
1865	if (atomic_read(v: &c->sectors_to_gc) < `0`)
1866	return true;
1867
1868	return false;
1869	}
1870
1871	static int bch_gc_thread(void *arg)
1872	{
1873	struct cache_set *c = arg;
1874
1875	while (`1`) {
1876	wait_event_interruptible(c->gc_wait,
1877	kthread_should_stop() \|\|
1878	test_bit(CACHE_SET_IO_DISABLE, &c->flags) \|\|
1879	gc_should_run(c));
1880
1881	if (kthread_should_stop() \|\|
1882	test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1883	break;
1884
1885	set_gc_sectors(c);
1886	bch_btree_gc(c);
1887	}
1888
1889	wait_for_kthread_stop();
1890	return `0`;
1891	}
1892
1893	int bch_gc_thread_start(struct cache_set *c)
1894	{
1895	c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
1896	return PTR_ERR_OR_ZERO(ptr: c->gc_thread);
1897	}
1898
1899	/ Initial partial gc /
1900
1901	static int bch_btree_check_recurse(struct btree b, struct* btree_op *op)
1902	{
1903	int ret = `0`;
1904	struct bkey k, p = NULL;
1905	struct btree_iter iter;
1906
1907	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1908	bch_initial_mark_key(c: b->c, level: b->level, k);
1909
1910	bch_initial_mark_key(c: b->c, level: b->level + `1`, k: &b->key);
1911
1912	if (b->level) {
1913	bch_btree_iter_init(b: &b->keys, iter: &iter, NULL);
1914
1915	do {
1916	k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys,
1917	fn: bch_ptr_bad);
1918	if (k) {
1919	btree_node_prefetch(parent: b, k);
1920	/*
1921	* initiallize c->gc_stats.nodes
1922	* for incremental GC
1923	*/
1924	b->c->gc_stats.nodes++;
1925	}
1926
1927	if (p)
1928	ret = bcache_btree(check_recurse, p, b, op);
1929
1930	p = k;
1931	} while (p && !ret);
1932	}
1933
1934	return ret;
1935	}
1936
1937
1938	static int bch_btree_check_thread(void *arg)
1939	{
1940	int ret;
1941	struct btree_check_info *info = arg;
1942	struct btree_check_state *check_state = info->state;
1943	struct cache_set *c = check_state->c;
1944	struct btree_iter iter;
1945	struct bkey k, p;
1946	int cur_idx, prev_idx, skip_nr;
1947
1948	k = p = NULL;
1949	cur_idx = prev_idx = `0`;
1950	ret = `0`;
1951
1952	/ root node keys are checked before thread created /
1953	bch_btree_iter_init(b: &c->root->keys, iter: &iter, NULL);
1954	k = bch_btree_iter_next_filter(iter: &iter, b: &c->root->keys, fn: bch_ptr_bad);
1955	BUG_ON(!k);
1956
1957	p = k;
1958	while (k) {
1959	/*
1960	* Fetch a root node key index, skip the keys which
1961	* should be fetched by other threads, then check the
1962	* sub-tree indexed by the fetched key.
1963	*/
1964	spin_lock(lock: &check_state->idx_lock);
1965	cur_idx = check_state->key_idx;
1966	check_state->key_idx++;
1967	spin_unlock(lock: &check_state->idx_lock);
1968
1969	skip_nr = cur_idx - prev_idx;
1970
1971	while (skip_nr) {
1972	k = bch_btree_iter_next_filter(iter: &iter,
1973	b: &c->root->keys,
1974	fn: bch_ptr_bad);
1975	if (k)
1976	p = k;
1977	else {
1978	/*
1979	* No more keys to check in root node,
1980	* current checking threads are enough,
1981	* stop creating more.
1982	*/
1983	atomic_set(v: &check_state->enough, i: `1`);
1984	/ Update check_state->enough earlier /
1985	smp_mb__after_atomic();
1986	goto out;
1987	}
1988	skip_nr--;
1989	cond_resched();
1990	}
1991
1992	if (p) {
1993	struct btree_op op;
1994
1995	btree_node_prefetch(parent: c->root, k: p);
1996	c->gc_stats.nodes++;
1997	bch_btree_op_init(op: &op, write_lock_level: `0`);
1998	ret = bcache_btree(check_recurse, p, c->root, &op);
1999	/*
2000	* The op may be added to cache_set's btree_cache_wait
2001	* in mca_cannibalize(), must ensure it is removed from
2002	* the list and release btree_cache_alloc_lock before
2003	* free op memory.
2004	* Otherwise, the btree_cache_wait will be damaged.
2005	*/
2006	bch_cannibalize_unlock(c);
2007	finish_wait(wq_head: &c->btree_cache_wait, wq_entry: &(&op)->wait);
2008	if (ret)
2009	goto out;
2010	}
2011	p = NULL;
2012	prev_idx = cur_idx;
2013	cond_resched();
2014	}
2015
2016	out:
2017	info->result = ret;
2018	/ update check_state->started among all CPUs /
2019	smp_mb__before_atomic();
2020	if (atomic_dec_and_test(v: &check_state->started))
2021	wake_up(&check_state->wait);
2022
2023	return ret;
2024	}
2025
2026
2027
2028	static int bch_btree_chkthread_nr(void)
2029	{
2030	int n = num_online_cpus()/`2`;
2031
2032	if (n == `0`)
2033	n = `1`;
2034	else if (n > BCH_BTR_CHKTHREAD_MAX)
2035	n = BCH_BTR_CHKTHREAD_MAX;
2036
2037	return n;
2038	}
2039
2040	int bch_btree_check(struct cache_set *c)
2041	{
2042	int ret = `0`;
2043	int i;
2044	struct bkey *k = NULL;
2045	struct btree_iter iter;
2046	struct btree_check_state check_state;
2047
2048	/ check and mark root node keys /
2049	for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
2050	bch_initial_mark_key(c, level: c->root->level, k);
2051
2052	bch_initial_mark_key(c, level: c->root->level + `1`, k: &c->root->key);
2053
2054	if (c->root->level == `0`)
2055	return `0`;
2056
2057	memset(&check_state, `0`, sizeof(struct btree_check_state));
2058	check_state.c = c;
2059	check_state.total_threads = bch_btree_chkthread_nr();
2060	check_state.key_idx = `0`;
2061	spin_lock_init(&check_state.idx_lock);
2062	atomic_set(v: &check_state.started, i: `0`);
2063	atomic_set(v: &check_state.enough, i: `0`);
2064	init_waitqueue_head(&check_state.wait);
2065
2066	rw_lock(w: `0`, b: c->root, level: c->root->level);
2067	/*
2068	* Run multiple threads to check btree nodes in parallel,
2069	* if check_state.enough is non-zero, it means current
2070	* running check threads are enough, unncessary to create
2071	* more.
2072	*/
2073	for (i = `0`; i < check_state.total_threads; i++) {
2074	/ fetch latest check_state.enough earlier /
2075	smp_mb__before_atomic();
2076	if (atomic_read(v: &check_state.enough))
2077	break;
2078
2079	check_state.infos[i].result = `0`;
2080	check_state.infos[i].state = &check_state;
2081
2082	check_state.infos[i].thread =
2083	kthread_run(bch_btree_check_thread,
2084	&check_state.infos[i],
2085	"bch_btrchk[%d]", i);
2086	if (IS_ERR(ptr: check_state.infos[i].thread)) {
2087	pr_err("fails to run thread bch_btrchk[%d]\n", i);
2088	for (--i; i >= `0`; i--)
2089	kthread_stop(k: check_state.infos[i].thread);
2090	ret = -ENOMEM;
2091	goto out;
2092	}
2093	atomic_inc(v: &check_state.started);
2094	}
2095
2096	/*
2097	* Must wait for all threads to stop.
2098	*/
2099	wait_event(check_state.wait, atomic_read(&check_state.started) == `0`);
2100
2101	for (i = `0`; i < check_state.total_threads; i++) {
2102	if (check_state.infos[i].result) {
2103	ret = check_state.infos[i].result;
2104	goto out;
2105	}
2106	}
2107
2108	out:
2109	rw_unlock(w: `0`, b: c->root);
2110	return ret;
2111	}
2112
2113	void bch_initial_gc_finish(struct cache_set *c)
2114	{
2115	struct cache *ca = c->cache;
2116	struct bucket *b;
2117
2118	bch_btree_gc_finish(c);
2119
2120	mutex_lock(&c->bucket_lock);
2121
2122	/*
2123	* We need to put some unused buckets directly on the prio freelist in
2124	* order to get the allocator thread started - it needs freed buckets in
2125	* order to rewrite the prios and gens, and it needs to rewrite prios
2126	* and gens in order to free buckets.
2127	*
2128	* This is only safe for buckets that have no live data in them, which
2129	* there should always be some of.
2130	*/
2131	for_each_bucket(b, ca) {
2132	if (fifo_full(&ca->free[RESERVE_PRIO]) &&
2133	fifo_full(&ca->free[RESERVE_BTREE]))
2134	break;
2135
2136	if (bch_can_invalidate_bucket(ca, b) &&
2137	!GC_MARK(k: b)) {
2138	__bch_invalidate_one_bucket(ca, b);
2139	if (!fifo_push(&ca->free[RESERVE_PRIO],
2140	b - ca->buckets))
2141	fifo_push(&ca->free[RESERVE_BTREE],
2142	b - ca->buckets);
2143	}
2144	}
2145
2146	mutex_unlock(lock: &c->bucket_lock);
2147	}
2148
2149	/ Btree insertion /
2150
2151	static bool btree_insert_key(struct btree b, struct* bkey *k,
2152	struct bkey *replace_key)
2153	{
2154	unsigned int status;
2155
2156	BUG_ON(bkey_cmp(k, &b->key) > `0`);
2157
2158	status = bch_btree_insert_key(b: &b->keys, k, replace_key);
2159	if (status != BTREE_INSERT_STATUS_NO_INSERT) {
2160	bch_check_keys(&b->keys, "%u for %s", status,
2161	replace_key ? "replace" : "insert");
2162
2163	trace_bcache_btree_insert_key(b, k, op: replace_key != NULL,
2164	status);
2165	return true;
2166	} else
2167	return false;
2168	}
2169
2170	static size_t insert_u64s_remaining(struct btree *b)
2171	{
2172	long ret = bch_btree_keys_u64s_remaining(b: &b->keys);
2173
2174	/*
2175	* Might land in the middle of an existing extent and have to split it
2176	*/
2177	if (b->keys.ops->is_extents)
2178	ret -= KEY_MAX_U64S;
2179
2180	return max(ret, `0L`);
2181	}
2182
2183	static bool bch_btree_insert_keys(struct btree b, struct* btree_op *op,
2184	struct keylist *insert_keys,
2185	struct bkey *replace_key)
2186	{
2187	bool ret = false;
2188	int oldsize = bch_count_data(b: &b->keys);
2189
2190	while (!bch_keylist_empty(l: insert_keys)) {
2191	struct bkey *k = insert_keys->keys;
2192
2193	if (bkey_u64s(k) > insert_u64s_remaining(b))
2194	break;
2195
2196	if (bkey_cmp(l: k, r: &b->key) <= `0`) {
2197	if (!b->level)
2198	bkey_put(c: b->c, k);
2199
2200	ret \|= btree_insert_key(b, k, replace_key);
2201	bch_keylist_pop_front(l: insert_keys);
2202	} else if (bkey_cmp(l: &START_KEY(k), r: &b->key) < `0`) {
2203	BKEY_PADDED(key) temp;
2204	bkey_copy(&temp.key, insert_keys->keys);
2205
2206	bch_cut_back(where: &b->key, k: &temp.key);
2207	bch_cut_front(where: &b->key, k: insert_keys->keys);
2208
2209	ret \|= btree_insert_key(b, k: &temp.key, replace_key);
2210	break;
2211	} else {
2212	break;
2213	}
2214	}
2215
2216	if (!ret)
2217	op->insert_collision = true;
2218
2219	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2220
2221	BUG_ON(bch_count_data(&b->keys) < oldsize);
2222	return ret;
2223	}
2224
2225	static int btree_split(struct btree b, struct* btree_op *op,
2226	struct keylist *insert_keys,
2227	struct bkey *replace_key)
2228	{
2229	bool split;
2230	struct btree n1, n2 = NULL, *n3 = NULL;
2231	uint64_t start_time = local_clock();
2232	struct closure cl;
2233	struct keylist parent_keys;
2234
2235	closure_init_stack(cl: &cl);
2236	bch_keylist_init(l: &parent_keys);
2237
2238	if (btree_check_reserve(b, op)) {
2239	if (!b->level)
2240	return -EINTR;
2241	else
2242	WARN(`1`, "insufficient reserve for split\n");
2243	}
2244
2245	n1 = btree_node_alloc_replacement(b, op);
2246	if (IS_ERR(ptr: n1))
2247	goto err;
2248
2249	split = set_blocks(btree_bset_first(n1),
2250	block_bytes(n1->c->cache)) > (btree_blocks(b) * `4`) / `5`;
2251
2252	if (split) {
2253	unsigned int keys = `0`;
2254
2255	trace_bcache_btree_node_split(b, keys: btree_bset_first(b: n1)->keys);
2256
2257	n2 = bch_btree_node_alloc(c: b->c, op, level: b->level, parent: b->parent);
2258	if (IS_ERR(ptr: n2))
2259	goto err_free1;
2260
2261	if (!b->parent) {
2262	n3 = bch_btree_node_alloc(c: b->c, op, level: b->level + `1`, NULL);
2263	if (IS_ERR(ptr: n3))
2264	goto err_free2;
2265	}
2266
2267	mutex_lock(&n1->write_lock);
2268	mutex_lock(&n2->write_lock);
2269
2270	bch_btree_insert_keys(b: n1, op, insert_keys, replace_key);
2271
2272	/*
2273	* Has to be a linear search because we don't have an auxiliary
2274	* search tree yet
2275	*/
2276
2277	while (keys < (btree_bset_first(b: n1)->keys * `3`) / `5`)
2278	keys += bkey_u64s(k: bset_bkey_idx(i: btree_bset_first(b: n1),
2279	idx: keys));
2280
2281	bkey_copy_key(dest: &n1->key,
2282	src: bset_bkey_idx(i: btree_bset_first(b: n1), idx: keys));
2283	keys += bkey_u64s(k: bset_bkey_idx(i: btree_bset_first(b: n1), idx: keys));
2284
2285	btree_bset_first(b: n2)->keys = btree_bset_first(b: n1)->keys - keys;
2286	btree_bset_first(b: n1)->keys = keys;
2287
2288	memcpy(btree_bset_first(n2)->start,
2289	bset_bkey_last(btree_bset_first(n1)),
2290	btree_bset_first(n2)->keys * sizeof(uint64_t));
2291
2292	bkey_copy_key(dest: &n2->key, src: &b->key);
2293
2294	bch_keylist_add(l: &parent_keys, k: &n2->key);
2295	bch_btree_node_write(b: n2, parent: &cl);
2296	mutex_unlock(lock: &n2->write_lock);
2297	rw_unlock(w: true, b: n2);
2298	} else {
2299	trace_bcache_btree_node_compact(b, keys: btree_bset_first(b: n1)->keys);
2300
2301	mutex_lock(&n1->write_lock);
2302	bch_btree_insert_keys(b: n1, op, insert_keys, replace_key);
2303	}
2304
2305	bch_keylist_add(l: &parent_keys, k: &n1->key);
2306	bch_btree_node_write(b: n1, parent: &cl);
2307	mutex_unlock(lock: &n1->write_lock);
2308
2309	if (n3) {
2310	/ Depth increases, make a new root /
2311	mutex_lock(&n3->write_lock);
2312	bkey_copy_key(dest: &n3->key, src: &MAX_KEY);
2313	bch_btree_insert_keys(b: n3, op, insert_keys: &parent_keys, NULL);
2314	bch_btree_node_write(b: n3, parent: &cl);
2315	mutex_unlock(lock: &n3->write_lock);
2316
2317	closure_sync(cl: &cl);
2318	bch_btree_set_root(b: n3);
2319	rw_unlock(w: true, b: n3);
2320	} else if (!b->parent) {
2321	/ Root filled up but didn't need to be split /
2322	closure_sync(cl: &cl);
2323	bch_btree_set_root(b: n1);
2324	} else {
2325	/ Split a non root node /
2326	closure_sync(cl: &cl);
2327	make_btree_freeing_key(b, k: parent_keys.top);
2328	bch_keylist_push(l: &parent_keys);
2329
2330	bch_btree_insert_node(b: b->parent, op, insert_keys: &parent_keys, NULL, NULL);
2331	BUG_ON(!bch_keylist_empty(&parent_keys));
2332	}
2333
2334	btree_node_free(b);
2335	rw_unlock(w: true, b: n1);
2336
2337	bch_time_stats_update(stats: &b->c->btree_split_time, time: start_time);
2338
2339	return `0`;
2340	err_free2:
2341	bkey_put(c: b->c, k: &n2->key);
2342	btree_node_free(b: n2);
2343	rw_unlock(w: true, b: n2);
2344	err_free1:
2345	bkey_put(c: b->c, k: &n1->key);
2346	btree_node_free(b: n1);
2347	rw_unlock(w: true, b: n1);
2348	err:
2349	WARN(`1`, "bcache: btree split failed (level %u)", b->level);
2350
2351	if (n3 == ERR_PTR(error: -EAGAIN) \|\|
2352	n2 == ERR_PTR(error: -EAGAIN) \|\|
2353	n1 == ERR_PTR(error: -EAGAIN))
2354	return -EAGAIN;
2355
2356	return -ENOMEM;
2357	}
2358
2359	static int bch_btree_insert_node(struct btree b, struct* btree_op *op,
2360	struct keylist *insert_keys,
2361	atomic_t *journal_ref,
2362	struct bkey *replace_key)
2363	{
2364	struct closure cl;
2365
2366	BUG_ON(b->level && replace_key);
2367
2368	closure_init_stack(cl: &cl);
2369
2370	mutex_lock(&b->write_lock);
2371
2372	if (write_block(b) != btree_bset_last(b) &&
2373	b->keys.last_set_unwritten)
2374	bch_btree_init_next(b); / just wrote a set /
2375
2376	if (bch_keylist_nkeys(l: insert_keys) > insert_u64s_remaining(b)) {
2377	mutex_unlock(lock: &b->write_lock);
2378	goto split;
2379	}
2380
2381	BUG_ON(write_block(b) != btree_bset_last(b));
2382
2383	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2384	if (!b->level)
2385	bch_btree_leaf_dirty(b, journal_ref);
2386	else
2387	bch_btree_node_write(b, parent: &cl);
2388	}
2389
2390	mutex_unlock(lock: &b->write_lock);
2391
2392	/ wait for btree node write if necessary, after unlock /
2393	closure_sync(cl: &cl);
2394
2395	return `0`;
2396	split:
2397	if (current->bio_list) {
2398	op->lock = b->c->root->level + `1`;
2399	return -EAGAIN;
2400	} else if (op->lock <= b->c->root->level) {
2401	op->lock = b->c->root->level + `1`;
2402	return -EINTR;
2403	} else {
2404	/ Invalidated all iterators /
2405	int ret = btree_split(b, op, insert_keys, replace_key);
2406
2407	if (bch_keylist_empty(l: insert_keys))
2408	return `0`;
2409	else if (!ret)
2410	return -EINTR;
2411	return ret;
2412	}
2413	}
2414
2415	int bch_btree_insert_check_key(struct btree b, struct* btree_op *op,
2416	struct bkey *check_key)
2417	{
2418	int ret = -EINTR;
2419	uint64_t btree_ptr = b->key.ptr[`0`];
2420	unsigned long seq = b->seq;
2421	struct keylist insert;
2422	bool upgrade = op->lock == -`1`;
2423
2424	bch_keylist_init(l: &insert);
2425
2426	if (upgrade) {
2427	rw_unlock(w: false, b);
2428	rw_lock(w: true, b, level: b->level);
2429
2430	if (b->key.ptr[`0`] != btree_ptr \|\|
2431	b->seq != seq + `1`) {
2432	op->lock = b->level;
2433	goto out;
2434	}
2435	}
2436
2437	SET_KEY_PTRS(k: check_key, v: `1`);
2438	get_random_bytes(buf: &check_key->ptr[`0`], len: sizeof(uint64_t));
2439
2440	SET_PTR_DEV(k: check_key, i: `0`, PTR_CHECK_DEV);
2441
2442	bch_keylist_add(l: &insert, k: check_key);
2443
2444	ret = bch_btree_insert_node(b, op, insert_keys: &insert, NULL, NULL);
2445
2446	BUG_ON(!ret && !bch_keylist_empty(&insert));
2447	out:
2448	if (upgrade)
2449	downgrade_write(sem: &b->lock);
2450	return ret;
2451	}
2452
2453	struct btree_insert_op {
2454	struct btree_op op;
2455	struct keylist *keys;
2456	atomic_t *journal_ref;
2457	struct bkey *replace_key;
2458	};
2459
2460	static int btree_insert_fn(struct btree_op b_op, struct* btree *b)
2461	{
2462	struct btree_insert_op *op = container_of(b_op,
2463	struct btree_insert_op, op);
2464
2465	int ret = bch_btree_insert_node(b, op: &op->op, insert_keys: op->keys,
2466	journal_ref: op->journal_ref, replace_key: op->replace_key);
2467	if (ret && !bch_keylist_empty(l: op->keys))
2468	return ret;
2469	else
2470	return MAP_DONE;
2471	}
2472
2473	int bch_btree_insert(struct cache_set c, struct* keylist *keys,
2474	atomic_t journal_ref, struct* bkey *replace_key)
2475	{
2476	struct btree_insert_op op;
2477	int ret = `0`;
2478
2479	BUG_ON(current->bio_list);
2480	BUG_ON(bch_keylist_empty(keys));
2481
2482	bch_btree_op_init(op: &op.op, write_lock_level: `0`);
2483	op.keys = keys;
2484	op.journal_ref = journal_ref;
2485	op.replace_key = replace_key;
2486
2487	while (!ret && !bch_keylist_empty(l: keys)) {
2488	op.op.lock = `0`;
2489	ret = bch_btree_map_leaf_nodes(op: &op.op, c,
2490	from: &START_KEY(keys->keys),
2491	fn: btree_insert_fn);
2492	}
2493
2494	if (ret) {
2495	struct bkey *k;
2496
2497	pr_err("error %i\n", ret);
2498
2499	while ((k = bch_keylist_pop(l: keys)))
2500	bkey_put(c, k);
2501	} else if (op.op.insert_collision)
2502	ret = -ESRCH;
2503
2504	return ret;
2505	}
2506
2507	void bch_btree_set_root(struct btree *b)
2508	{
2509	unsigned int i;
2510	struct closure cl;
2511
2512	closure_init_stack(cl: &cl);
2513
2514	trace_bcache_btree_set_root(b);
2515
2516	BUG_ON(!b->written);
2517
2518	for (i = `0`; i < KEY_PTRS(k: &b->key); i++)
2519	BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2520
2521	mutex_lock(&b->c->bucket_lock);
2522	list_del_init(entry: &b->list);
2523	mutex_unlock(lock: &b->c->bucket_lock);
2524
2525	b->c->root = b;
2526
2527	bch_journal_meta(c: b->c, cl: &cl);
2528	closure_sync(cl: &cl);
2529	}
2530
2531	/ Map across nodes or keys /
2532
2533	static int bch_btree_map_nodes_recurse(struct btree b, struct* btree_op *op,
2534	struct bkey *from,
2535	btree_map_nodes_fn fn, int* flags)
2536	{
2537	int ret = MAP_CONTINUE;
2538
2539	if (b->level) {
2540	struct bkey *k;
2541	struct btree_iter iter;
2542
2543	bch_btree_iter_init(b: &b->keys, iter: &iter, search: from);
2544
2545	while ((k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys,
2546	fn: bch_ptr_bad))) {
2547	ret = bcache_btree(map_nodes_recurse, k, b,
2548	op, from, fn, flags);
2549	from = NULL;
2550
2551	if (ret != MAP_CONTINUE)
2552	return ret;
2553	}
2554	}
2555
2556	if (!b->level \|\| flags == MAP_ALL_NODES)
2557	ret = fn(op, b);
2558
2559	return ret;
2560	}
2561
2562	int __bch_btree_map_nodes(struct btree_op op, struct* cache_set *c,
2563	struct bkey from, btree_map_nodes_fn fn, int flags)
2564	{
2565	return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
2566	}
2567
2568	int bch_btree_map_keys_recurse(struct btree b, struct* btree_op *op,
2569	struct bkey from, btree_map_keys_fn fn,
2570	int flags)
2571	{
2572	int ret = MAP_CONTINUE;
2573	struct bkey *k;
2574	struct btree_iter iter;
2575
2576	bch_btree_iter_init(b: &b->keys, iter: &iter, search: from);
2577
2578	while ((k = bch_btree_iter_next_filter(iter: &iter, b: &b->keys, fn: bch_ptr_bad))) {
2579	ret = !b->level
2580	? fn(op, b, k)
2581	: bcache_btree(map_keys_recurse, k,
2582	b, op, from, fn, flags);
2583	from = NULL;
2584
2585	if (ret != MAP_CONTINUE)
2586	return ret;
2587	}
2588
2589	if (!b->level && (flags & MAP_END_KEY))
2590	ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2591	KEY_OFFSET(&b->key), `0`));
2592
2593	return ret;
2594	}
2595
2596	int bch_btree_map_keys(struct btree_op op, struct* cache_set *c,
2597	struct bkey from, btree_map_keys_fn fn, int flags)
2598	{
2599	return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
2600	}
2601
2602	/ Keybuf code /
2603
2604	static inline int keybuf_cmp(struct keybuf_key l, struct* keybuf_key *r)
2605	{
2606	/ Overlapping keys compare equal /
2607	if (bkey_cmp(l: &l->key, r: &START_KEY(&r->key)) <= `0`)
2608	return -`1`;
2609	if (bkey_cmp(l: &START_KEY(&l->key), r: &r->key) >= `0`)
2610	return `1`;
2611	return `0`;
2612	}
2613
2614	static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2615	struct keybuf_key *r)
2616	{
2617	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -`1`, `1`);
2618	}
2619
2620	struct refill {
2621	struct btree_op op;
2622	unsigned int nr_found;
2623	struct keybuf *buf;
2624	struct bkey *end;
2625	keybuf_pred_fn *pred;
2626	};
2627
2628	static int refill_keybuf_fn(struct btree_op op, struct* btree *b,
2629	struct bkey *k)
2630	{
2631	struct refill refill = container_of(op, struct* refill, op);
2632	struct keybuf *buf = refill->buf;
2633	int ret = MAP_CONTINUE;
2634
2635	if (bkey_cmp(l: k, r: refill->end) > `0`) {
2636	ret = MAP_DONE;
2637	goto out;
2638	}
2639
2640	if (!KEY_SIZE(k)) / end key /
2641	goto out;
2642
2643	if (refill->pred(buf, k)) {
2644	struct keybuf_key *w;
2645
2646	spin_lock(lock: &buf->lock);
2647
2648	w = array_alloc(&buf->freelist);
2649	if (!w) {
2650	spin_unlock(lock: &buf->lock);
2651	return MAP_DONE;
2652	}
2653
2654	w->private = NULL;
2655	bkey_copy(&w->key, k);
2656
2657	if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2658	array_free(&buf->freelist, w);
2659	else
2660	refill->nr_found++;
2661
2662	if (array_freelist_empty(&buf->freelist))
2663	ret = MAP_DONE;
2664
2665	spin_unlock(lock: &buf->lock);
2666	}
2667	out:
2668	buf->last_scanned = *k;
2669	return ret;
2670	}
2671
2672	void bch_refill_keybuf(struct cache_set c, struct* keybuf *buf,
2673	struct bkey end, keybuf_pred_fn pred)
2674	{
2675	struct bkey start = buf->last_scanned;
2676	struct refill refill;
2677
2678	cond_resched();
2679
2680	bch_btree_op_init(op: &refill.op, write_lock_level: -`1`);
2681	refill.nr_found = `0`;
2682	refill.buf = buf;
2683	refill.end = end;
2684	refill.pred = pred;
2685
2686	bch_btree_map_keys(op: &refill.op, c, from: &buf->last_scanned,
2687	fn: refill_keybuf_fn, MAP_END_KEY);
2688
2689	trace_bcache_keyscan(nr_found: refill.nr_found,
2690	start_inode: KEY_INODE(k: &start), start_offset: KEY_OFFSET(k: &start),
2691	end_inode: KEY_INODE(k: &buf->last_scanned),
2692	end_offset: KEY_OFFSET(k: &buf->last_scanned));
2693
2694	spin_lock(lock: &buf->lock);
2695
2696	if (!RB_EMPTY_ROOT(&buf->keys)) {
2697	struct keybuf_key *w;
2698
2699	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2700	buf->start = START_KEY(&w->key);
2701
2702	w = RB_LAST(&buf->keys, struct keybuf_key, node);
2703	buf->end = w->key;
2704	} else {
2705	buf->start = MAX_KEY;
2706	buf->end = MAX_KEY;
2707	}
2708
2709	spin_unlock(lock: &buf->lock);
2710	}
2711
2712	static void __bch_keybuf_del(struct keybuf buf, struct* keybuf_key *w)
2713	{
2714	rb_erase(&w->node, &buf->keys);
2715	array_free(&buf->freelist, w);
2716	}
2717
2718	void bch_keybuf_del(struct keybuf buf, struct* keybuf_key *w)
2719	{
2720	spin_lock(lock: &buf->lock);
2721	__bch_keybuf_del(buf, w);
2722	spin_unlock(lock: &buf->lock);
2723	}
2724
2725	bool bch_keybuf_check_overlapping(struct keybuf buf, struct* bkey *start,
2726	struct bkey *end)
2727	{
2728	bool ret = false;
2729	struct keybuf_key p, w, s;
2730
2731	s.key = *start;
2732
2733	if (bkey_cmp(l: end, r: &buf->start) <= `0` \|\|
2734	bkey_cmp(l: start, r: &buf->end) >= `0`)
2735	return false;
2736
2737	spin_lock(lock: &buf->lock);
2738	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2739
2740	while (w && bkey_cmp(l: &START_KEY(&w->key), r: end) < `0`) {
2741	p = w;
2742	w = RB_NEXT(w, node);
2743
2744	if (p->private)
2745	ret = true;
2746	else
2747	__bch_keybuf_del(buf, w: p);
2748	}
2749
2750	spin_unlock(lock: &buf->lock);
2751	return ret;
2752	}
2753
2754	struct keybuf_key bch_keybuf_next(struct* keybuf *buf)
2755	{
2756	struct keybuf_key *w;
2757
2758	spin_lock(lock: &buf->lock);
2759
2760	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2761
2762	while (w && w->private)
2763	w = RB_NEXT(w, node);
2764
2765	if (w)
2766	w->private = ERR_PTR(error: -EINTR);
2767
2768	spin_unlock(lock: &buf->lock);
2769	return w;
2770	}
2771
2772	struct keybuf_key bch_keybuf_next_rescan(struct* cache_set *c,
2773	struct keybuf *buf,
2774	struct bkey *end,
2775	keybuf_pred_fn *pred)
2776	{
2777	struct keybuf_key *ret;
2778
2779	while (`1`) {
2780	ret = bch_keybuf_next(buf);
2781	if (ret)
2782	break;
2783
2784	if (bkey_cmp(l: &buf->last_scanned, r: end) >= `0`) {
2785	pr_debug("scan finished\n");
2786	break;
2787	}
2788
2789	bch_refill_keybuf(c, buf, end, pred);
2790	}
2791
2792	return ret;
2793	}
2794
2795	void bch_keybuf_init(struct keybuf *buf)
2796	{
2797	buf->last_scanned = MAX_KEY;
2798	buf->keys = RB_ROOT;
2799
2800	spin_lock_init(&buf->lock);
2801	array_allocator_init(&buf->freelist);
2802	}
2803
2804	void bch_btree_exit(void)
2805	{
2806	if (btree_io_wq)
2807	destroy_workqueue(wq: btree_io_wq);
2808	}
2809
2810	int __init bch_btree_init(void)
2811	{
2812	btree_io_wq = alloc_workqueue(fmt: "bch_btree_io", flags: WQ_MEM_RECLAIM, max_active: `0`);
2813	if (!btree_io_wq)
2814	return -ENOMEM;
2815
2816	return `0`;
2817	}
2818

source code of linux/drivers/md/bcache/btree.c