journal_reclaim.c source code [linux/fs/bcachefs/journal_reclaim.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	#include "bcachefs.h"
4	#include "btree_key_cache.h"
5	#include "btree_update.h"
6	#include "btree_write_buffer.h"
7	#include "buckets.h"
8	#include "errcode.h"
9	#include "error.h"
10	#include "journal.h"
11	#include "journal_io.h"
12	#include "journal_reclaim.h"
13	#include "replicas.h"
14	#include "sb-members.h"
15	#include "trace.h"
16
17	#include <linux/kthread.h>
18	#include <linux/sched/mm.h>
19
20	/ Free space calculations: /
21
22	static unsigned journal_space_from(struct journal_device *ja,
23	enum journal_space_from from)
24	{
25	switch (from) {
26	case journal_space_discarded:
27	return ja->discard_idx;
28	case journal_space_clean_ondisk:
29	return ja->dirty_idx_ondisk;
30	case journal_space_clean:
31	return ja->dirty_idx;
32	default:
33	BUG();
34	}
35	}
36
37	unsigned bch2_journal_dev_buckets_available(struct journal *j,
38	struct journal_device *ja,
39	enum journal_space_from from)
40	{
41	unsigned available = (journal_space_from(ja, from) -
42	ja->cur_idx - `1` + ja->nr) % ja->nr;
43
44	/*
45	* Don't use the last bucket unless writing the new last_seq
46	* will make another bucket available:
47	*/
48	if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
49	--available;
50
51	return available;
52	}
53
54	void bch2_journal_set_watermark(struct journal *j)
55	{
56	struct bch_fs c = container_of(j, struct* bch_fs, journal);
57	bool low_on_space = j->space[journal_space_clean].total * `4` <=
58	j->space[journal_space_total].total;
59	bool low_on_pin = fifo_free(&j->pin) < j->pin.size / `4`;
60	bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
61	unsigned watermark = low_on_space \|\| low_on_pin \|\| low_on_wb
62	? BCH_WATERMARK_reclaim
63	: BCH_WATERMARK_stripe;
64
65	if (track_event_change(stats: &c->times[BCH_TIME_blocked_journal_low_on_space], v: low_on_space) \|\|
66	track_event_change(stats: &c->times[BCH_TIME_blocked_journal_low_on_pin], v: low_on_pin) \|\|
67	track_event_change(stats: &c->times[BCH_TIME_blocked_write_buffer_full], v: low_on_wb))
68	trace_and_count(c, journal_full, c);
69
70	mod_bit(nr: JOURNAL_SPACE_LOW, addr: &j->flags, v: low_on_space \|\| low_on_pin);
71
72	swap(watermark, j->watermark);
73	if (watermark > j->watermark)
74	journal_wake(j);
75	}
76
77	static struct journal_space
78	journal_dev_space_available(struct journal j, struct* bch_dev *ca,
79	enum journal_space_from from)
80	{
81	struct journal_device *ja = &ca->journal;
82	unsigned sectors, buckets, unwritten;
83	u64 seq;
84
85	if (from == journal_space_total)
86	return (struct journal_space) {
87	.next_entry = ca->mi.bucket_size,
88	.total = ca->mi.bucket_size * ja->nr,
89	};
90
91	buckets = bch2_journal_dev_buckets_available(j, ja, from);
92	sectors = ja->sectors_free;
93
94	/*
95	* We that we don't allocate the space for a journal entry
96	* until we write it out - thus, account for it here:
97	*/
98	for (seq = journal_last_unwritten_seq(j);
99	seq <= journal_cur_seq(j);
100	seq++) {
101	unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
102
103	if (!unwritten)
104	continue;
105
106	/ entry won't fit on this device, skip: /
107	if (unwritten > ca->mi.bucket_size)
108	continue;
109
110	if (unwritten >= sectors) {
111	if (!buckets) {
112	sectors = `0`;
113	break;
114	}
115
116	buckets--;
117	sectors = ca->mi.bucket_size;
118	}
119
120	sectors -= unwritten;
121	}
122
123	if (sectors < ca->mi.bucket_size && buckets) {
124	buckets--;
125	sectors = ca->mi.bucket_size;
126	}
127
128	return (struct journal_space) {
129	.next_entry = sectors,
130	.total = sectors + buckets * ca->mi.bucket_size,
131	};
132	}
133
134	static struct journal_space __journal_space_available(struct journal j, unsigned* nr_devs_want,
135	enum journal_space_from from)
136	{
137	struct bch_fs c = container_of(j, struct* bch_fs, journal);
138	unsigned pos, nr_devs = `0`;
139	struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
140
141	BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
142
143	rcu_read_lock();
144	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
145	if (!ca->journal.nr)
146	continue;
147
148	space = journal_dev_space_available(j, ca, from);
149	if (!space.next_entry)
150	continue;
151
152	for (pos = `0`; pos < nr_devs; pos++)
153	if (space.total > dev_space[pos].total)
154	break;
155
156	array_insert_item(dev_space, nr_devs, pos, space);
157	}
158	rcu_read_unlock();
159
160	if (nr_devs < nr_devs_want)
161	return (struct journal_space) { `0`, `0` };
162
163	/*
164	* We sorted largest to smallest, and we want the smallest out of the
165	* @nr_devs_want largest devices:
166	*/
167	return dev_space[nr_devs_want - `1`];
168	}
169
170	void bch2_journal_space_available(struct journal *j)
171	{
172	struct bch_fs c = container_of(j, struct* bch_fs, journal);
173	unsigned clean, clean_ondisk, total;
174	unsigned max_entry_size = min(j->buf[`0`].buf_size >> `9`,
175	j->buf[`1`].buf_size >> `9`);
176	unsigned nr_online = `0`, nr_devs_want;
177	bool can_discard = false;
178	int ret = `0`;
179
180	lockdep_assert_held(&j->lock);
181
182	rcu_read_lock();
183	for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
184	struct journal_device *ja = &ca->journal;
185
186	if (!ja->nr)
187	continue;
188
189	while (ja->dirty_idx != ja->cur_idx &&
190	ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
191	ja->dirty_idx = (ja->dirty_idx + `1`) % ja->nr;
192
193	while (ja->dirty_idx_ondisk != ja->dirty_idx &&
194	ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
195	ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + `1`) % ja->nr;
196
197	if (ja->discard_idx != ja->dirty_idx_ondisk)
198	can_discard = true;
199
200	max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
201	nr_online++;
202	}
203	rcu_read_unlock();
204
205	j->can_discard = can_discard;
206
207	if (nr_online < metadata_replicas_required(c)) {
208	ret = JOURNAL_ERR_insufficient_devices;
209	goto out;
210	}
211
212	nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
213
214	for (unsigned i = `0`; i < journal_space_nr; i++)
215	j->space[i] = __journal_space_available(j, nr_devs_want, from: i);
216
217	clean_ondisk = j->space[journal_space_clean_ondisk].total;
218	clean = j->space[journal_space_clean].total;
219	total = j->space[journal_space_total].total;
220
221	if (!j->space[journal_space_discarded].next_entry)
222	ret = JOURNAL_ERR_journal_full;
223
224	if ((j->space[journal_space_clean_ondisk].next_entry <
225	j->space[journal_space_clean_ondisk].total) &&
226	(clean - clean_ondisk <= total / `8`) &&
227	(clean_ondisk * `2` > clean))
228	set_bit(nr: JOURNAL_MAY_SKIP_FLUSH, addr: &j->flags);
229	else
230	clear_bit(nr: JOURNAL_MAY_SKIP_FLUSH, addr: &j->flags);
231
232	bch2_journal_set_watermark(j);
233	out:
234	j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : `0`;
235	j->cur_entry_error = ret;
236
237	if (!ret)
238	journal_wake(j);
239	}
240
241	/ Discards - last part of journal reclaim: /
242
243	static bool should_discard_bucket(struct journal j, struct* journal_device *ja)
244	{
245	bool ret;
246
247	spin_lock(lock: &j->lock);
248	ret = ja->discard_idx != ja->dirty_idx_ondisk;
249	spin_unlock(lock: &j->lock);
250
251	return ret;
252	}
253
254	/*
255	* Advance ja->discard_idx as long as it points to buckets that are no longer
256	* dirty, issuing discards if necessary:
257	*/
258	void bch2_journal_do_discards(struct journal *j)
259	{
260	struct bch_fs c = container_of(j, struct* bch_fs, journal);
261
262	mutex_lock(&j->discard_lock);
263
264	for_each_rw_member(c, ca) {
265	struct journal_device *ja = &ca->journal;
266
267	while (should_discard_bucket(j, ja)) {
268	if (!c->opts.nochanges &&
269	ca->mi.discard &&
270	bdev_max_discard_sectors(bdev: ca->disk_sb.bdev))
271	blkdev_issue_discard(bdev: ca->disk_sb.bdev,
272	sector: bucket_to_sector(ca,
273	b: ja->buckets[ja->discard_idx]),
274	nr_sects: ca->mi.bucket_size, GFP_NOFS);
275
276	spin_lock(lock: &j->lock);
277	ja->discard_idx = (ja->discard_idx + `1`) % ja->nr;
278
279	bch2_journal_space_available(j);
280	spin_unlock(lock: &j->lock);
281	}
282	}
283
284	mutex_unlock(lock: &j->discard_lock);
285	}
286
287	/*
288	* Journal entry pinning - machinery for holding a reference on a given journal
289	* entry, holding it open to ensure it gets replayed during recovery:
290	*/
291
292	void bch2_journal_reclaim_fast(struct journal *j)
293	{
294	bool popped = false;
295
296	lockdep_assert_held(&j->lock);
297
298	/*
299	* Unpin journal entries whose reference counts reached zero, meaning
300	* all btree nodes got written out
301	*/
302	while (!fifo_empty(&j->pin) &&
303	j->pin.front <= j->seq_ondisk &&
304	!atomic_read(v: &fifo_peek_front(&j->pin).count)) {
305	j->pin.front++;
306	popped = true;
307	}
308
309	if (popped)
310	bch2_journal_space_available(j);
311	}
312
313	bool __bch2_journal_pin_put(struct journal *j, u64 seq)
314	{
315	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
316
317	return atomic_dec_and_test(v: &pin_list->count);
318	}
319
320	void bch2_journal_pin_put(struct journal *j, u64 seq)
321	{
322	if (__bch2_journal_pin_put(j, seq)) {
323	spin_lock(lock: &j->lock);
324	bch2_journal_reclaim_fast(j);
325	spin_unlock(lock: &j->lock);
326	}
327	}
328
329	static inline bool __journal_pin_drop(struct journal *j,
330	struct journal_entry_pin *pin)
331	{
332	struct journal_entry_pin_list *pin_list;
333
334	if (!journal_pin_active(pin))
335	return false;
336
337	if (j->flush_in_progress == pin)
338	j->flush_in_progress_dropped = true;
339
340	pin_list = journal_seq_pin(j, seq: pin->seq);
341	pin->seq = `0`;
342	list_del_init(entry: &pin->list);
343
344	/*
345	* Unpinning a journal entry may make journal_next_bucket() succeed, if
346	* writing a new last_seq will now make another bucket available:
347	*/
348	return atomic_dec_and_test(v: &pin_list->count) &&
349	pin_list == &fifo_peek_front(&j->pin);
350	}
351
352	void bch2_journal_pin_drop(struct journal *j,
353	struct journal_entry_pin *pin)
354	{
355	spin_lock(lock: &j->lock);
356	if (__journal_pin_drop(j, pin))
357	bch2_journal_reclaim_fast(j);
358	spin_unlock(lock: &j->lock);
359	}
360
361	static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
362	{
363	if (fn == bch2_btree_node_flush0 \|\|
364	fn == bch2_btree_node_flush1)
365	return JOURNAL_PIN_btree;
366	else if (fn == bch2_btree_key_cache_journal_flush)
367	return JOURNAL_PIN_key_cache;
368	else
369	return JOURNAL_PIN_other;
370	}
371
372	static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
373	struct journal_entry_pin *pin,
374	journal_pin_flush_fn flush_fn,
375	enum journal_pin_type type)
376	{
377	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
378
379	/*
380	* flush_fn is how we identify journal pins in debugfs, so must always
381	* exist, even if it doesn't do anything:
382	*/
383	BUG_ON(!flush_fn);
384
385	atomic_inc(v: &pin_list->count);
386	pin->seq = seq;
387	pin->flush = flush_fn;
388	list_add(new: &pin->list, head: &pin_list->list[type]);
389	}
390
391	void bch2_journal_pin_copy(struct journal *j,
392	struct journal_entry_pin *dst,
393	struct journal_entry_pin *src,
394	journal_pin_flush_fn flush_fn)
395	{
396	spin_lock(lock: &j->lock);
397
398	u64 seq = READ_ONCE(src->seq);
399
400	if (seq < journal_last_seq(j)) {
401	/*
402	* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
403	* the src pin - with the pin dropped, the entry to pin might no
404	* longer to exist, but that means there's no longer anything to
405	* copy and we can bail out here:
406	*/
407	spin_unlock(lock: &j->lock);
408	return;
409	}
410
411	bool reclaim = __journal_pin_drop(j, pin: dst);
412
413	bch2_journal_pin_set_locked(j, seq, pin: dst, flush_fn, type: journal_pin_type(fn: flush_fn));
414
415	if (reclaim)
416	bch2_journal_reclaim_fast(j);
417
418	/*
419	* If the journal is currently full, we might want to call flush_fn
420	* immediately:
421	*/
422	if (seq == journal_last_seq(j))
423	journal_wake(j);
424	spin_unlock(lock: &j->lock);
425	}
426
427	void bch2_journal_pin_set(struct journal *j, u64 seq,
428	struct journal_entry_pin *pin,
429	journal_pin_flush_fn flush_fn)
430	{
431	spin_lock(lock: &j->lock);
432
433	BUG_ON(seq < journal_last_seq(j));
434
435	bool reclaim = __journal_pin_drop(j, pin);
436
437	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, type: journal_pin_type(fn: flush_fn));
438
439	if (reclaim)
440	bch2_journal_reclaim_fast(j);
441	/*
442	* If the journal is currently full, we might want to call flush_fn
443	* immediately:
444	*/
445	if (seq == journal_last_seq(j))
446	journal_wake(j);
447
448	spin_unlock(lock: &j->lock);
449	}
450
451	/**
452	* bch2_journal_pin_flush: ensure journal pin callback is no longer running
453	* @j: journal object
454	* @pin: pin to flush
455	*/
456	void bch2_journal_pin_flush(struct journal j, struct* journal_entry_pin *pin)
457	{
458	BUG_ON(journal_pin_active(pin));
459
460	wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
461	}
462
463	/*
464	* Journal reclaim: flush references to open journal entries to reclaim space in
465	* the journal
466	*
467	* May be done by the journal code in the background as needed to free up space
468	* for more journal entries, or as part of doing a clean shutdown, or to migrate
469	* data off of a specific device:
470	*/
471
472	static struct journal_entry_pin *
473	journal_get_next_pin(struct journal *j,
474	u64 seq_to_flush,
475	unsigned allowed_below_seq,
476	unsigned allowed_above_seq,
477	u64 *seq)
478	{
479	struct journal_entry_pin_list *pin_list;
480	struct journal_entry_pin *ret = NULL;
481	unsigned i;
482
483	fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
484	if (*seq > seq_to_flush && !allowed_above_seq)
485	break;
486
487	for (i = `0`; i < JOURNAL_PIN_NR; i++)
488	if ((((`1U` << i) & allowed_below_seq) && *seq <= seq_to_flush) \|\|
489	((`1U` << i) & allowed_above_seq)) {
490	ret = list_first_entry_or_null(&pin_list->list[i],
491	struct journal_entry_pin, list);
492	if (ret)
493	return ret;
494	}
495	}
496
497	return NULL;
498	}
499
500	/ returns true if we did work /
501	static size_t journal_flush_pins(struct journal *j,
502	u64 seq_to_flush,
503	unsigned allowed_below_seq,
504	unsigned allowed_above_seq,
505	unsigned min_any,
506	unsigned min_key_cache)
507	{
508	struct journal_entry_pin *pin;
509	size_t nr_flushed = `0`;
510	journal_pin_flush_fn flush_fn;
511	u64 seq;
512	int err;
513
514	lockdep_assert_held(&j->reclaim_lock);
515
516	while (`1`) {
517	unsigned allowed_above = allowed_above_seq;
518	unsigned allowed_below = allowed_below_seq;
519
520	if (min_any) {
521	allowed_above \|= ~`0`;
522	allowed_below \|= ~`0`;
523	}
524
525	if (min_key_cache) {
526	allowed_above \|= `1U` << JOURNAL_PIN_key_cache;
527	allowed_below \|= `1U` << JOURNAL_PIN_key_cache;
528	}
529
530	cond_resched();
531
532	j->last_flushed = jiffies;
533
534	spin_lock(lock: &j->lock);
535	pin = journal_get_next_pin(j, seq_to_flush, allowed_below_seq: allowed_below, allowed_above_seq: allowed_above, seq: &seq);
536	if (pin) {
537	BUG_ON(j->flush_in_progress);
538	j->flush_in_progress = pin;
539	j->flush_in_progress_dropped = false;
540	flush_fn = pin->flush;
541	}
542	spin_unlock(lock: &j->lock);
543
544	if (!pin)
545	break;
546
547	if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
548	min_key_cache--;
549
550	if (min_any)
551	min_any--;
552
553	err = flush_fn(j, pin, seq);
554
555	spin_lock(lock: &j->lock);
556	/ Pin might have been dropped or rearmed: /
557	if (likely(!err && !j->flush_in_progress_dropped))
558	list_move(list: &pin->list, head: &journal_seq_pin(j, seq)->flushed);
559	j->flush_in_progress = NULL;
560	j->flush_in_progress_dropped = false;
561	spin_unlock(lock: &j->lock);
562
563	wake_up(&j->pin_flush_wait);
564
565	if (err)
566	break;
567
568	nr_flushed++;
569	}
570
571	return nr_flushed;
572	}
573
574	static u64 journal_seq_to_flush(struct journal *j)
575	{
576	struct bch_fs c = container_of(j, struct* bch_fs, journal);
577	u64 seq_to_flush = `0`;
578
579	spin_lock(lock: &j->lock);
580
581	for_each_rw_member(c, ca) {
582	struct journal_device *ja = &ca->journal;
583	unsigned nr_buckets, bucket_to_flush;
584
585	if (!ja->nr)
586	continue;
587
588	/ Try to keep the journal at most half full: /
589	nr_buckets = ja->nr / `2`;
590
591	nr_buckets = min(nr_buckets, ja->nr);
592
593	bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
594	seq_to_flush = max(seq_to_flush,
595	ja->bucket_seq[bucket_to_flush]);
596	}
597
598	/ Also flush if the pin fifo is more than half full /
599	seq_to_flush = max_t(s64, seq_to_flush,
600	(s64) journal_cur_seq(j) -
601	(j->pin.size >> `1`));
602	spin_unlock(lock: &j->lock);
603
604	return seq_to_flush;
605	}
606
607	/**
608	* __bch2_journal_reclaim - free up journal buckets
609	* @j: journal object
610	* @direct: direct or background reclaim?
611	* @kicked: requested to run since we last ran?
612	* Returns: 0 on success, or -EIO if the journal has been shutdown
613	*
614	* Background journal reclaim writes out btree nodes. It should be run
615	* early enough so that we never completely run out of journal buckets.
616	*
617	* High watermarks for triggering background reclaim:
618	* - FIFO has fewer than 512 entries left
619	* - fewer than 25% journal buckets free
620	*
621	* Background reclaim runs until low watermarks are reached:
622	* - FIFO has more than 1024 entries left
623	* - more than 50% journal buckets free
624	*
625	* As long as a reclaim can complete in the time it takes to fill up
626	* 512 journal entries or 25% of all journal buckets, then
627	* journal_next_bucket() should not stall.
628	*/
629	static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
630	{
631	struct bch_fs c = container_of(j, struct* bch_fs, journal);
632	bool kthread = (current->flags & PF_KTHREAD) != `0`;
633	u64 seq_to_flush;
634	size_t min_nr, min_key_cache, nr_flushed;
635	unsigned flags;
636	int ret = `0`;
637
638	/*
639	* We can't invoke memory reclaim while holding the reclaim_lock -
640	* journal reclaim is required to make progress for memory reclaim
641	* (cleaning the caches), so we can't get stuck in memory reclaim while
642	* we're holding the reclaim lock:
643	*/
644	lockdep_assert_held(&j->reclaim_lock);
645	flags = memalloc_noreclaim_save();
646
647	do {
648	if (kthread && kthread_should_stop())
649	break;
650
651	if (bch2_journal_error(j)) {
652	ret = -EIO;
653	break;
654	}
655
656	bch2_journal_do_discards(j);
657
658	seq_to_flush = journal_seq_to_flush(j);
659	min_nr = `0`;
660
661	/*
662	* If it's been longer than j->reclaim_delay_ms since we last flushed,
663	* make sure to flush at least one journal pin:
664	*/
665	if (time_after(jiffies, j->last_flushed +
666	msecs_to_jiffies(c->opts.journal_reclaim_delay)))
667	min_nr = `1`;
668
669	if (j->watermark != BCH_WATERMARK_stripe)
670	min_nr = `1`;
671
672	if (atomic_read(v: &c->btree_cache.dirty) * `2` > c->btree_cache.used)
673	min_nr = `1`;
674
675	min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) `128`);
676
677	trace_and_count(c, journal_reclaim_start, c,
678	direct, kicked,
679	min_nr, min_key_cache,
680	atomic_read(&c->btree_cache.dirty),
681	c->btree_cache.used,
682	atomic_long_read(&c->btree_key_cache.nr_dirty),
683	atomic_long_read(&c->btree_key_cache.nr_keys));
684
685	nr_flushed = journal_flush_pins(j, seq_to_flush,
686	allowed_below_seq: ~`0`, allowed_above_seq: `0`,
687	min_any: min_nr, min_key_cache);
688
689	if (direct)
690	j->nr_direct_reclaim += nr_flushed;
691	else
692	j->nr_background_reclaim += nr_flushed;
693	trace_and_count(c, journal_reclaim_finish, c, nr_flushed);
694
695	if (nr_flushed)
696	wake_up(&j->reclaim_wait);
697	} while ((min_nr \|\| min_key_cache) && nr_flushed && !direct);
698
699	memalloc_noreclaim_restore(flags);
700
701	return ret;
702	}
703
704	int bch2_journal_reclaim(struct journal *j)
705	{
706	return __bch2_journal_reclaim(j, direct: true, kicked: true);
707	}
708
709	static int bch2_journal_reclaim_thread(void *arg)
710	{
711	struct journal *j = arg;
712	struct bch_fs c = container_of(j, struct* bch_fs, journal);
713	unsigned long delay, now;
714	bool journal_empty;
715	int ret = `0`;
716
717	set_freezable();
718
719	j->last_flushed = jiffies;
720
721	while (!ret && !kthread_should_stop()) {
722	bool kicked = j->reclaim_kicked;
723
724	j->reclaim_kicked = false;
725
726	mutex_lock(&j->reclaim_lock);
727	ret = __bch2_journal_reclaim(j, direct: false, kicked);
728	mutex_unlock(lock: &j->reclaim_lock);
729
730	now = jiffies;
731	delay = msecs_to_jiffies(m: c->opts.journal_reclaim_delay);
732	j->next_reclaim = j->last_flushed + delay;
733
734	if (!time_in_range(j->next_reclaim, now, now + delay))
735	j->next_reclaim = now + delay;
736
737	while (`1`) {
738	set_current_state(TASK_INTERRUPTIBLE\|TASK_FREEZABLE);
739	if (kthread_should_stop())
740	break;
741	if (j->reclaim_kicked)
742	break;
743
744	spin_lock(lock: &j->lock);
745	journal_empty = fifo_empty(&j->pin);
746	spin_unlock(lock: &j->lock);
747
748	if (journal_empty)
749	schedule();
750	else if (time_after(j->next_reclaim, jiffies))
751	schedule_timeout(timeout: j->next_reclaim - jiffies);
752	else
753	break;
754	}
755	__set_current_state(TASK_RUNNING);
756	}
757
758	return `0`;
759	}
760
761	void bch2_journal_reclaim_stop(struct journal *j)
762	{
763	struct task_struct *p = j->reclaim_thread;
764
765	j->reclaim_thread = NULL;
766
767	if (p) {
768	kthread_stop(k: p);
769	put_task_struct(t: p);
770	}
771	}
772
773	int bch2_journal_reclaim_start(struct journal *j)
774	{
775	struct bch_fs c = container_of(j, struct* bch_fs, journal);
776	struct task_struct *p;
777	int ret;
778
779	if (j->reclaim_thread)
780	return `0`;
781
782	p = kthread_create(bch2_journal_reclaim_thread, j,
783	"bch-reclaim/%s", c->name);
784	ret = PTR_ERR_OR_ZERO(ptr: p);
785	bch_err_msg(c, ret, "creating journal reclaim thread");
786	if (ret)
787	return ret;
788
789	get_task_struct(t: p);
790	j->reclaim_thread = p;
791	wake_up_process(tsk: p);
792	return `0`;
793	}
794
795	static int journal_flush_done(struct journal *j, u64 seq_to_flush,
796	bool *did_work)
797	{
798	int ret;
799
800	ret = bch2_journal_error(j);
801	if (ret)
802	return ret;
803
804	mutex_lock(&j->reclaim_lock);
805
806	if (journal_flush_pins(j, seq_to_flush,
807	allowed_below_seq: (`1U` << JOURNAL_PIN_key_cache)\|
808	(`1U` << JOURNAL_PIN_other), allowed_above_seq: `0`, min_any: `0`, min_key_cache: `0`) \|\|
809	journal_flush_pins(j, seq_to_flush,
810	allowed_below_seq: (`1U` << JOURNAL_PIN_btree), allowed_above_seq: `0`, min_any: `0`, min_key_cache: `0`))
811	*did_work = true;
812
813	if (seq_to_flush > journal_cur_seq(j))
814	bch2_journal_entry_close(j);
815
816	spin_lock(lock: &j->lock);
817	/*
818	* If journal replay hasn't completed, the unreplayed journal entries
819	* hold refs on their corresponding sequence numbers
820	*/
821	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) \|\|
822	journal_last_seq(j) > seq_to_flush \|\|
823	!fifo_used(&j->pin);
824
825	spin_unlock(lock: &j->lock);
826	mutex_unlock(lock: &j->reclaim_lock);
827
828	return ret;
829	}
830
831	bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
832	{
833	/ time_stats this /
834	bool did_work = false;
835
836	if (!test_bit(JOURNAL_STARTED, &j->flags))
837	return false;
838
839	closure_wait_event(&j->async_wait,
840	journal_flush_done(j, seq_to_flush, &did_work));
841
842	return did_work;
843	}
844
845	int bch2_journal_flush_device_pins(struct journal j, int* dev_idx)
846	{
847	struct bch_fs c = container_of(j, struct* bch_fs, journal);
848	struct journal_entry_pin_list *p;
849	u64 iter, seq = `0`;
850	int ret = `0`;
851
852	spin_lock(lock: &j->lock);
853	fifo_for_each_entry_ptr(p, &j->pin, iter)
854	if (dev_idx >= `0`
855	? bch2_dev_list_has_dev(devs: p->devs, dev: dev_idx)
856	: p->devs.nr < c->opts.metadata_replicas)
857	seq = iter;
858	spin_unlock(lock: &j->lock);
859
860	bch2_journal_flush_pins(j, seq_to_flush: seq);
861
862	ret = bch2_journal_error(j);
863	if (ret)
864	return ret;
865
866	mutex_lock(&c->replicas_gc_lock);
867	bch2_replicas_gc_start(c, `1` << BCH_DATA_journal);
868
869	/*
870	* Now that we've populated replicas_gc, write to the journal to mark
871	* active journal devices. This handles the case where the journal might
872	* be empty. Otherwise we could clear all journal replicas and
873	* temporarily put the fs into an unrecoverable state. Journal recovery
874	* expects to find devices marked for journal data on unclean mount.
875	*/
876	ret = bch2_journal_meta(&c->journal);
877	if (ret)
878	goto err;
879
880	seq = `0`;
881	spin_lock(lock: &j->lock);
882	while (!ret) {
883	struct bch_replicas_padded replicas;
884
885	seq = max(seq, journal_last_seq(j));
886	if (seq >= j->pin.back)
887	break;
888	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
889	journal_seq_pin(j, seq)->devs);
890	seq++;
891
892	if (replicas.e.nr_devs) {
893	spin_unlock(lock: &j->lock);
894	ret = bch2_mark_replicas(c, &replicas.e);
895	spin_lock(lock: &j->lock);
896	}
897	}
898	spin_unlock(lock: &j->lock);
899	err:
900	ret = bch2_replicas_gc_end(c, ret);
901	mutex_unlock(lock: &c->replicas_gc_lock);
902
903	return ret;
904	}
905

source code of linux/fs/bcachefs/journal_reclaim.c