journal_io.c source code [linux/fs/bcachefs/journal_io.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include "bcachefs.h"
3	#include "alloc_background.h"
4	#include "alloc_foreground.h"
5	#include "btree_io.h"
6	#include "btree_update_interior.h"
7	#include "btree_write_buffer.h"
8	#include "buckets.h"
9	#include "checksum.h"
10	#include "disk_groups.h"
11	#include "error.h"
12	#include "journal.h"
13	#include "journal_io.h"
14	#include "journal_reclaim.h"
15	#include "journal_seq_blacklist.h"
16	#include "replicas.h"
17	#include "sb-clean.h"
18	#include "trace.h"
19
20	void bch2_journal_ptrs_to_text(struct printbuf out, struct* bch_fs *c,
21	struct journal_replay *j)
22	{
23	darray_for_each(j->ptrs, i) {
24	struct bch_dev *ca = bch_dev_bkey_exists(c, idx: i->dev);
25	u64 offset;
26
27	div64_u64_rem(dividend: i->sector, divisor: ca->mi.bucket_size, remainder: &offset);
28
29	if (i != j->ptrs.data)
30	prt_printf(out, " ");
31	prt_printf(out, "%u:%u:%u (sector %llu)",
32	i->dev, i->bucket, i->bucket_offset, i->sector);
33	}
34	}
35
36	static void bch2_journal_replay_to_text(struct printbuf out, struct* bch_fs *c,
37	struct journal_replay *j)
38	{
39	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
40
41	bch2_journal_ptrs_to_text(out, c, j);
42
43	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
44	struct jset_entry_datetime *datetime =
45	container_of(entry, struct jset_entry_datetime, entry);
46	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
47	break;
48	}
49	}
50
51	static struct nonce journal_nonce(const struct jset *jset)
52	{
53	return (struct nonce) {{
54	[`0`] = `0`,
55	[`1`] = ((__le32 *) &jset->seq)[`0`],
56	[`2`] = ((__le32 *) &jset->seq)[`1`],
57	[`3`] = BCH_NONCE_JOURNAL,
58	}};
59	}
60
61	static bool jset_csum_good(struct bch_fs c, struct* jset j, struct* bch_csum *csum)
62	{
63	if (!bch2_checksum_type_valid(c, type: JSET_CSUM_TYPE(k: j))) {
64	csum = (struct* bch_csum) {};
65	return false;
66	}
67
68	*csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
69	return !bch2_crc_cmp(l: j->csum, r: *csum);
70	}
71
72	static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
73	{
74	return (seq - c->journal_entries_base_seq) & (~`0U` >> `1`);
75	}
76
77	static void __journal_replay_free(struct bch_fs *c,
78	struct journal_replay *i)
79	{
80	struct journal_replay **p =
81	genradix_ptr(&c->journal_entries,
82	journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
83
84	BUG_ON(*p != i);
85	*p = NULL;
86	kvfree(addr: i);
87	}
88
89	static void journal_replay_free(struct bch_fs c, struct* journal_replay *i, bool blacklisted)
90	{
91	if (blacklisted)
92	i->ignore_blacklisted = true;
93	else
94	i->ignore_not_dirty = true;
95
96	if (!c->opts.read_entire_journal)
97	__journal_replay_free(c, i);
98	}
99
100	struct journal_list {
101	struct closure cl;
102	u64 last_seq;
103	struct mutex lock;
104	int ret;
105	};
106
107	#define JOURNAL_ENTRY_ADD_OK 0
108	#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
109
110	/*
111	* Given a journal entry we just read, add it to the list of journal entries to
112	* be replayed:
113	*/
114	static int journal_entry_add(struct bch_fs c, struct* bch_dev *ca,
115	struct journal_ptr entry_ptr,
116	struct journal_list jlist, struct* jset *j)
117	{
118	struct genradix_iter iter;
119	struct journal_replay *_i, i, *dup;
120	size_t bytes = vstruct_bytes(j);
121	u64 last_seq = !JSET_NO_FLUSH(k: j) ? le64_to_cpu(j->last_seq) : `0`;
122	struct printbuf buf = PRINTBUF;
123	int ret = JOURNAL_ENTRY_ADD_OK;
124
125	/ Is this entry older than the range we need? /
126	if (!c->opts.read_entire_journal &&
127	le64_to_cpu(j->seq) < jlist->last_seq)
128	return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
129
130	/*
131	* genradixes are indexed by a ulong, not a u64, so we can't index them
132	* by sequence number directly: Assume instead that they will all fall
133	* within the range of +-2billion of the filrst one we find.
134	*/
135	if (!c->journal_entries_base_seq)
136	c->journal_entries_base_seq = max_t(s64, `1`, le64_to_cpu(j->seq) - S32_MAX);
137
138	/ Drop entries we don't need anymore /
139	if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
140	genradix_for_each_from(&c->journal_entries, iter, _i,
141	journal_entry_radix_idx(c, jlist->last_seq)) {
142	i = *_i;
143
144	if (journal_replay_ignore(i))
145	continue;
146
147	if (le64_to_cpu(i->j.seq) >= last_seq)
148	break;
149
150	journal_replay_free(c, i, blacklisted: false);
151	}
152	}
153
154	jlist->last_seq = max(jlist->last_seq, last_seq);
155
156	_i = genradix_ptr_alloc(&c->journal_entries,
157	journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
158	GFP_KERNEL);
159	if (!_i)
160	return -BCH_ERR_ENOMEM_journal_entry_add;
161
162	/*
163	* Duplicate journal entries? If so we want the one that didn't have a
164	* checksum error:
165	*/
166	dup = *_i;
167	if (dup) {
168	bool identical = bytes == vstruct_bytes(&dup->j) &&
169	!memcmp(p: j, q: &dup->j, size: bytes);
170	bool not_identical = !identical &&
171	entry_ptr.csum_good &&
172	dup->csum_good;
173
174	bool same_device = false;
175	darray_for_each(dup->ptrs, ptr)
176	if (ptr->dev == ca->dev_idx)
177	same_device = true;
178
179	ret = darray_push(&dup->ptrs, entry_ptr);
180	if (ret)
181	goto out;
182
183	bch2_journal_replay_to_text(out: &buf, c, j: dup);
184
185	fsck_err_on(same_device,
186	c, journal_entry_dup_same_device,
187	"duplicate journal entry on same device\n %s",
188	buf.buf);
189
190	fsck_err_on(not_identical,
191	c, journal_entry_replicas_data_mismatch,
192	"found duplicate but non identical journal entries\n %s",
193	buf.buf);
194
195	if (entry_ptr.csum_good && !identical)
196	goto replace;
197
198	goto out;
199	}
200	replace:
201	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
202	if (!i)
203	return -BCH_ERR_ENOMEM_journal_entry_add;
204
205	darray_init(&i->ptrs);
206	i->csum_good = entry_ptr.csum_good;
207	i->ignore_blacklisted = false;
208	i->ignore_not_dirty = false;
209	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
210
211	if (dup) {
212	/ The first ptr should represent the jset we kept: /
213	darray_for_each(dup->ptrs, ptr)
214	darray_push(&i->ptrs, *ptr);
215	__journal_replay_free(c, i: dup);
216	} else {
217	darray_push(&i->ptrs, entry_ptr);
218	}
219
220	*_i = i;
221	out:
222	fsck_err:
223	printbuf_exit(&buf);
224	return ret;
225	}
226
227	/ this fills in a range with empty jset_entries: /
228	static void journal_entry_null_range(void start, void* *end)
229	{
230	struct jset_entry *entry;
231
232	for (entry = start; entry != end; entry = vstruct_next(entry))
233	memset(entry, `0`, sizeof(*entry));
234	}
235
236	#define JOURNAL_ENTRY_REREAD 5
237	#define JOURNAL_ENTRY_NONE 6
238	#define JOURNAL_ENTRY_BAD 7
239
240	static void journal_entry_err_msg(struct printbuf *out,
241	u32 version,
242	struct jset *jset,
243	struct jset_entry *entry)
244	{
245	prt_str(out, str: "invalid journal entry, version=");
246	bch2_version_to_text(out, version);
247
248	if (entry) {
249	prt_str(out, str: " type=");
250	bch2_prt_jset_entry_type(out, entry->type);
251	}
252
253	if (!jset) {
254	prt_printf(out, " in superblock");
255	} else {
256
257	prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
258
259	if (entry)
260	prt_printf(out, " offset=%zi/%u",
261	(u64 *) entry - jset->_data,
262	le32_to_cpu(jset->u64s));
263	}
264
265	prt_str(out, str: ": ");
266	}
267
268	#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
269	({ \
270	struct printbuf _buf = PRINTBUF; \
271	\
272	journal_entry_err_msg(&_buf, version, jset, entry); \
273	prt_printf(&_buf, msg, ##__VA_ARGS__); \
274	\
275	switch (flags & BKEY_INVALID_WRITE) { \
276	case READ: \
277	mustfix_fsck_err(c, _err, "%s", _buf.buf); \
278	break; \
279	case WRITE: \
280	bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
281	bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
282	if (bch2_fs_inconsistent(c)) { \
283	ret = -BCH_ERR_fsck_errors_not_fixed; \
284	goto fsck_err; \
285	} \
286	break; \
287	} \
288	\
289	printbuf_exit(&_buf); \
290	true; \
291	})
292
293	#define journal_entry_err_on(cond, ...) \
294	((cond) ? journal_entry_err(__VA_ARGS__) : false)
295
296	#define FSCK_DELETED_KEY 5
297
298	static int journal_validate_key(struct bch_fs *c,
299	struct jset *jset,
300	struct jset_entry *entry,
301	unsigned level, enum btree_id btree_id,
302	struct bkey_i *k,
303	unsigned version, int big_endian,
304	enum bkey_invalid_flags flags)
305	{
306	int write = flags & BKEY_INVALID_WRITE;
307	void *next = vstruct_next(entry);
308	struct printbuf buf = PRINTBUF;
309	int ret = `0`;
310
311	if (journal_entry_err_on(!k->k.u64s,
312	c, version, jset, entry,
313	journal_entry_bkey_u64s_0,
314	"k->u64s 0")) {
315	entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
316	journal_entry_null_range(vstruct_next(entry), end: next);
317	return FSCK_DELETED_KEY;
318	}
319
320	if (journal_entry_err_on((void *) bkey_next(k) >
321	(void *) vstruct_next(entry),
322	c, version, jset, entry,
323	journal_entry_bkey_past_end,
324	"extends past end of journal entry")) {
325	entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
326	journal_entry_null_range(vstruct_next(entry), end: next);
327	return FSCK_DELETED_KEY;
328	}
329
330	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
331	c, version, jset, entry,
332	journal_entry_bkey_bad_format,
333	"bad format %u", k->k.format)) {
334	le16_add_cpu(var: &entry->u64s, val: -((u16) k->k.u64s));
335	memmove(k, bkey_next(k), next - (void *) bkey_next(k));
336	journal_entry_null_range(vstruct_next(entry), end: next);
337	return FSCK_DELETED_KEY;
338	}
339
340	if (!write)
341	bch2_bkey_compat(level, btree_id, version, big_endian,
342	write, NULL, k: bkey_to_packed(k));
343
344	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
345	__btree_node_type(level, id: btree_id), write, &buf)) {
346	printbuf_reset(buf: &buf);
347	journal_entry_err_msg(out: &buf, version, jset, entry);
348	prt_newline(&buf);
349	printbuf_indent_add(&buf, `2`);
350
351	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
352	prt_newline(&buf);
353	bch2_bkey_invalid(c, bkey_i_to_s_c(k),
354	__btree_node_type(level, id: btree_id), write, &buf);
355
356	mustfix_fsck_err(c, journal_entry_bkey_invalid,
357	"%s", buf.buf);
358
359	le16_add_cpu(var: &entry->u64s, val: -((u16) k->k.u64s));
360	memmove(k, bkey_next(k), next - (void *) bkey_next(k));
361	journal_entry_null_range(vstruct_next(entry), end: next);
362
363	printbuf_exit(&buf);
364	return FSCK_DELETED_KEY;
365	}
366
367	if (write)
368	bch2_bkey_compat(level, btree_id, version, big_endian,
369	write, NULL, k: bkey_to_packed(k));
370	fsck_err:
371	printbuf_exit(&buf);
372	return ret;
373	}
374
375	static int journal_entry_btree_keys_validate(struct bch_fs *c,
376	struct jset *jset,
377	struct jset_entry *entry,
378	unsigned version, int big_endian,
379	enum bkey_invalid_flags flags)
380	{
381	struct bkey_i *k = entry->start;
382
383	while (k != vstruct_last(entry)) {
384	int ret = journal_validate_key(c, jset, entry,
385	level: entry->level,
386	btree_id: entry->btree_id,
387	k, version, big_endian,
388	flags: flags\|BKEY_INVALID_JOURNAL);
389	if (ret == FSCK_DELETED_KEY)
390	continue;
391
392	k = bkey_next(k);
393	}
394
395	return `0`;
396	}
397
398	static void journal_entry_btree_keys_to_text(struct printbuf out, struct* bch_fs *c,
399	struct jset_entry *entry)
400	{
401	bool first = true;
402
403	jset_entry_for_each_key(entry, k) {
404	if (!first) {
405	prt_newline(out);
406	bch2_prt_jset_entry_type(out, entry->type);
407	prt_str(out, str: ": ");
408	}
409	prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
410	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
411	first = false;
412	}
413	}
414
415	static int journal_entry_btree_root_validate(struct bch_fs *c,
416	struct jset *jset,
417	struct jset_entry *entry,
418	unsigned version, int big_endian,
419	enum bkey_invalid_flags flags)
420	{
421	struct bkey_i *k = entry->start;
422	int ret = `0`;
423
424	if (journal_entry_err_on(!entry->u64s \|\|
425	le16_to_cpu(entry->u64s) != k->k.u64s,
426	c, version, jset, entry,
427	journal_entry_btree_root_bad_size,
428	"invalid btree root journal entry: wrong number of keys")) {
429	void *next = vstruct_next(entry);
430	/*
431	* we don't want to null out this jset_entry,
432	* just the contents, so that later we can tell
433	* we were _supposed_ to have a btree root
434	*/
435	entry->u64s = `0`;
436	journal_entry_null_range(vstruct_next(entry), end: next);
437	return `0`;
438	}
439
440	ret = journal_validate_key(c, jset, entry, level: `1`, btree_id: entry->btree_id, k,
441	version, big_endian, flags);
442	if (ret == FSCK_DELETED_KEY)
443	ret = `0`;
444	fsck_err:
445	return ret;
446	}
447
448	static void journal_entry_btree_root_to_text(struct printbuf out, struct* bch_fs *c,
449	struct jset_entry *entry)
450	{
451	journal_entry_btree_keys_to_text(out, c, entry);
452	}
453
454	static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
455	struct jset *jset,
456	struct jset_entry *entry,
457	unsigned version, int big_endian,
458	enum bkey_invalid_flags flags)
459	{
460	/ obsolete, don't care: /
461	return `0`;
462	}
463
464	static void journal_entry_prio_ptrs_to_text(struct printbuf out, struct* bch_fs *c,
465	struct jset_entry *entry)
466	{
467	}
468
469	static int journal_entry_blacklist_validate(struct bch_fs *c,
470	struct jset *jset,
471	struct jset_entry *entry,
472	unsigned version, int big_endian,
473	enum bkey_invalid_flags flags)
474	{
475	int ret = `0`;
476
477	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != `1`,
478	c, version, jset, entry,
479	journal_entry_blacklist_bad_size,
480	"invalid journal seq blacklist entry: bad size")) {
481	journal_entry_null_range(start: entry, vstruct_next(entry));
482	}
483	fsck_err:
484	return ret;
485	}
486
487	static void journal_entry_blacklist_to_text(struct printbuf out, struct* bch_fs *c,
488	struct jset_entry *entry)
489	{
490	struct jset_entry_blacklist *bl =
491	container_of(entry, struct jset_entry_blacklist, entry);
492
493	prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
494	}
495
496	static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
497	struct jset *jset,
498	struct jset_entry *entry,
499	unsigned version, int big_endian,
500	enum bkey_invalid_flags flags)
501	{
502	struct jset_entry_blacklist_v2 *bl_entry;
503	int ret = `0`;
504
505	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != `2`,
506	c, version, jset, entry,
507	journal_entry_blacklist_v2_bad_size,
508	"invalid journal seq blacklist entry: bad size")) {
509	journal_entry_null_range(start: entry, vstruct_next(entry));
510	goto out;
511	}
512
513	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
514
515	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
516	le64_to_cpu(bl_entry->end),
517	c, version, jset, entry,
518	journal_entry_blacklist_v2_start_past_end,
519	"invalid journal seq blacklist entry: start > end")) {
520	journal_entry_null_range(start: entry, vstruct_next(entry));
521	}
522	out:
523	fsck_err:
524	return ret;
525	}
526
527	static void journal_entry_blacklist_v2_to_text(struct printbuf out, struct* bch_fs *c,
528	struct jset_entry *entry)
529	{
530	struct jset_entry_blacklist_v2 *bl =
531	container_of(entry, struct jset_entry_blacklist_v2, entry);
532
533	prt_printf(out, "start=%llu end=%llu",
534	le64_to_cpu(bl->start),
535	le64_to_cpu(bl->end));
536	}
537
538	static int journal_entry_usage_validate(struct bch_fs *c,
539	struct jset *jset,
540	struct jset_entry *entry,
541	unsigned version, int big_endian,
542	enum bkey_invalid_flags flags)
543	{
544	struct jset_entry_usage *u =
545	container_of(entry, struct jset_entry_usage, entry);
546	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
547	int ret = `0`;
548
549	if (journal_entry_err_on(bytes < sizeof(*u),
550	c, version, jset, entry,
551	journal_entry_usage_bad_size,
552	"invalid journal entry usage: bad size")) {
553	journal_entry_null_range(start: entry, vstruct_next(entry));
554	return ret;
555	}
556
557	fsck_err:
558	return ret;
559	}
560
561	static void journal_entry_usage_to_text(struct printbuf out, struct* bch_fs *c,
562	struct jset_entry *entry)
563	{
564	struct jset_entry_usage *u =
565	container_of(entry, struct jset_entry_usage, entry);
566
567	prt_str(out, str: "type=");
568	bch2_prt_fs_usage_type(out, u->entry.btree_id);
569	prt_printf(out, " v=%llu", le64_to_cpu(u->v));
570	}
571
572	static int journal_entry_data_usage_validate(struct bch_fs *c,
573	struct jset *jset,
574	struct jset_entry *entry,
575	unsigned version, int big_endian,
576	enum bkey_invalid_flags flags)
577	{
578	struct jset_entry_data_usage *u =
579	container_of(entry, struct jset_entry_data_usage, entry);
580	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
581	struct printbuf err = PRINTBUF;
582	int ret = `0`;
583
584	if (journal_entry_err_on(bytes < sizeof(*u) \|\|
585	bytes < sizeof(*u) + u->r.nr_devs,
586	c, version, jset, entry,
587	journal_entry_data_usage_bad_size,
588	"invalid journal entry usage: bad size")) {
589	journal_entry_null_range(start: entry, vstruct_next(entry));
590	goto out;
591	}
592
593	if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
594	c, version, jset, entry,
595	journal_entry_data_usage_bad_size,
596	"invalid journal entry usage: %s", err.buf)) {
597	journal_entry_null_range(start: entry, vstruct_next(entry));
598	goto out;
599	}
600	out:
601	fsck_err:
602	printbuf_exit(&err);
603	return ret;
604	}
605
606	static void journal_entry_data_usage_to_text(struct printbuf out, struct* bch_fs *c,
607	struct jset_entry *entry)
608	{
609	struct jset_entry_data_usage *u =
610	container_of(entry, struct jset_entry_data_usage, entry);
611
612	bch2_replicas_entry_to_text(out, &u->r);
613	prt_printf(out, "=%llu", le64_to_cpu(u->v));
614	}
615
616	static int journal_entry_clock_validate(struct bch_fs *c,
617	struct jset *jset,
618	struct jset_entry *entry,
619	unsigned version, int big_endian,
620	enum bkey_invalid_flags flags)
621	{
622	struct jset_entry_clock *clock =
623	container_of(entry, struct jset_entry_clock, entry);
624	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
625	int ret = `0`;
626
627	if (journal_entry_err_on(bytes != sizeof(*clock),
628	c, version, jset, entry,
629	journal_entry_clock_bad_size,
630	"bad size")) {
631	journal_entry_null_range(start: entry, vstruct_next(entry));
632	return ret;
633	}
634
635	if (journal_entry_err_on(clock->rw > `1`,
636	c, version, jset, entry,
637	journal_entry_clock_bad_rw,
638	"bad rw")) {
639	journal_entry_null_range(start: entry, vstruct_next(entry));
640	return ret;
641	}
642
643	fsck_err:
644	return ret;
645	}
646
647	static void journal_entry_clock_to_text(struct printbuf out, struct* bch_fs *c,
648	struct jset_entry *entry)
649	{
650	struct jset_entry_clock *clock =
651	container_of(entry, struct jset_entry_clock, entry);
652
653	prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
654	}
655
656	static int journal_entry_dev_usage_validate(struct bch_fs *c,
657	struct jset *jset,
658	struct jset_entry *entry,
659	unsigned version, int big_endian,
660	enum bkey_invalid_flags flags)
661	{
662	struct jset_entry_dev_usage *u =
663	container_of(entry, struct jset_entry_dev_usage, entry);
664	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
665	unsigned expected = sizeof(*u);
666	unsigned dev;
667	int ret = `0`;
668
669	if (journal_entry_err_on(bytes < expected,
670	c, version, jset, entry,
671	journal_entry_dev_usage_bad_size,
672	"bad size (%u < %u)",
673	bytes, expected)) {
674	journal_entry_null_range(start: entry, vstruct_next(entry));
675	return ret;
676	}
677
678	dev = le32_to_cpu(u->dev);
679
680	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
681	c, version, jset, entry,
682	journal_entry_dev_usage_bad_dev,
683	"bad dev")) {
684	journal_entry_null_range(start: entry, vstruct_next(entry));
685	return ret;
686	}
687
688	if (journal_entry_err_on(u->pad,
689	c, version, jset, entry,
690	journal_entry_dev_usage_bad_pad,
691	"bad pad")) {
692	journal_entry_null_range(start: entry, vstruct_next(entry));
693	return ret;
694	}
695
696	fsck_err:
697	return ret;
698	}
699
700	static void journal_entry_dev_usage_to_text(struct printbuf out, struct* bch_fs *c,
701	struct jset_entry *entry)
702	{
703	struct jset_entry_dev_usage *u =
704	container_of(entry, struct jset_entry_dev_usage, entry);
705	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
706
707	prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
708
709	for (i = `0`; i < nr_types; i++) {
710	bch2_prt_data_type(out, i);
711	prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
712	le64_to_cpu(u->d[i].buckets),
713	le64_to_cpu(u->d[i].sectors),
714	le64_to_cpu(u->d[i].fragmented));
715	}
716	}
717
718	static int journal_entry_log_validate(struct bch_fs *c,
719	struct jset *jset,
720	struct jset_entry *entry,
721	unsigned version, int big_endian,
722	enum bkey_invalid_flags flags)
723	{
724	return `0`;
725	}
726
727	static void journal_entry_log_to_text(struct printbuf out, struct* bch_fs *c,
728	struct jset_entry *entry)
729	{
730	struct jset_entry_log l = container_of(entry, struct* jset_entry_log, entry);
731	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
732
733	prt_printf(out, "%.*s", bytes, l->d);
734	}
735
736	static int journal_entry_overwrite_validate(struct bch_fs *c,
737	struct jset *jset,
738	struct jset_entry *entry,
739	unsigned version, int big_endian,
740	enum bkey_invalid_flags flags)
741	{
742	return journal_entry_btree_keys_validate(c, jset, entry,
743	version, big_endian, READ);
744	}
745
746	static void journal_entry_overwrite_to_text(struct printbuf out, struct* bch_fs *c,
747	struct jset_entry *entry)
748	{
749	journal_entry_btree_keys_to_text(out, c, entry);
750	}
751
752	static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
753	struct jset *jset,
754	struct jset_entry *entry,
755	unsigned version, int big_endian,
756	enum bkey_invalid_flags flags)
757	{
758	return journal_entry_btree_keys_validate(c, jset, entry,
759	version, big_endian, READ);
760	}
761
762	static void journal_entry_write_buffer_keys_to_text(struct printbuf out, struct* bch_fs *c,
763	struct jset_entry *entry)
764	{
765	journal_entry_btree_keys_to_text(out, c, entry);
766	}
767
768	static int journal_entry_datetime_validate(struct bch_fs *c,
769	struct jset *jset,
770	struct jset_entry *entry,
771	unsigned version, int big_endian,
772	enum bkey_invalid_flags flags)
773	{
774	unsigned bytes = vstruct_bytes(entry);
775	unsigned expected = `16`;
776	int ret = `0`;
777
778	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
779	c, version, jset, entry,
780	journal_entry_dev_usage_bad_size,
781	"bad size (%u < %u)",
782	bytes, expected)) {
783	journal_entry_null_range(start: entry, vstruct_next(entry));
784	return ret;
785	}
786	fsck_err:
787	return ret;
788	}
789
790	static void journal_entry_datetime_to_text(struct printbuf out, struct* bch_fs *c,
791	struct jset_entry *entry)
792	{
793	struct jset_entry_datetime *datetime =
794	container_of(entry, struct jset_entry_datetime, entry);
795
796	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
797	}
798
799	struct jset_entry_ops {
800	int (validate)(struct* bch_fs , struct* jset *,
801	struct jset_entry , unsigned, int*,
802	enum bkey_invalid_flags);
803	void (to_text)(struct* printbuf , struct* bch_fs , struct* jset_entry *);
804	};
805
806	static const struct jset_entry_ops bch2_jset_entry_ops[] = {
807	#define x(f, nr) \
808	[BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
809	.validate = journal_entry_##f##_validate, \
810	.to_text = journal_entry_##f##_to_text, \
811	},
812	BCH_JSET_ENTRY_TYPES()
813	#undef x
814	};
815
816	int bch2_journal_entry_validate(struct bch_fs *c,
817	struct jset *jset,
818	struct jset_entry *entry,
819	unsigned version, int big_endian,
820	enum bkey_invalid_flags flags)
821	{
822	return entry->type < BCH_JSET_ENTRY_NR
823	? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
824	version, big_endian, flags)
825	: `0`;
826	}
827
828	void bch2_journal_entry_to_text(struct printbuf out, struct* bch_fs *c,
829	struct jset_entry *entry)
830	{
831	bch2_prt_jset_entry_type(out, entry->type);
832
833	if (entry->type < BCH_JSET_ENTRY_NR) {
834	prt_str(out, str: ": ");
835	bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
836	}
837	}
838
839	static int jset_validate_entries(struct bch_fs c, struct* jset *jset,
840	enum bkey_invalid_flags flags)
841	{
842	unsigned version = le32_to_cpu(jset->version);
843	int ret = `0`;
844
845	vstruct_for_each(jset, entry) {
846	if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
847	c, version, jset, entry,
848	journal_entry_past_jset_end,
849	"journal entry extends past end of jset")) {
850	jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
851	break;
852	}
853
854	ret = bch2_journal_entry_validate(c, jset, entry,
855	version, big_endian: JSET_BIG_ENDIAN(k: jset), flags);
856	if (ret)
857	break;
858	}
859	fsck_err:
860	return ret;
861	}
862
863	static int jset_validate(struct bch_fs *c,
864	struct bch_dev *ca,
865	struct jset *jset, u64 sector,
866	enum bkey_invalid_flags flags)
867	{
868	unsigned version;
869	int ret = `0`;
870
871	if (le64_to_cpu(jset->magic) != jset_magic(c))
872	return JOURNAL_ENTRY_NONE;
873
874	version = le32_to_cpu(jset->version);
875	if (journal_entry_err_on(!bch2_version_compatible(version),
876	c, version, jset, NULL,
877	jset_unsupported_version,
878	"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
879	ca ? ca->name : c->name,
880	sector, le64_to_cpu(jset->seq),
881	BCH_VERSION_MAJOR(version),
882	BCH_VERSION_MINOR(version))) {
883	/ don't try to continue: /
884	return -EINVAL;
885	}
886
887	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
888	c, version, jset, NULL,
889	jset_unknown_csum,
890	"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
891	ca ? ca->name : c->name,
892	sector, le64_to_cpu(jset->seq),
893	JSET_CSUM_TYPE(jset)))
894	ret = JOURNAL_ENTRY_BAD;
895
896	/ last_seq is ignored when JSET_NO_FLUSH is true /
897	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
898	le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
899	c, version, jset, NULL,
900	jset_last_seq_newer_than_seq,
901	"invalid journal entry: last_seq > seq (%llu > %llu)",
902	le64_to_cpu(jset->last_seq),
903	le64_to_cpu(jset->seq))) {
904	jset->last_seq = jset->seq;
905	return JOURNAL_ENTRY_BAD;
906	}
907
908	ret = jset_validate_entries(c, jset, flags);
909	fsck_err:
910	return ret;
911	}
912
913	static int jset_validate_early(struct bch_fs *c,
914	struct bch_dev *ca,
915	struct jset *jset, u64 sector,
916	unsigned bucket_sectors_left,
917	unsigned sectors_read)
918	{
919	size_t bytes = vstruct_bytes(jset);
920	unsigned version;
921	enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
922	int ret = `0`;
923
924	if (le64_to_cpu(jset->magic) != jset_magic(c))
925	return JOURNAL_ENTRY_NONE;
926
927	version = le32_to_cpu(jset->version);
928	if (journal_entry_err_on(!bch2_version_compatible(version),
929	c, version, jset, NULL,
930	jset_unsupported_version,
931	"%s sector %llu seq %llu: unknown journal entry version %u.%u",
932	ca ? ca->name : c->name,
933	sector, le64_to_cpu(jset->seq),
934	BCH_VERSION_MAJOR(version),
935	BCH_VERSION_MINOR(version))) {
936	/ don't try to continue: /
937	return -EINVAL;
938	}
939
940	if (bytes > (sectors_read << `9`) &&
941	sectors_read < bucket_sectors_left)
942	return JOURNAL_ENTRY_REREAD;
943
944	if (journal_entry_err_on(bytes > bucket_sectors_left << `9`,
945	c, version, jset, NULL,
946	jset_past_bucket_end,
947	"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
948	ca ? ca->name : c->name,
949	sector, le64_to_cpu(jset->seq), bytes))
950	le32_add_cpu(var: &jset->u64s,
951	val: -((bytes - (bucket_sectors_left << `9`)) / `8`));
952	fsck_err:
953	return ret;
954	}
955
956	struct journal_read_buf {
957	void *data;
958	size_t size;
959	};
960
961	static int journal_read_buf_realloc(struct journal_read_buf *b,
962	size_t new_size)
963	{
964	void *n;
965
966	/ the bios are sized for this many pages, max: /
967	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
968	return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
969
970	new_size = roundup_pow_of_two(new_size);
971	n = kvmalloc(size: new_size, GFP_KERNEL);
972	if (!n)
973	return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
974
975	kvfree(addr: b->data);
976	b->data = n;
977	b->size = new_size;
978	return `0`;
979	}
980
981	static int journal_read_bucket(struct bch_dev *ca,
982	struct journal_read_buf *buf,
983	struct journal_list *jlist,
984	unsigned bucket)
985	{
986	struct bch_fs *c = ca->fs;
987	struct journal_device *ja = &ca->journal;
988	struct jset *j = NULL;
989	unsigned sectors, sectors_read = `0`;
990	u64 offset = bucket_to_sector(ca, b: ja->buckets[bucket]),
991	end = offset + ca->mi.bucket_size;
992	bool saw_bad = false, csum_good;
993	struct printbuf err = PRINTBUF;
994	int ret = `0`;
995
996	pr_debug("reading %u", bucket);
997
998	while (offset < end) {
999	if (!sectors_read) {
1000	struct bio *bio;
1001	unsigned nr_bvecs;
1002	reread:
1003	sectors_read = min_t(unsigned,
1004	end - offset, buf->size >> `9`);
1005	nr_bvecs = buf_pages(p: buf->data, len: sectors_read << `9`);
1006
1007	bio = bio_kmalloc(nr_vecs: nr_bvecs, GFP_KERNEL);
1008	bio_init(bio, bdev: ca->disk_sb.bdev, table: bio->bi_inline_vecs, max_vecs: nr_bvecs, opf: REQ_OP_READ);
1009
1010	bio->bi_iter.bi_sector = offset;
1011	bch2_bio_map(bio, base: buf->data, sectors_read << `9`);
1012
1013	ret = submit_bio_wait(bio);
1014	kfree(objp: bio);
1015
1016	if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
1017	"journal read error: sector %llu",
1018	offset) \|\|
1019	bch2_meta_read_fault("journal")) {
1020	/*
1021	* We don't error out of the recovery process
1022	* here, since the relevant journal entry may be
1023	* found on a different device, and missing or
1024	* no journal entries will be handled later
1025	*/
1026	goto out;
1027	}
1028
1029	j = buf->data;
1030	}
1031
1032	ret = jset_validate_early(c, ca, jset: j, sector: offset,
1033	bucket_sectors_left: end - offset, sectors_read);
1034	switch (ret) {
1035	case `0`:
1036	sectors = vstruct_sectors(j, c->block_bits);
1037	break;
1038	case JOURNAL_ENTRY_REREAD:
1039	if (vstruct_bytes(j) > buf->size) {
1040	ret = journal_read_buf_realloc(b: buf,
1041	vstruct_bytes(j));
1042	if (ret)
1043	goto err;
1044	}
1045	goto reread;
1046	case JOURNAL_ENTRY_NONE:
1047	if (!saw_bad)
1048	goto out;
1049	/*
1050	* On checksum error we don't really trust the size
1051	* field of the journal entry we read, so try reading
1052	* again at next block boundary:
1053	*/
1054	sectors = block_sectors(c);
1055	goto next_block;
1056	default:
1057	goto err;
1058	}
1059
1060	/*
1061	* This happens sometimes if we don't have discards on -
1062	* when we've partially overwritten a bucket with new
1063	* journal entries. We don't need the rest of the
1064	* bucket:
1065	*/
1066	if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
1067	goto out;
1068
1069	ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
1070
1071	enum bch_csum_type csum_type = JSET_CSUM_TYPE(k: j);
1072	struct bch_csum csum;
1073	csum_good = jset_csum_good(c, j, csum: &csum);
1074
1075	if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
1076	"%s",
1077	(printbuf_reset(&err),
1078	prt_str(&err, "journal "),
1079	bch2_csum_err_msg(&err, csum_type, j->csum, csum),
1080	err.buf)))
1081	saw_bad = true;
1082
1083	ret = bch2_encrypt(c, JSET_CSUM_TYPE(k: j), journal_nonce(jset: j),
1084	data: j->encrypted_start,
1085	vstruct_end(j) - (void *) j->encrypted_start);
1086	bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
1087
1088	mutex_lock(&jlist->lock);
1089	ret = journal_entry_add(c, ca, entry_ptr: (struct journal_ptr) {
1090	.csum_good = csum_good,
1091	.dev = ca->dev_idx,
1092	.bucket = bucket,
1093	.bucket_offset = offset -
1094	bucket_to_sector(ca, b: ja->buckets[bucket]),
1095	.sector = offset,
1096	}, jlist, j);
1097	mutex_unlock(lock: &jlist->lock);
1098
1099	switch (ret) {
1100	case JOURNAL_ENTRY_ADD_OK:
1101	break;
1102	case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
1103	break;
1104	default:
1105	goto err;
1106	}
1107	next_block:
1108	pr_debug("next");
1109	offset += sectors;
1110	sectors_read -= sectors;
1111	j = ((void *) j) + (sectors << `9`);
1112	}
1113
1114	out:
1115	ret = `0`;
1116	err:
1117	printbuf_exit(&err);
1118	return ret;
1119	}
1120
1121	static CLOSURE_CALLBACK(bch2_journal_read_device)
1122	{
1123	closure_type(ja, struct journal_device, read);
1124	struct bch_dev ca = container_of(ja, struct* bch_dev, journal);
1125	struct bch_fs *c = ca->fs;
1126	struct journal_list *jlist =
1127	container_of(cl->parent, struct journal_list, cl);
1128	struct journal_replay r, *_r;
1129	struct genradix_iter iter;
1130	struct journal_read_buf buf = { NULL, `0` };
1131	unsigned i;
1132	int ret = `0`;
1133
1134	if (!ja->nr)
1135	goto out;
1136
1137	ret = journal_read_buf_realloc(b: &buf, PAGE_SIZE);
1138	if (ret)
1139	goto err;
1140
1141	pr_debug("%u journal buckets", ja->nr);
1142
1143	for (i = `0`; i < ja->nr; i++) {
1144	ret = journal_read_bucket(ca, buf: &buf, jlist, bucket: i);
1145	if (ret)
1146	goto err;
1147	}
1148
1149	ja->sectors_free = ca->mi.bucket_size;
1150
1151	mutex_lock(&jlist->lock);
1152	genradix_for_each_reverse(&c->journal_entries, iter, _r) {
1153	r = *_r;
1154
1155	if (!r)
1156	continue;
1157
1158	darray_for_each(r->ptrs, i)
1159	if (i->dev == ca->dev_idx) {
1160	unsigned wrote = bucket_remainder(ca, s: i->sector) +
1161	vstruct_sectors(&r->j, c->block_bits);
1162
1163	ja->cur_idx = i->bucket;
1164	ja->sectors_free = ca->mi.bucket_size - wrote;
1165	goto found;
1166	}
1167	}
1168	found:
1169	mutex_unlock(lock: &jlist->lock);
1170
1171	if (ja->bucket_seq[ja->cur_idx] &&
1172	ja->sectors_free == ca->mi.bucket_size) {
1173	#if 0
1174	/*
1175	* Debug code for ZNS support, where we (probably) want to be
1176	* correlated where we stopped in the journal to the zone write
1177	* points:
1178	*/
1179	bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1180	bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1181	for (i = `0`; i < `3`; i++) {
1182	unsigned idx = (ja->cur_idx + ja->nr - `1` + i) % ja->nr;
1183
1184	bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1185	}
1186	#endif
1187	ja->sectors_free = `0`;
1188	}
1189
1190	/*
1191	* Set dirty_idx to indicate the entire journal is full and needs to be
1192	* reclaimed - journal reclaim will immediately reclaim whatever isn't
1193	* pinned when it first runs:
1194	*/
1195	ja->discard_idx = ja->dirty_idx_ondisk =
1196	ja->dirty_idx = (ja->cur_idx + `1`) % ja->nr;
1197	out:
1198	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1199	kvfree(addr: buf.data);
1200	percpu_ref_put(ref: &ca->io_ref);
1201	closure_return(cl);
1202	return;
1203	err:
1204	mutex_lock(&jlist->lock);
1205	jlist->ret = ret;
1206	mutex_unlock(lock: &jlist->lock);
1207	goto out;
1208	}
1209
1210	int bch2_journal_read(struct bch_fs *c,
1211	u64 *last_seq,
1212	u64 *blacklist_seq,
1213	u64 *start_seq)
1214	{
1215	struct journal_list jlist;
1216	struct journal_replay i, _i, prev = NULL;
1217	struct genradix_iter radix_iter;
1218	struct printbuf buf = PRINTBUF;
1219	bool degraded = false, last_write_torn = false;
1220	u64 seq;
1221	int ret = `0`;
1222
1223	closure_init_stack(cl: &jlist.cl);
1224	mutex_init(&jlist.lock);
1225	jlist.last_seq = `0`;
1226	jlist.ret = `0`;
1227
1228	for_each_member_device(c, ca) {
1229	if (!c->opts.fsck &&
1230	!(bch2_dev_has_data(c, ca) & (`1` << BCH_DATA_journal)))
1231	continue;
1232
1233	if ((ca->mi.state == BCH_MEMBER_STATE_rw \|\|
1234	ca->mi.state == BCH_MEMBER_STATE_ro) &&
1235	percpu_ref_tryget(ref: &ca->io_ref))
1236	closure_call(cl: &ca->journal.read,
1237	fn: bch2_journal_read_device,
1238	wq: system_unbound_wq,
1239	parent: &jlist.cl);
1240	else
1241	degraded = true;
1242	}
1243
1244	closure_sync(cl: &jlist.cl);
1245
1246	if (jlist.ret)
1247	return jlist.ret;
1248
1249	*last_seq = `0`;
1250	*start_seq = `0`;
1251	*blacklist_seq = `0`;
1252
1253	/*
1254	* Find most recent flush entry, and ignore newer non flush entries -
1255	* those entries will be blacklisted:
1256	*/
1257	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1258	enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
1259
1260	i = *_i;
1261
1262	if (journal_replay_ignore(i))
1263	continue;
1264
1265	if (!*start_seq)
1266	blacklist_seq = start_seq = le64_to_cpu(i->j.seq) + `1`;
1267
1268	if (JSET_NO_FLUSH(k: &i->j)) {
1269	i->ignore_blacklisted = true;
1270	continue;
1271	}
1272
1273	if (!last_write_torn && !i->csum_good) {
1274	last_write_torn = true;
1275	i->ignore_blacklisted = true;
1276	continue;
1277	}
1278
1279	if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
1280	c, le32_to_cpu(i->j.version), &i->j, NULL,
1281	jset_last_seq_newer_than_seq,
1282	"invalid journal entry: last_seq > seq (%llu > %llu)",
1283	le64_to_cpu(i->j.last_seq),
1284	le64_to_cpu(i->j.seq)))
1285	i->j.last_seq = i->j.seq;
1286
1287	*last_seq = le64_to_cpu(i->j.last_seq);
1288	*blacklist_seq = le64_to_cpu(i->j.seq) + `1`;
1289	break;
1290	}
1291
1292	if (!*start_seq) {
1293	bch_info(c, "journal read done, but no entries found");
1294	return `0`;
1295	}
1296
1297	if (!*last_seq) {
1298	fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
1299	"journal read done, but no entries found after dropping non-flushes");
1300	return `0`;
1301	}
1302
1303	bch_info(c, "journal read done, replaying entries %llu-%llu",
1304	last_seq, blacklist_seq - `1`);
1305
1306	if (start_seq != blacklist_seq)
1307	bch_info(c, "dropped unflushed entries %llu-%llu",
1308	blacklist_seq, start_seq - `1`);
1309
1310	/ Drop blacklisted entries and entries older than last_seq: /
1311	genradix_for_each(&c->journal_entries, radix_iter, _i) {
1312	i = *_i;
1313
1314	if (journal_replay_ignore(i))
1315	continue;
1316
1317	seq = le64_to_cpu(i->j.seq);
1318	if (seq < *last_seq) {
1319	journal_replay_free(c, i, blacklisted: false);
1320	continue;
1321	}
1322
1323	if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1324	fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1325	jset_seq_blacklisted,
1326	"found blacklisted journal entry %llu", seq);
1327	i->ignore_blacklisted = true;
1328	}
1329	}
1330
1331	/ Check for missing entries: /
1332	seq = *last_seq;
1333	genradix_for_each(&c->journal_entries, radix_iter, _i) {
1334	i = *_i;
1335
1336	if (journal_replay_ignore(i))
1337	continue;
1338
1339	BUG_ON(seq > le64_to_cpu(i->j.seq));
1340
1341	while (seq < le64_to_cpu(i->j.seq)) {
1342	u64 missing_start, missing_end;
1343	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1344
1345	while (seq < le64_to_cpu(i->j.seq) &&
1346	bch2_journal_seq_is_blacklisted(c, seq, false))
1347	seq++;
1348
1349	if (seq == le64_to_cpu(i->j.seq))
1350	break;
1351
1352	missing_start = seq;
1353
1354	while (seq < le64_to_cpu(i->j.seq) &&
1355	!bch2_journal_seq_is_blacklisted(c, seq, false))
1356	seq++;
1357
1358	if (prev) {
1359	bch2_journal_ptrs_to_text(out: &buf1, c, j: prev);
1360	prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1361	} else
1362	prt_printf(&buf1, "(none)");
1363	bch2_journal_ptrs_to_text(out: &buf2, c, j: i);
1364
1365	missing_end = seq - `1`;
1366	fsck_err(c, journal_entries_missing,
1367	"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1368	" prev at %s\n"
1369	" next at %s",
1370	missing_start, missing_end,
1371	last_seq, blacklist_seq - `1`,
1372	buf1.buf, buf2.buf);
1373
1374	printbuf_exit(&buf1);
1375	printbuf_exit(&buf2);
1376	}
1377
1378	prev = i;
1379	seq++;
1380	}
1381
1382	genradix_for_each(&c->journal_entries, radix_iter, _i) {
1383	struct bch_replicas_padded replicas = {
1384	.e.data_type = BCH_DATA_journal,
1385	.e.nr_required = `1`,
1386	};
1387
1388	i = *_i;
1389	if (journal_replay_ignore(i))
1390	continue;
1391
1392	darray_for_each(i->ptrs, ptr) {
1393	struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr->dev);
1394
1395	if (!ptr->csum_good)
1396	bch_err_dev_offset(ca, ptr->sector,
1397	"invalid journal checksum, seq %llu%s",
1398	le64_to_cpu(i->j.seq),
1399	i->csum_good ? " (had good copy on another device)" : "");
1400	}
1401
1402	ret = jset_validate(c,
1403	ca: bch_dev_bkey_exists(c, idx: i->ptrs.data[`0`].dev),
1404	jset: &i->j,
1405	sector: i->ptrs.data[`0`].sector,
1406	READ);
1407	if (ret)
1408	goto err;
1409
1410	darray_for_each(i->ptrs, ptr)
1411	replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
1412
1413	bch2_replicas_entry_sort(&replicas.e);
1414
1415	printbuf_reset(buf: &buf);
1416	bch2_replicas_entry_to_text(&buf, &replicas.e);
1417
1418	if (!degraded &&
1419	!bch2_replicas_marked(c, &replicas.e) &&
1420	(le64_to_cpu(i->j.seq) == *last_seq \|\|
1421	fsck_err(c, journal_entry_replicas_not_marked,
1422	"superblock not marked as containing replicas for journal entry %llu\n %s",
1423	le64_to_cpu(i->j.seq), buf.buf))) {
1424	ret = bch2_mark_replicas(c, &replicas.e);
1425	if (ret)
1426	goto err;
1427	}
1428	}
1429	err:
1430	fsck_err:
1431	printbuf_exit(&buf);
1432	return ret;
1433	}
1434
1435	/ journal write: /
1436
1437	static void __journal_write_alloc(struct journal *j,
1438	struct journal_buf *w,
1439	struct dev_alloc_list *devs_sorted,
1440	unsigned sectors,
1441	unsigned *replicas,
1442	unsigned replicas_want)
1443	{
1444	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1445	struct journal_device *ja;
1446	struct bch_dev *ca;
1447	unsigned i;
1448
1449	if (*replicas >= replicas_want)
1450	return;
1451
1452	for (i = `0`; i < devs_sorted->nr; i++) {
1453	ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1454	if (!ca)
1455	continue;
1456
1457	ja = &ca->journal;
1458
1459	/*
1460	* Check that we can use this device, and aren't already using
1461	* it:
1462	*/
1463	if (!ca->mi.durability \|\|
1464	ca->mi.state != BCH_MEMBER_STATE_rw \|\|
1465	!ja->nr \|\|
1466	bch2_bkey_has_device_c(bkey_i_to_s_c(k: &w->key), ca->dev_idx) \|\|
1467	sectors > ja->sectors_free)
1468	continue;
1469
1470	bch2_dev_stripe_increment(ca, &j->wp.stripe);
1471
1472	bch2_bkey_append_ptr(k: &w->key,
1473	ptr: (struct bch_extent_ptr) {
1474	.offset = bucket_to_sector(ca,
1475	b: ja->buckets[ja->cur_idx]) +
1476	ca->mi.bucket_size -
1477	ja->sectors_free,
1478	.dev = ca->dev_idx,
1479	});
1480
1481	ja->sectors_free -= sectors;
1482	ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1483
1484	*replicas += ca->mi.durability;
1485
1486	if (*replicas >= replicas_want)
1487	break;
1488	}
1489	}
1490
1491	/**
1492	* journal_write_alloc - decide where to write next journal entry
1493	*
1494	* @j: journal object
1495	* @w: journal buf (entry to be written)
1496	*
1497	* Returns: 0 on success, or -EROFS on failure
1498	*/
1499	static int journal_write_alloc(struct journal j, struct* journal_buf *w)
1500	{
1501	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1502	struct bch_devs_mask devs;
1503	struct journal_device *ja;
1504	struct bch_dev *ca;
1505	struct dev_alloc_list devs_sorted;
1506	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1507	unsigned target = c->opts.metadata_target ?:
1508	c->opts.foreground_target;
1509	unsigned i, replicas = `0`, replicas_want =
1510	READ_ONCE(c->opts.metadata_replicas);
1511	unsigned replicas_need = min_t(unsigned, replicas_want,
1512	READ_ONCE(c->opts.metadata_replicas_required));
1513
1514	rcu_read_lock();
1515	retry:
1516	devs = target_rw_devs(c, data_type: BCH_DATA_journal, target);
1517
1518	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1519
1520	__journal_write_alloc(j, w, devs_sorted: &devs_sorted,
1521	sectors, replicas: &replicas, replicas_want);
1522
1523	if (replicas >= replicas_want)
1524	goto done;
1525
1526	for (i = `0`; i < devs_sorted.nr; i++) {
1527	ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1528	if (!ca)
1529	continue;
1530
1531	ja = &ca->journal;
1532
1533	if (sectors > ja->sectors_free &&
1534	sectors <= ca->mi.bucket_size &&
1535	bch2_journal_dev_buckets_available(j, ja,
1536	journal_space_discarded)) {
1537	ja->cur_idx = (ja->cur_idx + `1`) % ja->nr;
1538	ja->sectors_free = ca->mi.bucket_size;
1539
1540	/*
1541	* ja->bucket_seq[ja->cur_idx] must always have
1542	* something sensible:
1543	*/
1544	ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1545	}
1546	}
1547
1548	__journal_write_alloc(j, w, devs_sorted: &devs_sorted,
1549	sectors, replicas: &replicas, replicas_want);
1550
1551	if (replicas < replicas_want && target) {
1552	/ Retry from all devices: /
1553	target = `0`;
1554	goto retry;
1555	}
1556	done:
1557	rcu_read_unlock();
1558
1559	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1560
1561	return replicas >= replicas_need ? `0` : -EROFS;
1562	}
1563
1564	static void journal_buf_realloc(struct journal j, struct* journal_buf *buf)
1565	{
1566	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1567
1568	/ we aren't holding j->lock: /
1569	unsigned new_size = READ_ONCE(j->buf_size_want);
1570	void *new_buf;
1571
1572	if (buf->buf_size >= new_size)
1573	return;
1574
1575	size_t btree_write_buffer_size = new_size / `64`;
1576
1577	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
1578	return;
1579
1580	new_buf = kvmalloc(size: new_size, GFP_NOFS\|__GFP_NOWARN);
1581	if (!new_buf)
1582	return;
1583
1584	memcpy(new_buf, buf->data, buf->buf_size);
1585
1586	spin_lock(lock: &j->lock);
1587	swap(buf->data, new_buf);
1588	swap(buf->buf_size, new_size);
1589	spin_unlock(lock: &j->lock);
1590
1591	kvfree(addr: new_buf);
1592	}
1593
1594	static inline struct journal_buf journal_last_unwritten_buf(struct* journal *j)
1595	{
1596	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1597	}
1598
1599	static CLOSURE_CALLBACK(journal_write_done)
1600	{
1601	closure_type(w, struct journal_buf, io);
1602	struct journal j = container_of(w, struct* journal, buf[w->idx]);
1603	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1604	struct bch_replicas_padded replicas;
1605	union journal_res_state old, new;
1606	u64 v, seq = le64_to_cpu(w->data->seq);
1607	int err = `0`;
1608
1609	bch2_time_stats_update(stats: !JSET_NO_FLUSH(k: w->data)
1610	? j->flush_write_time
1611	: j->noflush_write_time, start: j->write_start_time);
1612
1613	if (!w->devs_written.nr) {
1614	bch_err(c, "unable to write journal to sufficient devices");
1615	err = -EIO;
1616	} else {
1617	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1618	w->devs_written);
1619	if (bch2_mark_replicas(c, &replicas.e))
1620	err = -EIO;
1621	}
1622
1623	if (err)
1624	bch2_fatal_error(c);
1625
1626	closure_debug_destroy(cl);
1627
1628	spin_lock(lock: &j->lock);
1629	if (seq >= j->pin.front)
1630	journal_seq_pin(j, seq)->devs = w->devs_written;
1631	if (err && (!j->err_seq \|\| seq < j->err_seq))
1632	j->err_seq = seq;
1633	w->write_done = true;
1634
1635	bool completed = false;
1636
1637	for (seq = journal_last_unwritten_seq(j);
1638	seq <= journal_cur_seq(j);
1639	seq++) {
1640	w = j->buf + (seq & JOURNAL_BUF_MASK);
1641	if (!w->write_done)
1642	break;
1643
1644	if (!j->err_seq && !JSET_NO_FLUSH(k: w->data)) {
1645	j->flushed_seq_ondisk = seq;
1646	j->last_seq_ondisk = w->last_seq;
1647
1648	bch2_do_discards(c);
1649	closure_wake_up(list: &c->freelist_wait);
1650	bch2_reset_alloc_cursors(c);
1651	}
1652
1653	j->seq_ondisk = seq;
1654
1655	/*
1656	* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1657	* more buckets:
1658	*
1659	* Must come before signaling write completion, for
1660	* bch2_fs_journal_stop():
1661	*/
1662	if (j->watermark != BCH_WATERMARK_stripe)
1663	journal_reclaim_kick(j: &c->journal);
1664
1665	v = atomic64_read(v: &j->reservations.counter);
1666	do {
1667	old.v = new.v = v;
1668	BUG_ON(journal_state_count(new, new.unwritten_idx));
1669	BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
1670
1671	new.unwritten_idx++;
1672	} while ((v = atomic64_cmpxchg(v: &j->reservations.counter, old: old.v, new: new.v)) != old.v);
1673
1674	closure_wake_up(list: &w->wait);
1675	completed = true;
1676	}
1677
1678	if (completed) {
1679	bch2_journal_reclaim_fast(j);
1680	bch2_journal_space_available(j);
1681
1682	track_event_change(stats: &c->times[BCH_TIME_blocked_journal_max_in_flight], v: false);
1683
1684	journal_wake(j);
1685	}
1686
1687	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1688	new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1689	struct journal_buf *buf = journal_cur_buf(j);
1690	long delta = buf->expires - jiffies;
1691
1692	/*
1693	* We don't close a journal entry to write it while there's
1694	* previous entries still in flight - the current journal entry
1695	* might want to be written now:
1696	*/
1697	mod_delayed_work(wq: j->wq, dwork: &j->write_work, max(`0L`, delta));
1698	}
1699
1700	spin_unlock(lock: &j->lock);
1701	}
1702
1703	static void journal_write_endio(struct bio *bio)
1704	{
1705	struct journal_bio jbio = container_of(bio, struct* journal_bio, bio);
1706	struct bch_dev *ca = jbio->ca;
1707	struct journal *j = &ca->fs->journal;
1708	struct journal_buf *w = j->buf + jbio->buf_idx;
1709
1710	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
1711	"error writing journal entry %llu: %s",
1712	le64_to_cpu(w->data->seq),
1713	bch2_blk_status_to_str(bio->bi_status)) \|\|
1714	bch2_meta_write_fault("journal")) {
1715	unsigned long flags;
1716
1717	spin_lock_irqsave(&j->err_lock, flags);
1718	bch2_dev_list_drop_dev(devs: &w->devs_written, dev: ca->dev_idx);
1719	spin_unlock_irqrestore(lock: &j->err_lock, flags);
1720	}
1721
1722	closure_put(cl: &w->io);
1723	percpu_ref_put(ref: &ca->io_ref);
1724	}
1725
1726	static CLOSURE_CALLBACK(journal_write_submit)
1727	{
1728	closure_type(w, struct journal_buf, io);
1729	struct journal j = container_of(w, struct* journal, buf[w->idx]);
1730	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1731	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1732
1733	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1734	struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr->dev);
1735	struct journal_device *ja = &ca->journal;
1736
1737	if (!percpu_ref_tryget(ref: &ca->io_ref)) {
1738	/ XXX: fix this /
1739	bch_err(c, "missing device for journal write\n");
1740	continue;
1741	}
1742
1743	this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1744	sectors);
1745
1746	struct bio *bio = &ja->bio[w->idx]->bio;
1747	bio_reset(bio, bdev: ca->disk_sb.bdev, opf: REQ_OP_WRITE\|REQ_SYNC\|REQ_META);
1748	bio->bi_iter.bi_sector = ptr->offset;
1749	bio->bi_end_io = journal_write_endio;
1750	bio->bi_private = ca;
1751
1752	BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1753	ca->prev_journal_sector = bio->bi_iter.bi_sector;
1754
1755	if (!JSET_NO_FLUSH(k: w->data))
1756	bio->bi_opf \|= REQ_FUA;
1757	if (!JSET_NO_FLUSH(k: w->data) && !w->separate_flush)
1758	bio->bi_opf \|= REQ_PREFLUSH;
1759
1760	bch2_bio_map(bio, base: w->data, sectors << `9`);
1761
1762	trace_and_count(c, journal_write, bio);
1763	closure_bio_submit(bio, cl);
1764
1765	ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1766	}
1767
1768	continue_at(cl, journal_write_done, j->wq);
1769	}
1770
1771	static CLOSURE_CALLBACK(journal_write_preflush)
1772	{
1773	closure_type(w, struct journal_buf, io);
1774	struct journal j = container_of(w, struct* journal, buf[w->idx]);
1775	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1776
1777	if (j->seq_ondisk + `1` != le64_to_cpu(w->data->seq)) {
1778	spin_lock(lock: &j->lock);
1779	closure_wait(list: &j->async_wait, cl);
1780	spin_unlock(lock: &j->lock);
1781
1782	continue_at(cl, journal_write_preflush, j->wq);
1783	return;
1784	}
1785
1786	if (w->separate_flush) {
1787	for_each_rw_member(c, ca) {
1788	percpu_ref_get(ref: &ca->io_ref);
1789
1790	struct journal_device *ja = &ca->journal;
1791	struct bio *bio = &ja->bio[w->idx]->bio;
1792	bio_reset(bio, bdev: ca->disk_sb.bdev,
1793	opf: REQ_OP_WRITE\|REQ_SYNC\|REQ_META\|REQ_PREFLUSH);
1794	bio->bi_end_io = journal_write_endio;
1795	bio->bi_private = ca;
1796	closure_bio_submit(bio, cl);
1797	}
1798
1799	continue_at(cl, journal_write_submit, j->wq);
1800	} else {
1801	/*
1802	* no need to punt to another work item if we're not waiting on
1803	* preflushes
1804	*/
1805	journal_write_submit(ws: &cl->work);
1806	}
1807	}
1808
1809	static int bch2_journal_write_prep(struct journal j, struct* journal_buf *w)
1810	{
1811	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1812	struct jset_entry start, end;
1813	struct jset *jset = w->data;
1814	struct journal_keys_to_wb wb = { NULL };
1815	unsigned sectors, bytes, u64s;
1816	unsigned long btree_roots_have = `0`;
1817	bool validate_before_checksum = false;
1818	u64 seq = le64_to_cpu(jset->seq);
1819	int ret;
1820
1821	/*
1822	* Simple compaction, dropping empty jset_entries (from journal
1823	* reservations that weren't fully used) and merging jset_entries that
1824	* can be.
1825	*
1826	* If we wanted to be really fancy here, we could sort all the keys in
1827	* the jset and drop keys that were overwritten - probably not worth it:
1828	*/
1829	vstruct_for_each(jset, i) {
1830	unsigned u64s = le16_to_cpu(i->u64s);
1831
1832	/ Empty entry: /
1833	if (!u64s)
1834	continue;
1835
1836	/*
1837	* New btree roots are set by journalling them; when the journal
1838	* entry gets written we have to propagate them to
1839	* c->btree_roots
1840	*
1841	* But, every journal entry we write has to contain all the
1842	* btree roots (at least for now); so after we copy btree roots
1843	* to c->btree_roots we have to get any missing btree roots and
1844	* add them to this journal entry:
1845	*/
1846	switch (i->type) {
1847	case BCH_JSET_ENTRY_btree_root:
1848	bch2_journal_entry_to_btree_root(c, i);
1849	__set_bit(i->btree_id, &btree_roots_have);
1850	break;
1851	case BCH_JSET_ENTRY_write_buffer_keys:
1852	EBUG_ON(!w->need_flush_to_write_buffer);
1853
1854	if (!wb.wb)
1855	bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
1856
1857	jset_entry_for_each_key(i, k) {
1858	ret = bch2_journal_key_to_wb(c, dst: &wb, btree: i->btree_id, k);
1859	if (ret) {
1860	bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
1861	bch2_err_str(ret));
1862	bch2_journal_keys_to_write_buffer_end(c, &wb);
1863	return ret;
1864	}
1865	}
1866	i->type = BCH_JSET_ENTRY_btree_keys;
1867	break;
1868	}
1869	}
1870
1871	if (wb.wb)
1872	bch2_journal_keys_to_write_buffer_end(c, &wb);
1873
1874	spin_lock(lock: &c->journal.lock);
1875	w->need_flush_to_write_buffer = false;
1876	spin_unlock(lock: &c->journal.lock);
1877
1878	start = end = vstruct_last(jset);
1879
1880	end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
1881
1882	struct jset_entry_datetime *d =
1883	container_of(jset_entry_init(&end, sizeof(d)), struct* jset_entry_datetime, entry);
1884	d->entry.type = BCH_JSET_ENTRY_datetime;
1885	d->seconds = cpu_to_le64(ktime_get_real_seconds());
1886
1887	bch2_journal_super_entries_add_common(c, &end, seq);
1888	u64s = (u64 ) end - (u64 ) start;
1889
1890	WARN_ON(u64s > j->entry_u64s_reserved);
1891
1892	le32_add_cpu(var: &jset->u64s, val: u64s);
1893
1894	sectors = vstruct_sectors(jset, c->block_bits);
1895	bytes = vstruct_bytes(jset);
1896
1897	if (sectors > w->sectors) {
1898	bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
1899	vstruct_bytes(jset), w->sectors << `9`,
1900	u64s, w->u64s_reserved, j->entry_u64s_reserved);
1901	return -EINVAL;
1902	}
1903
1904	jset->magic = cpu_to_le64(jset_magic(c));
1905	jset->version = cpu_to_le32(c->sb.version);
1906
1907	SET_JSET_BIG_ENDIAN(k: jset, CPU_BIG_ENDIAN);
1908	SET_JSET_CSUM_TYPE(k: jset, v: bch2_meta_checksum_type(c));
1909
1910	if (!JSET_NO_FLUSH(k: jset) && journal_entry_empty(j: jset))
1911	j->last_empty_seq = seq;
1912
1913	if (bch2_csum_type_is_encryption(type: JSET_CSUM_TYPE(k: jset)))
1914	validate_before_checksum = true;
1915
1916	if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1917	validate_before_checksum = true;
1918
1919	if (validate_before_checksum &&
1920	(ret = jset_validate(c, NULL, jset, sector: `0`, WRITE)))
1921	return ret;
1922
1923	ret = bch2_encrypt(c, JSET_CSUM_TYPE(k: jset), journal_nonce(jset),
1924	data: jset->encrypted_start,
1925	vstruct_end(jset) - (void *) jset->encrypted_start);
1926	if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
1927	return ret;
1928
1929	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1930	journal_nonce(jset), jset);
1931
1932	if (!validate_before_checksum &&
1933	(ret = jset_validate(c, NULL, jset, sector: `0`, WRITE)))
1934	return ret;
1935
1936	memset((void *) jset + bytes, `0`, (sectors << `9`) - bytes);
1937	return `0`;
1938	}
1939
1940	static int bch2_journal_write_pick_flush(struct journal j, struct* journal_buf *w)
1941	{
1942	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1943	int error = bch2_journal_error(j);
1944
1945	/*
1946	* If the journal is in an error state - we did an emergency shutdown -
1947	* we prefer to continue doing journal writes. We just mark them as
1948	* noflush so they'll never be used, but they'll still be visible by the
1949	* list_journal tool - this helps in debugging.
1950	*
1951	* There's a caveat: the first journal write after marking the
1952	* superblock dirty must always be a flush write, because on startup
1953	* from a clean shutdown we didn't necessarily read the journal and the
1954	* new journal write might overwrite whatever was in the journal
1955	* previously - we can't leave the journal without any flush writes in
1956	* it.
1957	*
1958	* So if we're in an error state, and we're still starting up, we don't
1959	* write anything at all.
1960	*/
1961	if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
1962	return -EIO;
1963
1964	if (error \|\|
1965	w->noflush \|\|
1966	(!w->must_flush &&
1967	(jiffies - j->last_flush_write) < msecs_to_jiffies(m: c->opts.journal_flush_delay) &&
1968	test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
1969	w->noflush = true;
1970	SET_JSET_NO_FLUSH(k: w->data, v: true);
1971	w->data->last_seq = `0`;
1972	w->last_seq = `0`;
1973
1974	j->nr_noflush_writes++;
1975	} else {
1976	w->must_flush = true;
1977	j->last_flush_write = jiffies;
1978	j->nr_flush_writes++;
1979	clear_bit(nr: JOURNAL_NEED_FLUSH_WRITE, addr: &j->flags);
1980	}
1981
1982	return `0`;
1983	}
1984
1985	CLOSURE_CALLBACK(bch2_journal_write)
1986	{
1987	closure_type(w, struct journal_buf, io);
1988	struct journal j = container_of(w, struct* journal, buf[w->idx]);
1989	struct bch_fs c = container_of(j, struct* bch_fs, journal);
1990	struct bch_replicas_padded replicas;
1991	struct printbuf journal_debug_buf = PRINTBUF;
1992	unsigned nr_rw_members = `0`;
1993	int ret;
1994
1995	for_each_rw_member(c, ca)
1996	nr_rw_members++;
1997
1998	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1999	BUG_ON(!w->write_started);
2000	BUG_ON(w->write_allocated);
2001	BUG_ON(w->write_done);
2002
2003	j->write_start_time = local_clock();
2004
2005	spin_lock(lock: &j->lock);
2006	if (nr_rw_members > `1`)
2007	w->separate_flush = true;
2008
2009	ret = bch2_journal_write_pick_flush(j, w);
2010	spin_unlock(lock: &j->lock);
2011	if (ret)
2012	goto err;
2013
2014	mutex_lock(&j->buf_lock);
2015	journal_buf_realloc(j, buf: w);
2016
2017	ret = bch2_journal_write_prep(j, w);
2018	mutex_unlock(lock: &j->buf_lock);
2019	if (ret)
2020	goto err;
2021
2022	j->entry_bytes_written += vstruct_bytes(w->data);
2023
2024	while (`1`) {
2025	spin_lock(lock: &j->lock);
2026	ret = journal_write_alloc(j, w);
2027	if (!ret \|\| !j->can_discard)
2028	break;
2029
2030	spin_unlock(lock: &j->lock);
2031	bch2_journal_do_discards(j);
2032	}
2033
2034	if (ret) {
2035	__bch2_journal_debug_to_text(&journal_debug_buf, j);
2036	spin_unlock(lock: &j->lock);
2037	bch_err(c, "Unable to allocate journal write:\n%s",
2038	journal_debug_buf.buf);
2039	printbuf_exit(&journal_debug_buf);
2040	goto err;
2041	}
2042
2043	/*
2044	* write is allocated, no longer need to account for it in
2045	* bch2_journal_space_available():
2046	*/
2047	w->sectors = `0`;
2048	w->write_allocated = true;
2049
2050	/*
2051	* journal entry has been compacted and allocated, recalculate space
2052	* available:
2053	*/
2054	bch2_journal_space_available(j);
2055	bch2_journal_do_writes(j);
2056	spin_unlock(lock: &j->lock);
2057
2058	w->devs_written = bch2_bkey_devs(k: bkey_i_to_s_c(k: &w->key));
2059
2060	if (c->opts.nochanges)
2061	goto no_io;
2062
2063	/*
2064	* Mark journal replicas before we submit the write to guarantee
2065	* recovery will find the journal entries after a crash.
2066	*/
2067	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
2068	w->devs_written);
2069	ret = bch2_mark_replicas(c, &replicas.e);
2070	if (ret)
2071	goto err;
2072
2073	if (!JSET_NO_FLUSH(k: w->data))
2074	continue_at(cl, journal_write_preflush, j->wq);
2075	else
2076	continue_at(cl, journal_write_submit, j->wq);
2077	return;
2078	no_io:
2079	continue_at(cl, journal_write_done, j->wq);
2080	return;
2081	err:
2082	bch2_fatal_error(c);
2083	continue_at(cl, journal_write_done, j->wq);
2084	}
2085

source code of linux/fs/bcachefs/journal_io.c