bcachefs_format.h source code [linux/fs/bcachefs/bcachefs_format.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef _BCACHEFS_FORMAT_H
3	#define _BCACHEFS_FORMAT_H
4
5	/*
6	* bcachefs on disk data structures
7	*
8	* OVERVIEW:
9	*
10	* There are three main types of on disk data structures in bcachefs (this is
11	* reduced from 5 in bcache)
12	*
13	* - superblock
14	* - journal
15	* - btree
16	*
17	* The btree is the primary structure; most metadata exists as keys in the
18	* various btrees. There are only a small number of btrees, they're not
19	* sharded - we have one btree for extents, another for inodes, et cetera.
20	*
21	* SUPERBLOCK:
22	*
23	* The superblock contains the location of the journal, the list of devices in
24	* the filesystem, and in general any metadata we need in order to decide
25	* whether we can start a filesystem or prior to reading the journal/btree
26	* roots.
27	*
28	* The superblock is extensible, and most of the contents of the superblock are
29	* in variable length, type tagged fields; see struct bch_sb_field.
30	*
31	* Backup superblocks do not reside in a fixed location; also, superblocks do
32	* not have a fixed size. To locate backup superblocks we have struct
33	* bch_sb_layout; we store a copy of this inside every superblock, and also
34	* before the first superblock.
35	*
36	* JOURNAL:
37	*
38	* The journal primarily records btree updates in the order they occurred;
39	* journal replay consists of just iterating over all the keys in the open
40	* journal entries and re-inserting them into the btrees.
41	*
42	* The journal also contains entry types for the btree roots, and blacklisted
43	* journal sequence numbers (see journal_seq_blacklist.c).
44	*
45	* BTREE:
46	*
47	* bcachefs btrees are copy on write b+ trees, where nodes are big (typically
48	* 128k-256k) and log structured. We use struct btree_node for writing the first
49	* entry in a given node (offset 0), and struct btree_node_entry for all
50	* subsequent writes.
51	*
52	* After the header, btree node entries contain a list of keys in sorted order.
53	* Values are stored inline with the keys; since values are variable length (and
54	* keys effectively are variable length too, due to packing) we can't do random
55	* access without building up additional in memory tables in the btree node read
56	* path.
57	*
58	* BTREE KEYS (struct bkey):
59	*
60	* The various btrees share a common format for the key - so as to avoid
61	* switching in fastpath lookup/comparison code - but define their own
62	* structures for the key values.
63	*
64	* The size of a key/value pair is stored as a u8 in units of u64s, so the max
65	* size is just under 2k. The common part also contains a type tag for the
66	* value, and a format field indicating whether the key is packed or not (and
67	* also meant to allow adding new key fields in the future, if desired).
68	*
69	* bkeys, when stored within a btree node, may also be packed. In that case, the
70	* bkey_format in that node is used to unpack it. Packed bkeys mean that we can
71	* be generous with field sizes in the common part of the key format (64 bit
72	* inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
73	*/
74
75	#include <asm/types.h>
76	#include <asm/byteorder.h>
77	#include <linux/kernel.h>
78	#include <linux/uuid.h>
79	#include "vstructs.h"
80
81	#ifdef __KERNEL__
82	typedef uuid_t __uuid_t;
83	#endif
84
85	#define BITMASK(name, type, field, offset, end) \
86	static const __maybe_unused unsigned name##_OFFSET = offset; \
87	static const __maybe_unused unsigned name##_BITS = (end - offset); \
88	\
89	static inline __u64 name(const type *k) \
90	{ \
91	return (k->field >> offset) & ~(~0ULL << (end - offset)); \
92	} \
93	\
94	static inline void SET_##name(type *k, __u64 v) \
95	{ \
96	k->field &= ~(~(~0ULL << (end - offset)) << offset); \
97	k->field \|= (v & ~(~0ULL << (end - offset))) << offset; \
98	}
99
100	#define LE_BITMASK(_bits, name, type, field, offset, end) \
101	static const __maybe_unused unsigned name##_OFFSET = offset; \
102	static const __maybe_unused unsigned name##_BITS = (end - offset); \
103	static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
104	\
105	static inline __u64 name(const type *k) \
106	{ \
107	return (__le##_bits##_to_cpu(k->field) >> offset) & \
108	~(~0ULL << (end - offset)); \
109	} \
110	\
111	static inline void SET_##name(type *k, __u64 v) \
112	{ \
113	__u##_bits new = __le##_bits##_to_cpu(k->field); \
114	\
115	new &= ~(~(~0ULL << (end - offset)) << offset); \
116	new \|= (v & ~(~0ULL << (end - offset))) << offset; \
117	k->field = __cpu_to_le##_bits(new); \
118	}
119
120	#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
121	#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
122	#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
123
124	struct bkey_format {
125	__u8 key_u64s;
126	__u8 nr_fields;
127	/ One unused slot for now: /
128	__u8 bits_per_field[`6`];
129	__le64 field_offset[`6`];
130	};
131
132	/ Btree keys - all units are in sectors /
133
134	struct bpos {
135	/*
136	* Word order matches machine byte order - btree code treats a bpos as a
137	* single large integer, for search/comparison purposes
138	*
139	* Note that wherever a bpos is embedded in another on disk data
140	* structure, it has to be byte swabbed when reading in metadata that
141	* wasn't written in native endian order:
142	*/
143	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
144	__u32 snapshot;
145	__u64 offset;
146	__u64 inode;
147	#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
148	__u64 inode;
149	__u64 offset; / Points to end of extent - sectors /
150	__u32 snapshot;
151	#else
152	#error edit for your odd byteorder.
153	#endif
154	} __packed
155	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
156	__aligned(`4`)
157	#endif
158	;
159
160	#define KEY_INODE_MAX ((__u64)~0ULL)
161	#define KEY_OFFSET_MAX ((__u64)~0ULL)
162	#define KEY_SNAPSHOT_MAX ((__u32)~0U)
163	#define KEY_SIZE_MAX ((__u32)~0U)
164
165	static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
166	{
167	return (struct bpos) {
168	.inode = inode,
169	.offset = offset,
170	.snapshot = snapshot,
171	};
172	}
173
174	#define POS_MIN SPOS(0, 0, 0)
175	#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
176	#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
177	#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
178
179	/ Empty placeholder struct, for container_of() /
180	struct bch_val {
181	__u64 __nothing[`0`];
182	};
183
184	struct bversion {
185	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
186	__u64 lo;
187	__u32 hi;
188	#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
189	__u32 hi;
190	__u64 lo;
191	#endif
192	} __packed
193	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
194	__aligned(`4`)
195	#endif
196	;
197
198	struct bkey {
199	/ Size of combined key and value, in u64s /
200	__u8 u64s;
201
202	/ Format of key (0 for format local to btree node) /
203	#if defined(__LITTLE_ENDIAN_BITFIELD)
204	__u8 format:`7`,
205	needs_whiteout:`1`;
206	#elif defined (__BIG_ENDIAN_BITFIELD)
207	__u8 needs_whiteout:`1`,
208	format:`7`;
209	#else
210	#error edit for your odd byteorder.
211	#endif
212
213	/ Type of the value /
214	__u8 type;
215
216	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
217	__u8 pad[`1`];
218
219	struct bversion version;
220	__u32 size; / extent size, in sectors /
221	struct bpos p;
222	#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
223	struct bpos p;
224	__u32 size; / extent size, in sectors /
225	struct bversion version;
226
227	__u8 pad[`1`];
228	#endif
229	} __packed
230	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
231	/*
232	* The big-endian version of bkey can't be compiled by rustc with the "aligned"
233	* attr since it doesn't allow types to have both "packed" and "aligned" attrs.
234	* So for Rust compatibility, don't include this. It can be included in the LE
235	* version because the "packed" attr is redundant in that case.
236	*
237	* History: (quoting Kent)
238	*
239	* Specifically, when i was designing bkey, I wanted the header to be no
240	* bigger than necessary so that bkey_packed could use the rest. That means that
241	* decently offten extent keys will fit into only 8 bytes, instead of spilling over
242	* to 16.
243	*
244	* But packed_bkey treats the part after the header - the packed section -
245	* as a single multi word, variable length integer. And bkey, the unpacked
246	* version, is just a special case version of a bkey_packed; all the packed
247	* bkey code will work on keys in any packed format, the in-memory
248	* representation of an unpacked key also is just one type of packed key...
249	*
250	* So that constrains the key part of a bkig endian bkey to start right
251	* after the header.
252	*
253	* If we ever do a bkey_v2 and need to expand the hedaer by another byte for
254	* some reason - that will clean up this wart.
255	*/
256	__aligned(`8`)
257	#endif
258	;
259
260	struct bkey_packed {
261	__u64 _data[`0`];
262
263	/ Size of combined key and value, in u64s /
264	__u8 u64s;
265
266	/ Format of key (0 for format local to btree node) /
267
268	/*
269	* XXX: next incompat on disk format change, switch format and
270	* needs_whiteout - bkey_packed() will be cheaper if format is the high
271	* bits of the bitfield
272	*/
273	#if defined(__LITTLE_ENDIAN_BITFIELD)
274	__u8 format:`7`,
275	needs_whiteout:`1`;
276	#elif defined (__BIG_ENDIAN_BITFIELD)
277	__u8 needs_whiteout:`1`,
278	format:`7`;
279	#endif
280
281	/ Type of the value /
282	__u8 type;
283	__u8 key_start[`0`];
284
285	/*
286	* We copy bkeys with struct assignment in various places, and while
287	* that shouldn't be done with packed bkeys we can't disallow it in C,
288	* and it's legal to cast a bkey to a bkey_packed - so padding it out
289	* to the same size as struct bkey should hopefully be safest.
290	*/
291	__u8 pad[sizeof(struct bkey) - `3`];
292	} __packed __aligned(`8`);
293
294	typedef struct {
295	__le64 lo;
296	__le64 hi;
297	} bch_le128;
298
299	#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
300	#define BKEY_U64s_MAX U8_MAX
301	#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
302
303	#define KEY_PACKED_BITS_START 24
304
305	#define KEY_FORMAT_LOCAL_BTREE 0
306	#define KEY_FORMAT_CURRENT 1
307
308	enum bch_bkey_fields {
309	BKEY_FIELD_INODE,
310	BKEY_FIELD_OFFSET,
311	BKEY_FIELD_SNAPSHOT,
312	BKEY_FIELD_SIZE,
313	BKEY_FIELD_VERSION_HI,
314	BKEY_FIELD_VERSION_LO,
315	BKEY_NR_FIELDS,
316	};
317
318	#define bkey_format_field(name, field) \
319	[BKEY_FIELD_##name] = (sizeof(((struct bkey ) NULL)->field) 8)
320
321	#define BKEY_FORMAT_CURRENT \
322	((struct bkey_format) { \
323	.key_u64s = BKEY_U64s, \
324	.nr_fields = BKEY_NR_FIELDS, \
325	.bits_per_field = { \
326	bkey_format_field(INODE, p.inode), \
327	bkey_format_field(OFFSET, p.offset), \
328	bkey_format_field(SNAPSHOT, p.snapshot), \
329	bkey_format_field(SIZE, size), \
330	bkey_format_field(VERSION_HI, version.hi), \
331	bkey_format_field(VERSION_LO, version.lo), \
332	}, \
333	})
334
335	/ bkey with inline value /
336	struct bkey_i {
337	__u64 _data[`0`];
338
339	struct bkey k;
340	struct bch_val v;
341	};
342
343	#define POS_KEY(_pos) \
344	((struct bkey) { \
345	.u64s = BKEY_U64s, \
346	.format = KEY_FORMAT_CURRENT, \
347	.p = _pos, \
348	})
349
350	#define KEY(_inode, _offset, _size) \
351	((struct bkey) { \
352	.u64s = BKEY_U64s, \
353	.format = KEY_FORMAT_CURRENT, \
354	.p = POS(_inode, _offset), \
355	.size = _size, \
356	})
357
358	static inline void bkey_init(struct bkey *k)
359	{
360	*k = KEY(`0`, `0`, `0`);
361	}
362
363	#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
364
365	#define __BKEY_PADDED(key, pad) \
366	struct bkey_i key; __u64 key ## _pad[pad]
367
368	/*
369	* - DELETED keys are used internally to mark keys that should be ignored but
370	* override keys in composition order. Their version number is ignored.
371	*
372	* - DISCARDED keys indicate that the data is all 0s because it has been
373	* discarded. DISCARDs may have a version; if the version is nonzero the key
374	* will be persistent, otherwise the key will be dropped whenever the btree
375	* node is rewritten (like DELETED keys).
376	*
377	* - ERROR: any read of the data returns a read error, as the data was lost due
378	* to a failing device. Like DISCARDED keys, they can be removed (overridden)
379	* by new writes or cluster-wide GC. Node repair can also overwrite them with
380	* the same or a more recent version number, but not with an older version
381	* number.
382	*
383	* - WHITEOUT: for hash table btrees
384	*/
385	#define BCH_BKEY_TYPES() \
386	x(deleted, 0) \
387	x(whiteout, 1) \
388	x(error, 2) \
389	x(cookie, 3) \
390	x(hash_whiteout, 4) \
391	x(btree_ptr, 5) \
392	x(extent, 6) \
393	x(reservation, 7) \
394	x(inode, 8) \
395	x(inode_generation, 9) \
396	x(dirent, 10) \
397	x(xattr, 11) \
398	x(alloc, 12) \
399	x(quota, 13) \
400	x(stripe, 14) \
401	x(reflink_p, 15) \
402	x(reflink_v, 16) \
403	x(inline_data, 17) \
404	x(btree_ptr_v2, 18) \
405	x(indirect_inline_data, 19) \
406	x(alloc_v2, 20) \
407	x(subvolume, 21) \
408	x(snapshot, 22) \
409	x(inode_v2, 23) \
410	x(alloc_v3, 24) \
411	x(set, 25) \
412	x(lru, 26) \
413	x(alloc_v4, 27) \
414	x(backpointer, 28) \
415	x(inode_v3, 29) \
416	x(bucket_gens, 30) \
417	x(snapshot_tree, 31) \
418	x(logged_op_truncate, 32) \
419	x(logged_op_finsert, 33)
420
421	enum bch_bkey_type {
422	#define x(name, nr) KEY_TYPE_##name = nr,
423	BCH_BKEY_TYPES()
424	#undef x
425	KEY_TYPE_MAX,
426	};
427
428	struct bch_deleted {
429	struct bch_val v;
430	};
431
432	struct bch_whiteout {
433	struct bch_val v;
434	};
435
436	struct bch_error {
437	struct bch_val v;
438	};
439
440	struct bch_cookie {
441	struct bch_val v;
442	__le64 cookie;
443	};
444
445	struct bch_hash_whiteout {
446	struct bch_val v;
447	};
448
449	struct bch_set {
450	struct bch_val v;
451	};
452
453	/ 128 bits, sufficient for cryptographic MACs: /
454	struct bch_csum {
455	__le64 lo;
456	__le64 hi;
457	} __packed __aligned(`8`);
458
459	struct bch_backpointer {
460	struct bch_val v;
461	__u8 btree_id;
462	__u8 level;
463	__u8 data_type;
464	__u64 bucket_offset:`40`;
465	__u32 bucket_len;
466	struct bpos pos;
467	} __packed __aligned(`8`);
468
469	/ LRU btree: /
470
471	struct bch_lru {
472	struct bch_val v;
473	__le64 idx;
474	} __packed __aligned(`8`);
475
476	#define LRU_ID_STRIPES (1U << 16)
477
478	/ Optional/variable size superblock sections: /
479
480	struct bch_sb_field {
481	__u64 _data[`0`];
482	__le32 u64s;
483	__le32 type;
484	};
485
486	#define BCH_SB_FIELDS() \
487	x(journal, 0) \
488	x(members_v1, 1) \
489	x(crypt, 2) \
490	x(replicas_v0, 3) \
491	x(quota, 4) \
492	x(disk_groups, 5) \
493	x(clean, 6) \
494	x(replicas, 7) \
495	x(journal_seq_blacklist, 8) \
496	x(journal_v2, 9) \
497	x(counters, 10) \
498	x(members_v2, 11) \
499	x(errors, 12) \
500	x(ext, 13) \
501	x(downgrade, 14)
502
503	#include "alloc_background_format.h"
504	#include "extents_format.h"
505	#include "reflink_format.h"
506	#include "ec_format.h"
507	#include "inode_format.h"
508	#include "dirent_format.h"
509	#include "xattr_format.h"
510	#include "quota_format.h"
511	#include "logged_ops_format.h"
512	#include "snapshot_format.h"
513	#include "subvolume_format.h"
514	#include "sb-counters_format.h"
515
516	enum bch_sb_field_type {
517	#define x(f, nr) BCH_SB_FIELD_##f = nr,
518	BCH_SB_FIELDS()
519	#undef x
520	BCH_SB_FIELD_NR
521	};
522
523	/*
524	* Most superblock fields are replicated in all device's superblocks - a few are
525	* not:
526	*/
527	#define BCH_SINGLE_DEVICE_SB_FIELDS \
528	((1U << BCH_SB_FIELD_journal)\| \
529	(1U << BCH_SB_FIELD_journal_v2))
530
531	/ BCH_SB_FIELD_journal: /
532
533	struct bch_sb_field_journal {
534	struct bch_sb_field field;
535	__le64 buckets[];
536	};
537
538	struct bch_sb_field_journal_v2 {
539	struct bch_sb_field field;
540
541	struct bch_sb_field_journal_v2_entry {
542	__le64 start;
543	__le64 nr;
544	} d[];
545	};
546
547	/ BCH_SB_FIELD_members_v1: /
548
549	#define BCH_MIN_NR_NBUCKETS (1 << 6)
550
551	#define BCH_IOPS_MEASUREMENTS() \
552	x(seqread, 0) \
553	x(seqwrite, 1) \
554	x(randread, 2) \
555	x(randwrite, 3)
556
557	enum bch_iops_measurement {
558	#define x(t, n) BCH_IOPS_##t = n,
559	BCH_IOPS_MEASUREMENTS()
560	#undef x
561	BCH_IOPS_NR
562	};
563
564	#define BCH_MEMBER_ERROR_TYPES() \
565	x(read, 0) \
566	x(write, 1) \
567	x(checksum, 2)
568
569	enum bch_member_error_type {
570	#define x(t, n) BCH_MEMBER_ERROR_##t = n,
571	BCH_MEMBER_ERROR_TYPES()
572	#undef x
573	BCH_MEMBER_ERROR_NR
574	};
575
576	struct bch_member {
577	__uuid_t uuid;
578	__le64 nbuckets; / device size /
579	__le16 first_bucket; / index of first bucket used /
580	__le16 bucket_size; / sectors /
581	__u8 btree_bitmap_shift;
582	__u8 pad[`3`];
583	__le64 last_mount; / time_t /
584
585	__le64 flags;
586	__le32 iops[`4`];
587	__le64 errors[BCH_MEMBER_ERROR_NR];
588	__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
589	__le64 errors_reset_time;
590	__le64 seq;
591	__le64 btree_allocated_bitmap;
592	};
593
594	#define BCH_MEMBER_V1_BYTES 56
595
596	LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, `0`, `4`)
597	/ 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT /
598	LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, `14`, `15`)
599	LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, `15`, `20`)
600	LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, `20`, `28`)
601	LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, `28`, `30`)
602	LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
603	struct bch_member, flags, `30`, `31`)
604
605	#if 0
606	LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[`1`], `0`, `20`);
607	LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[`1`], `20`, `40`);
608	#endif
609
610	#define BCH_MEMBER_STATES() \
611	x(rw, 0) \
612	x(ro, 1) \
613	x(failed, 2) \
614	x(spare, 3)
615
616	enum bch_member_state {
617	#define x(t, n) BCH_MEMBER_STATE_##t = n,
618	BCH_MEMBER_STATES()
619	#undef x
620	BCH_MEMBER_STATE_NR
621	};
622
623	struct bch_sb_field_members_v1 {
624	struct bch_sb_field field;
625	struct bch_member _members[]; //Members are now variable size
626	};
627
628	struct bch_sb_field_members_v2 {
629	struct bch_sb_field field;
630	__le16 member_bytes; //size of single member entry
631	u8 pad[`6`];
632	struct bch_member _members[];
633	};
634
635	/ BCH_SB_FIELD_crypt: /
636
637	struct nonce {
638	__le32 d[`4`];
639	};
640
641	struct bch_key {
642	__le64 key[`4`];
643	};
644
645	#define BCH_KEY_MAGIC \
646	(((__u64) 'b' << 0)\|((__u64) 'c' << 8)\| \
647	((__u64) 'h' << 16)\|((__u64) '*' << 24)\| \
648	((__u64) '*' << 32)\|((__u64) 'k' << 40)\| \
649	((__u64) 'e' << 48)\|((__u64) 'y' << 56))
650
651	struct bch_encrypted_key {
652	__le64 magic;
653	struct bch_key key;
654	};
655
656	/*
657	* If this field is present in the superblock, it stores an encryption key which
658	* is used encrypt all other data/metadata. The key will normally be encrypted
659	* with the key userspace provides, but if encryption has been turned off we'll
660	* just store the master key unencrypted in the superblock so we can access the
661	* previously encrypted data.
662	*/
663	struct bch_sb_field_crypt {
664	struct bch_sb_field field;
665
666	__le64 flags;
667	__le64 kdf_flags;
668	struct bch_encrypted_key key;
669	};
670
671	LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, `0`, `4`);
672
673	enum bch_kdf_types {
674	BCH_KDF_SCRYPT = `0`,
675	BCH_KDF_NR = `1`,
676	};
677
678	/ stored as base 2 log of scrypt params: /
679	LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, `0`, `16`);
680	LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, `16`, `32`);
681	LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, `32`, `48`);
682
683	/ BCH_SB_FIELD_replicas: /
684
685	#define BCH_DATA_TYPES() \
686	x(free, 0) \
687	x(sb, 1) \
688	x(journal, 2) \
689	x(btree, 3) \
690	x(user, 4) \
691	x(cached, 5) \
692	x(parity, 6) \
693	x(stripe, 7) \
694	x(need_gc_gens, 8) \
695	x(need_discard, 9)
696
697	enum bch_data_type {
698	#define x(t, n) BCH_DATA_##t,
699	BCH_DATA_TYPES()
700	#undef x
701	BCH_DATA_NR
702	};
703
704	static inline bool data_type_is_empty(enum bch_data_type type)
705	{
706	switch (type) {
707	case BCH_DATA_free:
708	case BCH_DATA_need_gc_gens:
709	case BCH_DATA_need_discard:
710	return true;
711	default:
712	return false;
713	}
714	}
715
716	static inline bool data_type_is_hidden(enum bch_data_type type)
717	{
718	switch (type) {
719	case BCH_DATA_sb:
720	case BCH_DATA_journal:
721	return true;
722	default:
723	return false;
724	}
725	}
726
727	struct bch_replicas_entry_v0 {
728	__u8 data_type;
729	__u8 nr_devs;
730	__u8 devs[];
731	} __packed;
732
733	struct bch_sb_field_replicas_v0 {
734	struct bch_sb_field field;
735	struct bch_replicas_entry_v0 entries[];
736	} __packed __aligned(`8`);
737
738	struct bch_replicas_entry_v1 {
739	__u8 data_type;
740	__u8 nr_devs;
741	__u8 nr_required;
742	__u8 devs[];
743	} __packed;
744
745	#define replicas_entry_bytes(_i) \
746	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
747
748	struct bch_sb_field_replicas {
749	struct bch_sb_field field;
750	struct bch_replicas_entry_v1 entries[];
751	} __packed __aligned(`8`);
752
753	/ BCH_SB_FIELD_disk_groups: /
754
755	#define BCH_SB_LABEL_SIZE 32
756
757	struct bch_disk_group {
758	__u8 label[BCH_SB_LABEL_SIZE];
759	__le64 flags[`2`];
760	} __packed __aligned(`8`);
761
762	LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[`0`], `0`, `1`)
763	LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[`0`], `1`, `6`)
764	LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[`0`], `6`, `24`)
765
766	struct bch_sb_field_disk_groups {
767	struct bch_sb_field field;
768	struct bch_disk_group entries[];
769	} __packed __aligned(`8`);
770
771	/*
772	* On clean shutdown, store btree roots and current journal sequence number in
773	* the superblock:
774	*/
775	struct jset_entry {
776	__le16 u64s;
777	__u8 btree_id;
778	__u8 level;
779	__u8 type; / designates what this jset holds /
780	__u8 pad[`3`];
781
782	struct bkey_i start[`0`];
783	__u64 _data[];
784	};
785
786	struct bch_sb_field_clean {
787	struct bch_sb_field field;
788
789	__le32 flags;
790	__le16 _read_clock; / no longer used /
791	__le16 _write_clock;
792	__le64 journal_seq;
793
794	struct jset_entry start[`0`];
795	__u64 _data[];
796	};
797
798	struct journal_seq_blacklist_entry {
799	__le64 start;
800	__le64 end;
801	};
802
803	struct bch_sb_field_journal_seq_blacklist {
804	struct bch_sb_field field;
805	struct journal_seq_blacklist_entry start[];
806	};
807
808	struct bch_sb_field_errors {
809	struct bch_sb_field field;
810	struct bch_sb_field_error_entry {
811	__le64 v;
812	__le64 last_error_time;
813	} entries[];
814	};
815
816	LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, `0`, `16`);
817	LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, `16`, `64`);
818
819	struct bch_sb_field_ext {
820	struct bch_sb_field field;
821	__le64 recovery_passes_required[`2`];
822	__le64 errors_silent[`8`];
823	__le64 btrees_lost_data;
824	};
825
826	struct bch_sb_field_downgrade_entry {
827	__le16 version;
828	__le64 recovery_passes[`2`];
829	__le16 nr_errors;
830	__le16 errors[] __counted_by(nr_errors);
831	} __packed __aligned(`2`);
832
833	struct bch_sb_field_downgrade {
834	struct bch_sb_field field;
835	struct bch_sb_field_downgrade_entry entries[];
836	};
837
838	/ Superblock: /
839
840	/*
841	* New versioning scheme:
842	* One common version number for all on disk data structures - superblock, btree
843	* nodes, journal entries
844	*/
845	#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10))
846	#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
847	#define BCH_VERSION(_major, _minor) (((_major) << 10)\|(_minor) << 0)
848
849	/*
850	* field 1: version name
851	* field 2: BCH_VERSION(major, minor)
852	* field 3: recovery passess required on upgrade
853	*/
854	#define BCH_METADATA_VERSIONS() \
855	x(bkey_renumber, BCH_VERSION(0, 10)) \
856	x(inode_btree_change, BCH_VERSION(0, 11)) \
857	x(snapshot, BCH_VERSION(0, 12)) \
858	x(inode_backpointers, BCH_VERSION(0, 13)) \
859	x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
860	x(snapshot_2, BCH_VERSION(0, 15)) \
861	x(reflink_p_fix, BCH_VERSION(0, 16)) \
862	x(subvol_dirent, BCH_VERSION(0, 17)) \
863	x(inode_v2, BCH_VERSION(0, 18)) \
864	x(freespace, BCH_VERSION(0, 19)) \
865	x(alloc_v4, BCH_VERSION(0, 20)) \
866	x(new_data_types, BCH_VERSION(0, 21)) \
867	x(backpointers, BCH_VERSION(0, 22)) \
868	x(inode_v3, BCH_VERSION(0, 23)) \
869	x(unwritten_extents, BCH_VERSION(0, 24)) \
870	x(bucket_gens, BCH_VERSION(0, 25)) \
871	x(lru_v2, BCH_VERSION(0, 26)) \
872	x(fragmentation_lru, BCH_VERSION(0, 27)) \
873	x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
874	x(snapshot_trees, BCH_VERSION(0, 29)) \
875	x(major_minor, BCH_VERSION(1, 0)) \
876	x(snapshot_skiplists, BCH_VERSION(1, 1)) \
877	x(deleted_inodes, BCH_VERSION(1, 2)) \
878	x(rebalance_work, BCH_VERSION(1, 3)) \
879	x(member_seq, BCH_VERSION(1, 4)) \
880	x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
881	x(btree_subvolume_children, BCH_VERSION(1, 6)) \
882	x(mi_btree_bitmap, BCH_VERSION(1, 7))
883
884	enum bcachefs_metadata_version {
885	bcachefs_metadata_version_min = `9`,
886	#define x(t, n) bcachefs_metadata_version_##t = n,
887	BCH_METADATA_VERSIONS()
888	#undef x
889	bcachefs_metadata_version_max
890	};
891
892	static const __maybe_unused
893	unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
894
895	#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
896
897	#define BCH_SB_SECTOR 8
898	#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
899
900	struct bch_sb_layout {
901	__uuid_t magic; / bcachefs superblock UUID /
902	__u8 layout_type;
903	__u8 sb_max_size_bits; / base 2 of 512 byte sectors /
904	__u8 nr_superblocks;
905	__u8 pad[`5`];
906	__le64 sb_offset[`61`];
907	} __packed __aligned(`8`);
908
909	#define BCH_SB_LAYOUT_SECTOR 7
910
911	/*
912	* @offset - sector where this sb was written
913	* @version - on disk format version
914	* @version_min - Oldest metadata version this filesystem contains; so we can
915	* safely drop compatibility code and refuse to mount filesystems
916	* we'd need it for
917	* @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
918	* @seq - incremented each time superblock is written
919	* @uuid - used for generating various magic numbers and identifying
920	* member devices, never changes
921	* @user_uuid - user visible UUID, may be changed
922	* @label - filesystem label
923	* @seq - identifies most recent superblock, incremented each time
924	* superblock is written
925	* @features - enabled incompatible features
926	*/
927	struct bch_sb {
928	struct bch_csum csum;
929	__le16 version;
930	__le16 version_min;
931	__le16 pad[`2`];
932	__uuid_t magic;
933	__uuid_t uuid;
934	__uuid_t user_uuid;
935	__u8 label[BCH_SB_LABEL_SIZE];
936	__le64 offset;
937	__le64 seq;
938
939	__le16 block_size;
940	__u8 dev_idx;
941	__u8 nr_devices;
942	__le32 u64s;
943
944	__le64 time_base_lo;
945	__le32 time_base_hi;
946	__le32 time_precision;
947
948	__le64 flags[`7`];
949	__le64 write_time;
950	__le64 features[`2`];
951	__le64 compat[`2`];
952
953	struct bch_sb_layout layout;
954
955	struct bch_sb_field start[`0`];
956	__le64 _data[];
957	} __packed __aligned(`8`);
958
959	/*
960	* Flags:
961	* BCH_SB_INITALIZED - set on first mount
962	* BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect
963	* behaviour of mount/recovery path:
964	* BCH_SB_INODE_32BIT - limit inode numbers to 32 bits
965	* BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
966	* BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
967	* DATA/META_CSUM_TYPE. Also indicates encryption
968	* algorithm in use, if/when we get more than one
969	*/
970
971	LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, `0`, `16`);
972
973	LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[`0`], `0`, `1`);
974	LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[`0`], `1`, `2`);
975	LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[`0`], `2`, `8`);
976	LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[`0`], `8`, `12`);
977
978	LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[`0`], `12`, `28`);
979
980	LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[`0`], `28`, `33`);
981	LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[`0`], `33`, `40`);
982
983	LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[`0`], `40`, `44`);
984	LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[`0`], `44`, `48`);
985
986	LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[`0`], `48`, `52`);
987	LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[`0`], `52`, `56`);
988
989	LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[`0`], `56`, `57`);
990	LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[`0`], `57`, `58`);
991	LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[`0`], `58`, `59`);
992	LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[`0`], `59`, `60`);
993
994	LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[`0`], `60`, `61`);
995	LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[`0`], `61`, `62`);
996
997	LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[`0`], `62`, `63`);
998
999	LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[`1`], `0`, `4`);
1000	LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[`1`], `4`, `8`);
1001	LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[`1`], `8`, `9`);
1002
1003	LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[`1`], `9`, `10`);
1004	LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[`1`], `10`, `14`);
1005
1006	/*
1007	* Max size of an extent that may require bouncing to read or write
1008	* (checksummed, compressed): 64k
1009	*/
1010	LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
1011	struct bch_sb, flags[`1`], `14`, `20`);
1012
1013	LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[`1`], `20`, `24`);
1014	LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[`1`], `24`, `28`);
1015
1016	LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[`1`], `28`, `40`);
1017	LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[`1`], `40`, `52`);
1018	LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[`1`], `52`, `64`);
1019
1020	LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
1021	struct bch_sb, flags[`2`], `0`, `4`);
1022	LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[`2`], `4`, `64`);
1023
1024	LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[`3`], `0`, `16`);
1025	LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[`3`], `16`, `28`);
1026	LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[`3`], `28`, `29`);
1027	LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[`3`], `29`, `30`);
1028	LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[`3`], `30`, `62`);
1029	LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[`3`], `62`, `63`);
1030	LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[`4`], `0`, `32`);
1031	LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[`4`], `32`, `33`);
1032	LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[`4`], `33`, `34`);
1033	LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[`4`], `34`, `54`);
1034	LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[`4`], `54`, `56`);
1035
1036	LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[`4`], `56`, `60`);
1037	LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
1038	struct bch_sb, flags[`4`], `60`, `64`);
1039
1040	LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
1041	struct bch_sb, flags[`5`], `0`, `16`);
1042
1043	static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
1044	{
1045	return BCH_SB_COMPRESSION_TYPE_LO(k: sb) \| (BCH_SB_COMPRESSION_TYPE_HI(k: sb) << `4`);
1046	}
1047
1048	static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
1049	{
1050	SET_BCH_SB_COMPRESSION_TYPE_LO(k: sb, v);
1051	SET_BCH_SB_COMPRESSION_TYPE_HI(k: sb, v: v >> `4`);
1052	}
1053
1054	static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
1055	{
1056	return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(k: sb) \|
1057	(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(k: sb) << `4`);
1058	}
1059
1060	static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
1061	{
1062	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(k: sb, v);
1063	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(k: sb, v: v >> `4`);
1064	}
1065
1066	/*
1067	* Features:
1068	*
1069	* journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
1070	* reflink: gates KEY_TYPE_reflink
1071	* inline_data: gates KEY_TYPE_inline_data
1072	* new_siphash: gates BCH_STR_HASH_siphash
1073	* new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
1074	*/
1075	#define BCH_SB_FEATURES() \
1076	x(lz4, 0) \
1077	x(gzip, 1) \
1078	x(zstd, 2) \
1079	x(atomic_nlink, 3) \
1080	x(ec, 4) \
1081	x(journal_seq_blacklist_v3, 5) \
1082	x(reflink, 6) \
1083	x(new_siphash, 7) \
1084	x(inline_data, 8) \
1085	x(new_extent_overwrite, 9) \
1086	x(incompressible, 10) \
1087	x(btree_ptr_v2, 11) \
1088	x(extents_above_btree_updates, 12) \
1089	x(btree_updates_journalled, 13) \
1090	x(reflink_inline_data, 14) \
1091	x(new_varint, 15) \
1092	x(journal_no_flush, 16) \
1093	x(alloc_v2, 17) \
1094	x(extents_across_btree_nodes, 18)
1095
1096	#define BCH_SB_FEATURES_ALWAYS \
1097	((1ULL << BCH_FEATURE_new_extent_overwrite)\| \
1098	(1ULL << BCH_FEATURE_extents_above_btree_updates)\|\
1099	(1ULL << BCH_FEATURE_btree_updates_journalled)\|\
1100	(1ULL << BCH_FEATURE_alloc_v2)\|\
1101	(1ULL << BCH_FEATURE_extents_across_btree_nodes))
1102
1103	#define BCH_SB_FEATURES_ALL \
1104	(BCH_SB_FEATURES_ALWAYS\| \
1105	(1ULL << BCH_FEATURE_new_siphash)\| \
1106	(1ULL << BCH_FEATURE_btree_ptr_v2)\| \
1107	(1ULL << BCH_FEATURE_new_varint)\| \
1108	(1ULL << BCH_FEATURE_journal_no_flush))
1109
1110	enum bch_sb_feature {
1111	#define x(f, n) BCH_FEATURE_##f,
1112	BCH_SB_FEATURES()
1113	#undef x
1114	BCH_FEATURE_NR,
1115	};
1116
1117	#define BCH_SB_COMPAT() \
1118	x(alloc_info, 0) \
1119	x(alloc_metadata, 1) \
1120	x(extents_above_btree_updates_done, 2) \
1121	x(bformat_overflow_done, 3)
1122
1123	enum bch_sb_compat {
1124	#define x(f, n) BCH_COMPAT_##f,
1125	BCH_SB_COMPAT()
1126	#undef x
1127	BCH_COMPAT_NR,
1128	};
1129
1130	/ options: /
1131
1132	#define BCH_VERSION_UPGRADE_OPTS() \
1133	x(compatible, 0) \
1134	x(incompatible, 1) \
1135	x(none, 2)
1136
1137	enum bch_version_upgrade_opts {
1138	#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
1139	BCH_VERSION_UPGRADE_OPTS()
1140	#undef x
1141	};
1142
1143	#define BCH_REPLICAS_MAX 4U
1144
1145	#define BCH_BKEY_PTRS_MAX 16U
1146
1147	#define BCH_ERROR_ACTIONS() \
1148	x(continue, 0) \
1149	x(ro, 1) \
1150	x(panic, 2)
1151
1152	enum bch_error_actions {
1153	#define x(t, n) BCH_ON_ERROR_##t = n,
1154	BCH_ERROR_ACTIONS()
1155	#undef x
1156	BCH_ON_ERROR_NR
1157	};
1158
1159	#define BCH_STR_HASH_TYPES() \
1160	x(crc32c, 0) \
1161	x(crc64, 1) \
1162	x(siphash_old, 2) \
1163	x(siphash, 3)
1164
1165	enum bch_str_hash_type {
1166	#define x(t, n) BCH_STR_HASH_##t = n,
1167	BCH_STR_HASH_TYPES()
1168	#undef x
1169	BCH_STR_HASH_NR
1170	};
1171
1172	#define BCH_STR_HASH_OPTS() \
1173	x(crc32c, 0) \
1174	x(crc64, 1) \
1175	x(siphash, 2)
1176
1177	enum bch_str_hash_opts {
1178	#define x(t, n) BCH_STR_HASH_OPT_##t = n,
1179	BCH_STR_HASH_OPTS()
1180	#undef x
1181	BCH_STR_HASH_OPT_NR
1182	};
1183
1184	#define BCH_CSUM_TYPES() \
1185	x(none, 0) \
1186	x(crc32c_nonzero, 1) \
1187	x(crc64_nonzero, 2) \
1188	x(chacha20_poly1305_80, 3) \
1189	x(chacha20_poly1305_128, 4) \
1190	x(crc32c, 5) \
1191	x(crc64, 6) \
1192	x(xxhash, 7)
1193
1194	enum bch_csum_type {
1195	#define x(t, n) BCH_CSUM_##t = n,
1196	BCH_CSUM_TYPES()
1197	#undef x
1198	BCH_CSUM_NR
1199	};
1200
1201	static const __maybe_unused unsigned bch_crc_bytes[] = {
1202	[BCH_CSUM_none] = `0`,
1203	[BCH_CSUM_crc32c_nonzero] = `4`,
1204	[BCH_CSUM_crc32c] = `4`,
1205	[BCH_CSUM_crc64_nonzero] = `8`,
1206	[BCH_CSUM_crc64] = `8`,
1207	[BCH_CSUM_xxhash] = `8`,
1208	[BCH_CSUM_chacha20_poly1305_80] = `10`,
1209	[BCH_CSUM_chacha20_poly1305_128] = `16`,
1210	};
1211
1212	static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
1213	{
1214	switch (type) {
1215	case BCH_CSUM_chacha20_poly1305_80:
1216	case BCH_CSUM_chacha20_poly1305_128:
1217	return true;
1218	default:
1219	return false;
1220	}
1221	}
1222
1223	#define BCH_CSUM_OPTS() \
1224	x(none, 0) \
1225	x(crc32c, 1) \
1226	x(crc64, 2) \
1227	x(xxhash, 3)
1228
1229	enum bch_csum_opts {
1230	#define x(t, n) BCH_CSUM_OPT_##t = n,
1231	BCH_CSUM_OPTS()
1232	#undef x
1233	BCH_CSUM_OPT_NR
1234	};
1235
1236	#define BCH_COMPRESSION_TYPES() \
1237	x(none, 0) \
1238	x(lz4_old, 1) \
1239	x(gzip, 2) \
1240	x(lz4, 3) \
1241	x(zstd, 4) \
1242	x(incompressible, 5)
1243
1244	enum bch_compression_type {
1245	#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
1246	BCH_COMPRESSION_TYPES()
1247	#undef x
1248	BCH_COMPRESSION_TYPE_NR
1249	};
1250
1251	#define BCH_COMPRESSION_OPTS() \
1252	x(none, 0) \
1253	x(lz4, 1) \
1254	x(gzip, 2) \
1255	x(zstd, 3)
1256
1257	enum bch_compression_opts {
1258	#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
1259	BCH_COMPRESSION_OPTS()
1260	#undef x
1261	BCH_COMPRESSION_OPT_NR
1262	};
1263
1264	/*
1265	* Magic numbers
1266	*
1267	* The various other data structures have their own magic numbers, which are
1268	* xored with the first part of the cache set's UUID
1269	*/
1270
1271	#define BCACHE_MAGIC \
1272	UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \
1273	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
1274	#define BCHFS_MAGIC \
1275	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
1276	0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
1277
1278	#define BCACHEFS_STATFS_MAGIC 0xca451a4e
1279
1280	#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
1281	#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
1282
1283	static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
1284	{
1285	__le64 ret;
1286
1287	memcpy(&ret, &sb->uuid, sizeof(ret));
1288	return ret;
1289	}
1290
1291	static inline __u64 __jset_magic(struct bch_sb *sb)
1292	{
1293	return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
1294	}
1295
1296	static inline __u64 __bset_magic(struct bch_sb *sb)
1297	{
1298	return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
1299	}
1300
1301	/ Journal /
1302
1303	#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
1304
1305	#define BCH_JSET_ENTRY_TYPES() \
1306	x(btree_keys, 0) \
1307	x(btree_root, 1) \
1308	x(prio_ptrs, 2) \
1309	x(blacklist, 3) \
1310	x(blacklist_v2, 4) \
1311	x(usage, 5) \
1312	x(data_usage, 6) \
1313	x(clock, 7) \
1314	x(dev_usage, 8) \
1315	x(log, 9) \
1316	x(overwrite, 10) \
1317	x(write_buffer_keys, 11) \
1318	x(datetime, 12)
1319
1320	enum bch_jset_entry_type {
1321	#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
1322	BCH_JSET_ENTRY_TYPES()
1323	#undef x
1324	BCH_JSET_ENTRY_NR
1325	};
1326
1327	static inline bool jset_entry_is_key(struct jset_entry *e)
1328	{
1329	switch (e->type) {
1330	case BCH_JSET_ENTRY_btree_keys:
1331	case BCH_JSET_ENTRY_btree_root:
1332	case BCH_JSET_ENTRY_overwrite:
1333	case BCH_JSET_ENTRY_write_buffer_keys:
1334	return true;
1335	}
1336
1337	return false;
1338	}
1339
1340	/*
1341	* Journal sequence numbers can be blacklisted: bsets record the max sequence
1342	* number of all the journal entries they contain updates for, so that on
1343	* recovery we can ignore those bsets that contain index updates newer that what
1344	* made it into the journal.
1345	*
1346	* This means that we can't reuse that journal_seq - we have to skip it, and
1347	* then record that we skipped it so that the next time we crash and recover we
1348	* don't think there was a missing journal entry.
1349	*/
1350	struct jset_entry_blacklist {
1351	struct jset_entry entry;
1352	__le64 seq;
1353	};
1354
1355	struct jset_entry_blacklist_v2 {
1356	struct jset_entry entry;
1357	__le64 start;
1358	__le64 end;
1359	};
1360
1361	#define BCH_FS_USAGE_TYPES() \
1362	x(reserved, 0) \
1363	x(inodes, 1) \
1364	x(key_version, 2)
1365
1366	enum bch_fs_usage_type {
1367	#define x(f, nr) BCH_FS_USAGE_##f = nr,
1368	BCH_FS_USAGE_TYPES()
1369	#undef x
1370	BCH_FS_USAGE_NR
1371	};
1372
1373	struct jset_entry_usage {
1374	struct jset_entry entry;
1375	__le64 v;
1376	} __packed;
1377
1378	struct jset_entry_data_usage {
1379	struct jset_entry entry;
1380	__le64 v;
1381	struct bch_replicas_entry_v1 r;
1382	} __packed;
1383
1384	struct jset_entry_clock {
1385	struct jset_entry entry;
1386	__u8 rw;
1387	__u8 pad[`7`];
1388	__le64 time;
1389	} __packed;
1390
1391	struct jset_entry_dev_usage_type {
1392	__le64 buckets;
1393	__le64 sectors;
1394	__le64 fragmented;
1395	} __packed;
1396
1397	struct jset_entry_dev_usage {
1398	struct jset_entry entry;
1399	__le32 dev;
1400	__u32 pad;
1401
1402	__le64 _buckets_ec; / No longer used /
1403	__le64 _buckets_unavailable; / No longer used /
1404
1405	struct jset_entry_dev_usage_type d[];
1406	};
1407
1408	static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
1409	{
1410	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
1411	sizeof(struct jset_entry_dev_usage_type);
1412	}
1413
1414	struct jset_entry_log {
1415	struct jset_entry entry;
1416	u8 d[];
1417	} __packed __aligned(`8`);
1418
1419	struct jset_entry_datetime {
1420	struct jset_entry entry;
1421	__le64 seconds;
1422	} __packed __aligned(`8`);
1423
1424	/*
1425	* On disk format for a journal entry:
1426	* seq is monotonically increasing; every journal entry has its own unique
1427	* sequence number.
1428	*
1429	* last_seq is the oldest journal entry that still has keys the btree hasn't
1430	* flushed to disk yet.
1431	*
1432	* version is for on disk format changes.
1433	*/
1434	struct jset {
1435	struct bch_csum csum;
1436
1437	__le64 magic;
1438	__le64 seq;
1439	__le32 version;
1440	__le32 flags;
1441
1442	__le32 u64s; / size of d[] in u64s /
1443
1444	__u8 encrypted_start[`0`];
1445
1446	__le16 _read_clock; / no longer used /
1447	__le16 _write_clock;
1448
1449	/ Sequence number of oldest dirty journal entry /
1450	__le64 last_seq;
1451
1452
1453	struct jset_entry start[`0`];
1454	__u64 _data[];
1455	} __packed __aligned(`8`);
1456
1457	LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, `0`, `4`);
1458	LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, `4`, `5`);
1459	LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, `5`, `6`);
1460
1461	#define BCH_JOURNAL_BUCKETS_MIN 8
1462
1463	/ Btree: /
1464
1465	enum btree_id_flags {
1466	BTREE_ID_EXTENTS = BIT(`0`),
1467	BTREE_ID_SNAPSHOTS = BIT(`1`),
1468	BTREE_ID_SNAPSHOT_FIELD = BIT(`2`),
1469	BTREE_ID_DATA = BIT(`3`),
1470	};
1471
1472	#define BCH_BTREE_IDS() \
1473	x(extents, 0, BTREE_ID_EXTENTS\|BTREE_ID_SNAPSHOTS\|BTREE_ID_DATA,\
1474	BIT_ULL(KEY_TYPE_whiteout)\| \
1475	BIT_ULL(KEY_TYPE_error)\| \
1476	BIT_ULL(KEY_TYPE_cookie)\| \
1477	BIT_ULL(KEY_TYPE_extent)\| \
1478	BIT_ULL(KEY_TYPE_reservation)\| \
1479	BIT_ULL(KEY_TYPE_reflink_p)\| \
1480	BIT_ULL(KEY_TYPE_inline_data)) \
1481	x(inodes, 1, BTREE_ID_SNAPSHOTS, \
1482	BIT_ULL(KEY_TYPE_whiteout)\| \
1483	BIT_ULL(KEY_TYPE_inode)\| \
1484	BIT_ULL(KEY_TYPE_inode_v2)\| \
1485	BIT_ULL(KEY_TYPE_inode_v3)\| \
1486	BIT_ULL(KEY_TYPE_inode_generation)) \
1487	x(dirents, 2, BTREE_ID_SNAPSHOTS, \
1488	BIT_ULL(KEY_TYPE_whiteout)\| \
1489	BIT_ULL(KEY_TYPE_hash_whiteout)\| \
1490	BIT_ULL(KEY_TYPE_dirent)) \
1491	x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
1492	BIT_ULL(KEY_TYPE_whiteout)\| \
1493	BIT_ULL(KEY_TYPE_cookie)\| \
1494	BIT_ULL(KEY_TYPE_hash_whiteout)\| \
1495	BIT_ULL(KEY_TYPE_xattr)) \
1496	x(alloc, 4, 0, \
1497	BIT_ULL(KEY_TYPE_alloc)\| \
1498	BIT_ULL(KEY_TYPE_alloc_v2)\| \
1499	BIT_ULL(KEY_TYPE_alloc_v3)\| \
1500	BIT_ULL(KEY_TYPE_alloc_v4)) \
1501	x(quotas, 5, 0, \
1502	BIT_ULL(KEY_TYPE_quota)) \
1503	x(stripes, 6, 0, \
1504	BIT_ULL(KEY_TYPE_stripe)) \
1505	x(reflink, 7, BTREE_ID_EXTENTS\|BTREE_ID_DATA, \
1506	BIT_ULL(KEY_TYPE_reflink_v)\| \
1507	BIT_ULL(KEY_TYPE_indirect_inline_data)\| \
1508	BIT_ULL(KEY_TYPE_error)) \
1509	x(subvolumes, 8, 0, \
1510	BIT_ULL(KEY_TYPE_subvolume)) \
1511	x(snapshots, 9, 0, \
1512	BIT_ULL(KEY_TYPE_snapshot)) \
1513	x(lru, 10, 0, \
1514	BIT_ULL(KEY_TYPE_set)) \
1515	x(freespace, 11, BTREE_ID_EXTENTS, \
1516	BIT_ULL(KEY_TYPE_set)) \
1517	x(need_discard, 12, 0, \
1518	BIT_ULL(KEY_TYPE_set)) \
1519	x(backpointers, 13, 0, \
1520	BIT_ULL(KEY_TYPE_backpointer)) \
1521	x(bucket_gens, 14, 0, \
1522	BIT_ULL(KEY_TYPE_bucket_gens)) \
1523	x(snapshot_trees, 15, 0, \
1524	BIT_ULL(KEY_TYPE_snapshot_tree)) \
1525	x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
1526	BIT_ULL(KEY_TYPE_set)) \
1527	x(logged_ops, 17, 0, \
1528	BIT_ULL(KEY_TYPE_logged_op_truncate)\| \
1529	BIT_ULL(KEY_TYPE_logged_op_finsert)) \
1530	x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
1531	BIT_ULL(KEY_TYPE_set)\|BIT_ULL(KEY_TYPE_cookie)) \
1532	x(subvolume_children, 19, 0, \
1533	BIT_ULL(KEY_TYPE_set))
1534
1535	enum btree_id {
1536	#define x(name, nr, ...) BTREE_ID_##name = nr,
1537	BCH_BTREE_IDS()
1538	#undef x
1539	BTREE_ID_NR
1540	};
1541
1542	static inline bool btree_id_is_alloc(enum btree_id id)
1543	{
1544	switch (id) {
1545	case BTREE_ID_alloc:
1546	case BTREE_ID_backpointers:
1547	case BTREE_ID_need_discard:
1548	case BTREE_ID_freespace:
1549	case BTREE_ID_bucket_gens:
1550	return true;
1551	default:
1552	return false;
1553	}
1554	}
1555
1556	#define BTREE_MAX_DEPTH 4U
1557
1558	/ Btree nodes /
1559
1560	/*
1561	* Btree nodes
1562	*
1563	* On disk a btree node is a list/log of these; within each set the keys are
1564	* sorted
1565	*/
1566	struct bset {
1567	__le64 seq;
1568
1569	/*
1570	* Highest journal entry this bset contains keys for.
1571	* If on recovery we don't see that journal entry, this bset is ignored:
1572	* this allows us to preserve the order of all index updates after a
1573	* crash, since the journal records a total order of all index updates
1574	* and anything that didn't make it to the journal doesn't get used.
1575	*/
1576	__le64 journal_seq;
1577
1578	__le32 flags;
1579	__le16 version;
1580	__le16 u64s; / count of d[] in u64s /
1581
1582	struct bkey_packed start[`0`];
1583	__u64 _data[];
1584	} __packed __aligned(`8`);
1585
1586	LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, `0`, `4`);
1587
1588	LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, `4`, `5`);
1589	LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
1590	struct bset, flags, `5`, `6`);
1591
1592	/ Sector offset within the btree node: /
1593	LE32_BITMASK(BSET_OFFSET, struct bset, flags, `16`, `32`);
1594
1595	struct btree_node {
1596	struct bch_csum csum;
1597	__le64 magic;
1598
1599	/ this flags field is encrypted, unlike bset->flags: /
1600	__le64 flags;
1601
1602	/ Closed interval: /
1603	struct bpos min_key;
1604	struct bpos max_key;
1605	struct bch_extent_ptr _ptr; / not used anymore /
1606	struct bkey_format format;
1607
1608	union {
1609	struct bset keys;
1610	struct {
1611	__u8 pad[`22`];
1612	__le16 u64s;
1613	__u64 _data[`0`];
1614
1615	};
1616	};
1617	} __packed __aligned(`8`);
1618
1619	LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, `0`, `4`);
1620	LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, `4`, `8`);
1621	LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
1622	struct btree_node, flags, `8`, `9`);
1623	LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, `9`, `25`);
1624	/ 25-32 unused /
1625	LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, `32`, `64`);
1626
1627	static inline __u64 BTREE_NODE_ID(struct btree_node *n)
1628	{
1629	return BTREE_NODE_ID_LO(k: n) \| (BTREE_NODE_ID_HI(k: n) << `4`);
1630	}
1631
1632	static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
1633	{
1634	SET_BTREE_NODE_ID_LO(k: n, v);
1635	SET_BTREE_NODE_ID_HI(k: n, v: v >> `4`);
1636	}
1637
1638	struct btree_node_entry {
1639	struct bch_csum csum;
1640
1641	union {
1642	struct bset keys;
1643	struct {
1644	__u8 pad[`22`];
1645	__le16 u64s;
1646	__u64 _data[`0`];
1647	};
1648	};
1649	} __packed __aligned(`8`);
1650
1651	#endif /* _BCACHEFS_FORMAT_H */
1652

source code of linux/fs/bcachefs/bcachefs_format.h