1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_EXTENTS_FORMAT_H |
3 | #define _BCACHEFS_EXTENTS_FORMAT_H |
4 | |
5 | /* |
6 | * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally |
7 | * preceded by checksum/compression information (bch_extent_crc32 or |
8 | * bch_extent_crc64). |
9 | * |
10 | * One major determining factor in the format of extents is how we handle and |
11 | * represent extents that have been partially overwritten and thus trimmed: |
12 | * |
13 | * If an extent is not checksummed or compressed, when the extent is trimmed we |
14 | * don't have to remember the extent we originally allocated and wrote: we can |
15 | * merely adjust ptr->offset to point to the start of the data that is currently |
16 | * live. The size field in struct bkey records the current (live) size of the |
17 | * extent, and is also used to mean "size of region on disk that we point to" in |
18 | * this case. |
19 | * |
20 | * Thus an extent that is not checksummed or compressed will consist only of a |
21 | * list of bch_extent_ptrs, with none of the fields in |
22 | * bch_extent_crc32/bch_extent_crc64. |
23 | * |
24 | * When an extent is checksummed or compressed, it's not possible to read only |
25 | * the data that is currently live: we have to read the entire extent that was |
26 | * originally written, and then return only the part of the extent that is |
27 | * currently live. |
28 | * |
29 | * Thus, in addition to the current size of the extent in struct bkey, we need |
30 | * to store the size of the originally allocated space - this is the |
31 | * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, |
32 | * when the extent is trimmed, instead of modifying the offset field of the |
33 | * pointer, we keep a second smaller offset field - "offset into the original |
34 | * extent of the currently live region". |
35 | * |
36 | * The other major determining factor is replication and data migration: |
37 | * |
38 | * Each pointer may have its own bch_extent_crc32/64. When doing a replicated |
39 | * write, we will initially write all the replicas in the same format, with the |
40 | * same checksum type and compression format - however, when copygc runs later (or |
41 | * tiering/cache promotion, anything that moves data), it is not in general |
42 | * going to rewrite all the pointers at once - one of the replicas may be in a |
43 | * bucket on one device that has very little fragmentation while another lives |
44 | * in a bucket that has become heavily fragmented, and thus is being rewritten |
45 | * sooner than the rest. |
46 | * |
47 | * Thus it will only move a subset of the pointers (or in the case of |
48 | * tiering/cache promotion perhaps add a single pointer without dropping any |
49 | * current pointers), and if the extent has been partially overwritten it must |
50 | * write only the currently live portion (or copygc would not be able to reduce |
51 | * fragmentation!) - which necessitates a different bch_extent_crc format for |
52 | * the new pointer. |
53 | * |
54 | * But in the interests of space efficiency, we don't want to store one |
55 | * bch_extent_crc for each pointer if we don't have to. |
56 | * |
57 | * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and |
58 | * bch_extent_ptrs appended arbitrarily one after the other. We determine the |
59 | * type of a given entry with a scheme similar to utf8 (except we're encoding a |
60 | * type, not a size), encoding the type in the position of the first set bit: |
61 | * |
62 | * bch_extent_crc32 - 0b1 |
63 | * bch_extent_ptr - 0b10 |
64 | * bch_extent_crc64 - 0b100 |
65 | * |
66 | * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and |
67 | * bch_extent_crc64 is the least constrained). |
68 | * |
69 | * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, |
70 | * until the next bch_extent_crc32/64. |
71 | * |
72 | * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer |
73 | * is neither checksummed nor compressed. |
74 | */ |
75 | |
76 | #define BCH_EXTENT_ENTRY_TYPES() \ |
77 | x(ptr, 0) \ |
78 | x(crc32, 1) \ |
79 | x(crc64, 2) \ |
80 | x(crc128, 3) \ |
81 | x(stripe_ptr, 4) \ |
82 | x(rebalance, 5) |
83 | #define BCH_EXTENT_ENTRY_MAX 6 |
84 | |
85 | enum bch_extent_entry_type { |
86 | #define x(f, n) BCH_EXTENT_ENTRY_##f = n, |
87 | BCH_EXTENT_ENTRY_TYPES() |
88 | #undef x |
89 | }; |
90 | |
91 | /* Compressed/uncompressed size are stored biased by 1: */ |
92 | struct bch_extent_crc32 { |
93 | #if defined(__LITTLE_ENDIAN_BITFIELD) |
94 | __u32 type:2, |
95 | _compressed_size:7, |
96 | _uncompressed_size:7, |
97 | offset:7, |
98 | _unused:1, |
99 | csum_type:4, |
100 | compression_type:4; |
101 | __u32 csum; |
102 | #elif defined (__BIG_ENDIAN_BITFIELD) |
103 | __u32 csum; |
104 | __u32 compression_type:4, |
105 | csum_type:4, |
106 | _unused:1, |
107 | offset:7, |
108 | _uncompressed_size:7, |
109 | _compressed_size:7, |
110 | type:2; |
111 | #endif |
112 | } __packed __aligned(8); |
113 | |
114 | #define CRC32_SIZE_MAX (1U << 7) |
115 | #define CRC32_NONCE_MAX 0 |
116 | |
117 | struct bch_extent_crc64 { |
118 | #if defined(__LITTLE_ENDIAN_BITFIELD) |
119 | __u64 type:3, |
120 | _compressed_size:9, |
121 | _uncompressed_size:9, |
122 | offset:9, |
123 | nonce:10, |
124 | csum_type:4, |
125 | compression_type:4, |
126 | csum_hi:16; |
127 | #elif defined (__BIG_ENDIAN_BITFIELD) |
128 | __u64 csum_hi:16, |
129 | compression_type:4, |
130 | csum_type:4, |
131 | nonce:10, |
132 | offset:9, |
133 | _uncompressed_size:9, |
134 | _compressed_size:9, |
135 | type:3; |
136 | #endif |
137 | __u64 csum_lo; |
138 | } __packed __aligned(8); |
139 | |
140 | #define CRC64_SIZE_MAX (1U << 9) |
141 | #define CRC64_NONCE_MAX ((1U << 10) - 1) |
142 | |
143 | struct bch_extent_crc128 { |
144 | #if defined(__LITTLE_ENDIAN_BITFIELD) |
145 | __u64 type:4, |
146 | _compressed_size:13, |
147 | _uncompressed_size:13, |
148 | offset:13, |
149 | nonce:13, |
150 | csum_type:4, |
151 | compression_type:4; |
152 | #elif defined (__BIG_ENDIAN_BITFIELD) |
153 | __u64 compression_type:4, |
154 | csum_type:4, |
155 | nonce:13, |
156 | offset:13, |
157 | _uncompressed_size:13, |
158 | _compressed_size:13, |
159 | type:4; |
160 | #endif |
161 | struct bch_csum csum; |
162 | } __packed __aligned(8); |
163 | |
164 | #define CRC128_SIZE_MAX (1U << 13) |
165 | #define CRC128_NONCE_MAX ((1U << 13) - 1) |
166 | |
167 | /* |
168 | * @reservation - pointer hasn't been written to, just reserved |
169 | */ |
170 | struct bch_extent_ptr { |
171 | #if defined(__LITTLE_ENDIAN_BITFIELD) |
172 | __u64 type:1, |
173 | cached:1, |
174 | unused:1, |
175 | unwritten:1, |
176 | offset:44, /* 8 petabytes */ |
177 | dev:8, |
178 | gen:8; |
179 | #elif defined (__BIG_ENDIAN_BITFIELD) |
180 | __u64 gen:8, |
181 | dev:8, |
182 | offset:44, |
183 | unwritten:1, |
184 | unused:1, |
185 | cached:1, |
186 | type:1; |
187 | #endif |
188 | } __packed __aligned(8); |
189 | |
190 | struct bch_extent_stripe_ptr { |
191 | #if defined(__LITTLE_ENDIAN_BITFIELD) |
192 | __u64 type:5, |
193 | block:8, |
194 | redundancy:4, |
195 | idx:47; |
196 | #elif defined (__BIG_ENDIAN_BITFIELD) |
197 | __u64 idx:47, |
198 | redundancy:4, |
199 | block:8, |
200 | type:5; |
201 | #endif |
202 | }; |
203 | |
204 | struct bch_extent_rebalance { |
205 | #if defined(__LITTLE_ENDIAN_BITFIELD) |
206 | __u64 type:6, |
207 | unused:34, |
208 | compression:8, /* enum bch_compression_opt */ |
209 | target:16; |
210 | #elif defined (__BIG_ENDIAN_BITFIELD) |
211 | __u64 target:16, |
212 | compression:8, |
213 | unused:34, |
214 | type:6; |
215 | #endif |
216 | }; |
217 | |
218 | union bch_extent_entry { |
219 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 |
220 | unsigned long type; |
221 | #elif __BITS_PER_LONG == 32 |
222 | struct { |
223 | unsigned long pad; |
224 | unsigned long type; |
225 | }; |
226 | #else |
227 | #error edit for your odd byteorder. |
228 | #endif |
229 | |
230 | #define x(f, n) struct bch_extent_##f f; |
231 | BCH_EXTENT_ENTRY_TYPES() |
232 | #undef x |
233 | }; |
234 | |
235 | struct bch_btree_ptr { |
236 | struct bch_val v; |
237 | |
238 | __u64 _data[0]; |
239 | struct bch_extent_ptr start[]; |
240 | } __packed __aligned(8); |
241 | |
242 | struct bch_btree_ptr_v2 { |
243 | struct bch_val v; |
244 | |
245 | __u64 mem_ptr; |
246 | __le64 seq; |
247 | __le16 sectors_written; |
248 | __le16 flags; |
249 | struct bpos min_key; |
250 | __u64 _data[0]; |
251 | struct bch_extent_ptr start[]; |
252 | } __packed __aligned(8); |
253 | |
254 | LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); |
255 | |
256 | struct bch_extent { |
257 | struct bch_val v; |
258 | |
259 | __u64 _data[0]; |
260 | union bch_extent_entry start[]; |
261 | } __packed __aligned(8); |
262 | |
263 | /* Maximum size (in u64s) a single pointer could be: */ |
264 | #define BKEY_EXTENT_PTR_U64s_MAX\ |
265 | ((sizeof(struct bch_extent_crc128) + \ |
266 | sizeof(struct bch_extent_ptr)) / sizeof(__u64)) |
267 | |
268 | /* Maximum possible size of an entire extent value: */ |
269 | #define BKEY_EXTENT_VAL_U64s_MAX \ |
270 | (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) |
271 | |
272 | /* * Maximum possible size of an entire extent, key + value: */ |
273 | #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) |
274 | |
275 | /* Btree pointers don't carry around checksums: */ |
276 | #define BKEY_BTREE_PTR_VAL_U64s_MAX \ |
277 | ((sizeof(struct bch_btree_ptr_v2) + \ |
278 | sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) |
279 | #define BKEY_BTREE_PTR_U64s_MAX \ |
280 | (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) |
281 | |
282 | struct bch_reservation { |
283 | struct bch_val v; |
284 | |
285 | __le32 generation; |
286 | __u8 nr_replicas; |
287 | __u8 pad[3]; |
288 | } __packed __aligned(8); |
289 | |
290 | struct bch_inline_data { |
291 | struct bch_val v; |
292 | u8 data[]; |
293 | }; |
294 | |
295 | #endif /* _BCACHEFS_EXTENTS_FORMAT_H */ |
296 | |