1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_H |
3 | #define _BCACHEFS_H |
4 | |
5 | /* |
6 | * SOME HIGH LEVEL CODE DOCUMENTATION: |
7 | * |
8 | * Bcache mostly works with cache sets, cache devices, and backing devices. |
9 | * |
10 | * Support for multiple cache devices hasn't quite been finished off yet, but |
11 | * it's about 95% plumbed through. A cache set and its cache devices is sort of |
12 | * like a md raid array and its component devices. Most of the code doesn't care |
13 | * about individual cache devices, the main abstraction is the cache set. |
14 | * |
15 | * Multiple cache devices is intended to give us the ability to mirror dirty |
16 | * cached data and metadata, without mirroring clean cached data. |
17 | * |
18 | * Backing devices are different, in that they have a lifetime independent of a |
19 | * cache set. When you register a newly formatted backing device it'll come up |
20 | * in passthrough mode, and then you can attach and detach a backing device from |
21 | * a cache set at runtime - while it's mounted and in use. Detaching implicitly |
22 | * invalidates any cached data for that backing device. |
23 | * |
24 | * A cache set can have multiple (many) backing devices attached to it. |
25 | * |
26 | * There's also flash only volumes - this is the reason for the distinction |
27 | * between struct cached_dev and struct bcache_device. A flash only volume |
28 | * works much like a bcache device that has a backing device, except the |
29 | * "cached" data is always dirty. The end result is that we get thin |
30 | * provisioning with very little additional code. |
31 | * |
32 | * Flash only volumes work but they're not production ready because the moving |
33 | * garbage collector needs more work. More on that later. |
34 | * |
35 | * BUCKETS/ALLOCATION: |
36 | * |
37 | * Bcache is primarily designed for caching, which means that in normal |
38 | * operation all of our available space will be allocated. Thus, we need an |
39 | * efficient way of deleting things from the cache so we can write new things to |
40 | * it. |
41 | * |
42 | * To do this, we first divide the cache device up into buckets. A bucket is the |
43 | * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ |
44 | * works efficiently. |
45 | * |
46 | * Each bucket has a 16 bit priority, and an 8 bit generation associated with |
47 | * it. The gens and priorities for all the buckets are stored contiguously and |
48 | * packed on disk (in a linked list of buckets - aside from the superblock, all |
49 | * of bcache's metadata is stored in buckets). |
50 | * |
51 | * The priority is used to implement an LRU. We reset a bucket's priority when |
52 | * we allocate it or on cache it, and every so often we decrement the priority |
53 | * of each bucket. It could be used to implement something more sophisticated, |
54 | * if anyone ever gets around to it. |
55 | * |
56 | * The generation is used for invalidating buckets. Each pointer also has an 8 |
57 | * bit generation embedded in it; for a pointer to be considered valid, its gen |
58 | * must match the gen of the bucket it points into. Thus, to reuse a bucket all |
59 | * we have to do is increment its gen (and write its new gen to disk; we batch |
60 | * this up). |
61 | * |
62 | * Bcache is entirely COW - we never write twice to a bucket, even buckets that |
63 | * contain metadata (including btree nodes). |
64 | * |
65 | * THE BTREE: |
66 | * |
67 | * Bcache is in large part design around the btree. |
68 | * |
69 | * At a high level, the btree is just an index of key -> ptr tuples. |
70 | * |
71 | * Keys represent extents, and thus have a size field. Keys also have a variable |
72 | * number of pointers attached to them (potentially zero, which is handy for |
73 | * invalidating the cache). |
74 | * |
75 | * The key itself is an inode:offset pair. The inode number corresponds to a |
76 | * backing device or a flash only volume. The offset is the ending offset of the |
77 | * extent within the inode - not the starting offset; this makes lookups |
78 | * slightly more convenient. |
79 | * |
80 | * Pointers contain the cache device id, the offset on that device, and an 8 bit |
81 | * generation number. More on the gen later. |
82 | * |
83 | * Index lookups are not fully abstracted - cache lookups in particular are |
84 | * still somewhat mixed in with the btree code, but things are headed in that |
85 | * direction. |
86 | * |
87 | * Updates are fairly well abstracted, though. There are two different ways of |
88 | * updating the btree; insert and replace. |
89 | * |
90 | * BTREE_INSERT will just take a list of keys and insert them into the btree - |
91 | * overwriting (possibly only partially) any extents they overlap with. This is |
92 | * used to update the index after a write. |
93 | * |
94 | * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is |
95 | * overwriting a key that matches another given key. This is used for inserting |
96 | * data into the cache after a cache miss, and for background writeback, and for |
97 | * the moving garbage collector. |
98 | * |
99 | * There is no "delete" operation; deleting things from the index is |
100 | * accomplished by either by invalidating pointers (by incrementing a bucket's |
101 | * gen) or by inserting a key with 0 pointers - which will overwrite anything |
102 | * previously present at that location in the index. |
103 | * |
104 | * This means that there are always stale/invalid keys in the btree. They're |
105 | * filtered out by the code that iterates through a btree node, and removed when |
106 | * a btree node is rewritten. |
107 | * |
108 | * BTREE NODES: |
109 | * |
110 | * Our unit of allocation is a bucket, and we can't arbitrarily allocate and |
111 | * free smaller than a bucket - so, that's how big our btree nodes are. |
112 | * |
113 | * (If buckets are really big we'll only use part of the bucket for a btree node |
114 | * - no less than 1/4th - but a bucket still contains no more than a single |
115 | * btree node. I'd actually like to change this, but for now we rely on the |
116 | * bucket's gen for deleting btree nodes when we rewrite/split a node.) |
117 | * |
118 | * Anyways, btree nodes are big - big enough to be inefficient with a textbook |
119 | * btree implementation. |
120 | * |
121 | * The way this is solved is that btree nodes are internally log structured; we |
122 | * can append new keys to an existing btree node without rewriting it. This |
123 | * means each set of keys we write is sorted, but the node is not. |
124 | * |
125 | * We maintain this log structure in memory - keeping 1Mb of keys sorted would |
126 | * be expensive, and we have to distinguish between the keys we have written and |
127 | * the keys we haven't. So to do a lookup in a btree node, we have to search |
128 | * each sorted set. But we do merge written sets together lazily, so the cost of |
129 | * these extra searches is quite low (normally most of the keys in a btree node |
130 | * will be in one big set, and then there'll be one or two sets that are much |
131 | * smaller). |
132 | * |
133 | * This log structure makes bcache's btree more of a hybrid between a |
134 | * conventional btree and a compacting data structure, with some of the |
135 | * advantages of both. |
136 | * |
137 | * GARBAGE COLLECTION: |
138 | * |
139 | * We can't just invalidate any bucket - it might contain dirty data or |
140 | * metadata. If it once contained dirty data, other writes might overwrite it |
141 | * later, leaving no valid pointers into that bucket in the index. |
142 | * |
143 | * Thus, the primary purpose of garbage collection is to find buckets to reuse. |
144 | * It also counts how much valid data it each bucket currently contains, so that |
145 | * allocation can reuse buckets sooner when they've been mostly overwritten. |
146 | * |
147 | * It also does some things that are really internal to the btree |
148 | * implementation. If a btree node contains pointers that are stale by more than |
149 | * some threshold, it rewrites the btree node to avoid the bucket's generation |
150 | * wrapping around. It also merges adjacent btree nodes if they're empty enough. |
151 | * |
152 | * THE JOURNAL: |
153 | * |
154 | * Bcache's journal is not necessary for consistency; we always strictly |
155 | * order metadata writes so that the btree and everything else is consistent on |
156 | * disk in the event of an unclean shutdown, and in fact bcache had writeback |
157 | * caching (with recovery from unclean shutdown) before journalling was |
158 | * implemented. |
159 | * |
160 | * Rather, the journal is purely a performance optimization; we can't complete a |
161 | * write until we've updated the index on disk, otherwise the cache would be |
162 | * inconsistent in the event of an unclean shutdown. This means that without the |
163 | * journal, on random write workloads we constantly have to update all the leaf |
164 | * nodes in the btree, and those writes will be mostly empty (appending at most |
165 | * a few keys each) - highly inefficient in terms of amount of metadata writes, |
166 | * and it puts more strain on the various btree resorting/compacting code. |
167 | * |
168 | * The journal is just a log of keys we've inserted; on startup we just reinsert |
169 | * all the keys in the open journal entries. That means that when we're updating |
170 | * a node in the btree, we can wait until a 4k block of keys fills up before |
171 | * writing them out. |
172 | * |
173 | * For simplicity, we only journal updates to leaf nodes; updates to parent |
174 | * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth |
175 | * the complexity to deal with journalling them (in particular, journal replay) |
176 | * - updates to non leaf nodes just happen synchronously (see btree_split()). |
177 | */ |
178 | |
179 | #undef pr_fmt |
180 | #ifdef __KERNEL__ |
181 | #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ |
182 | #else |
183 | #define pr_fmt(fmt) "%s() " fmt "\n", __func__ |
184 | #endif |
185 | |
186 | #include <linux/backing-dev-defs.h> |
187 | #include <linux/bug.h> |
188 | #include <linux/bio.h> |
189 | #include <linux/closure.h> |
190 | #include <linux/kobject.h> |
191 | #include <linux/list.h> |
192 | #include <linux/math64.h> |
193 | #include <linux/mutex.h> |
194 | #include <linux/percpu-refcount.h> |
195 | #include <linux/percpu-rwsem.h> |
196 | #include <linux/refcount.h> |
197 | #include <linux/rhashtable.h> |
198 | #include <linux/rwsem.h> |
199 | #include <linux/semaphore.h> |
200 | #include <linux/seqlock.h> |
201 | #include <linux/shrinker.h> |
202 | #include <linux/srcu.h> |
203 | #include <linux/types.h> |
204 | #include <linux/workqueue.h> |
205 | #include <linux/zstd.h> |
206 | |
207 | #include "bcachefs_format.h" |
208 | #include "errcode.h" |
209 | #include "fifo.h" |
210 | #include "nocow_locking_types.h" |
211 | #include "opts.h" |
212 | #include "recovery_passes_types.h" |
213 | #include "sb-errors_types.h" |
214 | #include "seqmutex.h" |
215 | #include "time_stats.h" |
216 | #include "util.h" |
217 | |
218 | #ifdef CONFIG_BCACHEFS_DEBUG |
219 | #define BCH_WRITE_REF_DEBUG |
220 | #endif |
221 | |
222 | #ifndef dynamic_fault |
223 | #define dynamic_fault(...) 0 |
224 | #endif |
225 | |
226 | #define race_fault(...) dynamic_fault("bcachefs:race") |
227 | |
228 | #define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) |
229 | |
230 | #define trace_and_count(_c, _name, ...) \ |
231 | do { \ |
232 | count_event(_c, _name); \ |
233 | trace_##_name(__VA_ARGS__); \ |
234 | } while (0) |
235 | |
236 | #define bch2_fs_init_fault(name) \ |
237 | dynamic_fault("bcachefs:bch_fs_init:" name) |
238 | #define bch2_meta_read_fault(name) \ |
239 | dynamic_fault("bcachefs:meta:read:" name) |
240 | #define bch2_meta_write_fault(name) \ |
241 | dynamic_fault("bcachefs:meta:write:" name) |
242 | |
243 | #ifdef __KERNEL__ |
244 | #define BCACHEFS_LOG_PREFIX |
245 | #endif |
246 | |
247 | #ifdef BCACHEFS_LOG_PREFIX |
248 | |
249 | #define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) |
250 | #define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) |
251 | #define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) |
252 | #define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) |
253 | #define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ |
254 | "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) |
255 | |
256 | #else |
257 | |
258 | #define bch2_log_msg(_c, fmt) fmt |
259 | #define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) |
260 | #define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) |
261 | #define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) |
262 | #define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ |
263 | "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) |
264 | |
265 | #endif |
266 | |
267 | #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") |
268 | |
269 | __printf(2, 3) |
270 | void bch2_print_opts(struct bch_opts *, const char *, ...); |
271 | |
272 | __printf(2, 3) |
273 | void __bch2_print(struct bch_fs *c, const char *fmt, ...); |
274 | |
275 | #define maybe_dev_to_fs(_c) _Generic((_c), \ |
276 | struct bch_dev *: ((struct bch_dev *) (_c))->fs, \ |
277 | struct bch_fs *: (_c)) |
278 | |
279 | #define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__) |
280 | |
281 | #define bch2_print_ratelimited(_c, ...) \ |
282 | do { \ |
283 | static DEFINE_RATELIMIT_STATE(_rs, \ |
284 | DEFAULT_RATELIMIT_INTERVAL, \ |
285 | DEFAULT_RATELIMIT_BURST); \ |
286 | \ |
287 | if (__ratelimit(&_rs)) \ |
288 | bch2_print(_c, __VA_ARGS__); \ |
289 | } while (0) |
290 | |
291 | #define bch_info(c, fmt, ...) \ |
292 | bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) |
293 | #define bch_notice(c, fmt, ...) \ |
294 | bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) |
295 | #define bch_warn(c, fmt, ...) \ |
296 | bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) |
297 | #define bch_warn_ratelimited(c, fmt, ...) \ |
298 | bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) |
299 | |
300 | #define bch_err(c, fmt, ...) \ |
301 | bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) |
302 | #define bch_err_dev(ca, fmt, ...) \ |
303 | bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) |
304 | #define bch_err_dev_offset(ca, _offset, fmt, ...) \ |
305 | bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) |
306 | #define bch_err_inum(c, _inum, fmt, ...) \ |
307 | bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) |
308 | #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ |
309 | bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) |
310 | |
311 | #define bch_err_ratelimited(c, fmt, ...) \ |
312 | bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) |
313 | #define bch_err_dev_ratelimited(ca, fmt, ...) \ |
314 | bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) |
315 | #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ |
316 | bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) |
317 | #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ |
318 | bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) |
319 | #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ |
320 | bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) |
321 | |
322 | static inline bool should_print_err(int err) |
323 | { |
324 | return err && !bch2_err_matches(err, BCH_ERR_transaction_restart); |
325 | } |
326 | |
327 | #define bch_err_fn(_c, _ret) \ |
328 | do { \ |
329 | if (should_print_err(_ret)) \ |
330 | bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ |
331 | } while (0) |
332 | |
333 | #define bch_err_fn_ratelimited(_c, _ret) \ |
334 | do { \ |
335 | if (should_print_err(_ret)) \ |
336 | bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ |
337 | } while (0) |
338 | |
339 | #define bch_err_msg(_c, _ret, _msg, ...) \ |
340 | do { \ |
341 | if (should_print_err(_ret)) \ |
342 | bch_err(_c, "%s(): error " _msg " %s", __func__, \ |
343 | ##__VA_ARGS__, bch2_err_str(_ret)); \ |
344 | } while (0) |
345 | |
346 | #define bch_verbose(c, fmt, ...) \ |
347 | do { \ |
348 | if ((c)->opts.verbose) \ |
349 | bch_info(c, fmt, ##__VA_ARGS__); \ |
350 | } while (0) |
351 | |
352 | #define pr_verbose_init(opts, fmt, ...) \ |
353 | do { \ |
354 | if (opt_get(opts, verbose)) \ |
355 | pr_info(fmt, ##__VA_ARGS__); \ |
356 | } while (0) |
357 | |
358 | /* Parameters that are useful for debugging, but should always be compiled in: */ |
359 | #define BCH_DEBUG_PARAMS_ALWAYS() \ |
360 | BCH_DEBUG_PARAM(key_merging_disabled, \ |
361 | "Disables merging of extents") \ |
362 | BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ |
363 | "Causes mark and sweep to compact and rewrite every " \ |
364 | "btree node it traverses") \ |
365 | BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ |
366 | "Disables rewriting of btree nodes during mark and sweep")\ |
367 | BCH_DEBUG_PARAM(btree_shrinker_disabled, \ |
368 | "Disables the shrinker callback for the btree node cache")\ |
369 | BCH_DEBUG_PARAM(verify_btree_ondisk, \ |
370 | "Reread btree nodes at various points to verify the " \ |
371 | "mergesort in the read path against modifications " \ |
372 | "done in memory") \ |
373 | BCH_DEBUG_PARAM(verify_all_btree_replicas, \ |
374 | "When reading btree nodes, read all replicas and " \ |
375 | "compare them") \ |
376 | BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ |
377 | "Don't use the write buffer for backpointers, enabling "\ |
378 | "extra runtime checks") |
379 | |
380 | /* Parameters that should only be compiled in debug mode: */ |
381 | #define BCH_DEBUG_PARAMS_DEBUG() \ |
382 | BCH_DEBUG_PARAM(expensive_debug_checks, \ |
383 | "Enables various runtime debugging checks that " \ |
384 | "significantly affect performance") \ |
385 | BCH_DEBUG_PARAM(debug_check_iterators, \ |
386 | "Enables extra verification for btree iterators") \ |
387 | BCH_DEBUG_PARAM(debug_check_btree_accounting, \ |
388 | "Verify btree accounting for keys within a node") \ |
389 | BCH_DEBUG_PARAM(journal_seq_verify, \ |
390 | "Store the journal sequence number in the version " \ |
391 | "number of every btree key, and verify that btree " \ |
392 | "update ordering is preserved during recovery") \ |
393 | BCH_DEBUG_PARAM(inject_invalid_keys, \ |
394 | "Store the journal sequence number in the version " \ |
395 | "number of every btree key, and verify that btree " \ |
396 | "update ordering is preserved during recovery") \ |
397 | BCH_DEBUG_PARAM(test_alloc_startup, \ |
398 | "Force allocator startup to use the slowpath where it" \ |
399 | "can't find enough free buckets without invalidating" \ |
400 | "cached data") \ |
401 | BCH_DEBUG_PARAM(force_reconstruct_read, \ |
402 | "Force reads to use the reconstruct path, when reading" \ |
403 | "from erasure coded extents") \ |
404 | BCH_DEBUG_PARAM(test_restart_gc, \ |
405 | "Test restarting mark and sweep gc when bucket gens change") |
406 | |
407 | #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() |
408 | |
409 | #ifdef CONFIG_BCACHEFS_DEBUG |
410 | #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() |
411 | #else |
412 | #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() |
413 | #endif |
414 | |
415 | #define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; |
416 | BCH_DEBUG_PARAMS() |
417 | #undef BCH_DEBUG_PARAM |
418 | |
419 | #ifndef CONFIG_BCACHEFS_DEBUG |
420 | #define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; |
421 | BCH_DEBUG_PARAMS_DEBUG() |
422 | #undef BCH_DEBUG_PARAM |
423 | #endif |
424 | |
425 | #define BCH_TIME_STATS() \ |
426 | x(btree_node_mem_alloc) \ |
427 | x(btree_node_split) \ |
428 | x(btree_node_compact) \ |
429 | x(btree_node_merge) \ |
430 | x(btree_node_sort) \ |
431 | x(btree_node_read) \ |
432 | x(btree_node_read_done) \ |
433 | x(btree_interior_update_foreground) \ |
434 | x(btree_interior_update_total) \ |
435 | x(btree_gc) \ |
436 | x(data_write) \ |
437 | x(data_read) \ |
438 | x(data_promote) \ |
439 | x(journal_flush_write) \ |
440 | x(journal_noflush_write) \ |
441 | x(journal_flush_seq) \ |
442 | x(blocked_journal_low_on_space) \ |
443 | x(blocked_journal_low_on_pin) \ |
444 | x(blocked_journal_max_in_flight) \ |
445 | x(blocked_allocate) \ |
446 | x(blocked_allocate_open_bucket) \ |
447 | x(blocked_write_buffer_full) \ |
448 | x(nocow_lock_contended) |
449 | |
450 | enum bch_time_stats { |
451 | #define x(name) BCH_TIME_##name, |
452 | BCH_TIME_STATS() |
453 | #undef x |
454 | BCH_TIME_STAT_NR |
455 | }; |
456 | |
457 | #include "alloc_types.h" |
458 | #include "btree_types.h" |
459 | #include "btree_node_scan_types.h" |
460 | #include "btree_write_buffer_types.h" |
461 | #include "buckets_types.h" |
462 | #include "buckets_waiting_for_journal_types.h" |
463 | #include "clock_types.h" |
464 | #include "disk_groups_types.h" |
465 | #include "ec_types.h" |
466 | #include "journal_types.h" |
467 | #include "keylist_types.h" |
468 | #include "quota_types.h" |
469 | #include "rebalance_types.h" |
470 | #include "replicas_types.h" |
471 | #include "subvolume_types.h" |
472 | #include "super_types.h" |
473 | #include "thread_with_file_types.h" |
474 | |
475 | /* Number of nodes btree coalesce will try to coalesce at once */ |
476 | #define GC_MERGE_NODES 4U |
477 | |
478 | /* Maximum number of nodes we might need to allocate atomically: */ |
479 | #define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) |
480 | |
481 | /* Size of the freelist we allocate btree nodes from: */ |
482 | #define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) |
483 | |
484 | #define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) |
485 | |
486 | struct btree; |
487 | |
488 | enum gc_phase { |
489 | GC_PHASE_NOT_RUNNING, |
490 | GC_PHASE_START, |
491 | GC_PHASE_SB, |
492 | |
493 | GC_PHASE_BTREE_stripes, |
494 | GC_PHASE_BTREE_extents, |
495 | GC_PHASE_BTREE_inodes, |
496 | GC_PHASE_BTREE_dirents, |
497 | GC_PHASE_BTREE_xattrs, |
498 | GC_PHASE_BTREE_alloc, |
499 | GC_PHASE_BTREE_quotas, |
500 | GC_PHASE_BTREE_reflink, |
501 | GC_PHASE_BTREE_subvolumes, |
502 | GC_PHASE_BTREE_snapshots, |
503 | GC_PHASE_BTREE_lru, |
504 | GC_PHASE_BTREE_freespace, |
505 | GC_PHASE_BTREE_need_discard, |
506 | GC_PHASE_BTREE_backpointers, |
507 | GC_PHASE_BTREE_bucket_gens, |
508 | GC_PHASE_BTREE_snapshot_trees, |
509 | GC_PHASE_BTREE_deleted_inodes, |
510 | GC_PHASE_BTREE_logged_ops, |
511 | GC_PHASE_BTREE_rebalance_work, |
512 | GC_PHASE_BTREE_subvolume_children, |
513 | |
514 | GC_PHASE_PENDING_DELETE, |
515 | }; |
516 | |
517 | struct gc_pos { |
518 | enum gc_phase phase; |
519 | struct bpos pos; |
520 | unsigned level; |
521 | }; |
522 | |
523 | struct reflink_gc { |
524 | u64 offset; |
525 | u32 size; |
526 | u32 refcount; |
527 | }; |
528 | |
529 | typedef GENRADIX(struct reflink_gc) reflink_gc_table; |
530 | |
531 | struct io_count { |
532 | u64 sectors[2][BCH_DATA_NR]; |
533 | }; |
534 | |
535 | struct bch_dev { |
536 | struct kobject kobj; |
537 | struct percpu_ref ref; |
538 | struct completion ref_completion; |
539 | struct percpu_ref io_ref; |
540 | struct completion io_ref_completion; |
541 | |
542 | struct bch_fs *fs; |
543 | |
544 | u8 dev_idx; |
545 | /* |
546 | * Cached version of this device's member info from superblock |
547 | * Committed by bch2_write_super() -> bch_fs_mi_update() |
548 | */ |
549 | struct bch_member_cpu mi; |
550 | atomic64_t errors[BCH_MEMBER_ERROR_NR]; |
551 | |
552 | __uuid_t uuid; |
553 | char name[BDEVNAME_SIZE]; |
554 | |
555 | struct bch_sb_handle disk_sb; |
556 | struct bch_sb *sb_read_scratch; |
557 | int sb_write_error; |
558 | dev_t dev; |
559 | atomic_t flush_seq; |
560 | |
561 | struct bch_devs_mask self; |
562 | |
563 | /* biosets used in cloned bios for writing multiple replicas */ |
564 | struct bio_set replica_set; |
565 | |
566 | /* |
567 | * Buckets: |
568 | * Per-bucket arrays are protected by c->mark_lock, bucket_lock and |
569 | * gc_lock, for device resize - holding any is sufficient for access: |
570 | * Or rcu_read_lock(), but only for ptr_stale(): |
571 | */ |
572 | struct bucket_array __rcu *buckets_gc; |
573 | struct bucket_gens __rcu *bucket_gens; |
574 | u8 *oldest_gen; |
575 | unsigned long *buckets_nouse; |
576 | struct rw_semaphore bucket_lock; |
577 | |
578 | struct bch_dev_usage *usage_base; |
579 | struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; |
580 | struct bch_dev_usage __percpu *usage_gc; |
581 | |
582 | /* Allocator: */ |
583 | u64 new_fs_bucket_idx; |
584 | u64 alloc_cursor; |
585 | |
586 | unsigned nr_open_buckets; |
587 | unsigned nr_btree_reserve; |
588 | |
589 | size_t inc_gen_needs_gc; |
590 | size_t inc_gen_really_needs_gc; |
591 | size_t buckets_waiting_on_journal; |
592 | |
593 | atomic64_t rebalance_work; |
594 | |
595 | struct journal_device journal; |
596 | u64 prev_journal_sector; |
597 | |
598 | struct work_struct io_error_work; |
599 | |
600 | /* The rest of this all shows up in sysfs */ |
601 | atomic64_t cur_latency[2]; |
602 | struct bch2_time_stats_quantiles io_latency[2]; |
603 | |
604 | #define CONGESTED_MAX 1024 |
605 | atomic_t congested; |
606 | u64 congested_last; |
607 | |
608 | struct io_count __percpu *io_done; |
609 | }; |
610 | |
611 | /* |
612 | * initial_gc_unfixed |
613 | * error |
614 | * topology error |
615 | */ |
616 | |
617 | #define BCH_FS_FLAGS() \ |
618 | x(new_fs) \ |
619 | x(started) \ |
620 | x(may_go_rw) \ |
621 | x(rw) \ |
622 | x(was_rw) \ |
623 | x(stopping) \ |
624 | x(emergency_ro) \ |
625 | x(going_ro) \ |
626 | x(write_disable_complete) \ |
627 | x(clean_shutdown) \ |
628 | x(fsck_running) \ |
629 | x(initial_gc_unfixed) \ |
630 | x(need_another_gc) \ |
631 | x(need_delete_dead_snapshots) \ |
632 | x(error) \ |
633 | x(topology_error) \ |
634 | x(errors_fixed) \ |
635 | x(errors_not_fixed) |
636 | |
637 | enum bch_fs_flags { |
638 | #define x(n) BCH_FS_##n, |
639 | BCH_FS_FLAGS() |
640 | #undef x |
641 | }; |
642 | |
643 | struct btree_debug { |
644 | unsigned id; |
645 | }; |
646 | |
647 | #define BCH_TRANSACTIONS_NR 128 |
648 | |
649 | struct btree_transaction_stats { |
650 | struct bch2_time_stats duration; |
651 | struct bch2_time_stats lock_hold_times; |
652 | struct mutex lock; |
653 | unsigned nr_max_paths; |
654 | unsigned journal_entries_size; |
655 | unsigned max_mem; |
656 | char *max_paths_text; |
657 | }; |
658 | |
659 | struct bch_fs_pcpu { |
660 | u64 sectors_available; |
661 | }; |
662 | |
663 | struct journal_seq_blacklist_table { |
664 | size_t nr; |
665 | struct journal_seq_blacklist_table_entry { |
666 | u64 start; |
667 | u64 end; |
668 | bool dirty; |
669 | } entries[]; |
670 | }; |
671 | |
672 | struct journal_keys { |
673 | /* must match layout in darray_types.h */ |
674 | size_t nr, size; |
675 | struct journal_key { |
676 | u64 journal_seq; |
677 | u32 journal_offset; |
678 | enum btree_id btree_id:8; |
679 | unsigned level:8; |
680 | bool allocated; |
681 | bool overwritten; |
682 | struct bkey_i *k; |
683 | } *data; |
684 | /* |
685 | * Gap buffer: instead of all the empty space in the array being at the |
686 | * end of the buffer - from @nr to @size - the empty space is at @gap. |
687 | * This means that sequential insertions are O(n) instead of O(n^2). |
688 | */ |
689 | size_t gap; |
690 | atomic_t ref; |
691 | bool initial_ref_held; |
692 | }; |
693 | |
694 | struct btree_trans_buf { |
695 | struct btree_trans *trans; |
696 | }; |
697 | |
698 | #define REPLICAS_DELTA_LIST_MAX (1U << 16) |
699 | |
700 | #define BCACHEFS_ROOT_SUBVOL_INUM \ |
701 | ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) |
702 | |
703 | #define BCH_WRITE_REFS() \ |
704 | x(trans) \ |
705 | x(write) \ |
706 | x(promote) \ |
707 | x(node_rewrite) \ |
708 | x(stripe_create) \ |
709 | x(stripe_delete) \ |
710 | x(reflink) \ |
711 | x(fallocate) \ |
712 | x(fsync) \ |
713 | x(dio_write) \ |
714 | x(discard) \ |
715 | x(discard_fast) \ |
716 | x(invalidate) \ |
717 | x(delete_dead_snapshots) \ |
718 | x(snapshot_delete_pagecache) \ |
719 | x(sysfs) \ |
720 | x(btree_write_buffer) |
721 | |
722 | enum bch_write_ref { |
723 | #define x(n) BCH_WRITE_REF_##n, |
724 | BCH_WRITE_REFS() |
725 | #undef x |
726 | BCH_WRITE_REF_NR, |
727 | }; |
728 | |
729 | struct bch_fs { |
730 | struct closure cl; |
731 | |
732 | struct list_head list; |
733 | struct kobject kobj; |
734 | struct kobject counters_kobj; |
735 | struct kobject internal; |
736 | struct kobject opts_dir; |
737 | struct kobject time_stats; |
738 | unsigned long flags; |
739 | |
740 | int minor; |
741 | struct device *chardev; |
742 | struct super_block *vfs_sb; |
743 | dev_t dev; |
744 | char name[40]; |
745 | struct stdio_redirect *stdio; |
746 | struct task_struct *stdio_filter; |
747 | |
748 | /* ro/rw, add/remove/resize devices: */ |
749 | struct rw_semaphore state_lock; |
750 | |
751 | /* Counts outstanding writes, for clean transition to read-only */ |
752 | #ifdef BCH_WRITE_REF_DEBUG |
753 | atomic_long_t writes[BCH_WRITE_REF_NR]; |
754 | #else |
755 | struct percpu_ref writes; |
756 | #endif |
757 | /* |
758 | * Analagous to c->writes, for asynchronous ops that don't necessarily |
759 | * need fs to be read-write |
760 | */ |
761 | refcount_t ro_ref; |
762 | wait_queue_head_t ro_ref_wait; |
763 | |
764 | struct work_struct read_only_work; |
765 | |
766 | struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; |
767 | |
768 | struct bch_replicas_cpu replicas; |
769 | struct bch_replicas_cpu replicas_gc; |
770 | struct mutex replicas_gc_lock; |
771 | mempool_t replicas_delta_pool; |
772 | |
773 | struct journal_entry_res btree_root_journal_res; |
774 | struct journal_entry_res replicas_journal_res; |
775 | struct journal_entry_res clock_journal_res; |
776 | struct journal_entry_res dev_usage_journal_res; |
777 | |
778 | struct bch_disk_groups_cpu __rcu *disk_groups; |
779 | |
780 | struct bch_opts opts; |
781 | |
782 | /* Updated by bch2_sb_update():*/ |
783 | struct { |
784 | __uuid_t uuid; |
785 | __uuid_t user_uuid; |
786 | |
787 | u16 version; |
788 | u16 version_min; |
789 | u16 version_upgrade_complete; |
790 | |
791 | u8 nr_devices; |
792 | u8 clean; |
793 | |
794 | u8 encryption_type; |
795 | |
796 | u64 time_base_lo; |
797 | u32 time_base_hi; |
798 | unsigned time_units_per_sec; |
799 | unsigned nsec_per_time_unit; |
800 | u64 features; |
801 | u64 compat; |
802 | unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; |
803 | u64 btrees_lost_data; |
804 | } sb; |
805 | |
806 | |
807 | struct bch_sb_handle disk_sb; |
808 | |
809 | unsigned short block_bits; /* ilog2(block_size) */ |
810 | |
811 | u16 btree_foreground_merge_threshold; |
812 | |
813 | struct closure sb_write; |
814 | struct mutex sb_lock; |
815 | |
816 | /* snapshot.c: */ |
817 | struct snapshot_table __rcu *snapshots; |
818 | struct mutex snapshot_table_lock; |
819 | struct rw_semaphore snapshot_create_lock; |
820 | |
821 | struct work_struct snapshot_delete_work; |
822 | struct work_struct snapshot_wait_for_pagecache_and_delete_work; |
823 | snapshot_id_list snapshots_unlinked; |
824 | struct mutex snapshots_unlinked_lock; |
825 | |
826 | /* BTREE CACHE */ |
827 | struct bio_set btree_bio; |
828 | struct workqueue_struct *io_complete_wq; |
829 | |
830 | struct btree_root btree_roots_known[BTREE_ID_NR]; |
831 | DARRAY(struct btree_root) ; |
832 | struct mutex btree_root_lock; |
833 | |
834 | struct btree_cache btree_cache; |
835 | |
836 | /* |
837 | * Cache of allocated btree nodes - if we allocate a btree node and |
838 | * don't use it, if we free it that space can't be reused until going |
839 | * _all_ the way through the allocator (which exposes us to a livelock |
840 | * when allocating btree reserves fail halfway through) - instead, we |
841 | * can stick them here: |
842 | */ |
843 | struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; |
844 | unsigned btree_reserve_cache_nr; |
845 | struct mutex btree_reserve_cache_lock; |
846 | |
847 | mempool_t btree_interior_update_pool; |
848 | struct list_head btree_interior_update_list; |
849 | struct list_head btree_interior_updates_unwritten; |
850 | struct mutex btree_interior_update_lock; |
851 | struct closure_waitlist btree_interior_update_wait; |
852 | |
853 | struct workqueue_struct *btree_interior_update_worker; |
854 | struct work_struct btree_interior_update_work; |
855 | |
856 | struct workqueue_struct *btree_node_rewrite_worker; |
857 | |
858 | struct list_head pending_node_rewrites; |
859 | struct mutex pending_node_rewrites_lock; |
860 | |
861 | /* btree_io.c: */ |
862 | spinlock_t btree_write_error_lock; |
863 | struct btree_write_stats { |
864 | atomic64_t nr; |
865 | atomic64_t bytes; |
866 | } btree_write_stats[BTREE_WRITE_TYPE_NR]; |
867 | |
868 | /* btree_iter.c: */ |
869 | struct seqmutex btree_trans_lock; |
870 | struct list_head btree_trans_list; |
871 | mempool_t btree_trans_pool; |
872 | mempool_t btree_trans_mem_pool; |
873 | struct btree_trans_buf __percpu *btree_trans_bufs; |
874 | |
875 | struct srcu_struct btree_trans_barrier; |
876 | bool btree_trans_barrier_initialized; |
877 | |
878 | struct btree_key_cache btree_key_cache; |
879 | unsigned btree_key_cache_btrees; |
880 | |
881 | struct btree_write_buffer btree_write_buffer; |
882 | |
883 | struct workqueue_struct *btree_update_wq; |
884 | struct workqueue_struct *btree_io_complete_wq; |
885 | /* copygc needs its own workqueue for index updates.. */ |
886 | struct workqueue_struct *copygc_wq; |
887 | /* |
888 | * Use a dedicated wq for write ref holder tasks. Required to avoid |
889 | * dependency problems with other wq tasks that can block on ref |
890 | * draining, such as read-only transition. |
891 | */ |
892 | struct workqueue_struct *write_ref_wq; |
893 | |
894 | /* ALLOCATION */ |
895 | struct bch_devs_mask rw_devs[BCH_DATA_NR]; |
896 | |
897 | u64 capacity; /* sectors */ |
898 | |
899 | /* |
900 | * When capacity _decreases_ (due to a disk being removed), we |
901 | * increment capacity_gen - this invalidates outstanding reservations |
902 | * and forces them to be revalidated |
903 | */ |
904 | u32 capacity_gen; |
905 | unsigned bucket_size_max; |
906 | |
907 | atomic64_t sectors_available; |
908 | struct mutex sectors_available_lock; |
909 | |
910 | struct bch_fs_pcpu __percpu *pcpu; |
911 | |
912 | struct percpu_rw_semaphore mark_lock; |
913 | |
914 | seqcount_t usage_lock; |
915 | struct bch_fs_usage *usage_base; |
916 | struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; |
917 | struct bch_fs_usage __percpu *usage_gc; |
918 | u64 __percpu *online_reserved; |
919 | |
920 | /* single element mempool: */ |
921 | struct mutex usage_scratch_lock; |
922 | struct bch_fs_usage_online *usage_scratch; |
923 | |
924 | struct io_clock io_clock[2]; |
925 | |
926 | /* JOURNAL SEQ BLACKLIST */ |
927 | struct journal_seq_blacklist_table * |
928 | journal_seq_blacklist_table; |
929 | struct work_struct journal_seq_blacklist_gc_work; |
930 | |
931 | /* ALLOCATOR */ |
932 | spinlock_t freelist_lock; |
933 | struct closure_waitlist freelist_wait; |
934 | |
935 | open_bucket_idx_t open_buckets_freelist; |
936 | open_bucket_idx_t open_buckets_nr_free; |
937 | struct closure_waitlist open_buckets_wait; |
938 | struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; |
939 | open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; |
940 | |
941 | open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; |
942 | open_bucket_idx_t open_buckets_partial_nr; |
943 | |
944 | struct write_point btree_write_point; |
945 | struct write_point rebalance_write_point; |
946 | |
947 | struct write_point write_points[WRITE_POINT_MAX]; |
948 | struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; |
949 | struct mutex write_points_hash_lock; |
950 | unsigned write_points_nr; |
951 | |
952 | struct buckets_waiting_for_journal buckets_waiting_for_journal; |
953 | struct work_struct invalidate_work; |
954 | struct work_struct discard_work; |
955 | struct mutex discard_buckets_in_flight_lock; |
956 | DARRAY(struct bpos) discard_buckets_in_flight; |
957 | struct work_struct discard_fast_work; |
958 | |
959 | /* GARBAGE COLLECTION */ |
960 | struct task_struct *gc_thread; |
961 | atomic_t kick_gc; |
962 | unsigned long gc_count; |
963 | |
964 | enum btree_id gc_gens_btree; |
965 | struct bpos gc_gens_pos; |
966 | |
967 | /* |
968 | * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] |
969 | * has been marked by GC. |
970 | * |
971 | * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) |
972 | * |
973 | * Protected by gc_pos_lock. Only written to by GC thread, so GC thread |
974 | * can read without a lock. |
975 | */ |
976 | seqcount_t gc_pos_lock; |
977 | struct gc_pos gc_pos; |
978 | |
979 | /* |
980 | * The allocation code needs gc_mark in struct bucket to be correct, but |
981 | * it's not while a gc is in progress. |
982 | */ |
983 | struct rw_semaphore gc_lock; |
984 | struct mutex gc_gens_lock; |
985 | |
986 | /* IO PATH */ |
987 | struct semaphore io_in_flight; |
988 | struct bio_set bio_read; |
989 | struct bio_set bio_read_split; |
990 | struct bio_set bio_write; |
991 | struct mutex bio_bounce_pages_lock; |
992 | mempool_t bio_bounce_pages; |
993 | struct bucket_nocow_lock_table |
994 | nocow_locks; |
995 | struct rhashtable promote_table; |
996 | |
997 | mempool_t compression_bounce[2]; |
998 | mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; |
999 | mempool_t decompress_workspace; |
1000 | size_t zstd_workspace_size; |
1001 | |
1002 | struct crypto_shash *sha256; |
1003 | struct crypto_sync_skcipher *chacha20; |
1004 | struct crypto_shash *poly1305; |
1005 | |
1006 | atomic64_t key_version; |
1007 | |
1008 | mempool_t large_bkey_pool; |
1009 | |
1010 | /* MOVE.C */ |
1011 | struct list_head moving_context_list; |
1012 | struct mutex moving_context_lock; |
1013 | |
1014 | /* REBALANCE */ |
1015 | struct bch_fs_rebalance rebalance; |
1016 | |
1017 | /* COPYGC */ |
1018 | struct task_struct *copygc_thread; |
1019 | struct write_point copygc_write_point; |
1020 | s64 copygc_wait_at; |
1021 | s64 copygc_wait; |
1022 | bool copygc_running; |
1023 | wait_queue_head_t copygc_running_wq; |
1024 | |
1025 | /* STRIPES: */ |
1026 | GENRADIX(struct stripe) stripes; |
1027 | GENRADIX(struct gc_stripe) gc_stripes; |
1028 | |
1029 | struct hlist_head ec_stripes_new[32]; |
1030 | spinlock_t ec_stripes_new_lock; |
1031 | |
1032 | ec_stripes_heap ec_stripes_heap; |
1033 | struct mutex ec_stripes_heap_lock; |
1034 | |
1035 | /* ERASURE CODING */ |
1036 | struct list_head ec_stripe_head_list; |
1037 | struct mutex ec_stripe_head_lock; |
1038 | |
1039 | struct list_head ec_stripe_new_list; |
1040 | struct mutex ec_stripe_new_lock; |
1041 | wait_queue_head_t ec_stripe_new_wait; |
1042 | |
1043 | struct work_struct ec_stripe_create_work; |
1044 | u64 ec_stripe_hint; |
1045 | |
1046 | struct work_struct ec_stripe_delete_work; |
1047 | |
1048 | struct bio_set ec_bioset; |
1049 | |
1050 | /* REFLINK */ |
1051 | reflink_gc_table reflink_gc_table; |
1052 | size_t reflink_gc_nr; |
1053 | |
1054 | /* fs.c */ |
1055 | struct list_head vfs_inodes_list; |
1056 | struct mutex vfs_inodes_lock; |
1057 | |
1058 | /* VFS IO PATH - fs-io.c */ |
1059 | struct bio_set writepage_bioset; |
1060 | struct bio_set dio_write_bioset; |
1061 | struct bio_set dio_read_bioset; |
1062 | struct bio_set nocow_flush_bioset; |
1063 | |
1064 | /* QUOTAS */ |
1065 | struct bch_memquota_type quotas[QTYP_NR]; |
1066 | |
1067 | /* RECOVERY */ |
1068 | u64 journal_replay_seq_start; |
1069 | u64 journal_replay_seq_end; |
1070 | /* |
1071 | * Two different uses: |
1072 | * "Has this fsck pass?" - i.e. should this type of error be an |
1073 | * emergency read-only |
1074 | * And, in certain situations fsck will rewind to an earlier pass: used |
1075 | * for signaling to the toplevel code which pass we want to run now. |
1076 | */ |
1077 | enum bch_recovery_pass curr_recovery_pass; |
1078 | /* bitmap of explicitly enabled recovery passes: */ |
1079 | u64 recovery_passes_explicit; |
1080 | /* bitmask of recovery passes that we actually ran */ |
1081 | u64 recovery_passes_complete; |
1082 | /* never rewinds version of curr_recovery_pass */ |
1083 | enum bch_recovery_pass recovery_pass_done; |
1084 | struct semaphore online_fsck_mutex; |
1085 | |
1086 | /* DEBUG JUNK */ |
1087 | struct dentry *fs_debug_dir; |
1088 | struct dentry *btree_debug_dir; |
1089 | struct btree_debug btree_debug[BTREE_ID_NR]; |
1090 | struct btree *verify_data; |
1091 | struct btree_node *verify_ondisk; |
1092 | struct mutex verify_lock; |
1093 | |
1094 | u64 *unused_inode_hints; |
1095 | unsigned inode_shard_bits; |
1096 | |
1097 | /* |
1098 | * A btree node on disk could have too many bsets for an iterator to fit |
1099 | * on the stack - have to dynamically allocate them |
1100 | */ |
1101 | mempool_t fill_iter; |
1102 | |
1103 | mempool_t btree_bounce_pool; |
1104 | |
1105 | struct journal journal; |
1106 | GENRADIX(struct journal_replay *) journal_entries; |
1107 | u64 journal_entries_base_seq; |
1108 | struct journal_keys journal_keys; |
1109 | struct list_head journal_iters; |
1110 | |
1111 | struct find_btree_nodes found_btree_nodes; |
1112 | |
1113 | u64 last_bucket_seq_cleanup; |
1114 | |
1115 | u64 counters_on_mount[BCH_COUNTER_NR]; |
1116 | u64 __percpu *counters; |
1117 | |
1118 | unsigned btree_gc_periodic:1; |
1119 | unsigned copy_gc_enabled:1; |
1120 | bool promote_whole_extents; |
1121 | |
1122 | struct bch2_time_stats times[BCH_TIME_STAT_NR]; |
1123 | |
1124 | struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; |
1125 | |
1126 | /* ERRORS */ |
1127 | struct list_head fsck_error_msgs; |
1128 | struct mutex fsck_error_msgs_lock; |
1129 | bool fsck_alloc_msgs_err; |
1130 | |
1131 | bch_sb_errors_cpu fsck_error_counts; |
1132 | struct mutex fsck_error_counts_lock; |
1133 | }; |
1134 | |
1135 | extern struct wait_queue_head bch2_read_only_wait; |
1136 | |
1137 | static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) |
1138 | { |
1139 | #ifdef BCH_WRITE_REF_DEBUG |
1140 | atomic_long_inc(v: &c->writes[ref]); |
1141 | #else |
1142 | percpu_ref_get(&c->writes); |
1143 | #endif |
1144 | } |
1145 | |
1146 | static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) |
1147 | { |
1148 | #ifdef BCH_WRITE_REF_DEBUG |
1149 | return !test_bit(BCH_FS_going_ro, &c->flags) && |
1150 | atomic_long_inc_not_zero(v: &c->writes[ref]); |
1151 | #else |
1152 | return percpu_ref_tryget(&c->writes); |
1153 | #endif |
1154 | } |
1155 | |
1156 | static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) |
1157 | { |
1158 | #ifdef BCH_WRITE_REF_DEBUG |
1159 | return !test_bit(BCH_FS_going_ro, &c->flags) && |
1160 | atomic_long_inc_not_zero(v: &c->writes[ref]); |
1161 | #else |
1162 | return percpu_ref_tryget_live(&c->writes); |
1163 | #endif |
1164 | } |
1165 | |
1166 | static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) |
1167 | { |
1168 | #ifdef BCH_WRITE_REF_DEBUG |
1169 | long v = atomic_long_dec_return(v: &c->writes[ref]); |
1170 | |
1171 | BUG_ON(v < 0); |
1172 | if (v) |
1173 | return; |
1174 | for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) |
1175 | if (atomic_long_read(v: &c->writes[i])) |
1176 | return; |
1177 | |
1178 | set_bit(nr: BCH_FS_write_disable_complete, addr: &c->flags); |
1179 | wake_up(&bch2_read_only_wait); |
1180 | #else |
1181 | percpu_ref_put(&c->writes); |
1182 | #endif |
1183 | } |
1184 | |
1185 | static inline bool bch2_ro_ref_tryget(struct bch_fs *c) |
1186 | { |
1187 | if (test_bit(BCH_FS_stopping, &c->flags)) |
1188 | return false; |
1189 | |
1190 | return refcount_inc_not_zero(r: &c->ro_ref); |
1191 | } |
1192 | |
1193 | static inline void bch2_ro_ref_put(struct bch_fs *c) |
1194 | { |
1195 | if (refcount_dec_and_test(r: &c->ro_ref)) |
1196 | wake_up(&c->ro_ref_wait); |
1197 | } |
1198 | |
1199 | static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) |
1200 | { |
1201 | #ifndef NO_BCACHEFS_FS |
1202 | if (c->vfs_sb) |
1203 | c->vfs_sb->s_bdi->ra_pages = ra_pages; |
1204 | #endif |
1205 | } |
1206 | |
1207 | static inline unsigned bucket_bytes(const struct bch_dev *ca) |
1208 | { |
1209 | return ca->mi.bucket_size << 9; |
1210 | } |
1211 | |
1212 | static inline unsigned block_bytes(const struct bch_fs *c) |
1213 | { |
1214 | return c->opts.block_size; |
1215 | } |
1216 | |
1217 | static inline unsigned block_sectors(const struct bch_fs *c) |
1218 | { |
1219 | return c->opts.block_size >> 9; |
1220 | } |
1221 | |
1222 | static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) |
1223 | { |
1224 | return c->btree_key_cache_btrees & (1U << btree); |
1225 | } |
1226 | |
1227 | static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) |
1228 | { |
1229 | struct timespec64 t; |
1230 | s32 rem; |
1231 | |
1232 | time += c->sb.time_base_lo; |
1233 | |
1234 | t.tv_sec = div_s64_rem(dividend: time, divisor: c->sb.time_units_per_sec, remainder: &rem); |
1235 | t.tv_nsec = rem * c->sb.nsec_per_time_unit; |
1236 | return t; |
1237 | } |
1238 | |
1239 | static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) |
1240 | { |
1241 | return (ts.tv_sec * c->sb.time_units_per_sec + |
1242 | (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; |
1243 | } |
1244 | |
1245 | static inline s64 bch2_current_time(const struct bch_fs *c) |
1246 | { |
1247 | struct timespec64 now; |
1248 | |
1249 | ktime_get_coarse_real_ts64(ts: &now); |
1250 | return timespec_to_bch2_time(c, ts: now); |
1251 | } |
1252 | |
1253 | static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) |
1254 | { |
1255 | return dev < c->sb.nr_devices && c->devs[dev]; |
1256 | } |
1257 | |
1258 | static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c) |
1259 | { |
1260 | struct stdio_redirect *stdio = c->stdio; |
1261 | |
1262 | if (c->stdio_filter && c->stdio_filter != current) |
1263 | stdio = NULL; |
1264 | return stdio; |
1265 | } |
1266 | |
1267 | static inline unsigned metadata_replicas_required(struct bch_fs *c) |
1268 | { |
1269 | return min(c->opts.metadata_replicas, |
1270 | c->opts.metadata_replicas_required); |
1271 | } |
1272 | |
1273 | static inline unsigned data_replicas_required(struct bch_fs *c) |
1274 | { |
1275 | return min(c->opts.data_replicas, |
1276 | c->opts.data_replicas_required); |
1277 | } |
1278 | |
1279 | #define BKEY_PADDED_ONSTACK(key, pad) \ |
1280 | struct { struct bkey_i key; __u64 key ## _pad[pad]; } |
1281 | |
1282 | #endif /* _BCACHEFS_H */ |
1283 | |