1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | |
3 | #ifndef BTRFS_FS_H |
4 | #define BTRFS_FS_H |
5 | |
6 | #include <linux/blkdev.h> |
7 | #include <linux/fs.h> |
8 | #include <linux/btrfs_tree.h> |
9 | #include <linux/sizes.h> |
10 | #include "extent-io-tree.h" |
11 | #include "extent_map.h" |
12 | #include "async-thread.h" |
13 | #include "block-rsv.h" |
14 | |
15 | #define BTRFS_MAX_EXTENT_SIZE SZ_128M |
16 | |
17 | #define BTRFS_OLDEST_GENERATION 0ULL |
18 | |
19 | #define BTRFS_EMPTY_DIR_SIZE 0 |
20 | |
21 | #define BTRFS_DIRTY_METADATA_THRESH SZ_32M |
22 | |
23 | #define BTRFS_SUPER_INFO_OFFSET SZ_64K |
24 | #define BTRFS_SUPER_INFO_SIZE 4096 |
25 | static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); |
26 | |
27 | /* |
28 | * Number of metadata items necessary for an unlink operation: |
29 | * |
30 | * 1 for the possible orphan item |
31 | * 1 for the dir item |
32 | * 1 for the dir index |
33 | * 1 for the inode ref |
34 | * 1 for the inode |
35 | * 1 for the parent inode |
36 | */ |
37 | #define BTRFS_UNLINK_METADATA_UNITS 6 |
38 | |
39 | /* |
40 | * The reserved space at the beginning of each device. It covers the primary |
41 | * super block and leaves space for potential use by other tools like |
42 | * bootloaders or to lower potential damage of accidental overwrite. |
43 | */ |
44 | #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) |
45 | /* |
46 | * Runtime (in-memory) states of filesystem |
47 | */ |
48 | enum { |
49 | /* |
50 | * Filesystem is being remounted, allow to skip some operations, like |
51 | * defrag |
52 | */ |
53 | BTRFS_FS_STATE_REMOUNTING, |
54 | /* Filesystem in RO mode */ |
55 | BTRFS_FS_STATE_RO, |
56 | /* Track if a transaction abort has been reported on this filesystem */ |
57 | BTRFS_FS_STATE_TRANS_ABORTED, |
58 | /* |
59 | * Bio operations should be blocked on this filesystem because a source |
60 | * or target device is being destroyed as part of a device replace |
61 | */ |
62 | BTRFS_FS_STATE_DEV_REPLACING, |
63 | /* The btrfs_fs_info created for self-tests */ |
64 | BTRFS_FS_STATE_DUMMY_FS_INFO, |
65 | |
66 | BTRFS_FS_STATE_NO_CSUMS, |
67 | |
68 | /* Indicates there was an error cleaning up a log tree. */ |
69 | BTRFS_FS_STATE_LOG_CLEANUP_ERROR, |
70 | |
71 | BTRFS_FS_STATE_COUNT |
72 | }; |
73 | |
74 | enum { |
75 | BTRFS_FS_CLOSING_START, |
76 | BTRFS_FS_CLOSING_DONE, |
77 | BTRFS_FS_LOG_RECOVERING, |
78 | BTRFS_FS_OPEN, |
79 | BTRFS_FS_QUOTA_ENABLED, |
80 | BTRFS_FS_UPDATE_UUID_TREE_GEN, |
81 | BTRFS_FS_CREATING_FREE_SPACE_TREE, |
82 | BTRFS_FS_BTREE_ERR, |
83 | BTRFS_FS_LOG1_ERR, |
84 | BTRFS_FS_LOG2_ERR, |
85 | BTRFS_FS_QUOTA_OVERRIDE, |
86 | /* Used to record internally whether fs has been frozen */ |
87 | BTRFS_FS_FROZEN, |
88 | /* |
89 | * Indicate that balance has been set up from the ioctl and is in the |
90 | * main phase. The fs_info::balance_ctl is initialized. |
91 | */ |
92 | BTRFS_FS_BALANCE_RUNNING, |
93 | |
94 | /* |
95 | * Indicate that relocation of a chunk has started, it's set per chunk |
96 | * and is toggled between chunks. |
97 | */ |
98 | BTRFS_FS_RELOC_RUNNING, |
99 | |
100 | /* Indicate that the cleaner thread is awake and doing something. */ |
101 | BTRFS_FS_CLEANER_RUNNING, |
102 | |
103 | /* |
104 | * The checksumming has an optimized version and is considered fast, |
105 | * so we don't need to offload checksums to workqueues. |
106 | */ |
107 | BTRFS_FS_CSUM_IMPL_FAST, |
108 | |
109 | /* Indicate that the discard workqueue can service discards. */ |
110 | BTRFS_FS_DISCARD_RUNNING, |
111 | |
112 | /* Indicate that we need to cleanup space cache v1 */ |
113 | BTRFS_FS_CLEANUP_SPACE_CACHE_V1, |
114 | |
115 | /* Indicate that we can't trust the free space tree for caching yet */ |
116 | BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, |
117 | |
118 | /* Indicate whether there are any tree modification log users */ |
119 | BTRFS_FS_TREE_MOD_LOG_USERS, |
120 | |
121 | /* Indicate that we want the transaction kthread to commit right now. */ |
122 | BTRFS_FS_COMMIT_TRANS, |
123 | |
124 | /* Indicate we have half completed snapshot deletions pending. */ |
125 | BTRFS_FS_UNFINISHED_DROPS, |
126 | |
127 | /* Indicate we have to finish a zone to do next allocation. */ |
128 | BTRFS_FS_NEED_ZONE_FINISH, |
129 | |
130 | /* Indicate that we want to commit the transaction. */ |
131 | BTRFS_FS_NEED_TRANS_COMMIT, |
132 | |
133 | /* This is set when active zone tracking is needed. */ |
134 | BTRFS_FS_ACTIVE_ZONE_TRACKING, |
135 | |
136 | /* |
137 | * Indicate if we have some features changed, this is mostly for |
138 | * cleaner thread to update the sysfs interface. |
139 | */ |
140 | BTRFS_FS_FEATURE_CHANGED, |
141 | |
142 | /* |
143 | * Indicate that we have found a tree block which is only aligned to |
144 | * sectorsize, but not to nodesize. This should be rare nowadays. |
145 | */ |
146 | BTRFS_FS_UNALIGNED_TREE_BLOCK, |
147 | |
148 | #if BITS_PER_LONG == 32 |
149 | /* Indicate if we have error/warn message printed on 32bit systems */ |
150 | BTRFS_FS_32BIT_ERROR, |
151 | BTRFS_FS_32BIT_WARN, |
152 | #endif |
153 | }; |
154 | |
155 | /* |
156 | * Flags for mount options. |
157 | * |
158 | * Note: don't forget to add new options to btrfs_show_options() |
159 | */ |
160 | enum { |
161 | BTRFS_MOUNT_NODATASUM = (1UL << 0), |
162 | BTRFS_MOUNT_NODATACOW = (1UL << 1), |
163 | BTRFS_MOUNT_NOBARRIER = (1UL << 2), |
164 | BTRFS_MOUNT_SSD = (1UL << 3), |
165 | BTRFS_MOUNT_DEGRADED = (1UL << 4), |
166 | BTRFS_MOUNT_COMPRESS = (1UL << 5), |
167 | BTRFS_MOUNT_NOTREELOG = (1UL << 6), |
168 | BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), |
169 | BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), |
170 | BTRFS_MOUNT_NOSSD = (1UL << 9), |
171 | BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), |
172 | BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), |
173 | BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), |
174 | BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), |
175 | BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), |
176 | BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), |
177 | BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), |
178 | BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), |
179 | BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), |
180 | BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19), |
181 | BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20), |
182 | BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21), |
183 | BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22), |
184 | BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23), |
185 | BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24), |
186 | BTRFS_MOUNT_REF_VERIFY = (1UL << 25), |
187 | BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26), |
188 | BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), |
189 | BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), |
190 | BTRFS_MOUNT_NODISCARD = (1UL << 29), |
191 | }; |
192 | |
193 | /* |
194 | * Compat flags that we support. If any incompat flags are set other than the |
195 | * ones specified below then we will fail to mount |
196 | */ |
197 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL |
198 | #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL |
199 | #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL |
200 | |
201 | #define BTRFS_FEATURE_COMPAT_RO_SUPP \ |
202 | (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ |
203 | BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ |
204 | BTRFS_FEATURE_COMPAT_RO_VERITY | \ |
205 | BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) |
206 | |
207 | #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL |
208 | #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL |
209 | |
210 | #define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ |
211 | (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ |
212 | BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ |
213 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ |
214 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ |
215 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ |
216 | BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ |
217 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ |
218 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ |
219 | BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ |
220 | BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ |
221 | BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ |
222 | BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ |
223 | BTRFS_FEATURE_INCOMPAT_ZONED | \ |
224 | BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) |
225 | |
226 | #ifdef CONFIG_BTRFS_DEBUG |
227 | /* |
228 | * Features under developmen like Extent tree v2 support is enabled |
229 | * only under CONFIG_BTRFS_DEBUG. |
230 | */ |
231 | #define BTRFS_FEATURE_INCOMPAT_SUPP \ |
232 | (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ |
233 | BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ |
234 | BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) |
235 | |
236 | #else |
237 | |
238 | #define BTRFS_FEATURE_INCOMPAT_SUPP \ |
239 | (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) |
240 | |
241 | #endif |
242 | |
243 | #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ |
244 | (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) |
245 | #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL |
246 | |
247 | #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) |
248 | #define BTRFS_DEFAULT_MAX_INLINE (2048) |
249 | |
250 | struct btrfs_dev_replace { |
251 | /* See #define above */ |
252 | u64 replace_state; |
253 | /* Seconds since 1-Jan-1970 */ |
254 | time64_t time_started; |
255 | /* Seconds since 1-Jan-1970 */ |
256 | time64_t time_stopped; |
257 | atomic64_t num_write_errors; |
258 | atomic64_t num_uncorrectable_read_errors; |
259 | |
260 | u64 cursor_left; |
261 | u64 committed_cursor_left; |
262 | u64 cursor_left_last_write_of_item; |
263 | u64 cursor_right; |
264 | |
265 | /* See #define above */ |
266 | u64 cont_reading_from_srcdev_mode; |
267 | |
268 | int is_valid; |
269 | int item_needs_writeback; |
270 | struct btrfs_device *srcdev; |
271 | struct btrfs_device *tgtdev; |
272 | |
273 | struct mutex lock_finishing_cancel_unmount; |
274 | struct rw_semaphore rwsem; |
275 | |
276 | struct btrfs_scrub_progress scrub_progress; |
277 | |
278 | struct percpu_counter bio_counter; |
279 | wait_queue_head_t replace_wait; |
280 | }; |
281 | |
282 | /* |
283 | * Free clusters are used to claim free space in relatively large chunks, |
284 | * allowing us to do less seeky writes. They are used for all metadata |
285 | * allocations. In ssd_spread mode they are also used for data allocations. |
286 | */ |
287 | struct btrfs_free_cluster { |
288 | spinlock_t lock; |
289 | spinlock_t refill_lock; |
290 | struct rb_root root; |
291 | |
292 | /* Largest extent in this cluster */ |
293 | u64 max_size; |
294 | |
295 | /* First extent starting offset */ |
296 | u64 window_start; |
297 | |
298 | /* We did a full search and couldn't create a cluster */ |
299 | bool fragmented; |
300 | |
301 | struct btrfs_block_group *block_group; |
302 | /* |
303 | * When a cluster is allocated from a block group, we put the cluster |
304 | * onto a list in the block group so that it can be freed before the |
305 | * block group is freed. |
306 | */ |
307 | struct list_head block_group_list; |
308 | }; |
309 | |
310 | /* Discard control. */ |
311 | /* |
312 | * Async discard uses multiple lists to differentiate the discard filter |
313 | * parameters. Index 0 is for completely free block groups where we need to |
314 | * ensure the entire block group is trimmed without being lossy. Indices |
315 | * afterwards represent monotonically decreasing discard filter sizes to |
316 | * prioritize what should be discarded next. |
317 | */ |
318 | #define BTRFS_NR_DISCARD_LISTS 3 |
319 | #define BTRFS_DISCARD_INDEX_UNUSED 0 |
320 | #define BTRFS_DISCARD_INDEX_START 1 |
321 | |
322 | struct btrfs_discard_ctl { |
323 | struct workqueue_struct *discard_workers; |
324 | struct delayed_work work; |
325 | spinlock_t lock; |
326 | struct btrfs_block_group *block_group; |
327 | struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; |
328 | u64 prev_discard; |
329 | u64 prev_discard_time; |
330 | atomic_t discardable_extents; |
331 | atomic64_t discardable_bytes; |
332 | u64 max_discard_size; |
333 | u64 delay_ms; |
334 | u32 iops_limit; |
335 | u32 kbps_limit; |
336 | u64 discard_extent_bytes; |
337 | u64 discard_bitmap_bytes; |
338 | atomic64_t discard_bytes_saved; |
339 | }; |
340 | |
341 | /* |
342 | * Exclusive operations (device replace, resize, device add/remove, balance) |
343 | */ |
344 | enum btrfs_exclusive_operation { |
345 | BTRFS_EXCLOP_NONE, |
346 | BTRFS_EXCLOP_BALANCE_PAUSED, |
347 | BTRFS_EXCLOP_BALANCE, |
348 | BTRFS_EXCLOP_DEV_ADD, |
349 | BTRFS_EXCLOP_DEV_REMOVE, |
350 | BTRFS_EXCLOP_DEV_REPLACE, |
351 | BTRFS_EXCLOP_RESIZE, |
352 | BTRFS_EXCLOP_SWAP_ACTIVATE, |
353 | }; |
354 | |
355 | /* Store data about transaction commits, exported via sysfs. */ |
356 | struct btrfs_commit_stats { |
357 | /* Total number of commits */ |
358 | u64 commit_count; |
359 | /* The maximum commit duration so far in ns */ |
360 | u64 max_commit_dur; |
361 | /* The last commit duration in ns */ |
362 | u64 last_commit_dur; |
363 | /* The total commit duration in ns */ |
364 | u64 total_commit_dur; |
365 | }; |
366 | |
367 | struct btrfs_fs_info { |
368 | u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; |
369 | unsigned long flags; |
370 | struct btrfs_root *tree_root; |
371 | struct btrfs_root *chunk_root; |
372 | struct btrfs_root *dev_root; |
373 | struct btrfs_root *fs_root; |
374 | struct btrfs_root *quota_root; |
375 | struct btrfs_root *uuid_root; |
376 | struct btrfs_root *data_reloc_root; |
377 | struct btrfs_root *block_group_root; |
378 | struct btrfs_root *stripe_root; |
379 | |
380 | /* The log root tree is a directory of all the other log roots */ |
381 | struct btrfs_root *log_root_tree; |
382 | |
383 | /* The tree that holds the global roots (csum, extent, etc) */ |
384 | rwlock_t global_root_lock; |
385 | struct rb_root global_root_tree; |
386 | |
387 | spinlock_t fs_roots_radix_lock; |
388 | struct radix_tree_root fs_roots_radix; |
389 | |
390 | /* Block group cache stuff */ |
391 | rwlock_t block_group_cache_lock; |
392 | struct rb_root_cached block_group_cache_tree; |
393 | |
394 | /* Keep track of unallocated space */ |
395 | atomic64_t free_chunk_space; |
396 | |
397 | /* Track ranges which are used by log trees blocks/logged data extents */ |
398 | struct extent_io_tree excluded_extents; |
399 | |
400 | /* logical->physical extent mapping */ |
401 | struct extent_map_tree mapping_tree; |
402 | |
403 | /* |
404 | * Block reservation for extent, checksum, root tree and delayed dir |
405 | * index item. |
406 | */ |
407 | struct btrfs_block_rsv global_block_rsv; |
408 | /* Block reservation for metadata operations */ |
409 | struct btrfs_block_rsv trans_block_rsv; |
410 | /* Block reservation for chunk tree */ |
411 | struct btrfs_block_rsv chunk_block_rsv; |
412 | /* Block reservation for delayed operations */ |
413 | struct btrfs_block_rsv delayed_block_rsv; |
414 | /* Block reservation for delayed refs */ |
415 | struct btrfs_block_rsv delayed_refs_rsv; |
416 | |
417 | struct btrfs_block_rsv empty_block_rsv; |
418 | |
419 | /* |
420 | * Updated while holding the lock 'trans_lock'. Due to the life cycle of |
421 | * a transaction, it can be directly read while holding a transaction |
422 | * handle, everywhere else must be read with btrfs_get_fs_generation(). |
423 | * Should always be updated using btrfs_set_fs_generation(). |
424 | */ |
425 | u64 generation; |
426 | /* |
427 | * Always use btrfs_get_last_trans_committed() and |
428 | * btrfs_set_last_trans_committed() to read and update this field. |
429 | */ |
430 | u64 last_trans_committed; |
431 | /* |
432 | * Generation of the last transaction used for block group relocation |
433 | * since the filesystem was last mounted (or 0 if none happened yet). |
434 | * Must be written and read while holding btrfs_fs_info::commit_root_sem. |
435 | */ |
436 | u64 last_reloc_trans; |
437 | |
438 | /* |
439 | * This is updated to the current trans every time a full commit is |
440 | * required instead of the faster short fsync log commits |
441 | */ |
442 | u64 last_trans_log_full_commit; |
443 | unsigned long mount_opt; |
444 | |
445 | unsigned long compress_type:4; |
446 | unsigned int compress_level; |
447 | u32 commit_interval; |
448 | /* |
449 | * It is a suggestive number, the read side is safe even it gets a |
450 | * wrong number because we will write out the data into a regular |
451 | * extent. The write side(mount/remount) is under ->s_umount lock, |
452 | * so it is also safe. |
453 | */ |
454 | u64 max_inline; |
455 | |
456 | struct btrfs_transaction *running_transaction; |
457 | wait_queue_head_t transaction_throttle; |
458 | wait_queue_head_t transaction_wait; |
459 | wait_queue_head_t transaction_blocked_wait; |
460 | wait_queue_head_t async_submit_wait; |
461 | |
462 | /* |
463 | * Used to protect the incompat_flags, compat_flags, compat_ro_flags |
464 | * when they are updated. |
465 | * |
466 | * Because we do not clear the flags for ever, so we needn't use |
467 | * the lock on the read side. |
468 | * |
469 | * We also needn't use the lock when we mount the fs, because |
470 | * there is no other task which will update the flag. |
471 | */ |
472 | spinlock_t super_lock; |
473 | struct btrfs_super_block *super_copy; |
474 | struct btrfs_super_block *super_for_commit; |
475 | struct super_block *sb; |
476 | struct inode *btree_inode; |
477 | struct mutex tree_log_mutex; |
478 | struct mutex transaction_kthread_mutex; |
479 | struct mutex cleaner_mutex; |
480 | struct mutex chunk_mutex; |
481 | |
482 | /* |
483 | * This is taken to make sure we don't set block groups ro after the |
484 | * free space cache has been allocated on them. |
485 | */ |
486 | struct mutex ro_block_group_mutex; |
487 | |
488 | /* |
489 | * This is used during read/modify/write to make sure no two ios are |
490 | * trying to mod the same stripe at the same time. |
491 | */ |
492 | struct btrfs_stripe_hash_table *stripe_hash_table; |
493 | |
494 | /* |
495 | * This protects the ordered operations list only while we are |
496 | * processing all of the entries on it. This way we make sure the |
497 | * commit code doesn't find the list temporarily empty because another |
498 | * function happens to be doing non-waiting preflush before jumping |
499 | * into the main commit. |
500 | */ |
501 | struct mutex ordered_operations_mutex; |
502 | |
503 | struct rw_semaphore commit_root_sem; |
504 | |
505 | struct rw_semaphore cleanup_work_sem; |
506 | |
507 | struct rw_semaphore subvol_sem; |
508 | |
509 | spinlock_t trans_lock; |
510 | /* |
511 | * The reloc mutex goes with the trans lock, it is taken during commit |
512 | * to protect us from the relocation code. |
513 | */ |
514 | struct mutex reloc_mutex; |
515 | |
516 | struct list_head trans_list; |
517 | struct list_head dead_roots; |
518 | struct list_head caching_block_groups; |
519 | |
520 | spinlock_t delayed_iput_lock; |
521 | struct list_head delayed_iputs; |
522 | atomic_t nr_delayed_iputs; |
523 | wait_queue_head_t delayed_iputs_wait; |
524 | |
525 | atomic64_t tree_mod_seq; |
526 | |
527 | /* This protects tree_mod_log and tree_mod_seq_list */ |
528 | rwlock_t tree_mod_log_lock; |
529 | struct rb_root tree_mod_log; |
530 | struct list_head tree_mod_seq_list; |
531 | |
532 | atomic_t async_delalloc_pages; |
533 | |
534 | /* This is used to protect the following list -- ordered_roots. */ |
535 | spinlock_t ordered_root_lock; |
536 | |
537 | /* |
538 | * All fs/file tree roots in which there are data=ordered extents |
539 | * pending writeback are added into this list. |
540 | * |
541 | * These can span multiple transactions and basically include every |
542 | * dirty data page that isn't from nodatacow. |
543 | */ |
544 | struct list_head ordered_roots; |
545 | |
546 | struct mutex delalloc_root_mutex; |
547 | spinlock_t delalloc_root_lock; |
548 | /* All fs/file tree roots that have delalloc inodes. */ |
549 | struct list_head delalloc_roots; |
550 | |
551 | /* |
552 | * There is a pool of worker threads for checksumming during writes and |
553 | * a pool for checksumming after reads. This is because readers can |
554 | * run with FS locks held, and the writers may be waiting for those |
555 | * locks. We don't want ordering in the pending list to cause |
556 | * deadlocks, and so the two are serviced separately. |
557 | * |
558 | * A third pool does submit_bio to avoid deadlocking with the other two. |
559 | */ |
560 | struct btrfs_workqueue *workers; |
561 | struct btrfs_workqueue *delalloc_workers; |
562 | struct btrfs_workqueue *flush_workers; |
563 | struct workqueue_struct *endio_workers; |
564 | struct workqueue_struct *endio_meta_workers; |
565 | struct workqueue_struct *rmw_workers; |
566 | struct workqueue_struct *compressed_write_workers; |
567 | struct btrfs_workqueue *endio_write_workers; |
568 | struct btrfs_workqueue *endio_freespace_worker; |
569 | struct btrfs_workqueue *caching_workers; |
570 | |
571 | /* |
572 | * Fixup workers take dirty pages that didn't properly go through the |
573 | * cow mechanism and make them safe to write. It happens for the |
574 | * sys_munmap function call path. |
575 | */ |
576 | struct btrfs_workqueue *fixup_workers; |
577 | struct btrfs_workqueue *delayed_workers; |
578 | |
579 | struct task_struct *transaction_kthread; |
580 | struct task_struct *cleaner_kthread; |
581 | u32 thread_pool_size; |
582 | |
583 | struct kobject *space_info_kobj; |
584 | struct kobject *qgroups_kobj; |
585 | struct kobject *discard_kobj; |
586 | |
587 | /* Used to keep from writing metadata until there is a nice batch */ |
588 | struct percpu_counter dirty_metadata_bytes; |
589 | struct percpu_counter delalloc_bytes; |
590 | struct percpu_counter ordered_bytes; |
591 | s32 dirty_metadata_batch; |
592 | s32 delalloc_batch; |
593 | |
594 | /* Protected by 'trans_lock'. */ |
595 | struct list_head dirty_cowonly_roots; |
596 | |
597 | struct btrfs_fs_devices *fs_devices; |
598 | |
599 | /* |
600 | * The space_info list is effectively read only after initial setup. |
601 | * It is populated at mount time and cleaned up after all block groups |
602 | * are removed. RCU is used to protect it. |
603 | */ |
604 | struct list_head space_info; |
605 | |
606 | struct btrfs_space_info *data_sinfo; |
607 | |
608 | struct reloc_control *reloc_ctl; |
609 | |
610 | /* data_alloc_cluster is only used in ssd_spread mode */ |
611 | struct btrfs_free_cluster data_alloc_cluster; |
612 | |
613 | /* All metadata allocations go through this cluster. */ |
614 | struct btrfs_free_cluster meta_alloc_cluster; |
615 | |
616 | /* Auto defrag inodes go here. */ |
617 | spinlock_t defrag_inodes_lock; |
618 | struct rb_root defrag_inodes; |
619 | atomic_t defrag_running; |
620 | |
621 | /* Used to protect avail_{data, metadata, system}_alloc_bits */ |
622 | seqlock_t profiles_lock; |
623 | /* |
624 | * These three are in extended format (availability of single chunks is |
625 | * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted |
626 | * by corresponding BTRFS_BLOCK_GROUP_* bits) |
627 | */ |
628 | u64 avail_data_alloc_bits; |
629 | u64 avail_metadata_alloc_bits; |
630 | u64 avail_system_alloc_bits; |
631 | |
632 | /* Balance state */ |
633 | spinlock_t balance_lock; |
634 | struct mutex balance_mutex; |
635 | atomic_t balance_pause_req; |
636 | atomic_t balance_cancel_req; |
637 | struct btrfs_balance_control *balance_ctl; |
638 | wait_queue_head_t balance_wait_q; |
639 | |
640 | /* Cancellation requests for chunk relocation */ |
641 | atomic_t reloc_cancel_req; |
642 | |
643 | u32 data_chunk_allocations; |
644 | u32 metadata_ratio; |
645 | |
646 | void *bdev_holder; |
647 | |
648 | /* Private scrub information */ |
649 | struct mutex scrub_lock; |
650 | atomic_t scrubs_running; |
651 | atomic_t scrub_pause_req; |
652 | atomic_t scrubs_paused; |
653 | atomic_t scrub_cancel_req; |
654 | wait_queue_head_t scrub_pause_wait; |
655 | /* |
656 | * The worker pointers are NULL iff the refcount is 0, ie. scrub is not |
657 | * running. |
658 | */ |
659 | refcount_t scrub_workers_refcnt; |
660 | struct workqueue_struct *scrub_workers; |
661 | struct btrfs_subpage_info *subpage_info; |
662 | |
663 | struct btrfs_discard_ctl discard_ctl; |
664 | |
665 | /* Is qgroup tracking in a consistent state? */ |
666 | u64 qgroup_flags; |
667 | |
668 | /* Holds configuration and tracking. Protected by qgroup_lock. */ |
669 | struct rb_root qgroup_tree; |
670 | spinlock_t qgroup_lock; |
671 | |
672 | /* |
673 | * Used to avoid frequently calling ulist_alloc()/ulist_free() |
674 | * when doing qgroup accounting, it must be protected by qgroup_lock. |
675 | */ |
676 | struct ulist *qgroup_ulist; |
677 | |
678 | /* |
679 | * Protect user change for quota operations. If a transaction is needed, |
680 | * it must be started before locking this lock. |
681 | */ |
682 | struct mutex qgroup_ioctl_lock; |
683 | |
684 | /* List of dirty qgroups to be written at next commit. */ |
685 | struct list_head dirty_qgroups; |
686 | |
687 | /* Used by qgroup for an efficient tree traversal. */ |
688 | u64 qgroup_seq; |
689 | |
690 | /* Qgroup rescan items. */ |
691 | /* Protects the progress item */ |
692 | struct mutex qgroup_rescan_lock; |
693 | struct btrfs_key qgroup_rescan_progress; |
694 | struct btrfs_workqueue *qgroup_rescan_workers; |
695 | struct completion qgroup_rescan_completion; |
696 | struct btrfs_work qgroup_rescan_work; |
697 | /* Protected by qgroup_rescan_lock */ |
698 | bool qgroup_rescan_running; |
699 | u8 qgroup_drop_subtree_thres; |
700 | u64 qgroup_enable_gen; |
701 | |
702 | /* |
703 | * If this is not 0, then it indicates a serious filesystem error has |
704 | * happened and it contains that error (negative errno value). |
705 | */ |
706 | int fs_error; |
707 | |
708 | /* Filesystem state */ |
709 | unsigned long fs_state; |
710 | |
711 | struct btrfs_delayed_root *delayed_root; |
712 | |
713 | /* Extent buffer radix tree */ |
714 | spinlock_t buffer_lock; |
715 | /* Entries are eb->start / sectorsize */ |
716 | struct radix_tree_root buffer_radix; |
717 | |
718 | /* Next backup root to be overwritten */ |
719 | int backup_root_index; |
720 | |
721 | /* Device replace state */ |
722 | struct btrfs_dev_replace dev_replace; |
723 | |
724 | struct semaphore uuid_tree_rescan_sem; |
725 | |
726 | /* Used to reclaim the metadata space in the background. */ |
727 | struct work_struct async_reclaim_work; |
728 | struct work_struct async_data_reclaim_work; |
729 | struct work_struct preempt_reclaim_work; |
730 | |
731 | /* Reclaim partially filled block groups in the background */ |
732 | struct work_struct reclaim_bgs_work; |
733 | struct list_head reclaim_bgs; |
734 | int bg_reclaim_threshold; |
735 | |
736 | spinlock_t unused_bgs_lock; |
737 | struct list_head unused_bgs; |
738 | struct mutex unused_bg_unpin_mutex; |
739 | /* Protect block groups that are going to be deleted */ |
740 | struct mutex reclaim_bgs_lock; |
741 | |
742 | /* Cached block sizes */ |
743 | u32 nodesize; |
744 | u32 sectorsize; |
745 | /* ilog2 of sectorsize, use to avoid 64bit division */ |
746 | u32 sectorsize_bits; |
747 | u32 csum_size; |
748 | u32 csums_per_leaf; |
749 | u32 stripesize; |
750 | |
751 | /* |
752 | * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular |
753 | * filesystem, on zoned it depends on the device constraints. |
754 | */ |
755 | u64 max_extent_size; |
756 | |
757 | /* Block groups and devices containing active swapfiles. */ |
758 | spinlock_t swapfile_pins_lock; |
759 | struct rb_root swapfile_pins; |
760 | |
761 | struct crypto_shash *csum_shash; |
762 | |
763 | /* Type of exclusive operation running, protected by super_lock */ |
764 | enum btrfs_exclusive_operation exclusive_operation; |
765 | |
766 | /* |
767 | * Zone size > 0 when in ZONED mode, otherwise it's used for a check |
768 | * if the mode is enabled |
769 | */ |
770 | u64 zone_size; |
771 | |
772 | /* Constraints for ZONE_APPEND commands: */ |
773 | struct queue_limits limits; |
774 | u64 max_zone_append_size; |
775 | |
776 | struct mutex zoned_meta_io_lock; |
777 | spinlock_t treelog_bg_lock; |
778 | u64 treelog_bg; |
779 | |
780 | /* |
781 | * Start of the dedicated data relocation block group, protected by |
782 | * relocation_bg_lock. |
783 | */ |
784 | spinlock_t relocation_bg_lock; |
785 | u64 data_reloc_bg; |
786 | struct mutex zoned_data_reloc_io_lock; |
787 | |
788 | struct btrfs_block_group *active_meta_bg; |
789 | struct btrfs_block_group *active_system_bg; |
790 | |
791 | u64 nr_global_roots; |
792 | |
793 | spinlock_t zone_active_bgs_lock; |
794 | struct list_head zone_active_bgs; |
795 | |
796 | /* Updates are not protected by any lock */ |
797 | struct btrfs_commit_stats commit_stats; |
798 | |
799 | /* |
800 | * Last generation where we dropped a non-relocation root. |
801 | * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() |
802 | * to change it and to read it, respectively. |
803 | */ |
804 | u64 last_root_drop_gen; |
805 | |
806 | /* |
807 | * Annotations for transaction events (structures are empty when |
808 | * compiled without lockdep). |
809 | */ |
810 | struct lockdep_map btrfs_trans_num_writers_map; |
811 | struct lockdep_map btrfs_trans_num_extwriters_map; |
812 | struct lockdep_map btrfs_state_change_map[4]; |
813 | struct lockdep_map btrfs_trans_pending_ordered_map; |
814 | struct lockdep_map btrfs_ordered_extent_map; |
815 | |
816 | #ifdef CONFIG_BTRFS_FS_REF_VERIFY |
817 | spinlock_t ref_verify_lock; |
818 | struct rb_root block_tree; |
819 | #endif |
820 | |
821 | #ifdef CONFIG_BTRFS_DEBUG |
822 | struct kobject *debug_kobj; |
823 | struct list_head allocated_roots; |
824 | |
825 | spinlock_t eb_leak_lock; |
826 | struct list_head allocated_ebs; |
827 | #endif |
828 | }; |
829 | |
830 | static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) |
831 | { |
832 | return READ_ONCE(fs_info->generation); |
833 | } |
834 | |
835 | static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) |
836 | { |
837 | WRITE_ONCE(fs_info->generation, gen); |
838 | } |
839 | |
840 | static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) |
841 | { |
842 | return READ_ONCE(fs_info->last_trans_committed); |
843 | } |
844 | |
845 | static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) |
846 | { |
847 | WRITE_ONCE(fs_info->last_trans_committed, gen); |
848 | } |
849 | |
850 | static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, |
851 | u64 gen) |
852 | { |
853 | WRITE_ONCE(fs_info->last_root_drop_gen, gen); |
854 | } |
855 | |
856 | static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) |
857 | { |
858 | return READ_ONCE(fs_info->last_root_drop_gen); |
859 | } |
860 | |
861 | /* |
862 | * Take the number of bytes to be checksummed and figure out how many leaves |
863 | * it would require to store the csums for that many bytes. |
864 | */ |
865 | static inline u64 btrfs_csum_bytes_to_leaves( |
866 | const struct btrfs_fs_info *fs_info, u64 csum_bytes) |
867 | { |
868 | const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; |
869 | |
870 | return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); |
871 | } |
872 | |
873 | /* |
874 | * Use this if we would be adding new items, as we could split nodes as we cow |
875 | * down the tree. |
876 | */ |
877 | static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, |
878 | unsigned num_items) |
879 | { |
880 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; |
881 | } |
882 | |
883 | /* |
884 | * Doing a truncate or a modification won't result in new nodes or leaves, just |
885 | * what we need for COW. |
886 | */ |
887 | static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, |
888 | unsigned num_items) |
889 | { |
890 | return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; |
891 | } |
892 | |
893 | #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ |
894 | sizeof(struct btrfs_item)) |
895 | |
896 | static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) |
897 | { |
898 | return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; |
899 | } |
900 | |
901 | /* |
902 | * Count how many fs_info->max_extent_size cover the @size |
903 | */ |
904 | static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) |
905 | { |
906 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS |
907 | if (!fs_info) |
908 | return div_u64(dividend: size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); |
909 | #endif |
910 | |
911 | return div_u64(dividend: size + fs_info->max_extent_size - 1, divisor: fs_info->max_extent_size); |
912 | } |
913 | |
914 | bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, |
915 | enum btrfs_exclusive_operation type); |
916 | bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, |
917 | enum btrfs_exclusive_operation type); |
918 | void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); |
919 | void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); |
920 | void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, |
921 | enum btrfs_exclusive_operation op); |
922 | |
923 | /* Compatibility and incompatibility defines */ |
924 | void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, |
925 | const char *name); |
926 | void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, |
927 | const char *name); |
928 | void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, |
929 | const char *name); |
930 | void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, |
931 | const char *name); |
932 | |
933 | #define __btrfs_fs_incompat(fs_info, flags) \ |
934 | (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) |
935 | |
936 | #define __btrfs_fs_compat_ro(fs_info, flags) \ |
937 | (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) |
938 | |
939 | #define btrfs_set_fs_incompat(__fs_info, opt) \ |
940 | __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) |
941 | |
942 | #define btrfs_clear_fs_incompat(__fs_info, opt) \ |
943 | __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) |
944 | |
945 | #define btrfs_fs_incompat(fs_info, opt) \ |
946 | __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) |
947 | |
948 | #define btrfs_set_fs_compat_ro(__fs_info, opt) \ |
949 | __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) |
950 | |
951 | #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ |
952 | __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) |
953 | |
954 | #define btrfs_fs_compat_ro(fs_info, opt) \ |
955 | __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) |
956 | |
957 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
958 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) |
959 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) |
960 | #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ |
961 | BTRFS_MOUNT_##opt) |
962 | |
963 | #define btrfs_set_and_info(fs_info, opt, fmt, args...) \ |
964 | do { \ |
965 | if (!btrfs_test_opt(fs_info, opt)) \ |
966 | btrfs_info(fs_info, fmt, ##args); \ |
967 | btrfs_set_opt(fs_info->mount_opt, opt); \ |
968 | } while (0) |
969 | |
970 | #define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ |
971 | do { \ |
972 | if (btrfs_test_opt(fs_info, opt)) \ |
973 | btrfs_info(fs_info, fmt, ##args); \ |
974 | btrfs_clear_opt(fs_info->mount_opt, opt); \ |
975 | } while (0) |
976 | |
977 | static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) |
978 | { |
979 | /* Do it this way so we only ever do one test_bit in the normal case. */ |
980 | if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { |
981 | if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) |
982 | return 2; |
983 | return 1; |
984 | } |
985 | return 0; |
986 | } |
987 | |
988 | /* |
989 | * If we remount the fs to be R/O or umount the fs, the cleaner needn't do |
990 | * anything except sleeping. This function is used to check the status of |
991 | * the fs. |
992 | * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, |
993 | * since setting and checking for SB_RDONLY in the superblock's flags is not |
994 | * atomic. |
995 | */ |
996 | static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) |
997 | { |
998 | return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || |
999 | btrfs_fs_closing(fs_info); |
1000 | } |
1001 | |
1002 | static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) |
1003 | { |
1004 | clear_and_wake_up_bit(bit: BTRFS_FS_UNFINISHED_DROPS, word: &fs_info->flags); |
1005 | } |
1006 | |
1007 | #define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) |
1008 | |
1009 | #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ |
1010 | (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ |
1011 | &(fs_info)->fs_state))) |
1012 | |
1013 | #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS |
1014 | |
1015 | #define EXPORT_FOR_TESTS |
1016 | |
1017 | static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) |
1018 | { |
1019 | return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); |
1020 | } |
1021 | |
1022 | void btrfs_test_destroy_inode(struct inode *inode); |
1023 | |
1024 | #else |
1025 | |
1026 | #define EXPORT_FOR_TESTS static |
1027 | |
1028 | static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) |
1029 | { |
1030 | return 0; |
1031 | } |
1032 | #endif |
1033 | |
1034 | #endif |
1035 | |