block-group.c source code [linux/fs/btrfs/block-group.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	#include <linux/sizes.h>
4	#include <linux/list_sort.h>
5	#include "misc.h"
6	#include "ctree.h"
7	#include "block-group.h"
8	#include "space-info.h"
9	#include "disk-io.h"
10	#include "free-space-cache.h"
11	#include "free-space-tree.h"
12	#include "volumes.h"
13	#include "transaction.h"
14	#include "ref-verify.h"
15	#include "sysfs.h"
16	#include "tree-log.h"
17	#include "delalloc-space.h"
18	#include "discard.h"
19	#include "raid56.h"
20	#include "zoned.h"
21	#include "fs.h"
22	#include "accessors.h"
23	#include "extent-tree.h"
24
25	#ifdef CONFIG_BTRFS_DEBUG
26	int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
27	{
28	struct btrfs_fs_info *fs_info = block_group->fs_info;
29
30	return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
31	block_group->flags & BTRFS_BLOCK_GROUP_METADATA) \|\|
32	(btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
33	block_group->flags & BTRFS_BLOCK_GROUP_DATA);
34	}
35	#endif
36
37	/*
38	* Return target flags in extended format or 0 if restripe for this chunk_type
39	* is not in progress
40	*
41	* Should be called with balance_lock held
42	*/
43	static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
44	{
45	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
46	u64 target = `0`;
47
48	if (!bctl)
49	return `0`;
50
51	if (flags & BTRFS_BLOCK_GROUP_DATA &&
52	bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
53	target = BTRFS_BLOCK_GROUP_DATA \| bctl->data.target;
54	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
55	bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
56	target = BTRFS_BLOCK_GROUP_SYSTEM \| bctl->sys.target;
57	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
58	bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
59	target = BTRFS_BLOCK_GROUP_METADATA \| bctl->meta.target;
60	}
61
62	return target;
63	}
64
65	/*
66	* @flags: available profiles in extended format (see ctree.h)
67	*
68	* Return reduced profile in chunk format. If profile changing is in progress
69	* (either running or paused) picks the target profile (if it's already
70	* available), otherwise falls back to plain reducing.
71	*/
72	static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
73	{
74	u64 num_devices = fs_info->fs_devices->rw_devices;
75	u64 target;
76	u64 raid_type;
77	u64 allowed = `0`;
78
79	/*
80	* See if restripe for this chunk_type is in progress, if so try to
81	* reduce to the target profile
82	*/
83	spin_lock(lock: &fs_info->balance_lock);
84	target = get_restripe_target(fs_info, flags);
85	if (target) {
86	spin_unlock(lock: &fs_info->balance_lock);
87	return extended_to_chunk(flags: target);
88	}
89	spin_unlock(lock: &fs_info->balance_lock);
90
91	/ First, mask out the RAID levels which aren't possible /
92	for (raid_type = `0`; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
93	if (num_devices >= btrfs_raid_array[raid_type].devs_min)
94	allowed \|= btrfs_raid_array[raid_type].bg_flag;
95	}
96	allowed &= flags;
97
98	/ Select the highest-redundancy RAID level. /
99	if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
100	allowed = BTRFS_BLOCK_GROUP_RAID1C4;
101	else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
102	allowed = BTRFS_BLOCK_GROUP_RAID6;
103	else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
104	allowed = BTRFS_BLOCK_GROUP_RAID1C3;
105	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
106	allowed = BTRFS_BLOCK_GROUP_RAID5;
107	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
108	allowed = BTRFS_BLOCK_GROUP_RAID10;
109	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
110	allowed = BTRFS_BLOCK_GROUP_RAID1;
111	else if (allowed & BTRFS_BLOCK_GROUP_DUP)
112	allowed = BTRFS_BLOCK_GROUP_DUP;
113	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
114	allowed = BTRFS_BLOCK_GROUP_RAID0;
115
116	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
117
118	return extended_to_chunk(flags: flags \| allowed);
119	}
120
121	u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
122	{
123	unsigned seq;
124	u64 flags;
125
126	do {
127	flags = orig_flags;
128	seq = read_seqbegin(sl: &fs_info->profiles_lock);
129
130	if (flags & BTRFS_BLOCK_GROUP_DATA)
131	flags \|= fs_info->avail_data_alloc_bits;
132	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
133	flags \|= fs_info->avail_system_alloc_bits;
134	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
135	flags \|= fs_info->avail_metadata_alloc_bits;
136	} while (read_seqretry(sl: &fs_info->profiles_lock, start: seq));
137
138	return btrfs_reduce_alloc_profile(fs_info, flags);
139	}
140
141	void btrfs_get_block_group(struct btrfs_block_group *cache)
142	{
143	refcount_inc(r: &cache->refs);
144	}
145
146	void btrfs_put_block_group(struct btrfs_block_group *cache)
147	{
148	if (refcount_dec_and_test(r: &cache->refs)) {
149	WARN_ON(cache->pinned > `0`);
150	/*
151	* If there was a failure to cleanup a log tree, very likely due
152	* to an IO failure on a writeback attempt of one or more of its
153	* extent buffers, we could not do proper (and cheap) unaccounting
154	* of their reserved space, so don't warn on reserved > 0 in that
155	* case.
156	*/
157	if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) \|\|
158	!BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
159	WARN_ON(cache->reserved > `0`);
160
161	/*
162	* A block_group shouldn't be on the discard_list anymore.
163	* Remove the block_group from the discard_list to prevent us
164	* from causing a panic due to NULL pointer dereference.
165	*/
166	if (WARN_ON(!list_empty(&cache->discard_list)))
167	btrfs_discard_cancel_work(discard_ctl: &cache->fs_info->discard_ctl,
168	block_group: cache);
169
170	kfree(objp: cache->free_space_ctl);
171	btrfs_free_chunk_map(map: cache->physical_map);
172	kfree(objp: cache);
173	}
174	}
175
176	/*
177	* This adds the block group to the fs_info rb tree for the block group cache
178	*/
179	static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
180	struct btrfs_block_group *block_group)
181	{
182	struct rb_node **p;
183	struct rb_node *parent = NULL;
184	struct btrfs_block_group *cache;
185	bool leftmost = true;
186
187	ASSERT(block_group->length != `0`);
188
189	write_lock(&info->block_group_cache_lock);
190	p = &info->block_group_cache_tree.rb_root.rb_node;
191
192	while (*p) {
193	parent = *p;
194	cache = rb_entry(parent, struct btrfs_block_group, cache_node);
195	if (block_group->start < cache->start) {
196	p = &(*p)->rb_left;
197	} else if (block_group->start > cache->start) {
198	p = &(*p)->rb_right;
199	leftmost = false;
200	} else {
201	write_unlock(&info->block_group_cache_lock);
202	return -EEXIST;
203	}
204	}
205
206	rb_link_node(node: &block_group->cache_node, parent, rb_link: p);
207	rb_insert_color_cached(node: &block_group->cache_node,
208	root: &info->block_group_cache_tree, leftmost);
209
210	write_unlock(&info->block_group_cache_lock);
211
212	return `0`;
213	}
214
215	/*
216	* This will return the block group at or after bytenr if contains is 0, else
217	* it will return the block group that contains the bytenr
218	*/
219	static struct btrfs_block_group *block_group_cache_tree_search(
220	struct btrfs_fs_info info, u64 bytenr, int* contains)
221	{
222	struct btrfs_block_group cache, ret = NULL;
223	struct rb_node *n;
224	u64 end, start;
225
226	read_lock(&info->block_group_cache_lock);
227	n = info->block_group_cache_tree.rb_root.rb_node;
228
229	while (n) {
230	cache = rb_entry(n, struct btrfs_block_group, cache_node);
231	end = cache->start + cache->length - `1`;
232	start = cache->start;
233
234	if (bytenr < start) {
235	if (!contains && (!ret \|\| start < ret->start))
236	ret = cache;
237	n = n->rb_left;
238	} else if (bytenr > start) {
239	if (contains && bytenr <= end) {
240	ret = cache;
241	break;
242	}
243	n = n->rb_right;
244	} else {
245	ret = cache;
246	break;
247	}
248	}
249	if (ret)
250	btrfs_get_block_group(cache: ret);
251	read_unlock(&info->block_group_cache_lock);
252
253	return ret;
254	}
255
256	/*
257	* Return the block group that starts at or after bytenr
258	*/
259	struct btrfs_block_group *btrfs_lookup_first_block_group(
260	struct btrfs_fs_info *info, u64 bytenr)
261	{
262	return block_group_cache_tree_search(info, bytenr, contains: `0`);
263	}
264
265	/*
266	* Return the block group that contains the given bytenr
267	*/
268	struct btrfs_block_group *btrfs_lookup_block_group(
269	struct btrfs_fs_info *info, u64 bytenr)
270	{
271	return block_group_cache_tree_search(info, bytenr, contains: `1`);
272	}
273
274	struct btrfs_block_group *btrfs_next_block_group(
275	struct btrfs_block_group *cache)
276	{
277	struct btrfs_fs_info *fs_info = cache->fs_info;
278	struct rb_node *node;
279
280	read_lock(&fs_info->block_group_cache_lock);
281
282	/ If our block group was removed, we need a full search. /
283	if (RB_EMPTY_NODE(&cache->cache_node)) {
284	const u64 next_bytenr = cache->start + cache->length;
285
286	read_unlock(&fs_info->block_group_cache_lock);
287	btrfs_put_block_group(cache);
288	return btrfs_lookup_first_block_group(info: fs_info, bytenr: next_bytenr);
289	}
290	node = rb_next(&cache->cache_node);
291	btrfs_put_block_group(cache);
292	if (node) {
293	cache = rb_entry(node, struct btrfs_block_group, cache_node);
294	btrfs_get_block_group(cache);
295	} else
296	cache = NULL;
297	read_unlock(&fs_info->block_group_cache_lock);
298	return cache;
299	}
300
301	/*
302	* Check if we can do a NOCOW write for a given extent.
303	*
304	* @fs_info: The filesystem information object.
305	* @bytenr: Logical start address of the extent.
306	*
307	* Check if we can do a NOCOW write for the given extent, and increments the
308	* number of NOCOW writers in the block group that contains the extent, as long
309	* as the block group exists and it's currently not in read-only mode.
310	*
311	* Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
312	* is responsible for calling btrfs_dec_nocow_writers() later.
313	*
314	* Or NULL if we can not do a NOCOW write
315	*/
316	struct btrfs_block_group btrfs_inc_nocow_writers(struct* btrfs_fs_info *fs_info,
317	u64 bytenr)
318	{
319	struct btrfs_block_group *bg;
320	bool can_nocow = true;
321
322	bg = btrfs_lookup_block_group(info: fs_info, bytenr);
323	if (!bg)
324	return NULL;
325
326	spin_lock(lock: &bg->lock);
327	if (bg->ro)
328	can_nocow = false;
329	else
330	atomic_inc(v: &bg->nocow_writers);
331	spin_unlock(lock: &bg->lock);
332
333	if (!can_nocow) {
334	btrfs_put_block_group(cache: bg);
335	return NULL;
336	}
337
338	/ No put on block group, done by btrfs_dec_nocow_writers(). /
339	return bg;
340	}
341
342	/*
343	* Decrement the number of NOCOW writers in a block group.
344	*
345	* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
346	* and on the block group returned by that call. Typically this is called after
347	* creating an ordered extent for a NOCOW write, to prevent races with scrub and
348	* relocation.
349	*
350	* After this call, the caller should not use the block group anymore. It it wants
351	* to use it, then it should get a reference on it before calling this function.
352	*/
353	void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
354	{
355	if (atomic_dec_and_test(v: &bg->nocow_writers))
356	wake_up_var(var: &bg->nocow_writers);
357
358	/ For the lookup done by a previous call to btrfs_inc_nocow_writers(). /
359	btrfs_put_block_group(cache: bg);
360	}
361
362	void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
363	{
364	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
365	}
366
367	void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
368	const u64 start)
369	{
370	struct btrfs_block_group *bg;
371
372	bg = btrfs_lookup_block_group(info: fs_info, bytenr: start);
373	ASSERT(bg);
374	if (atomic_dec_and_test(v: &bg->reservations))
375	wake_up_var(var: &bg->reservations);
376	btrfs_put_block_group(cache: bg);
377	}
378
379	void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
380	{
381	struct btrfs_space_info *space_info = bg->space_info;
382
383	ASSERT(bg->ro);
384
385	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
386	return;
387
388	/*
389	* Our block group is read only but before we set it to read only,
390	* some task might have had allocated an extent from it already, but it
391	* has not yet created a respective ordered extent (and added it to a
392	* root's list of ordered extents).
393	* Therefore wait for any task currently allocating extents, since the
394	* block group's reservations counter is incremented while a read lock
395	* on the groups' semaphore is held and decremented after releasing
396	* the read access on that semaphore and creating the ordered extent.
397	*/
398	down_write(sem: &space_info->groups_sem);
399	up_write(sem: &space_info->groups_sem);
400
401	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
402	}
403
404	struct btrfs_caching_control *btrfs_get_caching_control(
405	struct btrfs_block_group *cache)
406	{
407	struct btrfs_caching_control *ctl;
408
409	spin_lock(lock: &cache->lock);
410	if (!cache->caching_ctl) {
411	spin_unlock(lock: &cache->lock);
412	return NULL;
413	}
414
415	ctl = cache->caching_ctl;
416	refcount_inc(r: &ctl->count);
417	spin_unlock(lock: &cache->lock);
418	return ctl;
419	}
420
421	static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
422	{
423	if (refcount_dec_and_test(r: &ctl->count))
424	kfree(objp: ctl);
425	}
426
427	/*
428	* When we wait for progress in the block group caching, its because our
429	* allocation attempt failed at least once. So, we must sleep and let some
430	* progress happen before we try again.
431	*
432	* This function will sleep at least once waiting for new free space to show
433	* up, and then it will check the block group free space numbers for our min
434	* num_bytes. Another option is to have it go ahead and look in the rbtree for
435	* a free extent of a given size, but this is a good start.
436	*
437	* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
438	* any of the information in this block group.
439	*/
440	void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
441	u64 num_bytes)
442	{
443	struct btrfs_caching_control *caching_ctl;
444	int progress;
445
446	caching_ctl = btrfs_get_caching_control(cache);
447	if (!caching_ctl)
448	return;
449
450	/*
451	* We've already failed to allocate from this block group, so even if
452	* there's enough space in the block group it isn't contiguous enough to
453	* allow for an allocation, so wait for at least the next wakeup tick,
454	* or for the thing to be done.
455	*/
456	progress = atomic_read(v: &caching_ctl->progress);
457
458	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) \|\|
459	(progress != atomic_read(&caching_ctl->progress) &&
460	(cache->free_space_ctl->free_space >= num_bytes)));
461
462	btrfs_put_caching_control(ctl: caching_ctl);
463	}
464
465	static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
466	struct btrfs_caching_control *caching_ctl)
467	{
468	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
469	return cache->cached == BTRFS_CACHE_ERROR ? -EIO : `0`;
470	}
471
472	static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
473	{
474	struct btrfs_caching_control *caching_ctl;
475	int ret;
476
477	caching_ctl = btrfs_get_caching_control(cache);
478	if (!caching_ctl)
479	return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : `0`;
480	ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
481	btrfs_put_caching_control(ctl: caching_ctl);
482	return ret;
483	}
484
485	#ifdef CONFIG_BTRFS_DEBUG
486	static void fragment_free_space(struct btrfs_block_group *block_group)
487	{
488	struct btrfs_fs_info *fs_info = block_group->fs_info;
489	u64 start = block_group->start;
490	u64 len = block_group->length;
491	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
492	fs_info->nodesize : fs_info->sectorsize;
493	u64 step = chunk << `1`;
494
495	while (len > chunk) {
496	btrfs_remove_free_space(block_group, bytenr: start, size: chunk);
497	start += step;
498	if (len < step)
499	len = `0`;
500	else
501	len -= step;
502	}
503	}
504	#endif
505
506	/*
507	* Add a free space range to the in memory free space cache of a block group.
508	* This checks if the range contains super block locations and any such
509	* locations are not added to the free space cache.
510	*
511	* @block_group: The target block group.
512	* @start: Start offset of the range.
513	* @end: End offset of the range (exclusive).
514	* @total_added_ret: Optional pointer to return the total amount of space
515	* added to the block group's free space cache.
516	*
517	* Returns 0 on success or < 0 on error.
518	*/
519	int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
520	u64 end, u64 *total_added_ret)
521	{
522	struct btrfs_fs_info *info = block_group->fs_info;
523	u64 extent_start, extent_end, size;
524	int ret;
525
526	if (total_added_ret)
527	*total_added_ret = `0`;
528
529	while (start < end) {
530	if (!find_first_extent_bit(tree: &info->excluded_extents, start,
531	start_ret: &extent_start, end_ret: &extent_end,
532	bits: EXTENT_DIRTY \| EXTENT_UPTODATE,
533	NULL))
534	break;
535
536	if (extent_start <= start) {
537	start = extent_end + `1`;
538	} else if (extent_start > start && extent_start < end) {
539	size = extent_start - start;
540	ret = btrfs_add_free_space_async_trimmed(block_group,
541	bytenr: start, size);
542	if (ret)
543	return ret;
544	if (total_added_ret)
545	*total_added_ret += size;
546	start = extent_end + `1`;
547	} else {
548	break;
549	}
550	}
551
552	if (start < end) {
553	size = end - start;
554	ret = btrfs_add_free_space_async_trimmed(block_group, bytenr: start,
555	size);
556	if (ret)
557	return ret;
558	if (total_added_ret)
559	*total_added_ret += size;
560	}
561
562	return `0`;
563	}
564
565	/*
566	* Get an arbitrary extent item index / max_index through the block group
567	*
568	* @block_group the block group to sample from
569	* @index: the integral step through the block group to grab from
570	* @max_index: the granularity of the sampling
571	* @key: return value parameter for the item we find
572	*
573	* Pre-conditions on indices:
574	* 0 <= index <= max_index
575	* 0 < max_index
576	*
577	* Returns: 0 on success, 1 if the search didn't yield a useful item, negative
578	* error code on error.
579	*/
580	static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
581	struct btrfs_block_group *block_group,
582	int index, int max_index,
583	struct btrfs_key *found_key)
584	{
585	struct btrfs_fs_info *fs_info = block_group->fs_info;
586	struct btrfs_root *extent_root;
587	u64 search_offset;
588	u64 search_end = block_group->start + block_group->length;
589	struct btrfs_path *path;
590	struct btrfs_key search_key;
591	int ret = `0`;
592
593	ASSERT(index >= `0`);
594	ASSERT(index <= max_index);
595	ASSERT(max_index > `0`);
596	lockdep_assert_held(&caching_ctl->mutex);
597	lockdep_assert_held_read(&fs_info->commit_root_sem);
598
599	path = btrfs_alloc_path();
600	if (!path)
601	return -ENOMEM;
602
603	extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
604	BTRFS_SUPER_INFO_OFFSET));
605
606	path->skip_locking = `1`;
607	path->search_commit_root = `1`;
608	path->reada = READA_FORWARD;
609
610	search_offset = index * div_u64(dividend: block_group->length, divisor: max_index);
611	search_key.objectid = block_group->start + search_offset;
612	search_key.type = BTRFS_EXTENT_ITEM_KEY;
613	search_key.offset = `0`;
614
615	btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
616	/ Success; sampled an extent item in the block group /
617	if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
618	found_key->objectid >= block_group->start &&
619	found_key->objectid + found_key->offset <= search_end)
620	break;
621
622	/ We can't possibly find a valid extent item anymore /
623	if (found_key->objectid >= search_end) {
624	ret = `1`;
625	break;
626	}
627	}
628
629	lockdep_assert_held(&caching_ctl->mutex);
630	lockdep_assert_held_read(&fs_info->commit_root_sem);
631	btrfs_free_path(p: path);
632	return ret;
633	}
634
635	/*
636	* Best effort attempt to compute a block group's size class while caching it.
637	*
638	* @block_group: the block group we are caching
639	*
640	* We cannot infer the size class while adding free space extents, because that
641	* logic doesn't care about contiguous file extents (it doesn't differentiate
642	* between a 100M extent and 100 contiguous 1M extents). So we need to read the
643	* file extent items. Reading all of them is quite wasteful, because usually
644	* only a handful are enough to give a good answer. Therefore, we just grab 5 of
645	* them at even steps through the block group and pick the smallest size class
646	* we see. Since size class is best effort, and not guaranteed in general,
647	* inaccuracy is acceptable.
648	*
649	* To be more explicit about why this algorithm makes sense:
650	*
651	* If we are caching in a block group from disk, then there are three major cases
652	* to consider:
653	* 1. the block group is well behaved and all extents in it are the same size
654	* class.
655	* 2. the block group is mostly one size class with rare exceptions for last
656	* ditch allocations
657	* 3. the block group was populated before size classes and can have a totally
658	* arbitrary mix of size classes.
659	*
660	* In case 1, looking at any extent in the block group will yield the correct
661	* result. For the mixed cases, taking the minimum size class seems like a good
662	* approximation, since gaps from frees will be usable to the size class. For
663	* 2., a small handful of file extents is likely to yield the right answer. For
664	* 3, we can either read every file extent, or admit that this is best effort
665	* anyway and try to stay fast.
666	*
667	* Returns: 0 on success, negative error code on error.
668	*/
669	static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
670	struct btrfs_block_group *block_group)
671	{
672	struct btrfs_fs_info *fs_info = block_group->fs_info;
673	struct btrfs_key key;
674	int i;
675	u64 min_size = block_group->length;
676	enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
677	int ret;
678
679	if (!btrfs_block_group_should_use_size_class(bg: block_group))
680	return `0`;
681
682	lockdep_assert_held(&caching_ctl->mutex);
683	lockdep_assert_held_read(&fs_info->commit_root_sem);
684	for (i = `0`; i < `5`; ++i) {
685	ret = sample_block_group_extent_item(caching_ctl, block_group, index: i, max_index: `5`, found_key: &key);
686	if (ret < `0`)
687	goto out;
688	if (ret > `0`)
689	continue;
690	min_size = min_t(u64, min_size, key.offset);
691	size_class = btrfs_calc_block_group_size_class(size: min_size);
692	}
693	if (size_class != BTRFS_BG_SZ_NONE) {
694	spin_lock(lock: &block_group->lock);
695	block_group->size_class = size_class;
696	spin_unlock(lock: &block_group->lock);
697	}
698	out:
699	return ret;
700	}
701
702	static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
703	{
704	struct btrfs_block_group *block_group = caching_ctl->block_group;
705	struct btrfs_fs_info *fs_info = block_group->fs_info;
706	struct btrfs_root *extent_root;
707	struct btrfs_path *path;
708	struct extent_buffer *leaf;
709	struct btrfs_key key;
710	u64 total_found = `0`;
711	u64 last = `0`;
712	u32 nritems;
713	int ret;
714	bool wakeup = true;
715
716	path = btrfs_alloc_path();
717	if (!path)
718	return -ENOMEM;
719
720	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
721	extent_root = btrfs_extent_root(fs_info, bytenr: last);
722
723	#ifdef CONFIG_BTRFS_DEBUG
724	/*
725	* If we're fragmenting we don't want to make anybody think we can
726	* allocate from this block group until we've had a chance to fragment
727	* the free space.
728	*/
729	if (btrfs_should_fragment_free_space(block_group))
730	wakeup = false;
731	#endif
732	/*
733	* We don't want to deadlock with somebody trying to allocate a new
734	* extent for the extent root while also trying to search the extent
735	* root to add free space. So we skip locking and search the commit
736	* root, since its read-only
737	*/
738	path->skip_locking = `1`;
739	path->search_commit_root = `1`;
740	path->reada = READA_FORWARD;
741
742	key.objectid = last;
743	key.offset = `0`;
744	key.type = BTRFS_EXTENT_ITEM_KEY;
745
746	next:
747	ret = btrfs_search_slot(NULL, root: extent_root, key: &key, p: path, ins_len: `0`, cow: `0`);
748	if (ret < `0`)
749	goto out;
750
751	leaf = path->nodes[`0`];
752	nritems = btrfs_header_nritems(eb: leaf);
753
754	while (`1`) {
755	if (btrfs_fs_closing(fs_info) > `1`) {
756	last = (u64)-`1`;
757	break;
758	}
759
760	if (path->slots[`0`] < nritems) {
761	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
762	} else {
763	ret = btrfs_find_next_key(root: extent_root, path, key: &key, lowest_level: `0`, min_trans: `0`);
764	if (ret)
765	break;
766
767	if (need_resched() \|\|
768	rwsem_is_contended(sem: &fs_info->commit_root_sem)) {
769	btrfs_release_path(p: path);
770	up_read(sem: &fs_info->commit_root_sem);
771	mutex_unlock(lock: &caching_ctl->mutex);
772	cond_resched();
773	mutex_lock(&caching_ctl->mutex);
774	down_read(sem: &fs_info->commit_root_sem);
775	goto next;
776	}
777
778	ret = btrfs_next_leaf(root: extent_root, path);
779	if (ret < `0`)
780	goto out;
781	if (ret)
782	break;
783	leaf = path->nodes[`0`];
784	nritems = btrfs_header_nritems(eb: leaf);
785	continue;
786	}
787
788	if (key.objectid < last) {
789	key.objectid = last;
790	key.offset = `0`;
791	key.type = BTRFS_EXTENT_ITEM_KEY;
792	btrfs_release_path(p: path);
793	goto next;
794	}
795
796	if (key.objectid < block_group->start) {
797	path->slots[`0`]++;
798	continue;
799	}
800
801	if (key.objectid >= block_group->start + block_group->length)
802	break;
803
804	if (key.type == BTRFS_EXTENT_ITEM_KEY \|\|
805	key.type == BTRFS_METADATA_ITEM_KEY) {
806	u64 space_added;
807
808	ret = btrfs_add_new_free_space(block_group, start: last,
809	end: key.objectid, total_added_ret: &space_added);
810	if (ret)
811	goto out;
812	total_found += space_added;
813	if (key.type == BTRFS_METADATA_ITEM_KEY)
814	last = key.objectid +
815	fs_info->nodesize;
816	else
817	last = key.objectid + key.offset;
818
819	if (total_found > CACHING_CTL_WAKE_UP) {
820	total_found = `0`;
821	if (wakeup) {
822	atomic_inc(v: &caching_ctl->progress);
823	wake_up(&caching_ctl->wait);
824	}
825	}
826	}
827	path->slots[`0`]++;
828	}
829
830	ret = btrfs_add_new_free_space(block_group, start: last,
831	end: block_group->start + block_group->length,
832	NULL);
833	out:
834	btrfs_free_path(p: path);
835	return ret;
836	}
837
838	static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
839	{
840	clear_extent_bits(tree: &bg->fs_info->excluded_extents, start: bg->start,
841	end: bg->start + bg->length - `1`, bits: EXTENT_UPTODATE);
842	}
843
844	static noinline void caching_thread(struct btrfs_work *work)
845	{
846	struct btrfs_block_group *block_group;
847	struct btrfs_fs_info *fs_info;
848	struct btrfs_caching_control *caching_ctl;
849	int ret;
850
851	caching_ctl = container_of(work, struct btrfs_caching_control, work);
852	block_group = caching_ctl->block_group;
853	fs_info = block_group->fs_info;
854
855	mutex_lock(&caching_ctl->mutex);
856	down_read(sem: &fs_info->commit_root_sem);
857
858	load_block_group_size_class(caching_ctl, block_group);
859	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
860	ret = load_free_space_cache(block_group);
861	if (ret == `1`) {
862	ret = `0`;
863	goto done;
864	}
865
866	/*
867	* We failed to load the space cache, set ourselves to
868	* CACHE_STARTED and carry on.
869	*/
870	spin_lock(lock: &block_group->lock);
871	block_group->cached = BTRFS_CACHE_STARTED;
872	spin_unlock(lock: &block_group->lock);
873	wake_up(&caching_ctl->wait);
874	}
875
876	/*
877	* If we are in the transaction that populated the free space tree we
878	* can't actually cache from the free space tree as our commit root and
879	* real root are the same, so we could change the contents of the blocks
880	* while caching. Instead do the slow caching in this case, and after
881	* the transaction has committed we will be safe.
882	*/
883	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
884	!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
885	ret = load_free_space_tree(caching_ctl);
886	else
887	ret = load_extent_tree_free(caching_ctl);
888	done:
889	spin_lock(lock: &block_group->lock);
890	block_group->caching_ctl = NULL;
891	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
892	spin_unlock(lock: &block_group->lock);
893
894	#ifdef CONFIG_BTRFS_DEBUG
895	if (btrfs_should_fragment_free_space(block_group)) {
896	u64 bytes_used;
897
898	spin_lock(lock: &block_group->space_info->lock);
899	spin_lock(lock: &block_group->lock);
900	bytes_used = block_group->length - block_group->used;
901	block_group->space_info->bytes_used += bytes_used >> `1`;
902	spin_unlock(lock: &block_group->lock);
903	spin_unlock(lock: &block_group->space_info->lock);
904	fragment_free_space(block_group);
905	}
906	#endif
907
908	up_read(sem: &fs_info->commit_root_sem);
909	btrfs_free_excluded_extents(bg: block_group);
910	mutex_unlock(lock: &caching_ctl->mutex);
911
912	wake_up(&caching_ctl->wait);
913
914	btrfs_put_caching_control(ctl: caching_ctl);
915	btrfs_put_block_group(cache: block_group);
916	}
917
918	int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
919	{
920	struct btrfs_fs_info *fs_info = cache->fs_info;
921	struct btrfs_caching_control *caching_ctl = NULL;
922	int ret = `0`;
923
924	/ Allocator for zoned filesystems does not use the cache at all /
925	if (btrfs_is_zoned(fs_info))
926	return `0`;
927
928	caching_ctl = kzalloc(size: sizeof(*caching_ctl), GFP_NOFS);
929	if (!caching_ctl)
930	return -ENOMEM;
931
932	INIT_LIST_HEAD(list: &caching_ctl->list);
933	mutex_init(&caching_ctl->mutex);
934	init_waitqueue_head(&caching_ctl->wait);
935	caching_ctl->block_group = cache;
936	refcount_set(r: &caching_ctl->count, n: `2`);
937	atomic_set(v: &caching_ctl->progress, i: `0`);
938	btrfs_init_work(work: &caching_ctl->work, func: caching_thread, NULL);
939
940	spin_lock(lock: &cache->lock);
941	if (cache->cached != BTRFS_CACHE_NO) {
942	kfree(objp: caching_ctl);
943
944	caching_ctl = cache->caching_ctl;
945	if (caching_ctl)
946	refcount_inc(r: &caching_ctl->count);
947	spin_unlock(lock: &cache->lock);
948	goto out;
949	}
950	WARN_ON(cache->caching_ctl);
951	cache->caching_ctl = caching_ctl;
952	cache->cached = BTRFS_CACHE_STARTED;
953	spin_unlock(lock: &cache->lock);
954
955	write_lock(&fs_info->block_group_cache_lock);
956	refcount_inc(r: &caching_ctl->count);
957	list_add_tail(new: &caching_ctl->list, head: &fs_info->caching_block_groups);
958	write_unlock(&fs_info->block_group_cache_lock);
959
960	btrfs_get_block_group(cache);
961
962	btrfs_queue_work(wq: fs_info->caching_workers, work: &caching_ctl->work);
963	out:
964	if (wait && caching_ctl)
965	ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
966	if (caching_ctl)
967	btrfs_put_caching_control(ctl: caching_ctl);
968
969	return ret;
970	}
971
972	static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
973	{
974	u64 extra_flags = chunk_to_extended(flags) &
975	BTRFS_EXTENDED_PROFILE_MASK;
976
977	write_seqlock(sl: &fs_info->profiles_lock);
978	if (flags & BTRFS_BLOCK_GROUP_DATA)
979	fs_info->avail_data_alloc_bits &= ~extra_flags;
980	if (flags & BTRFS_BLOCK_GROUP_METADATA)
981	fs_info->avail_metadata_alloc_bits &= ~extra_flags;
982	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
983	fs_info->avail_system_alloc_bits &= ~extra_flags;
984	write_sequnlock(sl: &fs_info->profiles_lock);
985	}
986
987	/*
988	* Clear incompat bits for the following feature(s):
989	*
990	* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
991	* in the whole filesystem
992	*
993	* - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
994	*/
995	static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
996	{
997	bool found_raid56 = false;
998	bool found_raid1c34 = false;
999
1000	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) \|\|
1001	(flags & BTRFS_BLOCK_GROUP_RAID1C3) \|\|
1002	(flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
1003	struct list_head *head = &fs_info->space_info;
1004	struct btrfs_space_info *sinfo;
1005
1006	list_for_each_entry_rcu(sinfo, head, list) {
1007	down_read(sem: &sinfo->groups_sem);
1008	if (!list_empty(head: &sinfo->block_groups[BTRFS_RAID_RAID5]))
1009	found_raid56 = true;
1010	if (!list_empty(head: &sinfo->block_groups[BTRFS_RAID_RAID6]))
1011	found_raid56 = true;
1012	if (!list_empty(head: &sinfo->block_groups[BTRFS_RAID_RAID1C3]))
1013	found_raid1c34 = true;
1014	if (!list_empty(head: &sinfo->block_groups[BTRFS_RAID_RAID1C4]))
1015	found_raid1c34 = true;
1016	up_read(sem: &sinfo->groups_sem);
1017	}
1018	if (!found_raid56)
1019	btrfs_clear_fs_incompat(fs_info, RAID56);
1020	if (!found_raid1c34)
1021	btrfs_clear_fs_incompat(fs_info, RAID1C34);
1022	}
1023	}
1024
1025	static int remove_block_group_item(struct btrfs_trans_handle *trans,
1026	struct btrfs_path *path,
1027	struct btrfs_block_group *block_group)
1028	{
1029	struct btrfs_fs_info *fs_info = trans->fs_info;
1030	struct btrfs_root *root;
1031	struct btrfs_key key;
1032	int ret;
1033
1034	root = btrfs_block_group_root(fs_info);
1035	key.objectid = block_group->start;
1036	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1037	key.offset = block_group->length;
1038
1039	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
1040	if (ret > `0`)
1041	ret = -ENOENT;
1042	if (ret < `0`)
1043	return ret;
1044
1045	ret = btrfs_del_item(trans, root, path);
1046	return ret;
1047	}
1048
1049	int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1050	struct btrfs_chunk_map *map)
1051	{
1052	struct btrfs_fs_info *fs_info = trans->fs_info;
1053	struct btrfs_path *path;
1054	struct btrfs_block_group *block_group;
1055	struct btrfs_free_cluster *cluster;
1056	struct inode *inode;
1057	struct kobject *kobj = NULL;
1058	int ret;
1059	int index;
1060	int factor;
1061	struct btrfs_caching_control *caching_ctl = NULL;
1062	bool remove_map;
1063	bool remove_rsv = false;
1064
1065	block_group = btrfs_lookup_block_group(info: fs_info, bytenr: map->start);
1066	if (!block_group)
1067	return -ENOENT;
1068
1069	BUG_ON(!block_group->ro);
1070
1071	trace_btrfs_remove_block_group(bg_cache: block_group);
1072	/*
1073	* Free the reserved super bytes from this block group before
1074	* remove it.
1075	*/
1076	btrfs_free_excluded_extents(bg: block_group);
1077	btrfs_free_ref_tree_range(fs_info, start: block_group->start,
1078	len: block_group->length);
1079
1080	index = btrfs_bg_flags_to_raid_index(flags: block_group->flags);
1081	factor = btrfs_bg_type_to_factor(flags: block_group->flags);
1082
1083	/ make sure this block group isn't part of an allocation cluster /
1084	cluster = &fs_info->data_alloc_cluster;
1085	spin_lock(lock: &cluster->refill_lock);
1086	btrfs_return_cluster_to_free_space(block_group, cluster);
1087	spin_unlock(lock: &cluster->refill_lock);
1088
1089	/*
1090	* make sure this block group isn't part of a metadata
1091	* allocation cluster
1092	*/
1093	cluster = &fs_info->meta_alloc_cluster;
1094	spin_lock(lock: &cluster->refill_lock);
1095	btrfs_return_cluster_to_free_space(block_group, cluster);
1096	spin_unlock(lock: &cluster->refill_lock);
1097
1098	btrfs_clear_treelog_bg(bg: block_group);
1099	btrfs_clear_data_reloc_bg(bg: block_group);
1100
1101	path = btrfs_alloc_path();
1102	if (!path) {
1103	ret = -ENOMEM;
1104	goto out;
1105	}
1106
1107	/*
1108	* get the inode first so any iput calls done for the io_list
1109	* aren't the final iput (no unlinks allowed now)
1110	*/
1111	inode = lookup_free_space_inode(block_group, path);
1112
1113	mutex_lock(&trans->transaction->cache_write_mutex);
1114	/*
1115	* Make sure our free space cache IO is done before removing the
1116	* free space inode
1117	*/
1118	spin_lock(lock: &trans->transaction->dirty_bgs_lock);
1119	if (!list_empty(head: &block_group->io_list)) {
1120	list_del_init(entry: &block_group->io_list);
1121
1122	WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1123
1124	spin_unlock(lock: &trans->transaction->dirty_bgs_lock);
1125	btrfs_wait_cache_io(trans, block_group, path);
1126	btrfs_put_block_group(cache: block_group);
1127	spin_lock(lock: &trans->transaction->dirty_bgs_lock);
1128	}
1129
1130	if (!list_empty(head: &block_group->dirty_list)) {
1131	list_del_init(entry: &block_group->dirty_list);
1132	remove_rsv = true;
1133	btrfs_put_block_group(cache: block_group);
1134	}
1135	spin_unlock(lock: &trans->transaction->dirty_bgs_lock);
1136	mutex_unlock(lock: &trans->transaction->cache_write_mutex);
1137
1138	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1139	if (ret)
1140	goto out;
1141
1142	write_lock(&fs_info->block_group_cache_lock);
1143	rb_erase_cached(node: &block_group->cache_node,
1144	root: &fs_info->block_group_cache_tree);
1145	RB_CLEAR_NODE(&block_group->cache_node);
1146
1147	/ Once for the block groups rbtree /
1148	btrfs_put_block_group(cache: block_group);
1149
1150	write_unlock(&fs_info->block_group_cache_lock);
1151
1152	down_write(sem: &block_group->space_info->groups_sem);
1153	/*
1154	* we must use list_del_init so people can check to see if they
1155	* are still on the list after taking the semaphore
1156	*/
1157	list_del_init(entry: &block_group->list);
1158	if (list_empty(head: &block_group->space_info->block_groups[index])) {
1159	kobj = block_group->space_info->block_group_kobjs[index];
1160	block_group->space_info->block_group_kobjs[index] = NULL;
1161	clear_avail_alloc_bits(fs_info, flags: block_group->flags);
1162	}
1163	up_write(sem: &block_group->space_info->groups_sem);
1164	clear_incompat_bg_bits(fs_info, flags: block_group->flags);
1165	if (kobj) {
1166	kobject_del(kobj);
1167	kobject_put(kobj);
1168	}
1169
1170	if (block_group->cached == BTRFS_CACHE_STARTED)
1171	btrfs_wait_block_group_cache_done(cache: block_group);
1172
1173	write_lock(&fs_info->block_group_cache_lock);
1174	caching_ctl = btrfs_get_caching_control(cache: block_group);
1175	if (!caching_ctl) {
1176	struct btrfs_caching_control *ctl;
1177
1178	list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1179	if (ctl->block_group == block_group) {
1180	caching_ctl = ctl;
1181	refcount_inc(r: &caching_ctl->count);
1182	break;
1183	}
1184	}
1185	}
1186	if (caching_ctl)
1187	list_del_init(entry: &caching_ctl->list);
1188	write_unlock(&fs_info->block_group_cache_lock);
1189
1190	if (caching_ctl) {
1191	/ Once for the caching bgs list and once for us. /
1192	btrfs_put_caching_control(ctl: caching_ctl);
1193	btrfs_put_caching_control(ctl: caching_ctl);
1194	}
1195
1196	spin_lock(lock: &trans->transaction->dirty_bgs_lock);
1197	WARN_ON(!list_empty(&block_group->dirty_list));
1198	WARN_ON(!list_empty(&block_group->io_list));
1199	spin_unlock(lock: &trans->transaction->dirty_bgs_lock);
1200
1201	btrfs_remove_free_space_cache(block_group);
1202
1203	spin_lock(lock: &block_group->space_info->lock);
1204	list_del_init(entry: &block_group->ro_list);
1205
1206	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1207	WARN_ON(block_group->space_info->total_bytes
1208	< block_group->length);
1209	WARN_ON(block_group->space_info->bytes_readonly
1210	< block_group->length - block_group->zone_unusable);
1211	WARN_ON(block_group->space_info->bytes_zone_unusable
1212	< block_group->zone_unusable);
1213	WARN_ON(block_group->space_info->disk_total
1214	< block_group->length * factor);
1215	}
1216	block_group->space_info->total_bytes -= block_group->length;
1217	block_group->space_info->bytes_readonly -=
1218	(block_group->length - block_group->zone_unusable);
1219	block_group->space_info->bytes_zone_unusable -=
1220	block_group->zone_unusable;
1221	block_group->space_info->disk_total -= block_group->length * factor;
1222
1223	spin_unlock(lock: &block_group->space_info->lock);
1224
1225	/*
1226	* Remove the free space for the block group from the free space tree
1227	* and the block group's item from the extent tree before marking the
1228	* block group as removed. This is to prevent races with tasks that
1229	* freeze and unfreeze a block group, this task and another task
1230	* allocating a new block group - the unfreeze task ends up removing
1231	* the block group's extent map before the task calling this function
1232	* deletes the block group item from the extent tree, allowing for
1233	* another task to attempt to create another block group with the same
1234	* item key (and failing with -EEXIST and a transaction abort).
1235	*/
1236	ret = remove_block_group_free_space(trans, block_group);
1237	if (ret)
1238	goto out;
1239
1240	ret = remove_block_group_item(trans, path, block_group);
1241	if (ret < `0`)
1242	goto out;
1243
1244	spin_lock(lock: &block_group->lock);
1245	set_bit(nr: BLOCK_GROUP_FLAG_REMOVED, addr: &block_group->runtime_flags);
1246
1247	/*
1248	* At this point trimming or scrub can't start on this block group,
1249	* because we removed the block group from the rbtree
1250	* fs_info->block_group_cache_tree so no one can't find it anymore and
1251	* even if someone already got this block group before we removed it
1252	* from the rbtree, they have already incremented block_group->frozen -
1253	* if they didn't, for the trimming case they won't find any free space
1254	* entries because we already removed them all when we called
1255	* btrfs_remove_free_space_cache().
1256	*
1257	* And we must not remove the chunk map from the fs_info->mapping_tree
1258	* to prevent the same logical address range and physical device space
1259	* ranges from being reused for a new block group. This is needed to
1260	* avoid races with trimming and scrub.
1261	*
1262	* An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1263	* completely transactionless, so while it is trimming a range the
1264	* currently running transaction might finish and a new one start,
1265	* allowing for new block groups to be created that can reuse the same
1266	* physical device locations unless we take this special care.
1267	*
1268	* There may also be an implicit trim operation if the file system
1269	* is mounted with -odiscard. The same protections must remain
1270	* in place until the extents have been discarded completely when
1271	* the transaction commit has completed.
1272	*/
1273	remove_map = (atomic_read(v: &block_group->frozen) == `0`);
1274	spin_unlock(lock: &block_group->lock);
1275
1276	if (remove_map)
1277	btrfs_remove_chunk_map(fs_info, map);
1278
1279	out:
1280	/ Once for the lookup reference /
1281	btrfs_put_block_group(cache: block_group);
1282	if (remove_rsv)
1283	btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
1284	btrfs_free_path(p: path);
1285	return ret;
1286	}
1287
1288	struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1289	struct btrfs_fs_info fs_info, const* u64 chunk_offset)
1290	{
1291	struct btrfs_root *root = btrfs_block_group_root(fs_info);
1292	struct btrfs_chunk_map *map;
1293	unsigned int num_items;
1294
1295	map = btrfs_find_chunk_map(fs_info, logical: chunk_offset, length: `1`);
1296	ASSERT(map != NULL);
1297	ASSERT(map->start == chunk_offset);
1298
1299	/*
1300	* We need to reserve 3 + N units from the metadata space info in order
1301	* to remove a block group (done at btrfs_remove_chunk() and at
1302	* btrfs_remove_block_group()), which are used for:
1303	*
1304	* 1 unit for adding the free space inode's orphan (located in the tree
1305	* of tree roots).
1306	* 1 unit for deleting the block group item (located in the extent
1307	* tree).
1308	* 1 unit for deleting the free space item (located in tree of tree
1309	* roots).
1310	* N units for deleting N device extent items corresponding to each
1311	* stripe (located in the device tree).
1312	*
1313	* In order to remove a block group we also need to reserve units in the
1314	* system space info in order to update the chunk tree (update one or
1315	* more device items and remove one chunk item), but this is done at
1316	* btrfs_remove_chunk() through a call to check_system_chunk().
1317	*/
1318	num_items = `3` + map->num_stripes;
1319	btrfs_free_chunk_map(map);
1320
1321	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1322	}
1323
1324	/*
1325	* Mark block group @cache read-only, so later write won't happen to block
1326	* group @cache.
1327	*
1328	* If @force is not set, this function will only mark the block group readonly
1329	* if we have enough free space (1M) in other metadata/system block groups.
1330	* If @force is not set, this function will mark the block group readonly
1331	* without checking free space.
1332	*
1333	* NOTE: This function doesn't care if other block groups can contain all the
1334	* data in this block group. That check should be done by relocation routine,
1335	* not this function.
1336	*/
1337	static int inc_block_group_ro(struct btrfs_block_group cache, int* force)
1338	{
1339	struct btrfs_space_info *sinfo = cache->space_info;
1340	u64 num_bytes;
1341	int ret = -ENOSPC;
1342
1343	spin_lock(lock: &sinfo->lock);
1344	spin_lock(lock: &cache->lock);
1345
1346	if (cache->swap_extents) {
1347	ret = -ETXTBSY;
1348	goto out;
1349	}
1350
1351	if (cache->ro) {
1352	cache->ro++;
1353	ret = `0`;
1354	goto out;
1355	}
1356
1357	num_bytes = cache->length - cache->reserved - cache->pinned -
1358	cache->bytes_super - cache->zone_unusable - cache->used;
1359
1360	/*
1361	* Data never overcommits, even in mixed mode, so do just the straight
1362	* check of left over space in how much we have allocated.
1363	*/
1364	if (force) {
1365	ret = `0`;
1366	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1367	u64 sinfo_used = btrfs_space_info_used(s_info: sinfo, may_use_included: true);
1368
1369	/*
1370	* Here we make sure if we mark this bg RO, we still have enough
1371	* free space as buffer.
1372	*/
1373	if (sinfo_used + num_bytes <= sinfo->total_bytes)
1374	ret = `0`;
1375	} else {
1376	/*
1377	* We overcommit metadata, so we need to do the
1378	* btrfs_can_overcommit check here, and we need to pass in
1379	* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1380	* leeway to allow us to mark this block group as read only.
1381	*/
1382	if (btrfs_can_overcommit(fs_info: cache->fs_info, space_info: sinfo, bytes: num_bytes,
1383	flush: BTRFS_RESERVE_NO_FLUSH))
1384	ret = `0`;
1385	}
1386
1387	if (!ret) {
1388	sinfo->bytes_readonly += num_bytes;
1389	if (btrfs_is_zoned(fs_info: cache->fs_info)) {
1390	/ Migrate zone_unusable bytes to readonly /
1391	sinfo->bytes_readonly += cache->zone_unusable;
1392	sinfo->bytes_zone_unusable -= cache->zone_unusable;
1393	cache->zone_unusable = `0`;
1394	}
1395	cache->ro++;
1396	list_add_tail(new: &cache->ro_list, head: &sinfo->ro_bgs);
1397	}
1398	out:
1399	spin_unlock(lock: &cache->lock);
1400	spin_unlock(lock: &sinfo->lock);
1401	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1402	btrfs_info(cache->fs_info,
1403	"unable to make block group %llu ro", cache->start);
1404	btrfs_dump_space_info(fs_info: cache->fs_info, info: cache->space_info, bytes: `0`, dump_block_groups: `0`);
1405	}
1406	return ret;
1407	}
1408
1409	static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1410	struct btrfs_block_group *bg)
1411	{
1412	struct btrfs_fs_info *fs_info = bg->fs_info;
1413	struct btrfs_transaction *prev_trans = NULL;
1414	const u64 start = bg->start;
1415	const u64 end = start + bg->length - `1`;
1416	int ret;
1417
1418	spin_lock(lock: &fs_info->trans_lock);
1419	if (trans->transaction->list.prev != &fs_info->trans_list) {
1420	prev_trans = list_last_entry(&trans->transaction->list,
1421	struct btrfs_transaction, list);
1422	refcount_inc(r: &prev_trans->use_count);
1423	}
1424	spin_unlock(lock: &fs_info->trans_lock);
1425
1426	/*
1427	* Hold the unused_bg_unpin_mutex lock to avoid racing with
1428	* btrfs_finish_extent_commit(). If we are at transaction N, another
1429	* task might be running finish_extent_commit() for the previous
1430	* transaction N - 1, and have seen a range belonging to the block
1431	* group in pinned_extents before we were able to clear the whole block
1432	* group range from pinned_extents. This means that task can lookup for
1433	* the block group after we unpinned it from pinned_extents and removed
1434	* it, leading to an error at unpin_extent_range().
1435	*/
1436	mutex_lock(&fs_info->unused_bg_unpin_mutex);
1437	if (prev_trans) {
1438	ret = clear_extent_bits(tree: &prev_trans->pinned_extents, start, end,
1439	bits: EXTENT_DIRTY);
1440	if (ret)
1441	goto out;
1442	}
1443
1444	ret = clear_extent_bits(tree: &trans->transaction->pinned_extents, start, end,
1445	bits: EXTENT_DIRTY);
1446	out:
1447	mutex_unlock(lock: &fs_info->unused_bg_unpin_mutex);
1448	if (prev_trans)
1449	btrfs_put_transaction(transaction: prev_trans);
1450
1451	return ret == `0`;
1452	}
1453
1454	/*
1455	* Process the unused_bgs list and remove any that don't have any allocated
1456	* space inside of them.
1457	*/
1458	void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1459	{
1460	LIST_HEAD(retry_list);
1461	struct btrfs_block_group *block_group;
1462	struct btrfs_space_info *space_info;
1463	struct btrfs_trans_handle *trans;
1464	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1465	int ret = `0`;
1466
1467	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1468	return;
1469
1470	if (btrfs_fs_closing(fs_info))
1471	return;
1472
1473	/*
1474	* Long running balances can keep us blocked here for eternity, so
1475	* simply skip deletion if we're unable to get the mutex.
1476	*/
1477	if (!mutex_trylock(lock: &fs_info->reclaim_bgs_lock))
1478	return;
1479
1480	spin_lock(lock: &fs_info->unused_bgs_lock);
1481	while (!list_empty(head: &fs_info->unused_bgs)) {
1482	u64 used;
1483	int trimming;
1484
1485	block_group = list_first_entry(&fs_info->unused_bgs,
1486	struct btrfs_block_group,
1487	bg_list);
1488	list_del_init(entry: &block_group->bg_list);
1489
1490	space_info = block_group->space_info;
1491
1492	if (ret \|\| btrfs_mixed_space_info(space_info)) {
1493	btrfs_put_block_group(cache: block_group);
1494	continue;
1495	}
1496	spin_unlock(lock: &fs_info->unused_bgs_lock);
1497
1498	btrfs_discard_cancel_work(discard_ctl: &fs_info->discard_ctl, block_group);
1499
1500	/ Don't want to race with allocators so take the groups_sem /
1501	down_write(sem: &space_info->groups_sem);
1502
1503	/*
1504	* Async discard moves the final block group discard to be prior
1505	* to the unused_bgs code path. Therefore, if it's not fully
1506	* trimmed, punt it back to the async discard lists.
1507	*/
1508	if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1509	!btrfs_is_free_space_trimmed(block_group)) {
1510	trace_btrfs_skip_unused_block_group(bg_cache: block_group);
1511	up_write(sem: &space_info->groups_sem);
1512	/ Requeue if we failed because of async discard /
1513	btrfs_discard_queue_work(discard_ctl: &fs_info->discard_ctl,
1514	block_group);
1515	goto next;
1516	}
1517
1518	spin_lock(lock: &space_info->lock);
1519	spin_lock(lock: &block_group->lock);
1520	if (btrfs_is_block_group_used(bg: block_group) \|\| block_group->ro \|\|
1521	list_is_singular(head: &block_group->list)) {
1522	/*
1523	* We want to bail if we made new allocations or have
1524	* outstanding allocations in this block group. We do
1525	* the ro check in case balance is currently acting on
1526	* this block group.
1527	*
1528	* Also bail out if this is the only block group for its
1529	* type, because otherwise we would lose profile
1530	* information from fs_info->avail_*_alloc_bits and the
1531	* next block group of this type would be created with a
1532	* "single" profile (even if we're in a raid fs) because
1533	* fs_info->avail_*_alloc_bits would be 0.
1534	*/
1535	trace_btrfs_skip_unused_block_group(bg_cache: block_group);
1536	spin_unlock(lock: &block_group->lock);
1537	spin_unlock(lock: &space_info->lock);
1538	up_write(sem: &space_info->groups_sem);
1539	goto next;
1540	}
1541
1542	/*
1543	* The block group may be unused but there may be space reserved
1544	* accounting with the existence of that block group, that is,
1545	* space_info->bytes_may_use was incremented by a task but no
1546	* space was yet allocated from the block group by the task.
1547	* That space may or may not be allocated, as we are generally
1548	* pessimistic about space reservation for metadata as well as
1549	* for data when using compression (as we reserve space based on
1550	* the worst case, when data can't be compressed, and before
1551	* actually attempting compression, before starting writeback).
1552	*
1553	* So check if the total space of the space_info minus the size
1554	* of this block group is less than the used space of the
1555	* space_info - if that's the case, then it means we have tasks
1556	* that might be relying on the block group in order to allocate
1557	* extents, and add back the block group to the unused list when
1558	* we finish, so that we retry later in case no tasks ended up
1559	* needing to allocate extents from the block group.
1560	*/
1561	used = btrfs_space_info_used(s_info: space_info, may_use_included: true);
1562	if (space_info->total_bytes - block_group->length < used &&
1563	block_group->zone_unusable < block_group->length) {
1564	/*
1565	* Add a reference for the list, compensate for the ref
1566	* drop under the "next" label for the
1567	* fs_info->unused_bgs list.
1568	*/
1569	btrfs_get_block_group(cache: block_group);
1570	list_add_tail(new: &block_group->bg_list, head: &retry_list);
1571
1572	trace_btrfs_skip_unused_block_group(bg_cache: block_group);
1573	spin_unlock(lock: &block_group->lock);
1574	spin_unlock(lock: &space_info->lock);
1575	up_write(sem: &space_info->groups_sem);
1576	goto next;
1577	}
1578
1579	spin_unlock(lock: &block_group->lock);
1580	spin_unlock(lock: &space_info->lock);
1581
1582	/ We don't want to force the issue, only flip if it's ok. /
1583	ret = inc_block_group_ro(cache: block_group, force: `0`);
1584	up_write(sem: &space_info->groups_sem);
1585	if (ret < `0`) {
1586	ret = `0`;
1587	goto next;
1588	}
1589
1590	ret = btrfs_zone_finish(block_group);
1591	if (ret < `0`) {
1592	btrfs_dec_block_group_ro(cache: block_group);
1593	if (ret == -EAGAIN)
1594	ret = `0`;
1595	goto next;
1596	}
1597
1598	/*
1599	* Want to do this before we do anything else so we can recover
1600	* properly if we fail to join the transaction.
1601	*/
1602	trans = btrfs_start_trans_remove_block_group(fs_info,
1603	chunk_offset: block_group->start);
1604	if (IS_ERR(ptr: trans)) {
1605	btrfs_dec_block_group_ro(cache: block_group);
1606	ret = PTR_ERR(ptr: trans);
1607	goto next;
1608	}
1609
1610	/*
1611	* We could have pending pinned extents for this block group,
1612	* just delete them, we don't care about them anymore.
1613	*/
1614	if (!clean_pinned_extents(trans, bg: block_group)) {
1615	btrfs_dec_block_group_ro(cache: block_group);
1616	goto end_trans;
1617	}
1618
1619	/*
1620	* At this point, the block_group is read only and should fail
1621	* new allocations. However, btrfs_finish_extent_commit() can
1622	* cause this block_group to be placed back on the discard
1623	* lists because now the block_group isn't fully discarded.
1624	* Bail here and try again later after discarding everything.
1625	*/
1626	spin_lock(lock: &fs_info->discard_ctl.lock);
1627	if (!list_empty(head: &block_group->discard_list)) {
1628	spin_unlock(lock: &fs_info->discard_ctl.lock);
1629	btrfs_dec_block_group_ro(cache: block_group);
1630	btrfs_discard_queue_work(discard_ctl: &fs_info->discard_ctl,
1631	block_group);
1632	goto end_trans;
1633	}
1634	spin_unlock(lock: &fs_info->discard_ctl.lock);
1635
1636	/ Reset pinned so btrfs_put_block_group doesn't complain /
1637	spin_lock(lock: &space_info->lock);
1638	spin_lock(lock: &block_group->lock);
1639
1640	btrfs_space_info_update_bytes_pinned(fs_info, sinfo: space_info,
1641	bytes: -block_group->pinned);
1642	space_info->bytes_readonly += block_group->pinned;
1643	block_group->pinned = `0`;
1644
1645	spin_unlock(lock: &block_group->lock);
1646	spin_unlock(lock: &space_info->lock);
1647
1648	/*
1649	* The normal path here is an unused block group is passed here,
1650	* then trimming is handled in the transaction commit path.
1651	* Async discard interposes before this to do the trimming
1652	* before coming down the unused block group path as trimming
1653	* will no longer be done later in the transaction commit path.
1654	*/
1655	if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1656	goto flip_async;
1657
1658	/*
1659	* DISCARD can flip during remount. On zoned filesystems, we
1660	* need to reset sequential-required zones.
1661	*/
1662	trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) \|\|
1663	btrfs_is_zoned(fs_info);
1664
1665	/ Implicit trim during transaction commit. /
1666	if (trimming)
1667	btrfs_freeze_block_group(cache: block_group);
1668
1669	/*
1670	* Btrfs_remove_chunk will abort the transaction if things go
1671	* horribly wrong.
1672	*/
1673	ret = btrfs_remove_chunk(trans, chunk_offset: block_group->start);
1674
1675	if (ret) {
1676	if (trimming)
1677	btrfs_unfreeze_block_group(cache: block_group);
1678	goto end_trans;
1679	}
1680
1681	/*
1682	* If we're not mounted with -odiscard, we can just forget
1683	* about this block group. Otherwise we'll need to wait
1684	* until transaction commit to do the actual discard.
1685	*/
1686	if (trimming) {
1687	spin_lock(lock: &fs_info->unused_bgs_lock);
1688	/*
1689	* A concurrent scrub might have added us to the list
1690	* fs_info->unused_bgs, so use a list_move operation
1691	* to add the block group to the deleted_bgs list.
1692	*/
1693	list_move(list: &block_group->bg_list,
1694	head: &trans->transaction->deleted_bgs);
1695	spin_unlock(lock: &fs_info->unused_bgs_lock);
1696	btrfs_get_block_group(cache: block_group);
1697	}
1698	end_trans:
1699	btrfs_end_transaction(trans);
1700	next:
1701	btrfs_put_block_group(cache: block_group);
1702	spin_lock(lock: &fs_info->unused_bgs_lock);
1703	}
1704	list_splice_tail(list: &retry_list, head: &fs_info->unused_bgs);
1705	spin_unlock(lock: &fs_info->unused_bgs_lock);
1706	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
1707	return;
1708
1709	flip_async:
1710	btrfs_end_transaction(trans);
1711	spin_lock(lock: &fs_info->unused_bgs_lock);
1712	list_splice_tail(list: &retry_list, head: &fs_info->unused_bgs);
1713	spin_unlock(lock: &fs_info->unused_bgs_lock);
1714	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
1715	btrfs_put_block_group(cache: block_group);
1716	btrfs_discard_punt_unused_bgs_list(fs_info);
1717	}
1718
1719	void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1720	{
1721	struct btrfs_fs_info *fs_info = bg->fs_info;
1722
1723	spin_lock(lock: &fs_info->unused_bgs_lock);
1724	if (list_empty(head: &bg->bg_list)) {
1725	btrfs_get_block_group(cache: bg);
1726	trace_btrfs_add_unused_block_group(bg_cache: bg);
1727	list_add_tail(new: &bg->bg_list, head: &fs_info->unused_bgs);
1728	} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
1729	/ Pull out the block group from the reclaim_bgs list. /
1730	trace_btrfs_add_unused_block_group(bg_cache: bg);
1731	list_move_tail(list: &bg->bg_list, head: &fs_info->unused_bgs);
1732	}
1733	spin_unlock(lock: &fs_info->unused_bgs_lock);
1734	}
1735
1736	/*
1737	* We want block groups with a low number of used bytes to be in the beginning
1738	* of the list, so they will get reclaimed first.
1739	*/
1740	static int reclaim_bgs_cmp(void unused, const* struct list_head *a,
1741	const struct list_head *b)
1742	{
1743	const struct btrfs_block_group bg1, bg2;
1744
1745	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1746	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1747
1748	return bg1->used > bg2->used;
1749	}
1750
1751	static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
1752	{
1753	if (btrfs_is_zoned(fs_info))
1754	return btrfs_zoned_should_reclaim(fs_info);
1755	return true;
1756	}
1757
1758	static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
1759	{
1760	const struct btrfs_space_info *space_info = bg->space_info;
1761	const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
1762	const u64 new_val = bg->used;
1763	const u64 old_val = new_val + bytes_freed;
1764	u64 thresh;
1765
1766	if (reclaim_thresh == `0`)
1767	return false;
1768
1769	thresh = mult_perc(num: bg->length, percent: reclaim_thresh);
1770
1771	/*
1772	* If we were below the threshold before don't reclaim, we are likely a
1773	* brand new block group and we don't want to relocate new block groups.
1774	*/
1775	if (old_val < thresh)
1776	return false;
1777	if (new_val >= thresh)
1778	return false;
1779	return true;
1780	}
1781
1782	void btrfs_reclaim_bgs_work(struct work_struct *work)
1783	{
1784	struct btrfs_fs_info *fs_info =
1785	container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1786	struct btrfs_block_group *bg;
1787	struct btrfs_space_info *space_info;
1788
1789	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1790	return;
1791
1792	if (btrfs_fs_closing(fs_info))
1793	return;
1794
1795	if (!btrfs_should_reclaim(fs_info))
1796	return;
1797
1798	sb_start_write(sb: fs_info->sb);
1799
1800	if (!btrfs_exclop_start(fs_info, type: BTRFS_EXCLOP_BALANCE)) {
1801	sb_end_write(sb: fs_info->sb);
1802	return;
1803	}
1804
1805	/*
1806	* Long running balances can keep us blocked here for eternity, so
1807	* simply skip reclaim if we're unable to get the mutex.
1808	*/
1809	if (!mutex_trylock(lock: &fs_info->reclaim_bgs_lock)) {
1810	btrfs_exclop_finish(fs_info);
1811	sb_end_write(sb: fs_info->sb);
1812	return;
1813	}
1814
1815	spin_lock(lock: &fs_info->unused_bgs_lock);
1816	/*
1817	* Sort happens under lock because we can't simply splice it and sort.
1818	* The block groups might still be in use and reachable via bg_list,
1819	* and their presence in the reclaim_bgs list must be preserved.
1820	*/
1821	list_sort(NULL, head: &fs_info->reclaim_bgs, cmp: reclaim_bgs_cmp);
1822	while (!list_empty(head: &fs_info->reclaim_bgs)) {
1823	u64 zone_unusable;
1824	int ret = `0`;
1825
1826	bg = list_first_entry(&fs_info->reclaim_bgs,
1827	struct btrfs_block_group,
1828	bg_list);
1829	list_del_init(entry: &bg->bg_list);
1830
1831	space_info = bg->space_info;
1832	spin_unlock(lock: &fs_info->unused_bgs_lock);
1833
1834	/ Don't race with allocators so take the groups_sem /
1835	down_write(sem: &space_info->groups_sem);
1836
1837	spin_lock(lock: &bg->lock);
1838	if (bg->reserved \|\| bg->pinned \|\| bg->ro) {
1839	/*
1840	* We want to bail if we made new allocations or have
1841	* outstanding allocations in this block group. We do
1842	* the ro check in case balance is currently acting on
1843	* this block group.
1844	*/
1845	spin_unlock(lock: &bg->lock);
1846	up_write(sem: &space_info->groups_sem);
1847	goto next;
1848	}
1849	if (bg->used == `0`) {
1850	/*
1851	* It is possible that we trigger relocation on a block
1852	* group as its extents are deleted and it first goes
1853	* below the threshold, then shortly after goes empty.
1854	*
1855	* In this case, relocating it does delete it, but has
1856	* some overhead in relocation specific metadata, looking
1857	* for the non-existent extents and running some extra
1858	* transactions, which we can avoid by using one of the
1859	* other mechanisms for dealing with empty block groups.
1860	*/
1861	if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1862	btrfs_mark_bg_unused(bg);
1863	spin_unlock(lock: &bg->lock);
1864	up_write(sem: &space_info->groups_sem);
1865	goto next;
1866
1867	}
1868	/*
1869	* The block group might no longer meet the reclaim condition by
1870	* the time we get around to reclaiming it, so to avoid
1871	* reclaiming overly full block_groups, skip reclaiming them.
1872	*
1873	* Since the decision making process also depends on the amount
1874	* being freed, pass in a fake giant value to skip that extra
1875	* check, which is more meaningful when adding to the list in
1876	* the first place.
1877	*/
1878	if (!should_reclaim_block_group(bg, bytes_freed: bg->length)) {
1879	spin_unlock(lock: &bg->lock);
1880	up_write(sem: &space_info->groups_sem);
1881	goto next;
1882	}
1883	spin_unlock(lock: &bg->lock);
1884
1885	/*
1886	* Get out fast, in case we're read-only or unmounting the
1887	* filesystem. It is OK to drop block groups from the list even
1888	* for the read-only case. As we did sb_start_write(),
1889	* "mount -o remount,ro" won't happen and read-only filesystem
1890	* means it is forced read-only due to a fatal error. So, it
1891	* never gets back to read-write to let us reclaim again.
1892	*/
1893	if (btrfs_need_cleaner_sleep(fs_info)) {
1894	up_write(sem: &space_info->groups_sem);
1895	goto next;
1896	}
1897
1898	/*
1899	* Cache the zone_unusable value before turning the block group
1900	* to read only. As soon as the blog group is read only it's
1901	* zone_unusable value gets moved to the block group's read-only
1902	* bytes and isn't available for calculations anymore.
1903	*/
1904	zone_unusable = bg->zone_unusable;
1905	ret = inc_block_group_ro(cache: bg, force: `0`);
1906	up_write(sem: &space_info->groups_sem);
1907	if (ret < `0`)
1908	goto next;
1909
1910	btrfs_info(fs_info,
1911	"reclaiming chunk %llu with %llu%% used %llu%% unusable",
1912	bg->start,
1913	div64_u64(bg->used * `100`, bg->length),
1914	div64_u64(zone_unusable * `100`, bg->length));
1915	trace_btrfs_reclaim_block_group(bg_cache: bg);
1916	ret = btrfs_relocate_chunk(fs_info, chunk_offset: bg->start);
1917	if (ret) {
1918	btrfs_dec_block_group_ro(cache: bg);
1919	btrfs_err(fs_info, "error relocating chunk %llu",
1920	bg->start);
1921	}
1922
1923	next:
1924	if (ret)
1925	btrfs_mark_bg_to_reclaim(bg);
1926	btrfs_put_block_group(cache: bg);
1927
1928	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
1929	/*
1930	* Reclaiming all the block groups in the list can take really
1931	* long. Prioritize cleaning up unused block groups.
1932	*/
1933	btrfs_delete_unused_bgs(fs_info);
1934	/*
1935	* If we are interrupted by a balance, we can just bail out. The
1936	* cleaner thread restart again if necessary.
1937	*/
1938	if (!mutex_trylock(lock: &fs_info->reclaim_bgs_lock))
1939	goto end;
1940	spin_lock(lock: &fs_info->unused_bgs_lock);
1941	}
1942	spin_unlock(lock: &fs_info->unused_bgs_lock);
1943	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
1944	end:
1945	btrfs_exclop_finish(fs_info);
1946	sb_end_write(sb: fs_info->sb);
1947	}
1948
1949	void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
1950	{
1951	spin_lock(lock: &fs_info->unused_bgs_lock);
1952	if (!list_empty(head: &fs_info->reclaim_bgs))
1953	queue_work(wq: system_unbound_wq, work: &fs_info->reclaim_bgs_work);
1954	spin_unlock(lock: &fs_info->unused_bgs_lock);
1955	}
1956
1957	void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
1958	{
1959	struct btrfs_fs_info *fs_info = bg->fs_info;
1960
1961	spin_lock(lock: &fs_info->unused_bgs_lock);
1962	if (list_empty(head: &bg->bg_list)) {
1963	btrfs_get_block_group(cache: bg);
1964	trace_btrfs_add_reclaim_block_group(bg_cache: bg);
1965	list_add_tail(new: &bg->bg_list, head: &fs_info->reclaim_bgs);
1966	}
1967	spin_unlock(lock: &fs_info->unused_bgs_lock);
1968	}
1969
1970	static int read_bg_from_eb(struct btrfs_fs_info fs_info, struct* btrfs_key *key,
1971	struct btrfs_path *path)
1972	{
1973	struct btrfs_chunk_map *map;
1974	struct btrfs_block_group_item bg;
1975	struct extent_buffer *leaf;
1976	int slot;
1977	u64 flags;
1978	int ret = `0`;
1979
1980	slot = path->slots[`0`];
1981	leaf = path->nodes[`0`];
1982
1983	map = btrfs_find_chunk_map(fs_info, logical: key->objectid, length: key->offset);
1984	if (!map) {
1985	btrfs_err(fs_info,
1986	"logical %llu len %llu found bg but no related chunk",
1987	key->objectid, key->offset);
1988	return -ENOENT;
1989	}
1990
1991	if (map->start != key->objectid \|\| map->chunk_len != key->offset) {
1992	btrfs_err(fs_info,
1993	"block group %llu len %llu mismatch with chunk %llu len %llu",
1994	key->objectid, key->offset, map->start, map->chunk_len);
1995	ret = -EUCLEAN;
1996	goto out_free_map;
1997	}
1998
1999	read_extent_buffer(eb: leaf, dst: &bg, btrfs_item_ptr_offset(leaf, slot),
2000	len: sizeof(bg));
2001	flags = btrfs_stack_block_group_flags(s: &bg) &
2002	BTRFS_BLOCK_GROUP_TYPE_MASK;
2003
2004	if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
2005	btrfs_err(fs_info,
2006	"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
2007	key->objectid, key->offset, flags,
2008	(BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
2009	ret = -EUCLEAN;
2010	}
2011
2012	out_free_map:
2013	btrfs_free_chunk_map(map);
2014	return ret;
2015	}
2016
2017	static int find_first_block_group(struct btrfs_fs_info *fs_info,
2018	struct btrfs_path *path,
2019	struct btrfs_key *key)
2020	{
2021	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2022	int ret;
2023	struct btrfs_key found_key;
2024
2025	btrfs_for_each_slot(root, key, &found_key, path, ret) {
2026	if (found_key.objectid >= key->objectid &&
2027	found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
2028	return read_bg_from_eb(fs_info, key: &found_key, path);
2029	}
2030	}
2031	return ret;
2032	}
2033
2034	static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2035	{
2036	u64 extra_flags = chunk_to_extended(flags) &
2037	BTRFS_EXTENDED_PROFILE_MASK;
2038
2039	write_seqlock(sl: &fs_info->profiles_lock);
2040	if (flags & BTRFS_BLOCK_GROUP_DATA)
2041	fs_info->avail_data_alloc_bits \|= extra_flags;
2042	if (flags & BTRFS_BLOCK_GROUP_METADATA)
2043	fs_info->avail_metadata_alloc_bits \|= extra_flags;
2044	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2045	fs_info->avail_system_alloc_bits \|= extra_flags;
2046	write_sequnlock(sl: &fs_info->profiles_lock);
2047	}
2048
2049	/*
2050	* Map a physical disk address to a list of logical addresses.
2051	*
2052	* @fs_info: the filesystem
2053	* @chunk_start: logical address of block group
2054	* @physical: physical address to map to logical addresses
2055	* @logical: return array of logical addresses which map to @physical
2056	* @naddrs: length of @logical
2057	* @stripe_len: size of IO stripe for the given block group
2058	*
2059	* Maps a particular @physical disk address to a list of @logical addresses.
2060	* Used primarily to exclude those portions of a block group that contain super
2061	* block copies.
2062	*/
2063	int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
2064	u64 physical, u64 *logical, int* naddrs, int* *stripe_len)
2065	{
2066	struct btrfs_chunk_map *map;
2067	u64 *buf;
2068	u64 bytenr;
2069	u64 data_stripe_length;
2070	u64 io_stripe_size;
2071	int i, nr = `0`;
2072	int ret = `0`;
2073
2074	map = btrfs_get_chunk_map(fs_info, logical: chunk_start, length: `1`);
2075	if (IS_ERR(ptr: map))
2076	return -EIO;
2077
2078	data_stripe_length = map->stripe_size;
2079	io_stripe_size = BTRFS_STRIPE_LEN;
2080	chunk_start = map->start;
2081
2082	/ For RAID5/6 adjust to a full IO stripe length /
2083	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2084	io_stripe_size = btrfs_stripe_nr_to_offset(stripe_nr: nr_data_stripes(map));
2085
2086	buf = kcalloc(n: map->num_stripes, size: sizeof(u64), GFP_NOFS);
2087	if (!buf) {
2088	ret = -ENOMEM;
2089	goto out;
2090	}
2091
2092	for (i = `0`; i < map->num_stripes; i++) {
2093	bool already_inserted = false;
2094	u32 stripe_nr;
2095	u32 offset;
2096	int j;
2097
2098	if (!in_range(physical, map->stripes[i].physical,
2099	data_stripe_length))
2100	continue;
2101
2102	stripe_nr = (physical - map->stripes[i].physical) >>
2103	BTRFS_STRIPE_LEN_SHIFT;
2104	offset = (physical - map->stripes[i].physical) &
2105	BTRFS_STRIPE_LEN_MASK;
2106
2107	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
2108	BTRFS_BLOCK_GROUP_RAID10))
2109	stripe_nr = div_u64(dividend: stripe_nr * map->num_stripes + i,
2110	divisor: map->sub_stripes);
2111	/*
2112	* The remaining case would be for RAID56, multiply by
2113	* nr_data_stripes(). Alternatively, just use rmap_len below
2114	* instead of map->stripe_len
2115	*/
2116	bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
2117
2118	/ Ensure we don't add duplicate addresses /
2119	for (j = `0`; j < nr; j++) {
2120	if (buf[j] == bytenr) {
2121	already_inserted = true;
2122	break;
2123	}
2124	}
2125
2126	if (!already_inserted)
2127	buf[nr++] = bytenr;
2128	}
2129
2130	*logical = buf;
2131	*naddrs = nr;
2132	*stripe_len = io_stripe_size;
2133	out:
2134	btrfs_free_chunk_map(map);
2135	return ret;
2136	}
2137
2138	static int exclude_super_stripes(struct btrfs_block_group *cache)
2139	{
2140	struct btrfs_fs_info *fs_info = cache->fs_info;
2141	const bool zoned = btrfs_is_zoned(fs_info);
2142	u64 bytenr;
2143	u64 *logical;
2144	int stripe_len;
2145	int i, nr, ret;
2146
2147	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2148	stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
2149	cache->bytes_super += stripe_len;
2150	ret = set_extent_bit(tree: &fs_info->excluded_extents, start: cache->start,
2151	end: cache->start + stripe_len - `1`,
2152	bits: EXTENT_UPTODATE, NULL);
2153	if (ret)
2154	return ret;
2155	}
2156
2157	for (i = `0`; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2158	bytenr = btrfs_sb_offset(mirror: i);
2159	ret = btrfs_rmap_block(fs_info, chunk_start: cache->start,
2160	physical: bytenr, logical: &logical, naddrs: &nr, stripe_len: &stripe_len);
2161	if (ret)
2162	return ret;
2163
2164	/ Shouldn't have super stripes in sequential zones /
2165	if (zoned && nr) {
2166	kfree(objp: logical);
2167	btrfs_err(fs_info,
2168	"zoned: block group %llu must not contain super block",
2169	cache->start);
2170	return -EUCLEAN;
2171	}
2172
2173	while (nr--) {
2174	u64 len = min_t(u64, stripe_len,
2175	cache->start + cache->length - logical[nr]);
2176
2177	cache->bytes_super += len;
2178	ret = set_extent_bit(tree: &fs_info->excluded_extents, start: logical[nr],
2179	end: logical[nr] + len - `1`,
2180	bits: EXTENT_UPTODATE, NULL);
2181	if (ret) {
2182	kfree(objp: logical);
2183	return ret;
2184	}
2185	}
2186
2187	kfree(objp: logical);
2188	}
2189	return `0`;
2190	}
2191
2192	static struct btrfs_block_group *btrfs_create_block_group_cache(
2193	struct btrfs_fs_info *fs_info, u64 start)
2194	{
2195	struct btrfs_block_group *cache;
2196
2197	cache = kzalloc(size: sizeof(*cache), GFP_NOFS);
2198	if (!cache)
2199	return NULL;
2200
2201	cache->free_space_ctl = kzalloc(size: sizeof(*cache->free_space_ctl),
2202	GFP_NOFS);
2203	if (!cache->free_space_ctl) {
2204	kfree(objp: cache);
2205	return NULL;
2206	}
2207
2208	cache->start = start;
2209
2210	cache->fs_info = fs_info;
2211	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, logical: start);
2212
2213	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2214
2215	refcount_set(r: &cache->refs, n: `1`);
2216	spin_lock_init(&cache->lock);
2217	init_rwsem(&cache->data_rwsem);
2218	INIT_LIST_HEAD(list: &cache->list);
2219	INIT_LIST_HEAD(list: &cache->cluster_list);
2220	INIT_LIST_HEAD(list: &cache->bg_list);
2221	INIT_LIST_HEAD(list: &cache->ro_list);
2222	INIT_LIST_HEAD(list: &cache->discard_list);
2223	INIT_LIST_HEAD(list: &cache->dirty_list);
2224	INIT_LIST_HEAD(list: &cache->io_list);
2225	INIT_LIST_HEAD(list: &cache->active_bg_list);
2226	btrfs_init_free_space_ctl(block_group: cache, ctl: cache->free_space_ctl);
2227	atomic_set(v: &cache->frozen, i: `0`);
2228	mutex_init(&cache->free_space_lock);
2229
2230	return cache;
2231	}
2232
2233	/*
2234	* Iterate all chunks and verify that each of them has the corresponding block
2235	* group
2236	*/
2237	static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2238	{
2239	u64 start = `0`;
2240	int ret = `0`;
2241
2242	while (`1`) {
2243	struct btrfs_chunk_map *map;
2244	struct btrfs_block_group *bg;
2245
2246	/*
2247	* btrfs_find_chunk_map() will return the first chunk map
2248	* intersecting the range, so setting @length to 1 is enough to
2249	* get the first chunk.
2250	*/
2251	map = btrfs_find_chunk_map(fs_info, logical: start, length: `1`);
2252	if (!map)
2253	break;
2254
2255	bg = btrfs_lookup_block_group(info: fs_info, bytenr: map->start);
2256	if (!bg) {
2257	btrfs_err(fs_info,
2258	"chunk start=%llu len=%llu doesn't have corresponding block group",
2259	map->start, map->chunk_len);
2260	ret = -EUCLEAN;
2261	btrfs_free_chunk_map(map);
2262	break;
2263	}
2264	if (bg->start != map->start \|\| bg->length != map->chunk_len \|\|
2265	(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
2266	(map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
2267	btrfs_err(fs_info,
2268	"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
2269	map->start, map->chunk_len,
2270	map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
2271	bg->start, bg->length,
2272	bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2273	ret = -EUCLEAN;
2274	btrfs_free_chunk_map(map);
2275	btrfs_put_block_group(cache: bg);
2276	break;
2277	}
2278	start = map->start + map->chunk_len;
2279	btrfs_free_chunk_map(map);
2280	btrfs_put_block_group(cache: bg);
2281	}
2282	return ret;
2283	}
2284
2285	static int read_one_block_group(struct btrfs_fs_info *info,
2286	struct btrfs_block_group_item *bgi,
2287	const struct btrfs_key *key,
2288	int need_clear)
2289	{
2290	struct btrfs_block_group *cache;
2291	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2292	int ret;
2293
2294	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2295
2296	cache = btrfs_create_block_group_cache(fs_info: info, start: key->objectid);
2297	if (!cache)
2298	return -ENOMEM;
2299
2300	cache->length = key->offset;
2301	cache->used = btrfs_stack_block_group_used(s: bgi);
2302	cache->commit_used = cache->used;
2303	cache->flags = btrfs_stack_block_group_flags(s: bgi);
2304	cache->global_root_id = btrfs_stack_block_group_chunk_objectid(s: bgi);
2305
2306	set_free_space_tree_thresholds(cache);
2307
2308	if (need_clear) {
2309	/*
2310	* When we mount with old space cache, we need to
2311	* set BTRFS_DC_CLEAR and set dirty flag.
2312	*
2313	* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2314	* truncate the old free space cache inode and
2315	* setup a new one.
2316	* b) Setting 'dirty flag' makes sure that we flush
2317	* the new space cache info onto disk.
2318	*/
2319	if (btrfs_test_opt(info, SPACE_CACHE))
2320	cache->disk_cache_state = BTRFS_DC_CLEAR;
2321	}
2322	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2323	(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2324	btrfs_err(info,
2325	"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2326	cache->start);
2327	ret = -EINVAL;
2328	goto error;
2329	}
2330
2331	ret = btrfs_load_block_group_zone_info(cache, new: false);
2332	if (ret) {
2333	btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2334	cache->start);
2335	goto error;
2336	}
2337
2338	/*
2339	* We need to exclude the super stripes now so that the space info has
2340	* super bytes accounted for, otherwise we'll think we have more space
2341	* than we actually do.
2342	*/
2343	ret = exclude_super_stripes(cache);
2344	if (ret) {
2345	/ We may have excluded something, so call this just in case. /
2346	btrfs_free_excluded_extents(bg: cache);
2347	goto error;
2348	}
2349
2350	/*
2351	* For zoned filesystem, space after the allocation offset is the only
2352	* free space for a block group. So, we don't need any caching work.
2353	* btrfs_calc_zone_unusable() will set the amount of free space and
2354	* zone_unusable space.
2355	*
2356	* For regular filesystem, check for two cases, either we are full, and
2357	* therefore don't need to bother with the caching work since we won't
2358	* find any space, or we are empty, and we can just add all the space
2359	* in and be done with it. This saves us _a_lot_ of time, particularly
2360	* in the full case.
2361	*/
2362	if (btrfs_is_zoned(fs_info: info)) {
2363	btrfs_calc_zone_unusable(cache);
2364	/ Should not have any excluded extents. Just in case, though. /
2365	btrfs_free_excluded_extents(bg: cache);
2366	} else if (cache->length == cache->used) {
2367	cache->cached = BTRFS_CACHE_FINISHED;
2368	btrfs_free_excluded_extents(bg: cache);
2369	} else if (cache->used == `0`) {
2370	cache->cached = BTRFS_CACHE_FINISHED;
2371	ret = btrfs_add_new_free_space(block_group: cache, start: cache->start,
2372	end: cache->start + cache->length, NULL);
2373	btrfs_free_excluded_extents(bg: cache);
2374	if (ret)
2375	goto error;
2376	}
2377
2378	ret = btrfs_add_block_group_cache(info, block_group: cache);
2379	if (ret) {
2380	btrfs_remove_free_space_cache(block_group: cache);
2381	goto error;
2382	}
2383	trace_btrfs_add_block_group(fs_info: info, block_group: cache, create: `0`);
2384	btrfs_add_bg_to_space_info(info, block_group: cache);
2385
2386	set_avail_alloc_bits(fs_info: info, flags: cache->flags);
2387	if (btrfs_chunk_writeable(fs_info: info, chunk_offset: cache->start)) {
2388	if (cache->used == `0`) {
2389	ASSERT(list_empty(&cache->bg_list));
2390	if (btrfs_test_opt(info, DISCARD_ASYNC))
2391	btrfs_discard_queue_work(discard_ctl: &info->discard_ctl, block_group: cache);
2392	else
2393	btrfs_mark_bg_unused(bg: cache);
2394	}
2395	} else {
2396	inc_block_group_ro(cache, force: `1`);
2397	}
2398
2399	return `0`;
2400	error:
2401	btrfs_put_block_group(cache);
2402	return ret;
2403	}
2404
2405	static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2406	{
2407	struct rb_node *node;
2408	int ret = `0`;
2409
2410	for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
2411	struct btrfs_chunk_map *map;
2412	struct btrfs_block_group *bg;
2413
2414	map = rb_entry(node, struct btrfs_chunk_map, rb_node);
2415	bg = btrfs_create_block_group_cache(fs_info, start: map->start);
2416	if (!bg) {
2417	ret = -ENOMEM;
2418	break;
2419	}
2420
2421	/ Fill dummy cache as FULL /
2422	bg->length = map->chunk_len;
2423	bg->flags = map->type;
2424	bg->cached = BTRFS_CACHE_FINISHED;
2425	bg->used = map->chunk_len;
2426	bg->flags = map->type;
2427	ret = btrfs_add_block_group_cache(info: fs_info, block_group: bg);
2428	/*
2429	* We may have some valid block group cache added already, in
2430	* that case we skip to the next one.
2431	*/
2432	if (ret == -EEXIST) {
2433	ret = `0`;
2434	btrfs_put_block_group(cache: bg);
2435	continue;
2436	}
2437
2438	if (ret) {
2439	btrfs_remove_free_space_cache(block_group: bg);
2440	btrfs_put_block_group(cache: bg);
2441	break;
2442	}
2443
2444	btrfs_add_bg_to_space_info(info: fs_info, block_group: bg);
2445
2446	set_avail_alloc_bits(fs_info, flags: bg->flags);
2447	}
2448	if (!ret)
2449	btrfs_init_global_block_rsv(fs_info);
2450	return ret;
2451	}
2452
2453	int btrfs_read_block_groups(struct btrfs_fs_info *info)
2454	{
2455	struct btrfs_root *root = btrfs_block_group_root(fs_info: info);
2456	struct btrfs_path *path;
2457	int ret;
2458	struct btrfs_block_group *cache;
2459	struct btrfs_space_info *space_info;
2460	struct btrfs_key key;
2461	int need_clear = `0`;
2462	u64 cache_gen;
2463
2464	/*
2465	* Either no extent root (with ibadroots rescue option) or we have
2466	* unsupported RO options. The fs can never be mounted read-write, so no
2467	* need to waste time searching block group items.
2468	*
2469	* This also allows new extent tree related changes to be RO compat,
2470	* no need for a full incompat flag.
2471	*/
2472	if (!root \|\| (btrfs_super_compat_ro_flags(s: info->super_copy) &
2473	~BTRFS_FEATURE_COMPAT_RO_SUPP))
2474	return fill_dummy_bgs(fs_info: info);
2475
2476	key.objectid = `0`;
2477	key.offset = `0`;
2478	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2479	path = btrfs_alloc_path();
2480	if (!path)
2481	return -ENOMEM;
2482
2483	cache_gen = btrfs_super_cache_generation(s: info->super_copy);
2484	if (btrfs_test_opt(info, SPACE_CACHE) &&
2485	btrfs_super_generation(s: info->super_copy) != cache_gen)
2486	need_clear = `1`;
2487	if (btrfs_test_opt(info, CLEAR_CACHE))
2488	need_clear = `1`;
2489
2490	while (`1`) {
2491	struct btrfs_block_group_item bgi;
2492	struct extent_buffer *leaf;
2493	int slot;
2494
2495	ret = find_first_block_group(fs_info: info, path, key: &key);
2496	if (ret > `0`)
2497	break;
2498	if (ret != `0`)
2499	goto error;
2500
2501	leaf = path->nodes[`0`];
2502	slot = path->slots[`0`];
2503
2504	read_extent_buffer(eb: leaf, dst: &bgi, btrfs_item_ptr_offset(leaf, slot),
2505	len: sizeof(bgi));
2506
2507	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
2508	btrfs_release_path(p: path);
2509	ret = read_one_block_group(info, bgi: &bgi, key: &key, need_clear);
2510	if (ret < `0`)
2511	goto error;
2512	key.objectid += key.offset;
2513	key.offset = `0`;
2514	}
2515	btrfs_release_path(p: path);
2516
2517	list_for_each_entry(space_info, &info->space_info, list) {
2518	int i;
2519
2520	for (i = `0`; i < BTRFS_NR_RAID_TYPES; i++) {
2521	if (list_empty(head: &space_info->block_groups[i]))
2522	continue;
2523	cache = list_first_entry(&space_info->block_groups[i],
2524	struct btrfs_block_group,
2525	list);
2526	btrfs_sysfs_add_block_group_type(cache);
2527	}
2528
2529	if (!(btrfs_get_alloc_profile(fs_info: info, orig_flags: space_info->flags) &
2530	(BTRFS_BLOCK_GROUP_RAID10 \|
2531	BTRFS_BLOCK_GROUP_RAID1_MASK \|
2532	BTRFS_BLOCK_GROUP_RAID56_MASK \|
2533	BTRFS_BLOCK_GROUP_DUP)))
2534	continue;
2535	/*
2536	* Avoid allocating from un-mirrored block group if there are
2537	* mirrored block groups.
2538	*/
2539	list_for_each_entry(cache,
2540	&space_info->block_groups[BTRFS_RAID_RAID0],
2541	list)
2542	inc_block_group_ro(cache, force: `1`);
2543	list_for_each_entry(cache,
2544	&space_info->block_groups[BTRFS_RAID_SINGLE],
2545	list)
2546	inc_block_group_ro(cache, force: `1`);
2547	}
2548
2549	btrfs_init_global_block_rsv(fs_info: info);
2550	ret = check_chunk_block_group_mappings(fs_info: info);
2551	error:
2552	btrfs_free_path(p: path);
2553	/*
2554	* We've hit some error while reading the extent tree, and have
2555	* rescue=ibadroots mount option.
2556	* Try to fill the tree using dummy block groups so that the user can
2557	* continue to mount and grab their data.
2558	*/
2559	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2560	ret = fill_dummy_bgs(fs_info: info);
2561	return ret;
2562	}
2563
2564	/*
2565	* This function, insert_block_group_item(), belongs to the phase 2 of chunk
2566	* allocation.
2567	*
2568	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2569	* phases.
2570	*/
2571	static int insert_block_group_item(struct btrfs_trans_handle *trans,
2572	struct btrfs_block_group *block_group)
2573	{
2574	struct btrfs_fs_info *fs_info = trans->fs_info;
2575	struct btrfs_block_group_item bgi;
2576	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2577	struct btrfs_key key;
2578	u64 old_commit_used;
2579	int ret;
2580
2581	spin_lock(lock: &block_group->lock);
2582	btrfs_set_stack_block_group_used(s: &bgi, val: block_group->used);
2583	btrfs_set_stack_block_group_chunk_objectid(s: &bgi,
2584	val: block_group->global_root_id);
2585	btrfs_set_stack_block_group_flags(s: &bgi, val: block_group->flags);
2586	old_commit_used = block_group->commit_used;
2587	block_group->commit_used = block_group->used;
2588	key.objectid = block_group->start;
2589	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2590	key.offset = block_group->length;
2591	spin_unlock(lock: &block_group->lock);
2592
2593	ret = btrfs_insert_item(trans, root, key: &key, data: &bgi, data_size: sizeof(bgi));
2594	if (ret < `0`) {
2595	spin_lock(lock: &block_group->lock);
2596	block_group->commit_used = old_commit_used;
2597	spin_unlock(lock: &block_group->lock);
2598	}
2599
2600	return ret;
2601	}
2602
2603	static int insert_dev_extent(struct btrfs_trans_handle *trans,
2604	struct btrfs_device *device, u64 chunk_offset,
2605	u64 start, u64 num_bytes)
2606	{
2607	struct btrfs_fs_info *fs_info = device->fs_info;
2608	struct btrfs_root *root = fs_info->dev_root;
2609	struct btrfs_path *path;
2610	struct btrfs_dev_extent *extent;
2611	struct extent_buffer *leaf;
2612	struct btrfs_key key;
2613	int ret;
2614
2615	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2616	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2617	path = btrfs_alloc_path();
2618	if (!path)
2619	return -ENOMEM;
2620
2621	key.objectid = device->devid;
2622	key.type = BTRFS_DEV_EXTENT_KEY;
2623	key.offset = start;
2624	ret = btrfs_insert_empty_item(trans, root, path, key: &key, data_size: sizeof(*extent));
2625	if (ret)
2626	goto out;
2627
2628	leaf = path->nodes[`0`];
2629	extent = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_dev_extent);
2630	btrfs_set_dev_extent_chunk_tree(eb: leaf, s: extent, BTRFS_CHUNK_TREE_OBJECTID);
2631	btrfs_set_dev_extent_chunk_objectid(eb: leaf, s: extent,
2632	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2633	btrfs_set_dev_extent_chunk_offset(eb: leaf, s: extent, val: chunk_offset);
2634
2635	btrfs_set_dev_extent_length(eb: leaf, s: extent, val: num_bytes);
2636	btrfs_mark_buffer_dirty(trans, buf: leaf);
2637	out:
2638	btrfs_free_path(p: path);
2639	return ret;
2640	}
2641
2642	/*
2643	* This function belongs to phase 2.
2644	*
2645	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2646	* phases.
2647	*/
2648	static int insert_dev_extents(struct btrfs_trans_handle *trans,
2649	u64 chunk_offset, u64 chunk_size)
2650	{
2651	struct btrfs_fs_info *fs_info = trans->fs_info;
2652	struct btrfs_device *device;
2653	struct btrfs_chunk_map *map;
2654	u64 dev_offset;
2655	int i;
2656	int ret = `0`;
2657
2658	map = btrfs_get_chunk_map(fs_info, logical: chunk_offset, length: chunk_size);
2659	if (IS_ERR(ptr: map))
2660	return PTR_ERR(ptr: map);
2661
2662	/*
2663	* Take the device list mutex to prevent races with the final phase of
2664	* a device replace operation that replaces the device object associated
2665	* with the map's stripes, because the device object's id can change
2666	* at any time during that final phase of the device replace operation
2667	* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2668	* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2669	* resulting in persisting a device extent item with such ID.
2670	*/
2671	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2672	for (i = `0`; i < map->num_stripes; i++) {
2673	device = map->stripes[i].dev;
2674	dev_offset = map->stripes[i].physical;
2675
2676	ret = insert_dev_extent(trans, device, chunk_offset, start: dev_offset,
2677	num_bytes: map->stripe_size);
2678	if (ret)
2679	break;
2680	}
2681	mutex_unlock(lock: &fs_info->fs_devices->device_list_mutex);
2682
2683	btrfs_free_chunk_map(map);
2684	return ret;
2685	}
2686
2687	/*
2688	* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2689	* chunk allocation.
2690	*
2691	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2692	* phases.
2693	*/
2694	void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2695	{
2696	struct btrfs_fs_info *fs_info = trans->fs_info;
2697	struct btrfs_block_group *block_group;
2698	int ret = `0`;
2699
2700	while (!list_empty(head: &trans->new_bgs)) {
2701	int index;
2702
2703	block_group = list_first_entry(&trans->new_bgs,
2704	struct btrfs_block_group,
2705	bg_list);
2706	if (ret)
2707	goto next;
2708
2709	index = btrfs_bg_flags_to_raid_index(flags: block_group->flags);
2710
2711	ret = insert_block_group_item(trans, block_group);
2712	if (ret)
2713	btrfs_abort_transaction(trans, ret);
2714	if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2715	&block_group->runtime_flags)) {
2716	mutex_lock(&fs_info->chunk_mutex);
2717	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg: block_group);
2718	mutex_unlock(lock: &fs_info->chunk_mutex);
2719	if (ret)
2720	btrfs_abort_transaction(trans, ret);
2721	}
2722	ret = insert_dev_extents(trans, chunk_offset: block_group->start,
2723	chunk_size: block_group->length);
2724	if (ret)
2725	btrfs_abort_transaction(trans, ret);
2726	add_block_group_free_space(trans, block_group);
2727
2728	/*
2729	* If we restriped during balance, we may have added a new raid
2730	* type, so now add the sysfs entries when it is safe to do so.
2731	* We don't have to worry about locking here as it's handled in
2732	* btrfs_sysfs_add_block_group_type.
2733	*/
2734	if (block_group->space_info->block_group_kobjs[index] == NULL)
2735	btrfs_sysfs_add_block_group_type(cache: block_group);
2736
2737	/ Already aborted the transaction if it failed. /
2738	next:
2739	btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2740	list_del_init(entry: &block_group->bg_list);
2741	clear_bit(nr: BLOCK_GROUP_FLAG_NEW, addr: &block_group->runtime_flags);
2742
2743	/*
2744	* If the block group is still unused, add it to the list of
2745	* unused block groups. The block group may have been created in
2746	* order to satisfy a space reservation, in which case the
2747	* extent allocation only happens later. But often we don't
2748	* actually need to allocate space that we previously reserved,
2749	* so the block group may become unused for a long time. For
2750	* example for metadata we generally reserve space for a worst
2751	* possible scenario, but then don't end up allocating all that
2752	* space or none at all (due to no need to COW, extent buffers
2753	* were already COWed in the current transaction and still
2754	* unwritten, tree heights lower than the maximum possible
2755	* height, etc). For data we generally reserve the axact amount
2756	* of space we are going to allocate later, the exception is
2757	* when using compression, as we must reserve space based on the
2758	* uncompressed data size, because the compression is only done
2759	* when writeback triggered and we don't know how much space we
2760	* are actually going to need, so we reserve the uncompressed
2761	* size because the data may be uncompressible in the worst case.
2762	*/
2763	if (ret == `0`) {
2764	bool used;
2765
2766	spin_lock(lock: &block_group->lock);
2767	used = btrfs_is_block_group_used(bg: block_group);
2768	spin_unlock(lock: &block_group->lock);
2769
2770	if (!used)
2771	btrfs_mark_bg_unused(bg: block_group);
2772	}
2773	}
2774	btrfs_trans_release_chunk_metadata(trans);
2775	}
2776
2777	/*
2778	* For extent tree v2 we use the block_group_item->chunk_offset to point at our
2779	* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2780	*/
2781	static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
2782	{
2783	u64 div = SZ_1G;
2784	u64 index;
2785
2786	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2787	return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2788
2789	/ If we have a smaller fs index based on 128MiB. /
2790	if (btrfs_super_total_bytes(s: fs_info->super_copy) <= (SZ_1G * `10ULL`))
2791	div = SZ_128M;
2792
2793	offset = div64_u64(dividend: offset, divisor: div);
2794	div64_u64_rem(dividend: offset, divisor: fs_info->nr_global_roots, remainder: &index);
2795	return index;
2796	}
2797
2798	struct btrfs_block_group btrfs_make_block_group(struct* btrfs_trans_handle *trans,
2799	u64 type,
2800	u64 chunk_offset, u64 size)
2801	{
2802	struct btrfs_fs_info *fs_info = trans->fs_info;
2803	struct btrfs_block_group *cache;
2804	int ret;
2805
2806	btrfs_set_log_full_commit(trans);
2807
2808	cache = btrfs_create_block_group_cache(fs_info, start: chunk_offset);
2809	if (!cache)
2810	return ERR_PTR(error: -ENOMEM);
2811
2812	/*
2813	* Mark it as new before adding it to the rbtree of block groups or any
2814	* list, so that no other task finds it and calls btrfs_mark_bg_unused()
2815	* before the new flag is set.
2816	*/
2817	set_bit(nr: BLOCK_GROUP_FLAG_NEW, addr: &cache->runtime_flags);
2818
2819	cache->length = size;
2820	set_free_space_tree_thresholds(cache);
2821	cache->flags = type;
2822	cache->cached = BTRFS_CACHE_FINISHED;
2823	cache->global_root_id = calculate_global_root_id(fs_info, offset: cache->start);
2824
2825	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2826	set_bit(nr: BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, addr: &cache->runtime_flags);
2827
2828	ret = btrfs_load_block_group_zone_info(cache, new: true);
2829	if (ret) {
2830	btrfs_put_block_group(cache);
2831	return ERR_PTR(error: ret);
2832	}
2833
2834	ret = exclude_super_stripes(cache);
2835	if (ret) {
2836	/ We may have excluded something, so call this just in case /
2837	btrfs_free_excluded_extents(bg: cache);
2838	btrfs_put_block_group(cache);
2839	return ERR_PTR(error: ret);
2840	}
2841
2842	ret = btrfs_add_new_free_space(block_group: cache, start: chunk_offset, end: chunk_offset + size, NULL);
2843	btrfs_free_excluded_extents(bg: cache);
2844	if (ret) {
2845	btrfs_put_block_group(cache);
2846	return ERR_PTR(error: ret);
2847	}
2848
2849	/*
2850	* Ensure the corresponding space_info object is created and
2851	* assigned to our block group. We want our bg to be added to the rbtree
2852	* with its ->space_info set.
2853	*/
2854	cache->space_info = btrfs_find_space_info(info: fs_info, flags: cache->flags);
2855	ASSERT(cache->space_info);
2856
2857	ret = btrfs_add_block_group_cache(info: fs_info, block_group: cache);
2858	if (ret) {
2859	btrfs_remove_free_space_cache(block_group: cache);
2860	btrfs_put_block_group(cache);
2861	return ERR_PTR(error: ret);
2862	}
2863
2864	/*
2865	* Now that our block group has its ->space_info set and is inserted in
2866	* the rbtree, update the space info's counters.
2867	*/
2868	trace_btrfs_add_block_group(fs_info, block_group: cache, create: `1`);
2869	btrfs_add_bg_to_space_info(info: fs_info, block_group: cache);
2870	btrfs_update_global_block_rsv(fs_info);
2871
2872	#ifdef CONFIG_BTRFS_DEBUG
2873	if (btrfs_should_fragment_free_space(block_group: cache)) {
2874	cache->space_info->bytes_used += size >> `1`;
2875	fragment_free_space(block_group: cache);
2876	}
2877	#endif
2878
2879	list_add_tail(new: &cache->bg_list, head: &trans->new_bgs);
2880	btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
2881
2882	set_avail_alloc_bits(fs_info, flags: type);
2883	return cache;
2884	}
2885
2886	/*
2887	* Mark one block group RO, can be called several times for the same block
2888	* group.
2889	*
2890	* @cache: the destination block group
2891	* @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
2892	* ensure we still have some free space after marking this
2893	* block group RO.
2894	*/
2895	int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2896	bool do_chunk_alloc)
2897	{
2898	struct btrfs_fs_info *fs_info = cache->fs_info;
2899	struct btrfs_trans_handle *trans;
2900	struct btrfs_root *root = btrfs_block_group_root(fs_info);
2901	u64 alloc_flags;
2902	int ret;
2903	bool dirty_bg_running;
2904
2905	/*
2906	* This can only happen when we are doing read-only scrub on read-only
2907	* mount.
2908	* In that case we should not start a new transaction on read-only fs.
2909	* Thus here we skip all chunk allocations.
2910	*/
2911	if (sb_rdonly(sb: fs_info->sb)) {
2912	mutex_lock(&fs_info->ro_block_group_mutex);
2913	ret = inc_block_group_ro(cache, force: `0`);
2914	mutex_unlock(lock: &fs_info->ro_block_group_mutex);
2915	return ret;
2916	}
2917
2918	do {
2919	trans = btrfs_join_transaction(root);
2920	if (IS_ERR(ptr: trans))
2921	return PTR_ERR(ptr: trans);
2922
2923	dirty_bg_running = false;
2924
2925	/*
2926	* We're not allowed to set block groups readonly after the dirty
2927	* block group cache has started writing. If it already started,
2928	* back off and let this transaction commit.
2929	*/
2930	mutex_lock(&fs_info->ro_block_group_mutex);
2931	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
2932	u64 transid = trans->transid;
2933
2934	mutex_unlock(lock: &fs_info->ro_block_group_mutex);
2935	btrfs_end_transaction(trans);
2936
2937	ret = btrfs_wait_for_commit(fs_info, transid);
2938	if (ret)
2939	return ret;
2940	dirty_bg_running = true;
2941	}
2942	} while (dirty_bg_running);
2943
2944	if (do_chunk_alloc) {
2945	/*
2946	* If we are changing raid levels, try to allocate a
2947	* corresponding block group with the new raid level.
2948	*/
2949	alloc_flags = btrfs_get_alloc_profile(fs_info, orig_flags: cache->flags);
2950	if (alloc_flags != cache->flags) {
2951	ret = btrfs_chunk_alloc(trans, flags: alloc_flags,
2952	force: CHUNK_ALLOC_FORCE);
2953	/*
2954	* ENOSPC is allowed here, we may have enough space
2955	* already allocated at the new raid level to carry on
2956	*/
2957	if (ret == -ENOSPC)
2958	ret = `0`;
2959	if (ret < `0`)
2960	goto out;
2961	}
2962	}
2963
2964	ret = inc_block_group_ro(cache, force: `0`);
2965	if (!ret)
2966	goto out;
2967	if (ret == -ETXTBSY)
2968	goto unlock_out;
2969
2970	/*
2971	* Skip chunk allocation if the bg is SYSTEM, this is to avoid system
2972	* chunk allocation storm to exhaust the system chunk array. Otherwise
2973	* we still want to try our best to mark the block group read-only.
2974	*/
2975	if (!do_chunk_alloc && ret == -ENOSPC &&
2976	(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
2977	goto unlock_out;
2978
2979	alloc_flags = btrfs_get_alloc_profile(fs_info, orig_flags: cache->space_info->flags);
2980	ret = btrfs_chunk_alloc(trans, flags: alloc_flags, force: CHUNK_ALLOC_FORCE);
2981	if (ret < `0`)
2982	goto out;
2983	/*
2984	* We have allocated a new chunk. We also need to activate that chunk to
2985	* grant metadata tickets for zoned filesystem.
2986	*/
2987	ret = btrfs_zoned_activate_one_bg(fs_info, space_info: cache->space_info, do_finish: true);
2988	if (ret < `0`)
2989	goto out;
2990
2991	ret = inc_block_group_ro(cache, force: `0`);
2992	if (ret == -ETXTBSY)
2993	goto unlock_out;
2994	out:
2995	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
2996	alloc_flags = btrfs_get_alloc_profile(fs_info, orig_flags: cache->flags);
2997	mutex_lock(&fs_info->chunk_mutex);
2998	check_system_chunk(trans, type: alloc_flags);
2999	mutex_unlock(lock: &fs_info->chunk_mutex);
3000	}
3001	unlock_out:
3002	mutex_unlock(lock: &fs_info->ro_block_group_mutex);
3003
3004	btrfs_end_transaction(trans);
3005	return ret;
3006	}
3007
3008	void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
3009	{
3010	struct btrfs_space_info *sinfo = cache->space_info;
3011	u64 num_bytes;
3012
3013	BUG_ON(!cache->ro);
3014
3015	spin_lock(lock: &sinfo->lock);
3016	spin_lock(lock: &cache->lock);
3017	if (!--cache->ro) {
3018	if (btrfs_is_zoned(fs_info: cache->fs_info)) {
3019	/ Migrate zone_unusable bytes back /
3020	cache->zone_unusable =
3021	(cache->alloc_offset - cache->used) +
3022	(cache->length - cache->zone_capacity);
3023	sinfo->bytes_zone_unusable += cache->zone_unusable;
3024	sinfo->bytes_readonly -= cache->zone_unusable;
3025	}
3026	num_bytes = cache->length - cache->reserved -
3027	cache->pinned - cache->bytes_super -
3028	cache->zone_unusable - cache->used;
3029	sinfo->bytes_readonly -= num_bytes;
3030	list_del_init(entry: &cache->ro_list);
3031	}
3032	spin_unlock(lock: &cache->lock);
3033	spin_unlock(lock: &sinfo->lock);
3034	}
3035
3036	static int update_block_group_item(struct btrfs_trans_handle *trans,
3037	struct btrfs_path *path,
3038	struct btrfs_block_group *cache)
3039	{
3040	struct btrfs_fs_info *fs_info = trans->fs_info;
3041	int ret;
3042	struct btrfs_root *root = btrfs_block_group_root(fs_info);
3043	unsigned long bi;
3044	struct extent_buffer *leaf;
3045	struct btrfs_block_group_item bgi;
3046	struct btrfs_key key;
3047	u64 old_commit_used;
3048	u64 used;
3049
3050	/*
3051	* Block group items update can be triggered out of commit transaction
3052	* critical section, thus we need a consistent view of used bytes.
3053	* We cannot use cache->used directly outside of the spin lock, as it
3054	* may be changed.
3055	*/
3056	spin_lock(lock: &cache->lock);
3057	old_commit_used = cache->commit_used;
3058	used = cache->used;
3059	/ No change in used bytes, can safely skip it. /
3060	if (cache->commit_used == used) {
3061	spin_unlock(lock: &cache->lock);
3062	return `0`;
3063	}
3064	cache->commit_used = used;
3065	spin_unlock(lock: &cache->lock);
3066
3067	key.objectid = cache->start;
3068	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
3069	key.offset = cache->length;
3070
3071	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
3072	if (ret) {
3073	if (ret > `0`)
3074	ret = -ENOENT;
3075	goto fail;
3076	}
3077
3078	leaf = path->nodes[`0`];
3079	bi = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
3080	btrfs_set_stack_block_group_used(s: &bgi, val: used);
3081	btrfs_set_stack_block_group_chunk_objectid(s: &bgi,
3082	val: cache->global_root_id);
3083	btrfs_set_stack_block_group_flags(s: &bgi, val: cache->flags);
3084	write_extent_buffer(eb: leaf, src: &bgi, start: bi, len: sizeof(bgi));
3085	btrfs_mark_buffer_dirty(trans, buf: leaf);
3086	fail:
3087	btrfs_release_path(p: path);
3088	/*
3089	* We didn't update the block group item, need to revert commit_used
3090	* unless the block group item didn't exist yet - this is to prevent a
3091	* race with a concurrent insertion of the block group item, with
3092	* insert_block_group_item(), that happened just after we attempted to
3093	* update. In that case we would reset commit_used to 0 just after the
3094	* insertion set it to a value greater than 0 - if the block group later
3095	* becomes with 0 used bytes, we would incorrectly skip its update.
3096	*/
3097	if (ret < `0` && ret != -ENOENT) {
3098	spin_lock(lock: &cache->lock);
3099	cache->commit_used = old_commit_used;
3100	spin_unlock(lock: &cache->lock);
3101	}
3102	return ret;
3103
3104	}
3105
3106	static int cache_save_setup(struct btrfs_block_group *block_group,
3107	struct btrfs_trans_handle *trans,
3108	struct btrfs_path *path)
3109	{
3110	struct btrfs_fs_info *fs_info = block_group->fs_info;
3111	struct inode *inode = NULL;
3112	struct extent_changeset *data_reserved = NULL;
3113	u64 alloc_hint = `0`;
3114	int dcs = BTRFS_DC_ERROR;
3115	u64 cache_size = `0`;
3116	int retries = `0`;
3117	int ret = `0`;
3118
3119	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
3120	return `0`;
3121
3122	/*
3123	* If this block group is smaller than 100 megs don't bother caching the
3124	* block group.
3125	*/
3126	if (block_group->length < (`100` * SZ_1M)) {
3127	spin_lock(lock: &block_group->lock);
3128	block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3129	spin_unlock(lock: &block_group->lock);
3130	return `0`;
3131	}
3132
3133	if (TRANS_ABORTED(trans))
3134	return `0`;
3135	again:
3136	inode = lookup_free_space_inode(block_group, path);
3137	if (IS_ERR(ptr: inode) && PTR_ERR(ptr: inode) != -ENOENT) {
3138	ret = PTR_ERR(ptr: inode);
3139	btrfs_release_path(p: path);
3140	goto out;
3141	}
3142
3143	if (IS_ERR(ptr: inode)) {
3144	BUG_ON(retries);
3145	retries++;
3146
3147	if (block_group->ro)
3148	goto out_free;
3149
3150	ret = create_free_space_inode(trans, block_group, path);
3151	if (ret)
3152	goto out_free;
3153	goto again;
3154	}
3155
3156	/*
3157	* We want to set the generation to 0, that way if anything goes wrong
3158	* from here on out we know not to trust this cache when we load up next
3159	* time.
3160	*/
3161	BTRFS_I(inode)->generation = `0`;
3162	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
3163	if (ret) {
3164	/*
3165	* So theoretically we could recover from this, simply set the
3166	* super cache generation to 0 so we know to invalidate the
3167	* cache, but then we'd have to keep track of the block groups
3168	* that fail this way so we know we _have_ to reset this cache
3169	* before the next commit or risk reading stale cache. So to
3170	* limit our exposure to horrible edge cases lets just abort the
3171	* transaction, this only happens in really bad situations
3172	* anyway.
3173	*/
3174	btrfs_abort_transaction(trans, ret);
3175	goto out_put;
3176	}
3177	WARN_ON(ret);
3178
3179	/ We've already setup this transaction, go ahead and exit /
3180	if (block_group->cache_generation == trans->transid &&
3181	i_size_read(inode)) {
3182	dcs = BTRFS_DC_SETUP;
3183	goto out_put;
3184	}
3185
3186	if (i_size_read(inode) > `0`) {
3187	ret = btrfs_check_trunc_cache_free_space(fs_info,
3188	rsv: &fs_info->global_block_rsv);
3189	if (ret)
3190	goto out_put;
3191
3192	ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3193	if (ret)
3194	goto out_put;
3195	}
3196
3197	spin_lock(lock: &block_group->lock);
3198	if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
3199	!btrfs_test_opt(fs_info, SPACE_CACHE)) {
3200	/*
3201	* don't bother trying to write stuff out _if_
3202	* a) we're not cached,
3203	* b) we're with nospace_cache mount option,
3204	* c) we're with v2 space_cache (FREE_SPACE_TREE).
3205	*/
3206	dcs = BTRFS_DC_WRITTEN;
3207	spin_unlock(lock: &block_group->lock);
3208	goto out_put;
3209	}
3210	spin_unlock(lock: &block_group->lock);
3211
3212	/*
3213	* We hit an ENOSPC when setting up the cache in this transaction, just
3214	* skip doing the setup, we've already cleared the cache so we're safe.
3215	*/
3216	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3217	ret = -ENOSPC;
3218	goto out_put;
3219	}
3220
3221	/*
3222	* Try to preallocate enough space based on how big the block group is.
3223	* Keep in mind this has to include any pinned space which could end up
3224	* taking up quite a bit since it's not folded into the other space
3225	* cache.
3226	*/
3227	cache_size = div_u64(dividend: block_group->length, SZ_256M);
3228	if (!cache_size)
3229	cache_size = `1`;
3230
3231	cache_size *= `16`;
3232	cache_size *= fs_info->sectorsize;
3233
3234	ret = btrfs_check_data_free_space(inode: BTRFS_I(inode), reserved: &data_reserved, start: `0`,
3235	len: cache_size, noflush: false);
3236	if (ret)
3237	goto out_put;
3238
3239	ret = btrfs_prealloc_file_range_trans(inode, trans, mode: `0`, start: `0`, num_bytes: cache_size,
3240	min_size: cache_size, actual_len: cache_size,
3241	alloc_hint: &alloc_hint);
3242	/*
3243	* Our cache requires contiguous chunks so that we don't modify a bunch
3244	* of metadata or split extents when writing the cache out, which means
3245	* we can enospc if we are heavily fragmented in addition to just normal
3246	* out of space conditions. So if we hit this just skip setting up any
3247	* other block groups for this transaction, maybe we'll unpin enough
3248	* space the next time around.
3249	*/
3250	if (!ret)
3251	dcs = BTRFS_DC_SETUP;
3252	else if (ret == -ENOSPC)
3253	set_bit(BTRFS_TRANS_CACHE_ENOSPC, addr: &trans->transaction->flags);
3254
3255	out_put:
3256	iput(inode);
3257	out_free:
3258	btrfs_release_path(p: path);
3259	out:
3260	spin_lock(lock: &block_group->lock);
3261	if (!ret && dcs == BTRFS_DC_SETUP)
3262	block_group->cache_generation = trans->transid;
3263	block_group->disk_cache_state = dcs;
3264	spin_unlock(lock: &block_group->lock);
3265
3266	extent_changeset_free(changeset: data_reserved);
3267	return ret;
3268	}
3269
3270	int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3271	{
3272	struct btrfs_fs_info *fs_info = trans->fs_info;
3273	struct btrfs_block_group cache, tmp;
3274	struct btrfs_transaction *cur_trans = trans->transaction;
3275	struct btrfs_path *path;
3276
3277	if (list_empty(head: &cur_trans->dirty_bgs) \|\|
3278	!btrfs_test_opt(fs_info, SPACE_CACHE))
3279	return `0`;
3280
3281	path = btrfs_alloc_path();
3282	if (!path)
3283	return -ENOMEM;
3284
3285	/ Could add new block groups, use _safe just in case /
3286	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3287	dirty_list) {
3288	if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3289	cache_save_setup(block_group: cache, trans, path);
3290	}
3291
3292	btrfs_free_path(p: path);
3293	return `0`;
3294	}
3295
3296	/*
3297	* Transaction commit does final block group cache writeback during a critical
3298	* section where nothing is allowed to change the FS. This is required in
3299	* order for the cache to actually match the block group, but can introduce a
3300	* lot of latency into the commit.
3301	*
3302	* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3303	* There's a chance we'll have to redo some of it if the block group changes
3304	* again during the commit, but it greatly reduces the commit latency by
3305	* getting rid of the easy block groups while we're still allowing others to
3306	* join the commit.
3307	*/
3308	int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3309	{
3310	struct btrfs_fs_info *fs_info = trans->fs_info;
3311	struct btrfs_block_group *cache;
3312	struct btrfs_transaction *cur_trans = trans->transaction;
3313	int ret = `0`;
3314	int should_put;
3315	struct btrfs_path *path = NULL;
3316	LIST_HEAD(dirty);
3317	struct list_head *io = &cur_trans->io_bgs;
3318	int loops = `0`;
3319
3320	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3321	if (list_empty(head: &cur_trans->dirty_bgs)) {
3322	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3323	return `0`;
3324	}
3325	list_splice_init(list: &cur_trans->dirty_bgs, head: &dirty);
3326	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3327
3328	again:
3329	/ Make sure all the block groups on our dirty list actually exist /
3330	btrfs_create_pending_block_groups(trans);
3331
3332	if (!path) {
3333	path = btrfs_alloc_path();
3334	if (!path) {
3335	ret = -ENOMEM;
3336	goto out;
3337	}
3338	}
3339
3340	/*
3341	* cache_write_mutex is here only to save us from balance or automatic
3342	* removal of empty block groups deleting this block group while we are
3343	* writing out the cache
3344	*/
3345	mutex_lock(&trans->transaction->cache_write_mutex);
3346	while (!list_empty(head: &dirty)) {
3347	bool drop_reserve = true;
3348
3349	cache = list_first_entry(&dirty, struct btrfs_block_group,
3350	dirty_list);
3351	/*
3352	* This can happen if something re-dirties a block group that
3353	* is already under IO. Just wait for it to finish and then do
3354	* it all again
3355	*/
3356	if (!list_empty(head: &cache->io_list)) {
3357	list_del_init(entry: &cache->io_list);
3358	btrfs_wait_cache_io(trans, block_group: cache, path);
3359	btrfs_put_block_group(cache);
3360	}
3361
3362
3363	/*
3364	* btrfs_wait_cache_io uses the cache->dirty_list to decide if
3365	* it should update the cache_state. Don't delete until after
3366	* we wait.
3367	*
3368	* Since we're not running in the commit critical section
3369	* we need the dirty_bgs_lock to protect from update_block_group
3370	*/
3371	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3372	list_del_init(entry: &cache->dirty_list);
3373	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3374
3375	should_put = `1`;
3376
3377	cache_save_setup(block_group: cache, trans, path);
3378
3379	if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3380	cache->io_ctl.inode = NULL;
3381	ret = btrfs_write_out_cache(trans, block_group: cache, path);
3382	if (ret == `0` && cache->io_ctl.inode) {
3383	should_put = `0`;
3384
3385	/*
3386	* The cache_write_mutex is protecting the
3387	* io_list, also refer to the definition of
3388	* btrfs_transaction::io_bgs for more details
3389	*/
3390	list_add_tail(new: &cache->io_list, head: io);
3391	} else {
3392	/*
3393	* If we failed to write the cache, the
3394	* generation will be bad and life goes on
3395	*/
3396	ret = `0`;
3397	}
3398	}
3399	if (!ret) {
3400	ret = update_block_group_item(trans, path, cache);
3401	/*
3402	* Our block group might still be attached to the list
3403	* of new block groups in the transaction handle of some
3404	* other task (struct btrfs_trans_handle->new_bgs). This
3405	* means its block group item isn't yet in the extent
3406	* tree. If this happens ignore the error, as we will
3407	* try again later in the critical section of the
3408	* transaction commit.
3409	*/
3410	if (ret == -ENOENT) {
3411	ret = `0`;
3412	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3413	if (list_empty(head: &cache->dirty_list)) {
3414	list_add_tail(new: &cache->dirty_list,
3415	head: &cur_trans->dirty_bgs);
3416	btrfs_get_block_group(cache);
3417	drop_reserve = false;
3418	}
3419	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3420	} else if (ret) {
3421	btrfs_abort_transaction(trans, ret);
3422	}
3423	}
3424
3425	/ If it's not on the io list, we need to put the block group /
3426	if (should_put)
3427	btrfs_put_block_group(cache);
3428	if (drop_reserve)
3429	btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3430	/*
3431	* Avoid blocking other tasks for too long. It might even save
3432	* us from writing caches for block groups that are going to be
3433	* removed.
3434	*/
3435	mutex_unlock(lock: &trans->transaction->cache_write_mutex);
3436	if (ret)
3437	goto out;
3438	mutex_lock(&trans->transaction->cache_write_mutex);
3439	}
3440	mutex_unlock(lock: &trans->transaction->cache_write_mutex);
3441
3442	/*
3443	* Go through delayed refs for all the stuff we've just kicked off
3444	* and then loop back (just once)
3445	*/
3446	if (!ret)
3447	ret = btrfs_run_delayed_refs(trans, min_bytes: `0`);
3448	if (!ret && loops == `0`) {
3449	loops++;
3450	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3451	list_splice_init(list: &cur_trans->dirty_bgs, head: &dirty);
3452	/*
3453	* dirty_bgs_lock protects us from concurrent block group
3454	* deletes too (not just cache_write_mutex).
3455	*/
3456	if (!list_empty(head: &dirty)) {
3457	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3458	goto again;
3459	}
3460	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3461	}
3462	out:
3463	if (ret < `0`) {
3464	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3465	list_splice_init(list: &dirty, head: &cur_trans->dirty_bgs);
3466	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3467	btrfs_cleanup_dirty_bgs(trans: cur_trans, fs_info);
3468	}
3469
3470	btrfs_free_path(p: path);
3471	return ret;
3472	}
3473
3474	int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3475	{
3476	struct btrfs_fs_info *fs_info = trans->fs_info;
3477	struct btrfs_block_group *cache;
3478	struct btrfs_transaction *cur_trans = trans->transaction;
3479	int ret = `0`;
3480	int should_put;
3481	struct btrfs_path *path;
3482	struct list_head *io = &cur_trans->io_bgs;
3483
3484	path = btrfs_alloc_path();
3485	if (!path)
3486	return -ENOMEM;
3487
3488	/*
3489	* Even though we are in the critical section of the transaction commit,
3490	* we can still have concurrent tasks adding elements to this
3491	* transaction's list of dirty block groups. These tasks correspond to
3492	* endio free space workers started when writeback finishes for a
3493	* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3494	* allocate new block groups as a result of COWing nodes of the root
3495	* tree when updating the free space inode. The writeback for the space
3496	* caches is triggered by an earlier call to
3497	* btrfs_start_dirty_block_groups() and iterations of the following
3498	* loop.
3499	* Also we want to do the cache_save_setup first and then run the
3500	* delayed refs to make sure we have the best chance at doing this all
3501	* in one shot.
3502	*/
3503	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3504	while (!list_empty(head: &cur_trans->dirty_bgs)) {
3505	cache = list_first_entry(&cur_trans->dirty_bgs,
3506	struct btrfs_block_group,
3507	dirty_list);
3508
3509	/*
3510	* This can happen if cache_save_setup re-dirties a block group
3511	* that is already under IO. Just wait for it to finish and
3512	* then do it all again
3513	*/
3514	if (!list_empty(head: &cache->io_list)) {
3515	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3516	list_del_init(entry: &cache->io_list);
3517	btrfs_wait_cache_io(trans, block_group: cache, path);
3518	btrfs_put_block_group(cache);
3519	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3520	}
3521
3522	/*
3523	* Don't remove from the dirty list until after we've waited on
3524	* any pending IO
3525	*/
3526	list_del_init(entry: &cache->dirty_list);
3527	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3528	should_put = `1`;
3529
3530	cache_save_setup(block_group: cache, trans, path);
3531
3532	if (!ret)
3533	ret = btrfs_run_delayed_refs(trans, U64_MAX);
3534
3535	if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3536	cache->io_ctl.inode = NULL;
3537	ret = btrfs_write_out_cache(trans, block_group: cache, path);
3538	if (ret == `0` && cache->io_ctl.inode) {
3539	should_put = `0`;
3540	list_add_tail(new: &cache->io_list, head: io);
3541	} else {
3542	/*
3543	* If we failed to write the cache, the
3544	* generation will be bad and life goes on
3545	*/
3546	ret = `0`;
3547	}
3548	}
3549	if (!ret) {
3550	ret = update_block_group_item(trans, path, cache);
3551	/*
3552	* One of the free space endio workers might have
3553	* created a new block group while updating a free space
3554	* cache's inode (at inode.c:btrfs_finish_ordered_io())
3555	* and hasn't released its transaction handle yet, in
3556	* which case the new block group is still attached to
3557	* its transaction handle and its creation has not
3558	* finished yet (no block group item in the extent tree
3559	* yet, etc). If this is the case, wait for all free
3560	* space endio workers to finish and retry. This is a
3561	* very rare case so no need for a more efficient and
3562	* complex approach.
3563	*/
3564	if (ret == -ENOENT) {
3565	wait_event(cur_trans->writer_wait,
3566	atomic_read(&cur_trans->num_writers) == `1`);
3567	ret = update_block_group_item(trans, path, cache);
3568	}
3569	if (ret)
3570	btrfs_abort_transaction(trans, ret);
3571	}
3572
3573	/ If its not on the io list, we need to put the block group /
3574	if (should_put)
3575	btrfs_put_block_group(cache);
3576	btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3577	spin_lock(lock: &cur_trans->dirty_bgs_lock);
3578	}
3579	spin_unlock(lock: &cur_trans->dirty_bgs_lock);
3580
3581	/*
3582	* Refer to the definition of io_bgs member for details why it's safe
3583	* to use it without any locking
3584	*/
3585	while (!list_empty(head: io)) {
3586	cache = list_first_entry(io, struct btrfs_block_group,
3587	io_list);
3588	list_del_init(entry: &cache->io_list);
3589	btrfs_wait_cache_io(trans, block_group: cache, path);
3590	btrfs_put_block_group(cache);
3591	}
3592
3593	btrfs_free_path(p: path);
3594	return ret;
3595	}
3596
3597	int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3598	u64 bytenr, u64 num_bytes, bool alloc)
3599	{
3600	struct btrfs_fs_info *info = trans->fs_info;
3601	struct btrfs_space_info *space_info;
3602	struct btrfs_block_group *cache;
3603	u64 old_val;
3604	bool reclaim = false;
3605	bool bg_already_dirty = true;
3606	int factor;
3607
3608	/ Block accounting for super block /
3609	spin_lock(lock: &info->delalloc_root_lock);
3610	old_val = btrfs_super_bytes_used(s: info->super_copy);
3611	if (alloc)
3612	old_val += num_bytes;
3613	else
3614	old_val -= num_bytes;
3615	btrfs_set_super_bytes_used(s: info->super_copy, val: old_val);
3616	spin_unlock(lock: &info->delalloc_root_lock);
3617
3618	cache = btrfs_lookup_block_group(info, bytenr);
3619	if (!cache)
3620	return -ENOENT;
3621
3622	/ An extent can not span multiple block groups. /
3623	ASSERT(bytenr + num_bytes <= cache->start + cache->length);
3624
3625	space_info = cache->space_info;
3626	factor = btrfs_bg_type_to_factor(flags: cache->flags);
3627
3628	/*
3629	* If this block group has free space cache written out, we need to make
3630	* sure to load it if we are removing space. This is because we need
3631	* the unpinning stage to actually add the space back to the block group,
3632	* otherwise we will leak space.
3633	*/
3634	if (!alloc && !btrfs_block_group_done(cache))
3635	btrfs_cache_block_group(cache, wait: true);
3636
3637	spin_lock(lock: &space_info->lock);
3638	spin_lock(lock: &cache->lock);
3639
3640	if (btrfs_test_opt(info, SPACE_CACHE) &&
3641	cache->disk_cache_state < BTRFS_DC_CLEAR)
3642	cache->disk_cache_state = BTRFS_DC_CLEAR;
3643
3644	old_val = cache->used;
3645	if (alloc) {
3646	old_val += num_bytes;
3647	cache->used = old_val;
3648	cache->reserved -= num_bytes;
3649	space_info->bytes_reserved -= num_bytes;
3650	space_info->bytes_used += num_bytes;
3651	space_info->disk_used += num_bytes * factor;
3652	spin_unlock(lock: &cache->lock);
3653	spin_unlock(lock: &space_info->lock);
3654	} else {
3655	old_val -= num_bytes;
3656	cache->used = old_val;
3657	cache->pinned += num_bytes;
3658	btrfs_space_info_update_bytes_pinned(fs_info: info, sinfo: space_info, bytes: num_bytes);
3659	space_info->bytes_used -= num_bytes;
3660	space_info->disk_used -= num_bytes * factor;
3661
3662	reclaim = should_reclaim_block_group(bg: cache, bytes_freed: num_bytes);
3663
3664	spin_unlock(lock: &cache->lock);
3665	spin_unlock(lock: &space_info->lock);
3666
3667	set_extent_bit(tree: &trans->transaction->pinned_extents, start: bytenr,
3668	end: bytenr + num_bytes - `1`, bits: EXTENT_DIRTY, NULL);
3669	}
3670
3671	spin_lock(lock: &trans->transaction->dirty_bgs_lock);
3672	if (list_empty(head: &cache->dirty_list)) {
3673	list_add_tail(new: &cache->dirty_list, head: &trans->transaction->dirty_bgs);
3674	bg_already_dirty = false;
3675	btrfs_get_block_group(cache);
3676	}
3677	spin_unlock(lock: &trans->transaction->dirty_bgs_lock);
3678
3679	/*
3680	* No longer have used bytes in this block group, queue it for deletion.
3681	* We do this after adding the block group to the dirty list to avoid
3682	* races between cleaner kthread and space cache writeout.
3683	*/
3684	if (!alloc && old_val == `0`) {
3685	if (!btrfs_test_opt(info, DISCARD_ASYNC))
3686	btrfs_mark_bg_unused(bg: cache);
3687	} else if (!alloc && reclaim) {
3688	btrfs_mark_bg_to_reclaim(bg: cache);
3689	}
3690
3691	btrfs_put_block_group(cache);
3692
3693	/ Modified block groups are accounted for in the delayed_refs_rsv. /
3694	if (!bg_already_dirty)
3695	btrfs_inc_delayed_refs_rsv_bg_updates(fs_info: info);
3696
3697	return `0`;
3698	}
3699
3700	/*
3701	* Update the block_group and space info counters.
3702	*
3703	* @cache: The cache we are manipulating
3704	* @ram_bytes: The number of bytes of file content, and will be same to
3705	* @num_bytes except for the compress path.
3706	* @num_bytes: The number of bytes in question
3707	* @delalloc: The blocks are allocated for the delalloc write
3708	*
3709	* This is called by the allocator when it reserves space. If this is a
3710	* reservation and the block group has become read only we cannot make the
3711	* reservation and return -EAGAIN, otherwise this function always succeeds.
3712	*/
3713	int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3714	u64 ram_bytes, u64 num_bytes, int delalloc,
3715	bool force_wrong_size_class)
3716	{
3717	struct btrfs_space_info *space_info = cache->space_info;
3718	enum btrfs_block_group_size_class size_class;
3719	int ret = `0`;
3720
3721	spin_lock(lock: &space_info->lock);
3722	spin_lock(lock: &cache->lock);
3723	if (cache->ro) {
3724	ret = -EAGAIN;
3725	goto out;
3726	}
3727
3728	if (btrfs_block_group_should_use_size_class(bg: cache)) {
3729	size_class = btrfs_calc_block_group_size_class(size: num_bytes);
3730	ret = btrfs_use_block_group_size_class(bg: cache, size_class, force_wrong_size_class);
3731	if (ret)
3732	goto out;
3733	}
3734	cache->reserved += num_bytes;
3735	space_info->bytes_reserved += num_bytes;
3736	trace_btrfs_space_reservation(fs_info: cache->fs_info, type: "space_info",
3737	val: space_info->flags, bytes: num_bytes, reserve: `1`);
3738	btrfs_space_info_update_bytes_may_use(fs_info: cache->fs_info,
3739	sinfo: space_info, bytes: -ram_bytes);
3740	if (delalloc)
3741	cache->delalloc_bytes += num_bytes;
3742
3743	/*
3744	* Compression can use less space than we reserved, so wake tickets if
3745	* that happens.
3746	*/
3747	if (num_bytes < ram_bytes)
3748	btrfs_try_granting_tickets(fs_info: cache->fs_info, space_info);
3749	out:
3750	spin_unlock(lock: &cache->lock);
3751	spin_unlock(lock: &space_info->lock);
3752	return ret;
3753	}
3754
3755	/*
3756	* Update the block_group and space info counters.
3757	*
3758	* @cache: The cache we are manipulating
3759	* @num_bytes: The number of bytes in question
3760	* @delalloc: The blocks are allocated for the delalloc write
3761	*
3762	* This is called by somebody who is freeing space that was never actually used
3763	* on disk. For example if you reserve some space for a new leaf in transaction
3764	* A and before transaction A commits you free that leaf, you call this with
3765	* reserve set to 0 in order to clear the reservation.
3766	*/
3767	void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
3768	u64 num_bytes, int delalloc)
3769	{
3770	struct btrfs_space_info *space_info = cache->space_info;
3771
3772	spin_lock(lock: &space_info->lock);
3773	spin_lock(lock: &cache->lock);
3774	if (cache->ro)
3775	space_info->bytes_readonly += num_bytes;
3776	cache->reserved -= num_bytes;
3777	space_info->bytes_reserved -= num_bytes;
3778	space_info->max_extent_size = `0`;
3779
3780	if (delalloc)
3781	cache->delalloc_bytes -= num_bytes;
3782	spin_unlock(lock: &cache->lock);
3783
3784	btrfs_try_granting_tickets(fs_info: cache->fs_info, space_info);
3785	spin_unlock(lock: &space_info->lock);
3786	}
3787
3788	static void force_metadata_allocation(struct btrfs_fs_info *info)
3789	{
3790	struct list_head *head = &info->space_info;
3791	struct btrfs_space_info *found;
3792
3793	list_for_each_entry(found, head, list) {
3794	if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3795	found->force_alloc = CHUNK_ALLOC_FORCE;
3796	}
3797	}
3798
3799	static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
3800	struct btrfs_space_info sinfo, int* force)
3801	{
3802	u64 bytes_used = btrfs_space_info_used(s_info: sinfo, may_use_included: false);
3803	u64 thresh;
3804
3805	if (force == CHUNK_ALLOC_FORCE)
3806	return `1`;
3807
3808	/*
3809	* in limited mode, we want to have some free space up to
3810	* about 1% of the FS size.
3811	*/
3812	if (force == CHUNK_ALLOC_LIMITED) {
3813	thresh = btrfs_super_total_bytes(s: fs_info->super_copy);
3814	thresh = max_t(u64, SZ_64M, mult_perc(thresh, `1`));
3815
3816	if (sinfo->total_bytes - bytes_used < thresh)
3817	return `1`;
3818	}
3819
3820	if (bytes_used + SZ_2M < mult_perc(num: sinfo->total_bytes, percent: `80`))
3821	return `0`;
3822	return `1`;
3823	}
3824
3825	int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3826	{
3827	u64 alloc_flags = btrfs_get_alloc_profile(fs_info: trans->fs_info, orig_flags: type);
3828
3829	return btrfs_chunk_alloc(trans, flags: alloc_flags, force: CHUNK_ALLOC_FORCE);
3830	}
3831
3832	static struct btrfs_block_group do_chunk_alloc(struct* btrfs_trans_handle *trans, u64 flags)
3833	{
3834	struct btrfs_block_group *bg;
3835	int ret;
3836
3837	/*
3838	* Check if we have enough space in the system space info because we
3839	* will need to update device items in the chunk btree and insert a new
3840	* chunk item in the chunk btree as well. This will allocate a new
3841	* system block group if needed.
3842	*/
3843	check_system_chunk(trans, type: flags);
3844
3845	bg = btrfs_create_chunk(trans, type: flags);
3846	if (IS_ERR(ptr: bg)) {
3847	ret = PTR_ERR(ptr: bg);
3848	goto out;
3849	}
3850
3851	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3852	/*
3853	* Normally we are not expected to fail with -ENOSPC here, since we have
3854	* previously reserved space in the system space_info and allocated one
3855	* new system chunk if necessary. However there are three exceptions:
3856	*
3857	* 1) We may have enough free space in the system space_info but all the
3858	* existing system block groups have a profile which can not be used
3859	* for extent allocation.
3860	*
3861	* This happens when mounting in degraded mode. For example we have a
3862	* RAID1 filesystem with 2 devices, lose one device and mount the fs
3863	* using the other device in degraded mode. If we then allocate a chunk,
3864	* we may have enough free space in the existing system space_info, but
3865	* none of the block groups can be used for extent allocation since they
3866	* have a RAID1 profile, and because we are in degraded mode with a
3867	* single device, we are forced to allocate a new system chunk with a
3868	* SINGLE profile. Making check_system_chunk() iterate over all system
3869	* block groups and check if they have a usable profile and enough space
3870	* can be slow on very large filesystems, so we tolerate the -ENOSPC and
3871	* try again after forcing allocation of a new system chunk. Like this
3872	* we avoid paying the cost of that search in normal circumstances, when
3873	* we were not mounted in degraded mode;
3874	*
3875	* 2) We had enough free space info the system space_info, and one suitable
3876	* block group to allocate from when we called check_system_chunk()
3877	* above. However right after we called it, the only system block group
3878	* with enough free space got turned into RO mode by a running scrub,
3879	* and in this case we have to allocate a new one and retry. We only
3880	* need do this allocate and retry once, since we have a transaction
3881	* handle and scrub uses the commit root to search for block groups;
3882	*
3883	* 3) We had one system block group with enough free space when we called
3884	* check_system_chunk(), but after that, right before we tried to
3885	* allocate the last extent buffer we needed, a discard operation came
3886	* in and it temporarily removed the last free space entry from the
3887	* block group (discard removes a free space entry, discards it, and
3888	* then adds back the entry to the block group cache).
3889	*/
3890	if (ret == -ENOSPC) {
3891	const u64 sys_flags = btrfs_system_alloc_profile(fs_info: trans->fs_info);
3892	struct btrfs_block_group *sys_bg;
3893
3894	sys_bg = btrfs_create_chunk(trans, type: sys_flags);
3895	if (IS_ERR(ptr: sys_bg)) {
3896	ret = PTR_ERR(ptr: sys_bg);
3897	btrfs_abort_transaction(trans, ret);
3898	goto out;
3899	}
3900
3901	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg: sys_bg);
3902	if (ret) {
3903	btrfs_abort_transaction(trans, ret);
3904	goto out;
3905	}
3906
3907	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3908	if (ret) {
3909	btrfs_abort_transaction(trans, ret);
3910	goto out;
3911	}
3912	} else if (ret) {
3913	btrfs_abort_transaction(trans, ret);
3914	goto out;
3915	}
3916	out:
3917	btrfs_trans_release_chunk_metadata(trans);
3918
3919	if (ret)
3920	return ERR_PTR(error: ret);
3921
3922	btrfs_get_block_group(cache: bg);
3923	return bg;
3924	}
3925
3926	/*
3927	* Chunk allocation is done in 2 phases:
3928	*
3929	* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
3930	* the chunk, the chunk mapping, create its block group and add the items
3931	* that belong in the chunk btree to it - more specifically, we need to
3932	* update device items in the chunk btree and add a new chunk item to it.
3933	*
3934	* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
3935	* group item to the extent btree and the device extent items to the devices
3936	* btree.
3937	*
3938	* This is done to prevent deadlocks. For example when COWing a node from the
3939	* extent btree we are holding a write lock on the node's parent and if we
3940	* trigger chunk allocation and attempted to insert the new block group item
3941	* in the extent btree right way, we could deadlock because the path for the
3942	* insertion can include that parent node. At first glance it seems impossible
3943	* to trigger chunk allocation after starting a transaction since tasks should
3944	* reserve enough transaction units (metadata space), however while that is true
3945	* most of the time, chunk allocation may still be triggered for several reasons:
3946	*
3947	* 1) When reserving metadata, we check if there is enough free space in the
3948	* metadata space_info and therefore don't trigger allocation of a new chunk.
3949	* However later when the task actually tries to COW an extent buffer from
3950	* the extent btree or from the device btree for example, it is forced to
3951	* allocate a new block group (chunk) because the only one that had enough
3952	* free space was just turned to RO mode by a running scrub for example (or
3953	* device replace, block group reclaim thread, etc), so we can not use it
3954	* for allocating an extent and end up being forced to allocate a new one;
3955	*
3956	* 2) Because we only check that the metadata space_info has enough free bytes,
3957	* we end up not allocating a new metadata chunk in that case. However if
3958	* the filesystem was mounted in degraded mode, none of the existing block
3959	* groups might be suitable for extent allocation due to their incompatible
3960	* profile (for e.g. mounting a 2 devices filesystem, where all block groups
3961	* use a RAID1 profile, in degraded mode using a single device). In this case
3962	* when the task attempts to COW some extent buffer of the extent btree for
3963	* example, it will trigger allocation of a new metadata block group with a
3964	* suitable profile (SINGLE profile in the example of the degraded mount of
3965	* the RAID1 filesystem);
3966	*
3967	* 3) The task has reserved enough transaction units / metadata space, but when
3968	* it attempts to COW an extent buffer from the extent or device btree for
3969	* example, it does not find any free extent in any metadata block group,
3970	* therefore forced to try to allocate a new metadata block group.
3971	* This is because some other task allocated all available extents in the
3972	* meanwhile - this typically happens with tasks that don't reserve space
3973	* properly, either intentionally or as a bug. One example where this is
3974	* done intentionally is fsync, as it does not reserve any transaction units
3975	* and ends up allocating a variable number of metadata extents for log
3976	* tree extent buffers;
3977	*
3978	* 4) The task has reserved enough transaction units / metadata space, but right
3979	* before it tries to allocate the last extent buffer it needs, a discard
3980	* operation comes in and, temporarily, removes the last free space entry from
3981	* the only metadata block group that had free space (discard starts by
3982	* removing a free space entry from a block group, then does the discard
3983	* operation and, once it's done, it adds back the free space entry to the
3984	* block group).
3985	*
3986	* We also need this 2 phases setup when adding a device to a filesystem with
3987	* a seed device - we must create new metadata and system chunks without adding
3988	* any of the block group items to the chunk, extent and device btrees. If we
3989	* did not do it this way, we would get ENOSPC when attempting to update those
3990	* btrees, since all the chunks from the seed device are read-only.
3991	*
3992	* Phase 1 does the updates and insertions to the chunk btree because if we had
3993	* it done in phase 2 and have a thundering herd of tasks allocating chunks in
3994	* parallel, we risk having too many system chunks allocated by many tasks if
3995	* many tasks reach phase 1 without the previous ones completing phase 2. In the
3996	* extreme case this leads to exhaustion of the system chunk array in the
3997	* superblock. This is easier to trigger if using a btree node/leaf size of 64K
3998	* and with RAID filesystems (so we have more device items in the chunk btree).
3999	* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
4000	* the system chunk array due to concurrent allocations") provides more details.
4001	*
4002	* Allocation of system chunks does not happen through this function. A task that
4003	* needs to update the chunk btree (the only btree that uses system chunks), must
4004	* preallocate chunk space by calling either check_system_chunk() or
4005	* btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
4006	* metadata chunk or when removing a chunk, while the later is used before doing
4007	* a modification to the chunk btree - use cases for the later are adding,
4008	* removing and resizing a device as well as relocation of a system chunk.
4009	* See the comment below for more details.
4010	*
4011	* The reservation of system space, done through check_system_chunk(), as well
4012	* as all the updates and insertions into the chunk btree must be done while
4013	* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
4014	* an extent buffer from the chunks btree we never trigger allocation of a new
4015	* system chunk, which would result in a deadlock (trying to lock twice an
4016	* extent buffer of the chunk btree, first time before triggering the chunk
4017	* allocation and the second time during chunk allocation while attempting to
4018	* update the chunks btree). The system chunk array is also updated while holding
4019	* that mutex. The same logic applies to removing chunks - we must reserve system
4020	* space, update the chunk btree and the system chunk array in the superblock
4021	* while holding fs_info->chunk_mutex.
4022	*
4023	* This function, btrfs_chunk_alloc(), belongs to phase 1.
4024	*
4025	* If @force is CHUNK_ALLOC_FORCE:
4026	* - return 1 if it successfully allocates a chunk,
4027	* - return errors including -ENOSPC otherwise.
4028	* If @force is NOT CHUNK_ALLOC_FORCE:
4029	* - return 0 if it doesn't need to allocate a new chunk,
4030	* - return 1 if it successfully allocates a chunk,
4031	* - return errors including -ENOSPC otherwise.
4032	*/
4033	int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4034	enum btrfs_chunk_alloc_enum force)
4035	{
4036	struct btrfs_fs_info *fs_info = trans->fs_info;
4037	struct btrfs_space_info *space_info;
4038	struct btrfs_block_group *ret_bg;
4039	bool wait_for_alloc = false;
4040	bool should_alloc = false;
4041	bool from_extent_allocation = false;
4042	int ret = `0`;
4043
4044	if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
4045	from_extent_allocation = true;
4046	force = CHUNK_ALLOC_FORCE;
4047	}
4048
4049	/ Don't re-enter if we're already allocating a chunk /
4050	if (trans->allocating_chunk)
4051	return -ENOSPC;
4052	/*
4053	* Allocation of system chunks can not happen through this path, as we
4054	* could end up in a deadlock if we are allocating a data or metadata
4055	* chunk and there is another task modifying the chunk btree.
4056	*
4057	* This is because while we are holding the chunk mutex, we will attempt
4058	* to add the new chunk item to the chunk btree or update an existing
4059	* device item in the chunk btree, while the other task that is modifying
4060	* the chunk btree is attempting to COW an extent buffer while holding a
4061	* lock on it and on its parent - if the COW operation triggers a system
4062	* chunk allocation, then we can deadlock because we are holding the
4063	* chunk mutex and we may need to access that extent buffer or its parent
4064	* in order to add the chunk item or update a device item.
4065	*
4066	* Tasks that want to modify the chunk tree should reserve system space
4067	* before updating the chunk btree, by calling either
4068	* btrfs_reserve_chunk_metadata() or check_system_chunk().
4069	* It's possible that after a task reserves the space, it still ends up
4070	* here - this happens in the cases described above at do_chunk_alloc().
4071	* The task will have to either retry or fail.
4072	*/
4073	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4074	return -ENOSPC;
4075
4076	space_info = btrfs_find_space_info(info: fs_info, flags);
4077	ASSERT(space_info);
4078
4079	do {
4080	spin_lock(lock: &space_info->lock);
4081	if (force < space_info->force_alloc)
4082	force = space_info->force_alloc;
4083	should_alloc = should_alloc_chunk(fs_info, sinfo: space_info, force);
4084	if (space_info->full) {
4085	/ No more free physical space /
4086	if (should_alloc)
4087	ret = -ENOSPC;
4088	else
4089	ret = `0`;
4090	spin_unlock(lock: &space_info->lock);
4091	return ret;
4092	} else if (!should_alloc) {
4093	spin_unlock(lock: &space_info->lock);
4094	return `0`;
4095	} else if (space_info->chunk_alloc) {
4096	/*
4097	* Someone is already allocating, so we need to block
4098	* until this someone is finished and then loop to
4099	* recheck if we should continue with our allocation
4100	* attempt.
4101	*/
4102	wait_for_alloc = true;
4103	force = CHUNK_ALLOC_NO_FORCE;
4104	spin_unlock(lock: &space_info->lock);
4105	mutex_lock(&fs_info->chunk_mutex);
4106	mutex_unlock(lock: &fs_info->chunk_mutex);
4107	} else {
4108	/ Proceed with allocation /
4109	space_info->chunk_alloc = `1`;
4110	wait_for_alloc = false;
4111	spin_unlock(lock: &space_info->lock);
4112	}
4113
4114	cond_resched();
4115	} while (wait_for_alloc);
4116
4117	mutex_lock(&fs_info->chunk_mutex);
4118	trans->allocating_chunk = true;
4119
4120	/*
4121	* If we have mixed data/metadata chunks we want to make sure we keep
4122	* allocating mixed chunks instead of individual chunks.
4123	*/
4124	if (btrfs_mixed_space_info(space_info))
4125	flags \|= (BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA);
4126
4127	/*
4128	* if we're doing a data chunk, go ahead and make sure that
4129	* we keep a reasonable number of metadata chunks allocated in the
4130	* FS as well.
4131	*/
4132	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4133	fs_info->data_chunk_allocations++;
4134	if (!(fs_info->data_chunk_allocations %
4135	fs_info->metadata_ratio))
4136	force_metadata_allocation(info: fs_info);
4137	}
4138
4139	ret_bg = do_chunk_alloc(trans, flags);
4140	trans->allocating_chunk = false;
4141
4142	if (IS_ERR(ptr: ret_bg)) {
4143	ret = PTR_ERR(ptr: ret_bg);
4144	} else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
4145	/*
4146	* New block group is likely to be used soon. Try to activate
4147	* it now. Failure is OK for now.
4148	*/
4149	btrfs_zone_activate(block_group: ret_bg);
4150	}
4151
4152	if (!ret)
4153	btrfs_put_block_group(cache: ret_bg);
4154
4155	spin_lock(lock: &space_info->lock);
4156	if (ret < `0`) {
4157	if (ret == -ENOSPC)
4158	space_info->full = `1`;
4159	else
4160	goto out;
4161	} else {
4162	ret = `1`;
4163	space_info->max_extent_size = `0`;
4164	}
4165
4166	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4167	out:
4168	space_info->chunk_alloc = `0`;
4169	spin_unlock(lock: &space_info->lock);
4170	mutex_unlock(lock: &fs_info->chunk_mutex);
4171
4172	return ret;
4173	}
4174
4175	static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4176	{
4177	u64 num_dev;
4178
4179	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(flags: type)].devs_max;
4180	if (!num_dev)
4181	num_dev = fs_info->fs_devices->rw_devices;
4182
4183	return num_dev;
4184	}
4185
4186	static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4187	u64 bytes,
4188	u64 type)
4189	{
4190	struct btrfs_fs_info *fs_info = trans->fs_info;
4191	struct btrfs_space_info *info;
4192	u64 left;
4193	int ret = `0`;
4194
4195	/*
4196	* Needed because we can end up allocating a system chunk and for an
4197	* atomic and race free space reservation in the chunk block reserve.
4198	*/
4199	lockdep_assert_held(&fs_info->chunk_mutex);
4200
4201	info = btrfs_find_space_info(info: fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4202	spin_lock(lock: &info->lock);
4203	left = info->total_bytes - btrfs_space_info_used(s_info: info, may_use_included: true);
4204	spin_unlock(lock: &info->lock);
4205
4206	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4207	btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4208	left, bytes, type);
4209	btrfs_dump_space_info(fs_info, info, bytes: `0`, dump_block_groups: `0`);
4210	}
4211
4212	if (left < bytes) {
4213	u64 flags = btrfs_system_alloc_profile(fs_info);
4214	struct btrfs_block_group *bg;
4215
4216	/*
4217	* Ignore failure to create system chunk. We might end up not
4218	* needing it, as we might not need to COW all nodes/leafs from
4219	* the paths we visit in the chunk tree (they were already COWed
4220	* or created in the current transaction for example).
4221	*/
4222	bg = btrfs_create_chunk(trans, type: flags);
4223	if (IS_ERR(ptr: bg)) {
4224	ret = PTR_ERR(ptr: bg);
4225	} else {
4226	/*
4227	* We have a new chunk. We also need to activate it for
4228	* zoned filesystem.
4229	*/
4230	ret = btrfs_zoned_activate_one_bg(fs_info, space_info: info, do_finish: true);
4231	if (ret < `0`)
4232	return;
4233
4234	/*
4235	* If we fail to add the chunk item here, we end up
4236	* trying again at phase 2 of chunk allocation, at
4237	* btrfs_create_pending_block_groups(). So ignore
4238	* any error here. An ENOSPC here could happen, due to
4239	* the cases described at do_chunk_alloc() - the system
4240	* block group we just created was just turned into RO
4241	* mode by a scrub for example, or a running discard
4242	* temporarily removed its free space entries, etc.
4243	*/
4244	btrfs_chunk_alloc_add_chunk_item(trans, bg);
4245	}
4246	}
4247
4248	if (!ret) {
4249	ret = btrfs_block_rsv_add(fs_info,
4250	block_rsv: &fs_info->chunk_block_rsv,
4251	num_bytes: bytes, flush: BTRFS_RESERVE_NO_FLUSH);
4252	if (!ret)
4253	trans->chunk_bytes_reserved += bytes;
4254	}
4255	}
4256
4257	/*
4258	* Reserve space in the system space for allocating or removing a chunk.
4259	* The caller must be holding fs_info->chunk_mutex.
4260	*/
4261	void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4262	{
4263	struct btrfs_fs_info *fs_info = trans->fs_info;
4264	const u64 num_devs = get_profile_num_devs(fs_info, type);
4265	u64 bytes;
4266
4267	/ num_devs device items to update and 1 chunk item to add or remove. /
4268	bytes = btrfs_calc_metadata_size(fs_info, num_items: num_devs) +
4269	btrfs_calc_insert_metadata_size(fs_info, num_items: `1`);
4270
4271	reserve_chunk_space(trans, bytes, type);
4272	}
4273
4274	/*
4275	* Reserve space in the system space, if needed, for doing a modification to the
4276	* chunk btree.
4277	*
4278	* @trans: A transaction handle.
4279	* @is_item_insertion: Indicate if the modification is for inserting a new item
4280	* in the chunk btree or if it's for the deletion or update
4281	* of an existing item.
4282	*
4283	* This is used in a context where we need to update the chunk btree outside
4284	* block group allocation and removal, to avoid a deadlock with a concurrent
4285	* task that is allocating a metadata or data block group and therefore needs to
4286	* update the chunk btree while holding the chunk mutex. After the update to the
4287	* chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4288	*
4289	*/
4290	void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4291	bool is_item_insertion)
4292	{
4293	struct btrfs_fs_info *fs_info = trans->fs_info;
4294	u64 bytes;
4295
4296	if (is_item_insertion)
4297	bytes = btrfs_calc_insert_metadata_size(fs_info, num_items: `1`);
4298	else
4299	bytes = btrfs_calc_metadata_size(fs_info, num_items: `1`);
4300
4301	mutex_lock(&fs_info->chunk_mutex);
4302	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4303	mutex_unlock(lock: &fs_info->chunk_mutex);
4304	}
4305
4306	void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4307	{
4308	struct btrfs_block_group *block_group;
4309
4310	block_group = btrfs_lookup_first_block_group(info, bytenr: `0`);
4311	while (block_group) {
4312	btrfs_wait_block_group_cache_done(cache: block_group);
4313	spin_lock(lock: &block_group->lock);
4314	if (test_and_clear_bit(nr: BLOCK_GROUP_FLAG_IREF,
4315	addr: &block_group->runtime_flags)) {
4316	struct inode *inode = block_group->inode;
4317
4318	block_group->inode = NULL;
4319	spin_unlock(lock: &block_group->lock);
4320
4321	ASSERT(block_group->io_ctl.inode == NULL);
4322	iput(inode);
4323	} else {
4324	spin_unlock(lock: &block_group->lock);
4325	}
4326	block_group = btrfs_next_block_group(cache: block_group);
4327	}
4328	}
4329
4330	/*
4331	* Must be called only after stopping all workers, since we could have block
4332	* group caching kthreads running, and therefore they could race with us if we
4333	* freed the block groups before stopping them.
4334	*/
4335	int btrfs_free_block_groups(struct btrfs_fs_info *info)
4336	{
4337	struct btrfs_block_group *block_group;
4338	struct btrfs_space_info *space_info;
4339	struct btrfs_caching_control *caching_ctl;
4340	struct rb_node *n;
4341
4342	if (btrfs_is_zoned(fs_info: info)) {
4343	if (info->active_meta_bg) {
4344	btrfs_put_block_group(cache: info->active_meta_bg);
4345	info->active_meta_bg = NULL;
4346	}
4347	if (info->active_system_bg) {
4348	btrfs_put_block_group(cache: info->active_system_bg);
4349	info->active_system_bg = NULL;
4350	}
4351	}
4352
4353	write_lock(&info->block_group_cache_lock);
4354	while (!list_empty(head: &info->caching_block_groups)) {
4355	caching_ctl = list_entry(info->caching_block_groups.next,
4356	struct btrfs_caching_control, list);
4357	list_del(entry: &caching_ctl->list);
4358	btrfs_put_caching_control(ctl: caching_ctl);
4359	}
4360	write_unlock(&info->block_group_cache_lock);
4361
4362	spin_lock(lock: &info->unused_bgs_lock);
4363	while (!list_empty(head: &info->unused_bgs)) {
4364	block_group = list_first_entry(&info->unused_bgs,
4365	struct btrfs_block_group,
4366	bg_list);
4367	list_del_init(entry: &block_group->bg_list);
4368	btrfs_put_block_group(cache: block_group);
4369	}
4370
4371	while (!list_empty(head: &info->reclaim_bgs)) {
4372	block_group = list_first_entry(&info->reclaim_bgs,
4373	struct btrfs_block_group,
4374	bg_list);
4375	list_del_init(entry: &block_group->bg_list);
4376	btrfs_put_block_group(cache: block_group);
4377	}
4378	spin_unlock(lock: &info->unused_bgs_lock);
4379
4380	spin_lock(lock: &info->zone_active_bgs_lock);
4381	while (!list_empty(head: &info->zone_active_bgs)) {
4382	block_group = list_first_entry(&info->zone_active_bgs,
4383	struct btrfs_block_group,
4384	active_bg_list);
4385	list_del_init(entry: &block_group->active_bg_list);
4386	btrfs_put_block_group(cache: block_group);
4387	}
4388	spin_unlock(lock: &info->zone_active_bgs_lock);
4389
4390	write_lock(&info->block_group_cache_lock);
4391	while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4392	block_group = rb_entry(n, struct btrfs_block_group,
4393	cache_node);
4394	rb_erase_cached(node: &block_group->cache_node,
4395	root: &info->block_group_cache_tree);
4396	RB_CLEAR_NODE(&block_group->cache_node);
4397	write_unlock(&info->block_group_cache_lock);
4398
4399	down_write(sem: &block_group->space_info->groups_sem);
4400	list_del(entry: &block_group->list);
4401	up_write(sem: &block_group->space_info->groups_sem);
4402
4403	/*
4404	* We haven't cached this block group, which means we could
4405	* possibly have excluded extents on this block group.
4406	*/
4407	if (block_group->cached == BTRFS_CACHE_NO \|\|
4408	block_group->cached == BTRFS_CACHE_ERROR)
4409	btrfs_free_excluded_extents(bg: block_group);
4410
4411	btrfs_remove_free_space_cache(block_group);
4412	ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4413	ASSERT(list_empty(&block_group->dirty_list));
4414	ASSERT(list_empty(&block_group->io_list));
4415	ASSERT(list_empty(&block_group->bg_list));
4416	ASSERT(refcount_read(&block_group->refs) == `1`);
4417	ASSERT(block_group->swap_extents == `0`);
4418	btrfs_put_block_group(cache: block_group);
4419
4420	write_lock(&info->block_group_cache_lock);
4421	}
4422	write_unlock(&info->block_group_cache_lock);
4423
4424	btrfs_release_global_block_rsv(fs_info: info);
4425
4426	while (!list_empty(head: &info->space_info)) {
4427	space_info = list_entry(info->space_info.next,
4428	struct btrfs_space_info,
4429	list);
4430
4431	/*
4432	* Do not hide this behind enospc_debug, this is actually
4433	* important and indicates a real bug if this happens.
4434	*/
4435	if (WARN_ON(space_info->bytes_pinned > `0` \|\|
4436	space_info->bytes_may_use > `0`))
4437	btrfs_dump_space_info(fs_info: info, info: space_info, bytes: `0`, dump_block_groups: `0`);
4438
4439	/*
4440	* If there was a failure to cleanup a log tree, very likely due
4441	* to an IO failure on a writeback attempt of one or more of its
4442	* extent buffers, we could not do proper (and cheap) unaccounting
4443	* of their reserved space, so don't warn on bytes_reserved > 0 in
4444	* that case.
4445	*/
4446	if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) \|\|
4447	!BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4448	if (WARN_ON(space_info->bytes_reserved > `0`))
4449	btrfs_dump_space_info(fs_info: info, info: space_info, bytes: `0`, dump_block_groups: `0`);
4450	}
4451
4452	WARN_ON(space_info->reclaim_size > `0`);
4453	list_del(entry: &space_info->list);
4454	btrfs_sysfs_remove_space_info(space_info);
4455	}
4456	return `0`;
4457	}
4458
4459	void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4460	{
4461	atomic_inc(v: &cache->frozen);
4462	}
4463
4464	void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4465	{
4466	struct btrfs_fs_info *fs_info = block_group->fs_info;
4467	bool cleanup;
4468
4469	spin_lock(lock: &block_group->lock);
4470	cleanup = (atomic_dec_and_test(v: &block_group->frozen) &&
4471	test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
4472	spin_unlock(lock: &block_group->lock);
4473
4474	if (cleanup) {
4475	struct btrfs_chunk_map *map;
4476
4477	map = btrfs_find_chunk_map(fs_info, logical: block_group->start, length: `1`);
4478	/ Logic error, can't happen. /
4479	ASSERT(map);
4480
4481	btrfs_remove_chunk_map(fs_info, map);
4482
4483	/ Once for our lookup reference. /
4484	btrfs_free_chunk_map(map);
4485
4486	/*
4487	* We may have left one free space entry and other possible
4488	* tasks trimming this block group have left 1 entry each one.
4489	* Free them if any.
4490	*/
4491	btrfs_remove_free_space_cache(block_group);
4492	}
4493	}
4494
4495	bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4496	{
4497	bool ret = true;
4498
4499	spin_lock(lock: &bg->lock);
4500	if (bg->ro)
4501	ret = false;
4502	else
4503	bg->swap_extents++;
4504	spin_unlock(lock: &bg->lock);
4505
4506	return ret;
4507	}
4508
4509	void btrfs_dec_block_group_swap_extents(struct btrfs_block_group bg, int* amount)
4510	{
4511	spin_lock(lock: &bg->lock);
4512	ASSERT(!bg->ro);
4513	ASSERT(bg->swap_extents >= amount);
4514	bg->swap_extents -= amount;
4515	spin_unlock(lock: &bg->lock);
4516	}
4517
4518	enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4519	{
4520	if (size <= SZ_128K)
4521	return BTRFS_BG_SZ_SMALL;
4522	if (size <= SZ_8M)
4523	return BTRFS_BG_SZ_MEDIUM;
4524	return BTRFS_BG_SZ_LARGE;
4525	}
4526
4527	/*
4528	* Handle a block group allocating an extent in a size class
4529	*
4530	* @bg: The block group we allocated in.
4531	* @size_class: The size class of the allocation.
4532	* @force_wrong_size_class: Whether we are desperate enough to allow
4533	* mismatched size classes.
4534	*
4535	* Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4536	* case of a race that leads to the wrong size class without
4537	* force_wrong_size_class set.
4538	*
4539	* find_free_extent will skip block groups with a mismatched size class until
4540	* it really needs to avoid ENOSPC. In that case it will set
4541	* force_wrong_size_class. However, if a block group is newly allocated and
4542	* doesn't yet have a size class, then it is possible for two allocations of
4543	* different sizes to race and both try to use it. The loser is caught here and
4544	* has to retry.
4545	*/
4546	int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4547	enum btrfs_block_group_size_class size_class,
4548	bool force_wrong_size_class)
4549	{
4550	ASSERT(size_class != BTRFS_BG_SZ_NONE);
4551
4552	/ The new allocation is in the right size class, do nothing /
4553	if (bg->size_class == size_class)
4554	return `0`;
4555	/*
4556	* The new allocation is in a mismatched size class.
4557	* This means one of two things:
4558	*
4559	* 1. Two tasks in find_free_extent for different size_classes raced
4560	* and hit the same empty block_group. Make the loser try again.
4561	* 2. A call to find_free_extent got desperate enough to set
4562	* 'force_wrong_slab'. Don't change the size_class, but allow the
4563	* allocation.
4564	*/
4565	if (bg->size_class != BTRFS_BG_SZ_NONE) {
4566	if (force_wrong_size_class)
4567	return `0`;
4568	return -EAGAIN;
4569	}
4570	/*
4571	* The happy new block group case: the new allocation is the first
4572	* one in the block_group so we set size_class.
4573	*/
4574	bg->size_class = size_class;
4575
4576	return `0`;
4577	}
4578
4579	bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg)
4580	{
4581	if (btrfs_is_zoned(fs_info: bg->fs_info))
4582	return false;
4583	if (!btrfs_is_block_group_data_only(block_group: bg))
4584	return false;
4585	return true;
4586	}
4587

source code of linux/fs/btrfs/block-group.c