volumes.c source code [linux/fs/btrfs/volumes.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007 Oracle. All rights reserved.
4	*/
5
6	#include <linux/sched.h>
7	#include <linux/sched/mm.h>
8	#include <linux/slab.h>
9	#include <linux/ratelimit.h>
10	#include <linux/kthread.h>
11	#include <linux/semaphore.h>
12	#include <linux/uuid.h>
13	#include <linux/list_sort.h>
14	#include <linux/namei.h>
15	#include "misc.h"
16	#include "ctree.h"
17	#include "extent_map.h"
18	#include "disk-io.h"
19	#include "transaction.h"
20	#include "print-tree.h"
21	#include "volumes.h"
22	#include "raid56.h"
23	#include "rcu-string.h"
24	#include "dev-replace.h"
25	#include "sysfs.h"
26	#include "tree-checker.h"
27	#include "space-info.h"
28	#include "block-group.h"
29	#include "discard.h"
30	#include "zoned.h"
31	#include "fs.h"
32	#include "accessors.h"
33	#include "uuid-tree.h"
34	#include "ioctl.h"
35	#include "relocation.h"
36	#include "scrub.h"
37	#include "super.h"
38	#include "raid-stripe-tree.h"
39
40	#define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 \| \
41	BTRFS_BLOCK_GROUP_RAID10 \| \
42	BTRFS_BLOCK_GROUP_RAID56_MASK)
43
44	const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
45	[BTRFS_RAID_RAID10] = {
46	.sub_stripes = `2`,
47	.dev_stripes = `1`,
48	.devs_max = `0`, / 0 == as many as possible /
49	.devs_min = `2`,
50	.tolerated_failures = `1`,
51	.devs_increment = `2`,
52	.ncopies = `2`,
53	.nparity = `0`,
54	.raid_name = "raid10",
55	.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
56	.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
57	},
58	[BTRFS_RAID_RAID1] = {
59	.sub_stripes = `1`,
60	.dev_stripes = `1`,
61	.devs_max = `2`,
62	.devs_min = `2`,
63	.tolerated_failures = `1`,
64	.devs_increment = `2`,
65	.ncopies = `2`,
66	.nparity = `0`,
67	.raid_name = "raid1",
68	.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
69	.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
70	},
71	[BTRFS_RAID_RAID1C3] = {
72	.sub_stripes = `1`,
73	.dev_stripes = `1`,
74	.devs_max = `3`,
75	.devs_min = `3`,
76	.tolerated_failures = `2`,
77	.devs_increment = `3`,
78	.ncopies = `3`,
79	.nparity = `0`,
80	.raid_name = "raid1c3",
81	.bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
82	.mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
83	},
84	[BTRFS_RAID_RAID1C4] = {
85	.sub_stripes = `1`,
86	.dev_stripes = `1`,
87	.devs_max = `4`,
88	.devs_min = `4`,
89	.tolerated_failures = `3`,
90	.devs_increment = `4`,
91	.ncopies = `4`,
92	.nparity = `0`,
93	.raid_name = "raid1c4",
94	.bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
95	.mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
96	},
97	[BTRFS_RAID_DUP] = {
98	.sub_stripes = `1`,
99	.dev_stripes = `2`,
100	.devs_max = `1`,
101	.devs_min = `1`,
102	.tolerated_failures = `0`,
103	.devs_increment = `1`,
104	.ncopies = `2`,
105	.nparity = `0`,
106	.raid_name = "dup",
107	.bg_flag = BTRFS_BLOCK_GROUP_DUP,
108	.mindev_error = `0`,
109	},
110	[BTRFS_RAID_RAID0] = {
111	.sub_stripes = `1`,
112	.dev_stripes = `1`,
113	.devs_max = `0`,
114	.devs_min = `1`,
115	.tolerated_failures = `0`,
116	.devs_increment = `1`,
117	.ncopies = `1`,
118	.nparity = `0`,
119	.raid_name = "raid0",
120	.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
121	.mindev_error = `0`,
122	},
123	[BTRFS_RAID_SINGLE] = {
124	.sub_stripes = `1`,
125	.dev_stripes = `1`,
126	.devs_max = `1`,
127	.devs_min = `1`,
128	.tolerated_failures = `0`,
129	.devs_increment = `1`,
130	.ncopies = `1`,
131	.nparity = `0`,
132	.raid_name = "single",
133	.bg_flag = `0`,
134	.mindev_error = `0`,
135	},
136	[BTRFS_RAID_RAID5] = {
137	.sub_stripes = `1`,
138	.dev_stripes = `1`,
139	.devs_max = `0`,
140	.devs_min = `2`,
141	.tolerated_failures = `1`,
142	.devs_increment = `1`,
143	.ncopies = `1`,
144	.nparity = `1`,
145	.raid_name = "raid5",
146	.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
147	.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
148	},
149	[BTRFS_RAID_RAID6] = {
150	.sub_stripes = `1`,
151	.dev_stripes = `1`,
152	.devs_max = `0`,
153	.devs_min = `3`,
154	.tolerated_failures = `2`,
155	.devs_increment = `1`,
156	.ncopies = `1`,
157	.nparity = `2`,
158	.raid_name = "raid6",
159	.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
160	.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
161	},
162	};
163
164	/*
165	* Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
166	* can be used as index to access btrfs_raid_array[].
167	*/
168	enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
169	{
170	const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
171
172	if (!profile)
173	return BTRFS_RAID_SINGLE;
174
175	return BTRFS_BG_FLAG_TO_INDEX(profile);
176	}
177
178	const char *btrfs_bg_type_to_raid_name(u64 flags)
179	{
180	const int index = btrfs_bg_flags_to_raid_index(flags);
181
182	if (index >= BTRFS_NR_RAID_TYPES)
183	return NULL;
184
185	return btrfs_raid_array[index].raid_name;
186	}
187
188	int btrfs_nr_parity_stripes(u64 type)
189	{
190	enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(flags: type);
191
192	return btrfs_raid_array[index].nparity;
193	}
194
195	/*
196	* Fill @buf with textual description of @bg_flags, no more than @size_buf
197	* bytes including terminating null byte.
198	*/
199	void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
200	{
201	int i;
202	int ret;
203	char *bp = buf;
204	u64 flags = bg_flags;
205	u32 size_bp = size_buf;
206
207	if (!flags) {
208	strcpy(p: bp, q: "NONE");
209	return;
210	}
211
212	#define DESCRIBE_FLAG(flag, desc) \
213	do { \
214	if (flags & (flag)) { \
215	ret = snprintf(bp, size_bp, "%s\|", (desc)); \
216	if (ret < 0 \|\| ret >= size_bp) \
217	goto out_overflow; \
218	size_bp -= ret; \
219	bp += ret; \
220	flags &= ~(flag); \
221	} \
222	} while (0)
223
224	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
225	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
226	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
227
228	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
229	for (i = `0`; i < BTRFS_NR_RAID_TYPES; i++)
230	DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
231	btrfs_raid_array[i].raid_name);
232	#undef DESCRIBE_FLAG
233
234	if (flags) {
235	ret = snprintf(buf: bp, size: size_bp, fmt: "0x%llx\|", flags);
236	size_bp -= ret;
237	}
238
239	if (size_bp < size_buf)
240	buf[size_buf - size_bp - `1`] = `'\0'`; / remove last \| /
241
242	/*
243	* The text is trimmed, it's up to the caller to provide sufficiently
244	* large buffer
245	*/
246	out_overflow:;
247	}
248
249	static int init_first_rw_device(struct btrfs_trans_handle *trans);
250	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
251	static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
252
253	/*
254	* Device locking
255	* ==============
256	*
257	* There are several mutexes that protect manipulation of devices and low-level
258	* structures like chunks but not block groups, extents or files
259	*
260	* uuid_mutex (global lock)
261	* ------------------------
262	* protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
263	* the SCAN_DEV ioctl registration or from mount either implicitly (the first
264	* device) or requested by the device= mount option
265	*
266	* the mutex can be very coarse and can cover long-running operations
267	*
268	* protects: updates to fs_devices counters like missing devices, rw devices,
269	* seeding, structure cloning, opening/closing devices at mount/umount time
270	*
271	* global::fs_devs - add, remove, updates to the global list
272	*
273	* does not protect: manipulation of the fs_devices::devices list in general
274	* but in mount context it could be used to exclude list modifications by eg.
275	* scan ioctl
276	*
277	* btrfs_device::name - renames (write side), read is RCU
278	*
279	* fs_devices::device_list_mutex (per-fs, with RCU)
280	* ------------------------------------------------
281	* protects updates to fs_devices::devices, ie. adding and deleting
282	*
283	* simple list traversal with read-only actions can be done with RCU protection
284	*
285	* may be used to exclude some operations from running concurrently without any
286	* modifications to the list (see write_all_supers)
287	*
288	* Is not required at mount and close times, because our device list is
289	* protected by the uuid_mutex at that point.
290	*
291	* balance_mutex
292	* -------------
293	* protects balance structures (status, state) and context accessed from
294	* several places (internally, ioctl)
295	*
296	* chunk_mutex
297	* -----------
298	* protects chunks, adding or removing during allocation, trim or when a new
299	* device is added/removed. Additionally it also protects post_commit_list of
300	* individual devices, since they can be added to the transaction's
301	* post_commit_list only with chunk_mutex held.
302	*
303	* cleaner_mutex
304	* -------------
305	* a big lock that is held by the cleaner thread and prevents running subvolume
306	* cleaning together with relocation or delayed iputs
307	*
308	*
309	* Lock nesting
310	* ============
311	*
312	* uuid_mutex
313	* device_list_mutex
314	* chunk_mutex
315	* balance_mutex
316	*
317	*
318	* Exclusive operations
319	* ====================
320	*
321	* Maintains the exclusivity of the following operations that apply to the
322	* whole filesystem and cannot run in parallel.
323	*
324	* - Balance (*)
325	* - Device add
326	* - Device remove
327	* - Device replace (*)
328	* - Resize
329	*
330	* The device operations (as above) can be in one of the following states:
331	*
332	* - Running state
333	* - Paused state
334	* - Completed state
335	*
336	* Only device operations marked with (*) can go into the Paused state for the
337	* following reasons:
338	*
339	* - ioctl (only Balance can be Paused through ioctl)
340	* - filesystem remounted as read-only
341	* - filesystem unmounted and mounted as read-only
342	* - system power-cycle and filesystem mounted as read-only
343	* - filesystem or device errors leading to forced read-only
344	*
345	* The status of exclusive operation is set and cleared atomically.
346	* During the course of Paused state, fs_info::exclusive_operation remains set.
347	* A device operation in Paused or Running state can be canceled or resumed
348	* either by ioctl (Balance only) or when remounted as read-write.
349	* The exclusive status is cleared when the device operation is canceled or
350	* completed.
351	*/
352
353	DEFINE_MUTEX(uuid_mutex);
354	static LIST_HEAD(fs_uuids);
355	struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
356	{
357	return &fs_uuids;
358	}
359
360	/*
361	* Allocate new btrfs_fs_devices structure identified by a fsid.
362	*
363	* @fsid: if not NULL, copy the UUID to fs_devices::fsid and to
364	* fs_devices::metadata_fsid
365	*
366	* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
367	* The returned struct is not linked onto any lists and can be destroyed with
368	* kfree() right away.
369	*/
370	static struct btrfs_fs_devices alloc_fs_devices(const* u8 *fsid)
371	{
372	struct btrfs_fs_devices *fs_devs;
373
374	fs_devs = kzalloc(size: sizeof(*fs_devs), GFP_KERNEL);
375	if (!fs_devs)
376	return ERR_PTR(error: -ENOMEM);
377
378	mutex_init(&fs_devs->device_list_mutex);
379
380	INIT_LIST_HEAD(list: &fs_devs->devices);
381	INIT_LIST_HEAD(list: &fs_devs->alloc_list);
382	INIT_LIST_HEAD(list: &fs_devs->fs_list);
383	INIT_LIST_HEAD(list: &fs_devs->seed_list);
384
385	if (fsid) {
386	memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
387	memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
388	}
389
390	return fs_devs;
391	}
392
393	static void btrfs_free_device(struct btrfs_device *device)
394	{
395	WARN_ON(!list_empty(&device->post_commit_list));
396	rcu_string_free(str: device->name);
397	extent_io_tree_release(tree: &device->alloc_state);
398	btrfs_destroy_dev_zone_info(device);
399	kfree(objp: device);
400	}
401
402	static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
403	{
404	struct btrfs_device *device;
405
406	WARN_ON(fs_devices->opened);
407	while (!list_empty(head: &fs_devices->devices)) {
408	device = list_entry(fs_devices->devices.next,
409	struct btrfs_device, dev_list);
410	list_del(entry: &device->dev_list);
411	btrfs_free_device(device);
412	}
413	kfree(objp: fs_devices);
414	}
415
416	void __exit btrfs_cleanup_fs_uuids(void)
417	{
418	struct btrfs_fs_devices *fs_devices;
419
420	while (!list_empty(head: &fs_uuids)) {
421	fs_devices = list_entry(fs_uuids.next,
422	struct btrfs_fs_devices, fs_list);
423	list_del(entry: &fs_devices->fs_list);
424	free_fs_devices(fs_devices);
425	}
426	}
427
428	static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
429	const u8 fsid, const* u8 *metadata_fsid)
430	{
431	if (memcmp(p: fsid, q: fs_devices->fsid, BTRFS_FSID_SIZE) != `0`)
432	return false;
433
434	if (!metadata_fsid)
435	return true;
436
437	if (memcmp(p: metadata_fsid, q: fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != `0`)
438	return false;
439
440	return true;
441	}
442
443	static noinline struct btrfs_fs_devices *find_fsid(
444	const u8 fsid, const* u8 *metadata_fsid)
445	{
446	struct btrfs_fs_devices *fs_devices;
447
448	ASSERT(fsid);
449
450	/ Handle non-split brain cases /
451	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
452	if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
453	return fs_devices;
454	}
455	return NULL;
456	}
457
458	static int
459	btrfs_get_bdev_and_sb(const char device_path, blk_mode_t flags, void* *holder,
460	int flush, struct bdev_handle **bdev_handle,
461	struct btrfs_super_block **disk_super)
462	{
463	struct block_device *bdev;
464	int ret;
465
466	*bdev_handle = bdev_open_by_path(path: device_path, mode: flags, holder, NULL);
467
468	if (IS_ERR(ptr: *bdev_handle)) {
469	ret = PTR_ERR(ptr: *bdev_handle);
470	goto error;
471	}
472	bdev = (*bdev_handle)->bdev;
473
474	if (flush)
475	sync_blockdev(bdev);
476	ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
477	if (ret) {
478	bdev_release(handle: *bdev_handle);
479	goto error;
480	}
481	invalidate_bdev(bdev);
482	*disk_super = btrfs_read_dev_super(bdev);
483	if (IS_ERR(ptr: *disk_super)) {
484	ret = PTR_ERR(ptr: *disk_super);
485	bdev_release(handle: *bdev_handle);
486	goto error;
487	}
488
489	return `0`;
490
491	error:
492	*bdev_handle = NULL;
493	return ret;
494	}
495
496	/*
497	* Search and remove all stale devices (which are not mounted). When both
498	* inputs are NULL, it will search and release all stale devices.
499	*
500	* @devt: Optional. When provided will it release all unmounted devices
501	* matching this devt only.
502	* @skip_device: Optional. Will skip this device when searching for the stale
503	* devices.
504	*
505	* Return: 0 for success or if @devt is 0.
506	* -EBUSY if @devt is a mounted device.
507	* -ENOENT if @devt does not match any device in the list.
508	*/
509	static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
510	{
511	struct btrfs_fs_devices fs_devices, tmp_fs_devices;
512	struct btrfs_device device, tmp_device;
513	int ret;
514	bool freed = false;
515
516	lockdep_assert_held(&uuid_mutex);
517
518	/ Return good status if there is no instance of devt. /
519	ret = `0`;
520	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
521
522	mutex_lock(&fs_devices->device_list_mutex);
523	list_for_each_entry_safe(device, tmp_device,
524	&fs_devices->devices, dev_list) {
525	if (skip_device && skip_device == device)
526	continue;
527	if (devt && devt != device->devt)
528	continue;
529	if (fs_devices->opened) {
530	if (devt)
531	ret = -EBUSY;
532	break;
533	}
534
535	/ delete the stale device /
536	fs_devices->num_devices--;
537	list_del(entry: &device->dev_list);
538	btrfs_free_device(device);
539
540	freed = true;
541	}
542	mutex_unlock(lock: &fs_devices->device_list_mutex);
543
544	if (fs_devices->num_devices == `0`) {
545	btrfs_sysfs_remove_fsid(fs_devs: fs_devices);
546	list_del(entry: &fs_devices->fs_list);
547	free_fs_devices(fs_devices);
548	}
549	}
550
551	/ If there is at least one freed device return 0. /
552	if (freed)
553	return `0`;
554
555	return ret;
556	}
557
558	static struct btrfs_fs_devices *find_fsid_by_device(
559	struct btrfs_super_block *disk_super,
560	dev_t devt, bool *same_fsid_diff_dev)
561	{
562	struct btrfs_fs_devices *fsid_fs_devices;
563	struct btrfs_fs_devices *devt_fs_devices;
564	const bool has_metadata_uuid = (btrfs_super_incompat_flags(s: disk_super) &
565	BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
566	bool found_by_devt = false;
567
568	/ Find the fs_device by the usual method, if found use it. /
569	fsid_fs_devices = find_fsid(fsid: disk_super->fsid,
570	metadata_fsid: has_metadata_uuid ? disk_super->metadata_uuid : NULL);
571
572	/ The temp_fsid feature is supported only with single device filesystem. /
573	if (btrfs_super_num_devices(s: disk_super) != `1`)
574	return fsid_fs_devices;
575
576	/*
577	* A seed device is an integral component of the sprout device, which
578	* functions as a multi-device filesystem. So, temp-fsid feature is
579	* not supported.
580	*/
581	if (btrfs_super_flags(s: disk_super) & BTRFS_SUPER_FLAG_SEEDING)
582	return fsid_fs_devices;
583
584	/ Try to find a fs_devices by matching devt. /
585	list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
586	struct btrfs_device *device;
587
588	list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
589	if (device->devt == devt) {
590	found_by_devt = true;
591	break;
592	}
593	}
594	if (found_by_devt)
595	break;
596	}
597
598	if (found_by_devt) {
599	/ Existing device. /
600	if (fsid_fs_devices == NULL) {
601	if (devt_fs_devices->opened == `0`) {
602	/ Stale device. /
603	return NULL;
604	} else {
605	/ temp_fsid is mounting a subvol. /
606	return devt_fs_devices;
607	}
608	} else {
609	/ Regular or temp_fsid device mounting a subvol. /
610	return devt_fs_devices;
611	}
612	} else {
613	/ New device. /
614	if (fsid_fs_devices == NULL) {
615	return NULL;
616	} else {
617	/ sb::fsid is already used create a new temp_fsid. /
618	*same_fsid_diff_dev = true;
619	return NULL;
620	}
621	}
622
623	/ Not reached. /
624	}
625
626	/*
627	* This is only used on mount, and we are protected from competing things
628	* messing with our fs_devices by the uuid_mutex, thus we do not need the
629	* fs_devices->device_list_mutex here.
630	*/
631	static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
632	struct btrfs_device *device, blk_mode_t flags,
633	void *holder)
634	{
635	struct bdev_handle *bdev_handle;
636	struct btrfs_super_block *disk_super;
637	u64 devid;
638	int ret;
639
640	if (device->bdev)
641	return -EINVAL;
642	if (!device->name)
643	return -EINVAL;
644
645	ret = btrfs_get_bdev_and_sb(device_path: device->name->str, flags, holder, flush: `1`,
646	bdev_handle: &bdev_handle, disk_super: &disk_super);
647	if (ret)
648	return ret;
649
650	devid = btrfs_stack_device_id(s: &disk_super->dev_item);
651	if (devid != device->devid)
652	goto error_free_page;
653
654	if (memcmp(p: device->uuid, q: disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
655	goto error_free_page;
656
657	device->generation = btrfs_super_generation(s: disk_super);
658
659	if (btrfs_super_flags(s: disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
660	if (btrfs_super_incompat_flags(s: disk_super) &
661	BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
662	pr_err(
663	"BTRFS: Invalid seeding and uuid-changed device detected\n");
664	goto error_free_page;
665	}
666
667	clear_bit(BTRFS_DEV_STATE_WRITEABLE, addr: &device->dev_state);
668	fs_devices->seeding = true;
669	} else {
670	if (bdev_read_only(bdev: bdev_handle->bdev))
671	clear_bit(BTRFS_DEV_STATE_WRITEABLE, addr: &device->dev_state);
672	else
673	set_bit(BTRFS_DEV_STATE_WRITEABLE, addr: &device->dev_state);
674	}
675
676	if (!bdev_nonrot(bdev: bdev_handle->bdev))
677	fs_devices->rotating = true;
678
679	if (bdev_max_discard_sectors(bdev: bdev_handle->bdev))
680	fs_devices->discardable = true;
681
682	device->bdev_handle = bdev_handle;
683	device->bdev = bdev_handle->bdev;
684	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, addr: &device->dev_state);
685
686	fs_devices->open_devices++;
687	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
688	device->devid != BTRFS_DEV_REPLACE_DEVID) {
689	fs_devices->rw_devices++;
690	list_add_tail(new: &device->dev_alloc_list, head: &fs_devices->alloc_list);
691	}
692	btrfs_release_disk_super(super: disk_super);
693
694	return `0`;
695
696	error_free_page:
697	btrfs_release_disk_super(super: disk_super);
698	bdev_release(handle: bdev_handle);
699
700	return -EINVAL;
701	}
702
703	u8 btrfs_sb_fsid_ptr(struct* btrfs_super_block *sb)
704	{
705	bool has_metadata_uuid = (btrfs_super_incompat_flags(s: sb) &
706	BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
707
708	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
709	}
710
711	/*
712	* Add new device to list of registered devices
713	*
714	* Returns:
715	* device pointer which was just added or updated when successful
716	* error pointer when failed
717	*/
718	static noinline struct btrfs_device device_list_add(const* char *path,
719	struct btrfs_super_block *disk_super,
720	bool *new_device_added)
721	{
722	struct btrfs_device *device;
723	struct btrfs_fs_devices *fs_devices = NULL;
724	struct rcu_string *name;
725	u64 found_transid = btrfs_super_generation(s: disk_super);
726	u64 devid = btrfs_stack_device_id(s: &disk_super->dev_item);
727	dev_t path_devt;
728	int error;
729	bool same_fsid_diff_dev = false;
730	bool has_metadata_uuid = (btrfs_super_incompat_flags(s: disk_super) &
731	BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
732
733	if (btrfs_super_flags(s: disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
734	btrfs_err(NULL,
735	"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
736	path);
737	return ERR_PTR(error: -EAGAIN);
738	}
739
740	error = lookup_bdev(pathname: path, dev: &path_devt);
741	if (error) {
742	btrfs_err(NULL, "failed to lookup block device for path %s: %d",
743	path, error);
744	return ERR_PTR(error);
745	}
746
747	fs_devices = find_fsid_by_device(disk_super, devt: path_devt, same_fsid_diff_dev: &same_fsid_diff_dev);
748
749	if (!fs_devices) {
750	fs_devices = alloc_fs_devices(fsid: disk_super->fsid);
751	if (has_metadata_uuid)
752	memcpy(fs_devices->metadata_uuid,
753	disk_super->metadata_uuid, BTRFS_FSID_SIZE);
754
755	if (IS_ERR(ptr: fs_devices))
756	return ERR_CAST(ptr: fs_devices);
757
758	if (same_fsid_diff_dev) {
759	generate_random_uuid(uuid: fs_devices->fsid);
760	fs_devices->temp_fsid = true;
761	pr_info("BTRFS: device %s using temp-fsid %pU\n",
762	path, fs_devices->fsid);
763	}
764
765	mutex_lock(&fs_devices->device_list_mutex);
766	list_add(new: &fs_devices->fs_list, head: &fs_uuids);
767
768	device = NULL;
769	} else {
770	struct btrfs_dev_lookup_args args = {
771	.devid = devid,
772	.uuid = disk_super->dev_item.uuid,
773	};
774
775	mutex_lock(&fs_devices->device_list_mutex);
776	device = btrfs_find_device(fs_devices, args: &args);
777
778	if (found_transid > fs_devices->latest_generation) {
779	memcpy(fs_devices->fsid, disk_super->fsid,
780	BTRFS_FSID_SIZE);
781	memcpy(fs_devices->metadata_uuid,
782	btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
783	}
784	}
785
786	if (!device) {
787	unsigned int nofs_flag;
788
789	if (fs_devices->opened) {
790	btrfs_err(NULL,
791	"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
792	path, fs_devices->fsid, current->comm,
793	task_pid_nr(current));
794	mutex_unlock(lock: &fs_devices->device_list_mutex);
795	return ERR_PTR(error: -EBUSY);
796	}
797
798	nofs_flag = memalloc_nofs_save();
799	device = btrfs_alloc_device(NULL, devid: &devid,
800	uuid: disk_super->dev_item.uuid, path);
801	memalloc_nofs_restore(flags: nofs_flag);
802	if (IS_ERR(ptr: device)) {
803	mutex_unlock(lock: &fs_devices->device_list_mutex);
804	/ we can safely leave the fs_devices entry around /
805	return device;
806	}
807
808	device->devt = path_devt;
809
810	list_add_rcu(new: &device->dev_list, head: &fs_devices->devices);
811	fs_devices->num_devices++;
812
813	device->fs_devices = fs_devices;
814	*new_device_added = true;
815
816	if (disk_super->label[`0`])
817	pr_info(
818	"BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
819	disk_super->label, devid, found_transid, path,
820	current->comm, task_pid_nr(current));
821	else
822	pr_info(
823	"BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
824	disk_super->fsid, devid, found_transid, path,
825	current->comm, task_pid_nr(current));
826
827	} else if (!device->name \|\| strcmp(device->name->str, path)) {
828	/*
829	* When FS is already mounted.
830	* 1. If you are here and if the device->name is NULL that
831	* means this device was missing at time of FS mount.
832	* 2. If you are here and if the device->name is different
833	* from 'path' that means either
834	* a. The same device disappeared and reappeared with
835	* different name. or
836	* b. The missing-disk-which-was-replaced, has
837	* reappeared now.
838	*
839	* We must allow 1 and 2a above. But 2b would be a spurious
840	* and unintentional.
841	*
842	* Further in case of 1 and 2a above, the disk at 'path'
843	* would have missed some transaction when it was away and
844	* in case of 2a the stale bdev has to be updated as well.
845	* 2b must not be allowed at all time.
846	*/
847
848	/*
849	* For now, we do allow update to btrfs_fs_device through the
850	* btrfs dev scan cli after FS has been mounted. We're still
851	* tracking a problem where systems fail mount by subvolume id
852	* when we reject replacement on a mounted FS.
853	*/
854	if (!fs_devices->opened && found_transid < device->generation) {
855	/*
856	* That is if the FS is _not_ mounted and if you
857	* are here, that means there is more than one
858	* disk with same uuid and devid.We keep the one
859	* with larger generation number or the last-in if
860	* generation are equal.
861	*/
862	mutex_unlock(lock: &fs_devices->device_list_mutex);
863	btrfs_err(NULL,
864	"device %s already registered with a higher generation, found %llu expect %llu",
865	path, found_transid, device->generation);
866	return ERR_PTR(error: -EEXIST);
867	}
868
869	/*
870	* We are going to replace the device path for a given devid,
871	* make sure it's the same device if the device is mounted
872	*
873	* NOTE: the device->fs_info may not be reliable here so pass
874	* in a NULL to message helpers instead. This avoids a possible
875	* use-after-free when the fs_info and fs_info->sb are already
876	* torn down.
877	*/
878	if (device->bdev) {
879	if (device->devt != path_devt) {
880	mutex_unlock(lock: &fs_devices->device_list_mutex);
881	btrfs_warn_in_rcu(NULL,
882	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
883	path, devid, found_transid,
884	current->comm,
885	task_pid_nr(current));
886	return ERR_PTR(error: -EEXIST);
887	}
888	btrfs_info_in_rcu(NULL,
889	"devid %llu device path %s changed to %s scanned by %s (%d)",
890	devid, btrfs_dev_name(device),
891	path, current->comm,
892	task_pid_nr(current));
893	}
894
895	name = rcu_string_strdup(src: path, GFP_NOFS);
896	if (!name) {
897	mutex_unlock(lock: &fs_devices->device_list_mutex);
898	return ERR_PTR(error: -ENOMEM);
899	}
900	rcu_string_free(str: device->name);
901	rcu_assign_pointer(device->name, name);
902	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
903	fs_devices->missing_devices--;
904	clear_bit(BTRFS_DEV_STATE_MISSING, addr: &device->dev_state);
905	}
906	device->devt = path_devt;
907	}
908
909	/*
910	* Unmount does not free the btrfs_device struct but would zero
911	* generation along with most of the other members. So just update
912	* it back. We need it to pick the disk with largest generation
913	* (as above).
914	*/
915	if (!fs_devices->opened) {
916	device->generation = found_transid;
917	fs_devices->latest_generation = max_t(u64, found_transid,
918	fs_devices->latest_generation);
919	}
920
921	fs_devices->total_devices = btrfs_super_num_devices(s: disk_super);
922
923	mutex_unlock(lock: &fs_devices->device_list_mutex);
924	return device;
925	}
926
927	static struct btrfs_fs_devices clone_fs_devices(struct* btrfs_fs_devices *orig)
928	{
929	struct btrfs_fs_devices *fs_devices;
930	struct btrfs_device *device;
931	struct btrfs_device *orig_dev;
932	int ret = `0`;
933
934	lockdep_assert_held(&uuid_mutex);
935
936	fs_devices = alloc_fs_devices(fsid: orig->fsid);
937	if (IS_ERR(ptr: fs_devices))
938	return fs_devices;
939
940	fs_devices->total_devices = orig->total_devices;
941
942	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
943	const char *dev_path = NULL;
944
945	/*
946	* This is ok to do without RCU read locked because we hold the
947	* uuid mutex so nothing we touch in here is going to disappear.
948	*/
949	if (orig_dev->name)
950	dev_path = orig_dev->name->str;
951
952	device = btrfs_alloc_device(NULL, devid: &orig_dev->devid,
953	uuid: orig_dev->uuid, path: dev_path);
954	if (IS_ERR(ptr: device)) {
955	ret = PTR_ERR(ptr: device);
956	goto error;
957	}
958
959	if (orig_dev->zone_info) {
960	struct btrfs_zoned_device_info *zone_info;
961
962	zone_info = btrfs_clone_dev_zone_info(orig_dev);
963	if (!zone_info) {
964	btrfs_free_device(device);
965	ret = -ENOMEM;
966	goto error;
967	}
968	device->zone_info = zone_info;
969	}
970
971	list_add(new: &device->dev_list, head: &fs_devices->devices);
972	device->fs_devices = fs_devices;
973	fs_devices->num_devices++;
974	}
975	return fs_devices;
976	error:
977	free_fs_devices(fs_devices);
978	return ERR_PTR(error: ret);
979	}
980
981	static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
982	struct btrfs_device **latest_dev)
983	{
984	struct btrfs_device device, next;
985
986	/ This is the initialized path, it is safe to release the devices. /
987	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
988	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
989	if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
990	&device->dev_state) &&
991	!test_bit(BTRFS_DEV_STATE_MISSING,
992	&device->dev_state) &&
993	(!*latest_dev \|\|
994	device->generation > (*latest_dev)->generation)) {
995	*latest_dev = device;
996	}
997	continue;
998	}
999
1000	/*
1001	* We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1002	* in btrfs_init_dev_replace() so just continue.
1003	*/
1004	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1005	continue;
1006
1007	if (device->bdev_handle) {
1008	bdev_release(handle: device->bdev_handle);
1009	device->bdev = NULL;
1010	device->bdev_handle = NULL;
1011	fs_devices->open_devices--;
1012	}
1013	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1014	list_del_init(entry: &device->dev_alloc_list);
1015	clear_bit(BTRFS_DEV_STATE_WRITEABLE, addr: &device->dev_state);
1016	fs_devices->rw_devices--;
1017	}
1018	list_del_init(entry: &device->dev_list);
1019	fs_devices->num_devices--;
1020	btrfs_free_device(device);
1021	}
1022
1023	}
1024
1025	/*
1026	* After we have read the system tree and know devids belonging to this
1027	* filesystem, remove the device which does not belong there.
1028	*/
1029	void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1030	{
1031	struct btrfs_device *latest_dev = NULL;
1032	struct btrfs_fs_devices *seed_dev;
1033
1034	mutex_lock(&uuid_mutex);
1035	__btrfs_free_extra_devids(fs_devices, latest_dev: &latest_dev);
1036
1037	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1038	__btrfs_free_extra_devids(fs_devices: seed_dev, latest_dev: &latest_dev);
1039
1040	fs_devices->latest_dev = latest_dev;
1041
1042	mutex_unlock(lock: &uuid_mutex);
1043	}
1044
1045	static void btrfs_close_bdev(struct btrfs_device *device)
1046	{
1047	if (!device->bdev)
1048	return;
1049
1050	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1051	sync_blockdev(bdev: device->bdev);
1052	invalidate_bdev(bdev: device->bdev);
1053	}
1054
1055	bdev_release(handle: device->bdev_handle);
1056	}
1057
1058	static void btrfs_close_one_device(struct btrfs_device *device)
1059	{
1060	struct btrfs_fs_devices *fs_devices = device->fs_devices;
1061
1062	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1063	device->devid != BTRFS_DEV_REPLACE_DEVID) {
1064	list_del_init(entry: &device->dev_alloc_list);
1065	fs_devices->rw_devices--;
1066	}
1067
1068	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1069	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, addr: &device->dev_state);
1070
1071	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1072	clear_bit(BTRFS_DEV_STATE_MISSING, addr: &device->dev_state);
1073	fs_devices->missing_devices--;
1074	}
1075
1076	btrfs_close_bdev(device);
1077	if (device->bdev) {
1078	fs_devices->open_devices--;
1079	device->bdev = NULL;
1080	}
1081	clear_bit(BTRFS_DEV_STATE_WRITEABLE, addr: &device->dev_state);
1082	btrfs_destroy_dev_zone_info(device);
1083
1084	device->fs_info = NULL;
1085	atomic_set(v: &device->dev_stats_ccnt, i: `0`);
1086	extent_io_tree_release(tree: &device->alloc_state);
1087
1088	/*
1089	* Reset the flush error record. We might have a transient flush error
1090	* in this mount, and if so we aborted the current transaction and set
1091	* the fs to an error state, guaranteeing no super blocks can be further
1092	* committed. However that error might be transient and if we unmount the
1093	* filesystem and mount it again, we should allow the mount to succeed
1094	* (btrfs_check_rw_degradable() should not fail) - if after mounting the
1095	* filesystem again we still get flush errors, then we will again abort
1096	* any transaction and set the error state, guaranteeing no commits of
1097	* unsafe super blocks.
1098	*/
1099	device->last_flush_error = `0`;
1100
1101	/ Verify the device is back in a pristine state /
1102	WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1103	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1104	WARN_ON(!list_empty(&device->dev_alloc_list));
1105	WARN_ON(!list_empty(&device->post_commit_list));
1106	}
1107
1108	static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1109	{
1110	struct btrfs_device device, tmp;
1111
1112	lockdep_assert_held(&uuid_mutex);
1113
1114	if (--fs_devices->opened > `0`)
1115	return;
1116
1117	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1118	btrfs_close_one_device(device);
1119
1120	WARN_ON(fs_devices->open_devices);
1121	WARN_ON(fs_devices->rw_devices);
1122	fs_devices->opened = `0`;
1123	fs_devices->seeding = false;
1124	fs_devices->fs_info = NULL;
1125	}
1126
1127	void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1128	{
1129	LIST_HEAD(list);
1130	struct btrfs_fs_devices *tmp;
1131
1132	mutex_lock(&uuid_mutex);
1133	close_fs_devices(fs_devices);
1134	if (!fs_devices->opened) {
1135	list_splice_init(list: &fs_devices->seed_list, head: &list);
1136
1137	/*
1138	* If the struct btrfs_fs_devices is not assembled with any
1139	* other device, it can be re-initialized during the next mount
1140	* without the needing device-scan step. Therefore, it can be
1141	* fully freed.
1142	*/
1143	if (fs_devices->num_devices == `1`) {
1144	list_del(entry: &fs_devices->fs_list);
1145	free_fs_devices(fs_devices);
1146	}
1147	}
1148
1149
1150	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1151	close_fs_devices(fs_devices);
1152	list_del(entry: &fs_devices->seed_list);
1153	free_fs_devices(fs_devices);
1154	}
1155	mutex_unlock(lock: &uuid_mutex);
1156	}
1157
1158	static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1159	blk_mode_t flags, void *holder)
1160	{
1161	struct btrfs_device *device;
1162	struct btrfs_device *latest_dev = NULL;
1163	struct btrfs_device *tmp_device;
1164
1165	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1166	dev_list) {
1167	int ret;
1168
1169	ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1170	if (ret == `0` &&
1171	(!latest_dev \|\| device->generation > latest_dev->generation)) {
1172	latest_dev = device;
1173	} else if (ret == -ENODATA) {
1174	fs_devices->num_devices--;
1175	list_del(entry: &device->dev_list);
1176	btrfs_free_device(device);
1177	}
1178	}
1179	if (fs_devices->open_devices == `0`)
1180	return -EINVAL;
1181
1182	fs_devices->opened = `1`;
1183	fs_devices->latest_dev = latest_dev;
1184	fs_devices->total_rw_bytes = `0`;
1185	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1186	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1187
1188	return `0`;
1189	}
1190
1191	static int devid_cmp(void priv, const* struct list_head *a,
1192	const struct list_head *b)
1193	{
1194	const struct btrfs_device dev1, dev2;
1195
1196	dev1 = list_entry(a, struct btrfs_device, dev_list);
1197	dev2 = list_entry(b, struct btrfs_device, dev_list);
1198
1199	if (dev1->devid < dev2->devid)
1200	return -`1`;
1201	else if (dev1->devid > dev2->devid)
1202	return `1`;
1203	return `0`;
1204	}
1205
1206	int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1207	blk_mode_t flags, void *holder)
1208	{
1209	int ret;
1210
1211	lockdep_assert_held(&uuid_mutex);
1212	/*
1213	* The device_list_mutex cannot be taken here in case opening the
1214	* underlying device takes further locks like open_mutex.
1215	*
1216	* We also don't need the lock here as this is called during mount and
1217	* exclusion is provided by uuid_mutex
1218	*/
1219
1220	if (fs_devices->opened) {
1221	fs_devices->opened++;
1222	ret = `0`;
1223	} else {
1224	list_sort(NULL, head: &fs_devices->devices, cmp: devid_cmp);
1225	ret = open_fs_devices(fs_devices, flags, holder);
1226	}
1227
1228	return ret;
1229	}
1230
1231	void btrfs_release_disk_super(struct btrfs_super_block *super)
1232	{
1233	struct page *page = virt_to_page(super);
1234
1235	put_page(page);
1236	}
1237
1238	static struct btrfs_super_block btrfs_read_disk_super(struct* block_device *bdev,
1239	u64 bytenr, u64 bytenr_orig)
1240	{
1241	struct btrfs_super_block *disk_super;
1242	struct page *page;
1243	void *p;
1244	pgoff_t index;
1245
1246	/ make sure our super fits in the device /
1247	if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1248	return ERR_PTR(error: -EINVAL);
1249
1250	/ make sure our super fits in the page /
1251	if (sizeof(*disk_super) > PAGE_SIZE)
1252	return ERR_PTR(error: -EINVAL);
1253
1254	/ make sure our super doesn't straddle pages on disk /
1255	index = bytenr >> PAGE_SHIFT;
1256	if ((bytenr + sizeof(*disk_super) - `1`) >> PAGE_SHIFT != index)
1257	return ERR_PTR(error: -EINVAL);
1258
1259	/ pull in the page with our super /
1260	page = read_cache_page_gfp(mapping: bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1261
1262	if (IS_ERR(ptr: page))
1263	return ERR_CAST(ptr: page);
1264
1265	p = page_address(page);
1266
1267	/ align our pointer to the offset of the super block /
1268	disk_super = p + offset_in_page(bytenr);
1269
1270	if (btrfs_super_bytenr(s: disk_super) != bytenr_orig \|\|
1271	btrfs_super_magic(s: disk_super) != BTRFS_MAGIC) {
1272	btrfs_release_disk_super(super: p);
1273	return ERR_PTR(error: -EINVAL);
1274	}
1275
1276	if (disk_super->label[`0`] && disk_super->label[BTRFS_LABEL_SIZE - `1`])
1277	disk_super->label[BTRFS_LABEL_SIZE - `1`] = `0`;
1278
1279	return disk_super;
1280	}
1281
1282	int btrfs_forget_devices(dev_t devt)
1283	{
1284	int ret;
1285
1286	mutex_lock(&uuid_mutex);
1287	ret = btrfs_free_stale_devices(devt, NULL);
1288	mutex_unlock(lock: &uuid_mutex);
1289
1290	return ret;
1291	}
1292
1293	/*
1294	* Look for a btrfs signature on a device. This may be called out of the mount path
1295	* and we are not allowed to call set_blocksize during the scan. The superblock
1296	* is read via pagecache.
1297	*
1298	* With @mount_arg_dev it's a scan during mount time that will always register
1299	* the device or return an error. Multi-device and seeding devices are registered
1300	* in both cases.
1301	*/
1302	struct btrfs_device btrfs_scan_one_device(const* char *path, blk_mode_t flags,
1303	bool mount_arg_dev)
1304	{
1305	struct btrfs_super_block *disk_super;
1306	bool new_device_added = false;
1307	struct btrfs_device *device = NULL;
1308	struct bdev_handle *bdev_handle;
1309	u64 bytenr, bytenr_orig;
1310	int ret;
1311
1312	lockdep_assert_held(&uuid_mutex);
1313
1314	/*
1315	* we would like to check all the supers, but that would make
1316	* a btrfs mount succeed after a mkfs from a different FS.
1317	* So, we need to add a special mount option to scan for
1318	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
1319	*/
1320
1321	/*
1322	* Avoid an exclusive open here, as the systemd-udev may initiate the
1323	* device scan which may race with the user's mount or mkfs command,
1324	* resulting in failure.
1325	* Since the device scan is solely for reading purposes, there is no
1326	* need for an exclusive open. Additionally, the devices are read again
1327	* during the mount process. It is ok to get some inconsistent
1328	* values temporarily, as the device paths of the fsid are the only
1329	* required information for assembling the volume.
1330	*/
1331	bdev_handle = bdev_open_by_path(path, mode: flags, NULL, NULL);
1332	if (IS_ERR(ptr: bdev_handle))
1333	return ERR_CAST(ptr: bdev_handle);
1334
1335	bytenr_orig = btrfs_sb_offset(mirror: `0`);
1336	ret = btrfs_sb_log_location_bdev(bdev: bdev_handle->bdev, mirror: `0`, READ, bytenr_ret: &bytenr);
1337	if (ret) {
1338	device = ERR_PTR(error: ret);
1339	goto error_bdev_put;
1340	}
1341
1342	disk_super = btrfs_read_disk_super(bdev: bdev_handle->bdev, bytenr,
1343	bytenr_orig);
1344	if (IS_ERR(ptr: disk_super)) {
1345	device = ERR_CAST(ptr: disk_super);
1346	goto error_bdev_put;
1347	}
1348
1349	if (!mount_arg_dev && btrfs_super_num_devices(s: disk_super) == `1` &&
1350	!(btrfs_super_flags(s: disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
1351	dev_t devt;
1352
1353	ret = lookup_bdev(pathname: path, dev: &devt);
1354	if (ret)
1355	btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
1356	path, ret);
1357	else
1358	btrfs_free_stale_devices(devt, NULL);
1359
1360	pr_debug("BTRFS: skip registering single non-seed device %s\n", path);
1361	device = NULL;
1362	goto free_disk_super;
1363	}
1364
1365	device = device_list_add(path, disk_super, new_device_added: &new_device_added);
1366	if (!IS_ERR(ptr: device) && new_device_added)
1367	btrfs_free_stale_devices(devt: device->devt, skip_device: device);
1368
1369	free_disk_super:
1370	btrfs_release_disk_super(super: disk_super);
1371
1372	error_bdev_put:
1373	bdev_release(handle: bdev_handle);
1374
1375	return device;
1376	}
1377
1378	/*
1379	* Try to find a chunk that intersects [start, start + len] range and when one
1380	* such is found, record the end of it in *start
1381	*/
1382	static bool contains_pending_extent(struct btrfs_device device, u64 start,
1383	u64 len)
1384	{
1385	u64 physical_start, physical_end;
1386
1387	lockdep_assert_held(&device->fs_info->chunk_mutex);
1388
1389	if (find_first_extent_bit(tree: &device->alloc_state, start: *start,
1390	start_ret: &physical_start, end_ret: &physical_end,
1391	CHUNK_ALLOCATED, NULL)) {
1392
1393	if (in_range(physical_start, *start, len) \|\|
1394	in_range(*start, physical_start,
1395	physical_end - physical_start)) {
1396	*start = physical_end + `1`;
1397	return true;
1398	}
1399	}
1400	return false;
1401	}
1402
1403	static u64 dev_extent_search_start(struct btrfs_device *device)
1404	{
1405	switch (device->fs_devices->chunk_alloc_policy) {
1406	case BTRFS_CHUNK_ALLOC_REGULAR:
1407	return BTRFS_DEVICE_RANGE_RESERVED;
1408	case BTRFS_CHUNK_ALLOC_ZONED:
1409	/*
1410	* We don't care about the starting region like regular
1411	* allocator, because we anyway use/reserve the first two zones
1412	* for superblock logging.
1413	*/
1414	return `0`;
1415	default:
1416	BUG();
1417	}
1418	}
1419
1420	static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1421	u64 hole_start, u64 hole_size,
1422	u64 num_bytes)
1423	{
1424	u64 zone_size = device->zone_info->zone_size;
1425	u64 pos;
1426	int ret;
1427	bool changed = false;
1428
1429	ASSERT(IS_ALIGNED(*hole_start, zone_size));
1430
1431	while (*hole_size > `0`) {
1432	pos = btrfs_find_allocatable_zones(device, hole_start: *hole_start,
1433	hole_end: hole_start + hole_size,
1434	num_bytes);
1435	if (pos != *hole_start) {
1436	hole_size = hole_start + *hole_size - pos;
1437	*hole_start = pos;
1438	changed = true;
1439	if (*hole_size < num_bytes)
1440	break;
1441	}
1442
1443	ret = btrfs_ensure_empty_zones(device, start: pos, size: num_bytes);
1444
1445	/ Range is ensured to be empty /
1446	if (!ret)
1447	return changed;
1448
1449	/ Given hole range was invalid (outside of device) /
1450	if (ret == -ERANGE) {
1451	hole_start += hole_size;
1452	*hole_size = `0`;
1453	return true;
1454	}
1455
1456	*hole_start += zone_size;
1457	*hole_size -= zone_size;
1458	changed = true;
1459	}
1460
1461	return changed;
1462	}
1463
1464	/*
1465	* Check if specified hole is suitable for allocation.
1466	*
1467	* @device: the device which we have the hole
1468	* @hole_start: starting position of the hole
1469	* @hole_size: the size of the hole
1470	* @num_bytes: the size of the free space that we need
1471	*
1472	* This function may modify @hole_start and @hole_size to reflect the suitable
1473	* position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1474	*/
1475	static bool dev_extent_hole_check(struct btrfs_device device, u64 hole_start,
1476	u64 *hole_size, u64 num_bytes)
1477	{
1478	bool changed = false;
1479	u64 hole_end = hole_start + hole_size;
1480
1481	for (;;) {
1482	/*
1483	* Check before we set max_hole_start, otherwise we could end up
1484	* sending back this offset anyway.
1485	*/
1486	if (contains_pending_extent(device, start: hole_start, len: *hole_size)) {
1487	if (hole_end >= *hole_start)
1488	hole_size = hole_end - hole_start;
1489	else
1490	*hole_size = `0`;
1491	changed = true;
1492	}
1493
1494	switch (device->fs_devices->chunk_alloc_policy) {
1495	case BTRFS_CHUNK_ALLOC_REGULAR:
1496	/ No extra check /
1497	break;
1498	case BTRFS_CHUNK_ALLOC_ZONED:
1499	if (dev_extent_hole_check_zoned(device, hole_start,
1500	hole_size, num_bytes)) {
1501	changed = true;
1502	/*
1503	* The changed hole can contain pending extent.
1504	* Loop again to check that.
1505	*/
1506	continue;
1507	}
1508	break;
1509	default:
1510	BUG();
1511	}
1512
1513	break;
1514	}
1515
1516	return changed;
1517	}
1518
1519	/*
1520	* Find free space in the specified device.
1521	*
1522	* @device: the device which we search the free space in
1523	* @num_bytes: the size of the free space that we need
1524	* @search_start: the position from which to begin the search
1525	* @start: store the start of the free space.
1526	* @len: the size of the free space. that we find, or the size
1527	* of the max free space if we don't find suitable free space
1528	*
1529	* This does a pretty simple search, the expectation is that it is called very
1530	* infrequently and that a given device has a small number of extents.
1531	*
1532	* @start is used to store the start of the free space if we find. But if we
1533	* don't find suitable free space, it will be used to store the start position
1534	* of the max free space.
1535	*
1536	* @len is used to store the size of the free space that we find.
1537	* But if we don't find suitable free space, it is used to store the size of
1538	* the max free space.
1539	*
1540	* NOTE: This function will search commit root of device tree, and does extra
1541	* check to ensure dev extents are not double allocated.
1542	* This makes the function safe to allocate dev extents but may not report
1543	* correct usable device space, as device extent freed in current transaction
1544	* is not reported as available.
1545	*/
1546	static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1547	u64 start, u64 len)
1548	{
1549	struct btrfs_fs_info *fs_info = device->fs_info;
1550	struct btrfs_root *root = fs_info->dev_root;
1551	struct btrfs_key key;
1552	struct btrfs_dev_extent *dev_extent;
1553	struct btrfs_path *path;
1554	u64 search_start;
1555	u64 hole_size;
1556	u64 max_hole_start;
1557	u64 max_hole_size = `0`;
1558	u64 extent_end;
1559	u64 search_end = device->total_bytes;
1560	int ret;
1561	int slot;
1562	struct extent_buffer *l;
1563
1564	search_start = dev_extent_search_start(device);
1565	max_hole_start = search_start;
1566
1567	WARN_ON(device->zone_info &&
1568	!IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1569
1570	path = btrfs_alloc_path();
1571	if (!path) {
1572	ret = -ENOMEM;
1573	goto out;
1574	}
1575	again:
1576	if (search_start >= search_end \|\|
1577	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1578	ret = -ENOSPC;
1579	goto out;
1580	}
1581
1582	path->reada = READA_FORWARD;
1583	path->search_commit_root = `1`;
1584	path->skip_locking = `1`;
1585
1586	key.objectid = device->devid;
1587	key.offset = search_start;
1588	key.type = BTRFS_DEV_EXTENT_KEY;
1589
1590	ret = btrfs_search_backwards(root, key: &key, path);
1591	if (ret < `0`)
1592	goto out;
1593
1594	while (search_start < search_end) {
1595	l = path->nodes[`0`];
1596	slot = path->slots[`0`];
1597	if (slot >= btrfs_header_nritems(eb: l)) {
1598	ret = btrfs_next_leaf(root, path);
1599	if (ret == `0`)
1600	continue;
1601	if (ret < `0`)
1602	goto out;
1603
1604	break;
1605	}
1606	btrfs_item_key_to_cpu(eb: l, cpu_key: &key, nr: slot);
1607
1608	if (key.objectid < device->devid)
1609	goto next;
1610
1611	if (key.objectid > device->devid)
1612	break;
1613
1614	if (key.type != BTRFS_DEV_EXTENT_KEY)
1615	goto next;
1616
1617	if (key.offset > search_end)
1618	break;
1619
1620	if (key.offset > search_start) {
1621	hole_size = key.offset - search_start;
1622	dev_extent_hole_check(device, hole_start: &search_start, hole_size: &hole_size,
1623	num_bytes);
1624
1625	if (hole_size > max_hole_size) {
1626	max_hole_start = search_start;
1627	max_hole_size = hole_size;
1628	}
1629
1630	/*
1631	* If this free space is greater than which we need,
1632	* it must be the max free space that we have found
1633	* until now, so max_hole_start must point to the start
1634	* of this free space and the length of this free space
1635	* is stored in max_hole_size. Thus, we return
1636	* max_hole_start and max_hole_size and go back to the
1637	* caller.
1638	*/
1639	if (hole_size >= num_bytes) {
1640	ret = `0`;
1641	goto out;
1642	}
1643	}
1644
1645	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1646	extent_end = key.offset + btrfs_dev_extent_length(eb: l,
1647	s: dev_extent);
1648	if (extent_end > search_start)
1649	search_start = extent_end;
1650	next:
1651	path->slots[`0`]++;
1652	cond_resched();
1653	}
1654
1655	/*
1656	* At this point, search_start should be the end of
1657	* allocated dev extents, and when shrinking the device,
1658	* search_end may be smaller than search_start.
1659	*/
1660	if (search_end > search_start) {
1661	hole_size = search_end - search_start;
1662	if (dev_extent_hole_check(device, hole_start: &search_start, hole_size: &hole_size,
1663	num_bytes)) {
1664	btrfs_release_path(p: path);
1665	goto again;
1666	}
1667
1668	if (hole_size > max_hole_size) {
1669	max_hole_start = search_start;
1670	max_hole_size = hole_size;
1671	}
1672	}
1673
1674	/ See above. /
1675	if (max_hole_size < num_bytes)
1676	ret = -ENOSPC;
1677	else
1678	ret = `0`;
1679
1680	ASSERT(max_hole_start + max_hole_size <= search_end);
1681	out:
1682	btrfs_free_path(p: path);
1683	*start = max_hole_start;
1684	if (len)
1685	*len = max_hole_size;
1686	return ret;
1687	}
1688
1689	static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1690	struct btrfs_device *device,
1691	u64 start, u64 *dev_extent_len)
1692	{
1693	struct btrfs_fs_info *fs_info = device->fs_info;
1694	struct btrfs_root *root = fs_info->dev_root;
1695	int ret;
1696	struct btrfs_path *path;
1697	struct btrfs_key key;
1698	struct btrfs_key found_key;
1699	struct extent_buffer *leaf = NULL;
1700	struct btrfs_dev_extent *extent = NULL;
1701
1702	path = btrfs_alloc_path();
1703	if (!path)
1704	return -ENOMEM;
1705
1706	key.objectid = device->devid;
1707	key.offset = start;
1708	key.type = BTRFS_DEV_EXTENT_KEY;
1709	again:
1710	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
1711	if (ret > `0`) {
1712	ret = btrfs_previous_item(root, path, min_objectid: key.objectid,
1713	BTRFS_DEV_EXTENT_KEY);
1714	if (ret)
1715	goto out;
1716	leaf = path->nodes[`0`];
1717	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
1718	extent = btrfs_item_ptr(leaf, path->slots[`0`],
1719	struct btrfs_dev_extent);
1720	BUG_ON(found_key.offset > start \|\| found_key.offset +
1721	btrfs_dev_extent_length(leaf, extent) < start);
1722	key = found_key;
1723	btrfs_release_path(p: path);
1724	goto again;
1725	} else if (ret == `0`) {
1726	leaf = path->nodes[`0`];
1727	extent = btrfs_item_ptr(leaf, path->slots[`0`],
1728	struct btrfs_dev_extent);
1729	} else {
1730	goto out;
1731	}
1732
1733	*dev_extent_len = btrfs_dev_extent_length(eb: leaf, s: extent);
1734
1735	ret = btrfs_del_item(trans, root, path);
1736	if (ret == `0`)
1737	set_bit(BTRFS_TRANS_HAVE_FREE_BGS, addr: &trans->transaction->flags);
1738	out:
1739	btrfs_free_path(p: path);
1740	return ret;
1741	}
1742
1743	static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1744	{
1745	struct extent_map_tree *em_tree;
1746	struct extent_map *em;
1747	struct rb_node *n;
1748	u64 ret = `0`;
1749
1750	em_tree = &fs_info->mapping_tree;
1751	read_lock(&em_tree->lock);
1752	n = rb_last(&em_tree->map.rb_root);
1753	if (n) {
1754	em = rb_entry(n, struct extent_map, rb_node);
1755	ret = em->start + em->len;
1756	}
1757	read_unlock(&em_tree->lock);
1758
1759	return ret;
1760	}
1761
1762	static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1763	u64 *devid_ret)
1764	{
1765	int ret;
1766	struct btrfs_key key;
1767	struct btrfs_key found_key;
1768	struct btrfs_path *path;
1769
1770	path = btrfs_alloc_path();
1771	if (!path)
1772	return -ENOMEM;
1773
1774	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1775	key.type = BTRFS_DEV_ITEM_KEY;
1776	key.offset = (u64)-`1`;
1777
1778	ret = btrfs_search_slot(NULL, root: fs_info->chunk_root, key: &key, p: path, ins_len: `0`, cow: `0`);
1779	if (ret < `0`)
1780	goto error;
1781
1782	if (ret == `0`) {
1783	/ Corruption /
1784	btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1785	ret = -EUCLEAN;
1786	goto error;
1787	}
1788
1789	ret = btrfs_previous_item(root: fs_info->chunk_root, path,
1790	BTRFS_DEV_ITEMS_OBJECTID,
1791	BTRFS_DEV_ITEM_KEY);
1792	if (ret) {
1793	*devid_ret = `1`;
1794	} else {
1795	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key,
1796	nr: path->slots[`0`]);
1797	*devid_ret = found_key.offset + `1`;
1798	}
1799	ret = `0`;
1800	error:
1801	btrfs_free_path(p: path);
1802	return ret;
1803	}
1804
1805	/*
1806	* the device information is stored in the chunk root
1807	* the btrfs_device struct should be fully filled in
1808	*/
1809	static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1810	struct btrfs_device *device)
1811	{
1812	int ret;
1813	struct btrfs_path *path;
1814	struct btrfs_dev_item *dev_item;
1815	struct extent_buffer *leaf;
1816	struct btrfs_key key;
1817	unsigned long ptr;
1818
1819	path = btrfs_alloc_path();
1820	if (!path)
1821	return -ENOMEM;
1822
1823	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1824	key.type = BTRFS_DEV_ITEM_KEY;
1825	key.offset = device->devid;
1826
1827	btrfs_reserve_chunk_metadata(trans, is_item_insertion: true);
1828	ret = btrfs_insert_empty_item(trans, root: trans->fs_info->chunk_root, path,
1829	key: &key, data_size: sizeof(*dev_item));
1830	btrfs_trans_release_chunk_metadata(trans);
1831	if (ret)
1832	goto out;
1833
1834	leaf = path->nodes[`0`];
1835	dev_item = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_dev_item);
1836
1837	btrfs_set_device_id(eb: leaf, s: dev_item, val: device->devid);
1838	btrfs_set_device_generation(eb: leaf, s: dev_item, val: `0`);
1839	btrfs_set_device_type(eb: leaf, s: dev_item, val: device->type);
1840	btrfs_set_device_io_align(eb: leaf, s: dev_item, val: device->io_align);
1841	btrfs_set_device_io_width(eb: leaf, s: dev_item, val: device->io_width);
1842	btrfs_set_device_sector_size(eb: leaf, s: dev_item, val: device->sector_size);
1843	btrfs_set_device_total_bytes(eb: leaf, s: dev_item,
1844	val: btrfs_device_get_disk_total_bytes(dev: device));
1845	btrfs_set_device_bytes_used(eb: leaf, s: dev_item,
1846	val: btrfs_device_get_bytes_used(dev: device));
1847	btrfs_set_device_group(eb: leaf, s: dev_item, val: `0`);
1848	btrfs_set_device_seek_speed(eb: leaf, s: dev_item, val: `0`);
1849	btrfs_set_device_bandwidth(eb: leaf, s: dev_item, val: `0`);
1850	btrfs_set_device_start_offset(eb: leaf, s: dev_item, val: `0`);
1851
1852	ptr = btrfs_device_uuid(d: dev_item);
1853	write_extent_buffer(eb: leaf, src: device->uuid, start: ptr, BTRFS_UUID_SIZE);
1854	ptr = btrfs_device_fsid(d: dev_item);
1855	write_extent_buffer(eb: leaf, src: trans->fs_info->fs_devices->metadata_uuid,
1856	start: ptr, BTRFS_FSID_SIZE);
1857	btrfs_mark_buffer_dirty(trans, buf: leaf);
1858
1859	ret = `0`;
1860	out:
1861	btrfs_free_path(p: path);
1862	return ret;
1863	}
1864
1865	/*
1866	* Function to update ctime/mtime for a given device path.
1867	* Mainly used for ctime/mtime based probe like libblkid.
1868	*
1869	* We don't care about errors here, this is just to be kind to userspace.
1870	*/
1871	static void update_dev_time(const char *device_path)
1872	{
1873	struct path path;
1874	int ret;
1875
1876	ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1877	if (ret)
1878	return;
1879
1880	inode_update_time(inode: d_inode(dentry: path.dentry), flags: S_MTIME \| S_CTIME \| S_VERSION);
1881	path_put(&path);
1882	}
1883
1884	static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
1885	struct btrfs_device *device)
1886	{
1887	struct btrfs_root *root = device->fs_info->chunk_root;
1888	int ret;
1889	struct btrfs_path *path;
1890	struct btrfs_key key;
1891
1892	path = btrfs_alloc_path();
1893	if (!path)
1894	return -ENOMEM;
1895
1896	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1897	key.type = BTRFS_DEV_ITEM_KEY;
1898	key.offset = device->devid;
1899
1900	btrfs_reserve_chunk_metadata(trans, is_item_insertion: false);
1901	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
1902	btrfs_trans_release_chunk_metadata(trans);
1903	if (ret) {
1904	if (ret > `0`)
1905	ret = -ENOENT;
1906	goto out;
1907	}
1908
1909	ret = btrfs_del_item(trans, root, path);
1910	out:
1911	btrfs_free_path(p: path);
1912	return ret;
1913	}
1914
1915	/*
1916	* Verify that @num_devices satisfies the RAID profile constraints in the whole
1917	* filesystem. It's up to the caller to adjust that number regarding eg. device
1918	* replace.
1919	*/
1920	static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1921	u64 num_devices)
1922	{
1923	u64 all_avail;
1924	unsigned seq;
1925	int i;
1926
1927	do {
1928	seq = read_seqbegin(sl: &fs_info->profiles_lock);
1929
1930	all_avail = fs_info->avail_data_alloc_bits \|
1931	fs_info->avail_system_alloc_bits \|
1932	fs_info->avail_metadata_alloc_bits;
1933	} while (read_seqretry(sl: &fs_info->profiles_lock, start: seq));
1934
1935	for (i = `0`; i < BTRFS_NR_RAID_TYPES; i++) {
1936	if (!(all_avail & btrfs_raid_array[i].bg_flag))
1937	continue;
1938
1939	if (num_devices < btrfs_raid_array[i].devs_min)
1940	return btrfs_raid_array[i].mindev_error;
1941	}
1942
1943	return `0`;
1944	}
1945
1946	static struct btrfs_device * btrfs_find_next_active_device(
1947	struct btrfs_fs_devices fs_devs, struct* btrfs_device *device)
1948	{
1949	struct btrfs_device *next_device;
1950
1951	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1952	if (next_device != device &&
1953	!test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1954	&& next_device->bdev)
1955	return next_device;
1956	}
1957
1958	return NULL;
1959	}
1960
1961	/*
1962	* Helper function to check if the given device is part of s_bdev / latest_dev
1963	* and replace it with the provided or the next active device, in the context
1964	* where this function called, there should be always be another device (or
1965	* this_dev) which is active.
1966	*/
1967	void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1968	struct btrfs_device *next_device)
1969	{
1970	struct btrfs_fs_info *fs_info = device->fs_info;
1971
1972	if (!next_device)
1973	next_device = btrfs_find_next_active_device(fs_devs: fs_info->fs_devices,
1974	device);
1975	ASSERT(next_device);
1976
1977	if (fs_info->sb->s_bdev &&
1978	(fs_info->sb->s_bdev == device->bdev))
1979	fs_info->sb->s_bdev = next_device->bdev;
1980
1981	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
1982	fs_info->fs_devices->latest_dev = next_device;
1983	}
1984
1985	/*
1986	* Return btrfs_fs_devices::num_devices excluding the device that's being
1987	* currently replaced.
1988	*/
1989	static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1990	{
1991	u64 num_devices = fs_info->fs_devices->num_devices;
1992
1993	down_read(sem: &fs_info->dev_replace.rwsem);
1994	if (btrfs_dev_replace_is_ongoing(dev_replace: &fs_info->dev_replace)) {
1995	ASSERT(num_devices > `1`);
1996	num_devices--;
1997	}
1998	up_read(sem: &fs_info->dev_replace.rwsem);
1999
2000	return num_devices;
2001	}
2002
2003	static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
2004	struct block_device bdev, int* copy_num)
2005	{
2006	struct btrfs_super_block *disk_super;
2007	const size_t len = sizeof(disk_super->magic);
2008	const u64 bytenr = btrfs_sb_offset(mirror: copy_num);
2009	int ret;
2010
2011	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig: bytenr);
2012	if (IS_ERR(ptr: disk_super))
2013	return;
2014
2015	memset(&disk_super->magic, `0`, len);
2016	folio_mark_dirty(folio: virt_to_folio(x: disk_super));
2017	btrfs_release_disk_super(super: disk_super);
2018
2019	ret = sync_blockdev_range(bdev, lstart: bytenr, lend: bytenr + len - `1`);
2020	if (ret)
2021	btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
2022	copy_num, ret);
2023	}
2024
2025	void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2026	struct block_device *bdev,
2027	const char *device_path)
2028	{
2029	int copy_num;
2030
2031	if (!bdev)
2032	return;
2033
2034	for (copy_num = `0`; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2035	if (bdev_is_zoned(bdev))
2036	btrfs_reset_sb_log_zones(bdev, mirror: copy_num);
2037	else
2038	btrfs_scratch_superblock(fs_info, bdev, copy_num);
2039	}
2040
2041	/ Notify udev that device has changed /
2042	btrfs_kobject_uevent(bdev, action: KOBJ_CHANGE);
2043
2044	/ Update ctime/mtime for device path for libblkid /
2045	update_dev_time(device_path);
2046	}
2047
2048	int btrfs_rm_device(struct btrfs_fs_info *fs_info,
2049	struct btrfs_dev_lookup_args *args,
2050	struct bdev_handle **bdev_handle)
2051	{
2052	struct btrfs_trans_handle *trans;
2053	struct btrfs_device *device;
2054	struct btrfs_fs_devices *cur_devices;
2055	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2056	u64 num_devices;
2057	int ret = `0`;
2058
2059	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2060	btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
2061	return -EINVAL;
2062	}
2063
2064	/*
2065	* The device list in fs_devices is accessed without locks (neither
2066	* uuid_mutex nor device_list_mutex) as it won't change on a mounted
2067	* filesystem and another device rm cannot run.
2068	*/
2069	num_devices = btrfs_num_devices(fs_info);
2070
2071	ret = btrfs_check_raid_min_devices(fs_info, num_devices: num_devices - `1`);
2072	if (ret)
2073	return ret;
2074
2075	device = btrfs_find_device(fs_devices: fs_info->fs_devices, args);
2076	if (!device) {
2077	if (args->missing)
2078	ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2079	else
2080	ret = -ENOENT;
2081	return ret;
2082	}
2083
2084	if (btrfs_pinned_by_swapfile(fs_info, ptr: device)) {
2085	btrfs_warn_in_rcu(fs_info,
2086	"cannot remove device %s (devid %llu) due to active swapfile",
2087	btrfs_dev_name(device), device->devid);
2088	return -ETXTBSY;
2089	}
2090
2091	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2092	return BTRFS_ERROR_DEV_TGT_REPLACE;
2093
2094	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2095	fs_info->fs_devices->rw_devices == `1`)
2096	return BTRFS_ERROR_DEV_ONLY_WRITABLE;
2097
2098	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2099	mutex_lock(&fs_info->chunk_mutex);
2100	list_del_init(entry: &device->dev_alloc_list);
2101	device->fs_devices->rw_devices--;
2102	mutex_unlock(lock: &fs_info->chunk_mutex);
2103	}
2104
2105	ret = btrfs_shrink_device(device, new_size: `0`);
2106	if (ret)
2107	goto error_undo;
2108
2109	trans = btrfs_start_transaction(root: fs_info->chunk_root, num_items: `0`);
2110	if (IS_ERR(ptr: trans)) {
2111	ret = PTR_ERR(ptr: trans);
2112	goto error_undo;
2113	}
2114
2115	ret = btrfs_rm_dev_item(trans, device);
2116	if (ret) {
2117	/ Any error in dev item removal is critical /
2118	btrfs_crit(fs_info,
2119	"failed to remove device item for devid %llu: %d",
2120	device->devid, ret);
2121	btrfs_abort_transaction(trans, ret);
2122	btrfs_end_transaction(trans);
2123	return ret;
2124	}
2125
2126	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, addr: &device->dev_state);
2127	btrfs_scrub_cancel_dev(dev: device);
2128
2129	/*
2130	* the device list mutex makes sure that we don't change
2131	* the device list while someone else is writing out all
2132	* the device supers. Whoever is writing all supers, should
2133	* lock the device list mutex before getting the number of
2134	* devices in the super block (super_copy). Conversely,
2135	* whoever updates the number of devices in the super block
2136	* (super_copy) should hold the device list mutex.
2137	*/
2138
2139	/*
2140	* In normal cases the cur_devices == fs_devices. But in case
2141	* of deleting a seed device, the cur_devices should point to
2142	* its own fs_devices listed under the fs_devices->seed_list.
2143	*/
2144	cur_devices = device->fs_devices;
2145	mutex_lock(&fs_devices->device_list_mutex);
2146	list_del_rcu(entry: &device->dev_list);
2147
2148	cur_devices->num_devices--;
2149	cur_devices->total_devices--;
2150	/ Update total_devices of the parent fs_devices if it's seed /
2151	if (cur_devices != fs_devices)
2152	fs_devices->total_devices--;
2153
2154	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2155	cur_devices->missing_devices--;
2156
2157	btrfs_assign_next_active_device(device, NULL);
2158
2159	if (device->bdev_handle) {
2160	cur_devices->open_devices--;
2161	/ remove sysfs entry /
2162	btrfs_sysfs_remove_device(device);
2163	}
2164
2165	num_devices = btrfs_super_num_devices(s: fs_info->super_copy) - `1`;
2166	btrfs_set_super_num_devices(s: fs_info->super_copy, val: num_devices);
2167	mutex_unlock(lock: &fs_devices->device_list_mutex);
2168
2169	/*
2170	* At this point, the device is zero sized and detached from the
2171	* devices list. All that's left is to zero out the old supers and
2172	* free the device.
2173	*
2174	* We cannot call btrfs_close_bdev() here because we're holding the sb
2175	* write lock, and bdev_release() will pull in the ->open_mutex on
2176	* the block device and it's dependencies. Instead just flush the
2177	* device and let the caller do the final bdev_release.
2178	*/
2179	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2180	btrfs_scratch_superblocks(fs_info, bdev: device->bdev,
2181	device_path: device->name->str);
2182	if (device->bdev) {
2183	sync_blockdev(bdev: device->bdev);
2184	invalidate_bdev(bdev: device->bdev);
2185	}
2186	}
2187
2188	*bdev_handle = device->bdev_handle;
2189	synchronize_rcu();
2190	btrfs_free_device(device);
2191
2192	/*
2193	* This can happen if cur_devices is the private seed devices list. We
2194	* cannot call close_fs_devices() here because it expects the uuid_mutex
2195	* to be held, but in fact we don't need that for the private
2196	* seed_devices, we can simply decrement cur_devices->opened and then
2197	* remove it from our list and free the fs_devices.
2198	*/
2199	if (cur_devices->num_devices == `0`) {
2200	list_del_init(entry: &cur_devices->seed_list);
2201	ASSERT(cur_devices->opened == `1`);
2202	cur_devices->opened--;
2203	free_fs_devices(fs_devices: cur_devices);
2204	}
2205
2206	ret = btrfs_commit_transaction(trans);
2207
2208	return ret;
2209
2210	error_undo:
2211	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2212	mutex_lock(&fs_info->chunk_mutex);
2213	list_add(new: &device->dev_alloc_list,
2214	head: &fs_devices->alloc_list);
2215	device->fs_devices->rw_devices++;
2216	mutex_unlock(lock: &fs_info->chunk_mutex);
2217	}
2218	return ret;
2219	}
2220
2221	void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2222	{
2223	struct btrfs_fs_devices *fs_devices;
2224
2225	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2226
2227	/*
2228	* in case of fs with no seed, srcdev->fs_devices will point
2229	* to fs_devices of fs_info. However when the dev being replaced is
2230	* a seed dev it will point to the seed's local fs_devices. In short
2231	* srcdev will have its correct fs_devices in both the cases.
2232	*/
2233	fs_devices = srcdev->fs_devices;
2234
2235	list_del_rcu(entry: &srcdev->dev_list);
2236	list_del(entry: &srcdev->dev_alloc_list);
2237	fs_devices->num_devices--;
2238	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2239	fs_devices->missing_devices--;
2240
2241	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2242	fs_devices->rw_devices--;
2243
2244	if (srcdev->bdev)
2245	fs_devices->open_devices--;
2246	}
2247
2248	void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2249	{
2250	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2251
2252	mutex_lock(&uuid_mutex);
2253
2254	btrfs_close_bdev(device: srcdev);
2255	synchronize_rcu();
2256	btrfs_free_device(device: srcdev);
2257
2258	/ if this is no devs we rather delete the fs_devices /
2259	if (!fs_devices->num_devices) {
2260	/*
2261	* On a mounted FS, num_devices can't be zero unless it's a
2262	* seed. In case of a seed device being replaced, the replace
2263	* target added to the sprout FS, so there will be no more
2264	* device left under the seed FS.
2265	*/
2266	ASSERT(fs_devices->seeding);
2267
2268	list_del_init(entry: &fs_devices->seed_list);
2269	close_fs_devices(fs_devices);
2270	free_fs_devices(fs_devices);
2271	}
2272	mutex_unlock(lock: &uuid_mutex);
2273	}
2274
2275	void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2276	{
2277	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2278
2279	mutex_lock(&fs_devices->device_list_mutex);
2280
2281	btrfs_sysfs_remove_device(device: tgtdev);
2282
2283	if (tgtdev->bdev)
2284	fs_devices->open_devices--;
2285
2286	fs_devices->num_devices--;
2287
2288	btrfs_assign_next_active_device(device: tgtdev, NULL);
2289
2290	list_del_rcu(entry: &tgtdev->dev_list);
2291
2292	mutex_unlock(lock: &fs_devices->device_list_mutex);
2293
2294	btrfs_scratch_superblocks(fs_info: tgtdev->fs_info, bdev: tgtdev->bdev,
2295	device_path: tgtdev->name->str);
2296
2297	btrfs_close_bdev(device: tgtdev);
2298	synchronize_rcu();
2299	btrfs_free_device(device: tgtdev);
2300	}
2301
2302	/*
2303	* Populate args from device at path.
2304	*
2305	* @fs_info: the filesystem
2306	* @args: the args to populate
2307	* @path: the path to the device
2308	*
2309	* This will read the super block of the device at @path and populate @args with
2310	* the devid, fsid, and uuid. This is meant to be used for ioctls that need to
2311	* lookup a device to operate on, but need to do it before we take any locks.
2312	* This properly handles the special case of "missing" that a user may pass in,
2313	* and does some basic sanity checks. The caller must make sure that @path is
2314	* properly NUL terminated before calling in, and must call
2315	* btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
2316	* uuid buffers.
2317	*
2318	* Return: 0 for success, -errno for failure
2319	*/
2320	int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
2321	struct btrfs_dev_lookup_args *args,
2322	const char *path)
2323	{
2324	struct btrfs_super_block *disk_super;
2325	struct bdev_handle *bdev_handle;
2326	int ret;
2327
2328	if (!path \|\| !path[`0`])
2329	return -EINVAL;
2330	if (!strcmp(path, "missing")) {
2331	args->missing = true;
2332	return `0`;
2333	}
2334
2335	args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
2336	args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
2337	if (!args->uuid \|\| !args->fsid) {
2338	btrfs_put_dev_args_from_path(args);
2339	return -ENOMEM;
2340	}
2341
2342	ret = btrfs_get_bdev_and_sb(device_path: path, BLK_OPEN_READ, NULL, flush: `0`,
2343	bdev_handle: &bdev_handle, disk_super: &disk_super);
2344	if (ret) {
2345	btrfs_put_dev_args_from_path(args);
2346	return ret;
2347	}
2348
2349	args->devid = btrfs_stack_device_id(s: &disk_super->dev_item);
2350	memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
2351	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2352	memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
2353	else
2354	memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
2355	btrfs_release_disk_super(super: disk_super);
2356	bdev_release(handle: bdev_handle);
2357	return `0`;
2358	}
2359
2360	/*
2361	* Only use this jointly with btrfs_get_dev_args_from_path() because we will
2362	* allocate our ->uuid and ->fsid pointers, everybody else uses local variables
2363	* that don't need to be freed.
2364	*/
2365	void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
2366	{
2367	kfree(objp: args->uuid);
2368	kfree(objp: args->fsid);
2369	args->uuid = NULL;
2370	args->fsid = NULL;
2371	}
2372
2373	struct btrfs_device *btrfs_find_device_by_devspec(
2374	struct btrfs_fs_info *fs_info, u64 devid,
2375	const char *device_path)
2376	{
2377	BTRFS_DEV_LOOKUP_ARGS(args);
2378	struct btrfs_device *device;
2379	int ret;
2380
2381	if (devid) {
2382	args.devid = devid;
2383	device = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
2384	if (!device)
2385	return ERR_PTR(error: -ENOENT);
2386	return device;
2387	}
2388
2389	ret = btrfs_get_dev_args_from_path(fs_info, args: &args, path: device_path);
2390	if (ret)
2391	return ERR_PTR(error: ret);
2392	device = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
2393	btrfs_put_dev_args_from_path(args: &args);
2394	if (!device)
2395	return ERR_PTR(error: -ENOENT);
2396	return device;
2397	}
2398
2399	static struct btrfs_fs_devices btrfs_init_sprout(struct* btrfs_fs_info *fs_info)
2400	{
2401	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2402	struct btrfs_fs_devices *old_devices;
2403	struct btrfs_fs_devices *seed_devices;
2404
2405	lockdep_assert_held(&uuid_mutex);
2406	if (!fs_devices->seeding)
2407	return ERR_PTR(error: -EINVAL);
2408
2409	/*
2410	* Private copy of the seed devices, anchored at
2411	* fs_info->fs_devices->seed_list
2412	*/
2413	seed_devices = alloc_fs_devices(NULL);
2414	if (IS_ERR(ptr: seed_devices))
2415	return seed_devices;
2416
2417	/*
2418	* It's necessary to retain a copy of the original seed fs_devices in
2419	* fs_uuids so that filesystems which have been seeded can successfully
2420	* reference the seed device from open_seed_devices. This also supports
2421	* multiple fs seed.
2422	*/
2423	old_devices = clone_fs_devices(orig: fs_devices);
2424	if (IS_ERR(ptr: old_devices)) {
2425	kfree(objp: seed_devices);
2426	return old_devices;
2427	}
2428
2429	list_add(new: &old_devices->fs_list, head: &fs_uuids);
2430
2431	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2432	seed_devices->opened = `1`;
2433	INIT_LIST_HEAD(list: &seed_devices->devices);
2434	INIT_LIST_HEAD(list: &seed_devices->alloc_list);
2435	mutex_init(&seed_devices->device_list_mutex);
2436
2437	return seed_devices;
2438	}
2439
2440	/*
2441	* Splice seed devices into the sprout fs_devices.
2442	* Generate a new fsid for the sprouted read-write filesystem.
2443	*/
2444	static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
2445	struct btrfs_fs_devices *seed_devices)
2446	{
2447	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2448	struct btrfs_super_block *disk_super = fs_info->super_copy;
2449	struct btrfs_device *device;
2450	u64 super_flags;
2451
2452	/*
2453	* We are updating the fsid, the thread leading to device_list_add()
2454	* could race, so uuid_mutex is needed.
2455	*/
2456	lockdep_assert_held(&uuid_mutex);
2457
2458	/*
2459	* The threads listed below may traverse dev_list but can do that without
2460	* device_list_mutex:
2461	* - All device ops and balance - as we are in btrfs_exclop_start.
2462	* - Various dev_list readers - are using RCU.
2463	* - btrfs_ioctl_fitrim() - is using RCU.
2464	*
2465	* For-read threads as below are using device_list_mutex:
2466	* - Readonly scrub btrfs_scrub_dev()
2467	* - Readonly scrub btrfs_scrub_progress()
2468	* - btrfs_get_dev_stats()
2469	*/
2470	lockdep_assert_held(&fs_devices->device_list_mutex);
2471
2472	list_splice_init_rcu(list: &fs_devices->devices, head: &seed_devices->devices,
2473	sync: synchronize_rcu);
2474	list_for_each_entry(device, &seed_devices->devices, dev_list)
2475	device->fs_devices = seed_devices;
2476
2477	fs_devices->seeding = false;
2478	fs_devices->num_devices = `0`;
2479	fs_devices->open_devices = `0`;
2480	fs_devices->missing_devices = `0`;
2481	fs_devices->rotating = false;
2482	list_add(new: &seed_devices->seed_list, head: &fs_devices->seed_list);
2483
2484	generate_random_uuid(uuid: fs_devices->fsid);
2485	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2486	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2487
2488	super_flags = btrfs_super_flags(s: disk_super) &
2489	~BTRFS_SUPER_FLAG_SEEDING;
2490	btrfs_set_super_flags(s: disk_super, val: super_flags);
2491	}
2492
2493	/*
2494	* Store the expected generation for seed devices in device items.
2495	*/
2496	static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2497	{
2498	BTRFS_DEV_LOOKUP_ARGS(args);
2499	struct btrfs_fs_info *fs_info = trans->fs_info;
2500	struct btrfs_root *root = fs_info->chunk_root;
2501	struct btrfs_path *path;
2502	struct extent_buffer *leaf;
2503	struct btrfs_dev_item *dev_item;
2504	struct btrfs_device *device;
2505	struct btrfs_key key;
2506	u8 fs_uuid[BTRFS_FSID_SIZE];
2507	u8 dev_uuid[BTRFS_UUID_SIZE];
2508	int ret;
2509
2510	path = btrfs_alloc_path();
2511	if (!path)
2512	return -ENOMEM;
2513
2514	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2515	key.offset = `0`;
2516	key.type = BTRFS_DEV_ITEM_KEY;
2517
2518	while (`1`) {
2519	btrfs_reserve_chunk_metadata(trans, is_item_insertion: false);
2520	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
2521	btrfs_trans_release_chunk_metadata(trans);
2522	if (ret < `0`)
2523	goto error;
2524
2525	leaf = path->nodes[`0`];
2526	next_slot:
2527	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
2528	ret = btrfs_next_leaf(root, path);
2529	if (ret > `0`)
2530	break;
2531	if (ret < `0`)
2532	goto error;
2533	leaf = path->nodes[`0`];
2534	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
2535	btrfs_release_path(p: path);
2536	continue;
2537	}
2538
2539	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
2540	if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID \|\|
2541	key.type != BTRFS_DEV_ITEM_KEY)
2542	break;
2543
2544	dev_item = btrfs_item_ptr(leaf, path->slots[`0`],
2545	struct btrfs_dev_item);
2546	args.devid = btrfs_device_id(eb: leaf, s: dev_item);
2547	read_extent_buffer(eb: leaf, dst: dev_uuid, start: btrfs_device_uuid(d: dev_item),
2548	BTRFS_UUID_SIZE);
2549	read_extent_buffer(eb: leaf, dst: fs_uuid, start: btrfs_device_fsid(d: dev_item),
2550	BTRFS_FSID_SIZE);
2551	args.uuid = dev_uuid;
2552	args.fsid = fs_uuid;
2553	device = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
2554	BUG_ON(!device); / Logic error /
2555
2556	if (device->fs_devices->seeding) {
2557	btrfs_set_device_generation(eb: leaf, s: dev_item,
2558	val: device->generation);
2559	btrfs_mark_buffer_dirty(trans, buf: leaf);
2560	}
2561
2562	path->slots[`0`]++;
2563	goto next_slot;
2564	}
2565	ret = `0`;
2566	error:
2567	btrfs_free_path(p: path);
2568	return ret;
2569	}
2570
2571	int btrfs_init_new_device(struct btrfs_fs_info fs_info, const* char *device_path)
2572	{
2573	struct btrfs_root *root = fs_info->dev_root;
2574	struct btrfs_trans_handle *trans;
2575	struct btrfs_device *device;
2576	struct bdev_handle *bdev_handle;
2577	struct super_block *sb = fs_info->sb;
2578	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2579	struct btrfs_fs_devices *seed_devices = NULL;
2580	u64 orig_super_total_bytes;
2581	u64 orig_super_num_devices;
2582	int ret = `0`;
2583	bool seeding_dev = false;
2584	bool locked = false;
2585
2586	if (sb_rdonly(sb) && !fs_devices->seeding)
2587	return -EROFS;
2588
2589	bdev_handle = bdev_open_by_path(path: device_path, BLK_OPEN_WRITE,
2590	holder: fs_info->bdev_holder, NULL);
2591	if (IS_ERR(ptr: bdev_handle))
2592	return PTR_ERR(ptr: bdev_handle);
2593
2594	if (!btrfs_check_device_zone_type(fs_info, bdev: bdev_handle->bdev)) {
2595	ret = -EINVAL;
2596	goto error;
2597	}
2598
2599	if (fs_devices->seeding) {
2600	seeding_dev = true;
2601	down_write(sem: &sb->s_umount);
2602	mutex_lock(&uuid_mutex);
2603	locked = true;
2604	}
2605
2606	sync_blockdev(bdev: bdev_handle->bdev);
2607
2608	rcu_read_lock();
2609	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2610	if (device->bdev == bdev_handle->bdev) {
2611	ret = -EEXIST;
2612	rcu_read_unlock();
2613	goto error;
2614	}
2615	}
2616	rcu_read_unlock();
2617
2618	device = btrfs_alloc_device(fs_info, NULL, NULL, path: device_path);
2619	if (IS_ERR(ptr: device)) {
2620	/ we can safely leave the fs_devices entry around /
2621	ret = PTR_ERR(ptr: device);
2622	goto error;
2623	}
2624
2625	device->fs_info = fs_info;
2626	device->bdev_handle = bdev_handle;
2627	device->bdev = bdev_handle->bdev;
2628	ret = lookup_bdev(pathname: device_path, dev: &device->devt);
2629	if (ret)
2630	goto error_free_device;
2631
2632	ret = btrfs_get_dev_zone_info(device, populate_cache: false);
2633	if (ret)
2634	goto error_free_device;
2635
2636	trans = btrfs_start_transaction(root, num_items: `0`);
2637	if (IS_ERR(ptr: trans)) {
2638	ret = PTR_ERR(ptr: trans);
2639	goto error_free_zone;
2640	}
2641
2642	set_bit(BTRFS_DEV_STATE_WRITEABLE, addr: &device->dev_state);
2643	device->generation = trans->transid;
2644	device->io_width = fs_info->sectorsize;
2645	device->io_align = fs_info->sectorsize;
2646	device->sector_size = fs_info->sectorsize;
2647	device->total_bytes =
2648	round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
2649	device->disk_total_bytes = device->total_bytes;
2650	device->commit_total_bytes = device->total_bytes;
2651	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, addr: &device->dev_state);
2652	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, addr: &device->dev_state);
2653	device->dev_stats_valid = `1`;
2654	set_blocksize(bdev: device->bdev, BTRFS_BDEV_BLOCKSIZE);
2655
2656	if (seeding_dev) {
2657	btrfs_clear_sb_rdonly(sb);
2658
2659	/ GFP_KERNEL allocation must not be under device_list_mutex /
2660	seed_devices = btrfs_init_sprout(fs_info);
2661	if (IS_ERR(ptr: seed_devices)) {
2662	ret = PTR_ERR(ptr: seed_devices);
2663	btrfs_abort_transaction(trans, ret);
2664	goto error_trans;
2665	}
2666	}
2667
2668	mutex_lock(&fs_devices->device_list_mutex);
2669	if (seeding_dev) {
2670	btrfs_setup_sprout(fs_info, seed_devices);
2671	btrfs_assign_next_active_device(device: fs_info->fs_devices->latest_dev,
2672	next_device: device);
2673	}
2674
2675	device->fs_devices = fs_devices;
2676
2677	mutex_lock(&fs_info->chunk_mutex);
2678	list_add_rcu(new: &device->dev_list, head: &fs_devices->devices);
2679	list_add(new: &device->dev_alloc_list, head: &fs_devices->alloc_list);
2680	fs_devices->num_devices++;
2681	fs_devices->open_devices++;
2682	fs_devices->rw_devices++;
2683	fs_devices->total_devices++;
2684	fs_devices->total_rw_bytes += device->total_bytes;
2685
2686	atomic64_add(i: device->total_bytes, v: &fs_info->free_chunk_space);
2687
2688	if (!bdev_nonrot(bdev: device->bdev))
2689	fs_devices->rotating = true;
2690
2691	orig_super_total_bytes = btrfs_super_total_bytes(s: fs_info->super_copy);
2692	btrfs_set_super_total_bytes(s: fs_info->super_copy,
2693	round_down(orig_super_total_bytes + device->total_bytes,
2694	fs_info->sectorsize));
2695
2696	orig_super_num_devices = btrfs_super_num_devices(s: fs_info->super_copy);
2697	btrfs_set_super_num_devices(s: fs_info->super_copy,
2698	val: orig_super_num_devices + `1`);
2699
2700	/*
2701	* we've got more storage, clear any full flags on the space
2702	* infos
2703	*/
2704	btrfs_clear_space_info_full(info: fs_info);
2705
2706	mutex_unlock(lock: &fs_info->chunk_mutex);
2707
2708	/ Add sysfs device entry /
2709	btrfs_sysfs_add_device(device);
2710
2711	mutex_unlock(lock: &fs_devices->device_list_mutex);
2712
2713	if (seeding_dev) {
2714	mutex_lock(&fs_info->chunk_mutex);
2715	ret = init_first_rw_device(trans);
2716	mutex_unlock(lock: &fs_info->chunk_mutex);
2717	if (ret) {
2718	btrfs_abort_transaction(trans, ret);
2719	goto error_sysfs;
2720	}
2721	}
2722
2723	ret = btrfs_add_dev_item(trans, device);
2724	if (ret) {
2725	btrfs_abort_transaction(trans, ret);
2726	goto error_sysfs;
2727	}
2728
2729	if (seeding_dev) {
2730	ret = btrfs_finish_sprout(trans);
2731	if (ret) {
2732	btrfs_abort_transaction(trans, ret);
2733	goto error_sysfs;
2734	}
2735
2736	/*
2737	* fs_devices now represents the newly sprouted filesystem and
2738	* its fsid has been changed by btrfs_sprout_splice().
2739	*/
2740	btrfs_sysfs_update_sprout_fsid(fs_devices);
2741	}
2742
2743	ret = btrfs_commit_transaction(trans);
2744
2745	if (seeding_dev) {
2746	mutex_unlock(lock: &uuid_mutex);
2747	up_write(sem: &sb->s_umount);
2748	locked = false;
2749
2750	if (ret) / transaction commit /
2751	return ret;
2752
2753	ret = btrfs_relocate_sys_chunks(fs_info);
2754	if (ret < `0`)
2755	btrfs_handle_fs_error(fs_info, ret,
2756	"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2757	trans = btrfs_attach_transaction(root);
2758	if (IS_ERR(ptr: trans)) {
2759	if (PTR_ERR(ptr: trans) == -ENOENT)
2760	return `0`;
2761	ret = PTR_ERR(ptr: trans);
2762	trans = NULL;
2763	goto error_sysfs;
2764	}
2765	ret = btrfs_commit_transaction(trans);
2766	}
2767
2768	/*
2769	* Now that we have written a new super block to this device, check all
2770	* other fs_devices list if device_path alienates any other scanned
2771	* device.
2772	* We can ignore the return value as it typically returns -EINVAL and
2773	* only succeeds if the device was an alien.
2774	*/
2775	btrfs_forget_devices(devt: device->devt);
2776
2777	/ Update ctime/mtime for blkid or udev /
2778	update_dev_time(device_path);
2779
2780	return ret;
2781
2782	error_sysfs:
2783	btrfs_sysfs_remove_device(device);
2784	mutex_lock(&fs_info->fs_devices->device_list_mutex);
2785	mutex_lock(&fs_info->chunk_mutex);
2786	list_del_rcu(entry: &device->dev_list);
2787	list_del(entry: &device->dev_alloc_list);
2788	fs_info->fs_devices->num_devices--;
2789	fs_info->fs_devices->open_devices--;
2790	fs_info->fs_devices->rw_devices--;
2791	fs_info->fs_devices->total_devices--;
2792	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2793	atomic64_sub(i: device->total_bytes, v: &fs_info->free_chunk_space);
2794	btrfs_set_super_total_bytes(s: fs_info->super_copy,
2795	val: orig_super_total_bytes);
2796	btrfs_set_super_num_devices(s: fs_info->super_copy,
2797	val: orig_super_num_devices);
2798	mutex_unlock(lock: &fs_info->chunk_mutex);
2799	mutex_unlock(lock: &fs_info->fs_devices->device_list_mutex);
2800	error_trans:
2801	if (seeding_dev)
2802	btrfs_set_sb_rdonly(sb);
2803	if (trans)
2804	btrfs_end_transaction(trans);
2805	error_free_zone:
2806	btrfs_destroy_dev_zone_info(device);
2807	error_free_device:
2808	btrfs_free_device(device);
2809	error:
2810	bdev_release(handle: bdev_handle);
2811	if (locked) {
2812	mutex_unlock(lock: &uuid_mutex);
2813	up_write(sem: &sb->s_umount);
2814	}
2815	return ret;
2816	}
2817
2818	static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2819	struct btrfs_device *device)
2820	{
2821	int ret;
2822	struct btrfs_path *path;
2823	struct btrfs_root *root = device->fs_info->chunk_root;
2824	struct btrfs_dev_item *dev_item;
2825	struct extent_buffer *leaf;
2826	struct btrfs_key key;
2827
2828	path = btrfs_alloc_path();
2829	if (!path)
2830	return -ENOMEM;
2831
2832	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2833	key.type = BTRFS_DEV_ITEM_KEY;
2834	key.offset = device->devid;
2835
2836	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
2837	if (ret < `0`)
2838	goto out;
2839
2840	if (ret > `0`) {
2841	ret = -ENOENT;
2842	goto out;
2843	}
2844
2845	leaf = path->nodes[`0`];
2846	dev_item = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_dev_item);
2847
2848	btrfs_set_device_id(eb: leaf, s: dev_item, val: device->devid);
2849	btrfs_set_device_type(eb: leaf, s: dev_item, val: device->type);
2850	btrfs_set_device_io_align(eb: leaf, s: dev_item, val: device->io_align);
2851	btrfs_set_device_io_width(eb: leaf, s: dev_item, val: device->io_width);
2852	btrfs_set_device_sector_size(eb: leaf, s: dev_item, val: device->sector_size);
2853	btrfs_set_device_total_bytes(eb: leaf, s: dev_item,
2854	val: btrfs_device_get_disk_total_bytes(dev: device));
2855	btrfs_set_device_bytes_used(eb: leaf, s: dev_item,
2856	val: btrfs_device_get_bytes_used(dev: device));
2857	btrfs_mark_buffer_dirty(trans, buf: leaf);
2858
2859	out:
2860	btrfs_free_path(p: path);
2861	return ret;
2862	}
2863
2864	int btrfs_grow_device(struct btrfs_trans_handle *trans,
2865	struct btrfs_device *device, u64 new_size)
2866	{
2867	struct btrfs_fs_info *fs_info = device->fs_info;
2868	struct btrfs_super_block *super_copy = fs_info->super_copy;
2869	u64 old_total;
2870	u64 diff;
2871	int ret;
2872
2873	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2874	return -EACCES;
2875
2876	new_size = round_down(new_size, fs_info->sectorsize);
2877
2878	mutex_lock(&fs_info->chunk_mutex);
2879	old_total = btrfs_super_total_bytes(s: super_copy);
2880	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2881
2882	if (new_size <= device->total_bytes \|\|
2883	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2884	mutex_unlock(lock: &fs_info->chunk_mutex);
2885	return -EINVAL;
2886	}
2887
2888	btrfs_set_super_total_bytes(s: super_copy,
2889	round_down(old_total + diff, fs_info->sectorsize));
2890	device->fs_devices->total_rw_bytes += diff;
2891	atomic64_add(i: diff, v: &fs_info->free_chunk_space);
2892
2893	btrfs_device_set_total_bytes(dev: device, size: new_size);
2894	btrfs_device_set_disk_total_bytes(dev: device, size: new_size);
2895	btrfs_clear_space_info_full(info: device->fs_info);
2896	if (list_empty(head: &device->post_commit_list))
2897	list_add_tail(new: &device->post_commit_list,
2898	head: &trans->transaction->dev_update_list);
2899	mutex_unlock(lock: &fs_info->chunk_mutex);
2900
2901	btrfs_reserve_chunk_metadata(trans, is_item_insertion: false);
2902	ret = btrfs_update_device(trans, device);
2903	btrfs_trans_release_chunk_metadata(trans);
2904
2905	return ret;
2906	}
2907
2908	static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2909	{
2910	struct btrfs_fs_info *fs_info = trans->fs_info;
2911	struct btrfs_root *root = fs_info->chunk_root;
2912	int ret;
2913	struct btrfs_path *path;
2914	struct btrfs_key key;
2915
2916	path = btrfs_alloc_path();
2917	if (!path)
2918	return -ENOMEM;
2919
2920	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2921	key.offset = chunk_offset;
2922	key.type = BTRFS_CHUNK_ITEM_KEY;
2923
2924	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
2925	if (ret < `0`)
2926	goto out;
2927	else if (ret > `0`) { / Logic error or corruption /
2928	btrfs_handle_fs_error(fs_info, -ENOENT,
2929	"Failed lookup while freeing chunk.");
2930	ret = -ENOENT;
2931	goto out;
2932	}
2933
2934	ret = btrfs_del_item(trans, root, path);
2935	if (ret < `0`)
2936	btrfs_handle_fs_error(fs_info, ret,
2937	"Failed to delete chunk item.");
2938	out:
2939	btrfs_free_path(p: path);
2940	return ret;
2941	}
2942
2943	static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2944	{
2945	struct btrfs_super_block *super_copy = fs_info->super_copy;
2946	struct btrfs_disk_key *disk_key;
2947	struct btrfs_chunk *chunk;
2948	u8 *ptr;
2949	int ret = `0`;
2950	u32 num_stripes;
2951	u32 array_size;
2952	u32 len = `0`;
2953	u32 cur;
2954	struct btrfs_key key;
2955
2956	lockdep_assert_held(&fs_info->chunk_mutex);
2957	array_size = btrfs_super_sys_array_size(s: super_copy);
2958
2959	ptr = super_copy->sys_chunk_array;
2960	cur = `0`;
2961
2962	while (cur < array_size) {
2963	disk_key = (struct btrfs_disk_key *)ptr;
2964	btrfs_disk_key_to_cpu(cpu_key: &key, disk_key);
2965
2966	len = sizeof(*disk_key);
2967
2968	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2969	chunk = (struct btrfs_chunk *)(ptr + len);
2970	num_stripes = btrfs_stack_chunk_num_stripes(s: chunk);
2971	len += btrfs_chunk_item_size(num_stripes);
2972	} else {
2973	ret = -EIO;
2974	break;
2975	}
2976	if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2977	key.offset == chunk_offset) {
2978	memmove(ptr, ptr + len, array_size - (cur + len));
2979	array_size -= len;
2980	btrfs_set_super_sys_array_size(s: super_copy, val: array_size);
2981	} else {
2982	ptr += len;
2983	cur += len;
2984	}
2985	}
2986	return ret;
2987	}
2988
2989	/*
2990	* Find the mapping containing the given logical extent.
2991	*
2992	* @logical: Logical block offset in bytes.
2993	* @length: Length of extent in bytes.
2994	*
2995	* Return: Chunk mapping or ERR_PTR.
2996	*/
2997	struct extent_map btrfs_get_chunk_map(struct* btrfs_fs_info *fs_info,
2998	u64 logical, u64 length)
2999	{
3000	struct extent_map_tree *em_tree;
3001	struct extent_map *em;
3002
3003	em_tree = &fs_info->mapping_tree;
3004	read_lock(&em_tree->lock);
3005	em = lookup_extent_mapping(tree: em_tree, start: logical, len: length);
3006	read_unlock(&em_tree->lock);
3007
3008	if (!em) {
3009	btrfs_crit(fs_info, "unable to find logical %llu length %llu",
3010	logical, length);
3011	return ERR_PTR(error: -EINVAL);
3012	}
3013
3014	if (em->start > logical \|\| em->start + em->len < logical) {
3015	btrfs_crit(fs_info,
3016	"found a bad mapping, wanted %llu-%llu, found %llu-%llu",
3017	logical, length, em->start, em->start + em->len);
3018	free_extent_map(em);
3019	return ERR_PTR(error: -EINVAL);
3020	}
3021
3022	/ callers are responsible for dropping em's ref. /
3023	return em;
3024	}
3025
3026	static int remove_chunk_item(struct btrfs_trans_handle *trans,
3027	struct map_lookup *map, u64 chunk_offset)
3028	{
3029	int i;
3030
3031	/*
3032	* Removing chunk items and updating the device items in the chunks btree
3033	* requires holding the chunk_mutex.
3034	* See the comment at btrfs_chunk_alloc() for the details.
3035	*/
3036	lockdep_assert_held(&trans->fs_info->chunk_mutex);
3037
3038	for (i = `0`; i < map->num_stripes; i++) {
3039	int ret;
3040
3041	ret = btrfs_update_device(trans, device: map->stripes[i].dev);
3042	if (ret)
3043	return ret;
3044	}
3045
3046	return btrfs_free_chunk(trans, chunk_offset);
3047	}
3048
3049	int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3050	{
3051	struct btrfs_fs_info *fs_info = trans->fs_info;
3052	struct extent_map *em;
3053	struct map_lookup *map;
3054	u64 dev_extent_len = `0`;
3055	int i, ret = `0`;
3056	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3057
3058	em = btrfs_get_chunk_map(fs_info, logical: chunk_offset, length: `1`);
3059	if (IS_ERR(ptr: em)) {
3060	/*
3061	* This is a logic error, but we don't want to just rely on the
3062	* user having built with ASSERT enabled, so if ASSERT doesn't
3063	* do anything we still error out.
3064	*/
3065	ASSERT(`0`);
3066	return PTR_ERR(ptr: em);
3067	}
3068	map = em->map_lookup;
3069
3070	/*
3071	* First delete the device extent items from the devices btree.
3072	* We take the device_list_mutex to avoid racing with the finishing phase
3073	* of a device replace operation. See the comment below before acquiring
3074	* fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3075	* because that can result in a deadlock when deleting the device extent
3076	* items from the devices btree - COWing an extent buffer from the btree
3077	* may result in allocating a new metadata chunk, which would attempt to
3078	* lock again fs_info->chunk_mutex.
3079	*/
3080	mutex_lock(&fs_devices->device_list_mutex);
3081	for (i = `0`; i < map->num_stripes; i++) {
3082	struct btrfs_device *device = map->stripes[i].dev;
3083	ret = btrfs_free_dev_extent(trans, device,
3084	start: map->stripes[i].physical,
3085	dev_extent_len: &dev_extent_len);
3086	if (ret) {
3087	mutex_unlock(lock: &fs_devices->device_list_mutex);
3088	btrfs_abort_transaction(trans, ret);
3089	goto out;
3090	}
3091
3092	if (device->bytes_used > `0`) {
3093	mutex_lock(&fs_info->chunk_mutex);
3094	btrfs_device_set_bytes_used(dev: device,
3095	size: device->bytes_used - dev_extent_len);
3096	atomic64_add(i: dev_extent_len, v: &fs_info->free_chunk_space);
3097	btrfs_clear_space_info_full(info: fs_info);
3098	mutex_unlock(lock: &fs_info->chunk_mutex);
3099	}
3100	}
3101	mutex_unlock(lock: &fs_devices->device_list_mutex);
3102
3103	/*
3104	* We acquire fs_info->chunk_mutex for 2 reasons:
3105	*
3106	* 1) Just like with the first phase of the chunk allocation, we must
3107	* reserve system space, do all chunk btree updates and deletions, and
3108	* update the system chunk array in the superblock while holding this
3109	* mutex. This is for similar reasons as explained on the comment at
3110	* the top of btrfs_chunk_alloc();
3111	*
3112	* 2) Prevent races with the final phase of a device replace operation
3113	* that replaces the device object associated with the map's stripes,
3114	* because the device object's id can change at any time during that
3115	* final phase of the device replace operation
3116	* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3117	* replaced device and then see it with an ID of
3118	* BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3119	* the device item, which does not exists on the chunk btree.
3120	* The finishing phase of device replace acquires both the
3121	* device_list_mutex and the chunk_mutex, in that order, so we are
3122	* safe by just acquiring the chunk_mutex.
3123	*/
3124	trans->removing_chunk = true;
3125	mutex_lock(&fs_info->chunk_mutex);
3126
3127	check_system_chunk(trans, type: map->type);
3128
3129	ret = remove_chunk_item(trans, map, chunk_offset);
3130	/*
3131	* Normally we should not get -ENOSPC since we reserved space before
3132	* through the call to check_system_chunk().
3133	*
3134	* Despite our system space_info having enough free space, we may not
3135	* be able to allocate extents from its block groups, because all have
3136	* an incompatible profile, which will force us to allocate a new system
3137	* block group with the right profile, or right after we called
3138	* check_system_space() above, a scrub turned the only system block group
3139	* with enough free space into RO mode.
3140	* This is explained with more detail at do_chunk_alloc().
3141	*
3142	* So if we get -ENOSPC, allocate a new system chunk and retry once.
3143	*/
3144	if (ret == -ENOSPC) {
3145	const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3146	struct btrfs_block_group *sys_bg;
3147
3148	sys_bg = btrfs_create_chunk(trans, type: sys_flags);
3149	if (IS_ERR(ptr: sys_bg)) {
3150	ret = PTR_ERR(ptr: sys_bg);
3151	btrfs_abort_transaction(trans, ret);
3152	goto out;
3153	}
3154
3155	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg: sys_bg);
3156	if (ret) {
3157	btrfs_abort_transaction(trans, ret);
3158	goto out;
3159	}
3160
3161	ret = remove_chunk_item(trans, map, chunk_offset);
3162	if (ret) {
3163	btrfs_abort_transaction(trans, ret);
3164	goto out;
3165	}
3166	} else if (ret) {
3167	btrfs_abort_transaction(trans, ret);
3168	goto out;
3169	}
3170
3171	trace_btrfs_chunk_free(fs_info, map, offset: chunk_offset, size: em->len);
3172
3173	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3174	ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3175	if (ret) {
3176	btrfs_abort_transaction(trans, ret);
3177	goto out;
3178	}
3179	}
3180
3181	mutex_unlock(lock: &fs_info->chunk_mutex);
3182	trans->removing_chunk = false;
3183
3184	/*
3185	* We are done with chunk btree updates and deletions, so release the
3186	* system space we previously reserved (with check_system_chunk()).
3187	*/
3188	btrfs_trans_release_chunk_metadata(trans);
3189
3190	ret = btrfs_remove_block_group(trans, group_start: chunk_offset, em);
3191	if (ret) {
3192	btrfs_abort_transaction(trans, ret);
3193	goto out;
3194	}
3195
3196	out:
3197	if (trans->removing_chunk) {
3198	mutex_unlock(lock: &fs_info->chunk_mutex);
3199	trans->removing_chunk = false;
3200	}
3201	/ once for us /
3202	free_extent_map(em);
3203	return ret;
3204	}
3205
3206	int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3207	{
3208	struct btrfs_root *root = fs_info->chunk_root;
3209	struct btrfs_trans_handle *trans;
3210	struct btrfs_block_group *block_group;
3211	u64 length;
3212	int ret;
3213
3214	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3215	btrfs_err(fs_info,
3216	"relocate: not supported on extent tree v2 yet");
3217	return -EINVAL;
3218	}
3219
3220	/*
3221	* Prevent races with automatic removal of unused block groups.
3222	* After we relocate and before we remove the chunk with offset
3223	* chunk_offset, automatic removal of the block group can kick in,
3224	* resulting in a failure when calling btrfs_remove_chunk() below.
3225	*
3226	* Make sure to acquire this mutex before doing a tree search (dev
3227	* or chunk trees) to find chunks. Otherwise the cleaner kthread might
3228	* call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3229	* we release the path used to search the chunk/dev tree and before
3230	* the current task acquires this mutex and calls us.
3231	*/
3232	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3233
3234	/ step one, relocate all the extents inside this chunk /
3235	btrfs_scrub_pause(fs_info);
3236	ret = btrfs_relocate_block_group(fs_info, group_start: chunk_offset);
3237	btrfs_scrub_continue(fs_info);
3238	if (ret) {
3239	/*
3240	* If we had a transaction abort, stop all running scrubs.
3241	* See transaction.c:cleanup_transaction() why we do it here.
3242	*/
3243	if (BTRFS_FS_ERROR(fs_info))
3244	btrfs_scrub_cancel(info: fs_info);
3245	return ret;
3246	}
3247
3248	block_group = btrfs_lookup_block_group(info: fs_info, bytenr: chunk_offset);
3249	if (!block_group)
3250	return -ENOENT;
3251	btrfs_discard_cancel_work(discard_ctl: &fs_info->discard_ctl, block_group);
3252	length = block_group->length;
3253	btrfs_put_block_group(cache: block_group);
3254
3255	/*
3256	* On a zoned file system, discard the whole block group, this will
3257	* trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3258	* resetting the zone fails, don't treat it as a fatal problem from the
3259	* filesystem's point of view.
3260	*/
3261	if (btrfs_is_zoned(fs_info)) {
3262	ret = btrfs_discard_extent(fs_info, bytenr: chunk_offset, num_bytes: length, NULL);
3263	if (ret)
3264	btrfs_info(fs_info,
3265	"failed to reset zone %llu after relocation",
3266	chunk_offset);
3267	}
3268
3269	trans = btrfs_start_trans_remove_block_group(fs_info: root->fs_info,
3270	chunk_offset);
3271	if (IS_ERR(ptr: trans)) {
3272	ret = PTR_ERR(ptr: trans);
3273	btrfs_handle_fs_error(root->fs_info, ret, NULL);
3274	return ret;
3275	}
3276
3277	/*
3278	* step two, delete the device extents and the
3279	* chunk tree entries
3280	*/
3281	ret = btrfs_remove_chunk(trans, chunk_offset);
3282	btrfs_end_transaction(trans);
3283	return ret;
3284	}
3285
3286	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3287	{
3288	struct btrfs_root *chunk_root = fs_info->chunk_root;
3289	struct btrfs_path *path;
3290	struct extent_buffer *leaf;
3291	struct btrfs_chunk *chunk;
3292	struct btrfs_key key;
3293	struct btrfs_key found_key;
3294	u64 chunk_type;
3295	bool retried = false;
3296	int failed = `0`;
3297	int ret;
3298
3299	path = btrfs_alloc_path();
3300	if (!path)
3301	return -ENOMEM;
3302
3303	again:
3304	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3305	key.offset = (u64)-`1`;
3306	key.type = BTRFS_CHUNK_ITEM_KEY;
3307
3308	while (`1`) {
3309	mutex_lock(&fs_info->reclaim_bgs_lock);
3310	ret = btrfs_search_slot(NULL, root: chunk_root, key: &key, p: path, ins_len: `0`, cow: `0`);
3311	if (ret < `0`) {
3312	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3313	goto error;
3314	}
3315	BUG_ON(ret == `0`); / Corruption /
3316
3317	ret = btrfs_previous_item(root: chunk_root, path, min_objectid: key.objectid,
3318	type: key.type);
3319	if (ret)
3320	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3321	if (ret < `0`)
3322	goto error;
3323	if (ret > `0`)
3324	break;
3325
3326	leaf = path->nodes[`0`];
3327	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
3328
3329	chunk = btrfs_item_ptr(leaf, path->slots[`0`],
3330	struct btrfs_chunk);
3331	chunk_type = btrfs_chunk_type(eb: leaf, s: chunk);
3332	btrfs_release_path(p: path);
3333
3334	if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3335	ret = btrfs_relocate_chunk(fs_info, chunk_offset: found_key.offset);
3336	if (ret == -ENOSPC)
3337	failed++;
3338	else
3339	BUG_ON(ret);
3340	}
3341	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3342
3343	if (found_key.offset == `0`)
3344	break;
3345	key.offset = found_key.offset - `1`;
3346	}
3347	ret = `0`;
3348	if (failed && !retried) {
3349	failed = `0`;
3350	retried = true;
3351	goto again;
3352	} else if (WARN_ON(failed && retried)) {
3353	ret = -ENOSPC;
3354	}
3355	error:
3356	btrfs_free_path(p: path);
3357	return ret;
3358	}
3359
3360	/*
3361	* return 1 : allocate a data chunk successfully,
3362	* return <0: errors during allocating a data chunk,
3363	* return 0 : no need to allocate a data chunk.
3364	*/
3365	static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3366	u64 chunk_offset)
3367	{
3368	struct btrfs_block_group *cache;
3369	u64 bytes_used;
3370	u64 chunk_type;
3371
3372	cache = btrfs_lookup_block_group(info: fs_info, bytenr: chunk_offset);
3373	ASSERT(cache);
3374	chunk_type = cache->flags;
3375	btrfs_put_block_group(cache);
3376
3377	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3378	return `0`;
3379
3380	spin_lock(lock: &fs_info->data_sinfo->lock);
3381	bytes_used = fs_info->data_sinfo->bytes_used;
3382	spin_unlock(lock: &fs_info->data_sinfo->lock);
3383
3384	if (!bytes_used) {
3385	struct btrfs_trans_handle *trans;
3386	int ret;
3387
3388	trans = btrfs_join_transaction(root: fs_info->tree_root);
3389	if (IS_ERR(ptr: trans))
3390	return PTR_ERR(ptr: trans);
3391
3392	ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3393	btrfs_end_transaction(trans);
3394	if (ret < `0`)
3395	return ret;
3396	return `1`;
3397	}
3398
3399	return `0`;
3400	}
3401
3402	static int insert_balance_item(struct btrfs_fs_info *fs_info,
3403	struct btrfs_balance_control *bctl)
3404	{
3405	struct btrfs_root *root = fs_info->tree_root;
3406	struct btrfs_trans_handle *trans;
3407	struct btrfs_balance_item *item;
3408	struct btrfs_disk_balance_args disk_bargs;
3409	struct btrfs_path *path;
3410	struct extent_buffer *leaf;
3411	struct btrfs_key key;
3412	int ret, err;
3413
3414	path = btrfs_alloc_path();
3415	if (!path)
3416	return -ENOMEM;
3417
3418	trans = btrfs_start_transaction(root, num_items: `0`);
3419	if (IS_ERR(ptr: trans)) {
3420	btrfs_free_path(p: path);
3421	return PTR_ERR(ptr: trans);
3422	}
3423
3424	key.objectid = BTRFS_BALANCE_OBJECTID;
3425	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3426	key.offset = `0`;
3427
3428	ret = btrfs_insert_empty_item(trans, root, path, key: &key,
3429	data_size: sizeof(*item));
3430	if (ret)
3431	goto out;
3432
3433	leaf = path->nodes[`0`];
3434	item = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_balance_item);
3435
3436	memzero_extent_buffer(eb: leaf, start: (unsigned long)item, len: sizeof(*item));
3437
3438	btrfs_cpu_balance_args_to_disk(disk: &disk_bargs, cpu: &bctl->data);
3439	btrfs_set_balance_data(eb: leaf, bi: item, ba: &disk_bargs);
3440	btrfs_cpu_balance_args_to_disk(disk: &disk_bargs, cpu: &bctl->meta);
3441	btrfs_set_balance_meta(eb: leaf, bi: item, ba: &disk_bargs);
3442	btrfs_cpu_balance_args_to_disk(disk: &disk_bargs, cpu: &bctl->sys);
3443	btrfs_set_balance_sys(eb: leaf, bi: item, ba: &disk_bargs);
3444
3445	btrfs_set_balance_flags(eb: leaf, s: item, val: bctl->flags);
3446
3447	btrfs_mark_buffer_dirty(trans, buf: leaf);
3448	out:
3449	btrfs_free_path(p: path);
3450	err = btrfs_commit_transaction(trans);
3451	if (err && !ret)
3452	ret = err;
3453	return ret;
3454	}
3455
3456	static int del_balance_item(struct btrfs_fs_info *fs_info)
3457	{
3458	struct btrfs_root *root = fs_info->tree_root;
3459	struct btrfs_trans_handle *trans;
3460	struct btrfs_path *path;
3461	struct btrfs_key key;
3462	int ret, err;
3463
3464	path = btrfs_alloc_path();
3465	if (!path)
3466	return -ENOMEM;
3467
3468	trans = btrfs_start_transaction_fallback_global_rsv(root, num_items: `0`);
3469	if (IS_ERR(ptr: trans)) {
3470	btrfs_free_path(p: path);
3471	return PTR_ERR(ptr: trans);
3472	}
3473
3474	key.objectid = BTRFS_BALANCE_OBJECTID;
3475	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3476	key.offset = `0`;
3477
3478	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
3479	if (ret < `0`)
3480	goto out;
3481	if (ret > `0`) {
3482	ret = -ENOENT;
3483	goto out;
3484	}
3485
3486	ret = btrfs_del_item(trans, root, path);
3487	out:
3488	btrfs_free_path(p: path);
3489	err = btrfs_commit_transaction(trans);
3490	if (err && !ret)
3491	ret = err;
3492	return ret;
3493	}
3494
3495	/*
3496	* This is a heuristic used to reduce the number of chunks balanced on
3497	* resume after balance was interrupted.
3498	*/
3499	static void update_balance_args(struct btrfs_balance_control *bctl)
3500	{
3501	/*
3502	* Turn on soft mode for chunk types that were being converted.
3503	*/
3504	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3505	bctl->data.flags \|= BTRFS_BALANCE_ARGS_SOFT;
3506	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3507	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_SOFT;
3508	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3509	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_SOFT;
3510
3511	/*
3512	* Turn on usage filter if is not already used. The idea is
3513	* that chunks that we have already balanced should be
3514	* reasonably full. Don't do it for chunks that are being
3515	* converted - that will keep us from relocating unconverted
3516	* (albeit full) chunks.
3517	*/
3518	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3519	!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3520	!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3521	bctl->data.flags \|= BTRFS_BALANCE_ARGS_USAGE;
3522	bctl->data.usage = `90`;
3523	}
3524	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3525	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3526	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3527	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_USAGE;
3528	bctl->sys.usage = `90`;
3529	}
3530	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3531	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3532	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3533	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_USAGE;
3534	bctl->meta.usage = `90`;
3535	}
3536	}
3537
3538	/*
3539	* Clear the balance status in fs_info and delete the balance item from disk.
3540	*/
3541	static void reset_balance_state(struct btrfs_fs_info *fs_info)
3542	{
3543	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3544	int ret;
3545
3546	BUG_ON(!fs_info->balance_ctl);
3547
3548	spin_lock(lock: &fs_info->balance_lock);
3549	fs_info->balance_ctl = NULL;
3550	spin_unlock(lock: &fs_info->balance_lock);
3551
3552	kfree(objp: bctl);
3553	ret = del_balance_item(fs_info);
3554	if (ret)
3555	btrfs_handle_fs_error(fs_info, ret, NULL);
3556	}
3557
3558	/*
3559	* Balance filters. Return 1 if chunk should be filtered out
3560	* (should not be balanced).
3561	*/
3562	static int chunk_profiles_filter(u64 chunk_type,
3563	struct btrfs_balance_args *bargs)
3564	{
3565	chunk_type = chunk_to_extended(flags: chunk_type) &
3566	BTRFS_EXTENDED_PROFILE_MASK;
3567
3568	if (bargs->profiles & chunk_type)
3569	return `0`;
3570
3571	return `1`;
3572	}
3573
3574	static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3575	struct btrfs_balance_args *bargs)
3576	{
3577	struct btrfs_block_group *cache;
3578	u64 chunk_used;
3579	u64 user_thresh_min;
3580	u64 user_thresh_max;
3581	int ret = `1`;
3582
3583	cache = btrfs_lookup_block_group(info: fs_info, bytenr: chunk_offset);
3584	chunk_used = cache->used;
3585
3586	if (bargs->usage_min == `0`)
3587	user_thresh_min = `0`;
3588	else
3589	user_thresh_min = mult_perc(num: cache->length, percent: bargs->usage_min);
3590
3591	if (bargs->usage_max == `0`)
3592	user_thresh_max = `1`;
3593	else if (bargs->usage_max > `100`)
3594	user_thresh_max = cache->length;
3595	else
3596	user_thresh_max = mult_perc(num: cache->length, percent: bargs->usage_max);
3597
3598	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3599	ret = `0`;
3600
3601	btrfs_put_block_group(cache);
3602	return ret;
3603	}
3604
3605	static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3606	u64 chunk_offset, struct btrfs_balance_args *bargs)
3607	{
3608	struct btrfs_block_group *cache;
3609	u64 chunk_used, user_thresh;
3610	int ret = `1`;
3611
3612	cache = btrfs_lookup_block_group(info: fs_info, bytenr: chunk_offset);
3613	chunk_used = cache->used;
3614
3615	if (bargs->usage_min == `0`)
3616	user_thresh = `1`;
3617	else if (bargs->usage > `100`)
3618	user_thresh = cache->length;
3619	else
3620	user_thresh = mult_perc(num: cache->length, percent: bargs->usage);
3621
3622	if (chunk_used < user_thresh)
3623	ret = `0`;
3624
3625	btrfs_put_block_group(cache);
3626	return ret;
3627	}
3628
3629	static int chunk_devid_filter(struct extent_buffer *leaf,
3630	struct btrfs_chunk *chunk,
3631	struct btrfs_balance_args *bargs)
3632	{
3633	struct btrfs_stripe *stripe;
3634	int num_stripes = btrfs_chunk_num_stripes(eb: leaf, s: chunk);
3635	int i;
3636
3637	for (i = `0`; i < num_stripes; i++) {
3638	stripe = btrfs_stripe_nr(c: chunk, nr: i);
3639	if (btrfs_stripe_devid(eb: leaf, s: stripe) == bargs->devid)
3640	return `0`;
3641	}
3642
3643	return `1`;
3644	}
3645
3646	static u64 calc_data_stripes(u64 type, int num_stripes)
3647	{
3648	const int index = btrfs_bg_flags_to_raid_index(flags: type);
3649	const int ncopies = btrfs_raid_array[index].ncopies;
3650	const int nparity = btrfs_raid_array[index].nparity;
3651
3652	return (num_stripes - nparity) / ncopies;
3653	}
3654
3655	/ [pstart, pend) /
3656	static int chunk_drange_filter(struct extent_buffer *leaf,
3657	struct btrfs_chunk *chunk,
3658	struct btrfs_balance_args *bargs)
3659	{
3660	struct btrfs_stripe *stripe;
3661	int num_stripes = btrfs_chunk_num_stripes(eb: leaf, s: chunk);
3662	u64 stripe_offset;
3663	u64 stripe_length;
3664	u64 type;
3665	int factor;
3666	int i;
3667
3668	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3669	return `0`;
3670
3671	type = btrfs_chunk_type(eb: leaf, s: chunk);
3672	factor = calc_data_stripes(type, num_stripes);
3673
3674	for (i = `0`; i < num_stripes; i++) {
3675	stripe = btrfs_stripe_nr(c: chunk, nr: i);
3676	if (btrfs_stripe_devid(eb: leaf, s: stripe) != bargs->devid)
3677	continue;
3678
3679	stripe_offset = btrfs_stripe_offset(eb: leaf, s: stripe);
3680	stripe_length = btrfs_chunk_length(eb: leaf, s: chunk);
3681	stripe_length = div_u64(dividend: stripe_length, divisor: factor);
3682
3683	if (stripe_offset < bargs->pend &&
3684	stripe_offset + stripe_length > bargs->pstart)
3685	return `0`;
3686	}
3687
3688	return `1`;
3689	}
3690
3691	/ [vstart, vend) /
3692	static int chunk_vrange_filter(struct extent_buffer *leaf,
3693	struct btrfs_chunk *chunk,
3694	u64 chunk_offset,
3695	struct btrfs_balance_args *bargs)
3696	{
3697	if (chunk_offset < bargs->vend &&
3698	chunk_offset + btrfs_chunk_length(eb: leaf, s: chunk) > bargs->vstart)
3699	/ at least part of the chunk is inside this vrange /
3700	return `0`;
3701
3702	return `1`;
3703	}
3704
3705	static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3706	struct btrfs_chunk *chunk,
3707	struct btrfs_balance_args *bargs)
3708	{
3709	int num_stripes = btrfs_chunk_num_stripes(eb: leaf, s: chunk);
3710
3711	if (bargs->stripes_min <= num_stripes
3712	&& num_stripes <= bargs->stripes_max)
3713	return `0`;
3714
3715	return `1`;
3716	}
3717
3718	static int chunk_soft_convert_filter(u64 chunk_type,
3719	struct btrfs_balance_args *bargs)
3720	{
3721	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3722	return `0`;
3723
3724	chunk_type = chunk_to_extended(flags: chunk_type) &
3725	BTRFS_EXTENDED_PROFILE_MASK;
3726
3727	if (bargs->target == chunk_type)
3728	return `1`;
3729
3730	return `0`;
3731	}
3732
3733	static int should_balance_chunk(struct extent_buffer *leaf,
3734	struct btrfs_chunk *chunk, u64 chunk_offset)
3735	{
3736	struct btrfs_fs_info *fs_info = leaf->fs_info;
3737	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3738	struct btrfs_balance_args *bargs = NULL;
3739	u64 chunk_type = btrfs_chunk_type(eb: leaf, s: chunk);
3740
3741	/ type filter /
3742	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3743	(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3744	return `0`;
3745	}
3746
3747	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3748	bargs = &bctl->data;
3749	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3750	bargs = &bctl->sys;
3751	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3752	bargs = &bctl->meta;
3753
3754	/ profiles filter /
3755	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3756	chunk_profiles_filter(chunk_type, bargs)) {
3757	return `0`;
3758	}
3759
3760	/ usage filter /
3761	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3762	chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3763	return `0`;
3764	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3765	chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3766	return `0`;
3767	}
3768
3769	/ devid filter /
3770	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3771	chunk_devid_filter(leaf, chunk, bargs)) {
3772	return `0`;
3773	}
3774
3775	/ drange filter, makes sense only with devid filter /
3776	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3777	chunk_drange_filter(leaf, chunk, bargs)) {
3778	return `0`;
3779	}
3780
3781	/ vrange filter /
3782	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3783	chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3784	return `0`;
3785	}
3786
3787	/ stripes filter /
3788	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3789	chunk_stripes_range_filter(leaf, chunk, bargs)) {
3790	return `0`;
3791	}
3792
3793	/ soft profile changing mode /
3794	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3795	chunk_soft_convert_filter(chunk_type, bargs)) {
3796	return `0`;
3797	}
3798
3799	/*
3800	* limited by count, must be the last filter
3801	*/
3802	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3803	if (bargs->limit == `0`)
3804	return `0`;
3805	else
3806	bargs->limit--;
3807	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3808	/*
3809	* Same logic as the 'limit' filter; the minimum cannot be
3810	* determined here because we do not have the global information
3811	* about the count of all chunks that satisfy the filters.
3812	*/
3813	if (bargs->limit_max == `0`)
3814	return `0`;
3815	else
3816	bargs->limit_max--;
3817	}
3818
3819	return `1`;
3820	}
3821
3822	static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3823	{
3824	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3825	struct btrfs_root *chunk_root = fs_info->chunk_root;
3826	u64 chunk_type;
3827	struct btrfs_chunk *chunk;
3828	struct btrfs_path *path = NULL;
3829	struct btrfs_key key;
3830	struct btrfs_key found_key;
3831	struct extent_buffer *leaf;
3832	int slot;
3833	int ret;
3834	int enospc_errors = `0`;
3835	bool counting = true;
3836	/ The single value limit and min/max limits use the same bytes in the /
3837	u64 limit_data = bctl->data.limit;
3838	u64 limit_meta = bctl->meta.limit;
3839	u64 limit_sys = bctl->sys.limit;
3840	u32 count_data = `0`;
3841	u32 count_meta = `0`;
3842	u32 count_sys = `0`;
3843	int chunk_reserved = `0`;
3844
3845	path = btrfs_alloc_path();
3846	if (!path) {
3847	ret = -ENOMEM;
3848	goto error;
3849	}
3850
3851	/ zero out stat counters /
3852	spin_lock(lock: &fs_info->balance_lock);
3853	memset(&bctl->stat, `0`, sizeof(bctl->stat));
3854	spin_unlock(lock: &fs_info->balance_lock);
3855	again:
3856	if (!counting) {
3857	/*
3858	* The single value limit and min/max limits use the same bytes
3859	* in the
3860	*/
3861	bctl->data.limit = limit_data;
3862	bctl->meta.limit = limit_meta;
3863	bctl->sys.limit = limit_sys;
3864	}
3865	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3866	key.offset = (u64)-`1`;
3867	key.type = BTRFS_CHUNK_ITEM_KEY;
3868
3869	while (`1`) {
3870	if ((!counting && atomic_read(v: &fs_info->balance_pause_req)) \|\|
3871	atomic_read(v: &fs_info->balance_cancel_req)) {
3872	ret = -ECANCELED;
3873	goto error;
3874	}
3875
3876	mutex_lock(&fs_info->reclaim_bgs_lock);
3877	ret = btrfs_search_slot(NULL, root: chunk_root, key: &key, p: path, ins_len: `0`, cow: `0`);
3878	if (ret < `0`) {
3879	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3880	goto error;
3881	}
3882
3883	/*
3884	* this shouldn't happen, it means the last relocate
3885	* failed
3886	*/
3887	if (ret == `0`)
3888	BUG(); / FIXME break ? /
3889
3890	ret = btrfs_previous_item(root: chunk_root, path, min_objectid: `0`,
3891	BTRFS_CHUNK_ITEM_KEY);
3892	if (ret) {
3893	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3894	ret = `0`;
3895	break;
3896	}
3897
3898	leaf = path->nodes[`0`];
3899	slot = path->slots[`0`];
3900	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: slot);
3901
3902	if (found_key.objectid != key.objectid) {
3903	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3904	break;
3905	}
3906
3907	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3908	chunk_type = btrfs_chunk_type(eb: leaf, s: chunk);
3909
3910	if (!counting) {
3911	spin_lock(lock: &fs_info->balance_lock);
3912	bctl->stat.considered++;
3913	spin_unlock(lock: &fs_info->balance_lock);
3914	}
3915
3916	ret = should_balance_chunk(leaf, chunk, chunk_offset: found_key.offset);
3917
3918	btrfs_release_path(p: path);
3919	if (!ret) {
3920	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3921	goto loop;
3922	}
3923
3924	if (counting) {
3925	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3926	spin_lock(lock: &fs_info->balance_lock);
3927	bctl->stat.expected++;
3928	spin_unlock(lock: &fs_info->balance_lock);
3929
3930	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3931	count_data++;
3932	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3933	count_sys++;
3934	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3935	count_meta++;
3936
3937	goto loop;
3938	}
3939
3940	/*
3941	* Apply limit_min filter, no need to check if the LIMITS
3942	* filter is used, limit_min is 0 by default
3943	*/
3944	if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3945	count_data < bctl->data.limit_min)
3946	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3947	count_meta < bctl->meta.limit_min)
3948	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3949	count_sys < bctl->sys.limit_min)) {
3950	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3951	goto loop;
3952	}
3953
3954	if (!chunk_reserved) {
3955	/*
3956	* We may be relocating the only data chunk we have,
3957	* which could potentially end up with losing data's
3958	* raid profile, so lets allocate an empty one in
3959	* advance.
3960	*/
3961	ret = btrfs_may_alloc_data_chunk(fs_info,
3962	chunk_offset: found_key.offset);
3963	if (ret < `0`) {
3964	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3965	goto error;
3966	} else if (ret == `1`) {
3967	chunk_reserved = `1`;
3968	}
3969	}
3970
3971	ret = btrfs_relocate_chunk(fs_info, chunk_offset: found_key.offset);
3972	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
3973	if (ret == -ENOSPC) {
3974	enospc_errors++;
3975	} else if (ret == -ETXTBSY) {
3976	btrfs_info(fs_info,
3977	"skipping relocation of block group %llu due to active swapfile",
3978	found_key.offset);
3979	ret = `0`;
3980	} else if (ret) {
3981	goto error;
3982	} else {
3983	spin_lock(lock: &fs_info->balance_lock);
3984	bctl->stat.completed++;
3985	spin_unlock(lock: &fs_info->balance_lock);
3986	}
3987	loop:
3988	if (found_key.offset == `0`)
3989	break;
3990	key.offset = found_key.offset - `1`;
3991	}
3992
3993	if (counting) {
3994	btrfs_release_path(p: path);
3995	counting = false;
3996	goto again;
3997	}
3998	error:
3999	btrfs_free_path(p: path);
4000	if (enospc_errors) {
4001	btrfs_info(fs_info, "%d enospc errors during balance",
4002	enospc_errors);
4003	if (!ret)
4004	ret = -ENOSPC;
4005	}
4006
4007	return ret;
4008	}
4009
4010	/*
4011	* See if a given profile is valid and reduced.
4012	*
4013	* @flags: profile to validate
4014	* @extended: if true @flags is treated as an extended profile
4015	*/
4016	static int alloc_profile_is_valid(u64 flags, int extended)
4017	{
4018	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
4019	BTRFS_BLOCK_GROUP_PROFILE_MASK);
4020
4021	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
4022
4023	/ 1) check that all other bits are zeroed /
4024	if (flags & ~mask)
4025	return `0`;
4026
4027	/ 2) see if profile is reduced /
4028	if (flags == `0`)
4029	return !extended; / "0" is valid for usual profiles /
4030
4031	return has_single_bit_set(n: flags);
4032	}
4033
4034	/*
4035	* Validate target profile against allowed profiles and return true if it's OK.
4036	* Otherwise print the error message and return false.
4037	*/
4038	static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
4039	const struct btrfs_balance_args *bargs,
4040	u64 allowed, const char *type)
4041	{
4042	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
4043	return true;
4044
4045	/ Profile is valid and does not have bits outside of the allowed set /
4046	if (alloc_profile_is_valid(flags: bargs->target, extended: `1`) &&
4047	(bargs->target & ~allowed) == `0`)
4048	return true;
4049
4050	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4051	type, btrfs_bg_type_to_raid_name(bargs->target));
4052	return false;
4053	}
4054
4055	/*
4056	* Fill @buf with textual description of balance filter flags @bargs, up to
4057	* @size_buf including the terminating null. The output may be trimmed if it
4058	* does not fit into the provided buffer.
4059	*/
4060	static void describe_balance_args(struct btrfs_balance_args bargs, char* *buf,
4061	u32 size_buf)
4062	{
4063	int ret;
4064	u32 size_bp = size_buf;
4065	char *bp = buf;
4066	u64 flags = bargs->flags;
4067	char tmp_buf[`128`] = {`'\0'`};
4068
4069	if (!flags)
4070	return;
4071
4072	#define CHECK_APPEND_NOARG(a) \
4073	do { \
4074	ret = snprintf(bp, size_bp, (a)); \
4075	if (ret < 0 \|\| ret >= size_bp) \
4076	goto out_overflow; \
4077	size_bp -= ret; \
4078	bp += ret; \
4079	} while (0)
4080
4081	#define CHECK_APPEND_1ARG(a, v1) \
4082	do { \
4083	ret = snprintf(bp, size_bp, (a), (v1)); \
4084	if (ret < 0 \|\| ret >= size_bp) \
4085	goto out_overflow; \
4086	size_bp -= ret; \
4087	bp += ret; \
4088	} while (0)
4089
4090	#define CHECK_APPEND_2ARG(a, v1, v2) \
4091	do { \
4092	ret = snprintf(bp, size_bp, (a), (v1), (v2)); \
4093	if (ret < 0 \|\| ret >= size_bp) \
4094	goto out_overflow; \
4095	size_bp -= ret; \
4096	bp += ret; \
4097	} while (0)
4098
4099	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4100	CHECK_APPEND_1ARG("convert=%s,",
4101	btrfs_bg_type_to_raid_name(bargs->target));
4102
4103	if (flags & BTRFS_BALANCE_ARGS_SOFT)
4104	CHECK_APPEND_NOARG("soft,");
4105
4106	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4107	btrfs_describe_block_groups(bg_flags: bargs->profiles, buf: tmp_buf,
4108	size_buf: sizeof(tmp_buf));
4109	CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4110	}
4111
4112	if (flags & BTRFS_BALANCE_ARGS_USAGE)
4113	CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4114
4115	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4116	CHECK_APPEND_2ARG("usage=%u..%u,",
4117	bargs->usage_min, bargs->usage_max);
4118
4119	if (flags & BTRFS_BALANCE_ARGS_DEVID)
4120	CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4121
4122	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4123	CHECK_APPEND_2ARG("drange=%llu..%llu,",
4124	bargs->pstart, bargs->pend);
4125
4126	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4127	CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4128	bargs->vstart, bargs->vend);
4129
4130	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4131	CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4132
4133	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4134	CHECK_APPEND_2ARG("limit=%u..%u,",
4135	bargs->limit_min, bargs->limit_max);
4136
4137	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4138	CHECK_APPEND_2ARG("stripes=%u..%u,",
4139	bargs->stripes_min, bargs->stripes_max);
4140
4141	#undef CHECK_APPEND_2ARG
4142	#undef CHECK_APPEND_1ARG
4143	#undef CHECK_APPEND_NOARG
4144
4145	out_overflow:
4146
4147	if (size_bp < size_buf)
4148	buf[size_buf - size_bp - `1`] = `'\0'`; / remove last , /
4149	else
4150	buf[`0`] = `'\0'`;
4151	}
4152
4153	static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4154	{
4155	u32 size_buf = `1024`;
4156	char tmp_buf[`192`] = {`'\0'`};
4157	char *buf;
4158	char *bp;
4159	u32 size_bp = size_buf;
4160	int ret;
4161	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4162
4163	buf = kzalloc(size: size_buf, GFP_KERNEL);
4164	if (!buf)
4165	return;
4166
4167	bp = buf;
4168
4169	#define CHECK_APPEND_1ARG(a, v1) \
4170	do { \
4171	ret = snprintf(bp, size_bp, (a), (v1)); \
4172	if (ret < 0 \|\| ret >= size_bp) \
4173	goto out_overflow; \
4174	size_bp -= ret; \
4175	bp += ret; \
4176	} while (0)
4177
4178	if (bctl->flags & BTRFS_BALANCE_FORCE)
4179	CHECK_APPEND_1ARG("%s", "-f ");
4180
4181	if (bctl->flags & BTRFS_BALANCE_DATA) {
4182	describe_balance_args(bargs: &bctl->data, buf: tmp_buf, size_buf: sizeof(tmp_buf));
4183	CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4184	}
4185
4186	if (bctl->flags & BTRFS_BALANCE_METADATA) {
4187	describe_balance_args(bargs: &bctl->meta, buf: tmp_buf, size_buf: sizeof(tmp_buf));
4188	CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4189	}
4190
4191	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4192	describe_balance_args(bargs: &bctl->sys, buf: tmp_buf, size_buf: sizeof(tmp_buf));
4193	CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4194	}
4195
4196	#undef CHECK_APPEND_1ARG
4197
4198	out_overflow:
4199
4200	if (size_bp < size_buf)
4201	buf[size_buf - size_bp - `1`] = `'\0'`; / remove last " " /
4202	btrfs_info(fs_info, "balance: %s %s",
4203	(bctl->flags & BTRFS_BALANCE_RESUME) ?
4204	"resume" : "start", buf);
4205
4206	kfree(objp: buf);
4207	}
4208
4209	/*
4210	* Should be called with balance mutexe held
4211	*/
4212	int btrfs_balance(struct btrfs_fs_info *fs_info,
4213	struct btrfs_balance_control *bctl,
4214	struct btrfs_ioctl_balance_args *bargs)
4215	{
4216	u64 meta_target, data_target;
4217	u64 allowed;
4218	int mixed = `0`;
4219	int ret;
4220	u64 num_devices;
4221	unsigned seq;
4222	bool reducing_redundancy;
4223	bool paused = false;
4224	int i;
4225
4226	if (btrfs_fs_closing(fs_info) \|\|
4227	atomic_read(v: &fs_info->balance_pause_req) \|\|
4228	btrfs_should_cancel_balance(fs_info)) {
4229	ret = -EINVAL;
4230	goto out;
4231	}
4232
4233	allowed = btrfs_super_incompat_flags(s: fs_info->super_copy);
4234	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4235	mixed = `1`;
4236
4237	/*
4238	* In case of mixed groups both data and meta should be picked,
4239	* and identical options should be given for both of them.
4240	*/
4241	allowed = BTRFS_BALANCE_DATA \| BTRFS_BALANCE_METADATA;
4242	if (mixed && (bctl->flags & allowed)) {
4243	if (!(bctl->flags & BTRFS_BALANCE_DATA) \|\|
4244	!(bctl->flags & BTRFS_BALANCE_METADATA) \|\|
4245	memcmp(p: &bctl->data, q: &bctl->meta, size: sizeof(bctl->data))) {
4246	btrfs_err(fs_info,
4247	"balance: mixed groups data and metadata options must be the same");
4248	ret = -EINVAL;
4249	goto out;
4250	}
4251	}
4252
4253	/*
4254	* rw_devices will not change at the moment, device add/delete/replace
4255	* are exclusive
4256	*/
4257	num_devices = fs_info->fs_devices->rw_devices;
4258
4259	/*
4260	* SINGLE profile on-disk has no profile bit, but in-memory we have a
4261	* special bit for it, to make it easier to distinguish. Thus we need
4262	* to set it manually, or balance would refuse the profile.
4263	*/
4264	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4265	for (i = `0`; i < ARRAY_SIZE(btrfs_raid_array); i++)
4266	if (num_devices >= btrfs_raid_array[i].devs_min)
4267	allowed \|= btrfs_raid_array[i].bg_flag;
4268
4269	if (!validate_convert_profile(fs_info, bargs: &bctl->data, allowed, type: "data") \|\|
4270	!validate_convert_profile(fs_info, bargs: &bctl->meta, allowed, type: "metadata") \|\|
4271	!validate_convert_profile(fs_info, bargs: &bctl->sys, allowed, type: "system")) {
4272	ret = -EINVAL;
4273	goto out;
4274	}
4275
4276	/*
4277	* Allow to reduce metadata or system integrity only if force set for
4278	* profiles with redundancy (copies, parity)
4279	*/
4280	allowed = `0`;
4281	for (i = `0`; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4282	if (btrfs_raid_array[i].ncopies >= `2` \|\|
4283	btrfs_raid_array[i].tolerated_failures >= `1`)
4284	allowed \|= btrfs_raid_array[i].bg_flag;
4285	}
4286	do {
4287	seq = read_seqbegin(sl: &fs_info->profiles_lock);
4288
4289	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4290	(fs_info->avail_system_alloc_bits & allowed) &&
4291	!(bctl->sys.target & allowed)) \|\|
4292	((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4293	(fs_info->avail_metadata_alloc_bits & allowed) &&
4294	!(bctl->meta.target & allowed)))
4295	reducing_redundancy = true;
4296	else
4297	reducing_redundancy = false;
4298
4299	/ if we're not converting, the target field is uninitialized /
4300	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4301	bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4302	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4303	bctl->data.target : fs_info->avail_data_alloc_bits;
4304	} while (read_seqretry(sl: &fs_info->profiles_lock, start: seq));
4305
4306	if (reducing_redundancy) {
4307	if (bctl->flags & BTRFS_BALANCE_FORCE) {
4308	btrfs_info(fs_info,
4309	"balance: force reducing metadata redundancy");
4310	} else {
4311	btrfs_err(fs_info,
4312	"balance: reduces metadata redundancy, use --force if you want this");
4313	ret = -EINVAL;
4314	goto out;
4315	}
4316	}
4317
4318	if (btrfs_get_num_tolerated_disk_barrier_failures(flags: meta_target) <
4319	btrfs_get_num_tolerated_disk_barrier_failures(flags: data_target)) {
4320	btrfs_warn(fs_info,
4321	"balance: metadata profile %s has lower redundancy than data profile %s",
4322	btrfs_bg_type_to_raid_name(meta_target),
4323	btrfs_bg_type_to_raid_name(data_target));
4324	}
4325
4326	ret = insert_balance_item(fs_info, bctl);
4327	if (ret && ret != -EEXIST)
4328	goto out;
4329
4330	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4331	BUG_ON(ret == -EEXIST);
4332	BUG_ON(fs_info->balance_ctl);
4333	spin_lock(lock: &fs_info->balance_lock);
4334	fs_info->balance_ctl = bctl;
4335	spin_unlock(lock: &fs_info->balance_lock);
4336	} else {
4337	BUG_ON(ret != -EEXIST);
4338	spin_lock(lock: &fs_info->balance_lock);
4339	update_balance_args(bctl);
4340	spin_unlock(lock: &fs_info->balance_lock);
4341	}
4342
4343	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4344	set_bit(nr: BTRFS_FS_BALANCE_RUNNING, addr: &fs_info->flags);
4345	describe_balance_start_or_resume(fs_info);
4346	mutex_unlock(lock: &fs_info->balance_mutex);
4347
4348	ret = __btrfs_balance(fs_info);
4349
4350	mutex_lock(&fs_info->balance_mutex);
4351	if (ret == -ECANCELED && atomic_read(v: &fs_info->balance_pause_req)) {
4352	btrfs_info(fs_info, "balance: paused");
4353	btrfs_exclop_balance(fs_info, op: BTRFS_EXCLOP_BALANCE_PAUSED);
4354	paused = true;
4355	}
4356	/*
4357	* Balance can be canceled by:
4358	*
4359	* - Regular cancel request
4360	* Then ret == -ECANCELED and balance_cancel_req > 0
4361	*
4362	* - Fatal signal to "btrfs" process
4363	* Either the signal caught by wait_reserve_ticket() and callers
4364	* got -EINTR, or caught by btrfs_should_cancel_balance() and
4365	* got -ECANCELED.
4366	* Either way, in this case balance_cancel_req = 0, and
4367	* ret == -EINTR or ret == -ECANCELED.
4368	*
4369	* So here we only check the return value to catch canceled balance.
4370	*/
4371	else if (ret == -ECANCELED \|\| ret == -EINTR)
4372	btrfs_info(fs_info, "balance: canceled");
4373	else
4374	btrfs_info(fs_info, "balance: ended with status: %d", ret);
4375
4376	clear_bit(nr: BTRFS_FS_BALANCE_RUNNING, addr: &fs_info->flags);
4377
4378	if (bargs) {
4379	memset(bargs, `0`, sizeof(*bargs));
4380	btrfs_update_ioctl_balance_args(fs_info, bargs);
4381	}
4382
4383	/ We didn't pause, we can clean everything up. /
4384	if (!paused) {
4385	reset_balance_state(fs_info);
4386	btrfs_exclop_finish(fs_info);
4387	}
4388
4389	wake_up(&fs_info->balance_wait_q);
4390
4391	return ret;
4392	out:
4393	if (bctl->flags & BTRFS_BALANCE_RESUME)
4394	reset_balance_state(fs_info);
4395	else
4396	kfree(objp: bctl);
4397	btrfs_exclop_finish(fs_info);
4398
4399	return ret;
4400	}
4401
4402	static int balance_kthread(void *data)
4403	{
4404	struct btrfs_fs_info *fs_info = data;
4405	int ret = `0`;
4406
4407	sb_start_write(sb: fs_info->sb);
4408	mutex_lock(&fs_info->balance_mutex);
4409	if (fs_info->balance_ctl)
4410	ret = btrfs_balance(fs_info, bctl: fs_info->balance_ctl, NULL);
4411	mutex_unlock(lock: &fs_info->balance_mutex);
4412	sb_end_write(sb: fs_info->sb);
4413
4414	return ret;
4415	}
4416
4417	int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4418	{
4419	struct task_struct *tsk;
4420
4421	mutex_lock(&fs_info->balance_mutex);
4422	if (!fs_info->balance_ctl) {
4423	mutex_unlock(lock: &fs_info->balance_mutex);
4424	return `0`;
4425	}
4426	mutex_unlock(lock: &fs_info->balance_mutex);
4427
4428	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4429	btrfs_info(fs_info, "balance: resume skipped");
4430	return `0`;
4431	}
4432
4433	spin_lock(lock: &fs_info->super_lock);
4434	ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
4435	fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
4436	spin_unlock(lock: &fs_info->super_lock);
4437	/*
4438	* A ro->rw remount sequence should continue with the paused balance
4439	* regardless of who pauses it, system or the user as of now, so set
4440	* the resume flag.
4441	*/
4442	spin_lock(lock: &fs_info->balance_lock);
4443	fs_info->balance_ctl->flags \|= BTRFS_BALANCE_RESUME;
4444	spin_unlock(lock: &fs_info->balance_lock);
4445
4446	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4447	return PTR_ERR_OR_ZERO(ptr: tsk);
4448	}
4449
4450	int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4451	{
4452	struct btrfs_balance_control *bctl;
4453	struct btrfs_balance_item *item;
4454	struct btrfs_disk_balance_args disk_bargs;
4455	struct btrfs_path *path;
4456	struct extent_buffer *leaf;
4457	struct btrfs_key key;
4458	int ret;
4459
4460	path = btrfs_alloc_path();
4461	if (!path)
4462	return -ENOMEM;
4463
4464	key.objectid = BTRFS_BALANCE_OBJECTID;
4465	key.type = BTRFS_TEMPORARY_ITEM_KEY;
4466	key.offset = `0`;
4467
4468	ret = btrfs_search_slot(NULL, root: fs_info->tree_root, key: &key, p: path, ins_len: `0`, cow: `0`);
4469	if (ret < `0`)
4470	goto out;
4471	if (ret > `0`) { / ret = -ENOENT; /
4472	ret = `0`;
4473	goto out;
4474	}
4475
4476	bctl = kzalloc(size: sizeof(*bctl), GFP_NOFS);
4477	if (!bctl) {
4478	ret = -ENOMEM;
4479	goto out;
4480	}
4481
4482	leaf = path->nodes[`0`];
4483	item = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_balance_item);
4484
4485	bctl->flags = btrfs_balance_flags(eb: leaf, s: item);
4486	bctl->flags \|= BTRFS_BALANCE_RESUME;
4487
4488	btrfs_balance_data(eb: leaf, bi: item, ba: &disk_bargs);
4489	btrfs_disk_balance_args_to_cpu(cpu: &bctl->data, disk: &disk_bargs);
4490	btrfs_balance_meta(eb: leaf, bi: item, ba: &disk_bargs);
4491	btrfs_disk_balance_args_to_cpu(cpu: &bctl->meta, disk: &disk_bargs);
4492	btrfs_balance_sys(eb: leaf, bi: item, ba: &disk_bargs);
4493	btrfs_disk_balance_args_to_cpu(cpu: &bctl->sys, disk: &disk_bargs);
4494
4495	/*
4496	* This should never happen, as the paused balance state is recovered
4497	* during mount without any chance of other exclusive ops to collide.
4498	*
4499	* This gives the exclusive op status to balance and keeps in paused
4500	* state until user intervention (cancel or umount). If the ownership
4501	* cannot be assigned, show a message but do not fail. The balance
4502	* is in a paused state and must have fs_info::balance_ctl properly
4503	* set up.
4504	*/
4505	if (!btrfs_exclop_start(fs_info, type: BTRFS_EXCLOP_BALANCE_PAUSED))
4506	btrfs_warn(fs_info,
4507	"balance: cannot set exclusive op status, resume manually");
4508
4509	btrfs_release_path(p: path);
4510
4511	mutex_lock(&fs_info->balance_mutex);
4512	BUG_ON(fs_info->balance_ctl);
4513	spin_lock(lock: &fs_info->balance_lock);
4514	fs_info->balance_ctl = bctl;
4515	spin_unlock(lock: &fs_info->balance_lock);
4516	mutex_unlock(lock: &fs_info->balance_mutex);
4517	out:
4518	btrfs_free_path(p: path);
4519	return ret;
4520	}
4521
4522	int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4523	{
4524	int ret = `0`;
4525
4526	mutex_lock(&fs_info->balance_mutex);
4527	if (!fs_info->balance_ctl) {
4528	mutex_unlock(lock: &fs_info->balance_mutex);
4529	return -ENOTCONN;
4530	}
4531
4532	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4533	atomic_inc(v: &fs_info->balance_pause_req);
4534	mutex_unlock(lock: &fs_info->balance_mutex);
4535
4536	wait_event(fs_info->balance_wait_q,
4537	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4538
4539	mutex_lock(&fs_info->balance_mutex);
4540	/ we are good with balance_ctl ripped off from under us /
4541	BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4542	atomic_dec(v: &fs_info->balance_pause_req);
4543	} else {
4544	ret = -ENOTCONN;
4545	}
4546
4547	mutex_unlock(lock: &fs_info->balance_mutex);
4548	return ret;
4549	}
4550
4551	int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4552	{
4553	mutex_lock(&fs_info->balance_mutex);
4554	if (!fs_info->balance_ctl) {
4555	mutex_unlock(lock: &fs_info->balance_mutex);
4556	return -ENOTCONN;
4557	}
4558
4559	/*
4560	* A paused balance with the item stored on disk can be resumed at
4561	* mount time if the mount is read-write. Otherwise it's still paused
4562	* and we must not allow cancelling as it deletes the item.
4563	*/
4564	if (sb_rdonly(sb: fs_info->sb)) {
4565	mutex_unlock(lock: &fs_info->balance_mutex);
4566	return -EROFS;
4567	}
4568
4569	atomic_inc(v: &fs_info->balance_cancel_req);
4570	/*
4571	* if we are running just wait and return, balance item is
4572	* deleted in btrfs_balance in this case
4573	*/
4574	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4575	mutex_unlock(lock: &fs_info->balance_mutex);
4576	wait_event(fs_info->balance_wait_q,
4577	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4578	mutex_lock(&fs_info->balance_mutex);
4579	} else {
4580	mutex_unlock(lock: &fs_info->balance_mutex);
4581	/*
4582	* Lock released to allow other waiters to continue, we'll
4583	* reexamine the status again.
4584	*/
4585	mutex_lock(&fs_info->balance_mutex);
4586
4587	if (fs_info->balance_ctl) {
4588	reset_balance_state(fs_info);
4589	btrfs_exclop_finish(fs_info);
4590	btrfs_info(fs_info, "balance: canceled");
4591	}
4592	}
4593
4594	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4595	atomic_dec(v: &fs_info->balance_cancel_req);
4596	mutex_unlock(lock: &fs_info->balance_mutex);
4597	return `0`;
4598	}
4599
4600	int btrfs_uuid_scan_kthread(void *data)
4601	{
4602	struct btrfs_fs_info *fs_info = data;
4603	struct btrfs_root *root = fs_info->tree_root;
4604	struct btrfs_key key;
4605	struct btrfs_path *path = NULL;
4606	int ret = `0`;
4607	struct extent_buffer *eb;
4608	int slot;
4609	struct btrfs_root_item root_item;
4610	u32 item_size;
4611	struct btrfs_trans_handle *trans = NULL;
4612	bool closing = false;
4613
4614	path = btrfs_alloc_path();
4615	if (!path) {
4616	ret = -ENOMEM;
4617	goto out;
4618	}
4619
4620	key.objectid = `0`;
4621	key.type = BTRFS_ROOT_ITEM_KEY;
4622	key.offset = `0`;
4623
4624	while (`1`) {
4625	if (btrfs_fs_closing(fs_info)) {
4626	closing = true;
4627	break;
4628	}
4629	ret = btrfs_search_forward(root, min_key: &key, path,
4630	BTRFS_OLDEST_GENERATION);
4631	if (ret) {
4632	if (ret > `0`)
4633	ret = `0`;
4634	break;
4635	}
4636
4637	if (key.type != BTRFS_ROOT_ITEM_KEY \|\|
4638	(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4639	key.objectid != BTRFS_FS_TREE_OBJECTID) \|\|
4640	key.objectid > BTRFS_LAST_FREE_OBJECTID)
4641	goto skip;
4642
4643	eb = path->nodes[`0`];
4644	slot = path->slots[`0`];
4645	item_size = btrfs_item_size(eb, slot);
4646	if (item_size < sizeof(root_item))
4647	goto skip;
4648
4649	read_extent_buffer(eb, dst: &root_item,
4650	btrfs_item_ptr_offset(eb, slot),
4651	len: (int)sizeof(root_item));
4652	if (btrfs_root_refs(s: &root_item) == `0`)
4653	goto skip;
4654
4655	if (!btrfs_is_empty_uuid(uuid: root_item.uuid) \|\|
4656	!btrfs_is_empty_uuid(uuid: root_item.received_uuid)) {
4657	if (trans)
4658	goto update_tree;
4659
4660	btrfs_release_path(p: path);
4661	/*
4662	* 1 - subvol uuid item
4663	* 1 - received_subvol uuid item
4664	*/
4665	trans = btrfs_start_transaction(root: fs_info->uuid_root, num_items: `2`);
4666	if (IS_ERR(ptr: trans)) {
4667	ret = PTR_ERR(ptr: trans);
4668	break;
4669	}
4670	continue;
4671	} else {
4672	goto skip;
4673	}
4674	update_tree:
4675	btrfs_release_path(p: path);
4676	if (!btrfs_is_empty_uuid(uuid: root_item.uuid)) {
4677	ret = btrfs_uuid_tree_add(trans, uuid: root_item.uuid,
4678	BTRFS_UUID_KEY_SUBVOL,
4679	subid: key.objectid);
4680	if (ret < `0`) {
4681	btrfs_warn(fs_info, "uuid_tree_add failed %d",
4682	ret);
4683	break;
4684	}
4685	}
4686
4687	if (!btrfs_is_empty_uuid(uuid: root_item.received_uuid)) {
4688	ret = btrfs_uuid_tree_add(trans,
4689	uuid: root_item.received_uuid,
4690	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4691	subid: key.objectid);
4692	if (ret < `0`) {
4693	btrfs_warn(fs_info, "uuid_tree_add failed %d",
4694	ret);
4695	break;
4696	}
4697	}
4698
4699	skip:
4700	btrfs_release_path(p: path);
4701	if (trans) {
4702	ret = btrfs_end_transaction(trans);
4703	trans = NULL;
4704	if (ret)
4705	break;
4706	}
4707
4708	if (key.offset < (u64)-`1`) {
4709	key.offset++;
4710	} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4711	key.offset = `0`;
4712	key.type = BTRFS_ROOT_ITEM_KEY;
4713	} else if (key.objectid < (u64)-`1`) {
4714	key.offset = `0`;
4715	key.type = BTRFS_ROOT_ITEM_KEY;
4716	key.objectid++;
4717	} else {
4718	break;
4719	}
4720	cond_resched();
4721	}
4722
4723	out:
4724	btrfs_free_path(p: path);
4725	if (trans && !IS_ERR(ptr: trans))
4726	btrfs_end_transaction(trans);
4727	if (ret)
4728	btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4729	else if (!closing)
4730	set_bit(nr: BTRFS_FS_UPDATE_UUID_TREE_GEN, addr: &fs_info->flags);
4731	up(sem: &fs_info->uuid_tree_rescan_sem);
4732	return `0`;
4733	}
4734
4735	int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4736	{
4737	struct btrfs_trans_handle *trans;
4738	struct btrfs_root *tree_root = fs_info->tree_root;
4739	struct btrfs_root *uuid_root;
4740	struct task_struct *task;
4741	int ret;
4742
4743	/*
4744	* 1 - root node
4745	* 1 - root item
4746	*/
4747	trans = btrfs_start_transaction(root: tree_root, num_items: `2`);
4748	if (IS_ERR(ptr: trans))
4749	return PTR_ERR(ptr: trans);
4750
4751	uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4752	if (IS_ERR(ptr: uuid_root)) {
4753	ret = PTR_ERR(ptr: uuid_root);
4754	btrfs_abort_transaction(trans, ret);
4755	btrfs_end_transaction(trans);
4756	return ret;
4757	}
4758
4759	fs_info->uuid_root = uuid_root;
4760
4761	ret = btrfs_commit_transaction(trans);
4762	if (ret)
4763	return ret;
4764
4765	down(sem: &fs_info->uuid_tree_rescan_sem);
4766	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4767	if (IS_ERR(ptr: task)) {
4768	/ fs_info->update_uuid_tree_gen remains 0 in all error case /
4769	btrfs_warn(fs_info, "failed to start uuid_scan task");
4770	up(sem: &fs_info->uuid_tree_rescan_sem);
4771	return PTR_ERR(ptr: task);
4772	}
4773
4774	return `0`;
4775	}
4776
4777	/*
4778	* shrinking a device means finding all of the device extents past
4779	* the new size, and then following the back refs to the chunks.
4780	* The chunk relocation code actually frees the device extent
4781	*/
4782	int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4783	{
4784	struct btrfs_fs_info *fs_info = device->fs_info;
4785	struct btrfs_root *root = fs_info->dev_root;
4786	struct btrfs_trans_handle *trans;
4787	struct btrfs_dev_extent *dev_extent = NULL;
4788	struct btrfs_path *path;
4789	u64 length;
4790	u64 chunk_offset;
4791	int ret;
4792	int slot;
4793	int failed = `0`;
4794	bool retried = false;
4795	struct extent_buffer *l;
4796	struct btrfs_key key;
4797	struct btrfs_super_block *super_copy = fs_info->super_copy;
4798	u64 old_total = btrfs_super_total_bytes(s: super_copy);
4799	u64 old_size = btrfs_device_get_total_bytes(dev: device);
4800	u64 diff;
4801	u64 start;
4802	u64 free_diff = `0`;
4803
4804	new_size = round_down(new_size, fs_info->sectorsize);
4805	start = new_size;
4806	diff = round_down(old_size - new_size, fs_info->sectorsize);
4807
4808	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4809	return -EINVAL;
4810
4811	path = btrfs_alloc_path();
4812	if (!path)
4813	return -ENOMEM;
4814
4815	path->reada = READA_BACK;
4816
4817	trans = btrfs_start_transaction(root, num_items: `0`);
4818	if (IS_ERR(ptr: trans)) {
4819	btrfs_free_path(p: path);
4820	return PTR_ERR(ptr: trans);
4821	}
4822
4823	mutex_lock(&fs_info->chunk_mutex);
4824
4825	btrfs_device_set_total_bytes(dev: device, size: new_size);
4826	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4827	device->fs_devices->total_rw_bytes -= diff;
4828
4829	/*
4830	* The new free_chunk_space is new_size - used, so we have to
4831	* subtract the delta of the old free_chunk_space which included
4832	* old_size - used. If used > new_size then just subtract this
4833	* entire device's free space.
4834	*/
4835	if (device->bytes_used < new_size)
4836	free_diff = (old_size - device->bytes_used) -
4837	(new_size - device->bytes_used);
4838	else
4839	free_diff = old_size - device->bytes_used;
4840	atomic64_sub(i: free_diff, v: &fs_info->free_chunk_space);
4841	}
4842
4843	/*
4844	* Once the device's size has been set to the new size, ensure all
4845	* in-memory chunks are synced to disk so that the loop below sees them
4846	* and relocates them accordingly.
4847	*/
4848	if (contains_pending_extent(device, start: &start, len: diff)) {
4849	mutex_unlock(lock: &fs_info->chunk_mutex);
4850	ret = btrfs_commit_transaction(trans);
4851	if (ret)
4852	goto done;
4853	} else {
4854	mutex_unlock(lock: &fs_info->chunk_mutex);
4855	btrfs_end_transaction(trans);
4856	}
4857
4858	again:
4859	key.objectid = device->devid;
4860	key.offset = (u64)-`1`;
4861	key.type = BTRFS_DEV_EXTENT_KEY;
4862
4863	do {
4864	mutex_lock(&fs_info->reclaim_bgs_lock);
4865	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
4866	if (ret < `0`) {
4867	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
4868	goto done;
4869	}
4870
4871	ret = btrfs_previous_item(root, path, min_objectid: `0`, type: key.type);
4872	if (ret) {
4873	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
4874	if (ret < `0`)
4875	goto done;
4876	ret = `0`;
4877	btrfs_release_path(p: path);
4878	break;
4879	}
4880
4881	l = path->nodes[`0`];
4882	slot = path->slots[`0`];
4883	btrfs_item_key_to_cpu(eb: l, cpu_key: &key, nr: path->slots[`0`]);
4884
4885	if (key.objectid != device->devid) {
4886	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
4887	btrfs_release_path(p: path);
4888	break;
4889	}
4890
4891	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4892	length = btrfs_dev_extent_length(eb: l, s: dev_extent);
4893
4894	if (key.offset + length <= new_size) {
4895	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
4896	btrfs_release_path(p: path);
4897	break;
4898	}
4899
4900	chunk_offset = btrfs_dev_extent_chunk_offset(eb: l, s: dev_extent);
4901	btrfs_release_path(p: path);
4902
4903	/*
4904	* We may be relocating the only data chunk we have,
4905	* which could potentially end up with losing data's
4906	* raid profile, so lets allocate an empty one in
4907	* advance.
4908	*/
4909	ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4910	if (ret < `0`) {
4911	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
4912	goto done;
4913	}
4914
4915	ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4916	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
4917	if (ret == -ENOSPC) {
4918	failed++;
4919	} else if (ret) {
4920	if (ret == -ETXTBSY) {
4921	btrfs_warn(fs_info,
4922	"could not shrink block group %llu due to active swapfile",
4923	chunk_offset);
4924	}
4925	goto done;
4926	}
4927	} while (key.offset-- > `0`);
4928
4929	if (failed && !retried) {
4930	failed = `0`;
4931	retried = true;
4932	goto again;
4933	} else if (failed && retried) {
4934	ret = -ENOSPC;
4935	goto done;
4936	}
4937
4938	/ Shrinking succeeded, else we would be at "done". /
4939	trans = btrfs_start_transaction(root, num_items: `0`);
4940	if (IS_ERR(ptr: trans)) {
4941	ret = PTR_ERR(ptr: trans);
4942	goto done;
4943	}
4944
4945	mutex_lock(&fs_info->chunk_mutex);
4946	/ Clear all state bits beyond the shrunk device size /
4947	clear_extent_bits(tree: &device->alloc_state, start: new_size, end: (u64)-`1`,
4948	CHUNK_STATE_MASK);
4949
4950	btrfs_device_set_disk_total_bytes(dev: device, size: new_size);
4951	if (list_empty(head: &device->post_commit_list))
4952	list_add_tail(new: &device->post_commit_list,
4953	head: &trans->transaction->dev_update_list);
4954
4955	WARN_ON(diff > old_total);
4956	btrfs_set_super_total_bytes(s: super_copy,
4957	round_down(old_total - diff, fs_info->sectorsize));
4958	mutex_unlock(lock: &fs_info->chunk_mutex);
4959
4960	btrfs_reserve_chunk_metadata(trans, is_item_insertion: false);
4961	/ Now btrfs_update_device() will change the on-disk size. /
4962	ret = btrfs_update_device(trans, device);
4963	btrfs_trans_release_chunk_metadata(trans);
4964	if (ret < `0`) {
4965	btrfs_abort_transaction(trans, ret);
4966	btrfs_end_transaction(trans);
4967	} else {
4968	ret = btrfs_commit_transaction(trans);
4969	}
4970	done:
4971	btrfs_free_path(p: path);
4972	if (ret) {
4973	mutex_lock(&fs_info->chunk_mutex);
4974	btrfs_device_set_total_bytes(dev: device, size: old_size);
4975	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4976	device->fs_devices->total_rw_bytes += diff;
4977	atomic64_add(i: free_diff, v: &fs_info->free_chunk_space);
4978	}
4979	mutex_unlock(lock: &fs_info->chunk_mutex);
4980	}
4981	return ret;
4982	}
4983
4984	static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4985	struct btrfs_key *key,
4986	struct btrfs_chunk chunk, int* item_size)
4987	{
4988	struct btrfs_super_block *super_copy = fs_info->super_copy;
4989	struct btrfs_disk_key disk_key;
4990	u32 array_size;
4991	u8 *ptr;
4992
4993	lockdep_assert_held(&fs_info->chunk_mutex);
4994
4995	array_size = btrfs_super_sys_array_size(s: super_copy);
4996	if (array_size + item_size + sizeof(disk_key)
4997	> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4998	return -EFBIG;
4999
5000	ptr = super_copy->sys_chunk_array + array_size;
5001	btrfs_cpu_key_to_disk(disk_key: &disk_key, cpu_key: key);
5002	memcpy(ptr, &disk_key, sizeof(disk_key));
5003	ptr += sizeof(disk_key);
5004	memcpy(ptr, chunk, item_size);
5005	item_size += sizeof(disk_key);
5006	btrfs_set_super_sys_array_size(s: super_copy, val: array_size + item_size);
5007
5008	return `0`;
5009	}
5010
5011	/*
5012	* sort the devices in descending order by max_avail, total_avail
5013	*/
5014	static int btrfs_cmp_device_info(const void a, const* void *b)
5015	{
5016	const struct btrfs_device_info *di_a = a;
5017	const struct btrfs_device_info *di_b = b;
5018
5019	if (di_a->max_avail > di_b->max_avail)
5020	return -`1`;
5021	if (di_a->max_avail < di_b->max_avail)
5022	return `1`;
5023	if (di_a->total_avail > di_b->total_avail)
5024	return -`1`;
5025	if (di_a->total_avail < di_b->total_avail)
5026	return `1`;
5027	return `0`;
5028	}
5029
5030	static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
5031	{
5032	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5033	return;
5034
5035	btrfs_set_fs_incompat(info, RAID56);
5036	}
5037
5038	static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
5039	{
5040	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 \| BTRFS_BLOCK_GROUP_RAID1C4)))
5041	return;
5042
5043	btrfs_set_fs_incompat(info, RAID1C34);
5044	}
5045
5046	/*
5047	* Structure used internally for btrfs_create_chunk() function.
5048	* Wraps needed parameters.
5049	*/
5050	struct alloc_chunk_ctl {
5051	u64 start;
5052	u64 type;
5053	/ Total number of stripes to allocate /
5054	int num_stripes;
5055	/ sub_stripes info for map /
5056	int sub_stripes;
5057	/ Stripes per device /
5058	int dev_stripes;
5059	/ Maximum number of devices to use /
5060	int devs_max;
5061	/ Minimum number of devices to use /
5062	int devs_min;
5063	/ ndevs has to be a multiple of this /
5064	int devs_increment;
5065	/ Number of copies /
5066	int ncopies;
5067	/ Number of stripes worth of bytes to store parity information /
5068	int nparity;
5069	u64 max_stripe_size;
5070	u64 max_chunk_size;
5071	u64 dev_extent_min;
5072	u64 stripe_size;
5073	u64 chunk_size;
5074	int ndevs;
5075	};
5076
5077	static void init_alloc_chunk_ctl_policy_regular(
5078	struct btrfs_fs_devices *fs_devices,
5079	struct alloc_chunk_ctl *ctl)
5080	{
5081	struct btrfs_space_info *space_info;
5082
5083	space_info = btrfs_find_space_info(info: fs_devices->fs_info, flags: ctl->type);
5084	ASSERT(space_info);
5085
5086	ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
5087	ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
5088
5089	if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
5090	ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
5091
5092	/ We don't want a chunk larger than 10% of writable space /
5093	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, `10`),
5094	ctl->max_chunk_size);
5095	ctl->dev_extent_min = btrfs_stripe_nr_to_offset(stripe_nr: ctl->dev_stripes);
5096	}
5097
5098	static void init_alloc_chunk_ctl_policy_zoned(
5099	struct btrfs_fs_devices *fs_devices,
5100	struct alloc_chunk_ctl *ctl)
5101	{
5102	u64 zone_size = fs_devices->fs_info->zone_size;
5103	u64 limit;
5104	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5105	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5106	u64 min_chunk_size = min_data_stripes * zone_size;
5107	u64 type = ctl->type;
5108
5109	ctl->max_stripe_size = zone_size;
5110	if (type & BTRFS_BLOCK_GROUP_DATA) {
5111	ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5112	zone_size);
5113	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5114	ctl->max_chunk_size = ctl->max_stripe_size;
5115	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5116	ctl->max_chunk_size = `2` * ctl->max_stripe_size;
5117	ctl->devs_max = min_t(int, ctl->devs_max,
5118	BTRFS_MAX_DEVS_SYS_CHUNK);
5119	} else {
5120	BUG();
5121	}
5122
5123	/ We don't want a chunk larger than 10% of writable space /
5124	limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, `10`),
5125	zone_size),
5126	min_chunk_size);
5127	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5128	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5129	}
5130
5131	static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5132	struct alloc_chunk_ctl *ctl)
5133	{
5134	int index = btrfs_bg_flags_to_raid_index(flags: ctl->type);
5135
5136	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5137	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5138	ctl->devs_max = btrfs_raid_array[index].devs_max;
5139	if (!ctl->devs_max)
5140	ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5141	ctl->devs_min = btrfs_raid_array[index].devs_min;
5142	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5143	ctl->ncopies = btrfs_raid_array[index].ncopies;
5144	ctl->nparity = btrfs_raid_array[index].nparity;
5145	ctl->ndevs = `0`;
5146
5147	switch (fs_devices->chunk_alloc_policy) {
5148	case BTRFS_CHUNK_ALLOC_REGULAR:
5149	init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5150	break;
5151	case BTRFS_CHUNK_ALLOC_ZONED:
5152	init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5153	break;
5154	default:
5155	BUG();
5156	}
5157	}
5158
5159	static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5160	struct alloc_chunk_ctl *ctl,
5161	struct btrfs_device_info *devices_info)
5162	{
5163	struct btrfs_fs_info *info = fs_devices->fs_info;
5164	struct btrfs_device *device;
5165	u64 total_avail;
5166	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5167	int ret;
5168	int ndevs = `0`;
5169	u64 max_avail;
5170	u64 dev_offset;
5171
5172	/*
5173	* in the first pass through the devices list, we gather information
5174	* about the available holes on each device.
5175	*/
5176	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5177	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5178	WARN(`1`, KERN_ERR
5179	"BTRFS: read-only device in alloc_list\n");
5180	continue;
5181	}
5182
5183	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5184	&device->dev_state) \|\|
5185	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5186	continue;
5187
5188	if (device->total_bytes > device->bytes_used)
5189	total_avail = device->total_bytes - device->bytes_used;
5190	else
5191	total_avail = `0`;
5192
5193	/ If there is no space on this device, skip it. /
5194	if (total_avail < ctl->dev_extent_min)
5195	continue;
5196
5197	ret = find_free_dev_extent(device, num_bytes: dev_extent_want, start: &dev_offset,
5198	len: &max_avail);
5199	if (ret && ret != -ENOSPC)
5200	return ret;
5201
5202	if (ret == `0`)
5203	max_avail = dev_extent_want;
5204
5205	if (max_avail < ctl->dev_extent_min) {
5206	if (btrfs_test_opt(info, ENOSPC_DEBUG))
5207	btrfs_debug(info,
5208	"%s: devid %llu has no free space, have=%llu want=%llu",
5209	__func__, device->devid, max_avail,
5210	ctl->dev_extent_min);
5211	continue;
5212	}
5213
5214	if (ndevs == fs_devices->rw_devices) {
5215	WARN(`1`, "%s: found more than %llu devices\n",
5216	__func__, fs_devices->rw_devices);
5217	break;
5218	}
5219	devices_info[ndevs].dev_offset = dev_offset;
5220	devices_info[ndevs].max_avail = max_avail;
5221	devices_info[ndevs].total_avail = total_avail;
5222	devices_info[ndevs].dev = device;
5223	++ndevs;
5224	}
5225	ctl->ndevs = ndevs;
5226
5227	/*
5228	* now sort the devices by hole size / available space
5229	*/
5230	sort(base: devices_info, num: ndevs, size: sizeof(struct btrfs_device_info),
5231	cmp_func: btrfs_cmp_device_info, NULL);
5232
5233	return `0`;
5234	}
5235
5236	static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5237	struct btrfs_device_info *devices_info)
5238	{
5239	/ Number of stripes that count for block group size /
5240	int data_stripes;
5241
5242	/*
5243	* The primary goal is to maximize the number of stripes, so use as
5244	* many devices as possible, even if the stripes are not maximum sized.
5245	*
5246	* The DUP profile stores more than one stripe per device, the
5247	* max_avail is the total size so we have to adjust.
5248	*/
5249	ctl->stripe_size = div_u64(dividend: devices_info[ctl->ndevs - `1`].max_avail,
5250	divisor: ctl->dev_stripes);
5251	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5252
5253	/ This will have to be fixed for RAID1 and RAID10 over more drives /
5254	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5255
5256	/*
5257	* Use the number of data stripes to figure out how big this chunk is
5258	* really going to be in terms of logical address space, and compare
5259	* that answer with the max chunk size. If it's higher, we try to
5260	* reduce stripe_size.
5261	*/
5262	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5263	/*
5264	* Reduce stripe_size, round it up to a 16MB boundary again and
5265	* then use it, unless it ends up being even bigger than the
5266	* previous value we had already.
5267	*/
5268	ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5269	data_stripes), SZ_16M),
5270	ctl->stripe_size);
5271	}
5272
5273	/ Stripe size should not go beyond 1G. /
5274	ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);
5275
5276	/ Align to BTRFS_STRIPE_LEN /
5277	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5278	ctl->chunk_size = ctl->stripe_size * data_stripes;
5279
5280	return `0`;
5281	}
5282
5283	static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5284	struct btrfs_device_info *devices_info)
5285	{
5286	u64 zone_size = devices_info[`0`].dev->zone_info->zone_size;
5287	/ Number of stripes that count for block group size /
5288	int data_stripes;
5289
5290	/*
5291	* It should hold because:
5292	* dev_extent_min == dev_extent_want == zone_size * dev_stripes
5293	*/
5294	ASSERT(devices_info[ctl->ndevs - `1`].max_avail == ctl->dev_extent_min);
5295
5296	ctl->stripe_size = zone_size;
5297	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5298	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5299
5300	/ stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. /
5301	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5302	ctl->ndevs = div_u64(dividend: div_u64(dividend: ctl->max_chunk_size * ctl->ncopies,
5303	divisor: ctl->stripe_size) + ctl->nparity,
5304	divisor: ctl->dev_stripes);
5305	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5306	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5307	ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5308	}
5309
5310	ctl->chunk_size = ctl->stripe_size * data_stripes;
5311
5312	return `0`;
5313	}
5314
5315	static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5316	struct alloc_chunk_ctl *ctl,
5317	struct btrfs_device_info *devices_info)
5318	{
5319	struct btrfs_fs_info *info = fs_devices->fs_info;
5320
5321	/*
5322	* Round down to number of usable stripes, devs_increment can be any
5323	* number so we can't use round_down() that requires power of 2, while
5324	* rounddown is safe.
5325	*/
5326	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5327
5328	if (ctl->ndevs < ctl->devs_min) {
5329	if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5330	btrfs_debug(info,
5331	"%s: not enough devices with free space: have=%d minimum required=%d",
5332	__func__, ctl->ndevs, ctl->devs_min);
5333	}
5334	return -ENOSPC;
5335	}
5336
5337	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5338
5339	switch (fs_devices->chunk_alloc_policy) {
5340	case BTRFS_CHUNK_ALLOC_REGULAR:
5341	return decide_stripe_size_regular(ctl, devices_info);
5342	case BTRFS_CHUNK_ALLOC_ZONED:
5343	return decide_stripe_size_zoned(ctl, devices_info);
5344	default:
5345	BUG();
5346	}
5347	}
5348
5349	static struct btrfs_block_group create_chunk(struct* btrfs_trans_handle *trans,
5350	struct alloc_chunk_ctl *ctl,
5351	struct btrfs_device_info *devices_info)
5352	{
5353	struct btrfs_fs_info *info = trans->fs_info;
5354	struct map_lookup *map = NULL;
5355	struct extent_map_tree *em_tree;
5356	struct btrfs_block_group *block_group;
5357	struct extent_map *em;
5358	u64 start = ctl->start;
5359	u64 type = ctl->type;
5360	int ret;
5361	int i;
5362	int j;
5363
5364	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5365	if (!map)
5366	return ERR_PTR(error: -ENOMEM);
5367	map->num_stripes = ctl->num_stripes;
5368
5369	for (i = `0`; i < ctl->ndevs; ++i) {
5370	for (j = `0`; j < ctl->dev_stripes; ++j) {
5371	int s = i * ctl->dev_stripes + j;
5372	map->stripes[s].dev = devices_info[i].dev;
5373	map->stripes[s].physical = devices_info[i].dev_offset +
5374	j * ctl->stripe_size;
5375	}
5376	}
5377	map->io_align = BTRFS_STRIPE_LEN;
5378	map->io_width = BTRFS_STRIPE_LEN;
5379	map->type = type;
5380	map->sub_stripes = ctl->sub_stripes;
5381
5382	trace_btrfs_chunk_alloc(fs_info: info, map, offset: start, size: ctl->chunk_size);
5383
5384	em = alloc_extent_map();
5385	if (!em) {
5386	kfree(objp: map);
5387	return ERR_PTR(error: -ENOMEM);
5388	}
5389	set_bit(nr: EXTENT_FLAG_FS_MAPPING, addr: &em->flags);
5390	em->map_lookup = map;
5391	em->start = start;
5392	em->len = ctl->chunk_size;
5393	em->block_start = `0`;
5394	em->block_len = em->len;
5395	em->orig_block_len = ctl->stripe_size;
5396
5397	em_tree = &info->mapping_tree;
5398	write_lock(&em_tree->lock);
5399	ret = add_extent_mapping(tree: em_tree, em, modified: `0`);
5400	if (ret) {
5401	write_unlock(&em_tree->lock);
5402	free_extent_map(em);
5403	return ERR_PTR(error: ret);
5404	}
5405	write_unlock(&em_tree->lock);
5406
5407	block_group = btrfs_make_block_group(trans, type, chunk_offset: start, size: ctl->chunk_size);
5408	if (IS_ERR(ptr: block_group))
5409	goto error_del_extent;
5410
5411	for (i = `0`; i < map->num_stripes; i++) {
5412	struct btrfs_device *dev = map->stripes[i].dev;
5413
5414	btrfs_device_set_bytes_used(dev,
5415	size: dev->bytes_used + ctl->stripe_size);
5416	if (list_empty(head: &dev->post_commit_list))
5417	list_add_tail(new: &dev->post_commit_list,
5418	head: &trans->transaction->dev_update_list);
5419	}
5420
5421	atomic64_sub(i: ctl->stripe_size * map->num_stripes,
5422	v: &info->free_chunk_space);
5423
5424	free_extent_map(em);
5425	check_raid56_incompat_flag(info, type);
5426	check_raid1c34_incompat_flag(info, type);
5427
5428	return block_group;
5429
5430	error_del_extent:
5431	write_lock(&em_tree->lock);
5432	remove_extent_mapping(tree: em_tree, em);
5433	write_unlock(&em_tree->lock);
5434
5435	/ One for our allocation /
5436	free_extent_map(em);
5437	/ One for the tree reference /
5438	free_extent_map(em);
5439
5440	return block_group;
5441	}
5442
5443	struct btrfs_block_group btrfs_create_chunk(struct* btrfs_trans_handle *trans,
5444	u64 type)
5445	{
5446	struct btrfs_fs_info *info = trans->fs_info;
5447	struct btrfs_fs_devices *fs_devices = info->fs_devices;
5448	struct btrfs_device_info *devices_info = NULL;
5449	struct alloc_chunk_ctl ctl;
5450	struct btrfs_block_group *block_group;
5451	int ret;
5452
5453	lockdep_assert_held(&info->chunk_mutex);
5454
5455	if (!alloc_profile_is_valid(flags: type, extended: `0`)) {
5456	ASSERT(`0`);
5457	return ERR_PTR(error: -EINVAL);
5458	}
5459
5460	if (list_empty(head: &fs_devices->alloc_list)) {
5461	if (btrfs_test_opt(info, ENOSPC_DEBUG))
5462	btrfs_debug(info, "%s: no writable device", __func__);
5463	return ERR_PTR(error: -ENOSPC);
5464	}
5465
5466	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5467	btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5468	ASSERT(`0`);
5469	return ERR_PTR(error: -EINVAL);
5470	}
5471
5472	ctl.start = find_next_chunk(fs_info: info);
5473	ctl.type = type;
5474	init_alloc_chunk_ctl(fs_devices, ctl: &ctl);
5475
5476	devices_info = kcalloc(n: fs_devices->rw_devices, size: sizeof(*devices_info),
5477	GFP_NOFS);
5478	if (!devices_info)
5479	return ERR_PTR(error: -ENOMEM);
5480
5481	ret = gather_device_info(fs_devices, ctl: &ctl, devices_info);
5482	if (ret < `0`) {
5483	block_group = ERR_PTR(error: ret);
5484	goto out;
5485	}
5486
5487	ret = decide_stripe_size(fs_devices, ctl: &ctl, devices_info);
5488	if (ret < `0`) {
5489	block_group = ERR_PTR(error: ret);
5490	goto out;
5491	}
5492
5493	block_group = create_chunk(trans, ctl: &ctl, devices_info);
5494
5495	out:
5496	kfree(objp: devices_info);
5497	return block_group;
5498	}
5499
5500	/*
5501	* This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5502	* phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5503	* chunks.
5504	*
5505	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5506	* phases.
5507	*/
5508	int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5509	struct btrfs_block_group *bg)
5510	{
5511	struct btrfs_fs_info *fs_info = trans->fs_info;
5512	struct btrfs_root *chunk_root = fs_info->chunk_root;
5513	struct btrfs_key key;
5514	struct btrfs_chunk *chunk;
5515	struct btrfs_stripe *stripe;
5516	struct extent_map *em;
5517	struct map_lookup *map;
5518	size_t item_size;
5519	int i;
5520	int ret;
5521
5522	/*
5523	* We take the chunk_mutex for 2 reasons:
5524	*
5525	* 1) Updates and insertions in the chunk btree must be done while holding
5526	* the chunk_mutex, as well as updating the system chunk array in the
5527	* superblock. See the comment on top of btrfs_chunk_alloc() for the
5528	* details;
5529	*
5530	* 2) To prevent races with the final phase of a device replace operation
5531	* that replaces the device object associated with the map's stripes,
5532	* because the device object's id can change at any time during that
5533	* final phase of the device replace operation
5534	* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5535	* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5536	* which would cause a failure when updating the device item, which does
5537	* not exists, or persisting a stripe of the chunk item with such ID.
5538	* Here we can't use the device_list_mutex because our caller already
5539	* has locked the chunk_mutex, and the final phase of device replace
5540	* acquires both mutexes - first the device_list_mutex and then the
5541	* chunk_mutex. Using any of those two mutexes protects us from a
5542	* concurrent device replace.
5543	*/
5544	lockdep_assert_held(&fs_info->chunk_mutex);
5545
5546	em = btrfs_get_chunk_map(fs_info, logical: bg->start, length: bg->length);
5547	if (IS_ERR(ptr: em)) {
5548	ret = PTR_ERR(ptr: em);
5549	btrfs_abort_transaction(trans, ret);
5550	return ret;
5551	}
5552
5553	map = em->map_lookup;
5554	item_size = btrfs_chunk_item_size(num_stripes: map->num_stripes);
5555
5556	chunk = kzalloc(size: item_size, GFP_NOFS);
5557	if (!chunk) {
5558	ret = -ENOMEM;
5559	btrfs_abort_transaction(trans, ret);
5560	goto out;
5561	}
5562
5563	for (i = `0`; i < map->num_stripes; i++) {
5564	struct btrfs_device *device = map->stripes[i].dev;
5565
5566	ret = btrfs_update_device(trans, device);
5567	if (ret)
5568	goto out;
5569	}
5570
5571	stripe = &chunk->stripe;
5572	for (i = `0`; i < map->num_stripes; i++) {
5573	struct btrfs_device *device = map->stripes[i].dev;
5574	const u64 dev_offset = map->stripes[i].physical;
5575
5576	btrfs_set_stack_stripe_devid(s: stripe, val: device->devid);
5577	btrfs_set_stack_stripe_offset(s: stripe, val: dev_offset);
5578	memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5579	stripe++;
5580	}
5581
5582	btrfs_set_stack_chunk_length(s: chunk, val: bg->length);
5583	btrfs_set_stack_chunk_owner(s: chunk, BTRFS_EXTENT_TREE_OBJECTID);
5584	btrfs_set_stack_chunk_stripe_len(s: chunk, BTRFS_STRIPE_LEN);
5585	btrfs_set_stack_chunk_type(s: chunk, val: map->type);
5586	btrfs_set_stack_chunk_num_stripes(s: chunk, val: map->num_stripes);
5587	btrfs_set_stack_chunk_io_align(s: chunk, BTRFS_STRIPE_LEN);
5588	btrfs_set_stack_chunk_io_width(s: chunk, BTRFS_STRIPE_LEN);
5589	btrfs_set_stack_chunk_sector_size(s: chunk, val: fs_info->sectorsize);
5590	btrfs_set_stack_chunk_sub_stripes(s: chunk, val: map->sub_stripes);
5591
5592	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5593	key.type = BTRFS_CHUNK_ITEM_KEY;
5594	key.offset = bg->start;
5595
5596	ret = btrfs_insert_item(trans, root: chunk_root, key: &key, data: chunk, data_size: item_size);
5597	if (ret)
5598	goto out;
5599
5600	set_bit(nr: BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, addr: &bg->runtime_flags);
5601
5602	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5603	ret = btrfs_add_system_chunk(fs_info, key: &key, chunk, item_size);
5604	if (ret)
5605	goto out;
5606	}
5607
5608	out:
5609	kfree(objp: chunk);
5610	free_extent_map(em);
5611	return ret;
5612	}
5613
5614	static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5615	{
5616	struct btrfs_fs_info *fs_info = trans->fs_info;
5617	u64 alloc_profile;
5618	struct btrfs_block_group *meta_bg;
5619	struct btrfs_block_group *sys_bg;
5620
5621	/*
5622	* When adding a new device for sprouting, the seed device is read-only
5623	* so we must first allocate a metadata and a system chunk. But before
5624	* adding the block group items to the extent, device and chunk btrees,
5625	* we must first:
5626	*
5627	* 1) Create both chunks without doing any changes to the btrees, as
5628	* otherwise we would get -ENOSPC since the block groups from the
5629	* seed device are read-only;
5630	*
5631	* 2) Add the device item for the new sprout device - finishing the setup
5632	* of a new block group requires updating the device item in the chunk
5633	* btree, so it must exist when we attempt to do it. The previous step
5634	* ensures this does not fail with -ENOSPC.
5635	*
5636	* After that we can add the block group items to their btrees:
5637	* update existing device item in the chunk btree, add a new block group
5638	* item to the extent btree, add a new chunk item to the chunk btree and
5639	* finally add the new device extent items to the devices btree.
5640	*/
5641
5642	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5643	meta_bg = btrfs_create_chunk(trans, type: alloc_profile);
5644	if (IS_ERR(ptr: meta_bg))
5645	return PTR_ERR(ptr: meta_bg);
5646
5647	alloc_profile = btrfs_system_alloc_profile(fs_info);
5648	sys_bg = btrfs_create_chunk(trans, type: alloc_profile);
5649	if (IS_ERR(ptr: sys_bg))
5650	return PTR_ERR(ptr: sys_bg);
5651
5652	return `0`;
5653	}
5654
5655	static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5656	{
5657	const int index = btrfs_bg_flags_to_raid_index(flags: map->type);
5658
5659	return btrfs_raid_array[index].tolerated_failures;
5660	}
5661
5662	bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5663	{
5664	struct extent_map *em;
5665	struct map_lookup *map;
5666	int miss_ndevs = `0`;
5667	int i;
5668	bool ret = true;
5669
5670	em = btrfs_get_chunk_map(fs_info, logical: chunk_offset, length: `1`);
5671	if (IS_ERR(ptr: em))
5672	return false;
5673
5674	map = em->map_lookup;
5675	for (i = `0`; i < map->num_stripes; i++) {
5676	if (test_bit(BTRFS_DEV_STATE_MISSING,
5677	&map->stripes[i].dev->dev_state)) {
5678	miss_ndevs++;
5679	continue;
5680	}
5681	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5682	&map->stripes[i].dev->dev_state)) {
5683	ret = false;
5684	goto end;
5685	}
5686	}
5687
5688	/*
5689	* If the number of missing devices is larger than max errors, we can
5690	* not write the data into that chunk successfully.
5691	*/
5692	if (miss_ndevs > btrfs_chunk_max_errors(map))
5693	ret = false;
5694	end:
5695	free_extent_map(em);
5696	return ret;
5697	}
5698
5699	void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5700	{
5701	struct extent_map *em;
5702
5703	while (`1`) {
5704	write_lock(&tree->lock);
5705	em = lookup_extent_mapping(tree, start: `0`, len: (u64)-`1`);
5706	if (em)
5707	remove_extent_mapping(tree, em);
5708	write_unlock(&tree->lock);
5709	if (!em)
5710	break;
5711	/ once for us /
5712	free_extent_map(em);
5713	/ once for the tree /
5714	free_extent_map(em);
5715	}
5716	}
5717
5718	int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5719	{
5720	struct extent_map *em;
5721	struct map_lookup *map;
5722	enum btrfs_raid_types index;
5723	int ret = `1`;
5724
5725	em = btrfs_get_chunk_map(fs_info, logical, length: len);
5726	if (IS_ERR(ptr: em))
5727	/*
5728	* We could return errors for these cases, but that could get
5729	* ugly and we'd probably do the same thing which is just not do
5730	* anything else and exit, so return 1 so the callers don't try
5731	* to use other copies.
5732	*/
5733	return `1`;
5734
5735	map = em->map_lookup;
5736	index = btrfs_bg_flags_to_raid_index(flags: map->type);
5737
5738	/ Non-RAID56, use their ncopies from btrfs_raid_array. /
5739	if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK))
5740	ret = btrfs_raid_array[index].ncopies;
5741	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5742	ret = `2`;
5743	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5744	/*
5745	* There could be two corrupted data stripes, we need
5746	* to loop retry in order to rebuild the correct data.
5747	*
5748	* Fail a stripe at a time on every retry except the
5749	* stripe under reconstruction.
5750	*/
5751	ret = map->num_stripes;
5752	free_extent_map(em);
5753	return ret;
5754	}
5755
5756	unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5757	u64 logical)
5758	{
5759	struct extent_map *em;
5760	struct map_lookup *map;
5761	unsigned long len = fs_info->sectorsize;
5762
5763	if (!btrfs_fs_incompat(fs_info, RAID56))
5764	return len;
5765
5766	em = btrfs_get_chunk_map(fs_info, logical, length: len);
5767
5768	if (!WARN_ON(IS_ERR(em))) {
5769	map = em->map_lookup;
5770	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5771	len = btrfs_stripe_nr_to_offset(stripe_nr: nr_data_stripes(map));
5772	free_extent_map(em);
5773	}
5774	return len;
5775	}
5776
5777	int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5778	{
5779	struct extent_map *em;
5780	struct map_lookup *map;
5781	int ret = `0`;
5782
5783	if (!btrfs_fs_incompat(fs_info, RAID56))
5784	return `0`;
5785
5786	em = btrfs_get_chunk_map(fs_info, logical, length: len);
5787
5788	if(!WARN_ON(IS_ERR(em))) {
5789	map = em->map_lookup;
5790	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5791	ret = `1`;
5792	free_extent_map(em);
5793	}
5794	return ret;
5795	}
5796
5797	static int find_live_mirror(struct btrfs_fs_info *fs_info,
5798	struct map_lookup map, int* first,
5799	int dev_replace_is_ongoing)
5800	{
5801	int i;
5802	int num_stripes;
5803	int preferred_mirror;
5804	int tolerance;
5805	struct btrfs_device *srcdev;
5806
5807	ASSERT((map->type &
5808	(BTRFS_BLOCK_GROUP_RAID1_MASK \| BTRFS_BLOCK_GROUP_RAID10)));
5809
5810	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5811	num_stripes = map->sub_stripes;
5812	else
5813	num_stripes = map->num_stripes;
5814
5815	switch (fs_info->fs_devices->read_policy) {
5816	default:
5817	/ Shouldn't happen, just warn and use pid instead of failing /
5818	btrfs_warn_rl(fs_info,
5819	"unknown read_policy type %u, reset to pid",
5820	fs_info->fs_devices->read_policy);
5821	fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5822	fallthrough;
5823	case BTRFS_READ_POLICY_PID:
5824	preferred_mirror = first + (current->pid % num_stripes);
5825	break;
5826	}
5827
5828	if (dev_replace_is_ongoing &&
5829	fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5830	BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5831	srcdev = fs_info->dev_replace.srcdev;
5832	else
5833	srcdev = NULL;
5834
5835	/*
5836	* try to avoid the drive that is the source drive for a
5837	* dev-replace procedure, only choose it if no other non-missing
5838	* mirror is available
5839	*/
5840	for (tolerance = `0`; tolerance < `2`; tolerance++) {
5841	if (map->stripes[preferred_mirror].dev->bdev &&
5842	(tolerance \|\| map->stripes[preferred_mirror].dev != srcdev))
5843	return preferred_mirror;
5844	for (i = first; i < first + num_stripes; i++) {
5845	if (map->stripes[i].dev->bdev &&
5846	(tolerance \|\| map->stripes[i].dev != srcdev))
5847	return i;
5848	}
5849	}
5850
5851	/ we couldn't find one that doesn't fail. Just return something*
5852	* and the io error handling code will clean up eventually
5853	*/
5854	return preferred_mirror;
5855	}
5856
5857	static struct btrfs_io_context alloc_btrfs_io_context(struct* btrfs_fs_info *fs_info,
5858	u64 logical,
5859	u16 total_stripes)
5860	{
5861	struct btrfs_io_context *bioc;
5862
5863	bioc = kzalloc(
5864	/ The size of btrfs_io_context /
5865	size: sizeof(struct btrfs_io_context) +
5866	/ Plus the variable array for the stripes /
5867	sizeof(struct btrfs_io_stripe) * (total_stripes),
5868	GFP_NOFS);
5869
5870	if (!bioc)
5871	return NULL;
5872
5873	refcount_set(r: &bioc->refs, n: `1`);
5874
5875	bioc->fs_info = fs_info;
5876	bioc->replace_stripe_src = -`1`;
5877	bioc->full_stripe_logical = (u64)-`1`;
5878	bioc->logical = logical;
5879
5880	return bioc;
5881	}
5882
5883	void btrfs_get_bioc(struct btrfs_io_context *bioc)
5884	{
5885	WARN_ON(!refcount_read(&bioc->refs));
5886	refcount_inc(r: &bioc->refs);
5887	}
5888
5889	void btrfs_put_bioc(struct btrfs_io_context *bioc)
5890	{
5891	if (!bioc)
5892	return;
5893	if (refcount_dec_and_test(r: &bioc->refs))
5894	kfree(objp: bioc);
5895	}
5896
5897	/*
5898	* Please note that, discard won't be sent to target device of device
5899	* replace.
5900	*/
5901	struct btrfs_discard_stripe btrfs_map_discard(struct* btrfs_fs_info *fs_info,
5902	u64 logical, u64 *length_ret,
5903	u32 *num_stripes)
5904	{
5905	struct extent_map *em;
5906	struct map_lookup *map;
5907	struct btrfs_discard_stripe *stripes;
5908	u64 length = *length_ret;
5909	u64 offset;
5910	u32 stripe_nr;
5911	u32 stripe_nr_end;
5912	u32 stripe_cnt;
5913	u64 stripe_end_offset;
5914	u64 stripe_offset;
5915	u32 stripe_index;
5916	u32 factor = `0`;
5917	u32 sub_stripes = `0`;
5918	u32 stripes_per_dev = `0`;
5919	u32 remaining_stripes = `0`;
5920	u32 last_stripe = `0`;
5921	int ret;
5922	int i;
5923
5924	em = btrfs_get_chunk_map(fs_info, logical, length);
5925	if (IS_ERR(ptr: em))
5926	return ERR_CAST(ptr: em);
5927
5928	map = em->map_lookup;
5929
5930	/ we don't discard raid56 yet /
5931	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5932	ret = -EOPNOTSUPP;
5933	goto out_free_map;
5934	}
5935
5936	offset = logical - em->start;
5937	length = min_t(u64, em->start + em->len - logical, length);
5938	*length_ret = length;
5939
5940	/*
5941	* stripe_nr counts the total number of stripes we have to stride
5942	* to get to this block
5943	*/
5944	stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
5945
5946	/ stripe_offset is the offset of this block in its stripe /
5947	stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
5948
5949	stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
5950	BTRFS_STRIPE_LEN_SHIFT;
5951	stripe_cnt = stripe_nr_end - stripe_nr;
5952	stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr: stripe_nr_end) -
5953	(offset + length);
5954	/*
5955	* after this, stripe_nr is the number of stripes on this
5956	* device we have to walk to find the data, and stripe_index is
5957	* the number of our device in the stripe array
5958	*/
5959	*num_stripes = `1`;
5960	stripe_index = `0`;
5961	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
5962	BTRFS_BLOCK_GROUP_RAID10)) {
5963	if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5964	sub_stripes = `1`;
5965	else
5966	sub_stripes = map->sub_stripes;
5967
5968	factor = map->num_stripes / sub_stripes;
5969	*num_stripes = min_t(u64, map->num_stripes,
5970	sub_stripes * stripe_cnt);
5971	stripe_index = stripe_nr % factor;
5972	stripe_nr /= factor;
5973	stripe_index *= sub_stripes;
5974
5975	remaining_stripes = stripe_cnt % factor;
5976	stripes_per_dev = stripe_cnt / factor;
5977	last_stripe = ((stripe_nr_end - `1`) % factor) * sub_stripes;
5978	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK \|
5979	BTRFS_BLOCK_GROUP_DUP)) {
5980	*num_stripes = map->num_stripes;
5981	} else {
5982	stripe_index = stripe_nr % map->num_stripes;
5983	stripe_nr /= map->num_stripes;
5984	}
5985
5986	stripes = kcalloc(n: num_stripes, size: sizeof(stripes), GFP_NOFS);
5987	if (!stripes) {
5988	ret = -ENOMEM;
5989	goto out_free_map;
5990	}
5991
5992	for (i = `0`; i < *num_stripes; i++) {
5993	stripes[i].physical =
5994	map->stripes[stripe_index].physical +
5995	stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
5996	stripes[i].dev = map->stripes[stripe_index].dev;
5997
5998	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
5999	BTRFS_BLOCK_GROUP_RAID10)) {
6000	stripes[i].length = btrfs_stripe_nr_to_offset(stripe_nr: stripes_per_dev);
6001
6002	if (i / sub_stripes < remaining_stripes)
6003	stripes[i].length += BTRFS_STRIPE_LEN;
6004
6005	/*
6006	* Special for the first stripe and
6007	* the last stripe:
6008	*
6009	* \|-------\|...\|-------\|
6010	* \|----------\|
6011	* off end_off
6012	*/
6013	if (i < sub_stripes)
6014	stripes[i].length -= stripe_offset;
6015
6016	if (stripe_index >= last_stripe &&
6017	stripe_index <= (last_stripe +
6018	sub_stripes - `1`))
6019	stripes[i].length -= stripe_end_offset;
6020
6021	if (i == sub_stripes - `1`)
6022	stripe_offset = `0`;
6023	} else {
6024	stripes[i].length = length;
6025	}
6026
6027	stripe_index++;
6028	if (stripe_index == map->num_stripes) {
6029	stripe_index = `0`;
6030	stripe_nr++;
6031	}
6032	}
6033
6034	free_extent_map(em);
6035	return stripes;
6036	out_free_map:
6037	free_extent_map(em);
6038	return ERR_PTR(error: ret);
6039	}
6040
6041	static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6042	{
6043	struct btrfs_block_group *cache;
6044	bool ret;
6045
6046	/ Non zoned filesystem does not use "to_copy" flag /
6047	if (!btrfs_is_zoned(fs_info))
6048	return false;
6049
6050	cache = btrfs_lookup_block_group(info: fs_info, bytenr: logical);
6051
6052	ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
6053
6054	btrfs_put_block_group(cache);
6055	return ret;
6056	}
6057
6058	static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6059	struct btrfs_io_context *bioc,
6060	struct btrfs_dev_replace *dev_replace,
6061	u64 logical,
6062	int num_stripes_ret, int* *max_errors_ret)
6063	{
6064	u64 srcdev_devid = dev_replace->srcdev->devid;
6065	/*
6066	* At this stage, num_stripes is still the real number of stripes,
6067	* excluding the duplicated stripes.
6068	*/
6069	int num_stripes = *num_stripes_ret;
6070	int nr_extra_stripes = `0`;
6071	int max_errors = *max_errors_ret;
6072	int i;
6073
6074	/*
6075	* A block group which has "to_copy" set will eventually be copied by
6076	* the dev-replace process. We can avoid cloning IO here.
6077	*/
6078	if (is_block_group_to_copy(fs_info: dev_replace->srcdev->fs_info, logical))
6079	return;
6080
6081	/*
6082	* Duplicate the write operations while the dev-replace procedure is
6083	* running. Since the copying of the old disk to the new disk takes
6084	* place at run time while the filesystem is mounted writable, the
6085	* regular write operations to the old disk have to be duplicated to go
6086	* to the new disk as well.
6087	*
6088	* Note that device->missing is handled by the caller, and that the
6089	* write to the old disk is already set up in the stripes array.
6090	*/
6091	for (i = `0`; i < num_stripes; i++) {
6092	struct btrfs_io_stripe *old = &bioc->stripes[i];
6093	struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
6094
6095	if (old->dev->devid != srcdev_devid)
6096	continue;
6097
6098	new->physical = old->physical;
6099	new->dev = dev_replace->tgtdev;
6100	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
6101	bioc->replace_stripe_src = i;
6102	nr_extra_stripes++;
6103	}
6104
6105	/ We can only have at most 2 extra nr_stripes (for DUP). /
6106	ASSERT(nr_extra_stripes <= `2`);
6107	/*
6108	* For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
6109	* replace.
6110	* If we have 2 extra stripes, only choose the one with smaller physical.
6111	*/
6112	if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == `2`) {
6113	struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
6114	struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + `1`];
6115
6116	/ Only DUP can have two extra stripes. /
6117	ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
6118
6119	/*
6120	* Swap the last stripe stripes and reduce @nr_extra_stripes.
6121	* The extra stripe would still be there, but won't be accessed.
6122	*/
6123	if (first->physical > second->physical) {
6124	swap(second->physical, first->physical);
6125	swap(second->dev, first->dev);
6126	nr_extra_stripes--;
6127	}
6128	}
6129
6130	*num_stripes_ret = num_stripes + nr_extra_stripes;
6131	*max_errors_ret = max_errors + nr_extra_stripes;
6132	bioc->replace_nr_stripes = nr_extra_stripes;
6133	}
6134
6135	static u64 btrfs_max_io_len(struct map_lookup map, enum* btrfs_map_op op,
6136	u64 offset, u32 stripe_nr, u64 stripe_offset,
6137	u64 *full_stripe_start)
6138	{
6139	/*
6140	* Stripe_nr is the stripe where this block falls. stripe_offset is
6141	* the offset of this block in its stripe.
6142	*/
6143	*stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
6144	*stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
6145	ASSERT(*stripe_offset < U32_MAX);
6146
6147	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6148	unsigned long full_stripe_len =
6149	btrfs_stripe_nr_to_offset(stripe_nr: nr_data_stripes(map));
6150
6151	/*
6152	* For full stripe start, we use previously calculated
6153	* @stripe_nr. Align it to nr_data_stripes, then multiply with
6154	* STRIPE_LEN.
6155	*
6156	* By this we can avoid u64 division completely. And we have
6157	* to go rounddown(), not round_down(), as nr_data_stripes is
6158	* not ensured to be power of 2.
6159	*/
6160	*full_stripe_start =
6161	btrfs_stripe_nr_to_offset(
6162	rounddown(*stripe_nr, nr_data_stripes(map)));
6163
6164	ASSERT(*full_stripe_start + full_stripe_len > offset);
6165	ASSERT(*full_stripe_start <= offset);
6166	/*
6167	* For writes to RAID56, allow to write a full stripe set, but
6168	* no straddling of stripe sets.
6169	*/
6170	if (op == BTRFS_MAP_WRITE)
6171	return full_stripe_len - (offset - *full_stripe_start);
6172	}
6173
6174	/*
6175	* For other RAID types and for RAID56 reads, allow a single stripe (on
6176	* a single disk).
6177	*/
6178	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
6179	return BTRFS_STRIPE_LEN - *stripe_offset;
6180	return U64_MAX;
6181	}
6182
6183	static int set_io_stripe(struct btrfs_fs_info fs_info, enum* btrfs_map_op op,
6184	u64 logical, u64 length, struct* btrfs_io_stripe *dst,
6185	struct map_lookup *map, u32 stripe_index,
6186	u64 stripe_offset, u64 stripe_nr)
6187	{
6188	dst->dev = map->stripes[stripe_index].dev;
6189
6190	if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map_type: map->type))
6191	return btrfs_get_raid_extent_offset(fs_info, logical, length,
6192	map_type: map->type, stripe_index, stripe: dst);
6193
6194	dst->physical = map->stripes[stripe_index].physical +
6195	stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
6196	return `0`;
6197	}
6198
6199	/*
6200	* Map one logical range to one or more physical ranges.
6201	*
6202	* @length: (Mandatory) mapped length of this run.
6203	* One logical range can be split into different segments
6204	* due to factors like zones and RAID0/5/6/10 stripe
6205	* boundaries.
6206	*
6207	* @bioc_ret: (Mandatory) returned btrfs_io_context structure.
6208	* which has one or more physical ranges (btrfs_io_stripe)
6209	* recorded inside.
6210	* Caller should call btrfs_put_bioc() to free it after use.
6211	*
6212	* @smap: (Optional) single physical range optimization.
6213	* If the map request can be fulfilled by one single
6214	* physical range, and this is parameter is not NULL,
6215	* then @bioc_ret would be NULL, and @smap would be
6216	* updated.
6217	*
6218	* @mirror_num_ret: (Mandatory) returned mirror number if the original
6219	* value is 0.
6220	*
6221	* Mirror number 0 means to choose any live mirrors.
6222	*
6223	* For non-RAID56 profiles, non-zero mirror_num means
6224	* the Nth mirror. (e.g. mirror_num 1 means the first
6225	* copy).
6226	*
6227	* For RAID56 profile, mirror 1 means rebuild from P and
6228	* the remaining data stripes.
6229	*
6230	* For RAID6 profile, mirror > 2 means mark another
6231	* data/P stripe error and rebuild from the remaining
6232	* stripes..
6233	*/
6234	int btrfs_map_block(struct btrfs_fs_info fs_info, enum* btrfs_map_op op,
6235	u64 logical, u64 *length,
6236	struct btrfs_io_context **bioc_ret,
6237	struct btrfs_io_stripe smap, int* *mirror_num_ret)
6238	{
6239	struct extent_map *em;
6240	struct map_lookup *map;
6241	u64 map_offset;
6242	u64 stripe_offset;
6243	u32 stripe_nr;
6244	u32 stripe_index;
6245	int data_stripes;
6246	int i;
6247	int ret = `0`;
6248	int mirror_num = (mirror_num_ret ? *mirror_num_ret : `0`);
6249	int num_stripes;
6250	int num_copies;
6251	int max_errors = `0`;
6252	struct btrfs_io_context *bioc = NULL;
6253	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6254	int dev_replace_is_ongoing = `0`;
6255	u16 num_alloc_stripes;
6256	u64 raid56_full_stripe_start = (u64)-`1`;
6257	u64 max_len;
6258
6259	ASSERT(bioc_ret);
6260
6261	num_copies = btrfs_num_copies(fs_info, logical, len: fs_info->sectorsize);
6262	if (mirror_num > num_copies)
6263	return -EINVAL;
6264
6265	em = btrfs_get_chunk_map(fs_info, logical, length: *length);
6266	if (IS_ERR(ptr: em))
6267	return PTR_ERR(ptr: em);
6268
6269	map = em->map_lookup;
6270	data_stripes = nr_data_stripes(map);
6271
6272	map_offset = logical - em->start;
6273	max_len = btrfs_max_io_len(map, op, offset: map_offset, stripe_nr: &stripe_nr,
6274	stripe_offset: &stripe_offset, full_stripe_start: &raid56_full_stripe_start);
6275	*length = min_t(u64, em->len - map_offset, max_len);
6276
6277	down_read(sem: &dev_replace->rwsem);
6278	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6279	/*
6280	* Hold the semaphore for read during the whole operation, write is
6281	* requested at commit time but must wait.
6282	*/
6283	if (!dev_replace_is_ongoing)
6284	up_read(sem: &dev_replace->rwsem);
6285
6286	num_stripes = `1`;
6287	stripe_index = `0`;
6288	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6289	stripe_index = stripe_nr % map->num_stripes;
6290	stripe_nr /= map->num_stripes;
6291	if (op == BTRFS_MAP_READ)
6292	mirror_num = `1`;
6293	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6294	if (op != BTRFS_MAP_READ) {
6295	num_stripes = map->num_stripes;
6296	} else if (mirror_num) {
6297	stripe_index = mirror_num - `1`;
6298	} else {
6299	stripe_index = find_live_mirror(fs_info, map, first: `0`,
6300	dev_replace_is_ongoing);
6301	mirror_num = stripe_index + `1`;
6302	}
6303
6304	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6305	if (op != BTRFS_MAP_READ) {
6306	num_stripes = map->num_stripes;
6307	} else if (mirror_num) {
6308	stripe_index = mirror_num - `1`;
6309	} else {
6310	mirror_num = `1`;
6311	}
6312
6313	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6314	u32 factor = map->num_stripes / map->sub_stripes;
6315
6316	stripe_index = (stripe_nr % factor) * map->sub_stripes;
6317	stripe_nr /= factor;
6318
6319	if (op != BTRFS_MAP_READ)
6320	num_stripes = map->sub_stripes;
6321	else if (mirror_num)
6322	stripe_index += mirror_num - `1`;
6323	else {
6324	int old_stripe_index = stripe_index;
6325	stripe_index = find_live_mirror(fs_info, map,
6326	first: stripe_index,
6327	dev_replace_is_ongoing);
6328	mirror_num = stripe_index - old_stripe_index + `1`;
6329	}
6330
6331	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6332	if (op != BTRFS_MAP_READ \|\| mirror_num > `1`) {
6333	/*
6334	* Needs full stripe mapping.
6335	*
6336	* Push stripe_nr back to the start of the full stripe
6337	* For those cases needing a full stripe, @stripe_nr
6338	* is the full stripe number.
6339	*
6340	* Originally we go raid56_full_stripe_start / full_stripe_len,
6341	* but that can be expensive. Here we just divide
6342	* @stripe_nr with @data_stripes.
6343	*/
6344	stripe_nr /= data_stripes;
6345
6346	/ RAID[56] write or recovery. Return all stripes /
6347	num_stripes = map->num_stripes;
6348	max_errors = btrfs_chunk_max_errors(map);
6349
6350	/ Return the length to the full stripe end /
6351	length = min(logical + length,
6352	raid56_full_stripe_start + em->start +
6353	btrfs_stripe_nr_to_offset(data_stripes)) -
6354	logical;
6355	stripe_index = `0`;
6356	stripe_offset = `0`;
6357	} else {
6358	ASSERT(mirror_num <= `1`);
6359	/ Just grab the data stripe directly. /
6360	stripe_index = stripe_nr % data_stripes;
6361	stripe_nr /= data_stripes;
6362
6363	/ We distribute the parity blocks across stripes /
6364	stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
6365	if (op == BTRFS_MAP_READ && mirror_num < `1`)
6366	mirror_num = `1`;
6367	}
6368	} else {
6369	/*
6370	* After this, stripe_nr is the number of stripes on this
6371	* device we have to walk to find the data, and stripe_index is
6372	* the number of our device in the stripe array
6373	*/
6374	stripe_index = stripe_nr % map->num_stripes;
6375	stripe_nr /= map->num_stripes;
6376	mirror_num = stripe_index + `1`;
6377	}
6378	if (stripe_index >= map->num_stripes) {
6379	btrfs_crit(fs_info,
6380	"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6381	stripe_index, map->num_stripes);
6382	ret = -EINVAL;
6383	goto out;
6384	}
6385
6386	num_alloc_stripes = num_stripes;
6387	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6388	op != BTRFS_MAP_READ)
6389	/*
6390	* For replace case, we need to add extra stripes for extra
6391	* duplicated stripes.
6392	*
6393	* For both WRITE and GET_READ_MIRRORS, we may have at most
6394	* 2 more stripes (DUP types, otherwise 1).
6395	*/
6396	num_alloc_stripes += `2`;
6397
6398	/*
6399	* If this I/O maps to a single device, try to return the device and
6400	* physical block information on the stack instead of allocating an
6401	* I/O context structure.
6402	*/
6403	if (smap && num_alloc_stripes == `1` &&
6404	!(btrfs_need_stripe_tree_update(fs_info, map_type: map->type) &&
6405	op != BTRFS_MAP_READ) &&
6406	!((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > `1`)) {
6407	ret = set_io_stripe(fs_info, op, logical, length, dst: smap, map,
6408	stripe_index, stripe_offset, stripe_nr);
6409	if (mirror_num_ret)
6410	*mirror_num_ret = mirror_num;
6411	*bioc_ret = NULL;
6412	goto out;
6413	}
6414
6415	bioc = alloc_btrfs_io_context(fs_info, logical, total_stripes: num_alloc_stripes);
6416	if (!bioc) {
6417	ret = -ENOMEM;
6418	goto out;
6419	}
6420	bioc->map_type = map->type;
6421
6422	/*
6423	* For RAID56 full map, we need to make sure the stripes[] follows the
6424	* rule that data stripes are all ordered, then followed with P and Q
6425	* (if we have).
6426	*
6427	* It's still mostly the same as other profiles, just with extra rotation.
6428	*/
6429	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
6430	(op != BTRFS_MAP_READ \|\| mirror_num > `1`)) {
6431	/*
6432	* For RAID56 @stripe_nr is already the number of full stripes
6433	* before us, which is also the rotation value (needs to modulo
6434	* with num_stripes).
6435	*
6436	* In this case, we just add @stripe_nr with @i, then do the
6437	* modulo, to reduce one modulo call.
6438	*/
6439	bioc->full_stripe_logical = em->start +
6440	btrfs_stripe_nr_to_offset(stripe_nr: stripe_nr * data_stripes);
6441	for (int i = `0`; i < num_stripes; i++) {
6442	ret = set_io_stripe(fs_info, op, logical, length,
6443	dst: &bioc->stripes[i], map,
6444	stripe_index: (i + stripe_nr) % num_stripes,
6445	stripe_offset, stripe_nr);
6446	if (ret < `0`)
6447	break;
6448	}
6449	} else {
6450	/*
6451	* For all other non-RAID56 profiles, just copy the target
6452	* stripe into the bioc.
6453	*/
6454	for (i = `0`; i < num_stripes; i++) {
6455	ret = set_io_stripe(fs_info, op, logical, length,
6456	dst: &bioc->stripes[i], map, stripe_index,
6457	stripe_offset, stripe_nr);
6458	if (ret < `0`)
6459	break;
6460	stripe_index++;
6461	}
6462	}
6463
6464	if (ret) {
6465	*bioc_ret = NULL;
6466	btrfs_put_bioc(bioc);
6467	goto out;
6468	}
6469
6470	if (op != BTRFS_MAP_READ)
6471	max_errors = btrfs_chunk_max_errors(map);
6472
6473	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6474	op != BTRFS_MAP_READ) {
6475	handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
6476	num_stripes_ret: &num_stripes, max_errors_ret: &max_errors);
6477	}
6478
6479	*bioc_ret = bioc;
6480	bioc->num_stripes = num_stripes;
6481	bioc->max_errors = max_errors;
6482	bioc->mirror_num = mirror_num;
6483
6484	out:
6485	if (dev_replace_is_ongoing) {
6486	lockdep_assert_held(&dev_replace->rwsem);
6487	/ Unlock and let waiting writers proceed /
6488	up_read(sem: &dev_replace->rwsem);
6489	}
6490	free_extent_map(em);
6491	return ret;
6492	}
6493
6494	static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
6495	const struct btrfs_fs_devices *fs_devices)
6496	{
6497	if (args->fsid == NULL)
6498	return true;
6499	if (memcmp(p: fs_devices->metadata_uuid, q: args->fsid, BTRFS_FSID_SIZE) == `0`)
6500	return true;
6501	return false;
6502	}
6503
6504	static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
6505	const struct btrfs_device *device)
6506	{
6507	if (args->missing) {
6508	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
6509	!device->bdev)
6510	return true;
6511	return false;
6512	}
6513
6514	if (device->devid != args->devid)
6515	return false;
6516	if (args->uuid && memcmp(p: device->uuid, q: args->uuid, BTRFS_UUID_SIZE) != `0`)
6517	return false;
6518	return true;
6519	}
6520
6521	/*
6522	* Find a device specified by @devid or @uuid in the list of @fs_devices, or
6523	* return NULL.
6524	*
6525	* If devid and uuid are both specified, the match must be exact, otherwise
6526	* only devid is used.
6527	*/
6528	struct btrfs_device btrfs_find_device(const* struct btrfs_fs_devices *fs_devices,
6529	const struct btrfs_dev_lookup_args *args)
6530	{
6531	struct btrfs_device *device;
6532	struct btrfs_fs_devices *seed_devs;
6533
6534	if (dev_args_match_fs_devices(args, fs_devices)) {
6535	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6536	if (dev_args_match_device(args, device))
6537	return device;
6538	}
6539	}
6540
6541	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6542	if (!dev_args_match_fs_devices(args, fs_devices: seed_devs))
6543	continue;
6544	list_for_each_entry(device, &seed_devs->devices, dev_list) {
6545	if (dev_args_match_device(args, device))
6546	return device;
6547	}
6548	}
6549
6550	return NULL;
6551	}
6552
6553	static struct btrfs_device add_missing_dev(struct* btrfs_fs_devices *fs_devices,
6554	u64 devid, u8 *dev_uuid)
6555	{
6556	struct btrfs_device *device;
6557	unsigned int nofs_flag;
6558
6559	/*
6560	* We call this under the chunk_mutex, so we want to use NOFS for this
6561	* allocation, however we don't want to change btrfs_alloc_device() to
6562	* always do NOFS because we use it in a lot of other GFP_KERNEL safe
6563	* places.
6564	*/
6565
6566	nofs_flag = memalloc_nofs_save();
6567	device = btrfs_alloc_device(NULL, devid: &devid, uuid: dev_uuid, NULL);
6568	memalloc_nofs_restore(flags: nofs_flag);
6569	if (IS_ERR(ptr: device))
6570	return device;
6571
6572	list_add(new: &device->dev_list, head: &fs_devices->devices);
6573	device->fs_devices = fs_devices;
6574	fs_devices->num_devices++;
6575
6576	set_bit(BTRFS_DEV_STATE_MISSING, addr: &device->dev_state);
6577	fs_devices->missing_devices++;
6578
6579	return device;
6580	}
6581
6582	/*
6583	* Allocate new device struct, set up devid and UUID.
6584	*
6585	* @fs_info: used only for generating a new devid, can be NULL if
6586	* devid is provided (i.e. @devid != NULL).
6587	* @devid: a pointer to devid for this device. If NULL a new devid
6588	* is generated.
6589	* @uuid: a pointer to UUID for this device. If NULL a new UUID
6590	* is generated.
6591	* @path: a pointer to device path if available, NULL otherwise.
6592	*
6593	* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6594	* on error. Returned struct is not linked onto any lists and must be
6595	* destroyed with btrfs_free_device.
6596	*/
6597	struct btrfs_device btrfs_alloc_device(struct* btrfs_fs_info *fs_info,
6598	const u64 devid, const* u8 *uuid,
6599	const char *path)
6600	{
6601	struct btrfs_device *dev;
6602	u64 tmp;
6603
6604	if (WARN_ON(!devid && !fs_info))
6605	return ERR_PTR(error: -EINVAL);
6606
6607	dev = kzalloc(size: sizeof(*dev), GFP_KERNEL);
6608	if (!dev)
6609	return ERR_PTR(error: -ENOMEM);
6610
6611	INIT_LIST_HEAD(list: &dev->dev_list);
6612	INIT_LIST_HEAD(list: &dev->dev_alloc_list);
6613	INIT_LIST_HEAD(list: &dev->post_commit_list);
6614
6615	atomic_set(v: &dev->dev_stats_ccnt, i: `0`);
6616	btrfs_device_data_ordered_init(dev);
6617	extent_io_tree_init(fs_info, tree: &dev->alloc_state, owner: IO_TREE_DEVICE_ALLOC_STATE);
6618
6619	if (devid)
6620	tmp = *devid;
6621	else {
6622	int ret;
6623
6624	ret = find_next_devid(fs_info, devid_ret: &tmp);
6625	if (ret) {
6626	btrfs_free_device(device: dev);
6627	return ERR_PTR(error: ret);
6628	}
6629	}
6630	dev->devid = tmp;
6631
6632	if (uuid)
6633	memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6634	else
6635	generate_random_uuid(uuid: dev->uuid);
6636
6637	if (path) {
6638	struct rcu_string *name;
6639
6640	name = rcu_string_strdup(src: path, GFP_KERNEL);
6641	if (!name) {
6642	btrfs_free_device(device: dev);
6643	return ERR_PTR(error: -ENOMEM);
6644	}
6645	rcu_assign_pointer(dev->name, name);
6646	}
6647
6648	return dev;
6649	}
6650
6651	static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6652	u64 devid, u8 *uuid, bool error)
6653	{
6654	if (error)
6655	btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6656	devid, uuid);
6657	else
6658	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6659	devid, uuid);
6660	}
6661
6662	u64 btrfs_calc_stripe_length(const struct extent_map *em)
6663	{
6664	const struct map_lookup *map = em->map_lookup;
6665	const int data_stripes = calc_data_stripes(type: map->type, num_stripes: map->num_stripes);
6666
6667	return div_u64(dividend: em->len, divisor: data_stripes);
6668	}
6669
6670	#if BITS_PER_LONG == 32
6671	/*
6672	* Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6673	* can't be accessed on 32bit systems.
6674	*
6675	* This function do mount time check to reject the fs if it already has
6676	* metadata chunk beyond that limit.
6677	*/
6678	static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6679	u64 logical, u64 length, u64 type)
6680	{
6681	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6682	return `0`;
6683
6684	if (logical + length < MAX_LFS_FILESIZE)
6685	return `0`;
6686
6687	btrfs_err_32bit_limit(fs_info);
6688	return -EOVERFLOW;
6689	}
6690
6691	/*
6692	* This is to give early warning for any metadata chunk reaching
6693	* BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6694	* Although we can still access the metadata, it's not going to be possible
6695	* once the limit is reached.
6696	*/
6697	static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6698	u64 logical, u64 length, u64 type)
6699	{
6700	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6701	return;
6702
6703	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6704	return;
6705
6706	btrfs_warn_32bit_limit(fs_info);
6707	}
6708	#endif
6709
6710	static struct btrfs_device handle_missing_device(struct* btrfs_fs_info *fs_info,
6711	u64 devid, u8 *uuid)
6712	{
6713	struct btrfs_device *dev;
6714
6715	if (!btrfs_test_opt(fs_info, DEGRADED)) {
6716	btrfs_report_missing_device(fs_info, devid, uuid, error: true);
6717	return ERR_PTR(error: -ENOENT);
6718	}
6719
6720	dev = add_missing_dev(fs_devices: fs_info->fs_devices, devid, dev_uuid: uuid);
6721	if (IS_ERR(ptr: dev)) {
6722	btrfs_err(fs_info, "failed to init missing device %llu: %ld",
6723	devid, PTR_ERR(dev));
6724	return dev;
6725	}
6726	btrfs_report_missing_device(fs_info, devid, uuid, error: false);
6727
6728	return dev;
6729	}
6730
6731	static int read_one_chunk(struct btrfs_key key, struct* extent_buffer *leaf,
6732	struct btrfs_chunk *chunk)
6733	{
6734	BTRFS_DEV_LOOKUP_ARGS(args);
6735	struct btrfs_fs_info *fs_info = leaf->fs_info;
6736	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6737	struct map_lookup *map;
6738	struct extent_map *em;
6739	u64 logical;
6740	u64 length;
6741	u64 devid;
6742	u64 type;
6743	u8 uuid[BTRFS_UUID_SIZE];
6744	int index;
6745	int num_stripes;
6746	int ret;
6747	int i;
6748
6749	logical = key->offset;
6750	length = btrfs_chunk_length(eb: leaf, s: chunk);
6751	type = btrfs_chunk_type(eb: leaf, s: chunk);
6752	index = btrfs_bg_flags_to_raid_index(flags: type);
6753	num_stripes = btrfs_chunk_num_stripes(eb: leaf, s: chunk);
6754
6755	#if BITS_PER_LONG == 32
6756	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
6757	if (ret < `0`)
6758	return ret;
6759	warn_32bit_meta_chunk(fs_info, logical, length, type);
6760	#endif
6761
6762	/*
6763	* Only need to verify chunk item if we're reading from sys chunk array,
6764	* as chunk item in tree block is already verified by tree-checker.
6765	*/
6766	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6767	ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6768	if (ret)
6769	return ret;
6770	}
6771
6772	read_lock(&map_tree->lock);
6773	em = lookup_extent_mapping(tree: map_tree, start: logical, len: `1`);
6774	read_unlock(&map_tree->lock);
6775
6776	/ already mapped? /
6777	if (em && em->start <= logical && em->start + em->len > logical) {
6778	free_extent_map(em);
6779	return `0`;
6780	} else if (em) {
6781	free_extent_map(em);
6782	}
6783
6784	em = alloc_extent_map();
6785	if (!em)
6786	return -ENOMEM;
6787	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6788	if (!map) {
6789	free_extent_map(em);
6790	return -ENOMEM;
6791	}
6792
6793	set_bit(nr: EXTENT_FLAG_FS_MAPPING, addr: &em->flags);
6794	em->map_lookup = map;
6795	em->start = logical;
6796	em->len = length;
6797	em->orig_start = `0`;
6798	em->block_start = `0`;
6799	em->block_len = em->len;
6800
6801	map->num_stripes = num_stripes;
6802	map->io_width = btrfs_chunk_io_width(eb: leaf, s: chunk);
6803	map->io_align = btrfs_chunk_io_align(eb: leaf, s: chunk);
6804	map->type = type;
6805	/*
6806	* We can't use the sub_stripes value, as for profiles other than
6807	* RAID10, they may have 0 as sub_stripes for filesystems created by
6808	* older mkfs (<v5.4).
6809	* In that case, it can cause divide-by-zero errors later.
6810	* Since currently sub_stripes is fixed for each profile, let's
6811	* use the trusted value instead.
6812	*/
6813	map->sub_stripes = btrfs_raid_array[index].sub_stripes;
6814	map->verified_stripes = `0`;
6815	em->orig_block_len = btrfs_calc_stripe_length(em);
6816	for (i = `0`; i < num_stripes; i++) {
6817	map->stripes[i].physical =
6818	btrfs_stripe_offset_nr(eb: leaf, c: chunk, nr: i);
6819	devid = btrfs_stripe_devid_nr(eb: leaf, c: chunk, nr: i);
6820	args.devid = devid;
6821	read_extent_buffer(eb: leaf, dst: uuid, start: (unsigned long)
6822	btrfs_stripe_dev_uuid_nr(c: chunk, nr: i),
6823	BTRFS_UUID_SIZE);
6824	args.uuid = uuid;
6825	map->stripes[i].dev = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
6826	if (!map->stripes[i].dev) {
6827	map->stripes[i].dev = handle_missing_device(fs_info,
6828	devid, uuid);
6829	if (IS_ERR(ptr: map->stripes[i].dev)) {
6830	ret = PTR_ERR(ptr: map->stripes[i].dev);
6831	free_extent_map(em);
6832	return ret;
6833	}
6834	}
6835
6836	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6837	addr: &(map->stripes[i].dev->dev_state));
6838	}
6839
6840	write_lock(&map_tree->lock);
6841	ret = add_extent_mapping(tree: map_tree, em, modified: `0`);
6842	write_unlock(&map_tree->lock);
6843	if (ret < `0`) {
6844	btrfs_err(fs_info,
6845	"failed to add chunk map, start=%llu len=%llu: %d",
6846	em->start, em->len, ret);
6847	}
6848	free_extent_map(em);
6849
6850	return ret;
6851	}
6852
6853	static void fill_device_from_item(struct extent_buffer *leaf,
6854	struct btrfs_dev_item *dev_item,
6855	struct btrfs_device *device)
6856	{
6857	unsigned long ptr;
6858
6859	device->devid = btrfs_device_id(eb: leaf, s: dev_item);
6860	device->disk_total_bytes = btrfs_device_total_bytes(eb: leaf, s: dev_item);
6861	device->total_bytes = device->disk_total_bytes;
6862	device->commit_total_bytes = device->disk_total_bytes;
6863	device->bytes_used = btrfs_device_bytes_used(eb: leaf, s: dev_item);
6864	device->commit_bytes_used = device->bytes_used;
6865	device->type = btrfs_device_type(eb: leaf, s: dev_item);
6866	device->io_align = btrfs_device_io_align(eb: leaf, s: dev_item);
6867	device->io_width = btrfs_device_io_width(eb: leaf, s: dev_item);
6868	device->sector_size = btrfs_device_sector_size(eb: leaf, s: dev_item);
6869	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6870	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, addr: &device->dev_state);
6871
6872	ptr = btrfs_device_uuid(d: dev_item);
6873	read_extent_buffer(eb: leaf, dst: device->uuid, start: ptr, BTRFS_UUID_SIZE);
6874	}
6875
6876	static struct btrfs_fs_devices open_seed_devices(struct* btrfs_fs_info *fs_info,
6877	u8 *fsid)
6878	{
6879	struct btrfs_fs_devices *fs_devices;
6880	int ret;
6881
6882	lockdep_assert_held(&uuid_mutex);
6883	ASSERT(fsid);
6884
6885	/ This will match only for multi-device seed fs /
6886	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6887	if (!memcmp(p: fs_devices->fsid, q: fsid, BTRFS_FSID_SIZE))
6888	return fs_devices;
6889
6890
6891	fs_devices = find_fsid(fsid, NULL);
6892	if (!fs_devices) {
6893	if (!btrfs_test_opt(fs_info, DEGRADED))
6894	return ERR_PTR(error: -ENOENT);
6895
6896	fs_devices = alloc_fs_devices(fsid);
6897	if (IS_ERR(ptr: fs_devices))
6898	return fs_devices;
6899
6900	fs_devices->seeding = true;
6901	fs_devices->opened = `1`;
6902	return fs_devices;
6903	}
6904
6905	/*
6906	* Upon first call for a seed fs fsid, just create a private copy of the
6907	* respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6908	*/
6909	fs_devices = clone_fs_devices(orig: fs_devices);
6910	if (IS_ERR(ptr: fs_devices))
6911	return fs_devices;
6912
6913	ret = open_fs_devices(fs_devices, BLK_OPEN_READ, holder: fs_info->bdev_holder);
6914	if (ret) {
6915	free_fs_devices(fs_devices);
6916	return ERR_PTR(error: ret);
6917	}
6918
6919	if (!fs_devices->seeding) {
6920	close_fs_devices(fs_devices);
6921	free_fs_devices(fs_devices);
6922	return ERR_PTR(error: -EINVAL);
6923	}
6924
6925	list_add(new: &fs_devices->seed_list, head: &fs_info->fs_devices->seed_list);
6926
6927	return fs_devices;
6928	}
6929
6930	static int read_one_dev(struct extent_buffer *leaf,
6931	struct btrfs_dev_item *dev_item)
6932	{
6933	BTRFS_DEV_LOOKUP_ARGS(args);
6934	struct btrfs_fs_info *fs_info = leaf->fs_info;
6935	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6936	struct btrfs_device *device;
6937	u64 devid;
6938	int ret;
6939	u8 fs_uuid[BTRFS_FSID_SIZE];
6940	u8 dev_uuid[BTRFS_UUID_SIZE];
6941
6942	devid = btrfs_device_id(eb: leaf, s: dev_item);
6943	args.devid = devid;
6944	read_extent_buffer(eb: leaf, dst: dev_uuid, start: btrfs_device_uuid(d: dev_item),
6945	BTRFS_UUID_SIZE);
6946	read_extent_buffer(eb: leaf, dst: fs_uuid, start: btrfs_device_fsid(d: dev_item),
6947	BTRFS_FSID_SIZE);
6948	args.uuid = dev_uuid;
6949	args.fsid = fs_uuid;
6950
6951	if (memcmp(p: fs_uuid, q: fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6952	fs_devices = open_seed_devices(fs_info, fsid: fs_uuid);
6953	if (IS_ERR(ptr: fs_devices))
6954	return PTR_ERR(ptr: fs_devices);
6955	}
6956
6957	device = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
6958	if (!device) {
6959	if (!btrfs_test_opt(fs_info, DEGRADED)) {
6960	btrfs_report_missing_device(fs_info, devid,
6961	uuid: dev_uuid, error: true);
6962	return -ENOENT;
6963	}
6964
6965	device = add_missing_dev(fs_devices, devid, dev_uuid);
6966	if (IS_ERR(ptr: device)) {
6967	btrfs_err(fs_info,
6968	"failed to add missing dev %llu: %ld",
6969	devid, PTR_ERR(device));
6970	return PTR_ERR(ptr: device);
6971	}
6972	btrfs_report_missing_device(fs_info, devid, uuid: dev_uuid, error: false);
6973	} else {
6974	if (!device->bdev) {
6975	if (!btrfs_test_opt(fs_info, DEGRADED)) {
6976	btrfs_report_missing_device(fs_info,
6977	devid, uuid: dev_uuid, error: true);
6978	return -ENOENT;
6979	}
6980	btrfs_report_missing_device(fs_info, devid,
6981	uuid: dev_uuid, error: false);
6982	}
6983
6984	if (!device->bdev &&
6985	!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6986	/*
6987	* this happens when a device that was properly setup
6988	* in the device info lists suddenly goes bad.
6989	* device->bdev is NULL, and so we have to set
6990	* device->missing to one here
6991	*/
6992	device->fs_devices->missing_devices++;
6993	set_bit(BTRFS_DEV_STATE_MISSING, addr: &device->dev_state);
6994	}
6995
6996	/ Move the device to its own fs_devices /
6997	if (device->fs_devices != fs_devices) {
6998	ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6999	&device->dev_state));
7000
7001	list_move(list: &device->dev_list, head: &fs_devices->devices);
7002	device->fs_devices->num_devices--;
7003	fs_devices->num_devices++;
7004
7005	device->fs_devices->missing_devices--;
7006	fs_devices->missing_devices++;
7007
7008	device->fs_devices = fs_devices;
7009	}
7010	}
7011
7012	if (device->fs_devices != fs_info->fs_devices) {
7013	BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7014	if (device->generation !=
7015	btrfs_device_generation(eb: leaf, s: dev_item))
7016	return -EINVAL;
7017	}
7018
7019	fill_device_from_item(leaf, dev_item, device);
7020	if (device->bdev) {
7021	u64 max_total_bytes = bdev_nr_bytes(bdev: device->bdev);
7022
7023	if (device->total_bytes > max_total_bytes) {
7024	btrfs_err(fs_info,
7025	"device total_bytes should be at most %llu but found %llu",
7026	max_total_bytes, device->total_bytes);
7027	return -EINVAL;
7028	}
7029	}
7030	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, addr: &device->dev_state);
7031	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7032	!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7033	device->fs_devices->total_rw_bytes += device->total_bytes;
7034	atomic64_add(i: device->total_bytes - device->bytes_used,
7035	v: &fs_info->free_chunk_space);
7036	}
7037	ret = `0`;
7038	return ret;
7039	}
7040
7041	int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7042	{
7043	struct btrfs_super_block *super_copy = fs_info->super_copy;
7044	struct extent_buffer *sb;
7045	struct btrfs_disk_key *disk_key;
7046	struct btrfs_chunk *chunk;
7047	u8 *array_ptr;
7048	unsigned long sb_array_offset;
7049	int ret = `0`;
7050	u32 num_stripes;
7051	u32 array_size;
7052	u32 len = `0`;
7053	u32 cur_offset;
7054	u64 type;
7055	struct btrfs_key key;
7056
7057	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7058
7059	/*
7060	* We allocated a dummy extent, just to use extent buffer accessors.
7061	* There will be unused space after BTRFS_SUPER_INFO_SIZE, but
7062	* that's fine, we will not go beyond system chunk array anyway.
7063	*/
7064	sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
7065	if (!sb)
7066	return -ENOMEM;
7067	set_extent_buffer_uptodate(sb);
7068
7069	write_extent_buffer(eb: sb, src: super_copy, start: `0`, BTRFS_SUPER_INFO_SIZE);
7070	array_size = btrfs_super_sys_array_size(s: super_copy);
7071
7072	array_ptr = super_copy->sys_chunk_array;
7073	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7074	cur_offset = `0`;
7075
7076	while (cur_offset < array_size) {
7077	disk_key = (struct btrfs_disk_key *)array_ptr;
7078	len = sizeof(*disk_key);
7079	if (cur_offset + len > array_size)
7080	goto out_short_read;
7081
7082	btrfs_disk_key_to_cpu(cpu_key: &key, disk_key);
7083
7084	array_ptr += len;
7085	sb_array_offset += len;
7086	cur_offset += len;
7087
7088	if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7089	btrfs_err(fs_info,
7090	"unexpected item type %u in sys_array at offset %u",
7091	(u32)key.type, cur_offset);
7092	ret = -EIO;
7093	break;
7094	}
7095
7096	chunk = (struct btrfs_chunk *)sb_array_offset;
7097	/*
7098	* At least one btrfs_chunk with one stripe must be present,
7099	* exact stripe count check comes afterwards
7100	*/
7101	len = btrfs_chunk_item_size(num_stripes: `1`);
7102	if (cur_offset + len > array_size)
7103	goto out_short_read;
7104
7105	num_stripes = btrfs_chunk_num_stripes(eb: sb, s: chunk);
7106	if (!num_stripes) {
7107	btrfs_err(fs_info,
7108	"invalid number of stripes %u in sys_array at offset %u",
7109	num_stripes, cur_offset);
7110	ret = -EIO;
7111	break;
7112	}
7113
7114	type = btrfs_chunk_type(eb: sb, s: chunk);
7115	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == `0`) {
7116	btrfs_err(fs_info,
7117	"invalid chunk type %llu in sys_array at offset %u",
7118	type, cur_offset);
7119	ret = -EIO;
7120	break;
7121	}
7122
7123	len = btrfs_chunk_item_size(num_stripes);
7124	if (cur_offset + len > array_size)
7125	goto out_short_read;
7126
7127	ret = read_one_chunk(key: &key, leaf: sb, chunk);
7128	if (ret)
7129	break;
7130
7131	array_ptr += len;
7132	sb_array_offset += len;
7133	cur_offset += len;
7134	}
7135	clear_extent_buffer_uptodate(eb: sb);
7136	free_extent_buffer_stale(eb: sb);
7137	return ret;
7138
7139	out_short_read:
7140	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7141	len, cur_offset);
7142	clear_extent_buffer_uptodate(eb: sb);
7143	free_extent_buffer_stale(eb: sb);
7144	return -EIO;
7145	}
7146
7147	/*
7148	* Check if all chunks in the fs are OK for read-write degraded mount
7149	*
7150	* If the @failing_dev is specified, it's accounted as missing.
7151	*
7152	* Return true if all chunks meet the minimal RW mount requirements.
7153	* Return false if any chunk doesn't meet the minimal RW mount requirements.
7154	*/
7155	bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7156	struct btrfs_device *failing_dev)
7157	{
7158	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7159	struct extent_map *em;
7160	u64 next_start = `0`;
7161	bool ret = true;
7162
7163	read_lock(&map_tree->lock);
7164	em = lookup_extent_mapping(tree: map_tree, start: `0`, len: (u64)-`1`);
7165	read_unlock(&map_tree->lock);
7166	/ No chunk at all? Return false anyway /
7167	if (!em) {
7168	ret = false;
7169	goto out;
7170	}
7171	while (em) {
7172	struct map_lookup *map;
7173	int missing = `0`;
7174	int max_tolerated;
7175	int i;
7176
7177	map = em->map_lookup;
7178	max_tolerated =
7179	btrfs_get_num_tolerated_disk_barrier_failures(
7180	flags: map->type);
7181	for (i = `0`; i < map->num_stripes; i++) {
7182	struct btrfs_device *dev = map->stripes[i].dev;
7183
7184	if (!dev \|\| !dev->bdev \|\|
7185	test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) \|\|
7186	dev->last_flush_error)
7187	missing++;
7188	else if (failing_dev && failing_dev == dev)
7189	missing++;
7190	}
7191	if (missing > max_tolerated) {
7192	if (!failing_dev)
7193	btrfs_warn(fs_info,
7194	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
7195	em->start, missing, max_tolerated);
7196	free_extent_map(em);
7197	ret = false;
7198	goto out;
7199	}
7200	next_start = extent_map_end(em);
7201	free_extent_map(em);
7202
7203	read_lock(&map_tree->lock);
7204	em = lookup_extent_mapping(tree: map_tree, start: next_start,
7205	len: (u64)(-`1`) - next_start);
7206	read_unlock(&map_tree->lock);
7207	}
7208	out:
7209	return ret;
7210	}
7211
7212	static void readahead_tree_node_children(struct extent_buffer *node)
7213	{
7214	int i;
7215	const int nr_items = btrfs_header_nritems(eb: node);
7216
7217	for (i = `0`; i < nr_items; i++)
7218	btrfs_readahead_node_child(node, slot: i);
7219	}
7220
7221	int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7222	{
7223	struct btrfs_root *root = fs_info->chunk_root;
7224	struct btrfs_path *path;
7225	struct extent_buffer *leaf;
7226	struct btrfs_key key;
7227	struct btrfs_key found_key;
7228	int ret;
7229	int slot;
7230	int iter_ret = `0`;
7231	u64 total_dev = `0`;
7232	u64 last_ra_node = `0`;
7233
7234	path = btrfs_alloc_path();
7235	if (!path)
7236	return -ENOMEM;
7237
7238	/*
7239	* uuid_mutex is needed only if we are mounting a sprout FS
7240	* otherwise we don't need it.
7241	*/
7242	mutex_lock(&uuid_mutex);
7243
7244	/*
7245	* It is possible for mount and umount to race in such a way that
7246	* we execute this code path, but open_fs_devices failed to clear
7247	* total_rw_bytes. We certainly want it cleared before reading the
7248	* device items, so clear it here.
7249	*/
7250	fs_info->fs_devices->total_rw_bytes = `0`;
7251
7252	/*
7253	* Lockdep complains about possible circular locking dependency between
7254	* a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
7255	* used for freeze procection of a fs (struct super_block.s_writers),
7256	* which we take when starting a transaction, and extent buffers of the
7257	* chunk tree if we call read_one_dev() while holding a lock on an
7258	* extent buffer of the chunk tree. Since we are mounting the filesystem
7259	* and at this point there can't be any concurrent task modifying the
7260	* chunk tree, to keep it simple, just skip locking on the chunk tree.
7261	*/
7262	ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7263	path->skip_locking = `1`;
7264
7265	/*
7266	* Read all device items, and then all the chunk items. All
7267	* device items are found before any chunk item (their object id
7268	* is smaller than the lowest possible object id for a chunk
7269	* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7270	*/
7271	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7272	key.offset = `0`;
7273	key.type = `0`;
7274	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
7275	struct extent_buffer *node = path->nodes[`1`];
7276
7277	leaf = path->nodes[`0`];
7278	slot = path->slots[`0`];
7279
7280	if (node) {
7281	if (last_ra_node != node->start) {
7282	readahead_tree_node_children(node);
7283	last_ra_node = node->start;
7284	}
7285	}
7286	if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7287	struct btrfs_dev_item *dev_item;
7288	dev_item = btrfs_item_ptr(leaf, slot,
7289	struct btrfs_dev_item);
7290	ret = read_one_dev(leaf, dev_item);
7291	if (ret)
7292	goto error;
7293	total_dev++;
7294	} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7295	struct btrfs_chunk *chunk;
7296
7297	/*
7298	* We are only called at mount time, so no need to take
7299	* fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7300	* we always lock first fs_info->chunk_mutex before
7301	* acquiring any locks on the chunk tree. This is a
7302	* requirement for chunk allocation, see the comment on
7303	* top of btrfs_chunk_alloc() for details.
7304	*/
7305	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7306	ret = read_one_chunk(key: &found_key, leaf, chunk);
7307	if (ret)
7308	goto error;
7309	}
7310	}
7311	/ Catch error found during iteration /
7312	if (iter_ret < `0`) {
7313	ret = iter_ret;
7314	goto error;
7315	}
7316
7317	/*
7318	* After loading chunk tree, we've got all device information,
7319	* do another round of validation checks.
7320	*/
7321	if (total_dev != fs_info->fs_devices->total_devices) {
7322	btrfs_warn(fs_info,
7323	"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7324	btrfs_super_num_devices(fs_info->super_copy),
7325	total_dev);
7326	fs_info->fs_devices->total_devices = total_dev;
7327	btrfs_set_super_num_devices(s: fs_info->super_copy, val: total_dev);
7328	}
7329	if (btrfs_super_total_bytes(s: fs_info->super_copy) <
7330	fs_info->fs_devices->total_rw_bytes) {
7331	btrfs_err(fs_info,
7332	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7333	btrfs_super_total_bytes(fs_info->super_copy),
7334	fs_info->fs_devices->total_rw_bytes);
7335	ret = -EINVAL;
7336	goto error;
7337	}
7338	ret = `0`;
7339	error:
7340	mutex_unlock(lock: &uuid_mutex);
7341
7342	btrfs_free_path(p: path);
7343	return ret;
7344	}
7345
7346	int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7347	{
7348	struct btrfs_fs_devices fs_devices = fs_info->fs_devices, seed_devs;
7349	struct btrfs_device *device;
7350	int ret = `0`;
7351
7352	fs_devices->fs_info = fs_info;
7353
7354	mutex_lock(&fs_devices->device_list_mutex);
7355	list_for_each_entry(device, &fs_devices->devices, dev_list)
7356	device->fs_info = fs_info;
7357
7358	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7359	list_for_each_entry(device, &seed_devs->devices, dev_list) {
7360	device->fs_info = fs_info;
7361	ret = btrfs_get_dev_zone_info(device, populate_cache: false);
7362	if (ret)
7363	break;
7364	}
7365
7366	seed_devs->fs_info = fs_info;
7367	}
7368	mutex_unlock(lock: &fs_devices->device_list_mutex);
7369
7370	return ret;
7371	}
7372
7373	static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7374	const struct btrfs_dev_stats_item *ptr,
7375	int index)
7376	{
7377	u64 val;
7378
7379	read_extent_buffer(eb, dst: &val,
7380	offsetof(struct btrfs_dev_stats_item, values) +
7381	((unsigned long)ptr) + (index * sizeof(u64)),
7382	len: sizeof(val));
7383	return val;
7384	}
7385
7386	static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7387	struct btrfs_dev_stats_item *ptr,
7388	int index, u64 val)
7389	{
7390	write_extent_buffer(eb, src: &val,
7391	offsetof(struct btrfs_dev_stats_item, values) +
7392	((unsigned long)ptr) + (index * sizeof(u64)),
7393	len: sizeof(val));
7394	}
7395
7396	static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7397	struct btrfs_path *path)
7398	{
7399	struct btrfs_dev_stats_item *ptr;
7400	struct extent_buffer *eb;
7401	struct btrfs_key key;
7402	int item_size;
7403	int i, ret, slot;
7404
7405	if (!device->fs_info->dev_root)
7406	return `0`;
7407
7408	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7409	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7410	key.offset = device->devid;
7411	ret = btrfs_search_slot(NULL, root: device->fs_info->dev_root, key: &key, p: path, ins_len: `0`, cow: `0`);
7412	if (ret) {
7413	for (i = `0`; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7414	btrfs_dev_stat_set(dev: device, index: i, val: `0`);
7415	device->dev_stats_valid = `1`;
7416	btrfs_release_path(p: path);
7417	return ret < `0` ? ret : `0`;
7418	}
7419	slot = path->slots[`0`];
7420	eb = path->nodes[`0`];
7421	item_size = btrfs_item_size(eb, slot);
7422
7423	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7424
7425	for (i = `0`; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7426	if (item_size >= (`1` + i) * sizeof(__le64))
7427	btrfs_dev_stat_set(dev: device, index: i,
7428	val: btrfs_dev_stats_value(eb, ptr, index: i));
7429	else
7430	btrfs_dev_stat_set(dev: device, index: i, val: `0`);
7431	}
7432
7433	device->dev_stats_valid = `1`;
7434	btrfs_dev_stat_print_on_load(device);
7435	btrfs_release_path(p: path);
7436
7437	return `0`;
7438	}
7439
7440	int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7441	{
7442	struct btrfs_fs_devices fs_devices = fs_info->fs_devices, seed_devs;
7443	struct btrfs_device *device;
7444	struct btrfs_path *path = NULL;
7445	int ret = `0`;
7446
7447	path = btrfs_alloc_path();
7448	if (!path)
7449	return -ENOMEM;
7450
7451	mutex_lock(&fs_devices->device_list_mutex);
7452	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7453	ret = btrfs_device_init_dev_stats(device, path);
7454	if (ret)
7455	goto out;
7456	}
7457	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7458	list_for_each_entry(device, &seed_devs->devices, dev_list) {
7459	ret = btrfs_device_init_dev_stats(device, path);
7460	if (ret)
7461	goto out;
7462	}
7463	}
7464	out:
7465	mutex_unlock(lock: &fs_devices->device_list_mutex);
7466
7467	btrfs_free_path(p: path);
7468	return ret;
7469	}
7470
7471	static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7472	struct btrfs_device *device)
7473	{
7474	struct btrfs_fs_info *fs_info = trans->fs_info;
7475	struct btrfs_root *dev_root = fs_info->dev_root;
7476	struct btrfs_path *path;
7477	struct btrfs_key key;
7478	struct extent_buffer *eb;
7479	struct btrfs_dev_stats_item *ptr;
7480	int ret;
7481	int i;
7482
7483	key.objectid = BTRFS_DEV_STATS_OBJECTID;
7484	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7485	key.offset = device->devid;
7486
7487	path = btrfs_alloc_path();
7488	if (!path)
7489	return -ENOMEM;
7490	ret = btrfs_search_slot(trans, root: dev_root, key: &key, p: path, ins_len: -`1`, cow: `1`);
7491	if (ret < `0`) {
7492	btrfs_warn_in_rcu(fs_info,
7493	"error %d while searching for dev_stats item for device %s",
7494	ret, btrfs_dev_name(device));
7495	goto out;
7496	}
7497
7498	if (ret == `0` &&
7499	btrfs_item_size(eb: path->nodes[`0`], slot: path->slots[`0`]) < sizeof(*ptr)) {
7500	/ need to delete old one and insert a new one /
7501	ret = btrfs_del_item(trans, root: dev_root, path);
7502	if (ret != `0`) {
7503	btrfs_warn_in_rcu(fs_info,
7504	"delete too small dev_stats item for device %s failed %d",
7505	btrfs_dev_name(device), ret);
7506	goto out;
7507	}
7508	ret = `1`;
7509	}
7510
7511	if (ret == `1`) {
7512	/ need to insert a new item /
7513	btrfs_release_path(p: path);
7514	ret = btrfs_insert_empty_item(trans, root: dev_root, path,
7515	key: &key, data_size: sizeof(*ptr));
7516	if (ret < `0`) {
7517	btrfs_warn_in_rcu(fs_info,
7518	"insert dev_stats item for device %s failed %d",
7519	btrfs_dev_name(device), ret);
7520	goto out;
7521	}
7522	}
7523
7524	eb = path->nodes[`0`];
7525	ptr = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_dev_stats_item);
7526	for (i = `0`; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7527	btrfs_set_dev_stats_value(eb, ptr, index: i,
7528	val: btrfs_dev_stat_read(dev: device, index: i));
7529	btrfs_mark_buffer_dirty(trans, buf: eb);
7530
7531	out:
7532	btrfs_free_path(p: path);
7533	return ret;
7534	}
7535
7536	/*
7537	* called from commit_transaction. Writes all changed device stats to disk.
7538	*/
7539	int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7540	{
7541	struct btrfs_fs_info *fs_info = trans->fs_info;
7542	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7543	struct btrfs_device *device;
7544	int stats_cnt;
7545	int ret = `0`;
7546
7547	mutex_lock(&fs_devices->device_list_mutex);
7548	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7549	stats_cnt = atomic_read(v: &device->dev_stats_ccnt);
7550	if (!device->dev_stats_valid \|\| stats_cnt == `0`)
7551	continue;
7552
7553
7554	/*
7555	* There is a LOAD-LOAD control dependency between the value of
7556	* dev_stats_ccnt and updating the on-disk values which requires
7557	* reading the in-memory counters. Such control dependencies
7558	* require explicit read memory barriers.
7559	*
7560	* This memory barriers pairs with smp_mb__before_atomic in
7561	* btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7562	* barrier implied by atomic_xchg in
7563	* btrfs_dev_stats_read_and_reset
7564	*/
7565	smp_rmb();
7566
7567	ret = update_dev_stat_item(trans, device);
7568	if (!ret)
7569	atomic_sub(i: stats_cnt, v: &device->dev_stats_ccnt);
7570	}
7571	mutex_unlock(lock: &fs_devices->device_list_mutex);
7572
7573	return ret;
7574	}
7575
7576	void btrfs_dev_stat_inc_and_print(struct btrfs_device dev, int* index)
7577	{
7578	btrfs_dev_stat_inc(dev, index);
7579
7580	if (!dev->dev_stats_valid)
7581	return;
7582	btrfs_err_rl_in_rcu(dev->fs_info,
7583	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7584	btrfs_dev_name(dev),
7585	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7586	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7587	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7588	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7589	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7590	}
7591
7592	static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7593	{
7594	int i;
7595
7596	for (i = `0`; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7597	if (btrfs_dev_stat_read(dev, index: i) != `0`)
7598	break;
7599	if (i == BTRFS_DEV_STAT_VALUES_MAX)
7600	return; / all values == 0, suppress message /
7601
7602	btrfs_info_in_rcu(dev->fs_info,
7603	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7604	btrfs_dev_name(dev),
7605	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7606	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7607	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7608	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7609	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7610	}
7611
7612	int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7613	struct btrfs_ioctl_get_dev_stats *stats)
7614	{
7615	BTRFS_DEV_LOOKUP_ARGS(args);
7616	struct btrfs_device *dev;
7617	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7618	int i;
7619
7620	mutex_lock(&fs_devices->device_list_mutex);
7621	args.devid = stats->devid;
7622	dev = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
7623	mutex_unlock(lock: &fs_devices->device_list_mutex);
7624
7625	if (!dev) {
7626	btrfs_warn(fs_info, "get dev_stats failed, device not found");
7627	return -ENODEV;
7628	} else if (!dev->dev_stats_valid) {
7629	btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7630	return -ENODEV;
7631	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7632	for (i = `0`; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7633	if (stats->nr_items > i)
7634	stats->values[i] =
7635	btrfs_dev_stat_read_and_reset(dev, index: i);
7636	else
7637	btrfs_dev_stat_set(dev, index: i, val: `0`);
7638	}
7639	btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7640	current->comm, task_pid_nr(current));
7641	} else {
7642	for (i = `0`; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7643	if (stats->nr_items > i)
7644	stats->values[i] = btrfs_dev_stat_read(dev, index: i);
7645	}
7646	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7647	stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7648	return `0`;
7649	}
7650
7651	/*
7652	* Update the size and bytes used for each device where it changed. This is
7653	* delayed since we would otherwise get errors while writing out the
7654	* superblocks.
7655	*
7656	* Must be invoked during transaction commit.
7657	*/
7658	void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7659	{
7660	struct btrfs_device curr, next;
7661
7662	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7663
7664	if (list_empty(head: &trans->dev_update_list))
7665	return;
7666
7667	/*
7668	* We don't need the device_list_mutex here. This list is owned by the
7669	* transaction and the transaction must complete before the device is
7670	* released.
7671	*/
7672	mutex_lock(&trans->fs_info->chunk_mutex);
7673	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7674	post_commit_list) {
7675	list_del_init(entry: &curr->post_commit_list);
7676	curr->commit_total_bytes = curr->disk_total_bytes;
7677	curr->commit_bytes_used = curr->bytes_used;
7678	}
7679	mutex_unlock(lock: &trans->fs_info->chunk_mutex);
7680	}
7681
7682	/*
7683	* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7684	*/
7685	int btrfs_bg_type_to_factor(u64 flags)
7686	{
7687	const int index = btrfs_bg_flags_to_raid_index(flags);
7688
7689	return btrfs_raid_array[index].ncopies;
7690	}
7691
7692
7693
7694	static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7695	u64 chunk_offset, u64 devid,
7696	u64 physical_offset, u64 physical_len)
7697	{
7698	struct btrfs_dev_lookup_args args = { .devid = devid };
7699	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7700	struct extent_map *em;
7701	struct map_lookup *map;
7702	struct btrfs_device *dev;
7703	u64 stripe_len;
7704	bool found = false;
7705	int ret = `0`;
7706	int i;
7707
7708	read_lock(&em_tree->lock);
7709	em = lookup_extent_mapping(tree: em_tree, start: chunk_offset, len: `1`);
7710	read_unlock(&em_tree->lock);
7711
7712	if (!em) {
7713	btrfs_err(fs_info,
7714	"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7715	physical_offset, devid);
7716	ret = -EUCLEAN;
7717	goto out;
7718	}
7719
7720	map = em->map_lookup;
7721	stripe_len = btrfs_calc_stripe_length(em);
7722	if (physical_len != stripe_len) {
7723	btrfs_err(fs_info,
7724	"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7725	physical_offset, devid, em->start, physical_len,
7726	stripe_len);
7727	ret = -EUCLEAN;
7728	goto out;
7729	}
7730
7731	/*
7732	* Very old mkfs.btrfs (before v4.1) will not respect the reserved
7733	* space. Although kernel can handle it without problem, better to warn
7734	* the users.
7735	*/
7736	if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
7737	btrfs_warn(fs_info,
7738	"devid %llu physical %llu len %llu inside the reserved space",
7739	devid, physical_offset, physical_len);
7740
7741	for (i = `0`; i < map->num_stripes; i++) {
7742	if (map->stripes[i].dev->devid == devid &&
7743	map->stripes[i].physical == physical_offset) {
7744	found = true;
7745	if (map->verified_stripes >= map->num_stripes) {
7746	btrfs_err(fs_info,
7747	"too many dev extents for chunk %llu found",
7748	em->start);
7749	ret = -EUCLEAN;
7750	goto out;
7751	}
7752	map->verified_stripes++;
7753	break;
7754	}
7755	}
7756	if (!found) {
7757	btrfs_err(fs_info,
7758	"dev extent physical offset %llu devid %llu has no corresponding chunk",
7759	physical_offset, devid);
7760	ret = -EUCLEAN;
7761	}
7762
7763	/ Make sure no dev extent is beyond device boundary /
7764	dev = btrfs_find_device(fs_devices: fs_info->fs_devices, args: &args);
7765	if (!dev) {
7766	btrfs_err(fs_info, "failed to find devid %llu", devid);
7767	ret = -EUCLEAN;
7768	goto out;
7769	}
7770
7771	if (physical_offset + physical_len > dev->disk_total_bytes) {
7772	btrfs_err(fs_info,
7773	"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7774	devid, physical_offset, physical_len,
7775	dev->disk_total_bytes);
7776	ret = -EUCLEAN;
7777	goto out;
7778	}
7779
7780	if (dev->zone_info) {
7781	u64 zone_size = dev->zone_info->zone_size;
7782
7783	if (!IS_ALIGNED(physical_offset, zone_size) \|\|
7784	!IS_ALIGNED(physical_len, zone_size)) {
7785	btrfs_err(fs_info,
7786	"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
7787	devid, physical_offset, physical_len);
7788	ret = -EUCLEAN;
7789	goto out;
7790	}
7791	}
7792
7793	out:
7794	free_extent_map(em);
7795	return ret;
7796	}
7797
7798	static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7799	{
7800	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7801	struct extent_map *em;
7802	struct rb_node *node;
7803	int ret = `0`;
7804
7805	read_lock(&em_tree->lock);
7806	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7807	em = rb_entry(node, struct extent_map, rb_node);
7808	if (em->map_lookup->num_stripes !=
7809	em->map_lookup->verified_stripes) {
7810	btrfs_err(fs_info,
7811	"chunk %llu has missing dev extent, have %d expect %d",
7812	em->start, em->map_lookup->verified_stripes,
7813	em->map_lookup->num_stripes);
7814	ret = -EUCLEAN;
7815	goto out;
7816	}
7817	}
7818	out:
7819	read_unlock(&em_tree->lock);
7820	return ret;
7821	}
7822
7823	/*
7824	* Ensure that all dev extents are mapped to correct chunk, otherwise
7825	* later chunk allocation/free would cause unexpected behavior.
7826	*
7827	* NOTE: This will iterate through the whole device tree, which should be of
7828	* the same size level as the chunk tree. This slightly increases mount time.
7829	*/
7830	int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7831	{
7832	struct btrfs_path *path;
7833	struct btrfs_root *root = fs_info->dev_root;
7834	struct btrfs_key key;
7835	u64 prev_devid = `0`;
7836	u64 prev_dev_ext_end = `0`;
7837	int ret = `0`;
7838
7839	/*
7840	* We don't have a dev_root because we mounted with ignorebadroots and
7841	* failed to load the root, so we want to skip the verification in this
7842	* case for sure.
7843	*
7844	* However if the dev root is fine, but the tree itself is corrupted
7845	* we'd still fail to mount. This verification is only to make sure
7846	* writes can happen safely, so instead just bypass this check
7847	* completely in the case of IGNOREBADROOTS.
7848	*/
7849	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
7850	return `0`;
7851
7852	key.objectid = `1`;
7853	key.type = BTRFS_DEV_EXTENT_KEY;
7854	key.offset = `0`;
7855
7856	path = btrfs_alloc_path();
7857	if (!path)
7858	return -ENOMEM;
7859
7860	path->reada = READA_FORWARD;
7861	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
7862	if (ret < `0`)
7863	goto out;
7864
7865	if (path->slots[`0`] >= btrfs_header_nritems(eb: path->nodes[`0`])) {
7866	ret = btrfs_next_leaf(root, path);
7867	if (ret < `0`)
7868	goto out;
7869	/ No dev extents at all? Not good /
7870	if (ret > `0`) {
7871	ret = -EUCLEAN;
7872	goto out;
7873	}
7874	}
7875	while (`1`) {
7876	struct extent_buffer *leaf = path->nodes[`0`];
7877	struct btrfs_dev_extent *dext;
7878	int slot = path->slots[`0`];
7879	u64 chunk_offset;
7880	u64 physical_offset;
7881	u64 physical_len;
7882	u64 devid;
7883
7884	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
7885	if (key.type != BTRFS_DEV_EXTENT_KEY)
7886	break;
7887	devid = key.objectid;
7888	physical_offset = key.offset;
7889
7890	dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7891	chunk_offset = btrfs_dev_extent_chunk_offset(eb: leaf, s: dext);
7892	physical_len = btrfs_dev_extent_length(eb: leaf, s: dext);
7893
7894	/ Check if this dev extent overlaps with the previous one /
7895	if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7896	btrfs_err(fs_info,
7897	"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7898	devid, physical_offset, prev_dev_ext_end);
7899	ret = -EUCLEAN;
7900	goto out;
7901	}
7902
7903	ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7904	physical_offset, physical_len);
7905	if (ret < `0`)
7906	goto out;
7907	prev_devid = devid;
7908	prev_dev_ext_end = physical_offset + physical_len;
7909
7910	ret = btrfs_next_item(root, p: path);
7911	if (ret < `0`)
7912	goto out;
7913	if (ret > `0`) {
7914	ret = `0`;
7915	break;
7916	}
7917	}
7918
7919	/ Ensure all chunks have corresponding dev extents /
7920	ret = verify_chunk_dev_extent_mapping(fs_info);
7921	out:
7922	btrfs_free_path(p: path);
7923	return ret;
7924	}
7925
7926	/*
7927	* Check whether the given block group or device is pinned by any inode being
7928	* used as a swapfile.
7929	*/
7930	bool btrfs_pinned_by_swapfile(struct btrfs_fs_info fs_info, void* *ptr)
7931	{
7932	struct btrfs_swapfile_pin *sp;
7933	struct rb_node *node;
7934
7935	spin_lock(lock: &fs_info->swapfile_pins_lock);
7936	node = fs_info->swapfile_pins.rb_node;
7937	while (node) {
7938	sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7939	if (ptr < sp->ptr)
7940	node = node->rb_left;
7941	else if (ptr > sp->ptr)
7942	node = node->rb_right;
7943	else
7944	break;
7945	}
7946	spin_unlock(lock: &fs_info->swapfile_pins_lock);
7947	return node != NULL;
7948	}
7949
7950	static int relocating_repair_kthread(void *data)
7951	{
7952	struct btrfs_block_group *cache = data;
7953	struct btrfs_fs_info *fs_info = cache->fs_info;
7954	u64 target;
7955	int ret = `0`;
7956
7957	target = cache->start;
7958	btrfs_put_block_group(cache);
7959
7960	sb_start_write(sb: fs_info->sb);
7961	if (!btrfs_exclop_start(fs_info, type: BTRFS_EXCLOP_BALANCE)) {
7962	btrfs_info(fs_info,
7963	"zoned: skip relocating block group %llu to repair: EBUSY",
7964	target);
7965	sb_end_write(sb: fs_info->sb);
7966	return -EBUSY;
7967	}
7968
7969	mutex_lock(&fs_info->reclaim_bgs_lock);
7970
7971	/ Ensure block group still exists /
7972	cache = btrfs_lookup_block_group(info: fs_info, bytenr: target);
7973	if (!cache)
7974	goto out;
7975
7976	if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
7977	goto out;
7978
7979	ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset: target);
7980	if (ret < `0`)
7981	goto out;
7982
7983	btrfs_info(fs_info,
7984	"zoned: relocating block group %llu to repair IO failure",
7985	target);
7986	ret = btrfs_relocate_chunk(fs_info, chunk_offset: target);
7987
7988	out:
7989	if (cache)
7990	btrfs_put_block_group(cache);
7991	mutex_unlock(lock: &fs_info->reclaim_bgs_lock);
7992	btrfs_exclop_finish(fs_info);
7993	sb_end_write(sb: fs_info->sb);
7994
7995	return ret;
7996	}
7997
7998	bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
7999	{
8000	struct btrfs_block_group *cache;
8001
8002	if (!btrfs_is_zoned(fs_info))
8003	return false;
8004
8005	/ Do not attempt to repair in degraded state /
8006	if (btrfs_test_opt(fs_info, DEGRADED))
8007	return true;
8008
8009	cache = btrfs_lookup_block_group(info: fs_info, bytenr: logical);
8010	if (!cache)
8011	return true;
8012
8013	if (test_and_set_bit(nr: BLOCK_GROUP_FLAG_RELOCATING_REPAIR, addr: &cache->runtime_flags)) {
8014	btrfs_put_block_group(cache);
8015	return true;
8016	}
8017
8018	kthread_run(relocating_repair_kthread, cache,
8019	"btrfs-relocating-repair");
8020
8021	return true;
8022	}
8023
8024	static void map_raid56_repair_block(struct btrfs_io_context *bioc,
8025	struct btrfs_io_stripe *smap,
8026	u64 logical)
8027	{
8028	int data_stripes = nr_bioc_data_stripes(bioc);
8029	int i;
8030
8031	for (i = `0`; i < data_stripes; i++) {
8032	u64 stripe_start = bioc->full_stripe_logical +
8033	btrfs_stripe_nr_to_offset(stripe_nr: i);
8034
8035	if (logical >= stripe_start &&
8036	logical < stripe_start + BTRFS_STRIPE_LEN)
8037	break;
8038	}
8039	ASSERT(i < data_stripes);
8040	smap->dev = bioc->stripes[i].dev;
8041	smap->physical = bioc->stripes[i].physical +
8042	((logical - bioc->full_stripe_logical) &
8043	BTRFS_STRIPE_LEN_MASK);
8044	}
8045
8046	/*
8047	* Map a repair write into a single device.
8048	*
8049	* A repair write is triggered by read time repair or scrub, which would only
8050	* update the contents of a single device.
8051	* Not update any other mirrors nor go through RMW path.
8052	*
8053	* Callers should ensure:
8054	*
8055	* - Call btrfs_bio_counter_inc_blocked() first
8056	* - The range does not cross stripe boundary
8057	* - Has a valid @mirror_num passed in.
8058	*/
8059	int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
8060	struct btrfs_io_stripe *smap, u64 logical,
8061	u32 length, int mirror_num)
8062	{
8063	struct btrfs_io_context *bioc = NULL;
8064	u64 map_length = length;
8065	int mirror_ret = mirror_num;
8066	int ret;
8067
8068	ASSERT(mirror_num > `0`);
8069
8070	ret = btrfs_map_block(fs_info, op: BTRFS_MAP_WRITE, logical, length: &map_length,
8071	bioc_ret: &bioc, smap, mirror_num_ret: &mirror_ret);
8072	if (ret < `0`)
8073	return ret;
8074
8075	/ The map range should not cross stripe boundary. /
8076	ASSERT(map_length >= length);
8077
8078	/ Already mapped to single stripe. /
8079	if (!bioc)
8080	goto out;
8081
8082	/ Map the RAID56 multi-stripe writes to a single one. /
8083	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
8084	map_raid56_repair_block(bioc, smap, logical);
8085	goto out;
8086	}
8087
8088	ASSERT(mirror_num <= bioc->num_stripes);
8089	smap->dev = bioc->stripes[mirror_num - `1`].dev;
8090	smap->physical = bioc->stripes[mirror_num - `1`].physical;
8091	out:
8092	btrfs_put_bioc(bioc);
8093	ASSERT(smap->dev);
8094	return `0`;
8095	}
8096

source code of linux/fs/btrfs/volumes.c