extent-io-tree.c source code [linux/fs/btrfs/extent-io-tree.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	#include <linux/slab.h>
4	#include <trace/events/btrfs.h>
5	#include "messages.h"
6	#include "ctree.h"
7	#include "extent-io-tree.h"
8	#include "btrfs_inode.h"
9
10	static struct kmem_cache *extent_state_cache;
11
12	static inline bool extent_state_in_tree(const struct extent_state *state)
13	{
14	return !RB_EMPTY_NODE(&state->rb_node);
15	}
16
17	#ifdef CONFIG_BTRFS_DEBUG
18	static LIST_HEAD(states);
19	static DEFINE_SPINLOCK(leak_lock);
20
21	static inline void btrfs_leak_debug_add_state(struct extent_state *state)
22	{
23	unsigned long flags;
24
25	spin_lock_irqsave(&leak_lock, flags);
26	list_add(new: &state->leak_list, head: &states);
27	spin_unlock_irqrestore(lock: &leak_lock, flags);
28	}
29
30	static inline void btrfs_leak_debug_del_state(struct extent_state *state)
31	{
32	unsigned long flags;
33
34	spin_lock_irqsave(&leak_lock, flags);
35	list_del(entry: &state->leak_list);
36	spin_unlock_irqrestore(lock: &leak_lock, flags);
37	}
38
39	static inline void btrfs_extent_state_leak_debug_check(void)
40	{
41	struct extent_state *state;
42
43	while (!list_empty(head: &states)) {
44	state = list_entry(states.next, struct extent_state, leak_list);
45	pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
46	state->start, state->end, state->state,
47	extent_state_in_tree(state),
48	refcount_read(&state->refs));
49	list_del(entry: &state->leak_list);
50	WARN_ON_ONCE(`1`);
51	kmem_cache_free(s: extent_state_cache, objp: state);
52	}
53	}
54
55	#define btrfs_debug_check_extent_io_range(tree, start, end) \
56	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
57	static inline void __btrfs_debug_check_extent_io_range(const char *caller,
58	struct extent_io_tree *tree,
59	u64 start, u64 end)
60	{
61	const struct btrfs_inode *inode;
62	u64 isize;
63
64	if (tree->owner != IO_TREE_INODE_IO)
65	return;
66
67	inode = extent_io_tree_to_inode_const(tree);
68	isize = i_size_read(inode: &inode->vfs_inode);
69	if (end >= PAGE_SIZE && (end % `2`) == `0` && end != isize - `1`) {
70	btrfs_debug_rl(inode->root->fs_info,
71	"%s: ino %llu isize %llu odd range [%llu,%llu]",
72	caller, btrfs_ino(inode), isize, start, end);
73	}
74	}
75	#else
76	#define btrfs_leak_debug_add_state(state) do {} while (0)
77	#define btrfs_leak_debug_del_state(state) do {} while (0)
78	#define btrfs_extent_state_leak_debug_check() do {} while (0)
79	#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
80	#endif
81
82
83	/*
84	* The only tree allowed to set the inode is IO_TREE_INODE_IO.
85	*/
86	static bool is_inode_io_tree(const struct extent_io_tree *tree)
87	{
88	return tree->owner == IO_TREE_INODE_IO;
89	}
90
91	/ Return the inode if it's valid for the given tree, otherwise NULL. /
92	struct btrfs_inode extent_io_tree_to_inode(struct* extent_io_tree *tree)
93	{
94	if (tree->owner == IO_TREE_INODE_IO)
95	return tree->inode;
96	return NULL;
97	}
98
99	/ Read-only access to the inode. /
100	const struct btrfs_inode extent_io_tree_to_inode_const(const* struct extent_io_tree *tree)
101	{
102	if (tree->owner == IO_TREE_INODE_IO)
103	return tree->inode;
104	return NULL;
105	}
106
107	/ For read-only access to fs_info. /
108	const struct btrfs_fs_info extent_io_tree_to_fs_info(const* struct extent_io_tree *tree)
109	{
110	if (tree->owner == IO_TREE_INODE_IO)
111	return tree->inode->root->fs_info;
112	return tree->fs_info;
113	}
114
115	void extent_io_tree_init(struct btrfs_fs_info *fs_info,
116	struct extent_io_tree tree, unsigned* int owner)
117	{
118	tree->state = RB_ROOT;
119	spin_lock_init(&tree->lock);
120	tree->fs_info = fs_info;
121	tree->owner = owner;
122	}
123
124	/*
125	* Empty an io tree, removing and freeing every extent state record from the
126	* tree. This should be called once we are sure no other task can access the
127	* tree anymore, so no tree updates happen after we empty the tree and there
128	* aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
129	* set on any extent state when calling this function).
130	*/
131	void extent_io_tree_release(struct extent_io_tree *tree)
132	{
133	struct rb_root root;
134	struct extent_state *state;
135	struct extent_state *tmp;
136
137	spin_lock(lock: &tree->lock);
138	root = tree->state;
139	tree->state = RB_ROOT;
140	rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
141	/ Clear node to keep free_extent_state() happy. /
142	RB_CLEAR_NODE(&state->rb_node);
143	ASSERT(!(state->state & EXTENT_LOCKED));
144	/*
145	* No need for a memory barrier here, as we are holding the tree
146	* lock and we only change the waitqueue while holding that lock
147	* (see wait_extent_bit()).
148	*/
149	ASSERT(!waitqueue_active(&state->wq));
150	free_extent_state(state);
151	cond_resched_lock(&tree->lock);
152	}
153	/*
154	* Should still be empty even after a reschedule, no other task should
155	* be accessing the tree anymore.
156	*/
157	ASSERT(RB_EMPTY_ROOT(&tree->state));
158	spin_unlock(lock: &tree->lock);
159	}
160
161	static struct extent_state *alloc_extent_state(gfp_t mask)
162	{
163	struct extent_state *state;
164
165	/*
166	* The given mask might be not appropriate for the slab allocator,
167	* drop the unsupported bits
168	*/
169	mask &= ~(__GFP_DMA32\|__GFP_HIGHMEM);
170	state = kmem_cache_alloc(cachep: extent_state_cache, flags: mask);
171	if (!state)
172	return state;
173	state->state = `0`;
174	RB_CLEAR_NODE(&state->rb_node);
175	btrfs_leak_debug_add_state(state);
176	refcount_set(r: &state->refs, n: `1`);
177	init_waitqueue_head(&state->wq);
178	trace_alloc_extent_state(state, mask, _RET_IP_);
179	return state;
180	}
181
182	static struct extent_state alloc_extent_state_atomic(struct* extent_state *prealloc)
183	{
184	if (!prealloc)
185	prealloc = alloc_extent_state(GFP_ATOMIC);
186
187	return prealloc;
188	}
189
190	void free_extent_state(struct extent_state *state)
191	{
192	if (!state)
193	return;
194	if (refcount_dec_and_test(r: &state->refs)) {
195	WARN_ON(extent_state_in_tree(state));
196	btrfs_leak_debug_del_state(state);
197	trace_free_extent_state(state, _RET_IP_);
198	kmem_cache_free(s: extent_state_cache, objp: state);
199	}
200	}
201
202	static int add_extent_changeset(struct extent_state *state, u32 bits,
203	struct extent_changeset *changeset,
204	int set)
205	{
206	int ret;
207
208	if (!changeset)
209	return `0`;
210	if (set && (state->state & bits) == bits)
211	return `0`;
212	if (!set && (state->state & bits) == `0`)
213	return `0`;
214	changeset->bytes_changed += state->end - state->start + `1`;
215	ret = ulist_add(ulist: &changeset->range_changed, val: state->start, aux: state->end,
216	GFP_ATOMIC);
217	return ret;
218	}
219
220	static inline struct extent_state next_state(struct* extent_state *state)
221	{
222	struct rb_node *next = rb_next(&state->rb_node);
223
224	if (next)
225	return rb_entry(next, struct extent_state, rb_node);
226	else
227	return NULL;
228	}
229
230	static inline struct extent_state prev_state(struct* extent_state *state)
231	{
232	struct rb_node *next = rb_prev(&state->rb_node);
233
234	if (next)
235	return rb_entry(next, struct extent_state, rb_node);
236	else
237	return NULL;
238	}
239
240	/*
241	* Search @tree for an entry that contains @offset. Such entry would have
242	* entry->start <= offset && entry->end >= offset.
243	*
244	* @tree: the tree to search
245	* @offset: offset that should fall within an entry in @tree
246	* @node_ret: pointer where new node should be anchored (used when inserting an
247	* entry in the tree)
248	* @parent_ret: points to entry which would have been the parent of the entry,
249	* containing @offset
250	*
251	* Return a pointer to the entry that contains @offset byte address and don't change
252	* @node_ret and @parent_ret.
253	*
254	* If no such entry exists, return pointer to entry that ends before @offset
255	* and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
256	*/
257	static inline struct extent_state tree_search_for_insert(struct* extent_io_tree *tree,
258	u64 offset,
259	struct rb_node ***node_ret,
260	struct rb_node **parent_ret)
261	{
262	struct rb_root *root = &tree->state;
263	struct rb_node **node = &root->rb_node;
264	struct rb_node *prev = NULL;
265	struct extent_state *entry = NULL;
266
267	while (*node) {
268	prev = *node;
269	entry = rb_entry(prev, struct extent_state, rb_node);
270
271	if (offset < entry->start)
272	node = &(*node)->rb_left;
273	else if (offset > entry->end)
274	node = &(*node)->rb_right;
275	else
276	return entry;
277	}
278
279	if (node_ret)
280	*node_ret = node;
281	if (parent_ret)
282	*parent_ret = prev;
283
284	/ Search neighbors until we find the first one past the end /
285	while (entry && offset > entry->end)
286	entry = next_state(state: entry);
287
288	return entry;
289	}
290
291	/*
292	* Search offset in the tree or fill neighbor rbtree node pointers.
293	*
294	* @tree: the tree to search
295	* @offset: offset that should fall within an entry in @tree
296	* @next_ret: pointer to the first entry whose range ends after @offset
297	* @prev_ret: pointer to the first entry whose range begins before @offset
298	*
299	* Return a pointer to the entry that contains @offset byte address. If no
300	* such entry exists, then return NULL and fill @prev_ret and @next_ret.
301	* Otherwise return the found entry and other pointers are left untouched.
302	*/
303	static struct extent_state tree_search_prev_next(struct* extent_io_tree *tree,
304	u64 offset,
305	struct extent_state **prev_ret,
306	struct extent_state **next_ret)
307	{
308	struct rb_root *root = &tree->state;
309	struct rb_node **node = &root->rb_node;
310	struct extent_state *orig_prev;
311	struct extent_state *entry = NULL;
312
313	ASSERT(prev_ret);
314	ASSERT(next_ret);
315
316	while (*node) {
317	entry = rb_entry(node, struct* extent_state, rb_node);
318
319	if (offset < entry->start)
320	node = &(*node)->rb_left;
321	else if (offset > entry->end)
322	node = &(*node)->rb_right;
323	else
324	return entry;
325	}
326
327	orig_prev = entry;
328	while (entry && offset > entry->end)
329	entry = next_state(state: entry);
330	*next_ret = entry;
331	entry = orig_prev;
332
333	while (entry && offset < entry->start)
334	entry = prev_state(state: entry);
335	*prev_ret = entry;
336
337	return NULL;
338	}
339
340	/*
341	* Inexact rb-tree search, return the next entry if @offset is not found
342	*/
343	static inline struct extent_state tree_search(struct* extent_io_tree *tree, u64 offset)
344	{
345	return tree_search_for_insert(tree, offset, NULL, NULL);
346	}
347
348	static void extent_io_tree_panic(const struct extent_io_tree *tree,
349	const struct extent_state *state,
350	const char *opname,
351	int err)
352	{
353	btrfs_panic(extent_io_tree_to_fs_info(tree), err,
354	"extent io tree error on %s state start %llu end %llu",
355	opname, state->start, state->end);
356	}
357
358	static void merge_prev_state(struct extent_io_tree tree, struct* extent_state *state)
359	{
360	struct extent_state *prev;
361
362	prev = prev_state(state);
363	if (prev && prev->end == state->start - `1` && prev->state == state->state) {
364	if (is_inode_io_tree(tree))
365	btrfs_merge_delalloc_extent(inode: extent_io_tree_to_inode(tree),
366	new: state, other: prev);
367	state->start = prev->start;
368	rb_erase(&prev->rb_node, &tree->state);
369	RB_CLEAR_NODE(&prev->rb_node);
370	free_extent_state(state: prev);
371	}
372	}
373
374	static void merge_next_state(struct extent_io_tree tree, struct* extent_state *state)
375	{
376	struct extent_state *next;
377
378	next = next_state(state);
379	if (next && next->start == state->end + `1` && next->state == state->state) {
380	if (is_inode_io_tree(tree))
381	btrfs_merge_delalloc_extent(inode: extent_io_tree_to_inode(tree),
382	new: state, other: next);
383	state->end = next->end;
384	rb_erase(&next->rb_node, &tree->state);
385	RB_CLEAR_NODE(&next->rb_node);
386	free_extent_state(state: next);
387	}
388	}
389
390	/*
391	* Utility function to look for merge candidates inside a given range. Any
392	* extents with matching state are merged together into a single extent in the
393	* tree. Extents with EXTENT_IO in their state field are not merged because
394	* the end_io handlers need to be able to do operations on them without
395	* sleeping (or doing allocations/splits).
396	*
397	* This should be called with the tree lock held.
398	*/
399	static void merge_state(struct extent_io_tree tree, struct* extent_state *state)
400	{
401	if (state->state & (EXTENT_LOCKED \| EXTENT_BOUNDARY))
402	return;
403
404	merge_prev_state(tree, state);
405	merge_next_state(tree, state);
406	}
407
408	static void set_state_bits(struct extent_io_tree *tree,
409	struct extent_state *state,
410	u32 bits, struct extent_changeset *changeset)
411	{
412	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
413	int ret;
414
415	if (is_inode_io_tree(tree))
416	btrfs_set_delalloc_extent(inode: extent_io_tree_to_inode(tree), state, bits);
417
418	ret = add_extent_changeset(state, bits: bits_to_set, changeset, set: `1`);
419	BUG_ON(ret < `0`);
420	state->state \|= bits_to_set;
421	}
422
423	/*
424	* Insert an extent_state struct into the tree. 'bits' are set on the
425	* struct before it is inserted.
426	*
427	* Returns a pointer to the struct extent_state record containing the range
428	* requested for insertion, which may be the same as the given struct or it
429	* may be an existing record in the tree that was expanded to accommodate the
430	* requested range. In case of an extent_state different from the one that was
431	* given, the later can be freed or reused by the caller.
432	*
433	* On error it returns an error pointer.
434	*
435	* The tree lock is not taken internally. This is a utility function and
436	* probably isn't what you want to call (see set/clear_extent_bit).
437	*/
438	static struct extent_state insert_state(struct* extent_io_tree *tree,
439	struct extent_state *state,
440	u32 bits,
441	struct extent_changeset *changeset)
442	{
443	struct rb_node **node;
444	struct rb_node *parent = NULL;
445	const u64 start = state->start - `1`;
446	const u64 end = state->end + `1`;
447	const bool try_merge = !(bits & (EXTENT_LOCKED \| EXTENT_BOUNDARY));
448
449	set_state_bits(tree, state, bits, changeset);
450
451	node = &tree->state.rb_node;
452	while (*node) {
453	struct extent_state *entry;
454
455	parent = *node;
456	entry = rb_entry(parent, struct extent_state, rb_node);
457
458	if (state->end < entry->start) {
459	if (try_merge && end == entry->start &&
460	state->state == entry->state) {
461	if (is_inode_io_tree(tree))
462	btrfs_merge_delalloc_extent(
463	inode: extent_io_tree_to_inode(tree),
464	new: state, other: entry);
465	entry->start = state->start;
466	merge_prev_state(tree, state: entry);
467	state->state = `0`;
468	return entry;
469	}
470	node = &(*node)->rb_left;
471	} else if (state->end > entry->end) {
472	if (try_merge && entry->end == start &&
473	state->state == entry->state) {
474	if (is_inode_io_tree(tree))
475	btrfs_merge_delalloc_extent(
476	inode: extent_io_tree_to_inode(tree),
477	new: state, other: entry);
478	entry->end = state->end;
479	merge_next_state(tree, state: entry);
480	state->state = `0`;
481	return entry;
482	}
483	node = &(*node)->rb_right;
484	} else {
485	return ERR_PTR(error: -EEXIST);
486	}
487	}
488
489	rb_link_node(node: &state->rb_node, parent, rb_link: node);
490	rb_insert_color(&state->rb_node, &tree->state);
491
492	return state;
493	}
494
495	/*
496	* Insert state to @tree to the location given by @node and @parent.
497	*/
498	static void insert_state_fast(struct extent_io_tree *tree,
499	struct extent_state state, struct* rb_node **node,
500	struct rb_node parent, unsigned* bits,
501	struct extent_changeset *changeset)
502	{
503	set_state_bits(tree, state, bits, changeset);
504	rb_link_node(node: &state->rb_node, parent, rb_link: node);
505	rb_insert_color(&state->rb_node, &tree->state);
506	merge_state(tree, state);
507	}
508
509	/*
510	* Split a given extent state struct in two, inserting the preallocated
511	* struct 'prealloc' as the newly created second half. 'split' indicates an
512	* offset inside 'orig' where it should be split.
513	*
514	* Before calling,
515	* the tree has 'orig' at [orig->start, orig->end]. After calling, there
516	* are two extent state structs in the tree:
517	* prealloc: [orig->start, split - 1]
518	* orig: [ split, orig->end ]
519	*
520	* The tree locks are not taken by this function. They need to be held
521	* by the caller.
522	*/
523	static int split_state(struct extent_io_tree tree, struct* extent_state *orig,
524	struct extent_state *prealloc, u64 split)
525	{
526	struct rb_node *parent = NULL;
527	struct rb_node **node;
528
529	if (is_inode_io_tree(tree))
530	btrfs_split_delalloc_extent(inode: extent_io_tree_to_inode(tree), orig,
531	split);
532
533	prealloc->start = orig->start;
534	prealloc->end = split - `1`;
535	prealloc->state = orig->state;
536	orig->start = split;
537
538	parent = &orig->rb_node;
539	node = &parent;
540	while (*node) {
541	struct extent_state *entry;
542
543	parent = *node;
544	entry = rb_entry(parent, struct extent_state, rb_node);
545
546	if (prealloc->end < entry->start) {
547	node = &(*node)->rb_left;
548	} else if (prealloc->end > entry->end) {
549	node = &(*node)->rb_right;
550	} else {
551	free_extent_state(state: prealloc);
552	return -EEXIST;
553	}
554	}
555
556	rb_link_node(node: &prealloc->rb_node, parent, rb_link: node);
557	rb_insert_color(&prealloc->rb_node, &tree->state);
558
559	return `0`;
560	}
561
562	/*
563	* Utility function to clear some bits in an extent state struct. It will
564	* optionally wake up anyone waiting on this state (wake == 1).
565	*
566	* If no bits are set on the state struct after clearing things, the
567	* struct is freed and removed from the tree
568	*/
569	static struct extent_state clear_state_bit(struct* extent_io_tree *tree,
570	struct extent_state *state,
571	u32 bits, int wake,
572	struct extent_changeset *changeset)
573	{
574	struct extent_state *next;
575	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
576	int ret;
577
578	if (is_inode_io_tree(tree))
579	btrfs_clear_delalloc_extent(inode: extent_io_tree_to_inode(tree), state,
580	bits);
581
582	ret = add_extent_changeset(state, bits: bits_to_clear, changeset, set: `0`);
583	BUG_ON(ret < `0`);
584	state->state &= ~bits_to_clear;
585	if (wake)
586	wake_up(&state->wq);
587	if (state->state == `0`) {
588	next = next_state(state);
589	if (extent_state_in_tree(state)) {
590	rb_erase(&state->rb_node, &tree->state);
591	RB_CLEAR_NODE(&state->rb_node);
592	free_extent_state(state);
593	} else {
594	WARN_ON(`1`);
595	}
596	} else {
597	merge_state(tree, state);
598	next = next_state(state);
599	}
600	return next;
601	}
602
603	/*
604	* Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
605	* unset the EXTENT_NOWAIT bit.
606	*/
607	static void set_gfp_mask_from_bits(u32 bits, gfp_t mask)
608	{
609	mask = (bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
610	*bits &= EXTENT_NOWAIT - `1`;
611	}
612
613	/*
614	* Clear some bits on a range in the tree. This may require splitting or
615	* inserting elements in the tree, so the gfp mask is used to indicate which
616	* allocations or sleeping are allowed.
617	*
618	* Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
619	* range from the tree regardless of state (ie for truncate).
620	*
621	* The range [start, end] is inclusive.
622	*
623	* This takes the tree lock, and returns 0 on success and < 0 on error.
624	*/
625	int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
626	u32 bits, struct extent_state **cached_state,
627	struct extent_changeset *changeset)
628	{
629	struct extent_state *state;
630	struct extent_state *cached;
631	struct extent_state *prealloc = NULL;
632	u64 last_end;
633	int err;
634	int clear = `0`;
635	int wake;
636	int delete = (bits & EXTENT_CLEAR_ALL_BITS);
637	gfp_t mask;
638
639	set_gfp_mask_from_bits(bits: &bits, mask: &mask);
640	btrfs_debug_check_extent_io_range(tree, start, end);
641	trace_btrfs_clear_extent_bit(tree, start, len: end - start + `1`, clear_bits: bits);
642
643	if (delete)
644	bits \|= ~EXTENT_CTLBITS;
645
646	if (bits & EXTENT_DELALLOC)
647	bits \|= EXTENT_NORESERVE;
648
649	wake = (bits & EXTENT_LOCKED) ? `1` : `0`;
650	if (bits & (EXTENT_LOCKED \| EXTENT_BOUNDARY))
651	clear = `1`;
652	again:
653	if (!prealloc) {
654	/*
655	* Don't care for allocation failure here because we might end
656	* up not needing the pre-allocated extent state at all, which
657	* is the case if we only have in the tree extent states that
658	* cover our input range and don't cover too any other range.
659	* If we end up needing a new extent state we allocate it later.
660	*/
661	prealloc = alloc_extent_state(mask);
662	}
663
664	spin_lock(lock: &tree->lock);
665	if (cached_state) {
666	cached = *cached_state;
667
668	if (clear) {
669	*cached_state = NULL;
670	cached_state = NULL;
671	}
672
673	if (cached && extent_state_in_tree(state: cached) &&
674	cached->start <= start && cached->end > start) {
675	if (clear)
676	refcount_dec(r: &cached->refs);
677	state = cached;
678	goto hit_next;
679	}
680	if (clear)
681	free_extent_state(state: cached);
682	}
683
684	/ This search will find the extents that end after our range starts. /
685	state = tree_search(tree, offset: start);
686	if (!state)
687	goto out;
688	hit_next:
689	if (state->start > end)
690	goto out;
691	WARN_ON(state->end < start);
692	last_end = state->end;
693
694	/ The state doesn't have the wanted bits, go ahead. /
695	if (!(state->state & bits)) {
696	state = next_state(state);
697	goto next;
698	}
699
700	/*
701	* \| ---- desired range ---- \|
702	* \| state \| or
703	* \| ------------- state -------------- \|
704	*
705	* We need to split the extent we found, and may flip bits on second
706	* half.
707	*
708	* If the extent we found extends past our range, we just split and
709	* search again. It'll get split again the next time though.
710	*
711	* If the extent we found is inside our range, we clear the desired bit
712	* on it.
713	*/
714
715	if (state->start < start) {
716	prealloc = alloc_extent_state_atomic(prealloc);
717	if (!prealloc)
718	goto search_again;
719	err = split_state(tree, orig: state, prealloc, split: start);
720	if (err)
721	extent_io_tree_panic(tree, state, opname: "split", err);
722
723	prealloc = NULL;
724	if (err)
725	goto out;
726	if (state->end <= end) {
727	state = clear_state_bit(tree, state, bits, wake, changeset);
728	goto next;
729	}
730	goto search_again;
731	}
732	/*
733	* \| ---- desired range ---- \|
734	* \| state \|
735	* We need to split the extent, and clear the bit on the first half.
736	*/
737	if (state->start <= end && state->end > end) {
738	prealloc = alloc_extent_state_atomic(prealloc);
739	if (!prealloc)
740	goto search_again;
741	err = split_state(tree, orig: state, prealloc, split: end + `1`);
742	if (err)
743	extent_io_tree_panic(tree, state, opname: "split", err);
744
745	if (wake)
746	wake_up(&state->wq);
747
748	clear_state_bit(tree, state: prealloc, bits, wake, changeset);
749
750	prealloc = NULL;
751	goto out;
752	}
753
754	state = clear_state_bit(tree, state, bits, wake, changeset);
755	next:
756	if (last_end == (u64)-`1`)
757	goto out;
758	start = last_end + `1`;
759	if (start <= end && state && !need_resched())
760	goto hit_next;
761
762	search_again:
763	if (start > end)
764	goto out;
765	spin_unlock(lock: &tree->lock);
766	if (gfpflags_allow_blocking(gfp_flags: mask))
767	cond_resched();
768	goto again;
769
770	out:
771	spin_unlock(lock: &tree->lock);
772	if (prealloc)
773	free_extent_state(state: prealloc);
774
775	return `0`;
776
777	}
778
779	/*
780	* Wait for one or more bits to clear on a range in the state tree.
781	* The range [start, end] is inclusive.
782	* The tree lock is taken by this function
783	*/
784	static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
785	u32 bits, struct extent_state **cached_state)
786	{
787	struct extent_state *state;
788
789	btrfs_debug_check_extent_io_range(tree, start, end);
790
791	spin_lock(lock: &tree->lock);
792	again:
793	/*
794	* Maintain cached_state, as we may not remove it from the tree if there
795	* are more bits than the bits we're waiting on set on this state.
796	*/
797	if (cached_state && *cached_state) {
798	state = *cached_state;
799	if (extent_state_in_tree(state) &&
800	state->start <= start && start < state->end)
801	goto process_node;
802	}
803	while (`1`) {
804	/*
805	* This search will find all the extents that end after our
806	* range starts.
807	*/
808	state = tree_search(tree, offset: start);
809	process_node:
810	if (!state)
811	break;
812	if (state->start > end)
813	goto out;
814
815	if (state->state & bits) {
816	DEFINE_WAIT(wait);
817
818	start = state->start;
819	refcount_inc(r: &state->refs);
820	prepare_to_wait(wq_head: &state->wq, wq_entry: &wait, TASK_UNINTERRUPTIBLE);
821	spin_unlock(lock: &tree->lock);
822	schedule();
823	spin_lock(lock: &tree->lock);
824	finish_wait(wq_head: &state->wq, wq_entry: &wait);
825	free_extent_state(state);
826	goto again;
827	}
828	start = state->end + `1`;
829
830	if (start > end)
831	break;
832
833	if (!cond_resched_lock(&tree->lock)) {
834	state = next_state(state);
835	goto process_node;
836	}
837	}
838	out:
839	/ This state is no longer useful, clear it and free it up. /
840	if (cached_state && *cached_state) {
841	state = *cached_state;
842	*cached_state = NULL;
843	free_extent_state(state);
844	}
845	spin_unlock(lock: &tree->lock);
846	}
847
848	static void cache_state_if_flags(struct extent_state *state,
849	struct extent_state **cached_ptr,
850	unsigned flags)
851	{
852	if (cached_ptr && !(*cached_ptr)) {
853	if (!flags \|\| (state->state & flags)) {
854	*cached_ptr = state;
855	refcount_inc(r: &state->refs);
856	}
857	}
858	}
859
860	static void cache_state(struct extent_state *state,
861	struct extent_state **cached_ptr)
862	{
863	return cache_state_if_flags(state, cached_ptr,
864	flags: EXTENT_LOCKED \| EXTENT_BOUNDARY);
865	}
866
867	/*
868	* Find the first state struct with 'bits' set after 'start', and return it.
869	* tree->lock must be held. NULL will returned if nothing was found after
870	* 'start'.
871	*/
872	static struct extent_state find_first_extent_bit_state(struct* extent_io_tree *tree,
873	u64 start, u32 bits)
874	{
875	struct extent_state *state;
876
877	/*
878	* This search will find all the extents that end after our range
879	* starts.
880	*/
881	state = tree_search(tree, offset: start);
882	while (state) {
883	if (state->end >= start && (state->state & bits))
884	return state;
885	state = next_state(state);
886	}
887	return NULL;
888	}
889
890	/*
891	* Find the first offset in the io tree with one or more @bits set.
892	*
893	* Note: If there are multiple bits set in @bits, any of them will match.
894	*
895	* Return true if we find something, and update @start_ret and @end_ret.
896	* Return false if we found nothing.
897	*/
898	bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
899	u64 start_ret, u64 end_ret, u32 bits,
900	struct extent_state **cached_state)
901	{
902	struct extent_state *state;
903	bool ret = false;
904
905	spin_lock(lock: &tree->lock);
906	if (cached_state && *cached_state) {
907	state = *cached_state;
908	if (state->end == start - `1` && extent_state_in_tree(state)) {
909	while ((state = next_state(state)) != NULL) {
910	if (state->state & bits)
911	break;
912	}
913	/*
914	* If we found the next extent state, clear cached_state
915	* so that we can cache the next extent state below and
916	* avoid future calls going over the same extent state
917	* again. If we haven't found any, clear as well since
918	* it's now useless.
919	*/
920	free_extent_state(state: *cached_state);
921	*cached_state = NULL;
922	if (state)
923	goto got_it;
924	goto out;
925	}
926	free_extent_state(state: *cached_state);
927	*cached_state = NULL;
928	}
929
930	state = find_first_extent_bit_state(tree, start, bits);
931	got_it:
932	if (state) {
933	cache_state_if_flags(state, cached_ptr: cached_state, flags: `0`);
934	*start_ret = state->start;
935	*end_ret = state->end;
936	ret = true;
937	}
938	out:
939	spin_unlock(lock: &tree->lock);
940	return ret;
941	}
942
943	/*
944	* Find a contiguous area of bits
945	*
946	* @tree: io tree to check
947	* @start: offset to start the search from
948	* @start_ret: the first offset we found with the bits set
949	* @end_ret: the final contiguous range of the bits that were set
950	* @bits: bits to look for
951	*
952	* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
953	* to set bits appropriately, and then merge them again. During this time it
954	* will drop the tree->lock, so use this helper if you want to find the actual
955	* contiguous area for given bits. We will search to the first bit we find, and
956	* then walk down the tree until we find a non-contiguous area. The area
957	* returned will be the full contiguous area with the bits set.
958	*/
959	int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
960	u64 start_ret, u64 end_ret, u32 bits)
961	{
962	struct extent_state *state;
963	int ret = `1`;
964
965	ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
966
967	spin_lock(lock: &tree->lock);
968	state = find_first_extent_bit_state(tree, start, bits);
969	if (state) {
970	*start_ret = state->start;
971	*end_ret = state->end;
972	while ((state = next_state(state)) != NULL) {
973	if (state->start > (*end_ret + `1`))
974	break;
975	*end_ret = state->end;
976	}
977	ret = `0`;
978	}
979	spin_unlock(lock: &tree->lock);
980	return ret;
981	}
982
983	/*
984	* Find a contiguous range of bytes in the file marked as delalloc, not more
985	* than 'max_bytes'. start and end are used to return the range,
986	*
987	* True is returned if we find something, false if nothing was in the tree.
988	*/
989	bool btrfs_find_delalloc_range(struct extent_io_tree tree, u64 start,
990	u64 *end, u64 max_bytes,
991	struct extent_state **cached_state)
992	{
993	struct extent_state *state;
994	u64 cur_start = *start;
995	bool found = false;
996	u64 total_bytes = `0`;
997
998	spin_lock(lock: &tree->lock);
999
1000	/*
1001	* This search will find all the extents that end after our range
1002	* starts.
1003	*/
1004	state = tree_search(tree, offset: cur_start);
1005	if (!state) {
1006	*end = (u64)-`1`;
1007	goto out;
1008	}
1009
1010	while (state) {
1011	if (found && (state->start != cur_start \|\|
1012	(state->state & EXTENT_BOUNDARY))) {
1013	goto out;
1014	}
1015	if (!(state->state & EXTENT_DELALLOC)) {
1016	if (!found)
1017	*end = state->end;
1018	goto out;
1019	}
1020	if (!found) {
1021	*start = state->start;
1022	*cached_state = state;
1023	refcount_inc(r: &state->refs);
1024	}
1025	found = true;
1026	*end = state->end;
1027	cur_start = state->end + `1`;
1028	total_bytes += state->end - state->start + `1`;
1029	if (total_bytes >= max_bytes)
1030	break;
1031	state = next_state(state);
1032	}
1033	out:
1034	spin_unlock(lock: &tree->lock);
1035	return found;
1036	}
1037
1038	/*
1039	* Set some bits on a range in the tree. This may require allocations or
1040	* sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
1041	* GFP_NOWAIT.
1042	*
1043	* If any of the exclusive bits are set, this will fail with -EEXIST if some
1044	* part of the range already has the desired bits set. The extent_state of the
1045	* existing range is returned in failed_state in this case, and the start of the
1046	* existing range is returned in failed_start. failed_state is used as an
1047	* optimization for wait_extent_bit, failed_start must be used as the source of
1048	* truth as failed_state may have changed since we returned.
1049	*
1050	* [start, end] is inclusive This takes the tree lock.
1051	*/
1052	static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1053	u32 bits, u64 *failed_start,
1054	struct extent_state **failed_state,
1055	struct extent_state **cached_state,
1056	struct extent_changeset *changeset)
1057	{
1058	struct extent_state *state;
1059	struct extent_state *prealloc = NULL;
1060	struct rb_node **p = NULL;
1061	struct rb_node *parent = NULL;
1062	int err = `0`;
1063	u64 last_start;
1064	u64 last_end;
1065	u32 exclusive_bits = (bits & EXTENT_LOCKED);
1066	gfp_t mask;
1067
1068	set_gfp_mask_from_bits(bits: &bits, mask: &mask);
1069	btrfs_debug_check_extent_io_range(tree, start, end);
1070	trace_btrfs_set_extent_bit(tree, start, len: end - start + `1`, set_bits: bits);
1071
1072	if (exclusive_bits)
1073	ASSERT(failed_start);
1074	else
1075	ASSERT(failed_start == NULL && failed_state == NULL);
1076	again:
1077	if (!prealloc) {
1078	/*
1079	* Don't care for allocation failure here because we might end
1080	* up not needing the pre-allocated extent state at all, which
1081	* is the case if we only have in the tree extent states that
1082	* cover our input range and don't cover too any other range.
1083	* If we end up needing a new extent state we allocate it later.
1084	*/
1085	prealloc = alloc_extent_state(mask);
1086	}
1087
1088	spin_lock(lock: &tree->lock);
1089	if (cached_state && *cached_state) {
1090	state = *cached_state;
1091	if (state->start <= start && state->end > start &&
1092	extent_state_in_tree(state))
1093	goto hit_next;
1094	}
1095	/*
1096	* This search will find all the extents that end after our range
1097	* starts.
1098	*/
1099	state = tree_search_for_insert(tree, offset: start, node_ret: &p, parent_ret: &parent);
1100	if (!state) {
1101	prealloc = alloc_extent_state_atomic(prealloc);
1102	if (!prealloc)
1103	goto search_again;
1104	prealloc->start = start;
1105	prealloc->end = end;
1106	insert_state_fast(tree, state: prealloc, node: p, parent, bits, changeset);
1107	cache_state(state: prealloc, cached_ptr: cached_state);
1108	prealloc = NULL;
1109	goto out;
1110	}
1111	hit_next:
1112	last_start = state->start;
1113	last_end = state->end;
1114
1115	/*
1116	* \| ---- desired range ---- \|
1117	* \| state \|
1118	*
1119	* Just lock what we found and keep going
1120	*/
1121	if (state->start == start && state->end <= end) {
1122	if (state->state & exclusive_bits) {
1123	*failed_start = state->start;
1124	cache_state(state, cached_ptr: failed_state);
1125	err = -EEXIST;
1126	goto out;
1127	}
1128
1129	set_state_bits(tree, state, bits, changeset);
1130	cache_state(state, cached_ptr: cached_state);
1131	merge_state(tree, state);
1132	if (last_end == (u64)-`1`)
1133	goto out;
1134	start = last_end + `1`;
1135	state = next_state(state);
1136	if (start < end && state && state->start == start &&
1137	!need_resched())
1138	goto hit_next;
1139	goto search_again;
1140	}
1141
1142	/*
1143	* \| ---- desired range ---- \|
1144	* \| state \|
1145	* or
1146	* \| ------------- state -------------- \|
1147	*
1148	* We need to split the extent we found, and may flip bits on second
1149	* half.
1150	*
1151	* If the extent we found extends past our range, we just split and
1152	* search again. It'll get split again the next time though.
1153	*
1154	* If the extent we found is inside our range, we set the desired bit
1155	* on it.
1156	*/
1157	if (state->start < start) {
1158	if (state->state & exclusive_bits) {
1159	*failed_start = start;
1160	cache_state(state, cached_ptr: failed_state);
1161	err = -EEXIST;
1162	goto out;
1163	}
1164
1165	/*
1166	* If this extent already has all the bits we want set, then
1167	* skip it, not necessary to split it or do anything with it.
1168	*/
1169	if ((state->state & bits) == bits) {
1170	start = state->end + `1`;
1171	cache_state(state, cached_ptr: cached_state);
1172	goto search_again;
1173	}
1174
1175	prealloc = alloc_extent_state_atomic(prealloc);
1176	if (!prealloc)
1177	goto search_again;
1178	err = split_state(tree, orig: state, prealloc, split: start);
1179	if (err)
1180	extent_io_tree_panic(tree, state, opname: "split", err);
1181
1182	prealloc = NULL;
1183	if (err)
1184	goto out;
1185	if (state->end <= end) {
1186	set_state_bits(tree, state, bits, changeset);
1187	cache_state(state, cached_ptr: cached_state);
1188	merge_state(tree, state);
1189	if (last_end == (u64)-`1`)
1190	goto out;
1191	start = last_end + `1`;
1192	state = next_state(state);
1193	if (start < end && state && state->start == start &&
1194	!need_resched())
1195	goto hit_next;
1196	}
1197	goto search_again;
1198	}
1199	/*
1200	* \| ---- desired range ---- \|
1201	* \| state \| or \| state \|
1202	*
1203	* There's a hole, we need to insert something in it and ignore the
1204	* extent we found.
1205	*/
1206	if (state->start > start) {
1207	u64 this_end;
1208	struct extent_state *inserted_state;
1209
1210	if (end < last_start)
1211	this_end = end;
1212	else
1213	this_end = last_start - `1`;
1214
1215	prealloc = alloc_extent_state_atomic(prealloc);
1216	if (!prealloc)
1217	goto search_again;
1218
1219	/*
1220	* Avoid to free 'prealloc' if it can be merged with the later
1221	* extent.
1222	*/
1223	prealloc->start = start;
1224	prealloc->end = this_end;
1225	inserted_state = insert_state(tree, state: prealloc, bits, changeset);
1226	if (IS_ERR(ptr: inserted_state)) {
1227	err = PTR_ERR(ptr: inserted_state);
1228	extent_io_tree_panic(tree, state: prealloc, opname: "insert", err);
1229	}
1230
1231	cache_state(state: inserted_state, cached_ptr: cached_state);
1232	if (inserted_state == prealloc)
1233	prealloc = NULL;
1234	start = this_end + `1`;
1235	goto search_again;
1236	}
1237	/*
1238	* \| ---- desired range ---- \|
1239	* \| state \|
1240	*
1241	* We need to split the extent, and set the bit on the first half
1242	*/
1243	if (state->start <= end && state->end > end) {
1244	if (state->state & exclusive_bits) {
1245	*failed_start = start;
1246	cache_state(state, cached_ptr: failed_state);
1247	err = -EEXIST;
1248	goto out;
1249	}
1250
1251	prealloc = alloc_extent_state_atomic(prealloc);
1252	if (!prealloc)
1253	goto search_again;
1254	err = split_state(tree, orig: state, prealloc, split: end + `1`);
1255	if (err)
1256	extent_io_tree_panic(tree, state, opname: "split", err);
1257
1258	set_state_bits(tree, state: prealloc, bits, changeset);
1259	cache_state(state: prealloc, cached_ptr: cached_state);
1260	merge_state(tree, state: prealloc);
1261	prealloc = NULL;
1262	goto out;
1263	}
1264
1265	search_again:
1266	if (start > end)
1267	goto out;
1268	spin_unlock(lock: &tree->lock);
1269	if (gfpflags_allow_blocking(gfp_flags: mask))
1270	cond_resched();
1271	goto again;
1272
1273	out:
1274	spin_unlock(lock: &tree->lock);
1275	if (prealloc)
1276	free_extent_state(state: prealloc);
1277
1278	return err;
1279
1280	}
1281
1282	int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1283	u32 bits, struct extent_state **cached_state)
1284	{
1285	return __set_extent_bit(tree, start, end, bits, NULL, NULL,
1286	cached_state, NULL);
1287	}
1288
1289	/*
1290	* Convert all bits in a given range from one bit to another
1291	*
1292	* @tree: the io tree to search
1293	* @start: the start offset in bytes
1294	* @end: the end offset in bytes (inclusive)
1295	* @bits: the bits to set in this range
1296	* @clear_bits: the bits to clear in this range
1297	* @cached_state: state that we're going to cache
1298	*
1299	* This will go through and set bits for the given range. If any states exist
1300	* already in this range they are set with the given bit and cleared of the
1301	* clear_bits. This is only meant to be used by things that are mergeable, ie.
1302	* converting from say DELALLOC to DIRTY. This is not meant to be used with
1303	* boundary bits like LOCK.
1304	*
1305	* All allocations are done with GFP_NOFS.
1306	*/
1307	int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1308	u32 bits, u32 clear_bits,
1309	struct extent_state **cached_state)
1310	{
1311	struct extent_state *state;
1312	struct extent_state *prealloc = NULL;
1313	struct rb_node **p = NULL;
1314	struct rb_node *parent = NULL;
1315	int err = `0`;
1316	u64 last_start;
1317	u64 last_end;
1318	bool first_iteration = true;
1319
1320	btrfs_debug_check_extent_io_range(tree, start, end);
1321	trace_btrfs_convert_extent_bit(tree, start, len: end - start + `1`, set_bits: bits,
1322	clear_bits);
1323
1324	again:
1325	if (!prealloc) {
1326	/*
1327	* Best effort, don't worry if extent state allocation fails
1328	* here for the first iteration. We might have a cached state
1329	* that matches exactly the target range, in which case no
1330	* extent state allocations are needed. We'll only know this
1331	* after locking the tree.
1332	*/
1333	prealloc = alloc_extent_state(GFP_NOFS);
1334	if (!prealloc && !first_iteration)
1335	return -ENOMEM;
1336	}
1337
1338	spin_lock(lock: &tree->lock);
1339	if (cached_state && *cached_state) {
1340	state = *cached_state;
1341	if (state->start <= start && state->end > start &&
1342	extent_state_in_tree(state))
1343	goto hit_next;
1344	}
1345
1346	/*
1347	* This search will find all the extents that end after our range
1348	* starts.
1349	*/
1350	state = tree_search_for_insert(tree, offset: start, node_ret: &p, parent_ret: &parent);
1351	if (!state) {
1352	prealloc = alloc_extent_state_atomic(prealloc);
1353	if (!prealloc) {
1354	err = -ENOMEM;
1355	goto out;
1356	}
1357	prealloc->start = start;
1358	prealloc->end = end;
1359	insert_state_fast(tree, state: prealloc, node: p, parent, bits, NULL);
1360	cache_state(state: prealloc, cached_ptr: cached_state);
1361	prealloc = NULL;
1362	goto out;
1363	}
1364	hit_next:
1365	last_start = state->start;
1366	last_end = state->end;
1367
1368	/*
1369	* \| ---- desired range ---- \|
1370	* \| state \|
1371	*
1372	* Just lock what we found and keep going.
1373	*/
1374	if (state->start == start && state->end <= end) {
1375	set_state_bits(tree, state, bits, NULL);
1376	cache_state(state, cached_ptr: cached_state);
1377	state = clear_state_bit(tree, state, bits: clear_bits, wake: `0`, NULL);
1378	if (last_end == (u64)-`1`)
1379	goto out;
1380	start = last_end + `1`;
1381	if (start < end && state && state->start == start &&
1382	!need_resched())
1383	goto hit_next;
1384	goto search_again;
1385	}
1386
1387	/*
1388	* \| ---- desired range ---- \|
1389	* \| state \|
1390	* or
1391	* \| ------------- state -------------- \|
1392	*
1393	* We need to split the extent we found, and may flip bits on second
1394	* half.
1395	*
1396	* If the extent we found extends past our range, we just split and
1397	* search again. It'll get split again the next time though.
1398	*
1399	* If the extent we found is inside our range, we set the desired bit
1400	* on it.
1401	*/
1402	if (state->start < start) {
1403	prealloc = alloc_extent_state_atomic(prealloc);
1404	if (!prealloc) {
1405	err = -ENOMEM;
1406	goto out;
1407	}
1408	err = split_state(tree, orig: state, prealloc, split: start);
1409	if (err)
1410	extent_io_tree_panic(tree, state, opname: "split", err);
1411	prealloc = NULL;
1412	if (err)
1413	goto out;
1414	if (state->end <= end) {
1415	set_state_bits(tree, state, bits, NULL);
1416	cache_state(state, cached_ptr: cached_state);
1417	state = clear_state_bit(tree, state, bits: clear_bits, wake: `0`, NULL);
1418	if (last_end == (u64)-`1`)
1419	goto out;
1420	start = last_end + `1`;
1421	if (start < end && state && state->start == start &&
1422	!need_resched())
1423	goto hit_next;
1424	}
1425	goto search_again;
1426	}
1427	/*
1428	* \| ---- desired range ---- \|
1429	* \| state \| or \| state \|
1430	*
1431	* There's a hole, we need to insert something in it and ignore the
1432	* extent we found.
1433	*/
1434	if (state->start > start) {
1435	u64 this_end;
1436	struct extent_state *inserted_state;
1437
1438	if (end < last_start)
1439	this_end = end;
1440	else
1441	this_end = last_start - `1`;
1442
1443	prealloc = alloc_extent_state_atomic(prealloc);
1444	if (!prealloc) {
1445	err = -ENOMEM;
1446	goto out;
1447	}
1448
1449	/*
1450	* Avoid to free 'prealloc' if it can be merged with the later
1451	* extent.
1452	*/
1453	prealloc->start = start;
1454	prealloc->end = this_end;
1455	inserted_state = insert_state(tree, state: prealloc, bits, NULL);
1456	if (IS_ERR(ptr: inserted_state)) {
1457	err = PTR_ERR(ptr: inserted_state);
1458	extent_io_tree_panic(tree, state: prealloc, opname: "insert", err);
1459	}
1460	cache_state(state: inserted_state, cached_ptr: cached_state);
1461	if (inserted_state == prealloc)
1462	prealloc = NULL;
1463	start = this_end + `1`;
1464	goto search_again;
1465	}
1466	/*
1467	* \| ---- desired range ---- \|
1468	* \| state \|
1469	*
1470	* We need to split the extent, and set the bit on the first half.
1471	*/
1472	if (state->start <= end && state->end > end) {
1473	prealloc = alloc_extent_state_atomic(prealloc);
1474	if (!prealloc) {
1475	err = -ENOMEM;
1476	goto out;
1477	}
1478
1479	err = split_state(tree, orig: state, prealloc, split: end + `1`);
1480	if (err)
1481	extent_io_tree_panic(tree, state, opname: "split", err);
1482
1483	set_state_bits(tree, state: prealloc, bits, NULL);
1484	cache_state(state: prealloc, cached_ptr: cached_state);
1485	clear_state_bit(tree, state: prealloc, bits: clear_bits, wake: `0`, NULL);
1486	prealloc = NULL;
1487	goto out;
1488	}
1489
1490	search_again:
1491	if (start > end)
1492	goto out;
1493	spin_unlock(lock: &tree->lock);
1494	cond_resched();
1495	first_iteration = false;
1496	goto again;
1497
1498	out:
1499	spin_unlock(lock: &tree->lock);
1500	if (prealloc)
1501	free_extent_state(state: prealloc);
1502
1503	return err;
1504	}
1505
1506	/*
1507	* Find the first range that has @bits not set. This range could start before
1508	* @start.
1509	*
1510	* @tree: the tree to search
1511	* @start: offset at/after which the found extent should start
1512	* @start_ret: records the beginning of the range
1513	* @end_ret: records the end of the range (inclusive)
1514	* @bits: the set of bits which must be unset
1515	*
1516	* Since unallocated range is also considered one which doesn't have the bits
1517	* set it's possible that @end_ret contains -1, this happens in case the range
1518	* spans (last_range_end, end of device]. In this case it's up to the caller to
1519	* trim @end_ret to the appropriate size.
1520	*/
1521	void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1522	u64 start_ret, u64 end_ret, u32 bits)
1523	{
1524	struct extent_state *state;
1525	struct extent_state prev = NULL, next = NULL;
1526
1527	spin_lock(lock: &tree->lock);
1528
1529	/ Find first extent with bits cleared /
1530	while (`1`) {
1531	state = tree_search_prev_next(tree, offset: start, prev_ret: &prev, next_ret: &next);
1532	if (!state && !next && !prev) {
1533	/*
1534	* Tree is completely empty, send full range and let
1535	* caller deal with it
1536	*/
1537	*start_ret = `0`;
1538	*end_ret = -`1`;
1539	goto out;
1540	} else if (!state && !next) {
1541	/*
1542	* We are past the last allocated chunk, set start at
1543	* the end of the last extent.
1544	*/
1545	*start_ret = prev->end + `1`;
1546	*end_ret = -`1`;
1547	goto out;
1548	} else if (!state) {
1549	state = next;
1550	}
1551
1552	/*
1553	* At this point 'state' either contains 'start' or start is
1554	* before 'state'
1555	*/
1556	if (in_range(start, state->start, state->end - state->start + `1`)) {
1557	if (state->state & bits) {
1558	/*
1559	* \|--range with bits sets--\|
1560	* \|
1561	* start
1562	*/
1563	start = state->end + `1`;
1564	} else {
1565	/*
1566	* 'start' falls within a range that doesn't
1567	* have the bits set, so take its start as the
1568	* beginning of the desired range
1569	*
1570	* \|--range with bits cleared----\|
1571	* \|
1572	* start
1573	*/
1574	*start_ret = state->start;
1575	break;
1576	}
1577	} else {
1578	/*
1579	* \|---prev range---\|---hole/unset---\|---node range---\|
1580	* \|
1581	* start
1582	*
1583	* or
1584	*
1585	* \|---hole/unset--\|\|--first node--\|
1586	* 0 \|
1587	* start
1588	*/
1589	if (prev)
1590	*start_ret = prev->end + `1`;
1591	else
1592	*start_ret = `0`;
1593	break;
1594	}
1595	}
1596
1597	/*
1598	* Find the longest stretch from start until an entry which has the
1599	* bits set
1600	*/
1601	while (state) {
1602	if (state->end >= start && !(state->state & bits)) {
1603	*end_ret = state->end;
1604	} else {
1605	*end_ret = state->start - `1`;
1606	break;
1607	}
1608	state = next_state(state);
1609	}
1610	out:
1611	spin_unlock(lock: &tree->lock);
1612	}
1613
1614	/*
1615	* Count the number of bytes in the tree that have a given bit(s) set for a
1616	* given range.
1617	*
1618	* @tree: The io tree to search.
1619	* @start: The start offset of the range. This value is updated to the
1620	* offset of the first byte found with the given bit(s), so it
1621	* can end up being bigger than the initial value.
1622	* @search_end: The end offset (inclusive value) of the search range.
1623	* @max_bytes: The maximum byte count we are interested. The search stops
1624	* once it reaches this count.
1625	* @bits: The bits the range must have in order to be accounted for.
1626	* If multiple bits are set, then only subranges that have all
1627	* the bits set are accounted for.
1628	* @contig: Indicate if we should ignore holes in the range or not. If
1629	* this is true, then stop once we find a hole.
1630	* @cached_state: A cached state to be used across multiple calls to this
1631	* function in order to speedup searches. Use NULL if this is
1632	* called only once or if each call does not start where the
1633	* previous one ended.
1634	*
1635	* Returns the total number of bytes found within the given range that have
1636	* all given bits set. If the returned number of bytes is greater than zero
1637	* then @start is updated with the offset of the first byte with the bits set.
1638	*/
1639	u64 count_range_bits(struct extent_io_tree *tree,
1640	u64 *start, u64 search_end, u64 max_bytes,
1641	u32 bits, int contig,
1642	struct extent_state **cached_state)
1643	{
1644	struct extent_state *state = NULL;
1645	struct extent_state *cached;
1646	u64 cur_start = *start;
1647	u64 total_bytes = `0`;
1648	u64 last = `0`;
1649	int found = `0`;
1650
1651	if (WARN_ON(search_end < cur_start))
1652	return `0`;
1653
1654	spin_lock(lock: &tree->lock);
1655
1656	if (!cached_state \|\| !*cached_state)
1657	goto search;
1658
1659	cached = *cached_state;
1660
1661	if (!extent_state_in_tree(state: cached))
1662	goto search;
1663
1664	if (cached->start <= cur_start && cur_start <= cached->end) {
1665	state = cached;
1666	} else if (cached->start > cur_start) {
1667	struct extent_state *prev;
1668
1669	/*
1670	* The cached state starts after our search range's start. Check
1671	* if the previous state record starts at or before the range we
1672	* are looking for, and if so, use it - this is a common case
1673	* when there are holes between records in the tree. If there is
1674	* no previous state record, we can start from our cached state.
1675	*/
1676	prev = prev_state(state: cached);
1677	if (!prev)
1678	state = cached;
1679	else if (prev->start <= cur_start && cur_start <= prev->end)
1680	state = prev;
1681	}
1682
1683	/*
1684	* This search will find all the extents that end after our range
1685	* starts.
1686	*/
1687	search:
1688	if (!state)
1689	state = tree_search(tree, offset: cur_start);
1690
1691	while (state) {
1692	if (state->start > search_end)
1693	break;
1694	if (contig && found && state->start > last + `1`)
1695	break;
1696	if (state->end >= cur_start && (state->state & bits) == bits) {
1697	total_bytes += min(search_end, state->end) + `1` -
1698	max(cur_start, state->start);
1699	if (total_bytes >= max_bytes)
1700	break;
1701	if (!found) {
1702	*start = max(cur_start, state->start);
1703	found = `1`;
1704	}
1705	last = state->end;
1706	} else if (contig && found) {
1707	break;
1708	}
1709	state = next_state(state);
1710	}
1711
1712	if (cached_state) {
1713	free_extent_state(state: *cached_state);
1714	*cached_state = state;
1715	if (state)
1716	refcount_inc(r: &state->refs);
1717	}
1718
1719	spin_unlock(lock: &tree->lock);
1720
1721	return total_bytes;
1722	}
1723
1724	/*
1725	* Check if the single @bit exists in the given range.
1726	*/
1727	bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
1728	{
1729	struct extent_state *state = NULL;
1730	bool bitset = false;
1731
1732	ASSERT(is_power_of_2(bit));
1733
1734	spin_lock(lock: &tree->lock);
1735	state = tree_search(tree, offset: start);
1736	while (state && start <= end) {
1737	if (state->start > end)
1738	break;
1739
1740	if (state->state & bit) {
1741	bitset = true;
1742	break;
1743	}
1744
1745	/ If state->end is (u64)-1, start will overflow to 0 /
1746	start = state->end + `1`;
1747	if (start > end \|\| start == `0`)
1748	break;
1749	state = next_state(state);
1750	}
1751	spin_unlock(lock: &tree->lock);
1752	return bitset;
1753	}
1754
1755	/*
1756	* Check if the whole range [@start,@end) contains the single @bit set.
1757	*/
1758	bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
1759	struct extent_state *cached)
1760	{
1761	struct extent_state *state = NULL;
1762	bool bitset = true;
1763
1764	ASSERT(is_power_of_2(bit));
1765
1766	spin_lock(lock: &tree->lock);
1767	if (cached && extent_state_in_tree(state: cached) && cached->start <= start &&
1768	cached->end > start)
1769	state = cached;
1770	else
1771	state = tree_search(tree, offset: start);
1772	while (state && start <= end) {
1773	if (state->start > start) {
1774	bitset = false;
1775	break;
1776	}
1777
1778	if (state->start > end)
1779	break;
1780
1781	if ((state->state & bit) == `0`) {
1782	bitset = false;
1783	break;
1784	}
1785
1786	if (state->end == (u64)-`1`)
1787	break;
1788
1789	/*
1790	* Last entry (if state->end is (u64)-1 and overflow happens),
1791	* or next entry starts after the range.
1792	*/
1793	start = state->end + `1`;
1794	if (start > end \|\| start == `0`)
1795	break;
1796	state = next_state(state);
1797	}
1798
1799	/ We ran out of states and were still inside of our range. /
1800	if (!state)
1801	bitset = false;
1802	spin_unlock(lock: &tree->lock);
1803	return bitset;
1804	}
1805
1806	/ Wrappers around set/clear extent bit /
1807	int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1808	u32 bits, struct extent_changeset *changeset)
1809	{
1810	/*
1811	* We don't support EXTENT_LOCKED yet, as current changeset will
1812	* record any bits changed, so for EXTENT_LOCKED case, it will
1813	* either fail with -EEXIST or changeset will record the whole
1814	* range.
1815	*/
1816	ASSERT(!(bits & EXTENT_LOCKED));
1817
1818	return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
1819	}
1820
1821	int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1822	u32 bits, struct extent_changeset *changeset)
1823	{
1824	/*
1825	* Don't support EXTENT_LOCKED case, same reason as
1826	* set_record_extent_bits().
1827	*/
1828	ASSERT(!(bits & EXTENT_LOCKED));
1829
1830	return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
1831	}
1832
1833	int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1834	struct extent_state **cached)
1835	{
1836	int err;
1837	u64 failed_start;
1838
1839	err = __set_extent_bit(tree, start, end, bits: EXTENT_LOCKED, failed_start: &failed_start,
1840	NULL, cached_state: cached, NULL);
1841	if (err == -EEXIST) {
1842	if (failed_start > start)
1843	clear_extent_bit(tree, start, end: failed_start - `1`,
1844	bits: EXTENT_LOCKED, cached);
1845	return `0`;
1846	}
1847	return `1`;
1848	}
1849
1850	/*
1851	* Either insert or lock state struct between start and end use mask to tell
1852	* us if waiting is desired.
1853	*/
1854	int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1855	struct extent_state **cached_state)
1856	{
1857	struct extent_state *failed_state = NULL;
1858	int err;
1859	u64 failed_start;
1860
1861	err = __set_extent_bit(tree, start, end, bits: EXTENT_LOCKED, failed_start: &failed_start,
1862	failed_state: &failed_state, cached_state, NULL);
1863	while (err == -EEXIST) {
1864	if (failed_start != start)
1865	clear_extent_bit(tree, start, end: failed_start - `1`,
1866	bits: EXTENT_LOCKED, cached: cached_state);
1867
1868	wait_extent_bit(tree, start: failed_start, end, bits: EXTENT_LOCKED,
1869	cached_state: &failed_state);
1870	err = __set_extent_bit(tree, start, end, bits: EXTENT_LOCKED,
1871	failed_start: &failed_start, failed_state: &failed_state,
1872	cached_state, NULL);
1873	}
1874	return err;
1875	}
1876
1877	void __cold extent_state_free_cachep(void)
1878	{
1879	btrfs_extent_state_leak_debug_check();
1880	kmem_cache_destroy(s: extent_state_cache);
1881	}
1882
1883	int __init extent_state_init_cachep(void)
1884	{
1885	extent_state_cache = kmem_cache_create(name: "btrfs_extent_state",
1886	size: sizeof(struct extent_state), align: `0`, flags: `0`,
1887	NULL);
1888	if (!extent_state_cache)
1889	return -ENOMEM;
1890
1891	return `0`;
1892	}
1893

source code of linux/fs/btrfs/extent-io-tree.c