1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/slab.h> |
4 | #include <trace/events/btrfs.h> |
5 | #include "messages.h" |
6 | #include "ctree.h" |
7 | #include "extent-io-tree.h" |
8 | #include "btrfs_inode.h" |
9 | |
10 | static struct kmem_cache *extent_state_cache; |
11 | |
12 | static inline bool extent_state_in_tree(const struct extent_state *state) |
13 | { |
14 | return !RB_EMPTY_NODE(&state->rb_node); |
15 | } |
16 | |
17 | #ifdef CONFIG_BTRFS_DEBUG |
18 | static LIST_HEAD(states); |
19 | static DEFINE_SPINLOCK(leak_lock); |
20 | |
21 | static inline void btrfs_leak_debug_add_state(struct extent_state *state) |
22 | { |
23 | unsigned long flags; |
24 | |
25 | spin_lock_irqsave(&leak_lock, flags); |
26 | list_add(new: &state->leak_list, head: &states); |
27 | spin_unlock_irqrestore(lock: &leak_lock, flags); |
28 | } |
29 | |
30 | static inline void btrfs_leak_debug_del_state(struct extent_state *state) |
31 | { |
32 | unsigned long flags; |
33 | |
34 | spin_lock_irqsave(&leak_lock, flags); |
35 | list_del(entry: &state->leak_list); |
36 | spin_unlock_irqrestore(lock: &leak_lock, flags); |
37 | } |
38 | |
39 | static inline void btrfs_extent_state_leak_debug_check(void) |
40 | { |
41 | struct extent_state *state; |
42 | |
43 | while (!list_empty(head: &states)) { |
44 | state = list_entry(states.next, struct extent_state, leak_list); |
45 | pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n" , |
46 | state->start, state->end, state->state, |
47 | extent_state_in_tree(state), |
48 | refcount_read(&state->refs)); |
49 | list_del(entry: &state->leak_list); |
50 | WARN_ON_ONCE(1); |
51 | kmem_cache_free(s: extent_state_cache, objp: state); |
52 | } |
53 | } |
54 | |
55 | #define btrfs_debug_check_extent_io_range(tree, start, end) \ |
56 | __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) |
57 | static inline void __btrfs_debug_check_extent_io_range(const char *caller, |
58 | struct extent_io_tree *tree, |
59 | u64 start, u64 end) |
60 | { |
61 | const struct btrfs_inode *inode; |
62 | u64 isize; |
63 | |
64 | if (tree->owner != IO_TREE_INODE_IO) |
65 | return; |
66 | |
67 | inode = extent_io_tree_to_inode_const(tree); |
68 | isize = i_size_read(inode: &inode->vfs_inode); |
69 | if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { |
70 | btrfs_debug_rl(inode->root->fs_info, |
71 | "%s: ino %llu isize %llu odd range [%llu,%llu]" , |
72 | caller, btrfs_ino(inode), isize, start, end); |
73 | } |
74 | } |
75 | #else |
76 | #define btrfs_leak_debug_add_state(state) do {} while (0) |
77 | #define btrfs_leak_debug_del_state(state) do {} while (0) |
78 | #define btrfs_extent_state_leak_debug_check() do {} while (0) |
79 | #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) |
80 | #endif |
81 | |
82 | |
83 | /* |
84 | * The only tree allowed to set the inode is IO_TREE_INODE_IO. |
85 | */ |
86 | static bool is_inode_io_tree(const struct extent_io_tree *tree) |
87 | { |
88 | return tree->owner == IO_TREE_INODE_IO; |
89 | } |
90 | |
91 | /* Return the inode if it's valid for the given tree, otherwise NULL. */ |
92 | struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) |
93 | { |
94 | if (tree->owner == IO_TREE_INODE_IO) |
95 | return tree->inode; |
96 | return NULL; |
97 | } |
98 | |
99 | /* Read-only access to the inode. */ |
100 | const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) |
101 | { |
102 | if (tree->owner == IO_TREE_INODE_IO) |
103 | return tree->inode; |
104 | return NULL; |
105 | } |
106 | |
107 | /* For read-only access to fs_info. */ |
108 | const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) |
109 | { |
110 | if (tree->owner == IO_TREE_INODE_IO) |
111 | return tree->inode->root->fs_info; |
112 | return tree->fs_info; |
113 | } |
114 | |
115 | void extent_io_tree_init(struct btrfs_fs_info *fs_info, |
116 | struct extent_io_tree *tree, unsigned int owner) |
117 | { |
118 | tree->state = RB_ROOT; |
119 | spin_lock_init(&tree->lock); |
120 | tree->fs_info = fs_info; |
121 | tree->owner = owner; |
122 | } |
123 | |
124 | /* |
125 | * Empty an io tree, removing and freeing every extent state record from the |
126 | * tree. This should be called once we are sure no other task can access the |
127 | * tree anymore, so no tree updates happen after we empty the tree and there |
128 | * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never |
129 | * set on any extent state when calling this function). |
130 | */ |
131 | void extent_io_tree_release(struct extent_io_tree *tree) |
132 | { |
133 | struct rb_root root; |
134 | struct extent_state *state; |
135 | struct extent_state *tmp; |
136 | |
137 | spin_lock(lock: &tree->lock); |
138 | root = tree->state; |
139 | tree->state = RB_ROOT; |
140 | rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { |
141 | /* Clear node to keep free_extent_state() happy. */ |
142 | RB_CLEAR_NODE(&state->rb_node); |
143 | ASSERT(!(state->state & EXTENT_LOCKED)); |
144 | /* |
145 | * No need for a memory barrier here, as we are holding the tree |
146 | * lock and we only change the waitqueue while holding that lock |
147 | * (see wait_extent_bit()). |
148 | */ |
149 | ASSERT(!waitqueue_active(&state->wq)); |
150 | free_extent_state(state); |
151 | cond_resched_lock(&tree->lock); |
152 | } |
153 | /* |
154 | * Should still be empty even after a reschedule, no other task should |
155 | * be accessing the tree anymore. |
156 | */ |
157 | ASSERT(RB_EMPTY_ROOT(&tree->state)); |
158 | spin_unlock(lock: &tree->lock); |
159 | } |
160 | |
161 | static struct extent_state *alloc_extent_state(gfp_t mask) |
162 | { |
163 | struct extent_state *state; |
164 | |
165 | /* |
166 | * The given mask might be not appropriate for the slab allocator, |
167 | * drop the unsupported bits |
168 | */ |
169 | mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); |
170 | state = kmem_cache_alloc(cachep: extent_state_cache, flags: mask); |
171 | if (!state) |
172 | return state; |
173 | state->state = 0; |
174 | RB_CLEAR_NODE(&state->rb_node); |
175 | btrfs_leak_debug_add_state(state); |
176 | refcount_set(r: &state->refs, n: 1); |
177 | init_waitqueue_head(&state->wq); |
178 | trace_alloc_extent_state(state, mask, _RET_IP_); |
179 | return state; |
180 | } |
181 | |
182 | static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc) |
183 | { |
184 | if (!prealloc) |
185 | prealloc = alloc_extent_state(GFP_ATOMIC); |
186 | |
187 | return prealloc; |
188 | } |
189 | |
190 | void free_extent_state(struct extent_state *state) |
191 | { |
192 | if (!state) |
193 | return; |
194 | if (refcount_dec_and_test(r: &state->refs)) { |
195 | WARN_ON(extent_state_in_tree(state)); |
196 | btrfs_leak_debug_del_state(state); |
197 | trace_free_extent_state(state, _RET_IP_); |
198 | kmem_cache_free(s: extent_state_cache, objp: state); |
199 | } |
200 | } |
201 | |
202 | static int add_extent_changeset(struct extent_state *state, u32 bits, |
203 | struct extent_changeset *changeset, |
204 | int set) |
205 | { |
206 | int ret; |
207 | |
208 | if (!changeset) |
209 | return 0; |
210 | if (set && (state->state & bits) == bits) |
211 | return 0; |
212 | if (!set && (state->state & bits) == 0) |
213 | return 0; |
214 | changeset->bytes_changed += state->end - state->start + 1; |
215 | ret = ulist_add(ulist: &changeset->range_changed, val: state->start, aux: state->end, |
216 | GFP_ATOMIC); |
217 | return ret; |
218 | } |
219 | |
220 | static inline struct extent_state *next_state(struct extent_state *state) |
221 | { |
222 | struct rb_node *next = rb_next(&state->rb_node); |
223 | |
224 | if (next) |
225 | return rb_entry(next, struct extent_state, rb_node); |
226 | else |
227 | return NULL; |
228 | } |
229 | |
230 | static inline struct extent_state *prev_state(struct extent_state *state) |
231 | { |
232 | struct rb_node *next = rb_prev(&state->rb_node); |
233 | |
234 | if (next) |
235 | return rb_entry(next, struct extent_state, rb_node); |
236 | else |
237 | return NULL; |
238 | } |
239 | |
240 | /* |
241 | * Search @tree for an entry that contains @offset. Such entry would have |
242 | * entry->start <= offset && entry->end >= offset. |
243 | * |
244 | * @tree: the tree to search |
245 | * @offset: offset that should fall within an entry in @tree |
246 | * @node_ret: pointer where new node should be anchored (used when inserting an |
247 | * entry in the tree) |
248 | * @parent_ret: points to entry which would have been the parent of the entry, |
249 | * containing @offset |
250 | * |
251 | * Return a pointer to the entry that contains @offset byte address and don't change |
252 | * @node_ret and @parent_ret. |
253 | * |
254 | * If no such entry exists, return pointer to entry that ends before @offset |
255 | * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. |
256 | */ |
257 | static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, |
258 | u64 offset, |
259 | struct rb_node ***node_ret, |
260 | struct rb_node **parent_ret) |
261 | { |
262 | struct rb_root *root = &tree->state; |
263 | struct rb_node **node = &root->rb_node; |
264 | struct rb_node *prev = NULL; |
265 | struct extent_state *entry = NULL; |
266 | |
267 | while (*node) { |
268 | prev = *node; |
269 | entry = rb_entry(prev, struct extent_state, rb_node); |
270 | |
271 | if (offset < entry->start) |
272 | node = &(*node)->rb_left; |
273 | else if (offset > entry->end) |
274 | node = &(*node)->rb_right; |
275 | else |
276 | return entry; |
277 | } |
278 | |
279 | if (node_ret) |
280 | *node_ret = node; |
281 | if (parent_ret) |
282 | *parent_ret = prev; |
283 | |
284 | /* Search neighbors until we find the first one past the end */ |
285 | while (entry && offset > entry->end) |
286 | entry = next_state(state: entry); |
287 | |
288 | return entry; |
289 | } |
290 | |
291 | /* |
292 | * Search offset in the tree or fill neighbor rbtree node pointers. |
293 | * |
294 | * @tree: the tree to search |
295 | * @offset: offset that should fall within an entry in @tree |
296 | * @next_ret: pointer to the first entry whose range ends after @offset |
297 | * @prev_ret: pointer to the first entry whose range begins before @offset |
298 | * |
299 | * Return a pointer to the entry that contains @offset byte address. If no |
300 | * such entry exists, then return NULL and fill @prev_ret and @next_ret. |
301 | * Otherwise return the found entry and other pointers are left untouched. |
302 | */ |
303 | static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree, |
304 | u64 offset, |
305 | struct extent_state **prev_ret, |
306 | struct extent_state **next_ret) |
307 | { |
308 | struct rb_root *root = &tree->state; |
309 | struct rb_node **node = &root->rb_node; |
310 | struct extent_state *orig_prev; |
311 | struct extent_state *entry = NULL; |
312 | |
313 | ASSERT(prev_ret); |
314 | ASSERT(next_ret); |
315 | |
316 | while (*node) { |
317 | entry = rb_entry(*node, struct extent_state, rb_node); |
318 | |
319 | if (offset < entry->start) |
320 | node = &(*node)->rb_left; |
321 | else if (offset > entry->end) |
322 | node = &(*node)->rb_right; |
323 | else |
324 | return entry; |
325 | } |
326 | |
327 | orig_prev = entry; |
328 | while (entry && offset > entry->end) |
329 | entry = next_state(state: entry); |
330 | *next_ret = entry; |
331 | entry = orig_prev; |
332 | |
333 | while (entry && offset < entry->start) |
334 | entry = prev_state(state: entry); |
335 | *prev_ret = entry; |
336 | |
337 | return NULL; |
338 | } |
339 | |
340 | /* |
341 | * Inexact rb-tree search, return the next entry if @offset is not found |
342 | */ |
343 | static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset) |
344 | { |
345 | return tree_search_for_insert(tree, offset, NULL, NULL); |
346 | } |
347 | |
348 | static void extent_io_tree_panic(const struct extent_io_tree *tree, |
349 | const struct extent_state *state, |
350 | const char *opname, |
351 | int err) |
352 | { |
353 | btrfs_panic(extent_io_tree_to_fs_info(tree), err, |
354 | "extent io tree error on %s state start %llu end %llu" , |
355 | opname, state->start, state->end); |
356 | } |
357 | |
358 | static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) |
359 | { |
360 | struct extent_state *prev; |
361 | |
362 | prev = prev_state(state); |
363 | if (prev && prev->end == state->start - 1 && prev->state == state->state) { |
364 | if (is_inode_io_tree(tree)) |
365 | btrfs_merge_delalloc_extent(inode: extent_io_tree_to_inode(tree), |
366 | new: state, other: prev); |
367 | state->start = prev->start; |
368 | rb_erase(&prev->rb_node, &tree->state); |
369 | RB_CLEAR_NODE(&prev->rb_node); |
370 | free_extent_state(state: prev); |
371 | } |
372 | } |
373 | |
374 | static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state) |
375 | { |
376 | struct extent_state *next; |
377 | |
378 | next = next_state(state); |
379 | if (next && next->start == state->end + 1 && next->state == state->state) { |
380 | if (is_inode_io_tree(tree)) |
381 | btrfs_merge_delalloc_extent(inode: extent_io_tree_to_inode(tree), |
382 | new: state, other: next); |
383 | state->end = next->end; |
384 | rb_erase(&next->rb_node, &tree->state); |
385 | RB_CLEAR_NODE(&next->rb_node); |
386 | free_extent_state(state: next); |
387 | } |
388 | } |
389 | |
390 | /* |
391 | * Utility function to look for merge candidates inside a given range. Any |
392 | * extents with matching state are merged together into a single extent in the |
393 | * tree. Extents with EXTENT_IO in their state field are not merged because |
394 | * the end_io handlers need to be able to do operations on them without |
395 | * sleeping (or doing allocations/splits). |
396 | * |
397 | * This should be called with the tree lock held. |
398 | */ |
399 | static void merge_state(struct extent_io_tree *tree, struct extent_state *state) |
400 | { |
401 | if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) |
402 | return; |
403 | |
404 | merge_prev_state(tree, state); |
405 | merge_next_state(tree, state); |
406 | } |
407 | |
408 | static void set_state_bits(struct extent_io_tree *tree, |
409 | struct extent_state *state, |
410 | u32 bits, struct extent_changeset *changeset) |
411 | { |
412 | u32 bits_to_set = bits & ~EXTENT_CTLBITS; |
413 | int ret; |
414 | |
415 | if (is_inode_io_tree(tree)) |
416 | btrfs_set_delalloc_extent(inode: extent_io_tree_to_inode(tree), state, bits); |
417 | |
418 | ret = add_extent_changeset(state, bits: bits_to_set, changeset, set: 1); |
419 | BUG_ON(ret < 0); |
420 | state->state |= bits_to_set; |
421 | } |
422 | |
423 | /* |
424 | * Insert an extent_state struct into the tree. 'bits' are set on the |
425 | * struct before it is inserted. |
426 | * |
427 | * Returns a pointer to the struct extent_state record containing the range |
428 | * requested for insertion, which may be the same as the given struct or it |
429 | * may be an existing record in the tree that was expanded to accommodate the |
430 | * requested range. In case of an extent_state different from the one that was |
431 | * given, the later can be freed or reused by the caller. |
432 | * |
433 | * On error it returns an error pointer. |
434 | * |
435 | * The tree lock is not taken internally. This is a utility function and |
436 | * probably isn't what you want to call (see set/clear_extent_bit). |
437 | */ |
438 | static struct extent_state *insert_state(struct extent_io_tree *tree, |
439 | struct extent_state *state, |
440 | u32 bits, |
441 | struct extent_changeset *changeset) |
442 | { |
443 | struct rb_node **node; |
444 | struct rb_node *parent = NULL; |
445 | const u64 start = state->start - 1; |
446 | const u64 end = state->end + 1; |
447 | const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); |
448 | |
449 | set_state_bits(tree, state, bits, changeset); |
450 | |
451 | node = &tree->state.rb_node; |
452 | while (*node) { |
453 | struct extent_state *entry; |
454 | |
455 | parent = *node; |
456 | entry = rb_entry(parent, struct extent_state, rb_node); |
457 | |
458 | if (state->end < entry->start) { |
459 | if (try_merge && end == entry->start && |
460 | state->state == entry->state) { |
461 | if (is_inode_io_tree(tree)) |
462 | btrfs_merge_delalloc_extent( |
463 | inode: extent_io_tree_to_inode(tree), |
464 | new: state, other: entry); |
465 | entry->start = state->start; |
466 | merge_prev_state(tree, state: entry); |
467 | state->state = 0; |
468 | return entry; |
469 | } |
470 | node = &(*node)->rb_left; |
471 | } else if (state->end > entry->end) { |
472 | if (try_merge && entry->end == start && |
473 | state->state == entry->state) { |
474 | if (is_inode_io_tree(tree)) |
475 | btrfs_merge_delalloc_extent( |
476 | inode: extent_io_tree_to_inode(tree), |
477 | new: state, other: entry); |
478 | entry->end = state->end; |
479 | merge_next_state(tree, state: entry); |
480 | state->state = 0; |
481 | return entry; |
482 | } |
483 | node = &(*node)->rb_right; |
484 | } else { |
485 | return ERR_PTR(error: -EEXIST); |
486 | } |
487 | } |
488 | |
489 | rb_link_node(node: &state->rb_node, parent, rb_link: node); |
490 | rb_insert_color(&state->rb_node, &tree->state); |
491 | |
492 | return state; |
493 | } |
494 | |
495 | /* |
496 | * Insert state to @tree to the location given by @node and @parent. |
497 | */ |
498 | static void insert_state_fast(struct extent_io_tree *tree, |
499 | struct extent_state *state, struct rb_node **node, |
500 | struct rb_node *parent, unsigned bits, |
501 | struct extent_changeset *changeset) |
502 | { |
503 | set_state_bits(tree, state, bits, changeset); |
504 | rb_link_node(node: &state->rb_node, parent, rb_link: node); |
505 | rb_insert_color(&state->rb_node, &tree->state); |
506 | merge_state(tree, state); |
507 | } |
508 | |
509 | /* |
510 | * Split a given extent state struct in two, inserting the preallocated |
511 | * struct 'prealloc' as the newly created second half. 'split' indicates an |
512 | * offset inside 'orig' where it should be split. |
513 | * |
514 | * Before calling, |
515 | * the tree has 'orig' at [orig->start, orig->end]. After calling, there |
516 | * are two extent state structs in the tree: |
517 | * prealloc: [orig->start, split - 1] |
518 | * orig: [ split, orig->end ] |
519 | * |
520 | * The tree locks are not taken by this function. They need to be held |
521 | * by the caller. |
522 | */ |
523 | static int split_state(struct extent_io_tree *tree, struct extent_state *orig, |
524 | struct extent_state *prealloc, u64 split) |
525 | { |
526 | struct rb_node *parent = NULL; |
527 | struct rb_node **node; |
528 | |
529 | if (is_inode_io_tree(tree)) |
530 | btrfs_split_delalloc_extent(inode: extent_io_tree_to_inode(tree), orig, |
531 | split); |
532 | |
533 | prealloc->start = orig->start; |
534 | prealloc->end = split - 1; |
535 | prealloc->state = orig->state; |
536 | orig->start = split; |
537 | |
538 | parent = &orig->rb_node; |
539 | node = &parent; |
540 | while (*node) { |
541 | struct extent_state *entry; |
542 | |
543 | parent = *node; |
544 | entry = rb_entry(parent, struct extent_state, rb_node); |
545 | |
546 | if (prealloc->end < entry->start) { |
547 | node = &(*node)->rb_left; |
548 | } else if (prealloc->end > entry->end) { |
549 | node = &(*node)->rb_right; |
550 | } else { |
551 | free_extent_state(state: prealloc); |
552 | return -EEXIST; |
553 | } |
554 | } |
555 | |
556 | rb_link_node(node: &prealloc->rb_node, parent, rb_link: node); |
557 | rb_insert_color(&prealloc->rb_node, &tree->state); |
558 | |
559 | return 0; |
560 | } |
561 | |
562 | /* |
563 | * Utility function to clear some bits in an extent state struct. It will |
564 | * optionally wake up anyone waiting on this state (wake == 1). |
565 | * |
566 | * If no bits are set on the state struct after clearing things, the |
567 | * struct is freed and removed from the tree |
568 | */ |
569 | static struct extent_state *clear_state_bit(struct extent_io_tree *tree, |
570 | struct extent_state *state, |
571 | u32 bits, int wake, |
572 | struct extent_changeset *changeset) |
573 | { |
574 | struct extent_state *next; |
575 | u32 bits_to_clear = bits & ~EXTENT_CTLBITS; |
576 | int ret; |
577 | |
578 | if (is_inode_io_tree(tree)) |
579 | btrfs_clear_delalloc_extent(inode: extent_io_tree_to_inode(tree), state, |
580 | bits); |
581 | |
582 | ret = add_extent_changeset(state, bits: bits_to_clear, changeset, set: 0); |
583 | BUG_ON(ret < 0); |
584 | state->state &= ~bits_to_clear; |
585 | if (wake) |
586 | wake_up(&state->wq); |
587 | if (state->state == 0) { |
588 | next = next_state(state); |
589 | if (extent_state_in_tree(state)) { |
590 | rb_erase(&state->rb_node, &tree->state); |
591 | RB_CLEAR_NODE(&state->rb_node); |
592 | free_extent_state(state); |
593 | } else { |
594 | WARN_ON(1); |
595 | } |
596 | } else { |
597 | merge_state(tree, state); |
598 | next = next_state(state); |
599 | } |
600 | return next; |
601 | } |
602 | |
603 | /* |
604 | * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly, |
605 | * unset the EXTENT_NOWAIT bit. |
606 | */ |
607 | static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) |
608 | { |
609 | *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS); |
610 | *bits &= EXTENT_NOWAIT - 1; |
611 | } |
612 | |
613 | /* |
614 | * Clear some bits on a range in the tree. This may require splitting or |
615 | * inserting elements in the tree, so the gfp mask is used to indicate which |
616 | * allocations or sleeping are allowed. |
617 | * |
618 | * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given |
619 | * range from the tree regardless of state (ie for truncate). |
620 | * |
621 | * The range [start, end] is inclusive. |
622 | * |
623 | * This takes the tree lock, and returns 0 on success and < 0 on error. |
624 | */ |
625 | int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
626 | u32 bits, struct extent_state **cached_state, |
627 | struct extent_changeset *changeset) |
628 | { |
629 | struct extent_state *state; |
630 | struct extent_state *cached; |
631 | struct extent_state *prealloc = NULL; |
632 | u64 last_end; |
633 | int err; |
634 | int clear = 0; |
635 | int wake; |
636 | int delete = (bits & EXTENT_CLEAR_ALL_BITS); |
637 | gfp_t mask; |
638 | |
639 | set_gfp_mask_from_bits(bits: &bits, mask: &mask); |
640 | btrfs_debug_check_extent_io_range(tree, start, end); |
641 | trace_btrfs_clear_extent_bit(tree, start, len: end - start + 1, clear_bits: bits); |
642 | |
643 | if (delete) |
644 | bits |= ~EXTENT_CTLBITS; |
645 | |
646 | if (bits & EXTENT_DELALLOC) |
647 | bits |= EXTENT_NORESERVE; |
648 | |
649 | wake = (bits & EXTENT_LOCKED) ? 1 : 0; |
650 | if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) |
651 | clear = 1; |
652 | again: |
653 | if (!prealloc) { |
654 | /* |
655 | * Don't care for allocation failure here because we might end |
656 | * up not needing the pre-allocated extent state at all, which |
657 | * is the case if we only have in the tree extent states that |
658 | * cover our input range and don't cover too any other range. |
659 | * If we end up needing a new extent state we allocate it later. |
660 | */ |
661 | prealloc = alloc_extent_state(mask); |
662 | } |
663 | |
664 | spin_lock(lock: &tree->lock); |
665 | if (cached_state) { |
666 | cached = *cached_state; |
667 | |
668 | if (clear) { |
669 | *cached_state = NULL; |
670 | cached_state = NULL; |
671 | } |
672 | |
673 | if (cached && extent_state_in_tree(state: cached) && |
674 | cached->start <= start && cached->end > start) { |
675 | if (clear) |
676 | refcount_dec(r: &cached->refs); |
677 | state = cached; |
678 | goto hit_next; |
679 | } |
680 | if (clear) |
681 | free_extent_state(state: cached); |
682 | } |
683 | |
684 | /* This search will find the extents that end after our range starts. */ |
685 | state = tree_search(tree, offset: start); |
686 | if (!state) |
687 | goto out; |
688 | hit_next: |
689 | if (state->start > end) |
690 | goto out; |
691 | WARN_ON(state->end < start); |
692 | last_end = state->end; |
693 | |
694 | /* The state doesn't have the wanted bits, go ahead. */ |
695 | if (!(state->state & bits)) { |
696 | state = next_state(state); |
697 | goto next; |
698 | } |
699 | |
700 | /* |
701 | * | ---- desired range ---- | |
702 | * | state | or |
703 | * | ------------- state -------------- | |
704 | * |
705 | * We need to split the extent we found, and may flip bits on second |
706 | * half. |
707 | * |
708 | * If the extent we found extends past our range, we just split and |
709 | * search again. It'll get split again the next time though. |
710 | * |
711 | * If the extent we found is inside our range, we clear the desired bit |
712 | * on it. |
713 | */ |
714 | |
715 | if (state->start < start) { |
716 | prealloc = alloc_extent_state_atomic(prealloc); |
717 | if (!prealloc) |
718 | goto search_again; |
719 | err = split_state(tree, orig: state, prealloc, split: start); |
720 | if (err) |
721 | extent_io_tree_panic(tree, state, opname: "split" , err); |
722 | |
723 | prealloc = NULL; |
724 | if (err) |
725 | goto out; |
726 | if (state->end <= end) { |
727 | state = clear_state_bit(tree, state, bits, wake, changeset); |
728 | goto next; |
729 | } |
730 | goto search_again; |
731 | } |
732 | /* |
733 | * | ---- desired range ---- | |
734 | * | state | |
735 | * We need to split the extent, and clear the bit on the first half. |
736 | */ |
737 | if (state->start <= end && state->end > end) { |
738 | prealloc = alloc_extent_state_atomic(prealloc); |
739 | if (!prealloc) |
740 | goto search_again; |
741 | err = split_state(tree, orig: state, prealloc, split: end + 1); |
742 | if (err) |
743 | extent_io_tree_panic(tree, state, opname: "split" , err); |
744 | |
745 | if (wake) |
746 | wake_up(&state->wq); |
747 | |
748 | clear_state_bit(tree, state: prealloc, bits, wake, changeset); |
749 | |
750 | prealloc = NULL; |
751 | goto out; |
752 | } |
753 | |
754 | state = clear_state_bit(tree, state, bits, wake, changeset); |
755 | next: |
756 | if (last_end == (u64)-1) |
757 | goto out; |
758 | start = last_end + 1; |
759 | if (start <= end && state && !need_resched()) |
760 | goto hit_next; |
761 | |
762 | search_again: |
763 | if (start > end) |
764 | goto out; |
765 | spin_unlock(lock: &tree->lock); |
766 | if (gfpflags_allow_blocking(gfp_flags: mask)) |
767 | cond_resched(); |
768 | goto again; |
769 | |
770 | out: |
771 | spin_unlock(lock: &tree->lock); |
772 | if (prealloc) |
773 | free_extent_state(state: prealloc); |
774 | |
775 | return 0; |
776 | |
777 | } |
778 | |
779 | /* |
780 | * Wait for one or more bits to clear on a range in the state tree. |
781 | * The range [start, end] is inclusive. |
782 | * The tree lock is taken by this function |
783 | */ |
784 | static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
785 | u32 bits, struct extent_state **cached_state) |
786 | { |
787 | struct extent_state *state; |
788 | |
789 | btrfs_debug_check_extent_io_range(tree, start, end); |
790 | |
791 | spin_lock(lock: &tree->lock); |
792 | again: |
793 | /* |
794 | * Maintain cached_state, as we may not remove it from the tree if there |
795 | * are more bits than the bits we're waiting on set on this state. |
796 | */ |
797 | if (cached_state && *cached_state) { |
798 | state = *cached_state; |
799 | if (extent_state_in_tree(state) && |
800 | state->start <= start && start < state->end) |
801 | goto process_node; |
802 | } |
803 | while (1) { |
804 | /* |
805 | * This search will find all the extents that end after our |
806 | * range starts. |
807 | */ |
808 | state = tree_search(tree, offset: start); |
809 | process_node: |
810 | if (!state) |
811 | break; |
812 | if (state->start > end) |
813 | goto out; |
814 | |
815 | if (state->state & bits) { |
816 | DEFINE_WAIT(wait); |
817 | |
818 | start = state->start; |
819 | refcount_inc(r: &state->refs); |
820 | prepare_to_wait(wq_head: &state->wq, wq_entry: &wait, TASK_UNINTERRUPTIBLE); |
821 | spin_unlock(lock: &tree->lock); |
822 | schedule(); |
823 | spin_lock(lock: &tree->lock); |
824 | finish_wait(wq_head: &state->wq, wq_entry: &wait); |
825 | free_extent_state(state); |
826 | goto again; |
827 | } |
828 | start = state->end + 1; |
829 | |
830 | if (start > end) |
831 | break; |
832 | |
833 | if (!cond_resched_lock(&tree->lock)) { |
834 | state = next_state(state); |
835 | goto process_node; |
836 | } |
837 | } |
838 | out: |
839 | /* This state is no longer useful, clear it and free it up. */ |
840 | if (cached_state && *cached_state) { |
841 | state = *cached_state; |
842 | *cached_state = NULL; |
843 | free_extent_state(state); |
844 | } |
845 | spin_unlock(lock: &tree->lock); |
846 | } |
847 | |
848 | static void cache_state_if_flags(struct extent_state *state, |
849 | struct extent_state **cached_ptr, |
850 | unsigned flags) |
851 | { |
852 | if (cached_ptr && !(*cached_ptr)) { |
853 | if (!flags || (state->state & flags)) { |
854 | *cached_ptr = state; |
855 | refcount_inc(r: &state->refs); |
856 | } |
857 | } |
858 | } |
859 | |
860 | static void cache_state(struct extent_state *state, |
861 | struct extent_state **cached_ptr) |
862 | { |
863 | return cache_state_if_flags(state, cached_ptr, |
864 | flags: EXTENT_LOCKED | EXTENT_BOUNDARY); |
865 | } |
866 | |
867 | /* |
868 | * Find the first state struct with 'bits' set after 'start', and return it. |
869 | * tree->lock must be held. NULL will returned if nothing was found after |
870 | * 'start'. |
871 | */ |
872 | static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, |
873 | u64 start, u32 bits) |
874 | { |
875 | struct extent_state *state; |
876 | |
877 | /* |
878 | * This search will find all the extents that end after our range |
879 | * starts. |
880 | */ |
881 | state = tree_search(tree, offset: start); |
882 | while (state) { |
883 | if (state->end >= start && (state->state & bits)) |
884 | return state; |
885 | state = next_state(state); |
886 | } |
887 | return NULL; |
888 | } |
889 | |
890 | /* |
891 | * Find the first offset in the io tree with one or more @bits set. |
892 | * |
893 | * Note: If there are multiple bits set in @bits, any of them will match. |
894 | * |
895 | * Return true if we find something, and update @start_ret and @end_ret. |
896 | * Return false if we found nothing. |
897 | */ |
898 | bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, |
899 | u64 *start_ret, u64 *end_ret, u32 bits, |
900 | struct extent_state **cached_state) |
901 | { |
902 | struct extent_state *state; |
903 | bool ret = false; |
904 | |
905 | spin_lock(lock: &tree->lock); |
906 | if (cached_state && *cached_state) { |
907 | state = *cached_state; |
908 | if (state->end == start - 1 && extent_state_in_tree(state)) { |
909 | while ((state = next_state(state)) != NULL) { |
910 | if (state->state & bits) |
911 | break; |
912 | } |
913 | /* |
914 | * If we found the next extent state, clear cached_state |
915 | * so that we can cache the next extent state below and |
916 | * avoid future calls going over the same extent state |
917 | * again. If we haven't found any, clear as well since |
918 | * it's now useless. |
919 | */ |
920 | free_extent_state(state: *cached_state); |
921 | *cached_state = NULL; |
922 | if (state) |
923 | goto got_it; |
924 | goto out; |
925 | } |
926 | free_extent_state(state: *cached_state); |
927 | *cached_state = NULL; |
928 | } |
929 | |
930 | state = find_first_extent_bit_state(tree, start, bits); |
931 | got_it: |
932 | if (state) { |
933 | cache_state_if_flags(state, cached_ptr: cached_state, flags: 0); |
934 | *start_ret = state->start; |
935 | *end_ret = state->end; |
936 | ret = true; |
937 | } |
938 | out: |
939 | spin_unlock(lock: &tree->lock); |
940 | return ret; |
941 | } |
942 | |
943 | /* |
944 | * Find a contiguous area of bits |
945 | * |
946 | * @tree: io tree to check |
947 | * @start: offset to start the search from |
948 | * @start_ret: the first offset we found with the bits set |
949 | * @end_ret: the final contiguous range of the bits that were set |
950 | * @bits: bits to look for |
951 | * |
952 | * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges |
953 | * to set bits appropriately, and then merge them again. During this time it |
954 | * will drop the tree->lock, so use this helper if you want to find the actual |
955 | * contiguous area for given bits. We will search to the first bit we find, and |
956 | * then walk down the tree until we find a non-contiguous area. The area |
957 | * returned will be the full contiguous area with the bits set. |
958 | */ |
959 | int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, |
960 | u64 *start_ret, u64 *end_ret, u32 bits) |
961 | { |
962 | struct extent_state *state; |
963 | int ret = 1; |
964 | |
965 | ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); |
966 | |
967 | spin_lock(lock: &tree->lock); |
968 | state = find_first_extent_bit_state(tree, start, bits); |
969 | if (state) { |
970 | *start_ret = state->start; |
971 | *end_ret = state->end; |
972 | while ((state = next_state(state)) != NULL) { |
973 | if (state->start > (*end_ret + 1)) |
974 | break; |
975 | *end_ret = state->end; |
976 | } |
977 | ret = 0; |
978 | } |
979 | spin_unlock(lock: &tree->lock); |
980 | return ret; |
981 | } |
982 | |
983 | /* |
984 | * Find a contiguous range of bytes in the file marked as delalloc, not more |
985 | * than 'max_bytes'. start and end are used to return the range, |
986 | * |
987 | * True is returned if we find something, false if nothing was in the tree. |
988 | */ |
989 | bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, |
990 | u64 *end, u64 max_bytes, |
991 | struct extent_state **cached_state) |
992 | { |
993 | struct extent_state *state; |
994 | u64 cur_start = *start; |
995 | bool found = false; |
996 | u64 total_bytes = 0; |
997 | |
998 | spin_lock(lock: &tree->lock); |
999 | |
1000 | /* |
1001 | * This search will find all the extents that end after our range |
1002 | * starts. |
1003 | */ |
1004 | state = tree_search(tree, offset: cur_start); |
1005 | if (!state) { |
1006 | *end = (u64)-1; |
1007 | goto out; |
1008 | } |
1009 | |
1010 | while (state) { |
1011 | if (found && (state->start != cur_start || |
1012 | (state->state & EXTENT_BOUNDARY))) { |
1013 | goto out; |
1014 | } |
1015 | if (!(state->state & EXTENT_DELALLOC)) { |
1016 | if (!found) |
1017 | *end = state->end; |
1018 | goto out; |
1019 | } |
1020 | if (!found) { |
1021 | *start = state->start; |
1022 | *cached_state = state; |
1023 | refcount_inc(r: &state->refs); |
1024 | } |
1025 | found = true; |
1026 | *end = state->end; |
1027 | cur_start = state->end + 1; |
1028 | total_bytes += state->end - state->start + 1; |
1029 | if (total_bytes >= max_bytes) |
1030 | break; |
1031 | state = next_state(state); |
1032 | } |
1033 | out: |
1034 | spin_unlock(lock: &tree->lock); |
1035 | return found; |
1036 | } |
1037 | |
1038 | /* |
1039 | * Set some bits on a range in the tree. This may require allocations or |
1040 | * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for |
1041 | * GFP_NOWAIT. |
1042 | * |
1043 | * If any of the exclusive bits are set, this will fail with -EEXIST if some |
1044 | * part of the range already has the desired bits set. The extent_state of the |
1045 | * existing range is returned in failed_state in this case, and the start of the |
1046 | * existing range is returned in failed_start. failed_state is used as an |
1047 | * optimization for wait_extent_bit, failed_start must be used as the source of |
1048 | * truth as failed_state may have changed since we returned. |
1049 | * |
1050 | * [start, end] is inclusive This takes the tree lock. |
1051 | */ |
1052 | static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
1053 | u32 bits, u64 *failed_start, |
1054 | struct extent_state **failed_state, |
1055 | struct extent_state **cached_state, |
1056 | struct extent_changeset *changeset) |
1057 | { |
1058 | struct extent_state *state; |
1059 | struct extent_state *prealloc = NULL; |
1060 | struct rb_node **p = NULL; |
1061 | struct rb_node *parent = NULL; |
1062 | int err = 0; |
1063 | u64 last_start; |
1064 | u64 last_end; |
1065 | u32 exclusive_bits = (bits & EXTENT_LOCKED); |
1066 | gfp_t mask; |
1067 | |
1068 | set_gfp_mask_from_bits(bits: &bits, mask: &mask); |
1069 | btrfs_debug_check_extent_io_range(tree, start, end); |
1070 | trace_btrfs_set_extent_bit(tree, start, len: end - start + 1, set_bits: bits); |
1071 | |
1072 | if (exclusive_bits) |
1073 | ASSERT(failed_start); |
1074 | else |
1075 | ASSERT(failed_start == NULL && failed_state == NULL); |
1076 | again: |
1077 | if (!prealloc) { |
1078 | /* |
1079 | * Don't care for allocation failure here because we might end |
1080 | * up not needing the pre-allocated extent state at all, which |
1081 | * is the case if we only have in the tree extent states that |
1082 | * cover our input range and don't cover too any other range. |
1083 | * If we end up needing a new extent state we allocate it later. |
1084 | */ |
1085 | prealloc = alloc_extent_state(mask); |
1086 | } |
1087 | |
1088 | spin_lock(lock: &tree->lock); |
1089 | if (cached_state && *cached_state) { |
1090 | state = *cached_state; |
1091 | if (state->start <= start && state->end > start && |
1092 | extent_state_in_tree(state)) |
1093 | goto hit_next; |
1094 | } |
1095 | /* |
1096 | * This search will find all the extents that end after our range |
1097 | * starts. |
1098 | */ |
1099 | state = tree_search_for_insert(tree, offset: start, node_ret: &p, parent_ret: &parent); |
1100 | if (!state) { |
1101 | prealloc = alloc_extent_state_atomic(prealloc); |
1102 | if (!prealloc) |
1103 | goto search_again; |
1104 | prealloc->start = start; |
1105 | prealloc->end = end; |
1106 | insert_state_fast(tree, state: prealloc, node: p, parent, bits, changeset); |
1107 | cache_state(state: prealloc, cached_ptr: cached_state); |
1108 | prealloc = NULL; |
1109 | goto out; |
1110 | } |
1111 | hit_next: |
1112 | last_start = state->start; |
1113 | last_end = state->end; |
1114 | |
1115 | /* |
1116 | * | ---- desired range ---- | |
1117 | * | state | |
1118 | * |
1119 | * Just lock what we found and keep going |
1120 | */ |
1121 | if (state->start == start && state->end <= end) { |
1122 | if (state->state & exclusive_bits) { |
1123 | *failed_start = state->start; |
1124 | cache_state(state, cached_ptr: failed_state); |
1125 | err = -EEXIST; |
1126 | goto out; |
1127 | } |
1128 | |
1129 | set_state_bits(tree, state, bits, changeset); |
1130 | cache_state(state, cached_ptr: cached_state); |
1131 | merge_state(tree, state); |
1132 | if (last_end == (u64)-1) |
1133 | goto out; |
1134 | start = last_end + 1; |
1135 | state = next_state(state); |
1136 | if (start < end && state && state->start == start && |
1137 | !need_resched()) |
1138 | goto hit_next; |
1139 | goto search_again; |
1140 | } |
1141 | |
1142 | /* |
1143 | * | ---- desired range ---- | |
1144 | * | state | |
1145 | * or |
1146 | * | ------------- state -------------- | |
1147 | * |
1148 | * We need to split the extent we found, and may flip bits on second |
1149 | * half. |
1150 | * |
1151 | * If the extent we found extends past our range, we just split and |
1152 | * search again. It'll get split again the next time though. |
1153 | * |
1154 | * If the extent we found is inside our range, we set the desired bit |
1155 | * on it. |
1156 | */ |
1157 | if (state->start < start) { |
1158 | if (state->state & exclusive_bits) { |
1159 | *failed_start = start; |
1160 | cache_state(state, cached_ptr: failed_state); |
1161 | err = -EEXIST; |
1162 | goto out; |
1163 | } |
1164 | |
1165 | /* |
1166 | * If this extent already has all the bits we want set, then |
1167 | * skip it, not necessary to split it or do anything with it. |
1168 | */ |
1169 | if ((state->state & bits) == bits) { |
1170 | start = state->end + 1; |
1171 | cache_state(state, cached_ptr: cached_state); |
1172 | goto search_again; |
1173 | } |
1174 | |
1175 | prealloc = alloc_extent_state_atomic(prealloc); |
1176 | if (!prealloc) |
1177 | goto search_again; |
1178 | err = split_state(tree, orig: state, prealloc, split: start); |
1179 | if (err) |
1180 | extent_io_tree_panic(tree, state, opname: "split" , err); |
1181 | |
1182 | prealloc = NULL; |
1183 | if (err) |
1184 | goto out; |
1185 | if (state->end <= end) { |
1186 | set_state_bits(tree, state, bits, changeset); |
1187 | cache_state(state, cached_ptr: cached_state); |
1188 | merge_state(tree, state); |
1189 | if (last_end == (u64)-1) |
1190 | goto out; |
1191 | start = last_end + 1; |
1192 | state = next_state(state); |
1193 | if (start < end && state && state->start == start && |
1194 | !need_resched()) |
1195 | goto hit_next; |
1196 | } |
1197 | goto search_again; |
1198 | } |
1199 | /* |
1200 | * | ---- desired range ---- | |
1201 | * | state | or | state | |
1202 | * |
1203 | * There's a hole, we need to insert something in it and ignore the |
1204 | * extent we found. |
1205 | */ |
1206 | if (state->start > start) { |
1207 | u64 this_end; |
1208 | struct extent_state *inserted_state; |
1209 | |
1210 | if (end < last_start) |
1211 | this_end = end; |
1212 | else |
1213 | this_end = last_start - 1; |
1214 | |
1215 | prealloc = alloc_extent_state_atomic(prealloc); |
1216 | if (!prealloc) |
1217 | goto search_again; |
1218 | |
1219 | /* |
1220 | * Avoid to free 'prealloc' if it can be merged with the later |
1221 | * extent. |
1222 | */ |
1223 | prealloc->start = start; |
1224 | prealloc->end = this_end; |
1225 | inserted_state = insert_state(tree, state: prealloc, bits, changeset); |
1226 | if (IS_ERR(ptr: inserted_state)) { |
1227 | err = PTR_ERR(ptr: inserted_state); |
1228 | extent_io_tree_panic(tree, state: prealloc, opname: "insert" , err); |
1229 | } |
1230 | |
1231 | cache_state(state: inserted_state, cached_ptr: cached_state); |
1232 | if (inserted_state == prealloc) |
1233 | prealloc = NULL; |
1234 | start = this_end + 1; |
1235 | goto search_again; |
1236 | } |
1237 | /* |
1238 | * | ---- desired range ---- | |
1239 | * | state | |
1240 | * |
1241 | * We need to split the extent, and set the bit on the first half |
1242 | */ |
1243 | if (state->start <= end && state->end > end) { |
1244 | if (state->state & exclusive_bits) { |
1245 | *failed_start = start; |
1246 | cache_state(state, cached_ptr: failed_state); |
1247 | err = -EEXIST; |
1248 | goto out; |
1249 | } |
1250 | |
1251 | prealloc = alloc_extent_state_atomic(prealloc); |
1252 | if (!prealloc) |
1253 | goto search_again; |
1254 | err = split_state(tree, orig: state, prealloc, split: end + 1); |
1255 | if (err) |
1256 | extent_io_tree_panic(tree, state, opname: "split" , err); |
1257 | |
1258 | set_state_bits(tree, state: prealloc, bits, changeset); |
1259 | cache_state(state: prealloc, cached_ptr: cached_state); |
1260 | merge_state(tree, state: prealloc); |
1261 | prealloc = NULL; |
1262 | goto out; |
1263 | } |
1264 | |
1265 | search_again: |
1266 | if (start > end) |
1267 | goto out; |
1268 | spin_unlock(lock: &tree->lock); |
1269 | if (gfpflags_allow_blocking(gfp_flags: mask)) |
1270 | cond_resched(); |
1271 | goto again; |
1272 | |
1273 | out: |
1274 | spin_unlock(lock: &tree->lock); |
1275 | if (prealloc) |
1276 | free_extent_state(state: prealloc); |
1277 | |
1278 | return err; |
1279 | |
1280 | } |
1281 | |
1282 | int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
1283 | u32 bits, struct extent_state **cached_state) |
1284 | { |
1285 | return __set_extent_bit(tree, start, end, bits, NULL, NULL, |
1286 | cached_state, NULL); |
1287 | } |
1288 | |
1289 | /* |
1290 | * Convert all bits in a given range from one bit to another |
1291 | * |
1292 | * @tree: the io tree to search |
1293 | * @start: the start offset in bytes |
1294 | * @end: the end offset in bytes (inclusive) |
1295 | * @bits: the bits to set in this range |
1296 | * @clear_bits: the bits to clear in this range |
1297 | * @cached_state: state that we're going to cache |
1298 | * |
1299 | * This will go through and set bits for the given range. If any states exist |
1300 | * already in this range they are set with the given bit and cleared of the |
1301 | * clear_bits. This is only meant to be used by things that are mergeable, ie. |
1302 | * converting from say DELALLOC to DIRTY. This is not meant to be used with |
1303 | * boundary bits like LOCK. |
1304 | * |
1305 | * All allocations are done with GFP_NOFS. |
1306 | */ |
1307 | int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, |
1308 | u32 bits, u32 clear_bits, |
1309 | struct extent_state **cached_state) |
1310 | { |
1311 | struct extent_state *state; |
1312 | struct extent_state *prealloc = NULL; |
1313 | struct rb_node **p = NULL; |
1314 | struct rb_node *parent = NULL; |
1315 | int err = 0; |
1316 | u64 last_start; |
1317 | u64 last_end; |
1318 | bool first_iteration = true; |
1319 | |
1320 | btrfs_debug_check_extent_io_range(tree, start, end); |
1321 | trace_btrfs_convert_extent_bit(tree, start, len: end - start + 1, set_bits: bits, |
1322 | clear_bits); |
1323 | |
1324 | again: |
1325 | if (!prealloc) { |
1326 | /* |
1327 | * Best effort, don't worry if extent state allocation fails |
1328 | * here for the first iteration. We might have a cached state |
1329 | * that matches exactly the target range, in which case no |
1330 | * extent state allocations are needed. We'll only know this |
1331 | * after locking the tree. |
1332 | */ |
1333 | prealloc = alloc_extent_state(GFP_NOFS); |
1334 | if (!prealloc && !first_iteration) |
1335 | return -ENOMEM; |
1336 | } |
1337 | |
1338 | spin_lock(lock: &tree->lock); |
1339 | if (cached_state && *cached_state) { |
1340 | state = *cached_state; |
1341 | if (state->start <= start && state->end > start && |
1342 | extent_state_in_tree(state)) |
1343 | goto hit_next; |
1344 | } |
1345 | |
1346 | /* |
1347 | * This search will find all the extents that end after our range |
1348 | * starts. |
1349 | */ |
1350 | state = tree_search_for_insert(tree, offset: start, node_ret: &p, parent_ret: &parent); |
1351 | if (!state) { |
1352 | prealloc = alloc_extent_state_atomic(prealloc); |
1353 | if (!prealloc) { |
1354 | err = -ENOMEM; |
1355 | goto out; |
1356 | } |
1357 | prealloc->start = start; |
1358 | prealloc->end = end; |
1359 | insert_state_fast(tree, state: prealloc, node: p, parent, bits, NULL); |
1360 | cache_state(state: prealloc, cached_ptr: cached_state); |
1361 | prealloc = NULL; |
1362 | goto out; |
1363 | } |
1364 | hit_next: |
1365 | last_start = state->start; |
1366 | last_end = state->end; |
1367 | |
1368 | /* |
1369 | * | ---- desired range ---- | |
1370 | * | state | |
1371 | * |
1372 | * Just lock what we found and keep going. |
1373 | */ |
1374 | if (state->start == start && state->end <= end) { |
1375 | set_state_bits(tree, state, bits, NULL); |
1376 | cache_state(state, cached_ptr: cached_state); |
1377 | state = clear_state_bit(tree, state, bits: clear_bits, wake: 0, NULL); |
1378 | if (last_end == (u64)-1) |
1379 | goto out; |
1380 | start = last_end + 1; |
1381 | if (start < end && state && state->start == start && |
1382 | !need_resched()) |
1383 | goto hit_next; |
1384 | goto search_again; |
1385 | } |
1386 | |
1387 | /* |
1388 | * | ---- desired range ---- | |
1389 | * | state | |
1390 | * or |
1391 | * | ------------- state -------------- | |
1392 | * |
1393 | * We need to split the extent we found, and may flip bits on second |
1394 | * half. |
1395 | * |
1396 | * If the extent we found extends past our range, we just split and |
1397 | * search again. It'll get split again the next time though. |
1398 | * |
1399 | * If the extent we found is inside our range, we set the desired bit |
1400 | * on it. |
1401 | */ |
1402 | if (state->start < start) { |
1403 | prealloc = alloc_extent_state_atomic(prealloc); |
1404 | if (!prealloc) { |
1405 | err = -ENOMEM; |
1406 | goto out; |
1407 | } |
1408 | err = split_state(tree, orig: state, prealloc, split: start); |
1409 | if (err) |
1410 | extent_io_tree_panic(tree, state, opname: "split" , err); |
1411 | prealloc = NULL; |
1412 | if (err) |
1413 | goto out; |
1414 | if (state->end <= end) { |
1415 | set_state_bits(tree, state, bits, NULL); |
1416 | cache_state(state, cached_ptr: cached_state); |
1417 | state = clear_state_bit(tree, state, bits: clear_bits, wake: 0, NULL); |
1418 | if (last_end == (u64)-1) |
1419 | goto out; |
1420 | start = last_end + 1; |
1421 | if (start < end && state && state->start == start && |
1422 | !need_resched()) |
1423 | goto hit_next; |
1424 | } |
1425 | goto search_again; |
1426 | } |
1427 | /* |
1428 | * | ---- desired range ---- | |
1429 | * | state | or | state | |
1430 | * |
1431 | * There's a hole, we need to insert something in it and ignore the |
1432 | * extent we found. |
1433 | */ |
1434 | if (state->start > start) { |
1435 | u64 this_end; |
1436 | struct extent_state *inserted_state; |
1437 | |
1438 | if (end < last_start) |
1439 | this_end = end; |
1440 | else |
1441 | this_end = last_start - 1; |
1442 | |
1443 | prealloc = alloc_extent_state_atomic(prealloc); |
1444 | if (!prealloc) { |
1445 | err = -ENOMEM; |
1446 | goto out; |
1447 | } |
1448 | |
1449 | /* |
1450 | * Avoid to free 'prealloc' if it can be merged with the later |
1451 | * extent. |
1452 | */ |
1453 | prealloc->start = start; |
1454 | prealloc->end = this_end; |
1455 | inserted_state = insert_state(tree, state: prealloc, bits, NULL); |
1456 | if (IS_ERR(ptr: inserted_state)) { |
1457 | err = PTR_ERR(ptr: inserted_state); |
1458 | extent_io_tree_panic(tree, state: prealloc, opname: "insert" , err); |
1459 | } |
1460 | cache_state(state: inserted_state, cached_ptr: cached_state); |
1461 | if (inserted_state == prealloc) |
1462 | prealloc = NULL; |
1463 | start = this_end + 1; |
1464 | goto search_again; |
1465 | } |
1466 | /* |
1467 | * | ---- desired range ---- | |
1468 | * | state | |
1469 | * |
1470 | * We need to split the extent, and set the bit on the first half. |
1471 | */ |
1472 | if (state->start <= end && state->end > end) { |
1473 | prealloc = alloc_extent_state_atomic(prealloc); |
1474 | if (!prealloc) { |
1475 | err = -ENOMEM; |
1476 | goto out; |
1477 | } |
1478 | |
1479 | err = split_state(tree, orig: state, prealloc, split: end + 1); |
1480 | if (err) |
1481 | extent_io_tree_panic(tree, state, opname: "split" , err); |
1482 | |
1483 | set_state_bits(tree, state: prealloc, bits, NULL); |
1484 | cache_state(state: prealloc, cached_ptr: cached_state); |
1485 | clear_state_bit(tree, state: prealloc, bits: clear_bits, wake: 0, NULL); |
1486 | prealloc = NULL; |
1487 | goto out; |
1488 | } |
1489 | |
1490 | search_again: |
1491 | if (start > end) |
1492 | goto out; |
1493 | spin_unlock(lock: &tree->lock); |
1494 | cond_resched(); |
1495 | first_iteration = false; |
1496 | goto again; |
1497 | |
1498 | out: |
1499 | spin_unlock(lock: &tree->lock); |
1500 | if (prealloc) |
1501 | free_extent_state(state: prealloc); |
1502 | |
1503 | return err; |
1504 | } |
1505 | |
1506 | /* |
1507 | * Find the first range that has @bits not set. This range could start before |
1508 | * @start. |
1509 | * |
1510 | * @tree: the tree to search |
1511 | * @start: offset at/after which the found extent should start |
1512 | * @start_ret: records the beginning of the range |
1513 | * @end_ret: records the end of the range (inclusive) |
1514 | * @bits: the set of bits which must be unset |
1515 | * |
1516 | * Since unallocated range is also considered one which doesn't have the bits |
1517 | * set it's possible that @end_ret contains -1, this happens in case the range |
1518 | * spans (last_range_end, end of device]. In this case it's up to the caller to |
1519 | * trim @end_ret to the appropriate size. |
1520 | */ |
1521 | void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, |
1522 | u64 *start_ret, u64 *end_ret, u32 bits) |
1523 | { |
1524 | struct extent_state *state; |
1525 | struct extent_state *prev = NULL, *next = NULL; |
1526 | |
1527 | spin_lock(lock: &tree->lock); |
1528 | |
1529 | /* Find first extent with bits cleared */ |
1530 | while (1) { |
1531 | state = tree_search_prev_next(tree, offset: start, prev_ret: &prev, next_ret: &next); |
1532 | if (!state && !next && !prev) { |
1533 | /* |
1534 | * Tree is completely empty, send full range and let |
1535 | * caller deal with it |
1536 | */ |
1537 | *start_ret = 0; |
1538 | *end_ret = -1; |
1539 | goto out; |
1540 | } else if (!state && !next) { |
1541 | /* |
1542 | * We are past the last allocated chunk, set start at |
1543 | * the end of the last extent. |
1544 | */ |
1545 | *start_ret = prev->end + 1; |
1546 | *end_ret = -1; |
1547 | goto out; |
1548 | } else if (!state) { |
1549 | state = next; |
1550 | } |
1551 | |
1552 | /* |
1553 | * At this point 'state' either contains 'start' or start is |
1554 | * before 'state' |
1555 | */ |
1556 | if (in_range(start, state->start, state->end - state->start + 1)) { |
1557 | if (state->state & bits) { |
1558 | /* |
1559 | * |--range with bits sets--| |
1560 | * | |
1561 | * start |
1562 | */ |
1563 | start = state->end + 1; |
1564 | } else { |
1565 | /* |
1566 | * 'start' falls within a range that doesn't |
1567 | * have the bits set, so take its start as the |
1568 | * beginning of the desired range |
1569 | * |
1570 | * |--range with bits cleared----| |
1571 | * | |
1572 | * start |
1573 | */ |
1574 | *start_ret = state->start; |
1575 | break; |
1576 | } |
1577 | } else { |
1578 | /* |
1579 | * |---prev range---|---hole/unset---|---node range---| |
1580 | * | |
1581 | * start |
1582 | * |
1583 | * or |
1584 | * |
1585 | * |---hole/unset--||--first node--| |
1586 | * 0 | |
1587 | * start |
1588 | */ |
1589 | if (prev) |
1590 | *start_ret = prev->end + 1; |
1591 | else |
1592 | *start_ret = 0; |
1593 | break; |
1594 | } |
1595 | } |
1596 | |
1597 | /* |
1598 | * Find the longest stretch from start until an entry which has the |
1599 | * bits set |
1600 | */ |
1601 | while (state) { |
1602 | if (state->end >= start && !(state->state & bits)) { |
1603 | *end_ret = state->end; |
1604 | } else { |
1605 | *end_ret = state->start - 1; |
1606 | break; |
1607 | } |
1608 | state = next_state(state); |
1609 | } |
1610 | out: |
1611 | spin_unlock(lock: &tree->lock); |
1612 | } |
1613 | |
1614 | /* |
1615 | * Count the number of bytes in the tree that have a given bit(s) set for a |
1616 | * given range. |
1617 | * |
1618 | * @tree: The io tree to search. |
1619 | * @start: The start offset of the range. This value is updated to the |
1620 | * offset of the first byte found with the given bit(s), so it |
1621 | * can end up being bigger than the initial value. |
1622 | * @search_end: The end offset (inclusive value) of the search range. |
1623 | * @max_bytes: The maximum byte count we are interested. The search stops |
1624 | * once it reaches this count. |
1625 | * @bits: The bits the range must have in order to be accounted for. |
1626 | * If multiple bits are set, then only subranges that have all |
1627 | * the bits set are accounted for. |
1628 | * @contig: Indicate if we should ignore holes in the range or not. If |
1629 | * this is true, then stop once we find a hole. |
1630 | * @cached_state: A cached state to be used across multiple calls to this |
1631 | * function in order to speedup searches. Use NULL if this is |
1632 | * called only once or if each call does not start where the |
1633 | * previous one ended. |
1634 | * |
1635 | * Returns the total number of bytes found within the given range that have |
1636 | * all given bits set. If the returned number of bytes is greater than zero |
1637 | * then @start is updated with the offset of the first byte with the bits set. |
1638 | */ |
1639 | u64 count_range_bits(struct extent_io_tree *tree, |
1640 | u64 *start, u64 search_end, u64 max_bytes, |
1641 | u32 bits, int contig, |
1642 | struct extent_state **cached_state) |
1643 | { |
1644 | struct extent_state *state = NULL; |
1645 | struct extent_state *cached; |
1646 | u64 cur_start = *start; |
1647 | u64 total_bytes = 0; |
1648 | u64 last = 0; |
1649 | int found = 0; |
1650 | |
1651 | if (WARN_ON(search_end < cur_start)) |
1652 | return 0; |
1653 | |
1654 | spin_lock(lock: &tree->lock); |
1655 | |
1656 | if (!cached_state || !*cached_state) |
1657 | goto search; |
1658 | |
1659 | cached = *cached_state; |
1660 | |
1661 | if (!extent_state_in_tree(state: cached)) |
1662 | goto search; |
1663 | |
1664 | if (cached->start <= cur_start && cur_start <= cached->end) { |
1665 | state = cached; |
1666 | } else if (cached->start > cur_start) { |
1667 | struct extent_state *prev; |
1668 | |
1669 | /* |
1670 | * The cached state starts after our search range's start. Check |
1671 | * if the previous state record starts at or before the range we |
1672 | * are looking for, and if so, use it - this is a common case |
1673 | * when there are holes between records in the tree. If there is |
1674 | * no previous state record, we can start from our cached state. |
1675 | */ |
1676 | prev = prev_state(state: cached); |
1677 | if (!prev) |
1678 | state = cached; |
1679 | else if (prev->start <= cur_start && cur_start <= prev->end) |
1680 | state = prev; |
1681 | } |
1682 | |
1683 | /* |
1684 | * This search will find all the extents that end after our range |
1685 | * starts. |
1686 | */ |
1687 | search: |
1688 | if (!state) |
1689 | state = tree_search(tree, offset: cur_start); |
1690 | |
1691 | while (state) { |
1692 | if (state->start > search_end) |
1693 | break; |
1694 | if (contig && found && state->start > last + 1) |
1695 | break; |
1696 | if (state->end >= cur_start && (state->state & bits) == bits) { |
1697 | total_bytes += min(search_end, state->end) + 1 - |
1698 | max(cur_start, state->start); |
1699 | if (total_bytes >= max_bytes) |
1700 | break; |
1701 | if (!found) { |
1702 | *start = max(cur_start, state->start); |
1703 | found = 1; |
1704 | } |
1705 | last = state->end; |
1706 | } else if (contig && found) { |
1707 | break; |
1708 | } |
1709 | state = next_state(state); |
1710 | } |
1711 | |
1712 | if (cached_state) { |
1713 | free_extent_state(state: *cached_state); |
1714 | *cached_state = state; |
1715 | if (state) |
1716 | refcount_inc(r: &state->refs); |
1717 | } |
1718 | |
1719 | spin_unlock(lock: &tree->lock); |
1720 | |
1721 | return total_bytes; |
1722 | } |
1723 | |
1724 | /* |
1725 | * Check if the single @bit exists in the given range. |
1726 | */ |
1727 | bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) |
1728 | { |
1729 | struct extent_state *state = NULL; |
1730 | bool bitset = false; |
1731 | |
1732 | ASSERT(is_power_of_2(bit)); |
1733 | |
1734 | spin_lock(lock: &tree->lock); |
1735 | state = tree_search(tree, offset: start); |
1736 | while (state && start <= end) { |
1737 | if (state->start > end) |
1738 | break; |
1739 | |
1740 | if (state->state & bit) { |
1741 | bitset = true; |
1742 | break; |
1743 | } |
1744 | |
1745 | /* If state->end is (u64)-1, start will overflow to 0 */ |
1746 | start = state->end + 1; |
1747 | if (start > end || start == 0) |
1748 | break; |
1749 | state = next_state(state); |
1750 | } |
1751 | spin_unlock(lock: &tree->lock); |
1752 | return bitset; |
1753 | } |
1754 | |
1755 | /* |
1756 | * Check if the whole range [@start,@end) contains the single @bit set. |
1757 | */ |
1758 | bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, |
1759 | struct extent_state *cached) |
1760 | { |
1761 | struct extent_state *state = NULL; |
1762 | bool bitset = true; |
1763 | |
1764 | ASSERT(is_power_of_2(bit)); |
1765 | |
1766 | spin_lock(lock: &tree->lock); |
1767 | if (cached && extent_state_in_tree(state: cached) && cached->start <= start && |
1768 | cached->end > start) |
1769 | state = cached; |
1770 | else |
1771 | state = tree_search(tree, offset: start); |
1772 | while (state && start <= end) { |
1773 | if (state->start > start) { |
1774 | bitset = false; |
1775 | break; |
1776 | } |
1777 | |
1778 | if (state->start > end) |
1779 | break; |
1780 | |
1781 | if ((state->state & bit) == 0) { |
1782 | bitset = false; |
1783 | break; |
1784 | } |
1785 | |
1786 | if (state->end == (u64)-1) |
1787 | break; |
1788 | |
1789 | /* |
1790 | * Last entry (if state->end is (u64)-1 and overflow happens), |
1791 | * or next entry starts after the range. |
1792 | */ |
1793 | start = state->end + 1; |
1794 | if (start > end || start == 0) |
1795 | break; |
1796 | state = next_state(state); |
1797 | } |
1798 | |
1799 | /* We ran out of states and were still inside of our range. */ |
1800 | if (!state) |
1801 | bitset = false; |
1802 | spin_unlock(lock: &tree->lock); |
1803 | return bitset; |
1804 | } |
1805 | |
1806 | /* Wrappers around set/clear extent bit */ |
1807 | int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
1808 | u32 bits, struct extent_changeset *changeset) |
1809 | { |
1810 | /* |
1811 | * We don't support EXTENT_LOCKED yet, as current changeset will |
1812 | * record any bits changed, so for EXTENT_LOCKED case, it will |
1813 | * either fail with -EEXIST or changeset will record the whole |
1814 | * range. |
1815 | */ |
1816 | ASSERT(!(bits & EXTENT_LOCKED)); |
1817 | |
1818 | return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); |
1819 | } |
1820 | |
1821 | int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, |
1822 | u32 bits, struct extent_changeset *changeset) |
1823 | { |
1824 | /* |
1825 | * Don't support EXTENT_LOCKED case, same reason as |
1826 | * set_record_extent_bits(). |
1827 | */ |
1828 | ASSERT(!(bits & EXTENT_LOCKED)); |
1829 | |
1830 | return __clear_extent_bit(tree, start, end, bits, NULL, changeset); |
1831 | } |
1832 | |
1833 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, |
1834 | struct extent_state **cached) |
1835 | { |
1836 | int err; |
1837 | u64 failed_start; |
1838 | |
1839 | err = __set_extent_bit(tree, start, end, bits: EXTENT_LOCKED, failed_start: &failed_start, |
1840 | NULL, cached_state: cached, NULL); |
1841 | if (err == -EEXIST) { |
1842 | if (failed_start > start) |
1843 | clear_extent_bit(tree, start, end: failed_start - 1, |
1844 | bits: EXTENT_LOCKED, cached); |
1845 | return 0; |
1846 | } |
1847 | return 1; |
1848 | } |
1849 | |
1850 | /* |
1851 | * Either insert or lock state struct between start and end use mask to tell |
1852 | * us if waiting is desired. |
1853 | */ |
1854 | int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, |
1855 | struct extent_state **cached_state) |
1856 | { |
1857 | struct extent_state *failed_state = NULL; |
1858 | int err; |
1859 | u64 failed_start; |
1860 | |
1861 | err = __set_extent_bit(tree, start, end, bits: EXTENT_LOCKED, failed_start: &failed_start, |
1862 | failed_state: &failed_state, cached_state, NULL); |
1863 | while (err == -EEXIST) { |
1864 | if (failed_start != start) |
1865 | clear_extent_bit(tree, start, end: failed_start - 1, |
1866 | bits: EXTENT_LOCKED, cached: cached_state); |
1867 | |
1868 | wait_extent_bit(tree, start: failed_start, end, bits: EXTENT_LOCKED, |
1869 | cached_state: &failed_state); |
1870 | err = __set_extent_bit(tree, start, end, bits: EXTENT_LOCKED, |
1871 | failed_start: &failed_start, failed_state: &failed_state, |
1872 | cached_state, NULL); |
1873 | } |
1874 | return err; |
1875 | } |
1876 | |
1877 | void __cold extent_state_free_cachep(void) |
1878 | { |
1879 | btrfs_extent_state_leak_debug_check(); |
1880 | kmem_cache_destroy(s: extent_state_cache); |
1881 | } |
1882 | |
1883 | int __init extent_state_init_cachep(void) |
1884 | { |
1885 | extent_state_cache = kmem_cache_create(name: "btrfs_extent_state" , |
1886 | size: sizeof(struct extent_state), align: 0, flags: 0, |
1887 | NULL); |
1888 | if (!extent_state_cache) |
1889 | return -ENOMEM; |
1890 | |
1891 | return 0; |
1892 | } |
1893 | |