1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/err.h> |
4 | #include <linux/slab.h> |
5 | #include <linux/spinlock.h> |
6 | #include "messages.h" |
7 | #include "ctree.h" |
8 | #include "volumes.h" |
9 | #include "extent_map.h" |
10 | #include "compression.h" |
11 | #include "btrfs_inode.h" |
12 | |
13 | |
14 | static struct kmem_cache *extent_map_cache; |
15 | |
16 | int __init extent_map_init(void) |
17 | { |
18 | extent_map_cache = kmem_cache_create(name: "btrfs_extent_map" , |
19 | size: sizeof(struct extent_map), align: 0, |
20 | SLAB_MEM_SPREAD, NULL); |
21 | if (!extent_map_cache) |
22 | return -ENOMEM; |
23 | return 0; |
24 | } |
25 | |
26 | void __cold extent_map_exit(void) |
27 | { |
28 | kmem_cache_destroy(s: extent_map_cache); |
29 | } |
30 | |
31 | /* |
32 | * Initialize the extent tree @tree. Should be called for each new inode or |
33 | * other user of the extent_map interface. |
34 | */ |
35 | void extent_map_tree_init(struct extent_map_tree *tree) |
36 | { |
37 | tree->map = RB_ROOT_CACHED; |
38 | INIT_LIST_HEAD(list: &tree->modified_extents); |
39 | rwlock_init(&tree->lock); |
40 | } |
41 | |
42 | /* |
43 | * Allocate a new extent_map structure. The new structure is returned with a |
44 | * reference count of one and needs to be freed using free_extent_map() |
45 | */ |
46 | struct extent_map *alloc_extent_map(void) |
47 | { |
48 | struct extent_map *em; |
49 | em = kmem_cache_zalloc(k: extent_map_cache, GFP_NOFS); |
50 | if (!em) |
51 | return NULL; |
52 | RB_CLEAR_NODE(&em->rb_node); |
53 | em->compress_type = BTRFS_COMPRESS_NONE; |
54 | refcount_set(r: &em->refs, n: 1); |
55 | INIT_LIST_HEAD(list: &em->list); |
56 | return em; |
57 | } |
58 | |
59 | /* |
60 | * Drop the reference out on @em by one and free the structure if the reference |
61 | * count hits zero. |
62 | */ |
63 | void free_extent_map(struct extent_map *em) |
64 | { |
65 | if (!em) |
66 | return; |
67 | if (refcount_dec_and_test(r: &em->refs)) { |
68 | WARN_ON(extent_map_in_tree(em)); |
69 | WARN_ON(!list_empty(&em->list)); |
70 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) |
71 | kfree(objp: em->map_lookup); |
72 | kmem_cache_free(s: extent_map_cache, objp: em); |
73 | } |
74 | } |
75 | |
76 | /* Do the math around the end of an extent, handling wrapping. */ |
77 | static u64 range_end(u64 start, u64 len) |
78 | { |
79 | if (start + len < start) |
80 | return (u64)-1; |
81 | return start + len; |
82 | } |
83 | |
84 | static int tree_insert(struct rb_root_cached *root, struct extent_map *em) |
85 | { |
86 | struct rb_node **p = &root->rb_root.rb_node; |
87 | struct rb_node *parent = NULL; |
88 | struct extent_map *entry = NULL; |
89 | struct rb_node *orig_parent = NULL; |
90 | u64 end = range_end(start: em->start, len: em->len); |
91 | bool leftmost = true; |
92 | |
93 | while (*p) { |
94 | parent = *p; |
95 | entry = rb_entry(parent, struct extent_map, rb_node); |
96 | |
97 | if (em->start < entry->start) { |
98 | p = &(*p)->rb_left; |
99 | } else if (em->start >= extent_map_end(em: entry)) { |
100 | p = &(*p)->rb_right; |
101 | leftmost = false; |
102 | } else { |
103 | return -EEXIST; |
104 | } |
105 | } |
106 | |
107 | orig_parent = parent; |
108 | while (parent && em->start >= extent_map_end(em: entry)) { |
109 | parent = rb_next(parent); |
110 | entry = rb_entry(parent, struct extent_map, rb_node); |
111 | } |
112 | if (parent) |
113 | if (end > entry->start && em->start < extent_map_end(em: entry)) |
114 | return -EEXIST; |
115 | |
116 | parent = orig_parent; |
117 | entry = rb_entry(parent, struct extent_map, rb_node); |
118 | while (parent && em->start < entry->start) { |
119 | parent = rb_prev(parent); |
120 | entry = rb_entry(parent, struct extent_map, rb_node); |
121 | } |
122 | if (parent) |
123 | if (end > entry->start && em->start < extent_map_end(em: entry)) |
124 | return -EEXIST; |
125 | |
126 | rb_link_node(node: &em->rb_node, parent: orig_parent, rb_link: p); |
127 | rb_insert_color_cached(node: &em->rb_node, root, leftmost); |
128 | return 0; |
129 | } |
130 | |
131 | /* |
132 | * Search through the tree for an extent_map with a given offset. If it can't |
133 | * be found, try to find some neighboring extents |
134 | */ |
135 | static struct rb_node *__tree_search(struct rb_root *root, u64 offset, |
136 | struct rb_node **prev_or_next_ret) |
137 | { |
138 | struct rb_node *n = root->rb_node; |
139 | struct rb_node *prev = NULL; |
140 | struct rb_node *orig_prev = NULL; |
141 | struct extent_map *entry; |
142 | struct extent_map *prev_entry = NULL; |
143 | |
144 | ASSERT(prev_or_next_ret); |
145 | |
146 | while (n) { |
147 | entry = rb_entry(n, struct extent_map, rb_node); |
148 | prev = n; |
149 | prev_entry = entry; |
150 | |
151 | if (offset < entry->start) |
152 | n = n->rb_left; |
153 | else if (offset >= extent_map_end(em: entry)) |
154 | n = n->rb_right; |
155 | else |
156 | return n; |
157 | } |
158 | |
159 | orig_prev = prev; |
160 | while (prev && offset >= extent_map_end(em: prev_entry)) { |
161 | prev = rb_next(prev); |
162 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
163 | } |
164 | |
165 | /* |
166 | * Previous extent map found, return as in this case the caller does not |
167 | * care about the next one. |
168 | */ |
169 | if (prev) { |
170 | *prev_or_next_ret = prev; |
171 | return NULL; |
172 | } |
173 | |
174 | prev = orig_prev; |
175 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
176 | while (prev && offset < prev_entry->start) { |
177 | prev = rb_prev(prev); |
178 | prev_entry = rb_entry(prev, struct extent_map, rb_node); |
179 | } |
180 | *prev_or_next_ret = prev; |
181 | |
182 | return NULL; |
183 | } |
184 | |
185 | /* Check to see if two extent_map structs are adjacent and safe to merge. */ |
186 | static int mergable_maps(struct extent_map *prev, struct extent_map *next) |
187 | { |
188 | if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) |
189 | return 0; |
190 | |
191 | /* |
192 | * don't merge compressed extents, we need to know their |
193 | * actual size |
194 | */ |
195 | if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) |
196 | return 0; |
197 | |
198 | if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || |
199 | test_bit(EXTENT_FLAG_LOGGING, &next->flags)) |
200 | return 0; |
201 | |
202 | /* |
203 | * We don't want to merge stuff that hasn't been written to the log yet |
204 | * since it may not reflect exactly what is on disk, and that would be |
205 | * bad. |
206 | */ |
207 | if (!list_empty(head: &prev->list) || !list_empty(head: &next->list)) |
208 | return 0; |
209 | |
210 | ASSERT(next->block_start != EXTENT_MAP_DELALLOC && |
211 | prev->block_start != EXTENT_MAP_DELALLOC); |
212 | |
213 | if (prev->map_lookup || next->map_lookup) |
214 | ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && |
215 | test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); |
216 | |
217 | if (extent_map_end(em: prev) == next->start && |
218 | prev->flags == next->flags && |
219 | prev->map_lookup == next->map_lookup && |
220 | ((next->block_start == EXTENT_MAP_HOLE && |
221 | prev->block_start == EXTENT_MAP_HOLE) || |
222 | (next->block_start == EXTENT_MAP_INLINE && |
223 | prev->block_start == EXTENT_MAP_INLINE) || |
224 | (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && |
225 | next->block_start == extent_map_block_end(em: prev)))) { |
226 | return 1; |
227 | } |
228 | return 0; |
229 | } |
230 | |
231 | static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) |
232 | { |
233 | struct extent_map *merge = NULL; |
234 | struct rb_node *rb; |
235 | |
236 | /* |
237 | * We can't modify an extent map that is in the tree and that is being |
238 | * used by another task, as it can cause that other task to see it in |
239 | * inconsistent state during the merging. We always have 1 reference for |
240 | * the tree and 1 for this task (which is unpinning the extent map or |
241 | * clearing the logging flag), so anything > 2 means it's being used by |
242 | * other tasks too. |
243 | */ |
244 | if (refcount_read(r: &em->refs) > 2) |
245 | return; |
246 | |
247 | if (em->start != 0) { |
248 | rb = rb_prev(&em->rb_node); |
249 | if (rb) |
250 | merge = rb_entry(rb, struct extent_map, rb_node); |
251 | if (rb && mergable_maps(prev: merge, next: em)) { |
252 | em->start = merge->start; |
253 | em->orig_start = merge->orig_start; |
254 | em->len += merge->len; |
255 | em->block_len += merge->block_len; |
256 | em->block_start = merge->block_start; |
257 | em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; |
258 | em->mod_start = merge->mod_start; |
259 | em->generation = max(em->generation, merge->generation); |
260 | set_bit(nr: EXTENT_FLAG_MERGED, addr: &em->flags); |
261 | |
262 | rb_erase_cached(node: &merge->rb_node, root: &tree->map); |
263 | RB_CLEAR_NODE(&merge->rb_node); |
264 | free_extent_map(em: merge); |
265 | } |
266 | } |
267 | |
268 | rb = rb_next(&em->rb_node); |
269 | if (rb) |
270 | merge = rb_entry(rb, struct extent_map, rb_node); |
271 | if (rb && mergable_maps(prev: em, next: merge)) { |
272 | em->len += merge->len; |
273 | em->block_len += merge->block_len; |
274 | rb_erase_cached(node: &merge->rb_node, root: &tree->map); |
275 | RB_CLEAR_NODE(&merge->rb_node); |
276 | em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; |
277 | em->generation = max(em->generation, merge->generation); |
278 | set_bit(nr: EXTENT_FLAG_MERGED, addr: &em->flags); |
279 | free_extent_map(em: merge); |
280 | } |
281 | } |
282 | |
283 | /* |
284 | * Unpin an extent from the cache. |
285 | * |
286 | * @tree: tree to unpin the extent in |
287 | * @start: logical offset in the file |
288 | * @len: length of the extent |
289 | * @gen: generation that this extent has been modified in |
290 | * |
291 | * Called after an extent has been written to disk properly. Set the generation |
292 | * to the generation that actually added the file item to the inode so we know |
293 | * we need to sync this extent when we call fsync(). |
294 | */ |
295 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, |
296 | u64 gen) |
297 | { |
298 | int ret = 0; |
299 | struct extent_map *em; |
300 | bool prealloc = false; |
301 | |
302 | write_lock(&tree->lock); |
303 | em = lookup_extent_mapping(tree, start, len); |
304 | |
305 | WARN_ON(!em || em->start != start); |
306 | |
307 | if (!em) |
308 | goto out; |
309 | |
310 | em->generation = gen; |
311 | clear_bit(nr: EXTENT_FLAG_PINNED, addr: &em->flags); |
312 | em->mod_start = em->start; |
313 | em->mod_len = em->len; |
314 | |
315 | if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { |
316 | prealloc = true; |
317 | clear_bit(nr: EXTENT_FLAG_FILLING, addr: &em->flags); |
318 | } |
319 | |
320 | try_merge_map(tree, em); |
321 | |
322 | if (prealloc) { |
323 | em->mod_start = em->start; |
324 | em->mod_len = em->len; |
325 | } |
326 | |
327 | free_extent_map(em); |
328 | out: |
329 | write_unlock(&tree->lock); |
330 | return ret; |
331 | |
332 | } |
333 | |
334 | void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) |
335 | { |
336 | lockdep_assert_held_write(&tree->lock); |
337 | |
338 | clear_bit(nr: EXTENT_FLAG_LOGGING, addr: &em->flags); |
339 | if (extent_map_in_tree(em)) |
340 | try_merge_map(tree, em); |
341 | } |
342 | |
343 | static inline void setup_extent_mapping(struct extent_map_tree *tree, |
344 | struct extent_map *em, |
345 | int modified) |
346 | { |
347 | refcount_inc(r: &em->refs); |
348 | em->mod_start = em->start; |
349 | em->mod_len = em->len; |
350 | |
351 | if (modified) |
352 | list_move(list: &em->list, head: &tree->modified_extents); |
353 | else |
354 | try_merge_map(tree, em); |
355 | } |
356 | |
357 | static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) |
358 | { |
359 | struct map_lookup *map = em->map_lookup; |
360 | u64 stripe_size = em->orig_block_len; |
361 | int i; |
362 | |
363 | for (i = 0; i < map->num_stripes; i++) { |
364 | struct btrfs_io_stripe *stripe = &map->stripes[i]; |
365 | struct btrfs_device *device = stripe->dev; |
366 | |
367 | set_extent_bit(tree: &device->alloc_state, start: stripe->physical, |
368 | end: stripe->physical + stripe_size - 1, |
369 | bits: bits | EXTENT_NOWAIT, NULL); |
370 | } |
371 | } |
372 | |
373 | static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) |
374 | { |
375 | struct map_lookup *map = em->map_lookup; |
376 | u64 stripe_size = em->orig_block_len; |
377 | int i; |
378 | |
379 | for (i = 0; i < map->num_stripes; i++) { |
380 | struct btrfs_io_stripe *stripe = &map->stripes[i]; |
381 | struct btrfs_device *device = stripe->dev; |
382 | |
383 | __clear_extent_bit(tree: &device->alloc_state, start: stripe->physical, |
384 | end: stripe->physical + stripe_size - 1, |
385 | bits: bits | EXTENT_NOWAIT, |
386 | NULL, NULL); |
387 | } |
388 | } |
389 | |
390 | /* |
391 | * Add new extent map to the extent tree |
392 | * |
393 | * @tree: tree to insert new map in |
394 | * @em: map to insert |
395 | * @modified: indicate whether the given @em should be added to the |
396 | * modified list, which indicates the extent needs to be logged |
397 | * |
398 | * Insert @em into @tree or perform a simple forward/backward merge with |
399 | * existing mappings. The extent_map struct passed in will be inserted |
400 | * into the tree directly, with an additional reference taken, or a |
401 | * reference dropped if the merge attempt was successful. |
402 | */ |
403 | int add_extent_mapping(struct extent_map_tree *tree, |
404 | struct extent_map *em, int modified) |
405 | { |
406 | int ret = 0; |
407 | |
408 | lockdep_assert_held_write(&tree->lock); |
409 | |
410 | ret = tree_insert(root: &tree->map, em); |
411 | if (ret) |
412 | goto out; |
413 | |
414 | setup_extent_mapping(tree, em, modified); |
415 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { |
416 | extent_map_device_set_bits(em, CHUNK_ALLOCATED); |
417 | extent_map_device_clear_bits(em, CHUNK_TRIMMED); |
418 | } |
419 | out: |
420 | return ret; |
421 | } |
422 | |
423 | static struct extent_map * |
424 | __lookup_extent_mapping(struct extent_map_tree *tree, |
425 | u64 start, u64 len, int strict) |
426 | { |
427 | struct extent_map *em; |
428 | struct rb_node *rb_node; |
429 | struct rb_node *prev_or_next = NULL; |
430 | u64 end = range_end(start, len); |
431 | |
432 | rb_node = __tree_search(root: &tree->map.rb_root, offset: start, prev_or_next_ret: &prev_or_next); |
433 | if (!rb_node) { |
434 | if (prev_or_next) |
435 | rb_node = prev_or_next; |
436 | else |
437 | return NULL; |
438 | } |
439 | |
440 | em = rb_entry(rb_node, struct extent_map, rb_node); |
441 | |
442 | if (strict && !(end > em->start && start < extent_map_end(em))) |
443 | return NULL; |
444 | |
445 | refcount_inc(r: &em->refs); |
446 | return em; |
447 | } |
448 | |
449 | /* |
450 | * Lookup extent_map that intersects @start + @len range. |
451 | * |
452 | * @tree: tree to lookup in |
453 | * @start: byte offset to start the search |
454 | * @len: length of the lookup range |
455 | * |
456 | * Find and return the first extent_map struct in @tree that intersects the |
457 | * [start, len] range. There may be additional objects in the tree that |
458 | * intersect, so check the object returned carefully to make sure that no |
459 | * additional lookups are needed. |
460 | */ |
461 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, |
462 | u64 start, u64 len) |
463 | { |
464 | return __lookup_extent_mapping(tree, start, len, strict: 1); |
465 | } |
466 | |
467 | /* |
468 | * Find a nearby extent map intersecting @start + @len (not an exact search). |
469 | * |
470 | * @tree: tree to lookup in |
471 | * @start: byte offset to start the search |
472 | * @len: length of the lookup range |
473 | * |
474 | * Find and return the first extent_map struct in @tree that intersects the |
475 | * [start, len] range. |
476 | * |
477 | * If one can't be found, any nearby extent may be returned |
478 | */ |
479 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, |
480 | u64 start, u64 len) |
481 | { |
482 | return __lookup_extent_mapping(tree, start, len, strict: 0); |
483 | } |
484 | |
485 | /* |
486 | * Remove an extent_map from the extent tree. |
487 | * |
488 | * @tree: extent tree to remove from |
489 | * @em: extent map being removed |
490 | * |
491 | * Remove @em from @tree. No reference counts are dropped, and no checks |
492 | * are done to see if the range is in use. |
493 | */ |
494 | void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) |
495 | { |
496 | lockdep_assert_held_write(&tree->lock); |
497 | |
498 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); |
499 | rb_erase_cached(node: &em->rb_node, root: &tree->map); |
500 | if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) |
501 | list_del_init(entry: &em->list); |
502 | if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) |
503 | extent_map_device_clear_bits(em, CHUNK_ALLOCATED); |
504 | RB_CLEAR_NODE(&em->rb_node); |
505 | } |
506 | |
507 | static void replace_extent_mapping(struct extent_map_tree *tree, |
508 | struct extent_map *cur, |
509 | struct extent_map *new, |
510 | int modified) |
511 | { |
512 | lockdep_assert_held_write(&tree->lock); |
513 | |
514 | WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); |
515 | ASSERT(extent_map_in_tree(cur)); |
516 | if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) |
517 | list_del_init(entry: &cur->list); |
518 | rb_replace_node_cached(victim: &cur->rb_node, new: &new->rb_node, root: &tree->map); |
519 | RB_CLEAR_NODE(&cur->rb_node); |
520 | |
521 | setup_extent_mapping(tree, em: new, modified); |
522 | } |
523 | |
524 | static struct extent_map *next_extent_map(const struct extent_map *em) |
525 | { |
526 | struct rb_node *next; |
527 | |
528 | next = rb_next(&em->rb_node); |
529 | if (!next) |
530 | return NULL; |
531 | return container_of(next, struct extent_map, rb_node); |
532 | } |
533 | |
534 | static struct extent_map *prev_extent_map(struct extent_map *em) |
535 | { |
536 | struct rb_node *prev; |
537 | |
538 | prev = rb_prev(&em->rb_node); |
539 | if (!prev) |
540 | return NULL; |
541 | return container_of(prev, struct extent_map, rb_node); |
542 | } |
543 | |
544 | /* |
545 | * Helper for btrfs_get_extent. Given an existing extent in the tree, |
546 | * the existing extent is the nearest extent to map_start, |
547 | * and an extent that you want to insert, deal with overlap and insert |
548 | * the best fitted new extent into the tree. |
549 | */ |
550 | static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, |
551 | struct extent_map *existing, |
552 | struct extent_map *em, |
553 | u64 map_start) |
554 | { |
555 | struct extent_map *prev; |
556 | struct extent_map *next; |
557 | u64 start; |
558 | u64 end; |
559 | u64 start_diff; |
560 | |
561 | BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); |
562 | |
563 | if (existing->start > map_start) { |
564 | next = existing; |
565 | prev = prev_extent_map(em: next); |
566 | } else { |
567 | prev = existing; |
568 | next = next_extent_map(em: prev); |
569 | } |
570 | |
571 | start = prev ? extent_map_end(em: prev) : em->start; |
572 | start = max_t(u64, start, em->start); |
573 | end = next ? next->start : extent_map_end(em); |
574 | end = min_t(u64, end, extent_map_end(em)); |
575 | start_diff = start - em->start; |
576 | em->start = start; |
577 | em->len = end - start; |
578 | if (em->block_start < EXTENT_MAP_LAST_BYTE && |
579 | !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { |
580 | em->block_start += start_diff; |
581 | em->block_len = em->len; |
582 | } |
583 | return add_extent_mapping(tree: em_tree, em, modified: 0); |
584 | } |
585 | |
586 | /* |
587 | * Add extent mapping into em_tree. |
588 | * |
589 | * @fs_info: the filesystem |
590 | * @em_tree: extent tree into which we want to insert the extent mapping |
591 | * @em_in: extent we are inserting |
592 | * @start: start of the logical range btrfs_get_extent() is requesting |
593 | * @len: length of the logical range btrfs_get_extent() is requesting |
594 | * |
595 | * Note that @em_in's range may be different from [start, start+len), |
596 | * but they must be overlapped. |
597 | * |
598 | * Insert @em_in into @em_tree. In case there is an overlapping range, handle |
599 | * the -EEXIST by either: |
600 | * a) Returning the existing extent in @em_in if @start is within the |
601 | * existing em. |
602 | * b) Merge the existing extent with @em_in passed in. |
603 | * |
604 | * Return 0 on success, otherwise -EEXIST. |
605 | * |
606 | */ |
607 | int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, |
608 | struct extent_map_tree *em_tree, |
609 | struct extent_map **em_in, u64 start, u64 len) |
610 | { |
611 | int ret; |
612 | struct extent_map *em = *em_in; |
613 | |
614 | /* |
615 | * Tree-checker should have rejected any inline extent with non-zero |
616 | * file offset. Here just do a sanity check. |
617 | */ |
618 | if (em->block_start == EXTENT_MAP_INLINE) |
619 | ASSERT(em->start == 0); |
620 | |
621 | ret = add_extent_mapping(tree: em_tree, em, modified: 0); |
622 | /* it is possible that someone inserted the extent into the tree |
623 | * while we had the lock dropped. It is also possible that |
624 | * an overlapping map exists in the tree |
625 | */ |
626 | if (ret == -EEXIST) { |
627 | struct extent_map *existing; |
628 | |
629 | ret = 0; |
630 | |
631 | existing = search_extent_mapping(tree: em_tree, start, len); |
632 | |
633 | trace_btrfs_handle_em_exist(fs_info, existing, map: em, start, len); |
634 | |
635 | /* |
636 | * existing will always be non-NULL, since there must be |
637 | * extent causing the -EEXIST. |
638 | */ |
639 | if (start >= existing->start && |
640 | start < extent_map_end(em: existing)) { |
641 | free_extent_map(em); |
642 | *em_in = existing; |
643 | ret = 0; |
644 | } else { |
645 | u64 orig_start = em->start; |
646 | u64 orig_len = em->len; |
647 | |
648 | /* |
649 | * The existing extent map is the one nearest to |
650 | * the [start, start + len) range which overlaps |
651 | */ |
652 | ret = merge_extent_mapping(em_tree, existing, |
653 | em, map_start: start); |
654 | if (ret) { |
655 | free_extent_map(em); |
656 | *em_in = NULL; |
657 | WARN_ONCE(ret, |
658 | "unexpected error %d: merge existing(start %llu len %llu) with em(start %llu len %llu)\n" , |
659 | ret, existing->start, existing->len, |
660 | orig_start, orig_len); |
661 | } |
662 | free_extent_map(em: existing); |
663 | } |
664 | } |
665 | |
666 | ASSERT(ret == 0 || ret == -EEXIST); |
667 | return ret; |
668 | } |
669 | |
670 | /* |
671 | * Drop all extent maps from a tree in the fastest possible way, rescheduling |
672 | * if needed. This avoids searching the tree, from the root down to the first |
673 | * extent map, before each deletion. |
674 | */ |
675 | static void drop_all_extent_maps_fast(struct extent_map_tree *tree) |
676 | { |
677 | write_lock(&tree->lock); |
678 | while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { |
679 | struct extent_map *em; |
680 | struct rb_node *node; |
681 | |
682 | node = rb_first_cached(&tree->map); |
683 | em = rb_entry(node, struct extent_map, rb_node); |
684 | clear_bit(nr: EXTENT_FLAG_PINNED, addr: &em->flags); |
685 | clear_bit(nr: EXTENT_FLAG_LOGGING, addr: &em->flags); |
686 | remove_extent_mapping(tree, em); |
687 | free_extent_map(em); |
688 | cond_resched_rwlock_write(&tree->lock); |
689 | } |
690 | write_unlock(&tree->lock); |
691 | } |
692 | |
693 | /* |
694 | * Drop all extent maps in a given range. |
695 | * |
696 | * @inode: The target inode. |
697 | * @start: Start offset of the range. |
698 | * @end: End offset of the range (inclusive value). |
699 | * @skip_pinned: Indicate if pinned extent maps should be ignored or not. |
700 | * |
701 | * This drops all the extent maps that intersect the given range [@start, @end]. |
702 | * Extent maps that partially overlap the range and extend behind or beyond it, |
703 | * are split. |
704 | * The caller should have locked an appropriate file range in the inode's io |
705 | * tree before calling this function. |
706 | */ |
707 | void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, |
708 | bool skip_pinned) |
709 | { |
710 | struct extent_map *split; |
711 | struct extent_map *split2; |
712 | struct extent_map *em; |
713 | struct extent_map_tree *em_tree = &inode->extent_tree; |
714 | u64 len = end - start + 1; |
715 | |
716 | WARN_ON(end < start); |
717 | if (end == (u64)-1) { |
718 | if (start == 0 && !skip_pinned) { |
719 | drop_all_extent_maps_fast(tree: em_tree); |
720 | return; |
721 | } |
722 | len = (u64)-1; |
723 | } else { |
724 | /* Make end offset exclusive for use in the loop below. */ |
725 | end++; |
726 | } |
727 | |
728 | /* |
729 | * It's ok if we fail to allocate the extent maps, see the comment near |
730 | * the bottom of the loop below. We only need two spare extent maps in |
731 | * the worst case, where the first extent map that intersects our range |
732 | * starts before the range and the last extent map that intersects our |
733 | * range ends after our range (and they might be the same extent map), |
734 | * because we need to split those two extent maps at the boundaries. |
735 | */ |
736 | split = alloc_extent_map(); |
737 | split2 = alloc_extent_map(); |
738 | |
739 | write_lock(&em_tree->lock); |
740 | em = lookup_extent_mapping(tree: em_tree, start, len); |
741 | |
742 | while (em) { |
743 | /* extent_map_end() returns exclusive value (last byte + 1). */ |
744 | const u64 em_end = extent_map_end(em); |
745 | struct extent_map *next_em = NULL; |
746 | u64 gen; |
747 | unsigned long flags; |
748 | bool modified; |
749 | bool compressed; |
750 | |
751 | if (em_end < end) { |
752 | next_em = next_extent_map(em); |
753 | if (next_em) { |
754 | if (next_em->start < end) |
755 | refcount_inc(r: &next_em->refs); |
756 | else |
757 | next_em = NULL; |
758 | } |
759 | } |
760 | |
761 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { |
762 | start = em_end; |
763 | goto next; |
764 | } |
765 | |
766 | flags = em->flags; |
767 | clear_bit(nr: EXTENT_FLAG_PINNED, addr: &em->flags); |
768 | /* |
769 | * In case we split the extent map, we want to preserve the |
770 | * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want |
771 | * it on the new extent maps. |
772 | */ |
773 | clear_bit(nr: EXTENT_FLAG_LOGGING, addr: &flags); |
774 | modified = !list_empty(head: &em->list); |
775 | |
776 | /* |
777 | * The extent map does not cross our target range, so no need to |
778 | * split it, we can remove it directly. |
779 | */ |
780 | if (em->start >= start && em_end <= end) |
781 | goto remove_em; |
782 | |
783 | gen = em->generation; |
784 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); |
785 | |
786 | if (em->start < start) { |
787 | if (!split) { |
788 | split = split2; |
789 | split2 = NULL; |
790 | if (!split) |
791 | goto remove_em; |
792 | } |
793 | split->start = em->start; |
794 | split->len = start - em->start; |
795 | |
796 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { |
797 | split->orig_start = em->orig_start; |
798 | split->block_start = em->block_start; |
799 | |
800 | if (compressed) |
801 | split->block_len = em->block_len; |
802 | else |
803 | split->block_len = split->len; |
804 | split->orig_block_len = max(split->block_len, |
805 | em->orig_block_len); |
806 | split->ram_bytes = em->ram_bytes; |
807 | } else { |
808 | split->orig_start = split->start; |
809 | split->block_len = 0; |
810 | split->block_start = em->block_start; |
811 | split->orig_block_len = 0; |
812 | split->ram_bytes = split->len; |
813 | } |
814 | |
815 | split->generation = gen; |
816 | split->flags = flags; |
817 | split->compress_type = em->compress_type; |
818 | replace_extent_mapping(tree: em_tree, cur: em, new: split, modified); |
819 | free_extent_map(em: split); |
820 | split = split2; |
821 | split2 = NULL; |
822 | } |
823 | if (em_end > end) { |
824 | if (!split) { |
825 | split = split2; |
826 | split2 = NULL; |
827 | if (!split) |
828 | goto remove_em; |
829 | } |
830 | split->start = end; |
831 | split->len = em_end - end; |
832 | split->block_start = em->block_start; |
833 | split->flags = flags; |
834 | split->compress_type = em->compress_type; |
835 | split->generation = gen; |
836 | |
837 | if (em->block_start < EXTENT_MAP_LAST_BYTE) { |
838 | split->orig_block_len = max(em->block_len, |
839 | em->orig_block_len); |
840 | |
841 | split->ram_bytes = em->ram_bytes; |
842 | if (compressed) { |
843 | split->block_len = em->block_len; |
844 | split->orig_start = em->orig_start; |
845 | } else { |
846 | const u64 diff = start + len - em->start; |
847 | |
848 | split->block_len = split->len; |
849 | split->block_start += diff; |
850 | split->orig_start = em->orig_start; |
851 | } |
852 | } else { |
853 | split->ram_bytes = split->len; |
854 | split->orig_start = split->start; |
855 | split->block_len = 0; |
856 | split->orig_block_len = 0; |
857 | } |
858 | |
859 | if (extent_map_in_tree(em)) { |
860 | replace_extent_mapping(tree: em_tree, cur: em, new: split, |
861 | modified); |
862 | } else { |
863 | int ret; |
864 | |
865 | ret = add_extent_mapping(tree: em_tree, em: split, |
866 | modified); |
867 | /* Logic error, shouldn't happen. */ |
868 | ASSERT(ret == 0); |
869 | if (WARN_ON(ret != 0) && modified) |
870 | btrfs_set_inode_full_sync(inode); |
871 | } |
872 | free_extent_map(em: split); |
873 | split = NULL; |
874 | } |
875 | remove_em: |
876 | if (extent_map_in_tree(em)) { |
877 | /* |
878 | * If the extent map is still in the tree it means that |
879 | * either of the following is true: |
880 | * |
881 | * 1) It fits entirely in our range (doesn't end beyond |
882 | * it or starts before it); |
883 | * |
884 | * 2) It starts before our range and/or ends after our |
885 | * range, and we were not able to allocate the extent |
886 | * maps for split operations, @split and @split2. |
887 | * |
888 | * If we are at case 2) then we just remove the entire |
889 | * extent map - this is fine since if anyone needs it to |
890 | * access the subranges outside our range, will just |
891 | * load it again from the subvolume tree's file extent |
892 | * item. However if the extent map was in the list of |
893 | * modified extents, then we must mark the inode for a |
894 | * full fsync, otherwise a fast fsync will miss this |
895 | * extent if it's new and needs to be logged. |
896 | */ |
897 | if ((em->start < start || em_end > end) && modified) { |
898 | ASSERT(!split); |
899 | btrfs_set_inode_full_sync(inode); |
900 | } |
901 | remove_extent_mapping(tree: em_tree, em); |
902 | } |
903 | |
904 | /* |
905 | * Once for the tree reference (we replaced or removed the |
906 | * extent map from the tree). |
907 | */ |
908 | free_extent_map(em); |
909 | next: |
910 | /* Once for us (for our lookup reference). */ |
911 | free_extent_map(em); |
912 | |
913 | em = next_em; |
914 | } |
915 | |
916 | write_unlock(&em_tree->lock); |
917 | |
918 | free_extent_map(em: split); |
919 | free_extent_map(em: split2); |
920 | } |
921 | |
922 | /* |
923 | * Replace a range in the inode's extent map tree with a new extent map. |
924 | * |
925 | * @inode: The target inode. |
926 | * @new_em: The new extent map to add to the inode's extent map tree. |
927 | * @modified: Indicate if the new extent map should be added to the list of |
928 | * modified extents (for fast fsync tracking). |
929 | * |
930 | * Drops all the extent maps in the inode's extent map tree that intersect the |
931 | * range of the new extent map and adds the new extent map to the tree. |
932 | * The caller should have locked an appropriate file range in the inode's io |
933 | * tree before calling this function. |
934 | */ |
935 | int btrfs_replace_extent_map_range(struct btrfs_inode *inode, |
936 | struct extent_map *new_em, |
937 | bool modified) |
938 | { |
939 | const u64 end = new_em->start + new_em->len - 1; |
940 | struct extent_map_tree *tree = &inode->extent_tree; |
941 | int ret; |
942 | |
943 | ASSERT(!extent_map_in_tree(new_em)); |
944 | |
945 | /* |
946 | * The caller has locked an appropriate file range in the inode's io |
947 | * tree, but getting -EEXIST when adding the new extent map can still |
948 | * happen in case there are extents that partially cover the range, and |
949 | * this is due to two tasks operating on different parts of the extent. |
950 | * See commit 18e83ac75bfe67 ("Btrfs: fix unexpected EEXIST from |
951 | * btrfs_get_extent") for an example and details. |
952 | */ |
953 | do { |
954 | btrfs_drop_extent_map_range(inode, start: new_em->start, end, skip_pinned: false); |
955 | write_lock(&tree->lock); |
956 | ret = add_extent_mapping(tree, em: new_em, modified); |
957 | write_unlock(&tree->lock); |
958 | } while (ret == -EEXIST); |
959 | |
960 | return ret; |
961 | } |
962 | |
963 | /* |
964 | * Split off the first pre bytes from the extent_map at [start, start + len], |
965 | * and set the block_start for it to new_logical. |
966 | * |
967 | * This function is used when an ordered_extent needs to be split. |
968 | */ |
969 | int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, |
970 | u64 new_logical) |
971 | { |
972 | struct extent_map_tree *em_tree = &inode->extent_tree; |
973 | struct extent_map *em; |
974 | struct extent_map *split_pre = NULL; |
975 | struct extent_map *split_mid = NULL; |
976 | int ret = 0; |
977 | unsigned long flags; |
978 | |
979 | ASSERT(pre != 0); |
980 | ASSERT(pre < len); |
981 | |
982 | split_pre = alloc_extent_map(); |
983 | if (!split_pre) |
984 | return -ENOMEM; |
985 | split_mid = alloc_extent_map(); |
986 | if (!split_mid) { |
987 | ret = -ENOMEM; |
988 | goto out_free_pre; |
989 | } |
990 | |
991 | lock_extent(tree: &inode->io_tree, start, end: start + len - 1, NULL); |
992 | write_lock(&em_tree->lock); |
993 | em = lookup_extent_mapping(tree: em_tree, start, len); |
994 | if (!em) { |
995 | ret = -EIO; |
996 | goto out_unlock; |
997 | } |
998 | |
999 | ASSERT(em->len == len); |
1000 | ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); |
1001 | ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); |
1002 | ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); |
1003 | ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); |
1004 | ASSERT(!list_empty(&em->list)); |
1005 | |
1006 | flags = em->flags; |
1007 | clear_bit(nr: EXTENT_FLAG_PINNED, addr: &em->flags); |
1008 | |
1009 | /* First, replace the em with a new extent_map starting from * em->start */ |
1010 | split_pre->start = em->start; |
1011 | split_pre->len = pre; |
1012 | split_pre->orig_start = split_pre->start; |
1013 | split_pre->block_start = new_logical; |
1014 | split_pre->block_len = split_pre->len; |
1015 | split_pre->orig_block_len = split_pre->block_len; |
1016 | split_pre->ram_bytes = split_pre->len; |
1017 | split_pre->flags = flags; |
1018 | split_pre->compress_type = em->compress_type; |
1019 | split_pre->generation = em->generation; |
1020 | |
1021 | replace_extent_mapping(tree: em_tree, cur: em, new: split_pre, modified: 1); |
1022 | |
1023 | /* |
1024 | * Now we only have an extent_map at: |
1025 | * [em->start, em->start + pre] |
1026 | */ |
1027 | |
1028 | /* Insert the middle extent_map. */ |
1029 | split_mid->start = em->start + pre; |
1030 | split_mid->len = em->len - pre; |
1031 | split_mid->orig_start = split_mid->start; |
1032 | split_mid->block_start = em->block_start + pre; |
1033 | split_mid->block_len = split_mid->len; |
1034 | split_mid->orig_block_len = split_mid->block_len; |
1035 | split_mid->ram_bytes = split_mid->len; |
1036 | split_mid->flags = flags; |
1037 | split_mid->compress_type = em->compress_type; |
1038 | split_mid->generation = em->generation; |
1039 | add_extent_mapping(tree: em_tree, em: split_mid, modified: 1); |
1040 | |
1041 | /* Once for us */ |
1042 | free_extent_map(em); |
1043 | /* Once for the tree */ |
1044 | free_extent_map(em); |
1045 | |
1046 | out_unlock: |
1047 | write_unlock(&em_tree->lock); |
1048 | unlock_extent(tree: &inode->io_tree, start, end: start + len - 1, NULL); |
1049 | free_extent_map(em: split_mid); |
1050 | out_free_pre: |
1051 | free_extent_map(em: split_pre); |
1052 | return ret; |
1053 | } |
1054 | |