1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2007 Oracle. All rights reserved. |
4 | */ |
5 | |
6 | #include <linux/sched.h> |
7 | #include "ctree.h" |
8 | #include "disk-io.h" |
9 | #include "transaction.h" |
10 | #include "locking.h" |
11 | #include "accessors.h" |
12 | #include "messages.h" |
13 | #include "delalloc-space.h" |
14 | #include "subpage.h" |
15 | #include "defrag.h" |
16 | #include "file-item.h" |
17 | #include "super.h" |
18 | |
19 | static struct kmem_cache *btrfs_inode_defrag_cachep; |
20 | |
21 | /* |
22 | * When auto defrag is enabled we queue up these defrag structs to remember |
23 | * which inodes need defragging passes. |
24 | */ |
25 | struct inode_defrag { |
26 | struct rb_node rb_node; |
27 | /* Inode number */ |
28 | u64 ino; |
29 | /* |
30 | * Transid where the defrag was added, we search for extents newer than |
31 | * this. |
32 | */ |
33 | u64 transid; |
34 | |
35 | /* Root objectid */ |
36 | u64 root; |
37 | |
38 | /* |
39 | * The extent size threshold for autodefrag. |
40 | * |
41 | * This value is different for compressed/non-compressed extents, thus |
42 | * needs to be passed from higher layer. |
43 | * (aka, inode_should_defrag()) |
44 | */ |
45 | u32 extent_thresh; |
46 | }; |
47 | |
48 | static int __compare_inode_defrag(struct inode_defrag *defrag1, |
49 | struct inode_defrag *defrag2) |
50 | { |
51 | if (defrag1->root > defrag2->root) |
52 | return 1; |
53 | else if (defrag1->root < defrag2->root) |
54 | return -1; |
55 | else if (defrag1->ino > defrag2->ino) |
56 | return 1; |
57 | else if (defrag1->ino < defrag2->ino) |
58 | return -1; |
59 | else |
60 | return 0; |
61 | } |
62 | |
63 | /* |
64 | * Pop a record for an inode into the defrag tree. The lock must be held |
65 | * already. |
66 | * |
67 | * If you're inserting a record for an older transid than an existing record, |
68 | * the transid already in the tree is lowered. |
69 | * |
70 | * If an existing record is found the defrag item you pass in is freed. |
71 | */ |
72 | static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, |
73 | struct inode_defrag *defrag) |
74 | { |
75 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
76 | struct inode_defrag *entry; |
77 | struct rb_node **p; |
78 | struct rb_node *parent = NULL; |
79 | int ret; |
80 | |
81 | p = &fs_info->defrag_inodes.rb_node; |
82 | while (*p) { |
83 | parent = *p; |
84 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
85 | |
86 | ret = __compare_inode_defrag(defrag1: defrag, defrag2: entry); |
87 | if (ret < 0) |
88 | p = &parent->rb_left; |
89 | else if (ret > 0) |
90 | p = &parent->rb_right; |
91 | else { |
92 | /* |
93 | * If we're reinserting an entry for an old defrag run, |
94 | * make sure to lower the transid of our existing |
95 | * record. |
96 | */ |
97 | if (defrag->transid < entry->transid) |
98 | entry->transid = defrag->transid; |
99 | entry->extent_thresh = min(defrag->extent_thresh, |
100 | entry->extent_thresh); |
101 | return -EEXIST; |
102 | } |
103 | } |
104 | set_bit(nr: BTRFS_INODE_IN_DEFRAG, addr: &inode->runtime_flags); |
105 | rb_link_node(node: &defrag->rb_node, parent, rb_link: p); |
106 | rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); |
107 | return 0; |
108 | } |
109 | |
110 | static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) |
111 | { |
112 | if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) |
113 | return 0; |
114 | |
115 | if (btrfs_fs_closing(fs_info)) |
116 | return 0; |
117 | |
118 | return 1; |
119 | } |
120 | |
121 | /* |
122 | * Insert a defrag record for this inode if auto defrag is enabled. |
123 | */ |
124 | int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, |
125 | struct btrfs_inode *inode, u32 extent_thresh) |
126 | { |
127 | struct btrfs_root *root = inode->root; |
128 | struct btrfs_fs_info *fs_info = root->fs_info; |
129 | struct inode_defrag *defrag; |
130 | u64 transid; |
131 | int ret; |
132 | |
133 | if (!__need_auto_defrag(fs_info)) |
134 | return 0; |
135 | |
136 | if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) |
137 | return 0; |
138 | |
139 | if (trans) |
140 | transid = trans->transid; |
141 | else |
142 | transid = inode->root->last_trans; |
143 | |
144 | defrag = kmem_cache_zalloc(k: btrfs_inode_defrag_cachep, GFP_NOFS); |
145 | if (!defrag) |
146 | return -ENOMEM; |
147 | |
148 | defrag->ino = btrfs_ino(inode); |
149 | defrag->transid = transid; |
150 | defrag->root = root->root_key.objectid; |
151 | defrag->extent_thresh = extent_thresh; |
152 | |
153 | spin_lock(lock: &fs_info->defrag_inodes_lock); |
154 | if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { |
155 | /* |
156 | * If we set IN_DEFRAG flag and evict the inode from memory, |
157 | * and then re-read this inode, this new inode doesn't have |
158 | * IN_DEFRAG flag. At the case, we may find the existed defrag. |
159 | */ |
160 | ret = __btrfs_add_inode_defrag(inode, defrag); |
161 | if (ret) |
162 | kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag); |
163 | } else { |
164 | kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag); |
165 | } |
166 | spin_unlock(lock: &fs_info->defrag_inodes_lock); |
167 | return 0; |
168 | } |
169 | |
170 | /* |
171 | * Pick the defragable inode that we want, if it doesn't exist, we will get the |
172 | * next one. |
173 | */ |
174 | static struct inode_defrag *btrfs_pick_defrag_inode( |
175 | struct btrfs_fs_info *fs_info, u64 root, u64 ino) |
176 | { |
177 | struct inode_defrag *entry = NULL; |
178 | struct inode_defrag tmp; |
179 | struct rb_node *p; |
180 | struct rb_node *parent = NULL; |
181 | int ret; |
182 | |
183 | tmp.ino = ino; |
184 | tmp.root = root; |
185 | |
186 | spin_lock(lock: &fs_info->defrag_inodes_lock); |
187 | p = fs_info->defrag_inodes.rb_node; |
188 | while (p) { |
189 | parent = p; |
190 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
191 | |
192 | ret = __compare_inode_defrag(defrag1: &tmp, defrag2: entry); |
193 | if (ret < 0) |
194 | p = parent->rb_left; |
195 | else if (ret > 0) |
196 | p = parent->rb_right; |
197 | else |
198 | goto out; |
199 | } |
200 | |
201 | if (parent && __compare_inode_defrag(defrag1: &tmp, defrag2: entry) > 0) { |
202 | parent = rb_next(parent); |
203 | if (parent) |
204 | entry = rb_entry(parent, struct inode_defrag, rb_node); |
205 | else |
206 | entry = NULL; |
207 | } |
208 | out: |
209 | if (entry) |
210 | rb_erase(parent, &fs_info->defrag_inodes); |
211 | spin_unlock(lock: &fs_info->defrag_inodes_lock); |
212 | return entry; |
213 | } |
214 | |
215 | void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) |
216 | { |
217 | struct inode_defrag *defrag; |
218 | struct rb_node *node; |
219 | |
220 | spin_lock(lock: &fs_info->defrag_inodes_lock); |
221 | node = rb_first(&fs_info->defrag_inodes); |
222 | while (node) { |
223 | rb_erase(node, &fs_info->defrag_inodes); |
224 | defrag = rb_entry(node, struct inode_defrag, rb_node); |
225 | kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag); |
226 | |
227 | cond_resched_lock(&fs_info->defrag_inodes_lock); |
228 | |
229 | node = rb_first(&fs_info->defrag_inodes); |
230 | } |
231 | spin_unlock(lock: &fs_info->defrag_inodes_lock); |
232 | } |
233 | |
234 | #define BTRFS_DEFRAG_BATCH 1024 |
235 | |
236 | static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, |
237 | struct inode_defrag *defrag) |
238 | { |
239 | struct btrfs_root *inode_root; |
240 | struct inode *inode; |
241 | struct btrfs_ioctl_defrag_range_args range; |
242 | int ret = 0; |
243 | u64 cur = 0; |
244 | |
245 | again: |
246 | if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) |
247 | goto cleanup; |
248 | if (!__need_auto_defrag(fs_info)) |
249 | goto cleanup; |
250 | |
251 | /* Get the inode */ |
252 | inode_root = btrfs_get_fs_root(fs_info, objectid: defrag->root, check_ref: true); |
253 | if (IS_ERR(ptr: inode_root)) { |
254 | ret = PTR_ERR(ptr: inode_root); |
255 | goto cleanup; |
256 | } |
257 | |
258 | inode = btrfs_iget(s: fs_info->sb, ino: defrag->ino, root: inode_root); |
259 | btrfs_put_root(root: inode_root); |
260 | if (IS_ERR(ptr: inode)) { |
261 | ret = PTR_ERR(ptr: inode); |
262 | goto cleanup; |
263 | } |
264 | |
265 | if (cur >= i_size_read(inode)) { |
266 | iput(inode); |
267 | goto cleanup; |
268 | } |
269 | |
270 | /* Do a chunk of defrag */ |
271 | clear_bit(nr: BTRFS_INODE_IN_DEFRAG, addr: &BTRFS_I(inode)->runtime_flags); |
272 | memset(&range, 0, sizeof(range)); |
273 | range.len = (u64)-1; |
274 | range.start = cur; |
275 | range.extent_thresh = defrag->extent_thresh; |
276 | |
277 | sb_start_write(sb: fs_info->sb); |
278 | ret = btrfs_defrag_file(inode, NULL, range: &range, newer_than: defrag->transid, |
279 | BTRFS_DEFRAG_BATCH); |
280 | sb_end_write(sb: fs_info->sb); |
281 | iput(inode); |
282 | |
283 | if (ret < 0) |
284 | goto cleanup; |
285 | |
286 | cur = max(cur + fs_info->sectorsize, range.start); |
287 | goto again; |
288 | |
289 | cleanup: |
290 | kmem_cache_free(s: btrfs_inode_defrag_cachep, objp: defrag); |
291 | return ret; |
292 | } |
293 | |
294 | /* |
295 | * Run through the list of inodes in the FS that need defragging. |
296 | */ |
297 | int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) |
298 | { |
299 | struct inode_defrag *defrag; |
300 | u64 first_ino = 0; |
301 | u64 root_objectid = 0; |
302 | |
303 | atomic_inc(v: &fs_info->defrag_running); |
304 | while (1) { |
305 | /* Pause the auto defragger. */ |
306 | if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) |
307 | break; |
308 | |
309 | if (!__need_auto_defrag(fs_info)) |
310 | break; |
311 | |
312 | /* find an inode to defrag */ |
313 | defrag = btrfs_pick_defrag_inode(fs_info, root: root_objectid, ino: first_ino); |
314 | if (!defrag) { |
315 | if (root_objectid || first_ino) { |
316 | root_objectid = 0; |
317 | first_ino = 0; |
318 | continue; |
319 | } else { |
320 | break; |
321 | } |
322 | } |
323 | |
324 | first_ino = defrag->ino + 1; |
325 | root_objectid = defrag->root; |
326 | |
327 | __btrfs_run_defrag_inode(fs_info, defrag); |
328 | } |
329 | atomic_dec(v: &fs_info->defrag_running); |
330 | |
331 | /* |
332 | * During unmount, we use the transaction_wait queue to wait for the |
333 | * defragger to stop. |
334 | */ |
335 | wake_up(&fs_info->transaction_wait); |
336 | return 0; |
337 | } |
338 | |
339 | /* |
340 | * Check if two blocks addresses are close, used by defrag. |
341 | */ |
342 | static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) |
343 | { |
344 | if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) |
345 | return true; |
346 | if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) |
347 | return true; |
348 | return false; |
349 | } |
350 | |
351 | /* |
352 | * Go through all the leaves pointed to by a node and reallocate them so that |
353 | * disk order is close to key order. |
354 | */ |
355 | static int btrfs_realloc_node(struct btrfs_trans_handle *trans, |
356 | struct btrfs_root *root, |
357 | struct extent_buffer *parent, |
358 | int start_slot, u64 *last_ret, |
359 | struct btrfs_key *progress) |
360 | { |
361 | struct btrfs_fs_info *fs_info = root->fs_info; |
362 | const u32 blocksize = fs_info->nodesize; |
363 | const int end_slot = btrfs_header_nritems(eb: parent) - 1; |
364 | u64 search_start = *last_ret; |
365 | u64 last_block = 0; |
366 | int ret = 0; |
367 | bool progress_passed = false; |
368 | |
369 | /* |
370 | * COWing must happen through a running transaction, which always |
371 | * matches the current fs generation (it's a transaction with a state |
372 | * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs |
373 | * into error state to prevent the commit of any transaction. |
374 | */ |
375 | if (unlikely(trans->transaction != fs_info->running_transaction || |
376 | trans->transid != fs_info->generation)) { |
377 | btrfs_abort_transaction(trans, -EUCLEAN); |
378 | btrfs_crit(fs_info, |
379 | "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu" , |
380 | parent->start, btrfs_root_id(root), trans->transid, |
381 | fs_info->running_transaction->transid, |
382 | fs_info->generation); |
383 | return -EUCLEAN; |
384 | } |
385 | |
386 | if (btrfs_header_nritems(eb: parent) <= 1) |
387 | return 0; |
388 | |
389 | for (int i = start_slot; i <= end_slot; i++) { |
390 | struct extent_buffer *cur; |
391 | struct btrfs_disk_key disk_key; |
392 | u64 blocknr; |
393 | u64 other; |
394 | bool close = true; |
395 | |
396 | btrfs_node_key(eb: parent, disk_key: &disk_key, nr: i); |
397 | if (!progress_passed && btrfs_comp_keys(disk_key: &disk_key, k2: progress) < 0) |
398 | continue; |
399 | |
400 | progress_passed = true; |
401 | blocknr = btrfs_node_blockptr(eb: parent, nr: i); |
402 | if (last_block == 0) |
403 | last_block = blocknr; |
404 | |
405 | if (i > 0) { |
406 | other = btrfs_node_blockptr(eb: parent, nr: i - 1); |
407 | close = close_blocks(blocknr, other, blocksize); |
408 | } |
409 | if (!close && i < end_slot) { |
410 | other = btrfs_node_blockptr(eb: parent, nr: i + 1); |
411 | close = close_blocks(blocknr, other, blocksize); |
412 | } |
413 | if (close) { |
414 | last_block = blocknr; |
415 | continue; |
416 | } |
417 | |
418 | cur = btrfs_read_node_slot(parent, slot: i); |
419 | if (IS_ERR(ptr: cur)) |
420 | return PTR_ERR(ptr: cur); |
421 | if (search_start == 0) |
422 | search_start = last_block; |
423 | |
424 | btrfs_tree_lock(eb: cur); |
425 | ret = btrfs_force_cow_block(trans, root, buf: cur, parent, parent_slot: i, |
426 | cow_ret: &cur, search_start, |
427 | min(16 * blocksize, |
428 | (end_slot - i) * blocksize), |
429 | nest: BTRFS_NESTING_COW); |
430 | if (ret) { |
431 | btrfs_tree_unlock(eb: cur); |
432 | free_extent_buffer(eb: cur); |
433 | break; |
434 | } |
435 | search_start = cur->start; |
436 | last_block = cur->start; |
437 | *last_ret = search_start; |
438 | btrfs_tree_unlock(eb: cur); |
439 | free_extent_buffer(eb: cur); |
440 | } |
441 | return ret; |
442 | } |
443 | |
444 | /* |
445 | * Defrag all the leaves in a given btree. |
446 | * Read all the leaves and try to get key order to |
447 | * better reflect disk order |
448 | */ |
449 | |
450 | static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
451 | struct btrfs_root *root) |
452 | { |
453 | struct btrfs_path *path = NULL; |
454 | struct btrfs_key key; |
455 | int ret = 0; |
456 | int wret; |
457 | int level; |
458 | int next_key_ret = 0; |
459 | u64 last_ret = 0; |
460 | |
461 | if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) |
462 | goto out; |
463 | |
464 | path = btrfs_alloc_path(); |
465 | if (!path) { |
466 | ret = -ENOMEM; |
467 | goto out; |
468 | } |
469 | |
470 | level = btrfs_header_level(eb: root->node); |
471 | |
472 | if (level == 0) |
473 | goto out; |
474 | |
475 | if (root->defrag_progress.objectid == 0) { |
476 | struct extent_buffer *root_node; |
477 | u32 nritems; |
478 | |
479 | root_node = btrfs_lock_root_node(root); |
480 | nritems = btrfs_header_nritems(eb: root_node); |
481 | root->defrag_max.objectid = 0; |
482 | /* from above we know this is not a leaf */ |
483 | btrfs_node_key_to_cpu(eb: root_node, cpu_key: &root->defrag_max, |
484 | nr: nritems - 1); |
485 | btrfs_tree_unlock(eb: root_node); |
486 | free_extent_buffer(eb: root_node); |
487 | memset(&key, 0, sizeof(key)); |
488 | } else { |
489 | memcpy(&key, &root->defrag_progress, sizeof(key)); |
490 | } |
491 | |
492 | path->keep_locks = 1; |
493 | |
494 | ret = btrfs_search_forward(root, min_key: &key, path, BTRFS_OLDEST_GENERATION); |
495 | if (ret < 0) |
496 | goto out; |
497 | if (ret > 0) { |
498 | ret = 0; |
499 | goto out; |
500 | } |
501 | btrfs_release_path(p: path); |
502 | /* |
503 | * We don't need a lock on a leaf. btrfs_realloc_node() will lock all |
504 | * leafs from path->nodes[1], so set lowest_level to 1 to avoid later |
505 | * a deadlock (attempting to write lock an already write locked leaf). |
506 | */ |
507 | path->lowest_level = 1; |
508 | wret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: 0, cow: 1); |
509 | |
510 | if (wret < 0) { |
511 | ret = wret; |
512 | goto out; |
513 | } |
514 | if (!path->nodes[1]) { |
515 | ret = 0; |
516 | goto out; |
517 | } |
518 | /* |
519 | * The node at level 1 must always be locked when our path has |
520 | * keep_locks set and lowest_level is 1, regardless of the value of |
521 | * path->slots[1]. |
522 | */ |
523 | ASSERT(path->locks[1] != 0); |
524 | ret = btrfs_realloc_node(trans, root, |
525 | parent: path->nodes[1], start_slot: 0, |
526 | last_ret: &last_ret, |
527 | progress: &root->defrag_progress); |
528 | if (ret) { |
529 | WARN_ON(ret == -EAGAIN); |
530 | goto out; |
531 | } |
532 | /* |
533 | * Now that we reallocated the node we can find the next key. Note that |
534 | * btrfs_find_next_key() can release our path and do another search |
535 | * without COWing, this is because even with path->keep_locks = 1, |
536 | * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a |
537 | * node when path->slots[node_level - 1] does not point to the last |
538 | * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore |
539 | * we search for the next key after reallocating our node. |
540 | */ |
541 | path->slots[1] = btrfs_header_nritems(eb: path->nodes[1]); |
542 | next_key_ret = btrfs_find_next_key(root, path, key: &key, lowest_level: 1, |
543 | BTRFS_OLDEST_GENERATION); |
544 | if (next_key_ret == 0) { |
545 | memcpy(&root->defrag_progress, &key, sizeof(key)); |
546 | ret = -EAGAIN; |
547 | } |
548 | out: |
549 | btrfs_free_path(p: path); |
550 | if (ret == -EAGAIN) { |
551 | if (root->defrag_max.objectid > root->defrag_progress.objectid) |
552 | goto done; |
553 | if (root->defrag_max.type > root->defrag_progress.type) |
554 | goto done; |
555 | if (root->defrag_max.offset > root->defrag_progress.offset) |
556 | goto done; |
557 | ret = 0; |
558 | } |
559 | done: |
560 | if (ret != -EAGAIN) |
561 | memset(&root->defrag_progress, 0, |
562 | sizeof(root->defrag_progress)); |
563 | |
564 | return ret; |
565 | } |
566 | |
567 | /* |
568 | * Defrag a given btree. Every leaf in the btree is read and defragmented. |
569 | */ |
570 | int btrfs_defrag_root(struct btrfs_root *root) |
571 | { |
572 | struct btrfs_fs_info *fs_info = root->fs_info; |
573 | int ret; |
574 | |
575 | if (test_and_set_bit(nr: BTRFS_ROOT_DEFRAG_RUNNING, addr: &root->state)) |
576 | return 0; |
577 | |
578 | while (1) { |
579 | struct btrfs_trans_handle *trans; |
580 | |
581 | trans = btrfs_start_transaction(root, num_items: 0); |
582 | if (IS_ERR(ptr: trans)) { |
583 | ret = PTR_ERR(ptr: trans); |
584 | break; |
585 | } |
586 | |
587 | ret = btrfs_defrag_leaves(trans, root); |
588 | |
589 | btrfs_end_transaction(trans); |
590 | btrfs_btree_balance_dirty(fs_info); |
591 | cond_resched(); |
592 | |
593 | if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) |
594 | break; |
595 | |
596 | if (btrfs_defrag_cancelled(fs_info)) { |
597 | btrfs_debug(fs_info, "defrag_root cancelled" ); |
598 | ret = -EAGAIN; |
599 | break; |
600 | } |
601 | } |
602 | clear_bit(nr: BTRFS_ROOT_DEFRAG_RUNNING, addr: &root->state); |
603 | return ret; |
604 | } |
605 | |
606 | /* |
607 | * Defrag specific helper to get an extent map. |
608 | * |
609 | * Differences between this and btrfs_get_extent() are: |
610 | * |
611 | * - No extent_map will be added to inode->extent_tree |
612 | * To reduce memory usage in the long run. |
613 | * |
614 | * - Extra optimization to skip file extents older than @newer_than |
615 | * By using btrfs_search_forward() we can skip entire file ranges that |
616 | * have extents created in past transactions, because btrfs_search_forward() |
617 | * will not visit leaves and nodes with a generation smaller than given |
618 | * minimal generation threshold (@newer_than). |
619 | * |
620 | * Return valid em if we find a file extent matching the requirement. |
621 | * Return NULL if we can not find a file extent matching the requirement. |
622 | * |
623 | * Return ERR_PTR() for error. |
624 | */ |
625 | static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, |
626 | u64 start, u64 newer_than) |
627 | { |
628 | struct btrfs_root *root = inode->root; |
629 | struct btrfs_file_extent_item *fi; |
630 | struct btrfs_path path = { 0 }; |
631 | struct extent_map *em; |
632 | struct btrfs_key key; |
633 | u64 ino = btrfs_ino(inode); |
634 | int ret; |
635 | |
636 | em = alloc_extent_map(); |
637 | if (!em) { |
638 | ret = -ENOMEM; |
639 | goto err; |
640 | } |
641 | |
642 | key.objectid = ino; |
643 | key.type = BTRFS_EXTENT_DATA_KEY; |
644 | key.offset = start; |
645 | |
646 | if (newer_than) { |
647 | ret = btrfs_search_forward(root, min_key: &key, path: &path, min_trans: newer_than); |
648 | if (ret < 0) |
649 | goto err; |
650 | /* Can't find anything newer */ |
651 | if (ret > 0) |
652 | goto not_found; |
653 | } else { |
654 | ret = btrfs_search_slot(NULL, root, key: &key, p: &path, ins_len: 0, cow: 0); |
655 | if (ret < 0) |
656 | goto err; |
657 | } |
658 | if (path.slots[0] >= btrfs_header_nritems(eb: path.nodes[0])) { |
659 | /* |
660 | * If btrfs_search_slot() makes path to point beyond nritems, |
661 | * we should not have an empty leaf, as this inode must at |
662 | * least have its INODE_ITEM. |
663 | */ |
664 | ASSERT(btrfs_header_nritems(path.nodes[0])); |
665 | path.slots[0] = btrfs_header_nritems(eb: path.nodes[0]) - 1; |
666 | } |
667 | btrfs_item_key_to_cpu(eb: path.nodes[0], cpu_key: &key, nr: path.slots[0]); |
668 | /* Perfect match, no need to go one slot back */ |
669 | if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && |
670 | key.offset == start) |
671 | goto iterate; |
672 | |
673 | /* We didn't find a perfect match, needs to go one slot back */ |
674 | if (path.slots[0] > 0) { |
675 | btrfs_item_key_to_cpu(eb: path.nodes[0], cpu_key: &key, nr: path.slots[0]); |
676 | if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) |
677 | path.slots[0]--; |
678 | } |
679 | |
680 | iterate: |
681 | /* Iterate through the path to find a file extent covering @start */ |
682 | while (true) { |
683 | u64 extent_end; |
684 | |
685 | if (path.slots[0] >= btrfs_header_nritems(eb: path.nodes[0])) |
686 | goto next; |
687 | |
688 | btrfs_item_key_to_cpu(eb: path.nodes[0], cpu_key: &key, nr: path.slots[0]); |
689 | |
690 | /* |
691 | * We may go one slot back to INODE_REF/XATTR item, then |
692 | * need to go forward until we reach an EXTENT_DATA. |
693 | * But we should still has the correct ino as key.objectid. |
694 | */ |
695 | if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) |
696 | goto next; |
697 | |
698 | /* It's beyond our target range, definitely not extent found */ |
699 | if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) |
700 | goto not_found; |
701 | |
702 | /* |
703 | * | |<- File extent ->| |
704 | * \- start |
705 | * |
706 | * This means there is a hole between start and key.offset. |
707 | */ |
708 | if (key.offset > start) { |
709 | em->start = start; |
710 | em->orig_start = start; |
711 | em->block_start = EXTENT_MAP_HOLE; |
712 | em->len = key.offset - start; |
713 | break; |
714 | } |
715 | |
716 | fi = btrfs_item_ptr(path.nodes[0], path.slots[0], |
717 | struct btrfs_file_extent_item); |
718 | extent_end = btrfs_file_extent_end(path: &path); |
719 | |
720 | /* |
721 | * |<- file extent ->| | |
722 | * \- start |
723 | * |
724 | * We haven't reached start, search next slot. |
725 | */ |
726 | if (extent_end <= start) |
727 | goto next; |
728 | |
729 | /* Now this extent covers @start, convert it to em */ |
730 | btrfs_extent_item_to_extent_map(inode, path: &path, fi, em); |
731 | break; |
732 | next: |
733 | ret = btrfs_next_item(root, p: &path); |
734 | if (ret < 0) |
735 | goto err; |
736 | if (ret > 0) |
737 | goto not_found; |
738 | } |
739 | btrfs_release_path(p: &path); |
740 | return em; |
741 | |
742 | not_found: |
743 | btrfs_release_path(p: &path); |
744 | free_extent_map(em); |
745 | return NULL; |
746 | |
747 | err: |
748 | btrfs_release_path(p: &path); |
749 | free_extent_map(em); |
750 | return ERR_PTR(error: ret); |
751 | } |
752 | |
753 | static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, |
754 | u64 newer_than, bool locked) |
755 | { |
756 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
757 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
758 | struct extent_map *em; |
759 | const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; |
760 | |
761 | /* |
762 | * Hopefully we have this extent in the tree already, try without the |
763 | * full extent lock. |
764 | */ |
765 | read_lock(&em_tree->lock); |
766 | em = lookup_extent_mapping(tree: em_tree, start, len: sectorsize); |
767 | read_unlock(&em_tree->lock); |
768 | |
769 | /* |
770 | * We can get a merged extent, in that case, we need to re-search |
771 | * tree to get the original em for defrag. |
772 | * |
773 | * If @newer_than is 0 or em::generation < newer_than, we can trust |
774 | * this em, as either we don't care about the generation, or the |
775 | * merged extent map will be rejected anyway. |
776 | */ |
777 | if (em && (em->flags & EXTENT_FLAG_MERGED) && |
778 | newer_than && em->generation >= newer_than) { |
779 | free_extent_map(em); |
780 | em = NULL; |
781 | } |
782 | |
783 | if (!em) { |
784 | struct extent_state *cached = NULL; |
785 | u64 end = start + sectorsize - 1; |
786 | |
787 | /* Get the big lock and read metadata off disk. */ |
788 | if (!locked) |
789 | lock_extent(tree: io_tree, start, end, cached: &cached); |
790 | em = defrag_get_extent(inode: BTRFS_I(inode), start, newer_than); |
791 | if (!locked) |
792 | unlock_extent(tree: io_tree, start, end, cached: &cached); |
793 | |
794 | if (IS_ERR(ptr: em)) |
795 | return NULL; |
796 | } |
797 | |
798 | return em; |
799 | } |
800 | |
801 | static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, |
802 | const struct extent_map *em) |
803 | { |
804 | if (extent_map_is_compressed(em)) |
805 | return BTRFS_MAX_COMPRESSED; |
806 | return fs_info->max_extent_size; |
807 | } |
808 | |
809 | static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, |
810 | u32 extent_thresh, u64 newer_than, bool locked) |
811 | { |
812 | struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); |
813 | struct extent_map *next; |
814 | bool ret = false; |
815 | |
816 | /* This is the last extent */ |
817 | if (em->start + em->len >= i_size_read(inode)) |
818 | return false; |
819 | |
820 | /* |
821 | * Here we need to pass @newer_then when checking the next extent, or |
822 | * we will hit a case we mark current extent for defrag, but the next |
823 | * one will not be a target. |
824 | * This will just cause extra IO without really reducing the fragments. |
825 | */ |
826 | next = defrag_lookup_extent(inode, start: em->start + em->len, newer_than, locked); |
827 | /* No more em or hole */ |
828 | if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) |
829 | goto out; |
830 | if (next->flags & EXTENT_FLAG_PREALLOC) |
831 | goto out; |
832 | /* |
833 | * If the next extent is at its max capacity, defragging current extent |
834 | * makes no sense, as the total number of extents won't change. |
835 | */ |
836 | if (next->len >= get_extent_max_capacity(fs_info, em)) |
837 | goto out; |
838 | /* Skip older extent */ |
839 | if (next->generation < newer_than) |
840 | goto out; |
841 | /* Also check extent size */ |
842 | if (next->len >= extent_thresh) |
843 | goto out; |
844 | |
845 | ret = true; |
846 | out: |
847 | free_extent_map(em: next); |
848 | return ret; |
849 | } |
850 | |
851 | /* |
852 | * Prepare one page to be defragged. |
853 | * |
854 | * This will ensure: |
855 | * |
856 | * - Returned page is locked and has been set up properly. |
857 | * - No ordered extent exists in the page. |
858 | * - The page is uptodate. |
859 | * |
860 | * NOTE: Caller should also wait for page writeback after the cluster is |
861 | * prepared, here we don't do writeback wait for each page. |
862 | */ |
863 | static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) |
864 | { |
865 | struct address_space *mapping = inode->vfs_inode.i_mapping; |
866 | gfp_t mask = btrfs_alloc_write_mask(mapping); |
867 | u64 page_start = (u64)index << PAGE_SHIFT; |
868 | u64 page_end = page_start + PAGE_SIZE - 1; |
869 | struct extent_state *cached_state = NULL; |
870 | struct folio *folio; |
871 | int ret; |
872 | |
873 | again: |
874 | folio = __filemap_get_folio(mapping, index, |
875 | FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp: mask); |
876 | if (IS_ERR(ptr: folio)) |
877 | return folio; |
878 | |
879 | /* |
880 | * Since we can defragment files opened read-only, we can encounter |
881 | * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We |
882 | * can't do I/O using huge pages yet, so return an error for now. |
883 | * Filesystem transparent huge pages are typically only used for |
884 | * executables that explicitly enable them, so this isn't very |
885 | * restrictive. |
886 | */ |
887 | if (folio_test_large(folio)) { |
888 | folio_unlock(folio); |
889 | folio_put(folio); |
890 | return ERR_PTR(error: -ETXTBSY); |
891 | } |
892 | |
893 | ret = set_folio_extent_mapped(folio); |
894 | if (ret < 0) { |
895 | folio_unlock(folio); |
896 | folio_put(folio); |
897 | return ERR_PTR(error: ret); |
898 | } |
899 | |
900 | /* Wait for any existing ordered extent in the range */ |
901 | while (1) { |
902 | struct btrfs_ordered_extent *ordered; |
903 | |
904 | lock_extent(tree: &inode->io_tree, start: page_start, end: page_end, cached: &cached_state); |
905 | ordered = btrfs_lookup_ordered_range(inode, file_offset: page_start, PAGE_SIZE); |
906 | unlock_extent(tree: &inode->io_tree, start: page_start, end: page_end, |
907 | cached: &cached_state); |
908 | if (!ordered) |
909 | break; |
910 | |
911 | folio_unlock(folio); |
912 | btrfs_start_ordered_extent(entry: ordered); |
913 | btrfs_put_ordered_extent(entry: ordered); |
914 | folio_lock(folio); |
915 | /* |
916 | * We unlocked the folio above, so we need check if it was |
917 | * released or not. |
918 | */ |
919 | if (folio->mapping != mapping || !folio->private) { |
920 | folio_unlock(folio); |
921 | folio_put(folio); |
922 | goto again; |
923 | } |
924 | } |
925 | |
926 | /* |
927 | * Now the page range has no ordered extent any more. Read the page to |
928 | * make it uptodate. |
929 | */ |
930 | if (!folio_test_uptodate(folio)) { |
931 | btrfs_read_folio(NULL, folio); |
932 | folio_lock(folio); |
933 | if (folio->mapping != mapping || !folio->private) { |
934 | folio_unlock(folio); |
935 | folio_put(folio); |
936 | goto again; |
937 | } |
938 | if (!folio_test_uptodate(folio)) { |
939 | folio_unlock(folio); |
940 | folio_put(folio); |
941 | return ERR_PTR(error: -EIO); |
942 | } |
943 | } |
944 | return folio; |
945 | } |
946 | |
947 | struct defrag_target_range { |
948 | struct list_head list; |
949 | u64 start; |
950 | u64 len; |
951 | }; |
952 | |
953 | /* |
954 | * Collect all valid target extents. |
955 | * |
956 | * @start: file offset to lookup |
957 | * @len: length to lookup |
958 | * @extent_thresh: file extent size threshold, any extent size >= this value |
959 | * will be ignored |
960 | * @newer_than: only defrag extents newer than this value |
961 | * @do_compress: whether the defrag is doing compression |
962 | * if true, @extent_thresh will be ignored and all regular |
963 | * file extents meeting @newer_than will be targets. |
964 | * @locked: if the range has already held extent lock |
965 | * @target_list: list of targets file extents |
966 | */ |
967 | static int defrag_collect_targets(struct btrfs_inode *inode, |
968 | u64 start, u64 len, u32 extent_thresh, |
969 | u64 newer_than, bool do_compress, |
970 | bool locked, struct list_head *target_list, |
971 | u64 *last_scanned_ret) |
972 | { |
973 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
974 | bool last_is_target = false; |
975 | u64 cur = start; |
976 | int ret = 0; |
977 | |
978 | while (cur < start + len) { |
979 | struct extent_map *em; |
980 | struct defrag_target_range *new; |
981 | bool next_mergeable = true; |
982 | u64 range_len; |
983 | |
984 | last_is_target = false; |
985 | em = defrag_lookup_extent(inode: &inode->vfs_inode, start: cur, newer_than, locked); |
986 | if (!em) |
987 | break; |
988 | |
989 | /* |
990 | * If the file extent is an inlined one, we may still want to |
991 | * defrag it (fallthrough) if it will cause a regular extent. |
992 | * This is for users who want to convert inline extents to |
993 | * regular ones through max_inline= mount option. |
994 | */ |
995 | if (em->block_start == EXTENT_MAP_INLINE && |
996 | em->len <= inode->root->fs_info->max_inline) |
997 | goto next; |
998 | |
999 | /* Skip holes and preallocated extents. */ |
1000 | if (em->block_start == EXTENT_MAP_HOLE || |
1001 | (em->flags & EXTENT_FLAG_PREALLOC)) |
1002 | goto next; |
1003 | |
1004 | /* Skip older extent */ |
1005 | if (em->generation < newer_than) |
1006 | goto next; |
1007 | |
1008 | /* This em is under writeback, no need to defrag */ |
1009 | if (em->generation == (u64)-1) |
1010 | goto next; |
1011 | |
1012 | /* |
1013 | * Our start offset might be in the middle of an existing extent |
1014 | * map, so take that into account. |
1015 | */ |
1016 | range_len = em->len - (cur - em->start); |
1017 | /* |
1018 | * If this range of the extent map is already flagged for delalloc, |
1019 | * skip it, because: |
1020 | * |
1021 | * 1) We could deadlock later, when trying to reserve space for |
1022 | * delalloc, because in case we can't immediately reserve space |
1023 | * the flusher can start delalloc and wait for the respective |
1024 | * ordered extents to complete. The deadlock would happen |
1025 | * because we do the space reservation while holding the range |
1026 | * locked, and starting writeback, or finishing an ordered |
1027 | * extent, requires locking the range; |
1028 | * |
1029 | * 2) If there's delalloc there, it means there's dirty pages for |
1030 | * which writeback has not started yet (we clean the delalloc |
1031 | * flag when starting writeback and after creating an ordered |
1032 | * extent). If we mark pages in an adjacent range for defrag, |
1033 | * then we will have a larger contiguous range for delalloc, |
1034 | * very likely resulting in a larger extent after writeback is |
1035 | * triggered (except in a case of free space fragmentation). |
1036 | */ |
1037 | if (test_range_bit_exists(tree: &inode->io_tree, start: cur, end: cur + range_len - 1, |
1038 | bit: EXTENT_DELALLOC)) |
1039 | goto next; |
1040 | |
1041 | /* |
1042 | * For do_compress case, we want to compress all valid file |
1043 | * extents, thus no @extent_thresh or mergeable check. |
1044 | */ |
1045 | if (do_compress) |
1046 | goto add; |
1047 | |
1048 | /* Skip too large extent */ |
1049 | if (em->len >= extent_thresh) |
1050 | goto next; |
1051 | |
1052 | /* |
1053 | * Skip extents already at its max capacity, this is mostly for |
1054 | * compressed extents, which max cap is only 128K. |
1055 | */ |
1056 | if (em->len >= get_extent_max_capacity(fs_info, em)) |
1057 | goto next; |
1058 | |
1059 | /* |
1060 | * Normally there are no more extents after an inline one, thus |
1061 | * @next_mergeable will normally be false and not defragged. |
1062 | * So if an inline extent passed all above checks, just add it |
1063 | * for defrag, and be converted to regular extents. |
1064 | */ |
1065 | if (em->block_start == EXTENT_MAP_INLINE) |
1066 | goto add; |
1067 | |
1068 | next_mergeable = defrag_check_next_extent(inode: &inode->vfs_inode, em, |
1069 | extent_thresh, newer_than, locked); |
1070 | if (!next_mergeable) { |
1071 | struct defrag_target_range *last; |
1072 | |
1073 | /* Empty target list, no way to merge with last entry */ |
1074 | if (list_empty(head: target_list)) |
1075 | goto next; |
1076 | last = list_entry(target_list->prev, |
1077 | struct defrag_target_range, list); |
1078 | /* Not mergeable with last entry */ |
1079 | if (last->start + last->len != cur) |
1080 | goto next; |
1081 | |
1082 | /* Mergeable, fall through to add it to @target_list. */ |
1083 | } |
1084 | |
1085 | add: |
1086 | last_is_target = true; |
1087 | range_len = min(extent_map_end(em), start + len) - cur; |
1088 | /* |
1089 | * This one is a good target, check if it can be merged into |
1090 | * last range of the target list. |
1091 | */ |
1092 | if (!list_empty(head: target_list)) { |
1093 | struct defrag_target_range *last; |
1094 | |
1095 | last = list_entry(target_list->prev, |
1096 | struct defrag_target_range, list); |
1097 | ASSERT(last->start + last->len <= cur); |
1098 | if (last->start + last->len == cur) { |
1099 | /* Mergeable, enlarge the last entry */ |
1100 | last->len += range_len; |
1101 | goto next; |
1102 | } |
1103 | /* Fall through to allocate a new entry */ |
1104 | } |
1105 | |
1106 | /* Allocate new defrag_target_range */ |
1107 | new = kmalloc(size: sizeof(*new), GFP_NOFS); |
1108 | if (!new) { |
1109 | free_extent_map(em); |
1110 | ret = -ENOMEM; |
1111 | break; |
1112 | } |
1113 | new->start = cur; |
1114 | new->len = range_len; |
1115 | list_add_tail(new: &new->list, head: target_list); |
1116 | |
1117 | next: |
1118 | cur = extent_map_end(em); |
1119 | free_extent_map(em); |
1120 | } |
1121 | if (ret < 0) { |
1122 | struct defrag_target_range *entry; |
1123 | struct defrag_target_range *tmp; |
1124 | |
1125 | list_for_each_entry_safe(entry, tmp, target_list, list) { |
1126 | list_del_init(entry: &entry->list); |
1127 | kfree(objp: entry); |
1128 | } |
1129 | } |
1130 | if (!ret && last_scanned_ret) { |
1131 | /* |
1132 | * If the last extent is not a target, the caller can skip to |
1133 | * the end of that extent. |
1134 | * Otherwise, we can only go the end of the specified range. |
1135 | */ |
1136 | if (!last_is_target) |
1137 | *last_scanned_ret = max(cur, *last_scanned_ret); |
1138 | else |
1139 | *last_scanned_ret = max(start + len, *last_scanned_ret); |
1140 | } |
1141 | return ret; |
1142 | } |
1143 | |
1144 | #define CLUSTER_SIZE (SZ_256K) |
1145 | static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); |
1146 | |
1147 | /* |
1148 | * Defrag one contiguous target range. |
1149 | * |
1150 | * @inode: target inode |
1151 | * @target: target range to defrag |
1152 | * @pages: locked pages covering the defrag range |
1153 | * @nr_pages: number of locked pages |
1154 | * |
1155 | * Caller should ensure: |
1156 | * |
1157 | * - Pages are prepared |
1158 | * Pages should be locked, no ordered extent in the pages range, |
1159 | * no writeback. |
1160 | * |
1161 | * - Extent bits are locked |
1162 | */ |
1163 | static int defrag_one_locked_target(struct btrfs_inode *inode, |
1164 | struct defrag_target_range *target, |
1165 | struct folio **folios, int nr_pages, |
1166 | struct extent_state **cached_state) |
1167 | { |
1168 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
1169 | struct extent_changeset *data_reserved = NULL; |
1170 | const u64 start = target->start; |
1171 | const u64 len = target->len; |
1172 | unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; |
1173 | unsigned long start_index = start >> PAGE_SHIFT; |
1174 | unsigned long first_index = folios[0]->index; |
1175 | int ret = 0; |
1176 | int i; |
1177 | |
1178 | ASSERT(last_index - first_index + 1 <= nr_pages); |
1179 | |
1180 | ret = btrfs_delalloc_reserve_space(inode, reserved: &data_reserved, start, len); |
1181 | if (ret < 0) |
1182 | return ret; |
1183 | clear_extent_bit(tree: &inode->io_tree, start, end: start + len - 1, |
1184 | bits: EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | |
1185 | EXTENT_DEFRAG, cached: cached_state); |
1186 | set_extent_bit(tree: &inode->io_tree, start, end: start + len - 1, |
1187 | bits: EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); |
1188 | |
1189 | /* Update the page status */ |
1190 | for (i = start_index - first_index; i <= last_index - first_index; i++) { |
1191 | folio_clear_checked(folio: folios[i]); |
1192 | btrfs_folio_clamp_set_dirty(fs_info, folio: folios[i], start, len); |
1193 | } |
1194 | btrfs_delalloc_release_extents(inode, num_bytes: len); |
1195 | extent_changeset_free(changeset: data_reserved); |
1196 | |
1197 | return ret; |
1198 | } |
1199 | |
1200 | static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, |
1201 | u32 extent_thresh, u64 newer_than, bool do_compress, |
1202 | u64 *last_scanned_ret) |
1203 | { |
1204 | struct extent_state *cached_state = NULL; |
1205 | struct defrag_target_range *entry; |
1206 | struct defrag_target_range *tmp; |
1207 | LIST_HEAD(target_list); |
1208 | struct folio **folios; |
1209 | const u32 sectorsize = inode->root->fs_info->sectorsize; |
1210 | u64 last_index = (start + len - 1) >> PAGE_SHIFT; |
1211 | u64 start_index = start >> PAGE_SHIFT; |
1212 | unsigned int nr_pages = last_index - start_index + 1; |
1213 | int ret = 0; |
1214 | int i; |
1215 | |
1216 | ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); |
1217 | ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); |
1218 | |
1219 | folios = kcalloc(n: nr_pages, size: sizeof(struct folio *), GFP_NOFS); |
1220 | if (!folios) |
1221 | return -ENOMEM; |
1222 | |
1223 | /* Prepare all pages */ |
1224 | for (i = 0; i < nr_pages; i++) { |
1225 | folios[i] = defrag_prepare_one_folio(inode, index: start_index + i); |
1226 | if (IS_ERR(ptr: folios[i])) { |
1227 | ret = PTR_ERR(ptr: folios[i]); |
1228 | nr_pages = i; |
1229 | goto free_folios; |
1230 | } |
1231 | } |
1232 | for (i = 0; i < nr_pages; i++) |
1233 | folio_wait_writeback(folio: folios[i]); |
1234 | |
1235 | /* Lock the pages range */ |
1236 | lock_extent(tree: &inode->io_tree, start: start_index << PAGE_SHIFT, |
1237 | end: (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, |
1238 | cached: &cached_state); |
1239 | /* |
1240 | * Now we have a consistent view about the extent map, re-check |
1241 | * which range really needs to be defragged. |
1242 | * |
1243 | * And this time we have extent locked already, pass @locked = true |
1244 | * so that we won't relock the extent range and cause deadlock. |
1245 | */ |
1246 | ret = defrag_collect_targets(inode, start, len, extent_thresh, |
1247 | newer_than, do_compress, locked: true, |
1248 | target_list: &target_list, last_scanned_ret); |
1249 | if (ret < 0) |
1250 | goto unlock_extent; |
1251 | |
1252 | list_for_each_entry(entry, &target_list, list) { |
1253 | ret = defrag_one_locked_target(inode, target: entry, folios, nr_pages, |
1254 | cached_state: &cached_state); |
1255 | if (ret < 0) |
1256 | break; |
1257 | } |
1258 | |
1259 | list_for_each_entry_safe(entry, tmp, &target_list, list) { |
1260 | list_del_init(entry: &entry->list); |
1261 | kfree(objp: entry); |
1262 | } |
1263 | unlock_extent: |
1264 | unlock_extent(tree: &inode->io_tree, start: start_index << PAGE_SHIFT, |
1265 | end: (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, |
1266 | cached: &cached_state); |
1267 | free_folios: |
1268 | for (i = 0; i < nr_pages; i++) { |
1269 | folio_unlock(folio: folios[i]); |
1270 | folio_put(folio: folios[i]); |
1271 | } |
1272 | kfree(objp: folios); |
1273 | return ret; |
1274 | } |
1275 | |
1276 | static int defrag_one_cluster(struct btrfs_inode *inode, |
1277 | struct file_ra_state *ra, |
1278 | u64 start, u32 len, u32 extent_thresh, |
1279 | u64 newer_than, bool do_compress, |
1280 | unsigned long *sectors_defragged, |
1281 | unsigned long max_sectors, |
1282 | u64 *last_scanned_ret) |
1283 | { |
1284 | const u32 sectorsize = inode->root->fs_info->sectorsize; |
1285 | struct defrag_target_range *entry; |
1286 | struct defrag_target_range *tmp; |
1287 | LIST_HEAD(target_list); |
1288 | int ret; |
1289 | |
1290 | ret = defrag_collect_targets(inode, start, len, extent_thresh, |
1291 | newer_than, do_compress, locked: false, |
1292 | target_list: &target_list, NULL); |
1293 | if (ret < 0) |
1294 | goto out; |
1295 | |
1296 | list_for_each_entry(entry, &target_list, list) { |
1297 | u32 range_len = entry->len; |
1298 | |
1299 | /* Reached or beyond the limit */ |
1300 | if (max_sectors && *sectors_defragged >= max_sectors) { |
1301 | ret = 1; |
1302 | break; |
1303 | } |
1304 | |
1305 | if (max_sectors) |
1306 | range_len = min_t(u32, range_len, |
1307 | (max_sectors - *sectors_defragged) * sectorsize); |
1308 | |
1309 | /* |
1310 | * If defrag_one_range() has updated last_scanned_ret, |
1311 | * our range may already be invalid (e.g. hole punched). |
1312 | * Skip if our range is before last_scanned_ret, as there is |
1313 | * no need to defrag the range anymore. |
1314 | */ |
1315 | if (entry->start + range_len <= *last_scanned_ret) |
1316 | continue; |
1317 | |
1318 | if (ra) |
1319 | page_cache_sync_readahead(mapping: inode->vfs_inode.i_mapping, |
1320 | ra, NULL, index: entry->start >> PAGE_SHIFT, |
1321 | req_count: ((entry->start + range_len - 1) >> PAGE_SHIFT) - |
1322 | (entry->start >> PAGE_SHIFT) + 1); |
1323 | /* |
1324 | * Here we may not defrag any range if holes are punched before |
1325 | * we locked the pages. |
1326 | * But that's fine, it only affects the @sectors_defragged |
1327 | * accounting. |
1328 | */ |
1329 | ret = defrag_one_range(inode, start: entry->start, len: range_len, |
1330 | extent_thresh, newer_than, do_compress, |
1331 | last_scanned_ret); |
1332 | if (ret < 0) |
1333 | break; |
1334 | *sectors_defragged += range_len >> |
1335 | inode->root->fs_info->sectorsize_bits; |
1336 | } |
1337 | out: |
1338 | list_for_each_entry_safe(entry, tmp, &target_list, list) { |
1339 | list_del_init(entry: &entry->list); |
1340 | kfree(objp: entry); |
1341 | } |
1342 | if (ret >= 0) |
1343 | *last_scanned_ret = max(*last_scanned_ret, start + len); |
1344 | return ret; |
1345 | } |
1346 | |
1347 | /* |
1348 | * Entry point to file defragmentation. |
1349 | * |
1350 | * @inode: inode to be defragged |
1351 | * @ra: readahead state (can be NUL) |
1352 | * @range: defrag options including range and flags |
1353 | * @newer_than: minimum transid to defrag |
1354 | * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode |
1355 | * will be defragged. |
1356 | * |
1357 | * Return <0 for error. |
1358 | * Return >=0 for the number of sectors defragged, and range->start will be updated |
1359 | * to indicate the file offset where next defrag should be started at. |
1360 | * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without |
1361 | * defragging all the range). |
1362 | */ |
1363 | int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, |
1364 | struct btrfs_ioctl_defrag_range_args *range, |
1365 | u64 newer_than, unsigned long max_to_defrag) |
1366 | { |
1367 | struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); |
1368 | unsigned long sectors_defragged = 0; |
1369 | u64 isize = i_size_read(inode); |
1370 | u64 cur; |
1371 | u64 last_byte; |
1372 | bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); |
1373 | bool ra_allocated = false; |
1374 | int compress_type = BTRFS_COMPRESS_ZLIB; |
1375 | int ret = 0; |
1376 | u32 extent_thresh = range->extent_thresh; |
1377 | pgoff_t start_index; |
1378 | |
1379 | if (isize == 0) |
1380 | return 0; |
1381 | |
1382 | if (range->start >= isize) |
1383 | return -EINVAL; |
1384 | |
1385 | if (do_compress) { |
1386 | if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) |
1387 | return -EINVAL; |
1388 | if (range->compress_type) |
1389 | compress_type = range->compress_type; |
1390 | } |
1391 | |
1392 | if (extent_thresh == 0) |
1393 | extent_thresh = SZ_256K; |
1394 | |
1395 | if (range->start + range->len > range->start) { |
1396 | /* Got a specific range */ |
1397 | last_byte = min(isize, range->start + range->len); |
1398 | } else { |
1399 | /* Defrag until file end */ |
1400 | last_byte = isize; |
1401 | } |
1402 | |
1403 | /* Align the range */ |
1404 | cur = round_down(range->start, fs_info->sectorsize); |
1405 | last_byte = round_up(last_byte, fs_info->sectorsize) - 1; |
1406 | |
1407 | /* |
1408 | * If we were not given a ra, allocate a readahead context. As |
1409 | * readahead is just an optimization, defrag will work without it so |
1410 | * we don't error out. |
1411 | */ |
1412 | if (!ra) { |
1413 | ra_allocated = true; |
1414 | ra = kzalloc(size: sizeof(*ra), GFP_KERNEL); |
1415 | if (ra) |
1416 | file_ra_state_init(ra, mapping: inode->i_mapping); |
1417 | } |
1418 | |
1419 | /* |
1420 | * Make writeback start from the beginning of the range, so that the |
1421 | * defrag range can be written sequentially. |
1422 | */ |
1423 | start_index = cur >> PAGE_SHIFT; |
1424 | if (start_index < inode->i_mapping->writeback_index) |
1425 | inode->i_mapping->writeback_index = start_index; |
1426 | |
1427 | while (cur < last_byte) { |
1428 | const unsigned long prev_sectors_defragged = sectors_defragged; |
1429 | u64 last_scanned = cur; |
1430 | u64 cluster_end; |
1431 | |
1432 | if (btrfs_defrag_cancelled(fs_info)) { |
1433 | ret = -EAGAIN; |
1434 | break; |
1435 | } |
1436 | |
1437 | /* We want the cluster end at page boundary when possible */ |
1438 | cluster_end = (((cur >> PAGE_SHIFT) + |
1439 | (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; |
1440 | cluster_end = min(cluster_end, last_byte); |
1441 | |
1442 | btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: 0); |
1443 | if (IS_SWAPFILE(inode)) { |
1444 | ret = -ETXTBSY; |
1445 | btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: 0); |
1446 | break; |
1447 | } |
1448 | if (!(inode->i_sb->s_flags & SB_ACTIVE)) { |
1449 | btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: 0); |
1450 | break; |
1451 | } |
1452 | if (do_compress) |
1453 | BTRFS_I(inode)->defrag_compress = compress_type; |
1454 | ret = defrag_one_cluster(inode: BTRFS_I(inode), ra, start: cur, |
1455 | len: cluster_end + 1 - cur, extent_thresh, |
1456 | newer_than, do_compress, sectors_defragged: §ors_defragged, |
1457 | max_sectors: max_to_defrag, last_scanned_ret: &last_scanned); |
1458 | |
1459 | if (sectors_defragged > prev_sectors_defragged) |
1460 | balance_dirty_pages_ratelimited(mapping: inode->i_mapping); |
1461 | |
1462 | btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: 0); |
1463 | if (ret < 0) |
1464 | break; |
1465 | cur = max(cluster_end + 1, last_scanned); |
1466 | if (ret > 0) { |
1467 | ret = 0; |
1468 | break; |
1469 | } |
1470 | cond_resched(); |
1471 | } |
1472 | |
1473 | if (ra_allocated) |
1474 | kfree(objp: ra); |
1475 | /* |
1476 | * Update range.start for autodefrag, this will indicate where to start |
1477 | * in next run. |
1478 | */ |
1479 | range->start = cur; |
1480 | if (sectors_defragged) { |
1481 | /* |
1482 | * We have defragged some sectors, for compression case they |
1483 | * need to be written back immediately. |
1484 | */ |
1485 | if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { |
1486 | filemap_flush(inode->i_mapping); |
1487 | if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, |
1488 | &BTRFS_I(inode)->runtime_flags)) |
1489 | filemap_flush(inode->i_mapping); |
1490 | } |
1491 | if (range->compress_type == BTRFS_COMPRESS_LZO) |
1492 | btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); |
1493 | else if (range->compress_type == BTRFS_COMPRESS_ZSTD) |
1494 | btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); |
1495 | ret = sectors_defragged; |
1496 | } |
1497 | if (do_compress) { |
1498 | btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: 0); |
1499 | BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; |
1500 | btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: 0); |
1501 | } |
1502 | return ret; |
1503 | } |
1504 | |
1505 | void __cold btrfs_auto_defrag_exit(void) |
1506 | { |
1507 | kmem_cache_destroy(s: btrfs_inode_defrag_cachep); |
1508 | } |
1509 | |
1510 | int __init btrfs_auto_defrag_init(void) |
1511 | { |
1512 | btrfs_inode_defrag_cachep = kmem_cache_create(name: "btrfs_inode_defrag" , |
1513 | size: sizeof(struct inode_defrag), align: 0, flags: 0, NULL); |
1514 | if (!btrfs_inode_defrag_cachep) |
1515 | return -ENOMEM; |
1516 | |
1517 | return 0; |
1518 | } |
1519 | |