1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "messages.h" |
4 | #include "tree-mod-log.h" |
5 | #include "disk-io.h" |
6 | #include "fs.h" |
7 | #include "accessors.h" |
8 | #include "tree-checker.h" |
9 | |
10 | struct tree_mod_root { |
11 | u64 logical; |
12 | u8 level; |
13 | }; |
14 | |
15 | struct tree_mod_elem { |
16 | struct rb_node node; |
17 | u64 logical; |
18 | u64 seq; |
19 | enum btrfs_mod_log_op op; |
20 | |
21 | /* |
22 | * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS |
23 | * operations. |
24 | */ |
25 | int slot; |
26 | |
27 | /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ |
28 | u64 generation; |
29 | |
30 | /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ |
31 | struct btrfs_disk_key key; |
32 | u64 blockptr; |
33 | |
34 | /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ |
35 | struct { |
36 | int dst_slot; |
37 | int nr_items; |
38 | } move; |
39 | |
40 | /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ |
41 | struct tree_mod_root old_root; |
42 | }; |
43 | |
44 | /* |
45 | * Pull a new tree mod seq number for our operation. |
46 | */ |
47 | static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) |
48 | { |
49 | return atomic64_inc_return(v: &fs_info->tree_mod_seq); |
50 | } |
51 | |
52 | /* |
53 | * This adds a new blocker to the tree mod log's blocker list if the @elem |
54 | * passed does not already have a sequence number set. So when a caller expects |
55 | * to record tree modifications, it should ensure to set elem->seq to zero |
56 | * before calling btrfs_get_tree_mod_seq. |
57 | * Returns a fresh, unused tree log modification sequence number, even if no new |
58 | * blocker was added. |
59 | */ |
60 | u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, |
61 | struct btrfs_seq_list *elem) |
62 | { |
63 | write_lock(&fs_info->tree_mod_log_lock); |
64 | if (!elem->seq) { |
65 | elem->seq = btrfs_inc_tree_mod_seq(fs_info); |
66 | list_add_tail(new: &elem->list, head: &fs_info->tree_mod_seq_list); |
67 | set_bit(nr: BTRFS_FS_TREE_MOD_LOG_USERS, addr: &fs_info->flags); |
68 | } |
69 | write_unlock(&fs_info->tree_mod_log_lock); |
70 | |
71 | return elem->seq; |
72 | } |
73 | |
74 | void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, |
75 | struct btrfs_seq_list *elem) |
76 | { |
77 | struct rb_root *tm_root; |
78 | struct rb_node *node; |
79 | struct rb_node *next; |
80 | struct tree_mod_elem *tm; |
81 | u64 min_seq = BTRFS_SEQ_LAST; |
82 | u64 seq_putting = elem->seq; |
83 | |
84 | if (!seq_putting) |
85 | return; |
86 | |
87 | write_lock(&fs_info->tree_mod_log_lock); |
88 | list_del(entry: &elem->list); |
89 | elem->seq = 0; |
90 | |
91 | if (list_empty(head: &fs_info->tree_mod_seq_list)) { |
92 | clear_bit(nr: BTRFS_FS_TREE_MOD_LOG_USERS, addr: &fs_info->flags); |
93 | } else { |
94 | struct btrfs_seq_list *first; |
95 | |
96 | first = list_first_entry(&fs_info->tree_mod_seq_list, |
97 | struct btrfs_seq_list, list); |
98 | if (seq_putting > first->seq) { |
99 | /* |
100 | * Blocker with lower sequence number exists, we cannot |
101 | * remove anything from the log. |
102 | */ |
103 | write_unlock(&fs_info->tree_mod_log_lock); |
104 | return; |
105 | } |
106 | min_seq = first->seq; |
107 | } |
108 | |
109 | /* |
110 | * Anything that's lower than the lowest existing (read: blocked) |
111 | * sequence number can be removed from the tree. |
112 | */ |
113 | tm_root = &fs_info->tree_mod_log; |
114 | for (node = rb_first(tm_root); node; node = next) { |
115 | next = rb_next(node); |
116 | tm = rb_entry(node, struct tree_mod_elem, node); |
117 | if (tm->seq >= min_seq) |
118 | continue; |
119 | rb_erase(node, tm_root); |
120 | kfree(objp: tm); |
121 | } |
122 | write_unlock(&fs_info->tree_mod_log_lock); |
123 | } |
124 | |
125 | /* |
126 | * Key order of the log: |
127 | * node/leaf start address -> sequence |
128 | * |
129 | * The 'start address' is the logical address of the *new* root node for root |
130 | * replace operations, or the logical address of the affected block for all |
131 | * other operations. |
132 | */ |
133 | static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, |
134 | struct tree_mod_elem *tm) |
135 | { |
136 | struct rb_root *tm_root; |
137 | struct rb_node **new; |
138 | struct rb_node *parent = NULL; |
139 | struct tree_mod_elem *cur; |
140 | |
141 | lockdep_assert_held_write(&fs_info->tree_mod_log_lock); |
142 | |
143 | tm->seq = btrfs_inc_tree_mod_seq(fs_info); |
144 | |
145 | tm_root = &fs_info->tree_mod_log; |
146 | new = &tm_root->rb_node; |
147 | while (*new) { |
148 | cur = rb_entry(*new, struct tree_mod_elem, node); |
149 | parent = *new; |
150 | if (cur->logical < tm->logical) |
151 | new = &((*new)->rb_left); |
152 | else if (cur->logical > tm->logical) |
153 | new = &((*new)->rb_right); |
154 | else if (cur->seq < tm->seq) |
155 | new = &((*new)->rb_left); |
156 | else if (cur->seq > tm->seq) |
157 | new = &((*new)->rb_right); |
158 | else |
159 | return -EEXIST; |
160 | } |
161 | |
162 | rb_link_node(node: &tm->node, parent, rb_link: new); |
163 | rb_insert_color(&tm->node, tm_root); |
164 | return 0; |
165 | } |
166 | |
167 | /* |
168 | * Determines if logging can be omitted. Returns true if it can. Otherwise, it |
169 | * returns false with the tree_mod_log_lock acquired. The caller must hold |
170 | * this until all tree mod log insertions are recorded in the rb tree and then |
171 | * write unlock fs_info::tree_mod_log_lock. |
172 | */ |
173 | static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) |
174 | { |
175 | if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) |
176 | return true; |
177 | if (eb && btrfs_header_level(eb) == 0) |
178 | return true; |
179 | |
180 | write_lock(&fs_info->tree_mod_log_lock); |
181 | if (list_empty(head: &(fs_info)->tree_mod_seq_list)) { |
182 | write_unlock(&fs_info->tree_mod_log_lock); |
183 | return true; |
184 | } |
185 | |
186 | return false; |
187 | } |
188 | |
189 | /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ |
190 | static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, |
191 | struct extent_buffer *eb) |
192 | { |
193 | if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) |
194 | return false; |
195 | if (eb && btrfs_header_level(eb) == 0) |
196 | return false; |
197 | |
198 | return true; |
199 | } |
200 | |
201 | static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, |
202 | int slot, |
203 | enum btrfs_mod_log_op op) |
204 | { |
205 | struct tree_mod_elem *tm; |
206 | |
207 | tm = kzalloc(size: sizeof(*tm), GFP_NOFS); |
208 | if (!tm) |
209 | return NULL; |
210 | |
211 | tm->logical = eb->start; |
212 | if (op != BTRFS_MOD_LOG_KEY_ADD) { |
213 | btrfs_node_key(eb, disk_key: &tm->key, nr: slot); |
214 | tm->blockptr = btrfs_node_blockptr(eb, nr: slot); |
215 | } |
216 | tm->op = op; |
217 | tm->slot = slot; |
218 | tm->generation = btrfs_node_ptr_generation(eb, nr: slot); |
219 | RB_CLEAR_NODE(&tm->node); |
220 | |
221 | return tm; |
222 | } |
223 | |
224 | int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, |
225 | enum btrfs_mod_log_op op) |
226 | { |
227 | struct tree_mod_elem *tm; |
228 | int ret = 0; |
229 | |
230 | if (!tree_mod_need_log(fs_info: eb->fs_info, eb)) |
231 | return 0; |
232 | |
233 | tm = alloc_tree_mod_elem(eb, slot, op); |
234 | if (!tm) |
235 | ret = -ENOMEM; |
236 | |
237 | if (tree_mod_dont_log(fs_info: eb->fs_info, eb)) { |
238 | kfree(objp: tm); |
239 | /* |
240 | * Don't error if we failed to allocate memory because we don't |
241 | * need to log. |
242 | */ |
243 | return 0; |
244 | } else if (ret != 0) { |
245 | /* |
246 | * We previously failed to allocate memory and we need to log, |
247 | * so we have to fail. |
248 | */ |
249 | goto out_unlock; |
250 | } |
251 | |
252 | ret = tree_mod_log_insert(fs_info: eb->fs_info, tm); |
253 | out_unlock: |
254 | write_unlock(&eb->fs_info->tree_mod_log_lock); |
255 | if (ret) |
256 | kfree(objp: tm); |
257 | |
258 | return ret; |
259 | } |
260 | |
261 | static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, |
262 | int dst_slot, int src_slot, |
263 | int nr_items) |
264 | { |
265 | struct tree_mod_elem *tm; |
266 | |
267 | tm = kzalloc(size: sizeof(*tm), GFP_NOFS); |
268 | if (!tm) |
269 | return ERR_PTR(error: -ENOMEM); |
270 | |
271 | tm->logical = eb->start; |
272 | tm->slot = src_slot; |
273 | tm->move.dst_slot = dst_slot; |
274 | tm->move.nr_items = nr_items; |
275 | tm->op = BTRFS_MOD_LOG_MOVE_KEYS; |
276 | RB_CLEAR_NODE(&tm->node); |
277 | |
278 | return tm; |
279 | } |
280 | |
281 | int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, |
282 | int dst_slot, int src_slot, |
283 | int nr_items) |
284 | { |
285 | struct tree_mod_elem *tm = NULL; |
286 | struct tree_mod_elem **tm_list = NULL; |
287 | int ret = 0; |
288 | int i; |
289 | bool locked = false; |
290 | |
291 | if (!tree_mod_need_log(fs_info: eb->fs_info, eb)) |
292 | return 0; |
293 | |
294 | tm_list = kcalloc(n: nr_items, size: sizeof(struct tree_mod_elem *), GFP_NOFS); |
295 | if (!tm_list) { |
296 | ret = -ENOMEM; |
297 | goto lock; |
298 | } |
299 | |
300 | tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); |
301 | if (IS_ERR(ptr: tm)) { |
302 | ret = PTR_ERR(ptr: tm); |
303 | tm = NULL; |
304 | goto lock; |
305 | } |
306 | |
307 | for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { |
308 | tm_list[i] = alloc_tree_mod_elem(eb, slot: i + dst_slot, |
309 | op: BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); |
310 | if (!tm_list[i]) { |
311 | ret = -ENOMEM; |
312 | goto lock; |
313 | } |
314 | } |
315 | |
316 | lock: |
317 | if (tree_mod_dont_log(fs_info: eb->fs_info, eb)) { |
318 | /* |
319 | * Don't error if we failed to allocate memory because we don't |
320 | * need to log. |
321 | */ |
322 | ret = 0; |
323 | goto free_tms; |
324 | } |
325 | locked = true; |
326 | |
327 | /* |
328 | * We previously failed to allocate memory and we need to log, so we |
329 | * have to fail. |
330 | */ |
331 | if (ret != 0) |
332 | goto free_tms; |
333 | |
334 | /* |
335 | * When we override something during the move, we log these removals. |
336 | * This can only happen when we move towards the beginning of the |
337 | * buffer, i.e. dst_slot < src_slot. |
338 | */ |
339 | for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { |
340 | ret = tree_mod_log_insert(fs_info: eb->fs_info, tm: tm_list[i]); |
341 | if (ret) |
342 | goto free_tms; |
343 | } |
344 | |
345 | ret = tree_mod_log_insert(fs_info: eb->fs_info, tm); |
346 | if (ret) |
347 | goto free_tms; |
348 | write_unlock(&eb->fs_info->tree_mod_log_lock); |
349 | kfree(objp: tm_list); |
350 | |
351 | return 0; |
352 | |
353 | free_tms: |
354 | if (tm_list) { |
355 | for (i = 0; i < nr_items; i++) { |
356 | if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) |
357 | rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); |
358 | kfree(objp: tm_list[i]); |
359 | } |
360 | } |
361 | if (locked) |
362 | write_unlock(&eb->fs_info->tree_mod_log_lock); |
363 | kfree(objp: tm_list); |
364 | kfree(objp: tm); |
365 | |
366 | return ret; |
367 | } |
368 | |
369 | static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, |
370 | struct tree_mod_elem **tm_list, |
371 | int nritems) |
372 | { |
373 | int i, j; |
374 | int ret; |
375 | |
376 | for (i = nritems - 1; i >= 0; i--) { |
377 | ret = tree_mod_log_insert(fs_info, tm: tm_list[i]); |
378 | if (ret) { |
379 | for (j = nritems - 1; j > i; j--) |
380 | rb_erase(&tm_list[j]->node, |
381 | &fs_info->tree_mod_log); |
382 | return ret; |
383 | } |
384 | } |
385 | |
386 | return 0; |
387 | } |
388 | |
389 | int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, |
390 | struct extent_buffer *new_root, |
391 | bool log_removal) |
392 | { |
393 | struct btrfs_fs_info *fs_info = old_root->fs_info; |
394 | struct tree_mod_elem *tm = NULL; |
395 | struct tree_mod_elem **tm_list = NULL; |
396 | int nritems = 0; |
397 | int ret = 0; |
398 | int i; |
399 | |
400 | if (!tree_mod_need_log(fs_info, NULL)) |
401 | return 0; |
402 | |
403 | if (log_removal && btrfs_header_level(eb: old_root) > 0) { |
404 | nritems = btrfs_header_nritems(eb: old_root); |
405 | tm_list = kcalloc(n: nritems, size: sizeof(struct tree_mod_elem *), |
406 | GFP_NOFS); |
407 | if (!tm_list) { |
408 | ret = -ENOMEM; |
409 | goto lock; |
410 | } |
411 | for (i = 0; i < nritems; i++) { |
412 | tm_list[i] = alloc_tree_mod_elem(eb: old_root, slot: i, |
413 | op: BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); |
414 | if (!tm_list[i]) { |
415 | ret = -ENOMEM; |
416 | goto lock; |
417 | } |
418 | } |
419 | } |
420 | |
421 | tm = kzalloc(size: sizeof(*tm), GFP_NOFS); |
422 | if (!tm) { |
423 | ret = -ENOMEM; |
424 | goto lock; |
425 | } |
426 | |
427 | tm->logical = new_root->start; |
428 | tm->old_root.logical = old_root->start; |
429 | tm->old_root.level = btrfs_header_level(eb: old_root); |
430 | tm->generation = btrfs_header_generation(eb: old_root); |
431 | tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; |
432 | |
433 | lock: |
434 | if (tree_mod_dont_log(fs_info, NULL)) { |
435 | /* |
436 | * Don't error if we failed to allocate memory because we don't |
437 | * need to log. |
438 | */ |
439 | ret = 0; |
440 | goto free_tms; |
441 | } else if (ret != 0) { |
442 | /* |
443 | * We previously failed to allocate memory and we need to log, |
444 | * so we have to fail. |
445 | */ |
446 | goto out_unlock; |
447 | } |
448 | |
449 | if (tm_list) |
450 | ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); |
451 | if (!ret) |
452 | ret = tree_mod_log_insert(fs_info, tm); |
453 | |
454 | out_unlock: |
455 | write_unlock(&fs_info->tree_mod_log_lock); |
456 | if (ret) |
457 | goto free_tms; |
458 | kfree(objp: tm_list); |
459 | |
460 | return ret; |
461 | |
462 | free_tms: |
463 | if (tm_list) { |
464 | for (i = 0; i < nritems; i++) |
465 | kfree(objp: tm_list[i]); |
466 | kfree(objp: tm_list); |
467 | } |
468 | kfree(objp: tm); |
469 | |
470 | return ret; |
471 | } |
472 | |
473 | static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, |
474 | u64 start, u64 min_seq, |
475 | bool smallest) |
476 | { |
477 | struct rb_root *tm_root; |
478 | struct rb_node *node; |
479 | struct tree_mod_elem *cur = NULL; |
480 | struct tree_mod_elem *found = NULL; |
481 | |
482 | read_lock(&fs_info->tree_mod_log_lock); |
483 | tm_root = &fs_info->tree_mod_log; |
484 | node = tm_root->rb_node; |
485 | while (node) { |
486 | cur = rb_entry(node, struct tree_mod_elem, node); |
487 | if (cur->logical < start) { |
488 | node = node->rb_left; |
489 | } else if (cur->logical > start) { |
490 | node = node->rb_right; |
491 | } else if (cur->seq < min_seq) { |
492 | node = node->rb_left; |
493 | } else if (!smallest) { |
494 | /* We want the node with the highest seq */ |
495 | if (found) |
496 | BUG_ON(found->seq > cur->seq); |
497 | found = cur; |
498 | node = node->rb_left; |
499 | } else if (cur->seq > min_seq) { |
500 | /* We want the node with the smallest seq */ |
501 | if (found) |
502 | BUG_ON(found->seq < cur->seq); |
503 | found = cur; |
504 | node = node->rb_right; |
505 | } else { |
506 | found = cur; |
507 | break; |
508 | } |
509 | } |
510 | read_unlock(&fs_info->tree_mod_log_lock); |
511 | |
512 | return found; |
513 | } |
514 | |
515 | /* |
516 | * This returns the element from the log with the smallest time sequence |
517 | * value that's in the log (the oldest log item). Any element with a time |
518 | * sequence lower than min_seq will be ignored. |
519 | */ |
520 | static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, |
521 | u64 start, u64 min_seq) |
522 | { |
523 | return __tree_mod_log_search(fs_info, start, min_seq, smallest: true); |
524 | } |
525 | |
526 | /* |
527 | * This returns the element from the log with the largest time sequence |
528 | * value that's in the log (the most recent log item). Any element with |
529 | * a time sequence lower than min_seq will be ignored. |
530 | */ |
531 | static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, |
532 | u64 start, u64 min_seq) |
533 | { |
534 | return __tree_mod_log_search(fs_info, start, min_seq, smallest: false); |
535 | } |
536 | |
537 | int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, |
538 | struct extent_buffer *src, |
539 | unsigned long dst_offset, |
540 | unsigned long src_offset, |
541 | int nr_items) |
542 | { |
543 | struct btrfs_fs_info *fs_info = dst->fs_info; |
544 | int ret = 0; |
545 | struct tree_mod_elem **tm_list = NULL; |
546 | struct tree_mod_elem **tm_list_add = NULL; |
547 | struct tree_mod_elem **tm_list_rem = NULL; |
548 | int i; |
549 | bool locked = false; |
550 | struct tree_mod_elem *dst_move_tm = NULL; |
551 | struct tree_mod_elem *src_move_tm = NULL; |
552 | u32 dst_move_nr_items = btrfs_header_nritems(eb: dst) - dst_offset; |
553 | u32 src_move_nr_items = btrfs_header_nritems(eb: src) - (src_offset + nr_items); |
554 | |
555 | if (!tree_mod_need_log(fs_info, NULL)) |
556 | return 0; |
557 | |
558 | if (btrfs_header_level(eb: dst) == 0 && btrfs_header_level(eb: src) == 0) |
559 | return 0; |
560 | |
561 | tm_list = kcalloc(n: nr_items * 2, size: sizeof(struct tree_mod_elem *), |
562 | GFP_NOFS); |
563 | if (!tm_list) { |
564 | ret = -ENOMEM; |
565 | goto lock; |
566 | } |
567 | |
568 | if (dst_move_nr_items) { |
569 | dst_move_tm = tree_mod_log_alloc_move(eb: dst, dst_slot: dst_offset + nr_items, |
570 | src_slot: dst_offset, nr_items: dst_move_nr_items); |
571 | if (IS_ERR(ptr: dst_move_tm)) { |
572 | ret = PTR_ERR(ptr: dst_move_tm); |
573 | dst_move_tm = NULL; |
574 | goto lock; |
575 | } |
576 | } |
577 | if (src_move_nr_items) { |
578 | src_move_tm = tree_mod_log_alloc_move(eb: src, dst_slot: src_offset, |
579 | src_slot: src_offset + nr_items, |
580 | nr_items: src_move_nr_items); |
581 | if (IS_ERR(ptr: src_move_tm)) { |
582 | ret = PTR_ERR(ptr: src_move_tm); |
583 | src_move_tm = NULL; |
584 | goto lock; |
585 | } |
586 | } |
587 | |
588 | tm_list_add = tm_list; |
589 | tm_list_rem = tm_list + nr_items; |
590 | for (i = 0; i < nr_items; i++) { |
591 | tm_list_rem[i] = alloc_tree_mod_elem(eb: src, slot: i + src_offset, |
592 | op: BTRFS_MOD_LOG_KEY_REMOVE); |
593 | if (!tm_list_rem[i]) { |
594 | ret = -ENOMEM; |
595 | goto lock; |
596 | } |
597 | |
598 | tm_list_add[i] = alloc_tree_mod_elem(eb: dst, slot: i + dst_offset, |
599 | op: BTRFS_MOD_LOG_KEY_ADD); |
600 | if (!tm_list_add[i]) { |
601 | ret = -ENOMEM; |
602 | goto lock; |
603 | } |
604 | } |
605 | |
606 | lock: |
607 | if (tree_mod_dont_log(fs_info, NULL)) { |
608 | /* |
609 | * Don't error if we failed to allocate memory because we don't |
610 | * need to log. |
611 | */ |
612 | ret = 0; |
613 | goto free_tms; |
614 | } |
615 | locked = true; |
616 | |
617 | /* |
618 | * We previously failed to allocate memory and we need to log, so we |
619 | * have to fail. |
620 | */ |
621 | if (ret != 0) |
622 | goto free_tms; |
623 | |
624 | if (dst_move_tm) { |
625 | ret = tree_mod_log_insert(fs_info, tm: dst_move_tm); |
626 | if (ret) |
627 | goto free_tms; |
628 | } |
629 | for (i = 0; i < nr_items; i++) { |
630 | ret = tree_mod_log_insert(fs_info, tm: tm_list_rem[i]); |
631 | if (ret) |
632 | goto free_tms; |
633 | ret = tree_mod_log_insert(fs_info, tm: tm_list_add[i]); |
634 | if (ret) |
635 | goto free_tms; |
636 | } |
637 | if (src_move_tm) { |
638 | ret = tree_mod_log_insert(fs_info, tm: src_move_tm); |
639 | if (ret) |
640 | goto free_tms; |
641 | } |
642 | |
643 | write_unlock(&fs_info->tree_mod_log_lock); |
644 | kfree(objp: tm_list); |
645 | |
646 | return 0; |
647 | |
648 | free_tms: |
649 | if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) |
650 | rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); |
651 | kfree(objp: dst_move_tm); |
652 | if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) |
653 | rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); |
654 | kfree(objp: src_move_tm); |
655 | if (tm_list) { |
656 | for (i = 0; i < nr_items * 2; i++) { |
657 | if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) |
658 | rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); |
659 | kfree(objp: tm_list[i]); |
660 | } |
661 | } |
662 | if (locked) |
663 | write_unlock(&fs_info->tree_mod_log_lock); |
664 | kfree(objp: tm_list); |
665 | |
666 | return ret; |
667 | } |
668 | |
669 | int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) |
670 | { |
671 | struct tree_mod_elem **tm_list = NULL; |
672 | int nritems = 0; |
673 | int i; |
674 | int ret = 0; |
675 | |
676 | if (!tree_mod_need_log(fs_info: eb->fs_info, eb)) |
677 | return 0; |
678 | |
679 | nritems = btrfs_header_nritems(eb); |
680 | tm_list = kcalloc(n: nritems, size: sizeof(struct tree_mod_elem *), GFP_NOFS); |
681 | if (!tm_list) { |
682 | ret = -ENOMEM; |
683 | goto lock; |
684 | } |
685 | |
686 | for (i = 0; i < nritems; i++) { |
687 | tm_list[i] = alloc_tree_mod_elem(eb, slot: i, |
688 | op: BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); |
689 | if (!tm_list[i]) { |
690 | ret = -ENOMEM; |
691 | goto lock; |
692 | } |
693 | } |
694 | |
695 | lock: |
696 | if (tree_mod_dont_log(fs_info: eb->fs_info, eb)) { |
697 | /* |
698 | * Don't error if we failed to allocate memory because we don't |
699 | * need to log. |
700 | */ |
701 | ret = 0; |
702 | goto free_tms; |
703 | } else if (ret != 0) { |
704 | /* |
705 | * We previously failed to allocate memory and we need to log, |
706 | * so we have to fail. |
707 | */ |
708 | goto out_unlock; |
709 | } |
710 | |
711 | ret = tree_mod_log_free_eb(fs_info: eb->fs_info, tm_list, nritems); |
712 | out_unlock: |
713 | write_unlock(&eb->fs_info->tree_mod_log_lock); |
714 | if (ret) |
715 | goto free_tms; |
716 | kfree(objp: tm_list); |
717 | |
718 | return 0; |
719 | |
720 | free_tms: |
721 | if (tm_list) { |
722 | for (i = 0; i < nritems; i++) |
723 | kfree(objp: tm_list[i]); |
724 | kfree(objp: tm_list); |
725 | } |
726 | |
727 | return ret; |
728 | } |
729 | |
730 | /* |
731 | * Returns the logical address of the oldest predecessor of the given root. |
732 | * Entries older than time_seq are ignored. |
733 | */ |
734 | static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, |
735 | u64 time_seq) |
736 | { |
737 | struct tree_mod_elem *tm; |
738 | struct tree_mod_elem *found = NULL; |
739 | u64 root_logical = eb_root->start; |
740 | bool looped = false; |
741 | |
742 | if (!time_seq) |
743 | return NULL; |
744 | |
745 | /* |
746 | * The very last operation that's logged for a root is the replacement |
747 | * operation (if it is replaced at all). This has the logical address |
748 | * of the *new* root, making it the very first operation that's logged |
749 | * for this root. |
750 | */ |
751 | while (1) { |
752 | tm = tree_mod_log_search_oldest(fs_info: eb_root->fs_info, start: root_logical, |
753 | min_seq: time_seq); |
754 | if (!looped && !tm) |
755 | return NULL; |
756 | /* |
757 | * If there are no tree operation for the oldest root, we simply |
758 | * return it. This should only happen if that (old) root is at |
759 | * level 0. |
760 | */ |
761 | if (!tm) |
762 | break; |
763 | |
764 | /* |
765 | * If there's an operation that's not a root replacement, we |
766 | * found the oldest version of our root. Normally, we'll find a |
767 | * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. |
768 | */ |
769 | if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) |
770 | break; |
771 | |
772 | found = tm; |
773 | root_logical = tm->old_root.logical; |
774 | looped = true; |
775 | } |
776 | |
777 | /* If there's no old root to return, return what we found instead */ |
778 | if (!found) |
779 | found = tm; |
780 | |
781 | return found; |
782 | } |
783 | |
784 | |
785 | /* |
786 | * tm is a pointer to the first operation to rewind within eb. Then, all |
787 | * previous operations will be rewound (until we reach something older than |
788 | * time_seq). |
789 | */ |
790 | static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, |
791 | struct extent_buffer *eb, |
792 | u64 time_seq, |
793 | struct tree_mod_elem *first_tm) |
794 | { |
795 | u32 n; |
796 | struct rb_node *next; |
797 | struct tree_mod_elem *tm = first_tm; |
798 | unsigned long o_dst; |
799 | unsigned long o_src; |
800 | unsigned long p_size = sizeof(struct btrfs_key_ptr); |
801 | /* |
802 | * max_slot tracks the maximum valid slot of the rewind eb at every |
803 | * step of the rewind. This is in contrast with 'n' which eventually |
804 | * matches the number of items, but can be wrong during moves or if |
805 | * removes overlap on already valid slots (which is probably separately |
806 | * a bug). We do this to validate the offsets of memmoves for rewinding |
807 | * moves and detect invalid memmoves. |
808 | * |
809 | * Since a rewind eb can start empty, max_slot is a signed integer with |
810 | * a special meaning for -1, which is that no slot is valid to move out |
811 | * of. Any other negative value is invalid. |
812 | */ |
813 | int max_slot; |
814 | int move_src_end_slot; |
815 | int move_dst_end_slot; |
816 | |
817 | n = btrfs_header_nritems(eb); |
818 | max_slot = n - 1; |
819 | read_lock(&fs_info->tree_mod_log_lock); |
820 | while (tm && tm->seq >= time_seq) { |
821 | ASSERT(max_slot >= -1); |
822 | /* |
823 | * All the operations are recorded with the operator used for |
824 | * the modification. As we're going backwards, we do the |
825 | * opposite of each operation here. |
826 | */ |
827 | switch (tm->op) { |
828 | case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: |
829 | BUG_ON(tm->slot < n); |
830 | fallthrough; |
831 | case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: |
832 | case BTRFS_MOD_LOG_KEY_REMOVE: |
833 | btrfs_set_node_key(eb, disk_key: &tm->key, nr: tm->slot); |
834 | btrfs_set_node_blockptr(eb, nr: tm->slot, val: tm->blockptr); |
835 | btrfs_set_node_ptr_generation(eb, nr: tm->slot, |
836 | val: tm->generation); |
837 | n++; |
838 | if (tm->slot > max_slot) |
839 | max_slot = tm->slot; |
840 | break; |
841 | case BTRFS_MOD_LOG_KEY_REPLACE: |
842 | BUG_ON(tm->slot >= n); |
843 | btrfs_set_node_key(eb, disk_key: &tm->key, nr: tm->slot); |
844 | btrfs_set_node_blockptr(eb, nr: tm->slot, val: tm->blockptr); |
845 | btrfs_set_node_ptr_generation(eb, nr: tm->slot, |
846 | val: tm->generation); |
847 | break; |
848 | case BTRFS_MOD_LOG_KEY_ADD: |
849 | /* |
850 | * It is possible we could have already removed keys |
851 | * behind the known max slot, so this will be an |
852 | * overestimate. In practice, the copy operation |
853 | * inserts them in increasing order, and overestimating |
854 | * just means we miss some warnings, so it's OK. It |
855 | * isn't worth carefully tracking the full array of |
856 | * valid slots to check against when moving. |
857 | */ |
858 | if (tm->slot == max_slot) |
859 | max_slot--; |
860 | /* if a move operation is needed it's in the log */ |
861 | n--; |
862 | break; |
863 | case BTRFS_MOD_LOG_MOVE_KEYS: |
864 | ASSERT(tm->move.nr_items > 0); |
865 | move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; |
866 | move_dst_end_slot = tm->slot + tm->move.nr_items - 1; |
867 | o_dst = btrfs_node_key_ptr_offset(eb, nr: tm->slot); |
868 | o_src = btrfs_node_key_ptr_offset(eb, nr: tm->move.dst_slot); |
869 | if (WARN_ON(move_src_end_slot > max_slot || |
870 | tm->move.nr_items <= 0)) { |
871 | btrfs_warn(fs_info, |
872 | "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d" , |
873 | eb->start, tm->slot, |
874 | tm->move.dst_slot, tm->move.nr_items, |
875 | tm->seq, n, max_slot); |
876 | } |
877 | memmove_extent_buffer(dst: eb, dst_offset: o_dst, src_offset: o_src, |
878 | len: tm->move.nr_items * p_size); |
879 | max_slot = move_dst_end_slot; |
880 | break; |
881 | case BTRFS_MOD_LOG_ROOT_REPLACE: |
882 | /* |
883 | * This operation is special. For roots, this must be |
884 | * handled explicitly before rewinding. |
885 | * For non-roots, this operation may exist if the node |
886 | * was a root: root A -> child B; then A gets empty and |
887 | * B is promoted to the new root. In the mod log, we'll |
888 | * have a root-replace operation for B, a tree block |
889 | * that is no root. We simply ignore that operation. |
890 | */ |
891 | break; |
892 | } |
893 | next = rb_next(&tm->node); |
894 | if (!next) |
895 | break; |
896 | tm = rb_entry(next, struct tree_mod_elem, node); |
897 | if (tm->logical != first_tm->logical) |
898 | break; |
899 | } |
900 | read_unlock(&fs_info->tree_mod_log_lock); |
901 | btrfs_set_header_nritems(eb, val: n); |
902 | } |
903 | |
904 | /* |
905 | * Called with eb read locked. If the buffer cannot be rewound, the same buffer |
906 | * is returned. If rewind operations happen, a fresh buffer is returned. The |
907 | * returned buffer is always read-locked. If the returned buffer is not the |
908 | * input buffer, the lock on the input buffer is released and the input buffer |
909 | * is freed (its refcount is decremented). |
910 | */ |
911 | struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, |
912 | struct btrfs_path *path, |
913 | struct extent_buffer *eb, |
914 | u64 time_seq) |
915 | { |
916 | struct extent_buffer *eb_rewin; |
917 | struct tree_mod_elem *tm; |
918 | |
919 | if (!time_seq) |
920 | return eb; |
921 | |
922 | if (btrfs_header_level(eb) == 0) |
923 | return eb; |
924 | |
925 | tm = tree_mod_log_search(fs_info, start: eb->start, min_seq: time_seq); |
926 | if (!tm) |
927 | return eb; |
928 | |
929 | if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { |
930 | BUG_ON(tm->slot != 0); |
931 | eb_rewin = alloc_dummy_extent_buffer(fs_info, start: eb->start); |
932 | if (!eb_rewin) { |
933 | btrfs_tree_read_unlock(eb); |
934 | free_extent_buffer(eb); |
935 | return NULL; |
936 | } |
937 | btrfs_set_header_bytenr(eb: eb_rewin, val: eb->start); |
938 | btrfs_set_header_backref_rev(eb: eb_rewin, |
939 | rev: btrfs_header_backref_rev(eb)); |
940 | btrfs_set_header_owner(eb: eb_rewin, val: btrfs_header_owner(eb)); |
941 | btrfs_set_header_level(eb: eb_rewin, val: btrfs_header_level(eb)); |
942 | } else { |
943 | eb_rewin = btrfs_clone_extent_buffer(src: eb); |
944 | if (!eb_rewin) { |
945 | btrfs_tree_read_unlock(eb); |
946 | free_extent_buffer(eb); |
947 | return NULL; |
948 | } |
949 | } |
950 | |
951 | btrfs_tree_read_unlock(eb); |
952 | free_extent_buffer(eb); |
953 | |
954 | btrfs_set_buffer_lockdep_class(objectid: btrfs_header_owner(eb: eb_rewin), |
955 | eb: eb_rewin, level: btrfs_header_level(eb: eb_rewin)); |
956 | btrfs_tree_read_lock(eb: eb_rewin); |
957 | tree_mod_log_rewind(fs_info, eb: eb_rewin, time_seq, first_tm: tm); |
958 | WARN_ON(btrfs_header_nritems(eb_rewin) > |
959 | BTRFS_NODEPTRS_PER_BLOCK(fs_info)); |
960 | |
961 | return eb_rewin; |
962 | } |
963 | |
964 | /* |
965 | * Rewind the state of @root's root node to the given @time_seq value. |
966 | * If there are no changes, the current root->root_node is returned. If anything |
967 | * changed in between, there's a fresh buffer allocated on which the rewind |
968 | * operations are done. In any case, the returned buffer is read locked. |
969 | * Returns NULL on error (with no locks held). |
970 | */ |
971 | struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) |
972 | { |
973 | struct btrfs_fs_info *fs_info = root->fs_info; |
974 | struct tree_mod_elem *tm; |
975 | struct extent_buffer *eb = NULL; |
976 | struct extent_buffer *eb_root; |
977 | u64 eb_root_owner = 0; |
978 | struct extent_buffer *old; |
979 | struct tree_mod_root *old_root = NULL; |
980 | u64 old_generation = 0; |
981 | u64 logical; |
982 | int level; |
983 | |
984 | eb_root = btrfs_read_lock_root_node(root); |
985 | tm = tree_mod_log_oldest_root(eb_root, time_seq); |
986 | if (!tm) |
987 | return eb_root; |
988 | |
989 | if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { |
990 | old_root = &tm->old_root; |
991 | old_generation = tm->generation; |
992 | logical = old_root->logical; |
993 | level = old_root->level; |
994 | } else { |
995 | logical = eb_root->start; |
996 | level = btrfs_header_level(eb: eb_root); |
997 | } |
998 | |
999 | tm = tree_mod_log_search(fs_info, start: logical, min_seq: time_seq); |
1000 | if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { |
1001 | struct btrfs_tree_parent_check check = { 0 }; |
1002 | |
1003 | btrfs_tree_read_unlock(eb: eb_root); |
1004 | free_extent_buffer(eb: eb_root); |
1005 | |
1006 | check.level = level; |
1007 | check.owner_root = root->root_key.objectid; |
1008 | |
1009 | old = read_tree_block(fs_info, bytenr: logical, check: &check); |
1010 | if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { |
1011 | if (!IS_ERR(ptr: old)) |
1012 | free_extent_buffer(eb: old); |
1013 | btrfs_warn(fs_info, |
1014 | "failed to read tree block %llu from get_old_root" , |
1015 | logical); |
1016 | } else { |
1017 | struct tree_mod_elem *tm2; |
1018 | |
1019 | btrfs_tree_read_lock(eb: old); |
1020 | eb = btrfs_clone_extent_buffer(src: old); |
1021 | /* |
1022 | * After the lookup for the most recent tree mod operation |
1023 | * above and before we locked and cloned the extent buffer |
1024 | * 'old', a new tree mod log operation may have been added. |
1025 | * So lookup for a more recent one to make sure the number |
1026 | * of mod log operations we replay is consistent with the |
1027 | * number of items we have in the cloned extent buffer, |
1028 | * otherwise we can hit a BUG_ON when rewinding the extent |
1029 | * buffer. |
1030 | */ |
1031 | tm2 = tree_mod_log_search(fs_info, start: logical, min_seq: time_seq); |
1032 | btrfs_tree_read_unlock(eb: old); |
1033 | free_extent_buffer(eb: old); |
1034 | ASSERT(tm2); |
1035 | ASSERT(tm2 == tm || tm2->seq > tm->seq); |
1036 | if (!tm2 || tm2->seq < tm->seq) { |
1037 | free_extent_buffer(eb); |
1038 | return NULL; |
1039 | } |
1040 | tm = tm2; |
1041 | } |
1042 | } else if (old_root) { |
1043 | eb_root_owner = btrfs_header_owner(eb: eb_root); |
1044 | btrfs_tree_read_unlock(eb: eb_root); |
1045 | free_extent_buffer(eb: eb_root); |
1046 | eb = alloc_dummy_extent_buffer(fs_info, start: logical); |
1047 | } else { |
1048 | eb = btrfs_clone_extent_buffer(src: eb_root); |
1049 | btrfs_tree_read_unlock(eb: eb_root); |
1050 | free_extent_buffer(eb: eb_root); |
1051 | } |
1052 | |
1053 | if (!eb) |
1054 | return NULL; |
1055 | if (old_root) { |
1056 | btrfs_set_header_bytenr(eb, val: eb->start); |
1057 | btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); |
1058 | btrfs_set_header_owner(eb, val: eb_root_owner); |
1059 | btrfs_set_header_level(eb, val: old_root->level); |
1060 | btrfs_set_header_generation(eb, val: old_generation); |
1061 | } |
1062 | btrfs_set_buffer_lockdep_class(objectid: btrfs_header_owner(eb), eb, |
1063 | level: btrfs_header_level(eb)); |
1064 | btrfs_tree_read_lock(eb); |
1065 | if (tm) |
1066 | tree_mod_log_rewind(fs_info, eb, time_seq, first_tm: tm); |
1067 | else |
1068 | WARN_ON(btrfs_header_level(eb) != 0); |
1069 | WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); |
1070 | |
1071 | return eb; |
1072 | } |
1073 | |
1074 | int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) |
1075 | { |
1076 | struct tree_mod_elem *tm; |
1077 | int level; |
1078 | struct extent_buffer *eb_root = btrfs_root_node(root); |
1079 | |
1080 | tm = tree_mod_log_oldest_root(eb_root, time_seq); |
1081 | if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) |
1082 | level = tm->old_root.level; |
1083 | else |
1084 | level = btrfs_header_level(eb: eb_root); |
1085 | |
1086 | free_extent_buffer(eb: eb_root); |
1087 | |
1088 | return level; |
1089 | } |
1090 | |
1091 | /* |
1092 | * Return the lowest sequence number in the tree modification log. |
1093 | * |
1094 | * Return the sequence number of the oldest tree modification log user, which |
1095 | * corresponds to the lowest sequence number of all existing users. If there are |
1096 | * no users it returns 0. |
1097 | */ |
1098 | u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) |
1099 | { |
1100 | u64 ret = 0; |
1101 | |
1102 | read_lock(&fs_info->tree_mod_log_lock); |
1103 | if (!list_empty(head: &fs_info->tree_mod_seq_list)) { |
1104 | struct btrfs_seq_list *elem; |
1105 | |
1106 | elem = list_first_entry(&fs_info->tree_mod_seq_list, |
1107 | struct btrfs_seq_list, list); |
1108 | ret = elem->seq; |
1109 | } |
1110 | read_unlock(&fs_info->tree_mod_log_lock); |
1111 | |
1112 | return ret; |
1113 | } |
1114 | |