1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "bkey_buf.h" |
5 | #include "btree_key_cache.h" |
6 | #include "btree_update.h" |
7 | #include "buckets.h" |
8 | #include "errcode.h" |
9 | #include "error.h" |
10 | #include "fs.h" |
11 | #include "recovery_passes.h" |
12 | #include "snapshot.h" |
13 | |
14 | #include <linux/random.h> |
15 | |
16 | /* |
17 | * Snapshot trees: |
18 | * |
19 | * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they |
20 | * exist to provide a stable identifier for the whole lifetime of a snapshot |
21 | * tree. |
22 | */ |
23 | |
24 | void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, |
25 | struct bkey_s_c k) |
26 | { |
27 | struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); |
28 | |
29 | prt_printf(out, "subvol %u root snapshot %u" , |
30 | le32_to_cpu(t.v->master_subvol), |
31 | le32_to_cpu(t.v->root_snapshot)); |
32 | } |
33 | |
34 | int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, |
35 | enum bkey_invalid_flags flags, |
36 | struct printbuf *err) |
37 | { |
38 | int ret = 0; |
39 | |
40 | bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || |
41 | bkey_lt(k.k->p, POS(0, 1)), c, err, |
42 | snapshot_tree_pos_bad, |
43 | "bad pos" ); |
44 | fsck_err: |
45 | return ret; |
46 | } |
47 | |
48 | int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, |
49 | struct bch_snapshot_tree *s) |
50 | { |
51 | int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), |
52 | BTREE_ITER_WITH_UPDATES, snapshot_tree, s); |
53 | |
54 | if (bch2_err_matches(ret, ENOENT)) |
55 | ret = -BCH_ERR_ENOENT_snapshot_tree; |
56 | return ret; |
57 | } |
58 | |
59 | struct bkey_i_snapshot_tree * |
60 | __bch2_snapshot_tree_create(struct btree_trans *trans) |
61 | { |
62 | struct btree_iter iter; |
63 | int ret = bch2_bkey_get_empty_slot(trans, &iter, |
64 | BTREE_ID_snapshot_trees, POS(0, U32_MAX)); |
65 | struct bkey_i_snapshot_tree *s_t; |
66 | |
67 | if (ret == -BCH_ERR_ENOSPC_btree_slot) |
68 | ret = -BCH_ERR_ENOSPC_snapshot_tree; |
69 | if (ret) |
70 | return ERR_PTR(error: ret); |
71 | |
72 | s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); |
73 | ret = PTR_ERR_OR_ZERO(ptr: s_t); |
74 | bch2_trans_iter_exit(trans, &iter); |
75 | return ret ? ERR_PTR(error: ret) : s_t; |
76 | } |
77 | |
78 | static int bch2_snapshot_tree_create(struct btree_trans *trans, |
79 | u32 root_id, u32 subvol_id, u32 *tree_id) |
80 | { |
81 | struct bkey_i_snapshot_tree *n_tree = |
82 | __bch2_snapshot_tree_create(trans); |
83 | |
84 | if (IS_ERR(ptr: n_tree)) |
85 | return PTR_ERR(ptr: n_tree); |
86 | |
87 | n_tree->v.master_subvol = cpu_to_le32(subvol_id); |
88 | n_tree->v.root_snapshot = cpu_to_le32(root_id); |
89 | *tree_id = n_tree->k.p.offset; |
90 | return 0; |
91 | } |
92 | |
93 | /* Snapshot nodes: */ |
94 | |
95 | static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor) |
96 | { |
97 | while (id && id < ancestor) { |
98 | const struct snapshot_t *s = __snapshot_t(t, id); |
99 | id = s ? s->parent : 0; |
100 | } |
101 | return id == ancestor; |
102 | } |
103 | |
104 | static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) |
105 | { |
106 | rcu_read_lock(); |
107 | bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); |
108 | rcu_read_unlock(); |
109 | |
110 | return ret; |
111 | } |
112 | |
113 | static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) |
114 | { |
115 | const struct snapshot_t *s = __snapshot_t(t, id); |
116 | if (!s) |
117 | return 0; |
118 | |
119 | if (s->skip[2] <= ancestor) |
120 | return s->skip[2]; |
121 | if (s->skip[1] <= ancestor) |
122 | return s->skip[1]; |
123 | if (s->skip[0] <= ancestor) |
124 | return s->skip[0]; |
125 | return s->parent; |
126 | } |
127 | |
128 | static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) |
129 | { |
130 | const struct snapshot_t *s = __snapshot_t(t, id); |
131 | if (!s) |
132 | return false; |
133 | |
134 | return test_bit(ancestor - id - 1, s->is_ancestor); |
135 | } |
136 | |
137 | bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) |
138 | { |
139 | bool ret; |
140 | |
141 | rcu_read_lock(); |
142 | struct snapshot_table *t = rcu_dereference(c->snapshots); |
143 | |
144 | if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { |
145 | ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); |
146 | goto out; |
147 | } |
148 | |
149 | while (id && id < ancestor - IS_ANCESTOR_BITMAP) |
150 | id = get_ancestor_below(t, id, ancestor); |
151 | |
152 | ret = id && id < ancestor |
153 | ? test_ancestor_bitmap(t, id, ancestor) |
154 | : id == ancestor; |
155 | |
156 | EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); |
157 | out: |
158 | rcu_read_unlock(); |
159 | |
160 | return ret; |
161 | } |
162 | |
163 | static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) |
164 | { |
165 | size_t idx = U32_MAX - id; |
166 | struct snapshot_table *new, *old; |
167 | |
168 | size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1)); |
169 | size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]); |
170 | |
171 | new = kvzalloc(size: new_bytes, GFP_KERNEL); |
172 | if (!new) |
173 | return NULL; |
174 | |
175 | new->nr = new_size; |
176 | |
177 | old = rcu_dereference_protected(c->snapshots, true); |
178 | if (old) |
179 | memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr); |
180 | |
181 | rcu_assign_pointer(c->snapshots, new); |
182 | kvfree_rcu(old, rcu); |
183 | |
184 | return &rcu_dereference_protected(c->snapshots, |
185 | lockdep_is_held(&c->snapshot_table_lock))->s[idx]; |
186 | } |
187 | |
188 | static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) |
189 | { |
190 | size_t idx = U32_MAX - id; |
191 | struct snapshot_table *table = |
192 | rcu_dereference_protected(c->snapshots, |
193 | lockdep_is_held(&c->snapshot_table_lock)); |
194 | |
195 | lockdep_assert_held(&c->snapshot_table_lock); |
196 | |
197 | if (likely(table && idx < table->nr)) |
198 | return &table->s[idx]; |
199 | |
200 | return __snapshot_t_mut(c, id); |
201 | } |
202 | |
203 | void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, |
204 | struct bkey_s_c k) |
205 | { |
206 | struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); |
207 | |
208 | prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u" , |
209 | BCH_SNAPSHOT_SUBVOL(s.v), |
210 | BCH_SNAPSHOT_DELETED(s.v), |
211 | le32_to_cpu(s.v->parent), |
212 | le32_to_cpu(s.v->children[0]), |
213 | le32_to_cpu(s.v->children[1]), |
214 | le32_to_cpu(s.v->subvol), |
215 | le32_to_cpu(s.v->tree)); |
216 | |
217 | if (bkey_val_bytes(k: k.k) > offsetof(struct bch_snapshot, depth)) |
218 | prt_printf(out, " depth %u skiplist %u %u %u" , |
219 | le32_to_cpu(s.v->depth), |
220 | le32_to_cpu(s.v->skip[0]), |
221 | le32_to_cpu(s.v->skip[1]), |
222 | le32_to_cpu(s.v->skip[2])); |
223 | } |
224 | |
225 | int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, |
226 | enum bkey_invalid_flags flags, |
227 | struct printbuf *err) |
228 | { |
229 | struct bkey_s_c_snapshot s; |
230 | u32 i, id; |
231 | int ret = 0; |
232 | |
233 | bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || |
234 | bkey_lt(k.k->p, POS(0, 1)), c, err, |
235 | snapshot_pos_bad, |
236 | "bad pos" ); |
237 | |
238 | s = bkey_s_c_to_snapshot(k); |
239 | |
240 | id = le32_to_cpu(s.v->parent); |
241 | bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, |
242 | snapshot_parent_bad, |
243 | "bad parent node (%u <= %llu)" , |
244 | id, k.k->p.offset); |
245 | |
246 | bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, |
247 | snapshot_children_not_normalized, |
248 | "children not normalized" ); |
249 | |
250 | bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, |
251 | snapshot_child_duplicate, |
252 | "duplicate child nodes" ); |
253 | |
254 | for (i = 0; i < 2; i++) { |
255 | id = le32_to_cpu(s.v->children[i]); |
256 | |
257 | bkey_fsck_err_on(id >= k.k->p.offset, c, err, |
258 | snapshot_child_bad, |
259 | "bad child node (%u >= %llu)" , |
260 | id, k.k->p.offset); |
261 | } |
262 | |
263 | if (bkey_val_bytes(k: k.k) > offsetof(struct bch_snapshot, skip)) { |
264 | bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || |
265 | le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, |
266 | snapshot_skiplist_not_normalized, |
267 | "skiplist not normalized" ); |
268 | |
269 | for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { |
270 | id = le32_to_cpu(s.v->skip[i]); |
271 | |
272 | bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, |
273 | snapshot_skiplist_bad, |
274 | "bad skiplist node %u" , id); |
275 | } |
276 | } |
277 | fsck_err: |
278 | return ret; |
279 | } |
280 | |
281 | static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) |
282 | { |
283 | struct snapshot_t *t = snapshot_t_mut(c, id); |
284 | u32 parent = id; |
285 | |
286 | while ((parent = bch2_snapshot_parent_early(c, id: parent)) && |
287 | parent - id - 1 < IS_ANCESTOR_BITMAP) |
288 | __set_bit(parent - id - 1, t->is_ancestor); |
289 | } |
290 | |
291 | static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) |
292 | { |
293 | mutex_lock(&c->snapshot_table_lock); |
294 | __set_is_ancestor_bitmap(c, id); |
295 | mutex_unlock(lock: &c->snapshot_table_lock); |
296 | } |
297 | |
298 | static int __bch2_mark_snapshot(struct btree_trans *trans, |
299 | enum btree_id btree, unsigned level, |
300 | struct bkey_s_c old, struct bkey_s_c new, |
301 | unsigned flags) |
302 | { |
303 | struct bch_fs *c = trans->c; |
304 | struct snapshot_t *t; |
305 | u32 id = new.k->p.offset; |
306 | int ret = 0; |
307 | |
308 | mutex_lock(&c->snapshot_table_lock); |
309 | |
310 | t = snapshot_t_mut(c, id); |
311 | if (!t) { |
312 | ret = -BCH_ERR_ENOMEM_mark_snapshot; |
313 | goto err; |
314 | } |
315 | |
316 | if (new.k->type == KEY_TYPE_snapshot) { |
317 | struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k: new); |
318 | |
319 | t->parent = le32_to_cpu(s.v->parent); |
320 | t->children[0] = le32_to_cpu(s.v->children[0]); |
321 | t->children[1] = le32_to_cpu(s.v->children[1]); |
322 | t->subvol = BCH_SNAPSHOT_SUBVOL(k: s.v) ? le32_to_cpu(s.v->subvol) : 0; |
323 | t->tree = le32_to_cpu(s.v->tree); |
324 | |
325 | if (bkey_val_bytes(k: s.k) > offsetof(struct bch_snapshot, depth)) { |
326 | t->depth = le32_to_cpu(s.v->depth); |
327 | t->skip[0] = le32_to_cpu(s.v->skip[0]); |
328 | t->skip[1] = le32_to_cpu(s.v->skip[1]); |
329 | t->skip[2] = le32_to_cpu(s.v->skip[2]); |
330 | } else { |
331 | t->depth = 0; |
332 | t->skip[0] = 0; |
333 | t->skip[1] = 0; |
334 | t->skip[2] = 0; |
335 | } |
336 | |
337 | __set_is_ancestor_bitmap(c, id); |
338 | |
339 | if (BCH_SNAPSHOT_DELETED(k: s.v)) { |
340 | set_bit(nr: BCH_FS_need_delete_dead_snapshots, addr: &c->flags); |
341 | if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) |
342 | bch2_delete_dead_snapshots_async(c); |
343 | } |
344 | } else { |
345 | memset(t, 0, sizeof(*t)); |
346 | } |
347 | err: |
348 | mutex_unlock(lock: &c->snapshot_table_lock); |
349 | return ret; |
350 | } |
351 | |
352 | int bch2_mark_snapshot(struct btree_trans *trans, |
353 | enum btree_id btree, unsigned level, |
354 | struct bkey_s_c old, struct bkey_s new, |
355 | unsigned flags) |
356 | { |
357 | return __bch2_mark_snapshot(trans, btree, level, old, new: new.s_c, flags); |
358 | } |
359 | |
360 | int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, |
361 | struct bch_snapshot *s) |
362 | { |
363 | return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), |
364 | BTREE_ITER_WITH_UPDATES, snapshot, s); |
365 | } |
366 | |
367 | static int bch2_snapshot_live(struct btree_trans *trans, u32 id) |
368 | { |
369 | struct bch_snapshot v; |
370 | int ret; |
371 | |
372 | if (!id) |
373 | return 0; |
374 | |
375 | ret = bch2_snapshot_lookup(trans, id, s: &v); |
376 | if (bch2_err_matches(ret, ENOENT)) |
377 | bch_err(trans->c, "snapshot node %u not found" , id); |
378 | if (ret) |
379 | return ret; |
380 | |
381 | return !BCH_SNAPSHOT_DELETED(k: &v); |
382 | } |
383 | |
384 | /* |
385 | * If @k is a snapshot with just one live child, it's part of a linear chain, |
386 | * which we consider to be an equivalence class: and then after snapshot |
387 | * deletion cleanup, there should only be a single key at a given position in |
388 | * this equivalence class. |
389 | * |
390 | * This sets the equivalence class of @k to be the child's equivalence class, if |
391 | * it's part of such a linear chain: this correctly sets equivalence classes on |
392 | * startup if we run leaf to root (i.e. in natural key order). |
393 | */ |
394 | static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) |
395 | { |
396 | struct bch_fs *c = trans->c; |
397 | unsigned i, nr_live = 0, live_idx = 0; |
398 | struct bkey_s_c_snapshot snap; |
399 | u32 id = k.k->p.offset, child[2]; |
400 | |
401 | if (k.k->type != KEY_TYPE_snapshot) |
402 | return 0; |
403 | |
404 | snap = bkey_s_c_to_snapshot(k); |
405 | |
406 | child[0] = le32_to_cpu(snap.v->children[0]); |
407 | child[1] = le32_to_cpu(snap.v->children[1]); |
408 | |
409 | for (i = 0; i < 2; i++) { |
410 | int ret = bch2_snapshot_live(trans, id: child[i]); |
411 | |
412 | if (ret < 0) |
413 | return ret; |
414 | |
415 | if (ret) |
416 | live_idx = i; |
417 | nr_live += ret; |
418 | } |
419 | |
420 | mutex_lock(&c->snapshot_table_lock); |
421 | |
422 | snapshot_t_mut(c, id)->equiv = nr_live == 1 |
423 | ? snapshot_t_mut(c, id: child[live_idx])->equiv |
424 | : id; |
425 | |
426 | mutex_unlock(lock: &c->snapshot_table_lock); |
427 | |
428 | return 0; |
429 | } |
430 | |
431 | /* fsck: */ |
432 | |
433 | static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) |
434 | { |
435 | return snapshot_t(c, id)->children[child]; |
436 | } |
437 | |
438 | static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) |
439 | { |
440 | return bch2_snapshot_child(c, id, child: 0); |
441 | } |
442 | |
443 | static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) |
444 | { |
445 | return bch2_snapshot_child(c, id, child: 1); |
446 | } |
447 | |
448 | static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) |
449 | { |
450 | u32 n, parent; |
451 | |
452 | n = bch2_snapshot_left_child(c, id); |
453 | if (n) |
454 | return n; |
455 | |
456 | while ((parent = bch2_snapshot_parent(c, id))) { |
457 | n = bch2_snapshot_right_child(c, id: parent); |
458 | if (n && n != id) |
459 | return n; |
460 | id = parent; |
461 | } |
462 | |
463 | return 0; |
464 | } |
465 | |
466 | static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) |
467 | { |
468 | u32 id = snapshot_root; |
469 | u32 subvol = 0, s; |
470 | |
471 | while (id) { |
472 | s = snapshot_t(c, id)->subvol; |
473 | |
474 | if (s && (!subvol || s < subvol)) |
475 | subvol = s; |
476 | |
477 | id = bch2_snapshot_tree_next(c, id); |
478 | } |
479 | |
480 | return subvol; |
481 | } |
482 | |
483 | static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, |
484 | u32 snapshot_root, u32 *subvol_id) |
485 | { |
486 | struct bch_fs *c = trans->c; |
487 | struct btree_iter iter; |
488 | struct bkey_s_c k; |
489 | bool found = false; |
490 | int ret; |
491 | |
492 | for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, |
493 | 0, k, ret) { |
494 | if (k.k->type != KEY_TYPE_subvolume) |
495 | continue; |
496 | |
497 | struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); |
498 | if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), ancestor: snapshot_root)) |
499 | continue; |
500 | if (!BCH_SUBVOLUME_SNAP(k: s.v)) { |
501 | *subvol_id = s.k->p.offset; |
502 | found = true; |
503 | break; |
504 | } |
505 | } |
506 | |
507 | bch2_trans_iter_exit(trans, &iter); |
508 | |
509 | if (!ret && !found) { |
510 | struct bkey_i_subvolume *u; |
511 | |
512 | *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); |
513 | |
514 | u = bch2_bkey_get_mut_typed(trans, &iter, |
515 | BTREE_ID_subvolumes, POS(0, *subvol_id), |
516 | 0, subvolume); |
517 | ret = PTR_ERR_OR_ZERO(ptr: u); |
518 | if (ret) |
519 | return ret; |
520 | |
521 | SET_BCH_SUBVOLUME_SNAP(k: &u->v, v: false); |
522 | } |
523 | |
524 | return ret; |
525 | } |
526 | |
527 | static int check_snapshot_tree(struct btree_trans *trans, |
528 | struct btree_iter *iter, |
529 | struct bkey_s_c k) |
530 | { |
531 | struct bch_fs *c = trans->c; |
532 | struct bkey_s_c_snapshot_tree st; |
533 | struct bch_snapshot s; |
534 | struct bch_subvolume subvol; |
535 | struct printbuf buf = PRINTBUF; |
536 | u32 root_id; |
537 | int ret; |
538 | |
539 | if (k.k->type != KEY_TYPE_snapshot_tree) |
540 | return 0; |
541 | |
542 | st = bkey_s_c_to_snapshot_tree(k); |
543 | root_id = le32_to_cpu(st.v->root_snapshot); |
544 | |
545 | ret = bch2_snapshot_lookup(trans, id: root_id, s: &s); |
546 | if (ret && !bch2_err_matches(ret, ENOENT)) |
547 | goto err; |
548 | |
549 | if (fsck_err_on(ret || |
550 | root_id != bch2_snapshot_root(c, root_id) || |
551 | st.k->p.offset != le32_to_cpu(s.tree), |
552 | c, snapshot_tree_to_missing_snapshot, |
553 | "snapshot tree points to missing/incorrect snapshot:\n %s" , |
554 | (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { |
555 | ret = bch2_btree_delete_at(trans, iter, 0); |
556 | goto err; |
557 | } |
558 | |
559 | ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), |
560 | false, 0, &subvol); |
561 | if (ret && !bch2_err_matches(ret, ENOENT)) |
562 | goto err; |
563 | |
564 | if (fsck_err_on(ret, |
565 | c, snapshot_tree_to_missing_subvol, |
566 | "snapshot tree points to missing subvolume:\n %s" , |
567 | (printbuf_reset(&buf), |
568 | bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || |
569 | fsck_err_on(!bch2_snapshot_is_ancestor(c, |
570 | le32_to_cpu(subvol.snapshot), |
571 | root_id), |
572 | c, snapshot_tree_to_wrong_subvol, |
573 | "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s" , |
574 | (printbuf_reset(&buf), |
575 | bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || |
576 | fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), |
577 | c, snapshot_tree_to_snapshot_subvol, |
578 | "snapshot tree points to snapshot subvolume:\n %s" , |
579 | (printbuf_reset(&buf), |
580 | bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { |
581 | struct bkey_i_snapshot_tree *u; |
582 | u32 subvol_id; |
583 | |
584 | ret = bch2_snapshot_tree_master_subvol(trans, snapshot_root: root_id, subvol_id: &subvol_id); |
585 | bch_err_fn(c, ret); |
586 | |
587 | if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ |
588 | ret = 0; |
589 | goto err; |
590 | } |
591 | |
592 | if (ret) |
593 | goto err; |
594 | |
595 | u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); |
596 | ret = PTR_ERR_OR_ZERO(ptr: u); |
597 | if (ret) |
598 | goto err; |
599 | |
600 | u->v.master_subvol = cpu_to_le32(subvol_id); |
601 | st = snapshot_tree_i_to_s_c(k: u); |
602 | } |
603 | err: |
604 | fsck_err: |
605 | printbuf_exit(&buf); |
606 | return ret; |
607 | } |
608 | |
609 | /* |
610 | * For each snapshot_tree, make sure it points to the root of a snapshot tree |
611 | * and that snapshot entry points back to it, or delete it. |
612 | * |
613 | * And, make sure it points to a subvolume within that snapshot tree, or correct |
614 | * it to point to the oldest subvolume within that snapshot tree. |
615 | */ |
616 | int bch2_check_snapshot_trees(struct bch_fs *c) |
617 | { |
618 | int ret = bch2_trans_run(c, |
619 | for_each_btree_key_commit(trans, iter, |
620 | BTREE_ID_snapshot_trees, POS_MIN, |
621 | BTREE_ITER_PREFETCH, k, |
622 | NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
623 | check_snapshot_tree(trans, &iter, k))); |
624 | bch_err_fn(c, ret); |
625 | return ret; |
626 | } |
627 | |
628 | /* |
629 | * Look up snapshot tree for @tree_id and find root, |
630 | * make sure @snap_id is a descendent: |
631 | */ |
632 | static int snapshot_tree_ptr_good(struct btree_trans *trans, |
633 | u32 snap_id, u32 tree_id) |
634 | { |
635 | struct bch_snapshot_tree s_t; |
636 | int ret = bch2_snapshot_tree_lookup(trans, id: tree_id, s: &s_t); |
637 | |
638 | if (bch2_err_matches(ret, ENOENT)) |
639 | return 0; |
640 | if (ret) |
641 | return ret; |
642 | |
643 | return bch2_snapshot_is_ancestor_early(c: trans->c, id: snap_id, le32_to_cpu(s_t.root_snapshot)); |
644 | } |
645 | |
646 | u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) |
647 | { |
648 | const struct snapshot_t *s; |
649 | |
650 | if (!id) |
651 | return 0; |
652 | |
653 | rcu_read_lock(); |
654 | s = snapshot_t(c, id); |
655 | if (s->parent) |
656 | id = bch2_snapshot_nth_parent(c, id, n: get_random_u32_below(ceil: s->depth)); |
657 | rcu_read_unlock(); |
658 | |
659 | return id; |
660 | } |
661 | |
662 | static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) |
663 | { |
664 | unsigned i; |
665 | |
666 | for (i = 0; i < 3; i++) |
667 | if (!s.parent) { |
668 | if (s.skip[i]) |
669 | return false; |
670 | } else { |
671 | if (!bch2_snapshot_is_ancestor_early(c: trans->c, id, le32_to_cpu(s.skip[i]))) |
672 | return false; |
673 | } |
674 | |
675 | return true; |
676 | } |
677 | |
678 | /* |
679 | * snapshot_tree pointer was incorrect: look up root snapshot node, make sure |
680 | * its snapshot_tree pointer is correct (allocate new one if necessary), then |
681 | * update this node's pointer to root node's pointer: |
682 | */ |
683 | static int snapshot_tree_ptr_repair(struct btree_trans *trans, |
684 | struct btree_iter *iter, |
685 | struct bkey_s_c k, |
686 | struct bch_snapshot *s) |
687 | { |
688 | struct bch_fs *c = trans->c; |
689 | struct btree_iter root_iter; |
690 | struct bch_snapshot_tree s_t; |
691 | struct bkey_s_c_snapshot root; |
692 | struct bkey_i_snapshot *u; |
693 | u32 root_id = bch2_snapshot_root(c, id: k.k->p.offset), tree_id; |
694 | int ret; |
695 | |
696 | root = bch2_bkey_get_iter_typed(trans, &root_iter, |
697 | BTREE_ID_snapshots, POS(0, root_id), |
698 | BTREE_ITER_WITH_UPDATES, snapshot); |
699 | ret = bkey_err(root); |
700 | if (ret) |
701 | goto err; |
702 | |
703 | tree_id = le32_to_cpu(root.v->tree); |
704 | |
705 | ret = bch2_snapshot_tree_lookup(trans, id: tree_id, s: &s_t); |
706 | if (ret && !bch2_err_matches(ret, ENOENT)) |
707 | return ret; |
708 | |
709 | if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { |
710 | u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); |
711 | ret = PTR_ERR_OR_ZERO(ptr: u) ?: |
712 | bch2_snapshot_tree_create(trans, root_id, |
713 | subvol_id: bch2_snapshot_tree_oldest_subvol(c, snapshot_root: root_id), |
714 | tree_id: &tree_id); |
715 | if (ret) |
716 | goto err; |
717 | |
718 | u->v.tree = cpu_to_le32(tree_id); |
719 | if (k.k->p.offset == root_id) |
720 | *s = u->v; |
721 | } |
722 | |
723 | if (k.k->p.offset != root_id) { |
724 | u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); |
725 | ret = PTR_ERR_OR_ZERO(ptr: u); |
726 | if (ret) |
727 | goto err; |
728 | |
729 | u->v.tree = cpu_to_le32(tree_id); |
730 | *s = u->v; |
731 | } |
732 | err: |
733 | bch2_trans_iter_exit(trans, &root_iter); |
734 | return ret; |
735 | } |
736 | |
737 | static int check_snapshot(struct btree_trans *trans, |
738 | struct btree_iter *iter, |
739 | struct bkey_s_c k) |
740 | { |
741 | struct bch_fs *c = trans->c; |
742 | struct bch_snapshot s; |
743 | struct bch_subvolume subvol; |
744 | struct bch_snapshot v; |
745 | struct bkey_i_snapshot *u; |
746 | u32 parent_id = bch2_snapshot_parent_early(c, id: k.k->p.offset); |
747 | u32 real_depth; |
748 | struct printbuf buf = PRINTBUF; |
749 | u32 i, id; |
750 | int ret = 0; |
751 | |
752 | if (k.k->type != KEY_TYPE_snapshot) |
753 | return 0; |
754 | |
755 | memset(&s, 0, sizeof(s)); |
756 | memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); |
757 | |
758 | id = le32_to_cpu(s.parent); |
759 | if (id) { |
760 | ret = bch2_snapshot_lookup(trans, id, s: &v); |
761 | if (bch2_err_matches(ret, ENOENT)) |
762 | bch_err(c, "snapshot with nonexistent parent:\n %s" , |
763 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
764 | if (ret) |
765 | goto err; |
766 | |
767 | if (le32_to_cpu(v.children[0]) != k.k->p.offset && |
768 | le32_to_cpu(v.children[1]) != k.k->p.offset) { |
769 | bch_err(c, "snapshot parent %u missing pointer to child %llu" , |
770 | id, k.k->p.offset); |
771 | ret = -EINVAL; |
772 | goto err; |
773 | } |
774 | } |
775 | |
776 | for (i = 0; i < 2 && s.children[i]; i++) { |
777 | id = le32_to_cpu(s.children[i]); |
778 | |
779 | ret = bch2_snapshot_lookup(trans, id, s: &v); |
780 | if (bch2_err_matches(ret, ENOENT)) |
781 | bch_err(c, "snapshot node %llu has nonexistent child %u" , |
782 | k.k->p.offset, id); |
783 | if (ret) |
784 | goto err; |
785 | |
786 | if (le32_to_cpu(v.parent) != k.k->p.offset) { |
787 | bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)" , |
788 | id, le32_to_cpu(v.parent), k.k->p.offset); |
789 | ret = -EINVAL; |
790 | goto err; |
791 | } |
792 | } |
793 | |
794 | bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(k: &s) && |
795 | !BCH_SNAPSHOT_DELETED(k: &s); |
796 | |
797 | if (should_have_subvol) { |
798 | id = le32_to_cpu(s.subvol); |
799 | ret = bch2_subvolume_get(trans, id, 0, false, &subvol); |
800 | if (bch2_err_matches(ret, ENOENT)) |
801 | bch_err(c, "snapshot points to nonexistent subvolume:\n %s" , |
802 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
803 | if (ret) |
804 | goto err; |
805 | |
806 | if (BCH_SNAPSHOT_SUBVOL(k: &s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { |
807 | bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL" , |
808 | k.k->p.offset); |
809 | ret = -EINVAL; |
810 | goto err; |
811 | } |
812 | } else { |
813 | if (fsck_err_on(s.subvol, |
814 | c, snapshot_should_not_have_subvol, |
815 | "snapshot should not point to subvol:\n %s" , |
816 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { |
817 | u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); |
818 | ret = PTR_ERR_OR_ZERO(ptr: u); |
819 | if (ret) |
820 | goto err; |
821 | |
822 | u->v.subvol = 0; |
823 | s = u->v; |
824 | } |
825 | } |
826 | |
827 | ret = snapshot_tree_ptr_good(trans, snap_id: k.k->p.offset, le32_to_cpu(s.tree)); |
828 | if (ret < 0) |
829 | goto err; |
830 | |
831 | if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, |
832 | "snapshot points to missing/incorrect tree:\n %s" , |
833 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { |
834 | ret = snapshot_tree_ptr_repair(trans, iter, k, s: &s); |
835 | if (ret) |
836 | goto err; |
837 | } |
838 | ret = 0; |
839 | |
840 | real_depth = bch2_snapshot_depth(c, parent: parent_id); |
841 | |
842 | if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, |
843 | c, snapshot_bad_depth, |
844 | "snapshot with incorrect depth field, should be %u:\n %s" , |
845 | real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { |
846 | u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); |
847 | ret = PTR_ERR_OR_ZERO(ptr: u); |
848 | if (ret) |
849 | goto err; |
850 | |
851 | u->v.depth = cpu_to_le32(real_depth); |
852 | s = u->v; |
853 | } |
854 | |
855 | ret = snapshot_skiplist_good(trans, id: k.k->p.offset, s); |
856 | if (ret < 0) |
857 | goto err; |
858 | |
859 | if (fsck_err_on(!ret, c, snapshot_bad_skiplist, |
860 | "snapshot with bad skiplist field:\n %s" , |
861 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { |
862 | u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); |
863 | ret = PTR_ERR_OR_ZERO(ptr: u); |
864 | if (ret) |
865 | goto err; |
866 | |
867 | for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) |
868 | u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); |
869 | |
870 | bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); |
871 | s = u->v; |
872 | } |
873 | ret = 0; |
874 | err: |
875 | fsck_err: |
876 | printbuf_exit(&buf); |
877 | return ret; |
878 | } |
879 | |
880 | int bch2_check_snapshots(struct bch_fs *c) |
881 | { |
882 | /* |
883 | * We iterate backwards as checking/fixing the depth field requires that |
884 | * the parent's depth already be correct: |
885 | */ |
886 | int ret = bch2_trans_run(c, |
887 | for_each_btree_key_reverse_commit(trans, iter, |
888 | BTREE_ID_snapshots, POS_MAX, |
889 | BTREE_ITER_PREFETCH, k, |
890 | NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
891 | check_snapshot(trans, &iter, k))); |
892 | bch_err_fn(c, ret); |
893 | return ret; |
894 | } |
895 | |
896 | static int check_snapshot_exists(struct btree_trans *trans, u32 id) |
897 | { |
898 | struct bch_fs *c = trans->c; |
899 | |
900 | if (bch2_snapshot_equiv(c, id)) |
901 | return 0; |
902 | |
903 | u32 tree_id; |
904 | int ret = bch2_snapshot_tree_create(trans, root_id: id, subvol_id: 0, tree_id: &tree_id); |
905 | if (ret) |
906 | return ret; |
907 | |
908 | struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, size: sizeof(*snapshot)); |
909 | ret = PTR_ERR_OR_ZERO(ptr: snapshot); |
910 | if (ret) |
911 | return ret; |
912 | |
913 | bkey_snapshot_init(k: &snapshot->k_i); |
914 | snapshot->k.p = POS(0, id); |
915 | snapshot->v.tree = cpu_to_le32(tree_id); |
916 | snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c)); |
917 | |
918 | return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: |
919 | bch2_mark_snapshot(trans, btree: BTREE_ID_snapshots, level: 0, |
920 | bkey_s_c_null, new: bkey_i_to_s(k: &snapshot->k_i), flags: 0) ?: |
921 | bch2_snapshot_set_equiv(trans, k: bkey_i_to_s_c(k: &snapshot->k_i)); |
922 | } |
923 | |
924 | /* Figure out which snapshot nodes belong in the same tree: */ |
925 | struct snapshot_tree_reconstruct { |
926 | enum btree_id btree; |
927 | struct bpos cur_pos; |
928 | snapshot_id_list cur_ids; |
929 | DARRAY(snapshot_id_list) trees; |
930 | }; |
931 | |
932 | static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r) |
933 | { |
934 | darray_for_each(r->trees, i) |
935 | darray_exit(i); |
936 | darray_exit(&r->trees); |
937 | darray_exit(&r->cur_ids); |
938 | } |
939 | |
940 | static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos) |
941 | { |
942 | return r->btree == BTREE_ID_inodes |
943 | ? r->cur_pos.offset == pos.offset |
944 | : r->cur_pos.inode == pos.inode; |
945 | } |
946 | |
947 | static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) |
948 | { |
949 | darray_for_each(*l, i) |
950 | if (snapshot_list_has_id(s: r, id: *i)) |
951 | return true; |
952 | return false; |
953 | } |
954 | |
955 | static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) |
956 | { |
957 | bool first = true; |
958 | darray_for_each(*s, i) { |
959 | if (!first) |
960 | prt_char(out, c: ' '); |
961 | first = false; |
962 | prt_printf(out, "%u" , *i); |
963 | } |
964 | } |
965 | |
966 | static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r) |
967 | { |
968 | if (r->cur_ids.nr) { |
969 | darray_for_each(r->trees, i) |
970 | if (snapshot_id_lists_have_common(l: i, r: &r->cur_ids)) { |
971 | int ret = snapshot_list_merge(c, dst: i, src: &r->cur_ids); |
972 | if (ret) |
973 | return ret; |
974 | goto out; |
975 | } |
976 | darray_push(&r->trees, r->cur_ids); |
977 | darray_init(&r->cur_ids); |
978 | } |
979 | out: |
980 | r->cur_ids.nr = 0; |
981 | return 0; |
982 | } |
983 | |
984 | static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos) |
985 | { |
986 | if (!same_snapshot(r, pos)) |
987 | snapshot_tree_reconstruct_next(c, r); |
988 | r->cur_pos = pos; |
989 | return snapshot_list_add_nodup(c, s: &r->cur_ids, id: pos.snapshot); |
990 | } |
991 | |
992 | int bch2_reconstruct_snapshots(struct bch_fs *c) |
993 | { |
994 | struct btree_trans *trans = bch2_trans_get(c); |
995 | struct printbuf buf = PRINTBUF; |
996 | struct snapshot_tree_reconstruct r = {}; |
997 | int ret = 0; |
998 | |
999 | for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { |
1000 | if (btree_type_has_snapshots(id: btree)) { |
1001 | r.btree = btree; |
1002 | |
1003 | ret = for_each_btree_key(trans, iter, btree, POS_MIN, |
1004 | BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({ |
1005 | get_snapshot_trees(c, &r, k.k->p); |
1006 | })); |
1007 | if (ret) |
1008 | goto err; |
1009 | |
1010 | snapshot_tree_reconstruct_next(c, r: &r); |
1011 | } |
1012 | } |
1013 | |
1014 | darray_for_each(r.trees, t) { |
1015 | printbuf_reset(buf: &buf); |
1016 | snapshot_id_list_to_text(out: &buf, s: t); |
1017 | |
1018 | darray_for_each(*t, id) { |
1019 | if (fsck_err_on(!bch2_snapshot_equiv(c, *id), |
1020 | c, snapshot_node_missing, |
1021 | "snapshot node %u from tree %s missing" , *id, buf.buf)) { |
1022 | if (t->nr > 1) { |
1023 | bch_err(c, "cannot reconstruct snapshot trees with multiple nodes" ); |
1024 | ret = -BCH_ERR_fsck_repair_unimplemented; |
1025 | goto err; |
1026 | } |
1027 | |
1028 | ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
1029 | check_snapshot_exists(trans, *id)); |
1030 | if (ret) |
1031 | goto err; |
1032 | } |
1033 | } |
1034 | } |
1035 | fsck_err: |
1036 | err: |
1037 | bch2_trans_put(trans); |
1038 | snapshot_tree_reconstruct_exit(r: &r); |
1039 | printbuf_exit(&buf); |
1040 | bch_err_fn(c, ret); |
1041 | return ret; |
1042 | } |
1043 | |
1044 | /* |
1045 | * Mark a snapshot as deleted, for future cleanup: |
1046 | */ |
1047 | int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) |
1048 | { |
1049 | struct btree_iter iter; |
1050 | struct bkey_i_snapshot *s; |
1051 | int ret = 0; |
1052 | |
1053 | s = bch2_bkey_get_mut_typed(trans, &iter, |
1054 | BTREE_ID_snapshots, POS(0, id), |
1055 | 0, snapshot); |
1056 | ret = PTR_ERR_OR_ZERO(ptr: s); |
1057 | if (unlikely(ret)) { |
1058 | bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), |
1059 | trans->c, "missing snapshot %u" , id); |
1060 | return ret; |
1061 | } |
1062 | |
1063 | /* already deleted? */ |
1064 | if (BCH_SNAPSHOT_DELETED(k: &s->v)) |
1065 | goto err; |
1066 | |
1067 | SET_BCH_SNAPSHOT_DELETED(k: &s->v, v: true); |
1068 | SET_BCH_SNAPSHOT_SUBVOL(k: &s->v, v: false); |
1069 | s->v.subvol = 0; |
1070 | err: |
1071 | bch2_trans_iter_exit(trans, &iter); |
1072 | return ret; |
1073 | } |
1074 | |
1075 | static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) |
1076 | { |
1077 | if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) |
1078 | swap(s->children[0], s->children[1]); |
1079 | } |
1080 | |
1081 | static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) |
1082 | { |
1083 | struct bch_fs *c = trans->c; |
1084 | struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; |
1085 | struct btree_iter c_iter = (struct btree_iter) { NULL }; |
1086 | struct btree_iter tree_iter = (struct btree_iter) { NULL }; |
1087 | struct bkey_s_c_snapshot s; |
1088 | u32 parent_id, child_id; |
1089 | unsigned i; |
1090 | int ret = 0; |
1091 | |
1092 | s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), |
1093 | BTREE_ITER_INTENT, snapshot); |
1094 | ret = bkey_err(s); |
1095 | bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, |
1096 | "missing snapshot %u" , id); |
1097 | |
1098 | if (ret) |
1099 | goto err; |
1100 | |
1101 | BUG_ON(s.v->children[1]); |
1102 | |
1103 | parent_id = le32_to_cpu(s.v->parent); |
1104 | child_id = le32_to_cpu(s.v->children[0]); |
1105 | |
1106 | if (parent_id) { |
1107 | struct bkey_i_snapshot *parent; |
1108 | |
1109 | parent = bch2_bkey_get_mut_typed(trans, &p_iter, |
1110 | BTREE_ID_snapshots, POS(0, parent_id), |
1111 | 0, snapshot); |
1112 | ret = PTR_ERR_OR_ZERO(ptr: parent); |
1113 | bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, |
1114 | "missing snapshot %u" , parent_id); |
1115 | if (unlikely(ret)) |
1116 | goto err; |
1117 | |
1118 | /* find entry in parent->children for node being deleted */ |
1119 | for (i = 0; i < 2; i++) |
1120 | if (le32_to_cpu(parent->v.children[i]) == id) |
1121 | break; |
1122 | |
1123 | if (bch2_fs_inconsistent_on(i == 2, c, |
1124 | "snapshot %u missing child pointer to %u" , |
1125 | parent_id, id)) |
1126 | goto err; |
1127 | |
1128 | parent->v.children[i] = cpu_to_le32(child_id); |
1129 | |
1130 | normalize_snapshot_child_pointers(s: &parent->v); |
1131 | } |
1132 | |
1133 | if (child_id) { |
1134 | struct bkey_i_snapshot *child; |
1135 | |
1136 | child = bch2_bkey_get_mut_typed(trans, &c_iter, |
1137 | BTREE_ID_snapshots, POS(0, child_id), |
1138 | 0, snapshot); |
1139 | ret = PTR_ERR_OR_ZERO(ptr: child); |
1140 | bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, |
1141 | "missing snapshot %u" , child_id); |
1142 | if (unlikely(ret)) |
1143 | goto err; |
1144 | |
1145 | child->v.parent = cpu_to_le32(parent_id); |
1146 | |
1147 | if (!child->v.parent) { |
1148 | child->v.skip[0] = 0; |
1149 | child->v.skip[1] = 0; |
1150 | child->v.skip[2] = 0; |
1151 | } |
1152 | } |
1153 | |
1154 | if (!parent_id) { |
1155 | /* |
1156 | * We're deleting the root of a snapshot tree: update the |
1157 | * snapshot_tree entry to point to the new root, or delete it if |
1158 | * this is the last snapshot ID in this tree: |
1159 | */ |
1160 | struct bkey_i_snapshot_tree *s_t; |
1161 | |
1162 | BUG_ON(s.v->children[1]); |
1163 | |
1164 | s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, |
1165 | BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), |
1166 | 0, snapshot_tree); |
1167 | ret = PTR_ERR_OR_ZERO(ptr: s_t); |
1168 | if (ret) |
1169 | goto err; |
1170 | |
1171 | if (s.v->children[0]) { |
1172 | s_t->v.root_snapshot = s.v->children[0]; |
1173 | } else { |
1174 | s_t->k.type = KEY_TYPE_deleted; |
1175 | set_bkey_val_u64s(k: &s_t->k, val_u64s: 0); |
1176 | } |
1177 | } |
1178 | |
1179 | ret = bch2_btree_delete_at(trans, &iter, 0); |
1180 | err: |
1181 | bch2_trans_iter_exit(trans, &tree_iter); |
1182 | bch2_trans_iter_exit(trans, &p_iter); |
1183 | bch2_trans_iter_exit(trans, &c_iter); |
1184 | bch2_trans_iter_exit(trans, &iter); |
1185 | return ret; |
1186 | } |
1187 | |
1188 | static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, |
1189 | u32 *new_snapids, |
1190 | u32 *snapshot_subvols, |
1191 | unsigned nr_snapids) |
1192 | { |
1193 | struct bch_fs *c = trans->c; |
1194 | struct btree_iter iter; |
1195 | struct bkey_i_snapshot *n; |
1196 | struct bkey_s_c k; |
1197 | unsigned i, j; |
1198 | u32 depth = bch2_snapshot_depth(c, parent); |
1199 | int ret; |
1200 | |
1201 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_snapshots, |
1202 | POS_MIN, flags: BTREE_ITER_INTENT); |
1203 | k = bch2_btree_iter_peek(iter: &iter); |
1204 | ret = bkey_err(k); |
1205 | if (ret) |
1206 | goto err; |
1207 | |
1208 | for (i = 0; i < nr_snapids; i++) { |
1209 | k = bch2_btree_iter_prev_slot(&iter); |
1210 | ret = bkey_err(k); |
1211 | if (ret) |
1212 | goto err; |
1213 | |
1214 | if (!k.k || !k.k->p.offset) { |
1215 | ret = -BCH_ERR_ENOSPC_snapshot_create; |
1216 | goto err; |
1217 | } |
1218 | |
1219 | n = bch2_bkey_alloc(trans, &iter, 0, snapshot); |
1220 | ret = PTR_ERR_OR_ZERO(ptr: n); |
1221 | if (ret) |
1222 | goto err; |
1223 | |
1224 | n->v.flags = 0; |
1225 | n->v.parent = cpu_to_le32(parent); |
1226 | n->v.subvol = cpu_to_le32(snapshot_subvols[i]); |
1227 | n->v.tree = cpu_to_le32(tree); |
1228 | n->v.depth = cpu_to_le32(depth); |
1229 | n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); |
1230 | n->v.btime.hi = 0; |
1231 | |
1232 | for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) |
1233 | n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); |
1234 | |
1235 | bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); |
1236 | SET_BCH_SNAPSHOT_SUBVOL(k: &n->v, v: true); |
1237 | |
1238 | ret = __bch2_mark_snapshot(trans, btree: BTREE_ID_snapshots, level: 0, |
1239 | bkey_s_c_null, new: bkey_i_to_s_c(k: &n->k_i), flags: 0); |
1240 | if (ret) |
1241 | goto err; |
1242 | |
1243 | new_snapids[i] = iter.pos.offset; |
1244 | |
1245 | mutex_lock(&c->snapshot_table_lock); |
1246 | snapshot_t_mut(c, id: new_snapids[i])->equiv = new_snapids[i]; |
1247 | mutex_unlock(lock: &c->snapshot_table_lock); |
1248 | } |
1249 | err: |
1250 | bch2_trans_iter_exit(trans, &iter); |
1251 | return ret; |
1252 | } |
1253 | |
1254 | /* |
1255 | * Create new snapshot IDs as children of an existing snapshot ID: |
1256 | */ |
1257 | static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, |
1258 | u32 *new_snapids, |
1259 | u32 *snapshot_subvols, |
1260 | unsigned nr_snapids) |
1261 | { |
1262 | struct btree_iter iter; |
1263 | struct bkey_i_snapshot *n_parent; |
1264 | int ret = 0; |
1265 | |
1266 | n_parent = bch2_bkey_get_mut_typed(trans, &iter, |
1267 | BTREE_ID_snapshots, POS(0, parent), |
1268 | 0, snapshot); |
1269 | ret = PTR_ERR_OR_ZERO(ptr: n_parent); |
1270 | if (unlikely(ret)) { |
1271 | if (bch2_err_matches(ret, ENOENT)) |
1272 | bch_err(trans->c, "snapshot %u not found" , parent); |
1273 | return ret; |
1274 | } |
1275 | |
1276 | if (n_parent->v.children[0] || n_parent->v.children[1]) { |
1277 | bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children" ); |
1278 | ret = -EINVAL; |
1279 | goto err; |
1280 | } |
1281 | |
1282 | ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), |
1283 | new_snapids, snapshot_subvols, nr_snapids); |
1284 | if (ret) |
1285 | goto err; |
1286 | |
1287 | n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); |
1288 | n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); |
1289 | n_parent->v.subvol = 0; |
1290 | SET_BCH_SNAPSHOT_SUBVOL(k: &n_parent->v, v: false); |
1291 | err: |
1292 | bch2_trans_iter_exit(trans, &iter); |
1293 | return ret; |
1294 | } |
1295 | |
1296 | /* |
1297 | * Create a snapshot node that is the root of a new tree: |
1298 | */ |
1299 | static int bch2_snapshot_node_create_tree(struct btree_trans *trans, |
1300 | u32 *new_snapids, |
1301 | u32 *snapshot_subvols, |
1302 | unsigned nr_snapids) |
1303 | { |
1304 | struct bkey_i_snapshot_tree *n_tree; |
1305 | int ret; |
1306 | |
1307 | n_tree = __bch2_snapshot_tree_create(trans); |
1308 | ret = PTR_ERR_OR_ZERO(ptr: n_tree) ?: |
1309 | create_snapids(trans, parent: 0, tree: n_tree->k.p.offset, |
1310 | new_snapids, snapshot_subvols, nr_snapids); |
1311 | if (ret) |
1312 | return ret; |
1313 | |
1314 | n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); |
1315 | n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); |
1316 | return 0; |
1317 | } |
1318 | |
1319 | int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, |
1320 | u32 *new_snapids, |
1321 | u32 *snapshot_subvols, |
1322 | unsigned nr_snapids) |
1323 | { |
1324 | BUG_ON((parent == 0) != (nr_snapids == 1)); |
1325 | BUG_ON((parent != 0) != (nr_snapids == 2)); |
1326 | |
1327 | return parent |
1328 | ? bch2_snapshot_node_create_children(trans, parent, |
1329 | new_snapids, snapshot_subvols, nr_snapids) |
1330 | : bch2_snapshot_node_create_tree(trans, |
1331 | new_snapids, snapshot_subvols, nr_snapids); |
1332 | |
1333 | } |
1334 | |
1335 | /* |
1336 | * If we have an unlinked inode in an internal snapshot node, and the inode |
1337 | * really has been deleted in all child snapshots, how does this get cleaned up? |
1338 | * |
1339 | * first there is the problem of how keys that have been overwritten in all |
1340 | * child snapshots get deleted (unimplemented?), but inodes may perhaps be |
1341 | * special? |
1342 | * |
1343 | * also: unlinked inode in internal snapshot appears to not be getting deleted |
1344 | * correctly if inode doesn't exist in leaf snapshots |
1345 | * |
1346 | * solution: |
1347 | * |
1348 | * for a key in an interior snapshot node that needs work to be done that |
1349 | * requires it to be mutated: iterate over all descendent leaf nodes and copy |
1350 | * that key to snapshot leaf nodes, where we can mutate it |
1351 | */ |
1352 | |
1353 | static int snapshot_delete_key(struct btree_trans *trans, |
1354 | struct btree_iter *iter, |
1355 | struct bkey_s_c k, |
1356 | snapshot_id_list *deleted, |
1357 | snapshot_id_list *equiv_seen, |
1358 | struct bpos *last_pos) |
1359 | { |
1360 | struct bch_fs *c = trans->c; |
1361 | u32 equiv = bch2_snapshot_equiv(c, id: k.k->p.snapshot); |
1362 | |
1363 | if (!bkey_eq(l: k.k->p, r: *last_pos)) |
1364 | equiv_seen->nr = 0; |
1365 | *last_pos = k.k->p; |
1366 | |
1367 | if (snapshot_list_has_id(s: deleted, id: k.k->p.snapshot) || |
1368 | snapshot_list_has_id(s: equiv_seen, id: equiv)) { |
1369 | return bch2_btree_delete_at(trans, iter, |
1370 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); |
1371 | } else { |
1372 | return snapshot_list_add(c, s: equiv_seen, id: equiv); |
1373 | } |
1374 | } |
1375 | |
1376 | static int move_key_to_correct_snapshot(struct btree_trans *trans, |
1377 | struct btree_iter *iter, |
1378 | struct bkey_s_c k) |
1379 | { |
1380 | struct bch_fs *c = trans->c; |
1381 | u32 equiv = bch2_snapshot_equiv(c, id: k.k->p.snapshot); |
1382 | |
1383 | /* |
1384 | * When we have a linear chain of snapshot nodes, we consider |
1385 | * those to form an equivalence class: we're going to collapse |
1386 | * them all down to a single node, and keep the leaf-most node - |
1387 | * which has the same id as the equivalence class id. |
1388 | * |
1389 | * If there are multiple keys in different snapshots at the same |
1390 | * position, we're only going to keep the one in the newest |
1391 | * snapshot - the rest have been overwritten and are redundant, |
1392 | * and for the key we're going to keep we need to move it to the |
1393 | * equivalance class ID if it's not there already. |
1394 | */ |
1395 | if (equiv != k.k->p.snapshot) { |
1396 | struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); |
1397 | struct btree_iter new_iter; |
1398 | int ret; |
1399 | |
1400 | ret = PTR_ERR_OR_ZERO(ptr: new); |
1401 | if (ret) |
1402 | return ret; |
1403 | |
1404 | new->k.p.snapshot = equiv; |
1405 | |
1406 | bch2_trans_iter_init(trans, iter: &new_iter, btree_id: iter->btree_id, pos: new->k.p, |
1407 | flags: BTREE_ITER_ALL_SNAPSHOTS| |
1408 | BTREE_ITER_CACHED| |
1409 | BTREE_ITER_INTENT); |
1410 | |
1411 | ret = bch2_btree_iter_traverse(&new_iter) ?: |
1412 | bch2_trans_update(trans, &new_iter, new, |
1413 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: |
1414 | bch2_btree_delete_at(trans, iter, |
1415 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); |
1416 | bch2_trans_iter_exit(trans, &new_iter); |
1417 | if (ret) |
1418 | return ret; |
1419 | } |
1420 | |
1421 | return 0; |
1422 | } |
1423 | |
1424 | static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) |
1425 | { |
1426 | struct bkey_s_c_snapshot snap; |
1427 | u32 children[2]; |
1428 | int ret; |
1429 | |
1430 | if (k.k->type != KEY_TYPE_snapshot) |
1431 | return 0; |
1432 | |
1433 | snap = bkey_s_c_to_snapshot(k); |
1434 | if (BCH_SNAPSHOT_DELETED(k: snap.v) || |
1435 | BCH_SNAPSHOT_SUBVOL(k: snap.v)) |
1436 | return 0; |
1437 | |
1438 | children[0] = le32_to_cpu(snap.v->children[0]); |
1439 | children[1] = le32_to_cpu(snap.v->children[1]); |
1440 | |
1441 | ret = bch2_snapshot_live(trans, id: children[0]) ?: |
1442 | bch2_snapshot_live(trans, id: children[1]); |
1443 | if (ret < 0) |
1444 | return ret; |
1445 | return !ret; |
1446 | } |
1447 | |
1448 | /* |
1449 | * For a given snapshot, if it doesn't have a subvolume that points to it, and |
1450 | * it doesn't have child snapshot nodes - it's now redundant and we can mark it |
1451 | * as deleted. |
1452 | */ |
1453 | static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) |
1454 | { |
1455 | int ret = bch2_snapshot_needs_delete(trans, k); |
1456 | |
1457 | return ret <= 0 |
1458 | ? ret |
1459 | : bch2_snapshot_node_set_deleted(trans, id: k.k->p.offset); |
1460 | } |
1461 | |
1462 | static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, |
1463 | snapshot_id_list *skip) |
1464 | { |
1465 | rcu_read_lock(); |
1466 | while (snapshot_list_has_id(s: skip, id)) |
1467 | id = __bch2_snapshot_parent(c, id); |
1468 | |
1469 | while (n--) { |
1470 | do { |
1471 | id = __bch2_snapshot_parent(c, id); |
1472 | } while (snapshot_list_has_id(s: skip, id)); |
1473 | } |
1474 | rcu_read_unlock(); |
1475 | |
1476 | return id; |
1477 | } |
1478 | |
1479 | static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, |
1480 | struct btree_iter *iter, struct bkey_s_c k, |
1481 | snapshot_id_list *deleted) |
1482 | { |
1483 | struct bch_fs *c = trans->c; |
1484 | u32 nr_deleted_ancestors = 0; |
1485 | struct bkey_i_snapshot *s; |
1486 | int ret; |
1487 | |
1488 | if (k.k->type != KEY_TYPE_snapshot) |
1489 | return 0; |
1490 | |
1491 | if (snapshot_list_has_id(s: deleted, id: k.k->p.offset)) |
1492 | return 0; |
1493 | |
1494 | s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); |
1495 | ret = PTR_ERR_OR_ZERO(ptr: s); |
1496 | if (ret) |
1497 | return ret; |
1498 | |
1499 | darray_for_each(*deleted, i) |
1500 | nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, id: s->k.p.offset, ancestor: *i); |
1501 | |
1502 | if (!nr_deleted_ancestors) |
1503 | return 0; |
1504 | |
1505 | le32_add_cpu(var: &s->v.depth, val: -nr_deleted_ancestors); |
1506 | |
1507 | if (!s->v.depth) { |
1508 | s->v.skip[0] = 0; |
1509 | s->v.skip[1] = 0; |
1510 | s->v.skip[2] = 0; |
1511 | } else { |
1512 | u32 depth = le32_to_cpu(s->v.depth); |
1513 | u32 parent = bch2_snapshot_parent(c, id: s->k.p.offset); |
1514 | |
1515 | for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { |
1516 | u32 id = le32_to_cpu(s->v.skip[j]); |
1517 | |
1518 | if (snapshot_list_has_id(s: deleted, id)) { |
1519 | id = bch2_snapshot_nth_parent_skip(c, |
1520 | id: parent, |
1521 | n: depth > 1 |
1522 | ? get_random_u32_below(ceil: depth - 1) |
1523 | : 0, |
1524 | skip: deleted); |
1525 | s->v.skip[j] = cpu_to_le32(id); |
1526 | } |
1527 | } |
1528 | |
1529 | bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); |
1530 | } |
1531 | |
1532 | return bch2_trans_update(trans, iter, &s->k_i, 0); |
1533 | } |
1534 | |
1535 | int bch2_delete_dead_snapshots(struct bch_fs *c) |
1536 | { |
1537 | struct btree_trans *trans; |
1538 | snapshot_id_list deleted = { 0 }; |
1539 | snapshot_id_list deleted_interior = { 0 }; |
1540 | u32 id; |
1541 | int ret = 0; |
1542 | |
1543 | if (!test_and_clear_bit(nr: BCH_FS_need_delete_dead_snapshots, addr: &c->flags)) |
1544 | return 0; |
1545 | |
1546 | if (!test_bit(BCH_FS_started, &c->flags)) { |
1547 | ret = bch2_fs_read_write_early(c); |
1548 | bch_err_msg(c, ret, "deleting dead snapshots: error going rw" ); |
1549 | if (ret) |
1550 | return ret; |
1551 | } |
1552 | |
1553 | trans = bch2_trans_get(c); |
1554 | |
1555 | /* |
1556 | * For every snapshot node: If we have no live children and it's not |
1557 | * pointed to by a subvolume, delete it: |
1558 | */ |
1559 | ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, |
1560 | POS_MIN, 0, k, |
1561 | NULL, NULL, 0, |
1562 | bch2_delete_redundant_snapshot(trans, k)); |
1563 | bch_err_msg(c, ret, "deleting redundant snapshots" ); |
1564 | if (ret) |
1565 | goto err; |
1566 | |
1567 | ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, |
1568 | POS_MIN, 0, k, |
1569 | bch2_snapshot_set_equiv(trans, k)); |
1570 | bch_err_msg(c, ret, "in bch2_snapshots_set_equiv" ); |
1571 | if (ret) |
1572 | goto err; |
1573 | |
1574 | ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, |
1575 | POS_MIN, 0, k, ({ |
1576 | if (k.k->type != KEY_TYPE_snapshot) |
1577 | continue; |
1578 | |
1579 | BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) |
1580 | ? snapshot_list_add(c, &deleted, k.k->p.offset) |
1581 | : 0; |
1582 | })); |
1583 | bch_err_msg(c, ret, "walking snapshots" ); |
1584 | if (ret) |
1585 | goto err; |
1586 | |
1587 | for (id = 0; id < BTREE_ID_NR; id++) { |
1588 | struct bpos last_pos = POS_MIN; |
1589 | snapshot_id_list equiv_seen = { 0 }; |
1590 | struct disk_reservation res = { 0 }; |
1591 | |
1592 | if (!btree_type_has_snapshots(id)) |
1593 | continue; |
1594 | |
1595 | /* |
1596 | * deleted inodes btree is maintained by a trigger on the inodes |
1597 | * btree - no work for us to do here, and it's not safe to scan |
1598 | * it because we'll see out of date keys due to the btree write |
1599 | * buffer: |
1600 | */ |
1601 | if (id == BTREE_ID_deleted_inodes) |
1602 | continue; |
1603 | |
1604 | ret = for_each_btree_key_commit(trans, iter, |
1605 | id, POS_MIN, |
1606 | BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, |
1607 | &res, NULL, BCH_TRANS_COMMIT_no_enospc, |
1608 | snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: |
1609 | for_each_btree_key_commit(trans, iter, |
1610 | id, POS_MIN, |
1611 | BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, |
1612 | &res, NULL, BCH_TRANS_COMMIT_no_enospc, |
1613 | move_key_to_correct_snapshot(trans, &iter, k)); |
1614 | |
1615 | bch2_disk_reservation_put(c, res: &res); |
1616 | darray_exit(&equiv_seen); |
1617 | |
1618 | bch_err_msg(c, ret, "deleting keys from dying snapshots" ); |
1619 | if (ret) |
1620 | goto err; |
1621 | } |
1622 | |
1623 | bch2_trans_unlock(trans); |
1624 | down_write(sem: &c->snapshot_create_lock); |
1625 | |
1626 | ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, |
1627 | POS_MIN, 0, k, ({ |
1628 | u32 snapshot = k.k->p.offset; |
1629 | u32 equiv = bch2_snapshot_equiv(c, snapshot); |
1630 | |
1631 | equiv != snapshot |
1632 | ? snapshot_list_add(c, &deleted_interior, snapshot) |
1633 | : 0; |
1634 | })); |
1635 | |
1636 | bch_err_msg(c, ret, "walking snapshots" ); |
1637 | if (ret) |
1638 | goto err_create_lock; |
1639 | |
1640 | /* |
1641 | * Fixing children of deleted snapshots can't be done completely |
1642 | * atomically, if we crash between here and when we delete the interior |
1643 | * nodes some depth fields will be off: |
1644 | */ |
1645 | ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, |
1646 | BTREE_ITER_INTENT, k, |
1647 | NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
1648 | bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); |
1649 | if (ret) |
1650 | goto err_create_lock; |
1651 | |
1652 | darray_for_each(deleted, i) { |
1653 | ret = commit_do(trans, NULL, NULL, 0, |
1654 | bch2_snapshot_node_delete(trans, *i)); |
1655 | bch_err_msg(c, ret, "deleting snapshot %u" , *i); |
1656 | if (ret) |
1657 | goto err_create_lock; |
1658 | } |
1659 | |
1660 | darray_for_each(deleted_interior, i) { |
1661 | ret = commit_do(trans, NULL, NULL, 0, |
1662 | bch2_snapshot_node_delete(trans, *i)); |
1663 | bch_err_msg(c, ret, "deleting snapshot %u" , *i); |
1664 | if (ret) |
1665 | goto err_create_lock; |
1666 | } |
1667 | err_create_lock: |
1668 | up_write(sem: &c->snapshot_create_lock); |
1669 | err: |
1670 | darray_exit(&deleted_interior); |
1671 | darray_exit(&deleted); |
1672 | bch2_trans_put(trans); |
1673 | bch_err_fn(c, ret); |
1674 | return ret; |
1675 | } |
1676 | |
1677 | void bch2_delete_dead_snapshots_work(struct work_struct *work) |
1678 | { |
1679 | struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); |
1680 | |
1681 | bch2_delete_dead_snapshots(c); |
1682 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_delete_dead_snapshots); |
1683 | } |
1684 | |
1685 | void bch2_delete_dead_snapshots_async(struct bch_fs *c) |
1686 | { |
1687 | if (bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_delete_dead_snapshots) && |
1688 | !queue_work(wq: c->write_ref_wq, work: &c->snapshot_delete_work)) |
1689 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_delete_dead_snapshots); |
1690 | } |
1691 | |
1692 | int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, |
1693 | enum btree_id id, |
1694 | struct bpos pos) |
1695 | { |
1696 | struct bch_fs *c = trans->c; |
1697 | struct btree_iter iter; |
1698 | struct bkey_s_c k; |
1699 | int ret; |
1700 | |
1701 | bch2_trans_iter_init(trans, iter: &iter, btree_id: id, pos, |
1702 | flags: BTREE_ITER_NOT_EXTENTS| |
1703 | BTREE_ITER_ALL_SNAPSHOTS); |
1704 | while (1) { |
1705 | k = bch2_btree_iter_prev(&iter); |
1706 | ret = bkey_err(k); |
1707 | if (ret) |
1708 | break; |
1709 | |
1710 | if (!k.k) |
1711 | break; |
1712 | |
1713 | if (!bkey_eq(l: pos, r: k.k->p)) |
1714 | break; |
1715 | |
1716 | if (bch2_snapshot_is_ancestor(c, id: k.k->p.snapshot, ancestor: pos.snapshot)) { |
1717 | ret = 1; |
1718 | break; |
1719 | } |
1720 | } |
1721 | bch2_trans_iter_exit(trans, &iter); |
1722 | |
1723 | return ret; |
1724 | } |
1725 | |
1726 | static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) |
1727 | { |
1728 | const struct snapshot_t *s = snapshot_t(c, id); |
1729 | |
1730 | return s->children[1] ?: s->children[0]; |
1731 | } |
1732 | |
1733 | static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) |
1734 | { |
1735 | u32 child; |
1736 | |
1737 | while ((child = bch2_snapshot_smallest_child(c, id))) |
1738 | id = child; |
1739 | return id; |
1740 | } |
1741 | |
1742 | static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, |
1743 | enum btree_id btree, |
1744 | struct bkey_s_c interior_k, |
1745 | u32 leaf_id, struct bpos *new_min_pos) |
1746 | { |
1747 | struct btree_iter iter; |
1748 | struct bpos pos = interior_k.k->p; |
1749 | struct bkey_s_c k; |
1750 | struct bkey_i *new; |
1751 | int ret; |
1752 | |
1753 | pos.snapshot = leaf_id; |
1754 | |
1755 | bch2_trans_iter_init(trans, iter: &iter, btree_id: btree, pos, flags: BTREE_ITER_INTENT); |
1756 | k = bch2_btree_iter_peek_slot(&iter); |
1757 | ret = bkey_err(k); |
1758 | if (ret) |
1759 | goto out; |
1760 | |
1761 | /* key already overwritten in this snapshot? */ |
1762 | if (k.k->p.snapshot != interior_k.k->p.snapshot) |
1763 | goto out; |
1764 | |
1765 | if (bpos_eq(l: *new_min_pos, POS_MIN)) { |
1766 | *new_min_pos = k.k->p; |
1767 | new_min_pos->snapshot = leaf_id; |
1768 | } |
1769 | |
1770 | new = bch2_bkey_make_mut_noupdate(trans, k: interior_k); |
1771 | ret = PTR_ERR_OR_ZERO(ptr: new); |
1772 | if (ret) |
1773 | goto out; |
1774 | |
1775 | new->k.p.snapshot = leaf_id; |
1776 | ret = bch2_trans_update(trans, &iter, new, 0); |
1777 | out: |
1778 | bch2_trans_iter_exit(trans, &iter); |
1779 | return ret; |
1780 | } |
1781 | |
1782 | int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, |
1783 | enum btree_id btree, |
1784 | struct bkey_s_c k, |
1785 | struct bpos *new_min_pos) |
1786 | { |
1787 | struct bch_fs *c = trans->c; |
1788 | struct bkey_buf sk; |
1789 | u32 restart_count = trans->restart_count; |
1790 | int ret = 0; |
1791 | |
1792 | bch2_bkey_buf_init(s: &sk); |
1793 | bch2_bkey_buf_reassemble(s: &sk, c, k); |
1794 | k = bkey_i_to_s_c(k: sk.k); |
1795 | |
1796 | *new_min_pos = POS_MIN; |
1797 | |
1798 | for (u32 id = bch2_snapshot_smallest_descendent(c, id: k.k->p.snapshot); |
1799 | id < k.k->p.snapshot; |
1800 | id++) { |
1801 | if (!bch2_snapshot_is_ancestor(c, id, ancestor: k.k->p.snapshot) || |
1802 | !bch2_snapshot_is_leaf(c, id)) |
1803 | continue; |
1804 | again: |
1805 | ret = btree_trans_too_many_iters(trans) ?: |
1806 | bch2_propagate_key_to_snapshot_leaf(trans, btree, interior_k: k, leaf_id: id, new_min_pos) ?: |
1807 | bch2_trans_commit(trans, NULL, NULL, flags: 0); |
1808 | if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { |
1809 | bch2_trans_begin(trans); |
1810 | goto again; |
1811 | } |
1812 | |
1813 | if (ret) |
1814 | break; |
1815 | } |
1816 | |
1817 | bch2_bkey_buf_exit(s: &sk, c); |
1818 | |
1819 | return ret ?: trans_was_restarted(trans, restart_count); |
1820 | } |
1821 | |
1822 | static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) |
1823 | { |
1824 | struct bch_fs *c = trans->c; |
1825 | struct bkey_s_c_snapshot snap; |
1826 | int ret = 0; |
1827 | |
1828 | if (k.k->type != KEY_TYPE_snapshot) |
1829 | return 0; |
1830 | |
1831 | snap = bkey_s_c_to_snapshot(k); |
1832 | if (BCH_SNAPSHOT_DELETED(k: snap.v) || |
1833 | bch2_snapshot_equiv(c, id: k.k->p.offset) != k.k->p.offset || |
1834 | (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { |
1835 | set_bit(nr: BCH_FS_need_delete_dead_snapshots, addr: &c->flags); |
1836 | return 0; |
1837 | } |
1838 | |
1839 | return ret; |
1840 | } |
1841 | |
1842 | int bch2_snapshots_read(struct bch_fs *c) |
1843 | { |
1844 | int ret = bch2_trans_run(c, |
1845 | for_each_btree_key(trans, iter, BTREE_ID_snapshots, |
1846 | POS_MIN, 0, k, |
1847 | __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: |
1848 | bch2_snapshot_set_equiv(trans, k) ?: |
1849 | bch2_check_snapshot_needs_deletion(trans, k)) ?: |
1850 | for_each_btree_key(trans, iter, BTREE_ID_snapshots, |
1851 | POS_MIN, 0, k, |
1852 | (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); |
1853 | bch_err_fn(c, ret); |
1854 | |
1855 | /* |
1856 | * It's important that we check if we need to reconstruct snapshots |
1857 | * before going RW, so we mark that pass as required in the superblock - |
1858 | * otherwise, we could end up deleting keys with missing snapshot nodes |
1859 | * instead |
1860 | */ |
1861 | BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && |
1862 | test_bit(BCH_FS_may_go_rw, &c->flags)); |
1863 | |
1864 | if (bch2_err_matches(ret, EIO) || |
1865 | (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) |
1866 | ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); |
1867 | |
1868 | return ret; |
1869 | } |
1870 | |
1871 | void bch2_fs_snapshots_exit(struct bch_fs *c) |
1872 | { |
1873 | kvfree(rcu_dereference_protected(c->snapshots, true)); |
1874 | } |
1875 | |