1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "alloc_foreground.h" |
5 | #include "bkey_buf.h" |
6 | #include "btree_update.h" |
7 | #include "buckets.h" |
8 | #include "data_update.h" |
9 | #include "ec.h" |
10 | #include "error.h" |
11 | #include "extents.h" |
12 | #include "io_write.h" |
13 | #include "keylist.h" |
14 | #include "move.h" |
15 | #include "nocow_locking.h" |
16 | #include "rebalance.h" |
17 | #include "snapshot.h" |
18 | #include "subvolume.h" |
19 | #include "trace.h" |
20 | |
21 | static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) |
22 | { |
23 | if (trace_move_extent_finish_enabled()) { |
24 | struct printbuf buf = PRINTBUF; |
25 | |
26 | bch2_bkey_val_to_text(&buf, c, k); |
27 | trace_move_extent_finish(c, str: buf.buf); |
28 | printbuf_exit(&buf); |
29 | } |
30 | } |
31 | |
32 | static void trace_move_extent_fail2(struct data_update *m, |
33 | struct bkey_s_c new, |
34 | struct bkey_s_c wrote, |
35 | struct bkey_i *insert, |
36 | const char *msg) |
37 | { |
38 | struct bch_fs *c = m->op.c; |
39 | struct bkey_s_c old = bkey_i_to_s_c(k: m->k.k); |
40 | const union bch_extent_entry *entry; |
41 | struct bch_extent_ptr *ptr; |
42 | struct extent_ptr_decoded p; |
43 | struct printbuf buf = PRINTBUF; |
44 | unsigned i, rewrites_found = 0; |
45 | |
46 | if (!trace_move_extent_fail_enabled()) |
47 | return; |
48 | |
49 | prt_str(out: &buf, str: msg); |
50 | |
51 | if (insert) { |
52 | i = 0; |
53 | bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { |
54 | if (((1U << i) & m->data_opts.rewrite_ptrs) && |
55 | (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(k: insert))) && |
56 | !ptr->cached) |
57 | rewrites_found |= 1U << i; |
58 | i++; |
59 | } |
60 | } |
61 | |
62 | prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u" , |
63 | (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, |
64 | (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, |
65 | (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, |
66 | (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); |
67 | |
68 | prt_printf(&buf, "\nrewrites found: %u%u%u%u" , |
69 | (rewrites_found & (1 << 0)) != 0, |
70 | (rewrites_found & (1 << 1)) != 0, |
71 | (rewrites_found & (1 << 2)) != 0, |
72 | (rewrites_found & (1 << 3)) != 0); |
73 | |
74 | prt_str(out: &buf, str: "\nold: " ); |
75 | bch2_bkey_val_to_text(&buf, c, old); |
76 | |
77 | prt_str(out: &buf, str: "\nnew: " ); |
78 | bch2_bkey_val_to_text(&buf, c, new); |
79 | |
80 | prt_str(out: &buf, str: "\nwrote: " ); |
81 | bch2_bkey_val_to_text(&buf, c, wrote); |
82 | |
83 | if (insert) { |
84 | prt_str(out: &buf, str: "\ninsert: " ); |
85 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: insert)); |
86 | } |
87 | |
88 | trace_move_extent_fail(c, str: buf.buf); |
89 | printbuf_exit(&buf); |
90 | } |
91 | |
92 | static int __bch2_data_update_index_update(struct btree_trans *trans, |
93 | struct bch_write_op *op) |
94 | { |
95 | struct bch_fs *c = op->c; |
96 | struct btree_iter iter; |
97 | struct data_update *m = |
98 | container_of(op, struct data_update, op); |
99 | struct keylist *keys = &op->insert_keys; |
100 | struct bkey_buf _new, _insert; |
101 | int ret = 0; |
102 | |
103 | bch2_bkey_buf_init(s: &_new); |
104 | bch2_bkey_buf_init(s: &_insert); |
105 | bch2_bkey_buf_realloc(s: &_insert, c, U8_MAX); |
106 | |
107 | bch2_trans_iter_init(trans, iter: &iter, btree_id: m->btree_id, |
108 | pos: bkey_start_pos(k: &bch2_keylist_front(l: keys)->k), |
109 | flags: BTREE_ITER_SLOTS|BTREE_ITER_INTENT); |
110 | |
111 | while (1) { |
112 | struct bkey_s_c k; |
113 | struct bkey_s_c old = bkey_i_to_s_c(k: m->k.k); |
114 | struct bkey_i *insert = NULL; |
115 | struct bkey_i_extent *new; |
116 | const union bch_extent_entry *entry_c; |
117 | union bch_extent_entry *entry; |
118 | struct extent_ptr_decoded p; |
119 | struct bch_extent_ptr *ptr; |
120 | const struct bch_extent_ptr *ptr_c; |
121 | struct bpos next_pos; |
122 | bool should_check_enospc; |
123 | s64 i_sectors_delta = 0, disk_sectors_delta = 0; |
124 | unsigned rewrites_found = 0, durability, i; |
125 | |
126 | bch2_trans_begin(trans); |
127 | |
128 | k = bch2_btree_iter_peek_slot(&iter); |
129 | ret = bkey_err(k); |
130 | if (ret) |
131 | goto err; |
132 | |
133 | new = bkey_i_to_extent(k: bch2_keylist_front(l: keys)); |
134 | |
135 | if (!bch2_extents_match(k, old)) { |
136 | trace_move_extent_fail2(m, new: k, wrote: bkey_i_to_s_c(k: &new->k_i), |
137 | NULL, msg: "no match:" ); |
138 | goto nowork; |
139 | } |
140 | |
141 | bkey_reassemble(dst: _insert.k, src: k); |
142 | insert = _insert.k; |
143 | |
144 | bch2_bkey_buf_copy(s: &_new, c, src: bch2_keylist_front(l: keys)); |
145 | new = bkey_i_to_extent(k: _new.k); |
146 | bch2_cut_front(where: iter.pos, k: &new->k_i); |
147 | |
148 | bch2_cut_front(where: iter.pos, k: insert); |
149 | bch2_cut_back(where: new->k.p, k: insert); |
150 | bch2_cut_back(where: insert->k.p, k: &new->k_i); |
151 | |
152 | /* |
153 | * @old: extent that we read from |
154 | * @insert: key that we're going to update, initialized from |
155 | * extent currently in btree - same as @old unless we raced with |
156 | * other updates |
157 | * @new: extent with new pointers that we'll be adding to @insert |
158 | * |
159 | * Fist, drop rewrite_ptrs from @new: |
160 | */ |
161 | i = 0; |
162 | bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { |
163 | if (((1U << i) & m->data_opts.rewrite_ptrs) && |
164 | (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(k: insert))) && |
165 | !ptr->cached) { |
166 | bch2_extent_ptr_set_cached(bkey_i_to_s(k: insert), ptr); |
167 | rewrites_found |= 1U << i; |
168 | } |
169 | i++; |
170 | } |
171 | |
172 | if (m->data_opts.rewrite_ptrs && |
173 | !rewrites_found && |
174 | bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { |
175 | trace_move_extent_fail2(m, new: k, wrote: bkey_i_to_s_c(k: &new->k_i), insert, msg: "no rewrites found:" ); |
176 | goto nowork; |
177 | } |
178 | |
179 | /* |
180 | * A replica that we just wrote might conflict with a replica |
181 | * that we want to keep, due to racing with another move: |
182 | */ |
183 | restart_drop_conflicting_replicas: |
184 | extent_for_each_ptr(extent_i_to_s(new), ptr) |
185 | if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(k: insert), ptr->dev)) && |
186 | !ptr_c->cached) { |
187 | bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k: &new->k_i), ptr); |
188 | goto restart_drop_conflicting_replicas; |
189 | } |
190 | |
191 | if (!bkey_val_u64s(&new->k)) { |
192 | trace_move_extent_fail2(m, new: k, wrote: bkey_i_to_s_c(k: &new->k_i), insert, msg: "new replicas conflicted:" ); |
193 | goto nowork; |
194 | } |
195 | |
196 | /* Now, drop pointers that conflict with what we just wrote: */ |
197 | extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) |
198 | if ((ptr = bch2_bkey_has_device(k: bkey_i_to_s(k: insert), dev: p.ptr.dev))) |
199 | bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k: insert), ptr); |
200 | |
201 | durability = bch2_bkey_durability(c, bkey_i_to_s_c(k: insert)) + |
202 | bch2_bkey_durability(c, bkey_i_to_s_c(k: &new->k_i)); |
203 | |
204 | /* Now, drop excess replicas: */ |
205 | : |
206 | bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { |
207 | unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); |
208 | |
209 | if (!p.ptr.cached && |
210 | durability - ptr_durability >= m->op.opts.data_replicas) { |
211 | durability -= ptr_durability; |
212 | |
213 | bch2_extent_ptr_set_cached(bkey_i_to_s(k: insert), &entry->ptr); |
214 | goto restart_drop_extra_replicas; |
215 | } |
216 | } |
217 | |
218 | /* Finally, add the pointers we just wrote: */ |
219 | extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) |
220 | bch2_extent_ptr_decoded_append(insert, &p); |
221 | |
222 | bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); |
223 | bch2_extent_normalize(c, bkey_i_to_s(insert)); |
224 | |
225 | ret = bch2_sum_sector_overwrites(trans, &iter, insert, |
226 | &should_check_enospc, |
227 | &i_sectors_delta, |
228 | &disk_sectors_delta); |
229 | if (ret) |
230 | goto err; |
231 | |
232 | if (disk_sectors_delta > (s64) op->res.sectors) { |
233 | ret = bch2_disk_reservation_add(c, &op->res, |
234 | disk_sectors_delta - op->res.sectors, |
235 | !should_check_enospc |
236 | ? BCH_DISK_RESERVATION_NOFAIL : 0); |
237 | if (ret) |
238 | goto out; |
239 | } |
240 | |
241 | next_pos = insert->k.p; |
242 | |
243 | /* |
244 | * Check for nonce offset inconsistency: |
245 | * This is debug code - we've been seeing this bug rarely, and |
246 | * it's been hard to reproduce, so this should give us some more |
247 | * information when it does occur: |
248 | */ |
249 | struct printbuf err = PRINTBUF; |
250 | int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); |
251 | printbuf_exit(&err); |
252 | |
253 | if (invalid) { |
254 | struct printbuf buf = PRINTBUF; |
255 | |
256 | prt_str(&buf, "about to insert invalid key in data update path" ); |
257 | prt_str(&buf, "\nold: " ); |
258 | bch2_bkey_val_to_text(&buf, c, old); |
259 | prt_str(&buf, "\nk: " ); |
260 | bch2_bkey_val_to_text(&buf, c, k); |
261 | prt_str(&buf, "\nnew: " ); |
262 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); |
263 | |
264 | bch2_print_string_as_lines(KERN_ERR, buf.buf); |
265 | printbuf_exit(&buf); |
266 | |
267 | bch2_fatal_error(c); |
268 | goto out; |
269 | } |
270 | |
271 | if (trace_data_update_enabled()) { |
272 | struct printbuf buf = PRINTBUF; |
273 | |
274 | prt_str(&buf, "\nold: " ); |
275 | bch2_bkey_val_to_text(&buf, c, old); |
276 | prt_str(&buf, "\nk: " ); |
277 | bch2_bkey_val_to_text(&buf, c, k); |
278 | prt_str(&buf, "\nnew: " ); |
279 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); |
280 | |
281 | trace_data_update(c, buf.buf); |
282 | printbuf_exit(&buf); |
283 | } |
284 | |
285 | ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, |
286 | k.k->p, bkey_start_pos(&insert->k)) ?: |
287 | bch2_insert_snapshot_whiteouts(trans, m->btree_id, |
288 | k.k->p, insert->k.p) ?: |
289 | bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: |
290 | bch2_trans_update(trans, &iter, insert, |
291 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: |
292 | bch2_trans_commit(trans, &op->res, |
293 | NULL, |
294 | BCH_TRANS_COMMIT_no_check_rw| |
295 | BCH_TRANS_COMMIT_no_enospc| |
296 | m->data_opts.btree_insert_flags); |
297 | if (!ret) { |
298 | bch2_btree_iter_set_pos(&iter, next_pos); |
299 | |
300 | this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); |
301 | trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); |
302 | } |
303 | err: |
304 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
305 | ret = 0; |
306 | if (ret) |
307 | break; |
308 | next: |
309 | while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { |
310 | bch2_keylist_pop_front(keys); |
311 | if (bch2_keylist_empty(keys)) |
312 | goto out; |
313 | } |
314 | continue; |
315 | nowork: |
316 | if (m->stats) { |
317 | BUG_ON(k.k->p.offset <= iter.pos.offset); |
318 | atomic64_inc(&m->stats->keys_raced); |
319 | atomic64_add(k.k->p.offset - iter.pos.offset, |
320 | &m->stats->sectors_raced); |
321 | } |
322 | |
323 | count_event(c, move_extent_fail); |
324 | |
325 | bch2_btree_iter_advance(&iter); |
326 | goto next; |
327 | } |
328 | out: |
329 | bch2_trans_iter_exit(trans, &iter); |
330 | bch2_bkey_buf_exit(&_insert, c); |
331 | bch2_bkey_buf_exit(&_new, c); |
332 | BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); |
333 | return ret; |
334 | } |
335 | |
336 | int bch2_data_update_index_update(struct bch_write_op *op) |
337 | { |
338 | return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); |
339 | } |
340 | |
341 | void bch2_data_update_read_done(struct data_update *m, |
342 | struct bch_extent_crc_unpacked crc) |
343 | { |
344 | /* write bio must own pages: */ |
345 | BUG_ON(!m->op.wbio.bio.bi_vcnt); |
346 | |
347 | m->op.crc = crc; |
348 | m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; |
349 | |
350 | closure_call(cl: &m->op.cl, fn: bch2_write, NULL, NULL); |
351 | } |
352 | |
353 | void bch2_data_update_exit(struct data_update *update) |
354 | { |
355 | struct bch_fs *c = update->op.c; |
356 | struct bkey_ptrs_c ptrs = |
357 | bch2_bkey_ptrs_c(k: bkey_i_to_s_c(k: update->k.k)); |
358 | |
359 | bkey_for_each_ptr(ptrs, ptr) { |
360 | if (c->opts.nocow_enabled) |
361 | bch2_bucket_nocow_unlock(&c->nocow_locks, |
362 | PTR_BUCKET_POS(c, ptr), 0); |
363 | percpu_ref_put(ref: &bch_dev_bkey_exists(c, idx: ptr->dev)->ref); |
364 | } |
365 | |
366 | bch2_bkey_buf_exit(s: &update->k, c); |
367 | bch2_disk_reservation_put(c, res: &update->op.res); |
368 | bch2_bio_free_pages_pool(c, &update->op.wbio.bio); |
369 | } |
370 | |
371 | static void bch2_update_unwritten_extent(struct btree_trans *trans, |
372 | struct data_update *update) |
373 | { |
374 | struct bch_fs *c = update->op.c; |
375 | struct bio *bio = &update->op.wbio.bio; |
376 | struct bkey_i_extent *e; |
377 | struct write_point *wp; |
378 | struct closure cl; |
379 | struct btree_iter iter; |
380 | struct bkey_s_c k; |
381 | int ret; |
382 | |
383 | closure_init_stack(cl: &cl); |
384 | bch2_keylist_init(l: &update->op.insert_keys, inline_keys: update->op.inline_keys); |
385 | |
386 | while (bio_sectors(bio)) { |
387 | unsigned sectors = bio_sectors(bio); |
388 | |
389 | bch2_trans_iter_init(trans, iter: &iter, btree_id: update->btree_id, pos: update->op.pos, |
390 | flags: BTREE_ITER_SLOTS); |
391 | ret = lockrestart_do(trans, ({ |
392 | k = bch2_btree_iter_peek_slot(&iter); |
393 | bkey_err(k); |
394 | })); |
395 | bch2_trans_iter_exit(trans, &iter); |
396 | |
397 | if (ret || !bch2_extents_match(k, bkey_i_to_s_c(k: update->k.k))) |
398 | break; |
399 | |
400 | e = bkey_extent_init(k: update->op.insert_keys.top); |
401 | e->k.p = update->op.pos; |
402 | |
403 | ret = bch2_alloc_sectors_start_trans(trans, |
404 | update->op.target, |
405 | false, |
406 | update->op.write_point, |
407 | &update->op.devs_have, |
408 | update->op.nr_replicas, |
409 | update->op.nr_replicas, |
410 | update->op.watermark, |
411 | 0, &cl, &wp); |
412 | if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { |
413 | bch2_trans_unlock(trans); |
414 | closure_sync(cl: &cl); |
415 | continue; |
416 | } |
417 | |
418 | bch_err_fn_ratelimited(c, ret); |
419 | |
420 | if (ret) |
421 | return; |
422 | |
423 | sectors = min(sectors, wp->sectors_free); |
424 | |
425 | bch2_key_resize(k: &e->k, new_size: sectors); |
426 | |
427 | bch2_open_bucket_get(c, wp, ptrs: &update->op.open_buckets); |
428 | bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); |
429 | bch2_alloc_sectors_done(c, wp); |
430 | |
431 | bio_advance(bio, nbytes: sectors << 9); |
432 | update->op.pos.offset += sectors; |
433 | |
434 | extent_for_each_ptr(extent_i_to_s(e), ptr) |
435 | ptr->unwritten = true; |
436 | bch2_keylist_push(l: &update->op.insert_keys); |
437 | |
438 | ret = __bch2_data_update_index_update(trans, op: &update->op); |
439 | |
440 | bch2_open_buckets_put(c, ptrs: &update->op.open_buckets); |
441 | |
442 | if (ret) |
443 | break; |
444 | } |
445 | |
446 | if (closure_nr_remaining(cl: &cl) != 1) { |
447 | bch2_trans_unlock(trans); |
448 | closure_sync(cl: &cl); |
449 | } |
450 | } |
451 | |
452 | int bch2_extent_drop_ptrs(struct btree_trans *trans, |
453 | struct btree_iter *iter, |
454 | struct bkey_s_c k, |
455 | struct data_update_opts data_opts) |
456 | { |
457 | struct bch_fs *c = trans->c; |
458 | struct bkey_i *n; |
459 | int ret; |
460 | |
461 | n = bch2_bkey_make_mut_noupdate(trans, k); |
462 | ret = PTR_ERR_OR_ZERO(ptr: n); |
463 | if (ret) |
464 | return ret; |
465 | |
466 | while (data_opts.kill_ptrs) { |
467 | unsigned i = 0, drop = __fls(word: data_opts.kill_ptrs); |
468 | struct bch_extent_ptr *ptr; |
469 | |
470 | bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); |
471 | data_opts.kill_ptrs ^= 1U << drop; |
472 | } |
473 | |
474 | /* |
475 | * If the new extent no longer has any pointers, bch2_extent_normalize() |
476 | * will do the appropriate thing with it (turning it into a |
477 | * KEY_TYPE_error key, or just a discard if it was a cached extent) |
478 | */ |
479 | bch2_extent_normalize(c, bkey_i_to_s(k: n)); |
480 | |
481 | /* |
482 | * Since we're not inserting through an extent iterator |
483 | * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), |
484 | * we aren't using the extent overwrite path to delete, we're |
485 | * just using the normal key deletion path: |
486 | */ |
487 | if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
488 | n->k.size = 0; |
489 | |
490 | return bch2_trans_relock(trans) ?: |
491 | bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: |
492 | bch2_trans_commit(trans, NULL, NULL, flags: BCH_TRANS_COMMIT_no_enospc); |
493 | } |
494 | |
495 | int bch2_data_update_init(struct btree_trans *trans, |
496 | struct btree_iter *iter, |
497 | struct moving_context *ctxt, |
498 | struct data_update *m, |
499 | struct write_point_specifier wp, |
500 | struct bch_io_opts io_opts, |
501 | struct data_update_opts data_opts, |
502 | enum btree_id btree_id, |
503 | struct bkey_s_c k) |
504 | { |
505 | struct bch_fs *c = trans->c; |
506 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
507 | const union bch_extent_entry *entry; |
508 | struct extent_ptr_decoded p; |
509 | unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; |
510 | unsigned ptrs_locked = 0; |
511 | int ret = 0; |
512 | |
513 | /* |
514 | * fs is corrupt we have a key for a snapshot node that doesn't exist, |
515 | * and we have to check for this because we go rw before repairing the |
516 | * snapshots table - just skip it, we can move it later. |
517 | */ |
518 | if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) |
519 | return -BCH_ERR_data_update_done; |
520 | |
521 | bch2_bkey_buf_init(s: &m->k); |
522 | bch2_bkey_buf_reassemble(s: &m->k, c, k); |
523 | m->btree_id = btree_id; |
524 | m->data_opts = data_opts; |
525 | m->ctxt = ctxt; |
526 | m->stats = ctxt ? ctxt->stats : NULL; |
527 | |
528 | bch2_write_op_init(op: &m->op, c, opts: io_opts); |
529 | m->op.pos = bkey_start_pos(k: k.k); |
530 | m->op.version = k.k->version; |
531 | m->op.target = data_opts.target; |
532 | m->op.write_point = wp; |
533 | m->op.nr_replicas = 0; |
534 | m->op.flags |= BCH_WRITE_PAGES_STABLE| |
535 | BCH_WRITE_PAGES_OWNED| |
536 | BCH_WRITE_DATA_ENCODED| |
537 | BCH_WRITE_MOVE| |
538 | m->data_opts.write_flags; |
539 | m->op.compression_opt = background_compression(opts: io_opts); |
540 | m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; |
541 | |
542 | bkey_for_each_ptr(ptrs, ptr) |
543 | percpu_ref_get(ref: &bch_dev_bkey_exists(c, idx: ptr->dev)->ref); |
544 | |
545 | unsigned durability_have = 0, durability_removing = 0; |
546 | |
547 | i = 0; |
548 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { |
549 | bool locked; |
550 | |
551 | if (((1U << i) & m->data_opts.rewrite_ptrs)) { |
552 | BUG_ON(p.ptr.cached); |
553 | |
554 | if (crc_is_compressed(crc: p.crc)) |
555 | reserve_sectors += k.k->size; |
556 | |
557 | m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); |
558 | durability_removing += bch2_extent_ptr_desired_durability(c, &p); |
559 | } else if (!p.ptr.cached && |
560 | !((1U << i) & m->data_opts.kill_ptrs)) { |
561 | bch2_dev_list_add_dev(devs: &m->op.devs_have, dev: p.ptr.dev); |
562 | durability_have += bch2_extent_ptr_durability(c, &p); |
563 | } |
564 | |
565 | /* |
566 | * op->csum_type is normally initialized from the fs/file's |
567 | * current options - but if an extent is encrypted, we require |
568 | * that it stays encrypted: |
569 | */ |
570 | if (bch2_csum_type_is_encryption(type: p.crc.csum_type)) { |
571 | m->op.nonce = p.crc.nonce + p.crc.offset; |
572 | m->op.csum_type = p.crc.csum_type; |
573 | } |
574 | |
575 | if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) |
576 | m->op.incompressible = true; |
577 | |
578 | if (c->opts.nocow_enabled) { |
579 | if (ctxt) { |
580 | move_ctxt_wait_event(ctxt, |
581 | (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, |
582 | PTR_BUCKET_POS(c, &p.ptr), 0)) || |
583 | list_empty(&ctxt->ios)); |
584 | |
585 | if (!locked) |
586 | bch2_bucket_nocow_lock(t: &c->nocow_locks, |
587 | bucket: PTR_BUCKET_POS(c, ptr: &p.ptr), flags: 0); |
588 | } else { |
589 | if (!bch2_bucket_nocow_trylock(t: &c->nocow_locks, |
590 | bucket: PTR_BUCKET_POS(c, ptr: &p.ptr), flags: 0)) { |
591 | ret = -BCH_ERR_nocow_lock_blocked; |
592 | goto err; |
593 | } |
594 | } |
595 | ptrs_locked |= (1U << i); |
596 | } |
597 | |
598 | i++; |
599 | } |
600 | |
601 | unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); |
602 | |
603 | /* |
604 | * If current extent durability is less than io_opts.data_replicas, |
605 | * we're not trying to rereplicate the extent up to data_replicas here - |
606 | * unless extra_replicas was specified |
607 | * |
608 | * Increasing replication is an explicit operation triggered by |
609 | * rereplicate, currently, so that users don't get an unexpected -ENOSPC |
610 | */ |
611 | if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) && |
612 | !durability_required) { |
613 | m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; |
614 | m->data_opts.rewrite_ptrs = 0; |
615 | /* if iter == NULL, it's just a promote */ |
616 | if (iter) |
617 | ret = bch2_extent_drop_ptrs(trans, iter, k, data_opts: m->data_opts); |
618 | goto done; |
619 | } |
620 | |
621 | m->op.nr_replicas = min(durability_removing, durability_required) + |
622 | m->data_opts.extra_replicas; |
623 | |
624 | /* |
625 | * If device(s) were set to durability=0 after data was written to them |
626 | * we can end up with a duribilty=0 extent, and the normal algorithm |
627 | * that tries not to increase durability doesn't work: |
628 | */ |
629 | if (!(durability_have + durability_removing)) |
630 | m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); |
631 | |
632 | m->op.nr_replicas_required = m->op.nr_replicas; |
633 | |
634 | if (reserve_sectors) { |
635 | ret = bch2_disk_reservation_add(c, res: &m->op.res, sectors: reserve_sectors, |
636 | flags: m->data_opts.extra_replicas |
637 | ? 0 |
638 | : BCH_DISK_RESERVATION_NOFAIL); |
639 | if (ret) |
640 | goto err; |
641 | } |
642 | |
643 | if (bkey_extent_is_unwritten(k)) { |
644 | bch2_update_unwritten_extent(trans, update: m); |
645 | goto done; |
646 | } |
647 | |
648 | return 0; |
649 | err: |
650 | i = 0; |
651 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { |
652 | if ((1U << i) & ptrs_locked) |
653 | bch2_bucket_nocow_unlock(&c->nocow_locks, |
654 | PTR_BUCKET_POS(c, ptr: &p.ptr), 0); |
655 | percpu_ref_put(ref: &bch_dev_bkey_exists(c, idx: p.ptr.dev)->ref); |
656 | i++; |
657 | } |
658 | |
659 | bch2_bkey_buf_exit(s: &m->k, c); |
660 | bch2_bio_free_pages_pool(c, &m->op.wbio.bio); |
661 | return ret; |
662 | done: |
663 | bch2_data_update_exit(update: m); |
664 | return ret ?: -BCH_ERR_data_update_done; |
665 | } |
666 | |
667 | void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) |
668 | { |
669 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
670 | unsigned i = 0; |
671 | |
672 | bkey_for_each_ptr(ptrs, ptr) { |
673 | if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { |
674 | opts->kill_ptrs |= 1U << i; |
675 | opts->rewrite_ptrs ^= 1U << i; |
676 | } |
677 | |
678 | i++; |
679 | } |
680 | } |
681 | |