1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * io_misc.c - fallocate, fpunch, truncate: |
4 | */ |
5 | |
6 | #include "bcachefs.h" |
7 | #include "alloc_foreground.h" |
8 | #include "bkey_buf.h" |
9 | #include "btree_update.h" |
10 | #include "buckets.h" |
11 | #include "clock.h" |
12 | #include "error.h" |
13 | #include "extents.h" |
14 | #include "extent_update.h" |
15 | #include "inode.h" |
16 | #include "io_misc.h" |
17 | #include "io_write.h" |
18 | #include "logged_ops.h" |
19 | #include "rebalance.h" |
20 | #include "subvolume.h" |
21 | |
22 | /* Overwrites whatever was present with zeroes: */ |
23 | int bch2_extent_fallocate(struct btree_trans *trans, |
24 | subvol_inum inum, |
25 | struct btree_iter *iter, |
26 | u64 sectors, |
27 | struct bch_io_opts opts, |
28 | s64 *i_sectors_delta, |
29 | struct write_point_specifier write_point) |
30 | { |
31 | struct bch_fs *c = trans->c; |
32 | struct disk_reservation disk_res = { 0 }; |
33 | struct closure cl; |
34 | struct open_buckets open_buckets = { 0 }; |
35 | struct bkey_s_c k; |
36 | struct bkey_buf old, new; |
37 | unsigned sectors_allocated = 0, new_replicas; |
38 | bool unwritten = opts.nocow && |
39 | c->sb.version >= bcachefs_metadata_version_unwritten_extents; |
40 | int ret; |
41 | |
42 | bch2_bkey_buf_init(s: &old); |
43 | bch2_bkey_buf_init(s: &new); |
44 | closure_init_stack(cl: &cl); |
45 | |
46 | k = bch2_btree_iter_peek_slot(iter); |
47 | ret = bkey_err(k); |
48 | if (ret) |
49 | return ret; |
50 | |
51 | sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); |
52 | new_replicas = max(0, (int) opts.data_replicas - |
53 | (int) bch2_bkey_nr_ptrs_fully_allocated(k)); |
54 | |
55 | /* |
56 | * Get a disk reservation before (in the nocow case) calling |
57 | * into the allocator: |
58 | */ |
59 | ret = bch2_disk_reservation_get(c, res: &disk_res, sectors, nr_replicas: new_replicas, flags: 0); |
60 | if (unlikely(ret)) |
61 | goto err_noprint; |
62 | |
63 | bch2_bkey_buf_reassemble(s: &old, c, k); |
64 | |
65 | if (!unwritten) { |
66 | struct bkey_i_reservation *reservation; |
67 | |
68 | bch2_bkey_buf_realloc(s: &new, c, u64s: sizeof(*reservation) / sizeof(u64)); |
69 | reservation = bkey_reservation_init(k: new.k); |
70 | reservation->k.p = iter->pos; |
71 | bch2_key_resize(k: &reservation->k, new_size: sectors); |
72 | reservation->v.nr_replicas = opts.data_replicas; |
73 | } else { |
74 | struct bkey_i_extent *e; |
75 | struct bch_devs_list devs_have; |
76 | struct write_point *wp; |
77 | |
78 | devs_have.nr = 0; |
79 | |
80 | bch2_bkey_buf_realloc(s: &new, c, BKEY_EXTENT_U64s_MAX); |
81 | |
82 | e = bkey_extent_init(k: new.k); |
83 | e->k.p = iter->pos; |
84 | |
85 | ret = bch2_alloc_sectors_start_trans(trans, |
86 | opts.foreground_target, |
87 | false, |
88 | write_point, |
89 | &devs_have, |
90 | opts.data_replicas, |
91 | opts.data_replicas, |
92 | BCH_WATERMARK_normal, 0, &cl, &wp); |
93 | if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) |
94 | ret = -BCH_ERR_transaction_restart_nested; |
95 | if (ret) |
96 | goto err; |
97 | |
98 | sectors = min_t(u64, sectors, wp->sectors_free); |
99 | sectors_allocated = sectors; |
100 | |
101 | bch2_key_resize(k: &e->k, new_size: sectors); |
102 | |
103 | bch2_open_bucket_get(c, wp, ptrs: &open_buckets); |
104 | bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); |
105 | bch2_alloc_sectors_done(c, wp); |
106 | |
107 | extent_for_each_ptr(extent_i_to_s(e), ptr) |
108 | ptr->unwritten = true; |
109 | } |
110 | |
111 | ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, |
112 | 0, i_sectors_delta, true); |
113 | err: |
114 | if (!ret && sectors_allocated) |
115 | bch2_increment_clock(c, sectors: sectors_allocated, WRITE); |
116 | if (should_print_err(err: ret)) |
117 | bch_err_inum_offset_ratelimited(c, |
118 | inum.inum, |
119 | iter->pos.offset << 9, |
120 | "%s(): error: %s" , __func__, bch2_err_str(ret)); |
121 | err_noprint: |
122 | bch2_open_buckets_put(c, ptrs: &open_buckets); |
123 | bch2_disk_reservation_put(c, res: &disk_res); |
124 | bch2_bkey_buf_exit(s: &new, c); |
125 | bch2_bkey_buf_exit(s: &old, c); |
126 | |
127 | if (closure_nr_remaining(cl: &cl) != 1) { |
128 | bch2_trans_unlock(trans); |
129 | closure_sync(cl: &cl); |
130 | } |
131 | |
132 | return ret; |
133 | } |
134 | |
135 | /* |
136 | * Returns -BCH_ERR_transacton_restart if we had to drop locks: |
137 | */ |
138 | int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, |
139 | subvol_inum inum, u64 end, |
140 | s64 *i_sectors_delta) |
141 | { |
142 | struct bch_fs *c = trans->c; |
143 | unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); |
144 | struct bpos end_pos = POS(inum.inum, end); |
145 | struct bkey_s_c k; |
146 | int ret = 0, ret2 = 0; |
147 | u32 snapshot; |
148 | |
149 | while (!ret || |
150 | bch2_err_matches(ret, BCH_ERR_transaction_restart)) { |
151 | struct disk_reservation disk_res = |
152 | bch2_disk_reservation_init(c, nr_replicas: 0); |
153 | struct bkey_i delete; |
154 | |
155 | if (ret) |
156 | ret2 = ret; |
157 | |
158 | bch2_trans_begin(trans); |
159 | |
160 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
161 | if (ret) |
162 | continue; |
163 | |
164 | bch2_btree_iter_set_snapshot(iter, snapshot); |
165 | |
166 | /* |
167 | * peek_upto() doesn't have ideal semantics for extents: |
168 | */ |
169 | k = bch2_btree_iter_peek_upto(iter, end_pos); |
170 | if (!k.k) |
171 | break; |
172 | |
173 | ret = bkey_err(k); |
174 | if (ret) |
175 | continue; |
176 | |
177 | bkey_init(k: &delete.k); |
178 | delete.k.p = iter->pos; |
179 | |
180 | /* create the biggest key we can */ |
181 | bch2_key_resize(k: &delete.k, new_size: max_sectors); |
182 | bch2_cut_back(where: end_pos, k: &delete); |
183 | |
184 | ret = bch2_extent_update(trans, inum, iter, &delete, |
185 | &disk_res, 0, i_sectors_delta, false); |
186 | bch2_disk_reservation_put(c, res: &disk_res); |
187 | } |
188 | |
189 | return ret ?: ret2; |
190 | } |
191 | |
192 | int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, |
193 | s64 *i_sectors_delta) |
194 | { |
195 | struct btree_trans *trans = bch2_trans_get(c); |
196 | struct btree_iter iter; |
197 | int ret; |
198 | |
199 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
200 | POS(inum.inum, start), |
201 | flags: BTREE_ITER_INTENT); |
202 | |
203 | ret = bch2_fpunch_at(trans, iter: &iter, inum, end, i_sectors_delta); |
204 | |
205 | bch2_trans_iter_exit(trans, &iter); |
206 | bch2_trans_put(trans); |
207 | |
208 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
209 | ret = 0; |
210 | |
211 | return ret; |
212 | } |
213 | |
214 | /* truncate: */ |
215 | |
216 | void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) |
217 | { |
218 | struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); |
219 | |
220 | prt_printf(out, "subvol=%u" , le32_to_cpu(op.v->subvol)); |
221 | prt_printf(out, " inum=%llu" , le64_to_cpu(op.v->inum)); |
222 | prt_printf(out, " new_i_size=%llu" , le64_to_cpu(op.v->new_i_size)); |
223 | } |
224 | |
225 | static int truncate_set_isize(struct btree_trans *trans, |
226 | subvol_inum inum, |
227 | u64 new_i_size) |
228 | { |
229 | struct btree_iter iter = { NULL }; |
230 | struct bch_inode_unpacked inode_u; |
231 | int ret; |
232 | |
233 | ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: |
234 | (inode_u.bi_size = new_i_size, 0) ?: |
235 | bch2_inode_write(trans, iter: &iter, inode: &inode_u); |
236 | |
237 | bch2_trans_iter_exit(trans, &iter); |
238 | return ret; |
239 | } |
240 | |
241 | static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, |
242 | struct bkey_i *op_k, |
243 | u64 *i_sectors_delta) |
244 | { |
245 | struct bch_fs *c = trans->c; |
246 | struct btree_iter fpunch_iter; |
247 | struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(k: op_k); |
248 | subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; |
249 | u64 new_i_size = le64_to_cpu(op->v.new_i_size); |
250 | int ret; |
251 | |
252 | ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
253 | truncate_set_isize(trans, inum, new_i_size)); |
254 | if (ret) |
255 | goto err; |
256 | |
257 | bch2_trans_iter_init(trans, iter: &fpunch_iter, btree_id: BTREE_ID_extents, |
258 | POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), |
259 | flags: BTREE_ITER_INTENT); |
260 | ret = bch2_fpunch_at(trans, iter: &fpunch_iter, inum, U64_MAX, i_sectors_delta); |
261 | bch2_trans_iter_exit(trans, &fpunch_iter); |
262 | |
263 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
264 | ret = 0; |
265 | err: |
266 | bch2_logged_op_finish(trans, op_k); |
267 | bch_err_fn(c, ret); |
268 | return ret; |
269 | } |
270 | |
271 | int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) |
272 | { |
273 | return __bch2_resume_logged_op_truncate(trans, op_k, NULL); |
274 | } |
275 | |
276 | int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) |
277 | { |
278 | struct bkey_i_logged_op_truncate op; |
279 | |
280 | bkey_logged_op_truncate_init(k: &op.k_i); |
281 | op.v.subvol = cpu_to_le32(inum.subvol); |
282 | op.v.inum = cpu_to_le64(inum.inum); |
283 | op.v.new_i_size = cpu_to_le64(new_i_size); |
284 | |
285 | /* |
286 | * Logged ops aren't atomic w.r.t. snapshot creation: creating a |
287 | * snapshot while they're in progress, then crashing, will result in the |
288 | * resume only proceeding in one of the snapshots |
289 | */ |
290 | down_read(sem: &c->snapshot_create_lock); |
291 | int ret = bch2_trans_run(c, |
292 | bch2_logged_op_start(trans, &op.k_i) ?: |
293 | __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta)); |
294 | up_read(sem: &c->snapshot_create_lock); |
295 | |
296 | return ret; |
297 | } |
298 | |
299 | /* finsert/fcollapse: */ |
300 | |
301 | void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) |
302 | { |
303 | struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); |
304 | |
305 | prt_printf(out, "subvol=%u" , le32_to_cpu(op.v->subvol)); |
306 | prt_printf(out, " inum=%llu" , le64_to_cpu(op.v->inum)); |
307 | prt_printf(out, " dst_offset=%lli" , le64_to_cpu(op.v->dst_offset)); |
308 | prt_printf(out, " src_offset=%llu" , le64_to_cpu(op.v->src_offset)); |
309 | } |
310 | |
311 | static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len) |
312 | { |
313 | struct btree_iter iter; |
314 | struct bch_inode_unpacked inode_u; |
315 | int ret; |
316 | |
317 | offset <<= 9; |
318 | len <<= 9; |
319 | |
320 | ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); |
321 | if (ret) |
322 | return ret; |
323 | |
324 | if (len > 0) { |
325 | if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { |
326 | ret = -EFBIG; |
327 | goto err; |
328 | } |
329 | |
330 | if (offset >= inode_u.bi_size) { |
331 | ret = -EINVAL; |
332 | goto err; |
333 | } |
334 | } |
335 | |
336 | inode_u.bi_size += len; |
337 | inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(c: trans->c); |
338 | |
339 | ret = bch2_inode_write(trans, iter: &iter, inode: &inode_u); |
340 | err: |
341 | bch2_trans_iter_exit(trans, &iter); |
342 | return ret; |
343 | } |
344 | |
345 | static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, |
346 | struct bkey_i *op_k, |
347 | u64 *i_sectors_delta) |
348 | { |
349 | struct bch_fs *c = trans->c; |
350 | struct btree_iter iter; |
351 | struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(k: op_k); |
352 | subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; |
353 | struct bch_io_opts opts; |
354 | u64 dst_offset = le64_to_cpu(op->v.dst_offset); |
355 | u64 src_offset = le64_to_cpu(op->v.src_offset); |
356 | s64 shift = dst_offset - src_offset; |
357 | u64 len = abs(shift); |
358 | u64 pos = le64_to_cpu(op->v.pos); |
359 | bool insert = shift > 0; |
360 | int ret = 0; |
361 | |
362 | ret = bch2_inum_opts_get(trans, inum, &opts); |
363 | if (ret) |
364 | return ret; |
365 | |
366 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
367 | POS(inum.inum, 0), |
368 | flags: BTREE_ITER_INTENT); |
369 | |
370 | switch (op->v.state) { |
371 | case LOGGED_OP_FINSERT_start: |
372 | op->v.state = LOGGED_OP_FINSERT_shift_extents; |
373 | |
374 | if (insert) { |
375 | ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
376 | adjust_i_size(trans, inum, src_offset, len) ?: |
377 | bch2_logged_op_update(trans, &op->k_i)); |
378 | if (ret) |
379 | goto err; |
380 | } else { |
381 | bch2_btree_iter_set_pos(iter: &iter, POS(inum.inum, src_offset)); |
382 | |
383 | ret = bch2_fpunch_at(trans, iter: &iter, inum, end: src_offset + len, i_sectors_delta); |
384 | if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
385 | goto err; |
386 | |
387 | ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
388 | bch2_logged_op_update(trans, &op->k_i)); |
389 | } |
390 | |
391 | fallthrough; |
392 | case LOGGED_OP_FINSERT_shift_extents: |
393 | while (1) { |
394 | struct disk_reservation disk_res = |
395 | bch2_disk_reservation_init(c, nr_replicas: 0); |
396 | struct bkey_i delete, *copy; |
397 | struct bkey_s_c k; |
398 | struct bpos src_pos = POS(inum.inum, src_offset); |
399 | u32 snapshot; |
400 | |
401 | bch2_trans_begin(trans); |
402 | |
403 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
404 | if (ret) |
405 | goto btree_err; |
406 | |
407 | bch2_btree_iter_set_snapshot(iter: &iter, snapshot); |
408 | bch2_btree_iter_set_pos(iter: &iter, new_pos: SPOS(inode: inum.inum, offset: pos, snapshot)); |
409 | |
410 | k = insert |
411 | ? bch2_btree_iter_peek_prev(&iter) |
412 | : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); |
413 | if ((ret = bkey_err(k))) |
414 | goto btree_err; |
415 | |
416 | if (!k.k || |
417 | k.k->p.inode != inum.inum || |
418 | bkey_le(l: k.k->p, POS(inum.inum, src_offset))) |
419 | break; |
420 | |
421 | copy = bch2_bkey_make_mut_noupdate(trans, k); |
422 | if ((ret = PTR_ERR_OR_ZERO(ptr: copy))) |
423 | goto btree_err; |
424 | |
425 | if (insert && |
426 | bkey_lt(l: bkey_start_pos(k: k.k), r: src_pos)) { |
427 | bch2_cut_front(where: src_pos, k: copy); |
428 | |
429 | /* Splitting compressed extent? */ |
430 | bch2_disk_reservation_add(c, res: &disk_res, |
431 | sectors: copy->k.size * |
432 | bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k: copy)), |
433 | BCH_DISK_RESERVATION_NOFAIL); |
434 | } |
435 | |
436 | bkey_init(k: &delete.k); |
437 | delete.k.p = copy->k.p; |
438 | delete.k.p.snapshot = snapshot; |
439 | delete.k.size = copy->k.size; |
440 | |
441 | copy->k.p.offset += shift; |
442 | copy->k.p.snapshot = snapshot; |
443 | |
444 | op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); |
445 | |
446 | ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: |
447 | bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: |
448 | bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: |
449 | bch2_logged_op_update(trans, op: &op->k_i) ?: |
450 | bch2_trans_commit(trans, disk_res: &disk_res, NULL, flags: BCH_TRANS_COMMIT_no_enospc); |
451 | btree_err: |
452 | bch2_disk_reservation_put(c, res: &disk_res); |
453 | |
454 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
455 | continue; |
456 | if (ret) |
457 | goto err; |
458 | |
459 | pos = le64_to_cpu(op->v.pos); |
460 | } |
461 | |
462 | op->v.state = LOGGED_OP_FINSERT_finish; |
463 | |
464 | if (!insert) { |
465 | ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
466 | adjust_i_size(trans, inum, src_offset, shift) ?: |
467 | bch2_logged_op_update(trans, &op->k_i)); |
468 | } else { |
469 | /* We need an inode update to update bi_journal_seq for fsync: */ |
470 | ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
471 | adjust_i_size(trans, inum, 0, 0) ?: |
472 | bch2_logged_op_update(trans, &op->k_i)); |
473 | } |
474 | |
475 | break; |
476 | case LOGGED_OP_FINSERT_finish: |
477 | break; |
478 | } |
479 | err: |
480 | bch_err_fn(c, ret); |
481 | bch2_logged_op_finish(trans, op_k); |
482 | bch2_trans_iter_exit(trans, &iter); |
483 | return ret; |
484 | } |
485 | |
486 | int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) |
487 | { |
488 | return __bch2_resume_logged_op_finsert(trans, op_k, NULL); |
489 | } |
490 | |
491 | int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, |
492 | u64 offset, u64 len, bool insert, |
493 | s64 *i_sectors_delta) |
494 | { |
495 | struct bkey_i_logged_op_finsert op; |
496 | s64 shift = insert ? len : -len; |
497 | |
498 | bkey_logged_op_finsert_init(k: &op.k_i); |
499 | op.v.subvol = cpu_to_le32(inum.subvol); |
500 | op.v.inum = cpu_to_le64(inum.inum); |
501 | op.v.dst_offset = cpu_to_le64(offset + shift); |
502 | op.v.src_offset = cpu_to_le64(offset); |
503 | op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); |
504 | |
505 | /* |
506 | * Logged ops aren't atomic w.r.t. snapshot creation: creating a |
507 | * snapshot while they're in progress, then crashing, will result in the |
508 | * resume only proceeding in one of the snapshots |
509 | */ |
510 | down_read(sem: &c->snapshot_create_lock); |
511 | int ret = bch2_trans_run(c, |
512 | bch2_logged_op_start(trans, &op.k_i) ?: |
513 | __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta)); |
514 | up_read(sem: &c->snapshot_create_lock); |
515 | |
516 | return ret; |
517 | } |
518 | |