1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #ifndef NO_BCACHEFS_FS |
3 | |
4 | #include "bcachefs.h" |
5 | #include "alloc_foreground.h" |
6 | #include "bkey_buf.h" |
7 | #include "btree_update.h" |
8 | #include "buckets.h" |
9 | #include "clock.h" |
10 | #include "error.h" |
11 | #include "extents.h" |
12 | #include "extent_update.h" |
13 | #include "fs.h" |
14 | #include "fs-io.h" |
15 | #include "fs-io-buffered.h" |
16 | #include "fs-io-pagecache.h" |
17 | #include "fsck.h" |
18 | #include "inode.h" |
19 | #include "journal.h" |
20 | #include "io_misc.h" |
21 | #include "keylist.h" |
22 | #include "quota.h" |
23 | #include "reflink.h" |
24 | #include "trace.h" |
25 | |
26 | #include <linux/aio.h> |
27 | #include <linux/backing-dev.h> |
28 | #include <linux/falloc.h> |
29 | #include <linux/migrate.h> |
30 | #include <linux/mmu_context.h> |
31 | #include <linux/pagevec.h> |
32 | #include <linux/rmap.h> |
33 | #include <linux/sched/signal.h> |
34 | #include <linux/task_io_accounting_ops.h> |
35 | #include <linux/uio.h> |
36 | |
37 | #include <trace/events/writeback.h> |
38 | |
39 | struct nocow_flush { |
40 | struct closure *cl; |
41 | struct bch_dev *ca; |
42 | struct bio bio; |
43 | }; |
44 | |
45 | static void nocow_flush_endio(struct bio *_bio) |
46 | { |
47 | |
48 | struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); |
49 | |
50 | closure_put(cl: bio->cl); |
51 | percpu_ref_put(ref: &bio->ca->io_ref); |
52 | bio_put(&bio->bio); |
53 | } |
54 | |
55 | void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, |
56 | struct bch_inode_info *inode, |
57 | struct closure *cl) |
58 | { |
59 | struct nocow_flush *bio; |
60 | struct bch_dev *ca; |
61 | struct bch_devs_mask devs; |
62 | unsigned dev; |
63 | |
64 | dev = find_first_bit(addr: inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); |
65 | if (dev == BCH_SB_MEMBERS_MAX) |
66 | return; |
67 | |
68 | devs = inode->ei_devs_need_flush; |
69 | memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); |
70 | |
71 | for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { |
72 | rcu_read_lock(); |
73 | ca = rcu_dereference(c->devs[dev]); |
74 | if (ca && !percpu_ref_tryget(ref: &ca->io_ref)) |
75 | ca = NULL; |
76 | rcu_read_unlock(); |
77 | |
78 | if (!ca) |
79 | continue; |
80 | |
81 | bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, |
82 | REQ_OP_WRITE|REQ_PREFLUSH, |
83 | GFP_KERNEL, |
84 | &c->nocow_flush_bioset), |
85 | struct nocow_flush, bio); |
86 | bio->cl = cl; |
87 | bio->ca = ca; |
88 | bio->bio.bi_end_io = nocow_flush_endio; |
89 | closure_bio_submit(&bio->bio, cl); |
90 | } |
91 | } |
92 | |
93 | static int bch2_inode_flush_nocow_writes(struct bch_fs *c, |
94 | struct bch_inode_info *inode) |
95 | { |
96 | struct closure cl; |
97 | |
98 | closure_init_stack(cl: &cl); |
99 | bch2_inode_flush_nocow_writes_async(c, inode, cl: &cl); |
100 | closure_sync(cl: &cl); |
101 | |
102 | return 0; |
103 | } |
104 | |
105 | /* i_size updates: */ |
106 | |
107 | struct inode_new_size { |
108 | loff_t new_size; |
109 | u64 now; |
110 | unsigned fields; |
111 | }; |
112 | |
113 | static int inode_set_size(struct btree_trans *trans, |
114 | struct bch_inode_info *inode, |
115 | struct bch_inode_unpacked *bi, |
116 | void *p) |
117 | { |
118 | struct inode_new_size *s = p; |
119 | |
120 | bi->bi_size = s->new_size; |
121 | if (s->fields & ATTR_ATIME) |
122 | bi->bi_atime = s->now; |
123 | if (s->fields & ATTR_MTIME) |
124 | bi->bi_mtime = s->now; |
125 | if (s->fields & ATTR_CTIME) |
126 | bi->bi_ctime = s->now; |
127 | |
128 | return 0; |
129 | } |
130 | |
131 | int __must_check bch2_write_inode_size(struct bch_fs *c, |
132 | struct bch_inode_info *inode, |
133 | loff_t new_size, unsigned fields) |
134 | { |
135 | struct inode_new_size s = { |
136 | .new_size = new_size, |
137 | .now = bch2_current_time(c), |
138 | .fields = fields, |
139 | }; |
140 | |
141 | return bch2_write_inode(c, inode, inode_set_size, &s, fields); |
142 | } |
143 | |
144 | void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, |
145 | struct quota_res *quota_res, s64 sectors) |
146 | { |
147 | bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, |
148 | "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)" , |
149 | inode->v.i_ino, (u64) inode->v.i_blocks, sectors, |
150 | inode->ei_inode.bi_sectors); |
151 | inode->v.i_blocks += sectors; |
152 | |
153 | #ifdef CONFIG_BCACHEFS_QUOTA |
154 | if (quota_res && |
155 | !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && |
156 | sectors > 0) { |
157 | BUG_ON(sectors > quota_res->sectors); |
158 | BUG_ON(sectors > inode->ei_quota_reserved); |
159 | |
160 | quota_res->sectors -= sectors; |
161 | inode->ei_quota_reserved -= sectors; |
162 | } else { |
163 | bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); |
164 | } |
165 | #endif |
166 | } |
167 | |
168 | /* fsync: */ |
169 | |
170 | /* |
171 | * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an |
172 | * insert trigger: look up the btree inode instead |
173 | */ |
174 | static int bch2_flush_inode(struct bch_fs *c, |
175 | struct bch_inode_info *inode) |
176 | { |
177 | if (c->opts.journal_flush_disabled) |
178 | return 0; |
179 | |
180 | if (!bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_fsync)) |
181 | return -EROFS; |
182 | |
183 | struct bch_inode_unpacked u; |
184 | int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: |
185 | bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: |
186 | bch2_inode_flush_nocow_writes(c, inode); |
187 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_fsync); |
188 | return ret; |
189 | } |
190 | |
191 | int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) |
192 | { |
193 | struct bch_inode_info *inode = file_bch_inode(file); |
194 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
195 | int ret; |
196 | |
197 | ret = file_write_and_wait_range(file, start, end); |
198 | if (ret) |
199 | goto out; |
200 | ret = sync_inode_metadata(inode: &inode->v, wait: 1); |
201 | if (ret) |
202 | goto out; |
203 | ret = bch2_flush_inode(c, inode); |
204 | out: |
205 | return bch2_err_class(err: ret); |
206 | } |
207 | |
208 | /* truncate: */ |
209 | |
210 | static inline int range_has_data(struct bch_fs *c, u32 subvol, |
211 | struct bpos start, |
212 | struct bpos end) |
213 | { |
214 | struct btree_trans *trans = bch2_trans_get(c); |
215 | struct btree_iter iter; |
216 | struct bkey_s_c k; |
217 | int ret = 0; |
218 | retry: |
219 | bch2_trans_begin(trans); |
220 | |
221 | ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); |
222 | if (ret) |
223 | goto err; |
224 | |
225 | for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) |
226 | if (bkey_extent_is_data(k: k.k) && !bkey_extent_is_unwritten(k)) { |
227 | ret = 1; |
228 | break; |
229 | } |
230 | start = iter.pos; |
231 | bch2_trans_iter_exit(trans, &iter); |
232 | err: |
233 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
234 | goto retry; |
235 | |
236 | bch2_trans_put(trans); |
237 | return ret; |
238 | } |
239 | |
240 | static int __bch2_truncate_folio(struct bch_inode_info *inode, |
241 | pgoff_t index, loff_t start, loff_t end) |
242 | { |
243 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
244 | struct address_space *mapping = inode->v.i_mapping; |
245 | struct bch_folio *s; |
246 | unsigned start_offset; |
247 | unsigned end_offset; |
248 | unsigned i; |
249 | struct folio *folio; |
250 | s64 i_sectors_delta = 0; |
251 | int ret = 0; |
252 | u64 end_pos; |
253 | |
254 | folio = filemap_lock_folio(mapping, index); |
255 | if (IS_ERR_OR_NULL(ptr: folio)) { |
256 | /* |
257 | * XXX: we're doing two index lookups when we end up reading the |
258 | * folio |
259 | */ |
260 | ret = range_has_data(c, subvol: inode->ei_subvol, |
261 | POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), |
262 | POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); |
263 | if (ret <= 0) |
264 | return ret; |
265 | |
266 | folio = __filemap_get_folio(mapping, index, |
267 | FGP_LOCK|FGP_CREAT, GFP_KERNEL); |
268 | if (IS_ERR_OR_NULL(ptr: folio)) { |
269 | ret = -ENOMEM; |
270 | goto out; |
271 | } |
272 | } |
273 | |
274 | BUG_ON(start >= folio_end_pos(folio)); |
275 | BUG_ON(end <= folio_pos(folio)); |
276 | |
277 | start_offset = max(start, folio_pos(folio)) - folio_pos(folio); |
278 | end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); |
279 | |
280 | /* Folio boundary? Nothing to do */ |
281 | if (start_offset == 0 && |
282 | end_offset == folio_size(folio)) { |
283 | ret = 0; |
284 | goto unlock; |
285 | } |
286 | |
287 | s = bch2_folio_create(folio, 0); |
288 | if (!s) { |
289 | ret = -ENOMEM; |
290 | goto unlock; |
291 | } |
292 | |
293 | if (!folio_test_uptodate(folio)) { |
294 | ret = bch2_read_single_folio(folio, mapping); |
295 | if (ret) |
296 | goto unlock; |
297 | } |
298 | |
299 | ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); |
300 | if (ret) |
301 | goto unlock; |
302 | |
303 | for (i = round_up(start_offset, block_bytes(c)) >> 9; |
304 | i < round_down(end_offset, block_bytes(c)) >> 9; |
305 | i++) { |
306 | s->s[i].nr_replicas = 0; |
307 | |
308 | i_sectors_delta -= s->s[i].state == SECTOR_dirty; |
309 | bch2_folio_sector_set(folio, s, i, n: SECTOR_unallocated); |
310 | } |
311 | |
312 | bch2_i_sectors_acct(c, inode, NULL, sectors: i_sectors_delta); |
313 | |
314 | /* |
315 | * Caller needs to know whether this folio will be written out by |
316 | * writeback - doing an i_size update if necessary - or whether it will |
317 | * be responsible for the i_size update. |
318 | * |
319 | * Note that we shouldn't ever see a folio beyond EOF, but check and |
320 | * warn if so. This has been observed by failure to clean up folios |
321 | * after a short write and there's still a chance reclaim will fix |
322 | * things up. |
323 | */ |
324 | WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); |
325 | end_pos = folio_end_pos(folio); |
326 | if (inode->v.i_size > folio_pos(folio)) |
327 | end_pos = min_t(u64, inode->v.i_size, end_pos); |
328 | ret = s->s[folio_pos_to_s(folio, pos: end_pos - 1)].state >= SECTOR_dirty; |
329 | |
330 | folio_zero_segment(folio, start: start_offset, xend: end_offset); |
331 | |
332 | /* |
333 | * Bit of a hack - we don't want truncate to fail due to -ENOSPC. |
334 | * |
335 | * XXX: because we aren't currently tracking whether the folio has actual |
336 | * data in it (vs. just 0s, or only partially written) this wrong. ick. |
337 | */ |
338 | BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); |
339 | |
340 | /* |
341 | * This removes any writeable userspace mappings; we need to force |
342 | * .page_mkwrite to be called again before any mmapped writes, to |
343 | * redirty the full page: |
344 | */ |
345 | folio_mkclean(folio); |
346 | filemap_dirty_folio(mapping, folio); |
347 | unlock: |
348 | folio_unlock(folio); |
349 | folio_put(folio); |
350 | out: |
351 | return ret; |
352 | } |
353 | |
354 | static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) |
355 | { |
356 | return __bch2_truncate_folio(inode, index: from >> PAGE_SHIFT, |
357 | start: from, ANYSINT_MAX(loff_t)); |
358 | } |
359 | |
360 | static int bch2_truncate_folios(struct bch_inode_info *inode, |
361 | loff_t start, loff_t end) |
362 | { |
363 | int ret = __bch2_truncate_folio(inode, index: start >> PAGE_SHIFT, |
364 | start, end); |
365 | |
366 | if (ret >= 0 && |
367 | start >> PAGE_SHIFT != end >> PAGE_SHIFT) |
368 | ret = __bch2_truncate_folio(inode, |
369 | index: (end - 1) >> PAGE_SHIFT, |
370 | start, end); |
371 | return ret; |
372 | } |
373 | |
374 | static int bch2_extend(struct mnt_idmap *idmap, |
375 | struct bch_inode_info *inode, |
376 | struct bch_inode_unpacked *inode_u, |
377 | struct iattr *iattr) |
378 | { |
379 | struct address_space *mapping = inode->v.i_mapping; |
380 | int ret; |
381 | |
382 | /* |
383 | * sync appends: |
384 | * |
385 | * this has to be done _before_ extending i_size: |
386 | */ |
387 | ret = filemap_write_and_wait_range(mapping, lstart: inode_u->bi_size, S64_MAX); |
388 | if (ret) |
389 | return ret; |
390 | |
391 | truncate_setsize(inode: &inode->v, newsize: iattr->ia_size); |
392 | |
393 | return bch2_setattr_nonsize(idmap, inode, iattr); |
394 | } |
395 | |
396 | int bchfs_truncate(struct mnt_idmap *idmap, |
397 | struct bch_inode_info *inode, struct iattr *iattr) |
398 | { |
399 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
400 | struct address_space *mapping = inode->v.i_mapping; |
401 | struct bch_inode_unpacked inode_u; |
402 | s64 i_sectors_delta = 0; |
403 | int ret = 0; |
404 | |
405 | /* |
406 | * If the truncate call with change the size of the file, the |
407 | * cmtimes should be updated. If the size will not change, we |
408 | * do not need to update the cmtimes. |
409 | */ |
410 | if (iattr->ia_size != inode->v.i_size) { |
411 | if (!(iattr->ia_valid & ATTR_MTIME)) |
412 | ktime_get_coarse_real_ts64(ts: &iattr->ia_mtime); |
413 | if (!(iattr->ia_valid & ATTR_CTIME)) |
414 | ktime_get_coarse_real_ts64(ts: &iattr->ia_ctime); |
415 | iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; |
416 | } |
417 | |
418 | inode_dio_wait(inode: &inode->v); |
419 | bch2_pagecache_block_get(inode); |
420 | |
421 | ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); |
422 | if (ret) |
423 | goto err; |
424 | |
425 | /* |
426 | * check this before next assertion; on filesystem error our normal |
427 | * invariants are a bit broken (truncate has to truncate the page cache |
428 | * before the inode). |
429 | */ |
430 | ret = bch2_journal_error(j: &c->journal); |
431 | if (ret) |
432 | goto err; |
433 | |
434 | WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && |
435 | inode->v.i_size < inode_u.bi_size, |
436 | "truncate spotted in mem i_size < btree i_size: %llu < %llu\n" , |
437 | (u64) inode->v.i_size, inode_u.bi_size); |
438 | |
439 | if (iattr->ia_size > inode->v.i_size) { |
440 | ret = bch2_extend(idmap, inode, inode_u: &inode_u, iattr); |
441 | goto err; |
442 | } |
443 | |
444 | iattr->ia_valid &= ~ATTR_SIZE; |
445 | |
446 | ret = bch2_truncate_folio(inode, from: iattr->ia_size); |
447 | if (unlikely(ret < 0)) |
448 | goto err; |
449 | |
450 | truncate_setsize(inode: &inode->v, newsize: iattr->ia_size); |
451 | |
452 | /* |
453 | * When extending, we're going to write the new i_size to disk |
454 | * immediately so we need to flush anything above the current on disk |
455 | * i_size first: |
456 | * |
457 | * Also, when extending we need to flush the page that i_size currently |
458 | * straddles - if it's mapped to userspace, we need to ensure that |
459 | * userspace has to redirty it and call .mkwrite -> set_page_dirty |
460 | * again to allocate the part of the page that was extended. |
461 | */ |
462 | if (iattr->ia_size > inode_u.bi_size) |
463 | ret = filemap_write_and_wait_range(mapping, |
464 | lstart: inode_u.bi_size, |
465 | lend: iattr->ia_size - 1); |
466 | else if (iattr->ia_size & (PAGE_SIZE - 1)) |
467 | ret = filemap_write_and_wait_range(mapping, |
468 | round_down(iattr->ia_size, PAGE_SIZE), |
469 | lend: iattr->ia_size - 1); |
470 | if (ret) |
471 | goto err; |
472 | |
473 | ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); |
474 | bch2_i_sectors_acct(c, inode, NULL, sectors: i_sectors_delta); |
475 | |
476 | if (unlikely(ret)) { |
477 | /* |
478 | * If we error here, VFS caches are now inconsistent with btree |
479 | */ |
480 | set_bit(EI_INODE_ERROR, addr: &inode->ei_flags); |
481 | goto err; |
482 | } |
483 | |
484 | bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && |
485 | !bch2_journal_error(&c->journal), c, |
486 | "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)" , |
487 | inode->v.i_ino, (u64) inode->v.i_blocks, |
488 | inode->ei_inode.bi_sectors); |
489 | |
490 | ret = bch2_setattr_nonsize(idmap, inode, iattr); |
491 | err: |
492 | bch2_pagecache_block_put(inode); |
493 | return bch2_err_class(err: ret); |
494 | } |
495 | |
496 | /* fallocate: */ |
497 | |
498 | static int inode_update_times_fn(struct btree_trans *trans, |
499 | struct bch_inode_info *inode, |
500 | struct bch_inode_unpacked *bi, void *p) |
501 | { |
502 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
503 | |
504 | bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); |
505 | return 0; |
506 | } |
507 | |
508 | static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) |
509 | { |
510 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
511 | u64 end = offset + len; |
512 | u64 block_start = round_up(offset, block_bytes(c)); |
513 | u64 block_end = round_down(end, block_bytes(c)); |
514 | bool truncated_last_page; |
515 | int ret = 0; |
516 | |
517 | ret = bch2_truncate_folios(inode, start: offset, end); |
518 | if (unlikely(ret < 0)) |
519 | goto err; |
520 | |
521 | truncated_last_page = ret; |
522 | |
523 | truncate_pagecache_range(inode: &inode->v, offset, end: end - 1); |
524 | |
525 | if (block_start < block_end) { |
526 | s64 i_sectors_delta = 0; |
527 | |
528 | ret = bch2_fpunch(c, inode_inum(inode), |
529 | block_start >> 9, block_end >> 9, |
530 | &i_sectors_delta); |
531 | bch2_i_sectors_acct(c, inode, NULL, sectors: i_sectors_delta); |
532 | } |
533 | |
534 | mutex_lock(&inode->ei_update_lock); |
535 | if (end >= inode->v.i_size && !truncated_last_page) { |
536 | ret = bch2_write_inode_size(c, inode, new_size: inode->v.i_size, |
537 | ATTR_MTIME|ATTR_CTIME); |
538 | } else { |
539 | ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, |
540 | ATTR_MTIME|ATTR_CTIME); |
541 | } |
542 | mutex_unlock(lock: &inode->ei_update_lock); |
543 | err: |
544 | return ret; |
545 | } |
546 | |
547 | static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, |
548 | loff_t offset, loff_t len, |
549 | bool insert) |
550 | { |
551 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
552 | struct address_space *mapping = inode->v.i_mapping; |
553 | s64 i_sectors_delta = 0; |
554 | int ret = 0; |
555 | |
556 | if ((offset | len) & (block_bytes(c) - 1)) |
557 | return -EINVAL; |
558 | |
559 | if (insert) { |
560 | if (offset >= inode->v.i_size) |
561 | return -EINVAL; |
562 | } else { |
563 | if (offset + len >= inode->v.i_size) |
564 | return -EINVAL; |
565 | } |
566 | |
567 | ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); |
568 | if (ret) |
569 | return ret; |
570 | |
571 | if (insert) |
572 | i_size_write(inode: &inode->v, i_size: inode->v.i_size + len); |
573 | |
574 | ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, |
575 | insert, &i_sectors_delta); |
576 | if (!ret && !insert) |
577 | i_size_write(inode: &inode->v, i_size: inode->v.i_size - len); |
578 | bch2_i_sectors_acct(c, inode, NULL, sectors: i_sectors_delta); |
579 | |
580 | return ret; |
581 | } |
582 | |
583 | static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, |
584 | u64 start_sector, u64 end_sector) |
585 | { |
586 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
587 | struct btree_trans *trans = bch2_trans_get(c); |
588 | struct btree_iter iter; |
589 | struct bpos end_pos = POS(inode->v.i_ino, end_sector); |
590 | struct bch_io_opts opts; |
591 | int ret = 0; |
592 | |
593 | bch2_inode_opts_get(&opts, c, &inode->ei_inode); |
594 | |
595 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
596 | POS(inode->v.i_ino, start_sector), |
597 | flags: BTREE_ITER_SLOTS|BTREE_ITER_INTENT); |
598 | |
599 | while (!ret && bkey_lt(l: iter.pos, r: end_pos)) { |
600 | s64 i_sectors_delta = 0; |
601 | struct quota_res quota_res = { 0 }; |
602 | struct bkey_s_c k; |
603 | unsigned sectors; |
604 | bool is_allocation; |
605 | u64 hole_start, hole_end; |
606 | u32 snapshot; |
607 | |
608 | bch2_trans_begin(trans); |
609 | |
610 | ret = bch2_subvolume_get_snapshot(trans, |
611 | inode->ei_subvol, &snapshot); |
612 | if (ret) |
613 | goto bkey_err; |
614 | |
615 | bch2_btree_iter_set_snapshot(iter: &iter, snapshot); |
616 | |
617 | k = bch2_btree_iter_peek_slot(&iter); |
618 | if ((ret = bkey_err(k))) |
619 | goto bkey_err; |
620 | |
621 | hole_start = iter.pos.offset; |
622 | hole_end = bpos_min(l: k.k->p, r: end_pos).offset; |
623 | is_allocation = bkey_extent_is_allocation(k: k.k); |
624 | |
625 | /* already reserved */ |
626 | if (bkey_extent_is_reservation(k) && |
627 | bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { |
628 | bch2_btree_iter_advance(&iter); |
629 | continue; |
630 | } |
631 | |
632 | if (bkey_extent_is_data(k: k.k) && |
633 | !(mode & FALLOC_FL_ZERO_RANGE)) { |
634 | bch2_btree_iter_advance(&iter); |
635 | continue; |
636 | } |
637 | |
638 | if (!(mode & FALLOC_FL_ZERO_RANGE)) { |
639 | /* |
640 | * Lock ordering - can't be holding btree locks while |
641 | * blocking on a folio lock: |
642 | */ |
643 | if (bch2_clamp_data_hole(&inode->v, |
644 | &hole_start, |
645 | &hole_end, |
646 | opts.data_replicas, true)) |
647 | ret = drop_locks_do(trans, |
648 | (bch2_clamp_data_hole(&inode->v, |
649 | &hole_start, |
650 | &hole_end, |
651 | opts.data_replicas, false), 0)); |
652 | bch2_btree_iter_set_pos(iter: &iter, POS(iter.pos.inode, hole_start)); |
653 | |
654 | if (ret) |
655 | goto bkey_err; |
656 | |
657 | if (hole_start == hole_end) |
658 | continue; |
659 | } |
660 | |
661 | sectors = hole_end - hole_start; |
662 | |
663 | if (!is_allocation) { |
664 | ret = bch2_quota_reservation_add(c, inode, |
665 | res: "a_res, sectors, check_enospc: true); |
666 | if (unlikely(ret)) |
667 | goto bkey_err; |
668 | } |
669 | |
670 | ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, |
671 | sectors, opts, &i_sectors_delta, |
672 | writepoint_hashed(v: (unsigned long) current)); |
673 | if (ret) |
674 | goto bkey_err; |
675 | |
676 | bch2_i_sectors_acct(c, inode, quota_res: "a_res, sectors: i_sectors_delta); |
677 | |
678 | if (bch2_mark_pagecache_reserved(inode, &hole_start, |
679 | iter.pos.offset, true)) |
680 | drop_locks_do(trans, |
681 | bch2_mark_pagecache_reserved(inode, &hole_start, |
682 | iter.pos.offset, false)); |
683 | bkey_err: |
684 | bch2_quota_reservation_put(c, inode, res: "a_res); |
685 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
686 | ret = 0; |
687 | } |
688 | |
689 | if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { |
690 | struct quota_res quota_res = { 0 }; |
691 | s64 i_sectors_delta = 0; |
692 | |
693 | bch2_fpunch_at(trans, &iter, inode_inum(inode), |
694 | end_sector, &i_sectors_delta); |
695 | bch2_i_sectors_acct(c, inode, quota_res: "a_res, sectors: i_sectors_delta); |
696 | bch2_quota_reservation_put(c, inode, res: "a_res); |
697 | } |
698 | |
699 | bch2_trans_iter_exit(trans, &iter); |
700 | bch2_trans_put(trans); |
701 | return ret; |
702 | } |
703 | |
704 | static long bchfs_fallocate(struct bch_inode_info *inode, int mode, |
705 | loff_t offset, loff_t len) |
706 | { |
707 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
708 | u64 end = offset + len; |
709 | u64 block_start = round_down(offset, block_bytes(c)); |
710 | u64 block_end = round_up(end, block_bytes(c)); |
711 | bool truncated_last_page = false; |
712 | int ret, ret2 = 0; |
713 | |
714 | if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { |
715 | ret = inode_newsize_ok(&inode->v, offset: end); |
716 | if (ret) |
717 | return ret; |
718 | } |
719 | |
720 | if (mode & FALLOC_FL_ZERO_RANGE) { |
721 | ret = bch2_truncate_folios(inode, start: offset, end); |
722 | if (unlikely(ret < 0)) |
723 | return ret; |
724 | |
725 | truncated_last_page = ret; |
726 | |
727 | truncate_pagecache_range(inode: &inode->v, offset, end: end - 1); |
728 | |
729 | block_start = round_up(offset, block_bytes(c)); |
730 | block_end = round_down(end, block_bytes(c)); |
731 | } |
732 | |
733 | ret = __bchfs_fallocate(inode, mode, start_sector: block_start >> 9, end_sector: block_end >> 9); |
734 | |
735 | /* |
736 | * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, |
737 | * so that the VFS cache i_size is consistent with the btree i_size: |
738 | */ |
739 | if (ret && |
740 | !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) |
741 | return ret; |
742 | |
743 | if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) |
744 | end = inode->v.i_size; |
745 | |
746 | if (end >= inode->v.i_size && |
747 | (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || |
748 | !(mode & FALLOC_FL_KEEP_SIZE))) { |
749 | spin_lock(lock: &inode->v.i_lock); |
750 | i_size_write(inode: &inode->v, i_size: end); |
751 | spin_unlock(lock: &inode->v.i_lock); |
752 | |
753 | mutex_lock(&inode->ei_update_lock); |
754 | ret2 = bch2_write_inode_size(c, inode, new_size: end, fields: 0); |
755 | mutex_unlock(lock: &inode->ei_update_lock); |
756 | } |
757 | |
758 | return ret ?: ret2; |
759 | } |
760 | |
761 | long bch2_fallocate_dispatch(struct file *file, int mode, |
762 | loff_t offset, loff_t len) |
763 | { |
764 | struct bch_inode_info *inode = file_bch_inode(file); |
765 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
766 | long ret; |
767 | |
768 | if (!bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_fallocate)) |
769 | return -EROFS; |
770 | |
771 | inode_lock(inode: &inode->v); |
772 | inode_dio_wait(inode: &inode->v); |
773 | bch2_pagecache_block_get(inode); |
774 | |
775 | ret = file_modified(file); |
776 | if (ret) |
777 | goto err; |
778 | |
779 | if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) |
780 | ret = bchfs_fallocate(inode, mode, offset, len); |
781 | else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) |
782 | ret = bchfs_fpunch(inode, offset, len); |
783 | else if (mode == FALLOC_FL_INSERT_RANGE) |
784 | ret = bchfs_fcollapse_finsert(inode, offset, len, insert: true); |
785 | else if (mode == FALLOC_FL_COLLAPSE_RANGE) |
786 | ret = bchfs_fcollapse_finsert(inode, offset, len, insert: false); |
787 | else |
788 | ret = -EOPNOTSUPP; |
789 | err: |
790 | bch2_pagecache_block_put(inode); |
791 | inode_unlock(inode: &inode->v); |
792 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_fallocate); |
793 | |
794 | return bch2_err_class(err: ret); |
795 | } |
796 | |
797 | /* |
798 | * Take a quota reservation for unallocated blocks in a given file range |
799 | * Does not check pagecache |
800 | */ |
801 | static int quota_reserve_range(struct bch_inode_info *inode, |
802 | struct quota_res *res, |
803 | u64 start, u64 end) |
804 | { |
805 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
806 | struct btree_trans *trans = bch2_trans_get(c); |
807 | struct btree_iter iter; |
808 | struct bkey_s_c k; |
809 | u32 snapshot; |
810 | u64 sectors = end - start; |
811 | u64 pos = start; |
812 | int ret; |
813 | retry: |
814 | bch2_trans_begin(trans); |
815 | |
816 | ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); |
817 | if (ret) |
818 | goto err; |
819 | |
820 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
821 | pos: SPOS(inode: inode->v.i_ino, offset: pos, snapshot), flags: 0); |
822 | |
823 | while (!(ret = btree_trans_too_many_iters(trans)) && |
824 | (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && |
825 | !(ret = bkey_err(k))) { |
826 | if (bkey_extent_is_allocation(k: k.k)) { |
827 | u64 s = min(end, k.k->p.offset) - |
828 | max(start, bkey_start_offset(k.k)); |
829 | BUG_ON(s > sectors); |
830 | sectors -= s; |
831 | } |
832 | bch2_btree_iter_advance(&iter); |
833 | } |
834 | pos = iter.pos.offset; |
835 | bch2_trans_iter_exit(trans, &iter); |
836 | err: |
837 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
838 | goto retry; |
839 | |
840 | bch2_trans_put(trans); |
841 | |
842 | return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, check_enospc: true); |
843 | } |
844 | |
845 | loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, |
846 | struct file *file_dst, loff_t pos_dst, |
847 | loff_t len, unsigned remap_flags) |
848 | { |
849 | struct bch_inode_info *src = file_bch_inode(file: file_src); |
850 | struct bch_inode_info *dst = file_bch_inode(file: file_dst); |
851 | struct bch_fs *c = src->v.i_sb->s_fs_info; |
852 | struct quota_res quota_res = { 0 }; |
853 | s64 i_sectors_delta = 0; |
854 | u64 aligned_len; |
855 | loff_t ret = 0; |
856 | |
857 | if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) |
858 | return -EINVAL; |
859 | |
860 | if (remap_flags & REMAP_FILE_DEDUP) |
861 | return -EOPNOTSUPP; |
862 | |
863 | if ((pos_src & (block_bytes(c) - 1)) || |
864 | (pos_dst & (block_bytes(c) - 1))) |
865 | return -EINVAL; |
866 | |
867 | if (src == dst && |
868 | abs(pos_src - pos_dst) < len) |
869 | return -EINVAL; |
870 | |
871 | lock_two_nondirectories(&src->v, &dst->v); |
872 | bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst); |
873 | |
874 | inode_dio_wait(inode: &src->v); |
875 | inode_dio_wait(inode: &dst->v); |
876 | |
877 | ret = generic_remap_file_range_prep(file_in: file_src, pos_in: pos_src, |
878 | file_out: file_dst, pos_out: pos_dst, |
879 | count: &len, remap_flags); |
880 | if (ret < 0 || len == 0) |
881 | goto err; |
882 | |
883 | aligned_len = round_up((u64) len, block_bytes(c)); |
884 | |
885 | ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, |
886 | pos_dst, pos_dst + len - 1); |
887 | if (ret) |
888 | goto err; |
889 | |
890 | ret = quota_reserve_range(inode: dst, res: "a_res, start: pos_dst >> 9, |
891 | end: (pos_dst + aligned_len) >> 9); |
892 | if (ret) |
893 | goto err; |
894 | |
895 | file_update_time(file: file_dst); |
896 | |
897 | bch2_mark_pagecache_unallocated(src, pos_src >> 9, |
898 | (pos_src + aligned_len) >> 9); |
899 | |
900 | ret = bch2_remap_range(c, |
901 | inode_inum(inode: dst), pos_dst >> 9, |
902 | inode_inum(inode: src), pos_src >> 9, |
903 | aligned_len >> 9, |
904 | pos_dst + len, &i_sectors_delta); |
905 | if (ret < 0) |
906 | goto err; |
907 | |
908 | /* |
909 | * due to alignment, we might have remapped slightly more than requsted |
910 | */ |
911 | ret = min((u64) ret << 9, (u64) len); |
912 | |
913 | bch2_i_sectors_acct(c, inode: dst, quota_res: "a_res, sectors: i_sectors_delta); |
914 | |
915 | spin_lock(lock: &dst->v.i_lock); |
916 | if (pos_dst + ret > dst->v.i_size) |
917 | i_size_write(inode: &dst->v, i_size: pos_dst + ret); |
918 | spin_unlock(lock: &dst->v.i_lock); |
919 | |
920 | if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || |
921 | IS_SYNC(file_inode(file_dst))) |
922 | ret = bch2_flush_inode(c, inode: dst); |
923 | err: |
924 | bch2_quota_reservation_put(c, inode: dst, res: "a_res); |
925 | bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst); |
926 | unlock_two_nondirectories(&src->v, &dst->v); |
927 | |
928 | return bch2_err_class(err: ret); |
929 | } |
930 | |
931 | /* fseek: */ |
932 | |
933 | static loff_t bch2_seek_data(struct file *file, u64 offset) |
934 | { |
935 | struct bch_inode_info *inode = file_bch_inode(file); |
936 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
937 | struct btree_trans *trans; |
938 | struct btree_iter iter; |
939 | struct bkey_s_c k; |
940 | subvol_inum inum = inode_inum(inode); |
941 | u64 isize, next_data = MAX_LFS_FILESIZE; |
942 | u32 snapshot; |
943 | int ret; |
944 | |
945 | isize = i_size_read(inode: &inode->v); |
946 | if (offset >= isize) |
947 | return -ENXIO; |
948 | |
949 | trans = bch2_trans_get(c); |
950 | retry: |
951 | bch2_trans_begin(trans); |
952 | |
953 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
954 | if (ret) |
955 | goto err; |
956 | |
957 | for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, |
958 | SPOS(inode->v.i_ino, offset >> 9, snapshot), |
959 | POS(inode->v.i_ino, U64_MAX), |
960 | 0, k, ret) { |
961 | if (bkey_extent_is_data(k: k.k)) { |
962 | next_data = max(offset, bkey_start_offset(k.k) << 9); |
963 | break; |
964 | } else if (k.k->p.offset >> 9 > isize) |
965 | break; |
966 | } |
967 | bch2_trans_iter_exit(trans, &iter); |
968 | err: |
969 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
970 | goto retry; |
971 | |
972 | bch2_trans_put(trans); |
973 | if (ret) |
974 | return ret; |
975 | |
976 | if (next_data > offset) |
977 | next_data = bch2_seek_pagecache_data(&inode->v, |
978 | offset, next_data, 0, false); |
979 | |
980 | if (next_data >= isize) |
981 | return -ENXIO; |
982 | |
983 | return vfs_setpos(file, offset: next_data, MAX_LFS_FILESIZE); |
984 | } |
985 | |
986 | static loff_t bch2_seek_hole(struct file *file, u64 offset) |
987 | { |
988 | struct bch_inode_info *inode = file_bch_inode(file); |
989 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
990 | struct btree_trans *trans; |
991 | struct btree_iter iter; |
992 | struct bkey_s_c k; |
993 | subvol_inum inum = inode_inum(inode); |
994 | u64 isize, next_hole = MAX_LFS_FILESIZE; |
995 | u32 snapshot; |
996 | int ret; |
997 | |
998 | isize = i_size_read(inode: &inode->v); |
999 | if (offset >= isize) |
1000 | return -ENXIO; |
1001 | |
1002 | trans = bch2_trans_get(c); |
1003 | retry: |
1004 | bch2_trans_begin(trans); |
1005 | |
1006 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
1007 | if (ret) |
1008 | goto err; |
1009 | |
1010 | for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, |
1011 | SPOS(inode->v.i_ino, offset >> 9, snapshot), |
1012 | BTREE_ITER_SLOTS, k, ret) { |
1013 | if (k.k->p.inode != inode->v.i_ino) { |
1014 | next_hole = bch2_seek_pagecache_hole(&inode->v, |
1015 | offset, MAX_LFS_FILESIZE, 0, false); |
1016 | break; |
1017 | } else if (!bkey_extent_is_data(k: k.k)) { |
1018 | next_hole = bch2_seek_pagecache_hole(&inode->v, |
1019 | max(offset, bkey_start_offset(k.k) << 9), |
1020 | k.k->p.offset << 9, 0, false); |
1021 | |
1022 | if (next_hole < k.k->p.offset << 9) |
1023 | break; |
1024 | } else { |
1025 | offset = max(offset, bkey_start_offset(k.k) << 9); |
1026 | } |
1027 | } |
1028 | bch2_trans_iter_exit(trans, &iter); |
1029 | err: |
1030 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
1031 | goto retry; |
1032 | |
1033 | bch2_trans_put(trans); |
1034 | if (ret) |
1035 | return ret; |
1036 | |
1037 | if (next_hole > isize) |
1038 | next_hole = isize; |
1039 | |
1040 | return vfs_setpos(file, offset: next_hole, MAX_LFS_FILESIZE); |
1041 | } |
1042 | |
1043 | loff_t bch2_llseek(struct file *file, loff_t offset, int whence) |
1044 | { |
1045 | loff_t ret; |
1046 | |
1047 | switch (whence) { |
1048 | case SEEK_SET: |
1049 | case SEEK_CUR: |
1050 | case SEEK_END: |
1051 | ret = generic_file_llseek(file, offset, whence); |
1052 | break; |
1053 | case SEEK_DATA: |
1054 | ret = bch2_seek_data(file, offset); |
1055 | break; |
1056 | case SEEK_HOLE: |
1057 | ret = bch2_seek_hole(file, offset); |
1058 | break; |
1059 | default: |
1060 | ret = -EINVAL; |
1061 | break; |
1062 | } |
1063 | |
1064 | return bch2_err_class(err: ret); |
1065 | } |
1066 | |
1067 | void bch2_fs_fsio_exit(struct bch_fs *c) |
1068 | { |
1069 | bioset_exit(&c->nocow_flush_bioset); |
1070 | } |
1071 | |
1072 | int bch2_fs_fsio_init(struct bch_fs *c) |
1073 | { |
1074 | if (bioset_init(&c->nocow_flush_bioset, |
1075 | 1, offsetof(struct nocow_flush, bio), flags: 0)) |
1076 | return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; |
1077 | |
1078 | return 0; |
1079 | } |
1080 | |
1081 | #endif /* NO_BCACHEFS_FS */ |
1082 | |