1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #ifndef NO_BCACHEFS_FS |
3 | |
4 | #include "bcachefs.h" |
5 | #include "alloc_foreground.h" |
6 | #include "bkey_buf.h" |
7 | #include "fs-io.h" |
8 | #include "fs-io-buffered.h" |
9 | #include "fs-io-direct.h" |
10 | #include "fs-io-pagecache.h" |
11 | #include "io_read.h" |
12 | #include "io_write.h" |
13 | |
14 | #include <linux/backing-dev.h> |
15 | #include <linux/pagemap.h> |
16 | #include <linux/writeback.h> |
17 | |
18 | static inline bool bio_full(struct bio *bio, unsigned len) |
19 | { |
20 | if (bio->bi_vcnt >= bio->bi_max_vecs) |
21 | return true; |
22 | if (bio->bi_iter.bi_size > UINT_MAX - len) |
23 | return true; |
24 | return false; |
25 | } |
26 | |
27 | /* readpage(s): */ |
28 | |
29 | static void bch2_readpages_end_io(struct bio *bio) |
30 | { |
31 | struct folio_iter fi; |
32 | |
33 | bio_for_each_folio_all(fi, bio) { |
34 | if (!bio->bi_status) { |
35 | folio_mark_uptodate(folio: fi.folio); |
36 | } else { |
37 | folio_clear_uptodate(folio: fi.folio); |
38 | folio_set_error(folio: fi.folio); |
39 | } |
40 | folio_unlock(folio: fi.folio); |
41 | } |
42 | |
43 | bio_put(bio); |
44 | } |
45 | |
46 | struct readpages_iter { |
47 | struct address_space *mapping; |
48 | unsigned idx; |
49 | folios folios; |
50 | }; |
51 | |
52 | static int readpages_iter_init(struct readpages_iter *iter, |
53 | struct readahead_control *ractl) |
54 | { |
55 | struct folio *folio; |
56 | |
57 | *iter = (struct readpages_iter) { ractl->mapping }; |
58 | |
59 | while ((folio = __readahead_folio(ractl))) { |
60 | if (!bch2_folio_create(folio, GFP_KERNEL) || |
61 | darray_push(&iter->folios, folio)) { |
62 | bch2_folio_release(folio); |
63 | ractl->_nr_pages += folio_nr_pages(folio); |
64 | ractl->_index -= folio_nr_pages(folio); |
65 | return iter->folios.nr ? 0 : -ENOMEM; |
66 | } |
67 | |
68 | folio_put(folio); |
69 | } |
70 | |
71 | return 0; |
72 | } |
73 | |
74 | static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) |
75 | { |
76 | if (iter->idx >= iter->folios.nr) |
77 | return NULL; |
78 | return iter->folios.data[iter->idx]; |
79 | } |
80 | |
81 | static inline void readpage_iter_advance(struct readpages_iter *iter) |
82 | { |
83 | iter->idx++; |
84 | } |
85 | |
86 | static bool extent_partial_reads_expensive(struct bkey_s_c k) |
87 | { |
88 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
89 | struct bch_extent_crc_unpacked crc; |
90 | const union bch_extent_entry *i; |
91 | |
92 | bkey_for_each_crc(k.k, ptrs, crc, i) |
93 | if (crc.csum_type || crc.compression_type) |
94 | return true; |
95 | return false; |
96 | } |
97 | |
98 | static int readpage_bio_extend(struct btree_trans *trans, |
99 | struct readpages_iter *iter, |
100 | struct bio *bio, |
101 | unsigned sectors_this_extent, |
102 | bool get_more) |
103 | { |
104 | /* Don't hold btree locks while allocating memory: */ |
105 | bch2_trans_unlock(trans); |
106 | |
107 | while (bio_sectors(bio) < sectors_this_extent && |
108 | bio->bi_vcnt < bio->bi_max_vecs) { |
109 | struct folio *folio = readpage_iter_peek(iter); |
110 | int ret; |
111 | |
112 | if (folio) { |
113 | readpage_iter_advance(iter); |
114 | } else { |
115 | pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; |
116 | |
117 | if (!get_more) |
118 | break; |
119 | |
120 | folio = xa_load(&iter->mapping->i_pages, index: folio_offset); |
121 | if (folio && !xa_is_value(entry: folio)) |
122 | break; |
123 | |
124 | folio = filemap_alloc_folio(gfp: readahead_gfp_mask(x: iter->mapping), order: 0); |
125 | if (!folio) |
126 | break; |
127 | |
128 | if (!__bch2_folio_create(folio, GFP_KERNEL)) { |
129 | folio_put(folio); |
130 | break; |
131 | } |
132 | |
133 | ret = filemap_add_folio(mapping: iter->mapping, folio, index: folio_offset, GFP_KERNEL); |
134 | if (ret) { |
135 | __bch2_folio_release(folio); |
136 | folio_put(folio); |
137 | break; |
138 | } |
139 | |
140 | folio_put(folio); |
141 | } |
142 | |
143 | BUG_ON(folio_sector(folio) != bio_end_sector(bio)); |
144 | |
145 | BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); |
146 | } |
147 | |
148 | return bch2_trans_relock(trans); |
149 | } |
150 | |
151 | static void bchfs_read(struct btree_trans *trans, |
152 | struct bch_read_bio *rbio, |
153 | subvol_inum inum, |
154 | struct readpages_iter *readpages_iter) |
155 | { |
156 | struct bch_fs *c = trans->c; |
157 | struct btree_iter iter; |
158 | struct bkey_buf sk; |
159 | int flags = BCH_READ_RETRY_IF_STALE| |
160 | BCH_READ_MAY_PROMOTE; |
161 | u32 snapshot; |
162 | int ret = 0; |
163 | |
164 | rbio->c = c; |
165 | rbio->start_time = local_clock(); |
166 | rbio->subvol = inum.subvol; |
167 | |
168 | bch2_bkey_buf_init(s: &sk); |
169 | retry: |
170 | bch2_trans_begin(trans); |
171 | iter = (struct btree_iter) { NULL }; |
172 | |
173 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
174 | if (ret) |
175 | goto err; |
176 | |
177 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
178 | pos: SPOS(inode: inum.inum, offset: rbio->bio.bi_iter.bi_sector, snapshot), |
179 | flags: BTREE_ITER_SLOTS); |
180 | while (1) { |
181 | struct bkey_s_c k; |
182 | unsigned bytes, sectors, offset_into_extent; |
183 | enum btree_id data_btree = BTREE_ID_extents; |
184 | |
185 | /* |
186 | * read_extent -> io_time_reset may cause a transaction restart |
187 | * without returning an error, we need to check for that here: |
188 | */ |
189 | ret = bch2_trans_relock(trans); |
190 | if (ret) |
191 | break; |
192 | |
193 | bch2_btree_iter_set_pos(iter: &iter, |
194 | POS(inum.inum, rbio->bio.bi_iter.bi_sector)); |
195 | |
196 | k = bch2_btree_iter_peek_slot(&iter); |
197 | ret = bkey_err(k); |
198 | if (ret) |
199 | break; |
200 | |
201 | offset_into_extent = iter.pos.offset - |
202 | bkey_start_offset(k: k.k); |
203 | sectors = k.k->size - offset_into_extent; |
204 | |
205 | bch2_bkey_buf_reassemble(s: &sk, c, k); |
206 | |
207 | ret = bch2_read_indirect_extent(trans, data_btree: &data_btree, |
208 | offset_into_extent: &offset_into_extent, k: &sk); |
209 | if (ret) |
210 | break; |
211 | |
212 | k = bkey_i_to_s_c(k: sk.k); |
213 | |
214 | sectors = min(sectors, k.k->size - offset_into_extent); |
215 | |
216 | if (readpages_iter) { |
217 | ret = readpage_bio_extend(trans, iter: readpages_iter, bio: &rbio->bio, sectors_this_extent: sectors, |
218 | get_more: extent_partial_reads_expensive(k)); |
219 | if (ret) |
220 | break; |
221 | } |
222 | |
223 | bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; |
224 | swap(rbio->bio.bi_iter.bi_size, bytes); |
225 | |
226 | if (rbio->bio.bi_iter.bi_size == bytes) |
227 | flags |= BCH_READ_LAST_FRAGMENT; |
228 | |
229 | bch2_bio_page_state_set(&rbio->bio, k); |
230 | |
231 | bch2_read_extent(trans, rbio, read_pos: iter.pos, |
232 | data_btree, k, offset_into_extent, flags); |
233 | |
234 | if (flags & BCH_READ_LAST_FRAGMENT) |
235 | break; |
236 | |
237 | swap(rbio->bio.bi_iter.bi_size, bytes); |
238 | bio_advance(bio: &rbio->bio, nbytes: bytes); |
239 | |
240 | ret = btree_trans_too_many_iters(trans); |
241 | if (ret) |
242 | break; |
243 | } |
244 | err: |
245 | bch2_trans_iter_exit(trans, &iter); |
246 | |
247 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
248 | goto retry; |
249 | |
250 | if (ret) { |
251 | bch_err_inum_offset_ratelimited(c, |
252 | iter.pos.inode, |
253 | iter.pos.offset << 9, |
254 | "read error %i from btree lookup" , ret); |
255 | rbio->bio.bi_status = BLK_STS_IOERR; |
256 | bio_endio(&rbio->bio); |
257 | } |
258 | |
259 | bch2_bkey_buf_exit(s: &sk, c); |
260 | } |
261 | |
262 | void bch2_readahead(struct readahead_control *ractl) |
263 | { |
264 | struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); |
265 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
266 | struct bch_io_opts opts; |
267 | struct btree_trans *trans = bch2_trans_get(c); |
268 | struct folio *folio; |
269 | struct readpages_iter readpages_iter; |
270 | |
271 | bch2_inode_opts_get(&opts, c, &inode->ei_inode); |
272 | |
273 | int ret = readpages_iter_init(iter: &readpages_iter, ractl); |
274 | if (ret) |
275 | return; |
276 | |
277 | bch2_pagecache_add_get(inode); |
278 | |
279 | while ((folio = readpage_iter_peek(iter: &readpages_iter))) { |
280 | unsigned n = min_t(unsigned, |
281 | readpages_iter.folios.nr - |
282 | readpages_iter.idx, |
283 | BIO_MAX_VECS); |
284 | struct bch_read_bio *rbio = |
285 | rbio_init(bio: bio_alloc_bioset(NULL, nr_vecs: n, opf: REQ_OP_READ, |
286 | GFP_KERNEL, bs: &c->bio_read), |
287 | opts); |
288 | |
289 | readpage_iter_advance(iter: &readpages_iter); |
290 | |
291 | rbio->bio.bi_iter.bi_sector = folio_sector(folio); |
292 | rbio->bio.bi_end_io = bch2_readpages_end_io; |
293 | BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); |
294 | |
295 | bchfs_read(trans, rbio, inum: inode_inum(inode), |
296 | readpages_iter: &readpages_iter); |
297 | bch2_trans_unlock(trans); |
298 | } |
299 | |
300 | bch2_pagecache_add_put(inode); |
301 | |
302 | bch2_trans_put(trans); |
303 | darray_exit(&readpages_iter.folios); |
304 | } |
305 | |
306 | static void bch2_read_single_folio_end_io(struct bio *bio) |
307 | { |
308 | complete(bio->bi_private); |
309 | } |
310 | |
311 | int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) |
312 | { |
313 | struct bch_inode_info *inode = to_bch_ei(mapping->host); |
314 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
315 | struct bch_read_bio *rbio; |
316 | struct bch_io_opts opts; |
317 | int ret; |
318 | DECLARE_COMPLETION_ONSTACK(done); |
319 | |
320 | if (!bch2_folio_create(folio, GFP_KERNEL)) |
321 | return -ENOMEM; |
322 | |
323 | bch2_inode_opts_get(&opts, c, &inode->ei_inode); |
324 | |
325 | rbio = rbio_init(bio: bio_alloc_bioset(NULL, nr_vecs: 1, opf: REQ_OP_READ, GFP_KERNEL, bs: &c->bio_read), |
326 | opts); |
327 | rbio->bio.bi_private = &done; |
328 | rbio->bio.bi_end_io = bch2_read_single_folio_end_io; |
329 | |
330 | rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; |
331 | rbio->bio.bi_iter.bi_sector = folio_sector(folio); |
332 | BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); |
333 | |
334 | bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); |
335 | wait_for_completion(&done); |
336 | |
337 | ret = blk_status_to_errno(status: rbio->bio.bi_status); |
338 | bio_put(&rbio->bio); |
339 | |
340 | if (ret < 0) |
341 | return ret; |
342 | |
343 | folio_mark_uptodate(folio); |
344 | return 0; |
345 | } |
346 | |
347 | int bch2_read_folio(struct file *file, struct folio *folio) |
348 | { |
349 | int ret; |
350 | |
351 | ret = bch2_read_single_folio(folio, mapping: folio->mapping); |
352 | folio_unlock(folio); |
353 | return bch2_err_class(err: ret); |
354 | } |
355 | |
356 | /* writepages: */ |
357 | |
358 | struct bch_writepage_io { |
359 | struct bch_inode_info *inode; |
360 | |
361 | /* must be last: */ |
362 | struct bch_write_op op; |
363 | }; |
364 | |
365 | struct bch_writepage_state { |
366 | struct bch_writepage_io *io; |
367 | struct bch_io_opts opts; |
368 | struct bch_folio_sector *tmp; |
369 | unsigned tmp_sectors; |
370 | }; |
371 | |
372 | static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, |
373 | struct bch_inode_info *inode) |
374 | { |
375 | struct bch_writepage_state ret = { 0 }; |
376 | |
377 | bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); |
378 | return ret; |
379 | } |
380 | |
381 | /* |
382 | * Determine when a writepage io is full. We have to limit writepage bios to a |
383 | * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to |
384 | * what the bounce path in bch2_write_extent() can handle. In theory we could |
385 | * loosen this restriction for non-bounce I/O, but we don't have that context |
386 | * here. Ideally, we can up this limit and make it configurable in the future |
387 | * when the bounce path can be enhanced to accommodate larger source bios. |
388 | */ |
389 | static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) |
390 | { |
391 | struct bio *bio = &io->op.wbio.bio; |
392 | return bio_full(bio, len) || |
393 | (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); |
394 | } |
395 | |
396 | static void bch2_writepage_io_done(struct bch_write_op *op) |
397 | { |
398 | struct bch_writepage_io *io = |
399 | container_of(op, struct bch_writepage_io, op); |
400 | struct bch_fs *c = io->op.c; |
401 | struct bio *bio = &io->op.wbio.bio; |
402 | struct folio_iter fi; |
403 | unsigned i; |
404 | |
405 | if (io->op.error) { |
406 | set_bit(EI_INODE_ERROR, addr: &io->inode->ei_flags); |
407 | |
408 | bio_for_each_folio_all(fi, bio) { |
409 | struct bch_folio *s; |
410 | |
411 | folio_set_error(folio: fi.folio); |
412 | mapping_set_error(mapping: fi.folio->mapping, error: -EIO); |
413 | |
414 | s = __bch2_folio(folio: fi.folio); |
415 | spin_lock(lock: &s->lock); |
416 | for (i = 0; i < folio_sectors(folio: fi.folio); i++) |
417 | s->s[i].nr_replicas = 0; |
418 | spin_unlock(lock: &s->lock); |
419 | } |
420 | } |
421 | |
422 | if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { |
423 | bio_for_each_folio_all(fi, bio) { |
424 | struct bch_folio *s; |
425 | |
426 | s = __bch2_folio(folio: fi.folio); |
427 | spin_lock(lock: &s->lock); |
428 | for (i = 0; i < folio_sectors(folio: fi.folio); i++) |
429 | s->s[i].nr_replicas = 0; |
430 | spin_unlock(lock: &s->lock); |
431 | } |
432 | } |
433 | |
434 | /* |
435 | * racing with fallocate can cause us to add fewer sectors than |
436 | * expected - but we shouldn't add more sectors than expected: |
437 | */ |
438 | WARN_ON_ONCE(io->op.i_sectors_delta > 0); |
439 | |
440 | /* |
441 | * (error (due to going RO) halfway through a page can screw that up |
442 | * slightly) |
443 | * XXX wtf? |
444 | BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); |
445 | */ |
446 | |
447 | /* |
448 | * PageWriteback is effectively our ref on the inode - fixup i_blocks |
449 | * before calling end_page_writeback: |
450 | */ |
451 | bch2_i_sectors_acct(c, inode: io->inode, NULL, sectors: io->op.i_sectors_delta); |
452 | |
453 | bio_for_each_folio_all(fi, bio) { |
454 | struct bch_folio *s = __bch2_folio(folio: fi.folio); |
455 | |
456 | if (atomic_dec_and_test(v: &s->write_count)) |
457 | folio_end_writeback(folio: fi.folio); |
458 | } |
459 | |
460 | bio_put(&io->op.wbio.bio); |
461 | } |
462 | |
463 | static void bch2_writepage_do_io(struct bch_writepage_state *w) |
464 | { |
465 | struct bch_writepage_io *io = w->io; |
466 | |
467 | w->io = NULL; |
468 | closure_call(cl: &io->op.cl, fn: bch2_write, NULL, NULL); |
469 | } |
470 | |
471 | /* |
472 | * Get a bch_writepage_io and add @page to it - appending to an existing one if |
473 | * possible, else allocating a new one: |
474 | */ |
475 | static void bch2_writepage_io_alloc(struct bch_fs *c, |
476 | struct writeback_control *wbc, |
477 | struct bch_writepage_state *w, |
478 | struct bch_inode_info *inode, |
479 | u64 sector, |
480 | unsigned nr_replicas) |
481 | { |
482 | struct bch_write_op *op; |
483 | |
484 | w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, |
485 | REQ_OP_WRITE, |
486 | GFP_KERNEL, |
487 | &c->writepage_bioset), |
488 | struct bch_writepage_io, op.wbio.bio); |
489 | |
490 | w->io->inode = inode; |
491 | op = &w->io->op; |
492 | bch2_write_op_init(op, c, opts: w->opts); |
493 | op->target = w->opts.foreground_target; |
494 | op->nr_replicas = nr_replicas; |
495 | op->res.nr_replicas = nr_replicas; |
496 | op->write_point = writepoint_hashed(v: inode->ei_last_dirtied); |
497 | op->subvol = inode->ei_subvol; |
498 | op->pos = POS(inode->v.i_ino, sector); |
499 | op->end_io = bch2_writepage_io_done; |
500 | op->devs_need_flush = &inode->ei_devs_need_flush; |
501 | op->wbio.bio.bi_iter.bi_sector = sector; |
502 | op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); |
503 | } |
504 | |
505 | static int __bch2_writepage(struct folio *folio, |
506 | struct writeback_control *wbc, |
507 | void *data) |
508 | { |
509 | struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); |
510 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
511 | struct bch_writepage_state *w = data; |
512 | struct bch_folio *s; |
513 | unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; |
514 | loff_t i_size = i_size_read(inode: &inode->v); |
515 | int ret; |
516 | |
517 | EBUG_ON(!folio_test_uptodate(folio)); |
518 | |
519 | /* Is the folio fully inside i_size? */ |
520 | if (folio_end_pos(folio) <= i_size) |
521 | goto do_io; |
522 | |
523 | /* Is the folio fully outside i_size? (truncate in progress) */ |
524 | if (folio_pos(folio) >= i_size) { |
525 | folio_unlock(folio); |
526 | return 0; |
527 | } |
528 | |
529 | /* |
530 | * The folio straddles i_size. It must be zeroed out on each and every |
531 | * writepage invocation because it may be mmapped. "A file is mapped |
532 | * in multiples of the folio size. For a file that is not a multiple of |
533 | * the folio size, the remaining memory is zeroed when mapped, and |
534 | * writes to that region are not written out to the file." |
535 | */ |
536 | folio_zero_segment(folio, |
537 | start: i_size - folio_pos(folio), |
538 | xend: folio_size(folio)); |
539 | do_io: |
540 | f_sectors = folio_sectors(folio); |
541 | s = bch2_folio(folio); |
542 | |
543 | if (f_sectors > w->tmp_sectors) { |
544 | kfree(objp: w->tmp); |
545 | w->tmp = kcalloc(n: f_sectors, size: sizeof(struct bch_folio_sector), __GFP_NOFAIL); |
546 | w->tmp_sectors = f_sectors; |
547 | } |
548 | |
549 | /* |
550 | * Things get really hairy with errors during writeback: |
551 | */ |
552 | ret = bch2_get_folio_disk_reservation(c, inode, folio, false); |
553 | BUG_ON(ret); |
554 | |
555 | /* Before unlocking the page, get copy of reservations: */ |
556 | spin_lock(lock: &s->lock); |
557 | memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); |
558 | |
559 | for (i = 0; i < f_sectors; i++) { |
560 | if (s->s[i].state < SECTOR_dirty) |
561 | continue; |
562 | |
563 | nr_replicas_this_write = |
564 | min_t(unsigned, nr_replicas_this_write, |
565 | s->s[i].nr_replicas + |
566 | s->s[i].replicas_reserved); |
567 | } |
568 | |
569 | for (i = 0; i < f_sectors; i++) { |
570 | if (s->s[i].state < SECTOR_dirty) |
571 | continue; |
572 | |
573 | s->s[i].nr_replicas = w->opts.compression |
574 | ? 0 : nr_replicas_this_write; |
575 | |
576 | s->s[i].replicas_reserved = 0; |
577 | bch2_folio_sector_set(folio, s, i, n: SECTOR_allocated); |
578 | } |
579 | spin_unlock(lock: &s->lock); |
580 | |
581 | BUG_ON(atomic_read(&s->write_count)); |
582 | atomic_set(v: &s->write_count, i: 1); |
583 | |
584 | BUG_ON(folio_test_writeback(folio)); |
585 | folio_start_writeback(folio); |
586 | |
587 | folio_unlock(folio); |
588 | |
589 | offset = 0; |
590 | while (1) { |
591 | unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; |
592 | u64 sector; |
593 | |
594 | while (offset < f_sectors && |
595 | w->tmp[offset].state < SECTOR_dirty) |
596 | offset++; |
597 | |
598 | if (offset == f_sectors) |
599 | break; |
600 | |
601 | while (offset + sectors < f_sectors && |
602 | w->tmp[offset + sectors].state >= SECTOR_dirty) { |
603 | reserved_sectors += w->tmp[offset + sectors].replicas_reserved; |
604 | dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; |
605 | sectors++; |
606 | } |
607 | BUG_ON(!sectors); |
608 | |
609 | sector = folio_sector(folio) + offset; |
610 | |
611 | if (w->io && |
612 | (w->io->op.res.nr_replicas != nr_replicas_this_write || |
613 | bch_io_full(io: w->io, len: sectors << 9) || |
614 | bio_end_sector(&w->io->op.wbio.bio) != sector)) |
615 | bch2_writepage_do_io(w); |
616 | |
617 | if (!w->io) |
618 | bch2_writepage_io_alloc(c, wbc, w, inode, sector, |
619 | nr_replicas: nr_replicas_this_write); |
620 | |
621 | atomic_inc(v: &s->write_count); |
622 | |
623 | BUG_ON(inode != w->io->inode); |
624 | BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, |
625 | sectors << 9, offset << 9)); |
626 | |
627 | /* Check for writing past i_size: */ |
628 | WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > |
629 | round_up(i_size, block_bytes(c)) && |
630 | !test_bit(BCH_FS_emergency_ro, &c->flags), |
631 | "writing past i_size: %llu > %llu (unrounded %llu)\n" , |
632 | bio_end_sector(&w->io->op.wbio.bio) << 9, |
633 | round_up(i_size, block_bytes(c)), |
634 | i_size); |
635 | |
636 | w->io->op.res.sectors += reserved_sectors; |
637 | w->io->op.i_sectors_delta -= dirty_sectors; |
638 | w->io->op.new_i_size = i_size; |
639 | |
640 | offset += sectors; |
641 | } |
642 | |
643 | if (atomic_dec_and_test(v: &s->write_count)) |
644 | folio_end_writeback(folio); |
645 | |
646 | return 0; |
647 | } |
648 | |
649 | int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) |
650 | { |
651 | struct bch_fs *c = mapping->host->i_sb->s_fs_info; |
652 | struct bch_writepage_state w = |
653 | bch_writepage_state_init(c, to_bch_ei(mapping->host)); |
654 | struct blk_plug plug; |
655 | int ret; |
656 | |
657 | blk_start_plug(&plug); |
658 | ret = write_cache_pages(mapping, wbc, writepage: __bch2_writepage, data: &w); |
659 | if (w.io) |
660 | bch2_writepage_do_io(w: &w); |
661 | blk_finish_plug(&plug); |
662 | kfree(objp: w.tmp); |
663 | return bch2_err_class(err: ret); |
664 | } |
665 | |
666 | /* buffered writes: */ |
667 | |
668 | int bch2_write_begin(struct file *file, struct address_space *mapping, |
669 | loff_t pos, unsigned len, |
670 | struct page **pagep, void **fsdata) |
671 | { |
672 | struct bch_inode_info *inode = to_bch_ei(mapping->host); |
673 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
674 | struct bch2_folio_reservation *res; |
675 | struct folio *folio; |
676 | unsigned offset; |
677 | int ret = -ENOMEM; |
678 | |
679 | res = kmalloc(size: sizeof(*res), GFP_KERNEL); |
680 | if (!res) |
681 | return -ENOMEM; |
682 | |
683 | bch2_folio_reservation_init(c, inode, res); |
684 | *fsdata = res; |
685 | |
686 | bch2_pagecache_add_get(inode); |
687 | |
688 | folio = __filemap_get_folio(mapping, index: pos >> PAGE_SHIFT, |
689 | FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, |
690 | gfp: mapping_gfp_mask(mapping)); |
691 | if (IS_ERR_OR_NULL(ptr: folio)) |
692 | goto err_unlock; |
693 | |
694 | offset = pos - folio_pos(folio); |
695 | len = min_t(size_t, len, folio_end_pos(folio) - pos); |
696 | |
697 | if (folio_test_uptodate(folio)) |
698 | goto out; |
699 | |
700 | /* If we're writing entire folio, don't need to read it in first: */ |
701 | if (!offset && len == folio_size(folio)) |
702 | goto out; |
703 | |
704 | if (!offset && pos + len >= inode->v.i_size) { |
705 | folio_zero_segment(folio, start: len, xend: folio_size(folio)); |
706 | flush_dcache_folio(folio); |
707 | goto out; |
708 | } |
709 | |
710 | if (folio_pos(folio) >= inode->v.i_size) { |
711 | folio_zero_segments(folio, start1: 0, xend1: offset, start2: offset + len, xend2: folio_size(folio)); |
712 | flush_dcache_folio(folio); |
713 | goto out; |
714 | } |
715 | readpage: |
716 | ret = bch2_read_single_folio(folio, mapping); |
717 | if (ret) |
718 | goto err; |
719 | out: |
720 | ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); |
721 | if (ret) |
722 | goto err; |
723 | |
724 | ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); |
725 | if (ret) { |
726 | if (!folio_test_uptodate(folio)) { |
727 | /* |
728 | * If the folio hasn't been read in, we won't know if we |
729 | * actually need a reservation - we don't actually need |
730 | * to read here, we just need to check if the folio is |
731 | * fully backed by uncompressed data: |
732 | */ |
733 | goto readpage; |
734 | } |
735 | |
736 | goto err; |
737 | } |
738 | |
739 | *pagep = &folio->page; |
740 | return 0; |
741 | err: |
742 | folio_unlock(folio); |
743 | folio_put(folio); |
744 | *pagep = NULL; |
745 | err_unlock: |
746 | bch2_pagecache_add_put(inode); |
747 | kfree(objp: res); |
748 | *fsdata = NULL; |
749 | return bch2_err_class(err: ret); |
750 | } |
751 | |
752 | int bch2_write_end(struct file *file, struct address_space *mapping, |
753 | loff_t pos, unsigned len, unsigned copied, |
754 | struct page *page, void *fsdata) |
755 | { |
756 | struct bch_inode_info *inode = to_bch_ei(mapping->host); |
757 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
758 | struct bch2_folio_reservation *res = fsdata; |
759 | struct folio *folio = page_folio(page); |
760 | unsigned offset = pos - folio_pos(folio); |
761 | |
762 | lockdep_assert_held(&inode->v.i_rwsem); |
763 | BUG_ON(offset + copied > folio_size(folio)); |
764 | |
765 | if (unlikely(copied < len && !folio_test_uptodate(folio))) { |
766 | /* |
767 | * The folio needs to be read in, but that would destroy |
768 | * our partial write - simplest thing is to just force |
769 | * userspace to redo the write: |
770 | */ |
771 | folio_zero_range(folio, start: 0, length: folio_size(folio)); |
772 | flush_dcache_folio(folio); |
773 | copied = 0; |
774 | } |
775 | |
776 | spin_lock(lock: &inode->v.i_lock); |
777 | if (pos + copied > inode->v.i_size) |
778 | i_size_write(inode: &inode->v, i_size: pos + copied); |
779 | spin_unlock(lock: &inode->v.i_lock); |
780 | |
781 | if (copied) { |
782 | if (!folio_test_uptodate(folio)) |
783 | folio_mark_uptodate(folio); |
784 | |
785 | bch2_set_folio_dirty(c, inode, folio, res, offset, copied); |
786 | |
787 | inode->ei_last_dirtied = (unsigned long) current; |
788 | } |
789 | |
790 | folio_unlock(folio); |
791 | folio_put(folio); |
792 | bch2_pagecache_add_put(inode); |
793 | |
794 | bch2_folio_reservation_put(c, inode, res); |
795 | kfree(objp: res); |
796 | |
797 | return copied; |
798 | } |
799 | |
800 | static noinline void folios_trunc(folios *fs, struct folio **fi) |
801 | { |
802 | while (fs->data + fs->nr > fi) { |
803 | struct folio *f = darray_pop(fs); |
804 | |
805 | folio_unlock(folio: f); |
806 | folio_put(folio: f); |
807 | } |
808 | } |
809 | |
810 | static int __bch2_buffered_write(struct bch_inode_info *inode, |
811 | struct address_space *mapping, |
812 | struct iov_iter *iter, |
813 | loff_t pos, unsigned len, |
814 | bool inode_locked) |
815 | { |
816 | struct bch_fs *c = inode->v.i_sb->s_fs_info; |
817 | struct bch2_folio_reservation res; |
818 | folios fs; |
819 | struct folio *f; |
820 | unsigned copied = 0, f_offset, f_copied; |
821 | u64 end = pos + len, f_pos, f_len; |
822 | loff_t last_folio_pos = inode->v.i_size; |
823 | int ret = 0; |
824 | |
825 | BUG_ON(!len); |
826 | |
827 | bch2_folio_reservation_init(c, inode, res: &res); |
828 | darray_init(&fs); |
829 | |
830 | ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, |
831 | FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, |
832 | mapping_gfp_mask(mapping), |
833 | &fs); |
834 | if (ret) |
835 | goto out; |
836 | |
837 | BUG_ON(!fs.nr); |
838 | |
839 | /* |
840 | * If we're not using the inode lock, we need to lock all the folios for |
841 | * atomiticity of writes vs. other writes: |
842 | */ |
843 | if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { |
844 | ret = -BCH_ERR_need_inode_lock; |
845 | goto out; |
846 | } |
847 | |
848 | f = darray_first(fs); |
849 | if (pos != folio_pos(folio: f) && !folio_test_uptodate(folio: f)) { |
850 | ret = bch2_read_single_folio(folio: f, mapping); |
851 | if (ret) |
852 | goto out; |
853 | } |
854 | |
855 | f = darray_last(fs); |
856 | end = min(end, folio_end_pos(f)); |
857 | last_folio_pos = folio_pos(folio: f); |
858 | if (end != folio_end_pos(folio: f) && !folio_test_uptodate(folio: f)) { |
859 | if (end >= inode->v.i_size) { |
860 | folio_zero_range(folio: f, start: 0, length: folio_size(folio: f)); |
861 | } else { |
862 | ret = bch2_read_single_folio(folio: f, mapping); |
863 | if (ret) |
864 | goto out; |
865 | } |
866 | } |
867 | |
868 | ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); |
869 | if (ret) |
870 | goto out; |
871 | |
872 | f_pos = pos; |
873 | f_offset = pos - folio_pos(darray_first(fs)); |
874 | darray_for_each(fs, fi) { |
875 | f = *fi; |
876 | f_len = min(end, folio_end_pos(f)) - f_pos; |
877 | |
878 | /* |
879 | * XXX: per POSIX and fstests generic/275, on -ENOSPC we're |
880 | * supposed to write as much as we have disk space for. |
881 | * |
882 | * On failure here we should still write out a partial page if |
883 | * we aren't completely out of disk space - we don't do that |
884 | * yet: |
885 | */ |
886 | ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); |
887 | if (unlikely(ret)) { |
888 | folios_trunc(fs: &fs, fi); |
889 | if (!fs.nr) |
890 | goto out; |
891 | |
892 | end = min(end, folio_end_pos(darray_last(fs))); |
893 | break; |
894 | } |
895 | |
896 | f_pos = folio_end_pos(folio: f); |
897 | f_offset = 0; |
898 | } |
899 | |
900 | if (mapping_writably_mapped(mapping)) |
901 | darray_for_each(fs, fi) |
902 | flush_dcache_folio(folio: *fi); |
903 | |
904 | f_pos = pos; |
905 | f_offset = pos - folio_pos(darray_first(fs)); |
906 | darray_for_each(fs, fi) { |
907 | f = *fi; |
908 | f_len = min(end, folio_end_pos(f)) - f_pos; |
909 | f_copied = copy_page_from_iter_atomic(page: &f->page, offset: f_offset, bytes: f_len, i: iter); |
910 | if (!f_copied) { |
911 | folios_trunc(fs: &fs, fi); |
912 | break; |
913 | } |
914 | |
915 | if (!folio_test_uptodate(folio: f) && |
916 | f_copied != folio_size(folio: f) && |
917 | pos + copied + f_copied < inode->v.i_size) { |
918 | iov_iter_revert(i: iter, bytes: f_copied); |
919 | folio_zero_range(folio: f, start: 0, length: folio_size(folio: f)); |
920 | folios_trunc(fs: &fs, fi); |
921 | break; |
922 | } |
923 | |
924 | flush_dcache_folio(folio: f); |
925 | copied += f_copied; |
926 | |
927 | if (f_copied != f_len) { |
928 | folios_trunc(fs: &fs, fi: fi + 1); |
929 | break; |
930 | } |
931 | |
932 | f_pos = folio_end_pos(folio: f); |
933 | f_offset = 0; |
934 | } |
935 | |
936 | if (!copied) |
937 | goto out; |
938 | |
939 | end = pos + copied; |
940 | |
941 | spin_lock(lock: &inode->v.i_lock); |
942 | if (end > inode->v.i_size) { |
943 | BUG_ON(!inode_locked); |
944 | i_size_write(inode: &inode->v, i_size: end); |
945 | } |
946 | spin_unlock(lock: &inode->v.i_lock); |
947 | |
948 | f_pos = pos; |
949 | f_offset = pos - folio_pos(darray_first(fs)); |
950 | darray_for_each(fs, fi) { |
951 | f = *fi; |
952 | f_len = min(end, folio_end_pos(f)) - f_pos; |
953 | |
954 | if (!folio_test_uptodate(folio: f)) |
955 | folio_mark_uptodate(folio: f); |
956 | |
957 | bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); |
958 | |
959 | f_pos = folio_end_pos(folio: f); |
960 | f_offset = 0; |
961 | } |
962 | |
963 | inode->ei_last_dirtied = (unsigned long) current; |
964 | out: |
965 | darray_for_each(fs, fi) { |
966 | folio_unlock(folio: *fi); |
967 | folio_put(folio: *fi); |
968 | } |
969 | |
970 | /* |
971 | * If the last folio added to the mapping starts beyond current EOF, we |
972 | * performed a short write but left around at least one post-EOF folio. |
973 | * Clean up the mapping before we return. |
974 | */ |
975 | if (last_folio_pos >= inode->v.i_size) |
976 | truncate_pagecache(inode: &inode->v, new: inode->v.i_size); |
977 | |
978 | darray_exit(&fs); |
979 | bch2_folio_reservation_put(c, inode, &res); |
980 | |
981 | return copied ?: ret; |
982 | } |
983 | |
984 | static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) |
985 | { |
986 | struct file *file = iocb->ki_filp; |
987 | struct address_space *mapping = file->f_mapping; |
988 | struct bch_inode_info *inode = file_bch_inode(file); |
989 | loff_t pos; |
990 | bool inode_locked = false; |
991 | ssize_t written = 0, written2 = 0, ret = 0; |
992 | |
993 | /* |
994 | * We don't take the inode lock unless i_size will be changing. Folio |
995 | * locks provide exclusion with other writes, and the pagecache add lock |
996 | * provides exclusion with truncate and hole punching. |
997 | * |
998 | * There is one nasty corner case where atomicity would be broken |
999 | * without great care: when copying data from userspace to the page |
1000 | * cache, we do that with faults disable - a page fault would recurse |
1001 | * back into the filesystem, taking filesystem locks again, and |
1002 | * deadlock; so it's done with faults disabled, and we fault in the user |
1003 | * buffer when we aren't holding locks. |
1004 | * |
1005 | * If we do part of the write, but we then race and in the userspace |
1006 | * buffer have been evicted and are no longer resident, then we have to |
1007 | * drop our folio locks to re-fault them in, breaking write atomicity. |
1008 | * |
1009 | * To fix this, we restart the write from the start, if we weren't |
1010 | * holding the inode lock. |
1011 | * |
1012 | * There is another wrinkle after that; if we restart the write from the |
1013 | * start, and then get an unrecoverable error, we _cannot_ claim to |
1014 | * userspace that we did not write data we actually did - so we must |
1015 | * track (written2) the most we ever wrote. |
1016 | */ |
1017 | |
1018 | if ((iocb->ki_flags & IOCB_APPEND) || |
1019 | (iocb->ki_pos + iov_iter_count(i: iter) > i_size_read(inode: &inode->v))) { |
1020 | inode_lock(inode: &inode->v); |
1021 | inode_locked = true; |
1022 | } |
1023 | |
1024 | ret = generic_write_checks(iocb, iter); |
1025 | if (ret <= 0) |
1026 | goto unlock; |
1027 | |
1028 | ret = file_remove_privs_flags(file, flags: !inode_locked ? IOCB_NOWAIT : 0); |
1029 | if (ret) { |
1030 | if (!inode_locked) { |
1031 | inode_lock(inode: &inode->v); |
1032 | inode_locked = true; |
1033 | ret = file_remove_privs_flags(file, flags: 0); |
1034 | } |
1035 | if (ret) |
1036 | goto unlock; |
1037 | } |
1038 | |
1039 | ret = file_update_time(file); |
1040 | if (ret) |
1041 | goto unlock; |
1042 | |
1043 | pos = iocb->ki_pos; |
1044 | |
1045 | bch2_pagecache_add_get(inode); |
1046 | |
1047 | if (!inode_locked && |
1048 | (iocb->ki_pos + iov_iter_count(i: iter) > i_size_read(inode: &inode->v))) |
1049 | goto get_inode_lock; |
1050 | |
1051 | do { |
1052 | unsigned offset = pos & (PAGE_SIZE - 1); |
1053 | unsigned bytes = iov_iter_count(i: iter); |
1054 | again: |
1055 | /* |
1056 | * Bring in the user page that we will copy from _first_. |
1057 | * Otherwise there's a nasty deadlock on copying from the |
1058 | * same page as we're writing to, without it being marked |
1059 | * up-to-date. |
1060 | * |
1061 | * Not only is this an optimisation, but it is also required |
1062 | * to check that the address is actually valid, when atomic |
1063 | * usercopies are used, below. |
1064 | */ |
1065 | if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { |
1066 | bytes = min_t(unsigned long, iov_iter_count(iter), |
1067 | PAGE_SIZE - offset); |
1068 | |
1069 | if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { |
1070 | ret = -EFAULT; |
1071 | break; |
1072 | } |
1073 | } |
1074 | |
1075 | if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) |
1076 | goto get_inode_lock; |
1077 | |
1078 | if (unlikely(fatal_signal_pending(current))) { |
1079 | ret = -EINTR; |
1080 | break; |
1081 | } |
1082 | |
1083 | ret = __bch2_buffered_write(inode, mapping, iter, pos, len: bytes, inode_locked); |
1084 | if (ret == -BCH_ERR_need_inode_lock) |
1085 | goto get_inode_lock; |
1086 | if (unlikely(ret < 0)) |
1087 | break; |
1088 | |
1089 | cond_resched(); |
1090 | |
1091 | if (unlikely(ret == 0)) { |
1092 | /* |
1093 | * If we were unable to copy any data at all, we must |
1094 | * fall back to a single segment length write. |
1095 | * |
1096 | * If we didn't fallback here, we could livelock |
1097 | * because not all segments in the iov can be copied at |
1098 | * once without a pagefault. |
1099 | */ |
1100 | bytes = min_t(unsigned long, PAGE_SIZE - offset, |
1101 | iov_iter_single_seg_count(iter)); |
1102 | goto again; |
1103 | } |
1104 | pos += ret; |
1105 | written += ret; |
1106 | written2 = max(written, written2); |
1107 | |
1108 | if (ret != bytes && !inode_locked) |
1109 | goto get_inode_lock; |
1110 | ret = 0; |
1111 | |
1112 | balance_dirty_pages_ratelimited(mapping); |
1113 | |
1114 | if (0) { |
1115 | get_inode_lock: |
1116 | bch2_pagecache_add_put(inode); |
1117 | inode_lock(inode: &inode->v); |
1118 | inode_locked = true; |
1119 | bch2_pagecache_add_get(inode); |
1120 | |
1121 | iov_iter_revert(i: iter, bytes: written); |
1122 | pos -= written; |
1123 | written = 0; |
1124 | ret = 0; |
1125 | } |
1126 | } while (iov_iter_count(i: iter)); |
1127 | bch2_pagecache_add_put(inode); |
1128 | unlock: |
1129 | if (inode_locked) |
1130 | inode_unlock(inode: &inode->v); |
1131 | |
1132 | iocb->ki_pos += written; |
1133 | |
1134 | ret = max(written, written2) ?: ret; |
1135 | if (ret > 0) |
1136 | ret = generic_write_sync(iocb, count: ret); |
1137 | return ret; |
1138 | } |
1139 | |
1140 | ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) |
1141 | { |
1142 | ssize_t ret = iocb->ki_flags & IOCB_DIRECT |
1143 | ? bch2_direct_write(iocb, iter) |
1144 | : bch2_buffered_write(iocb, iter); |
1145 | |
1146 | return bch2_err_class(err: ret); |
1147 | } |
1148 | |
1149 | void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) |
1150 | { |
1151 | bioset_exit(&c->writepage_bioset); |
1152 | } |
1153 | |
1154 | int bch2_fs_fs_io_buffered_init(struct bch_fs *c) |
1155 | { |
1156 | if (bioset_init(&c->writepage_bioset, |
1157 | 4, offsetof(struct bch_writepage_io, op.wbio.bio), |
1158 | flags: BIOSET_NEED_BVECS)) |
1159 | return -BCH_ERR_ENOMEM_writepage_bioset_init; |
1160 | |
1161 | return 0; |
1162 | } |
1163 | |
1164 | #endif /* NO_BCACHEFS_FS */ |
1165 | |