1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2007 Oracle. All rights reserved. |
4 | * Copyright (C) 2022 Christoph Hellwig. |
5 | */ |
6 | |
7 | #include <linux/bio.h> |
8 | #include "bio.h" |
9 | #include "ctree.h" |
10 | #include "volumes.h" |
11 | #include "raid56.h" |
12 | #include "async-thread.h" |
13 | #include "dev-replace.h" |
14 | #include "rcu-string.h" |
15 | #include "zoned.h" |
16 | #include "file-item.h" |
17 | #include "raid-stripe-tree.h" |
18 | |
19 | static struct bio_set btrfs_bioset; |
20 | static struct bio_set btrfs_clone_bioset; |
21 | static struct bio_set btrfs_repair_bioset; |
22 | static mempool_t btrfs_failed_bio_pool; |
23 | |
24 | struct btrfs_failed_bio { |
25 | struct btrfs_bio *bbio; |
26 | int num_copies; |
27 | atomic_t repair_count; |
28 | }; |
29 | |
30 | /* Is this a data path I/O that needs storage layer checksum and repair? */ |
31 | static inline bool is_data_bbio(struct btrfs_bio *bbio) |
32 | { |
33 | return bbio->inode && is_data_inode(inode: &bbio->inode->vfs_inode); |
34 | } |
35 | |
36 | static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) |
37 | { |
38 | return is_data_bbio(bbio) && btrfs_op(bio: &bbio->bio) == BTRFS_MAP_WRITE; |
39 | } |
40 | |
41 | /* |
42 | * Initialize a btrfs_bio structure. This skips the embedded bio itself as it |
43 | * is already initialized by the block layer. |
44 | */ |
45 | void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, |
46 | btrfs_bio_end_io_t end_io, void *private) |
47 | { |
48 | memset(bbio, 0, offsetof(struct btrfs_bio, bio)); |
49 | bbio->fs_info = fs_info; |
50 | bbio->end_io = end_io; |
51 | bbio->private = private; |
52 | atomic_set(v: &bbio->pending_ios, i: 1); |
53 | } |
54 | |
55 | /* |
56 | * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for |
57 | * btrfs, and is used for all I/O submitted through btrfs_submit_bio. |
58 | * |
59 | * Just like the underlying bio_alloc_bioset it will not fail as it is backed by |
60 | * a mempool. |
61 | */ |
62 | struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, |
63 | struct btrfs_fs_info *fs_info, |
64 | btrfs_bio_end_io_t end_io, void *private) |
65 | { |
66 | struct btrfs_bio *bbio; |
67 | struct bio *bio; |
68 | |
69 | bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, bs: &btrfs_bioset); |
70 | bbio = btrfs_bio(bio); |
71 | btrfs_bio_init(bbio, fs_info, end_io, private); |
72 | return bbio; |
73 | } |
74 | |
75 | static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, |
76 | struct btrfs_bio *orig_bbio, |
77 | u64 map_length, bool use_append) |
78 | { |
79 | struct btrfs_bio *bbio; |
80 | struct bio *bio; |
81 | |
82 | if (use_append) { |
83 | unsigned int nr_segs; |
84 | |
85 | bio = bio_split_rw(bio: &orig_bbio->bio, lim: &fs_info->limits, segs: &nr_segs, |
86 | bs: &btrfs_clone_bioset, max_bytes: map_length); |
87 | } else { |
88 | bio = bio_split(bio: &orig_bbio->bio, sectors: map_length >> SECTOR_SHIFT, |
89 | GFP_NOFS, bs: &btrfs_clone_bioset); |
90 | } |
91 | bbio = btrfs_bio(bio); |
92 | btrfs_bio_init(bbio, fs_info, NULL, private: orig_bbio); |
93 | bbio->inode = orig_bbio->inode; |
94 | bbio->file_offset = orig_bbio->file_offset; |
95 | orig_bbio->file_offset += map_length; |
96 | if (bbio_has_ordered_extent(bbio)) { |
97 | refcount_inc(r: &orig_bbio->ordered->refs); |
98 | bbio->ordered = orig_bbio->ordered; |
99 | } |
100 | atomic_inc(v: &orig_bbio->pending_ios); |
101 | return bbio; |
102 | } |
103 | |
104 | /* Free a bio that was never submitted to the underlying device. */ |
105 | static void btrfs_cleanup_bio(struct btrfs_bio *bbio) |
106 | { |
107 | if (bbio_has_ordered_extent(bbio)) |
108 | btrfs_put_ordered_extent(entry: bbio->ordered); |
109 | bio_put(&bbio->bio); |
110 | } |
111 | |
112 | static void __btrfs_bio_end_io(struct btrfs_bio *bbio) |
113 | { |
114 | if (bbio_has_ordered_extent(bbio)) { |
115 | struct btrfs_ordered_extent *ordered = bbio->ordered; |
116 | |
117 | bbio->end_io(bbio); |
118 | btrfs_put_ordered_extent(entry: ordered); |
119 | } else { |
120 | bbio->end_io(bbio); |
121 | } |
122 | } |
123 | |
124 | void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) |
125 | { |
126 | bbio->bio.bi_status = status; |
127 | __btrfs_bio_end_io(bbio); |
128 | } |
129 | |
130 | static void btrfs_orig_write_end_io(struct bio *bio); |
131 | |
132 | static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, |
133 | struct btrfs_bio *orig_bbio) |
134 | { |
135 | /* |
136 | * For writes we tolerate nr_mirrors - 1 write failures, so we can't |
137 | * just blindly propagate a write failure here. Instead increment the |
138 | * error count in the original I/O context so that it is guaranteed to |
139 | * be larger than the error tolerance. |
140 | */ |
141 | if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) { |
142 | struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private; |
143 | struct btrfs_io_context *orig_bioc = orig_stripe->bioc; |
144 | |
145 | atomic_add(i: orig_bioc->max_errors, v: &orig_bioc->error); |
146 | } else { |
147 | orig_bbio->bio.bi_status = bbio->bio.bi_status; |
148 | } |
149 | } |
150 | |
151 | static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) |
152 | { |
153 | if (bbio->bio.bi_pool == &btrfs_clone_bioset) { |
154 | struct btrfs_bio *orig_bbio = bbio->private; |
155 | |
156 | if (bbio->bio.bi_status) |
157 | btrfs_bbio_propagate_error(bbio, orig_bbio); |
158 | btrfs_cleanup_bio(bbio); |
159 | bbio = orig_bbio; |
160 | } |
161 | |
162 | if (atomic_dec_and_test(v: &bbio->pending_ios)) |
163 | __btrfs_bio_end_io(bbio); |
164 | } |
165 | |
166 | static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
167 | { |
168 | if (cur_mirror == fbio->num_copies) |
169 | return cur_mirror + 1 - fbio->num_copies; |
170 | return cur_mirror + 1; |
171 | } |
172 | |
173 | static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) |
174 | { |
175 | if (cur_mirror == 1) |
176 | return fbio->num_copies; |
177 | return cur_mirror - 1; |
178 | } |
179 | |
180 | static void btrfs_repair_done(struct btrfs_failed_bio *fbio) |
181 | { |
182 | if (atomic_dec_and_test(v: &fbio->repair_count)) { |
183 | btrfs_orig_bbio_end_io(bbio: fbio->bbio); |
184 | mempool_free(element: fbio, pool: &btrfs_failed_bio_pool); |
185 | } |
186 | } |
187 | |
188 | static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, |
189 | struct btrfs_device *dev) |
190 | { |
191 | struct btrfs_failed_bio *fbio = repair_bbio->private; |
192 | struct btrfs_inode *inode = repair_bbio->inode; |
193 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
194 | struct bio_vec *bv = bio_first_bvec_all(bio: &repair_bbio->bio); |
195 | int mirror = repair_bbio->mirror_num; |
196 | |
197 | if (repair_bbio->bio.bi_status || |
198 | !btrfs_data_csum_ok(bbio: repair_bbio, dev, bio_offset: 0, bv)) { |
199 | bio_reset(bio: &repair_bbio->bio, NULL, opf: REQ_OP_READ); |
200 | repair_bbio->bio.bi_iter = repair_bbio->saved_iter; |
201 | |
202 | mirror = next_repair_mirror(fbio, cur_mirror: mirror); |
203 | if (mirror == fbio->bbio->mirror_num) { |
204 | btrfs_debug(fs_info, "no mirror left" ); |
205 | fbio->bbio->bio.bi_status = BLK_STS_IOERR; |
206 | goto done; |
207 | } |
208 | |
209 | btrfs_submit_bio(bbio: repair_bbio, mirror_num: mirror); |
210 | return; |
211 | } |
212 | |
213 | do { |
214 | mirror = prev_repair_mirror(fbio, cur_mirror: mirror); |
215 | btrfs_repair_io_failure(fs_info, ino: btrfs_ino(inode), |
216 | start: repair_bbio->file_offset, length: fs_info->sectorsize, |
217 | logical: repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, |
218 | page: bv->bv_page, pg_offset: bv->bv_offset, mirror_num: mirror); |
219 | } while (mirror != fbio->bbio->mirror_num); |
220 | |
221 | done: |
222 | btrfs_repair_done(fbio); |
223 | bio_put(&repair_bbio->bio); |
224 | } |
225 | |
226 | /* |
227 | * Try to kick off a repair read to the next available mirror for a bad sector. |
228 | * |
229 | * This primarily tries to recover good data to serve the actual read request, |
230 | * but also tries to write the good data back to the bad mirror(s) when a |
231 | * read succeeded to restore the redundancy. |
232 | */ |
233 | static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, |
234 | u32 bio_offset, |
235 | struct bio_vec *bv, |
236 | struct btrfs_failed_bio *fbio) |
237 | { |
238 | struct btrfs_inode *inode = failed_bbio->inode; |
239 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
240 | const u32 sectorsize = fs_info->sectorsize; |
241 | const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); |
242 | struct btrfs_bio *repair_bbio; |
243 | struct bio *repair_bio; |
244 | int num_copies; |
245 | int mirror; |
246 | |
247 | btrfs_debug(fs_info, "repair read error: read error at %llu" , |
248 | failed_bbio->file_offset + bio_offset); |
249 | |
250 | num_copies = btrfs_num_copies(fs_info, logical, len: sectorsize); |
251 | if (num_copies == 1) { |
252 | btrfs_debug(fs_info, "no copy to repair from" ); |
253 | failed_bbio->bio.bi_status = BLK_STS_IOERR; |
254 | return fbio; |
255 | } |
256 | |
257 | if (!fbio) { |
258 | fbio = mempool_alloc(pool: &btrfs_failed_bio_pool, GFP_NOFS); |
259 | fbio->bbio = failed_bbio; |
260 | fbio->num_copies = num_copies; |
261 | atomic_set(v: &fbio->repair_count, i: 1); |
262 | } |
263 | |
264 | atomic_inc(v: &fbio->repair_count); |
265 | |
266 | repair_bio = bio_alloc_bioset(NULL, nr_vecs: 1, opf: REQ_OP_READ, GFP_NOFS, |
267 | bs: &btrfs_repair_bioset); |
268 | repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; |
269 | __bio_add_page(bio: repair_bio, page: bv->bv_page, len: bv->bv_len, off: bv->bv_offset); |
270 | |
271 | repair_bbio = btrfs_bio(bio: repair_bio); |
272 | btrfs_bio_init(bbio: repair_bbio, fs_info, NULL, private: fbio); |
273 | repair_bbio->inode = failed_bbio->inode; |
274 | repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; |
275 | |
276 | mirror = next_repair_mirror(fbio, cur_mirror: failed_bbio->mirror_num); |
277 | btrfs_debug(fs_info, "submitting repair read to mirror %d" , mirror); |
278 | btrfs_submit_bio(bbio: repair_bbio, mirror_num: mirror); |
279 | return fbio; |
280 | } |
281 | |
282 | static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev) |
283 | { |
284 | struct btrfs_inode *inode = bbio->inode; |
285 | struct btrfs_fs_info *fs_info = inode->root->fs_info; |
286 | u32 sectorsize = fs_info->sectorsize; |
287 | struct bvec_iter *iter = &bbio->saved_iter; |
288 | blk_status_t status = bbio->bio.bi_status; |
289 | struct btrfs_failed_bio *fbio = NULL; |
290 | u32 offset = 0; |
291 | |
292 | /* Read-repair requires the inode field to be set by the submitter. */ |
293 | ASSERT(inode); |
294 | |
295 | /* |
296 | * Hand off repair bios to the repair code as there is no upper level |
297 | * submitter for them. |
298 | */ |
299 | if (bbio->bio.bi_pool == &btrfs_repair_bioset) { |
300 | btrfs_end_repair_bio(repair_bbio: bbio, dev); |
301 | return; |
302 | } |
303 | |
304 | /* Clear the I/O error. A failed repair will reset it. */ |
305 | bbio->bio.bi_status = BLK_STS_OK; |
306 | |
307 | while (iter->bi_size) { |
308 | struct bio_vec bv = bio_iter_iovec(&bbio->bio, *iter); |
309 | |
310 | bv.bv_len = min(bv.bv_len, sectorsize); |
311 | if (status || !btrfs_data_csum_ok(bbio, dev, bio_offset: offset, bv: &bv)) |
312 | fbio = repair_one_sector(failed_bbio: bbio, bio_offset: offset, bv: &bv, fbio); |
313 | |
314 | bio_advance_iter_single(bio: &bbio->bio, iter, bytes: sectorsize); |
315 | offset += sectorsize; |
316 | } |
317 | |
318 | if (bbio->csum != bbio->csum_inline) |
319 | kfree(objp: bbio->csum); |
320 | |
321 | if (fbio) |
322 | btrfs_repair_done(fbio); |
323 | else |
324 | btrfs_orig_bbio_end_io(bbio); |
325 | } |
326 | |
327 | static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) |
328 | { |
329 | if (!dev || !dev->bdev) |
330 | return; |
331 | if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) |
332 | return; |
333 | |
334 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) |
335 | btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_WRITE_ERRS); |
336 | else if (!(bio->bi_opf & REQ_RAHEAD)) |
337 | btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_READ_ERRS); |
338 | if (bio->bi_opf & REQ_PREFLUSH) |
339 | btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_FLUSH_ERRS); |
340 | } |
341 | |
342 | static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, |
343 | struct bio *bio) |
344 | { |
345 | if (bio->bi_opf & REQ_META) |
346 | return fs_info->endio_meta_workers; |
347 | return fs_info->endio_workers; |
348 | } |
349 | |
350 | static void btrfs_end_bio_work(struct work_struct *work) |
351 | { |
352 | struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); |
353 | |
354 | /* Metadata reads are checked and repaired by the submitter. */ |
355 | if (is_data_bbio(bbio)) |
356 | btrfs_check_read_bio(bbio, dev: bbio->bio.bi_private); |
357 | else |
358 | btrfs_orig_bbio_end_io(bbio); |
359 | } |
360 | |
361 | static void btrfs_simple_end_io(struct bio *bio) |
362 | { |
363 | struct btrfs_bio *bbio = btrfs_bio(bio); |
364 | struct btrfs_device *dev = bio->bi_private; |
365 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
366 | |
367 | btrfs_bio_counter_dec(fs_info); |
368 | |
369 | if (bio->bi_status) |
370 | btrfs_log_dev_io_error(bio, dev); |
371 | |
372 | if (bio_op(bio) == REQ_OP_READ) { |
373 | INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); |
374 | queue_work(wq: btrfs_end_io_wq(fs_info, bio), work: &bbio->end_io_work); |
375 | } else { |
376 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
377 | btrfs_record_physical_zoned(bbio); |
378 | btrfs_orig_bbio_end_io(bbio); |
379 | } |
380 | } |
381 | |
382 | static void btrfs_raid56_end_io(struct bio *bio) |
383 | { |
384 | struct btrfs_io_context *bioc = bio->bi_private; |
385 | struct btrfs_bio *bbio = btrfs_bio(bio); |
386 | |
387 | btrfs_bio_counter_dec(fs_info: bioc->fs_info); |
388 | bbio->mirror_num = bioc->mirror_num; |
389 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) |
390 | btrfs_check_read_bio(bbio, NULL); |
391 | else |
392 | btrfs_orig_bbio_end_io(bbio); |
393 | |
394 | btrfs_put_bioc(bioc); |
395 | } |
396 | |
397 | static void btrfs_orig_write_end_io(struct bio *bio) |
398 | { |
399 | struct btrfs_io_stripe *stripe = bio->bi_private; |
400 | struct btrfs_io_context *bioc = stripe->bioc; |
401 | struct btrfs_bio *bbio = btrfs_bio(bio); |
402 | |
403 | btrfs_bio_counter_dec(fs_info: bioc->fs_info); |
404 | |
405 | if (bio->bi_status) { |
406 | atomic_inc(v: &bioc->error); |
407 | btrfs_log_dev_io_error(bio, dev: stripe->dev); |
408 | } |
409 | |
410 | /* |
411 | * Only send an error to the higher layers if it is beyond the tolerance |
412 | * threshold. |
413 | */ |
414 | if (atomic_read(v: &bioc->error) > bioc->max_errors) |
415 | bio->bi_status = BLK_STS_IOERR; |
416 | else |
417 | bio->bi_status = BLK_STS_OK; |
418 | |
419 | if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) |
420 | stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
421 | |
422 | btrfs_orig_bbio_end_io(bbio); |
423 | btrfs_put_bioc(bioc); |
424 | } |
425 | |
426 | static void btrfs_clone_write_end_io(struct bio *bio) |
427 | { |
428 | struct btrfs_io_stripe *stripe = bio->bi_private; |
429 | |
430 | if (bio->bi_status) { |
431 | atomic_inc(v: &stripe->bioc->error); |
432 | btrfs_log_dev_io_error(bio, dev: stripe->dev); |
433 | } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
434 | stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
435 | } |
436 | |
437 | /* Pass on control to the original bio this one was cloned from */ |
438 | bio_endio(stripe->bioc->orig_bio); |
439 | bio_put(bio); |
440 | } |
441 | |
442 | static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) |
443 | { |
444 | if (!dev || !dev->bdev || |
445 | test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || |
446 | (btrfs_op(bio) == BTRFS_MAP_WRITE && |
447 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { |
448 | bio_io_error(bio); |
449 | return; |
450 | } |
451 | |
452 | bio_set_dev(bio, bdev: dev->bdev); |
453 | |
454 | /* |
455 | * For zone append writing, bi_sector must point the beginning of the |
456 | * zone |
457 | */ |
458 | if (bio_op(bio) == REQ_OP_ZONE_APPEND) { |
459 | u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
460 | u64 zone_start = round_down(physical, dev->fs_info->zone_size); |
461 | |
462 | ASSERT(btrfs_dev_is_sequential(dev, physical)); |
463 | bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; |
464 | } |
465 | btrfs_debug_in_rcu(dev->fs_info, |
466 | "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u" , |
467 | __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, |
468 | (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), |
469 | dev->devid, bio->bi_iter.bi_size); |
470 | |
471 | if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) |
472 | blkcg_punt_bio_submit(bio); |
473 | else |
474 | submit_bio(bio); |
475 | } |
476 | |
477 | static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) |
478 | { |
479 | struct bio *orig_bio = bioc->orig_bio, *bio; |
480 | |
481 | ASSERT(bio_op(orig_bio) != REQ_OP_READ); |
482 | |
483 | /* Reuse the bio embedded into the btrfs_bio for the last mirror */ |
484 | if (dev_nr == bioc->num_stripes - 1) { |
485 | bio = orig_bio; |
486 | bio->bi_end_io = btrfs_orig_write_end_io; |
487 | } else { |
488 | bio = bio_alloc_clone(NULL, bio_src: orig_bio, GFP_NOFS, bs: &fs_bio_set); |
489 | bio_inc_remaining(bio: orig_bio); |
490 | bio->bi_end_io = btrfs_clone_write_end_io; |
491 | } |
492 | |
493 | bio->bi_private = &bioc->stripes[dev_nr]; |
494 | bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; |
495 | bioc->stripes[dev_nr].bioc = bioc; |
496 | bioc->size = bio->bi_iter.bi_size; |
497 | btrfs_submit_dev_bio(dev: bioc->stripes[dev_nr].dev, bio); |
498 | } |
499 | |
500 | static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, |
501 | struct btrfs_io_stripe *smap, int mirror_num) |
502 | { |
503 | if (!bioc) { |
504 | /* Single mirror read/write fast path. */ |
505 | btrfs_bio(bio)->mirror_num = mirror_num; |
506 | if (bio_op(bio) != REQ_OP_READ) |
507 | btrfs_bio(bio)->orig_physical = smap->physical; |
508 | bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; |
509 | if (bio_op(bio) != REQ_OP_READ) |
510 | btrfs_bio(bio)->orig_physical = smap->physical; |
511 | bio->bi_private = smap->dev; |
512 | bio->bi_end_io = btrfs_simple_end_io; |
513 | btrfs_submit_dev_bio(dev: smap->dev, bio); |
514 | } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { |
515 | /* Parity RAID write or read recovery. */ |
516 | bio->bi_private = bioc; |
517 | bio->bi_end_io = btrfs_raid56_end_io; |
518 | if (bio_op(bio) == REQ_OP_READ) |
519 | raid56_parity_recover(bio, bioc, mirror_num); |
520 | else |
521 | raid56_parity_write(bio, bioc); |
522 | } else { |
523 | /* Write to multiple mirrors. */ |
524 | int total_devs = bioc->num_stripes; |
525 | |
526 | bioc->orig_bio = bio; |
527 | for (int dev_nr = 0; dev_nr < total_devs; dev_nr++) |
528 | btrfs_submit_mirrored_bio(bioc, dev_nr); |
529 | } |
530 | } |
531 | |
532 | static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) |
533 | { |
534 | if (bbio->bio.bi_opf & REQ_META) |
535 | return btree_csum_one_bio(bbio); |
536 | return btrfs_csum_one_bio(bbio); |
537 | } |
538 | |
539 | /* |
540 | * Async submit bios are used to offload expensive checksumming onto the worker |
541 | * threads. |
542 | */ |
543 | struct async_submit_bio { |
544 | struct btrfs_bio *bbio; |
545 | struct btrfs_io_context *bioc; |
546 | struct btrfs_io_stripe smap; |
547 | int mirror_num; |
548 | struct btrfs_work work; |
549 | }; |
550 | |
551 | /* |
552 | * In order to insert checksums into the metadata in large chunks, we wait |
553 | * until bio submission time. All the pages in the bio are checksummed and |
554 | * sums are attached onto the ordered extent record. |
555 | * |
556 | * At IO completion time the csums attached on the ordered extent record are |
557 | * inserted into the btree. |
558 | */ |
559 | static void run_one_async_start(struct btrfs_work *work) |
560 | { |
561 | struct async_submit_bio *async = |
562 | container_of(work, struct async_submit_bio, work); |
563 | blk_status_t ret; |
564 | |
565 | ret = btrfs_bio_csum(bbio: async->bbio); |
566 | if (ret) |
567 | async->bbio->bio.bi_status = ret; |
568 | } |
569 | |
570 | /* |
571 | * In order to insert checksums into the metadata in large chunks, we wait |
572 | * until bio submission time. All the pages in the bio are checksummed and |
573 | * sums are attached onto the ordered extent record. |
574 | * |
575 | * At IO completion time the csums attached on the ordered extent record are |
576 | * inserted into the tree. |
577 | * |
578 | * If called with @do_free == true, then it will free the work struct. |
579 | */ |
580 | static void run_one_async_done(struct btrfs_work *work, bool do_free) |
581 | { |
582 | struct async_submit_bio *async = |
583 | container_of(work, struct async_submit_bio, work); |
584 | struct bio *bio = &async->bbio->bio; |
585 | |
586 | if (do_free) { |
587 | kfree(container_of(work, struct async_submit_bio, work)); |
588 | return; |
589 | } |
590 | |
591 | /* If an error occurred we just want to clean up the bio and move on. */ |
592 | if (bio->bi_status) { |
593 | btrfs_orig_bbio_end_io(bbio: async->bbio); |
594 | return; |
595 | } |
596 | |
597 | /* |
598 | * All of the bios that pass through here are from async helpers. |
599 | * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's |
600 | * context. This changes nothing when cgroups aren't in use. |
601 | */ |
602 | bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; |
603 | __btrfs_submit_bio(bio, bioc: async->bioc, smap: &async->smap, mirror_num: async->mirror_num); |
604 | } |
605 | |
606 | static bool should_async_write(struct btrfs_bio *bbio) |
607 | { |
608 | /* Submit synchronously if the checksum implementation is fast. */ |
609 | if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) |
610 | return false; |
611 | |
612 | /* |
613 | * Try to defer the submission to a workqueue to parallelize the |
614 | * checksum calculation unless the I/O is issued synchronously. |
615 | */ |
616 | if (op_is_sync(op: bbio->bio.bi_opf)) |
617 | return false; |
618 | |
619 | /* Zoned devices require I/O to be submitted in order. */ |
620 | if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info: bbio->fs_info)) |
621 | return false; |
622 | |
623 | return true; |
624 | } |
625 | |
626 | /* |
627 | * Submit bio to an async queue. |
628 | * |
629 | * Return true if the work has been succesfuly submitted, else false. |
630 | */ |
631 | static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, |
632 | struct btrfs_io_context *bioc, |
633 | struct btrfs_io_stripe *smap, int mirror_num) |
634 | { |
635 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
636 | struct async_submit_bio *async; |
637 | |
638 | async = kmalloc(size: sizeof(*async), GFP_NOFS); |
639 | if (!async) |
640 | return false; |
641 | |
642 | async->bbio = bbio; |
643 | async->bioc = bioc; |
644 | async->smap = *smap; |
645 | async->mirror_num = mirror_num; |
646 | |
647 | btrfs_init_work(work: &async->work, func: run_one_async_start, ordered_func: run_one_async_done); |
648 | btrfs_queue_work(wq: fs_info->workers, work: &async->work); |
649 | return true; |
650 | } |
651 | |
652 | static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) |
653 | { |
654 | struct btrfs_inode *inode = bbio->inode; |
655 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
656 | struct btrfs_bio *orig_bbio = bbio; |
657 | struct bio *bio = &bbio->bio; |
658 | u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; |
659 | u64 length = bio->bi_iter.bi_size; |
660 | u64 map_length = length; |
661 | bool use_append = btrfs_use_zone_append(bbio); |
662 | struct btrfs_io_context *bioc = NULL; |
663 | struct btrfs_io_stripe smap; |
664 | blk_status_t ret; |
665 | int error; |
666 | |
667 | smap.is_scrub = !bbio->inode; |
668 | |
669 | btrfs_bio_counter_inc_blocked(fs_info); |
670 | error = btrfs_map_block(fs_info, op: btrfs_op(bio), logical, length: &map_length, |
671 | bioc_ret: &bioc, smap: &smap, mirror_num_ret: &mirror_num); |
672 | if (error) { |
673 | ret = errno_to_blk_status(errno: error); |
674 | goto fail; |
675 | } |
676 | |
677 | map_length = min(map_length, length); |
678 | if (use_append) |
679 | map_length = min(map_length, fs_info->max_zone_append_size); |
680 | |
681 | if (map_length < length) { |
682 | bbio = btrfs_split_bio(fs_info, orig_bbio: bbio, map_length, use_append); |
683 | bio = &bbio->bio; |
684 | } |
685 | |
686 | /* |
687 | * Save the iter for the end_io handler and preload the checksums for |
688 | * data reads. |
689 | */ |
690 | if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { |
691 | bbio->saved_iter = bio->bi_iter; |
692 | ret = btrfs_lookup_bio_sums(bbio); |
693 | if (ret) |
694 | goto fail_put_bio; |
695 | } |
696 | |
697 | if (btrfs_op(bio) == BTRFS_MAP_WRITE) { |
698 | if (use_append) { |
699 | bio->bi_opf &= ~REQ_OP_WRITE; |
700 | bio->bi_opf |= REQ_OP_ZONE_APPEND; |
701 | } |
702 | |
703 | if (is_data_bbio(bbio) && bioc && |
704 | btrfs_need_stripe_tree_update(fs_info: bioc->fs_info, map_type: bioc->map_type)) { |
705 | /* |
706 | * No locking for the list update, as we only add to |
707 | * the list in the I/O submission path, and list |
708 | * iteration only happens in the completion path, which |
709 | * can't happen until after the last submission. |
710 | */ |
711 | btrfs_get_bioc(bioc); |
712 | list_add_tail(new: &bioc->rst_ordered_entry, head: &bbio->ordered->bioc_list); |
713 | } |
714 | |
715 | /* |
716 | * Csum items for reloc roots have already been cloned at this |
717 | * point, so they are handled as part of the no-checksum case. |
718 | */ |
719 | if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && |
720 | !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && |
721 | !btrfs_is_data_reloc_root(root: inode->root)) { |
722 | if (should_async_write(bbio) && |
723 | btrfs_wq_submit_bio(bbio, bioc, smap: &smap, mirror_num)) |
724 | goto done; |
725 | |
726 | ret = btrfs_bio_csum(bbio); |
727 | if (ret) |
728 | goto fail_put_bio; |
729 | } else if (use_append) { |
730 | ret = btrfs_alloc_dummy_sum(bbio); |
731 | if (ret) |
732 | goto fail_put_bio; |
733 | } |
734 | } |
735 | |
736 | __btrfs_submit_bio(bio, bioc, smap: &smap, mirror_num); |
737 | done: |
738 | return map_length == length; |
739 | |
740 | fail_put_bio: |
741 | if (map_length < length) |
742 | btrfs_cleanup_bio(bbio); |
743 | fail: |
744 | btrfs_bio_counter_dec(fs_info); |
745 | btrfs_bio_end_io(bbio: orig_bbio, status: ret); |
746 | /* Do not submit another chunk */ |
747 | return true; |
748 | } |
749 | |
750 | void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) |
751 | { |
752 | /* If bbio->inode is not populated, its file_offset must be 0. */ |
753 | ASSERT(bbio->inode || bbio->file_offset == 0); |
754 | |
755 | while (!btrfs_submit_chunk(bbio, mirror_num)) |
756 | ; |
757 | } |
758 | |
759 | /* |
760 | * Submit a repair write. |
761 | * |
762 | * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a |
763 | * RAID setup. Here we only want to write the one bad copy, so we do the |
764 | * mapping ourselves and submit the bio directly. |
765 | * |
766 | * The I/O is issued synchronously to block the repair read completion from |
767 | * freeing the bio. |
768 | */ |
769 | int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, |
770 | u64 length, u64 logical, struct page *page, |
771 | unsigned int pg_offset, int mirror_num) |
772 | { |
773 | struct btrfs_io_stripe smap = { 0 }; |
774 | struct bio_vec bvec; |
775 | struct bio bio; |
776 | int ret = 0; |
777 | |
778 | ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); |
779 | BUG_ON(!mirror_num); |
780 | |
781 | if (btrfs_repair_one_zone(fs_info, logical)) |
782 | return 0; |
783 | |
784 | /* |
785 | * Avoid races with device replace and make sure our bioc has devices |
786 | * associated to its stripes that don't go away while we are doing the |
787 | * read repair operation. |
788 | */ |
789 | btrfs_bio_counter_inc_blocked(fs_info); |
790 | ret = btrfs_map_repair_block(fs_info, smap: &smap, logical, length, mirror_num); |
791 | if (ret < 0) |
792 | goto out_counter_dec; |
793 | |
794 | if (!smap.dev->bdev || |
795 | !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { |
796 | ret = -EIO; |
797 | goto out_counter_dec; |
798 | } |
799 | |
800 | bio_init(bio: &bio, bdev: smap.dev->bdev, table: &bvec, max_vecs: 1, opf: REQ_OP_WRITE | REQ_SYNC); |
801 | bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; |
802 | __bio_add_page(bio: &bio, page, len: length, off: pg_offset); |
803 | ret = submit_bio_wait(bio: &bio); |
804 | if (ret) { |
805 | /* try to remap that extent elsewhere? */ |
806 | btrfs_dev_stat_inc_and_print(dev: smap.dev, index: BTRFS_DEV_STAT_WRITE_ERRS); |
807 | goto out_bio_uninit; |
808 | } |
809 | |
810 | btrfs_info_rl_in_rcu(fs_info, |
811 | "read error corrected: ino %llu off %llu (dev %s sector %llu)" , |
812 | ino, start, btrfs_dev_name(smap.dev), |
813 | smap.physical >> SECTOR_SHIFT); |
814 | ret = 0; |
815 | |
816 | out_bio_uninit: |
817 | bio_uninit(&bio); |
818 | out_counter_dec: |
819 | btrfs_bio_counter_dec(fs_info); |
820 | return ret; |
821 | } |
822 | |
823 | /* |
824 | * Submit a btrfs_bio based repair write. |
825 | * |
826 | * If @dev_replace is true, the write would be submitted to dev-replace target. |
827 | */ |
828 | void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) |
829 | { |
830 | struct btrfs_fs_info *fs_info = bbio->fs_info; |
831 | u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; |
832 | u64 length = bbio->bio.bi_iter.bi_size; |
833 | struct btrfs_io_stripe smap = { 0 }; |
834 | int ret; |
835 | |
836 | ASSERT(fs_info); |
837 | ASSERT(mirror_num > 0); |
838 | ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); |
839 | ASSERT(!bbio->inode); |
840 | |
841 | btrfs_bio_counter_inc_blocked(fs_info); |
842 | ret = btrfs_map_repair_block(fs_info, smap: &smap, logical, length, mirror_num); |
843 | if (ret < 0) |
844 | goto fail; |
845 | |
846 | if (dev_replace) { |
847 | ASSERT(smap.dev == fs_info->dev_replace.srcdev); |
848 | smap.dev = fs_info->dev_replace.tgtdev; |
849 | } |
850 | __btrfs_submit_bio(bio: &bbio->bio, NULL, smap: &smap, mirror_num); |
851 | return; |
852 | |
853 | fail: |
854 | btrfs_bio_counter_dec(fs_info); |
855 | btrfs_bio_end_io(bbio, status: errno_to_blk_status(errno: ret)); |
856 | } |
857 | |
858 | int __init btrfs_bioset_init(void) |
859 | { |
860 | if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, |
861 | offsetof(struct btrfs_bio, bio), |
862 | flags: BIOSET_NEED_BVECS)) |
863 | return -ENOMEM; |
864 | if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, |
865 | offsetof(struct btrfs_bio, bio), flags: 0)) |
866 | goto out_free_bioset; |
867 | if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, |
868 | offsetof(struct btrfs_bio, bio), |
869 | flags: BIOSET_NEED_BVECS)) |
870 | goto out_free_clone_bioset; |
871 | if (mempool_init_kmalloc_pool(pool: &btrfs_failed_bio_pool, BIO_POOL_SIZE, |
872 | size: sizeof(struct btrfs_failed_bio))) |
873 | goto out_free_repair_bioset; |
874 | return 0; |
875 | |
876 | out_free_repair_bioset: |
877 | bioset_exit(&btrfs_repair_bioset); |
878 | out_free_clone_bioset: |
879 | bioset_exit(&btrfs_clone_bioset); |
880 | out_free_bioset: |
881 | bioset_exit(&btrfs_bioset); |
882 | return -ENOMEM; |
883 | } |
884 | |
885 | void __cold btrfs_bioset_exit(void) |
886 | { |
887 | mempool_exit(pool: &btrfs_failed_bio_pool); |
888 | bioset_exit(&btrfs_repair_bioset); |
889 | bioset_exit(&btrfs_clone_bioset); |
890 | bioset_exit(&btrfs_bioset); |
891 | } |
892 | |