1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Some low level IO code, and hacks for various block layer limitations |
4 | * |
5 | * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> |
6 | * Copyright 2012 Google, Inc. |
7 | */ |
8 | |
9 | #include "bcachefs.h" |
10 | #include "alloc_background.h" |
11 | #include "alloc_foreground.h" |
12 | #include "btree_update.h" |
13 | #include "buckets.h" |
14 | #include "checksum.h" |
15 | #include "clock.h" |
16 | #include "compress.h" |
17 | #include "data_update.h" |
18 | #include "disk_groups.h" |
19 | #include "ec.h" |
20 | #include "error.h" |
21 | #include "io_read.h" |
22 | #include "io_misc.h" |
23 | #include "io_write.h" |
24 | #include "subvolume.h" |
25 | #include "trace.h" |
26 | |
27 | #include <linux/sched/mm.h> |
28 | |
29 | #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT |
30 | |
31 | static bool bch2_target_congested(struct bch_fs *c, u16 target) |
32 | { |
33 | const struct bch_devs_mask *devs; |
34 | unsigned d, nr = 0, total = 0; |
35 | u64 now = local_clock(), last; |
36 | s64 congested; |
37 | struct bch_dev *ca; |
38 | |
39 | if (!target) |
40 | return false; |
41 | |
42 | rcu_read_lock(); |
43 | devs = bch2_target_to_mask(c, target) ?: |
44 | &c->rw_devs[BCH_DATA_user]; |
45 | |
46 | for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { |
47 | ca = rcu_dereference(c->devs[d]); |
48 | if (!ca) |
49 | continue; |
50 | |
51 | congested = atomic_read(&ca->congested); |
52 | last = READ_ONCE(ca->congested_last); |
53 | if (time_after64(now, last)) |
54 | congested -= (now - last) >> 12; |
55 | |
56 | total += max(congested, 0LL); |
57 | nr++; |
58 | } |
59 | rcu_read_unlock(); |
60 | |
61 | return bch2_rand_range(nr * CONGESTED_MAX) < total; |
62 | } |
63 | |
64 | #else |
65 | |
66 | static bool bch2_target_congested(struct bch_fs *c, u16 target) |
67 | { |
68 | return false; |
69 | } |
70 | |
71 | #endif |
72 | |
73 | /* Cache promotion on read */ |
74 | |
75 | struct promote_op { |
76 | struct rcu_head rcu; |
77 | u64 start_time; |
78 | |
79 | struct rhash_head hash; |
80 | struct bpos pos; |
81 | |
82 | struct data_update write; |
83 | struct bio_vec bi_inline_vecs[]; /* must be last */ |
84 | }; |
85 | |
86 | static const struct rhashtable_params bch_promote_params = { |
87 | .head_offset = offsetof(struct promote_op, hash), |
88 | .key_offset = offsetof(struct promote_op, pos), |
89 | .key_len = sizeof(struct bpos), |
90 | }; |
91 | |
92 | static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, |
93 | struct bpos pos, |
94 | struct bch_io_opts opts, |
95 | unsigned flags) |
96 | { |
97 | BUG_ON(!opts.promote_target); |
98 | |
99 | if (!(flags & BCH_READ_MAY_PROMOTE)) |
100 | return -BCH_ERR_nopromote_may_not; |
101 | |
102 | if (bch2_bkey_has_target(c, k, opts.promote_target)) |
103 | return -BCH_ERR_nopromote_already_promoted; |
104 | |
105 | if (bkey_extent_is_unwritten(k)) |
106 | return -BCH_ERR_nopromote_unwritten; |
107 | |
108 | if (bch2_target_congested(c, target: opts.promote_target)) |
109 | return -BCH_ERR_nopromote_congested; |
110 | |
111 | if (rhashtable_lookup_fast(ht: &c->promote_table, key: &pos, |
112 | params: bch_promote_params)) |
113 | return -BCH_ERR_nopromote_in_flight; |
114 | |
115 | return 0; |
116 | } |
117 | |
118 | static void promote_free(struct bch_fs *c, struct promote_op *op) |
119 | { |
120 | int ret; |
121 | |
122 | bch2_data_update_exit(&op->write); |
123 | |
124 | ret = rhashtable_remove_fast(ht: &c->promote_table, obj: &op->hash, |
125 | params: bch_promote_params); |
126 | BUG_ON(ret); |
127 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_promote); |
128 | kfree_rcu(op, rcu); |
129 | } |
130 | |
131 | static void promote_done(struct bch_write_op *wop) |
132 | { |
133 | struct promote_op *op = |
134 | container_of(wop, struct promote_op, write.op); |
135 | struct bch_fs *c = op->write.op.c; |
136 | |
137 | bch2_time_stats_update(stats: &c->times[BCH_TIME_data_promote], |
138 | start: op->start_time); |
139 | promote_free(c, op); |
140 | } |
141 | |
142 | static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) |
143 | { |
144 | struct bio *bio = &op->write.op.wbio.bio; |
145 | |
146 | trace_and_count(op->write.op.c, read_promote, &rbio->bio); |
147 | |
148 | /* we now own pages: */ |
149 | BUG_ON(!rbio->bounce); |
150 | BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); |
151 | |
152 | memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, |
153 | sizeof(struct bio_vec) * rbio->bio.bi_vcnt); |
154 | swap(bio->bi_vcnt, rbio->bio.bi_vcnt); |
155 | |
156 | bch2_data_update_read_done(&op->write, rbio->pick.crc); |
157 | } |
158 | |
159 | static struct promote_op *__promote_alloc(struct btree_trans *trans, |
160 | enum btree_id btree_id, |
161 | struct bkey_s_c k, |
162 | struct bpos pos, |
163 | struct extent_ptr_decoded *pick, |
164 | struct bch_io_opts opts, |
165 | unsigned sectors, |
166 | struct bch_read_bio **rbio) |
167 | { |
168 | struct bch_fs *c = trans->c; |
169 | struct promote_op *op = NULL; |
170 | struct bio *bio; |
171 | unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); |
172 | int ret; |
173 | |
174 | if (!bch2_write_ref_tryget(c, ref: BCH_WRITE_REF_promote)) |
175 | return ERR_PTR(error: -BCH_ERR_nopromote_no_writes); |
176 | |
177 | op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); |
178 | if (!op) { |
179 | ret = -BCH_ERR_nopromote_enomem; |
180 | goto err; |
181 | } |
182 | |
183 | op->start_time = local_clock(); |
184 | op->pos = pos; |
185 | |
186 | /* |
187 | * We don't use the mempool here because extents that aren't |
188 | * checksummed or compressed can be too big for the mempool: |
189 | */ |
190 | *rbio = kzalloc(size: sizeof(struct bch_read_bio) + |
191 | sizeof(struct bio_vec) * pages, |
192 | GFP_KERNEL); |
193 | if (!*rbio) { |
194 | ret = -BCH_ERR_nopromote_enomem; |
195 | goto err; |
196 | } |
197 | |
198 | rbio_init(bio: &(*rbio)->bio, opts); |
199 | bio_init(bio: &(*rbio)->bio, NULL, table: (*rbio)->bio.bi_inline_vecs, max_vecs: pages, opf: 0); |
200 | |
201 | if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { |
202 | ret = -BCH_ERR_nopromote_enomem; |
203 | goto err; |
204 | } |
205 | |
206 | (*rbio)->bounce = true; |
207 | (*rbio)->split = true; |
208 | (*rbio)->kmalloc = true; |
209 | |
210 | if (rhashtable_lookup_insert_fast(ht: &c->promote_table, obj: &op->hash, |
211 | params: bch_promote_params)) { |
212 | ret = -BCH_ERR_nopromote_in_flight; |
213 | goto err; |
214 | } |
215 | |
216 | bio = &op->write.op.wbio.bio; |
217 | bio_init(bio, NULL, table: bio->bi_inline_vecs, max_vecs: pages, opf: 0); |
218 | |
219 | ret = bch2_data_update_init(trans, NULL, NULL, &op->write, |
220 | writepoint_hashed(v: (unsigned long) current), |
221 | opts, |
222 | (struct data_update_opts) { |
223 | .target = opts.promote_target, |
224 | .extra_replicas = 1, |
225 | .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, |
226 | }, |
227 | btree_id, k); |
228 | /* |
229 | * possible errors: -BCH_ERR_nocow_lock_blocked, |
230 | * -BCH_ERR_ENOSPC_disk_reservation: |
231 | */ |
232 | if (ret) { |
233 | BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, |
234 | bch_promote_params)); |
235 | goto err; |
236 | } |
237 | |
238 | op->write.op.end_io = promote_done; |
239 | |
240 | return op; |
241 | err: |
242 | if (*rbio) |
243 | bio_free_pages(bio: &(*rbio)->bio); |
244 | kfree(objp: *rbio); |
245 | *rbio = NULL; |
246 | kfree(objp: op); |
247 | bch2_write_ref_put(c, ref: BCH_WRITE_REF_promote); |
248 | return ERR_PTR(error: ret); |
249 | } |
250 | |
251 | noinline |
252 | static struct promote_op *promote_alloc(struct btree_trans *trans, |
253 | struct bvec_iter iter, |
254 | struct bkey_s_c k, |
255 | struct extent_ptr_decoded *pick, |
256 | struct bch_io_opts opts, |
257 | unsigned flags, |
258 | struct bch_read_bio **rbio, |
259 | bool *bounce, |
260 | bool *read_full) |
261 | { |
262 | struct bch_fs *c = trans->c; |
263 | bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); |
264 | /* data might have to be decompressed in the write path: */ |
265 | unsigned sectors = promote_full |
266 | ? max(pick->crc.compressed_size, pick->crc.live_size) |
267 | : bvec_iter_sectors(iter); |
268 | struct bpos pos = promote_full |
269 | ? bkey_start_pos(k: k.k) |
270 | : POS(k.k->p.inode, iter.bi_sector); |
271 | struct promote_op *promote; |
272 | int ret; |
273 | |
274 | ret = should_promote(c, k, pos, opts, flags); |
275 | if (ret) |
276 | goto nopromote; |
277 | |
278 | promote = __promote_alloc(trans, |
279 | btree_id: k.k->type == KEY_TYPE_reflink_v |
280 | ? BTREE_ID_reflink |
281 | : BTREE_ID_extents, |
282 | k, pos, pick, opts, sectors, rbio); |
283 | ret = PTR_ERR_OR_ZERO(ptr: promote); |
284 | if (ret) |
285 | goto nopromote; |
286 | |
287 | *bounce = true; |
288 | *read_full = promote_full; |
289 | return promote; |
290 | nopromote: |
291 | trace_read_nopromote(c, ret); |
292 | return NULL; |
293 | } |
294 | |
295 | /* Read */ |
296 | |
297 | #define READ_RETRY_AVOID 1 |
298 | #define READ_RETRY 2 |
299 | #define READ_ERR 3 |
300 | |
301 | enum rbio_context { |
302 | RBIO_CONTEXT_NULL, |
303 | RBIO_CONTEXT_HIGHPRI, |
304 | RBIO_CONTEXT_UNBOUND, |
305 | }; |
306 | |
307 | static inline struct bch_read_bio * |
308 | bch2_rbio_parent(struct bch_read_bio *rbio) |
309 | { |
310 | return rbio->split ? rbio->parent : rbio; |
311 | } |
312 | |
313 | __always_inline |
314 | static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, |
315 | enum rbio_context context, |
316 | struct workqueue_struct *wq) |
317 | { |
318 | if (context <= rbio->context) { |
319 | fn(&rbio->work); |
320 | } else { |
321 | rbio->work.func = fn; |
322 | rbio->context = context; |
323 | queue_work(wq, work: &rbio->work); |
324 | } |
325 | } |
326 | |
327 | static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) |
328 | { |
329 | BUG_ON(rbio->bounce && !rbio->split); |
330 | |
331 | if (rbio->promote) |
332 | promote_free(c: rbio->c, op: rbio->promote); |
333 | rbio->promote = NULL; |
334 | |
335 | if (rbio->bounce) |
336 | bch2_bio_free_pages_pool(rbio->c, &rbio->bio); |
337 | |
338 | if (rbio->split) { |
339 | struct bch_read_bio *parent = rbio->parent; |
340 | |
341 | if (rbio->kmalloc) |
342 | kfree(objp: rbio); |
343 | else |
344 | bio_put(&rbio->bio); |
345 | |
346 | rbio = parent; |
347 | } |
348 | |
349 | return rbio; |
350 | } |
351 | |
352 | /* |
353 | * Only called on a top level bch_read_bio to complete an entire read request, |
354 | * not a split: |
355 | */ |
356 | static void bch2_rbio_done(struct bch_read_bio *rbio) |
357 | { |
358 | if (rbio->start_time) |
359 | bch2_time_stats_update(stats: &rbio->c->times[BCH_TIME_data_read], |
360 | start: rbio->start_time); |
361 | bio_endio(&rbio->bio); |
362 | } |
363 | |
364 | static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, |
365 | struct bvec_iter bvec_iter, |
366 | struct bch_io_failures *failed, |
367 | unsigned flags) |
368 | { |
369 | struct btree_trans *trans = bch2_trans_get(c); |
370 | struct btree_iter iter; |
371 | struct bkey_buf sk; |
372 | struct bkey_s_c k; |
373 | int ret; |
374 | |
375 | flags &= ~BCH_READ_LAST_FRAGMENT; |
376 | flags |= BCH_READ_MUST_CLONE; |
377 | |
378 | bch2_bkey_buf_init(s: &sk); |
379 | |
380 | bch2_trans_iter_init(trans, iter: &iter, btree_id: rbio->data_btree, |
381 | pos: rbio->read_pos, flags: BTREE_ITER_SLOTS); |
382 | retry: |
383 | rbio->bio.bi_status = 0; |
384 | |
385 | k = bch2_btree_iter_peek_slot(&iter); |
386 | if (bkey_err(k)) |
387 | goto err; |
388 | |
389 | bch2_bkey_buf_reassemble(s: &sk, c, k); |
390 | k = bkey_i_to_s_c(k: sk.k); |
391 | bch2_trans_unlock(trans); |
392 | |
393 | if (!bch2_bkey_matches_ptr(c, k, |
394 | rbio->pick.ptr, |
395 | rbio->data_pos.offset - |
396 | rbio->pick.crc.offset)) { |
397 | /* extent we wanted to read no longer exists: */ |
398 | rbio->hole = true; |
399 | goto out; |
400 | } |
401 | |
402 | ret = __bch2_read_extent(trans, rbio, bvec_iter, |
403 | rbio->read_pos, |
404 | rbio->data_btree, |
405 | k, 0, failed, flags); |
406 | if (ret == READ_RETRY) |
407 | goto retry; |
408 | if (ret) |
409 | goto err; |
410 | out: |
411 | bch2_rbio_done(rbio); |
412 | bch2_trans_iter_exit(trans, &iter); |
413 | bch2_trans_put(trans); |
414 | bch2_bkey_buf_exit(s: &sk, c); |
415 | return; |
416 | err: |
417 | rbio->bio.bi_status = BLK_STS_IOERR; |
418 | goto out; |
419 | } |
420 | |
421 | static void bch2_rbio_retry(struct work_struct *work) |
422 | { |
423 | struct bch_read_bio *rbio = |
424 | container_of(work, struct bch_read_bio, work); |
425 | struct bch_fs *c = rbio->c; |
426 | struct bvec_iter iter = rbio->bvec_iter; |
427 | unsigned flags = rbio->flags; |
428 | subvol_inum inum = { |
429 | .subvol = rbio->subvol, |
430 | .inum = rbio->read_pos.inode, |
431 | }; |
432 | struct bch_io_failures failed = { .nr = 0 }; |
433 | |
434 | trace_and_count(c, read_retry, &rbio->bio); |
435 | |
436 | if (rbio->retry == READ_RETRY_AVOID) |
437 | bch2_mark_io_failure(&failed, &rbio->pick); |
438 | |
439 | rbio->bio.bi_status = 0; |
440 | |
441 | rbio = bch2_rbio_free(rbio); |
442 | |
443 | flags |= BCH_READ_IN_RETRY; |
444 | flags &= ~BCH_READ_MAY_PROMOTE; |
445 | |
446 | if (flags & BCH_READ_NODECODE) { |
447 | bch2_read_retry_nodecode(c, rbio, bvec_iter: iter, failed: &failed, flags); |
448 | } else { |
449 | flags &= ~BCH_READ_LAST_FRAGMENT; |
450 | flags |= BCH_READ_MUST_CLONE; |
451 | |
452 | __bch2_read(c, rbio, iter, inum, &failed, flags); |
453 | } |
454 | } |
455 | |
456 | static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, |
457 | blk_status_t error) |
458 | { |
459 | rbio->retry = retry; |
460 | |
461 | if (rbio->flags & BCH_READ_IN_RETRY) |
462 | return; |
463 | |
464 | if (retry == READ_ERR) { |
465 | rbio = bch2_rbio_free(rbio); |
466 | |
467 | rbio->bio.bi_status = error; |
468 | bch2_rbio_done(rbio); |
469 | } else { |
470 | bch2_rbio_punt(rbio, fn: bch2_rbio_retry, |
471 | context: RBIO_CONTEXT_UNBOUND, wq: system_unbound_wq); |
472 | } |
473 | } |
474 | |
475 | static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, |
476 | struct bch_read_bio *rbio) |
477 | { |
478 | struct bch_fs *c = rbio->c; |
479 | u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; |
480 | struct bch_extent_crc_unpacked new_crc; |
481 | struct btree_iter iter; |
482 | struct bkey_i *new; |
483 | struct bkey_s_c k; |
484 | int ret = 0; |
485 | |
486 | if (crc_is_compressed(crc: rbio->pick.crc)) |
487 | return 0; |
488 | |
489 | k = bch2_bkey_get_iter(trans, iter: &iter, btree_id: rbio->data_btree, pos: rbio->data_pos, |
490 | flags: BTREE_ITER_SLOTS|BTREE_ITER_INTENT); |
491 | if ((ret = bkey_err(k))) |
492 | goto out; |
493 | |
494 | if (bversion_cmp(l: k.k->version, r: rbio->version) || |
495 | !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) |
496 | goto out; |
497 | |
498 | /* Extent was merged? */ |
499 | if (bkey_start_offset(k: k.k) < data_offset || |
500 | k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) |
501 | goto out; |
502 | |
503 | if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, |
504 | rbio->pick.crc, NULL, &new_crc, |
505 | bkey_start_offset(k: k.k) - data_offset, k.k->size, |
506 | rbio->pick.crc.csum_type)) { |
507 | bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)" ); |
508 | ret = 0; |
509 | goto out; |
510 | } |
511 | |
512 | /* |
513 | * going to be temporarily appending another checksum entry: |
514 | */ |
515 | new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + |
516 | sizeof(struct bch_extent_crc128)); |
517 | if ((ret = PTR_ERR_OR_ZERO(ptr: new))) |
518 | goto out; |
519 | |
520 | bkey_reassemble(dst: new, src: k); |
521 | |
522 | if (!bch2_bkey_narrow_crcs(new, new_crc)) |
523 | goto out; |
524 | |
525 | ret = bch2_trans_update(trans, &iter, new, |
526 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); |
527 | out: |
528 | bch2_trans_iter_exit(trans, &iter); |
529 | return ret; |
530 | } |
531 | |
532 | static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) |
533 | { |
534 | bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
535 | __bch2_rbio_narrow_crcs(trans, rbio)); |
536 | } |
537 | |
538 | /* Inner part that may run in process context */ |
539 | static void __bch2_read_endio(struct work_struct *work) |
540 | { |
541 | struct bch_read_bio *rbio = |
542 | container_of(work, struct bch_read_bio, work); |
543 | struct bch_fs *c = rbio->c; |
544 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: rbio->pick.ptr.dev); |
545 | struct bio *src = &rbio->bio; |
546 | struct bio *dst = &bch2_rbio_parent(rbio)->bio; |
547 | struct bvec_iter dst_iter = rbio->bvec_iter; |
548 | struct bch_extent_crc_unpacked crc = rbio->pick.crc; |
549 | struct nonce nonce = extent_nonce(version: rbio->version, crc); |
550 | unsigned nofs_flags; |
551 | struct bch_csum csum; |
552 | int ret; |
553 | |
554 | nofs_flags = memalloc_nofs_save(); |
555 | |
556 | /* Reset iterator for checksumming and copying bounced data: */ |
557 | if (rbio->bounce) { |
558 | src->bi_iter.bi_size = crc.compressed_size << 9; |
559 | src->bi_iter.bi_idx = 0; |
560 | src->bi_iter.bi_bvec_done = 0; |
561 | } else { |
562 | src->bi_iter = rbio->bvec_iter; |
563 | } |
564 | |
565 | csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); |
566 | if (bch2_crc_cmp(l: csum, r: rbio->pick.crc.csum) && !c->opts.no_data_io) |
567 | goto csum_err; |
568 | |
569 | /* |
570 | * XXX |
571 | * We need to rework the narrow_crcs path to deliver the read completion |
572 | * first, and then punt to a different workqueue, otherwise we're |
573 | * holding up reads while doing btree updates which is bad for memory |
574 | * reclaim. |
575 | */ |
576 | if (unlikely(rbio->narrow_crcs)) |
577 | bch2_rbio_narrow_crcs(rbio); |
578 | |
579 | if (rbio->flags & BCH_READ_NODECODE) |
580 | goto nodecode; |
581 | |
582 | /* Adjust crc to point to subset of data we want: */ |
583 | crc.offset += rbio->offset_into_extent; |
584 | crc.live_size = bvec_iter_sectors(rbio->bvec_iter); |
585 | |
586 | if (crc_is_compressed(crc)) { |
587 | ret = bch2_encrypt_bio(c, type: crc.csum_type, nonce, bio: src); |
588 | if (ret) |
589 | goto decrypt_err; |
590 | |
591 | if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && |
592 | !c->opts.no_data_io) |
593 | goto decompression_err; |
594 | } else { |
595 | /* don't need to decrypt the entire bio: */ |
596 | nonce = nonce_add(nonce, offset: crc.offset << 9); |
597 | bio_advance(bio: src, nbytes: crc.offset << 9); |
598 | |
599 | BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); |
600 | src->bi_iter.bi_size = dst_iter.bi_size; |
601 | |
602 | ret = bch2_encrypt_bio(c, type: crc.csum_type, nonce, bio: src); |
603 | if (ret) |
604 | goto decrypt_err; |
605 | |
606 | if (rbio->bounce) { |
607 | struct bvec_iter src_iter = src->bi_iter; |
608 | |
609 | bio_copy_data_iter(dst, dst_iter: &dst_iter, src, src_iter: &src_iter); |
610 | } |
611 | } |
612 | |
613 | if (rbio->promote) { |
614 | /* |
615 | * Re encrypt data we decrypted, so it's consistent with |
616 | * rbio->crc: |
617 | */ |
618 | ret = bch2_encrypt_bio(c, type: crc.csum_type, nonce, bio: src); |
619 | if (ret) |
620 | goto decrypt_err; |
621 | |
622 | promote_start(op: rbio->promote, rbio); |
623 | rbio->promote = NULL; |
624 | } |
625 | nodecode: |
626 | if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { |
627 | rbio = bch2_rbio_free(rbio); |
628 | bch2_rbio_done(rbio); |
629 | } |
630 | out: |
631 | memalloc_nofs_restore(flags: nofs_flags); |
632 | return; |
633 | csum_err: |
634 | /* |
635 | * Checksum error: if the bio wasn't bounced, we may have been |
636 | * reading into buffers owned by userspace (that userspace can |
637 | * scribble over) - retry the read, bouncing it this time: |
638 | */ |
639 | if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { |
640 | rbio->flags |= BCH_READ_MUST_BOUNCE; |
641 | bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); |
642 | goto out; |
643 | } |
644 | |
645 | struct printbuf buf = PRINTBUF; |
646 | buf.atomic++; |
647 | prt_str(out: &buf, str: "data " ); |
648 | bch2_csum_err_msg(out: &buf, type: crc.csum_type, expected: rbio->pick.crc.csum, got: csum); |
649 | |
650 | bch_err_inum_offset_ratelimited(ca, |
651 | rbio->read_pos.inode, |
652 | rbio->read_pos.offset << 9, |
653 | "data %s" , buf.buf); |
654 | printbuf_exit(&buf); |
655 | |
656 | bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); |
657 | bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); |
658 | goto out; |
659 | decompression_err: |
660 | bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, |
661 | rbio->read_pos.offset << 9, |
662 | "decompression error" ); |
663 | bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); |
664 | goto out; |
665 | decrypt_err: |
666 | bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, |
667 | rbio->read_pos.offset << 9, |
668 | "decrypt error" ); |
669 | bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); |
670 | goto out; |
671 | } |
672 | |
673 | static void bch2_read_endio(struct bio *bio) |
674 | { |
675 | struct bch_read_bio *rbio = |
676 | container_of(bio, struct bch_read_bio, bio); |
677 | struct bch_fs *c = rbio->c; |
678 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: rbio->pick.ptr.dev); |
679 | struct workqueue_struct *wq = NULL; |
680 | enum rbio_context context = RBIO_CONTEXT_NULL; |
681 | |
682 | if (rbio->have_ioref) { |
683 | bch2_latency_acct(ca, submit_time: rbio->submit_time, READ); |
684 | percpu_ref_put(ref: &ca->io_ref); |
685 | } |
686 | |
687 | if (!rbio->split) |
688 | rbio->bio.bi_end_io = rbio->end_io; |
689 | |
690 | if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, |
691 | rbio->read_pos.inode, |
692 | rbio->read_pos.offset, |
693 | "data read error: %s" , |
694 | bch2_blk_status_to_str(bio->bi_status))) { |
695 | bch2_rbio_error(rbio, READ_RETRY_AVOID, error: bio->bi_status); |
696 | return; |
697 | } |
698 | |
699 | if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || |
700 | ptr_stale(ca, ptr: &rbio->pick.ptr)) { |
701 | trace_and_count(c, read_reuse_race, &rbio->bio); |
702 | |
703 | if (rbio->flags & BCH_READ_RETRY_IF_STALE) |
704 | bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); |
705 | else |
706 | bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); |
707 | return; |
708 | } |
709 | |
710 | if (rbio->narrow_crcs || |
711 | rbio->promote || |
712 | crc_is_compressed(crc: rbio->pick.crc) || |
713 | bch2_csum_type_is_encryption(type: rbio->pick.crc.csum_type)) |
714 | context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; |
715 | else if (rbio->pick.crc.csum_type) |
716 | context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; |
717 | |
718 | bch2_rbio_punt(rbio, fn: __bch2_read_endio, context, wq); |
719 | } |
720 | |
721 | int __bch2_read_indirect_extent(struct btree_trans *trans, |
722 | unsigned *offset_into_extent, |
723 | struct bkey_buf *orig_k) |
724 | { |
725 | struct btree_iter iter; |
726 | struct bkey_s_c k; |
727 | u64 reflink_offset; |
728 | int ret; |
729 | |
730 | reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + |
731 | *offset_into_extent; |
732 | |
733 | k = bch2_bkey_get_iter(trans, iter: &iter, btree_id: BTREE_ID_reflink, |
734 | POS(0, reflink_offset), flags: 0); |
735 | ret = bkey_err(k); |
736 | if (ret) |
737 | goto err; |
738 | |
739 | if (k.k->type != KEY_TYPE_reflink_v && |
740 | k.k->type != KEY_TYPE_indirect_inline_data) { |
741 | bch_err_inum_offset_ratelimited(trans->c, |
742 | orig_k->k->k.p.inode, |
743 | orig_k->k->k.p.offset << 9, |
744 | "%llu len %u points to nonexistent indirect extent %llu" , |
745 | orig_k->k->k.p.offset, |
746 | orig_k->k->k.size, |
747 | reflink_offset); |
748 | bch2_inconsistent_error(trans->c); |
749 | ret = -EIO; |
750 | goto err; |
751 | } |
752 | |
753 | *offset_into_extent = iter.pos.offset - bkey_start_offset(k: k.k); |
754 | bch2_bkey_buf_reassemble(s: orig_k, c: trans->c, k); |
755 | err: |
756 | bch2_trans_iter_exit(trans, &iter); |
757 | return ret; |
758 | } |
759 | |
760 | static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, |
761 | struct bkey_s_c k, |
762 | struct bch_extent_ptr ptr) |
763 | { |
764 | struct bch_fs *c = trans->c; |
765 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr.dev); |
766 | struct btree_iter iter; |
767 | struct printbuf buf = PRINTBUF; |
768 | int ret; |
769 | |
770 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_alloc, |
771 | pos: PTR_BUCKET_POS(c, ptr: &ptr), |
772 | flags: BTREE_ITER_CACHED); |
773 | |
774 | prt_printf(&buf, "Attempting to read from stale dirty pointer:" ); |
775 | printbuf_indent_add(&buf, 2); |
776 | prt_newline(&buf); |
777 | |
778 | bch2_bkey_val_to_text(&buf, c, k); |
779 | prt_newline(&buf); |
780 | |
781 | prt_printf(&buf, "memory gen: %u" , *bucket_gen(ca, iter.pos.offset)); |
782 | |
783 | ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); |
784 | if (!ret) { |
785 | prt_newline(&buf); |
786 | bch2_bkey_val_to_text(&buf, c, k); |
787 | } |
788 | |
789 | bch2_fs_inconsistent(c, "%s" , buf.buf); |
790 | |
791 | bch2_trans_iter_exit(trans, &iter); |
792 | printbuf_exit(&buf); |
793 | } |
794 | |
795 | int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, |
796 | struct bvec_iter iter, struct bpos read_pos, |
797 | enum btree_id data_btree, struct bkey_s_c k, |
798 | unsigned offset_into_extent, |
799 | struct bch_io_failures *failed, unsigned flags) |
800 | { |
801 | struct bch_fs *c = trans->c; |
802 | struct extent_ptr_decoded pick; |
803 | struct bch_read_bio *rbio = NULL; |
804 | struct bch_dev *ca = NULL; |
805 | struct promote_op *promote = NULL; |
806 | bool bounce = false, read_full = false, narrow_crcs = false; |
807 | struct bpos data_pos = bkey_start_pos(k: k.k); |
808 | int pick_ret; |
809 | |
810 | if (bkey_extent_is_inline_data(k: k.k)) { |
811 | unsigned bytes = min_t(unsigned, iter.bi_size, |
812 | bkey_inline_data_bytes(k.k)); |
813 | |
814 | swap(iter.bi_size, bytes); |
815 | memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); |
816 | swap(iter.bi_size, bytes); |
817 | bio_advance_iter(bio: &orig->bio, iter: &iter, bytes); |
818 | zero_fill_bio_iter(bio: &orig->bio, iter); |
819 | goto out_read_done; |
820 | } |
821 | retry_pick: |
822 | pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); |
823 | |
824 | /* hole or reservation - just zero fill: */ |
825 | if (!pick_ret) |
826 | goto hole; |
827 | |
828 | if (pick_ret < 0) { |
829 | bch_err_inum_offset_ratelimited(c, |
830 | read_pos.inode, read_pos.offset << 9, |
831 | "no device to read from" ); |
832 | goto err; |
833 | } |
834 | |
835 | ca = bch_dev_bkey_exists(c, idx: pick.ptr.dev); |
836 | |
837 | /* |
838 | * Stale dirty pointers are treated as IO errors, but @failed isn't |
839 | * allocated unless we're in the retry path - so if we're not in the |
840 | * retry path, don't check here, it'll be caught in bch2_read_endio() |
841 | * and we'll end up in the retry path: |
842 | */ |
843 | if ((flags & BCH_READ_IN_RETRY) && |
844 | !pick.ptr.cached && |
845 | unlikely(ptr_stale(ca, &pick.ptr))) { |
846 | read_from_stale_dirty_pointer(trans, k, ptr: pick.ptr); |
847 | bch2_mark_io_failure(failed, &pick); |
848 | goto retry_pick; |
849 | } |
850 | |
851 | /* |
852 | * Unlock the iterator while the btree node's lock is still in |
853 | * cache, before doing the IO: |
854 | */ |
855 | bch2_trans_unlock(trans); |
856 | |
857 | if (flags & BCH_READ_NODECODE) { |
858 | /* |
859 | * can happen if we retry, and the extent we were going to read |
860 | * has been merged in the meantime: |
861 | */ |
862 | if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) |
863 | goto hole; |
864 | |
865 | iter.bi_size = pick.crc.compressed_size << 9; |
866 | goto get_bio; |
867 | } |
868 | |
869 | if (!(flags & BCH_READ_LAST_FRAGMENT) || |
870 | bio_flagged(bio: &orig->bio, bit: BIO_CHAIN)) |
871 | flags |= BCH_READ_MUST_CLONE; |
872 | |
873 | narrow_crcs = !(flags & BCH_READ_IN_RETRY) && |
874 | bch2_can_narrow_extent_crcs(k, pick.crc); |
875 | |
876 | if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) |
877 | flags |= BCH_READ_MUST_BOUNCE; |
878 | |
879 | EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); |
880 | |
881 | if (crc_is_compressed(crc: pick.crc) || |
882 | (pick.crc.csum_type != BCH_CSUM_none && |
883 | (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || |
884 | (bch2_csum_type_is_encryption(type: pick.crc.csum_type) && |
885 | (flags & BCH_READ_USER_MAPPED)) || |
886 | (flags & BCH_READ_MUST_BOUNCE)))) { |
887 | read_full = true; |
888 | bounce = true; |
889 | } |
890 | |
891 | if (orig->opts.promote_target) |
892 | promote = promote_alloc(trans, iter, k, pick: &pick, opts: orig->opts, flags, |
893 | rbio: &rbio, bounce: &bounce, read_full: &read_full); |
894 | |
895 | if (!read_full) { |
896 | EBUG_ON(crc_is_compressed(pick.crc)); |
897 | EBUG_ON(pick.crc.csum_type && |
898 | (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || |
899 | bvec_iter_sectors(iter) != pick.crc.live_size || |
900 | pick.crc.offset || |
901 | offset_into_extent)); |
902 | |
903 | data_pos.offset += offset_into_extent; |
904 | pick.ptr.offset += pick.crc.offset + |
905 | offset_into_extent; |
906 | offset_into_extent = 0; |
907 | pick.crc.compressed_size = bvec_iter_sectors(iter); |
908 | pick.crc.uncompressed_size = bvec_iter_sectors(iter); |
909 | pick.crc.offset = 0; |
910 | pick.crc.live_size = bvec_iter_sectors(iter); |
911 | } |
912 | get_bio: |
913 | if (rbio) { |
914 | /* |
915 | * promote already allocated bounce rbio: |
916 | * promote needs to allocate a bio big enough for uncompressing |
917 | * data in the write path, but we're not going to use it all |
918 | * here: |
919 | */ |
920 | EBUG_ON(rbio->bio.bi_iter.bi_size < |
921 | pick.crc.compressed_size << 9); |
922 | rbio->bio.bi_iter.bi_size = |
923 | pick.crc.compressed_size << 9; |
924 | } else if (bounce) { |
925 | unsigned sectors = pick.crc.compressed_size; |
926 | |
927 | rbio = rbio_init(bio: bio_alloc_bioset(NULL, |
928 | DIV_ROUND_UP(sectors, PAGE_SECTORS), |
929 | opf: 0, |
930 | GFP_NOFS, |
931 | bs: &c->bio_read_split), |
932 | opts: orig->opts); |
933 | |
934 | bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); |
935 | rbio->bounce = true; |
936 | rbio->split = true; |
937 | } else if (flags & BCH_READ_MUST_CLONE) { |
938 | /* |
939 | * Have to clone if there were any splits, due to error |
940 | * reporting issues (if a split errored, and retrying didn't |
941 | * work, when it reports the error to its parent (us) we don't |
942 | * know if the error was from our bio, and we should retry, or |
943 | * from the whole bio, in which case we don't want to retry and |
944 | * lose the error) |
945 | */ |
946 | rbio = rbio_init(bio: bio_alloc_clone(NULL, bio_src: &orig->bio, GFP_NOFS, |
947 | bs: &c->bio_read_split), |
948 | opts: orig->opts); |
949 | rbio->bio.bi_iter = iter; |
950 | rbio->split = true; |
951 | } else { |
952 | rbio = orig; |
953 | rbio->bio.bi_iter = iter; |
954 | EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); |
955 | } |
956 | |
957 | EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); |
958 | |
959 | rbio->c = c; |
960 | rbio->submit_time = local_clock(); |
961 | if (rbio->split) |
962 | rbio->parent = orig; |
963 | else |
964 | rbio->end_io = orig->bio.bi_end_io; |
965 | rbio->bvec_iter = iter; |
966 | rbio->offset_into_extent= offset_into_extent; |
967 | rbio->flags = flags; |
968 | rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); |
969 | rbio->narrow_crcs = narrow_crcs; |
970 | rbio->hole = 0; |
971 | rbio->retry = 0; |
972 | rbio->context = 0; |
973 | /* XXX: only initialize this if needed */ |
974 | rbio->devs_have = bch2_bkey_devs(k); |
975 | rbio->pick = pick; |
976 | rbio->subvol = orig->subvol; |
977 | rbio->read_pos = read_pos; |
978 | rbio->data_btree = data_btree; |
979 | rbio->data_pos = data_pos; |
980 | rbio->version = k.k->version; |
981 | rbio->promote = promote; |
982 | INIT_WORK(&rbio->work, NULL); |
983 | |
984 | rbio->bio.bi_opf = orig->bio.bi_opf; |
985 | rbio->bio.bi_iter.bi_sector = pick.ptr.offset; |
986 | rbio->bio.bi_end_io = bch2_read_endio; |
987 | |
988 | if (rbio->bounce) |
989 | trace_and_count(c, read_bounce, &rbio->bio); |
990 | |
991 | this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); |
992 | bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); |
993 | |
994 | /* |
995 | * If it's being moved internally, we don't want to flag it as a cache |
996 | * hit: |
997 | */ |
998 | if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) |
999 | bch2_bucket_io_time_reset(trans, pick.ptr.dev, |
1000 | PTR_BUCKET_NR(ca, ptr: &pick.ptr), READ); |
1001 | |
1002 | if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { |
1003 | bio_inc_remaining(bio: &orig->bio); |
1004 | trace_and_count(c, read_split, &orig->bio); |
1005 | } |
1006 | |
1007 | if (!rbio->pick.idx) { |
1008 | if (!rbio->have_ioref) { |
1009 | bch_err_inum_offset_ratelimited(c, |
1010 | read_pos.inode, |
1011 | read_pos.offset << 9, |
1012 | "no device to read from" ); |
1013 | bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); |
1014 | goto out; |
1015 | } |
1016 | |
1017 | this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], |
1018 | bio_sectors(&rbio->bio)); |
1019 | bio_set_dev(bio: &rbio->bio, bdev: ca->disk_sb.bdev); |
1020 | |
1021 | if (unlikely(c->opts.no_data_io)) { |
1022 | if (likely(!(flags & BCH_READ_IN_RETRY))) |
1023 | bio_endio(&rbio->bio); |
1024 | } else { |
1025 | if (likely(!(flags & BCH_READ_IN_RETRY))) |
1026 | submit_bio(bio: &rbio->bio); |
1027 | else |
1028 | submit_bio_wait(bio: &rbio->bio); |
1029 | } |
1030 | |
1031 | /* |
1032 | * We just submitted IO which may block, we expect relock fail |
1033 | * events and shouldn't count them: |
1034 | */ |
1035 | trans->notrace_relock_fail = true; |
1036 | } else { |
1037 | /* Attempting reconstruct read: */ |
1038 | if (bch2_ec_read_extent(trans, rbio)) { |
1039 | bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); |
1040 | goto out; |
1041 | } |
1042 | |
1043 | if (likely(!(flags & BCH_READ_IN_RETRY))) |
1044 | bio_endio(&rbio->bio); |
1045 | } |
1046 | out: |
1047 | if (likely(!(flags & BCH_READ_IN_RETRY))) { |
1048 | return 0; |
1049 | } else { |
1050 | int ret; |
1051 | |
1052 | rbio->context = RBIO_CONTEXT_UNBOUND; |
1053 | bch2_read_endio(bio: &rbio->bio); |
1054 | |
1055 | ret = rbio->retry; |
1056 | rbio = bch2_rbio_free(rbio); |
1057 | |
1058 | if (ret == READ_RETRY_AVOID) { |
1059 | bch2_mark_io_failure(failed, &pick); |
1060 | ret = READ_RETRY; |
1061 | } |
1062 | |
1063 | if (!ret) |
1064 | goto out_read_done; |
1065 | |
1066 | return ret; |
1067 | } |
1068 | |
1069 | err: |
1070 | if (flags & BCH_READ_IN_RETRY) |
1071 | return READ_ERR; |
1072 | |
1073 | orig->bio.bi_status = BLK_STS_IOERR; |
1074 | goto out_read_done; |
1075 | |
1076 | hole: |
1077 | /* |
1078 | * won't normally happen in the BCH_READ_NODECODE |
1079 | * (bch2_move_extent()) path, but if we retry and the extent we wanted |
1080 | * to read no longer exists we have to signal that: |
1081 | */ |
1082 | if (flags & BCH_READ_NODECODE) |
1083 | orig->hole = true; |
1084 | |
1085 | zero_fill_bio_iter(bio: &orig->bio, iter); |
1086 | out_read_done: |
1087 | if (flags & BCH_READ_LAST_FRAGMENT) |
1088 | bch2_rbio_done(rbio: orig); |
1089 | return 0; |
1090 | } |
1091 | |
1092 | void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, |
1093 | struct bvec_iter bvec_iter, subvol_inum inum, |
1094 | struct bch_io_failures *failed, unsigned flags) |
1095 | { |
1096 | struct btree_trans *trans = bch2_trans_get(c); |
1097 | struct btree_iter iter; |
1098 | struct bkey_buf sk; |
1099 | struct bkey_s_c k; |
1100 | u32 snapshot; |
1101 | int ret; |
1102 | |
1103 | BUG_ON(flags & BCH_READ_NODECODE); |
1104 | |
1105 | bch2_bkey_buf_init(s: &sk); |
1106 | retry: |
1107 | bch2_trans_begin(trans); |
1108 | iter = (struct btree_iter) { NULL }; |
1109 | |
1110 | ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); |
1111 | if (ret) |
1112 | goto err; |
1113 | |
1114 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_extents, |
1115 | pos: SPOS(inode: inum.inum, offset: bvec_iter.bi_sector, snapshot), |
1116 | flags: BTREE_ITER_SLOTS); |
1117 | while (1) { |
1118 | unsigned bytes, sectors, offset_into_extent; |
1119 | enum btree_id data_btree = BTREE_ID_extents; |
1120 | |
1121 | /* |
1122 | * read_extent -> io_time_reset may cause a transaction restart |
1123 | * without returning an error, we need to check for that here: |
1124 | */ |
1125 | ret = bch2_trans_relock(trans); |
1126 | if (ret) |
1127 | break; |
1128 | |
1129 | bch2_btree_iter_set_pos(iter: &iter, |
1130 | POS(inum.inum, bvec_iter.bi_sector)); |
1131 | |
1132 | k = bch2_btree_iter_peek_slot(&iter); |
1133 | ret = bkey_err(k); |
1134 | if (ret) |
1135 | break; |
1136 | |
1137 | offset_into_extent = iter.pos.offset - |
1138 | bkey_start_offset(k: k.k); |
1139 | sectors = k.k->size - offset_into_extent; |
1140 | |
1141 | bch2_bkey_buf_reassemble(s: &sk, c, k); |
1142 | |
1143 | ret = bch2_read_indirect_extent(trans, data_btree: &data_btree, |
1144 | offset_into_extent: &offset_into_extent, k: &sk); |
1145 | if (ret) |
1146 | break; |
1147 | |
1148 | k = bkey_i_to_s_c(k: sk.k); |
1149 | |
1150 | /* |
1151 | * With indirect extents, the amount of data to read is the min |
1152 | * of the original extent and the indirect extent: |
1153 | */ |
1154 | sectors = min(sectors, k.k->size - offset_into_extent); |
1155 | |
1156 | bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; |
1157 | swap(bvec_iter.bi_size, bytes); |
1158 | |
1159 | if (bvec_iter.bi_size == bytes) |
1160 | flags |= BCH_READ_LAST_FRAGMENT; |
1161 | |
1162 | ret = __bch2_read_extent(trans, orig: rbio, iter: bvec_iter, read_pos: iter.pos, |
1163 | data_btree, k, |
1164 | offset_into_extent, failed, flags); |
1165 | if (ret) |
1166 | break; |
1167 | |
1168 | if (flags & BCH_READ_LAST_FRAGMENT) |
1169 | break; |
1170 | |
1171 | swap(bvec_iter.bi_size, bytes); |
1172 | bio_advance_iter(bio: &rbio->bio, iter: &bvec_iter, bytes); |
1173 | |
1174 | ret = btree_trans_too_many_iters(trans); |
1175 | if (ret) |
1176 | break; |
1177 | } |
1178 | err: |
1179 | bch2_trans_iter_exit(trans, &iter); |
1180 | |
1181 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || |
1182 | ret == READ_RETRY || |
1183 | ret == READ_RETRY_AVOID) |
1184 | goto retry; |
1185 | |
1186 | bch2_trans_put(trans); |
1187 | bch2_bkey_buf_exit(s: &sk, c); |
1188 | |
1189 | if (ret) { |
1190 | bch_err_inum_offset_ratelimited(c, inum.inum, |
1191 | bvec_iter.bi_sector << 9, |
1192 | "read error %i from btree lookup" , ret); |
1193 | rbio->bio.bi_status = BLK_STS_IOERR; |
1194 | bch2_rbio_done(rbio); |
1195 | } |
1196 | } |
1197 | |
1198 | void bch2_fs_io_read_exit(struct bch_fs *c) |
1199 | { |
1200 | if (c->promote_table.tbl) |
1201 | rhashtable_destroy(ht: &c->promote_table); |
1202 | bioset_exit(&c->bio_read_split); |
1203 | bioset_exit(&c->bio_read); |
1204 | } |
1205 | |
1206 | int bch2_fs_io_read_init(struct bch_fs *c) |
1207 | { |
1208 | if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), |
1209 | flags: BIOSET_NEED_BVECS)) |
1210 | return -BCH_ERR_ENOMEM_bio_read_init; |
1211 | |
1212 | if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), |
1213 | flags: BIOSET_NEED_BVECS)) |
1214 | return -BCH_ERR_ENOMEM_bio_read_split_init; |
1215 | |
1216 | if (rhashtable_init(ht: &c->promote_table, params: &bch_promote_params)) |
1217 | return -BCH_ERR_ENOMEM_promote_table_init; |
1218 | |
1219 | return 0; |
1220 | } |
1221 | |