1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "alloc_background.h"
5#include "alloc_foreground.h"
6#include "backpointers.h"
7#include "bkey_buf.h"
8#include "btree_gc.h"
9#include "btree_io.h"
10#include "btree_update.h"
11#include "btree_update_interior.h"
12#include "btree_write_buffer.h"
13#include "compress.h"
14#include "disk_groups.h"
15#include "ec.h"
16#include "errcode.h"
17#include "error.h"
18#include "inode.h"
19#include "io_read.h"
20#include "io_write.h"
21#include "journal_reclaim.h"
22#include "keylist.h"
23#include "move.h"
24#include "replicas.h"
25#include "snapshot.h"
26#include "super-io.h"
27#include "trace.h"
28
29#include <linux/ioprio.h>
30#include <linux/kthread.h>
31
32const char * const bch2_data_ops_strs[] = {
33#define x(t, n, ...) [n] = #t,
34 BCH_DATA_OPS()
35#undef x
36 NULL
37};
38
39static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
40 struct bch_io_opts *io_opts,
41 struct data_update_opts *data_opts)
42{
43 printbuf_tabstop_push(out, 20);
44 prt_str(out, str: "rewrite ptrs:");
45 prt_tab(out);
46 bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
47 prt_newline(out);
48
49 prt_str(out, str: "kill ptrs: ");
50 prt_tab(out);
51 bch2_prt_u64_base2(out, data_opts->kill_ptrs);
52 prt_newline(out);
53
54 prt_str(out, str: "target: ");
55 prt_tab(out);
56 bch2_target_to_text(out, c, data_opts->target);
57 prt_newline(out);
58
59 prt_str(out, str: "compression: ");
60 prt_tab(out);
61 bch2_compression_opt_to_text(out, background_compression(opts: *io_opts));
62 prt_newline(out);
63
64 prt_str(out, str: "extra replicas: ");
65 prt_tab(out);
66 prt_u64(out, data_opts->extra_replicas);
67}
68
69static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
70 struct bch_io_opts *io_opts,
71 struct data_update_opts *data_opts)
72{
73 if (trace_move_extent_enabled()) {
74 struct printbuf buf = PRINTBUF;
75
76 bch2_bkey_val_to_text(&buf, c, k);
77 prt_newline(&buf);
78 bch2_data_update_opts_to_text(out: &buf, c, io_opts, data_opts);
79 trace_move_extent(c, str: buf.buf);
80 printbuf_exit(&buf);
81 }
82}
83
84static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
85{
86 if (trace_move_extent_read_enabled()) {
87 struct printbuf buf = PRINTBUF;
88
89 bch2_bkey_val_to_text(&buf, c, k);
90 trace_move_extent_read(c, str: buf.buf);
91 printbuf_exit(&buf);
92 }
93}
94
95struct moving_io {
96 struct list_head read_list;
97 struct list_head io_list;
98 struct move_bucket_in_flight *b;
99 struct closure cl;
100 bool read_completed;
101
102 unsigned read_sectors;
103 unsigned write_sectors;
104
105 struct bch_read_bio rbio;
106
107 struct data_update write;
108 /* Must be last since it is variable size */
109 struct bio_vec bi_inline_vecs[];
110};
111
112static void move_free(struct moving_io *io)
113{
114 struct moving_context *ctxt = io->write.ctxt;
115
116 if (io->b)
117 atomic_dec(v: &io->b->count);
118
119 bch2_data_update_exit(&io->write);
120
121 mutex_lock(&ctxt->lock);
122 list_del(entry: &io->io_list);
123 wake_up(&ctxt->wait);
124 mutex_unlock(lock: &ctxt->lock);
125
126 kfree(objp: io);
127}
128
129static void move_write_done(struct bch_write_op *op)
130{
131 struct moving_io *io = container_of(op, struct moving_io, write.op);
132 struct moving_context *ctxt = io->write.ctxt;
133
134 if (io->write.op.error)
135 ctxt->write_error = true;
136
137 atomic_sub(i: io->write_sectors, v: &io->write.ctxt->write_sectors);
138 atomic_dec(v: &io->write.ctxt->write_ios);
139 move_free(io);
140 closure_put(cl: &ctxt->cl);
141}
142
143static void move_write(struct moving_io *io)
144{
145 if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
146 move_free(io);
147 return;
148 }
149
150 if (trace_move_extent_write_enabled()) {
151 struct bch_fs *c = io->write.op.c;
152 struct printbuf buf = PRINTBUF;
153
154 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k: io->write.k.k));
155 trace_move_extent_write(c, str: buf.buf);
156 printbuf_exit(&buf);
157 }
158
159 closure_get(cl: &io->write.ctxt->cl);
160 atomic_add(i: io->write_sectors, v: &io->write.ctxt->write_sectors);
161 atomic_inc(v: &io->write.ctxt->write_ios);
162
163 bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
164}
165
166struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
167{
168 struct moving_io *io =
169 list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list);
170
171 return io && io->read_completed ? io : NULL;
172}
173
174static void move_read_endio(struct bio *bio)
175{
176 struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
177 struct moving_context *ctxt = io->write.ctxt;
178
179 atomic_sub(i: io->read_sectors, v: &ctxt->read_sectors);
180 atomic_dec(v: &ctxt->read_ios);
181 io->read_completed = true;
182
183 wake_up(&ctxt->wait);
184 closure_put(cl: &ctxt->cl);
185}
186
187void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
188{
189 struct moving_io *io;
190
191 while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
192 bch2_trans_unlock_long(ctxt->trans);
193 list_del(entry: &io->read_list);
194 move_write(io);
195 }
196}
197
198void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
199{
200 unsigned sectors_pending = atomic_read(v: &ctxt->write_sectors);
201
202 move_ctxt_wait_event(ctxt,
203 !atomic_read(&ctxt->write_sectors) ||
204 atomic_read(&ctxt->write_sectors) != sectors_pending);
205}
206
207void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
208{
209 move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
210 bch2_trans_unlock_long(ctxt->trans);
211 closure_sync(cl: &ctxt->cl);
212}
213
214void bch2_moving_ctxt_exit(struct moving_context *ctxt)
215{
216 struct bch_fs *c = ctxt->trans->c;
217
218 bch2_moving_ctxt_flush_all(ctxt);
219
220 EBUG_ON(atomic_read(&ctxt->write_sectors));
221 EBUG_ON(atomic_read(&ctxt->write_ios));
222 EBUG_ON(atomic_read(&ctxt->read_sectors));
223 EBUG_ON(atomic_read(&ctxt->read_ios));
224
225 mutex_lock(&c->moving_context_lock);
226 list_del(entry: &ctxt->list);
227 mutex_unlock(lock: &c->moving_context_lock);
228
229 bch2_trans_put(ctxt->trans);
230 memset(ctxt, 0, sizeof(*ctxt));
231}
232
233void bch2_moving_ctxt_init(struct moving_context *ctxt,
234 struct bch_fs *c,
235 struct bch_ratelimit *rate,
236 struct bch_move_stats *stats,
237 struct write_point_specifier wp,
238 bool wait_on_copygc)
239{
240 memset(ctxt, 0, sizeof(*ctxt));
241
242 ctxt->trans = bch2_trans_get(c);
243 ctxt->fn = (void *) _RET_IP_;
244 ctxt->rate = rate;
245 ctxt->stats = stats;
246 ctxt->wp = wp;
247 ctxt->wait_on_copygc = wait_on_copygc;
248
249 closure_init_stack(cl: &ctxt->cl);
250
251 mutex_init(&ctxt->lock);
252 INIT_LIST_HEAD(list: &ctxt->reads);
253 INIT_LIST_HEAD(list: &ctxt->ios);
254 init_waitqueue_head(&ctxt->wait);
255
256 mutex_lock(&c->moving_context_lock);
257 list_add(new: &ctxt->list, head: &c->moving_context_list);
258 mutex_unlock(lock: &c->moving_context_lock);
259}
260
261void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
262{
263 trace_move_data(c, stats);
264}
265
266void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
267{
268 memset(stats, 0, sizeof(*stats));
269 stats->data_type = BCH_DATA_user;
270 scnprintf(buf: stats->name, size: sizeof(stats->name), fmt: "%s", name);
271}
272
273int bch2_move_extent(struct moving_context *ctxt,
274 struct move_bucket_in_flight *bucket_in_flight,
275 struct btree_iter *iter,
276 struct bkey_s_c k,
277 struct bch_io_opts io_opts,
278 struct data_update_opts data_opts)
279{
280 struct btree_trans *trans = ctxt->trans;
281 struct bch_fs *c = trans->c;
282 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
283 struct moving_io *io;
284 const union bch_extent_entry *entry;
285 struct extent_ptr_decoded p;
286 unsigned sectors = k.k->size, pages;
287 int ret = -ENOMEM;
288
289 trace_move_extent2(c, k, io_opts: &io_opts, data_opts: &data_opts);
290
291 if (ctxt->stats)
292 ctxt->stats->pos = BBPOS(btree: iter->btree_id, pos: iter->pos);
293
294 bch2_data_update_opts_normalize(k, &data_opts);
295
296 if (!data_opts.rewrite_ptrs &&
297 !data_opts.extra_replicas) {
298 if (data_opts.kill_ptrs)
299 return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
300 return 0;
301 }
302
303 /*
304 * Before memory allocations & taking nocow locks in
305 * bch2_data_update_init():
306 */
307 bch2_trans_unlock(trans);
308
309 /* write path might have to decompress data: */
310 bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
311 sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
312
313 pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
314 io = kzalloc(size: sizeof(struct moving_io) +
315 sizeof(struct bio_vec) * pages, GFP_KERNEL);
316 if (!io)
317 goto err;
318
319 INIT_LIST_HEAD(list: &io->io_list);
320 io->write.ctxt = ctxt;
321 io->read_sectors = k.k->size;
322 io->write_sectors = k.k->size;
323
324 bio_init(bio: &io->write.op.wbio.bio, NULL, table: io->bi_inline_vecs, max_vecs: pages, opf: 0);
325 bio_set_prio(&io->write.op.wbio.bio,
326 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
327
328 if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
329 GFP_KERNEL))
330 goto err_free;
331
332 io->rbio.c = c;
333 io->rbio.opts = io_opts;
334 bio_init(bio: &io->rbio.bio, NULL, table: io->bi_inline_vecs, max_vecs: pages, opf: 0);
335 io->rbio.bio.bi_vcnt = pages;
336 bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
337 io->rbio.bio.bi_iter.bi_size = sectors << 9;
338
339 io->rbio.bio.bi_opf = REQ_OP_READ;
340 io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k: k.k);
341 io->rbio.bio.bi_end_io = move_read_endio;
342
343 ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
344 io_opts, data_opts, iter->btree_id, k);
345 if (ret)
346 goto err_free_pages;
347
348 io->write.op.end_io = move_write_done;
349
350 if (ctxt->rate)
351 bch2_ratelimit_increment(ctxt->rate, k.k->size);
352
353 if (ctxt->stats) {
354 atomic64_inc(v: &ctxt->stats->keys_moved);
355 atomic64_add(i: k.k->size, v: &ctxt->stats->sectors_moved);
356 }
357
358 if (bucket_in_flight) {
359 io->b = bucket_in_flight;
360 atomic_inc(v: &io->b->count);
361 }
362
363 this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
364 this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
365 trace_move_extent_read2(c, k);
366
367 mutex_lock(&ctxt->lock);
368 atomic_add(i: io->read_sectors, v: &ctxt->read_sectors);
369 atomic_inc(v: &ctxt->read_ios);
370
371 list_add_tail(new: &io->read_list, head: &ctxt->reads);
372 list_add_tail(new: &io->io_list, head: &ctxt->ios);
373 mutex_unlock(lock: &ctxt->lock);
374
375 /*
376 * dropped by move_read_endio() - guards against use after free of
377 * ctxt when doing wakeup
378 */
379 closure_get(cl: &ctxt->cl);
380 bch2_read_extent(trans, rbio: &io->rbio,
381 read_pos: bkey_start_pos(k: k.k),
382 data_btree: iter->btree_id, k, offset_into_extent: 0,
383 flags: BCH_READ_NODECODE|
384 BCH_READ_LAST_FRAGMENT);
385 return 0;
386err_free_pages:
387 bio_free_pages(bio: &io->write.op.wbio.bio);
388err_free:
389 kfree(objp: io);
390err:
391 if (ret == -BCH_ERR_data_update_done)
392 return 0;
393
394 if (bch2_err_matches(ret, EROFS) ||
395 bch2_err_matches(ret, BCH_ERR_transaction_restart))
396 return ret;
397
398 count_event(c, move_extent_start_fail);
399
400 if (trace_move_extent_start_fail_enabled()) {
401 struct printbuf buf = PRINTBUF;
402
403 bch2_bkey_val_to_text(&buf, c, k);
404 prt_str(out: &buf, str: ": ");
405 prt_str(out: &buf, str: bch2_err_str(ret));
406 trace_move_extent_start_fail(c, str: buf.buf);
407 printbuf_exit(&buf);
408 }
409 return ret;
410}
411
412struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
413 struct per_snapshot_io_opts *io_opts,
414 struct bkey_s_c extent_k)
415{
416 struct bch_fs *c = trans->c;
417 u32 restart_count = trans->restart_count;
418 int ret = 0;
419
420 if (io_opts->cur_inum != extent_k.k->p.inode) {
421 io_opts->d.nr = 0;
422
423 ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
424 BTREE_ITER_ALL_SNAPSHOTS, k, ({
425 if (k.k->p.offset != extent_k.k->p.inode)
426 break;
427
428 if (!bkey_is_inode(k.k))
429 continue;
430
431 struct bch_inode_unpacked inode;
432 BUG_ON(bch2_inode_unpack(k, &inode));
433
434 struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
435 bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
436
437 darray_push(&io_opts->d, e);
438 }));
439 io_opts->cur_inum = extent_k.k->p.inode;
440 }
441
442 ret = ret ?: trans_was_restarted(trans, restart_count);
443 if (ret)
444 return ERR_PTR(error: ret);
445
446 if (extent_k.k->p.snapshot)
447 darray_for_each(io_opts->d, i)
448 if (bch2_snapshot_is_ancestor(c, id: extent_k.k->p.snapshot, ancestor: i->snapshot))
449 return &i->io_opts;
450
451 return &io_opts->fs_io_opts;
452}
453
454int bch2_move_get_io_opts_one(struct btree_trans *trans,
455 struct bch_io_opts *io_opts,
456 struct bkey_s_c extent_k)
457{
458 struct btree_iter iter;
459 struct bkey_s_c k;
460 int ret;
461
462 /* reflink btree? */
463 if (!extent_k.k->p.inode) {
464 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
465 return 0;
466 }
467
468 k = bch2_bkey_get_iter(trans, iter: &iter, btree_id: BTREE_ID_inodes,
469 pos: SPOS(inode: 0, offset: extent_k.k->p.inode, snapshot: extent_k.k->p.snapshot),
470 flags: BTREE_ITER_CACHED);
471 ret = bkey_err(k);
472 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
473 return ret;
474
475 if (!ret && bkey_is_inode(k: k.k)) {
476 struct bch_inode_unpacked inode;
477 bch2_inode_unpack(k, &inode);
478 bch2_inode_opts_get(io_opts, trans->c, &inode);
479 } else {
480 *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
481 }
482
483 bch2_trans_iter_exit(trans, &iter);
484 return 0;
485}
486
487int bch2_move_ratelimit(struct moving_context *ctxt)
488{
489 struct bch_fs *c = ctxt->trans->c;
490 bool is_kthread = current->flags & PF_KTHREAD;
491 u64 delay;
492
493 if (ctxt->wait_on_copygc && c->copygc_running) {
494 bch2_moving_ctxt_flush_all(ctxt);
495 wait_event_killable(c->copygc_running_wq,
496 !c->copygc_running ||
497 (is_kthread && kthread_should_stop()));
498 }
499
500 do {
501 delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
502
503 if (is_kthread && kthread_should_stop())
504 return 1;
505
506 if (delay)
507 move_ctxt_wait_event_timeout(ctxt,
508 freezing(current) ||
509 (is_kthread && kthread_should_stop()),
510 delay);
511
512 if (unlikely(freezing(current))) {
513 bch2_moving_ctxt_flush_all(ctxt);
514 try_to_freeze();
515 }
516 } while (delay);
517
518 /*
519 * XXX: these limits really ought to be per device, SSDs and hard drives
520 * will want different limits
521 */
522 move_ctxt_wait_event(ctxt,
523 atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
524 atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
525 atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
526 atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight);
527
528 return 0;
529}
530
531static int bch2_move_data_btree(struct moving_context *ctxt,
532 struct bpos start,
533 struct bpos end,
534 move_pred_fn pred, void *arg,
535 enum btree_id btree_id)
536{
537 struct btree_trans *trans = ctxt->trans;
538 struct bch_fs *c = trans->c;
539 struct per_snapshot_io_opts snapshot_io_opts;
540 struct bch_io_opts *io_opts;
541 struct bkey_buf sk;
542 struct btree_iter iter;
543 struct bkey_s_c k;
544 struct data_update_opts data_opts;
545 int ret = 0, ret2;
546
547 per_snapshot_io_opts_init(io_opts: &snapshot_io_opts, c);
548 bch2_bkey_buf_init(s: &sk);
549
550 if (ctxt->stats) {
551 ctxt->stats->data_type = BCH_DATA_user;
552 ctxt->stats->pos = BBPOS(btree: btree_id, pos: start);
553 }
554
555 bch2_trans_iter_init(trans, iter: &iter, btree_id, pos: start,
556 flags: BTREE_ITER_PREFETCH|
557 BTREE_ITER_ALL_SNAPSHOTS);
558
559 if (ctxt->rate)
560 bch2_ratelimit_reset(d: ctxt->rate);
561
562 while (!bch2_move_ratelimit(ctxt)) {
563 bch2_trans_begin(trans);
564
565 k = bch2_btree_iter_peek(iter: &iter);
566 if (!k.k)
567 break;
568
569 ret = bkey_err(k);
570 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
571 continue;
572 if (ret)
573 break;
574
575 if (bkey_ge(l: bkey_start_pos(k: k.k), r: end))
576 break;
577
578 if (ctxt->stats)
579 ctxt->stats->pos = BBPOS(btree: iter.btree_id, pos: iter.pos);
580
581 if (!bkey_extent_is_direct_data(k: k.k))
582 goto next_nondata;
583
584 io_opts = bch2_move_get_io_opts(trans, io_opts: &snapshot_io_opts, extent_k: k);
585 ret = PTR_ERR_OR_ZERO(ptr: io_opts);
586 if (ret)
587 continue;
588
589 memset(&data_opts, 0, sizeof(data_opts));
590 if (!pred(c, arg, k, io_opts, &data_opts))
591 goto next;
592
593 /*
594 * The iterator gets unlocked by __bch2_read_extent - need to
595 * save a copy of @k elsewhere:
596 */
597 bch2_bkey_buf_reassemble(s: &sk, c, k);
598 k = bkey_i_to_s_c(k: sk.k);
599
600 ret2 = bch2_move_extent(ctxt, NULL, iter: &iter, k, io_opts: *io_opts, data_opts);
601 if (ret2) {
602 if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
603 continue;
604
605 if (ret2 == -ENOMEM) {
606 /* memory allocation failure, wait for some IO to finish */
607 bch2_move_ctxt_wait_for_io(ctxt);
608 continue;
609 }
610
611 /* XXX signal failure */
612 goto next;
613 }
614next:
615 if (ctxt->stats)
616 atomic64_add(i: k.k->size, v: &ctxt->stats->sectors_seen);
617next_nondata:
618 bch2_btree_iter_advance(&iter);
619 }
620
621 bch2_trans_iter_exit(trans, &iter);
622 bch2_bkey_buf_exit(s: &sk, c);
623 per_snapshot_io_opts_exit(io_opts: &snapshot_io_opts);
624
625 return ret;
626}
627
628int __bch2_move_data(struct moving_context *ctxt,
629 struct bbpos start,
630 struct bbpos end,
631 move_pred_fn pred, void *arg)
632{
633 struct bch_fs *c = ctxt->trans->c;
634 enum btree_id id;
635 int ret = 0;
636
637 for (id = start.btree;
638 id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
639 id++) {
640 ctxt->stats->pos = BBPOS(btree: id, POS_MIN);
641
642 if (!btree_type_has_ptrs(id) ||
643 !bch2_btree_id_root(c, id)->b)
644 continue;
645
646 ret = bch2_move_data_btree(ctxt,
647 start: id == start.btree ? start.pos : POS_MIN,
648 end: id == end.btree ? end.pos : POS_MAX,
649 pred, arg, btree_id: id);
650 if (ret)
651 break;
652 }
653
654 return ret;
655}
656
657int bch2_move_data(struct bch_fs *c,
658 struct bbpos start,
659 struct bbpos end,
660 struct bch_ratelimit *rate,
661 struct bch_move_stats *stats,
662 struct write_point_specifier wp,
663 bool wait_on_copygc,
664 move_pred_fn pred, void *arg)
665{
666
667 struct moving_context ctxt;
668 int ret;
669
670 bch2_moving_ctxt_init(ctxt: &ctxt, c, rate, stats, wp, wait_on_copygc);
671 ret = __bch2_move_data(ctxt: &ctxt, start, end, pred, arg);
672 bch2_moving_ctxt_exit(ctxt: &ctxt);
673
674 return ret;
675}
676
677int bch2_evacuate_bucket(struct moving_context *ctxt,
678 struct move_bucket_in_flight *bucket_in_flight,
679 struct bpos bucket, int gen,
680 struct data_update_opts _data_opts)
681{
682 struct btree_trans *trans = ctxt->trans;
683 struct bch_fs *c = trans->c;
684 bool is_kthread = current->flags & PF_KTHREAD;
685 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
686 struct btree_iter iter;
687 struct bkey_buf sk;
688 struct bch_backpointer bp;
689 struct bch_alloc_v4 a_convert;
690 const struct bch_alloc_v4 *a;
691 struct bkey_s_c k;
692 struct data_update_opts data_opts;
693 unsigned dirty_sectors, bucket_size;
694 u64 fragmentation;
695 struct bpos bp_pos = POS_MIN;
696 int ret = 0;
697
698 trace_bucket_evacuate(c, bucket: &bucket);
699
700 bch2_bkey_buf_init(s: &sk);
701
702 /*
703 * We're not run in a context that handles transaction restarts:
704 */
705 bch2_trans_begin(trans);
706
707 bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_alloc,
708 pos: bucket, flags: BTREE_ITER_CACHED);
709 ret = lockrestart_do(trans,
710 bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
711 bch2_trans_iter_exit(trans, &iter);
712
713 bch_err_msg(c, ret, "looking up alloc key");
714 if (ret)
715 goto err;
716
717 a = bch2_alloc_to_v4(k, convert: &a_convert);
718 dirty_sectors = bch2_bucket_sectors_dirty(a: *a);
719 bucket_size = bch_dev_bkey_exists(c, idx: bucket.inode)->mi.bucket_size;
720 fragmentation = a->fragmentation_lru;
721
722 ret = bch2_btree_write_buffer_tryflush(trans);
723 bch_err_msg(c, ret, "flushing btree write buffer");
724 if (ret)
725 goto err;
726
727 while (!(ret = bch2_move_ratelimit(ctxt))) {
728 if (is_kthread && kthread_should_stop())
729 break;
730
731 bch2_trans_begin(trans);
732
733 ret = bch2_get_next_backpointer(trans, bucket, gen,
734 &bp_pos, &bp,
735 BTREE_ITER_CACHED);
736 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
737 continue;
738 if (ret)
739 goto err;
740 if (bkey_eq(l: bp_pos, POS_MAX))
741 break;
742
743 if (!bp.level) {
744 k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
745 ret = bkey_err(k);
746 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
747 continue;
748 if (ret)
749 goto err;
750 if (!k.k)
751 goto next;
752
753 bch2_bkey_buf_reassemble(s: &sk, c, k);
754 k = bkey_i_to_s_c(k: sk.k);
755
756 ret = bch2_move_get_io_opts_one(trans, io_opts: &io_opts, extent_k: k);
757 if (ret) {
758 bch2_trans_iter_exit(trans, &iter);
759 continue;
760 }
761
762 data_opts = _data_opts;
763 data_opts.target = io_opts.background_target;
764 data_opts.rewrite_ptrs = 0;
765
766 unsigned i = 0;
767 bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
768 if (ptr->dev == bucket.inode) {
769 data_opts.rewrite_ptrs |= 1U << i;
770 if (ptr->cached) {
771 bch2_trans_iter_exit(trans, &iter);
772 goto next;
773 }
774 }
775 i++;
776 }
777
778 ret = bch2_move_extent(ctxt, bucket_in_flight,
779 iter: &iter, k, io_opts, data_opts);
780 bch2_trans_iter_exit(trans, &iter);
781
782 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
783 continue;
784 if (ret == -ENOMEM) {
785 /* memory allocation failure, wait for some IO to finish */
786 bch2_move_ctxt_wait_for_io(ctxt);
787 continue;
788 }
789 if (ret)
790 goto err;
791
792 if (ctxt->stats)
793 atomic64_add(i: k.k->size, v: &ctxt->stats->sectors_seen);
794 } else {
795 struct btree *b;
796
797 b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp);
798 ret = PTR_ERR_OR_ZERO(ptr: b);
799 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
800 continue;
801 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
802 continue;
803 if (ret)
804 goto err;
805 if (!b)
806 goto next;
807
808 unsigned sectors = btree_ptr_sectors_written(k: &b->key);
809
810 ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
811 bch2_trans_iter_exit(trans, &iter);
812
813 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
814 continue;
815 if (ret)
816 goto err;
817
818 if (ctxt->rate)
819 bch2_ratelimit_increment(ctxt->rate, sectors);
820 if (ctxt->stats) {
821 atomic64_add(i: sectors, v: &ctxt->stats->sectors_seen);
822 atomic64_add(i: sectors, v: &ctxt->stats->sectors_moved);
823 }
824 }
825next:
826 bp_pos = bpos_nosnap_successor(p: bp_pos);
827 }
828
829 trace_evacuate_bucket(c, bucket: &bucket, sectors: dirty_sectors, bucket_size, fragmentation, ret);
830err:
831 bch2_bkey_buf_exit(s: &sk, c);
832 return ret;
833}
834
835typedef bool (*move_btree_pred)(struct bch_fs *, void *,
836 struct btree *, struct bch_io_opts *,
837 struct data_update_opts *);
838
839static int bch2_move_btree(struct bch_fs *c,
840 struct bbpos start,
841 struct bbpos end,
842 move_btree_pred pred, void *arg,
843 struct bch_move_stats *stats)
844{
845 bool kthread = (current->flags & PF_KTHREAD) != 0;
846 struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
847 struct moving_context ctxt;
848 struct btree_trans *trans;
849 struct btree_iter iter;
850 struct btree *b;
851 enum btree_id btree;
852 struct data_update_opts data_opts;
853 int ret = 0;
854
855 bch2_moving_ctxt_init(ctxt: &ctxt, c, NULL, stats,
856 wp: writepoint_ptr(wp: &c->btree_write_point),
857 wait_on_copygc: true);
858 trans = ctxt.trans;
859
860 stats->data_type = BCH_DATA_btree;
861
862 for (btree = start.btree;
863 btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
864 btree ++) {
865 stats->pos = BBPOS(btree, POS_MIN);
866
867 if (!bch2_btree_id_root(c, id: btree)->b)
868 continue;
869
870 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
871 BTREE_ITER_PREFETCH);
872retry:
873 ret = 0;
874 while (bch2_trans_begin(trans),
875 (b = bch2_btree_iter_peek_node(&iter)) &&
876 !(ret = PTR_ERR_OR_ZERO(ptr: b))) {
877 if (kthread && kthread_should_stop())
878 break;
879
880 if ((cmp_int(btree, end.btree) ?:
881 bpos_cmp(l: b->key.k.p, r: end.pos)) > 0)
882 break;
883
884 stats->pos = BBPOS(btree: iter.btree_id, pos: iter.pos);
885
886 if (!pred(c, arg, b, &io_opts, &data_opts))
887 goto next;
888
889 ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
890 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
891 continue;
892 if (ret)
893 break;
894next:
895 bch2_btree_iter_next_node(&iter);
896 }
897 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
898 goto retry;
899
900 bch2_trans_iter_exit(trans, &iter);
901
902 if (kthread && kthread_should_stop())
903 break;
904 }
905
906 bch_err_fn(c, ret);
907 bch2_moving_ctxt_exit(ctxt: &ctxt);
908 bch2_btree_interior_updates_flush(c);
909
910 return ret;
911}
912
913static bool rereplicate_pred(struct bch_fs *c, void *arg,
914 struct bkey_s_c k,
915 struct bch_io_opts *io_opts,
916 struct data_update_opts *data_opts)
917{
918 unsigned nr_good = bch2_bkey_durability(c, k);
919 unsigned replicas = bkey_is_btree_ptr(k: k.k)
920 ? c->opts.metadata_replicas
921 : io_opts->data_replicas;
922
923 if (!nr_good || nr_good >= replicas)
924 return false;
925
926 data_opts->target = 0;
927 data_opts->extra_replicas = replicas - nr_good;
928 data_opts->btree_insert_flags = 0;
929 return true;
930}
931
932static bool migrate_pred(struct bch_fs *c, void *arg,
933 struct bkey_s_c k,
934 struct bch_io_opts *io_opts,
935 struct data_update_opts *data_opts)
936{
937 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
938 struct bch_ioctl_data *op = arg;
939 unsigned i = 0;
940
941 data_opts->rewrite_ptrs = 0;
942 data_opts->target = 0;
943 data_opts->extra_replicas = 0;
944 data_opts->btree_insert_flags = 0;
945
946 bkey_for_each_ptr(ptrs, ptr) {
947 if (ptr->dev == op->migrate.dev)
948 data_opts->rewrite_ptrs |= 1U << i;
949 i++;
950 }
951
952 return data_opts->rewrite_ptrs != 0;
953}
954
955static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
956 struct btree *b,
957 struct bch_io_opts *io_opts,
958 struct data_update_opts *data_opts)
959{
960 return rereplicate_pred(c, arg, k: bkey_i_to_s_c(k: &b->key), io_opts, data_opts);
961}
962
963static bool migrate_btree_pred(struct bch_fs *c, void *arg,
964 struct btree *b,
965 struct bch_io_opts *io_opts,
966 struct data_update_opts *data_opts)
967{
968 return migrate_pred(c, arg, k: bkey_i_to_s_c(k: &b->key), io_opts, data_opts);
969}
970
971static bool bformat_needs_redo(struct bkey_format *f)
972{
973 unsigned i;
974
975 for (i = 0; i < f->nr_fields; i++) {
976 unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
977 u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
978 u64 field_offset = le64_to_cpu(f->field_offset[i]);
979
980 if (f->bits_per_field[i] > unpacked_bits)
981 return true;
982
983 if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
984 return true;
985
986 if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
987 unpacked_mask) <
988 field_offset)
989 return true;
990 }
991
992 return false;
993}
994
995static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
996 struct btree *b,
997 struct bch_io_opts *io_opts,
998 struct data_update_opts *data_opts)
999{
1000 if (b->version_ondisk != c->sb.version ||
1001 btree_node_need_rewrite(b) ||
1002 bformat_needs_redo(f: &b->format)) {
1003 data_opts->target = 0;
1004 data_opts->extra_replicas = 0;
1005 data_opts->btree_insert_flags = 0;
1006 return true;
1007 }
1008
1009 return false;
1010}
1011
1012int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
1013{
1014 int ret;
1015
1016 ret = bch2_move_btree(c,
1017 BBPOS_MIN,
1018 BBPOS_MAX,
1019 pred: rewrite_old_nodes_pred, arg: c, stats);
1020 if (!ret) {
1021 mutex_lock(&c->sb_lock);
1022 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
1023 c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
1024 c->disk_sb.sb->version_min = c->disk_sb.sb->version;
1025 bch2_write_super(c);
1026 mutex_unlock(lock: &c->sb_lock);
1027 }
1028
1029 bch_err_fn(c, ret);
1030 return ret;
1031}
1032
1033static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
1034 struct bkey_s_c k,
1035 struct bch_io_opts *io_opts,
1036 struct data_update_opts *data_opts)
1037{
1038 unsigned durability = bch2_bkey_durability(c, k);
1039 unsigned replicas = bkey_is_btree_ptr(k: k.k)
1040 ? c->opts.metadata_replicas
1041 : io_opts->data_replicas;
1042 const union bch_extent_entry *entry;
1043 struct extent_ptr_decoded p;
1044 unsigned i = 0;
1045
1046 bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
1047 unsigned d = bch2_extent_ptr_durability(c, &p);
1048
1049 if (d && durability - d >= replicas) {
1050 data_opts->kill_ptrs |= BIT(i);
1051 durability -= d;
1052 }
1053
1054 i++;
1055 }
1056
1057 return data_opts->kill_ptrs != 0;
1058}
1059
1060static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
1061 struct btree *b,
1062 struct bch_io_opts *io_opts,
1063 struct data_update_opts *data_opts)
1064{
1065 return drop_extra_replicas_pred(c, arg, k: bkey_i_to_s_c(k: &b->key), io_opts, data_opts);
1066}
1067
1068int bch2_data_job(struct bch_fs *c,
1069 struct bch_move_stats *stats,
1070 struct bch_ioctl_data op)
1071{
1072 struct bbpos start = BBPOS(btree: op.start_btree, pos: op.start_pos);
1073 struct bbpos end = BBPOS(btree: op.end_btree, pos: op.end_pos);
1074 int ret = 0;
1075
1076 if (op.op >= BCH_DATA_OP_NR)
1077 return -EINVAL;
1078
1079 bch2_move_stats_init(stats, name: bch2_data_ops_strs[op.op]);
1080
1081 switch (op.op) {
1082 case BCH_DATA_OP_rereplicate:
1083 stats->data_type = BCH_DATA_journal;
1084 ret = bch2_journal_flush_device_pins(&c->journal, -1);
1085 ret = bch2_move_btree(c, start, end,
1086 pred: rereplicate_btree_pred, arg: c, stats) ?: ret;
1087 ret = bch2_move_data(c, start, end,
1088 NULL,
1089 stats,
1090 wp: writepoint_hashed(v: (unsigned long) current),
1091 wait_on_copygc: true,
1092 pred: rereplicate_pred, arg: c) ?: ret;
1093 ret = bch2_replicas_gc2(c) ?: ret;
1094 break;
1095 case BCH_DATA_OP_migrate:
1096 if (op.migrate.dev >= c->sb.nr_devices)
1097 return -EINVAL;
1098
1099 stats->data_type = BCH_DATA_journal;
1100 ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
1101 ret = bch2_move_btree(c, start, end,
1102 pred: migrate_btree_pred, arg: &op, stats) ?: ret;
1103 ret = bch2_move_data(c, start, end,
1104 NULL,
1105 stats,
1106 wp: writepoint_hashed(v: (unsigned long) current),
1107 wait_on_copygc: true,
1108 pred: migrate_pred, arg: &op) ?: ret;
1109 ret = bch2_replicas_gc2(c) ?: ret;
1110 break;
1111 case BCH_DATA_OP_rewrite_old_nodes:
1112 ret = bch2_scan_old_btree_nodes(c, stats);
1113 break;
1114 case BCH_DATA_OP_drop_extra_replicas:
1115 ret = bch2_move_btree(c, start, end,
1116 pred: drop_extra_replicas_btree_pred, arg: c, stats) ?: ret;
1117 ret = bch2_move_data(c, start, end, NULL, stats,
1118 wp: writepoint_hashed(v: (unsigned long) current),
1119 wait_on_copygc: true,
1120 pred: drop_extra_replicas_pred, arg: c) ?: ret;
1121 ret = bch2_replicas_gc2(c) ?: ret;
1122 break;
1123 default:
1124 ret = -EINVAL;
1125 }
1126
1127 bch2_move_stats_exit(stats, c);
1128 return ret;
1129}
1130
1131void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
1132{
1133 prt_printf(out, "%s: data type==", stats->name);
1134 bch2_prt_data_type(out, stats->data_type);
1135 prt_str(out, str: " pos=");
1136 bch2_bbpos_to_text(out, pos: stats->pos);
1137 prt_newline(out);
1138 printbuf_indent_add(out, 2);
1139
1140 prt_str(out, str: "keys moved: ");
1141 prt_u64(out, atomic64_read(&stats->keys_moved));
1142 prt_newline(out);
1143
1144 prt_str(out, str: "keys raced: ");
1145 prt_u64(out, atomic64_read(&stats->keys_raced));
1146 prt_newline(out);
1147
1148 prt_str(out, str: "bytes seen: ");
1149 prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
1150 prt_newline(out);
1151
1152 prt_str(out, str: "bytes moved: ");
1153 prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
1154 prt_newline(out);
1155
1156 prt_str(out, str: "bytes raced: ");
1157 prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
1158 prt_newline(out);
1159
1160 printbuf_indent_sub(out, 2);
1161}
1162
1163static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
1164{
1165 struct moving_io *io;
1166
1167 bch2_move_stats_to_text(out, stats: ctxt->stats);
1168 printbuf_indent_add(out, 2);
1169
1170 prt_printf(out, "reads: ios %u/%u sectors %u/%u",
1171 atomic_read(&ctxt->read_ios),
1172 c->opts.move_ios_in_flight,
1173 atomic_read(&ctxt->read_sectors),
1174 c->opts.move_bytes_in_flight >> 9);
1175 prt_newline(out);
1176
1177 prt_printf(out, "writes: ios %u/%u sectors %u/%u",
1178 atomic_read(&ctxt->write_ios),
1179 c->opts.move_ios_in_flight,
1180 atomic_read(&ctxt->write_sectors),
1181 c->opts.move_bytes_in_flight >> 9);
1182 prt_newline(out);
1183
1184 printbuf_indent_add(out, 2);
1185
1186 mutex_lock(&ctxt->lock);
1187 list_for_each_entry(io, &ctxt->ios, io_list)
1188 bch2_write_op_to_text(out, &io->write.op);
1189 mutex_unlock(lock: &ctxt->lock);
1190
1191 printbuf_indent_sub(out, 4);
1192}
1193
1194void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
1195{
1196 struct moving_context *ctxt;
1197
1198 mutex_lock(&c->moving_context_lock);
1199 list_for_each_entry(ctxt, &c->moving_context_list, list)
1200 bch2_moving_ctxt_to_text(out, c, ctxt);
1201 mutex_unlock(lock: &c->moving_context_lock);
1202}
1203
1204void bch2_fs_move_init(struct bch_fs *c)
1205{
1206 INIT_LIST_HEAD(list: &c->moving_context_list);
1207 mutex_init(&c->moving_context_lock);
1208}
1209

source code of linux/fs/bcachefs/move.c