1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "alloc_background.h" |
5 | #include "alloc_foreground.h" |
6 | #include "btree_iter.h" |
7 | #include "btree_update.h" |
8 | #include "btree_write_buffer.h" |
9 | #include "buckets.h" |
10 | #include "clock.h" |
11 | #include "compress.h" |
12 | #include "disk_groups.h" |
13 | #include "errcode.h" |
14 | #include "error.h" |
15 | #include "inode.h" |
16 | #include "move.h" |
17 | #include "rebalance.h" |
18 | #include "subvolume.h" |
19 | #include "super-io.h" |
20 | #include "trace.h" |
21 | |
22 | #include <linux/freezer.h> |
23 | #include <linux/kthread.h> |
24 | #include <linux/sched/cputime.h> |
25 | |
26 | #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) |
27 | |
28 | static const char * const bch2_rebalance_state_strs[] = { |
29 | #define x(t) #t, |
30 | BCH_REBALANCE_STATES() |
31 | NULL |
32 | #undef x |
33 | }; |
34 | |
35 | static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) |
36 | { |
37 | struct btree_iter iter; |
38 | struct bkey_s_c k; |
39 | struct bkey_i_cookie *cookie; |
40 | u64 v; |
41 | int ret; |
42 | |
43 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_rebalance_work, |
44 | pos: SPOS(inode: inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), |
45 | flags: BTREE_ITER_INTENT); |
46 | k = bch2_btree_iter_peek_slot(&iter); |
47 | ret = bkey_err(k); |
48 | if (ret) |
49 | goto err; |
50 | |
51 | v = k.k->type == KEY_TYPE_cookie |
52 | ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) |
53 | : 0; |
54 | |
55 | cookie = bch2_trans_kmalloc(trans, size: sizeof(*cookie)); |
56 | ret = PTR_ERR_OR_ZERO(ptr: cookie); |
57 | if (ret) |
58 | goto err; |
59 | |
60 | bkey_cookie_init(k: &cookie->k_i); |
61 | cookie->k.p = iter.pos; |
62 | cookie->v.cookie = cpu_to_le64(v + 1); |
63 | |
64 | ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); |
65 | err: |
66 | bch2_trans_iter_exit(trans, &iter); |
67 | return ret; |
68 | } |
69 | |
70 | int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) |
71 | { |
72 | int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, |
73 | __bch2_set_rebalance_needs_scan(trans, inum)); |
74 | rebalance_wakeup(c); |
75 | return ret; |
76 | } |
77 | |
78 | int bch2_set_fs_needs_rebalance(struct bch_fs *c) |
79 | { |
80 | return bch2_set_rebalance_needs_scan(c, inum: 0); |
81 | } |
82 | |
83 | static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) |
84 | { |
85 | struct btree_iter iter; |
86 | struct bkey_s_c k; |
87 | u64 v; |
88 | int ret; |
89 | |
90 | bch2_trans_iter_init(trans, iter: &iter, btree_id: BTREE_ID_rebalance_work, |
91 | pos: SPOS(inode: inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), |
92 | flags: BTREE_ITER_INTENT); |
93 | k = bch2_btree_iter_peek_slot(&iter); |
94 | ret = bkey_err(k); |
95 | if (ret) |
96 | goto err; |
97 | |
98 | v = k.k->type == KEY_TYPE_cookie |
99 | ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) |
100 | : 0; |
101 | |
102 | if (v == cookie) |
103 | ret = bch2_btree_delete_at(trans, &iter, 0); |
104 | err: |
105 | bch2_trans_iter_exit(trans, &iter); |
106 | return ret; |
107 | } |
108 | |
109 | static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, |
110 | struct btree_iter *work_iter) |
111 | { |
112 | return !kthread_should_stop() |
113 | ? bch2_btree_iter_peek(iter: work_iter) |
114 | : bkey_s_c_null; |
115 | } |
116 | |
117 | static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, |
118 | struct btree_iter *iter, |
119 | struct bkey_s_c k) |
120 | { |
121 | struct bkey_i *n = bch2_bkey_make_mut(trans, iter, k: &k, flags: 0); |
122 | int ret = PTR_ERR_OR_ZERO(ptr: n); |
123 | if (ret) |
124 | return ret; |
125 | |
126 | extent_entry_drop(k: bkey_i_to_s(k: n), |
127 | entry: (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(k: n))); |
128 | return bch2_trans_commit(trans, NULL, NULL, flags: BCH_TRANS_COMMIT_no_enospc); |
129 | } |
130 | |
131 | static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, |
132 | struct bpos work_pos, |
133 | struct btree_iter *extent_iter, |
134 | struct data_update_opts *data_opts) |
135 | { |
136 | struct bch_fs *c = trans->c; |
137 | struct bkey_s_c k; |
138 | |
139 | bch2_trans_iter_exit(trans, extent_iter); |
140 | bch2_trans_iter_init(trans, iter: extent_iter, |
141 | btree_id: work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, |
142 | pos: work_pos, |
143 | flags: BTREE_ITER_ALL_SNAPSHOTS); |
144 | k = bch2_btree_iter_peek_slot(extent_iter); |
145 | if (bkey_err(k)) |
146 | return k; |
147 | |
148 | const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; |
149 | if (!r) { |
150 | /* raced due to btree write buffer, nothing to do */ |
151 | return bkey_s_c_null; |
152 | } |
153 | |
154 | memset(data_opts, 0, sizeof(*data_opts)); |
155 | |
156 | data_opts->rewrite_ptrs = |
157 | bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); |
158 | data_opts->target = r->target; |
159 | |
160 | if (!data_opts->rewrite_ptrs) { |
161 | /* |
162 | * device we would want to write to offline? devices in target |
163 | * changed? |
164 | * |
165 | * We'll now need a full scan before this extent is picked up |
166 | * again: |
167 | */ |
168 | int ret = bch2_bkey_clear_needs_rebalance(trans, iter: extent_iter, k); |
169 | if (ret) |
170 | return bkey_s_c_err(ret); |
171 | return bkey_s_c_null; |
172 | } |
173 | |
174 | if (trace_rebalance_extent_enabled()) { |
175 | struct printbuf buf = PRINTBUF; |
176 | |
177 | prt_str(out: &buf, str: "target=" ); |
178 | bch2_target_to_text(out: &buf, c, r->target); |
179 | prt_str(out: &buf, str: " compression=" ); |
180 | bch2_compression_opt_to_text(&buf, r->compression); |
181 | prt_str(out: &buf, str: " " ); |
182 | bch2_bkey_val_to_text(&buf, c, k); |
183 | |
184 | trace_rebalance_extent(c, str: buf.buf); |
185 | printbuf_exit(&buf); |
186 | } |
187 | |
188 | return k; |
189 | } |
190 | |
191 | noinline_for_stack |
192 | static int do_rebalance_extent(struct moving_context *ctxt, |
193 | struct bpos work_pos, |
194 | struct btree_iter *extent_iter) |
195 | { |
196 | struct btree_trans *trans = ctxt->trans; |
197 | struct bch_fs *c = trans->c; |
198 | struct bch_fs_rebalance *r = &trans->c->rebalance; |
199 | struct data_update_opts data_opts; |
200 | struct bch_io_opts io_opts; |
201 | struct bkey_s_c k; |
202 | struct bkey_buf sk; |
203 | int ret; |
204 | |
205 | ctxt->stats = &r->work_stats; |
206 | r->state = BCH_REBALANCE_working; |
207 | |
208 | bch2_bkey_buf_init(s: &sk); |
209 | |
210 | ret = bkey_err(k = next_rebalance_extent(trans, work_pos, |
211 | extent_iter, &data_opts)); |
212 | if (ret || !k.k) |
213 | goto out; |
214 | |
215 | ret = bch2_move_get_io_opts_one(trans, &io_opts, k); |
216 | if (ret) |
217 | goto out; |
218 | |
219 | atomic64_add(i: k.k->size, v: &ctxt->stats->sectors_seen); |
220 | |
221 | /* |
222 | * The iterator gets unlocked by __bch2_read_extent - need to |
223 | * save a copy of @k elsewhere: |
224 | */ |
225 | bch2_bkey_buf_reassemble(s: &sk, c, k); |
226 | k = bkey_i_to_s_c(k: sk.k); |
227 | |
228 | ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); |
229 | if (ret) { |
230 | if (bch2_err_matches(ret, ENOMEM)) { |
231 | /* memory allocation failure, wait for some IO to finish */ |
232 | bch2_move_ctxt_wait_for_io(ctxt); |
233 | ret = -BCH_ERR_transaction_restart_nested; |
234 | } |
235 | |
236 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
237 | goto out; |
238 | |
239 | /* skip it and continue, XXX signal failure */ |
240 | ret = 0; |
241 | } |
242 | out: |
243 | bch2_bkey_buf_exit(s: &sk, c); |
244 | return ret; |
245 | } |
246 | |
247 | static bool rebalance_pred(struct bch_fs *c, void *arg, |
248 | struct bkey_s_c k, |
249 | struct bch_io_opts *io_opts, |
250 | struct data_update_opts *data_opts) |
251 | { |
252 | unsigned target, compression; |
253 | |
254 | if (k.k->p.inode) { |
255 | target = io_opts->background_target; |
256 | compression = background_compression(opts: *io_opts); |
257 | } else { |
258 | const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); |
259 | |
260 | target = r ? r->target : io_opts->background_target; |
261 | compression = r ? r->compression : background_compression(opts: *io_opts); |
262 | } |
263 | |
264 | data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); |
265 | data_opts->target = target; |
266 | return data_opts->rewrite_ptrs != 0; |
267 | } |
268 | |
269 | static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) |
270 | { |
271 | struct btree_trans *trans = ctxt->trans; |
272 | struct bch_fs_rebalance *r = &trans->c->rebalance; |
273 | int ret; |
274 | |
275 | bch2_move_stats_init(&r->scan_stats, "rebalance_scan" ); |
276 | ctxt->stats = &r->scan_stats; |
277 | |
278 | if (!inum) { |
279 | r->scan_start = BBPOS_MIN; |
280 | r->scan_end = BBPOS_MAX; |
281 | } else { |
282 | r->scan_start = BBPOS(btree: BTREE_ID_extents, POS(inum, 0)); |
283 | r->scan_end = BBPOS(btree: BTREE_ID_extents, POS(inum, U64_MAX)); |
284 | } |
285 | |
286 | r->state = BCH_REBALANCE_scanning; |
287 | |
288 | ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: |
289 | commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, |
290 | bch2_clear_rebalance_needs_scan(trans, inum, cookie)); |
291 | |
292 | bch2_move_stats_exit(&r->scan_stats, trans->c); |
293 | return ret; |
294 | } |
295 | |
296 | static void rebalance_wait(struct bch_fs *c) |
297 | { |
298 | struct bch_fs_rebalance *r = &c->rebalance; |
299 | struct io_clock *clock = &c->io_clock[WRITE]; |
300 | u64 now = atomic64_read(v: &clock->now); |
301 | u64 min_member_capacity = bch2_min_rw_member_capacity(c); |
302 | |
303 | if (min_member_capacity == U64_MAX) |
304 | min_member_capacity = 128 * 2048; |
305 | |
306 | r->wait_iotime_end = now + (min_member_capacity >> 6); |
307 | |
308 | if (r->state != BCH_REBALANCE_waiting) { |
309 | r->wait_iotime_start = now; |
310 | r->wait_wallclock_start = ktime_get_real_ns(); |
311 | r->state = BCH_REBALANCE_waiting; |
312 | } |
313 | |
314 | bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); |
315 | } |
316 | |
317 | static int do_rebalance(struct moving_context *ctxt) |
318 | { |
319 | struct btree_trans *trans = ctxt->trans; |
320 | struct bch_fs *c = trans->c; |
321 | struct bch_fs_rebalance *r = &c->rebalance; |
322 | struct btree_iter rebalance_work_iter, extent_iter = { NULL }; |
323 | struct bkey_s_c k; |
324 | int ret = 0; |
325 | |
326 | bch2_move_stats_init(&r->work_stats, "rebalance_work" ); |
327 | bch2_move_stats_init(&r->scan_stats, "rebalance_scan" ); |
328 | |
329 | bch2_trans_iter_init(trans, iter: &rebalance_work_iter, |
330 | btree_id: BTREE_ID_rebalance_work, POS_MIN, |
331 | flags: BTREE_ITER_ALL_SNAPSHOTS); |
332 | |
333 | while (!bch2_move_ratelimit(ctxt)) { |
334 | if (!r->enabled) { |
335 | bch2_moving_ctxt_flush_all(ctxt); |
336 | kthread_wait_freezable(r->enabled || |
337 | kthread_should_stop()); |
338 | } |
339 | |
340 | if (kthread_should_stop()) |
341 | break; |
342 | |
343 | bch2_trans_begin(trans); |
344 | |
345 | ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); |
346 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
347 | continue; |
348 | if (ret || !k.k) |
349 | break; |
350 | |
351 | ret = k.k->type == KEY_TYPE_cookie |
352 | ? do_rebalance_scan(ctxt, inum: k.k->p.inode, |
353 | le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) |
354 | : do_rebalance_extent(ctxt, work_pos: k.k->p, extent_iter: &extent_iter); |
355 | |
356 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
357 | continue; |
358 | if (ret) |
359 | break; |
360 | |
361 | bch2_btree_iter_advance(&rebalance_work_iter); |
362 | } |
363 | |
364 | bch2_trans_iter_exit(trans, &extent_iter); |
365 | bch2_trans_iter_exit(trans, &rebalance_work_iter); |
366 | bch2_move_stats_exit(&r->scan_stats, c); |
367 | |
368 | if (!ret && |
369 | !kthread_should_stop() && |
370 | !atomic64_read(v: &r->work_stats.sectors_seen) && |
371 | !atomic64_read(v: &r->scan_stats.sectors_seen)) { |
372 | bch2_moving_ctxt_flush_all(ctxt); |
373 | bch2_trans_unlock_long(trans); |
374 | rebalance_wait(c); |
375 | } |
376 | |
377 | if (!bch2_err_matches(ret, EROFS)) |
378 | bch_err_fn(c, ret); |
379 | return ret; |
380 | } |
381 | |
382 | static int bch2_rebalance_thread(void *arg) |
383 | { |
384 | struct bch_fs *c = arg; |
385 | struct bch_fs_rebalance *r = &c->rebalance; |
386 | struct moving_context ctxt; |
387 | |
388 | set_freezable(); |
389 | |
390 | bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, |
391 | writepoint_ptr(wp: &c->rebalance_write_point), |
392 | true); |
393 | |
394 | while (!kthread_should_stop() && !do_rebalance(ctxt: &ctxt)) |
395 | ; |
396 | |
397 | bch2_moving_ctxt_exit(&ctxt); |
398 | |
399 | return 0; |
400 | } |
401 | |
402 | void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) |
403 | { |
404 | struct bch_fs_rebalance *r = &c->rebalance; |
405 | |
406 | prt_str(out, str: bch2_rebalance_state_strs[r->state]); |
407 | prt_newline(out); |
408 | printbuf_indent_add(out, 2); |
409 | |
410 | switch (r->state) { |
411 | case BCH_REBALANCE_waiting: { |
412 | u64 now = atomic64_read(v: &c->io_clock[WRITE].now); |
413 | |
414 | prt_str(out, str: "io wait duration: " ); |
415 | bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); |
416 | prt_newline(out); |
417 | |
418 | prt_str(out, str: "io wait remaining: " ); |
419 | bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); |
420 | prt_newline(out); |
421 | |
422 | prt_str(out, str: "duration waited: " ); |
423 | bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); |
424 | prt_newline(out); |
425 | break; |
426 | } |
427 | case BCH_REBALANCE_working: |
428 | bch2_move_stats_to_text(out, &r->work_stats); |
429 | break; |
430 | case BCH_REBALANCE_scanning: |
431 | bch2_move_stats_to_text(out, &r->scan_stats); |
432 | break; |
433 | } |
434 | prt_newline(out); |
435 | printbuf_indent_sub(out, 2); |
436 | } |
437 | |
438 | void bch2_rebalance_stop(struct bch_fs *c) |
439 | { |
440 | struct task_struct *p; |
441 | |
442 | c->rebalance.pd.rate.rate = UINT_MAX; |
443 | bch2_ratelimit_reset(d: &c->rebalance.pd.rate); |
444 | |
445 | p = rcu_dereference_protected(c->rebalance.thread, 1); |
446 | c->rebalance.thread = NULL; |
447 | |
448 | if (p) { |
449 | /* for sychronizing with rebalance_wakeup() */ |
450 | synchronize_rcu(); |
451 | |
452 | kthread_stop(k: p); |
453 | put_task_struct(t: p); |
454 | } |
455 | } |
456 | |
457 | int bch2_rebalance_start(struct bch_fs *c) |
458 | { |
459 | struct task_struct *p; |
460 | int ret; |
461 | |
462 | if (c->rebalance.thread) |
463 | return 0; |
464 | |
465 | if (c->opts.nochanges) |
466 | return 0; |
467 | |
468 | p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s" , c->name); |
469 | ret = PTR_ERR_OR_ZERO(ptr: p); |
470 | bch_err_msg(c, ret, "creating rebalance thread" ); |
471 | if (ret) |
472 | return ret; |
473 | |
474 | get_task_struct(t: p); |
475 | rcu_assign_pointer(c->rebalance.thread, p); |
476 | wake_up_process(tsk: p); |
477 | return 0; |
478 | } |
479 | |
480 | void bch2_fs_rebalance_init(struct bch_fs *c) |
481 | { |
482 | bch2_pd_controller_init(&c->rebalance.pd); |
483 | } |
484 | |