1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "buckets.h" |
5 | #include "journal.h" |
6 | #include "replicas.h" |
7 | #include "super-io.h" |
8 | |
9 | #include <linux/sort.h> |
10 | |
11 | static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, |
12 | struct bch_replicas_cpu *); |
13 | |
14 | /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ |
15 | static int bch2_memcmp(const void *l, const void *r, const void *priv) |
16 | { |
17 | size_t size = (size_t) priv; |
18 | return memcmp(p: l, q: r, size); |
19 | } |
20 | |
21 | /* Replicas tracking - in memory: */ |
22 | |
23 | static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) |
24 | { |
25 | #ifdef CONFIG_BCACHEFS_DEBUG |
26 | unsigned i; |
27 | |
28 | BUG_ON(e->data_type >= BCH_DATA_NR); |
29 | BUG_ON(!e->nr_devs); |
30 | BUG_ON(e->nr_required > 1 && |
31 | e->nr_required >= e->nr_devs); |
32 | |
33 | for (i = 0; i + 1 < e->nr_devs; i++) |
34 | BUG_ON(e->devs[i] >= e->devs[i + 1]); |
35 | #endif |
36 | } |
37 | |
38 | void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) |
39 | { |
40 | bubble_sort(e->devs, e->nr_devs, u8_cmp); |
41 | } |
42 | |
43 | static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) |
44 | { |
45 | eytzinger0_sort_r(r->entries, r->nr, r->entry_size, |
46 | bch2_memcmp, NULL, (void *)(size_t)r->entry_size); |
47 | } |
48 | |
49 | static void bch2_replicas_entry_v0_to_text(struct printbuf *out, |
50 | struct bch_replicas_entry_v0 *e) |
51 | { |
52 | bch2_prt_data_type(out, e->data_type); |
53 | |
54 | prt_printf(out, ": %u [" , e->nr_devs); |
55 | for (unsigned i = 0; i < e->nr_devs; i++) |
56 | prt_printf(out, i ? " %u" : "%u" , e->devs[i]); |
57 | prt_printf(out, "]" ); |
58 | } |
59 | |
60 | void bch2_replicas_entry_to_text(struct printbuf *out, |
61 | struct bch_replicas_entry_v1 *e) |
62 | { |
63 | bch2_prt_data_type(out, e->data_type); |
64 | |
65 | prt_printf(out, ": %u/%u [" , e->nr_required, e->nr_devs); |
66 | for (unsigned i = 0; i < e->nr_devs; i++) |
67 | prt_printf(out, i ? " %u" : "%u" , e->devs[i]); |
68 | prt_printf(out, "]" ); |
69 | } |
70 | |
71 | int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, |
72 | struct bch_sb *sb, |
73 | struct printbuf *err) |
74 | { |
75 | if (!r->nr_devs) { |
76 | prt_printf(err, "no devices in entry " ); |
77 | goto bad; |
78 | } |
79 | |
80 | if (r->nr_required > 1 && |
81 | r->nr_required >= r->nr_devs) { |
82 | prt_printf(err, "bad nr_required in entry " ); |
83 | goto bad; |
84 | } |
85 | |
86 | for (unsigned i = 0; i < r->nr_devs; i++) |
87 | if (!bch2_dev_exists(sb, dev: r->devs[i])) { |
88 | prt_printf(err, "invalid device %u in entry " , r->devs[i]); |
89 | goto bad; |
90 | } |
91 | |
92 | return 0; |
93 | bad: |
94 | bch2_replicas_entry_to_text(out: err, e: r); |
95 | return -BCH_ERR_invalid_replicas_entry; |
96 | } |
97 | |
98 | void bch2_cpu_replicas_to_text(struct printbuf *out, |
99 | struct bch_replicas_cpu *r) |
100 | { |
101 | struct bch_replicas_entry_v1 *e; |
102 | bool first = true; |
103 | |
104 | for_each_cpu_replicas_entry(r, e) { |
105 | if (!first) |
106 | prt_printf(out, " " ); |
107 | first = false; |
108 | |
109 | bch2_replicas_entry_to_text(out, e); |
110 | } |
111 | } |
112 | |
113 | static void extent_to_replicas(struct bkey_s_c k, |
114 | struct bch_replicas_entry_v1 *r) |
115 | { |
116 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
117 | const union bch_extent_entry *entry; |
118 | struct extent_ptr_decoded p; |
119 | |
120 | r->nr_required = 1; |
121 | |
122 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { |
123 | if (p.ptr.cached) |
124 | continue; |
125 | |
126 | if (!p.has_ec) |
127 | r->devs[r->nr_devs++] = p.ptr.dev; |
128 | else |
129 | r->nr_required = 0; |
130 | } |
131 | } |
132 | |
133 | static void stripe_to_replicas(struct bkey_s_c k, |
134 | struct bch_replicas_entry_v1 *r) |
135 | { |
136 | struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); |
137 | const struct bch_extent_ptr *ptr; |
138 | |
139 | r->nr_required = s.v->nr_blocks - s.v->nr_redundant; |
140 | |
141 | for (ptr = s.v->ptrs; |
142 | ptr < s.v->ptrs + s.v->nr_blocks; |
143 | ptr++) |
144 | r->devs[r->nr_devs++] = ptr->dev; |
145 | } |
146 | |
147 | void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, |
148 | struct bkey_s_c k) |
149 | { |
150 | e->nr_devs = 0; |
151 | |
152 | switch (k.k->type) { |
153 | case KEY_TYPE_btree_ptr: |
154 | case KEY_TYPE_btree_ptr_v2: |
155 | e->data_type = BCH_DATA_btree; |
156 | extent_to_replicas(k, r: e); |
157 | break; |
158 | case KEY_TYPE_extent: |
159 | case KEY_TYPE_reflink_v: |
160 | e->data_type = BCH_DATA_user; |
161 | extent_to_replicas(k, r: e); |
162 | break; |
163 | case KEY_TYPE_stripe: |
164 | e->data_type = BCH_DATA_parity; |
165 | stripe_to_replicas(k, r: e); |
166 | break; |
167 | } |
168 | |
169 | bch2_replicas_entry_sort(e); |
170 | } |
171 | |
172 | void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, |
173 | enum bch_data_type data_type, |
174 | struct bch_devs_list devs) |
175 | { |
176 | BUG_ON(!data_type || |
177 | data_type == BCH_DATA_sb || |
178 | data_type >= BCH_DATA_NR); |
179 | |
180 | e->data_type = data_type; |
181 | e->nr_devs = 0; |
182 | e->nr_required = 1; |
183 | |
184 | darray_for_each(devs, i) |
185 | e->devs[e->nr_devs++] = *i; |
186 | |
187 | bch2_replicas_entry_sort(e); |
188 | } |
189 | |
190 | static struct bch_replicas_cpu |
191 | cpu_replicas_add_entry(struct bch_fs *c, |
192 | struct bch_replicas_cpu *old, |
193 | struct bch_replicas_entry_v1 *new_entry) |
194 | { |
195 | unsigned i; |
196 | struct bch_replicas_cpu new = { |
197 | .nr = old->nr + 1, |
198 | .entry_size = max_t(unsigned, old->entry_size, |
199 | replicas_entry_bytes(new_entry)), |
200 | }; |
201 | |
202 | for (i = 0; i < new_entry->nr_devs; i++) |
203 | BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); |
204 | |
205 | BUG_ON(!new_entry->data_type); |
206 | verify_replicas_entry(e: new_entry); |
207 | |
208 | new.entries = kcalloc(n: new.nr, size: new.entry_size, GFP_KERNEL); |
209 | if (!new.entries) |
210 | return new; |
211 | |
212 | for (i = 0; i < old->nr; i++) |
213 | memcpy(cpu_replicas_entry(&new, i), |
214 | cpu_replicas_entry(old, i), |
215 | old->entry_size); |
216 | |
217 | memcpy(cpu_replicas_entry(&new, old->nr), |
218 | new_entry, |
219 | replicas_entry_bytes(new_entry)); |
220 | |
221 | bch2_cpu_replicas_sort(r: &new); |
222 | return new; |
223 | } |
224 | |
225 | static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, |
226 | struct bch_replicas_entry_v1 *search) |
227 | { |
228 | int idx, entry_size = replicas_entry_bytes(search); |
229 | |
230 | if (unlikely(entry_size > r->entry_size)) |
231 | return -1; |
232 | |
233 | verify_replicas_entry(e: search); |
234 | |
235 | #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) |
236 | idx = eytzinger0_find(r->entries, r->nr, r->entry_size, |
237 | entry_cmp, search); |
238 | #undef entry_cmp |
239 | |
240 | return idx < r->nr ? idx : -1; |
241 | } |
242 | |
243 | int bch2_replicas_entry_idx(struct bch_fs *c, |
244 | struct bch_replicas_entry_v1 *search) |
245 | { |
246 | bch2_replicas_entry_sort(e: search); |
247 | |
248 | return __replicas_entry_idx(r: &c->replicas, search); |
249 | } |
250 | |
251 | static bool __replicas_has_entry(struct bch_replicas_cpu *r, |
252 | struct bch_replicas_entry_v1 *search) |
253 | { |
254 | return __replicas_entry_idx(r, search) >= 0; |
255 | } |
256 | |
257 | bool bch2_replicas_marked(struct bch_fs *c, |
258 | struct bch_replicas_entry_v1 *search) |
259 | { |
260 | bool marked; |
261 | |
262 | if (!search->nr_devs) |
263 | return true; |
264 | |
265 | verify_replicas_entry(e: search); |
266 | |
267 | percpu_down_read(sem: &c->mark_lock); |
268 | marked = __replicas_has_entry(r: &c->replicas, search) && |
269 | (likely((!c->replicas_gc.entries)) || |
270 | __replicas_has_entry(r: &c->replicas_gc, search)); |
271 | percpu_up_read(sem: &c->mark_lock); |
272 | |
273 | return marked; |
274 | } |
275 | |
276 | static void __replicas_table_update(struct bch_fs_usage *dst, |
277 | struct bch_replicas_cpu *dst_r, |
278 | struct bch_fs_usage *src, |
279 | struct bch_replicas_cpu *src_r) |
280 | { |
281 | int src_idx, dst_idx; |
282 | |
283 | *dst = *src; |
284 | |
285 | for (src_idx = 0; src_idx < src_r->nr; src_idx++) { |
286 | if (!src->replicas[src_idx]) |
287 | continue; |
288 | |
289 | dst_idx = __replicas_entry_idx(r: dst_r, |
290 | search: cpu_replicas_entry(r: src_r, i: src_idx)); |
291 | BUG_ON(dst_idx < 0); |
292 | |
293 | dst->replicas[dst_idx] = src->replicas[src_idx]; |
294 | } |
295 | } |
296 | |
297 | static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, |
298 | struct bch_replicas_cpu *dst_r, |
299 | struct bch_fs_usage __percpu *src_p, |
300 | struct bch_replicas_cpu *src_r) |
301 | { |
302 | unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; |
303 | struct bch_fs_usage *dst, *src = (void *) |
304 | bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); |
305 | |
306 | preempt_disable(); |
307 | dst = this_cpu_ptr(dst_p); |
308 | preempt_enable(); |
309 | |
310 | __replicas_table_update(dst, dst_r, src, src_r); |
311 | } |
312 | |
313 | /* |
314 | * Resize filesystem accounting: |
315 | */ |
316 | static int replicas_table_update(struct bch_fs *c, |
317 | struct bch_replicas_cpu *new_r) |
318 | { |
319 | struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; |
320 | struct bch_fs_usage_online *new_scratch = NULL; |
321 | struct bch_fs_usage __percpu *new_gc = NULL; |
322 | struct bch_fs_usage *new_base = NULL; |
323 | unsigned i, bytes = sizeof(struct bch_fs_usage) + |
324 | sizeof(u64) * new_r->nr; |
325 | unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + |
326 | sizeof(u64) * new_r->nr; |
327 | int ret = 0; |
328 | |
329 | memset(new_usage, 0, sizeof(new_usage)); |
330 | |
331 | for (i = 0; i < ARRAY_SIZE(new_usage); i++) |
332 | if (!(new_usage[i] = __alloc_percpu_gfp(size: bytes, |
333 | align: sizeof(u64), GFP_KERNEL))) |
334 | goto err; |
335 | |
336 | if (!(new_base = kzalloc(size: bytes, GFP_KERNEL)) || |
337 | !(new_scratch = kmalloc(size: scratch_bytes, GFP_KERNEL)) || |
338 | (c->usage_gc && |
339 | !(new_gc = __alloc_percpu_gfp(size: bytes, align: sizeof(u64), GFP_KERNEL)))) |
340 | goto err; |
341 | |
342 | for (i = 0; i < ARRAY_SIZE(new_usage); i++) |
343 | if (c->usage[i]) |
344 | __replicas_table_update_pcpu(dst_p: new_usage[i], dst_r: new_r, |
345 | src_p: c->usage[i], src_r: &c->replicas); |
346 | if (c->usage_base) |
347 | __replicas_table_update(dst: new_base, dst_r: new_r, |
348 | src: c->usage_base, src_r: &c->replicas); |
349 | if (c->usage_gc) |
350 | __replicas_table_update_pcpu(dst_p: new_gc, dst_r: new_r, |
351 | src_p: c->usage_gc, src_r: &c->replicas); |
352 | |
353 | for (i = 0; i < ARRAY_SIZE(new_usage); i++) |
354 | swap(c->usage[i], new_usage[i]); |
355 | swap(c->usage_base, new_base); |
356 | swap(c->usage_scratch, new_scratch); |
357 | swap(c->usage_gc, new_gc); |
358 | swap(c->replicas, *new_r); |
359 | out: |
360 | free_percpu(pdata: new_gc); |
361 | kfree(objp: new_scratch); |
362 | for (i = 0; i < ARRAY_SIZE(new_usage); i++) |
363 | free_percpu(pdata: new_usage[i]); |
364 | kfree(objp: new_base); |
365 | return ret; |
366 | err: |
367 | bch_err(c, "error updating replicas table: memory allocation failure" ); |
368 | ret = -BCH_ERR_ENOMEM_replicas_table; |
369 | goto out; |
370 | } |
371 | |
372 | static unsigned reserve_journal_replicas(struct bch_fs *c, |
373 | struct bch_replicas_cpu *r) |
374 | { |
375 | struct bch_replicas_entry_v1 *e; |
376 | unsigned journal_res_u64s = 0; |
377 | |
378 | /* nr_inodes: */ |
379 | journal_res_u64s += |
380 | DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); |
381 | |
382 | /* key_version: */ |
383 | journal_res_u64s += |
384 | DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); |
385 | |
386 | /* persistent_reserved: */ |
387 | journal_res_u64s += |
388 | DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * |
389 | BCH_REPLICAS_MAX; |
390 | |
391 | for_each_cpu_replicas_entry(r, e) |
392 | journal_res_u64s += |
393 | DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + |
394 | e->nr_devs, sizeof(u64)); |
395 | return journal_res_u64s; |
396 | } |
397 | |
398 | noinline |
399 | static int bch2_mark_replicas_slowpath(struct bch_fs *c, |
400 | struct bch_replicas_entry_v1 *new_entry) |
401 | { |
402 | struct bch_replicas_cpu new_r, new_gc; |
403 | int ret = 0; |
404 | |
405 | verify_replicas_entry(e: new_entry); |
406 | |
407 | memset(&new_r, 0, sizeof(new_r)); |
408 | memset(&new_gc, 0, sizeof(new_gc)); |
409 | |
410 | mutex_lock(&c->sb_lock); |
411 | |
412 | if (c->replicas_gc.entries && |
413 | !__replicas_has_entry(r: &c->replicas_gc, search: new_entry)) { |
414 | new_gc = cpu_replicas_add_entry(c, old: &c->replicas_gc, new_entry); |
415 | if (!new_gc.entries) { |
416 | ret = -BCH_ERR_ENOMEM_cpu_replicas; |
417 | goto err; |
418 | } |
419 | } |
420 | |
421 | if (!__replicas_has_entry(r: &c->replicas, search: new_entry)) { |
422 | new_r = cpu_replicas_add_entry(c, old: &c->replicas, new_entry); |
423 | if (!new_r.entries) { |
424 | ret = -BCH_ERR_ENOMEM_cpu_replicas; |
425 | goto err; |
426 | } |
427 | |
428 | ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); |
429 | if (ret) |
430 | goto err; |
431 | |
432 | bch2_journal_entry_res_resize(&c->journal, |
433 | &c->replicas_journal_res, |
434 | reserve_journal_replicas(c, r: &new_r)); |
435 | } |
436 | |
437 | if (!new_r.entries && |
438 | !new_gc.entries) |
439 | goto out; |
440 | |
441 | /* allocations done, now commit: */ |
442 | |
443 | if (new_r.entries) |
444 | bch2_write_super(c); |
445 | |
446 | /* don't update in memory replicas until changes are persistent */ |
447 | percpu_down_write(&c->mark_lock); |
448 | if (new_r.entries) |
449 | ret = replicas_table_update(c, new_r: &new_r); |
450 | if (new_gc.entries) |
451 | swap(new_gc, c->replicas_gc); |
452 | percpu_up_write(&c->mark_lock); |
453 | out: |
454 | mutex_unlock(lock: &c->sb_lock); |
455 | |
456 | kfree(objp: new_r.entries); |
457 | kfree(objp: new_gc.entries); |
458 | |
459 | return ret; |
460 | err: |
461 | bch_err_msg(c, ret, "adding replicas entry" ); |
462 | goto out; |
463 | } |
464 | |
465 | int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) |
466 | { |
467 | return likely(bch2_replicas_marked(c, r)) |
468 | ? 0 : bch2_mark_replicas_slowpath(c, new_entry: r); |
469 | } |
470 | |
471 | /* replicas delta list: */ |
472 | |
473 | int bch2_replicas_delta_list_mark(struct bch_fs *c, |
474 | struct replicas_delta_list *r) |
475 | { |
476 | struct replicas_delta *d = r->d; |
477 | struct replicas_delta *top = (void *) r->d + r->used; |
478 | int ret = 0; |
479 | |
480 | for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) |
481 | ret = bch2_mark_replicas(c, r: &d->r); |
482 | return ret; |
483 | } |
484 | |
485 | /* |
486 | * Old replicas_gc mechanism: only used for journal replicas entries now, should |
487 | * die at some point: |
488 | */ |
489 | |
490 | int bch2_replicas_gc_end(struct bch_fs *c, int ret) |
491 | { |
492 | lockdep_assert_held(&c->replicas_gc_lock); |
493 | |
494 | mutex_lock(&c->sb_lock); |
495 | percpu_down_write(&c->mark_lock); |
496 | |
497 | ret = ret ?: |
498 | bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: |
499 | replicas_table_update(c, new_r: &c->replicas_gc); |
500 | |
501 | kfree(objp: c->replicas_gc.entries); |
502 | c->replicas_gc.entries = NULL; |
503 | |
504 | percpu_up_write(&c->mark_lock); |
505 | |
506 | if (!ret) |
507 | bch2_write_super(c); |
508 | |
509 | mutex_unlock(lock: &c->sb_lock); |
510 | |
511 | return ret; |
512 | } |
513 | |
514 | int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) |
515 | { |
516 | struct bch_replicas_entry_v1 *e; |
517 | unsigned i = 0; |
518 | |
519 | lockdep_assert_held(&c->replicas_gc_lock); |
520 | |
521 | mutex_lock(&c->sb_lock); |
522 | BUG_ON(c->replicas_gc.entries); |
523 | |
524 | c->replicas_gc.nr = 0; |
525 | c->replicas_gc.entry_size = 0; |
526 | |
527 | for_each_cpu_replicas_entry(&c->replicas, e) |
528 | if (!((1 << e->data_type) & typemask)) { |
529 | c->replicas_gc.nr++; |
530 | c->replicas_gc.entry_size = |
531 | max_t(unsigned, c->replicas_gc.entry_size, |
532 | replicas_entry_bytes(e)); |
533 | } |
534 | |
535 | c->replicas_gc.entries = kcalloc(n: c->replicas_gc.nr, |
536 | size: c->replicas_gc.entry_size, |
537 | GFP_KERNEL); |
538 | if (!c->replicas_gc.entries) { |
539 | mutex_unlock(lock: &c->sb_lock); |
540 | bch_err(c, "error allocating c->replicas_gc" ); |
541 | return -BCH_ERR_ENOMEM_replicas_gc; |
542 | } |
543 | |
544 | for_each_cpu_replicas_entry(&c->replicas, e) |
545 | if (!((1 << e->data_type) & typemask)) |
546 | memcpy(cpu_replicas_entry(&c->replicas_gc, i++), |
547 | e, c->replicas_gc.entry_size); |
548 | |
549 | bch2_cpu_replicas_sort(r: &c->replicas_gc); |
550 | mutex_unlock(lock: &c->sb_lock); |
551 | |
552 | return 0; |
553 | } |
554 | |
555 | /* |
556 | * New much simpler mechanism for clearing out unneeded replicas entries - drop |
557 | * replicas entries that have 0 sectors used. |
558 | * |
559 | * However, we don't track sector counts for journal usage, so this doesn't drop |
560 | * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism |
561 | * is retained for that. |
562 | */ |
563 | int bch2_replicas_gc2(struct bch_fs *c) |
564 | { |
565 | struct bch_replicas_cpu new = { 0 }; |
566 | unsigned i, nr; |
567 | int ret = 0; |
568 | |
569 | bch2_journal_meta(&c->journal); |
570 | retry: |
571 | nr = READ_ONCE(c->replicas.nr); |
572 | new.entry_size = READ_ONCE(c->replicas.entry_size); |
573 | new.entries = kcalloc(n: nr, size: new.entry_size, GFP_KERNEL); |
574 | if (!new.entries) { |
575 | bch_err(c, "error allocating c->replicas_gc" ); |
576 | return -BCH_ERR_ENOMEM_replicas_gc; |
577 | } |
578 | |
579 | mutex_lock(&c->sb_lock); |
580 | percpu_down_write(&c->mark_lock); |
581 | |
582 | if (nr != c->replicas.nr || |
583 | new.entry_size != c->replicas.entry_size) { |
584 | percpu_up_write(&c->mark_lock); |
585 | mutex_unlock(lock: &c->sb_lock); |
586 | kfree(objp: new.entries); |
587 | goto retry; |
588 | } |
589 | |
590 | for (i = 0; i < c->replicas.nr; i++) { |
591 | struct bch_replicas_entry_v1 *e = |
592 | cpu_replicas_entry(r: &c->replicas, i); |
593 | |
594 | if (e->data_type == BCH_DATA_journal || |
595 | c->usage_base->replicas[i] || |
596 | percpu_u64_get(src: &c->usage[0]->replicas[i]) || |
597 | percpu_u64_get(src: &c->usage[1]->replicas[i]) || |
598 | percpu_u64_get(src: &c->usage[2]->replicas[i]) || |
599 | percpu_u64_get(src: &c->usage[3]->replicas[i])) |
600 | memcpy(cpu_replicas_entry(&new, new.nr++), |
601 | e, new.entry_size); |
602 | } |
603 | |
604 | bch2_cpu_replicas_sort(r: &new); |
605 | |
606 | ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: |
607 | replicas_table_update(c, new_r: &new); |
608 | |
609 | kfree(objp: new.entries); |
610 | |
611 | percpu_up_write(&c->mark_lock); |
612 | |
613 | if (!ret) |
614 | bch2_write_super(c); |
615 | |
616 | mutex_unlock(lock: &c->sb_lock); |
617 | |
618 | return ret; |
619 | } |
620 | |
621 | int bch2_replicas_set_usage(struct bch_fs *c, |
622 | struct bch_replicas_entry_v1 *r, |
623 | u64 sectors) |
624 | { |
625 | int ret, idx = bch2_replicas_entry_idx(c, search: r); |
626 | |
627 | if (idx < 0) { |
628 | struct bch_replicas_cpu n; |
629 | |
630 | n = cpu_replicas_add_entry(c, old: &c->replicas, new_entry: r); |
631 | if (!n.entries) |
632 | return -BCH_ERR_ENOMEM_cpu_replicas; |
633 | |
634 | ret = replicas_table_update(c, new_r: &n); |
635 | if (ret) |
636 | return ret; |
637 | |
638 | kfree(objp: n.entries); |
639 | |
640 | idx = bch2_replicas_entry_idx(c, search: r); |
641 | BUG_ON(ret < 0); |
642 | } |
643 | |
644 | c->usage_base->replicas[idx] = sectors; |
645 | |
646 | return 0; |
647 | } |
648 | |
649 | /* Replicas tracking - superblock: */ |
650 | |
651 | static int |
652 | __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, |
653 | struct bch_replicas_cpu *cpu_r) |
654 | { |
655 | struct bch_replicas_entry_v1 *e, *dst; |
656 | unsigned nr = 0, entry_size = 0, idx = 0; |
657 | |
658 | for_each_replicas_entry(sb_r, e) { |
659 | entry_size = max_t(unsigned, entry_size, |
660 | replicas_entry_bytes(e)); |
661 | nr++; |
662 | } |
663 | |
664 | cpu_r->entries = kcalloc(n: nr, size: entry_size, GFP_KERNEL); |
665 | if (!cpu_r->entries) |
666 | return -BCH_ERR_ENOMEM_cpu_replicas; |
667 | |
668 | cpu_r->nr = nr; |
669 | cpu_r->entry_size = entry_size; |
670 | |
671 | for_each_replicas_entry(sb_r, e) { |
672 | dst = cpu_replicas_entry(r: cpu_r, i: idx++); |
673 | memcpy(dst, e, replicas_entry_bytes(e)); |
674 | bch2_replicas_entry_sort(e: dst); |
675 | } |
676 | |
677 | return 0; |
678 | } |
679 | |
680 | static int |
681 | __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, |
682 | struct bch_replicas_cpu *cpu_r) |
683 | { |
684 | struct bch_replicas_entry_v0 *e; |
685 | unsigned nr = 0, entry_size = 0, idx = 0; |
686 | |
687 | for_each_replicas_entry(sb_r, e) { |
688 | entry_size = max_t(unsigned, entry_size, |
689 | replicas_entry_bytes(e)); |
690 | nr++; |
691 | } |
692 | |
693 | entry_size += sizeof(struct bch_replicas_entry_v1) - |
694 | sizeof(struct bch_replicas_entry_v0); |
695 | |
696 | cpu_r->entries = kcalloc(n: nr, size: entry_size, GFP_KERNEL); |
697 | if (!cpu_r->entries) |
698 | return -BCH_ERR_ENOMEM_cpu_replicas; |
699 | |
700 | cpu_r->nr = nr; |
701 | cpu_r->entry_size = entry_size; |
702 | |
703 | for_each_replicas_entry(sb_r, e) { |
704 | struct bch_replicas_entry_v1 *dst = |
705 | cpu_replicas_entry(r: cpu_r, i: idx++); |
706 | |
707 | dst->data_type = e->data_type; |
708 | dst->nr_devs = e->nr_devs; |
709 | dst->nr_required = 1; |
710 | memcpy(dst->devs, e->devs, e->nr_devs); |
711 | bch2_replicas_entry_sort(e: dst); |
712 | } |
713 | |
714 | return 0; |
715 | } |
716 | |
717 | int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) |
718 | { |
719 | struct bch_sb_field_replicas *sb_v1; |
720 | struct bch_sb_field_replicas_v0 *sb_v0; |
721 | struct bch_replicas_cpu new_r = { 0, 0, NULL }; |
722 | int ret = 0; |
723 | |
724 | if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) |
725 | ret = __bch2_sb_replicas_to_cpu_replicas(sb_r: sb_v1, cpu_r: &new_r); |
726 | else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) |
727 | ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r: sb_v0, cpu_r: &new_r); |
728 | if (ret) |
729 | return ret; |
730 | |
731 | bch2_cpu_replicas_sort(r: &new_r); |
732 | |
733 | percpu_down_write(&c->mark_lock); |
734 | |
735 | ret = replicas_table_update(c, new_r: &new_r); |
736 | percpu_up_write(&c->mark_lock); |
737 | |
738 | kfree(objp: new_r.entries); |
739 | |
740 | return 0; |
741 | } |
742 | |
743 | static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, |
744 | struct bch_replicas_cpu *r) |
745 | { |
746 | struct bch_sb_field_replicas_v0 *sb_r; |
747 | struct bch_replicas_entry_v0 *dst; |
748 | struct bch_replicas_entry_v1 *src; |
749 | size_t bytes; |
750 | |
751 | bytes = sizeof(struct bch_sb_field_replicas); |
752 | |
753 | for_each_cpu_replicas_entry(r, src) |
754 | bytes += replicas_entry_bytes(src) - 1; |
755 | |
756 | sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, |
757 | DIV_ROUND_UP(bytes, sizeof(u64))); |
758 | if (!sb_r) |
759 | return -BCH_ERR_ENOSPC_sb_replicas; |
760 | |
761 | bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); |
762 | sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); |
763 | |
764 | memset(&sb_r->entries, 0, |
765 | vstruct_end(&sb_r->field) - |
766 | (void *) &sb_r->entries); |
767 | |
768 | dst = sb_r->entries; |
769 | for_each_cpu_replicas_entry(r, src) { |
770 | dst->data_type = src->data_type; |
771 | dst->nr_devs = src->nr_devs; |
772 | memcpy(dst->devs, src->devs, src->nr_devs); |
773 | |
774 | dst = replicas_entry_next(dst); |
775 | |
776 | BUG_ON((void *) dst > vstruct_end(&sb_r->field)); |
777 | } |
778 | |
779 | return 0; |
780 | } |
781 | |
782 | static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, |
783 | struct bch_replicas_cpu *r) |
784 | { |
785 | struct bch_sb_field_replicas *sb_r; |
786 | struct bch_replicas_entry_v1 *dst, *src; |
787 | bool need_v1 = false; |
788 | size_t bytes; |
789 | |
790 | bytes = sizeof(struct bch_sb_field_replicas); |
791 | |
792 | for_each_cpu_replicas_entry(r, src) { |
793 | bytes += replicas_entry_bytes(src); |
794 | if (src->nr_required != 1) |
795 | need_v1 = true; |
796 | } |
797 | |
798 | if (!need_v1) |
799 | return bch2_cpu_replicas_to_sb_replicas_v0(c, r); |
800 | |
801 | sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, |
802 | DIV_ROUND_UP(bytes, sizeof(u64))); |
803 | if (!sb_r) |
804 | return -BCH_ERR_ENOSPC_sb_replicas; |
805 | |
806 | bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); |
807 | sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); |
808 | |
809 | memset(&sb_r->entries, 0, |
810 | vstruct_end(&sb_r->field) - |
811 | (void *) &sb_r->entries); |
812 | |
813 | dst = sb_r->entries; |
814 | for_each_cpu_replicas_entry(r, src) { |
815 | memcpy(dst, src, replicas_entry_bytes(src)); |
816 | |
817 | dst = replicas_entry_next(dst); |
818 | |
819 | BUG_ON((void *) dst > vstruct_end(&sb_r->field)); |
820 | } |
821 | |
822 | return 0; |
823 | } |
824 | |
825 | static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, |
826 | struct bch_sb *sb, |
827 | struct printbuf *err) |
828 | { |
829 | unsigned i; |
830 | |
831 | sort_r(base: cpu_r->entries, |
832 | num: cpu_r->nr, |
833 | size: cpu_r->entry_size, |
834 | cmp_func: bch2_memcmp, NULL, |
835 | priv: (void *)(size_t)cpu_r->entry_size); |
836 | |
837 | for (i = 0; i < cpu_r->nr; i++) { |
838 | struct bch_replicas_entry_v1 *e = |
839 | cpu_replicas_entry(r: cpu_r, i); |
840 | |
841 | int ret = bch2_replicas_entry_validate(r: e, sb, err); |
842 | if (ret) |
843 | return ret; |
844 | |
845 | if (i + 1 < cpu_r->nr) { |
846 | struct bch_replicas_entry_v1 *n = |
847 | cpu_replicas_entry(r: cpu_r, i: i + 1); |
848 | |
849 | BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); |
850 | |
851 | if (!memcmp(p: e, q: n, size: cpu_r->entry_size)) { |
852 | prt_printf(err, "duplicate replicas entry " ); |
853 | bch2_replicas_entry_to_text(out: err, e); |
854 | return -BCH_ERR_invalid_sb_replicas; |
855 | } |
856 | } |
857 | } |
858 | |
859 | return 0; |
860 | } |
861 | |
862 | static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, |
863 | struct printbuf *err) |
864 | { |
865 | struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); |
866 | struct bch_replicas_cpu cpu_r; |
867 | int ret; |
868 | |
869 | ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, cpu_r: &cpu_r); |
870 | if (ret) |
871 | return ret; |
872 | |
873 | ret = bch2_cpu_replicas_validate(cpu_r: &cpu_r, sb, err); |
874 | kfree(objp: cpu_r.entries); |
875 | return ret; |
876 | } |
877 | |
878 | static void bch2_sb_replicas_to_text(struct printbuf *out, |
879 | struct bch_sb *sb, |
880 | struct bch_sb_field *f) |
881 | { |
882 | struct bch_sb_field_replicas *r = field_to_type(f, replicas); |
883 | struct bch_replicas_entry_v1 *e; |
884 | bool first = true; |
885 | |
886 | for_each_replicas_entry(r, e) { |
887 | if (!first) |
888 | prt_printf(out, " " ); |
889 | first = false; |
890 | |
891 | bch2_replicas_entry_to_text(out, e); |
892 | } |
893 | prt_newline(out); |
894 | } |
895 | |
896 | const struct bch_sb_field_ops bch_sb_field_ops_replicas = { |
897 | .validate = bch2_sb_replicas_validate, |
898 | .to_text = bch2_sb_replicas_to_text, |
899 | }; |
900 | |
901 | static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, |
902 | struct printbuf *err) |
903 | { |
904 | struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); |
905 | struct bch_replicas_cpu cpu_r; |
906 | int ret; |
907 | |
908 | ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, cpu_r: &cpu_r); |
909 | if (ret) |
910 | return ret; |
911 | |
912 | ret = bch2_cpu_replicas_validate(cpu_r: &cpu_r, sb, err); |
913 | kfree(objp: cpu_r.entries); |
914 | return ret; |
915 | } |
916 | |
917 | static void bch2_sb_replicas_v0_to_text(struct printbuf *out, |
918 | struct bch_sb *sb, |
919 | struct bch_sb_field *f) |
920 | { |
921 | struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); |
922 | struct bch_replicas_entry_v0 *e; |
923 | bool first = true; |
924 | |
925 | for_each_replicas_entry(sb_r, e) { |
926 | if (!first) |
927 | prt_printf(out, " " ); |
928 | first = false; |
929 | |
930 | bch2_replicas_entry_v0_to_text(out, e); |
931 | } |
932 | prt_newline(out); |
933 | } |
934 | |
935 | const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { |
936 | .validate = bch2_sb_replicas_v0_validate, |
937 | .to_text = bch2_sb_replicas_v0_to_text, |
938 | }; |
939 | |
940 | /* Query replicas: */ |
941 | |
942 | bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, |
943 | unsigned flags, bool print) |
944 | { |
945 | struct bch_replicas_entry_v1 *e; |
946 | bool ret = true; |
947 | |
948 | percpu_down_read(sem: &c->mark_lock); |
949 | for_each_cpu_replicas_entry(&c->replicas, e) { |
950 | unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; |
951 | bool metadata = e->data_type < BCH_DATA_user; |
952 | |
953 | if (e->data_type == BCH_DATA_cached) |
954 | continue; |
955 | |
956 | for (i = 0; i < e->nr_devs; i++) { |
957 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: e->devs[i]); |
958 | |
959 | nr_online += test_bit(e->devs[i], devs.d); |
960 | nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; |
961 | } |
962 | |
963 | if (nr_failed == e->nr_devs) |
964 | continue; |
965 | |
966 | if (nr_online < e->nr_required) |
967 | dflags |= metadata |
968 | ? BCH_FORCE_IF_METADATA_LOST |
969 | : BCH_FORCE_IF_DATA_LOST; |
970 | |
971 | if (nr_online < e->nr_devs) |
972 | dflags |= metadata |
973 | ? BCH_FORCE_IF_METADATA_DEGRADED |
974 | : BCH_FORCE_IF_DATA_DEGRADED; |
975 | |
976 | if (dflags & ~flags) { |
977 | if (print) { |
978 | struct printbuf buf = PRINTBUF; |
979 | |
980 | bch2_replicas_entry_to_text(out: &buf, e); |
981 | bch_err(c, "insufficient devices online (%u) for replicas entry %s" , |
982 | nr_online, buf.buf); |
983 | printbuf_exit(&buf); |
984 | } |
985 | ret = false; |
986 | break; |
987 | } |
988 | |
989 | } |
990 | percpu_up_read(sem: &c->mark_lock); |
991 | |
992 | return ret; |
993 | } |
994 | |
995 | unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) |
996 | { |
997 | struct bch_sb_field_replicas *replicas; |
998 | struct bch_sb_field_replicas_v0 *replicas_v0; |
999 | unsigned i, data_has = 0; |
1000 | |
1001 | replicas = bch2_sb_field_get(sb, replicas); |
1002 | replicas_v0 = bch2_sb_field_get(sb, replicas_v0); |
1003 | |
1004 | if (replicas) { |
1005 | struct bch_replicas_entry_v1 *r; |
1006 | |
1007 | for_each_replicas_entry(replicas, r) |
1008 | for (i = 0; i < r->nr_devs; i++) |
1009 | if (r->devs[i] == dev) |
1010 | data_has |= 1 << r->data_type; |
1011 | } else if (replicas_v0) { |
1012 | struct bch_replicas_entry_v0 *r; |
1013 | |
1014 | for_each_replicas_entry_v0(replicas_v0, r) |
1015 | for (i = 0; i < r->nr_devs; i++) |
1016 | if (r->devs[i] == dev) |
1017 | data_has |= 1 << r->data_type; |
1018 | } |
1019 | |
1020 | |
1021 | return data_has; |
1022 | } |
1023 | |
1024 | unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) |
1025 | { |
1026 | unsigned ret; |
1027 | |
1028 | mutex_lock(&c->sb_lock); |
1029 | ret = bch2_sb_dev_has_data(sb: c->disk_sb.sb, dev: ca->dev_idx); |
1030 | mutex_unlock(lock: &c->sb_lock); |
1031 | |
1032 | return ret; |
1033 | } |
1034 | |
1035 | void bch2_fs_replicas_exit(struct bch_fs *c) |
1036 | { |
1037 | unsigned i; |
1038 | |
1039 | kfree(objp: c->usage_scratch); |
1040 | for (i = 0; i < ARRAY_SIZE(c->usage); i++) |
1041 | free_percpu(pdata: c->usage[i]); |
1042 | kfree(objp: c->usage_base); |
1043 | kfree(objp: c->replicas.entries); |
1044 | kfree(objp: c->replicas_gc.entries); |
1045 | |
1046 | mempool_exit(pool: &c->replicas_delta_pool); |
1047 | } |
1048 | |
1049 | int bch2_fs_replicas_init(struct bch_fs *c) |
1050 | { |
1051 | bch2_journal_entry_res_resize(&c->journal, |
1052 | &c->replicas_journal_res, |
1053 | reserve_journal_replicas(c, r: &c->replicas)); |
1054 | |
1055 | return mempool_init_kmalloc_pool(pool: &c->replicas_delta_pool, min_nr: 1, |
1056 | REPLICAS_DELTA_LIST_MAX) ?: |
1057 | replicas_table_update(c, new_r: &c->replicas); |
1058 | } |
1059 | |