1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "buckets.h"
5#include "journal.h"
6#include "replicas.h"
7#include "super-io.h"
8
9#include <linux/sort.h>
10
11static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
12 struct bch_replicas_cpu *);
13
14/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
15static int bch2_memcmp(const void *l, const void *r, const void *priv)
16{
17 size_t size = (size_t) priv;
18 return memcmp(p: l, q: r, size);
19}
20
21/* Replicas tracking - in memory: */
22
23static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
24{
25#ifdef CONFIG_BCACHEFS_DEBUG
26 unsigned i;
27
28 BUG_ON(e->data_type >= BCH_DATA_NR);
29 BUG_ON(!e->nr_devs);
30 BUG_ON(e->nr_required > 1 &&
31 e->nr_required >= e->nr_devs);
32
33 for (i = 0; i + 1 < e->nr_devs; i++)
34 BUG_ON(e->devs[i] >= e->devs[i + 1]);
35#endif
36}
37
38void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
39{
40 bubble_sort(e->devs, e->nr_devs, u8_cmp);
41}
42
43static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
44{
45 eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
46 bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
47}
48
49static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
50 struct bch_replicas_entry_v0 *e)
51{
52 bch2_prt_data_type(out, e->data_type);
53
54 prt_printf(out, ": %u [", e->nr_devs);
55 for (unsigned i = 0; i < e->nr_devs; i++)
56 prt_printf(out, i ? " %u" : "%u", e->devs[i]);
57 prt_printf(out, "]");
58}
59
60void bch2_replicas_entry_to_text(struct printbuf *out,
61 struct bch_replicas_entry_v1 *e)
62{
63 bch2_prt_data_type(out, e->data_type);
64
65 prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
66 for (unsigned i = 0; i < e->nr_devs; i++)
67 prt_printf(out, i ? " %u" : "%u", e->devs[i]);
68 prt_printf(out, "]");
69}
70
71int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
72 struct bch_sb *sb,
73 struct printbuf *err)
74{
75 if (!r->nr_devs) {
76 prt_printf(err, "no devices in entry ");
77 goto bad;
78 }
79
80 if (r->nr_required > 1 &&
81 r->nr_required >= r->nr_devs) {
82 prt_printf(err, "bad nr_required in entry ");
83 goto bad;
84 }
85
86 for (unsigned i = 0; i < r->nr_devs; i++)
87 if (!bch2_dev_exists(sb, dev: r->devs[i])) {
88 prt_printf(err, "invalid device %u in entry ", r->devs[i]);
89 goto bad;
90 }
91
92 return 0;
93bad:
94 bch2_replicas_entry_to_text(out: err, e: r);
95 return -BCH_ERR_invalid_replicas_entry;
96}
97
98void bch2_cpu_replicas_to_text(struct printbuf *out,
99 struct bch_replicas_cpu *r)
100{
101 struct bch_replicas_entry_v1 *e;
102 bool first = true;
103
104 for_each_cpu_replicas_entry(r, e) {
105 if (!first)
106 prt_printf(out, " ");
107 first = false;
108
109 bch2_replicas_entry_to_text(out, e);
110 }
111}
112
113static void extent_to_replicas(struct bkey_s_c k,
114 struct bch_replicas_entry_v1 *r)
115{
116 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
117 const union bch_extent_entry *entry;
118 struct extent_ptr_decoded p;
119
120 r->nr_required = 1;
121
122 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
123 if (p.ptr.cached)
124 continue;
125
126 if (!p.has_ec)
127 r->devs[r->nr_devs++] = p.ptr.dev;
128 else
129 r->nr_required = 0;
130 }
131}
132
133static void stripe_to_replicas(struct bkey_s_c k,
134 struct bch_replicas_entry_v1 *r)
135{
136 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
137 const struct bch_extent_ptr *ptr;
138
139 r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
140
141 for (ptr = s.v->ptrs;
142 ptr < s.v->ptrs + s.v->nr_blocks;
143 ptr++)
144 r->devs[r->nr_devs++] = ptr->dev;
145}
146
147void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
148 struct bkey_s_c k)
149{
150 e->nr_devs = 0;
151
152 switch (k.k->type) {
153 case KEY_TYPE_btree_ptr:
154 case KEY_TYPE_btree_ptr_v2:
155 e->data_type = BCH_DATA_btree;
156 extent_to_replicas(k, r: e);
157 break;
158 case KEY_TYPE_extent:
159 case KEY_TYPE_reflink_v:
160 e->data_type = BCH_DATA_user;
161 extent_to_replicas(k, r: e);
162 break;
163 case KEY_TYPE_stripe:
164 e->data_type = BCH_DATA_parity;
165 stripe_to_replicas(k, r: e);
166 break;
167 }
168
169 bch2_replicas_entry_sort(e);
170}
171
172void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
173 enum bch_data_type data_type,
174 struct bch_devs_list devs)
175{
176 BUG_ON(!data_type ||
177 data_type == BCH_DATA_sb ||
178 data_type >= BCH_DATA_NR);
179
180 e->data_type = data_type;
181 e->nr_devs = 0;
182 e->nr_required = 1;
183
184 darray_for_each(devs, i)
185 e->devs[e->nr_devs++] = *i;
186
187 bch2_replicas_entry_sort(e);
188}
189
190static struct bch_replicas_cpu
191cpu_replicas_add_entry(struct bch_fs *c,
192 struct bch_replicas_cpu *old,
193 struct bch_replicas_entry_v1 *new_entry)
194{
195 unsigned i;
196 struct bch_replicas_cpu new = {
197 .nr = old->nr + 1,
198 .entry_size = max_t(unsigned, old->entry_size,
199 replicas_entry_bytes(new_entry)),
200 };
201
202 for (i = 0; i < new_entry->nr_devs; i++)
203 BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i]));
204
205 BUG_ON(!new_entry->data_type);
206 verify_replicas_entry(e: new_entry);
207
208 new.entries = kcalloc(n: new.nr, size: new.entry_size, GFP_KERNEL);
209 if (!new.entries)
210 return new;
211
212 for (i = 0; i < old->nr; i++)
213 memcpy(cpu_replicas_entry(&new, i),
214 cpu_replicas_entry(old, i),
215 old->entry_size);
216
217 memcpy(cpu_replicas_entry(&new, old->nr),
218 new_entry,
219 replicas_entry_bytes(new_entry));
220
221 bch2_cpu_replicas_sort(r: &new);
222 return new;
223}
224
225static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
226 struct bch_replicas_entry_v1 *search)
227{
228 int idx, entry_size = replicas_entry_bytes(search);
229
230 if (unlikely(entry_size > r->entry_size))
231 return -1;
232
233 verify_replicas_entry(e: search);
234
235#define entry_cmp(_l, _r) memcmp(_l, _r, entry_size)
236 idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
237 entry_cmp, search);
238#undef entry_cmp
239
240 return idx < r->nr ? idx : -1;
241}
242
243int bch2_replicas_entry_idx(struct bch_fs *c,
244 struct bch_replicas_entry_v1 *search)
245{
246 bch2_replicas_entry_sort(e: search);
247
248 return __replicas_entry_idx(r: &c->replicas, search);
249}
250
251static bool __replicas_has_entry(struct bch_replicas_cpu *r,
252 struct bch_replicas_entry_v1 *search)
253{
254 return __replicas_entry_idx(r, search) >= 0;
255}
256
257bool bch2_replicas_marked(struct bch_fs *c,
258 struct bch_replicas_entry_v1 *search)
259{
260 bool marked;
261
262 if (!search->nr_devs)
263 return true;
264
265 verify_replicas_entry(e: search);
266
267 percpu_down_read(sem: &c->mark_lock);
268 marked = __replicas_has_entry(r: &c->replicas, search) &&
269 (likely((!c->replicas_gc.entries)) ||
270 __replicas_has_entry(r: &c->replicas_gc, search));
271 percpu_up_read(sem: &c->mark_lock);
272
273 return marked;
274}
275
276static void __replicas_table_update(struct bch_fs_usage *dst,
277 struct bch_replicas_cpu *dst_r,
278 struct bch_fs_usage *src,
279 struct bch_replicas_cpu *src_r)
280{
281 int src_idx, dst_idx;
282
283 *dst = *src;
284
285 for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
286 if (!src->replicas[src_idx])
287 continue;
288
289 dst_idx = __replicas_entry_idx(r: dst_r,
290 search: cpu_replicas_entry(r: src_r, i: src_idx));
291 BUG_ON(dst_idx < 0);
292
293 dst->replicas[dst_idx] = src->replicas[src_idx];
294 }
295}
296
297static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
298 struct bch_replicas_cpu *dst_r,
299 struct bch_fs_usage __percpu *src_p,
300 struct bch_replicas_cpu *src_r)
301{
302 unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
303 struct bch_fs_usage *dst, *src = (void *)
304 bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr);
305
306 preempt_disable();
307 dst = this_cpu_ptr(dst_p);
308 preempt_enable();
309
310 __replicas_table_update(dst, dst_r, src, src_r);
311}
312
313/*
314 * Resize filesystem accounting:
315 */
316static int replicas_table_update(struct bch_fs *c,
317 struct bch_replicas_cpu *new_r)
318{
319 struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
320 struct bch_fs_usage_online *new_scratch = NULL;
321 struct bch_fs_usage __percpu *new_gc = NULL;
322 struct bch_fs_usage *new_base = NULL;
323 unsigned i, bytes = sizeof(struct bch_fs_usage) +
324 sizeof(u64) * new_r->nr;
325 unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
326 sizeof(u64) * new_r->nr;
327 int ret = 0;
328
329 memset(new_usage, 0, sizeof(new_usage));
330
331 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
332 if (!(new_usage[i] = __alloc_percpu_gfp(size: bytes,
333 align: sizeof(u64), GFP_KERNEL)))
334 goto err;
335
336 if (!(new_base = kzalloc(size: bytes, GFP_KERNEL)) ||
337 !(new_scratch = kmalloc(size: scratch_bytes, GFP_KERNEL)) ||
338 (c->usage_gc &&
339 !(new_gc = __alloc_percpu_gfp(size: bytes, align: sizeof(u64), GFP_KERNEL))))
340 goto err;
341
342 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
343 if (c->usage[i])
344 __replicas_table_update_pcpu(dst_p: new_usage[i], dst_r: new_r,
345 src_p: c->usage[i], src_r: &c->replicas);
346 if (c->usage_base)
347 __replicas_table_update(dst: new_base, dst_r: new_r,
348 src: c->usage_base, src_r: &c->replicas);
349 if (c->usage_gc)
350 __replicas_table_update_pcpu(dst_p: new_gc, dst_r: new_r,
351 src_p: c->usage_gc, src_r: &c->replicas);
352
353 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
354 swap(c->usage[i], new_usage[i]);
355 swap(c->usage_base, new_base);
356 swap(c->usage_scratch, new_scratch);
357 swap(c->usage_gc, new_gc);
358 swap(c->replicas, *new_r);
359out:
360 free_percpu(pdata: new_gc);
361 kfree(objp: new_scratch);
362 for (i = 0; i < ARRAY_SIZE(new_usage); i++)
363 free_percpu(pdata: new_usage[i]);
364 kfree(objp: new_base);
365 return ret;
366err:
367 bch_err(c, "error updating replicas table: memory allocation failure");
368 ret = -BCH_ERR_ENOMEM_replicas_table;
369 goto out;
370}
371
372static unsigned reserve_journal_replicas(struct bch_fs *c,
373 struct bch_replicas_cpu *r)
374{
375 struct bch_replicas_entry_v1 *e;
376 unsigned journal_res_u64s = 0;
377
378 /* nr_inodes: */
379 journal_res_u64s +=
380 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
381
382 /* key_version: */
383 journal_res_u64s +=
384 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
385
386 /* persistent_reserved: */
387 journal_res_u64s +=
388 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
389 BCH_REPLICAS_MAX;
390
391 for_each_cpu_replicas_entry(r, e)
392 journal_res_u64s +=
393 DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
394 e->nr_devs, sizeof(u64));
395 return journal_res_u64s;
396}
397
398noinline
399static int bch2_mark_replicas_slowpath(struct bch_fs *c,
400 struct bch_replicas_entry_v1 *new_entry)
401{
402 struct bch_replicas_cpu new_r, new_gc;
403 int ret = 0;
404
405 verify_replicas_entry(e: new_entry);
406
407 memset(&new_r, 0, sizeof(new_r));
408 memset(&new_gc, 0, sizeof(new_gc));
409
410 mutex_lock(&c->sb_lock);
411
412 if (c->replicas_gc.entries &&
413 !__replicas_has_entry(r: &c->replicas_gc, search: new_entry)) {
414 new_gc = cpu_replicas_add_entry(c, old: &c->replicas_gc, new_entry);
415 if (!new_gc.entries) {
416 ret = -BCH_ERR_ENOMEM_cpu_replicas;
417 goto err;
418 }
419 }
420
421 if (!__replicas_has_entry(r: &c->replicas, search: new_entry)) {
422 new_r = cpu_replicas_add_entry(c, old: &c->replicas, new_entry);
423 if (!new_r.entries) {
424 ret = -BCH_ERR_ENOMEM_cpu_replicas;
425 goto err;
426 }
427
428 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
429 if (ret)
430 goto err;
431
432 bch2_journal_entry_res_resize(&c->journal,
433 &c->replicas_journal_res,
434 reserve_journal_replicas(c, r: &new_r));
435 }
436
437 if (!new_r.entries &&
438 !new_gc.entries)
439 goto out;
440
441 /* allocations done, now commit: */
442
443 if (new_r.entries)
444 bch2_write_super(c);
445
446 /* don't update in memory replicas until changes are persistent */
447 percpu_down_write(&c->mark_lock);
448 if (new_r.entries)
449 ret = replicas_table_update(c, new_r: &new_r);
450 if (new_gc.entries)
451 swap(new_gc, c->replicas_gc);
452 percpu_up_write(&c->mark_lock);
453out:
454 mutex_unlock(lock: &c->sb_lock);
455
456 kfree(objp: new_r.entries);
457 kfree(objp: new_gc.entries);
458
459 return ret;
460err:
461 bch_err_msg(c, ret, "adding replicas entry");
462 goto out;
463}
464
465int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
466{
467 return likely(bch2_replicas_marked(c, r))
468 ? 0 : bch2_mark_replicas_slowpath(c, new_entry: r);
469}
470
471/* replicas delta list: */
472
473int bch2_replicas_delta_list_mark(struct bch_fs *c,
474 struct replicas_delta_list *r)
475{
476 struct replicas_delta *d = r->d;
477 struct replicas_delta *top = (void *) r->d + r->used;
478 int ret = 0;
479
480 for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
481 ret = bch2_mark_replicas(c, r: &d->r);
482 return ret;
483}
484
485/*
486 * Old replicas_gc mechanism: only used for journal replicas entries now, should
487 * die at some point:
488 */
489
490int bch2_replicas_gc_end(struct bch_fs *c, int ret)
491{
492 lockdep_assert_held(&c->replicas_gc_lock);
493
494 mutex_lock(&c->sb_lock);
495 percpu_down_write(&c->mark_lock);
496
497 ret = ret ?:
498 bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
499 replicas_table_update(c, new_r: &c->replicas_gc);
500
501 kfree(objp: c->replicas_gc.entries);
502 c->replicas_gc.entries = NULL;
503
504 percpu_up_write(&c->mark_lock);
505
506 if (!ret)
507 bch2_write_super(c);
508
509 mutex_unlock(lock: &c->sb_lock);
510
511 return ret;
512}
513
514int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
515{
516 struct bch_replicas_entry_v1 *e;
517 unsigned i = 0;
518
519 lockdep_assert_held(&c->replicas_gc_lock);
520
521 mutex_lock(&c->sb_lock);
522 BUG_ON(c->replicas_gc.entries);
523
524 c->replicas_gc.nr = 0;
525 c->replicas_gc.entry_size = 0;
526
527 for_each_cpu_replicas_entry(&c->replicas, e)
528 if (!((1 << e->data_type) & typemask)) {
529 c->replicas_gc.nr++;
530 c->replicas_gc.entry_size =
531 max_t(unsigned, c->replicas_gc.entry_size,
532 replicas_entry_bytes(e));
533 }
534
535 c->replicas_gc.entries = kcalloc(n: c->replicas_gc.nr,
536 size: c->replicas_gc.entry_size,
537 GFP_KERNEL);
538 if (!c->replicas_gc.entries) {
539 mutex_unlock(lock: &c->sb_lock);
540 bch_err(c, "error allocating c->replicas_gc");
541 return -BCH_ERR_ENOMEM_replicas_gc;
542 }
543
544 for_each_cpu_replicas_entry(&c->replicas, e)
545 if (!((1 << e->data_type) & typemask))
546 memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
547 e, c->replicas_gc.entry_size);
548
549 bch2_cpu_replicas_sort(r: &c->replicas_gc);
550 mutex_unlock(lock: &c->sb_lock);
551
552 return 0;
553}
554
555/*
556 * New much simpler mechanism for clearing out unneeded replicas entries - drop
557 * replicas entries that have 0 sectors used.
558 *
559 * However, we don't track sector counts for journal usage, so this doesn't drop
560 * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
561 * is retained for that.
562 */
563int bch2_replicas_gc2(struct bch_fs *c)
564{
565 struct bch_replicas_cpu new = { 0 };
566 unsigned i, nr;
567 int ret = 0;
568
569 bch2_journal_meta(&c->journal);
570retry:
571 nr = READ_ONCE(c->replicas.nr);
572 new.entry_size = READ_ONCE(c->replicas.entry_size);
573 new.entries = kcalloc(n: nr, size: new.entry_size, GFP_KERNEL);
574 if (!new.entries) {
575 bch_err(c, "error allocating c->replicas_gc");
576 return -BCH_ERR_ENOMEM_replicas_gc;
577 }
578
579 mutex_lock(&c->sb_lock);
580 percpu_down_write(&c->mark_lock);
581
582 if (nr != c->replicas.nr ||
583 new.entry_size != c->replicas.entry_size) {
584 percpu_up_write(&c->mark_lock);
585 mutex_unlock(lock: &c->sb_lock);
586 kfree(objp: new.entries);
587 goto retry;
588 }
589
590 for (i = 0; i < c->replicas.nr; i++) {
591 struct bch_replicas_entry_v1 *e =
592 cpu_replicas_entry(r: &c->replicas, i);
593
594 if (e->data_type == BCH_DATA_journal ||
595 c->usage_base->replicas[i] ||
596 percpu_u64_get(src: &c->usage[0]->replicas[i]) ||
597 percpu_u64_get(src: &c->usage[1]->replicas[i]) ||
598 percpu_u64_get(src: &c->usage[2]->replicas[i]) ||
599 percpu_u64_get(src: &c->usage[3]->replicas[i]))
600 memcpy(cpu_replicas_entry(&new, new.nr++),
601 e, new.entry_size);
602 }
603
604 bch2_cpu_replicas_sort(r: &new);
605
606 ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
607 replicas_table_update(c, new_r: &new);
608
609 kfree(objp: new.entries);
610
611 percpu_up_write(&c->mark_lock);
612
613 if (!ret)
614 bch2_write_super(c);
615
616 mutex_unlock(lock: &c->sb_lock);
617
618 return ret;
619}
620
621int bch2_replicas_set_usage(struct bch_fs *c,
622 struct bch_replicas_entry_v1 *r,
623 u64 sectors)
624{
625 int ret, idx = bch2_replicas_entry_idx(c, search: r);
626
627 if (idx < 0) {
628 struct bch_replicas_cpu n;
629
630 n = cpu_replicas_add_entry(c, old: &c->replicas, new_entry: r);
631 if (!n.entries)
632 return -BCH_ERR_ENOMEM_cpu_replicas;
633
634 ret = replicas_table_update(c, new_r: &n);
635 if (ret)
636 return ret;
637
638 kfree(objp: n.entries);
639
640 idx = bch2_replicas_entry_idx(c, search: r);
641 BUG_ON(ret < 0);
642 }
643
644 c->usage_base->replicas[idx] = sectors;
645
646 return 0;
647}
648
649/* Replicas tracking - superblock: */
650
651static int
652__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
653 struct bch_replicas_cpu *cpu_r)
654{
655 struct bch_replicas_entry_v1 *e, *dst;
656 unsigned nr = 0, entry_size = 0, idx = 0;
657
658 for_each_replicas_entry(sb_r, e) {
659 entry_size = max_t(unsigned, entry_size,
660 replicas_entry_bytes(e));
661 nr++;
662 }
663
664 cpu_r->entries = kcalloc(n: nr, size: entry_size, GFP_KERNEL);
665 if (!cpu_r->entries)
666 return -BCH_ERR_ENOMEM_cpu_replicas;
667
668 cpu_r->nr = nr;
669 cpu_r->entry_size = entry_size;
670
671 for_each_replicas_entry(sb_r, e) {
672 dst = cpu_replicas_entry(r: cpu_r, i: idx++);
673 memcpy(dst, e, replicas_entry_bytes(e));
674 bch2_replicas_entry_sort(e: dst);
675 }
676
677 return 0;
678}
679
680static int
681__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
682 struct bch_replicas_cpu *cpu_r)
683{
684 struct bch_replicas_entry_v0 *e;
685 unsigned nr = 0, entry_size = 0, idx = 0;
686
687 for_each_replicas_entry(sb_r, e) {
688 entry_size = max_t(unsigned, entry_size,
689 replicas_entry_bytes(e));
690 nr++;
691 }
692
693 entry_size += sizeof(struct bch_replicas_entry_v1) -
694 sizeof(struct bch_replicas_entry_v0);
695
696 cpu_r->entries = kcalloc(n: nr, size: entry_size, GFP_KERNEL);
697 if (!cpu_r->entries)
698 return -BCH_ERR_ENOMEM_cpu_replicas;
699
700 cpu_r->nr = nr;
701 cpu_r->entry_size = entry_size;
702
703 for_each_replicas_entry(sb_r, e) {
704 struct bch_replicas_entry_v1 *dst =
705 cpu_replicas_entry(r: cpu_r, i: idx++);
706
707 dst->data_type = e->data_type;
708 dst->nr_devs = e->nr_devs;
709 dst->nr_required = 1;
710 memcpy(dst->devs, e->devs, e->nr_devs);
711 bch2_replicas_entry_sort(e: dst);
712 }
713
714 return 0;
715}
716
717int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
718{
719 struct bch_sb_field_replicas *sb_v1;
720 struct bch_sb_field_replicas_v0 *sb_v0;
721 struct bch_replicas_cpu new_r = { 0, 0, NULL };
722 int ret = 0;
723
724 if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
725 ret = __bch2_sb_replicas_to_cpu_replicas(sb_r: sb_v1, cpu_r: &new_r);
726 else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
727 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r: sb_v0, cpu_r: &new_r);
728 if (ret)
729 return ret;
730
731 bch2_cpu_replicas_sort(r: &new_r);
732
733 percpu_down_write(&c->mark_lock);
734
735 ret = replicas_table_update(c, new_r: &new_r);
736 percpu_up_write(&c->mark_lock);
737
738 kfree(objp: new_r.entries);
739
740 return 0;
741}
742
743static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
744 struct bch_replicas_cpu *r)
745{
746 struct bch_sb_field_replicas_v0 *sb_r;
747 struct bch_replicas_entry_v0 *dst;
748 struct bch_replicas_entry_v1 *src;
749 size_t bytes;
750
751 bytes = sizeof(struct bch_sb_field_replicas);
752
753 for_each_cpu_replicas_entry(r, src)
754 bytes += replicas_entry_bytes(src) - 1;
755
756 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
757 DIV_ROUND_UP(bytes, sizeof(u64)));
758 if (!sb_r)
759 return -BCH_ERR_ENOSPC_sb_replicas;
760
761 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
762 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
763
764 memset(&sb_r->entries, 0,
765 vstruct_end(&sb_r->field) -
766 (void *) &sb_r->entries);
767
768 dst = sb_r->entries;
769 for_each_cpu_replicas_entry(r, src) {
770 dst->data_type = src->data_type;
771 dst->nr_devs = src->nr_devs;
772 memcpy(dst->devs, src->devs, src->nr_devs);
773
774 dst = replicas_entry_next(dst);
775
776 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
777 }
778
779 return 0;
780}
781
782static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
783 struct bch_replicas_cpu *r)
784{
785 struct bch_sb_field_replicas *sb_r;
786 struct bch_replicas_entry_v1 *dst, *src;
787 bool need_v1 = false;
788 size_t bytes;
789
790 bytes = sizeof(struct bch_sb_field_replicas);
791
792 for_each_cpu_replicas_entry(r, src) {
793 bytes += replicas_entry_bytes(src);
794 if (src->nr_required != 1)
795 need_v1 = true;
796 }
797
798 if (!need_v1)
799 return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
800
801 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
802 DIV_ROUND_UP(bytes, sizeof(u64)));
803 if (!sb_r)
804 return -BCH_ERR_ENOSPC_sb_replicas;
805
806 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
807 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
808
809 memset(&sb_r->entries, 0,
810 vstruct_end(&sb_r->field) -
811 (void *) &sb_r->entries);
812
813 dst = sb_r->entries;
814 for_each_cpu_replicas_entry(r, src) {
815 memcpy(dst, src, replicas_entry_bytes(src));
816
817 dst = replicas_entry_next(dst);
818
819 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
820 }
821
822 return 0;
823}
824
825static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
826 struct bch_sb *sb,
827 struct printbuf *err)
828{
829 unsigned i;
830
831 sort_r(base: cpu_r->entries,
832 num: cpu_r->nr,
833 size: cpu_r->entry_size,
834 cmp_func: bch2_memcmp, NULL,
835 priv: (void *)(size_t)cpu_r->entry_size);
836
837 for (i = 0; i < cpu_r->nr; i++) {
838 struct bch_replicas_entry_v1 *e =
839 cpu_replicas_entry(r: cpu_r, i);
840
841 int ret = bch2_replicas_entry_validate(r: e, sb, err);
842 if (ret)
843 return ret;
844
845 if (i + 1 < cpu_r->nr) {
846 struct bch_replicas_entry_v1 *n =
847 cpu_replicas_entry(r: cpu_r, i: i + 1);
848
849 BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
850
851 if (!memcmp(p: e, q: n, size: cpu_r->entry_size)) {
852 prt_printf(err, "duplicate replicas entry ");
853 bch2_replicas_entry_to_text(out: err, e);
854 return -BCH_ERR_invalid_sb_replicas;
855 }
856 }
857 }
858
859 return 0;
860}
861
862static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
863 struct printbuf *err)
864{
865 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
866 struct bch_replicas_cpu cpu_r;
867 int ret;
868
869 ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, cpu_r: &cpu_r);
870 if (ret)
871 return ret;
872
873 ret = bch2_cpu_replicas_validate(cpu_r: &cpu_r, sb, err);
874 kfree(objp: cpu_r.entries);
875 return ret;
876}
877
878static void bch2_sb_replicas_to_text(struct printbuf *out,
879 struct bch_sb *sb,
880 struct bch_sb_field *f)
881{
882 struct bch_sb_field_replicas *r = field_to_type(f, replicas);
883 struct bch_replicas_entry_v1 *e;
884 bool first = true;
885
886 for_each_replicas_entry(r, e) {
887 if (!first)
888 prt_printf(out, " ");
889 first = false;
890
891 bch2_replicas_entry_to_text(out, e);
892 }
893 prt_newline(out);
894}
895
896const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
897 .validate = bch2_sb_replicas_validate,
898 .to_text = bch2_sb_replicas_to_text,
899};
900
901static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
902 struct printbuf *err)
903{
904 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
905 struct bch_replicas_cpu cpu_r;
906 int ret;
907
908 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, cpu_r: &cpu_r);
909 if (ret)
910 return ret;
911
912 ret = bch2_cpu_replicas_validate(cpu_r: &cpu_r, sb, err);
913 kfree(objp: cpu_r.entries);
914 return ret;
915}
916
917static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
918 struct bch_sb *sb,
919 struct bch_sb_field *f)
920{
921 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
922 struct bch_replicas_entry_v0 *e;
923 bool first = true;
924
925 for_each_replicas_entry(sb_r, e) {
926 if (!first)
927 prt_printf(out, " ");
928 first = false;
929
930 bch2_replicas_entry_v0_to_text(out, e);
931 }
932 prt_newline(out);
933}
934
935const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
936 .validate = bch2_sb_replicas_v0_validate,
937 .to_text = bch2_sb_replicas_v0_to_text,
938};
939
940/* Query replicas: */
941
942bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
943 unsigned flags, bool print)
944{
945 struct bch_replicas_entry_v1 *e;
946 bool ret = true;
947
948 percpu_down_read(sem: &c->mark_lock);
949 for_each_cpu_replicas_entry(&c->replicas, e) {
950 unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
951 bool metadata = e->data_type < BCH_DATA_user;
952
953 if (e->data_type == BCH_DATA_cached)
954 continue;
955
956 for (i = 0; i < e->nr_devs; i++) {
957 struct bch_dev *ca = bch_dev_bkey_exists(c, idx: e->devs[i]);
958
959 nr_online += test_bit(e->devs[i], devs.d);
960 nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
961 }
962
963 if (nr_failed == e->nr_devs)
964 continue;
965
966 if (nr_online < e->nr_required)
967 dflags |= metadata
968 ? BCH_FORCE_IF_METADATA_LOST
969 : BCH_FORCE_IF_DATA_LOST;
970
971 if (nr_online < e->nr_devs)
972 dflags |= metadata
973 ? BCH_FORCE_IF_METADATA_DEGRADED
974 : BCH_FORCE_IF_DATA_DEGRADED;
975
976 if (dflags & ~flags) {
977 if (print) {
978 struct printbuf buf = PRINTBUF;
979
980 bch2_replicas_entry_to_text(out: &buf, e);
981 bch_err(c, "insufficient devices online (%u) for replicas entry %s",
982 nr_online, buf.buf);
983 printbuf_exit(&buf);
984 }
985 ret = false;
986 break;
987 }
988
989 }
990 percpu_up_read(sem: &c->mark_lock);
991
992 return ret;
993}
994
995unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
996{
997 struct bch_sb_field_replicas *replicas;
998 struct bch_sb_field_replicas_v0 *replicas_v0;
999 unsigned i, data_has = 0;
1000
1001 replicas = bch2_sb_field_get(sb, replicas);
1002 replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
1003
1004 if (replicas) {
1005 struct bch_replicas_entry_v1 *r;
1006
1007 for_each_replicas_entry(replicas, r)
1008 for (i = 0; i < r->nr_devs; i++)
1009 if (r->devs[i] == dev)
1010 data_has |= 1 << r->data_type;
1011 } else if (replicas_v0) {
1012 struct bch_replicas_entry_v0 *r;
1013
1014 for_each_replicas_entry_v0(replicas_v0, r)
1015 for (i = 0; i < r->nr_devs; i++)
1016 if (r->devs[i] == dev)
1017 data_has |= 1 << r->data_type;
1018 }
1019
1020
1021 return data_has;
1022}
1023
1024unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1025{
1026 unsigned ret;
1027
1028 mutex_lock(&c->sb_lock);
1029 ret = bch2_sb_dev_has_data(sb: c->disk_sb.sb, dev: ca->dev_idx);
1030 mutex_unlock(lock: &c->sb_lock);
1031
1032 return ret;
1033}
1034
1035void bch2_fs_replicas_exit(struct bch_fs *c)
1036{
1037 unsigned i;
1038
1039 kfree(objp: c->usage_scratch);
1040 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
1041 free_percpu(pdata: c->usage[i]);
1042 kfree(objp: c->usage_base);
1043 kfree(objp: c->replicas.entries);
1044 kfree(objp: c->replicas_gc.entries);
1045
1046 mempool_exit(pool: &c->replicas_delta_pool);
1047}
1048
1049int bch2_fs_replicas_init(struct bch_fs *c)
1050{
1051 bch2_journal_entry_res_resize(&c->journal,
1052 &c->replicas_journal_res,
1053 reserve_journal_replicas(c, r: &c->replicas));
1054
1055 return mempool_init_kmalloc_pool(pool: &c->replicas_delta_pool, min_nr: 1,
1056 REPLICAS_DELTA_LIST_MAX) ?:
1057 replicas_table_update(c, new_r: &c->replicas);
1058}
1059

source code of linux/fs/bcachefs/replicas.c