1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Code for manipulating bucket marks for garbage collection. |
4 | * |
5 | * Copyright 2014 Datera, Inc. |
6 | */ |
7 | |
8 | #include "bcachefs.h" |
9 | #include "alloc_background.h" |
10 | #include "backpointers.h" |
11 | #include "bset.h" |
12 | #include "btree_gc.h" |
13 | #include "btree_update.h" |
14 | #include "buckets.h" |
15 | #include "buckets_waiting_for_journal.h" |
16 | #include "ec.h" |
17 | #include "error.h" |
18 | #include "inode.h" |
19 | #include "movinggc.h" |
20 | #include "recovery.h" |
21 | #include "reflink.h" |
22 | #include "replicas.h" |
23 | #include "subvolume.h" |
24 | #include "trace.h" |
25 | |
26 | #include <linux/preempt.h> |
27 | |
28 | static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, |
29 | enum bch_data_type data_type, |
30 | s64 sectors) |
31 | { |
32 | switch (data_type) { |
33 | case BCH_DATA_btree: |
34 | fs_usage->btree += sectors; |
35 | break; |
36 | case BCH_DATA_user: |
37 | case BCH_DATA_parity: |
38 | fs_usage->data += sectors; |
39 | break; |
40 | case BCH_DATA_cached: |
41 | fs_usage->cached += sectors; |
42 | break; |
43 | default: |
44 | break; |
45 | } |
46 | } |
47 | |
48 | void bch2_fs_usage_initialize(struct bch_fs *c) |
49 | { |
50 | percpu_down_write(&c->mark_lock); |
51 | struct bch_fs_usage *usage = c->usage_base; |
52 | |
53 | for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++) |
54 | bch2_fs_usage_acc_to_base(c, i); |
55 | |
56 | for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) |
57 | usage->b.reserved += usage->persistent_reserved[i]; |
58 | |
59 | for (unsigned i = 0; i < c->replicas.nr; i++) { |
60 | struct bch_replicas_entry_v1 *e = |
61 | cpu_replicas_entry(r: &c->replicas, i); |
62 | |
63 | fs_usage_data_type_to_base(fs_usage: &usage->b, data_type: e->data_type, sectors: usage->replicas[i]); |
64 | } |
65 | |
66 | for_each_member_device(c, ca) { |
67 | struct bch_dev_usage dev = bch2_dev_usage_read(ca); |
68 | |
69 | usage->b.hidden += (dev.d[BCH_DATA_sb].buckets + |
70 | dev.d[BCH_DATA_journal].buckets) * |
71 | ca->mi.bucket_size; |
72 | } |
73 | |
74 | percpu_up_write(&c->mark_lock); |
75 | } |
76 | |
77 | static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, |
78 | unsigned journal_seq, |
79 | bool gc) |
80 | { |
81 | BUG_ON(!gc && !journal_seq); |
82 | |
83 | return this_cpu_ptr(gc |
84 | ? ca->usage_gc |
85 | : ca->usage[journal_seq & JOURNAL_BUF_MASK]); |
86 | } |
87 | |
88 | void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) |
89 | { |
90 | struct bch_fs *c = ca->fs; |
91 | unsigned seq, i, u64s = dev_usage_u64s(); |
92 | |
93 | do { |
94 | seq = read_seqcount_begin(&c->usage_lock); |
95 | memcpy(usage, ca->usage_base, u64s * sizeof(u64)); |
96 | for (i = 0; i < ARRAY_SIZE(ca->usage); i++) |
97 | acc_u64s_percpu(acc: (u64 *) usage, src: (u64 __percpu *) ca->usage[i], nr: u64s); |
98 | } while (read_seqcount_retry(&c->usage_lock, seq)); |
99 | } |
100 | |
101 | u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) |
102 | { |
103 | ssize_t offset = v - (u64 *) c->usage_base; |
104 | unsigned i, seq; |
105 | u64 ret; |
106 | |
107 | BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); |
108 | percpu_rwsem_assert_held(&c->mark_lock); |
109 | |
110 | do { |
111 | seq = read_seqcount_begin(&c->usage_lock); |
112 | ret = *v; |
113 | |
114 | for (i = 0; i < ARRAY_SIZE(c->usage); i++) |
115 | ret += percpu_u64_get(src: (u64 __percpu *) c->usage[i] + offset); |
116 | } while (read_seqcount_retry(&c->usage_lock, seq)); |
117 | |
118 | return ret; |
119 | } |
120 | |
121 | struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) |
122 | { |
123 | struct bch_fs_usage_online *ret; |
124 | unsigned nr_replicas = READ_ONCE(c->replicas.nr); |
125 | unsigned seq, i; |
126 | retry: |
127 | ret = kmalloc(size: __fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); |
128 | if (unlikely(!ret)) |
129 | return NULL; |
130 | |
131 | percpu_down_read(sem: &c->mark_lock); |
132 | |
133 | if (nr_replicas != c->replicas.nr) { |
134 | nr_replicas = c->replicas.nr; |
135 | percpu_up_read(sem: &c->mark_lock); |
136 | kfree(objp: ret); |
137 | goto retry; |
138 | } |
139 | |
140 | ret->online_reserved = percpu_u64_get(src: c->online_reserved); |
141 | |
142 | do { |
143 | seq = read_seqcount_begin(&c->usage_lock); |
144 | unsafe_memcpy(&ret->u, c->usage_base, |
145 | __fs_usage_u64s(nr_replicas) * sizeof(u64), |
146 | "embedded variable length struct" ); |
147 | for (i = 0; i < ARRAY_SIZE(c->usage); i++) |
148 | acc_u64s_percpu(acc: (u64 *) &ret->u, src: (u64 __percpu *) c->usage[i], |
149 | nr: __fs_usage_u64s(nr_replicas)); |
150 | } while (read_seqcount_retry(&c->usage_lock, seq)); |
151 | |
152 | return ret; |
153 | } |
154 | |
155 | void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) |
156 | { |
157 | unsigned u64s = fs_usage_u64s(c); |
158 | |
159 | BUG_ON(idx >= ARRAY_SIZE(c->usage)); |
160 | |
161 | preempt_disable(); |
162 | write_seqcount_begin(&c->usage_lock); |
163 | |
164 | acc_u64s_percpu(acc: (u64 *) c->usage_base, |
165 | src: (u64 __percpu *) c->usage[idx], nr: u64s); |
166 | percpu_memset(p: c->usage[idx], c: 0, bytes: u64s * sizeof(u64)); |
167 | |
168 | rcu_read_lock(); |
169 | for_each_member_device_rcu(c, ca, NULL) { |
170 | u64s = dev_usage_u64s(); |
171 | |
172 | acc_u64s_percpu(acc: (u64 *) ca->usage_base, |
173 | src: (u64 __percpu *) ca->usage[idx], nr: u64s); |
174 | percpu_memset(p: ca->usage[idx], c: 0, bytes: u64s * sizeof(u64)); |
175 | } |
176 | rcu_read_unlock(); |
177 | |
178 | write_seqcount_end(&c->usage_lock); |
179 | preempt_enable(); |
180 | } |
181 | |
182 | void bch2_fs_usage_to_text(struct printbuf *out, |
183 | struct bch_fs *c, |
184 | struct bch_fs_usage_online *fs_usage) |
185 | { |
186 | unsigned i; |
187 | |
188 | prt_printf(out, "capacity:\t\t\t%llu\n" , c->capacity); |
189 | |
190 | prt_printf(out, "hidden:\t\t\t\t%llu\n" , |
191 | fs_usage->u.b.hidden); |
192 | prt_printf(out, "data:\t\t\t\t%llu\n" , |
193 | fs_usage->u.b.data); |
194 | prt_printf(out, "cached:\t\t\t\t%llu\n" , |
195 | fs_usage->u.b.cached); |
196 | prt_printf(out, "reserved:\t\t\t%llu\n" , |
197 | fs_usage->u.b.reserved); |
198 | prt_printf(out, "nr_inodes:\t\t\t%llu\n" , |
199 | fs_usage->u.b.nr_inodes); |
200 | prt_printf(out, "online reserved:\t\t%llu\n" , |
201 | fs_usage->online_reserved); |
202 | |
203 | for (i = 0; |
204 | i < ARRAY_SIZE(fs_usage->u.persistent_reserved); |
205 | i++) { |
206 | prt_printf(out, "%u replicas:\n" , i + 1); |
207 | prt_printf(out, "\treserved:\t\t%llu\n" , |
208 | fs_usage->u.persistent_reserved[i]); |
209 | } |
210 | |
211 | for (i = 0; i < c->replicas.nr; i++) { |
212 | struct bch_replicas_entry_v1 *e = |
213 | cpu_replicas_entry(r: &c->replicas, i); |
214 | |
215 | prt_printf(out, "\t" ); |
216 | bch2_replicas_entry_to_text(out, e); |
217 | prt_printf(out, ":\t%llu\n" , fs_usage->u.replicas[i]); |
218 | } |
219 | } |
220 | |
221 | static u64 reserve_factor(u64 r) |
222 | { |
223 | return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); |
224 | } |
225 | |
226 | u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) |
227 | { |
228 | return min(fs_usage->u.b.hidden + |
229 | fs_usage->u.b.btree + |
230 | fs_usage->u.b.data + |
231 | reserve_factor(fs_usage->u.b.reserved + |
232 | fs_usage->online_reserved), |
233 | c->capacity); |
234 | } |
235 | |
236 | static struct bch_fs_usage_short |
237 | __bch2_fs_usage_read_short(struct bch_fs *c) |
238 | { |
239 | struct bch_fs_usage_short ret; |
240 | u64 data, reserved; |
241 | |
242 | ret.capacity = c->capacity - |
243 | bch2_fs_usage_read_one(c, v: &c->usage_base->b.hidden); |
244 | |
245 | data = bch2_fs_usage_read_one(c, v: &c->usage_base->b.data) + |
246 | bch2_fs_usage_read_one(c, v: &c->usage_base->b.btree); |
247 | reserved = bch2_fs_usage_read_one(c, v: &c->usage_base->b.reserved) + |
248 | percpu_u64_get(src: c->online_reserved); |
249 | |
250 | ret.used = min(ret.capacity, data + reserve_factor(reserved)); |
251 | ret.free = ret.capacity - ret.used; |
252 | |
253 | ret.nr_inodes = bch2_fs_usage_read_one(c, v: &c->usage_base->b.nr_inodes); |
254 | |
255 | return ret; |
256 | } |
257 | |
258 | struct bch_fs_usage_short |
259 | bch2_fs_usage_read_short(struct bch_fs *c) |
260 | { |
261 | struct bch_fs_usage_short ret; |
262 | |
263 | percpu_down_read(sem: &c->mark_lock); |
264 | ret = __bch2_fs_usage_read_short(c); |
265 | percpu_up_read(sem: &c->mark_lock); |
266 | |
267 | return ret; |
268 | } |
269 | |
270 | void bch2_dev_usage_init(struct bch_dev *ca) |
271 | { |
272 | ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; |
273 | } |
274 | |
275 | void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage) |
276 | { |
277 | prt_tab(out); |
278 | prt_str(out, str: "buckets" ); |
279 | prt_tab_rjust(out); |
280 | prt_str(out, str: "sectors" ); |
281 | prt_tab_rjust(out); |
282 | prt_str(out, str: "fragmented" ); |
283 | prt_tab_rjust(out); |
284 | prt_newline(out); |
285 | |
286 | for (unsigned i = 0; i < BCH_DATA_NR; i++) { |
287 | bch2_prt_data_type(out, i); |
288 | prt_tab(out); |
289 | prt_u64(out, usage->d[i].buckets); |
290 | prt_tab_rjust(out); |
291 | prt_u64(out, usage->d[i].sectors); |
292 | prt_tab_rjust(out); |
293 | prt_u64(out, usage->d[i].fragmented); |
294 | prt_tab_rjust(out); |
295 | prt_newline(out); |
296 | } |
297 | } |
298 | |
299 | void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, |
300 | const struct bch_alloc_v4 *old, |
301 | const struct bch_alloc_v4 *new, |
302 | u64 journal_seq, bool gc) |
303 | { |
304 | struct bch_fs_usage *fs_usage; |
305 | struct bch_dev_usage *u; |
306 | |
307 | preempt_disable(); |
308 | fs_usage = fs_usage_ptr(c, journal_seq, gc); |
309 | |
310 | if (data_type_is_hidden(type: old->data_type)) |
311 | fs_usage->b.hidden -= ca->mi.bucket_size; |
312 | if (data_type_is_hidden(type: new->data_type)) |
313 | fs_usage->b.hidden += ca->mi.bucket_size; |
314 | |
315 | u = dev_usage_ptr(ca, journal_seq, gc); |
316 | |
317 | u->d[old->data_type].buckets--; |
318 | u->d[new->data_type].buckets++; |
319 | |
320 | u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(a: *old); |
321 | u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(a: *new); |
322 | |
323 | u->d[BCH_DATA_cached].sectors += new->cached_sectors; |
324 | u->d[BCH_DATA_cached].sectors -= old->cached_sectors; |
325 | |
326 | u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, a: *old); |
327 | u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, a: *new); |
328 | |
329 | preempt_enable(); |
330 | } |
331 | |
332 | static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) |
333 | { |
334 | return (struct bch_alloc_v4) { |
335 | .gen = b.gen, |
336 | .data_type = b.data_type, |
337 | .dirty_sectors = b.dirty_sectors, |
338 | .cached_sectors = b.cached_sectors, |
339 | .stripe = b.stripe, |
340 | }; |
341 | } |
342 | |
343 | void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, |
344 | struct bucket *old, struct bucket *new) |
345 | { |
346 | struct bch_alloc_v4 old_a = bucket_m_to_alloc(b: *old); |
347 | struct bch_alloc_v4 new_a = bucket_m_to_alloc(b: *new); |
348 | |
349 | bch2_dev_usage_update(c, ca, old: &old_a, new: &new_a, journal_seq: 0, gc: true); |
350 | } |
351 | |
352 | static inline int __update_replicas(struct bch_fs *c, |
353 | struct bch_fs_usage *fs_usage, |
354 | struct bch_replicas_entry_v1 *r, |
355 | s64 sectors) |
356 | { |
357 | int idx = bch2_replicas_entry_idx(c, r); |
358 | |
359 | if (idx < 0) |
360 | return -1; |
361 | |
362 | fs_usage_data_type_to_base(fs_usage: &fs_usage->b, data_type: r->data_type, sectors); |
363 | fs_usage->replicas[idx] += sectors; |
364 | return 0; |
365 | } |
366 | |
367 | int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k, |
368 | struct bch_replicas_entry_v1 *r, s64 sectors, |
369 | unsigned journal_seq, bool gc) |
370 | { |
371 | struct bch_fs_usage *fs_usage; |
372 | int idx, ret = 0; |
373 | struct printbuf buf = PRINTBUF; |
374 | |
375 | percpu_down_read(sem: &c->mark_lock); |
376 | |
377 | idx = bch2_replicas_entry_idx(c, r); |
378 | if (idx < 0 && |
379 | fsck_err(c, ptr_to_missing_replicas_entry, |
380 | "no replicas entry\n while marking %s" , |
381 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { |
382 | percpu_up_read(sem: &c->mark_lock); |
383 | ret = bch2_mark_replicas(c, r); |
384 | percpu_down_read(sem: &c->mark_lock); |
385 | |
386 | if (ret) |
387 | goto err; |
388 | idx = bch2_replicas_entry_idx(c, r); |
389 | } |
390 | if (idx < 0) { |
391 | ret = -1; |
392 | goto err; |
393 | } |
394 | |
395 | preempt_disable(); |
396 | fs_usage = fs_usage_ptr(c, journal_seq, gc); |
397 | fs_usage_data_type_to_base(fs_usage: &fs_usage->b, data_type: r->data_type, sectors); |
398 | fs_usage->replicas[idx] += sectors; |
399 | preempt_enable(); |
400 | err: |
401 | fsck_err: |
402 | percpu_up_read(sem: &c->mark_lock); |
403 | printbuf_exit(&buf); |
404 | return ret; |
405 | } |
406 | |
407 | static inline int update_cached_sectors(struct bch_fs *c, |
408 | struct bkey_s_c k, |
409 | unsigned dev, s64 sectors, |
410 | unsigned journal_seq, bool gc) |
411 | { |
412 | struct bch_replicas_padded r; |
413 | |
414 | bch2_replicas_entry_cached(e: &r.e, dev); |
415 | |
416 | return bch2_update_replicas(c, k, r: &r.e, sectors, journal_seq, gc); |
417 | } |
418 | |
419 | static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, |
420 | gfp_t gfp) |
421 | { |
422 | struct replicas_delta_list *d = trans->fs_usage_deltas; |
423 | unsigned new_size = d ? (d->size + more) * 2 : 128; |
424 | unsigned alloc_size = sizeof(*d) + new_size; |
425 | |
426 | WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); |
427 | |
428 | if (!d || d->used + more > d->size) { |
429 | d = krealloc(objp: d, new_size: alloc_size, flags: gfp|__GFP_ZERO); |
430 | |
431 | if (unlikely(!d)) { |
432 | if (alloc_size > REPLICAS_DELTA_LIST_MAX) |
433 | return -ENOMEM; |
434 | |
435 | d = mempool_alloc(pool: &trans->c->replicas_delta_pool, gfp_mask: gfp); |
436 | if (!d) |
437 | return -ENOMEM; |
438 | |
439 | memset(d, 0, REPLICAS_DELTA_LIST_MAX); |
440 | |
441 | if (trans->fs_usage_deltas) |
442 | memcpy(d, trans->fs_usage_deltas, |
443 | trans->fs_usage_deltas->size + sizeof(*d)); |
444 | |
445 | new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); |
446 | kfree(objp: trans->fs_usage_deltas); |
447 | } |
448 | |
449 | d->size = new_size; |
450 | trans->fs_usage_deltas = d; |
451 | } |
452 | |
453 | return 0; |
454 | } |
455 | |
456 | int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) |
457 | { |
458 | return allocate_dropping_locks_errcode(trans, |
459 | __replicas_deltas_realloc(trans, more, _gfp)); |
460 | } |
461 | |
462 | int bch2_update_replicas_list(struct btree_trans *trans, |
463 | struct bch_replicas_entry_v1 *r, |
464 | s64 sectors) |
465 | { |
466 | struct replicas_delta_list *d; |
467 | struct replicas_delta *n; |
468 | unsigned b; |
469 | int ret; |
470 | |
471 | if (!sectors) |
472 | return 0; |
473 | |
474 | b = replicas_entry_bytes(r) + 8; |
475 | ret = bch2_replicas_deltas_realloc(trans, more: b); |
476 | if (ret) |
477 | return ret; |
478 | |
479 | d = trans->fs_usage_deltas; |
480 | n = (void *) d->d + d->used; |
481 | n->delta = sectors; |
482 | unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), |
483 | r, replicas_entry_bytes(r), |
484 | "flexible array member embedded in strcuct with padding" ); |
485 | bch2_replicas_entry_sort(&n->r); |
486 | d->used += b; |
487 | return 0; |
488 | } |
489 | |
490 | int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) |
491 | { |
492 | struct bch_replicas_padded r; |
493 | |
494 | bch2_replicas_entry_cached(e: &r.e, dev); |
495 | |
496 | return bch2_update_replicas_list(trans, r: &r.e, sectors); |
497 | } |
498 | |
499 | int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, |
500 | size_t b, enum bch_data_type data_type, |
501 | unsigned sectors, struct gc_pos pos, |
502 | unsigned flags) |
503 | { |
504 | struct bucket old, new, *g; |
505 | int ret = 0; |
506 | |
507 | BUG_ON(!(flags & BTREE_TRIGGER_GC)); |
508 | BUG_ON(data_type != BCH_DATA_sb && |
509 | data_type != BCH_DATA_journal); |
510 | |
511 | /* |
512 | * Backup superblock might be past the end of our normal usable space: |
513 | */ |
514 | if (b >= ca->mi.nbuckets) |
515 | return 0; |
516 | |
517 | percpu_down_read(sem: &c->mark_lock); |
518 | g = gc_bucket(ca, b); |
519 | |
520 | bucket_lock(b: g); |
521 | old = *g; |
522 | |
523 | if (bch2_fs_inconsistent_on(g->data_type && |
524 | g->data_type != data_type, c, |
525 | "different types of data in same bucket: %s, %s" , |
526 | bch2_data_type_str(g->data_type), |
527 | bch2_data_type_str(data_type))) { |
528 | BUG(); |
529 | ret = -EIO; |
530 | goto err; |
531 | } |
532 | |
533 | if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, |
534 | "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size" , |
535 | ca->dev_idx, b, g->gen, |
536 | bch2_data_type_str(g->data_type ?: data_type), |
537 | g->dirty_sectors, sectors)) { |
538 | ret = -EIO; |
539 | goto err; |
540 | } |
541 | |
542 | g->data_type = data_type; |
543 | g->dirty_sectors += sectors; |
544 | new = *g; |
545 | err: |
546 | bucket_unlock(b: g); |
547 | if (!ret) |
548 | bch2_dev_usage_update_m(c, ca, old: &old, new: &new); |
549 | percpu_up_read(sem: &c->mark_lock); |
550 | return ret; |
551 | } |
552 | |
553 | int bch2_check_bucket_ref(struct btree_trans *trans, |
554 | struct bkey_s_c k, |
555 | const struct bch_extent_ptr *ptr, |
556 | s64 sectors, enum bch_data_type ptr_data_type, |
557 | u8 b_gen, u8 bucket_data_type, |
558 | u32 bucket_sectors) |
559 | { |
560 | struct bch_fs *c = trans->c; |
561 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ptr->dev); |
562 | size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); |
563 | struct printbuf buf = PRINTBUF; |
564 | int ret = 0; |
565 | |
566 | if (bucket_data_type == BCH_DATA_cached) |
567 | bucket_data_type = BCH_DATA_user; |
568 | |
569 | if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || |
570 | (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) |
571 | bucket_data_type = ptr_data_type = BCH_DATA_stripe; |
572 | |
573 | if (gen_after(a: ptr->gen, b: b_gen)) { |
574 | bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, |
575 | BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, |
576 | "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" |
577 | "while marking %s" , |
578 | ptr->dev, bucket_nr, b_gen, |
579 | bch2_data_type_str(type: bucket_data_type ?: ptr_data_type), |
580 | ptr->gen, |
581 | (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
582 | ret = -EIO; |
583 | goto err; |
584 | } |
585 | |
586 | if (gen_cmp(a: b_gen, b: ptr->gen) > BUCKET_GC_GEN_MAX) { |
587 | bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, |
588 | BCH_FSCK_ERR_ptr_too_stale, |
589 | "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" |
590 | "while marking %s" , |
591 | ptr->dev, bucket_nr, b_gen, |
592 | bch2_data_type_str(type: bucket_data_type ?: ptr_data_type), |
593 | ptr->gen, |
594 | (printbuf_reset(buf: &buf), |
595 | bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
596 | ret = -EIO; |
597 | goto err; |
598 | } |
599 | |
600 | if (b_gen != ptr->gen && !ptr->cached) { |
601 | bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, |
602 | BCH_FSCK_ERR_stale_dirty_ptr, |
603 | "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" |
604 | "while marking %s" , |
605 | ptr->dev, bucket_nr, b_gen, |
606 | *bucket_gen(ca, b: bucket_nr), |
607 | bch2_data_type_str(type: bucket_data_type ?: ptr_data_type), |
608 | ptr->gen, |
609 | (printbuf_reset(buf: &buf), |
610 | bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
611 | ret = -EIO; |
612 | goto err; |
613 | } |
614 | |
615 | if (b_gen != ptr->gen) { |
616 | ret = 1; |
617 | goto out; |
618 | } |
619 | |
620 | if (!data_type_is_empty(type: bucket_data_type) && |
621 | ptr_data_type && |
622 | bucket_data_type != ptr_data_type) { |
623 | bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, |
624 | BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, |
625 | "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" |
626 | "while marking %s" , |
627 | ptr->dev, bucket_nr, b_gen, |
628 | bch2_data_type_str(type: bucket_data_type), |
629 | bch2_data_type_str(type: ptr_data_type), |
630 | (printbuf_reset(buf: &buf), |
631 | bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
632 | BUG(); |
633 | ret = -EIO; |
634 | goto err; |
635 | } |
636 | |
637 | if ((u64) bucket_sectors + sectors > U32_MAX) { |
638 | bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, |
639 | BCH_FSCK_ERR_bucket_sector_count_overflow, |
640 | "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" |
641 | "while marking %s" , |
642 | ptr->dev, bucket_nr, b_gen, |
643 | bch2_data_type_str(type: bucket_data_type ?: ptr_data_type), |
644 | bucket_sectors, sectors, |
645 | (printbuf_reset(buf: &buf), |
646 | bch2_bkey_val_to_text(&buf, c, k), buf.buf)); |
647 | ret = -EIO; |
648 | goto err; |
649 | } |
650 | out: |
651 | printbuf_exit(&buf); |
652 | return ret; |
653 | err: |
654 | bch2_dump_trans_updates(trans); |
655 | goto out; |
656 | } |
657 | |
658 | void bch2_trans_fs_usage_revert(struct btree_trans *trans, |
659 | struct replicas_delta_list *deltas) |
660 | { |
661 | struct bch_fs *c = trans->c; |
662 | struct bch_fs_usage *dst; |
663 | struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; |
664 | s64 added = 0; |
665 | unsigned i; |
666 | |
667 | percpu_down_read(sem: &c->mark_lock); |
668 | preempt_disable(); |
669 | dst = fs_usage_ptr(c, journal_seq: trans->journal_res.seq, gc: false); |
670 | |
671 | /* revert changes: */ |
672 | for (d = deltas->d; d != top; d = replicas_delta_next(d)) { |
673 | switch (d->r.data_type) { |
674 | case BCH_DATA_btree: |
675 | case BCH_DATA_user: |
676 | case BCH_DATA_parity: |
677 | added += d->delta; |
678 | } |
679 | BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); |
680 | } |
681 | |
682 | dst->b.nr_inodes -= deltas->nr_inodes; |
683 | |
684 | for (i = 0; i < BCH_REPLICAS_MAX; i++) { |
685 | added -= deltas->persistent_reserved[i]; |
686 | dst->b.reserved -= deltas->persistent_reserved[i]; |
687 | dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; |
688 | } |
689 | |
690 | if (added > 0) { |
691 | trans->disk_res->sectors += added; |
692 | this_cpu_add(*c->online_reserved, added); |
693 | } |
694 | |
695 | preempt_enable(); |
696 | percpu_up_read(sem: &c->mark_lock); |
697 | } |
698 | |
699 | void bch2_trans_account_disk_usage_change(struct btree_trans *trans) |
700 | { |
701 | struct bch_fs *c = trans->c; |
702 | u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; |
703 | static int warned_disk_usage = 0; |
704 | bool warn = false; |
705 | |
706 | percpu_down_read(sem: &c->mark_lock); |
707 | preempt_disable(); |
708 | struct bch_fs_usage_base *dst = &fs_usage_ptr(c, journal_seq: trans->journal_res.seq, gc: false)->b; |
709 | struct bch_fs_usage_base *src = &trans->fs_usage_delta; |
710 | |
711 | s64 added = src->btree + src->data + src->reserved; |
712 | |
713 | /* |
714 | * Not allowed to reduce sectors_available except by getting a |
715 | * reservation: |
716 | */ |
717 | s64 should_not_have_added = added - (s64) disk_res_sectors; |
718 | if (unlikely(should_not_have_added > 0)) { |
719 | u64 old, new, v = atomic64_read(v: &c->sectors_available); |
720 | |
721 | do { |
722 | old = v; |
723 | new = max_t(s64, 0, old - should_not_have_added); |
724 | } while ((v = atomic64_cmpxchg(v: &c->sectors_available, |
725 | old, new)) != old); |
726 | |
727 | added -= should_not_have_added; |
728 | warn = true; |
729 | } |
730 | |
731 | if (added > 0) { |
732 | trans->disk_res->sectors -= added; |
733 | this_cpu_sub(*c->online_reserved, added); |
734 | } |
735 | |
736 | dst->hidden += src->hidden; |
737 | dst->btree += src->btree; |
738 | dst->data += src->data; |
739 | dst->cached += src->cached; |
740 | dst->reserved += src->reserved; |
741 | dst->nr_inodes += src->nr_inodes; |
742 | |
743 | preempt_enable(); |
744 | percpu_up_read(sem: &c->mark_lock); |
745 | |
746 | if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) |
747 | bch2_trans_inconsistent(trans, |
748 | "disk usage increased %lli more than %llu sectors reserved)" , |
749 | should_not_have_added, disk_res_sectors); |
750 | } |
751 | |
752 | int bch2_trans_fs_usage_apply(struct btree_trans *trans, |
753 | struct replicas_delta_list *deltas) |
754 | { |
755 | struct bch_fs *c = trans->c; |
756 | struct replicas_delta *d, *d2; |
757 | struct replicas_delta *top = (void *) deltas->d + deltas->used; |
758 | struct bch_fs_usage *dst; |
759 | unsigned i; |
760 | |
761 | percpu_down_read(sem: &c->mark_lock); |
762 | preempt_disable(); |
763 | dst = fs_usage_ptr(c, journal_seq: trans->journal_res.seq, gc: false); |
764 | |
765 | for (d = deltas->d; d != top; d = replicas_delta_next(d)) |
766 | if (__update_replicas(c, fs_usage: dst, r: &d->r, sectors: d->delta)) |
767 | goto need_mark; |
768 | |
769 | dst->b.nr_inodes += deltas->nr_inodes; |
770 | |
771 | for (i = 0; i < BCH_REPLICAS_MAX; i++) { |
772 | dst->b.reserved += deltas->persistent_reserved[i]; |
773 | dst->persistent_reserved[i] += deltas->persistent_reserved[i]; |
774 | } |
775 | |
776 | preempt_enable(); |
777 | percpu_up_read(sem: &c->mark_lock); |
778 | return 0; |
779 | need_mark: |
780 | /* revert changes: */ |
781 | for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d: d2)) |
782 | BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); |
783 | |
784 | preempt_enable(); |
785 | percpu_up_read(sem: &c->mark_lock); |
786 | return -1; |
787 | } |
788 | |
789 | /* KEY_TYPE_extent: */ |
790 | |
791 | static int __mark_pointer(struct btree_trans *trans, |
792 | struct bkey_s_c k, |
793 | const struct bch_extent_ptr *ptr, |
794 | s64 sectors, enum bch_data_type ptr_data_type, |
795 | u8 bucket_gen, u8 *bucket_data_type, |
796 | u32 *dirty_sectors, u32 *cached_sectors) |
797 | { |
798 | u32 *dst_sectors = !ptr->cached |
799 | ? dirty_sectors |
800 | : cached_sectors; |
801 | int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, |
802 | b_gen: bucket_gen, bucket_data_type: *bucket_data_type, bucket_sectors: *dst_sectors); |
803 | |
804 | if (ret) |
805 | return ret; |
806 | |
807 | *dst_sectors += sectors; |
808 | |
809 | if (!*dirty_sectors && !*cached_sectors) |
810 | *bucket_data_type = 0; |
811 | else if (*bucket_data_type != BCH_DATA_stripe) |
812 | *bucket_data_type = ptr_data_type; |
813 | |
814 | return 0; |
815 | } |
816 | |
817 | static int bch2_trigger_pointer(struct btree_trans *trans, |
818 | enum btree_id btree_id, unsigned level, |
819 | struct bkey_s_c k, struct extent_ptr_decoded p, |
820 | const union bch_extent_entry *entry, |
821 | s64 *sectors, unsigned flags) |
822 | { |
823 | bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); |
824 | struct bpos bucket; |
825 | struct bch_backpointer bp; |
826 | |
827 | bch2_extent_ptr_to_bp(c: trans->c, btree_id, level, k, p, entry, bucket_pos: &bucket, bp: &bp); |
828 | *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); |
829 | |
830 | if (flags & BTREE_TRIGGER_TRANSACTIONAL) { |
831 | struct btree_iter iter; |
832 | struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket); |
833 | int ret = PTR_ERR_OR_ZERO(ptr: a); |
834 | if (ret) |
835 | return ret; |
836 | |
837 | ret = __mark_pointer(trans, k, ptr: &p.ptr, sectors: *sectors, ptr_data_type: bp.data_type, |
838 | bucket_gen: a->v.gen, bucket_data_type: &a->v.data_type, |
839 | dirty_sectors: &a->v.dirty_sectors, cached_sectors: &a->v.cached_sectors) ?: |
840 | bch2_trans_update(trans, &iter, &a->k_i, 0); |
841 | bch2_trans_iter_exit(trans, &iter); |
842 | |
843 | if (ret) |
844 | return ret; |
845 | |
846 | if (!p.ptr.cached) { |
847 | ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k: k, insert); |
848 | if (ret) |
849 | return ret; |
850 | } |
851 | } |
852 | |
853 | if (flags & BTREE_TRIGGER_GC) { |
854 | struct bch_fs *c = trans->c; |
855 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: p.ptr.dev); |
856 | enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); |
857 | |
858 | percpu_down_read(sem: &c->mark_lock); |
859 | struct bucket *g = PTR_GC_BUCKET(ca, ptr: &p.ptr); |
860 | bucket_lock(b: g); |
861 | struct bucket old = *g; |
862 | |
863 | u8 bucket_data_type = g->data_type; |
864 | int ret = __mark_pointer(trans, k, ptr: &p.ptr, sectors: *sectors, |
865 | ptr_data_type: data_type, bucket_gen: g->gen, |
866 | bucket_data_type: &bucket_data_type, |
867 | dirty_sectors: &g->dirty_sectors, |
868 | cached_sectors: &g->cached_sectors); |
869 | if (ret) { |
870 | bucket_unlock(b: g); |
871 | percpu_up_read(sem: &c->mark_lock); |
872 | return ret; |
873 | } |
874 | |
875 | g->data_type = bucket_data_type; |
876 | struct bucket new = *g; |
877 | bucket_unlock(b: g); |
878 | bch2_dev_usage_update_m(c, ca, old: &old, new: &new); |
879 | percpu_up_read(sem: &c->mark_lock); |
880 | } |
881 | |
882 | return 0; |
883 | } |
884 | |
885 | static int bch2_trigger_stripe_ptr(struct btree_trans *trans, |
886 | struct bkey_s_c k, |
887 | struct extent_ptr_decoded p, |
888 | enum bch_data_type data_type, |
889 | s64 sectors, unsigned flags) |
890 | { |
891 | if (flags & BTREE_TRIGGER_TRANSACTIONAL) { |
892 | struct btree_iter iter; |
893 | struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, |
894 | BTREE_ID_stripes, POS(0, p.ec.idx), |
895 | BTREE_ITER_WITH_UPDATES, stripe); |
896 | int ret = PTR_ERR_OR_ZERO(ptr: s); |
897 | if (unlikely(ret)) { |
898 | bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, |
899 | "pointer to nonexistent stripe %llu" , |
900 | (u64) p.ec.idx); |
901 | goto err; |
902 | } |
903 | |
904 | if (!bch2_ptr_matches_stripe(s: &s->v, p)) { |
905 | bch2_trans_inconsistent(trans, |
906 | "stripe pointer doesn't match stripe %llu" , |
907 | (u64) p.ec.idx); |
908 | ret = -EIO; |
909 | goto err; |
910 | } |
911 | |
912 | stripe_blockcount_set(s: &s->v, idx: p.ec.block, |
913 | v: stripe_blockcount_get(s: &s->v, idx: p.ec.block) + |
914 | sectors); |
915 | |
916 | struct bch_replicas_padded r; |
917 | bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(k: &s->k_i)); |
918 | r.e.data_type = data_type; |
919 | ret = bch2_update_replicas_list(trans, r: &r.e, sectors); |
920 | err: |
921 | bch2_trans_iter_exit(trans, &iter); |
922 | return ret; |
923 | } |
924 | |
925 | if (flags & BTREE_TRIGGER_GC) { |
926 | struct bch_fs *c = trans->c; |
927 | |
928 | BUG_ON(!(flags & BTREE_TRIGGER_GC)); |
929 | |
930 | struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); |
931 | if (!m) { |
932 | bch_err(c, "error allocating memory for gc_stripes, idx %llu" , |
933 | (u64) p.ec.idx); |
934 | return -BCH_ERR_ENOMEM_mark_stripe_ptr; |
935 | } |
936 | |
937 | mutex_lock(&c->ec_stripes_heap_lock); |
938 | |
939 | if (!m || !m->alive) { |
940 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
941 | struct printbuf buf = PRINTBUF; |
942 | bch2_bkey_val_to_text(&buf, c, k); |
943 | bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s" , |
944 | (u64) p.ec.idx, buf.buf); |
945 | printbuf_exit(&buf); |
946 | bch2_inconsistent_error(c); |
947 | return -EIO; |
948 | } |
949 | |
950 | m->block_sectors[p.ec.block] += sectors; |
951 | |
952 | struct bch_replicas_padded r = m->r; |
953 | mutex_unlock(lock: &c->ec_stripes_heap_lock); |
954 | |
955 | r.e.data_type = data_type; |
956 | bch2_update_replicas(c, k, r: &r.e, sectors, journal_seq: trans->journal_res.seq, gc: true); |
957 | } |
958 | |
959 | return 0; |
960 | } |
961 | |
962 | static int __trigger_extent(struct btree_trans *trans, |
963 | enum btree_id btree_id, unsigned level, |
964 | struct bkey_s_c k, unsigned flags) |
965 | { |
966 | bool gc = flags & BTREE_TRIGGER_GC; |
967 | struct bch_fs *c = trans->c; |
968 | struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); |
969 | const union bch_extent_entry *entry; |
970 | struct extent_ptr_decoded p; |
971 | struct bch_replicas_padded r; |
972 | enum bch_data_type data_type = bkey_is_btree_ptr(k: k.k) |
973 | ? BCH_DATA_btree |
974 | : BCH_DATA_user; |
975 | s64 dirty_sectors = 0; |
976 | int ret = 0; |
977 | |
978 | r.e.data_type = data_type; |
979 | r.e.nr_devs = 0; |
980 | r.e.nr_required = 1; |
981 | |
982 | bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { |
983 | s64 disk_sectors; |
984 | ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, sectors: &disk_sectors, flags); |
985 | if (ret < 0) |
986 | return ret; |
987 | |
988 | bool stale = ret > 0; |
989 | |
990 | if (p.ptr.cached) { |
991 | if (!stale) { |
992 | ret = !gc |
993 | ? bch2_update_cached_sectors_list(trans, dev: p.ptr.dev, sectors: disk_sectors) |
994 | : update_cached_sectors(c, k, dev: p.ptr.dev, sectors: disk_sectors, journal_seq: 0, gc: true); |
995 | bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors" , |
996 | bch2_err_str(ret)); |
997 | if (ret) |
998 | return ret; |
999 | } |
1000 | } else if (!p.has_ec) { |
1001 | dirty_sectors += disk_sectors; |
1002 | r.e.devs[r.e.nr_devs++] = p.ptr.dev; |
1003 | } else { |
1004 | ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, sectors: disk_sectors, flags); |
1005 | if (ret) |
1006 | return ret; |
1007 | |
1008 | /* |
1009 | * There may be other dirty pointers in this extent, but |
1010 | * if so they're not required for mounting if we have an |
1011 | * erasure coded pointer in this extent: |
1012 | */ |
1013 | r.e.nr_required = 0; |
1014 | } |
1015 | } |
1016 | |
1017 | if (r.e.nr_devs) { |
1018 | ret = !gc |
1019 | ? bch2_update_replicas_list(trans, r: &r.e, sectors: dirty_sectors) |
1020 | : bch2_update_replicas(c, k, r: &r.e, sectors: dirty_sectors, journal_seq: 0, gc: true); |
1021 | if (unlikely(ret && gc)) { |
1022 | struct printbuf buf = PRINTBUF; |
1023 | |
1024 | bch2_bkey_val_to_text(&buf, c, k); |
1025 | bch2_fs_fatal_error(c, ": no replicas entry for %s" , buf.buf); |
1026 | printbuf_exit(&buf); |
1027 | } |
1028 | if (ret) |
1029 | return ret; |
1030 | } |
1031 | |
1032 | return 0; |
1033 | } |
1034 | |
1035 | int bch2_trigger_extent(struct btree_trans *trans, |
1036 | enum btree_id btree_id, unsigned level, |
1037 | struct bkey_s_c old, struct bkey_s new, |
1038 | unsigned flags) |
1039 | { |
1040 | struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(k: new.s_c); |
1041 | struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(k: old); |
1042 | unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start; |
1043 | unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start; |
1044 | |
1045 | /* if pointers aren't changing - nothing to do: */ |
1046 | if (new_ptrs_bytes == old_ptrs_bytes && |
1047 | !memcmp(p: new_ptrs.start, |
1048 | q: old_ptrs.start, |
1049 | size: new_ptrs_bytes)) |
1050 | return 0; |
1051 | |
1052 | if (flags & BTREE_TRIGGER_TRANSACTIONAL) { |
1053 | struct bch_fs *c = trans->c; |
1054 | int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) - |
1055 | (int) bch2_bkey_needs_rebalance(c, old); |
1056 | |
1057 | if (mod) { |
1058 | int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, |
1059 | new.k->p, mod > 0); |
1060 | if (ret) |
1061 | return ret; |
1062 | } |
1063 | } |
1064 | |
1065 | if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC)) |
1066 | return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags); |
1067 | |
1068 | return 0; |
1069 | } |
1070 | |
1071 | /* KEY_TYPE_reservation */ |
1072 | |
1073 | static int __trigger_reservation(struct btree_trans *trans, |
1074 | enum btree_id btree_id, unsigned level, |
1075 | struct bkey_s_c k, unsigned flags) |
1076 | { |
1077 | struct bch_fs *c = trans->c; |
1078 | unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; |
1079 | s64 sectors = (s64) k.k->size * replicas; |
1080 | |
1081 | if (flags & BTREE_TRIGGER_OVERWRITE) |
1082 | sectors = -sectors; |
1083 | |
1084 | if (flags & BTREE_TRIGGER_TRANSACTIONAL) { |
1085 | int ret = bch2_replicas_deltas_realloc(trans, more: 0); |
1086 | if (ret) |
1087 | return ret; |
1088 | |
1089 | struct replicas_delta_list *d = trans->fs_usage_deltas; |
1090 | replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved)); |
1091 | |
1092 | d->persistent_reserved[replicas - 1] += sectors; |
1093 | } |
1094 | |
1095 | if (flags & BTREE_TRIGGER_GC) { |
1096 | percpu_down_read(sem: &c->mark_lock); |
1097 | preempt_disable(); |
1098 | |
1099 | struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc); |
1100 | |
1101 | replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved)); |
1102 | fs_usage->b.reserved += sectors; |
1103 | fs_usage->persistent_reserved[replicas - 1] += sectors; |
1104 | |
1105 | preempt_enable(); |
1106 | percpu_up_read(sem: &c->mark_lock); |
1107 | } |
1108 | |
1109 | return 0; |
1110 | } |
1111 | |
1112 | int bch2_trigger_reservation(struct btree_trans *trans, |
1113 | enum btree_id btree_id, unsigned level, |
1114 | struct bkey_s_c old, struct bkey_s new, |
1115 | unsigned flags) |
1116 | { |
1117 | return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags); |
1118 | } |
1119 | |
1120 | /* Mark superblocks: */ |
1121 | |
1122 | static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, |
1123 | struct bch_dev *ca, size_t b, |
1124 | enum bch_data_type type, |
1125 | unsigned sectors) |
1126 | { |
1127 | struct bch_fs *c = trans->c; |
1128 | struct btree_iter iter; |
1129 | struct bkey_i_alloc_v4 *a; |
1130 | int ret = 0; |
1131 | |
1132 | /* |
1133 | * Backup superblock might be past the end of our normal usable space: |
1134 | */ |
1135 | if (b >= ca->mi.nbuckets) |
1136 | return 0; |
1137 | |
1138 | a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); |
1139 | if (IS_ERR(ptr: a)) |
1140 | return PTR_ERR(ptr: a); |
1141 | |
1142 | if (a->v.data_type && type && a->v.data_type != type) { |
1143 | bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, |
1144 | BCH_FSCK_ERR_bucket_metadata_type_mismatch, |
1145 | "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" |
1146 | "while marking %s" , |
1147 | iter.pos.inode, iter.pos.offset, a->v.gen, |
1148 | bch2_data_type_str(type: a->v.data_type), |
1149 | bch2_data_type_str(type), |
1150 | bch2_data_type_str(type)); |
1151 | ret = -EIO; |
1152 | goto err; |
1153 | } |
1154 | |
1155 | if (a->v.data_type != type || |
1156 | a->v.dirty_sectors != sectors) { |
1157 | a->v.data_type = type; |
1158 | a->v.dirty_sectors = sectors; |
1159 | ret = bch2_trans_update(trans, &iter, &a->k_i, 0); |
1160 | } |
1161 | err: |
1162 | bch2_trans_iter_exit(trans, &iter); |
1163 | return ret; |
1164 | } |
1165 | |
1166 | int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, |
1167 | struct bch_dev *ca, size_t b, |
1168 | enum bch_data_type type, |
1169 | unsigned sectors) |
1170 | { |
1171 | return commit_do(trans, NULL, NULL, 0, |
1172 | __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); |
1173 | } |
1174 | |
1175 | static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, |
1176 | struct bch_dev *ca, |
1177 | u64 start, u64 end, |
1178 | enum bch_data_type type, |
1179 | u64 *bucket, unsigned *bucket_sectors) |
1180 | { |
1181 | do { |
1182 | u64 b = sector_to_bucket(ca, s: start); |
1183 | unsigned sectors = |
1184 | min_t(u64, bucket_to_sector(ca, b + 1), end) - start; |
1185 | |
1186 | if (b != *bucket && *bucket_sectors) { |
1187 | int ret = bch2_trans_mark_metadata_bucket(trans, ca, b: *bucket, |
1188 | type, sectors: *bucket_sectors); |
1189 | if (ret) |
1190 | return ret; |
1191 | |
1192 | *bucket_sectors = 0; |
1193 | } |
1194 | |
1195 | *bucket = b; |
1196 | *bucket_sectors += sectors; |
1197 | start += sectors; |
1198 | } while (start < end); |
1199 | |
1200 | return 0; |
1201 | } |
1202 | |
1203 | static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, |
1204 | struct bch_dev *ca) |
1205 | { |
1206 | struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; |
1207 | u64 bucket = 0; |
1208 | unsigned i, bucket_sectors = 0; |
1209 | int ret; |
1210 | |
1211 | for (i = 0; i < layout->nr_superblocks; i++) { |
1212 | u64 offset = le64_to_cpu(layout->sb_offset[i]); |
1213 | |
1214 | if (offset == BCH_SB_SECTOR) { |
1215 | ret = bch2_trans_mark_metadata_sectors(trans, ca, |
1216 | start: 0, BCH_SB_SECTOR, |
1217 | type: BCH_DATA_sb, bucket: &bucket, bucket_sectors: &bucket_sectors); |
1218 | if (ret) |
1219 | return ret; |
1220 | } |
1221 | |
1222 | ret = bch2_trans_mark_metadata_sectors(trans, ca, start: offset, |
1223 | end: offset + (1 << layout->sb_max_size_bits), |
1224 | type: BCH_DATA_sb, bucket: &bucket, bucket_sectors: &bucket_sectors); |
1225 | if (ret) |
1226 | return ret; |
1227 | } |
1228 | |
1229 | if (bucket_sectors) { |
1230 | ret = bch2_trans_mark_metadata_bucket(trans, ca, |
1231 | b: bucket, type: BCH_DATA_sb, sectors: bucket_sectors); |
1232 | if (ret) |
1233 | return ret; |
1234 | } |
1235 | |
1236 | for (i = 0; i < ca->journal.nr; i++) { |
1237 | ret = bch2_trans_mark_metadata_bucket(trans, ca, |
1238 | b: ca->journal.buckets[i], |
1239 | type: BCH_DATA_journal, sectors: ca->mi.bucket_size); |
1240 | if (ret) |
1241 | return ret; |
1242 | } |
1243 | |
1244 | return 0; |
1245 | } |
1246 | |
1247 | int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) |
1248 | { |
1249 | int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); |
1250 | |
1251 | bch_err_fn(c, ret); |
1252 | return ret; |
1253 | } |
1254 | |
1255 | int bch2_trans_mark_dev_sbs(struct bch_fs *c) |
1256 | { |
1257 | for_each_online_member(c, ca) { |
1258 | int ret = bch2_trans_mark_dev_sb(c, ca); |
1259 | if (ret) { |
1260 | percpu_ref_put(ref: &ca->ref); |
1261 | return ret; |
1262 | } |
1263 | } |
1264 | |
1265 | return 0; |
1266 | } |
1267 | |
1268 | /* Disk reservations: */ |
1269 | |
1270 | #define SECTORS_CACHE 1024 |
1271 | |
1272 | int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, |
1273 | u64 sectors, int flags) |
1274 | { |
1275 | struct bch_fs_pcpu *pcpu; |
1276 | u64 old, v, get; |
1277 | s64 sectors_available; |
1278 | int ret; |
1279 | |
1280 | percpu_down_read(sem: &c->mark_lock); |
1281 | preempt_disable(); |
1282 | pcpu = this_cpu_ptr(c->pcpu); |
1283 | |
1284 | if (sectors <= pcpu->sectors_available) |
1285 | goto out; |
1286 | |
1287 | v = atomic64_read(v: &c->sectors_available); |
1288 | do { |
1289 | old = v; |
1290 | get = min((u64) sectors + SECTORS_CACHE, old); |
1291 | |
1292 | if (get < sectors) { |
1293 | preempt_enable(); |
1294 | goto recalculate; |
1295 | } |
1296 | } while ((v = atomic64_cmpxchg(v: &c->sectors_available, |
1297 | old, new: old - get)) != old); |
1298 | |
1299 | pcpu->sectors_available += get; |
1300 | |
1301 | out: |
1302 | pcpu->sectors_available -= sectors; |
1303 | this_cpu_add(*c->online_reserved, sectors); |
1304 | res->sectors += sectors; |
1305 | |
1306 | preempt_enable(); |
1307 | percpu_up_read(sem: &c->mark_lock); |
1308 | return 0; |
1309 | |
1310 | recalculate: |
1311 | mutex_lock(&c->sectors_available_lock); |
1312 | |
1313 | percpu_u64_set(dst: &c->pcpu->sectors_available, src: 0); |
1314 | sectors_available = avail_factor(r: __bch2_fs_usage_read_short(c).free); |
1315 | |
1316 | if (sectors <= sectors_available || |
1317 | (flags & BCH_DISK_RESERVATION_NOFAIL)) { |
1318 | atomic64_set(v: &c->sectors_available, |
1319 | max_t(s64, 0, sectors_available - sectors)); |
1320 | this_cpu_add(*c->online_reserved, sectors); |
1321 | res->sectors += sectors; |
1322 | ret = 0; |
1323 | } else { |
1324 | atomic64_set(v: &c->sectors_available, i: sectors_available); |
1325 | ret = -BCH_ERR_ENOSPC_disk_reservation; |
1326 | } |
1327 | |
1328 | mutex_unlock(lock: &c->sectors_available_lock); |
1329 | percpu_up_read(sem: &c->mark_lock); |
1330 | |
1331 | return ret; |
1332 | } |
1333 | |
1334 | /* Startup/shutdown: */ |
1335 | |
1336 | static void bucket_gens_free_rcu(struct rcu_head *rcu) |
1337 | { |
1338 | struct bucket_gens *buckets = |
1339 | container_of(rcu, struct bucket_gens, rcu); |
1340 | |
1341 | kvfree(addr: buckets); |
1342 | } |
1343 | |
1344 | int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) |
1345 | { |
1346 | struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; |
1347 | unsigned long *buckets_nouse = NULL; |
1348 | bool resize = ca->bucket_gens != NULL; |
1349 | int ret; |
1350 | |
1351 | if (!(bucket_gens = kvmalloc(size: sizeof(struct bucket_gens) + nbuckets, |
1352 | GFP_KERNEL|__GFP_ZERO))) { |
1353 | ret = -BCH_ERR_ENOMEM_bucket_gens; |
1354 | goto err; |
1355 | } |
1356 | |
1357 | if ((c->opts.buckets_nouse && |
1358 | !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * |
1359 | sizeof(unsigned long), |
1360 | GFP_KERNEL|__GFP_ZERO)))) { |
1361 | ret = -BCH_ERR_ENOMEM_buckets_nouse; |
1362 | goto err; |
1363 | } |
1364 | |
1365 | bucket_gens->first_bucket = ca->mi.first_bucket; |
1366 | bucket_gens->nbuckets = nbuckets; |
1367 | |
1368 | if (resize) { |
1369 | down_write(sem: &c->gc_lock); |
1370 | down_write(sem: &ca->bucket_lock); |
1371 | percpu_down_write(&c->mark_lock); |
1372 | } |
1373 | |
1374 | old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); |
1375 | |
1376 | if (resize) { |
1377 | size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); |
1378 | |
1379 | memcpy(bucket_gens->b, |
1380 | old_bucket_gens->b, |
1381 | n); |
1382 | if (buckets_nouse) |
1383 | memcpy(buckets_nouse, |
1384 | ca->buckets_nouse, |
1385 | BITS_TO_LONGS(n) * sizeof(unsigned long)); |
1386 | } |
1387 | |
1388 | rcu_assign_pointer(ca->bucket_gens, bucket_gens); |
1389 | bucket_gens = old_bucket_gens; |
1390 | |
1391 | swap(ca->buckets_nouse, buckets_nouse); |
1392 | |
1393 | nbuckets = ca->mi.nbuckets; |
1394 | |
1395 | if (resize) { |
1396 | percpu_up_write(&c->mark_lock); |
1397 | up_write(sem: &ca->bucket_lock); |
1398 | up_write(sem: &c->gc_lock); |
1399 | } |
1400 | |
1401 | ret = 0; |
1402 | err: |
1403 | kvfree(addr: buckets_nouse); |
1404 | if (bucket_gens) |
1405 | call_rcu(head: &bucket_gens->rcu, func: bucket_gens_free_rcu); |
1406 | |
1407 | return ret; |
1408 | } |
1409 | |
1410 | void bch2_dev_buckets_free(struct bch_dev *ca) |
1411 | { |
1412 | kvfree(addr: ca->buckets_nouse); |
1413 | kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); |
1414 | |
1415 | for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) |
1416 | free_percpu(pdata: ca->usage[i]); |
1417 | kfree(objp: ca->usage_base); |
1418 | } |
1419 | |
1420 | int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) |
1421 | { |
1422 | ca->usage_base = kzalloc(size: sizeof(struct bch_dev_usage), GFP_KERNEL); |
1423 | if (!ca->usage_base) |
1424 | return -BCH_ERR_ENOMEM_usage_init; |
1425 | |
1426 | for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { |
1427 | ca->usage[i] = alloc_percpu(struct bch_dev_usage); |
1428 | if (!ca->usage[i]) |
1429 | return -BCH_ERR_ENOMEM_usage_init; |
1430 | } |
1431 | |
1432 | return bch2_dev_buckets_resize(c, ca, nbuckets: ca->mi.nbuckets); |
1433 | } |
1434 | |