1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright 2012 Google, Inc. |
4 | * |
5 | * Foreground allocator code: allocate buckets from freelist, and allocate in |
6 | * sector granularity from writepoints. |
7 | * |
8 | * bch2_bucket_alloc() allocates a single bucket from a specific device. |
9 | * |
10 | * bch2_bucket_alloc_set() allocates one or more buckets from different devices |
11 | * in a given filesystem. |
12 | */ |
13 | |
14 | #include "bcachefs.h" |
15 | #include "alloc_background.h" |
16 | #include "alloc_foreground.h" |
17 | #include "backpointers.h" |
18 | #include "btree_iter.h" |
19 | #include "btree_update.h" |
20 | #include "btree_gc.h" |
21 | #include "buckets.h" |
22 | #include "buckets_waiting_for_journal.h" |
23 | #include "clock.h" |
24 | #include "debug.h" |
25 | #include "disk_groups.h" |
26 | #include "ec.h" |
27 | #include "error.h" |
28 | #include "io_write.h" |
29 | #include "journal.h" |
30 | #include "movinggc.h" |
31 | #include "nocow_locking.h" |
32 | #include "trace.h" |
33 | |
34 | #include <linux/math64.h> |
35 | #include <linux/rculist.h> |
36 | #include <linux/rcupdate.h> |
37 | |
38 | static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, |
39 | struct mutex *lock) |
40 | { |
41 | if (!mutex_trylock(lock)) { |
42 | bch2_trans_unlock(trans); |
43 | mutex_lock(lock); |
44 | } |
45 | } |
46 | |
47 | const char * const bch2_watermarks[] = { |
48 | #define x(t) #t, |
49 | BCH_WATERMARKS() |
50 | #undef x |
51 | NULL |
52 | }; |
53 | |
54 | /* |
55 | * Open buckets represent a bucket that's currently being allocated from. They |
56 | * serve two purposes: |
57 | * |
58 | * - They track buckets that have been partially allocated, allowing for |
59 | * sub-bucket sized allocations - they're used by the sector allocator below |
60 | * |
61 | * - They provide a reference to the buckets they own that mark and sweep GC |
62 | * can find, until the new allocation has a pointer to it inserted into the |
63 | * btree |
64 | * |
65 | * When allocating some space with the sector allocator, the allocation comes |
66 | * with a reference to an open bucket - the caller is required to put that |
67 | * reference _after_ doing the index update that makes its allocation reachable. |
68 | */ |
69 | |
70 | void bch2_reset_alloc_cursors(struct bch_fs *c) |
71 | { |
72 | rcu_read_lock(); |
73 | for_each_member_device_rcu(c, ca, NULL) |
74 | ca->alloc_cursor = 0; |
75 | rcu_read_unlock(); |
76 | } |
77 | |
78 | static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) |
79 | { |
80 | open_bucket_idx_t idx = ob - c->open_buckets; |
81 | open_bucket_idx_t *slot = open_bucket_hashslot(c, dev: ob->dev, bucket: ob->bucket); |
82 | |
83 | ob->hash = *slot; |
84 | *slot = idx; |
85 | } |
86 | |
87 | static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) |
88 | { |
89 | open_bucket_idx_t idx = ob - c->open_buckets; |
90 | open_bucket_idx_t *slot = open_bucket_hashslot(c, dev: ob->dev, bucket: ob->bucket); |
91 | |
92 | while (*slot != idx) { |
93 | BUG_ON(!*slot); |
94 | slot = &c->open_buckets[*slot].hash; |
95 | } |
96 | |
97 | *slot = ob->hash; |
98 | ob->hash = 0; |
99 | } |
100 | |
101 | void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) |
102 | { |
103 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ob->dev); |
104 | |
105 | if (ob->ec) { |
106 | ec_stripe_new_put(c, s: ob->ec, ref: STRIPE_REF_io); |
107 | return; |
108 | } |
109 | |
110 | percpu_down_read(sem: &c->mark_lock); |
111 | spin_lock(lock: &ob->lock); |
112 | |
113 | ob->valid = false; |
114 | ob->data_type = 0; |
115 | |
116 | spin_unlock(lock: &ob->lock); |
117 | percpu_up_read(sem: &c->mark_lock); |
118 | |
119 | spin_lock(lock: &c->freelist_lock); |
120 | bch2_open_bucket_hash_remove(c, ob); |
121 | |
122 | ob->freelist = c->open_buckets_freelist; |
123 | c->open_buckets_freelist = ob - c->open_buckets; |
124 | |
125 | c->open_buckets_nr_free++; |
126 | ca->nr_open_buckets--; |
127 | spin_unlock(lock: &c->freelist_lock); |
128 | |
129 | closure_wake_up(list: &c->open_buckets_wait); |
130 | } |
131 | |
132 | void bch2_open_bucket_write_error(struct bch_fs *c, |
133 | struct open_buckets *obs, |
134 | unsigned dev) |
135 | { |
136 | struct open_bucket *ob; |
137 | unsigned i; |
138 | |
139 | open_bucket_for_each(c, obs, ob, i) |
140 | if (ob->dev == dev && ob->ec) |
141 | bch2_ec_bucket_cancel(c, ob); |
142 | } |
143 | |
144 | static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) |
145 | { |
146 | struct open_bucket *ob; |
147 | |
148 | BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); |
149 | |
150 | ob = c->open_buckets + c->open_buckets_freelist; |
151 | c->open_buckets_freelist = ob->freelist; |
152 | atomic_set(v: &ob->pin, i: 1); |
153 | ob->data_type = 0; |
154 | |
155 | c->open_buckets_nr_free--; |
156 | return ob; |
157 | } |
158 | |
159 | static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) |
160 | { |
161 | BUG_ON(c->open_buckets_partial_nr >= |
162 | ARRAY_SIZE(c->open_buckets_partial)); |
163 | |
164 | spin_lock(lock: &c->freelist_lock); |
165 | ob->on_partial_list = true; |
166 | c->open_buckets_partial[c->open_buckets_partial_nr++] = |
167 | ob - c->open_buckets; |
168 | spin_unlock(lock: &c->freelist_lock); |
169 | |
170 | closure_wake_up(list: &c->open_buckets_wait); |
171 | closure_wake_up(list: &c->freelist_wait); |
172 | } |
173 | |
174 | /* _only_ for allocating the journal on a new device: */ |
175 | long bch2_bucket_alloc_new_fs(struct bch_dev *ca) |
176 | { |
177 | while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { |
178 | u64 b = ca->new_fs_bucket_idx++; |
179 | |
180 | if (!is_superblock_bucket(ca, b) && |
181 | (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) |
182 | return b; |
183 | } |
184 | |
185 | return -1; |
186 | } |
187 | |
188 | static inline unsigned open_buckets_reserved(enum bch_watermark watermark) |
189 | { |
190 | switch (watermark) { |
191 | case BCH_WATERMARK_interior_updates: |
192 | return 0; |
193 | case BCH_WATERMARK_reclaim: |
194 | return OPEN_BUCKETS_COUNT / 6; |
195 | case BCH_WATERMARK_btree: |
196 | case BCH_WATERMARK_btree_copygc: |
197 | return OPEN_BUCKETS_COUNT / 4; |
198 | case BCH_WATERMARK_copygc: |
199 | return OPEN_BUCKETS_COUNT / 3; |
200 | default: |
201 | return OPEN_BUCKETS_COUNT / 2; |
202 | } |
203 | } |
204 | |
205 | static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, |
206 | u64 bucket, |
207 | enum bch_watermark watermark, |
208 | const struct bch_alloc_v4 *a, |
209 | struct bucket_alloc_state *s, |
210 | struct closure *cl) |
211 | { |
212 | struct open_bucket *ob; |
213 | |
214 | if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { |
215 | s->skipped_nouse++; |
216 | return NULL; |
217 | } |
218 | |
219 | if (bch2_bucket_is_open(c, dev: ca->dev_idx, bucket)) { |
220 | s->skipped_open++; |
221 | return NULL; |
222 | } |
223 | |
224 | if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, |
225 | c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { |
226 | s->skipped_need_journal_commit++; |
227 | return NULL; |
228 | } |
229 | |
230 | if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { |
231 | s->skipped_nocow++; |
232 | return NULL; |
233 | } |
234 | |
235 | spin_lock(lock: &c->freelist_lock); |
236 | |
237 | if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { |
238 | if (cl) |
239 | closure_wait(list: &c->open_buckets_wait, cl); |
240 | |
241 | track_event_change(stats: &c->times[BCH_TIME_blocked_allocate_open_bucket], v: true); |
242 | spin_unlock(lock: &c->freelist_lock); |
243 | return ERR_PTR(error: -BCH_ERR_open_buckets_empty); |
244 | } |
245 | |
246 | /* Recheck under lock: */ |
247 | if (bch2_bucket_is_open(c, dev: ca->dev_idx, bucket)) { |
248 | spin_unlock(lock: &c->freelist_lock); |
249 | s->skipped_open++; |
250 | return NULL; |
251 | } |
252 | |
253 | ob = bch2_open_bucket_alloc(c); |
254 | |
255 | spin_lock(lock: &ob->lock); |
256 | |
257 | ob->valid = true; |
258 | ob->sectors_free = ca->mi.bucket_size; |
259 | ob->dev = ca->dev_idx; |
260 | ob->gen = a->gen; |
261 | ob->bucket = bucket; |
262 | spin_unlock(lock: &ob->lock); |
263 | |
264 | ca->nr_open_buckets++; |
265 | bch2_open_bucket_hash_add(c, ob); |
266 | |
267 | track_event_change(stats: &c->times[BCH_TIME_blocked_allocate_open_bucket], v: false); |
268 | track_event_change(stats: &c->times[BCH_TIME_blocked_allocate], v: false); |
269 | |
270 | spin_unlock(lock: &c->freelist_lock); |
271 | return ob; |
272 | } |
273 | |
274 | static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, |
275 | enum bch_watermark watermark, u64 free_entry, |
276 | struct bucket_alloc_state *s, |
277 | struct bkey_s_c freespace_k, |
278 | struct closure *cl) |
279 | { |
280 | struct bch_fs *c = trans->c; |
281 | struct btree_iter iter = { NULL }; |
282 | struct bkey_s_c k; |
283 | struct open_bucket *ob; |
284 | struct bch_alloc_v4 a_convert; |
285 | const struct bch_alloc_v4 *a; |
286 | u64 b = free_entry & ~(~0ULL << 56); |
287 | unsigned genbits = free_entry >> 56; |
288 | struct printbuf buf = PRINTBUF; |
289 | int ret; |
290 | |
291 | if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { |
292 | prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" |
293 | " freespace key " , |
294 | ca->mi.first_bucket, ca->mi.nbuckets); |
295 | bch2_bkey_val_to_text(&buf, c, freespace_k); |
296 | bch2_trans_inconsistent(trans, "%s" , buf.buf); |
297 | ob = ERR_PTR(error: -EIO); |
298 | goto err; |
299 | } |
300 | |
301 | k = bch2_bkey_get_iter(trans, iter: &iter, |
302 | btree_id: BTREE_ID_alloc, POS(ca->dev_idx, b), |
303 | flags: BTREE_ITER_CACHED); |
304 | ret = bkey_err(k); |
305 | if (ret) { |
306 | ob = ERR_PTR(error: ret); |
307 | goto err; |
308 | } |
309 | |
310 | a = bch2_alloc_to_v4(k, convert: &a_convert); |
311 | |
312 | if (a->data_type != BCH_DATA_free) { |
313 | if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { |
314 | ob = NULL; |
315 | goto err; |
316 | } |
317 | |
318 | prt_printf(&buf, "non free bucket in freespace btree\n" |
319 | " freespace key " ); |
320 | bch2_bkey_val_to_text(&buf, c, freespace_k); |
321 | prt_printf(&buf, "\n " ); |
322 | bch2_bkey_val_to_text(&buf, c, k); |
323 | bch2_trans_inconsistent(trans, "%s" , buf.buf); |
324 | ob = ERR_PTR(error: -EIO); |
325 | goto err; |
326 | } |
327 | |
328 | if (genbits != (alloc_freespace_genbits(a: *a) >> 56) && |
329 | c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { |
330 | prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" |
331 | " freespace key " , |
332 | genbits, alloc_freespace_genbits(*a) >> 56); |
333 | bch2_bkey_val_to_text(&buf, c, freespace_k); |
334 | prt_printf(&buf, "\n " ); |
335 | bch2_bkey_val_to_text(&buf, c, k); |
336 | bch2_trans_inconsistent(trans, "%s" , buf.buf); |
337 | ob = ERR_PTR(error: -EIO); |
338 | goto err; |
339 | } |
340 | |
341 | if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { |
342 | struct bch_backpointer bp; |
343 | struct bpos bp_pos = POS_MIN; |
344 | |
345 | ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, |
346 | &bp_pos, &bp, |
347 | BTREE_ITER_NOPRESERVE); |
348 | if (ret) { |
349 | ob = ERR_PTR(error: ret); |
350 | goto err; |
351 | } |
352 | |
353 | if (!bkey_eq(l: bp_pos, POS_MAX)) { |
354 | /* |
355 | * Bucket may have data in it - we don't call |
356 | * bc2h_trans_inconnsistent() because fsck hasn't |
357 | * finished yet |
358 | */ |
359 | ob = NULL; |
360 | goto err; |
361 | } |
362 | } |
363 | |
364 | ob = __try_alloc_bucket(c, ca, bucket: b, watermark, a, s, cl); |
365 | if (!ob) |
366 | set_btree_iter_dontneed(&iter); |
367 | err: |
368 | if (iter.path) |
369 | set_btree_iter_dontneed(&iter); |
370 | bch2_trans_iter_exit(trans, &iter); |
371 | printbuf_exit(&buf); |
372 | return ob; |
373 | } |
374 | |
375 | /* |
376 | * This path is for before the freespace btree is initialized: |
377 | * |
378 | * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & |
379 | * journal buckets - journal buckets will be < ca->new_fs_bucket_idx |
380 | */ |
381 | static noinline struct open_bucket * |
382 | bch2_bucket_alloc_early(struct btree_trans *trans, |
383 | struct bch_dev *ca, |
384 | enum bch_watermark watermark, |
385 | struct bucket_alloc_state *s, |
386 | struct closure *cl) |
387 | { |
388 | struct btree_iter iter, citer; |
389 | struct bkey_s_c k, ck; |
390 | struct open_bucket *ob = NULL; |
391 | u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); |
392 | u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); |
393 | u64 alloc_cursor = alloc_start; |
394 | int ret; |
395 | |
396 | /* |
397 | * Scan with an uncached iterator to avoid polluting the key cache. An |
398 | * uncached iter will return a cached key if one exists, but if not |
399 | * there is no other underlying protection for the associated key cache |
400 | * slot. To avoid racing bucket allocations, look up the cached key slot |
401 | * of any likely allocation candidate before attempting to proceed with |
402 | * the allocation. This provides proper exclusion on the associated |
403 | * bucket. |
404 | */ |
405 | again: |
406 | for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), |
407 | BTREE_ITER_SLOTS, k, ret) { |
408 | struct bch_alloc_v4 a_convert; |
409 | const struct bch_alloc_v4 *a; |
410 | |
411 | if (bkey_ge(l: k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) |
412 | break; |
413 | |
414 | if (ca->new_fs_bucket_idx && |
415 | is_superblock_bucket(ca, b: k.k->p.offset)) |
416 | continue; |
417 | |
418 | a = bch2_alloc_to_v4(k, convert: &a_convert); |
419 | if (a->data_type != BCH_DATA_free) |
420 | continue; |
421 | |
422 | /* now check the cached key to serialize concurrent allocs of the bucket */ |
423 | ck = bch2_bkey_get_iter(trans, iter: &citer, btree_id: BTREE_ID_alloc, pos: k.k->p, flags: BTREE_ITER_CACHED); |
424 | ret = bkey_err(ck); |
425 | if (ret) |
426 | break; |
427 | |
428 | a = bch2_alloc_to_v4(k: ck, convert: &a_convert); |
429 | if (a->data_type != BCH_DATA_free) |
430 | goto next; |
431 | |
432 | s->buckets_seen++; |
433 | |
434 | ob = __try_alloc_bucket(c: trans->c, ca, bucket: k.k->p.offset, watermark, a, s, cl); |
435 | next: |
436 | set_btree_iter_dontneed(&citer); |
437 | bch2_trans_iter_exit(trans, &citer); |
438 | if (ob) |
439 | break; |
440 | } |
441 | bch2_trans_iter_exit(trans, &iter); |
442 | |
443 | alloc_cursor = iter.pos.offset; |
444 | ca->alloc_cursor = alloc_cursor; |
445 | |
446 | if (!ob && ret) |
447 | ob = ERR_PTR(error: ret); |
448 | |
449 | if (!ob && alloc_start > first_bucket) { |
450 | alloc_cursor = alloc_start = first_bucket; |
451 | goto again; |
452 | } |
453 | |
454 | return ob; |
455 | } |
456 | |
457 | static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, |
458 | struct bch_dev *ca, |
459 | enum bch_watermark watermark, |
460 | struct bucket_alloc_state *s, |
461 | struct closure *cl) |
462 | { |
463 | struct btree_iter iter; |
464 | struct bkey_s_c k; |
465 | struct open_bucket *ob = NULL; |
466 | u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); |
467 | u64 alloc_cursor = alloc_start; |
468 | int ret; |
469 | |
470 | BUG_ON(ca->new_fs_bucket_idx); |
471 | again: |
472 | for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, |
473 | POS(ca->dev_idx, alloc_cursor), 0, k, ret) { |
474 | if (k.k->p.inode != ca->dev_idx) |
475 | break; |
476 | |
477 | for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); |
478 | alloc_cursor < k.k->p.offset; |
479 | alloc_cursor++) { |
480 | ret = btree_trans_too_many_iters(trans); |
481 | if (ret) { |
482 | ob = ERR_PTR(error: ret); |
483 | break; |
484 | } |
485 | |
486 | s->buckets_seen++; |
487 | |
488 | ob = try_alloc_bucket(trans, ca, watermark, |
489 | free_entry: alloc_cursor, s, freespace_k: k, cl); |
490 | if (ob) { |
491 | set_btree_iter_dontneed(&iter); |
492 | break; |
493 | } |
494 | } |
495 | |
496 | if (ob || ret) |
497 | break; |
498 | } |
499 | bch2_trans_iter_exit(trans, &iter); |
500 | |
501 | ca->alloc_cursor = alloc_cursor; |
502 | |
503 | if (!ob && ret) |
504 | ob = ERR_PTR(error: ret); |
505 | |
506 | if (!ob && alloc_start > ca->mi.first_bucket) { |
507 | alloc_cursor = alloc_start = ca->mi.first_bucket; |
508 | goto again; |
509 | } |
510 | |
511 | return ob; |
512 | } |
513 | |
514 | /** |
515 | * bch2_bucket_alloc_trans - allocate a single bucket from a specific device |
516 | * @trans: transaction object |
517 | * @ca: device to allocate from |
518 | * @watermark: how important is this allocation? |
519 | * @cl: if not NULL, closure to be used to wait if buckets not available |
520 | * @usage: for secondarily also returning the current device usage |
521 | * |
522 | * Returns: an open_bucket on success, or an ERR_PTR() on failure. |
523 | */ |
524 | static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, |
525 | struct bch_dev *ca, |
526 | enum bch_watermark watermark, |
527 | struct closure *cl, |
528 | struct bch_dev_usage *usage) |
529 | { |
530 | struct bch_fs *c = trans->c; |
531 | struct open_bucket *ob = NULL; |
532 | bool freespace = READ_ONCE(ca->mi.freespace_initialized); |
533 | u64 avail; |
534 | struct bucket_alloc_state s = { 0 }; |
535 | bool waiting = false; |
536 | again: |
537 | bch2_dev_usage_read_fast(ca, usage); |
538 | avail = dev_buckets_free(ca, usage: *usage, watermark); |
539 | |
540 | if (usage->d[BCH_DATA_need_discard].buckets > avail) |
541 | bch2_do_discards(c); |
542 | |
543 | if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) |
544 | bch2_do_gc_gens(c); |
545 | |
546 | if (should_invalidate_buckets(ca, u: *usage)) |
547 | bch2_do_invalidates(c); |
548 | |
549 | if (!avail) { |
550 | if (cl && !waiting) { |
551 | closure_wait(list: &c->freelist_wait, cl); |
552 | waiting = true; |
553 | goto again; |
554 | } |
555 | |
556 | track_event_change(stats: &c->times[BCH_TIME_blocked_allocate], v: true); |
557 | |
558 | ob = ERR_PTR(error: -BCH_ERR_freelist_empty); |
559 | goto err; |
560 | } |
561 | |
562 | if (waiting) |
563 | closure_wake_up(list: &c->freelist_wait); |
564 | alloc: |
565 | ob = likely(freespace) |
566 | ? bch2_bucket_alloc_freelist(trans, ca, watermark, s: &s, cl) |
567 | : bch2_bucket_alloc_early(trans, ca, watermark, s: &s, cl); |
568 | |
569 | if (s.skipped_need_journal_commit * 2 > avail) |
570 | bch2_journal_flush_async(&c->journal, NULL); |
571 | |
572 | if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { |
573 | freespace = false; |
574 | goto alloc; |
575 | } |
576 | err: |
577 | if (!ob) |
578 | ob = ERR_PTR(error: -BCH_ERR_no_buckets_found); |
579 | |
580 | if (!IS_ERR(ptr: ob)) |
581 | trace_and_count(c, bucket_alloc, ca, |
582 | bch2_watermarks[watermark], |
583 | ob->bucket, |
584 | usage->d[BCH_DATA_free].buckets, |
585 | avail, |
586 | bch2_copygc_wait_amount(c), |
587 | c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), |
588 | &s, |
589 | cl == NULL, |
590 | "" ); |
591 | else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) |
592 | trace_and_count(c, bucket_alloc_fail, ca, |
593 | bch2_watermarks[watermark], |
594 | 0, |
595 | usage->d[BCH_DATA_free].buckets, |
596 | avail, |
597 | bch2_copygc_wait_amount(c), |
598 | c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), |
599 | &s, |
600 | cl == NULL, |
601 | bch2_err_str(PTR_ERR(ob))); |
602 | |
603 | return ob; |
604 | } |
605 | |
606 | struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, |
607 | enum bch_watermark watermark, |
608 | struct closure *cl) |
609 | { |
610 | struct bch_dev_usage usage; |
611 | struct open_bucket *ob; |
612 | |
613 | bch2_trans_do(c, NULL, NULL, 0, |
614 | PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, |
615 | cl, &usage))); |
616 | return ob; |
617 | } |
618 | |
619 | static int __dev_stripe_cmp(struct dev_stripe_state *stripe, |
620 | unsigned l, unsigned r) |
621 | { |
622 | return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - |
623 | (stripe->next_alloc[l] < stripe->next_alloc[r])); |
624 | } |
625 | |
626 | #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) |
627 | |
628 | struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, |
629 | struct dev_stripe_state *stripe, |
630 | struct bch_devs_mask *devs) |
631 | { |
632 | struct dev_alloc_list ret = { .nr = 0 }; |
633 | unsigned i; |
634 | |
635 | for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) |
636 | ret.devs[ret.nr++] = i; |
637 | |
638 | bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); |
639 | return ret; |
640 | } |
641 | |
642 | static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, |
643 | struct dev_stripe_state *stripe, |
644 | struct bch_dev_usage *usage) |
645 | { |
646 | u64 *v = stripe->next_alloc + ca->dev_idx; |
647 | u64 free_space = dev_buckets_available(ca, watermark: BCH_WATERMARK_normal); |
648 | u64 free_space_inv = free_space |
649 | ? div64_u64(dividend: 1ULL << 48, divisor: free_space) |
650 | : 1ULL << 48; |
651 | u64 scale = *v / 4; |
652 | |
653 | if (*v + free_space_inv >= *v) |
654 | *v += free_space_inv; |
655 | else |
656 | *v = U64_MAX; |
657 | |
658 | for (v = stripe->next_alloc; |
659 | v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) |
660 | *v = *v < scale ? 0 : *v - scale; |
661 | } |
662 | |
663 | void bch2_dev_stripe_increment(struct bch_dev *ca, |
664 | struct dev_stripe_state *stripe) |
665 | { |
666 | struct bch_dev_usage usage; |
667 | |
668 | bch2_dev_usage_read_fast(ca, &usage); |
669 | bch2_dev_stripe_increment_inlined(ca, stripe, usage: &usage); |
670 | } |
671 | |
672 | static int add_new_bucket(struct bch_fs *c, |
673 | struct open_buckets *ptrs, |
674 | struct bch_devs_mask *devs_may_alloc, |
675 | unsigned nr_replicas, |
676 | unsigned *nr_effective, |
677 | bool *have_cache, |
678 | unsigned flags, |
679 | struct open_bucket *ob) |
680 | { |
681 | unsigned durability = |
682 | bch_dev_bkey_exists(c, idx: ob->dev)->mi.durability; |
683 | |
684 | BUG_ON(*nr_effective >= nr_replicas); |
685 | |
686 | __clear_bit(ob->dev, devs_may_alloc->d); |
687 | *nr_effective += durability; |
688 | *have_cache |= !durability; |
689 | |
690 | ob_push(c, obs: ptrs, ob); |
691 | |
692 | if (*nr_effective >= nr_replicas) |
693 | return 1; |
694 | if (ob->ec) |
695 | return 1; |
696 | return 0; |
697 | } |
698 | |
699 | int bch2_bucket_alloc_set_trans(struct btree_trans *trans, |
700 | struct open_buckets *ptrs, |
701 | struct dev_stripe_state *stripe, |
702 | struct bch_devs_mask *devs_may_alloc, |
703 | unsigned nr_replicas, |
704 | unsigned *nr_effective, |
705 | bool *have_cache, |
706 | unsigned flags, |
707 | enum bch_data_type data_type, |
708 | enum bch_watermark watermark, |
709 | struct closure *cl) |
710 | { |
711 | struct bch_fs *c = trans->c; |
712 | struct dev_alloc_list devs_sorted = |
713 | bch2_dev_alloc_list(c, stripe, devs: devs_may_alloc); |
714 | unsigned dev; |
715 | struct bch_dev *ca; |
716 | int ret = -BCH_ERR_insufficient_devices; |
717 | unsigned i; |
718 | |
719 | BUG_ON(*nr_effective >= nr_replicas); |
720 | |
721 | for (i = 0; i < devs_sorted.nr; i++) { |
722 | struct bch_dev_usage usage; |
723 | struct open_bucket *ob; |
724 | |
725 | dev = devs_sorted.devs[i]; |
726 | |
727 | rcu_read_lock(); |
728 | ca = rcu_dereference(c->devs[dev]); |
729 | if (ca) |
730 | percpu_ref_get(ref: &ca->ref); |
731 | rcu_read_unlock(); |
732 | |
733 | if (!ca) |
734 | continue; |
735 | |
736 | if (!ca->mi.durability && *have_cache) { |
737 | percpu_ref_put(ref: &ca->ref); |
738 | continue; |
739 | } |
740 | |
741 | ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, usage: &usage); |
742 | if (!IS_ERR(ptr: ob)) |
743 | bch2_dev_stripe_increment_inlined(ca, stripe, usage: &usage); |
744 | percpu_ref_put(ref: &ca->ref); |
745 | |
746 | if (IS_ERR(ptr: ob)) { |
747 | ret = PTR_ERR(ptr: ob); |
748 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) |
749 | break; |
750 | continue; |
751 | } |
752 | |
753 | ob->data_type = data_type; |
754 | |
755 | if (add_new_bucket(c, ptrs, devs_may_alloc, |
756 | nr_replicas, nr_effective, |
757 | have_cache, flags, ob)) { |
758 | ret = 0; |
759 | break; |
760 | } |
761 | } |
762 | |
763 | return ret; |
764 | } |
765 | |
766 | /* Allocate from stripes: */ |
767 | |
768 | /* |
769 | * if we can't allocate a new stripe because there are already too many |
770 | * partially filled stripes, force allocating from an existing stripe even when |
771 | * it's to a device we don't want: |
772 | */ |
773 | |
774 | static int bucket_alloc_from_stripe(struct btree_trans *trans, |
775 | struct open_buckets *ptrs, |
776 | struct write_point *wp, |
777 | struct bch_devs_mask *devs_may_alloc, |
778 | u16 target, |
779 | unsigned nr_replicas, |
780 | unsigned *nr_effective, |
781 | bool *have_cache, |
782 | enum bch_watermark watermark, |
783 | unsigned flags, |
784 | struct closure *cl) |
785 | { |
786 | struct bch_fs *c = trans->c; |
787 | struct dev_alloc_list devs_sorted; |
788 | struct ec_stripe_head *h; |
789 | struct open_bucket *ob; |
790 | unsigned i, ec_idx; |
791 | int ret = 0; |
792 | |
793 | if (nr_replicas < 2) |
794 | return 0; |
795 | |
796 | if (ec_open_bucket(c, obs: ptrs)) |
797 | return 0; |
798 | |
799 | h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); |
800 | if (IS_ERR(ptr: h)) |
801 | return PTR_ERR(ptr: h); |
802 | if (!h) |
803 | return 0; |
804 | |
805 | devs_sorted = bch2_dev_alloc_list(c, stripe: &wp->stripe, devs: devs_may_alloc); |
806 | |
807 | for (i = 0; i < devs_sorted.nr; i++) |
808 | for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { |
809 | if (!h->s->blocks[ec_idx]) |
810 | continue; |
811 | |
812 | ob = c->open_buckets + h->s->blocks[ec_idx]; |
813 | if (ob->dev == devs_sorted.devs[i] && |
814 | !test_and_set_bit(nr: ec_idx, addr: h->s->blocks_allocated)) |
815 | goto got_bucket; |
816 | } |
817 | goto out_put_head; |
818 | got_bucket: |
819 | ob->ec_idx = ec_idx; |
820 | ob->ec = h->s; |
821 | ec_stripe_new_get(s: h->s, ref: STRIPE_REF_io); |
822 | |
823 | ret = add_new_bucket(c, ptrs, devs_may_alloc, |
824 | nr_replicas, nr_effective, |
825 | have_cache, flags, ob); |
826 | out_put_head: |
827 | bch2_ec_stripe_head_put(c, h); |
828 | return ret; |
829 | } |
830 | |
831 | /* Sector allocator */ |
832 | |
833 | static bool want_bucket(struct bch_fs *c, |
834 | struct write_point *wp, |
835 | struct bch_devs_mask *devs_may_alloc, |
836 | bool *have_cache, bool ec, |
837 | struct open_bucket *ob) |
838 | { |
839 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ob->dev); |
840 | |
841 | if (!test_bit(ob->dev, devs_may_alloc->d)) |
842 | return false; |
843 | |
844 | if (ob->data_type != wp->data_type) |
845 | return false; |
846 | |
847 | if (!ca->mi.durability && |
848 | (wp->data_type == BCH_DATA_btree || ec || *have_cache)) |
849 | return false; |
850 | |
851 | if (ec != (ob->ec != NULL)) |
852 | return false; |
853 | |
854 | return true; |
855 | } |
856 | |
857 | static int bucket_alloc_set_writepoint(struct bch_fs *c, |
858 | struct open_buckets *ptrs, |
859 | struct write_point *wp, |
860 | struct bch_devs_mask *devs_may_alloc, |
861 | unsigned nr_replicas, |
862 | unsigned *nr_effective, |
863 | bool *have_cache, |
864 | bool ec, unsigned flags) |
865 | { |
866 | struct open_buckets ptrs_skip = { .nr = 0 }; |
867 | struct open_bucket *ob; |
868 | unsigned i; |
869 | int ret = 0; |
870 | |
871 | open_bucket_for_each(c, &wp->ptrs, ob, i) { |
872 | if (!ret && want_bucket(c, wp, devs_may_alloc, |
873 | have_cache, ec, ob)) |
874 | ret = add_new_bucket(c, ptrs, devs_may_alloc, |
875 | nr_replicas, nr_effective, |
876 | have_cache, flags, ob); |
877 | else |
878 | ob_push(c, obs: &ptrs_skip, ob); |
879 | } |
880 | wp->ptrs = ptrs_skip; |
881 | |
882 | return ret; |
883 | } |
884 | |
885 | static int bucket_alloc_set_partial(struct bch_fs *c, |
886 | struct open_buckets *ptrs, |
887 | struct write_point *wp, |
888 | struct bch_devs_mask *devs_may_alloc, |
889 | unsigned nr_replicas, |
890 | unsigned *nr_effective, |
891 | bool *have_cache, bool ec, |
892 | enum bch_watermark watermark, |
893 | unsigned flags) |
894 | { |
895 | int i, ret = 0; |
896 | |
897 | if (!c->open_buckets_partial_nr) |
898 | return 0; |
899 | |
900 | spin_lock(lock: &c->freelist_lock); |
901 | |
902 | if (!c->open_buckets_partial_nr) |
903 | goto unlock; |
904 | |
905 | for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { |
906 | struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; |
907 | |
908 | if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { |
909 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ob->dev); |
910 | struct bch_dev_usage usage; |
911 | u64 avail; |
912 | |
913 | bch2_dev_usage_read_fast(ca, &usage); |
914 | avail = dev_buckets_free(ca, usage, watermark); |
915 | if (!avail) |
916 | continue; |
917 | |
918 | array_remove_item(c->open_buckets_partial, |
919 | c->open_buckets_partial_nr, |
920 | i); |
921 | ob->on_partial_list = false; |
922 | |
923 | ret = add_new_bucket(c, ptrs, devs_may_alloc, |
924 | nr_replicas, nr_effective, |
925 | have_cache, flags, ob); |
926 | if (ret) |
927 | break; |
928 | } |
929 | } |
930 | unlock: |
931 | spin_unlock(lock: &c->freelist_lock); |
932 | return ret; |
933 | } |
934 | |
935 | static int __open_bucket_add_buckets(struct btree_trans *trans, |
936 | struct open_buckets *ptrs, |
937 | struct write_point *wp, |
938 | struct bch_devs_list *devs_have, |
939 | u16 target, |
940 | bool erasure_code, |
941 | unsigned nr_replicas, |
942 | unsigned *nr_effective, |
943 | bool *have_cache, |
944 | enum bch_watermark watermark, |
945 | unsigned flags, |
946 | struct closure *_cl) |
947 | { |
948 | struct bch_fs *c = trans->c; |
949 | struct bch_devs_mask devs; |
950 | struct open_bucket *ob; |
951 | struct closure *cl = NULL; |
952 | unsigned i; |
953 | int ret; |
954 | |
955 | devs = target_rw_devs(c, data_type: wp->data_type, target); |
956 | |
957 | /* Don't allocate from devices we already have pointers to: */ |
958 | darray_for_each(*devs_have, i) |
959 | __clear_bit(*i, devs.d); |
960 | |
961 | open_bucket_for_each(c, ptrs, ob, i) |
962 | __clear_bit(ob->dev, devs.d); |
963 | |
964 | if (erasure_code && ec_open_bucket(c, obs: ptrs)) |
965 | return 0; |
966 | |
967 | ret = bucket_alloc_set_writepoint(c, ptrs, wp, devs_may_alloc: &devs, |
968 | nr_replicas, nr_effective, |
969 | have_cache, ec: erasure_code, flags); |
970 | if (ret) |
971 | return ret; |
972 | |
973 | ret = bucket_alloc_set_partial(c, ptrs, wp, devs_may_alloc: &devs, |
974 | nr_replicas, nr_effective, |
975 | have_cache, ec: erasure_code, watermark, flags); |
976 | if (ret) |
977 | return ret; |
978 | |
979 | if (erasure_code) { |
980 | ret = bucket_alloc_from_stripe(trans, ptrs, wp, devs_may_alloc: &devs, |
981 | target, |
982 | nr_replicas, nr_effective, |
983 | have_cache, |
984 | watermark, flags, cl: _cl); |
985 | } else { |
986 | retry_blocking: |
987 | /* |
988 | * Try nonblocking first, so that if one device is full we'll try from |
989 | * other devices: |
990 | */ |
991 | ret = bch2_bucket_alloc_set_trans(trans, ptrs, stripe: &wp->stripe, devs_may_alloc: &devs, |
992 | nr_replicas, nr_effective, have_cache, |
993 | flags, data_type: wp->data_type, watermark, cl); |
994 | if (ret && |
995 | !bch2_err_matches(ret, BCH_ERR_transaction_restart) && |
996 | !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && |
997 | !cl && _cl) { |
998 | cl = _cl; |
999 | goto retry_blocking; |
1000 | } |
1001 | } |
1002 | |
1003 | return ret; |
1004 | } |
1005 | |
1006 | static int open_bucket_add_buckets(struct btree_trans *trans, |
1007 | struct open_buckets *ptrs, |
1008 | struct write_point *wp, |
1009 | struct bch_devs_list *devs_have, |
1010 | u16 target, |
1011 | unsigned erasure_code, |
1012 | unsigned nr_replicas, |
1013 | unsigned *nr_effective, |
1014 | bool *have_cache, |
1015 | enum bch_watermark watermark, |
1016 | unsigned flags, |
1017 | struct closure *cl) |
1018 | { |
1019 | int ret; |
1020 | |
1021 | if (erasure_code) { |
1022 | ret = __open_bucket_add_buckets(trans, ptrs, wp, |
1023 | devs_have, target, erasure_code, |
1024 | nr_replicas, nr_effective, have_cache, |
1025 | watermark, flags, cl: cl); |
1026 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || |
1027 | bch2_err_matches(ret, BCH_ERR_operation_blocked) || |
1028 | bch2_err_matches(ret, BCH_ERR_freelist_empty) || |
1029 | bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) |
1030 | return ret; |
1031 | if (*nr_effective >= nr_replicas) |
1032 | return 0; |
1033 | } |
1034 | |
1035 | ret = __open_bucket_add_buckets(trans, ptrs, wp, |
1036 | devs_have, target, erasure_code: false, |
1037 | nr_replicas, nr_effective, have_cache, |
1038 | watermark, flags, cl: cl); |
1039 | return ret < 0 ? ret : 0; |
1040 | } |
1041 | |
1042 | /** |
1043 | * should_drop_bucket - check if this is open_bucket should go away |
1044 | * @ob: open_bucket to predicate on |
1045 | * @c: filesystem handle |
1046 | * @ca: if set, we're killing buckets for a particular device |
1047 | * @ec: if true, we're shutting down erasure coding and killing all ec |
1048 | * open_buckets |
1049 | * otherwise, return true |
1050 | * Returns: true if we should kill this open_bucket |
1051 | * |
1052 | * We're killing open_buckets because we're shutting down a device, erasure |
1053 | * coding, or the entire filesystem - check if this open_bucket matches: |
1054 | */ |
1055 | static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, |
1056 | struct bch_dev *ca, bool ec) |
1057 | { |
1058 | if (ec) { |
1059 | return ob->ec != NULL; |
1060 | } else if (ca) { |
1061 | bool drop = ob->dev == ca->dev_idx; |
1062 | struct open_bucket *ob2; |
1063 | unsigned i; |
1064 | |
1065 | if (!drop && ob->ec) { |
1066 | unsigned nr_blocks; |
1067 | |
1068 | mutex_lock(&ob->ec->lock); |
1069 | nr_blocks = bkey_i_to_stripe(k: &ob->ec->new_stripe.key)->v.nr_blocks; |
1070 | |
1071 | for (i = 0; i < nr_blocks; i++) { |
1072 | if (!ob->ec->blocks[i]) |
1073 | continue; |
1074 | |
1075 | ob2 = c->open_buckets + ob->ec->blocks[i]; |
1076 | drop |= ob2->dev == ca->dev_idx; |
1077 | } |
1078 | mutex_unlock(lock: &ob->ec->lock); |
1079 | } |
1080 | |
1081 | return drop; |
1082 | } else { |
1083 | return true; |
1084 | } |
1085 | } |
1086 | |
1087 | static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, |
1088 | bool ec, struct write_point *wp) |
1089 | { |
1090 | struct open_buckets ptrs = { .nr = 0 }; |
1091 | struct open_bucket *ob; |
1092 | unsigned i; |
1093 | |
1094 | mutex_lock(&wp->lock); |
1095 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
1096 | if (should_drop_bucket(ob, c, ca, ec)) |
1097 | bch2_open_bucket_put(c, ob); |
1098 | else |
1099 | ob_push(c, obs: &ptrs, ob); |
1100 | wp->ptrs = ptrs; |
1101 | mutex_unlock(lock: &wp->lock); |
1102 | } |
1103 | |
1104 | void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, |
1105 | bool ec) |
1106 | { |
1107 | unsigned i; |
1108 | |
1109 | /* Next, close write points that point to this device... */ |
1110 | for (i = 0; i < ARRAY_SIZE(c->write_points); i++) |
1111 | bch2_writepoint_stop(c, ca, ec, wp: &c->write_points[i]); |
1112 | |
1113 | bch2_writepoint_stop(c, ca, ec, wp: &c->copygc_write_point); |
1114 | bch2_writepoint_stop(c, ca, ec, wp: &c->rebalance_write_point); |
1115 | bch2_writepoint_stop(c, ca, ec, wp: &c->btree_write_point); |
1116 | |
1117 | mutex_lock(&c->btree_reserve_cache_lock); |
1118 | while (c->btree_reserve_cache_nr) { |
1119 | struct btree_alloc *a = |
1120 | &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; |
1121 | |
1122 | bch2_open_buckets_put(c, ptrs: &a->ob); |
1123 | } |
1124 | mutex_unlock(lock: &c->btree_reserve_cache_lock); |
1125 | |
1126 | spin_lock(lock: &c->freelist_lock); |
1127 | i = 0; |
1128 | while (i < c->open_buckets_partial_nr) { |
1129 | struct open_bucket *ob = |
1130 | c->open_buckets + c->open_buckets_partial[i]; |
1131 | |
1132 | if (should_drop_bucket(ob, c, ca, ec)) { |
1133 | --c->open_buckets_partial_nr; |
1134 | swap(c->open_buckets_partial[i], |
1135 | c->open_buckets_partial[c->open_buckets_partial_nr]); |
1136 | ob->on_partial_list = false; |
1137 | spin_unlock(lock: &c->freelist_lock); |
1138 | bch2_open_bucket_put(c, ob); |
1139 | spin_lock(lock: &c->freelist_lock); |
1140 | } else { |
1141 | i++; |
1142 | } |
1143 | } |
1144 | spin_unlock(lock: &c->freelist_lock); |
1145 | |
1146 | bch2_ec_stop_dev(c, ca); |
1147 | } |
1148 | |
1149 | static inline struct hlist_head *writepoint_hash(struct bch_fs *c, |
1150 | unsigned long write_point) |
1151 | { |
1152 | unsigned hash = |
1153 | hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); |
1154 | |
1155 | return &c->write_points_hash[hash]; |
1156 | } |
1157 | |
1158 | static struct write_point *__writepoint_find(struct hlist_head *head, |
1159 | unsigned long write_point) |
1160 | { |
1161 | struct write_point *wp; |
1162 | |
1163 | rcu_read_lock(); |
1164 | hlist_for_each_entry_rcu(wp, head, node) |
1165 | if (wp->write_point == write_point) |
1166 | goto out; |
1167 | wp = NULL; |
1168 | out: |
1169 | rcu_read_unlock(); |
1170 | return wp; |
1171 | } |
1172 | |
1173 | static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) |
1174 | { |
1175 | u64 stranded = c->write_points_nr * c->bucket_size_max; |
1176 | u64 free = bch2_fs_usage_read_short(c).free; |
1177 | |
1178 | return stranded * factor > free; |
1179 | } |
1180 | |
1181 | static bool try_increase_writepoints(struct bch_fs *c) |
1182 | { |
1183 | struct write_point *wp; |
1184 | |
1185 | if (c->write_points_nr == ARRAY_SIZE(c->write_points) || |
1186 | too_many_writepoints(c, factor: 32)) |
1187 | return false; |
1188 | |
1189 | wp = c->write_points + c->write_points_nr++; |
1190 | hlist_add_head_rcu(n: &wp->node, h: writepoint_hash(c, write_point: wp->write_point)); |
1191 | return true; |
1192 | } |
1193 | |
1194 | static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) |
1195 | { |
1196 | struct bch_fs *c = trans->c; |
1197 | struct write_point *wp; |
1198 | struct open_bucket *ob; |
1199 | unsigned i; |
1200 | |
1201 | mutex_lock(&c->write_points_hash_lock); |
1202 | if (c->write_points_nr < old_nr) { |
1203 | mutex_unlock(lock: &c->write_points_hash_lock); |
1204 | return true; |
1205 | } |
1206 | |
1207 | if (c->write_points_nr == 1 || |
1208 | !too_many_writepoints(c, factor: 8)) { |
1209 | mutex_unlock(lock: &c->write_points_hash_lock); |
1210 | return false; |
1211 | } |
1212 | |
1213 | wp = c->write_points + --c->write_points_nr; |
1214 | |
1215 | hlist_del_rcu(n: &wp->node); |
1216 | mutex_unlock(lock: &c->write_points_hash_lock); |
1217 | |
1218 | bch2_trans_mutex_lock_norelock(trans, lock: &wp->lock); |
1219 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
1220 | open_bucket_free_unused(c, ob); |
1221 | wp->ptrs.nr = 0; |
1222 | mutex_unlock(lock: &wp->lock); |
1223 | return true; |
1224 | } |
1225 | |
1226 | static struct write_point *writepoint_find(struct btree_trans *trans, |
1227 | unsigned long write_point) |
1228 | { |
1229 | struct bch_fs *c = trans->c; |
1230 | struct write_point *wp, *oldest; |
1231 | struct hlist_head *head; |
1232 | |
1233 | if (!(write_point & 1UL)) { |
1234 | wp = (struct write_point *) write_point; |
1235 | bch2_trans_mutex_lock_norelock(trans, lock: &wp->lock); |
1236 | return wp; |
1237 | } |
1238 | |
1239 | head = writepoint_hash(c, write_point); |
1240 | restart_find: |
1241 | wp = __writepoint_find(head, write_point); |
1242 | if (wp) { |
1243 | lock_wp: |
1244 | bch2_trans_mutex_lock_norelock(trans, lock: &wp->lock); |
1245 | if (wp->write_point == write_point) |
1246 | goto out; |
1247 | mutex_unlock(lock: &wp->lock); |
1248 | goto restart_find; |
1249 | } |
1250 | restart_find_oldest: |
1251 | oldest = NULL; |
1252 | for (wp = c->write_points; |
1253 | wp < c->write_points + c->write_points_nr; wp++) |
1254 | if (!oldest || time_before64(wp->last_used, oldest->last_used)) |
1255 | oldest = wp; |
1256 | |
1257 | bch2_trans_mutex_lock_norelock(trans, lock: &oldest->lock); |
1258 | bch2_trans_mutex_lock_norelock(trans, lock: &c->write_points_hash_lock); |
1259 | if (oldest >= c->write_points + c->write_points_nr || |
1260 | try_increase_writepoints(c)) { |
1261 | mutex_unlock(lock: &c->write_points_hash_lock); |
1262 | mutex_unlock(lock: &oldest->lock); |
1263 | goto restart_find_oldest; |
1264 | } |
1265 | |
1266 | wp = __writepoint_find(head, write_point); |
1267 | if (wp && wp != oldest) { |
1268 | mutex_unlock(lock: &c->write_points_hash_lock); |
1269 | mutex_unlock(lock: &oldest->lock); |
1270 | goto lock_wp; |
1271 | } |
1272 | |
1273 | wp = oldest; |
1274 | hlist_del_rcu(n: &wp->node); |
1275 | wp->write_point = write_point; |
1276 | hlist_add_head_rcu(n: &wp->node, h: head); |
1277 | mutex_unlock(lock: &c->write_points_hash_lock); |
1278 | out: |
1279 | wp->last_used = local_clock(); |
1280 | return wp; |
1281 | } |
1282 | |
1283 | static noinline void |
1284 | (struct bch_fs *c, |
1285 | struct open_buckets *ptrs, |
1286 | struct open_buckets *ptrs_no_use, |
1287 | unsigned ) |
1288 | { |
1289 | struct open_buckets ptrs2 = { 0 }; |
1290 | struct open_bucket *ob; |
1291 | unsigned i; |
1292 | |
1293 | open_bucket_for_each(c, ptrs, ob, i) { |
1294 | unsigned d = bch_dev_bkey_exists(c, idx: ob->dev)->mi.durability; |
1295 | |
1296 | if (d && d <= extra_replicas) { |
1297 | extra_replicas -= d; |
1298 | ob_push(c, obs: ptrs_no_use, ob); |
1299 | } else { |
1300 | ob_push(c, obs: &ptrs2, ob); |
1301 | } |
1302 | } |
1303 | |
1304 | *ptrs = ptrs2; |
1305 | } |
1306 | |
1307 | /* |
1308 | * Get us an open_bucket we can allocate from, return with it locked: |
1309 | */ |
1310 | int bch2_alloc_sectors_start_trans(struct btree_trans *trans, |
1311 | unsigned target, |
1312 | unsigned erasure_code, |
1313 | struct write_point_specifier write_point, |
1314 | struct bch_devs_list *devs_have, |
1315 | unsigned nr_replicas, |
1316 | unsigned nr_replicas_required, |
1317 | enum bch_watermark watermark, |
1318 | unsigned flags, |
1319 | struct closure *cl, |
1320 | struct write_point **wp_ret) |
1321 | { |
1322 | struct bch_fs *c = trans->c; |
1323 | struct write_point *wp; |
1324 | struct open_bucket *ob; |
1325 | struct open_buckets ptrs; |
1326 | unsigned nr_effective, write_points_nr; |
1327 | bool have_cache; |
1328 | int ret; |
1329 | int i; |
1330 | |
1331 | if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) |
1332 | erasure_code = false; |
1333 | |
1334 | BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); |
1335 | |
1336 | BUG_ON(!nr_replicas || !nr_replicas_required); |
1337 | retry: |
1338 | ptrs.nr = 0; |
1339 | nr_effective = 0; |
1340 | write_points_nr = c->write_points_nr; |
1341 | have_cache = false; |
1342 | |
1343 | *wp_ret = wp = writepoint_find(trans, write_point: write_point.v); |
1344 | |
1345 | /* metadata may not allocate on cache devices: */ |
1346 | if (wp->data_type != BCH_DATA_user) |
1347 | have_cache = true; |
1348 | |
1349 | if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { |
1350 | ret = open_bucket_add_buckets(trans, ptrs: &ptrs, wp, devs_have, |
1351 | target, erasure_code, |
1352 | nr_replicas, nr_effective: &nr_effective, |
1353 | have_cache: &have_cache, watermark, |
1354 | flags, NULL); |
1355 | if (!ret || |
1356 | bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
1357 | goto alloc_done; |
1358 | |
1359 | /* Don't retry from all devices if we're out of open buckets: */ |
1360 | if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { |
1361 | int ret2 = open_bucket_add_buckets(trans, ptrs: &ptrs, wp, devs_have, |
1362 | target, erasure_code, |
1363 | nr_replicas, nr_effective: &nr_effective, |
1364 | have_cache: &have_cache, watermark, |
1365 | flags, cl); |
1366 | if (!ret2 || |
1367 | bch2_err_matches(ret2, BCH_ERR_transaction_restart) || |
1368 | bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { |
1369 | ret = ret2; |
1370 | goto alloc_done; |
1371 | } |
1372 | } |
1373 | |
1374 | /* |
1375 | * Only try to allocate cache (durability = 0 devices) from the |
1376 | * specified target: |
1377 | */ |
1378 | have_cache = true; |
1379 | |
1380 | ret = open_bucket_add_buckets(trans, ptrs: &ptrs, wp, devs_have, |
1381 | target: 0, erasure_code, |
1382 | nr_replicas, nr_effective: &nr_effective, |
1383 | have_cache: &have_cache, watermark, |
1384 | flags, cl); |
1385 | } else { |
1386 | ret = open_bucket_add_buckets(trans, ptrs: &ptrs, wp, devs_have, |
1387 | target, erasure_code, |
1388 | nr_replicas, nr_effective: &nr_effective, |
1389 | have_cache: &have_cache, watermark, |
1390 | flags, cl); |
1391 | } |
1392 | alloc_done: |
1393 | BUG_ON(!ret && nr_effective < nr_replicas); |
1394 | |
1395 | if (erasure_code && !ec_open_bucket(c, obs: &ptrs)) |
1396 | pr_debug("failed to get ec bucket: ret %u" , ret); |
1397 | |
1398 | if (ret == -BCH_ERR_insufficient_devices && |
1399 | nr_effective >= nr_replicas_required) |
1400 | ret = 0; |
1401 | |
1402 | if (ret) |
1403 | goto err; |
1404 | |
1405 | if (nr_effective > nr_replicas) |
1406 | deallocate_extra_replicas(c, ptrs: &ptrs, ptrs_no_use: &wp->ptrs, extra_replicas: nr_effective - nr_replicas); |
1407 | |
1408 | /* Free buckets we didn't use: */ |
1409 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
1410 | open_bucket_free_unused(c, ob); |
1411 | |
1412 | wp->ptrs = ptrs; |
1413 | |
1414 | wp->sectors_free = UINT_MAX; |
1415 | |
1416 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
1417 | wp->sectors_free = min(wp->sectors_free, ob->sectors_free); |
1418 | |
1419 | BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); |
1420 | |
1421 | return 0; |
1422 | err: |
1423 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
1424 | if (ptrs.nr < ARRAY_SIZE(ptrs.v)) |
1425 | ob_push(c, obs: &ptrs, ob); |
1426 | else |
1427 | open_bucket_free_unused(c, ob); |
1428 | wp->ptrs = ptrs; |
1429 | |
1430 | mutex_unlock(lock: &wp->lock); |
1431 | |
1432 | if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && |
1433 | try_decrease_writepoints(trans, old_nr: write_points_nr)) |
1434 | goto retry; |
1435 | |
1436 | if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || |
1437 | bch2_err_matches(ret, BCH_ERR_freelist_empty)) |
1438 | return cl |
1439 | ? -BCH_ERR_bucket_alloc_blocked |
1440 | : -BCH_ERR_ENOSPC_bucket_alloc; |
1441 | |
1442 | return ret; |
1443 | } |
1444 | |
1445 | struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) |
1446 | { |
1447 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ob->dev); |
1448 | |
1449 | return (struct bch_extent_ptr) { |
1450 | .type = 1 << BCH_EXTENT_ENTRY_ptr, |
1451 | .gen = ob->gen, |
1452 | .dev = ob->dev, |
1453 | .offset = bucket_to_sector(ca, b: ob->bucket) + |
1454 | ca->mi.bucket_size - |
1455 | ob->sectors_free, |
1456 | }; |
1457 | } |
1458 | |
1459 | void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, |
1460 | struct bkey_i *k, unsigned sectors, |
1461 | bool cached) |
1462 | { |
1463 | bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); |
1464 | } |
1465 | |
1466 | /* |
1467 | * Append pointers to the space we just allocated to @k, and mark @sectors space |
1468 | * as allocated out of @ob |
1469 | */ |
1470 | void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) |
1471 | { |
1472 | bch2_alloc_sectors_done_inlined(c, wp); |
1473 | } |
1474 | |
1475 | static inline void writepoint_init(struct write_point *wp, |
1476 | enum bch_data_type type) |
1477 | { |
1478 | mutex_init(&wp->lock); |
1479 | wp->data_type = type; |
1480 | |
1481 | INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); |
1482 | INIT_LIST_HEAD(list: &wp->writes); |
1483 | spin_lock_init(&wp->writes_lock); |
1484 | } |
1485 | |
1486 | void bch2_fs_allocator_foreground_init(struct bch_fs *c) |
1487 | { |
1488 | struct open_bucket *ob; |
1489 | struct write_point *wp; |
1490 | |
1491 | mutex_init(&c->write_points_hash_lock); |
1492 | c->write_points_nr = ARRAY_SIZE(c->write_points); |
1493 | |
1494 | /* open bucket 0 is a sentinal NULL: */ |
1495 | spin_lock_init(&c->open_buckets[0].lock); |
1496 | |
1497 | for (ob = c->open_buckets + 1; |
1498 | ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { |
1499 | spin_lock_init(&ob->lock); |
1500 | c->open_buckets_nr_free++; |
1501 | |
1502 | ob->freelist = c->open_buckets_freelist; |
1503 | c->open_buckets_freelist = ob - c->open_buckets; |
1504 | } |
1505 | |
1506 | writepoint_init(wp: &c->btree_write_point, type: BCH_DATA_btree); |
1507 | writepoint_init(wp: &c->rebalance_write_point, type: BCH_DATA_user); |
1508 | writepoint_init(wp: &c->copygc_write_point, type: BCH_DATA_user); |
1509 | |
1510 | for (wp = c->write_points; |
1511 | wp < c->write_points + c->write_points_nr; wp++) { |
1512 | writepoint_init(wp, type: BCH_DATA_user); |
1513 | |
1514 | wp->last_used = local_clock(); |
1515 | wp->write_point = (unsigned long) wp; |
1516 | hlist_add_head_rcu(n: &wp->node, |
1517 | h: writepoint_hash(c, write_point: wp->write_point)); |
1518 | } |
1519 | } |
1520 | |
1521 | static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) |
1522 | { |
1523 | struct bch_dev *ca = bch_dev_bkey_exists(c, idx: ob->dev); |
1524 | unsigned data_type = ob->data_type; |
1525 | barrier(); /* READ_ONCE() doesn't work on bitfields */ |
1526 | |
1527 | prt_printf(out, "%zu ref %u " , |
1528 | ob - c->open_buckets, |
1529 | atomic_read(&ob->pin)); |
1530 | bch2_prt_data_type(out, data_type); |
1531 | prt_printf(out, " %u:%llu gen %u allocated %u/%u" , |
1532 | ob->dev, ob->bucket, ob->gen, |
1533 | ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); |
1534 | if (ob->ec) |
1535 | prt_printf(out, " ec idx %llu" , ob->ec->idx); |
1536 | if (ob->on_partial_list) |
1537 | prt_str(out, str: " partial" ); |
1538 | prt_newline(out); |
1539 | } |
1540 | |
1541 | void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) |
1542 | { |
1543 | struct open_bucket *ob; |
1544 | |
1545 | out->atomic++; |
1546 | |
1547 | for (ob = c->open_buckets; |
1548 | ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); |
1549 | ob++) { |
1550 | spin_lock(lock: &ob->lock); |
1551 | if (ob->valid && !ob->on_partial_list) |
1552 | bch2_open_bucket_to_text(out, c, ob); |
1553 | spin_unlock(lock: &ob->lock); |
1554 | } |
1555 | |
1556 | --out->atomic; |
1557 | } |
1558 | |
1559 | void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) |
1560 | { |
1561 | unsigned i; |
1562 | |
1563 | out->atomic++; |
1564 | spin_lock(lock: &c->freelist_lock); |
1565 | |
1566 | for (i = 0; i < c->open_buckets_partial_nr; i++) |
1567 | bch2_open_bucket_to_text(out, c, |
1568 | ob: c->open_buckets + c->open_buckets_partial[i]); |
1569 | |
1570 | spin_unlock(lock: &c->freelist_lock); |
1571 | --out->atomic; |
1572 | } |
1573 | |
1574 | static const char * const bch2_write_point_states[] = { |
1575 | #define x(n) #n, |
1576 | WRITE_POINT_STATES() |
1577 | #undef x |
1578 | NULL |
1579 | }; |
1580 | |
1581 | static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, |
1582 | struct write_point *wp) |
1583 | { |
1584 | struct open_bucket *ob; |
1585 | unsigned i; |
1586 | |
1587 | prt_printf(out, "%lu: " , wp->write_point); |
1588 | prt_human_readable_u64(out, wp->sectors_allocated); |
1589 | |
1590 | prt_printf(out, " last wrote: " ); |
1591 | bch2_pr_time_units(out, sched_clock() - wp->last_used); |
1592 | |
1593 | for (i = 0; i < WRITE_POINT_STATE_NR; i++) { |
1594 | prt_printf(out, " %s: " , bch2_write_point_states[i]); |
1595 | bch2_pr_time_units(out, wp->time[i]); |
1596 | } |
1597 | |
1598 | prt_newline(out); |
1599 | |
1600 | printbuf_indent_add(out, 2); |
1601 | open_bucket_for_each(c, &wp->ptrs, ob, i) |
1602 | bch2_open_bucket_to_text(out, c, ob); |
1603 | printbuf_indent_sub(out, 2); |
1604 | } |
1605 | |
1606 | void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) |
1607 | { |
1608 | struct write_point *wp; |
1609 | |
1610 | prt_str(out, str: "Foreground write points\n" ); |
1611 | for (wp = c->write_points; |
1612 | wp < c->write_points + ARRAY_SIZE(c->write_points); |
1613 | wp++) |
1614 | bch2_write_point_to_text(out, c, wp); |
1615 | |
1616 | prt_str(out, str: "Copygc write point\n" ); |
1617 | bch2_write_point_to_text(out, c, wp: &c->copygc_write_point); |
1618 | |
1619 | prt_str(out, str: "Rebalance write point\n" ); |
1620 | bch2_write_point_to_text(out, c, wp: &c->rebalance_write_point); |
1621 | |
1622 | prt_str(out, str: "Btree write point\n" ); |
1623 | bch2_write_point_to_text(out, c, wp: &c->btree_write_point); |
1624 | } |
1625 | |