1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Primary bucket allocation code |
4 | * |
5 | * Copyright 2012 Google, Inc. |
6 | * |
7 | * Allocation in bcache is done in terms of buckets: |
8 | * |
9 | * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in |
10 | * btree pointers - they must match for the pointer to be considered valid. |
11 | * |
12 | * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a |
13 | * bucket simply by incrementing its gen. |
14 | * |
15 | * The gens (along with the priorities; it's really the gens are important but |
16 | * the code is named as if it's the priorities) are written in an arbitrary list |
17 | * of buckets on disk, with a pointer to them in the journal header. |
18 | * |
19 | * When we invalidate a bucket, we have to write its new gen to disk and wait |
20 | * for that write to complete before we use it - otherwise after a crash we |
21 | * could have pointers that appeared to be good but pointed to data that had |
22 | * been overwritten. |
23 | * |
24 | * Since the gens and priorities are all stored contiguously on disk, we can |
25 | * batch this up: We fill up the free_inc list with freshly invalidated buckets, |
26 | * call prio_write(), and when prio_write() finishes we pull buckets off the |
27 | * free_inc list and optionally discard them. |
28 | * |
29 | * free_inc isn't the only freelist - if it was, we'd often to sleep while |
30 | * priorities and gens were being written before we could allocate. c->free is a |
31 | * smaller freelist, and buckets on that list are always ready to be used. |
32 | * |
33 | * If we've got discards enabled, that happens when a bucket moves from the |
34 | * free_inc list to the free list. |
35 | * |
36 | * There is another freelist, because sometimes we have buckets that we know |
37 | * have nothing pointing into them - these we can reuse without waiting for |
38 | * priorities to be rewritten. These come from freed btree nodes and buckets |
39 | * that garbage collection discovered no longer had valid keys pointing into |
40 | * them (because they were overwritten). That's the unused list - buckets on the |
41 | * unused list move to the free list, optionally being discarded in the process. |
42 | * |
43 | * It's also important to ensure that gens don't wrap around - with respect to |
44 | * either the oldest gen in the btree or the gen on disk. This is quite |
45 | * difficult to do in practice, but we explicitly guard against it anyways - if |
46 | * a bucket is in danger of wrapping around we simply skip invalidating it that |
47 | * time around, and we garbage collect or rewrite the priorities sooner than we |
48 | * would have otherwise. |
49 | * |
50 | * bch_bucket_alloc() allocates a single bucket from a specific cache. |
51 | * |
52 | * bch_bucket_alloc_set() allocates one bucket from different caches |
53 | * out of a cache set. |
54 | * |
55 | * free_some_buckets() drives all the processes described above. It's called |
56 | * from bch_bucket_alloc() and a few other places that need to make sure free |
57 | * buckets are ready. |
58 | * |
59 | * invalidate_buckets_(lru|fifo)() find buckets that are available to be |
60 | * invalidated, and then invalidate them and stick them on the free_inc list - |
61 | * in either lru or fifo order. |
62 | */ |
63 | |
64 | #include "bcache.h" |
65 | #include "btree.h" |
66 | |
67 | #include <linux/blkdev.h> |
68 | #include <linux/kthread.h> |
69 | #include <linux/random.h> |
70 | #include <trace/events/bcache.h> |
71 | |
72 | #define MAX_OPEN_BUCKETS 128 |
73 | |
74 | /* Bucket heap / gen */ |
75 | |
76 | uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) |
77 | { |
78 | uint8_t ret = ++b->gen; |
79 | |
80 | ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); |
81 | WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); |
82 | |
83 | return ret; |
84 | } |
85 | |
86 | void bch_rescale_priorities(struct cache_set *c, int sectors) |
87 | { |
88 | struct cache *ca; |
89 | struct bucket *b; |
90 | unsigned long next = c->nbuckets * c->cache->sb.bucket_size / 1024; |
91 | int r; |
92 | |
93 | atomic_sub(i: sectors, v: &c->rescale); |
94 | |
95 | do { |
96 | r = atomic_read(v: &c->rescale); |
97 | |
98 | if (r >= 0) |
99 | return; |
100 | } while (atomic_cmpxchg(v: &c->rescale, old: r, new: r + next) != r); |
101 | |
102 | mutex_lock(&c->bucket_lock); |
103 | |
104 | c->min_prio = USHRT_MAX; |
105 | |
106 | ca = c->cache; |
107 | for_each_bucket(b, ca) |
108 | if (b->prio && |
109 | b->prio != BTREE_PRIO && |
110 | !atomic_read(v: &b->pin)) { |
111 | b->prio--; |
112 | c->min_prio = min(c->min_prio, b->prio); |
113 | } |
114 | |
115 | mutex_unlock(lock: &c->bucket_lock); |
116 | } |
117 | |
118 | /* |
119 | * Background allocation thread: scans for buckets to be invalidated, |
120 | * invalidates them, rewrites prios/gens (marking them as invalidated on disk), |
121 | * then optionally issues discard commands to the newly free buckets, then puts |
122 | * them on the various freelists. |
123 | */ |
124 | |
125 | static inline bool can_inc_bucket_gen(struct bucket *b) |
126 | { |
127 | return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; |
128 | } |
129 | |
130 | bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) |
131 | { |
132 | BUG_ON(!ca->set->gc_mark_valid); |
133 | |
134 | return (!GC_MARK(k: b) || |
135 | GC_MARK(k: b) == GC_MARK_RECLAIMABLE) && |
136 | !atomic_read(v: &b->pin) && |
137 | can_inc_bucket_gen(b); |
138 | } |
139 | |
140 | void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) |
141 | { |
142 | lockdep_assert_held(&ca->set->bucket_lock); |
143 | BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); |
144 | |
145 | if (GC_SECTORS_USED(k: b)) |
146 | trace_bcache_invalidate(ca, bucket: b - ca->buckets); |
147 | |
148 | bch_inc_gen(ca, b); |
149 | b->prio = INITIAL_PRIO; |
150 | atomic_inc(v: &b->pin); |
151 | } |
152 | |
153 | static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) |
154 | { |
155 | __bch_invalidate_one_bucket(ca, b); |
156 | |
157 | fifo_push(&ca->free_inc, b - ca->buckets); |
158 | } |
159 | |
160 | /* |
161 | * Determines what order we're going to reuse buckets, smallest bucket_prio() |
162 | * first: we also take into account the number of sectors of live data in that |
163 | * bucket, and in order for that multiply to make sense we have to scale bucket |
164 | * |
165 | * Thus, we scale the bucket priorities so that the bucket with the smallest |
166 | * prio is worth 1/8th of what INITIAL_PRIO is worth. |
167 | */ |
168 | |
169 | #define bucket_prio(b) \ |
170 | ({ \ |
171 | unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ |
172 | \ |
173 | (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ |
174 | }) |
175 | |
176 | #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) |
177 | #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) |
178 | |
179 | static void invalidate_buckets_lru(struct cache *ca) |
180 | { |
181 | struct bucket *b; |
182 | ssize_t i; |
183 | |
184 | ca->heap.used = 0; |
185 | |
186 | for_each_bucket(b, ca) { |
187 | if (!bch_can_invalidate_bucket(ca, b)) |
188 | continue; |
189 | |
190 | if (!heap_full(&ca->heap)) |
191 | heap_add(&ca->heap, b, bucket_max_cmp); |
192 | else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { |
193 | ca->heap.data[0] = b; |
194 | heap_sift(&ca->heap, 0, bucket_max_cmp); |
195 | } |
196 | } |
197 | |
198 | for (i = ca->heap.used / 2 - 1; i >= 0; --i) |
199 | heap_sift(&ca->heap, i, bucket_min_cmp); |
200 | |
201 | while (!fifo_full(&ca->free_inc)) { |
202 | if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { |
203 | /* |
204 | * We don't want to be calling invalidate_buckets() |
205 | * multiple times when it can't do anything |
206 | */ |
207 | ca->invalidate_needs_gc = 1; |
208 | wake_up_gc(c: ca->set); |
209 | return; |
210 | } |
211 | |
212 | bch_invalidate_one_bucket(ca, b); |
213 | } |
214 | } |
215 | |
216 | static void invalidate_buckets_fifo(struct cache *ca) |
217 | { |
218 | struct bucket *b; |
219 | size_t checked = 0; |
220 | |
221 | while (!fifo_full(&ca->free_inc)) { |
222 | if (ca->fifo_last_bucket < ca->sb.first_bucket || |
223 | ca->fifo_last_bucket >= ca->sb.nbuckets) |
224 | ca->fifo_last_bucket = ca->sb.first_bucket; |
225 | |
226 | b = ca->buckets + ca->fifo_last_bucket++; |
227 | |
228 | if (bch_can_invalidate_bucket(ca, b)) |
229 | bch_invalidate_one_bucket(ca, b); |
230 | |
231 | if (++checked >= ca->sb.nbuckets) { |
232 | ca->invalidate_needs_gc = 1; |
233 | wake_up_gc(c: ca->set); |
234 | return; |
235 | } |
236 | } |
237 | } |
238 | |
239 | static void invalidate_buckets_random(struct cache *ca) |
240 | { |
241 | struct bucket *b; |
242 | size_t checked = 0; |
243 | |
244 | while (!fifo_full(&ca->free_inc)) { |
245 | size_t n; |
246 | |
247 | get_random_bytes(buf: &n, len: sizeof(n)); |
248 | |
249 | n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); |
250 | n += ca->sb.first_bucket; |
251 | |
252 | b = ca->buckets + n; |
253 | |
254 | if (bch_can_invalidate_bucket(ca, b)) |
255 | bch_invalidate_one_bucket(ca, b); |
256 | |
257 | if (++checked >= ca->sb.nbuckets / 2) { |
258 | ca->invalidate_needs_gc = 1; |
259 | wake_up_gc(c: ca->set); |
260 | return; |
261 | } |
262 | } |
263 | } |
264 | |
265 | static void invalidate_buckets(struct cache *ca) |
266 | { |
267 | BUG_ON(ca->invalidate_needs_gc); |
268 | |
269 | switch (CACHE_REPLACEMENT(k: &ca->sb)) { |
270 | case CACHE_REPLACEMENT_LRU: |
271 | invalidate_buckets_lru(ca); |
272 | break; |
273 | case CACHE_REPLACEMENT_FIFO: |
274 | invalidate_buckets_fifo(ca); |
275 | break; |
276 | case CACHE_REPLACEMENT_RANDOM: |
277 | invalidate_buckets_random(ca); |
278 | break; |
279 | } |
280 | } |
281 | |
282 | #define allocator_wait(ca, cond) \ |
283 | do { \ |
284 | while (1) { \ |
285 | set_current_state(TASK_INTERRUPTIBLE); \ |
286 | if (cond) \ |
287 | break; \ |
288 | \ |
289 | mutex_unlock(&(ca)->set->bucket_lock); \ |
290 | if (kthread_should_stop() || \ |
291 | test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags)) { \ |
292 | set_current_state(TASK_RUNNING); \ |
293 | goto out; \ |
294 | } \ |
295 | \ |
296 | schedule(); \ |
297 | mutex_lock(&(ca)->set->bucket_lock); \ |
298 | } \ |
299 | __set_current_state(TASK_RUNNING); \ |
300 | } while (0) |
301 | |
302 | static int bch_allocator_push(struct cache *ca, long bucket) |
303 | { |
304 | unsigned int i; |
305 | |
306 | /* Prios/gens are actually the most important reserve */ |
307 | if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) |
308 | return true; |
309 | |
310 | for (i = 0; i < RESERVE_NR; i++) |
311 | if (fifo_push(&ca->free[i], bucket)) |
312 | return true; |
313 | |
314 | return false; |
315 | } |
316 | |
317 | static int bch_allocator_thread(void *arg) |
318 | { |
319 | struct cache *ca = arg; |
320 | |
321 | mutex_lock(&ca->set->bucket_lock); |
322 | |
323 | while (1) { |
324 | /* |
325 | * First, we pull buckets off of the unused and free_inc lists, |
326 | * possibly issue discards to them, then we add the bucket to |
327 | * the free list: |
328 | */ |
329 | while (1) { |
330 | long bucket; |
331 | |
332 | if (!fifo_pop(&ca->free_inc, bucket)) |
333 | break; |
334 | |
335 | if (ca->discard) { |
336 | mutex_unlock(lock: &ca->set->bucket_lock); |
337 | blkdev_issue_discard(bdev: ca->bdev, |
338 | sector: bucket_to_sector(c: ca->set, b: bucket), |
339 | nr_sects: ca->sb.bucket_size, GFP_KERNEL); |
340 | mutex_lock(&ca->set->bucket_lock); |
341 | } |
342 | |
343 | allocator_wait(ca, bch_allocator_push(ca, bucket)); |
344 | wake_up(&ca->set->btree_cache_wait); |
345 | wake_up(&ca->set->bucket_wait); |
346 | } |
347 | |
348 | /* |
349 | * We've run out of free buckets, we need to find some buckets |
350 | * we can invalidate. First, invalidate them in memory and add |
351 | * them to the free_inc list: |
352 | */ |
353 | |
354 | retry_invalidate: |
355 | allocator_wait(ca, ca->set->gc_mark_valid && |
356 | !ca->invalidate_needs_gc); |
357 | invalidate_buckets(ca); |
358 | |
359 | /* |
360 | * Now, we write their new gens to disk so we can start writing |
361 | * new stuff to them: |
362 | */ |
363 | allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); |
364 | if (CACHE_SYNC(k: &ca->sb)) { |
365 | /* |
366 | * This could deadlock if an allocation with a btree |
367 | * node locked ever blocked - having the btree node |
368 | * locked would block garbage collection, but here we're |
369 | * waiting on garbage collection before we invalidate |
370 | * and free anything. |
371 | * |
372 | * But this should be safe since the btree code always |
373 | * uses btree_check_reserve() before allocating now, and |
374 | * if it fails it blocks without btree nodes locked. |
375 | */ |
376 | if (!fifo_full(&ca->free_inc)) |
377 | goto retry_invalidate; |
378 | |
379 | if (bch_prio_write(ca, wait: false) < 0) { |
380 | ca->invalidate_needs_gc = 1; |
381 | wake_up_gc(c: ca->set); |
382 | } |
383 | } |
384 | } |
385 | out: |
386 | wait_for_kthread_stop(); |
387 | return 0; |
388 | } |
389 | |
390 | /* Allocation */ |
391 | |
392 | long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) |
393 | { |
394 | DEFINE_WAIT(w); |
395 | struct bucket *b; |
396 | long r; |
397 | |
398 | |
399 | /* No allocation if CACHE_SET_IO_DISABLE bit is set */ |
400 | if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) |
401 | return -1; |
402 | |
403 | /* fastpath */ |
404 | if (fifo_pop(&ca->free[RESERVE_NONE], r) || |
405 | fifo_pop(&ca->free[reserve], r)) |
406 | goto out; |
407 | |
408 | if (!wait) { |
409 | trace_bcache_alloc_fail(ca, reserve); |
410 | return -1; |
411 | } |
412 | |
413 | do { |
414 | prepare_to_wait(wq_head: &ca->set->bucket_wait, wq_entry: &w, |
415 | TASK_UNINTERRUPTIBLE); |
416 | |
417 | mutex_unlock(lock: &ca->set->bucket_lock); |
418 | schedule(); |
419 | mutex_lock(&ca->set->bucket_lock); |
420 | } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && |
421 | !fifo_pop(&ca->free[reserve], r)); |
422 | |
423 | finish_wait(wq_head: &ca->set->bucket_wait, wq_entry: &w); |
424 | out: |
425 | if (ca->alloc_thread) |
426 | wake_up_process(tsk: ca->alloc_thread); |
427 | |
428 | trace_bcache_alloc(ca, bucket: reserve); |
429 | |
430 | if (expensive_debug_checks(ca->set)) { |
431 | size_t iter; |
432 | long i; |
433 | unsigned int j; |
434 | |
435 | for (iter = 0; iter < prio_buckets(ca) * 2; iter++) |
436 | BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); |
437 | |
438 | for (j = 0; j < RESERVE_NR; j++) |
439 | fifo_for_each(i, &ca->free[j], iter) |
440 | BUG_ON(i == r); |
441 | fifo_for_each(i, &ca->free_inc, iter) |
442 | BUG_ON(i == r); |
443 | } |
444 | |
445 | b = ca->buckets + r; |
446 | |
447 | BUG_ON(atomic_read(&b->pin) != 1); |
448 | |
449 | SET_GC_SECTORS_USED(k: b, v: ca->sb.bucket_size); |
450 | |
451 | if (reserve <= RESERVE_PRIO) { |
452 | SET_GC_MARK(k: b, GC_MARK_METADATA); |
453 | SET_GC_MOVE(k: b, v: 0); |
454 | b->prio = BTREE_PRIO; |
455 | } else { |
456 | SET_GC_MARK(k: b, GC_MARK_RECLAIMABLE); |
457 | SET_GC_MOVE(k: b, v: 0); |
458 | b->prio = INITIAL_PRIO; |
459 | } |
460 | |
461 | if (ca->set->avail_nbuckets > 0) { |
462 | ca->set->avail_nbuckets--; |
463 | bch_update_bucket_in_use(c: ca->set, stats: &ca->set->gc_stats); |
464 | } |
465 | |
466 | return r; |
467 | } |
468 | |
469 | void __bch_bucket_free(struct cache *ca, struct bucket *b) |
470 | { |
471 | SET_GC_MARK(k: b, v: 0); |
472 | SET_GC_SECTORS_USED(k: b, v: 0); |
473 | |
474 | if (ca->set->avail_nbuckets < ca->set->nbuckets) { |
475 | ca->set->avail_nbuckets++; |
476 | bch_update_bucket_in_use(c: ca->set, stats: &ca->set->gc_stats); |
477 | } |
478 | } |
479 | |
480 | void bch_bucket_free(struct cache_set *c, struct bkey *k) |
481 | { |
482 | unsigned int i; |
483 | |
484 | for (i = 0; i < KEY_PTRS(k); i++) |
485 | __bch_bucket_free(ca: c->cache, b: PTR_BUCKET(c, k, ptr: i)); |
486 | } |
487 | |
488 | int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, |
489 | struct bkey *k, bool wait) |
490 | { |
491 | struct cache *ca; |
492 | long b; |
493 | |
494 | /* No allocation if CACHE_SET_IO_DISABLE bit is set */ |
495 | if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) |
496 | return -1; |
497 | |
498 | lockdep_assert_held(&c->bucket_lock); |
499 | |
500 | bkey_init(k); |
501 | |
502 | ca = c->cache; |
503 | b = bch_bucket_alloc(ca, reserve, wait); |
504 | if (b == -1) |
505 | goto err; |
506 | |
507 | k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, |
508 | bucket_to_sector(c, b), |
509 | ca->sb.nr_this_dev); |
510 | |
511 | SET_KEY_PTRS(k, v: 1); |
512 | |
513 | return 0; |
514 | err: |
515 | bch_bucket_free(c, k); |
516 | bkey_put(c, k); |
517 | return -1; |
518 | } |
519 | |
520 | int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, |
521 | struct bkey *k, bool wait) |
522 | { |
523 | int ret; |
524 | |
525 | mutex_lock(&c->bucket_lock); |
526 | ret = __bch_bucket_alloc_set(c, reserve, k, wait); |
527 | mutex_unlock(lock: &c->bucket_lock); |
528 | return ret; |
529 | } |
530 | |
531 | /* Sector allocator */ |
532 | |
533 | struct open_bucket { |
534 | struct list_head list; |
535 | unsigned int last_write_point; |
536 | unsigned int sectors_free; |
537 | BKEY_PADDED(key); |
538 | }; |
539 | |
540 | /* |
541 | * We keep multiple buckets open for writes, and try to segregate different |
542 | * write streams for better cache utilization: first we try to segregate flash |
543 | * only volume write streams from cached devices, secondly we look for a bucket |
544 | * where the last write to it was sequential with the current write, and |
545 | * failing that we look for a bucket that was last used by the same task. |
546 | * |
547 | * The ideas is if you've got multiple tasks pulling data into the cache at the |
548 | * same time, you'll get better cache utilization if you try to segregate their |
549 | * data and preserve locality. |
550 | * |
551 | * For example, dirty sectors of flash only volume is not reclaimable, if their |
552 | * dirty sectors mixed with dirty sectors of cached device, such buckets will |
553 | * be marked as dirty and won't be reclaimed, though the dirty data of cached |
554 | * device have been written back to backend device. |
555 | * |
556 | * And say you've starting Firefox at the same time you're copying a |
557 | * bunch of files. Firefox will likely end up being fairly hot and stay in the |
558 | * cache awhile, but the data you copied might not be; if you wrote all that |
559 | * data to the same buckets it'd get invalidated at the same time. |
560 | * |
561 | * Both of those tasks will be doing fairly random IO so we can't rely on |
562 | * detecting sequential IO to segregate their data, but going off of the task |
563 | * should be a sane heuristic. |
564 | */ |
565 | static struct open_bucket *pick_data_bucket(struct cache_set *c, |
566 | const struct bkey *search, |
567 | unsigned int write_point, |
568 | struct bkey *alloc) |
569 | { |
570 | struct open_bucket *ret, *ret_task = NULL; |
571 | |
572 | list_for_each_entry_reverse(ret, &c->data_buckets, list) |
573 | if (UUID_FLASH_ONLY(k: &c->uuids[KEY_INODE(k: &ret->key)]) != |
574 | UUID_FLASH_ONLY(k: &c->uuids[KEY_INODE(k: search)])) |
575 | continue; |
576 | else if (!bkey_cmp(l: &ret->key, r: search)) |
577 | goto found; |
578 | else if (ret->last_write_point == write_point) |
579 | ret_task = ret; |
580 | |
581 | ret = ret_task ?: list_first_entry(&c->data_buckets, |
582 | struct open_bucket, list); |
583 | found: |
584 | if (!ret->sectors_free && KEY_PTRS(k: alloc)) { |
585 | ret->sectors_free = c->cache->sb.bucket_size; |
586 | bkey_copy(&ret->key, alloc); |
587 | bkey_init(k: alloc); |
588 | } |
589 | |
590 | if (!ret->sectors_free) |
591 | ret = NULL; |
592 | |
593 | return ret; |
594 | } |
595 | |
596 | /* |
597 | * Allocates some space in the cache to write to, and k to point to the newly |
598 | * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the |
599 | * end of the newly allocated space). |
600 | * |
601 | * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many |
602 | * sectors were actually allocated. |
603 | * |
604 | * If s->writeback is true, will not fail. |
605 | */ |
606 | bool bch_alloc_sectors(struct cache_set *c, |
607 | struct bkey *k, |
608 | unsigned int sectors, |
609 | unsigned int write_point, |
610 | unsigned int write_prio, |
611 | bool wait) |
612 | { |
613 | struct open_bucket *b; |
614 | BKEY_PADDED(key) alloc; |
615 | unsigned int i; |
616 | |
617 | /* |
618 | * We might have to allocate a new bucket, which we can't do with a |
619 | * spinlock held. So if we have to allocate, we drop the lock, allocate |
620 | * and then retry. KEY_PTRS() indicates whether alloc points to |
621 | * allocated bucket(s). |
622 | */ |
623 | |
624 | bkey_init(k: &alloc.key); |
625 | spin_lock(lock: &c->data_bucket_lock); |
626 | |
627 | while (!(b = pick_data_bucket(c, search: k, write_point, alloc: &alloc.key))) { |
628 | unsigned int watermark = write_prio |
629 | ? RESERVE_MOVINGGC |
630 | : RESERVE_NONE; |
631 | |
632 | spin_unlock(lock: &c->data_bucket_lock); |
633 | |
634 | if (bch_bucket_alloc_set(c, reserve: watermark, k: &alloc.key, wait)) |
635 | return false; |
636 | |
637 | spin_lock(lock: &c->data_bucket_lock); |
638 | } |
639 | |
640 | /* |
641 | * If we had to allocate, we might race and not need to allocate the |
642 | * second time we call pick_data_bucket(). If we allocated a bucket but |
643 | * didn't use it, drop the refcount bch_bucket_alloc_set() took: |
644 | */ |
645 | if (KEY_PTRS(k: &alloc.key)) |
646 | bkey_put(c, k: &alloc.key); |
647 | |
648 | for (i = 0; i < KEY_PTRS(k: &b->key); i++) |
649 | EBUG_ON(ptr_stale(c, &b->key, i)); |
650 | |
651 | /* Set up the pointer to the space we're allocating: */ |
652 | |
653 | for (i = 0; i < KEY_PTRS(k: &b->key); i++) |
654 | k->ptr[i] = b->key.ptr[i]; |
655 | |
656 | sectors = min(sectors, b->sectors_free); |
657 | |
658 | SET_KEY_OFFSET(k, v: KEY_OFFSET(k) + sectors); |
659 | SET_KEY_SIZE(k, v: sectors); |
660 | SET_KEY_PTRS(k, v: KEY_PTRS(k: &b->key)); |
661 | |
662 | /* |
663 | * Move b to the end of the lru, and keep track of what this bucket was |
664 | * last used for: |
665 | */ |
666 | list_move_tail(list: &b->list, head: &c->data_buckets); |
667 | bkey_copy_key(dest: &b->key, src: k); |
668 | b->last_write_point = write_point; |
669 | |
670 | b->sectors_free -= sectors; |
671 | |
672 | for (i = 0; i < KEY_PTRS(k: &b->key); i++) { |
673 | SET_PTR_OFFSET(k: &b->key, i, v: PTR_OFFSET(k: &b->key, i) + sectors); |
674 | |
675 | atomic_long_add(i: sectors, |
676 | v: &c->cache->sectors_written); |
677 | } |
678 | |
679 | if (b->sectors_free < c->cache->sb.block_size) |
680 | b->sectors_free = 0; |
681 | |
682 | /* |
683 | * k takes refcounts on the buckets it points to until it's inserted |
684 | * into the btree, but if we're done with this bucket we just transfer |
685 | * get_data_bucket()'s refcount. |
686 | */ |
687 | if (b->sectors_free) |
688 | for (i = 0; i < KEY_PTRS(k: &b->key); i++) |
689 | atomic_inc(v: &PTR_BUCKET(c, k: &b->key, ptr: i)->pin); |
690 | |
691 | spin_unlock(lock: &c->data_bucket_lock); |
692 | return true; |
693 | } |
694 | |
695 | /* Init */ |
696 | |
697 | void bch_open_buckets_free(struct cache_set *c) |
698 | { |
699 | struct open_bucket *b; |
700 | |
701 | while (!list_empty(head: &c->data_buckets)) { |
702 | b = list_first_entry(&c->data_buckets, |
703 | struct open_bucket, list); |
704 | list_del(entry: &b->list); |
705 | kfree(objp: b); |
706 | } |
707 | } |
708 | |
709 | int bch_open_buckets_alloc(struct cache_set *c) |
710 | { |
711 | int i; |
712 | |
713 | spin_lock_init(&c->data_bucket_lock); |
714 | |
715 | for (i = 0; i < MAX_OPEN_BUCKETS; i++) { |
716 | struct open_bucket *b = kzalloc(size: sizeof(*b), GFP_KERNEL); |
717 | |
718 | if (!b) |
719 | return -ENOMEM; |
720 | |
721 | list_add(new: &b->list, head: &c->data_buckets); |
722 | } |
723 | |
724 | return 0; |
725 | } |
726 | |
727 | int bch_cache_allocator_start(struct cache *ca) |
728 | { |
729 | struct task_struct *k = kthread_run(bch_allocator_thread, |
730 | ca, "bcache_allocator" ); |
731 | if (IS_ERR(ptr: k)) |
732 | return PTR_ERR(ptr: k); |
733 | |
734 | ca->alloc_thread = k; |
735 | return 0; |
736 | } |
737 | |