1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "btree_cache.h" |
5 | #include "btree_iter.h" |
6 | #include "btree_key_cache.h" |
7 | #include "btree_locking.h" |
8 | #include "btree_update.h" |
9 | #include "errcode.h" |
10 | #include "error.h" |
11 | #include "journal.h" |
12 | #include "journal_reclaim.h" |
13 | #include "trace.h" |
14 | |
15 | #include <linux/sched/mm.h> |
16 | |
17 | static inline bool btree_uses_pcpu_readers(enum btree_id id) |
18 | { |
19 | return id == BTREE_ID_subvolumes; |
20 | } |
21 | |
22 | static struct kmem_cache *bch2_key_cache; |
23 | |
24 | static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, |
25 | const void *obj) |
26 | { |
27 | const struct bkey_cached *ck = obj; |
28 | const struct bkey_cached_key *key = arg->key; |
29 | |
30 | return ck->key.btree_id != key->btree_id || |
31 | !bpos_eq(l: ck->key.pos, r: key->pos); |
32 | } |
33 | |
34 | static const struct rhashtable_params bch2_btree_key_cache_params = { |
35 | .head_offset = offsetof(struct bkey_cached, hash), |
36 | .key_offset = offsetof(struct bkey_cached, key), |
37 | .key_len = sizeof(struct bkey_cached_key), |
38 | .obj_cmpfn = bch2_btree_key_cache_cmp_fn, |
39 | }; |
40 | |
41 | __flatten |
42 | inline struct bkey_cached * |
43 | bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) |
44 | { |
45 | struct bkey_cached_key key = { |
46 | .btree_id = btree_id, |
47 | .pos = pos, |
48 | }; |
49 | |
50 | return rhashtable_lookup_fast(ht: &c->btree_key_cache.table, key: &key, |
51 | params: bch2_btree_key_cache_params); |
52 | } |
53 | |
54 | static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) |
55 | { |
56 | if (!six_trylock_intent(lock: &ck->c.lock)) |
57 | return false; |
58 | |
59 | if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
60 | six_unlock_intent(lock: &ck->c.lock); |
61 | return false; |
62 | } |
63 | |
64 | if (!six_trylock_write(lock: &ck->c.lock)) { |
65 | six_unlock_intent(lock: &ck->c.lock); |
66 | return false; |
67 | } |
68 | |
69 | return true; |
70 | } |
71 | |
72 | static void bkey_cached_evict(struct btree_key_cache *c, |
73 | struct bkey_cached *ck) |
74 | { |
75 | BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, |
76 | bch2_btree_key_cache_params)); |
77 | memset(&ck->key, ~0, sizeof(ck->key)); |
78 | |
79 | atomic_long_dec(v: &c->nr_keys); |
80 | } |
81 | |
82 | static void bkey_cached_free(struct btree_key_cache *bc, |
83 | struct bkey_cached *ck) |
84 | { |
85 | struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); |
86 | |
87 | BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); |
88 | |
89 | ck->btree_trans_barrier_seq = |
90 | start_poll_synchronize_srcu(ssp: &c->btree_trans_barrier); |
91 | |
92 | if (ck->c.lock.readers) { |
93 | list_move_tail(list: &ck->list, head: &bc->freed_pcpu); |
94 | bc->nr_freed_pcpu++; |
95 | } else { |
96 | list_move_tail(list: &ck->list, head: &bc->freed_nonpcpu); |
97 | bc->nr_freed_nonpcpu++; |
98 | } |
99 | atomic_long_inc(v: &bc->nr_freed); |
100 | |
101 | kfree(objp: ck->k); |
102 | ck->k = NULL; |
103 | ck->u64s = 0; |
104 | |
105 | six_unlock_write(lock: &ck->c.lock); |
106 | six_unlock_intent(lock: &ck->c.lock); |
107 | } |
108 | |
109 | #ifdef __KERNEL__ |
110 | static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, |
111 | struct bkey_cached *ck) |
112 | { |
113 | struct bkey_cached *pos; |
114 | |
115 | bc->nr_freed_nonpcpu++; |
116 | |
117 | list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { |
118 | if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, |
119 | pos->btree_trans_barrier_seq)) { |
120 | list_move(list: &ck->list, head: &pos->list); |
121 | return; |
122 | } |
123 | } |
124 | |
125 | list_move(list: &ck->list, head: &bc->freed_nonpcpu); |
126 | } |
127 | #endif |
128 | |
129 | static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, |
130 | struct bkey_cached *ck) |
131 | { |
132 | BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); |
133 | |
134 | if (!ck->c.lock.readers) { |
135 | #ifdef __KERNEL__ |
136 | struct btree_key_cache_freelist *f; |
137 | bool freed = false; |
138 | |
139 | preempt_disable(); |
140 | f = this_cpu_ptr(bc->pcpu_freed); |
141 | |
142 | if (f->nr < ARRAY_SIZE(f->objs)) { |
143 | f->objs[f->nr++] = ck; |
144 | freed = true; |
145 | } |
146 | preempt_enable(); |
147 | |
148 | if (!freed) { |
149 | mutex_lock(&bc->lock); |
150 | preempt_disable(); |
151 | f = this_cpu_ptr(bc->pcpu_freed); |
152 | |
153 | while (f->nr > ARRAY_SIZE(f->objs) / 2) { |
154 | struct bkey_cached *ck2 = f->objs[--f->nr]; |
155 | |
156 | __bkey_cached_move_to_freelist_ordered(bc, ck: ck2); |
157 | } |
158 | preempt_enable(); |
159 | |
160 | __bkey_cached_move_to_freelist_ordered(bc, ck); |
161 | mutex_unlock(lock: &bc->lock); |
162 | } |
163 | #else |
164 | mutex_lock(&bc->lock); |
165 | list_move_tail(&ck->list, &bc->freed_nonpcpu); |
166 | bc->nr_freed_nonpcpu++; |
167 | mutex_unlock(&bc->lock); |
168 | #endif |
169 | } else { |
170 | mutex_lock(&bc->lock); |
171 | list_move_tail(list: &ck->list, head: &bc->freed_pcpu); |
172 | bc->nr_freed_pcpu++; |
173 | mutex_unlock(lock: &bc->lock); |
174 | } |
175 | } |
176 | |
177 | static void bkey_cached_free_fast(struct btree_key_cache *bc, |
178 | struct bkey_cached *ck) |
179 | { |
180 | struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); |
181 | |
182 | ck->btree_trans_barrier_seq = |
183 | start_poll_synchronize_srcu(ssp: &c->btree_trans_barrier); |
184 | |
185 | list_del_init(entry: &ck->list); |
186 | atomic_long_inc(v: &bc->nr_freed); |
187 | |
188 | kfree(objp: ck->k); |
189 | ck->k = NULL; |
190 | ck->u64s = 0; |
191 | |
192 | bkey_cached_move_to_freelist(bc, ck); |
193 | |
194 | six_unlock_write(lock: &ck->c.lock); |
195 | six_unlock_intent(lock: &ck->c.lock); |
196 | } |
197 | |
198 | static struct bkey_cached * |
199 | bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, |
200 | bool *was_new) |
201 | { |
202 | struct bch_fs *c = trans->c; |
203 | struct btree_key_cache *bc = &c->btree_key_cache; |
204 | struct bkey_cached *ck = NULL; |
205 | bool pcpu_readers = btree_uses_pcpu_readers(id: path->btree_id); |
206 | int ret; |
207 | |
208 | if (!pcpu_readers) { |
209 | #ifdef __KERNEL__ |
210 | struct btree_key_cache_freelist *f; |
211 | |
212 | preempt_disable(); |
213 | f = this_cpu_ptr(bc->pcpu_freed); |
214 | if (f->nr) |
215 | ck = f->objs[--f->nr]; |
216 | preempt_enable(); |
217 | |
218 | if (!ck) { |
219 | mutex_lock(&bc->lock); |
220 | preempt_disable(); |
221 | f = this_cpu_ptr(bc->pcpu_freed); |
222 | |
223 | while (!list_empty(head: &bc->freed_nonpcpu) && |
224 | f->nr < ARRAY_SIZE(f->objs) / 2) { |
225 | ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); |
226 | list_del_init(entry: &ck->list); |
227 | bc->nr_freed_nonpcpu--; |
228 | f->objs[f->nr++] = ck; |
229 | } |
230 | |
231 | ck = f->nr ? f->objs[--f->nr] : NULL; |
232 | preempt_enable(); |
233 | mutex_unlock(lock: &bc->lock); |
234 | } |
235 | #else |
236 | mutex_lock(&bc->lock); |
237 | if (!list_empty(&bc->freed_nonpcpu)) { |
238 | ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); |
239 | list_del_init(&ck->list); |
240 | bc->nr_freed_nonpcpu--; |
241 | } |
242 | mutex_unlock(&bc->lock); |
243 | #endif |
244 | } else { |
245 | mutex_lock(&bc->lock); |
246 | if (!list_empty(head: &bc->freed_pcpu)) { |
247 | ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); |
248 | list_del_init(entry: &ck->list); |
249 | bc->nr_freed_pcpu--; |
250 | } |
251 | mutex_unlock(lock: &bc->lock); |
252 | } |
253 | |
254 | if (ck) { |
255 | ret = btree_node_lock_nopath(trans, b: &ck->c, type: SIX_LOCK_intent, _THIS_IP_); |
256 | if (unlikely(ret)) { |
257 | bkey_cached_move_to_freelist(bc, ck); |
258 | return ERR_PTR(error: ret); |
259 | } |
260 | |
261 | path->l[0].b = (void *) ck; |
262 | path->l[0].lock_seq = six_lock_seq(lock: &ck->c.lock); |
263 | mark_btree_node_locked(trans, path, level: 0, type: BTREE_NODE_INTENT_LOCKED); |
264 | |
265 | ret = bch2_btree_node_lock_write(trans, path, b: &ck->c); |
266 | if (unlikely(ret)) { |
267 | btree_node_unlock(trans, path, level: 0); |
268 | bkey_cached_move_to_freelist(bc, ck); |
269 | return ERR_PTR(error: ret); |
270 | } |
271 | |
272 | return ck; |
273 | } |
274 | |
275 | ck = allocate_dropping_locks(trans, ret, |
276 | kmem_cache_zalloc(bch2_key_cache, _gfp)); |
277 | if (ret) { |
278 | kmem_cache_free(s: bch2_key_cache, objp: ck); |
279 | return ERR_PTR(error: ret); |
280 | } |
281 | |
282 | if (!ck) |
283 | return NULL; |
284 | |
285 | INIT_LIST_HEAD(list: &ck->list); |
286 | bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); |
287 | |
288 | ck->c.cached = true; |
289 | BUG_ON(!six_trylock_intent(&ck->c.lock)); |
290 | BUG_ON(!six_trylock_write(&ck->c.lock)); |
291 | *was_new = true; |
292 | return ck; |
293 | } |
294 | |
295 | static struct bkey_cached * |
296 | bkey_cached_reuse(struct btree_key_cache *c) |
297 | { |
298 | struct bucket_table *tbl; |
299 | struct rhash_head *pos; |
300 | struct bkey_cached *ck; |
301 | unsigned i; |
302 | |
303 | mutex_lock(&c->lock); |
304 | rcu_read_lock(); |
305 | tbl = rht_dereference_rcu(c->table.tbl, &c->table); |
306 | for (i = 0; i < tbl->size; i++) |
307 | rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { |
308 | if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && |
309 | bkey_cached_lock_for_evict(ck)) { |
310 | bkey_cached_evict(c, ck); |
311 | goto out; |
312 | } |
313 | } |
314 | ck = NULL; |
315 | out: |
316 | rcu_read_unlock(); |
317 | mutex_unlock(lock: &c->lock); |
318 | return ck; |
319 | } |
320 | |
321 | static struct bkey_cached * |
322 | btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) |
323 | { |
324 | struct bch_fs *c = trans->c; |
325 | struct btree_key_cache *bc = &c->btree_key_cache; |
326 | struct bkey_cached *ck; |
327 | bool was_new = false; |
328 | |
329 | ck = bkey_cached_alloc(trans, path, was_new: &was_new); |
330 | if (IS_ERR(ptr: ck)) |
331 | return ck; |
332 | |
333 | if (unlikely(!ck)) { |
334 | ck = bkey_cached_reuse(c: bc); |
335 | if (unlikely(!ck)) { |
336 | bch_err(c, "error allocating memory for key cache item, btree %s" , |
337 | bch2_btree_id_str(path->btree_id)); |
338 | return ERR_PTR(error: -BCH_ERR_ENOMEM_btree_key_cache_create); |
339 | } |
340 | |
341 | mark_btree_node_locked(trans, path, level: 0, type: BTREE_NODE_INTENT_LOCKED); |
342 | } |
343 | |
344 | ck->c.level = 0; |
345 | ck->c.btree_id = path->btree_id; |
346 | ck->key.btree_id = path->btree_id; |
347 | ck->key.pos = path->pos; |
348 | ck->valid = false; |
349 | ck->flags = 1U << BKEY_CACHED_ACCESSED; |
350 | |
351 | if (unlikely(rhashtable_lookup_insert_fast(&bc->table, |
352 | &ck->hash, |
353 | bch2_btree_key_cache_params))) { |
354 | /* We raced with another fill: */ |
355 | |
356 | if (likely(was_new)) { |
357 | six_unlock_write(lock: &ck->c.lock); |
358 | six_unlock_intent(lock: &ck->c.lock); |
359 | kfree(objp: ck); |
360 | } else { |
361 | bkey_cached_free_fast(bc, ck); |
362 | } |
363 | |
364 | mark_btree_node_locked(trans, path, level: 0, type: BTREE_NODE_UNLOCKED); |
365 | return NULL; |
366 | } |
367 | |
368 | atomic_long_inc(v: &bc->nr_keys); |
369 | |
370 | six_unlock_write(lock: &ck->c.lock); |
371 | |
372 | return ck; |
373 | } |
374 | |
375 | static int btree_key_cache_fill(struct btree_trans *trans, |
376 | struct btree_path *ck_path, |
377 | struct bkey_cached *ck) |
378 | { |
379 | struct btree_iter iter; |
380 | struct bkey_s_c k; |
381 | unsigned new_u64s = 0; |
382 | struct bkey_i *new_k = NULL; |
383 | int ret; |
384 | |
385 | bch2_trans_iter_init(trans, iter: &iter, btree_id: ck->key.btree_id, pos: ck->key.pos, |
386 | flags: BTREE_ITER_KEY_CACHE_FILL| |
387 | BTREE_ITER_CACHED_NOFILL); |
388 | iter.flags &= ~BTREE_ITER_WITH_JOURNAL; |
389 | k = bch2_btree_iter_peek_slot(&iter); |
390 | ret = bkey_err(k); |
391 | if (ret) |
392 | goto err; |
393 | |
394 | if (!bch2_btree_node_relock(trans, path: ck_path, level: 0)) { |
395 | trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); |
396 | ret = btree_trans_restart(trans, err: BCH_ERR_transaction_restart_key_cache_fill); |
397 | goto err; |
398 | } |
399 | |
400 | /* |
401 | * bch2_varint_decode can read past the end of the buffer by at |
402 | * most 7 bytes (it won't be used): |
403 | */ |
404 | new_u64s = k.k->u64s + 1; |
405 | |
406 | /* |
407 | * Allocate some extra space so that the transaction commit path is less |
408 | * likely to have to reallocate, since that requires a transaction |
409 | * restart: |
410 | */ |
411 | new_u64s = min(256U, (new_u64s * 3) / 2); |
412 | |
413 | if (new_u64s > ck->u64s) { |
414 | new_u64s = roundup_pow_of_two(new_u64s); |
415 | new_k = kmalloc(size: new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); |
416 | if (!new_k) { |
417 | bch2_trans_unlock(trans); |
418 | |
419 | new_k = kmalloc(size: new_u64s * sizeof(u64), GFP_KERNEL); |
420 | if (!new_k) { |
421 | bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u" , |
422 | bch2_btree_id_str(ck->key.btree_id), new_u64s); |
423 | ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; |
424 | goto err; |
425 | } |
426 | |
427 | if (!bch2_btree_node_relock(trans, path: ck_path, level: 0)) { |
428 | kfree(objp: new_k); |
429 | trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); |
430 | ret = btree_trans_restart(trans, err: BCH_ERR_transaction_restart_key_cache_fill); |
431 | goto err; |
432 | } |
433 | |
434 | ret = bch2_trans_relock(trans); |
435 | if (ret) { |
436 | kfree(objp: new_k); |
437 | goto err; |
438 | } |
439 | } |
440 | } |
441 | |
442 | ret = bch2_btree_node_lock_write(trans, path: ck_path, b: &ck_path->l[0].b->c); |
443 | if (ret) { |
444 | kfree(objp: new_k); |
445 | goto err; |
446 | } |
447 | |
448 | if (new_k) { |
449 | kfree(objp: ck->k); |
450 | ck->u64s = new_u64s; |
451 | ck->k = new_k; |
452 | } |
453 | |
454 | bkey_reassemble(dst: ck->k, src: k); |
455 | ck->valid = true; |
456 | bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); |
457 | |
458 | /* We're not likely to need this iterator again: */ |
459 | set_btree_iter_dontneed(&iter); |
460 | err: |
461 | bch2_trans_iter_exit(trans, &iter); |
462 | return ret; |
463 | } |
464 | |
465 | static noinline int |
466 | bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, |
467 | unsigned flags) |
468 | { |
469 | struct bch_fs *c = trans->c; |
470 | struct bkey_cached *ck; |
471 | int ret = 0; |
472 | |
473 | BUG_ON(path->level); |
474 | |
475 | path->l[1].b = NULL; |
476 | |
477 | if (bch2_btree_node_relock_notrace(trans, path, level: 0)) { |
478 | ck = (void *) path->l[0].b; |
479 | goto fill; |
480 | } |
481 | retry: |
482 | ck = bch2_btree_key_cache_find(c, btree_id: path->btree_id, pos: path->pos); |
483 | if (!ck) { |
484 | ck = btree_key_cache_create(trans, path); |
485 | ret = PTR_ERR_OR_ZERO(ptr: ck); |
486 | if (ret) |
487 | goto err; |
488 | if (!ck) |
489 | goto retry; |
490 | |
491 | mark_btree_node_locked(trans, path, level: 0, type: BTREE_NODE_INTENT_LOCKED); |
492 | path->locks_want = 1; |
493 | } else { |
494 | enum six_lock_type lock_want = __btree_lock_want(path, level: 0); |
495 | |
496 | ret = btree_node_lock(trans, path, b: (void *) ck, level: 0, |
497 | type: lock_want, _THIS_IP_); |
498 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
499 | goto err; |
500 | |
501 | BUG_ON(ret); |
502 | |
503 | if (ck->key.btree_id != path->btree_id || |
504 | !bpos_eq(l: ck->key.pos, r: path->pos)) { |
505 | six_unlock_type(lock: &ck->c.lock, type: lock_want); |
506 | goto retry; |
507 | } |
508 | |
509 | mark_btree_node_locked(trans, path, level: 0, |
510 | type: (enum btree_node_locked_type) lock_want); |
511 | } |
512 | |
513 | path->l[0].lock_seq = six_lock_seq(lock: &ck->c.lock); |
514 | path->l[0].b = (void *) ck; |
515 | fill: |
516 | path->uptodate = BTREE_ITER_UPTODATE; |
517 | |
518 | if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { |
519 | /* |
520 | * Using the underscore version because we haven't set |
521 | * path->uptodate yet: |
522 | */ |
523 | if (!path->locks_want && |
524 | !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { |
525 | trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); |
526 | ret = btree_trans_restart(trans, err: BCH_ERR_transaction_restart_key_cache_upgrade); |
527 | goto err; |
528 | } |
529 | |
530 | ret = btree_key_cache_fill(trans, ck_path: path, ck); |
531 | if (ret) |
532 | goto err; |
533 | |
534 | ret = bch2_btree_path_relock(trans, path, _THIS_IP_); |
535 | if (ret) |
536 | goto err; |
537 | |
538 | path->uptodate = BTREE_ITER_UPTODATE; |
539 | } |
540 | |
541 | if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) |
542 | set_bit(BKEY_CACHED_ACCESSED, addr: &ck->flags); |
543 | |
544 | BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); |
545 | BUG_ON(path->uptodate); |
546 | |
547 | return ret; |
548 | err: |
549 | path->uptodate = BTREE_ITER_NEED_TRAVERSE; |
550 | if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { |
551 | btree_node_unlock(trans, path, level: 0); |
552 | path->l[0].b = ERR_PTR(error: ret); |
553 | } |
554 | return ret; |
555 | } |
556 | |
557 | int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, |
558 | unsigned flags) |
559 | { |
560 | struct bch_fs *c = trans->c; |
561 | struct bkey_cached *ck; |
562 | int ret = 0; |
563 | |
564 | EBUG_ON(path->level); |
565 | |
566 | path->l[1].b = NULL; |
567 | |
568 | if (bch2_btree_node_relock_notrace(trans, path, level: 0)) { |
569 | ck = (void *) path->l[0].b; |
570 | goto fill; |
571 | } |
572 | retry: |
573 | ck = bch2_btree_key_cache_find(c, btree_id: path->btree_id, pos: path->pos); |
574 | if (!ck) { |
575 | return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); |
576 | } else { |
577 | enum six_lock_type lock_want = __btree_lock_want(path, level: 0); |
578 | |
579 | ret = btree_node_lock(trans, path, b: (void *) ck, level: 0, |
580 | type: lock_want, _THIS_IP_); |
581 | EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); |
582 | |
583 | if (ret) |
584 | return ret; |
585 | |
586 | if (ck->key.btree_id != path->btree_id || |
587 | !bpos_eq(l: ck->key.pos, r: path->pos)) { |
588 | six_unlock_type(lock: &ck->c.lock, type: lock_want); |
589 | goto retry; |
590 | } |
591 | |
592 | mark_btree_node_locked(trans, path, level: 0, |
593 | type: (enum btree_node_locked_type) lock_want); |
594 | } |
595 | |
596 | path->l[0].lock_seq = six_lock_seq(lock: &ck->c.lock); |
597 | path->l[0].b = (void *) ck; |
598 | fill: |
599 | if (!ck->valid) |
600 | return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); |
601 | |
602 | if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) |
603 | set_bit(BKEY_CACHED_ACCESSED, addr: &ck->flags); |
604 | |
605 | path->uptodate = BTREE_ITER_UPTODATE; |
606 | EBUG_ON(!ck->valid); |
607 | EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); |
608 | |
609 | return ret; |
610 | } |
611 | |
612 | static int btree_key_cache_flush_pos(struct btree_trans *trans, |
613 | struct bkey_cached_key key, |
614 | u64 journal_seq, |
615 | unsigned commit_flags, |
616 | bool evict) |
617 | { |
618 | struct bch_fs *c = trans->c; |
619 | struct journal *j = &c->journal; |
620 | struct btree_iter c_iter, b_iter; |
621 | struct bkey_cached *ck = NULL; |
622 | int ret; |
623 | |
624 | bch2_trans_iter_init(trans, iter: &b_iter, btree_id: key.btree_id, pos: key.pos, |
625 | flags: BTREE_ITER_SLOTS| |
626 | BTREE_ITER_INTENT| |
627 | BTREE_ITER_ALL_SNAPSHOTS); |
628 | bch2_trans_iter_init(trans, iter: &c_iter, btree_id: key.btree_id, pos: key.pos, |
629 | flags: BTREE_ITER_CACHED| |
630 | BTREE_ITER_INTENT); |
631 | b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; |
632 | |
633 | ret = bch2_btree_iter_traverse(&c_iter); |
634 | if (ret) |
635 | goto out; |
636 | |
637 | ck = (void *) btree_iter_path(trans, iter: &c_iter)->l[0].b; |
638 | if (!ck) |
639 | goto out; |
640 | |
641 | if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
642 | if (evict) |
643 | goto evict; |
644 | goto out; |
645 | } |
646 | |
647 | BUG_ON(!ck->valid); |
648 | |
649 | if (journal_seq && ck->journal.seq != journal_seq) |
650 | goto out; |
651 | |
652 | trans->journal_res.seq = ck->journal.seq; |
653 | |
654 | /* |
655 | * If we're at the end of the journal, we really want to free up space |
656 | * in the journal right away - we don't want to pin that old journal |
657 | * sequence number with a new btree node write, we want to re-journal |
658 | * the update |
659 | */ |
660 | if (ck->journal.seq == journal_last_seq(j)) |
661 | commit_flags |= BCH_WATERMARK_reclaim; |
662 | |
663 | if (ck->journal.seq != journal_last_seq(j) || |
664 | !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) |
665 | commit_flags |= BCH_TRANS_COMMIT_no_journal_res; |
666 | |
667 | ret = bch2_btree_iter_traverse(&b_iter) ?: |
668 | bch2_trans_update(trans, &b_iter, ck->k, |
669 | BTREE_UPDATE_KEY_CACHE_RECLAIM| |
670 | BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| |
671 | BTREE_TRIGGER_NORUN) ?: |
672 | bch2_trans_commit(trans, NULL, NULL, |
673 | flags: BCH_TRANS_COMMIT_no_check_rw| |
674 | BCH_TRANS_COMMIT_no_enospc| |
675 | commit_flags); |
676 | |
677 | bch2_fs_fatal_err_on(ret && |
678 | !bch2_err_matches(ret, BCH_ERR_transaction_restart) && |
679 | !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && |
680 | !bch2_journal_error(j), c, |
681 | "flushing key cache: %s" , bch2_err_str(ret)); |
682 | if (ret) |
683 | goto out; |
684 | |
685 | bch2_journal_pin_drop(j, &ck->journal); |
686 | |
687 | struct btree_path *path = btree_iter_path(trans, iter: &c_iter); |
688 | BUG_ON(!btree_node_locked(path, 0)); |
689 | |
690 | if (!evict) { |
691 | if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
692 | clear_bit(BKEY_CACHED_DIRTY, addr: &ck->flags); |
693 | atomic_long_dec(v: &c->btree_key_cache.nr_dirty); |
694 | } |
695 | } else { |
696 | struct btree_path *path2; |
697 | unsigned i; |
698 | evict: |
699 | trans_for_each_path(trans, path2, i) |
700 | if (path2 != path) |
701 | __bch2_btree_path_unlock(trans, path: path2); |
702 | |
703 | bch2_btree_node_lock_write_nofail(trans, path, &ck->c); |
704 | |
705 | if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
706 | clear_bit(BKEY_CACHED_DIRTY, addr: &ck->flags); |
707 | atomic_long_dec(v: &c->btree_key_cache.nr_dirty); |
708 | } |
709 | |
710 | mark_btree_node_locked_noreset(path, level: 0, type: BTREE_NODE_UNLOCKED); |
711 | bkey_cached_evict(c: &c->btree_key_cache, ck); |
712 | bkey_cached_free_fast(bc: &c->btree_key_cache, ck); |
713 | } |
714 | out: |
715 | bch2_trans_iter_exit(trans, &b_iter); |
716 | bch2_trans_iter_exit(trans, &c_iter); |
717 | return ret; |
718 | } |
719 | |
720 | int bch2_btree_key_cache_journal_flush(struct journal *j, |
721 | struct journal_entry_pin *pin, u64 seq) |
722 | { |
723 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
724 | struct bkey_cached *ck = |
725 | container_of(pin, struct bkey_cached, journal); |
726 | struct bkey_cached_key key; |
727 | struct btree_trans *trans = bch2_trans_get(c); |
728 | int srcu_idx = srcu_read_lock(ssp: &c->btree_trans_barrier); |
729 | int ret = 0; |
730 | |
731 | btree_node_lock_nopath_nofail(trans, b: &ck->c, type: SIX_LOCK_read); |
732 | key = ck->key; |
733 | |
734 | if (ck->journal.seq != seq || |
735 | !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
736 | six_unlock_read(lock: &ck->c.lock); |
737 | goto unlock; |
738 | } |
739 | |
740 | if (ck->seq != seq) { |
741 | bch2_journal_pin_update(j: &c->journal, seq: ck->seq, pin: &ck->journal, |
742 | flush_fn: bch2_btree_key_cache_journal_flush); |
743 | six_unlock_read(lock: &ck->c.lock); |
744 | goto unlock; |
745 | } |
746 | six_unlock_read(lock: &ck->c.lock); |
747 | |
748 | ret = lockrestart_do(trans, |
749 | btree_key_cache_flush_pos(trans, key, seq, |
750 | BCH_TRANS_COMMIT_journal_reclaim, false)); |
751 | unlock: |
752 | srcu_read_unlock(ssp: &c->btree_trans_barrier, idx: srcu_idx); |
753 | |
754 | bch2_trans_put(trans); |
755 | return ret; |
756 | } |
757 | |
758 | bool bch2_btree_insert_key_cached(struct btree_trans *trans, |
759 | unsigned flags, |
760 | struct btree_insert_entry *insert_entry) |
761 | { |
762 | struct bch_fs *c = trans->c; |
763 | struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; |
764 | struct bkey_i *insert = insert_entry->k; |
765 | bool kick_reclaim = false; |
766 | |
767 | BUG_ON(insert->k.u64s > ck->u64s); |
768 | |
769 | bkey_copy(dst: ck->k, src: insert); |
770 | ck->valid = true; |
771 | |
772 | if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
773 | EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); |
774 | set_bit(BKEY_CACHED_DIRTY, addr: &ck->flags); |
775 | atomic_long_inc(v: &c->btree_key_cache.nr_dirty); |
776 | |
777 | if (bch2_nr_btree_keys_need_flush(c)) |
778 | kick_reclaim = true; |
779 | } |
780 | |
781 | /* |
782 | * To minimize lock contention, we only add the journal pin here and |
783 | * defer pin updates to the flush callback via ->seq. Be careful not to |
784 | * update ->seq on nojournal commits because we don't want to update the |
785 | * pin to a seq that doesn't include journal updates on disk. Otherwise |
786 | * we risk losing the update after a crash. |
787 | * |
788 | * The only exception is if the pin is not active in the first place. We |
789 | * have to add the pin because journal reclaim drives key cache |
790 | * flushing. The flush callback will not proceed unless ->seq matches |
791 | * the latest pin, so make sure it starts with a consistent value. |
792 | */ |
793 | if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || |
794 | !journal_pin_active(pin: &ck->journal)) { |
795 | ck->seq = trans->journal_res.seq; |
796 | } |
797 | bch2_journal_pin_add(j: &c->journal, seq: trans->journal_res.seq, |
798 | pin: &ck->journal, flush_fn: bch2_btree_key_cache_journal_flush); |
799 | |
800 | if (kick_reclaim) |
801 | journal_reclaim_kick(j: &c->journal); |
802 | return true; |
803 | } |
804 | |
805 | void bch2_btree_key_cache_drop(struct btree_trans *trans, |
806 | struct btree_path *path) |
807 | { |
808 | struct bch_fs *c = trans->c; |
809 | struct bkey_cached *ck = (void *) path->l[0].b; |
810 | |
811 | BUG_ON(!ck->valid); |
812 | |
813 | /* |
814 | * We just did an update to the btree, bypassing the key cache: the key |
815 | * cache key is now stale and must be dropped, even if dirty: |
816 | */ |
817 | if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
818 | clear_bit(BKEY_CACHED_DIRTY, addr: &ck->flags); |
819 | atomic_long_dec(v: &c->btree_key_cache.nr_dirty); |
820 | bch2_journal_pin_drop(&c->journal, &ck->journal); |
821 | } |
822 | |
823 | ck->valid = false; |
824 | } |
825 | |
826 | static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, |
827 | struct shrink_control *sc) |
828 | { |
829 | struct bch_fs *c = shrink->private_data; |
830 | struct btree_key_cache *bc = &c->btree_key_cache; |
831 | struct bucket_table *tbl; |
832 | struct bkey_cached *ck, *t; |
833 | size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; |
834 | unsigned start, flags; |
835 | int srcu_idx; |
836 | |
837 | mutex_lock(&bc->lock); |
838 | srcu_idx = srcu_read_lock(ssp: &c->btree_trans_barrier); |
839 | flags = memalloc_nofs_save(); |
840 | |
841 | /* |
842 | * Newest freed entries are at the end of the list - once we hit one |
843 | * that's too new to be freed, we can bail out: |
844 | */ |
845 | list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { |
846 | if (!poll_state_synchronize_srcu(ssp: &c->btree_trans_barrier, |
847 | cookie: ck->btree_trans_barrier_seq)) |
848 | break; |
849 | |
850 | list_del(entry: &ck->list); |
851 | six_lock_exit(lock: &ck->c.lock); |
852 | kmem_cache_free(s: bch2_key_cache, objp: ck); |
853 | atomic_long_dec(v: &bc->nr_freed); |
854 | freed++; |
855 | bc->nr_freed_nonpcpu--; |
856 | } |
857 | |
858 | list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { |
859 | if (!poll_state_synchronize_srcu(ssp: &c->btree_trans_barrier, |
860 | cookie: ck->btree_trans_barrier_seq)) |
861 | break; |
862 | |
863 | list_del(entry: &ck->list); |
864 | six_lock_exit(lock: &ck->c.lock); |
865 | kmem_cache_free(s: bch2_key_cache, objp: ck); |
866 | atomic_long_dec(v: &bc->nr_freed); |
867 | freed++; |
868 | bc->nr_freed_pcpu--; |
869 | } |
870 | |
871 | rcu_read_lock(); |
872 | tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); |
873 | if (bc->shrink_iter >= tbl->size) |
874 | bc->shrink_iter = 0; |
875 | start = bc->shrink_iter; |
876 | |
877 | do { |
878 | struct rhash_head *pos, *next; |
879 | |
880 | pos = rht_ptr_rcu(bkt: rht_bucket(tbl, hash: bc->shrink_iter)); |
881 | |
882 | while (!rht_is_a_nulls(ptr: pos)) { |
883 | next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); |
884 | ck = container_of(pos, struct bkey_cached, hash); |
885 | |
886 | if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { |
887 | goto next; |
888 | } else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) { |
889 | clear_bit(BKEY_CACHED_ACCESSED, addr: &ck->flags); |
890 | goto next; |
891 | } else if (bkey_cached_lock_for_evict(ck)) { |
892 | bkey_cached_evict(c: bc, ck); |
893 | bkey_cached_free(bc, ck); |
894 | } |
895 | |
896 | scanned++; |
897 | if (scanned >= nr) |
898 | break; |
899 | next: |
900 | pos = next; |
901 | } |
902 | |
903 | bc->shrink_iter++; |
904 | if (bc->shrink_iter >= tbl->size) |
905 | bc->shrink_iter = 0; |
906 | } while (scanned < nr && bc->shrink_iter != start); |
907 | |
908 | rcu_read_unlock(); |
909 | memalloc_nofs_restore(flags); |
910 | srcu_read_unlock(ssp: &c->btree_trans_barrier, idx: srcu_idx); |
911 | mutex_unlock(lock: &bc->lock); |
912 | |
913 | return freed; |
914 | } |
915 | |
916 | static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, |
917 | struct shrink_control *sc) |
918 | { |
919 | struct bch_fs *c = shrink->private_data; |
920 | struct btree_key_cache *bc = &c->btree_key_cache; |
921 | long nr = atomic_long_read(v: &bc->nr_keys) - |
922 | atomic_long_read(v: &bc->nr_dirty); |
923 | |
924 | return max(0L, nr); |
925 | } |
926 | |
927 | void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) |
928 | { |
929 | struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); |
930 | struct bucket_table *tbl; |
931 | struct bkey_cached *ck, *n; |
932 | struct rhash_head *pos; |
933 | LIST_HEAD(items); |
934 | unsigned i; |
935 | #ifdef __KERNEL__ |
936 | int cpu; |
937 | #endif |
938 | |
939 | shrinker_free(shrinker: bc->shrink); |
940 | |
941 | mutex_lock(&bc->lock); |
942 | |
943 | /* |
944 | * The loop is needed to guard against racing with rehash: |
945 | */ |
946 | while (atomic_long_read(v: &bc->nr_keys)) { |
947 | rcu_read_lock(); |
948 | tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); |
949 | if (tbl) |
950 | for (i = 0; i < tbl->size; i++) |
951 | rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { |
952 | bkey_cached_evict(c: bc, ck); |
953 | list_add(new: &ck->list, head: &items); |
954 | } |
955 | rcu_read_unlock(); |
956 | } |
957 | |
958 | #ifdef __KERNEL__ |
959 | for_each_possible_cpu(cpu) { |
960 | struct btree_key_cache_freelist *f = |
961 | per_cpu_ptr(bc->pcpu_freed, cpu); |
962 | |
963 | for (i = 0; i < f->nr; i++) { |
964 | ck = f->objs[i]; |
965 | list_add(new: &ck->list, head: &items); |
966 | } |
967 | } |
968 | #endif |
969 | |
970 | BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); |
971 | BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); |
972 | |
973 | list_splice(list: &bc->freed_pcpu, head: &items); |
974 | list_splice(list: &bc->freed_nonpcpu, head: &items); |
975 | |
976 | mutex_unlock(lock: &bc->lock); |
977 | |
978 | list_for_each_entry_safe(ck, n, &items, list) { |
979 | cond_resched(); |
980 | |
981 | list_del(entry: &ck->list); |
982 | kfree(objp: ck->k); |
983 | six_lock_exit(lock: &ck->c.lock); |
984 | kmem_cache_free(s: bch2_key_cache, objp: ck); |
985 | } |
986 | |
987 | if (atomic_long_read(v: &bc->nr_dirty) && |
988 | !bch2_journal_error(j: &c->journal) && |
989 | test_bit(BCH_FS_was_rw, &c->flags)) |
990 | panic(fmt: "btree key cache shutdown error: nr_dirty nonzero (%li)\n" , |
991 | atomic_long_read(v: &bc->nr_dirty)); |
992 | |
993 | if (atomic_long_read(v: &bc->nr_keys)) |
994 | panic(fmt: "btree key cache shutdown error: nr_keys nonzero (%li)\n" , |
995 | atomic_long_read(v: &bc->nr_keys)); |
996 | |
997 | if (bc->table_init_done) |
998 | rhashtable_destroy(ht: &bc->table); |
999 | |
1000 | free_percpu(pdata: bc->pcpu_freed); |
1001 | } |
1002 | |
1003 | void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) |
1004 | { |
1005 | mutex_init(&c->lock); |
1006 | INIT_LIST_HEAD(list: &c->freed_pcpu); |
1007 | INIT_LIST_HEAD(list: &c->freed_nonpcpu); |
1008 | } |
1009 | |
1010 | int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) |
1011 | { |
1012 | struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); |
1013 | struct shrinker *shrink; |
1014 | |
1015 | #ifdef __KERNEL__ |
1016 | bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); |
1017 | if (!bc->pcpu_freed) |
1018 | return -BCH_ERR_ENOMEM_fs_btree_cache_init; |
1019 | #endif |
1020 | |
1021 | if (rhashtable_init(ht: &bc->table, params: &bch2_btree_key_cache_params)) |
1022 | return -BCH_ERR_ENOMEM_fs_btree_cache_init; |
1023 | |
1024 | bc->table_init_done = true; |
1025 | |
1026 | shrink = shrinker_alloc(flags: 0, fmt: "%s-btree_key_cache" , c->name); |
1027 | if (!shrink) |
1028 | return -BCH_ERR_ENOMEM_fs_btree_cache_init; |
1029 | bc->shrink = shrink; |
1030 | shrink->seeks = 0; |
1031 | shrink->count_objects = bch2_btree_key_cache_count; |
1032 | shrink->scan_objects = bch2_btree_key_cache_scan; |
1033 | shrink->private_data = c; |
1034 | shrinker_register(shrinker: shrink); |
1035 | return 0; |
1036 | } |
1037 | |
1038 | void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) |
1039 | { |
1040 | prt_printf(out, "nr_freed:\t%lu" , atomic_long_read(&c->nr_freed)); |
1041 | prt_newline(out); |
1042 | prt_printf(out, "nr_keys:\t%lu" , atomic_long_read(&c->nr_keys)); |
1043 | prt_newline(out); |
1044 | prt_printf(out, "nr_dirty:\t%lu" , atomic_long_read(&c->nr_dirty)); |
1045 | prt_newline(out); |
1046 | } |
1047 | |
1048 | void bch2_btree_key_cache_exit(void) |
1049 | { |
1050 | kmem_cache_destroy(s: bch2_key_cache); |
1051 | } |
1052 | |
1053 | int __init bch2_btree_key_cache_init(void) |
1054 | { |
1055 | bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); |
1056 | if (!bch2_key_cache) |
1057 | return -ENOMEM; |
1058 | |
1059 | return 0; |
1060 | } |
1061 | |