1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "bcachefs.h" |
4 | #include "btree_key_cache.h" |
5 | #include "btree_update.h" |
6 | #include "btree_write_buffer.h" |
7 | #include "buckets.h" |
8 | #include "errcode.h" |
9 | #include "error.h" |
10 | #include "journal.h" |
11 | #include "journal_io.h" |
12 | #include "journal_reclaim.h" |
13 | #include "replicas.h" |
14 | #include "sb-members.h" |
15 | #include "trace.h" |
16 | |
17 | #include <linux/kthread.h> |
18 | #include <linux/sched/mm.h> |
19 | |
20 | /* Free space calculations: */ |
21 | |
22 | static unsigned journal_space_from(struct journal_device *ja, |
23 | enum journal_space_from from) |
24 | { |
25 | switch (from) { |
26 | case journal_space_discarded: |
27 | return ja->discard_idx; |
28 | case journal_space_clean_ondisk: |
29 | return ja->dirty_idx_ondisk; |
30 | case journal_space_clean: |
31 | return ja->dirty_idx; |
32 | default: |
33 | BUG(); |
34 | } |
35 | } |
36 | |
37 | unsigned bch2_journal_dev_buckets_available(struct journal *j, |
38 | struct journal_device *ja, |
39 | enum journal_space_from from) |
40 | { |
41 | unsigned available = (journal_space_from(ja, from) - |
42 | ja->cur_idx - 1 + ja->nr) % ja->nr; |
43 | |
44 | /* |
45 | * Don't use the last bucket unless writing the new last_seq |
46 | * will make another bucket available: |
47 | */ |
48 | if (available && ja->dirty_idx_ondisk == ja->dirty_idx) |
49 | --available; |
50 | |
51 | return available; |
52 | } |
53 | |
54 | void bch2_journal_set_watermark(struct journal *j) |
55 | { |
56 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
57 | bool low_on_space = j->space[journal_space_clean].total * 4 <= |
58 | j->space[journal_space_total].total; |
59 | bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; |
60 | bool low_on_wb = bch2_btree_write_buffer_must_wait(c); |
61 | unsigned watermark = low_on_space || low_on_pin || low_on_wb |
62 | ? BCH_WATERMARK_reclaim |
63 | : BCH_WATERMARK_stripe; |
64 | |
65 | if (track_event_change(stats: &c->times[BCH_TIME_blocked_journal_low_on_space], v: low_on_space) || |
66 | track_event_change(stats: &c->times[BCH_TIME_blocked_journal_low_on_pin], v: low_on_pin) || |
67 | track_event_change(stats: &c->times[BCH_TIME_blocked_write_buffer_full], v: low_on_wb)) |
68 | trace_and_count(c, journal_full, c); |
69 | |
70 | mod_bit(nr: JOURNAL_SPACE_LOW, addr: &j->flags, v: low_on_space || low_on_pin); |
71 | |
72 | swap(watermark, j->watermark); |
73 | if (watermark > j->watermark) |
74 | journal_wake(j); |
75 | } |
76 | |
77 | static struct journal_space |
78 | journal_dev_space_available(struct journal *j, struct bch_dev *ca, |
79 | enum journal_space_from from) |
80 | { |
81 | struct journal_device *ja = &ca->journal; |
82 | unsigned sectors, buckets, unwritten; |
83 | u64 seq; |
84 | |
85 | if (from == journal_space_total) |
86 | return (struct journal_space) { |
87 | .next_entry = ca->mi.bucket_size, |
88 | .total = ca->mi.bucket_size * ja->nr, |
89 | }; |
90 | |
91 | buckets = bch2_journal_dev_buckets_available(j, ja, from); |
92 | sectors = ja->sectors_free; |
93 | |
94 | /* |
95 | * We that we don't allocate the space for a journal entry |
96 | * until we write it out - thus, account for it here: |
97 | */ |
98 | for (seq = journal_last_unwritten_seq(j); |
99 | seq <= journal_cur_seq(j); |
100 | seq++) { |
101 | unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; |
102 | |
103 | if (!unwritten) |
104 | continue; |
105 | |
106 | /* entry won't fit on this device, skip: */ |
107 | if (unwritten > ca->mi.bucket_size) |
108 | continue; |
109 | |
110 | if (unwritten >= sectors) { |
111 | if (!buckets) { |
112 | sectors = 0; |
113 | break; |
114 | } |
115 | |
116 | buckets--; |
117 | sectors = ca->mi.bucket_size; |
118 | } |
119 | |
120 | sectors -= unwritten; |
121 | } |
122 | |
123 | if (sectors < ca->mi.bucket_size && buckets) { |
124 | buckets--; |
125 | sectors = ca->mi.bucket_size; |
126 | } |
127 | |
128 | return (struct journal_space) { |
129 | .next_entry = sectors, |
130 | .total = sectors + buckets * ca->mi.bucket_size, |
131 | }; |
132 | } |
133 | |
134 | static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, |
135 | enum journal_space_from from) |
136 | { |
137 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
138 | unsigned pos, nr_devs = 0; |
139 | struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; |
140 | |
141 | BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); |
142 | |
143 | rcu_read_lock(); |
144 | for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { |
145 | if (!ca->journal.nr) |
146 | continue; |
147 | |
148 | space = journal_dev_space_available(j, ca, from); |
149 | if (!space.next_entry) |
150 | continue; |
151 | |
152 | for (pos = 0; pos < nr_devs; pos++) |
153 | if (space.total > dev_space[pos].total) |
154 | break; |
155 | |
156 | array_insert_item(dev_space, nr_devs, pos, space); |
157 | } |
158 | rcu_read_unlock(); |
159 | |
160 | if (nr_devs < nr_devs_want) |
161 | return (struct journal_space) { 0, 0 }; |
162 | |
163 | /* |
164 | * We sorted largest to smallest, and we want the smallest out of the |
165 | * @nr_devs_want largest devices: |
166 | */ |
167 | return dev_space[nr_devs_want - 1]; |
168 | } |
169 | |
170 | void bch2_journal_space_available(struct journal *j) |
171 | { |
172 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
173 | unsigned clean, clean_ondisk, total; |
174 | unsigned max_entry_size = min(j->buf[0].buf_size >> 9, |
175 | j->buf[1].buf_size >> 9); |
176 | unsigned nr_online = 0, nr_devs_want; |
177 | bool can_discard = false; |
178 | int ret = 0; |
179 | |
180 | lockdep_assert_held(&j->lock); |
181 | |
182 | rcu_read_lock(); |
183 | for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { |
184 | struct journal_device *ja = &ca->journal; |
185 | |
186 | if (!ja->nr) |
187 | continue; |
188 | |
189 | while (ja->dirty_idx != ja->cur_idx && |
190 | ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) |
191 | ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; |
192 | |
193 | while (ja->dirty_idx_ondisk != ja->dirty_idx && |
194 | ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) |
195 | ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; |
196 | |
197 | if (ja->discard_idx != ja->dirty_idx_ondisk) |
198 | can_discard = true; |
199 | |
200 | max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); |
201 | nr_online++; |
202 | } |
203 | rcu_read_unlock(); |
204 | |
205 | j->can_discard = can_discard; |
206 | |
207 | if (nr_online < metadata_replicas_required(c)) { |
208 | ret = JOURNAL_ERR_insufficient_devices; |
209 | goto out; |
210 | } |
211 | |
212 | nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); |
213 | |
214 | for (unsigned i = 0; i < journal_space_nr; i++) |
215 | j->space[i] = __journal_space_available(j, nr_devs_want, from: i); |
216 | |
217 | clean_ondisk = j->space[journal_space_clean_ondisk].total; |
218 | clean = j->space[journal_space_clean].total; |
219 | total = j->space[journal_space_total].total; |
220 | |
221 | if (!j->space[journal_space_discarded].next_entry) |
222 | ret = JOURNAL_ERR_journal_full; |
223 | |
224 | if ((j->space[journal_space_clean_ondisk].next_entry < |
225 | j->space[journal_space_clean_ondisk].total) && |
226 | (clean - clean_ondisk <= total / 8) && |
227 | (clean_ondisk * 2 > clean)) |
228 | set_bit(nr: JOURNAL_MAY_SKIP_FLUSH, addr: &j->flags); |
229 | else |
230 | clear_bit(nr: JOURNAL_MAY_SKIP_FLUSH, addr: &j->flags); |
231 | |
232 | bch2_journal_set_watermark(j); |
233 | out: |
234 | j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; |
235 | j->cur_entry_error = ret; |
236 | |
237 | if (!ret) |
238 | journal_wake(j); |
239 | } |
240 | |
241 | /* Discards - last part of journal reclaim: */ |
242 | |
243 | static bool should_discard_bucket(struct journal *j, struct journal_device *ja) |
244 | { |
245 | bool ret; |
246 | |
247 | spin_lock(lock: &j->lock); |
248 | ret = ja->discard_idx != ja->dirty_idx_ondisk; |
249 | spin_unlock(lock: &j->lock); |
250 | |
251 | return ret; |
252 | } |
253 | |
254 | /* |
255 | * Advance ja->discard_idx as long as it points to buckets that are no longer |
256 | * dirty, issuing discards if necessary: |
257 | */ |
258 | void bch2_journal_do_discards(struct journal *j) |
259 | { |
260 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
261 | |
262 | mutex_lock(&j->discard_lock); |
263 | |
264 | for_each_rw_member(c, ca) { |
265 | struct journal_device *ja = &ca->journal; |
266 | |
267 | while (should_discard_bucket(j, ja)) { |
268 | if (!c->opts.nochanges && |
269 | ca->mi.discard && |
270 | bdev_max_discard_sectors(bdev: ca->disk_sb.bdev)) |
271 | blkdev_issue_discard(bdev: ca->disk_sb.bdev, |
272 | sector: bucket_to_sector(ca, |
273 | b: ja->buckets[ja->discard_idx]), |
274 | nr_sects: ca->mi.bucket_size, GFP_NOFS); |
275 | |
276 | spin_lock(lock: &j->lock); |
277 | ja->discard_idx = (ja->discard_idx + 1) % ja->nr; |
278 | |
279 | bch2_journal_space_available(j); |
280 | spin_unlock(lock: &j->lock); |
281 | } |
282 | } |
283 | |
284 | mutex_unlock(lock: &j->discard_lock); |
285 | } |
286 | |
287 | /* |
288 | * Journal entry pinning - machinery for holding a reference on a given journal |
289 | * entry, holding it open to ensure it gets replayed during recovery: |
290 | */ |
291 | |
292 | void bch2_journal_reclaim_fast(struct journal *j) |
293 | { |
294 | bool popped = false; |
295 | |
296 | lockdep_assert_held(&j->lock); |
297 | |
298 | /* |
299 | * Unpin journal entries whose reference counts reached zero, meaning |
300 | * all btree nodes got written out |
301 | */ |
302 | while (!fifo_empty(&j->pin) && |
303 | j->pin.front <= j->seq_ondisk && |
304 | !atomic_read(v: &fifo_peek_front(&j->pin).count)) { |
305 | j->pin.front++; |
306 | popped = true; |
307 | } |
308 | |
309 | if (popped) |
310 | bch2_journal_space_available(j); |
311 | } |
312 | |
313 | bool __bch2_journal_pin_put(struct journal *j, u64 seq) |
314 | { |
315 | struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); |
316 | |
317 | return atomic_dec_and_test(v: &pin_list->count); |
318 | } |
319 | |
320 | void bch2_journal_pin_put(struct journal *j, u64 seq) |
321 | { |
322 | if (__bch2_journal_pin_put(j, seq)) { |
323 | spin_lock(lock: &j->lock); |
324 | bch2_journal_reclaim_fast(j); |
325 | spin_unlock(lock: &j->lock); |
326 | } |
327 | } |
328 | |
329 | static inline bool __journal_pin_drop(struct journal *j, |
330 | struct journal_entry_pin *pin) |
331 | { |
332 | struct journal_entry_pin_list *pin_list; |
333 | |
334 | if (!journal_pin_active(pin)) |
335 | return false; |
336 | |
337 | if (j->flush_in_progress == pin) |
338 | j->flush_in_progress_dropped = true; |
339 | |
340 | pin_list = journal_seq_pin(j, seq: pin->seq); |
341 | pin->seq = 0; |
342 | list_del_init(entry: &pin->list); |
343 | |
344 | /* |
345 | * Unpinning a journal entry may make journal_next_bucket() succeed, if |
346 | * writing a new last_seq will now make another bucket available: |
347 | */ |
348 | return atomic_dec_and_test(v: &pin_list->count) && |
349 | pin_list == &fifo_peek_front(&j->pin); |
350 | } |
351 | |
352 | void bch2_journal_pin_drop(struct journal *j, |
353 | struct journal_entry_pin *pin) |
354 | { |
355 | spin_lock(lock: &j->lock); |
356 | if (__journal_pin_drop(j, pin)) |
357 | bch2_journal_reclaim_fast(j); |
358 | spin_unlock(lock: &j->lock); |
359 | } |
360 | |
361 | static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) |
362 | { |
363 | if (fn == bch2_btree_node_flush0 || |
364 | fn == bch2_btree_node_flush1) |
365 | return JOURNAL_PIN_btree; |
366 | else if (fn == bch2_btree_key_cache_journal_flush) |
367 | return JOURNAL_PIN_key_cache; |
368 | else |
369 | return JOURNAL_PIN_other; |
370 | } |
371 | |
372 | static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, |
373 | struct journal_entry_pin *pin, |
374 | journal_pin_flush_fn flush_fn, |
375 | enum journal_pin_type type) |
376 | { |
377 | struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); |
378 | |
379 | /* |
380 | * flush_fn is how we identify journal pins in debugfs, so must always |
381 | * exist, even if it doesn't do anything: |
382 | */ |
383 | BUG_ON(!flush_fn); |
384 | |
385 | atomic_inc(v: &pin_list->count); |
386 | pin->seq = seq; |
387 | pin->flush = flush_fn; |
388 | list_add(new: &pin->list, head: &pin_list->list[type]); |
389 | } |
390 | |
391 | void bch2_journal_pin_copy(struct journal *j, |
392 | struct journal_entry_pin *dst, |
393 | struct journal_entry_pin *src, |
394 | journal_pin_flush_fn flush_fn) |
395 | { |
396 | spin_lock(lock: &j->lock); |
397 | |
398 | u64 seq = READ_ONCE(src->seq); |
399 | |
400 | if (seq < journal_last_seq(j)) { |
401 | /* |
402 | * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on |
403 | * the src pin - with the pin dropped, the entry to pin might no |
404 | * longer to exist, but that means there's no longer anything to |
405 | * copy and we can bail out here: |
406 | */ |
407 | spin_unlock(lock: &j->lock); |
408 | return; |
409 | } |
410 | |
411 | bool reclaim = __journal_pin_drop(j, pin: dst); |
412 | |
413 | bch2_journal_pin_set_locked(j, seq, pin: dst, flush_fn, type: journal_pin_type(fn: flush_fn)); |
414 | |
415 | if (reclaim) |
416 | bch2_journal_reclaim_fast(j); |
417 | |
418 | /* |
419 | * If the journal is currently full, we might want to call flush_fn |
420 | * immediately: |
421 | */ |
422 | if (seq == journal_last_seq(j)) |
423 | journal_wake(j); |
424 | spin_unlock(lock: &j->lock); |
425 | } |
426 | |
427 | void bch2_journal_pin_set(struct journal *j, u64 seq, |
428 | struct journal_entry_pin *pin, |
429 | journal_pin_flush_fn flush_fn) |
430 | { |
431 | spin_lock(lock: &j->lock); |
432 | |
433 | BUG_ON(seq < journal_last_seq(j)); |
434 | |
435 | bool reclaim = __journal_pin_drop(j, pin); |
436 | |
437 | bch2_journal_pin_set_locked(j, seq, pin, flush_fn, type: journal_pin_type(fn: flush_fn)); |
438 | |
439 | if (reclaim) |
440 | bch2_journal_reclaim_fast(j); |
441 | /* |
442 | * If the journal is currently full, we might want to call flush_fn |
443 | * immediately: |
444 | */ |
445 | if (seq == journal_last_seq(j)) |
446 | journal_wake(j); |
447 | |
448 | spin_unlock(lock: &j->lock); |
449 | } |
450 | |
451 | /** |
452 | * bch2_journal_pin_flush: ensure journal pin callback is no longer running |
453 | * @j: journal object |
454 | * @pin: pin to flush |
455 | */ |
456 | void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) |
457 | { |
458 | BUG_ON(journal_pin_active(pin)); |
459 | |
460 | wait_event(j->pin_flush_wait, j->flush_in_progress != pin); |
461 | } |
462 | |
463 | /* |
464 | * Journal reclaim: flush references to open journal entries to reclaim space in |
465 | * the journal |
466 | * |
467 | * May be done by the journal code in the background as needed to free up space |
468 | * for more journal entries, or as part of doing a clean shutdown, or to migrate |
469 | * data off of a specific device: |
470 | */ |
471 | |
472 | static struct journal_entry_pin * |
473 | journal_get_next_pin(struct journal *j, |
474 | u64 seq_to_flush, |
475 | unsigned allowed_below_seq, |
476 | unsigned allowed_above_seq, |
477 | u64 *seq) |
478 | { |
479 | struct journal_entry_pin_list *pin_list; |
480 | struct journal_entry_pin *ret = NULL; |
481 | unsigned i; |
482 | |
483 | fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { |
484 | if (*seq > seq_to_flush && !allowed_above_seq) |
485 | break; |
486 | |
487 | for (i = 0; i < JOURNAL_PIN_NR; i++) |
488 | if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || |
489 | ((1U << i) & allowed_above_seq)) { |
490 | ret = list_first_entry_or_null(&pin_list->list[i], |
491 | struct journal_entry_pin, list); |
492 | if (ret) |
493 | return ret; |
494 | } |
495 | } |
496 | |
497 | return NULL; |
498 | } |
499 | |
500 | /* returns true if we did work */ |
501 | static size_t journal_flush_pins(struct journal *j, |
502 | u64 seq_to_flush, |
503 | unsigned allowed_below_seq, |
504 | unsigned allowed_above_seq, |
505 | unsigned min_any, |
506 | unsigned min_key_cache) |
507 | { |
508 | struct journal_entry_pin *pin; |
509 | size_t nr_flushed = 0; |
510 | journal_pin_flush_fn flush_fn; |
511 | u64 seq; |
512 | int err; |
513 | |
514 | lockdep_assert_held(&j->reclaim_lock); |
515 | |
516 | while (1) { |
517 | unsigned allowed_above = allowed_above_seq; |
518 | unsigned allowed_below = allowed_below_seq; |
519 | |
520 | if (min_any) { |
521 | allowed_above |= ~0; |
522 | allowed_below |= ~0; |
523 | } |
524 | |
525 | if (min_key_cache) { |
526 | allowed_above |= 1U << JOURNAL_PIN_key_cache; |
527 | allowed_below |= 1U << JOURNAL_PIN_key_cache; |
528 | } |
529 | |
530 | cond_resched(); |
531 | |
532 | j->last_flushed = jiffies; |
533 | |
534 | spin_lock(lock: &j->lock); |
535 | pin = journal_get_next_pin(j, seq_to_flush, allowed_below_seq: allowed_below, allowed_above_seq: allowed_above, seq: &seq); |
536 | if (pin) { |
537 | BUG_ON(j->flush_in_progress); |
538 | j->flush_in_progress = pin; |
539 | j->flush_in_progress_dropped = false; |
540 | flush_fn = pin->flush; |
541 | } |
542 | spin_unlock(lock: &j->lock); |
543 | |
544 | if (!pin) |
545 | break; |
546 | |
547 | if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) |
548 | min_key_cache--; |
549 | |
550 | if (min_any) |
551 | min_any--; |
552 | |
553 | err = flush_fn(j, pin, seq); |
554 | |
555 | spin_lock(lock: &j->lock); |
556 | /* Pin might have been dropped or rearmed: */ |
557 | if (likely(!err && !j->flush_in_progress_dropped)) |
558 | list_move(list: &pin->list, head: &journal_seq_pin(j, seq)->flushed); |
559 | j->flush_in_progress = NULL; |
560 | j->flush_in_progress_dropped = false; |
561 | spin_unlock(lock: &j->lock); |
562 | |
563 | wake_up(&j->pin_flush_wait); |
564 | |
565 | if (err) |
566 | break; |
567 | |
568 | nr_flushed++; |
569 | } |
570 | |
571 | return nr_flushed; |
572 | } |
573 | |
574 | static u64 journal_seq_to_flush(struct journal *j) |
575 | { |
576 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
577 | u64 seq_to_flush = 0; |
578 | |
579 | spin_lock(lock: &j->lock); |
580 | |
581 | for_each_rw_member(c, ca) { |
582 | struct journal_device *ja = &ca->journal; |
583 | unsigned nr_buckets, bucket_to_flush; |
584 | |
585 | if (!ja->nr) |
586 | continue; |
587 | |
588 | /* Try to keep the journal at most half full: */ |
589 | nr_buckets = ja->nr / 2; |
590 | |
591 | nr_buckets = min(nr_buckets, ja->nr); |
592 | |
593 | bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; |
594 | seq_to_flush = max(seq_to_flush, |
595 | ja->bucket_seq[bucket_to_flush]); |
596 | } |
597 | |
598 | /* Also flush if the pin fifo is more than half full */ |
599 | seq_to_flush = max_t(s64, seq_to_flush, |
600 | (s64) journal_cur_seq(j) - |
601 | (j->pin.size >> 1)); |
602 | spin_unlock(lock: &j->lock); |
603 | |
604 | return seq_to_flush; |
605 | } |
606 | |
607 | /** |
608 | * __bch2_journal_reclaim - free up journal buckets |
609 | * @j: journal object |
610 | * @direct: direct or background reclaim? |
611 | * @kicked: requested to run since we last ran? |
612 | * Returns: 0 on success, or -EIO if the journal has been shutdown |
613 | * |
614 | * Background journal reclaim writes out btree nodes. It should be run |
615 | * early enough so that we never completely run out of journal buckets. |
616 | * |
617 | * High watermarks for triggering background reclaim: |
618 | * - FIFO has fewer than 512 entries left |
619 | * - fewer than 25% journal buckets free |
620 | * |
621 | * Background reclaim runs until low watermarks are reached: |
622 | * - FIFO has more than 1024 entries left |
623 | * - more than 50% journal buckets free |
624 | * |
625 | * As long as a reclaim can complete in the time it takes to fill up |
626 | * 512 journal entries or 25% of all journal buckets, then |
627 | * journal_next_bucket() should not stall. |
628 | */ |
629 | static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) |
630 | { |
631 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
632 | bool kthread = (current->flags & PF_KTHREAD) != 0; |
633 | u64 seq_to_flush; |
634 | size_t min_nr, min_key_cache, nr_flushed; |
635 | unsigned flags; |
636 | int ret = 0; |
637 | |
638 | /* |
639 | * We can't invoke memory reclaim while holding the reclaim_lock - |
640 | * journal reclaim is required to make progress for memory reclaim |
641 | * (cleaning the caches), so we can't get stuck in memory reclaim while |
642 | * we're holding the reclaim lock: |
643 | */ |
644 | lockdep_assert_held(&j->reclaim_lock); |
645 | flags = memalloc_noreclaim_save(); |
646 | |
647 | do { |
648 | if (kthread && kthread_should_stop()) |
649 | break; |
650 | |
651 | if (bch2_journal_error(j)) { |
652 | ret = -EIO; |
653 | break; |
654 | } |
655 | |
656 | bch2_journal_do_discards(j); |
657 | |
658 | seq_to_flush = journal_seq_to_flush(j); |
659 | min_nr = 0; |
660 | |
661 | /* |
662 | * If it's been longer than j->reclaim_delay_ms since we last flushed, |
663 | * make sure to flush at least one journal pin: |
664 | */ |
665 | if (time_after(jiffies, j->last_flushed + |
666 | msecs_to_jiffies(c->opts.journal_reclaim_delay))) |
667 | min_nr = 1; |
668 | |
669 | if (j->watermark != BCH_WATERMARK_stripe) |
670 | min_nr = 1; |
671 | |
672 | if (atomic_read(v: &c->btree_cache.dirty) * 2 > c->btree_cache.used) |
673 | min_nr = 1; |
674 | |
675 | min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); |
676 | |
677 | trace_and_count(c, journal_reclaim_start, c, |
678 | direct, kicked, |
679 | min_nr, min_key_cache, |
680 | atomic_read(&c->btree_cache.dirty), |
681 | c->btree_cache.used, |
682 | atomic_long_read(&c->btree_key_cache.nr_dirty), |
683 | atomic_long_read(&c->btree_key_cache.nr_keys)); |
684 | |
685 | nr_flushed = journal_flush_pins(j, seq_to_flush, |
686 | allowed_below_seq: ~0, allowed_above_seq: 0, |
687 | min_any: min_nr, min_key_cache); |
688 | |
689 | if (direct) |
690 | j->nr_direct_reclaim += nr_flushed; |
691 | else |
692 | j->nr_background_reclaim += nr_flushed; |
693 | trace_and_count(c, journal_reclaim_finish, c, nr_flushed); |
694 | |
695 | if (nr_flushed) |
696 | wake_up(&j->reclaim_wait); |
697 | } while ((min_nr || min_key_cache) && nr_flushed && !direct); |
698 | |
699 | memalloc_noreclaim_restore(flags); |
700 | |
701 | return ret; |
702 | } |
703 | |
704 | int bch2_journal_reclaim(struct journal *j) |
705 | { |
706 | return __bch2_journal_reclaim(j, direct: true, kicked: true); |
707 | } |
708 | |
709 | static int bch2_journal_reclaim_thread(void *arg) |
710 | { |
711 | struct journal *j = arg; |
712 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
713 | unsigned long delay, now; |
714 | bool journal_empty; |
715 | int ret = 0; |
716 | |
717 | set_freezable(); |
718 | |
719 | j->last_flushed = jiffies; |
720 | |
721 | while (!ret && !kthread_should_stop()) { |
722 | bool kicked = j->reclaim_kicked; |
723 | |
724 | j->reclaim_kicked = false; |
725 | |
726 | mutex_lock(&j->reclaim_lock); |
727 | ret = __bch2_journal_reclaim(j, direct: false, kicked); |
728 | mutex_unlock(lock: &j->reclaim_lock); |
729 | |
730 | now = jiffies; |
731 | delay = msecs_to_jiffies(m: c->opts.journal_reclaim_delay); |
732 | j->next_reclaim = j->last_flushed + delay; |
733 | |
734 | if (!time_in_range(j->next_reclaim, now, now + delay)) |
735 | j->next_reclaim = now + delay; |
736 | |
737 | while (1) { |
738 | set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); |
739 | if (kthread_should_stop()) |
740 | break; |
741 | if (j->reclaim_kicked) |
742 | break; |
743 | |
744 | spin_lock(lock: &j->lock); |
745 | journal_empty = fifo_empty(&j->pin); |
746 | spin_unlock(lock: &j->lock); |
747 | |
748 | if (journal_empty) |
749 | schedule(); |
750 | else if (time_after(j->next_reclaim, jiffies)) |
751 | schedule_timeout(timeout: j->next_reclaim - jiffies); |
752 | else |
753 | break; |
754 | } |
755 | __set_current_state(TASK_RUNNING); |
756 | } |
757 | |
758 | return 0; |
759 | } |
760 | |
761 | void bch2_journal_reclaim_stop(struct journal *j) |
762 | { |
763 | struct task_struct *p = j->reclaim_thread; |
764 | |
765 | j->reclaim_thread = NULL; |
766 | |
767 | if (p) { |
768 | kthread_stop(k: p); |
769 | put_task_struct(t: p); |
770 | } |
771 | } |
772 | |
773 | int bch2_journal_reclaim_start(struct journal *j) |
774 | { |
775 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
776 | struct task_struct *p; |
777 | int ret; |
778 | |
779 | if (j->reclaim_thread) |
780 | return 0; |
781 | |
782 | p = kthread_create(bch2_journal_reclaim_thread, j, |
783 | "bch-reclaim/%s" , c->name); |
784 | ret = PTR_ERR_OR_ZERO(ptr: p); |
785 | bch_err_msg(c, ret, "creating journal reclaim thread" ); |
786 | if (ret) |
787 | return ret; |
788 | |
789 | get_task_struct(t: p); |
790 | j->reclaim_thread = p; |
791 | wake_up_process(tsk: p); |
792 | return 0; |
793 | } |
794 | |
795 | static int journal_flush_done(struct journal *j, u64 seq_to_flush, |
796 | bool *did_work) |
797 | { |
798 | int ret; |
799 | |
800 | ret = bch2_journal_error(j); |
801 | if (ret) |
802 | return ret; |
803 | |
804 | mutex_lock(&j->reclaim_lock); |
805 | |
806 | if (journal_flush_pins(j, seq_to_flush, |
807 | allowed_below_seq: (1U << JOURNAL_PIN_key_cache)| |
808 | (1U << JOURNAL_PIN_other), allowed_above_seq: 0, min_any: 0, min_key_cache: 0) || |
809 | journal_flush_pins(j, seq_to_flush, |
810 | allowed_below_seq: (1U << JOURNAL_PIN_btree), allowed_above_seq: 0, min_any: 0, min_key_cache: 0)) |
811 | *did_work = true; |
812 | |
813 | if (seq_to_flush > journal_cur_seq(j)) |
814 | bch2_journal_entry_close(j); |
815 | |
816 | spin_lock(lock: &j->lock); |
817 | /* |
818 | * If journal replay hasn't completed, the unreplayed journal entries |
819 | * hold refs on their corresponding sequence numbers |
820 | */ |
821 | ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || |
822 | journal_last_seq(j) > seq_to_flush || |
823 | !fifo_used(&j->pin); |
824 | |
825 | spin_unlock(lock: &j->lock); |
826 | mutex_unlock(lock: &j->reclaim_lock); |
827 | |
828 | return ret; |
829 | } |
830 | |
831 | bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) |
832 | { |
833 | /* time_stats this */ |
834 | bool did_work = false; |
835 | |
836 | if (!test_bit(JOURNAL_STARTED, &j->flags)) |
837 | return false; |
838 | |
839 | closure_wait_event(&j->async_wait, |
840 | journal_flush_done(j, seq_to_flush, &did_work)); |
841 | |
842 | return did_work; |
843 | } |
844 | |
845 | int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) |
846 | { |
847 | struct bch_fs *c = container_of(j, struct bch_fs, journal); |
848 | struct journal_entry_pin_list *p; |
849 | u64 iter, seq = 0; |
850 | int ret = 0; |
851 | |
852 | spin_lock(lock: &j->lock); |
853 | fifo_for_each_entry_ptr(p, &j->pin, iter) |
854 | if (dev_idx >= 0 |
855 | ? bch2_dev_list_has_dev(devs: p->devs, dev: dev_idx) |
856 | : p->devs.nr < c->opts.metadata_replicas) |
857 | seq = iter; |
858 | spin_unlock(lock: &j->lock); |
859 | |
860 | bch2_journal_flush_pins(j, seq_to_flush: seq); |
861 | |
862 | ret = bch2_journal_error(j); |
863 | if (ret) |
864 | return ret; |
865 | |
866 | mutex_lock(&c->replicas_gc_lock); |
867 | bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); |
868 | |
869 | /* |
870 | * Now that we've populated replicas_gc, write to the journal to mark |
871 | * active journal devices. This handles the case where the journal might |
872 | * be empty. Otherwise we could clear all journal replicas and |
873 | * temporarily put the fs into an unrecoverable state. Journal recovery |
874 | * expects to find devices marked for journal data on unclean mount. |
875 | */ |
876 | ret = bch2_journal_meta(&c->journal); |
877 | if (ret) |
878 | goto err; |
879 | |
880 | seq = 0; |
881 | spin_lock(lock: &j->lock); |
882 | while (!ret) { |
883 | struct bch_replicas_padded replicas; |
884 | |
885 | seq = max(seq, journal_last_seq(j)); |
886 | if (seq >= j->pin.back) |
887 | break; |
888 | bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, |
889 | journal_seq_pin(j, seq)->devs); |
890 | seq++; |
891 | |
892 | if (replicas.e.nr_devs) { |
893 | spin_unlock(lock: &j->lock); |
894 | ret = bch2_mark_replicas(c, &replicas.e); |
895 | spin_lock(lock: &j->lock); |
896 | } |
897 | } |
898 | spin_unlock(lock: &j->lock); |
899 | err: |
900 | ret = bch2_replicas_gc_end(c, ret); |
901 | mutex_unlock(lock: &c->replicas_gc_lock); |
902 | |
903 | return ret; |
904 | } |
905 | |