1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_JOURNAL_TYPES_H |
3 | #define _BCACHEFS_JOURNAL_TYPES_H |
4 | |
5 | #include <linux/cache.h> |
6 | #include <linux/workqueue.h> |
7 | |
8 | #include "alloc_types.h" |
9 | #include "super_types.h" |
10 | #include "fifo.h" |
11 | |
12 | #define JOURNAL_BUF_BITS 2 |
13 | #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) |
14 | #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) |
15 | |
16 | /* |
17 | * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to |
18 | * the journal that are being staged or in flight. |
19 | */ |
20 | struct journal_buf { |
21 | struct closure io; |
22 | struct jset *data; |
23 | |
24 | __BKEY_PADDED(key, BCH_REPLICAS_MAX); |
25 | struct bch_devs_list devs_written; |
26 | |
27 | struct closure_waitlist wait; |
28 | u64 last_seq; /* copy of data->last_seq */ |
29 | long expires; |
30 | u64 flush_time; |
31 | |
32 | unsigned buf_size; /* size in bytes of @data */ |
33 | unsigned sectors; /* maximum size for current entry */ |
34 | unsigned disk_sectors; /* maximum size entry could have been, if |
35 | buf_size was bigger */ |
36 | unsigned u64s_reserved; |
37 | bool noflush:1; /* write has already been kicked off, and was noflush */ |
38 | bool must_flush:1; /* something wants a flush */ |
39 | bool separate_flush:1; |
40 | bool need_flush_to_write_buffer:1; |
41 | bool write_started:1; |
42 | bool write_allocated:1; |
43 | bool write_done:1; |
44 | u8 idx; |
45 | }; |
46 | |
47 | /* |
48 | * Something that makes a journal entry dirty - i.e. a btree node that has to be |
49 | * flushed: |
50 | */ |
51 | |
52 | enum journal_pin_type { |
53 | JOURNAL_PIN_btree, |
54 | JOURNAL_PIN_key_cache, |
55 | JOURNAL_PIN_other, |
56 | JOURNAL_PIN_NR, |
57 | }; |
58 | |
59 | struct journal_entry_pin_list { |
60 | struct list_head list[JOURNAL_PIN_NR]; |
61 | struct list_head flushed; |
62 | atomic_t count; |
63 | struct bch_devs_list devs; |
64 | }; |
65 | |
66 | struct journal; |
67 | struct journal_entry_pin; |
68 | typedef int (*journal_pin_flush_fn)(struct journal *j, |
69 | struct journal_entry_pin *, u64); |
70 | |
71 | struct journal_entry_pin { |
72 | struct list_head list; |
73 | journal_pin_flush_fn flush; |
74 | u64 seq; |
75 | }; |
76 | |
77 | struct journal_res { |
78 | bool ref; |
79 | u8 idx; |
80 | u16 u64s; |
81 | u32 offset; |
82 | u64 seq; |
83 | }; |
84 | |
85 | union journal_res_state { |
86 | struct { |
87 | atomic64_t counter; |
88 | }; |
89 | |
90 | struct { |
91 | u64 v; |
92 | }; |
93 | |
94 | struct { |
95 | u64 cur_entry_offset:20, |
96 | idx:2, |
97 | unwritten_idx:2, |
98 | buf0_count:10, |
99 | buf1_count:10, |
100 | buf2_count:10, |
101 | buf3_count:10; |
102 | }; |
103 | }; |
104 | |
105 | /* bytes: */ |
106 | #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ |
107 | #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ |
108 | |
109 | /* |
110 | * We stash some journal state as sentinal values in cur_entry_offset: |
111 | * note - cur_entry_offset is in units of u64s |
112 | */ |
113 | #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) |
114 | |
115 | #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) |
116 | #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) |
117 | |
118 | struct journal_space { |
119 | /* Units of 512 bytes sectors: */ |
120 | unsigned next_entry; /* How big the next journal entry can be */ |
121 | unsigned total; |
122 | }; |
123 | |
124 | enum journal_space_from { |
125 | journal_space_discarded, |
126 | journal_space_clean_ondisk, |
127 | journal_space_clean, |
128 | journal_space_total, |
129 | journal_space_nr, |
130 | }; |
131 | |
132 | enum journal_flags { |
133 | JOURNAL_REPLAY_DONE, |
134 | JOURNAL_STARTED, |
135 | JOURNAL_MAY_SKIP_FLUSH, |
136 | JOURNAL_NEED_FLUSH_WRITE, |
137 | JOURNAL_SPACE_LOW, |
138 | }; |
139 | |
140 | /* Reasons we may fail to get a journal reservation: */ |
141 | #define JOURNAL_ERRORS() \ |
142 | x(ok) \ |
143 | x(retry) \ |
144 | x(blocked) \ |
145 | x(max_in_flight) \ |
146 | x(journal_full) \ |
147 | x(journal_pin_full) \ |
148 | x(journal_stuck) \ |
149 | x(insufficient_devices) |
150 | |
151 | enum journal_errors { |
152 | #define x(n) JOURNAL_ERR_##n, |
153 | JOURNAL_ERRORS() |
154 | #undef x |
155 | }; |
156 | |
157 | typedef DARRAY(u64) darray_u64; |
158 | |
159 | struct journal_bio { |
160 | struct bch_dev *ca; |
161 | unsigned buf_idx; |
162 | |
163 | struct bio bio; |
164 | }; |
165 | |
166 | /* Embedded in struct bch_fs */ |
167 | struct journal { |
168 | /* Fastpath stuff up front: */ |
169 | struct { |
170 | |
171 | union journal_res_state reservations; |
172 | enum bch_watermark watermark; |
173 | |
174 | } __aligned(SMP_CACHE_BYTES); |
175 | |
176 | unsigned long flags; |
177 | |
178 | /* Max size of current journal entry */ |
179 | unsigned cur_entry_u64s; |
180 | unsigned cur_entry_sectors; |
181 | |
182 | /* Reserved space in journal entry to be used just prior to write */ |
183 | unsigned entry_u64s_reserved; |
184 | |
185 | |
186 | /* |
187 | * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if |
188 | * insufficient devices: |
189 | */ |
190 | enum journal_errors cur_entry_error; |
191 | |
192 | unsigned buf_size_want; |
193 | /* |
194 | * We may queue up some things to be journalled (log messages) before |
195 | * the journal has actually started - stash them here: |
196 | */ |
197 | darray_u64 early_journal_entries; |
198 | |
199 | /* |
200 | * Protects journal_buf->data, when accessing without a jorunal |
201 | * reservation: for synchronization between the btree write buffer code |
202 | * and the journal write path: |
203 | */ |
204 | struct mutex buf_lock; |
205 | /* |
206 | * Two journal entries -- one is currently open for new entries, the |
207 | * other is possibly being written out. |
208 | */ |
209 | struct journal_buf buf[JOURNAL_BUF_NR]; |
210 | |
211 | spinlock_t lock; |
212 | |
213 | /* if nonzero, we may not open a new journal entry: */ |
214 | unsigned blocked; |
215 | |
216 | /* Used when waiting because the journal was full */ |
217 | wait_queue_head_t wait; |
218 | struct closure_waitlist async_wait; |
219 | |
220 | struct delayed_work write_work; |
221 | struct workqueue_struct *wq; |
222 | |
223 | /* Sequence number of most recent journal entry (last entry in @pin) */ |
224 | atomic64_t seq; |
225 | |
226 | /* seq, last_seq from the most recent journal entry successfully written */ |
227 | u64 seq_ondisk; |
228 | u64 flushed_seq_ondisk; |
229 | u64 last_seq_ondisk; |
230 | u64 err_seq; |
231 | u64 last_empty_seq; |
232 | |
233 | /* |
234 | * FIFO of journal entries whose btree updates have not yet been |
235 | * written out. |
236 | * |
237 | * Each entry is a reference count. The position in the FIFO is the |
238 | * entry's sequence number relative to @seq. |
239 | * |
240 | * The journal entry itself holds a reference count, put when the |
241 | * journal entry is written out. Each btree node modified by the journal |
242 | * entry also holds a reference count, put when the btree node is |
243 | * written. |
244 | * |
245 | * When a reference count reaches zero, the journal entry is no longer |
246 | * needed. When all journal entries in the oldest journal bucket are no |
247 | * longer needed, the bucket can be discarded and reused. |
248 | */ |
249 | struct { |
250 | u64 front, back, size, mask; |
251 | struct journal_entry_pin_list *data; |
252 | } pin; |
253 | |
254 | struct journal_space space[journal_space_nr]; |
255 | |
256 | u64 replay_journal_seq; |
257 | u64 replay_journal_seq_end; |
258 | |
259 | struct write_point wp; |
260 | spinlock_t err_lock; |
261 | |
262 | struct mutex reclaim_lock; |
263 | /* |
264 | * Used for waiting until journal reclaim has freed up space in the |
265 | * journal: |
266 | */ |
267 | wait_queue_head_t reclaim_wait; |
268 | struct task_struct *reclaim_thread; |
269 | bool reclaim_kicked; |
270 | unsigned long next_reclaim; |
271 | u64 nr_direct_reclaim; |
272 | u64 nr_background_reclaim; |
273 | |
274 | unsigned long last_flushed; |
275 | struct journal_entry_pin *flush_in_progress; |
276 | bool flush_in_progress_dropped; |
277 | wait_queue_head_t pin_flush_wait; |
278 | |
279 | /* protects advancing ja->discard_idx: */ |
280 | struct mutex discard_lock; |
281 | bool can_discard; |
282 | |
283 | unsigned long last_flush_write; |
284 | |
285 | u64 write_start_time; |
286 | |
287 | u64 nr_flush_writes; |
288 | u64 nr_noflush_writes; |
289 | u64 entry_bytes_written; |
290 | |
291 | struct bch2_time_stats *flush_write_time; |
292 | struct bch2_time_stats *noflush_write_time; |
293 | struct bch2_time_stats *flush_seq_time; |
294 | |
295 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
296 | struct lockdep_map res_map; |
297 | #endif |
298 | } __aligned(SMP_CACHE_BYTES); |
299 | |
300 | /* |
301 | * Embedded in struct bch_dev. First three fields refer to the array of journal |
302 | * buckets, in bch_sb. |
303 | */ |
304 | struct journal_device { |
305 | /* |
306 | * For each journal bucket, contains the max sequence number of the |
307 | * journal writes it contains - so we know when a bucket can be reused. |
308 | */ |
309 | u64 *bucket_seq; |
310 | |
311 | unsigned sectors_free; |
312 | |
313 | /* |
314 | * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: |
315 | */ |
316 | unsigned discard_idx; /* Next bucket to discard */ |
317 | unsigned dirty_idx_ondisk; |
318 | unsigned dirty_idx; |
319 | unsigned cur_idx; /* Journal bucket we're currently writing to */ |
320 | unsigned nr; |
321 | |
322 | u64 *buckets; |
323 | |
324 | /* Bio for journal reads/writes to this device */ |
325 | struct journal_bio *bio[JOURNAL_BUF_NR]; |
326 | |
327 | /* for bch_journal_read_device */ |
328 | struct closure read; |
329 | }; |
330 | |
331 | /* |
332 | * journal_entry_res - reserve space in every journal entry: |
333 | */ |
334 | struct journal_entry_res { |
335 | unsigned u64s; |
336 | }; |
337 | |
338 | #endif /* _BCACHEFS_JOURNAL_TYPES_H */ |
339 | |