1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | /* |
4 | * fs/ext4/fast_commit.c |
5 | * |
6 | * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> |
7 | * |
8 | * Ext4 fast commits routines. |
9 | */ |
10 | #include "ext4.h" |
11 | #include "ext4_jbd2.h" |
12 | #include "ext4_extents.h" |
13 | #include "mballoc.h" |
14 | |
15 | /* |
16 | * Ext4 Fast Commits |
17 | * ----------------- |
18 | * |
19 | * Ext4 fast commits implement fine grained journalling for Ext4. |
20 | * |
21 | * Fast commits are organized as a log of tag-length-value (TLV) structs. (See |
22 | * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by |
23 | * TLV during the recovery phase. For the scenarios for which we currently |
24 | * don't have replay code, fast commit falls back to full commits. |
25 | * Fast commits record delta in one of the following three categories. |
26 | * |
27 | * (A) Directory entry updates: |
28 | * |
29 | * - EXT4_FC_TAG_UNLINK - records directory entry unlink |
30 | * - EXT4_FC_TAG_LINK - records directory entry link |
31 | * - EXT4_FC_TAG_CREAT - records inode and directory entry creation |
32 | * |
33 | * (B) File specific data range updates: |
34 | * |
35 | * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode |
36 | * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode |
37 | * |
38 | * (C) Inode metadata (mtime / ctime etc): |
39 | * |
40 | * - EXT4_FC_TAG_INODE - record the inode that should be replayed |
41 | * during recovery. Note that iblocks field is |
42 | * not replayed and instead derived during |
43 | * replay. |
44 | * Commit Operation |
45 | * ---------------- |
46 | * With fast commits, we maintain all the directory entry operations in the |
47 | * order in which they are issued in an in-memory queue. This queue is flushed |
48 | * to disk during the commit operation. We also maintain a list of inodes |
49 | * that need to be committed during a fast commit in another in memory queue of |
50 | * inodes. During the commit operation, we commit in the following order: |
51 | * |
52 | * [1] Lock inodes for any further data updates by setting COMMITTING state |
53 | * [2] Submit data buffers of all the inodes |
54 | * [3] Wait for [2] to complete |
55 | * [4] Commit all the directory entry updates in the fast commit space |
56 | * [5] Commit all the changed inode structures |
57 | * [6] Write tail tag (this tag ensures the atomicity, please read the following |
58 | * section for more details). |
59 | * [7] Wait for [4], [5] and [6] to complete. |
60 | * |
61 | * All the inode updates must call ext4_fc_start_update() before starting an |
62 | * update. If such an ongoing update is present, fast commit waits for it to |
63 | * complete. The completion of such an update is marked by |
64 | * ext4_fc_stop_update(). |
65 | * |
66 | * Fast Commit Ineligibility |
67 | * ------------------------- |
68 | * |
69 | * Not all operations are supported by fast commits today (e.g extended |
70 | * attributes). Fast commit ineligibility is marked by calling |
71 | * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back |
72 | * to full commit. |
73 | * |
74 | * Atomicity of commits |
75 | * -------------------- |
76 | * In order to guarantee atomicity during the commit operation, fast commit |
77 | * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail |
78 | * tag contains CRC of the contents and TID of the transaction after which |
79 | * this fast commit should be applied. Recovery code replays fast commit |
80 | * logs only if there's at least 1 valid tail present. For every fast commit |
81 | * operation, there is 1 tail. This means, we may end up with multiple tails |
82 | * in the fast commit space. Here's an example: |
83 | * |
84 | * - Create a new file A and remove existing file B |
85 | * - fsync() |
86 | * - Append contents to file A |
87 | * - Truncate file A |
88 | * - fsync() |
89 | * |
90 | * The fast commit space at the end of above operations would look like this: |
91 | * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] |
92 | * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| |
93 | * |
94 | * Replay code should thus check for all the valid tails in the FC area. |
95 | * |
96 | * Fast Commit Replay Idempotence |
97 | * ------------------------------ |
98 | * |
99 | * Fast commits tags are idempotent in nature provided the recovery code follows |
100 | * certain rules. The guiding principle that the commit path follows while |
101 | * committing is that it stores the result of a particular operation instead of |
102 | * storing the procedure. |
103 | * |
104 | * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' |
105 | * was associated with inode 10. During fast commit, instead of storing this |
106 | * operation as a procedure "rename a to b", we store the resulting file system |
107 | * state as a "series" of outcomes: |
108 | * |
109 | * - Link dirent b to inode 10 |
110 | * - Unlink dirent a |
111 | * - Inode <10> with valid refcount |
112 | * |
113 | * Now when recovery code runs, it needs "enforce" this state on the file |
114 | * system. This is what guarantees idempotence of fast commit replay. |
115 | * |
116 | * Let's take an example of a procedure that is not idempotent and see how fast |
117 | * commits make it idempotent. Consider following sequence of operations: |
118 | * |
119 | * rm A; mv B A; read A |
120 | * (x) (y) (z) |
121 | * |
122 | * (x), (y) and (z) are the points at which we can crash. If we store this |
123 | * sequence of operations as is then the replay is not idempotent. Let's say |
124 | * while in replay, we crash at (z). During the second replay, file A (which was |
125 | * actually created as a result of "mv B A" operation) would get deleted. Thus, |
126 | * file named A would be absent when we try to read A. So, this sequence of |
127 | * operations is not idempotent. However, as mentioned above, instead of storing |
128 | * the procedure fast commits store the outcome of each procedure. Thus the fast |
129 | * commit log for above procedure would be as follows: |
130 | * |
131 | * (Let's assume dirent A was linked to inode 10 and dirent B was linked to |
132 | * inode 11 before the replay) |
133 | * |
134 | * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] |
135 | * (w) (x) (y) (z) |
136 | * |
137 | * If we crash at (z), we will have file A linked to inode 11. During the second |
138 | * replay, we will remove file A (inode 11). But we will create it back and make |
139 | * it point to inode 11. We won't find B, so we'll just skip that step. At this |
140 | * point, the refcount for inode 11 is not reliable, but that gets fixed by the |
141 | * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled |
142 | * similarly. Thus, by converting a non-idempotent procedure into a series of |
143 | * idempotent outcomes, fast commits ensured idempotence during the replay. |
144 | * |
145 | * TODOs |
146 | * ----- |
147 | * |
148 | * 0) Fast commit replay path hardening: Fast commit replay code should use |
149 | * journal handles to make sure all the updates it does during the replay |
150 | * path are atomic. With that if we crash during fast commit replay, after |
151 | * trying to do recovery again, we will find a file system where fast commit |
152 | * area is invalid (because new full commit would be found). In order to deal |
153 | * with that, fast commit replay code should ensure that the "FC_REPLAY" |
154 | * superblock state is persisted before starting the replay, so that after |
155 | * the crash, fast commit recovery code can look at that flag and perform |
156 | * fast commit recovery even if that area is invalidated by later full |
157 | * commits. |
158 | * |
159 | * 1) Fast commit's commit path locks the entire file system during fast |
160 | * commit. This has significant performance penalty. Instead of that, we |
161 | * should use ext4_fc_start/stop_update functions to start inode level |
162 | * updates from ext4_journal_start/stop. Once we do that we can drop file |
163 | * system locking during commit path. |
164 | * |
165 | * 2) Handle more ineligible cases. |
166 | */ |
167 | |
168 | #include <trace/events/ext4.h> |
169 | static struct kmem_cache *ext4_fc_dentry_cachep; |
170 | |
171 | static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) |
172 | { |
173 | BUFFER_TRACE(bh, "" ); |
174 | if (uptodate) { |
175 | ext4_debug("%s: Block %lld up-to-date" , |
176 | __func__, bh->b_blocknr); |
177 | set_buffer_uptodate(bh); |
178 | } else { |
179 | ext4_debug("%s: Block %lld not up-to-date" , |
180 | __func__, bh->b_blocknr); |
181 | clear_buffer_uptodate(bh); |
182 | } |
183 | |
184 | unlock_buffer(bh); |
185 | } |
186 | |
187 | static inline void ext4_fc_reset_inode(struct inode *inode) |
188 | { |
189 | struct ext4_inode_info *ei = EXT4_I(inode); |
190 | |
191 | ei->i_fc_lblk_start = 0; |
192 | ei->i_fc_lblk_len = 0; |
193 | } |
194 | |
195 | void ext4_fc_init_inode(struct inode *inode) |
196 | { |
197 | struct ext4_inode_info *ei = EXT4_I(inode); |
198 | |
199 | ext4_fc_reset_inode(inode); |
200 | ext4_clear_inode_state(inode, bit: EXT4_STATE_FC_COMMITTING); |
201 | INIT_LIST_HEAD(list: &ei->i_fc_list); |
202 | INIT_LIST_HEAD(list: &ei->i_fc_dilist); |
203 | init_waitqueue_head(&ei->i_fc_wait); |
204 | atomic_set(v: &ei->i_fc_updates, i: 0); |
205 | } |
206 | |
207 | /* This function must be called with sbi->s_fc_lock held. */ |
208 | static void ext4_fc_wait_committing_inode(struct inode *inode) |
209 | __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) |
210 | { |
211 | wait_queue_head_t *wq; |
212 | struct ext4_inode_info *ei = EXT4_I(inode); |
213 | |
214 | #if (BITS_PER_LONG < 64) |
215 | DEFINE_WAIT_BIT(wait, &ei->i_state_flags, |
216 | EXT4_STATE_FC_COMMITTING); |
217 | wq = bit_waitqueue(&ei->i_state_flags, |
218 | EXT4_STATE_FC_COMMITTING); |
219 | #else |
220 | DEFINE_WAIT_BIT(wait, &ei->i_flags, |
221 | EXT4_STATE_FC_COMMITTING); |
222 | wq = bit_waitqueue(word: &ei->i_flags, |
223 | bit: EXT4_STATE_FC_COMMITTING); |
224 | #endif |
225 | lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); |
226 | prepare_to_wait(wq_head: wq, wq_entry: &wait.wq_entry, TASK_UNINTERRUPTIBLE); |
227 | spin_unlock(lock: &EXT4_SB(sb: inode->i_sb)->s_fc_lock); |
228 | schedule(); |
229 | finish_wait(wq_head: wq, wq_entry: &wait.wq_entry); |
230 | } |
231 | |
232 | static bool ext4_fc_disabled(struct super_block *sb) |
233 | { |
234 | return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || |
235 | (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); |
236 | } |
237 | |
238 | /* |
239 | * Inform Ext4's fast about start of an inode update |
240 | * |
241 | * This function is called by the high level call VFS callbacks before |
242 | * performing any inode update. This function blocks if there's an ongoing |
243 | * fast commit on the inode in question. |
244 | */ |
245 | void ext4_fc_start_update(struct inode *inode) |
246 | { |
247 | struct ext4_inode_info *ei = EXT4_I(inode); |
248 | |
249 | if (ext4_fc_disabled(sb: inode->i_sb)) |
250 | return; |
251 | |
252 | restart: |
253 | spin_lock(lock: &EXT4_SB(sb: inode->i_sb)->s_fc_lock); |
254 | if (list_empty(head: &ei->i_fc_list)) |
255 | goto out; |
256 | |
257 | if (ext4_test_inode_state(inode, bit: EXT4_STATE_FC_COMMITTING)) { |
258 | ext4_fc_wait_committing_inode(inode); |
259 | goto restart; |
260 | } |
261 | out: |
262 | atomic_inc(v: &ei->i_fc_updates); |
263 | spin_unlock(lock: &EXT4_SB(sb: inode->i_sb)->s_fc_lock); |
264 | } |
265 | |
266 | /* |
267 | * Stop inode update and wake up waiting fast commits if any. |
268 | */ |
269 | void ext4_fc_stop_update(struct inode *inode) |
270 | { |
271 | struct ext4_inode_info *ei = EXT4_I(inode); |
272 | |
273 | if (ext4_fc_disabled(sb: inode->i_sb)) |
274 | return; |
275 | |
276 | if (atomic_dec_and_test(v: &ei->i_fc_updates)) |
277 | wake_up_all(&ei->i_fc_wait); |
278 | } |
279 | |
280 | /* |
281 | * Remove inode from fast commit list. If the inode is being committed |
282 | * we wait until inode commit is done. |
283 | */ |
284 | void ext4_fc_del(struct inode *inode) |
285 | { |
286 | struct ext4_inode_info *ei = EXT4_I(inode); |
287 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
288 | struct ext4_fc_dentry_update *fc_dentry; |
289 | |
290 | if (ext4_fc_disabled(sb: inode->i_sb)) |
291 | return; |
292 | |
293 | restart: |
294 | spin_lock(lock: &EXT4_SB(sb: inode->i_sb)->s_fc_lock); |
295 | if (list_empty(head: &ei->i_fc_list) && list_empty(head: &ei->i_fc_dilist)) { |
296 | spin_unlock(lock: &EXT4_SB(sb: inode->i_sb)->s_fc_lock); |
297 | return; |
298 | } |
299 | |
300 | if (ext4_test_inode_state(inode, bit: EXT4_STATE_FC_COMMITTING)) { |
301 | ext4_fc_wait_committing_inode(inode); |
302 | goto restart; |
303 | } |
304 | |
305 | if (!list_empty(head: &ei->i_fc_list)) |
306 | list_del_init(entry: &ei->i_fc_list); |
307 | |
308 | /* |
309 | * Since this inode is getting removed, let's also remove all FC |
310 | * dentry create references, since it is not needed to log it anyways. |
311 | */ |
312 | if (list_empty(head: &ei->i_fc_dilist)) { |
313 | spin_unlock(lock: &sbi->s_fc_lock); |
314 | return; |
315 | } |
316 | |
317 | fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); |
318 | WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); |
319 | list_del_init(entry: &fc_dentry->fcd_list); |
320 | list_del_init(entry: &fc_dentry->fcd_dilist); |
321 | |
322 | WARN_ON(!list_empty(&ei->i_fc_dilist)); |
323 | spin_unlock(lock: &sbi->s_fc_lock); |
324 | |
325 | if (fc_dentry->fcd_name.name && |
326 | fc_dentry->fcd_name.len > DNAME_INLINE_LEN) |
327 | kfree(objp: fc_dentry->fcd_name.name); |
328 | kmem_cache_free(s: ext4_fc_dentry_cachep, objp: fc_dentry); |
329 | |
330 | return; |
331 | } |
332 | |
333 | /* |
334 | * Mark file system as fast commit ineligible, and record latest |
335 | * ineligible transaction tid. This means until the recorded |
336 | * transaction, commit operation would result in a full jbd2 commit. |
337 | */ |
338 | void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) |
339 | { |
340 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
341 | tid_t tid; |
342 | |
343 | if (ext4_fc_disabled(sb)) |
344 | return; |
345 | |
346 | ext4_set_mount_flag(sb, bit: EXT4_MF_FC_INELIGIBLE); |
347 | if (handle && !IS_ERR(ptr: handle)) |
348 | tid = handle->h_transaction->t_tid; |
349 | else { |
350 | read_lock(&sbi->s_journal->j_state_lock); |
351 | tid = sbi->s_journal->j_running_transaction ? |
352 | sbi->s_journal->j_running_transaction->t_tid : 0; |
353 | read_unlock(&sbi->s_journal->j_state_lock); |
354 | } |
355 | spin_lock(lock: &sbi->s_fc_lock); |
356 | if (sbi->s_fc_ineligible_tid < tid) |
357 | sbi->s_fc_ineligible_tid = tid; |
358 | spin_unlock(lock: &sbi->s_fc_lock); |
359 | WARN_ON(reason >= EXT4_FC_REASON_MAX); |
360 | sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; |
361 | } |
362 | |
363 | /* |
364 | * Generic fast commit tracking function. If this is the first time this we are |
365 | * called after a full commit, we initialize fast commit fields and then call |
366 | * __fc_track_fn() with update = 0. If we have already been called after a full |
367 | * commit, we pass update = 1. Based on that, the track function can determine |
368 | * if it needs to track a field for the first time or if it needs to just |
369 | * update the previously tracked value. |
370 | * |
371 | * If enqueue is set, this function enqueues the inode in fast commit list. |
372 | */ |
373 | static int ext4_fc_track_template( |
374 | handle_t *handle, struct inode *inode, |
375 | int (*__fc_track_fn)(struct inode *, void *, bool), |
376 | void *args, int enqueue) |
377 | { |
378 | bool update = false; |
379 | struct ext4_inode_info *ei = EXT4_I(inode); |
380 | struct ext4_sb_info *sbi = EXT4_SB(sb: inode->i_sb); |
381 | tid_t tid = 0; |
382 | int ret; |
383 | |
384 | tid = handle->h_transaction->t_tid; |
385 | mutex_lock(&ei->i_fc_lock); |
386 | if (tid == ei->i_sync_tid) { |
387 | update = true; |
388 | } else { |
389 | ext4_fc_reset_inode(inode); |
390 | ei->i_sync_tid = tid; |
391 | } |
392 | ret = __fc_track_fn(inode, args, update); |
393 | mutex_unlock(lock: &ei->i_fc_lock); |
394 | |
395 | if (!enqueue) |
396 | return ret; |
397 | |
398 | spin_lock(lock: &sbi->s_fc_lock); |
399 | if (list_empty(head: &EXT4_I(inode)->i_fc_list)) |
400 | list_add_tail(new: &EXT4_I(inode)->i_fc_list, |
401 | head: (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || |
402 | sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? |
403 | &sbi->s_fc_q[FC_Q_STAGING] : |
404 | &sbi->s_fc_q[FC_Q_MAIN]); |
405 | spin_unlock(lock: &sbi->s_fc_lock); |
406 | |
407 | return ret; |
408 | } |
409 | |
410 | struct __track_dentry_update_args { |
411 | struct dentry *dentry; |
412 | int op; |
413 | }; |
414 | |
415 | /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ |
416 | static int __track_dentry_update(struct inode *inode, void *arg, bool update) |
417 | { |
418 | struct ext4_fc_dentry_update *node; |
419 | struct ext4_inode_info *ei = EXT4_I(inode); |
420 | struct __track_dentry_update_args *dentry_update = |
421 | (struct __track_dentry_update_args *)arg; |
422 | struct dentry *dentry = dentry_update->dentry; |
423 | struct inode *dir = dentry->d_parent->d_inode; |
424 | struct super_block *sb = inode->i_sb; |
425 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
426 | |
427 | mutex_unlock(lock: &ei->i_fc_lock); |
428 | |
429 | if (IS_ENCRYPTED(dir)) { |
430 | ext4_fc_mark_ineligible(sb, reason: EXT4_FC_REASON_ENCRYPTED_FILENAME, |
431 | NULL); |
432 | mutex_lock(&ei->i_fc_lock); |
433 | return -EOPNOTSUPP; |
434 | } |
435 | |
436 | node = kmem_cache_alloc(cachep: ext4_fc_dentry_cachep, GFP_NOFS); |
437 | if (!node) { |
438 | ext4_fc_mark_ineligible(sb, reason: EXT4_FC_REASON_NOMEM, NULL); |
439 | mutex_lock(&ei->i_fc_lock); |
440 | return -ENOMEM; |
441 | } |
442 | |
443 | node->fcd_op = dentry_update->op; |
444 | node->fcd_parent = dir->i_ino; |
445 | node->fcd_ino = inode->i_ino; |
446 | if (dentry->d_name.len > DNAME_INLINE_LEN) { |
447 | node->fcd_name.name = kmalloc(size: dentry->d_name.len, GFP_NOFS); |
448 | if (!node->fcd_name.name) { |
449 | kmem_cache_free(s: ext4_fc_dentry_cachep, objp: node); |
450 | ext4_fc_mark_ineligible(sb, reason: EXT4_FC_REASON_NOMEM, NULL); |
451 | mutex_lock(&ei->i_fc_lock); |
452 | return -ENOMEM; |
453 | } |
454 | memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, |
455 | dentry->d_name.len); |
456 | } else { |
457 | memcpy(node->fcd_iname, dentry->d_name.name, |
458 | dentry->d_name.len); |
459 | node->fcd_name.name = node->fcd_iname; |
460 | } |
461 | node->fcd_name.len = dentry->d_name.len; |
462 | INIT_LIST_HEAD(list: &node->fcd_dilist); |
463 | spin_lock(lock: &sbi->s_fc_lock); |
464 | if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || |
465 | sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) |
466 | list_add_tail(new: &node->fcd_list, |
467 | head: &sbi->s_fc_dentry_q[FC_Q_STAGING]); |
468 | else |
469 | list_add_tail(new: &node->fcd_list, head: &sbi->s_fc_dentry_q[FC_Q_MAIN]); |
470 | |
471 | /* |
472 | * This helps us keep a track of all fc_dentry updates which is part of |
473 | * this ext4 inode. So in case the inode is getting unlinked, before |
474 | * even we get a chance to fsync, we could remove all fc_dentry |
475 | * references while evicting the inode in ext4_fc_del(). |
476 | * Also with this, we don't need to loop over all the inodes in |
477 | * sbi->s_fc_q to get the corresponding inode in |
478 | * ext4_fc_commit_dentry_updates(). |
479 | */ |
480 | if (dentry_update->op == EXT4_FC_TAG_CREAT) { |
481 | WARN_ON(!list_empty(&ei->i_fc_dilist)); |
482 | list_add_tail(new: &node->fcd_dilist, head: &ei->i_fc_dilist); |
483 | } |
484 | spin_unlock(lock: &sbi->s_fc_lock); |
485 | mutex_lock(&ei->i_fc_lock); |
486 | |
487 | return 0; |
488 | } |
489 | |
490 | void __ext4_fc_track_unlink(handle_t *handle, |
491 | struct inode *inode, struct dentry *dentry) |
492 | { |
493 | struct __track_dentry_update_args args; |
494 | int ret; |
495 | |
496 | args.dentry = dentry; |
497 | args.op = EXT4_FC_TAG_UNLINK; |
498 | |
499 | ret = ext4_fc_track_template(handle, inode, fc_track_fn: __track_dentry_update, |
500 | args: (void *)&args, enqueue: 0); |
501 | trace_ext4_fc_track_unlink(handle, inode, dentry, ret); |
502 | } |
503 | |
504 | void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) |
505 | { |
506 | struct inode *inode = d_inode(dentry); |
507 | |
508 | if (ext4_fc_disabled(sb: inode->i_sb)) |
509 | return; |
510 | |
511 | if (ext4_test_mount_flag(sb: inode->i_sb, bit: EXT4_MF_FC_INELIGIBLE)) |
512 | return; |
513 | |
514 | __ext4_fc_track_unlink(handle, inode, dentry); |
515 | } |
516 | |
517 | void __ext4_fc_track_link(handle_t *handle, |
518 | struct inode *inode, struct dentry *dentry) |
519 | { |
520 | struct __track_dentry_update_args args; |
521 | int ret; |
522 | |
523 | args.dentry = dentry; |
524 | args.op = EXT4_FC_TAG_LINK; |
525 | |
526 | ret = ext4_fc_track_template(handle, inode, fc_track_fn: __track_dentry_update, |
527 | args: (void *)&args, enqueue: 0); |
528 | trace_ext4_fc_track_link(handle, inode, dentry, ret); |
529 | } |
530 | |
531 | void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) |
532 | { |
533 | struct inode *inode = d_inode(dentry); |
534 | |
535 | if (ext4_fc_disabled(sb: inode->i_sb)) |
536 | return; |
537 | |
538 | if (ext4_test_mount_flag(sb: inode->i_sb, bit: EXT4_MF_FC_INELIGIBLE)) |
539 | return; |
540 | |
541 | __ext4_fc_track_link(handle, inode, dentry); |
542 | } |
543 | |
544 | void __ext4_fc_track_create(handle_t *handle, struct inode *inode, |
545 | struct dentry *dentry) |
546 | { |
547 | struct __track_dentry_update_args args; |
548 | int ret; |
549 | |
550 | args.dentry = dentry; |
551 | args.op = EXT4_FC_TAG_CREAT; |
552 | |
553 | ret = ext4_fc_track_template(handle, inode, fc_track_fn: __track_dentry_update, |
554 | args: (void *)&args, enqueue: 0); |
555 | trace_ext4_fc_track_create(handle, inode, dentry, ret); |
556 | } |
557 | |
558 | void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) |
559 | { |
560 | struct inode *inode = d_inode(dentry); |
561 | |
562 | if (ext4_fc_disabled(sb: inode->i_sb)) |
563 | return; |
564 | |
565 | if (ext4_test_mount_flag(sb: inode->i_sb, bit: EXT4_MF_FC_INELIGIBLE)) |
566 | return; |
567 | |
568 | __ext4_fc_track_create(handle, inode, dentry); |
569 | } |
570 | |
571 | /* __track_fn for inode tracking */ |
572 | static int __track_inode(struct inode *inode, void *arg, bool update) |
573 | { |
574 | if (update) |
575 | return -EEXIST; |
576 | |
577 | EXT4_I(inode)->i_fc_lblk_len = 0; |
578 | |
579 | return 0; |
580 | } |
581 | |
582 | void ext4_fc_track_inode(handle_t *handle, struct inode *inode) |
583 | { |
584 | int ret; |
585 | |
586 | if (S_ISDIR(inode->i_mode)) |
587 | return; |
588 | |
589 | if (ext4_fc_disabled(sb: inode->i_sb)) |
590 | return; |
591 | |
592 | if (ext4_should_journal_data(inode)) { |
593 | ext4_fc_mark_ineligible(sb: inode->i_sb, |
594 | reason: EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); |
595 | return; |
596 | } |
597 | |
598 | if (ext4_test_mount_flag(sb: inode->i_sb, bit: EXT4_MF_FC_INELIGIBLE)) |
599 | return; |
600 | |
601 | ret = ext4_fc_track_template(handle, inode, fc_track_fn: __track_inode, NULL, enqueue: 1); |
602 | trace_ext4_fc_track_inode(handle, inode, ret); |
603 | } |
604 | |
605 | struct __track_range_args { |
606 | ext4_lblk_t start, end; |
607 | }; |
608 | |
609 | /* __track_fn for tracking data updates */ |
610 | static int __track_range(struct inode *inode, void *arg, bool update) |
611 | { |
612 | struct ext4_inode_info *ei = EXT4_I(inode); |
613 | ext4_lblk_t oldstart; |
614 | struct __track_range_args *__arg = |
615 | (struct __track_range_args *)arg; |
616 | |
617 | if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { |
618 | ext4_debug("Special inode %ld being modified\n" , inode->i_ino); |
619 | return -ECANCELED; |
620 | } |
621 | |
622 | oldstart = ei->i_fc_lblk_start; |
623 | |
624 | if (update && ei->i_fc_lblk_len > 0) { |
625 | ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); |
626 | ei->i_fc_lblk_len = |
627 | max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - |
628 | ei->i_fc_lblk_start + 1; |
629 | } else { |
630 | ei->i_fc_lblk_start = __arg->start; |
631 | ei->i_fc_lblk_len = __arg->end - __arg->start + 1; |
632 | } |
633 | |
634 | return 0; |
635 | } |
636 | |
637 | void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, |
638 | ext4_lblk_t end) |
639 | { |
640 | struct __track_range_args args; |
641 | int ret; |
642 | |
643 | if (S_ISDIR(inode->i_mode)) |
644 | return; |
645 | |
646 | if (ext4_fc_disabled(sb: inode->i_sb)) |
647 | return; |
648 | |
649 | if (ext4_test_mount_flag(sb: inode->i_sb, bit: EXT4_MF_FC_INELIGIBLE)) |
650 | return; |
651 | |
652 | args.start = start; |
653 | args.end = end; |
654 | |
655 | ret = ext4_fc_track_template(handle, inode, fc_track_fn: __track_range, args: &args, enqueue: 1); |
656 | |
657 | trace_ext4_fc_track_range(handle, inode, start, end, ret); |
658 | } |
659 | |
660 | static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) |
661 | { |
662 | blk_opf_t write_flags = REQ_SYNC; |
663 | struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; |
664 | |
665 | /* Add REQ_FUA | REQ_PREFLUSH only its tail */ |
666 | if (test_opt(sb, BARRIER) && is_tail) |
667 | write_flags |= REQ_FUA | REQ_PREFLUSH; |
668 | lock_buffer(bh); |
669 | set_buffer_dirty(bh); |
670 | set_buffer_uptodate(bh); |
671 | bh->b_end_io = ext4_end_buffer_io_sync; |
672 | submit_bh(REQ_OP_WRITE | write_flags, bh); |
673 | EXT4_SB(sb)->s_fc_bh = NULL; |
674 | } |
675 | |
676 | /* Ext4 commit path routines */ |
677 | |
678 | /* |
679 | * Allocate len bytes on a fast commit buffer. |
680 | * |
681 | * During the commit time this function is used to manage fast commit |
682 | * block space. We don't split a fast commit log onto different |
683 | * blocks. So this function makes sure that if there's not enough space |
684 | * on the current block, the remaining space in the current block is |
685 | * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, |
686 | * new block is from jbd2 and CRC is updated to reflect the padding |
687 | * we added. |
688 | */ |
689 | static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) |
690 | { |
691 | struct ext4_fc_tl tl; |
692 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
693 | struct buffer_head *bh; |
694 | int bsize = sbi->s_journal->j_blocksize; |
695 | int ret, off = sbi->s_fc_bytes % bsize; |
696 | int remaining; |
697 | u8 *dst; |
698 | |
699 | /* |
700 | * If 'len' is too long to fit in any block alongside a PAD tlv, then we |
701 | * cannot fulfill the request. |
702 | */ |
703 | if (len > bsize - EXT4_FC_TAG_BASE_LEN) |
704 | return NULL; |
705 | |
706 | if (!sbi->s_fc_bh) { |
707 | ret = jbd2_fc_get_buf(journal: EXT4_SB(sb)->s_journal, bh_out: &bh); |
708 | if (ret) |
709 | return NULL; |
710 | sbi->s_fc_bh = bh; |
711 | } |
712 | dst = sbi->s_fc_bh->b_data + off; |
713 | |
714 | /* |
715 | * Allocate the bytes in the current block if we can do so while still |
716 | * leaving enough space for a PAD tlv. |
717 | */ |
718 | remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; |
719 | if (len <= remaining) { |
720 | sbi->s_fc_bytes += len; |
721 | return dst; |
722 | } |
723 | |
724 | /* |
725 | * Else, terminate the current block with a PAD tlv, then allocate a new |
726 | * block and allocate the bytes at the start of that new block. |
727 | */ |
728 | |
729 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); |
730 | tl.fc_len = cpu_to_le16(remaining); |
731 | memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); |
732 | memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); |
733 | *crc = ext4_chksum(sbi, crc: *crc, address: sbi->s_fc_bh->b_data, length: bsize); |
734 | |
735 | ext4_fc_submit_bh(sb, is_tail: false); |
736 | |
737 | ret = jbd2_fc_get_buf(journal: EXT4_SB(sb)->s_journal, bh_out: &bh); |
738 | if (ret) |
739 | return NULL; |
740 | sbi->s_fc_bh = bh; |
741 | sbi->s_fc_bytes += bsize - off + len; |
742 | return sbi->s_fc_bh->b_data; |
743 | } |
744 | |
745 | /* |
746 | * Complete a fast commit by writing tail tag. |
747 | * |
748 | * Writing tail tag marks the end of a fast commit. In order to guarantee |
749 | * atomicity, after writing tail tag, even if there's space remaining |
750 | * in the block, next commit shouldn't use it. That's why tail tag |
751 | * has the length as that of the remaining space on the block. |
752 | */ |
753 | static int ext4_fc_write_tail(struct super_block *sb, u32 crc) |
754 | { |
755 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
756 | struct ext4_fc_tl tl; |
757 | struct ext4_fc_tail tail; |
758 | int off, bsize = sbi->s_journal->j_blocksize; |
759 | u8 *dst; |
760 | |
761 | /* |
762 | * ext4_fc_reserve_space takes care of allocating an extra block if |
763 | * there's no enough space on this block for accommodating this tail. |
764 | */ |
765 | dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), crc: &crc); |
766 | if (!dst) |
767 | return -ENOSPC; |
768 | |
769 | off = sbi->s_fc_bytes % bsize; |
770 | |
771 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); |
772 | tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); |
773 | sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); |
774 | |
775 | memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); |
776 | dst += EXT4_FC_TAG_BASE_LEN; |
777 | tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); |
778 | memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); |
779 | dst += sizeof(tail.fc_tid); |
780 | crc = ext4_chksum(sbi, crc, address: sbi->s_fc_bh->b_data, |
781 | length: dst - (u8 *)sbi->s_fc_bh->b_data); |
782 | tail.fc_crc = cpu_to_le32(crc); |
783 | memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); |
784 | dst += sizeof(tail.fc_crc); |
785 | memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ |
786 | |
787 | ext4_fc_submit_bh(sb, is_tail: true); |
788 | |
789 | return 0; |
790 | } |
791 | |
792 | /* |
793 | * Adds tag, length, value and updates CRC. Returns true if tlv was added. |
794 | * Returns false if there's not enough space. |
795 | */ |
796 | static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, |
797 | u32 *crc) |
798 | { |
799 | struct ext4_fc_tl tl; |
800 | u8 *dst; |
801 | |
802 | dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); |
803 | if (!dst) |
804 | return false; |
805 | |
806 | tl.fc_tag = cpu_to_le16(tag); |
807 | tl.fc_len = cpu_to_le16(len); |
808 | |
809 | memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); |
810 | memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); |
811 | |
812 | return true; |
813 | } |
814 | |
815 | /* Same as above, but adds dentry tlv. */ |
816 | static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, |
817 | struct ext4_fc_dentry_update *fc_dentry) |
818 | { |
819 | struct ext4_fc_dentry_info fcd; |
820 | struct ext4_fc_tl tl; |
821 | int dlen = fc_dentry->fcd_name.len; |
822 | u8 *dst = ext4_fc_reserve_space(sb, |
823 | EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); |
824 | |
825 | if (!dst) |
826 | return false; |
827 | |
828 | fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); |
829 | fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); |
830 | tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); |
831 | tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); |
832 | memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); |
833 | dst += EXT4_FC_TAG_BASE_LEN; |
834 | memcpy(dst, &fcd, sizeof(fcd)); |
835 | dst += sizeof(fcd); |
836 | memcpy(dst, fc_dentry->fcd_name.name, dlen); |
837 | |
838 | return true; |
839 | } |
840 | |
841 | /* |
842 | * Writes inode in the fast commit space under TLV with tag @tag. |
843 | * Returns 0 on success, error on failure. |
844 | */ |
845 | static int ext4_fc_write_inode(struct inode *inode, u32 *crc) |
846 | { |
847 | struct ext4_inode_info *ei = EXT4_I(inode); |
848 | int inode_len = EXT4_GOOD_OLD_INODE_SIZE; |
849 | int ret; |
850 | struct ext4_iloc iloc; |
851 | struct ext4_fc_inode fc_inode; |
852 | struct ext4_fc_tl tl; |
853 | u8 *dst; |
854 | |
855 | ret = ext4_get_inode_loc(inode, &iloc); |
856 | if (ret) |
857 | return ret; |
858 | |
859 | if (ext4_test_inode_flag(inode, bit: EXT4_INODE_INLINE_DATA)) |
860 | inode_len = EXT4_INODE_SIZE(inode->i_sb); |
861 | else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) |
862 | inode_len += ei->i_extra_isize; |
863 | |
864 | fc_inode.fc_ino = cpu_to_le32(inode->i_ino); |
865 | tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); |
866 | tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); |
867 | |
868 | ret = -ECANCELED; |
869 | dst = ext4_fc_reserve_space(sb: inode->i_sb, |
870 | EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); |
871 | if (!dst) |
872 | goto err; |
873 | |
874 | memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); |
875 | dst += EXT4_FC_TAG_BASE_LEN; |
876 | memcpy(dst, &fc_inode, sizeof(fc_inode)); |
877 | dst += sizeof(fc_inode); |
878 | memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); |
879 | ret = 0; |
880 | err: |
881 | brelse(bh: iloc.bh); |
882 | return ret; |
883 | } |
884 | |
885 | /* |
886 | * Writes updated data ranges for the inode in question. Updates CRC. |
887 | * Returns 0 on success, error otherwise. |
888 | */ |
889 | static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) |
890 | { |
891 | ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; |
892 | struct ext4_inode_info *ei = EXT4_I(inode); |
893 | struct ext4_map_blocks map; |
894 | struct ext4_fc_add_range fc_ext; |
895 | struct ext4_fc_del_range lrange; |
896 | struct ext4_extent *ex; |
897 | int ret; |
898 | |
899 | mutex_lock(&ei->i_fc_lock); |
900 | if (ei->i_fc_lblk_len == 0) { |
901 | mutex_unlock(lock: &ei->i_fc_lock); |
902 | return 0; |
903 | } |
904 | old_blk_size = ei->i_fc_lblk_start; |
905 | new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; |
906 | ei->i_fc_lblk_len = 0; |
907 | mutex_unlock(lock: &ei->i_fc_lock); |
908 | |
909 | cur_lblk_off = old_blk_size; |
910 | ext4_debug("will try writing %d to %d for inode %ld\n" , |
911 | cur_lblk_off, new_blk_size, inode->i_ino); |
912 | |
913 | while (cur_lblk_off <= new_blk_size) { |
914 | map.m_lblk = cur_lblk_off; |
915 | map.m_len = new_blk_size - cur_lblk_off + 1; |
916 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
917 | if (ret < 0) |
918 | return -ECANCELED; |
919 | |
920 | if (map.m_len == 0) { |
921 | cur_lblk_off++; |
922 | continue; |
923 | } |
924 | |
925 | if (ret == 0) { |
926 | lrange.fc_ino = cpu_to_le32(inode->i_ino); |
927 | lrange.fc_lblk = cpu_to_le32(map.m_lblk); |
928 | lrange.fc_len = cpu_to_le32(map.m_len); |
929 | if (!ext4_fc_add_tlv(sb: inode->i_sb, EXT4_FC_TAG_DEL_RANGE, |
930 | len: sizeof(lrange), val: (u8 *)&lrange, crc)) |
931 | return -ENOSPC; |
932 | } else { |
933 | unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? |
934 | EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; |
935 | |
936 | /* Limit the number of blocks in one extent */ |
937 | map.m_len = min(max, map.m_len); |
938 | |
939 | fc_ext.fc_ino = cpu_to_le32(inode->i_ino); |
940 | ex = (struct ext4_extent *)&fc_ext.fc_ex; |
941 | ex->ee_block = cpu_to_le32(map.m_lblk); |
942 | ex->ee_len = cpu_to_le16(map.m_len); |
943 | ext4_ext_store_pblock(ex, pb: map.m_pblk); |
944 | if (map.m_flags & EXT4_MAP_UNWRITTEN) |
945 | ext4_ext_mark_unwritten(ext: ex); |
946 | else |
947 | ext4_ext_mark_initialized(ext: ex); |
948 | if (!ext4_fc_add_tlv(sb: inode->i_sb, EXT4_FC_TAG_ADD_RANGE, |
949 | len: sizeof(fc_ext), val: (u8 *)&fc_ext, crc)) |
950 | return -ENOSPC; |
951 | } |
952 | |
953 | cur_lblk_off += map.m_len; |
954 | } |
955 | |
956 | return 0; |
957 | } |
958 | |
959 | |
960 | /* Submit data for all the fast commit inodes */ |
961 | static int ext4_fc_submit_inode_data_all(journal_t *journal) |
962 | { |
963 | struct super_block *sb = journal->j_private; |
964 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
965 | struct ext4_inode_info *ei; |
966 | int ret = 0; |
967 | |
968 | spin_lock(lock: &sbi->s_fc_lock); |
969 | list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { |
970 | ext4_set_inode_state(inode: &ei->vfs_inode, bit: EXT4_STATE_FC_COMMITTING); |
971 | while (atomic_read(v: &ei->i_fc_updates)) { |
972 | DEFINE_WAIT(wait); |
973 | |
974 | prepare_to_wait(wq_head: &ei->i_fc_wait, wq_entry: &wait, |
975 | TASK_UNINTERRUPTIBLE); |
976 | if (atomic_read(v: &ei->i_fc_updates)) { |
977 | spin_unlock(lock: &sbi->s_fc_lock); |
978 | schedule(); |
979 | spin_lock(lock: &sbi->s_fc_lock); |
980 | } |
981 | finish_wait(wq_head: &ei->i_fc_wait, wq_entry: &wait); |
982 | } |
983 | spin_unlock(lock: &sbi->s_fc_lock); |
984 | ret = jbd2_submit_inode_data(journal, jinode: ei->jinode); |
985 | if (ret) |
986 | return ret; |
987 | spin_lock(lock: &sbi->s_fc_lock); |
988 | } |
989 | spin_unlock(lock: &sbi->s_fc_lock); |
990 | |
991 | return ret; |
992 | } |
993 | |
994 | /* Wait for completion of data for all the fast commit inodes */ |
995 | static int ext4_fc_wait_inode_data_all(journal_t *journal) |
996 | { |
997 | struct super_block *sb = journal->j_private; |
998 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
999 | struct ext4_inode_info *pos, *n; |
1000 | int ret = 0; |
1001 | |
1002 | spin_lock(lock: &sbi->s_fc_lock); |
1003 | list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { |
1004 | if (!ext4_test_inode_state(inode: &pos->vfs_inode, |
1005 | bit: EXT4_STATE_FC_COMMITTING)) |
1006 | continue; |
1007 | spin_unlock(lock: &sbi->s_fc_lock); |
1008 | |
1009 | ret = jbd2_wait_inode_data(journal, jinode: pos->jinode); |
1010 | if (ret) |
1011 | return ret; |
1012 | spin_lock(lock: &sbi->s_fc_lock); |
1013 | } |
1014 | spin_unlock(lock: &sbi->s_fc_lock); |
1015 | |
1016 | return 0; |
1017 | } |
1018 | |
1019 | /* Commit all the directory entry updates */ |
1020 | static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) |
1021 | __acquires(&sbi->s_fc_lock) |
1022 | __releases(&sbi->s_fc_lock) |
1023 | { |
1024 | struct super_block *sb = journal->j_private; |
1025 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1026 | struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; |
1027 | struct inode *inode; |
1028 | struct ext4_inode_info *ei; |
1029 | int ret; |
1030 | |
1031 | if (list_empty(head: &sbi->s_fc_dentry_q[FC_Q_MAIN])) |
1032 | return 0; |
1033 | list_for_each_entry_safe(fc_dentry, fc_dentry_n, |
1034 | &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { |
1035 | if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { |
1036 | spin_unlock(lock: &sbi->s_fc_lock); |
1037 | if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { |
1038 | ret = -ENOSPC; |
1039 | goto lock_and_exit; |
1040 | } |
1041 | spin_lock(lock: &sbi->s_fc_lock); |
1042 | continue; |
1043 | } |
1044 | /* |
1045 | * With fcd_dilist we need not loop in sbi->s_fc_q to get the |
1046 | * corresponding inode pointer |
1047 | */ |
1048 | WARN_ON(list_empty(&fc_dentry->fcd_dilist)); |
1049 | ei = list_first_entry(&fc_dentry->fcd_dilist, |
1050 | struct ext4_inode_info, i_fc_dilist); |
1051 | inode = &ei->vfs_inode; |
1052 | WARN_ON(inode->i_ino != fc_dentry->fcd_ino); |
1053 | |
1054 | spin_unlock(lock: &sbi->s_fc_lock); |
1055 | |
1056 | /* |
1057 | * We first write the inode and then the create dirent. This |
1058 | * allows the recovery code to create an unnamed inode first |
1059 | * and then link it to a directory entry. This allows us |
1060 | * to use namei.c routines almost as is and simplifies |
1061 | * the recovery code. |
1062 | */ |
1063 | ret = ext4_fc_write_inode(inode, crc); |
1064 | if (ret) |
1065 | goto lock_and_exit; |
1066 | |
1067 | ret = ext4_fc_write_inode_data(inode, crc); |
1068 | if (ret) |
1069 | goto lock_and_exit; |
1070 | |
1071 | if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { |
1072 | ret = -ENOSPC; |
1073 | goto lock_and_exit; |
1074 | } |
1075 | |
1076 | spin_lock(lock: &sbi->s_fc_lock); |
1077 | } |
1078 | return 0; |
1079 | lock_and_exit: |
1080 | spin_lock(lock: &sbi->s_fc_lock); |
1081 | return ret; |
1082 | } |
1083 | |
1084 | static int ext4_fc_perform_commit(journal_t *journal) |
1085 | { |
1086 | struct super_block *sb = journal->j_private; |
1087 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1088 | struct ext4_inode_info *iter; |
1089 | struct ext4_fc_head head; |
1090 | struct inode *inode; |
1091 | struct blk_plug plug; |
1092 | int ret = 0; |
1093 | u32 crc = 0; |
1094 | |
1095 | ret = ext4_fc_submit_inode_data_all(journal); |
1096 | if (ret) |
1097 | return ret; |
1098 | |
1099 | ret = ext4_fc_wait_inode_data_all(journal); |
1100 | if (ret) |
1101 | return ret; |
1102 | |
1103 | /* |
1104 | * If file system device is different from journal device, issue a cache |
1105 | * flush before we start writing fast commit blocks. |
1106 | */ |
1107 | if (journal->j_fs_dev != journal->j_dev) |
1108 | blkdev_issue_flush(bdev: journal->j_fs_dev); |
1109 | |
1110 | blk_start_plug(&plug); |
1111 | if (sbi->s_fc_bytes == 0) { |
1112 | /* |
1113 | * Add a head tag only if this is the first fast commit |
1114 | * in this TID. |
1115 | */ |
1116 | head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); |
1117 | head.fc_tid = cpu_to_le32( |
1118 | sbi->s_journal->j_running_transaction->t_tid); |
1119 | if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, len: sizeof(head), |
1120 | val: (u8 *)&head, crc: &crc)) { |
1121 | ret = -ENOSPC; |
1122 | goto out; |
1123 | } |
1124 | } |
1125 | |
1126 | spin_lock(lock: &sbi->s_fc_lock); |
1127 | ret = ext4_fc_commit_dentry_updates(journal, crc: &crc); |
1128 | if (ret) { |
1129 | spin_unlock(lock: &sbi->s_fc_lock); |
1130 | goto out; |
1131 | } |
1132 | |
1133 | list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { |
1134 | inode = &iter->vfs_inode; |
1135 | if (!ext4_test_inode_state(inode, bit: EXT4_STATE_FC_COMMITTING)) |
1136 | continue; |
1137 | |
1138 | spin_unlock(lock: &sbi->s_fc_lock); |
1139 | ret = ext4_fc_write_inode_data(inode, crc: &crc); |
1140 | if (ret) |
1141 | goto out; |
1142 | ret = ext4_fc_write_inode(inode, crc: &crc); |
1143 | if (ret) |
1144 | goto out; |
1145 | spin_lock(lock: &sbi->s_fc_lock); |
1146 | } |
1147 | spin_unlock(lock: &sbi->s_fc_lock); |
1148 | |
1149 | ret = ext4_fc_write_tail(sb, crc); |
1150 | |
1151 | out: |
1152 | blk_finish_plug(&plug); |
1153 | return ret; |
1154 | } |
1155 | |
1156 | static void ext4_fc_update_stats(struct super_block *sb, int status, |
1157 | u64 commit_time, int nblks, tid_t commit_tid) |
1158 | { |
1159 | struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; |
1160 | |
1161 | ext4_debug("Fast commit ended with status = %d for tid %u" , |
1162 | status, commit_tid); |
1163 | if (status == EXT4_FC_STATUS_OK) { |
1164 | stats->fc_num_commits++; |
1165 | stats->fc_numblks += nblks; |
1166 | if (likely(stats->s_fc_avg_commit_time)) |
1167 | stats->s_fc_avg_commit_time = |
1168 | (commit_time + |
1169 | stats->s_fc_avg_commit_time * 3) / 4; |
1170 | else |
1171 | stats->s_fc_avg_commit_time = commit_time; |
1172 | } else if (status == EXT4_FC_STATUS_FAILED || |
1173 | status == EXT4_FC_STATUS_INELIGIBLE) { |
1174 | if (status == EXT4_FC_STATUS_FAILED) |
1175 | stats->fc_failed_commits++; |
1176 | stats->fc_ineligible_commits++; |
1177 | } else { |
1178 | stats->fc_skipped_commits++; |
1179 | } |
1180 | trace_ext4_fc_commit_stop(sb, nblks, reason: status, commit_tid); |
1181 | } |
1182 | |
1183 | /* |
1184 | * The main commit entry point. Performs a fast commit for transaction |
1185 | * commit_tid if needed. If it's not possible to perform a fast commit |
1186 | * due to various reasons, we fall back to full commit. Returns 0 |
1187 | * on success, error otherwise. |
1188 | */ |
1189 | int ext4_fc_commit(journal_t *journal, tid_t commit_tid) |
1190 | { |
1191 | struct super_block *sb = journal->j_private; |
1192 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1193 | int nblks = 0, ret, bsize = journal->j_blocksize; |
1194 | int subtid = atomic_read(v: &sbi->s_fc_subtid); |
1195 | int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; |
1196 | ktime_t start_time, commit_time; |
1197 | |
1198 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) |
1199 | return jbd2_complete_transaction(journal, tid: commit_tid); |
1200 | |
1201 | trace_ext4_fc_commit_start(sb, commit_tid); |
1202 | |
1203 | start_time = ktime_get(); |
1204 | |
1205 | restart_fc: |
1206 | ret = jbd2_fc_begin_commit(journal, tid: commit_tid); |
1207 | if (ret == -EALREADY) { |
1208 | /* There was an ongoing commit, check if we need to restart */ |
1209 | if (atomic_read(v: &sbi->s_fc_subtid) <= subtid && |
1210 | commit_tid > journal->j_commit_sequence) |
1211 | goto restart_fc; |
1212 | ext4_fc_update_stats(sb, status: EXT4_FC_STATUS_SKIPPED, commit_time: 0, nblks: 0, |
1213 | commit_tid); |
1214 | return 0; |
1215 | } else if (ret) { |
1216 | /* |
1217 | * Commit couldn't start. Just update stats and perform a |
1218 | * full commit. |
1219 | */ |
1220 | ext4_fc_update_stats(sb, status: EXT4_FC_STATUS_FAILED, commit_time: 0, nblks: 0, |
1221 | commit_tid); |
1222 | return jbd2_complete_transaction(journal, tid: commit_tid); |
1223 | } |
1224 | |
1225 | /* |
1226 | * After establishing journal barrier via jbd2_fc_begin_commit(), check |
1227 | * if we are fast commit ineligible. |
1228 | */ |
1229 | if (ext4_test_mount_flag(sb, bit: EXT4_MF_FC_INELIGIBLE)) { |
1230 | status = EXT4_FC_STATUS_INELIGIBLE; |
1231 | goto fallback; |
1232 | } |
1233 | |
1234 | fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; |
1235 | ret = ext4_fc_perform_commit(journal); |
1236 | if (ret < 0) { |
1237 | status = EXT4_FC_STATUS_FAILED; |
1238 | goto fallback; |
1239 | } |
1240 | nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; |
1241 | ret = jbd2_fc_wait_bufs(journal, num_blks: nblks); |
1242 | if (ret < 0) { |
1243 | status = EXT4_FC_STATUS_FAILED; |
1244 | goto fallback; |
1245 | } |
1246 | atomic_inc(v: &sbi->s_fc_subtid); |
1247 | ret = jbd2_fc_end_commit(journal); |
1248 | /* |
1249 | * weight the commit time higher than the average time so we |
1250 | * don't react too strongly to vast changes in the commit time |
1251 | */ |
1252 | commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); |
1253 | ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); |
1254 | return ret; |
1255 | |
1256 | fallback: |
1257 | ret = jbd2_fc_end_commit_fallback(journal); |
1258 | ext4_fc_update_stats(sb, status, commit_time: 0, nblks: 0, commit_tid); |
1259 | return ret; |
1260 | } |
1261 | |
1262 | /* |
1263 | * Fast commit cleanup routine. This is called after every fast commit and |
1264 | * full commit. full is true if we are called after a full commit. |
1265 | */ |
1266 | static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) |
1267 | { |
1268 | struct super_block *sb = journal->j_private; |
1269 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1270 | struct ext4_inode_info *iter, *iter_n; |
1271 | struct ext4_fc_dentry_update *fc_dentry; |
1272 | |
1273 | if (full && sbi->s_fc_bh) |
1274 | sbi->s_fc_bh = NULL; |
1275 | |
1276 | trace_ext4_fc_cleanup(journal, full, tid); |
1277 | jbd2_fc_release_bufs(journal); |
1278 | |
1279 | spin_lock(lock: &sbi->s_fc_lock); |
1280 | list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], |
1281 | i_fc_list) { |
1282 | list_del_init(entry: &iter->i_fc_list); |
1283 | ext4_clear_inode_state(inode: &iter->vfs_inode, |
1284 | bit: EXT4_STATE_FC_COMMITTING); |
1285 | if (iter->i_sync_tid <= tid) |
1286 | ext4_fc_reset_inode(inode: &iter->vfs_inode); |
1287 | /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ |
1288 | smp_mb(); |
1289 | #if (BITS_PER_LONG < 64) |
1290 | wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); |
1291 | #else |
1292 | wake_up_bit(word: &iter->i_flags, bit: EXT4_STATE_FC_COMMITTING); |
1293 | #endif |
1294 | } |
1295 | |
1296 | while (!list_empty(head: &sbi->s_fc_dentry_q[FC_Q_MAIN])) { |
1297 | fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], |
1298 | struct ext4_fc_dentry_update, |
1299 | fcd_list); |
1300 | list_del_init(entry: &fc_dentry->fcd_list); |
1301 | list_del_init(entry: &fc_dentry->fcd_dilist); |
1302 | spin_unlock(lock: &sbi->s_fc_lock); |
1303 | |
1304 | if (fc_dentry->fcd_name.name && |
1305 | fc_dentry->fcd_name.len > DNAME_INLINE_LEN) |
1306 | kfree(objp: fc_dentry->fcd_name.name); |
1307 | kmem_cache_free(s: ext4_fc_dentry_cachep, objp: fc_dentry); |
1308 | spin_lock(lock: &sbi->s_fc_lock); |
1309 | } |
1310 | |
1311 | list_splice_init(list: &sbi->s_fc_dentry_q[FC_Q_STAGING], |
1312 | head: &sbi->s_fc_dentry_q[FC_Q_MAIN]); |
1313 | list_splice_init(list: &sbi->s_fc_q[FC_Q_STAGING], |
1314 | head: &sbi->s_fc_q[FC_Q_MAIN]); |
1315 | |
1316 | if (tid >= sbi->s_fc_ineligible_tid) { |
1317 | sbi->s_fc_ineligible_tid = 0; |
1318 | ext4_clear_mount_flag(sb, bit: EXT4_MF_FC_INELIGIBLE); |
1319 | } |
1320 | |
1321 | if (full) |
1322 | sbi->s_fc_bytes = 0; |
1323 | spin_unlock(lock: &sbi->s_fc_lock); |
1324 | trace_ext4_fc_stats(sb); |
1325 | } |
1326 | |
1327 | /* Ext4 Replay Path Routines */ |
1328 | |
1329 | /* Helper struct for dentry replay routines */ |
1330 | struct dentry_info_args { |
1331 | int parent_ino, dname_len, ino, inode_len; |
1332 | char *dname; |
1333 | }; |
1334 | |
1335 | /* Same as struct ext4_fc_tl, but uses native endianness fields */ |
1336 | struct ext4_fc_tl_mem { |
1337 | u16 fc_tag; |
1338 | u16 fc_len; |
1339 | }; |
1340 | |
1341 | static inline void tl_to_darg(struct dentry_info_args *darg, |
1342 | struct ext4_fc_tl_mem *tl, u8 *val) |
1343 | { |
1344 | struct ext4_fc_dentry_info fcd; |
1345 | |
1346 | memcpy(&fcd, val, sizeof(fcd)); |
1347 | |
1348 | darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); |
1349 | darg->ino = le32_to_cpu(fcd.fc_ino); |
1350 | darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); |
1351 | darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); |
1352 | } |
1353 | |
1354 | static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) |
1355 | { |
1356 | struct ext4_fc_tl tl_disk; |
1357 | |
1358 | memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); |
1359 | tl->fc_len = le16_to_cpu(tl_disk.fc_len); |
1360 | tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); |
1361 | } |
1362 | |
1363 | /* Unlink replay function */ |
1364 | static int ext4_fc_replay_unlink(struct super_block *sb, |
1365 | struct ext4_fc_tl_mem *tl, u8 *val) |
1366 | { |
1367 | struct inode *inode, *old_parent; |
1368 | struct qstr entry; |
1369 | struct dentry_info_args darg; |
1370 | int ret = 0; |
1371 | |
1372 | tl_to_darg(darg: &darg, tl, val); |
1373 | |
1374 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, ino: darg.ino, |
1375 | priv1: darg.parent_ino, priv2: darg.dname_len); |
1376 | |
1377 | entry.name = darg.dname; |
1378 | entry.len = darg.dname_len; |
1379 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); |
1380 | |
1381 | if (IS_ERR(ptr: inode)) { |
1382 | ext4_debug("Inode %d not found" , darg.ino); |
1383 | return 0; |
1384 | } |
1385 | |
1386 | old_parent = ext4_iget(sb, darg.parent_ino, |
1387 | EXT4_IGET_NORMAL); |
1388 | if (IS_ERR(ptr: old_parent)) { |
1389 | ext4_debug("Dir with inode %d not found" , darg.parent_ino); |
1390 | iput(inode); |
1391 | return 0; |
1392 | } |
1393 | |
1394 | ret = __ext4_unlink(dir: old_parent, d_name: &entry, inode, NULL); |
1395 | /* -ENOENT ok coz it might not exist anymore. */ |
1396 | if (ret == -ENOENT) |
1397 | ret = 0; |
1398 | iput(old_parent); |
1399 | iput(inode); |
1400 | return ret; |
1401 | } |
1402 | |
1403 | static int ext4_fc_replay_link_internal(struct super_block *sb, |
1404 | struct dentry_info_args *darg, |
1405 | struct inode *inode) |
1406 | { |
1407 | struct inode *dir = NULL; |
1408 | struct dentry *dentry_dir = NULL, *dentry_inode = NULL; |
1409 | struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); |
1410 | int ret = 0; |
1411 | |
1412 | dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); |
1413 | if (IS_ERR(ptr: dir)) { |
1414 | ext4_debug("Dir with inode %d not found." , darg->parent_ino); |
1415 | dir = NULL; |
1416 | goto out; |
1417 | } |
1418 | |
1419 | dentry_dir = d_obtain_alias(dir); |
1420 | if (IS_ERR(ptr: dentry_dir)) { |
1421 | ext4_debug("Failed to obtain dentry" ); |
1422 | dentry_dir = NULL; |
1423 | goto out; |
1424 | } |
1425 | |
1426 | dentry_inode = d_alloc(dentry_dir, &qstr_dname); |
1427 | if (!dentry_inode) { |
1428 | ext4_debug("Inode dentry not created." ); |
1429 | ret = -ENOMEM; |
1430 | goto out; |
1431 | } |
1432 | |
1433 | ret = __ext4_link(dir, inode, dentry: dentry_inode); |
1434 | /* |
1435 | * It's possible that link already existed since data blocks |
1436 | * for the dir in question got persisted before we crashed OR |
1437 | * we replayed this tag and crashed before the entire replay |
1438 | * could complete. |
1439 | */ |
1440 | if (ret && ret != -EEXIST) { |
1441 | ext4_debug("Failed to link\n" ); |
1442 | goto out; |
1443 | } |
1444 | |
1445 | ret = 0; |
1446 | out: |
1447 | if (dentry_dir) { |
1448 | d_drop(dentry: dentry_dir); |
1449 | dput(dentry_dir); |
1450 | } else if (dir) { |
1451 | iput(dir); |
1452 | } |
1453 | if (dentry_inode) { |
1454 | d_drop(dentry: dentry_inode); |
1455 | dput(dentry_inode); |
1456 | } |
1457 | |
1458 | return ret; |
1459 | } |
1460 | |
1461 | /* Link replay function */ |
1462 | static int ext4_fc_replay_link(struct super_block *sb, |
1463 | struct ext4_fc_tl_mem *tl, u8 *val) |
1464 | { |
1465 | struct inode *inode; |
1466 | struct dentry_info_args darg; |
1467 | int ret = 0; |
1468 | |
1469 | tl_to_darg(darg: &darg, tl, val); |
1470 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, ino: darg.ino, |
1471 | priv1: darg.parent_ino, priv2: darg.dname_len); |
1472 | |
1473 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); |
1474 | if (IS_ERR(ptr: inode)) { |
1475 | ext4_debug("Inode not found." ); |
1476 | return 0; |
1477 | } |
1478 | |
1479 | ret = ext4_fc_replay_link_internal(sb, darg: &darg, inode); |
1480 | iput(inode); |
1481 | return ret; |
1482 | } |
1483 | |
1484 | /* |
1485 | * Record all the modified inodes during replay. We use this later to setup |
1486 | * block bitmaps correctly. |
1487 | */ |
1488 | static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) |
1489 | { |
1490 | struct ext4_fc_replay_state *state; |
1491 | int i; |
1492 | |
1493 | state = &EXT4_SB(sb)->s_fc_replay_state; |
1494 | for (i = 0; i < state->fc_modified_inodes_used; i++) |
1495 | if (state->fc_modified_inodes[i] == ino) |
1496 | return 0; |
1497 | if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { |
1498 | int *fc_modified_inodes; |
1499 | |
1500 | fc_modified_inodes = krealloc(objp: state->fc_modified_inodes, |
1501 | new_size: sizeof(int) * (state->fc_modified_inodes_size + |
1502 | EXT4_FC_REPLAY_REALLOC_INCREMENT), |
1503 | GFP_KERNEL); |
1504 | if (!fc_modified_inodes) |
1505 | return -ENOMEM; |
1506 | state->fc_modified_inodes = fc_modified_inodes; |
1507 | state->fc_modified_inodes_size += |
1508 | EXT4_FC_REPLAY_REALLOC_INCREMENT; |
1509 | } |
1510 | state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; |
1511 | return 0; |
1512 | } |
1513 | |
1514 | /* |
1515 | * Inode replay function |
1516 | */ |
1517 | static int ext4_fc_replay_inode(struct super_block *sb, |
1518 | struct ext4_fc_tl_mem *tl, u8 *val) |
1519 | { |
1520 | struct ext4_fc_inode fc_inode; |
1521 | struct ext4_inode *raw_inode; |
1522 | struct ext4_inode *raw_fc_inode; |
1523 | struct inode *inode = NULL; |
1524 | struct ext4_iloc iloc; |
1525 | int inode_len, ino, ret, tag = tl->fc_tag; |
1526 | struct ext4_extent_header *eh; |
1527 | size_t off_gen = offsetof(struct ext4_inode, i_generation); |
1528 | |
1529 | memcpy(&fc_inode, val, sizeof(fc_inode)); |
1530 | |
1531 | ino = le32_to_cpu(fc_inode.fc_ino); |
1532 | trace_ext4_fc_replay(sb, tag, ino, priv1: 0, priv2: 0); |
1533 | |
1534 | inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); |
1535 | if (!IS_ERR(ptr: inode)) { |
1536 | ext4_ext_clear_bb(inode); |
1537 | iput(inode); |
1538 | } |
1539 | inode = NULL; |
1540 | |
1541 | ret = ext4_fc_record_modified_inode(sb, ino); |
1542 | if (ret) |
1543 | goto out; |
1544 | |
1545 | raw_fc_inode = (struct ext4_inode *) |
1546 | (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); |
1547 | ret = ext4_get_fc_inode_loc(sb, ino, iloc: &iloc); |
1548 | if (ret) |
1549 | goto out; |
1550 | |
1551 | inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); |
1552 | raw_inode = ext4_raw_inode(iloc: &iloc); |
1553 | |
1554 | memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); |
1555 | memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, |
1556 | inode_len - off_gen); |
1557 | if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { |
1558 | eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); |
1559 | if (eh->eh_magic != EXT4_EXT_MAGIC) { |
1560 | memset(eh, 0, sizeof(*eh)); |
1561 | eh->eh_magic = EXT4_EXT_MAGIC; |
1562 | eh->eh_max = cpu_to_le16( |
1563 | (sizeof(raw_inode->i_block) - |
1564 | sizeof(struct ext4_extent_header)) |
1565 | / sizeof(struct ext4_extent)); |
1566 | } |
1567 | } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { |
1568 | memcpy(raw_inode->i_block, raw_fc_inode->i_block, |
1569 | sizeof(raw_inode->i_block)); |
1570 | } |
1571 | |
1572 | /* Immediately update the inode on disk. */ |
1573 | ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); |
1574 | if (ret) |
1575 | goto out; |
1576 | ret = sync_dirty_buffer(bh: iloc.bh); |
1577 | if (ret) |
1578 | goto out; |
1579 | ret = ext4_mark_inode_used(sb, ino); |
1580 | if (ret) |
1581 | goto out; |
1582 | |
1583 | /* Given that we just wrote the inode on disk, this SHOULD succeed. */ |
1584 | inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); |
1585 | if (IS_ERR(ptr: inode)) { |
1586 | ext4_debug("Inode not found." ); |
1587 | return -EFSCORRUPTED; |
1588 | } |
1589 | |
1590 | /* |
1591 | * Our allocator could have made different decisions than before |
1592 | * crashing. This should be fixed but until then, we calculate |
1593 | * the number of blocks the inode. |
1594 | */ |
1595 | if (!ext4_test_inode_flag(inode, bit: EXT4_INODE_INLINE_DATA)) |
1596 | ext4_ext_replay_set_iblocks(inode); |
1597 | |
1598 | inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); |
1599 | ext4_reset_inode_seed(inode); |
1600 | |
1601 | ext4_inode_csum_set(inode, raw: ext4_raw_inode(iloc: &iloc), EXT4_I(inode)); |
1602 | ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); |
1603 | sync_dirty_buffer(bh: iloc.bh); |
1604 | brelse(bh: iloc.bh); |
1605 | out: |
1606 | iput(inode); |
1607 | if (!ret) |
1608 | blkdev_issue_flush(bdev: sb->s_bdev); |
1609 | |
1610 | return 0; |
1611 | } |
1612 | |
1613 | /* |
1614 | * Dentry create replay function. |
1615 | * |
1616 | * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the |
1617 | * inode for which we are trying to create a dentry here, should already have |
1618 | * been replayed before we start here. |
1619 | */ |
1620 | static int ext4_fc_replay_create(struct super_block *sb, |
1621 | struct ext4_fc_tl_mem *tl, u8 *val) |
1622 | { |
1623 | int ret = 0; |
1624 | struct inode *inode = NULL; |
1625 | struct inode *dir = NULL; |
1626 | struct dentry_info_args darg; |
1627 | |
1628 | tl_to_darg(darg: &darg, tl, val); |
1629 | |
1630 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, ino: darg.ino, |
1631 | priv1: darg.parent_ino, priv2: darg.dname_len); |
1632 | |
1633 | /* This takes care of update group descriptor and other metadata */ |
1634 | ret = ext4_mark_inode_used(sb, ino: darg.ino); |
1635 | if (ret) |
1636 | goto out; |
1637 | |
1638 | inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); |
1639 | if (IS_ERR(ptr: inode)) { |
1640 | ext4_debug("inode %d not found." , darg.ino); |
1641 | inode = NULL; |
1642 | ret = -EINVAL; |
1643 | goto out; |
1644 | } |
1645 | |
1646 | if (S_ISDIR(inode->i_mode)) { |
1647 | /* |
1648 | * If we are creating a directory, we need to make sure that the |
1649 | * dot and dot dot dirents are setup properly. |
1650 | */ |
1651 | dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); |
1652 | if (IS_ERR(ptr: dir)) { |
1653 | ext4_debug("Dir %d not found." , darg.ino); |
1654 | goto out; |
1655 | } |
1656 | ret = ext4_init_new_dir(NULL, dir, inode); |
1657 | iput(dir); |
1658 | if (ret) { |
1659 | ret = 0; |
1660 | goto out; |
1661 | } |
1662 | } |
1663 | ret = ext4_fc_replay_link_internal(sb, darg: &darg, inode); |
1664 | if (ret) |
1665 | goto out; |
1666 | set_nlink(inode, nlink: 1); |
1667 | ext4_mark_inode_dirty(NULL, inode); |
1668 | out: |
1669 | iput(inode); |
1670 | return ret; |
1671 | } |
1672 | |
1673 | /* |
1674 | * Record physical disk regions which are in use as per fast commit area, |
1675 | * and used by inodes during replay phase. Our simple replay phase |
1676 | * allocator excludes these regions from allocation. |
1677 | */ |
1678 | int ext4_fc_record_regions(struct super_block *sb, int ino, |
1679 | ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) |
1680 | { |
1681 | struct ext4_fc_replay_state *state; |
1682 | struct ext4_fc_alloc_region *region; |
1683 | |
1684 | state = &EXT4_SB(sb)->s_fc_replay_state; |
1685 | /* |
1686 | * during replay phase, the fc_regions_valid may not same as |
1687 | * fc_regions_used, update it when do new additions. |
1688 | */ |
1689 | if (replay && state->fc_regions_used != state->fc_regions_valid) |
1690 | state->fc_regions_used = state->fc_regions_valid; |
1691 | if (state->fc_regions_used == state->fc_regions_size) { |
1692 | struct ext4_fc_alloc_region *fc_regions; |
1693 | |
1694 | fc_regions = krealloc(objp: state->fc_regions, |
1695 | new_size: sizeof(struct ext4_fc_alloc_region) * |
1696 | (state->fc_regions_size + |
1697 | EXT4_FC_REPLAY_REALLOC_INCREMENT), |
1698 | GFP_KERNEL); |
1699 | if (!fc_regions) |
1700 | return -ENOMEM; |
1701 | state->fc_regions_size += |
1702 | EXT4_FC_REPLAY_REALLOC_INCREMENT; |
1703 | state->fc_regions = fc_regions; |
1704 | } |
1705 | region = &state->fc_regions[state->fc_regions_used++]; |
1706 | region->ino = ino; |
1707 | region->lblk = lblk; |
1708 | region->pblk = pblk; |
1709 | region->len = len; |
1710 | |
1711 | if (replay) |
1712 | state->fc_regions_valid++; |
1713 | |
1714 | return 0; |
1715 | } |
1716 | |
1717 | /* Replay add range tag */ |
1718 | static int ext4_fc_replay_add_range(struct super_block *sb, |
1719 | struct ext4_fc_tl_mem *tl, u8 *val) |
1720 | { |
1721 | struct ext4_fc_add_range fc_add_ex; |
1722 | struct ext4_extent newex, *ex; |
1723 | struct inode *inode; |
1724 | ext4_lblk_t start, cur; |
1725 | int remaining, len; |
1726 | ext4_fsblk_t start_pblk; |
1727 | struct ext4_map_blocks map; |
1728 | struct ext4_ext_path *path = NULL; |
1729 | int ret; |
1730 | |
1731 | memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); |
1732 | ex = (struct ext4_extent *)&fc_add_ex.fc_ex; |
1733 | |
1734 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, |
1735 | le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), |
1736 | priv2: ext4_ext_get_actual_len(ext: ex)); |
1737 | |
1738 | inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); |
1739 | if (IS_ERR(ptr: inode)) { |
1740 | ext4_debug("Inode not found." ); |
1741 | return 0; |
1742 | } |
1743 | |
1744 | ret = ext4_fc_record_modified_inode(sb, ino: inode->i_ino); |
1745 | if (ret) |
1746 | goto out; |
1747 | |
1748 | start = le32_to_cpu(ex->ee_block); |
1749 | start_pblk = ext4_ext_pblock(ex); |
1750 | len = ext4_ext_get_actual_len(ext: ex); |
1751 | |
1752 | cur = start; |
1753 | remaining = len; |
1754 | ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n" , |
1755 | start, start_pblk, len, ext4_ext_is_unwritten(ex), |
1756 | inode->i_ino); |
1757 | |
1758 | while (remaining > 0) { |
1759 | map.m_lblk = cur; |
1760 | map.m_len = remaining; |
1761 | map.m_pblk = 0; |
1762 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
1763 | |
1764 | if (ret < 0) |
1765 | goto out; |
1766 | |
1767 | if (ret == 0) { |
1768 | /* Range is not mapped */ |
1769 | path = ext4_find_extent(inode, cur, NULL, flags: 0); |
1770 | if (IS_ERR(ptr: path)) |
1771 | goto out; |
1772 | memset(&newex, 0, sizeof(newex)); |
1773 | newex.ee_block = cpu_to_le32(cur); |
1774 | ext4_ext_store_pblock( |
1775 | ex: &newex, pb: start_pblk + cur - start); |
1776 | newex.ee_len = cpu_to_le16(map.m_len); |
1777 | if (ext4_ext_is_unwritten(ext: ex)) |
1778 | ext4_ext_mark_unwritten(ext: &newex); |
1779 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
1780 | ret = ext4_ext_insert_extent( |
1781 | NULL, inode, &path, &newex, 0); |
1782 | up_write(sem: (&EXT4_I(inode)->i_data_sem)); |
1783 | ext4_free_ext_path(path); |
1784 | if (ret) |
1785 | goto out; |
1786 | goto next; |
1787 | } |
1788 | |
1789 | if (start_pblk + cur - start != map.m_pblk) { |
1790 | /* |
1791 | * Logical to physical mapping changed. This can happen |
1792 | * if this range was removed and then reallocated to |
1793 | * map to new physical blocks during a fast commit. |
1794 | */ |
1795 | ret = ext4_ext_replay_update_ex(inode, start: cur, len: map.m_len, |
1796 | unwritten: ext4_ext_is_unwritten(ext: ex), |
1797 | pblk: start_pblk + cur - start); |
1798 | if (ret) |
1799 | goto out; |
1800 | /* |
1801 | * Mark the old blocks as free since they aren't used |
1802 | * anymore. We maintain an array of all the modified |
1803 | * inodes. In case these blocks are still used at either |
1804 | * a different logical range in the same inode or in |
1805 | * some different inode, we will mark them as allocated |
1806 | * at the end of the FC replay using our array of |
1807 | * modified inodes. |
1808 | */ |
1809 | ext4_mb_mark_bb(sb: inode->i_sb, block: map.m_pblk, len: map.m_len, state: false); |
1810 | goto next; |
1811 | } |
1812 | |
1813 | /* Range is mapped and needs a state change */ |
1814 | ext4_debug("Converting from %ld to %d %lld" , |
1815 | map.m_flags & EXT4_MAP_UNWRITTEN, |
1816 | ext4_ext_is_unwritten(ex), map.m_pblk); |
1817 | ret = ext4_ext_replay_update_ex(inode, start: cur, len: map.m_len, |
1818 | unwritten: ext4_ext_is_unwritten(ext: ex), pblk: map.m_pblk); |
1819 | if (ret) |
1820 | goto out; |
1821 | /* |
1822 | * We may have split the extent tree while toggling the state. |
1823 | * Try to shrink the extent tree now. |
1824 | */ |
1825 | ext4_ext_replay_shrink_inode(inode, end: start + len); |
1826 | next: |
1827 | cur += map.m_len; |
1828 | remaining -= map.m_len; |
1829 | } |
1830 | ext4_ext_replay_shrink_inode(inode, end: i_size_read(inode) >> |
1831 | sb->s_blocksize_bits); |
1832 | out: |
1833 | iput(inode); |
1834 | return 0; |
1835 | } |
1836 | |
1837 | /* Replay DEL_RANGE tag */ |
1838 | static int |
1839 | ext4_fc_replay_del_range(struct super_block *sb, |
1840 | struct ext4_fc_tl_mem *tl, u8 *val) |
1841 | { |
1842 | struct inode *inode; |
1843 | struct ext4_fc_del_range lrange; |
1844 | struct ext4_map_blocks map; |
1845 | ext4_lblk_t cur, remaining; |
1846 | int ret; |
1847 | |
1848 | memcpy(&lrange, val, sizeof(lrange)); |
1849 | cur = le32_to_cpu(lrange.fc_lblk); |
1850 | remaining = le32_to_cpu(lrange.fc_len); |
1851 | |
1852 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, |
1853 | le32_to_cpu(lrange.fc_ino), priv1: cur, priv2: remaining); |
1854 | |
1855 | inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); |
1856 | if (IS_ERR(ptr: inode)) { |
1857 | ext4_debug("Inode %d not found" , le32_to_cpu(lrange.fc_ino)); |
1858 | return 0; |
1859 | } |
1860 | |
1861 | ret = ext4_fc_record_modified_inode(sb, ino: inode->i_ino); |
1862 | if (ret) |
1863 | goto out; |
1864 | |
1865 | ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n" , |
1866 | inode->i_ino, le32_to_cpu(lrange.fc_lblk), |
1867 | le32_to_cpu(lrange.fc_len)); |
1868 | while (remaining > 0) { |
1869 | map.m_lblk = cur; |
1870 | map.m_len = remaining; |
1871 | |
1872 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
1873 | if (ret < 0) |
1874 | goto out; |
1875 | if (ret > 0) { |
1876 | remaining -= ret; |
1877 | cur += ret; |
1878 | ext4_mb_mark_bb(sb: inode->i_sb, block: map.m_pblk, len: map.m_len, state: false); |
1879 | } else { |
1880 | remaining -= map.m_len; |
1881 | cur += map.m_len; |
1882 | } |
1883 | } |
1884 | |
1885 | down_write(sem: &EXT4_I(inode)->i_data_sem); |
1886 | ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), |
1887 | le32_to_cpu(lrange.fc_lblk) + |
1888 | le32_to_cpu(lrange.fc_len) - 1); |
1889 | up_write(sem: &EXT4_I(inode)->i_data_sem); |
1890 | if (ret) |
1891 | goto out; |
1892 | ext4_ext_replay_shrink_inode(inode, |
1893 | end: i_size_read(inode) >> sb->s_blocksize_bits); |
1894 | ext4_mark_inode_dirty(NULL, inode); |
1895 | out: |
1896 | iput(inode); |
1897 | return 0; |
1898 | } |
1899 | |
1900 | static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) |
1901 | { |
1902 | struct ext4_fc_replay_state *state; |
1903 | struct inode *inode; |
1904 | struct ext4_ext_path *path = NULL; |
1905 | struct ext4_map_blocks map; |
1906 | int i, ret, j; |
1907 | ext4_lblk_t cur, end; |
1908 | |
1909 | state = &EXT4_SB(sb)->s_fc_replay_state; |
1910 | for (i = 0; i < state->fc_modified_inodes_used; i++) { |
1911 | inode = ext4_iget(sb, state->fc_modified_inodes[i], |
1912 | EXT4_IGET_NORMAL); |
1913 | if (IS_ERR(ptr: inode)) { |
1914 | ext4_debug("Inode %d not found." , |
1915 | state->fc_modified_inodes[i]); |
1916 | continue; |
1917 | } |
1918 | cur = 0; |
1919 | end = EXT_MAX_BLOCKS; |
1920 | if (ext4_test_inode_flag(inode, bit: EXT4_INODE_INLINE_DATA)) { |
1921 | iput(inode); |
1922 | continue; |
1923 | } |
1924 | while (cur < end) { |
1925 | map.m_lblk = cur; |
1926 | map.m_len = end - cur; |
1927 | |
1928 | ret = ext4_map_blocks(NULL, inode, map: &map, flags: 0); |
1929 | if (ret < 0) |
1930 | break; |
1931 | |
1932 | if (ret > 0) { |
1933 | path = ext4_find_extent(inode, map.m_lblk, NULL, flags: 0); |
1934 | if (!IS_ERR(ptr: path)) { |
1935 | for (j = 0; j < path->p_depth; j++) |
1936 | ext4_mb_mark_bb(sb: inode->i_sb, |
1937 | block: path[j].p_block, len: 1, state: true); |
1938 | ext4_free_ext_path(path); |
1939 | } |
1940 | cur += ret; |
1941 | ext4_mb_mark_bb(sb: inode->i_sb, block: map.m_pblk, |
1942 | len: map.m_len, state: true); |
1943 | } else { |
1944 | cur = cur + (map.m_len ? map.m_len : 1); |
1945 | } |
1946 | } |
1947 | iput(inode); |
1948 | } |
1949 | } |
1950 | |
1951 | /* |
1952 | * Check if block is in excluded regions for block allocation. The simple |
1953 | * allocator that runs during replay phase is calls this function to see |
1954 | * if it is okay to use a block. |
1955 | */ |
1956 | bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) |
1957 | { |
1958 | int i; |
1959 | struct ext4_fc_replay_state *state; |
1960 | |
1961 | state = &EXT4_SB(sb)->s_fc_replay_state; |
1962 | for (i = 0; i < state->fc_regions_valid; i++) { |
1963 | if (state->fc_regions[i].ino == 0 || |
1964 | state->fc_regions[i].len == 0) |
1965 | continue; |
1966 | if (in_range(blk, state->fc_regions[i].pblk, |
1967 | state->fc_regions[i].len)) |
1968 | return true; |
1969 | } |
1970 | return false; |
1971 | } |
1972 | |
1973 | /* Cleanup function called after replay */ |
1974 | void ext4_fc_replay_cleanup(struct super_block *sb) |
1975 | { |
1976 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
1977 | |
1978 | sbi->s_mount_state &= ~EXT4_FC_REPLAY; |
1979 | kfree(objp: sbi->s_fc_replay_state.fc_regions); |
1980 | kfree(objp: sbi->s_fc_replay_state.fc_modified_inodes); |
1981 | } |
1982 | |
1983 | static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, |
1984 | int tag, int len) |
1985 | { |
1986 | switch (tag) { |
1987 | case EXT4_FC_TAG_ADD_RANGE: |
1988 | return len == sizeof(struct ext4_fc_add_range); |
1989 | case EXT4_FC_TAG_DEL_RANGE: |
1990 | return len == sizeof(struct ext4_fc_del_range); |
1991 | case EXT4_FC_TAG_CREAT: |
1992 | case EXT4_FC_TAG_LINK: |
1993 | case EXT4_FC_TAG_UNLINK: |
1994 | len -= sizeof(struct ext4_fc_dentry_info); |
1995 | return len >= 1 && len <= EXT4_NAME_LEN; |
1996 | case EXT4_FC_TAG_INODE: |
1997 | len -= sizeof(struct ext4_fc_inode); |
1998 | return len >= EXT4_GOOD_OLD_INODE_SIZE && |
1999 | len <= sbi->s_inode_size; |
2000 | case EXT4_FC_TAG_PAD: |
2001 | return true; /* padding can have any length */ |
2002 | case EXT4_FC_TAG_TAIL: |
2003 | return len >= sizeof(struct ext4_fc_tail); |
2004 | case EXT4_FC_TAG_HEAD: |
2005 | return len == sizeof(struct ext4_fc_head); |
2006 | } |
2007 | return false; |
2008 | } |
2009 | |
2010 | /* |
2011 | * Recovery Scan phase handler |
2012 | * |
2013 | * This function is called during the scan phase and is responsible |
2014 | * for doing following things: |
2015 | * - Make sure the fast commit area has valid tags for replay |
2016 | * - Count number of tags that need to be replayed by the replay handler |
2017 | * - Verify CRC |
2018 | * - Create a list of excluded blocks for allocation during replay phase |
2019 | * |
2020 | * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is |
2021 | * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP |
2022 | * to indicate that scan has finished and JBD2 can now start replay phase. |
2023 | * It returns a negative error to indicate that there was an error. At the end |
2024 | * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set |
2025 | * to indicate the number of tags that need to replayed during the replay phase. |
2026 | */ |
2027 | static int ext4_fc_replay_scan(journal_t *journal, |
2028 | struct buffer_head *bh, int off, |
2029 | tid_t expected_tid) |
2030 | { |
2031 | struct super_block *sb = journal->j_private; |
2032 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2033 | struct ext4_fc_replay_state *state; |
2034 | int ret = JBD2_FC_REPLAY_CONTINUE; |
2035 | struct ext4_fc_add_range ext; |
2036 | struct ext4_fc_tl_mem tl; |
2037 | struct ext4_fc_tail tail; |
2038 | __u8 *start, *end, *cur, *val; |
2039 | struct ext4_fc_head head; |
2040 | struct ext4_extent *ex; |
2041 | |
2042 | state = &sbi->s_fc_replay_state; |
2043 | |
2044 | start = (u8 *)bh->b_data; |
2045 | end = start + journal->j_blocksize; |
2046 | |
2047 | if (state->fc_replay_expected_off == 0) { |
2048 | state->fc_cur_tag = 0; |
2049 | state->fc_replay_num_tags = 0; |
2050 | state->fc_crc = 0; |
2051 | state->fc_regions = NULL; |
2052 | state->fc_regions_valid = state->fc_regions_used = |
2053 | state->fc_regions_size = 0; |
2054 | /* Check if we can stop early */ |
2055 | if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) |
2056 | != EXT4_FC_TAG_HEAD) |
2057 | return 0; |
2058 | } |
2059 | |
2060 | if (off != state->fc_replay_expected_off) { |
2061 | ret = -EFSCORRUPTED; |
2062 | goto out_err; |
2063 | } |
2064 | |
2065 | state->fc_replay_expected_off++; |
2066 | for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; |
2067 | cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { |
2068 | ext4_fc_get_tl(tl: &tl, val: cur); |
2069 | val = cur + EXT4_FC_TAG_BASE_LEN; |
2070 | if (tl.fc_len > end - val || |
2071 | !ext4_fc_value_len_isvalid(sbi, tag: tl.fc_tag, len: tl.fc_len)) { |
2072 | ret = state->fc_replay_num_tags ? |
2073 | JBD2_FC_REPLAY_STOP : -ECANCELED; |
2074 | goto out_err; |
2075 | } |
2076 | ext4_debug("Scan phase, tag:%s, blk %lld\n" , |
2077 | tag2str(tl.fc_tag), bh->b_blocknr); |
2078 | switch (tl.fc_tag) { |
2079 | case EXT4_FC_TAG_ADD_RANGE: |
2080 | memcpy(&ext, val, sizeof(ext)); |
2081 | ex = (struct ext4_extent *)&ext.fc_ex; |
2082 | ret = ext4_fc_record_regions(sb, |
2083 | le32_to_cpu(ext.fc_ino), |
2084 | le32_to_cpu(ex->ee_block), pblk: ext4_ext_pblock(ex), |
2085 | len: ext4_ext_get_actual_len(ext: ex), replay: 0); |
2086 | if (ret < 0) |
2087 | break; |
2088 | ret = JBD2_FC_REPLAY_CONTINUE; |
2089 | fallthrough; |
2090 | case EXT4_FC_TAG_DEL_RANGE: |
2091 | case EXT4_FC_TAG_LINK: |
2092 | case EXT4_FC_TAG_UNLINK: |
2093 | case EXT4_FC_TAG_CREAT: |
2094 | case EXT4_FC_TAG_INODE: |
2095 | case EXT4_FC_TAG_PAD: |
2096 | state->fc_cur_tag++; |
2097 | state->fc_crc = ext4_chksum(sbi, crc: state->fc_crc, address: cur, |
2098 | EXT4_FC_TAG_BASE_LEN + tl.fc_len); |
2099 | break; |
2100 | case EXT4_FC_TAG_TAIL: |
2101 | state->fc_cur_tag++; |
2102 | memcpy(&tail, val, sizeof(tail)); |
2103 | state->fc_crc = ext4_chksum(sbi, crc: state->fc_crc, address: cur, |
2104 | EXT4_FC_TAG_BASE_LEN + |
2105 | offsetof(struct ext4_fc_tail, |
2106 | fc_crc)); |
2107 | if (le32_to_cpu(tail.fc_tid) == expected_tid && |
2108 | le32_to_cpu(tail.fc_crc) == state->fc_crc) { |
2109 | state->fc_replay_num_tags = state->fc_cur_tag; |
2110 | state->fc_regions_valid = |
2111 | state->fc_regions_used; |
2112 | } else { |
2113 | ret = state->fc_replay_num_tags ? |
2114 | JBD2_FC_REPLAY_STOP : -EFSBADCRC; |
2115 | } |
2116 | state->fc_crc = 0; |
2117 | break; |
2118 | case EXT4_FC_TAG_HEAD: |
2119 | memcpy(&head, val, sizeof(head)); |
2120 | if (le32_to_cpu(head.fc_features) & |
2121 | ~EXT4_FC_SUPPORTED_FEATURES) { |
2122 | ret = -EOPNOTSUPP; |
2123 | break; |
2124 | } |
2125 | if (le32_to_cpu(head.fc_tid) != expected_tid) { |
2126 | ret = JBD2_FC_REPLAY_STOP; |
2127 | break; |
2128 | } |
2129 | state->fc_cur_tag++; |
2130 | state->fc_crc = ext4_chksum(sbi, crc: state->fc_crc, address: cur, |
2131 | EXT4_FC_TAG_BASE_LEN + tl.fc_len); |
2132 | break; |
2133 | default: |
2134 | ret = state->fc_replay_num_tags ? |
2135 | JBD2_FC_REPLAY_STOP : -ECANCELED; |
2136 | } |
2137 | if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) |
2138 | break; |
2139 | } |
2140 | |
2141 | out_err: |
2142 | trace_ext4_fc_replay_scan(sb, error: ret, off); |
2143 | return ret; |
2144 | } |
2145 | |
2146 | /* |
2147 | * Main recovery path entry point. |
2148 | * The meaning of return codes is similar as above. |
2149 | */ |
2150 | static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, |
2151 | enum passtype pass, int off, tid_t expected_tid) |
2152 | { |
2153 | struct super_block *sb = journal->j_private; |
2154 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2155 | struct ext4_fc_tl_mem tl; |
2156 | __u8 *start, *end, *cur, *val; |
2157 | int ret = JBD2_FC_REPLAY_CONTINUE; |
2158 | struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; |
2159 | struct ext4_fc_tail tail; |
2160 | |
2161 | if (pass == PASS_SCAN) { |
2162 | state->fc_current_pass = PASS_SCAN; |
2163 | return ext4_fc_replay_scan(journal, bh, off, expected_tid); |
2164 | } |
2165 | |
2166 | if (state->fc_current_pass != pass) { |
2167 | state->fc_current_pass = pass; |
2168 | sbi->s_mount_state |= EXT4_FC_REPLAY; |
2169 | } |
2170 | if (!sbi->s_fc_replay_state.fc_replay_num_tags) { |
2171 | ext4_debug("Replay stops\n" ); |
2172 | ext4_fc_set_bitmaps_and_counters(sb); |
2173 | return 0; |
2174 | } |
2175 | |
2176 | #ifdef CONFIG_EXT4_DEBUG |
2177 | if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { |
2178 | pr_warn("Dropping fc block %d because max_replay set\n" , off); |
2179 | return JBD2_FC_REPLAY_STOP; |
2180 | } |
2181 | #endif |
2182 | |
2183 | start = (u8 *)bh->b_data; |
2184 | end = start + journal->j_blocksize; |
2185 | |
2186 | for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; |
2187 | cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { |
2188 | ext4_fc_get_tl(tl: &tl, val: cur); |
2189 | val = cur + EXT4_FC_TAG_BASE_LEN; |
2190 | |
2191 | if (state->fc_replay_num_tags == 0) { |
2192 | ret = JBD2_FC_REPLAY_STOP; |
2193 | ext4_fc_set_bitmaps_and_counters(sb); |
2194 | break; |
2195 | } |
2196 | |
2197 | ext4_debug("Replay phase, tag:%s\n" , tag2str(tl.fc_tag)); |
2198 | state->fc_replay_num_tags--; |
2199 | switch (tl.fc_tag) { |
2200 | case EXT4_FC_TAG_LINK: |
2201 | ret = ext4_fc_replay_link(sb, tl: &tl, val); |
2202 | break; |
2203 | case EXT4_FC_TAG_UNLINK: |
2204 | ret = ext4_fc_replay_unlink(sb, tl: &tl, val); |
2205 | break; |
2206 | case EXT4_FC_TAG_ADD_RANGE: |
2207 | ret = ext4_fc_replay_add_range(sb, tl: &tl, val); |
2208 | break; |
2209 | case EXT4_FC_TAG_CREAT: |
2210 | ret = ext4_fc_replay_create(sb, tl: &tl, val); |
2211 | break; |
2212 | case EXT4_FC_TAG_DEL_RANGE: |
2213 | ret = ext4_fc_replay_del_range(sb, tl: &tl, val); |
2214 | break; |
2215 | case EXT4_FC_TAG_INODE: |
2216 | ret = ext4_fc_replay_inode(sb, tl: &tl, val); |
2217 | break; |
2218 | case EXT4_FC_TAG_PAD: |
2219 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, ino: 0, |
2220 | priv1: tl.fc_len, priv2: 0); |
2221 | break; |
2222 | case EXT4_FC_TAG_TAIL: |
2223 | trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, |
2224 | ino: 0, priv1: tl.fc_len, priv2: 0); |
2225 | memcpy(&tail, val, sizeof(tail)); |
2226 | WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); |
2227 | break; |
2228 | case EXT4_FC_TAG_HEAD: |
2229 | break; |
2230 | default: |
2231 | trace_ext4_fc_replay(sb, tag: tl.fc_tag, ino: 0, priv1: tl.fc_len, priv2: 0); |
2232 | ret = -ECANCELED; |
2233 | break; |
2234 | } |
2235 | if (ret < 0) |
2236 | break; |
2237 | ret = JBD2_FC_REPLAY_CONTINUE; |
2238 | } |
2239 | return ret; |
2240 | } |
2241 | |
2242 | void ext4_fc_init(struct super_block *sb, journal_t *journal) |
2243 | { |
2244 | /* |
2245 | * We set replay callback even if fast commit disabled because we may |
2246 | * could still have fast commit blocks that need to be replayed even if |
2247 | * fast commit has now been turned off. |
2248 | */ |
2249 | journal->j_fc_replay_callback = ext4_fc_replay; |
2250 | if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) |
2251 | return; |
2252 | journal->j_fc_cleanup_callback = ext4_fc_cleanup; |
2253 | } |
2254 | |
2255 | static const char * const fc_ineligible_reasons[] = { |
2256 | [EXT4_FC_REASON_XATTR] = "Extended attributes changed" , |
2257 | [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename" , |
2258 | [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed" , |
2259 | [EXT4_FC_REASON_NOMEM] = "Insufficient memory" , |
2260 | [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot" , |
2261 | [EXT4_FC_REASON_RESIZE] = "Resize" , |
2262 | [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed" , |
2263 | [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op" , |
2264 | [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling" , |
2265 | [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename" , |
2266 | }; |
2267 | |
2268 | int ext4_fc_info_show(struct seq_file *seq, void *v) |
2269 | { |
2270 | struct ext4_sb_info *sbi = EXT4_SB(sb: (struct super_block *)seq->private); |
2271 | struct ext4_fc_stats *stats = &sbi->s_fc_stats; |
2272 | int i; |
2273 | |
2274 | if (v != SEQ_START_TOKEN) |
2275 | return 0; |
2276 | |
2277 | seq_printf(m: seq, |
2278 | fmt: "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n" , |
2279 | stats->fc_num_commits, stats->fc_ineligible_commits, |
2280 | stats->fc_numblks, |
2281 | div_u64(dividend: stats->s_fc_avg_commit_time, divisor: 1000)); |
2282 | seq_puts(m: seq, s: "Ineligible reasons:\n" ); |
2283 | for (i = 0; i < EXT4_FC_REASON_MAX; i++) |
2284 | seq_printf(m: seq, fmt: "\"%s\":\t%d\n" , fc_ineligible_reasons[i], |
2285 | stats->fc_ineligible_reason_count[i]); |
2286 | |
2287 | return 0; |
2288 | } |
2289 | |
2290 | int __init ext4_fc_init_dentry_cache(void) |
2291 | { |
2292 | ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, |
2293 | SLAB_RECLAIM_ACCOUNT); |
2294 | |
2295 | if (ext4_fc_dentry_cachep == NULL) |
2296 | return -ENOMEM; |
2297 | |
2298 | return 0; |
2299 | } |
2300 | |
2301 | void ext4_fc_destroy_dentry_cache(void) |
2302 | { |
2303 | kmem_cache_destroy(s: ext4_fc_dentry_cachep); |
2304 | } |
2305 | |