1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * io.c |
4 | * |
5 | * Buffer cache handling |
6 | * |
7 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. |
8 | */ |
9 | |
10 | #include <linux/fs.h> |
11 | #include <linux/types.h> |
12 | #include <linux/highmem.h> |
13 | #include <linux/bio.h> |
14 | |
15 | #include <cluster/masklog.h> |
16 | |
17 | #include "ocfs2.h" |
18 | |
19 | #include "alloc.h" |
20 | #include "inode.h" |
21 | #include "journal.h" |
22 | #include "uptodate.h" |
23 | #include "buffer_head_io.h" |
24 | #include "ocfs2_trace.h" |
25 | |
26 | /* |
27 | * Bits on bh->b_state used by ocfs2. |
28 | * |
29 | * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart. |
30 | */ |
31 | enum ocfs2_state_bits { |
32 | BH_NeedsValidate = BH_JBDPrivateStart, |
33 | }; |
34 | |
35 | /* Expand the magic b_state functions */ |
36 | BUFFER_FNS(NeedsValidate, needs_validate); |
37 | |
38 | int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, |
39 | struct ocfs2_caching_info *ci) |
40 | { |
41 | int ret = 0; |
42 | |
43 | trace_ocfs2_write_block(block: (unsigned long long)bh->b_blocknr, ci); |
44 | |
45 | BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); |
46 | BUG_ON(buffer_jbd(bh)); |
47 | |
48 | /* No need to check for a soft readonly file system here. non |
49 | * journalled writes are only ever done on system files which |
50 | * can get modified during recovery even if read-only. */ |
51 | if (ocfs2_is_hard_readonly(osb)) { |
52 | ret = -EROFS; |
53 | mlog_errno(ret); |
54 | goto out; |
55 | } |
56 | |
57 | ocfs2_metadata_cache_io_lock(ci); |
58 | |
59 | lock_buffer(bh); |
60 | set_buffer_uptodate(bh); |
61 | |
62 | /* remove from dirty list before I/O. */ |
63 | clear_buffer_dirty(bh); |
64 | |
65 | get_bh(bh); /* for end_buffer_write_sync() */ |
66 | bh->b_end_io = end_buffer_write_sync; |
67 | submit_bh(REQ_OP_WRITE, bh); |
68 | |
69 | wait_on_buffer(bh); |
70 | |
71 | if (buffer_uptodate(bh)) { |
72 | ocfs2_set_buffer_uptodate(ci, bh); |
73 | } else { |
74 | /* We don't need to remove the clustered uptodate |
75 | * information for this bh as it's not marked locally |
76 | * uptodate. */ |
77 | ret = -EIO; |
78 | mlog_errno(ret); |
79 | } |
80 | |
81 | ocfs2_metadata_cache_io_unlock(ci); |
82 | out: |
83 | return ret; |
84 | } |
85 | |
86 | /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it |
87 | * will be easier to handle read failure. |
88 | */ |
89 | int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, |
90 | unsigned int nr, struct buffer_head *bhs[]) |
91 | { |
92 | int status = 0; |
93 | unsigned int i; |
94 | struct buffer_head *bh; |
95 | int new_bh = 0; |
96 | |
97 | trace_ocfs2_read_blocks_sync(val1: (unsigned long long)block, val2: nr); |
98 | |
99 | if (!nr) |
100 | goto bail; |
101 | |
102 | /* Don't put buffer head and re-assign it to NULL if it is allocated |
103 | * outside since the caller can't be aware of this alternation! |
104 | */ |
105 | new_bh = (bhs[0] == NULL); |
106 | |
107 | for (i = 0 ; i < nr ; i++) { |
108 | if (bhs[i] == NULL) { |
109 | bhs[i] = sb_getblk(sb: osb->sb, block: block++); |
110 | if (bhs[i] == NULL) { |
111 | status = -ENOMEM; |
112 | mlog_errno(status); |
113 | break; |
114 | } |
115 | } |
116 | bh = bhs[i]; |
117 | |
118 | if (buffer_jbd(bh)) { |
119 | trace_ocfs2_read_blocks_sync_jbd( |
120 | num: (unsigned long long)bh->b_blocknr); |
121 | continue; |
122 | } |
123 | |
124 | if (buffer_dirty(bh)) { |
125 | /* This should probably be a BUG, or |
126 | * at least return an error. */ |
127 | mlog(ML_ERROR, |
128 | "trying to sync read a dirty " |
129 | "buffer! (blocknr = %llu), skipping\n" , |
130 | (unsigned long long)bh->b_blocknr); |
131 | continue; |
132 | } |
133 | |
134 | lock_buffer(bh); |
135 | if (buffer_jbd(bh)) { |
136 | #ifdef CATCH_BH_JBD_RACES |
137 | mlog(ML_ERROR, |
138 | "block %llu had the JBD bit set " |
139 | "while I was in lock_buffer!" , |
140 | (unsigned long long)bh->b_blocknr); |
141 | BUG(); |
142 | #else |
143 | unlock_buffer(bh); |
144 | continue; |
145 | #endif |
146 | } |
147 | |
148 | get_bh(bh); /* for end_buffer_read_sync() */ |
149 | bh->b_end_io = end_buffer_read_sync; |
150 | submit_bh(REQ_OP_READ, bh); |
151 | } |
152 | |
153 | read_failure: |
154 | for (i = nr; i > 0; i--) { |
155 | bh = bhs[i - 1]; |
156 | |
157 | if (unlikely(status)) { |
158 | if (new_bh && bh) { |
159 | /* If middle bh fails, let previous bh |
160 | * finish its read and then put it to |
161 | * avoid bh leak |
162 | */ |
163 | if (!buffer_jbd(bh)) |
164 | wait_on_buffer(bh); |
165 | put_bh(bh); |
166 | bhs[i - 1] = NULL; |
167 | } else if (bh && buffer_uptodate(bh)) { |
168 | clear_buffer_uptodate(bh); |
169 | } |
170 | continue; |
171 | } |
172 | |
173 | /* No need to wait on the buffer if it's managed by JBD. */ |
174 | if (!buffer_jbd(bh)) |
175 | wait_on_buffer(bh); |
176 | |
177 | if (!buffer_uptodate(bh)) { |
178 | /* Status won't be cleared from here on out, |
179 | * so we can safely record this and loop back |
180 | * to cleanup the other buffers. */ |
181 | status = -EIO; |
182 | goto read_failure; |
183 | } |
184 | } |
185 | |
186 | bail: |
187 | return status; |
188 | } |
189 | |
190 | /* Caller must provide a bhs[] with all NULL or non-NULL entries, so it |
191 | * will be easier to handle read failure. |
192 | */ |
193 | int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, |
194 | struct buffer_head *bhs[], int flags, |
195 | int (*validate)(struct super_block *sb, |
196 | struct buffer_head *bh)) |
197 | { |
198 | int status = 0; |
199 | int i, ignore_cache = 0; |
200 | struct buffer_head *bh; |
201 | struct super_block *sb = ocfs2_metadata_cache_get_super(ci); |
202 | int new_bh = 0; |
203 | |
204 | trace_ocfs2_read_blocks_begin(ci, block: (unsigned long long)block, nr, flags); |
205 | |
206 | BUG_ON(!ci); |
207 | BUG_ON((flags & OCFS2_BH_READAHEAD) && |
208 | (flags & OCFS2_BH_IGNORE_CACHE)); |
209 | |
210 | if (bhs == NULL) { |
211 | status = -EINVAL; |
212 | mlog_errno(status); |
213 | goto bail; |
214 | } |
215 | |
216 | if (nr < 0) { |
217 | mlog(ML_ERROR, "asked to read %d blocks!\n" , nr); |
218 | status = -EINVAL; |
219 | mlog_errno(status); |
220 | goto bail; |
221 | } |
222 | |
223 | if (nr == 0) { |
224 | status = 0; |
225 | goto bail; |
226 | } |
227 | |
228 | /* Don't put buffer head and re-assign it to NULL if it is allocated |
229 | * outside since the caller can't be aware of this alternation! |
230 | */ |
231 | new_bh = (bhs[0] == NULL); |
232 | |
233 | ocfs2_metadata_cache_io_lock(ci); |
234 | for (i = 0 ; i < nr ; i++) { |
235 | if (bhs[i] == NULL) { |
236 | bhs[i] = sb_getblk(sb, block: block++); |
237 | if (bhs[i] == NULL) { |
238 | ocfs2_metadata_cache_io_unlock(ci); |
239 | status = -ENOMEM; |
240 | mlog_errno(status); |
241 | /* Don't forget to put previous bh! */ |
242 | break; |
243 | } |
244 | } |
245 | bh = bhs[i]; |
246 | ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); |
247 | |
248 | /* There are three read-ahead cases here which we need to |
249 | * be concerned with. All three assume a buffer has |
250 | * previously been submitted with OCFS2_BH_READAHEAD |
251 | * and it hasn't yet completed I/O. |
252 | * |
253 | * 1) The current request is sync to disk. This rarely |
254 | * happens these days, and never when performance |
255 | * matters - the code can just wait on the buffer |
256 | * lock and re-submit. |
257 | * |
258 | * 2) The current request is cached, but not |
259 | * readahead. ocfs2_buffer_uptodate() will return |
260 | * false anyway, so we'll wind up waiting on the |
261 | * buffer lock to do I/O. We re-check the request |
262 | * with after getting the lock to avoid a re-submit. |
263 | * |
264 | * 3) The current request is readahead (and so must |
265 | * also be a caching one). We short circuit if the |
266 | * buffer is locked (under I/O) and if it's in the |
267 | * uptodate cache. The re-check from #2 catches the |
268 | * case that the previous read-ahead completes just |
269 | * before our is-it-in-flight check. |
270 | */ |
271 | |
272 | if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { |
273 | trace_ocfs2_read_blocks_from_disk( |
274 | val1: (unsigned long long)bh->b_blocknr, |
275 | val2: (unsigned long long)ocfs2_metadata_cache_owner(ci)); |
276 | /* We're using ignore_cache here to say |
277 | * "go to disk" */ |
278 | ignore_cache = 1; |
279 | } |
280 | |
281 | trace_ocfs2_read_blocks_bh(ull: (unsigned long long)bh->b_blocknr, |
282 | value1: ignore_cache, value2: buffer_jbd(bh), value3: buffer_dirty(bh)); |
283 | |
284 | if (buffer_jbd(bh)) { |
285 | continue; |
286 | } |
287 | |
288 | if (ignore_cache) { |
289 | if (buffer_dirty(bh)) { |
290 | /* This should probably be a BUG, or |
291 | * at least return an error. */ |
292 | continue; |
293 | } |
294 | |
295 | /* A read-ahead request was made - if the |
296 | * buffer is already under read-ahead from a |
297 | * previously submitted request than we are |
298 | * done here. */ |
299 | if ((flags & OCFS2_BH_READAHEAD) |
300 | && ocfs2_buffer_read_ahead(ci, bh)) |
301 | continue; |
302 | |
303 | lock_buffer(bh); |
304 | if (buffer_jbd(bh)) { |
305 | #ifdef CATCH_BH_JBD_RACES |
306 | mlog(ML_ERROR, "block %llu had the JBD bit set " |
307 | "while I was in lock_buffer!" , |
308 | (unsigned long long)bh->b_blocknr); |
309 | BUG(); |
310 | #else |
311 | unlock_buffer(bh); |
312 | continue; |
313 | #endif |
314 | } |
315 | |
316 | /* Re-check ocfs2_buffer_uptodate() as a |
317 | * previously read-ahead buffer may have |
318 | * completed I/O while we were waiting for the |
319 | * buffer lock. */ |
320 | if (!(flags & OCFS2_BH_IGNORE_CACHE) |
321 | && !(flags & OCFS2_BH_READAHEAD) |
322 | && ocfs2_buffer_uptodate(ci, bh)) { |
323 | unlock_buffer(bh); |
324 | continue; |
325 | } |
326 | |
327 | get_bh(bh); /* for end_buffer_read_sync() */ |
328 | if (validate) |
329 | set_buffer_needs_validate(bh); |
330 | bh->b_end_io = end_buffer_read_sync; |
331 | submit_bh(REQ_OP_READ, bh); |
332 | continue; |
333 | } |
334 | } |
335 | |
336 | read_failure: |
337 | for (i = (nr - 1); i >= 0; i--) { |
338 | bh = bhs[i]; |
339 | |
340 | if (!(flags & OCFS2_BH_READAHEAD)) { |
341 | if (unlikely(status)) { |
342 | /* Clear the buffers on error including those |
343 | * ever succeeded in reading |
344 | */ |
345 | if (new_bh && bh) { |
346 | /* If middle bh fails, let previous bh |
347 | * finish its read and then put it to |
348 | * avoid bh leak |
349 | */ |
350 | if (!buffer_jbd(bh)) |
351 | wait_on_buffer(bh); |
352 | put_bh(bh); |
353 | bhs[i] = NULL; |
354 | } else if (bh && buffer_uptodate(bh)) { |
355 | clear_buffer_uptodate(bh); |
356 | } |
357 | continue; |
358 | } |
359 | /* We know this can't have changed as we hold the |
360 | * owner sem. Avoid doing any work on the bh if the |
361 | * journal has it. */ |
362 | if (!buffer_jbd(bh)) |
363 | wait_on_buffer(bh); |
364 | |
365 | if (!buffer_uptodate(bh)) { |
366 | /* Status won't be cleared from here on out, |
367 | * so we can safely record this and loop back |
368 | * to cleanup the other buffers. Don't need to |
369 | * remove the clustered uptodate information |
370 | * for this bh as it's not marked locally |
371 | * uptodate. */ |
372 | status = -EIO; |
373 | clear_buffer_needs_validate(bh); |
374 | goto read_failure; |
375 | } |
376 | |
377 | if (buffer_needs_validate(bh)) { |
378 | /* We never set NeedsValidate if the |
379 | * buffer was held by the journal, so |
380 | * that better not have changed */ |
381 | BUG_ON(buffer_jbd(bh)); |
382 | clear_buffer_needs_validate(bh); |
383 | status = validate(sb, bh); |
384 | if (status) |
385 | goto read_failure; |
386 | } |
387 | } |
388 | |
389 | /* Always set the buffer in the cache, even if it was |
390 | * a forced read, or read-ahead which hasn't yet |
391 | * completed. */ |
392 | ocfs2_set_buffer_uptodate(ci, bh); |
393 | } |
394 | ocfs2_metadata_cache_io_unlock(ci); |
395 | |
396 | trace_ocfs2_read_blocks_end(ull: (unsigned long long)block, value1: nr, |
397 | value2: flags, value3: ignore_cache); |
398 | |
399 | bail: |
400 | |
401 | return status; |
402 | } |
403 | |
404 | /* Check whether the blkno is the super block or one of the backups. */ |
405 | static void ocfs2_check_super_or_backup(struct super_block *sb, |
406 | sector_t blkno) |
407 | { |
408 | int i; |
409 | u64 backup_blkno; |
410 | |
411 | if (blkno == OCFS2_SUPER_BLOCK_BLKNO) |
412 | return; |
413 | |
414 | for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { |
415 | backup_blkno = ocfs2_backup_super_blkno(sb, index: i); |
416 | if (backup_blkno == blkno) |
417 | return; |
418 | } |
419 | |
420 | BUG(); |
421 | } |
422 | |
423 | /* |
424 | * Write super block and backups doesn't need to collaborate with journal, |
425 | * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed |
426 | * into this function. |
427 | */ |
428 | int ocfs2_write_super_or_backup(struct ocfs2_super *osb, |
429 | struct buffer_head *bh) |
430 | { |
431 | int ret = 0; |
432 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; |
433 | |
434 | BUG_ON(buffer_jbd(bh)); |
435 | ocfs2_check_super_or_backup(sb: osb->sb, blkno: bh->b_blocknr); |
436 | |
437 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) { |
438 | ret = -EROFS; |
439 | mlog_errno(ret); |
440 | goto out; |
441 | } |
442 | |
443 | lock_buffer(bh); |
444 | set_buffer_uptodate(bh); |
445 | |
446 | /* remove from dirty list before I/O. */ |
447 | clear_buffer_dirty(bh); |
448 | |
449 | get_bh(bh); /* for end_buffer_write_sync() */ |
450 | bh->b_end_io = end_buffer_write_sync; |
451 | ocfs2_compute_meta_ecc(sb: osb->sb, data: bh->b_data, bc: &di->i_check); |
452 | submit_bh(REQ_OP_WRITE, bh); |
453 | |
454 | wait_on_buffer(bh); |
455 | |
456 | if (!buffer_uptodate(bh)) { |
457 | ret = -EIO; |
458 | mlog_errno(ret); |
459 | } |
460 | |
461 | out: |
462 | return ret; |
463 | } |
464 | |