1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2000-2005 Silicon Graphics, Inc. |
4 | * All Rights Reserved. |
5 | */ |
6 | #include "xfs.h" |
7 | #include "xfs_fs.h" |
8 | #include "xfs_shared.h" |
9 | #include "xfs_format.h" |
10 | #include "xfs_log_format.h" |
11 | #include "xfs_trans_resv.h" |
12 | #include "xfs_bit.h" |
13 | #include "xfs_sb.h" |
14 | #include "xfs_mount.h" |
15 | #include "xfs_inode.h" |
16 | #include "xfs_dir2.h" |
17 | #include "xfs_ialloc.h" |
18 | #include "xfs_alloc.h" |
19 | #include "xfs_rtalloc.h" |
20 | #include "xfs_bmap.h" |
21 | #include "xfs_trans.h" |
22 | #include "xfs_trans_priv.h" |
23 | #include "xfs_log.h" |
24 | #include "xfs_log_priv.h" |
25 | #include "xfs_error.h" |
26 | #include "xfs_quota.h" |
27 | #include "xfs_fsops.h" |
28 | #include "xfs_icache.h" |
29 | #include "xfs_sysfs.h" |
30 | #include "xfs_rmap_btree.h" |
31 | #include "xfs_refcount_btree.h" |
32 | #include "xfs_reflink.h" |
33 | #include "xfs_extent_busy.h" |
34 | #include "xfs_health.h" |
35 | #include "xfs_trace.h" |
36 | #include "xfs_ag.h" |
37 | #include "scrub/stats.h" |
38 | |
39 | static DEFINE_MUTEX(xfs_uuid_table_mutex); |
40 | static int xfs_uuid_table_size; |
41 | static uuid_t *xfs_uuid_table; |
42 | |
43 | void |
44 | xfs_uuid_table_free(void) |
45 | { |
46 | if (xfs_uuid_table_size == 0) |
47 | return; |
48 | kmem_free(ptr: xfs_uuid_table); |
49 | xfs_uuid_table = NULL; |
50 | xfs_uuid_table_size = 0; |
51 | } |
52 | |
53 | /* |
54 | * See if the UUID is unique among mounted XFS filesystems. |
55 | * Mount fails if UUID is nil or a FS with the same UUID is already mounted. |
56 | */ |
57 | STATIC int |
58 | xfs_uuid_mount( |
59 | struct xfs_mount *mp) |
60 | { |
61 | uuid_t *uuid = &mp->m_sb.sb_uuid; |
62 | int hole, i; |
63 | |
64 | /* Publish UUID in struct super_block */ |
65 | uuid_copy(dst: &mp->m_super->s_uuid, src: uuid); |
66 | |
67 | if (xfs_has_nouuid(mp)) |
68 | return 0; |
69 | |
70 | if (uuid_is_null(uuid)) { |
71 | xfs_warn(mp, "Filesystem has null UUID - can't mount" ); |
72 | return -EINVAL; |
73 | } |
74 | |
75 | mutex_lock(&xfs_uuid_table_mutex); |
76 | for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { |
77 | if (uuid_is_null(uuid: &xfs_uuid_table[i])) { |
78 | hole = i; |
79 | continue; |
80 | } |
81 | if (uuid_equal(u1: uuid, u2: &xfs_uuid_table[i])) |
82 | goto out_duplicate; |
83 | } |
84 | |
85 | if (hole < 0) { |
86 | xfs_uuid_table = krealloc(objp: xfs_uuid_table, |
87 | new_size: (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), |
88 | GFP_KERNEL | __GFP_NOFAIL); |
89 | hole = xfs_uuid_table_size++; |
90 | } |
91 | xfs_uuid_table[hole] = *uuid; |
92 | mutex_unlock(lock: &xfs_uuid_table_mutex); |
93 | |
94 | return 0; |
95 | |
96 | out_duplicate: |
97 | mutex_unlock(lock: &xfs_uuid_table_mutex); |
98 | xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount" , uuid); |
99 | return -EINVAL; |
100 | } |
101 | |
102 | STATIC void |
103 | xfs_uuid_unmount( |
104 | struct xfs_mount *mp) |
105 | { |
106 | uuid_t *uuid = &mp->m_sb.sb_uuid; |
107 | int i; |
108 | |
109 | if (xfs_has_nouuid(mp)) |
110 | return; |
111 | |
112 | mutex_lock(&xfs_uuid_table_mutex); |
113 | for (i = 0; i < xfs_uuid_table_size; i++) { |
114 | if (uuid_is_null(uuid: &xfs_uuid_table[i])) |
115 | continue; |
116 | if (!uuid_equal(u1: uuid, u2: &xfs_uuid_table[i])) |
117 | continue; |
118 | memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); |
119 | break; |
120 | } |
121 | ASSERT(i < xfs_uuid_table_size); |
122 | mutex_unlock(lock: &xfs_uuid_table_mutex); |
123 | } |
124 | |
125 | /* |
126 | * Check size of device based on the (data/realtime) block count. |
127 | * Note: this check is used by the growfs code as well as mount. |
128 | */ |
129 | int |
130 | xfs_sb_validate_fsb_count( |
131 | xfs_sb_t *sbp, |
132 | uint64_t nblocks) |
133 | { |
134 | ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); |
135 | ASSERT(sbp->sb_blocklog >= BBSHIFT); |
136 | |
137 | /* Limited by ULONG_MAX of page cache index */ |
138 | if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) |
139 | return -EFBIG; |
140 | return 0; |
141 | } |
142 | |
143 | /* |
144 | * xfs_readsb |
145 | * |
146 | * Does the initial read of the superblock. |
147 | */ |
148 | int |
149 | xfs_readsb( |
150 | struct xfs_mount *mp, |
151 | int flags) |
152 | { |
153 | unsigned int sector_size; |
154 | struct xfs_buf *bp; |
155 | struct xfs_sb *sbp = &mp->m_sb; |
156 | int error; |
157 | int loud = !(flags & XFS_MFSI_QUIET); |
158 | const struct xfs_buf_ops *buf_ops; |
159 | |
160 | ASSERT(mp->m_sb_bp == NULL); |
161 | ASSERT(mp->m_ddev_targp != NULL); |
162 | |
163 | /* |
164 | * For the initial read, we must guess at the sector |
165 | * size based on the block device. It's enough to |
166 | * get the sb_sectsize out of the superblock and |
167 | * then reread with the proper length. |
168 | * We don't verify it yet, because it may not be complete. |
169 | */ |
170 | sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); |
171 | buf_ops = NULL; |
172 | |
173 | /* |
174 | * Allocate a (locked) buffer to hold the superblock. This will be kept |
175 | * around at all times to optimize access to the superblock. Therefore, |
176 | * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count |
177 | * elevated. |
178 | */ |
179 | reread: |
180 | error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, |
181 | BTOBB(sector_size), XBF_NO_IOACCT, &bp, |
182 | buf_ops); |
183 | if (error) { |
184 | if (loud) |
185 | xfs_warn(mp, "SB validate failed with error %d." , error); |
186 | /* bad CRC means corrupted metadata */ |
187 | if (error == -EFSBADCRC) |
188 | error = -EFSCORRUPTED; |
189 | return error; |
190 | } |
191 | |
192 | /* |
193 | * Initialize the mount structure from the superblock. |
194 | */ |
195 | xfs_sb_from_disk(sbp, bp->b_addr); |
196 | |
197 | /* |
198 | * If we haven't validated the superblock, do so now before we try |
199 | * to check the sector size and reread the superblock appropriately. |
200 | */ |
201 | if (sbp->sb_magicnum != XFS_SB_MAGIC) { |
202 | if (loud) |
203 | xfs_warn(mp, "Invalid superblock magic number" ); |
204 | error = -EINVAL; |
205 | goto release_buf; |
206 | } |
207 | |
208 | /* |
209 | * We must be able to do sector-sized and sector-aligned IO. |
210 | */ |
211 | if (sector_size > sbp->sb_sectsize) { |
212 | if (loud) |
213 | xfs_warn(mp, "device supports %u byte sectors (not %u)" , |
214 | sector_size, sbp->sb_sectsize); |
215 | error = -ENOSYS; |
216 | goto release_buf; |
217 | } |
218 | |
219 | if (buf_ops == NULL) { |
220 | /* |
221 | * Re-read the superblock so the buffer is correctly sized, |
222 | * and properly verified. |
223 | */ |
224 | xfs_buf_relse(bp); |
225 | sector_size = sbp->sb_sectsize; |
226 | buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; |
227 | goto reread; |
228 | } |
229 | |
230 | mp->m_features |= xfs_sb_version_to_features(sbp); |
231 | xfs_reinit_percpu_counters(mp); |
232 | |
233 | /* no need to be quiet anymore, so reset the buf ops */ |
234 | bp->b_ops = &xfs_sb_buf_ops; |
235 | |
236 | mp->m_sb_bp = bp; |
237 | xfs_buf_unlock(bp); |
238 | return 0; |
239 | |
240 | release_buf: |
241 | xfs_buf_relse(bp); |
242 | return error; |
243 | } |
244 | |
245 | /* |
246 | * If the sunit/swidth change would move the precomputed root inode value, we |
247 | * must reject the ondisk change because repair will stumble over that. |
248 | * However, we allow the mount to proceed because we never rejected this |
249 | * combination before. Returns true to update the sb, false otherwise. |
250 | */ |
251 | static inline int |
252 | xfs_check_new_dalign( |
253 | struct xfs_mount *mp, |
254 | int new_dalign, |
255 | bool *update_sb) |
256 | { |
257 | struct xfs_sb *sbp = &mp->m_sb; |
258 | xfs_ino_t calc_ino; |
259 | |
260 | calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); |
261 | trace_xfs_check_new_dalign(mp, new_dalign, calc_rootino: calc_ino); |
262 | |
263 | if (sbp->sb_rootino == calc_ino) { |
264 | *update_sb = true; |
265 | return 0; |
266 | } |
267 | |
268 | xfs_warn(mp, |
269 | "Cannot change stripe alignment; would require moving root inode." ); |
270 | |
271 | /* |
272 | * XXX: Next time we add a new incompat feature, this should start |
273 | * returning -EINVAL to fail the mount. Until then, spit out a warning |
274 | * that we're ignoring the administrator's instructions. |
275 | */ |
276 | xfs_warn(mp, "Skipping superblock stripe alignment update." ); |
277 | *update_sb = false; |
278 | return 0; |
279 | } |
280 | |
281 | /* |
282 | * If we were provided with new sunit/swidth values as mount options, make sure |
283 | * that they pass basic alignment and superblock feature checks, and convert |
284 | * them into the same units (FSB) that everything else expects. This step |
285 | * /must/ be done before computing the inode geometry. |
286 | */ |
287 | STATIC int |
288 | xfs_validate_new_dalign( |
289 | struct xfs_mount *mp) |
290 | { |
291 | if (mp->m_dalign == 0) |
292 | return 0; |
293 | |
294 | /* |
295 | * If stripe unit and stripe width are not multiples |
296 | * of the fs blocksize turn off alignment. |
297 | */ |
298 | if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || |
299 | (BBTOB(mp->m_swidth) & mp->m_blockmask)) { |
300 | xfs_warn(mp, |
301 | "alignment check failed: sunit/swidth vs. blocksize(%d)" , |
302 | mp->m_sb.sb_blocksize); |
303 | return -EINVAL; |
304 | } |
305 | |
306 | /* |
307 | * Convert the stripe unit and width to FSBs. |
308 | */ |
309 | mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); |
310 | if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { |
311 | xfs_warn(mp, |
312 | "alignment check failed: sunit/swidth vs. agsize(%d)" , |
313 | mp->m_sb.sb_agblocks); |
314 | return -EINVAL; |
315 | } |
316 | |
317 | if (!mp->m_dalign) { |
318 | xfs_warn(mp, |
319 | "alignment check failed: sunit(%d) less than bsize(%d)" , |
320 | mp->m_dalign, mp->m_sb.sb_blocksize); |
321 | return -EINVAL; |
322 | } |
323 | |
324 | mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); |
325 | |
326 | if (!xfs_has_dalign(mp)) { |
327 | xfs_warn(mp, |
328 | "cannot change alignment: superblock does not support data alignment" ); |
329 | return -EINVAL; |
330 | } |
331 | |
332 | return 0; |
333 | } |
334 | |
335 | /* Update alignment values based on mount options and sb values. */ |
336 | STATIC int |
337 | xfs_update_alignment( |
338 | struct xfs_mount *mp) |
339 | { |
340 | struct xfs_sb *sbp = &mp->m_sb; |
341 | |
342 | if (mp->m_dalign) { |
343 | bool update_sb; |
344 | int error; |
345 | |
346 | if (sbp->sb_unit == mp->m_dalign && |
347 | sbp->sb_width == mp->m_swidth) |
348 | return 0; |
349 | |
350 | error = xfs_check_new_dalign(mp, new_dalign: mp->m_dalign, update_sb: &update_sb); |
351 | if (error || !update_sb) |
352 | return error; |
353 | |
354 | sbp->sb_unit = mp->m_dalign; |
355 | sbp->sb_width = mp->m_swidth; |
356 | mp->m_update_sb = true; |
357 | } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) { |
358 | mp->m_dalign = sbp->sb_unit; |
359 | mp->m_swidth = sbp->sb_width; |
360 | } |
361 | |
362 | return 0; |
363 | } |
364 | |
365 | /* |
366 | * precalculate the low space thresholds for dynamic speculative preallocation. |
367 | */ |
368 | void |
369 | xfs_set_low_space_thresholds( |
370 | struct xfs_mount *mp) |
371 | { |
372 | uint64_t dblocks = mp->m_sb.sb_dblocks; |
373 | uint64_t rtexts = mp->m_sb.sb_rextents; |
374 | int i; |
375 | |
376 | do_div(dblocks, 100); |
377 | do_div(rtexts, 100); |
378 | |
379 | for (i = 0; i < XFS_LOWSP_MAX; i++) { |
380 | mp->m_low_space[i] = dblocks * (i + 1); |
381 | mp->m_low_rtexts[i] = rtexts * (i + 1); |
382 | } |
383 | } |
384 | |
385 | /* |
386 | * Check that the data (and log if separate) is an ok size. |
387 | */ |
388 | STATIC int |
389 | xfs_check_sizes( |
390 | struct xfs_mount *mp) |
391 | { |
392 | struct xfs_buf *bp; |
393 | xfs_daddr_t d; |
394 | int error; |
395 | |
396 | d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); |
397 | if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { |
398 | xfs_warn(mp, "filesystem size mismatch detected" ); |
399 | return -EFBIG; |
400 | } |
401 | error = xfs_buf_read_uncached(target: mp->m_ddev_targp, |
402 | daddr: d - XFS_FSS_TO_BB(mp, 1), |
403 | numblks: XFS_FSS_TO_BB(mp, 1), flags: 0, bpp: &bp, NULL); |
404 | if (error) { |
405 | xfs_warn(mp, "last sector read failed" ); |
406 | return error; |
407 | } |
408 | xfs_buf_relse(bp); |
409 | |
410 | if (mp->m_logdev_targp == mp->m_ddev_targp) |
411 | return 0; |
412 | |
413 | d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); |
414 | if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { |
415 | xfs_warn(mp, "log size mismatch detected" ); |
416 | return -EFBIG; |
417 | } |
418 | error = xfs_buf_read_uncached(target: mp->m_logdev_targp, |
419 | daddr: d - XFS_FSB_TO_BB(mp, 1), |
420 | numblks: XFS_FSB_TO_BB(mp, 1), flags: 0, bpp: &bp, NULL); |
421 | if (error) { |
422 | xfs_warn(mp, "log device read failed" ); |
423 | return error; |
424 | } |
425 | xfs_buf_relse(bp); |
426 | return 0; |
427 | } |
428 | |
429 | /* |
430 | * Clear the quotaflags in memory and in the superblock. |
431 | */ |
432 | int |
433 | xfs_mount_reset_sbqflags( |
434 | struct xfs_mount *mp) |
435 | { |
436 | mp->m_qflags = 0; |
437 | |
438 | /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */ |
439 | if (mp->m_sb.sb_qflags == 0) |
440 | return 0; |
441 | spin_lock(lock: &mp->m_sb_lock); |
442 | mp->m_sb.sb_qflags = 0; |
443 | spin_unlock(lock: &mp->m_sb_lock); |
444 | |
445 | if (!xfs_fs_writable(mp, level: SB_FREEZE_WRITE)) |
446 | return 0; |
447 | |
448 | return xfs_sync_sb(mp, false); |
449 | } |
450 | |
451 | uint64_t |
452 | xfs_default_resblks(xfs_mount_t *mp) |
453 | { |
454 | uint64_t resblks; |
455 | |
456 | /* |
457 | * We default to 5% or 8192 fsbs of space reserved, whichever is |
458 | * smaller. This is intended to cover concurrent allocation |
459 | * transactions when we initially hit enospc. These each require a 4 |
460 | * block reservation. Hence by default we cover roughly 2000 concurrent |
461 | * allocation reservations. |
462 | */ |
463 | resblks = mp->m_sb.sb_dblocks; |
464 | do_div(resblks, 20); |
465 | resblks = min_t(uint64_t, resblks, 8192); |
466 | return resblks; |
467 | } |
468 | |
469 | /* Ensure the summary counts are correct. */ |
470 | STATIC int |
471 | xfs_check_summary_counts( |
472 | struct xfs_mount *mp) |
473 | { |
474 | int error = 0; |
475 | |
476 | /* |
477 | * The AG0 superblock verifier rejects in-progress filesystems, |
478 | * so we should never see the flag set this far into mounting. |
479 | */ |
480 | if (mp->m_sb.sb_inprogress) { |
481 | xfs_err(mp, "sb_inprogress set after log recovery??" ); |
482 | WARN_ON(1); |
483 | return -EFSCORRUPTED; |
484 | } |
485 | |
486 | /* |
487 | * Now the log is mounted, we know if it was an unclean shutdown or |
488 | * not. If it was, with the first phase of recovery has completed, we |
489 | * have consistent AG blocks on disk. We have not recovered EFIs yet, |
490 | * but they are recovered transactionally in the second recovery phase |
491 | * later. |
492 | * |
493 | * If the log was clean when we mounted, we can check the summary |
494 | * counters. If any of them are obviously incorrect, we can recompute |
495 | * them from the AGF headers in the next step. |
496 | */ |
497 | if (xfs_is_clean(mp) && |
498 | (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || |
499 | !xfs_verify_icount(mp, mp->m_sb.sb_icount) || |
500 | mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) |
501 | xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); |
502 | |
503 | /* |
504 | * We can safely re-initialise incore superblock counters from the |
505 | * per-ag data. These may not be correct if the filesystem was not |
506 | * cleanly unmounted, so we waited for recovery to finish before doing |
507 | * this. |
508 | * |
509 | * If the filesystem was cleanly unmounted or the previous check did |
510 | * not flag anything weird, then we can trust the values in the |
511 | * superblock to be correct and we don't need to do anything here. |
512 | * Otherwise, recalculate the summary counters. |
513 | */ |
514 | if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) || |
515 | xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) { |
516 | error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); |
517 | if (error) |
518 | return error; |
519 | } |
520 | |
521 | /* |
522 | * Older kernels misused sb_frextents to reflect both incore |
523 | * reservations made by running transactions and the actual count of |
524 | * free rt extents in the ondisk metadata. Transactions committed |
525 | * during runtime can therefore contain a superblock update that |
526 | * undercounts the number of free rt extents tracked in the rt bitmap. |
527 | * A clean unmount record will have the correct frextents value since |
528 | * there can be no other transactions running at that point. |
529 | * |
530 | * If we're mounting the rt volume after recovering the log, recompute |
531 | * frextents from the rtbitmap file to fix the inconsistency. |
532 | */ |
533 | if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { |
534 | error = xfs_rtalloc_reinit_frextents(mp); |
535 | if (error) |
536 | return error; |
537 | } |
538 | |
539 | return 0; |
540 | } |
541 | |
542 | static void |
543 | xfs_unmount_check( |
544 | struct xfs_mount *mp) |
545 | { |
546 | if (xfs_is_shutdown(mp)) |
547 | return; |
548 | |
549 | if (percpu_counter_sum(fbc: &mp->m_ifree) > |
550 | percpu_counter_sum(fbc: &mp->m_icount)) { |
551 | xfs_alert(mp, "ifree/icount mismatch at unmount" ); |
552 | xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); |
553 | } |
554 | } |
555 | |
556 | /* |
557 | * Flush and reclaim dirty inodes in preparation for unmount. Inodes and |
558 | * internal inode structures can be sitting in the CIL and AIL at this point, |
559 | * so we need to unpin them, write them back and/or reclaim them before unmount |
560 | * can proceed. In other words, callers are required to have inactivated all |
561 | * inodes. |
562 | * |
563 | * An inode cluster that has been freed can have its buffer still pinned in |
564 | * memory because the transaction is still sitting in a iclog. The stale inodes |
565 | * on that buffer will be pinned to the buffer until the transaction hits the |
566 | * disk and the callbacks run. Pushing the AIL will skip the stale inodes and |
567 | * may never see the pinned buffer, so nothing will push out the iclog and |
568 | * unpin the buffer. |
569 | * |
570 | * Hence we need to force the log to unpin everything first. However, log |
571 | * forces don't wait for the discards they issue to complete, so we have to |
572 | * explicitly wait for them to complete here as well. |
573 | * |
574 | * Then we can tell the world we are unmounting so that error handling knows |
575 | * that the filesystem is going away and we should error out anything that we |
576 | * have been retrying in the background. This will prevent never-ending |
577 | * retries in AIL pushing from hanging the unmount. |
578 | * |
579 | * Finally, we can push the AIL to clean all the remaining dirty objects, then |
580 | * reclaim the remaining inodes that are still in memory at this point in time. |
581 | */ |
582 | static void |
583 | xfs_unmount_flush_inodes( |
584 | struct xfs_mount *mp) |
585 | { |
586 | xfs_log_force(mp, XFS_LOG_SYNC); |
587 | xfs_extent_busy_wait_all(mp); |
588 | flush_workqueue(xfs_discard_wq); |
589 | |
590 | set_bit(XFS_OPSTATE_UNMOUNTING, addr: &mp->m_opstate); |
591 | |
592 | xfs_ail_push_all_sync(mp->m_ail); |
593 | xfs_inodegc_stop(mp); |
594 | cancel_delayed_work_sync(dwork: &mp->m_reclaim_work); |
595 | xfs_reclaim_inodes(mp); |
596 | xfs_health_unmount(mp); |
597 | } |
598 | |
599 | static void |
600 | xfs_mount_setup_inode_geom( |
601 | struct xfs_mount *mp) |
602 | { |
603 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
604 | |
605 | igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp); |
606 | ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp)); |
607 | |
608 | xfs_ialloc_setup_geometry(mp); |
609 | } |
610 | |
611 | /* Compute maximum possible height for per-AG btree types for this fs. */ |
612 | static inline void |
613 | xfs_agbtree_compute_maxlevels( |
614 | struct xfs_mount *mp) |
615 | { |
616 | unsigned int levels; |
617 | |
618 | levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels); |
619 | levels = max(levels, mp->m_rmap_maxlevels); |
620 | mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); |
621 | } |
622 | |
623 | /* |
624 | * This function does the following on an initial mount of a file system: |
625 | * - reads the superblock from disk and init the mount struct |
626 | * - if we're a 32-bit kernel, do a size check on the superblock |
627 | * so we don't mount terabyte filesystems |
628 | * - init mount struct realtime fields |
629 | * - allocate inode hash table for fs |
630 | * - init directory manager |
631 | * - perform recovery and init the log manager |
632 | */ |
633 | int |
634 | xfs_mountfs( |
635 | struct xfs_mount *mp) |
636 | { |
637 | struct xfs_sb *sbp = &(mp->m_sb); |
638 | struct xfs_inode *rip; |
639 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
640 | uint64_t resblks; |
641 | uint quotamount = 0; |
642 | uint quotaflags = 0; |
643 | int error = 0; |
644 | |
645 | xfs_sb_mount_common(mp, sbp); |
646 | |
647 | /* |
648 | * Check for a mismatched features2 values. Older kernels read & wrote |
649 | * into the wrong sb offset for sb_features2 on some platforms due to |
650 | * xfs_sb_t not being 64bit size aligned when sb_features2 was added, |
651 | * which made older superblock reading/writing routines swap it as a |
652 | * 64-bit value. |
653 | * |
654 | * For backwards compatibility, we make both slots equal. |
655 | * |
656 | * If we detect a mismatched field, we OR the set bits into the existing |
657 | * features2 field in case it has already been modified; we don't want |
658 | * to lose any features. We then update the bad location with the ORed |
659 | * value so that older kernels will see any features2 flags. The |
660 | * superblock writeback code ensures the new sb_features2 is copied to |
661 | * sb_bad_features2 before it is logged or written to disk. |
662 | */ |
663 | if (xfs_sb_has_mismatched_features2(sbp)) { |
664 | xfs_warn(mp, "correcting sb_features alignment problem" ); |
665 | sbp->sb_features2 |= sbp->sb_bad_features2; |
666 | mp->m_update_sb = true; |
667 | } |
668 | |
669 | |
670 | /* always use v2 inodes by default now */ |
671 | if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { |
672 | mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; |
673 | mp->m_features |= XFS_FEAT_NLINK; |
674 | mp->m_update_sb = true; |
675 | } |
676 | |
677 | /* |
678 | * If we were given new sunit/swidth options, do some basic validation |
679 | * checks and convert the incore dalign and swidth values to the |
680 | * same units (FSB) that everything else uses. This /must/ happen |
681 | * before computing the inode geometry. |
682 | */ |
683 | error = xfs_validate_new_dalign(mp); |
684 | if (error) |
685 | goto out; |
686 | |
687 | xfs_alloc_compute_maxlevels(mp); |
688 | xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); |
689 | xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); |
690 | xfs_mount_setup_inode_geom(mp); |
691 | xfs_rmapbt_compute_maxlevels(mp); |
692 | xfs_refcountbt_compute_maxlevels(mp); |
693 | |
694 | xfs_agbtree_compute_maxlevels(mp); |
695 | |
696 | /* |
697 | * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks |
698 | * is NOT aligned turn off m_dalign since allocator alignment is within |
699 | * an ag, therefore ag has to be aligned at stripe boundary. Note that |
700 | * we must compute the free space and rmap btree geometry before doing |
701 | * this. |
702 | */ |
703 | error = xfs_update_alignment(mp); |
704 | if (error) |
705 | goto out; |
706 | |
707 | /* enable fail_at_unmount as default */ |
708 | mp->m_fail_unmount = true; |
709 | |
710 | error = xfs_sysfs_init(kobj: &mp->m_kobj, ktype: &xfs_mp_ktype, |
711 | NULL, name: mp->m_super->s_id); |
712 | if (error) |
713 | goto out; |
714 | |
715 | error = xfs_sysfs_init(kobj: &mp->m_stats.xs_kobj, ktype: &xfs_stats_ktype, |
716 | parent_kobj: &mp->m_kobj, name: "stats" ); |
717 | if (error) |
718 | goto out_remove_sysfs; |
719 | |
720 | xchk_stats_register(cs: mp->m_scrub_stats, parent: mp->m_debugfs); |
721 | |
722 | error = xfs_error_sysfs_init(mp); |
723 | if (error) |
724 | goto out_remove_scrub_stats; |
725 | |
726 | error = xfs_errortag_init(mp); |
727 | if (error) |
728 | goto out_remove_error_sysfs; |
729 | |
730 | error = xfs_uuid_mount(mp); |
731 | if (error) |
732 | goto out_remove_errortag; |
733 | |
734 | /* |
735 | * Update the preferred write size based on the information from the |
736 | * on-disk superblock. |
737 | */ |
738 | mp->m_allocsize_log = |
739 | max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log); |
740 | mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog); |
741 | |
742 | /* set the low space thresholds for dynamic preallocation */ |
743 | xfs_set_low_space_thresholds(mp); |
744 | |
745 | /* |
746 | * If enabled, sparse inode chunk alignment is expected to match the |
747 | * cluster size. Full inode chunk alignment must match the chunk size, |
748 | * but that is checked on sb read verification... |
749 | */ |
750 | if (xfs_has_sparseinodes(mp) && |
751 | mp->m_sb.sb_spino_align != |
752 | XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) { |
753 | xfs_warn(mp, |
754 | "Sparse inode block alignment (%u) must match cluster size (%llu)." , |
755 | mp->m_sb.sb_spino_align, |
756 | XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)); |
757 | error = -EINVAL; |
758 | goto out_remove_uuid; |
759 | } |
760 | |
761 | /* |
762 | * Check that the data (and log if separate) is an ok size. |
763 | */ |
764 | error = xfs_check_sizes(mp); |
765 | if (error) |
766 | goto out_remove_uuid; |
767 | |
768 | /* |
769 | * Initialize realtime fields in the mount structure |
770 | */ |
771 | error = xfs_rtmount_init(mp); |
772 | if (error) { |
773 | xfs_warn(mp, "RT mount failed" ); |
774 | goto out_remove_uuid; |
775 | } |
776 | |
777 | /* |
778 | * Copies the low order bits of the timestamp and the randomly |
779 | * set "sequence" number out of a UUID. |
780 | */ |
781 | mp->m_fixedfsid[0] = |
782 | (get_unaligned_be16(p: &sbp->sb_uuid.b[8]) << 16) | |
783 | get_unaligned_be16(p: &sbp->sb_uuid.b[4]); |
784 | mp->m_fixedfsid[1] = get_unaligned_be32(p: &sbp->sb_uuid.b[0]); |
785 | |
786 | error = xfs_da_mount(mp); |
787 | if (error) { |
788 | xfs_warn(mp, "Failed dir/attr init: %d" , error); |
789 | goto out_remove_uuid; |
790 | } |
791 | |
792 | /* |
793 | * Initialize the precomputed transaction reservations values. |
794 | */ |
795 | xfs_trans_init(mp); |
796 | |
797 | /* |
798 | * Allocate and initialize the per-ag data. |
799 | */ |
800 | error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, |
801 | &mp->m_maxagi); |
802 | if (error) { |
803 | xfs_warn(mp, "Failed per-ag init: %d" , error); |
804 | goto out_free_dir; |
805 | } |
806 | |
807 | if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { |
808 | xfs_warn(mp, "no log defined" ); |
809 | error = -EFSCORRUPTED; |
810 | goto out_free_perag; |
811 | } |
812 | |
813 | error = xfs_inodegc_register_shrinker(mp); |
814 | if (error) |
815 | goto out_fail_wait; |
816 | |
817 | /* |
818 | * Log's mount-time initialization. The first part of recovery can place |
819 | * some items on the AIL, to be handled when recovery is finished or |
820 | * cancelled. |
821 | */ |
822 | error = xfs_log_mount(mp, log_target: mp->m_logdev_targp, |
823 | start_block: XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), |
824 | num_bblocks: XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); |
825 | if (error) { |
826 | xfs_warn(mp, "log mount failed" ); |
827 | goto out_inodegc_shrinker; |
828 | } |
829 | |
830 | /* Enable background inode inactivation workers. */ |
831 | xfs_inodegc_start(mp); |
832 | xfs_blockgc_start(mp); |
833 | |
834 | /* |
835 | * Now that we've recovered any pending superblock feature bit |
836 | * additions, we can finish setting up the attr2 behaviour for the |
837 | * mount. The noattr2 option overrides the superblock flag, so only |
838 | * check the superblock feature flag if the mount option is not set. |
839 | */ |
840 | if (xfs_has_noattr2(mp)) { |
841 | mp->m_features &= ~XFS_FEAT_ATTR2; |
842 | } else if (!xfs_has_attr2(mp) && |
843 | (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { |
844 | mp->m_features |= XFS_FEAT_ATTR2; |
845 | } |
846 | |
847 | /* |
848 | * Get and sanity-check the root inode. |
849 | * Save the pointer to it in the mount structure. |
850 | */ |
851 | error = xfs_iget(mp, NULL, ino: sbp->sb_rootino, XFS_IGET_UNTRUSTED, |
852 | XFS_ILOCK_EXCL, ipp: &rip); |
853 | if (error) { |
854 | xfs_warn(mp, |
855 | "Failed to read root inode 0x%llx, error %d" , |
856 | sbp->sb_rootino, -error); |
857 | goto out_log_dealloc; |
858 | } |
859 | |
860 | ASSERT(rip != NULL); |
861 | |
862 | if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) { |
863 | xfs_warn(mp, "corrupted root inode %llu: not a directory" , |
864 | (unsigned long long)rip->i_ino); |
865 | xfs_iunlock(rip, XFS_ILOCK_EXCL); |
866 | error = -EFSCORRUPTED; |
867 | goto out_rele_rip; |
868 | } |
869 | mp->m_rootip = rip; /* save it */ |
870 | |
871 | xfs_iunlock(rip, XFS_ILOCK_EXCL); |
872 | |
873 | /* |
874 | * Initialize realtime inode pointers in the mount structure |
875 | */ |
876 | error = xfs_rtmount_inodes(mp); |
877 | if (error) { |
878 | /* |
879 | * Free up the root inode. |
880 | */ |
881 | xfs_warn(mp, "failed to read RT inodes" ); |
882 | goto out_rele_rip; |
883 | } |
884 | |
885 | /* Make sure the summary counts are ok. */ |
886 | error = xfs_check_summary_counts(mp); |
887 | if (error) |
888 | goto out_rtunmount; |
889 | |
890 | /* |
891 | * If this is a read-only mount defer the superblock updates until |
892 | * the next remount into writeable mode. Otherwise we would never |
893 | * perform the update e.g. for the root filesystem. |
894 | */ |
895 | if (mp->m_update_sb && !xfs_is_readonly(mp)) { |
896 | error = xfs_sync_sb(mp, false); |
897 | if (error) { |
898 | xfs_warn(mp, "failed to write sb changes" ); |
899 | goto out_rtunmount; |
900 | } |
901 | } |
902 | |
903 | /* |
904 | * Initialise the XFS quota management subsystem for this mount |
905 | */ |
906 | if (XFS_IS_QUOTA_ON(mp)) { |
907 | error = xfs_qm_newmount(mp, "amount, "aflags); |
908 | if (error) |
909 | goto out_rtunmount; |
910 | } else { |
911 | /* |
912 | * If a file system had quotas running earlier, but decided to |
913 | * mount without -o uquota/pquota/gquota options, revoke the |
914 | * quotachecked license. |
915 | */ |
916 | if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { |
917 | xfs_notice(mp, "resetting quota flags" ); |
918 | error = xfs_mount_reset_sbqflags(mp); |
919 | if (error) |
920 | goto out_rtunmount; |
921 | } |
922 | } |
923 | |
924 | /* |
925 | * Finish recovering the file system. This part needed to be delayed |
926 | * until after the root and real-time bitmap inodes were consistently |
927 | * read in. Temporarily create per-AG space reservations for metadata |
928 | * btree shape changes because space freeing transactions (for inode |
929 | * inactivation) require the per-AG reservation in lieu of reserving |
930 | * blocks. |
931 | */ |
932 | error = xfs_fs_reserve_ag_blocks(mp); |
933 | if (error && error == -ENOSPC) |
934 | xfs_warn(mp, |
935 | "ENOSPC reserving per-AG metadata pool, log recovery may fail." ); |
936 | error = xfs_log_mount_finish(mp); |
937 | xfs_fs_unreserve_ag_blocks(mp); |
938 | if (error) { |
939 | xfs_warn(mp, "log mount finish failed" ); |
940 | goto out_rtunmount; |
941 | } |
942 | |
943 | /* |
944 | * Now the log is fully replayed, we can transition to full read-only |
945 | * mode for read-only mounts. This will sync all the metadata and clean |
946 | * the log so that the recovery we just performed does not have to be |
947 | * replayed again on the next mount. |
948 | * |
949 | * We use the same quiesce mechanism as the rw->ro remount, as they are |
950 | * semantically identical operations. |
951 | */ |
952 | if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) |
953 | xfs_log_clean(mp); |
954 | |
955 | /* |
956 | * Complete the quota initialisation, post-log-replay component. |
957 | */ |
958 | if (quotamount) { |
959 | ASSERT(mp->m_qflags == 0); |
960 | mp->m_qflags = quotaflags; |
961 | |
962 | xfs_qm_mount_quotas(mp); |
963 | } |
964 | |
965 | /* |
966 | * Now we are mounted, reserve a small amount of unused space for |
967 | * privileged transactions. This is needed so that transaction |
968 | * space required for critical operations can dip into this pool |
969 | * when at ENOSPC. This is needed for operations like create with |
970 | * attr, unwritten extent conversion at ENOSPC, etc. Data allocations |
971 | * are not allowed to use this reserved space. |
972 | * |
973 | * This may drive us straight to ENOSPC on mount, but that implies |
974 | * we were already there on the last unmount. Warn if this occurs. |
975 | */ |
976 | if (!xfs_is_readonly(mp)) { |
977 | resblks = xfs_default_resblks(mp); |
978 | error = xfs_reserve_blocks(mp, &resblks, NULL); |
979 | if (error) |
980 | xfs_warn(mp, |
981 | "Unable to allocate reserve blocks. Continuing without reserve pool." ); |
982 | |
983 | /* Reserve AG blocks for future btree expansion. */ |
984 | error = xfs_fs_reserve_ag_blocks(mp); |
985 | if (error && error != -ENOSPC) |
986 | goto out_agresv; |
987 | } |
988 | |
989 | return 0; |
990 | |
991 | out_agresv: |
992 | xfs_fs_unreserve_ag_blocks(mp); |
993 | xfs_qm_unmount_quotas(mp); |
994 | out_rtunmount: |
995 | xfs_rtunmount_inodes(mp); |
996 | out_rele_rip: |
997 | xfs_irele(ip: rip); |
998 | /* Clean out dquots that might be in memory after quotacheck. */ |
999 | xfs_qm_unmount(mp); |
1000 | |
1001 | /* |
1002 | * Inactivate all inodes that might still be in memory after a log |
1003 | * intent recovery failure so that reclaim can free them. Metadata |
1004 | * inodes and the root directory shouldn't need inactivation, but the |
1005 | * mount failed for some reason, so pull down all the state and flee. |
1006 | */ |
1007 | xfs_inodegc_flush(mp); |
1008 | |
1009 | /* |
1010 | * Flush all inode reclamation work and flush the log. |
1011 | * We have to do this /after/ rtunmount and qm_unmount because those |
1012 | * two will have scheduled delayed reclaim for the rt/quota inodes. |
1013 | * |
1014 | * This is slightly different from the unmountfs call sequence |
1015 | * because we could be tearing down a partially set up mount. In |
1016 | * particular, if log_mount_finish fails we bail out without calling |
1017 | * qm_unmount_quotas and therefore rely on qm_unmount to release the |
1018 | * quota inodes. |
1019 | */ |
1020 | xfs_unmount_flush_inodes(mp); |
1021 | out_log_dealloc: |
1022 | xfs_log_mount_cancel(mp); |
1023 | out_inodegc_shrinker: |
1024 | shrinker_free(shrinker: mp->m_inodegc_shrinker); |
1025 | out_fail_wait: |
1026 | if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) |
1027 | xfs_buftarg_drain(mp->m_logdev_targp); |
1028 | xfs_buftarg_drain(mp->m_ddev_targp); |
1029 | out_free_perag: |
1030 | xfs_free_perag(mp); |
1031 | out_free_dir: |
1032 | xfs_da_unmount(mp); |
1033 | out_remove_uuid: |
1034 | xfs_uuid_unmount(mp); |
1035 | out_remove_errortag: |
1036 | xfs_errortag_del(mp); |
1037 | out_remove_error_sysfs: |
1038 | xfs_error_sysfs_del(mp); |
1039 | out_remove_scrub_stats: |
1040 | xchk_stats_unregister(cs: mp->m_scrub_stats); |
1041 | xfs_sysfs_del(kobj: &mp->m_stats.xs_kobj); |
1042 | out_remove_sysfs: |
1043 | xfs_sysfs_del(kobj: &mp->m_kobj); |
1044 | out: |
1045 | return error; |
1046 | } |
1047 | |
1048 | /* |
1049 | * This flushes out the inodes,dquots and the superblock, unmounts the |
1050 | * log and makes sure that incore structures are freed. |
1051 | */ |
1052 | void |
1053 | xfs_unmountfs( |
1054 | struct xfs_mount *mp) |
1055 | { |
1056 | uint64_t resblks; |
1057 | int error; |
1058 | |
1059 | /* |
1060 | * Perform all on-disk metadata updates required to inactivate inodes |
1061 | * that the VFS evicted earlier in the unmount process. Freeing inodes |
1062 | * and discarding CoW fork preallocations can cause shape changes to |
1063 | * the free inode and refcount btrees, respectively, so we must finish |
1064 | * this before we discard the metadata space reservations. Metadata |
1065 | * inodes and the root directory do not require inactivation. |
1066 | */ |
1067 | xfs_inodegc_flush(mp); |
1068 | |
1069 | xfs_blockgc_stop(mp); |
1070 | xfs_fs_unreserve_ag_blocks(mp); |
1071 | xfs_qm_unmount_quotas(mp); |
1072 | xfs_rtunmount_inodes(mp); |
1073 | xfs_irele(ip: mp->m_rootip); |
1074 | |
1075 | xfs_unmount_flush_inodes(mp); |
1076 | |
1077 | xfs_qm_unmount(mp); |
1078 | |
1079 | /* |
1080 | * Unreserve any blocks we have so that when we unmount we don't account |
1081 | * the reserved free space as used. This is really only necessary for |
1082 | * lazy superblock counting because it trusts the incore superblock |
1083 | * counters to be absolutely correct on clean unmount. |
1084 | * |
1085 | * We don't bother correcting this elsewhere for lazy superblock |
1086 | * counting because on mount of an unclean filesystem we reconstruct the |
1087 | * correct counter value and this is irrelevant. |
1088 | * |
1089 | * For non-lazy counter filesystems, this doesn't matter at all because |
1090 | * we only every apply deltas to the superblock and hence the incore |
1091 | * value does not matter.... |
1092 | */ |
1093 | resblks = 0; |
1094 | error = xfs_reserve_blocks(mp, &resblks, NULL); |
1095 | if (error) |
1096 | xfs_warn(mp, "Unable to free reserved block pool. " |
1097 | "Freespace may not be correct on next mount." ); |
1098 | xfs_unmount_check(mp); |
1099 | |
1100 | xfs_log_unmount(mp); |
1101 | xfs_da_unmount(mp); |
1102 | xfs_uuid_unmount(mp); |
1103 | |
1104 | #if defined(DEBUG) |
1105 | xfs_errortag_clearall(mp); |
1106 | #endif |
1107 | shrinker_free(shrinker: mp->m_inodegc_shrinker); |
1108 | xfs_free_perag(mp); |
1109 | |
1110 | xfs_errortag_del(mp); |
1111 | xfs_error_sysfs_del(mp); |
1112 | xchk_stats_unregister(cs: mp->m_scrub_stats); |
1113 | xfs_sysfs_del(kobj: &mp->m_stats.xs_kobj); |
1114 | xfs_sysfs_del(kobj: &mp->m_kobj); |
1115 | } |
1116 | |
1117 | /* |
1118 | * Determine whether modifications can proceed. The caller specifies the minimum |
1119 | * freeze level for which modifications should not be allowed. This allows |
1120 | * certain operations to proceed while the freeze sequence is in progress, if |
1121 | * necessary. |
1122 | */ |
1123 | bool |
1124 | xfs_fs_writable( |
1125 | struct xfs_mount *mp, |
1126 | int level) |
1127 | { |
1128 | ASSERT(level > SB_UNFROZEN); |
1129 | if ((mp->m_super->s_writers.frozen >= level) || |
1130 | xfs_is_shutdown(mp) || xfs_is_readonly(mp)) |
1131 | return false; |
1132 | |
1133 | return true; |
1134 | } |
1135 | |
1136 | /* Adjust m_fdblocks or m_frextents. */ |
1137 | int |
1138 | xfs_mod_freecounter( |
1139 | struct xfs_mount *mp, |
1140 | struct percpu_counter *counter, |
1141 | int64_t delta, |
1142 | bool rsvd) |
1143 | { |
1144 | int64_t lcounter; |
1145 | long long res_used; |
1146 | uint64_t set_aside = 0; |
1147 | s32 batch; |
1148 | bool has_resv_pool; |
1149 | |
1150 | ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); |
1151 | has_resv_pool = (counter == &mp->m_fdblocks); |
1152 | if (rsvd) |
1153 | ASSERT(has_resv_pool); |
1154 | |
1155 | if (delta > 0) { |
1156 | /* |
1157 | * If the reserve pool is depleted, put blocks back into it |
1158 | * first. Most of the time the pool is full. |
1159 | */ |
1160 | if (likely(!has_resv_pool || |
1161 | mp->m_resblks == mp->m_resblks_avail)) { |
1162 | percpu_counter_add(fbc: counter, amount: delta); |
1163 | return 0; |
1164 | } |
1165 | |
1166 | spin_lock(lock: &mp->m_sb_lock); |
1167 | res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); |
1168 | |
1169 | if (res_used > delta) { |
1170 | mp->m_resblks_avail += delta; |
1171 | } else { |
1172 | delta -= res_used; |
1173 | mp->m_resblks_avail = mp->m_resblks; |
1174 | percpu_counter_add(fbc: counter, amount: delta); |
1175 | } |
1176 | spin_unlock(lock: &mp->m_sb_lock); |
1177 | return 0; |
1178 | } |
1179 | |
1180 | /* |
1181 | * Taking blocks away, need to be more accurate the closer we |
1182 | * are to zero. |
1183 | * |
1184 | * If the counter has a value of less than 2 * max batch size, |
1185 | * then make everything serialise as we are real close to |
1186 | * ENOSPC. |
1187 | */ |
1188 | if (__percpu_counter_compare(fbc: counter, rhs: 2 * XFS_FDBLOCKS_BATCH, |
1189 | XFS_FDBLOCKS_BATCH) < 0) |
1190 | batch = 1; |
1191 | else |
1192 | batch = XFS_FDBLOCKS_BATCH; |
1193 | |
1194 | /* |
1195 | * Set aside allocbt blocks because these blocks are tracked as free |
1196 | * space but not available for allocation. Technically this means that a |
1197 | * single reservation cannot consume all remaining free space, but the |
1198 | * ratio of allocbt blocks to usable free blocks should be rather small. |
1199 | * The tradeoff without this is that filesystems that maintain high |
1200 | * perag block reservations can over reserve physical block availability |
1201 | * and fail physical allocation, which leads to much more serious |
1202 | * problems (i.e. transaction abort, pagecache discards, etc.) than |
1203 | * slightly premature -ENOSPC. |
1204 | */ |
1205 | if (has_resv_pool) |
1206 | set_aside = xfs_fdblocks_unavailable(mp); |
1207 | percpu_counter_add_batch(fbc: counter, amount: delta, batch); |
1208 | if (__percpu_counter_compare(fbc: counter, rhs: set_aside, |
1209 | XFS_FDBLOCKS_BATCH) >= 0) { |
1210 | /* we had space! */ |
1211 | return 0; |
1212 | } |
1213 | |
1214 | /* |
1215 | * lock up the sb for dipping into reserves before releasing the space |
1216 | * that took us to ENOSPC. |
1217 | */ |
1218 | spin_lock(lock: &mp->m_sb_lock); |
1219 | percpu_counter_add(fbc: counter, amount: -delta); |
1220 | if (!has_resv_pool || !rsvd) |
1221 | goto fdblocks_enospc; |
1222 | |
1223 | lcounter = (long long)mp->m_resblks_avail + delta; |
1224 | if (lcounter >= 0) { |
1225 | mp->m_resblks_avail = lcounter; |
1226 | spin_unlock(lock: &mp->m_sb_lock); |
1227 | return 0; |
1228 | } |
1229 | xfs_warn_once(mp, |
1230 | "Reserve blocks depleted! Consider increasing reserve pool size." ); |
1231 | |
1232 | fdblocks_enospc: |
1233 | spin_unlock(lock: &mp->m_sb_lock); |
1234 | return -ENOSPC; |
1235 | } |
1236 | |
1237 | /* |
1238 | * Used to free the superblock along various error paths. |
1239 | */ |
1240 | void |
1241 | xfs_freesb( |
1242 | struct xfs_mount *mp) |
1243 | { |
1244 | struct xfs_buf *bp = mp->m_sb_bp; |
1245 | |
1246 | xfs_buf_lock(bp); |
1247 | mp->m_sb_bp = NULL; |
1248 | xfs_buf_relse(bp); |
1249 | } |
1250 | |
1251 | /* |
1252 | * If the underlying (data/log/rt) device is readonly, there are some |
1253 | * operations that cannot proceed. |
1254 | */ |
1255 | int |
1256 | xfs_dev_is_read_only( |
1257 | struct xfs_mount *mp, |
1258 | char *message) |
1259 | { |
1260 | if (xfs_readonly_buftarg(mp->m_ddev_targp) || |
1261 | xfs_readonly_buftarg(mp->m_logdev_targp) || |
1262 | (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { |
1263 | xfs_notice(mp, "%s required on read-only device." , message); |
1264 | xfs_notice(mp, "write access unavailable, cannot proceed." ); |
1265 | return -EROFS; |
1266 | } |
1267 | return 0; |
1268 | } |
1269 | |
1270 | /* Force the summary counters to be recalculated at next mount. */ |
1271 | void |
1272 | xfs_force_summary_recalc( |
1273 | struct xfs_mount *mp) |
1274 | { |
1275 | if (!xfs_has_lazysbcount(mp)) |
1276 | return; |
1277 | |
1278 | xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); |
1279 | } |
1280 | |
1281 | /* |
1282 | * Enable a log incompat feature flag in the primary superblock. The caller |
1283 | * cannot have any other transactions in progress. |
1284 | */ |
1285 | int |
1286 | xfs_add_incompat_log_feature( |
1287 | struct xfs_mount *mp, |
1288 | uint32_t feature) |
1289 | { |
1290 | struct xfs_dsb *dsb; |
1291 | int error; |
1292 | |
1293 | ASSERT(hweight32(feature) == 1); |
1294 | ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); |
1295 | |
1296 | /* |
1297 | * Force the log to disk and kick the background AIL thread to reduce |
1298 | * the chances that the bwrite will stall waiting for the AIL to unpin |
1299 | * the primary superblock buffer. This isn't a data integrity |
1300 | * operation, so we don't need a synchronous push. |
1301 | */ |
1302 | error = xfs_log_force(mp, XFS_LOG_SYNC); |
1303 | if (error) |
1304 | return error; |
1305 | xfs_ail_push_all(mp->m_ail); |
1306 | |
1307 | /* |
1308 | * Lock the primary superblock buffer to serialize all callers that |
1309 | * are trying to set feature bits. |
1310 | */ |
1311 | xfs_buf_lock(mp->m_sb_bp); |
1312 | xfs_buf_hold(bp: mp->m_sb_bp); |
1313 | |
1314 | if (xfs_is_shutdown(mp)) { |
1315 | error = -EIO; |
1316 | goto rele; |
1317 | } |
1318 | |
1319 | if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature)) |
1320 | goto rele; |
1321 | |
1322 | /* |
1323 | * Write the primary superblock to disk immediately, because we need |
1324 | * the log_incompat bit to be set in the primary super now to protect |
1325 | * the log items that we're going to commit later. |
1326 | */ |
1327 | dsb = mp->m_sb_bp->b_addr; |
1328 | xfs_sb_to_disk(dsb, &mp->m_sb); |
1329 | dsb->sb_features_log_incompat |= cpu_to_be32(feature); |
1330 | error = xfs_bwrite(bp: mp->m_sb_bp); |
1331 | if (error) |
1332 | goto shutdown; |
1333 | |
1334 | /* |
1335 | * Add the feature bits to the incore superblock before we unlock the |
1336 | * buffer. |
1337 | */ |
1338 | xfs_sb_add_incompat_log_features(&mp->m_sb, feature); |
1339 | xfs_buf_relse(bp: mp->m_sb_bp); |
1340 | |
1341 | /* Log the superblock to disk. */ |
1342 | return xfs_sync_sb(mp, false); |
1343 | shutdown: |
1344 | xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); |
1345 | rele: |
1346 | xfs_buf_relse(bp: mp->m_sb_bp); |
1347 | return error; |
1348 | } |
1349 | |
1350 | /* |
1351 | * Clear all the log incompat flags from the superblock. |
1352 | * |
1353 | * The caller cannot be in a transaction, must ensure that the log does not |
1354 | * contain any log items protected by any log incompat bit, and must ensure |
1355 | * that there are no other threads that depend on the state of the log incompat |
1356 | * feature flags in the primary super. |
1357 | * |
1358 | * Returns true if the superblock is dirty. |
1359 | */ |
1360 | bool |
1361 | xfs_clear_incompat_log_features( |
1362 | struct xfs_mount *mp) |
1363 | { |
1364 | bool ret = false; |
1365 | |
1366 | if (!xfs_has_crc(mp) || |
1367 | !xfs_sb_has_incompat_log_feature(&mp->m_sb, |
1368 | XFS_SB_FEAT_INCOMPAT_LOG_ALL) || |
1369 | xfs_is_shutdown(mp)) |
1370 | return false; |
1371 | |
1372 | /* |
1373 | * Update the incore superblock. We synchronize on the primary super |
1374 | * buffer lock to be consistent with the add function, though at least |
1375 | * in theory this shouldn't be necessary. |
1376 | */ |
1377 | xfs_buf_lock(mp->m_sb_bp); |
1378 | xfs_buf_hold(bp: mp->m_sb_bp); |
1379 | |
1380 | if (xfs_sb_has_incompat_log_feature(&mp->m_sb, |
1381 | XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { |
1382 | xfs_sb_remove_incompat_log_features(&mp->m_sb); |
1383 | ret = true; |
1384 | } |
1385 | |
1386 | xfs_buf_relse(bp: mp->m_sb_bp); |
1387 | return ret; |
1388 | } |
1389 | |
1390 | /* |
1391 | * Update the in-core delayed block counter. |
1392 | * |
1393 | * We prefer to update the counter without having to take a spinlock for every |
1394 | * counter update (i.e. batching). Each change to delayed allocation |
1395 | * reservations can change can easily exceed the default percpu counter |
1396 | * batching, so we use a larger batch factor here. |
1397 | * |
1398 | * Note that we don't currently have any callers requiring fast summation |
1399 | * (e.g. percpu_counter_read) so we can use a big batch value here. |
1400 | */ |
1401 | #define XFS_DELALLOC_BATCH (4096) |
1402 | void |
1403 | xfs_mod_delalloc( |
1404 | struct xfs_mount *mp, |
1405 | int64_t delta) |
1406 | { |
1407 | percpu_counter_add_batch(fbc: &mp->m_delalloc_blks, amount: delta, |
1408 | XFS_DELALLOC_BATCH); |
1409 | } |
1410 | |