1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 1991, 1992 Linus Torvalds |
4 | * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Copyright (C) 2016 - 2020 Christoph Hellwig |
6 | */ |
7 | |
8 | #include <linux/init.h> |
9 | #include <linux/mm.h> |
10 | #include <linux/slab.h> |
11 | #include <linux/kmod.h> |
12 | #include <linux/major.h> |
13 | #include <linux/device_cgroup.h> |
14 | #include <linux/blkdev.h> |
15 | #include <linux/blk-integrity.h> |
16 | #include <linux/backing-dev.h> |
17 | #include <linux/module.h> |
18 | #include <linux/blkpg.h> |
19 | #include <linux/magic.h> |
20 | #include <linux/buffer_head.h> |
21 | #include <linux/swap.h> |
22 | #include <linux/writeback.h> |
23 | #include <linux/mount.h> |
24 | #include <linux/pseudo_fs.h> |
25 | #include <linux/uio.h> |
26 | #include <linux/namei.h> |
27 | #include <linux/part_stat.h> |
28 | #include <linux/uaccess.h> |
29 | #include <linux/stat.h> |
30 | #include "../fs/internal.h" |
31 | #include "blk.h" |
32 | |
33 | /* Should we allow writing to mounted block devices? */ |
34 | static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); |
35 | |
36 | struct bdev_inode { |
37 | struct block_device bdev; |
38 | struct inode vfs_inode; |
39 | }; |
40 | |
41 | static inline struct bdev_inode *BDEV_I(struct inode *inode) |
42 | { |
43 | return container_of(inode, struct bdev_inode, vfs_inode); |
44 | } |
45 | |
46 | struct block_device *I_BDEV(struct inode *inode) |
47 | { |
48 | return &BDEV_I(inode)->bdev; |
49 | } |
50 | EXPORT_SYMBOL(I_BDEV); |
51 | |
52 | struct block_device *file_bdev(struct file *bdev_file) |
53 | { |
54 | return I_BDEV(bdev_file->f_mapping->host); |
55 | } |
56 | EXPORT_SYMBOL(file_bdev); |
57 | |
58 | static void bdev_write_inode(struct block_device *bdev) |
59 | { |
60 | struct inode *inode = bdev->bd_inode; |
61 | int ret; |
62 | |
63 | spin_lock(lock: &inode->i_lock); |
64 | while (inode->i_state & I_DIRTY) { |
65 | spin_unlock(lock: &inode->i_lock); |
66 | ret = write_inode_now(inode, sync: true); |
67 | if (ret) |
68 | pr_warn_ratelimited( |
69 | "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n" , |
70 | bdev, ret); |
71 | spin_lock(lock: &inode->i_lock); |
72 | } |
73 | spin_unlock(lock: &inode->i_lock); |
74 | } |
75 | |
76 | /* Kill _all_ buffers and pagecache , dirty or not.. */ |
77 | static void kill_bdev(struct block_device *bdev) |
78 | { |
79 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
80 | |
81 | if (mapping_empty(mapping)) |
82 | return; |
83 | |
84 | invalidate_bh_lrus(); |
85 | truncate_inode_pages(mapping, 0); |
86 | } |
87 | |
88 | /* Invalidate clean unused buffers and pagecache. */ |
89 | void invalidate_bdev(struct block_device *bdev) |
90 | { |
91 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
92 | |
93 | if (mapping->nrpages) { |
94 | invalidate_bh_lrus(); |
95 | lru_add_drain_all(); /* make sure all lru add caches are flushed */ |
96 | invalidate_mapping_pages(mapping, start: 0, end: -1); |
97 | } |
98 | } |
99 | EXPORT_SYMBOL(invalidate_bdev); |
100 | |
101 | /* |
102 | * Drop all buffers & page cache for given bdev range. This function bails |
103 | * with error if bdev has other exclusive owner (such as filesystem). |
104 | */ |
105 | int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, |
106 | loff_t lstart, loff_t lend) |
107 | { |
108 | /* |
109 | * If we don't hold exclusive handle for the device, upgrade to it |
110 | * while we discard the buffer cache to avoid discarding buffers |
111 | * under live filesystem. |
112 | */ |
113 | if (!(mode & BLK_OPEN_EXCL)) { |
114 | int err = bd_prepare_to_claim(bdev, holder: truncate_bdev_range, NULL); |
115 | if (err) |
116 | goto invalidate; |
117 | } |
118 | |
119 | truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); |
120 | if (!(mode & BLK_OPEN_EXCL)) |
121 | bd_abort_claiming(bdev, holder: truncate_bdev_range); |
122 | return 0; |
123 | |
124 | invalidate: |
125 | /* |
126 | * Someone else has handle exclusively open. Try invalidating instead. |
127 | * The 'end' argument is inclusive so the rounding is safe. |
128 | */ |
129 | return invalidate_inode_pages2_range(mapping: bdev->bd_inode->i_mapping, |
130 | start: lstart >> PAGE_SHIFT, |
131 | end: lend >> PAGE_SHIFT); |
132 | } |
133 | |
134 | static void set_init_blocksize(struct block_device *bdev) |
135 | { |
136 | unsigned int bsize = bdev_logical_block_size(bdev); |
137 | loff_t size = i_size_read(inode: bdev->bd_inode); |
138 | |
139 | while (bsize < PAGE_SIZE) { |
140 | if (size & bsize) |
141 | break; |
142 | bsize <<= 1; |
143 | } |
144 | bdev->bd_inode->i_blkbits = blksize_bits(size: bsize); |
145 | } |
146 | |
147 | int set_blocksize(struct block_device *bdev, int size) |
148 | { |
149 | /* Size must be a power of two, and between 512 and PAGE_SIZE */ |
150 | if (size > PAGE_SIZE || size < 512 || !is_power_of_2(n: size)) |
151 | return -EINVAL; |
152 | |
153 | /* Size cannot be smaller than the size supported by the device */ |
154 | if (size < bdev_logical_block_size(bdev)) |
155 | return -EINVAL; |
156 | |
157 | /* Don't change the size if it is same as current */ |
158 | if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { |
159 | sync_blockdev(bdev); |
160 | bdev->bd_inode->i_blkbits = blksize_bits(size); |
161 | kill_bdev(bdev); |
162 | } |
163 | return 0; |
164 | } |
165 | |
166 | EXPORT_SYMBOL(set_blocksize); |
167 | |
168 | int sb_set_blocksize(struct super_block *sb, int size) |
169 | { |
170 | if (set_blocksize(sb->s_bdev, size)) |
171 | return 0; |
172 | /* If we get here, we know size is power of two |
173 | * and it's value is between 512 and PAGE_SIZE */ |
174 | sb->s_blocksize = size; |
175 | sb->s_blocksize_bits = blksize_bits(size); |
176 | return sb->s_blocksize; |
177 | } |
178 | |
179 | EXPORT_SYMBOL(sb_set_blocksize); |
180 | |
181 | int sb_min_blocksize(struct super_block *sb, int size) |
182 | { |
183 | int minsize = bdev_logical_block_size(bdev: sb->s_bdev); |
184 | if (size < minsize) |
185 | size = minsize; |
186 | return sb_set_blocksize(sb, size); |
187 | } |
188 | |
189 | EXPORT_SYMBOL(sb_min_blocksize); |
190 | |
191 | int sync_blockdev_nowait(struct block_device *bdev) |
192 | { |
193 | if (!bdev) |
194 | return 0; |
195 | return filemap_flush(bdev->bd_inode->i_mapping); |
196 | } |
197 | EXPORT_SYMBOL_GPL(sync_blockdev_nowait); |
198 | |
199 | /* |
200 | * Write out and wait upon all the dirty data associated with a block |
201 | * device via its mapping. Does not take the superblock lock. |
202 | */ |
203 | int sync_blockdev(struct block_device *bdev) |
204 | { |
205 | if (!bdev) |
206 | return 0; |
207 | return filemap_write_and_wait(mapping: bdev->bd_inode->i_mapping); |
208 | } |
209 | EXPORT_SYMBOL(sync_blockdev); |
210 | |
211 | int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) |
212 | { |
213 | return filemap_write_and_wait_range(mapping: bdev->bd_inode->i_mapping, |
214 | lstart, lend); |
215 | } |
216 | EXPORT_SYMBOL(sync_blockdev_range); |
217 | |
218 | /** |
219 | * bdev_freeze - lock a filesystem and force it into a consistent state |
220 | * @bdev: blockdevice to lock |
221 | * |
222 | * If a superblock is found on this device, we take the s_umount semaphore |
223 | * on it to make sure nobody unmounts until the snapshot creation is done. |
224 | * The reference counter (bd_fsfreeze_count) guarantees that only the last |
225 | * unfreeze process can unfreeze the frozen filesystem actually when multiple |
226 | * freeze requests arrive simultaneously. It counts up in bdev_freeze() and |
227 | * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze |
228 | * actually. |
229 | * |
230 | * Return: On success zero is returned, negative error code on failure. |
231 | */ |
232 | int bdev_freeze(struct block_device *bdev) |
233 | { |
234 | int error = 0; |
235 | |
236 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
237 | |
238 | if (atomic_inc_return(v: &bdev->bd_fsfreeze_count) > 1) { |
239 | mutex_unlock(lock: &bdev->bd_fsfreeze_mutex); |
240 | return 0; |
241 | } |
242 | |
243 | mutex_lock(&bdev->bd_holder_lock); |
244 | if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { |
245 | error = bdev->bd_holder_ops->freeze(bdev); |
246 | lockdep_assert_not_held(&bdev->bd_holder_lock); |
247 | } else { |
248 | mutex_unlock(lock: &bdev->bd_holder_lock); |
249 | error = sync_blockdev(bdev); |
250 | } |
251 | |
252 | if (error) |
253 | atomic_dec(v: &bdev->bd_fsfreeze_count); |
254 | |
255 | mutex_unlock(lock: &bdev->bd_fsfreeze_mutex); |
256 | return error; |
257 | } |
258 | EXPORT_SYMBOL(bdev_freeze); |
259 | |
260 | /** |
261 | * bdev_thaw - unlock filesystem |
262 | * @bdev: blockdevice to unlock |
263 | * |
264 | * Unlocks the filesystem and marks it writeable again after bdev_freeze(). |
265 | * |
266 | * Return: On success zero is returned, negative error code on failure. |
267 | */ |
268 | int bdev_thaw(struct block_device *bdev) |
269 | { |
270 | int error = -EINVAL, nr_freeze; |
271 | |
272 | mutex_lock(&bdev->bd_fsfreeze_mutex); |
273 | |
274 | /* |
275 | * If this returns < 0 it means that @bd_fsfreeze_count was |
276 | * already 0 and no decrement was performed. |
277 | */ |
278 | nr_freeze = atomic_dec_if_positive(v: &bdev->bd_fsfreeze_count); |
279 | if (nr_freeze < 0) |
280 | goto out; |
281 | |
282 | error = 0; |
283 | if (nr_freeze > 0) |
284 | goto out; |
285 | |
286 | mutex_lock(&bdev->bd_holder_lock); |
287 | if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { |
288 | error = bdev->bd_holder_ops->thaw(bdev); |
289 | lockdep_assert_not_held(&bdev->bd_holder_lock); |
290 | } else { |
291 | mutex_unlock(lock: &bdev->bd_holder_lock); |
292 | } |
293 | |
294 | if (error) |
295 | atomic_inc(v: &bdev->bd_fsfreeze_count); |
296 | out: |
297 | mutex_unlock(lock: &bdev->bd_fsfreeze_mutex); |
298 | return error; |
299 | } |
300 | EXPORT_SYMBOL(bdev_thaw); |
301 | |
302 | /* |
303 | * pseudo-fs |
304 | */ |
305 | |
306 | static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); |
307 | static struct kmem_cache *bdev_cachep __ro_after_init; |
308 | |
309 | static struct inode *bdev_alloc_inode(struct super_block *sb) |
310 | { |
311 | struct bdev_inode *ei = alloc_inode_sb(sb, cache: bdev_cachep, GFP_KERNEL); |
312 | |
313 | if (!ei) |
314 | return NULL; |
315 | memset(&ei->bdev, 0, sizeof(ei->bdev)); |
316 | return &ei->vfs_inode; |
317 | } |
318 | |
319 | static void bdev_free_inode(struct inode *inode) |
320 | { |
321 | struct block_device *bdev = I_BDEV(inode); |
322 | |
323 | free_percpu(pdata: bdev->bd_stats); |
324 | kfree(objp: bdev->bd_meta_info); |
325 | |
326 | if (!bdev_is_partition(bdev)) { |
327 | if (bdev->bd_disk && bdev->bd_disk->bdi) |
328 | bdi_put(bdi: bdev->bd_disk->bdi); |
329 | kfree(objp: bdev->bd_disk); |
330 | } |
331 | |
332 | if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) |
333 | blk_free_ext_minor(MINOR(bdev->bd_dev)); |
334 | |
335 | kmem_cache_free(s: bdev_cachep, objp: BDEV_I(inode)); |
336 | } |
337 | |
338 | static void init_once(void *data) |
339 | { |
340 | struct bdev_inode *ei = data; |
341 | |
342 | inode_init_once(&ei->vfs_inode); |
343 | } |
344 | |
345 | static void bdev_evict_inode(struct inode *inode) |
346 | { |
347 | truncate_inode_pages_final(&inode->i_data); |
348 | invalidate_inode_buffers(inode); /* is it needed here? */ |
349 | clear_inode(inode); |
350 | } |
351 | |
352 | static const struct super_operations bdev_sops = { |
353 | .statfs = simple_statfs, |
354 | .alloc_inode = bdev_alloc_inode, |
355 | .free_inode = bdev_free_inode, |
356 | .drop_inode = generic_delete_inode, |
357 | .evict_inode = bdev_evict_inode, |
358 | }; |
359 | |
360 | static int bd_init_fs_context(struct fs_context *fc) |
361 | { |
362 | struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); |
363 | if (!ctx) |
364 | return -ENOMEM; |
365 | fc->s_iflags |= SB_I_CGROUPWB; |
366 | ctx->ops = &bdev_sops; |
367 | return 0; |
368 | } |
369 | |
370 | static struct file_system_type bd_type = { |
371 | .name = "bdev" , |
372 | .init_fs_context = bd_init_fs_context, |
373 | .kill_sb = kill_anon_super, |
374 | }; |
375 | |
376 | struct super_block *blockdev_superblock __ro_after_init; |
377 | struct vfsmount *blockdev_mnt __ro_after_init; |
378 | EXPORT_SYMBOL_GPL(blockdev_superblock); |
379 | |
380 | void __init bdev_cache_init(void) |
381 | { |
382 | int err; |
383 | |
384 | bdev_cachep = kmem_cache_create(name: "bdev_cache" , size: sizeof(struct bdev_inode), |
385 | align: 0, flags: (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| |
386 | SLAB_ACCOUNT|SLAB_PANIC), |
387 | ctor: init_once); |
388 | err = register_filesystem(&bd_type); |
389 | if (err) |
390 | panic(fmt: "Cannot register bdev pseudo-fs" ); |
391 | blockdev_mnt = kern_mount(&bd_type); |
392 | if (IS_ERR(ptr: blockdev_mnt)) |
393 | panic(fmt: "Cannot create bdev pseudo-fs" ); |
394 | blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */ |
395 | } |
396 | |
397 | struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) |
398 | { |
399 | struct block_device *bdev; |
400 | struct inode *inode; |
401 | |
402 | inode = new_inode(sb: blockdev_superblock); |
403 | if (!inode) |
404 | return NULL; |
405 | inode->i_mode = S_IFBLK; |
406 | inode->i_rdev = 0; |
407 | inode->i_data.a_ops = &def_blk_aops; |
408 | mapping_set_gfp_mask(m: &inode->i_data, GFP_USER); |
409 | |
410 | bdev = I_BDEV(inode); |
411 | mutex_init(&bdev->bd_fsfreeze_mutex); |
412 | spin_lock_init(&bdev->bd_size_lock); |
413 | mutex_init(&bdev->bd_holder_lock); |
414 | bdev->bd_partno = partno; |
415 | bdev->bd_inode = inode; |
416 | bdev->bd_queue = disk->queue; |
417 | if (partno) |
418 | bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; |
419 | else |
420 | bdev->bd_has_submit_bio = false; |
421 | bdev->bd_stats = alloc_percpu(struct disk_stats); |
422 | if (!bdev->bd_stats) { |
423 | iput(inode); |
424 | return NULL; |
425 | } |
426 | bdev->bd_disk = disk; |
427 | return bdev; |
428 | } |
429 | |
430 | void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) |
431 | { |
432 | spin_lock(lock: &bdev->bd_size_lock); |
433 | i_size_write(inode: bdev->bd_inode, i_size: (loff_t)sectors << SECTOR_SHIFT); |
434 | bdev->bd_nr_sectors = sectors; |
435 | spin_unlock(lock: &bdev->bd_size_lock); |
436 | } |
437 | |
438 | void bdev_add(struct block_device *bdev, dev_t dev) |
439 | { |
440 | if (bdev_stable_writes(bdev)) |
441 | mapping_set_stable_writes(mapping: bdev->bd_inode->i_mapping); |
442 | bdev->bd_dev = dev; |
443 | bdev->bd_inode->i_rdev = dev; |
444 | bdev->bd_inode->i_ino = dev; |
445 | insert_inode_hash(inode: bdev->bd_inode); |
446 | } |
447 | |
448 | long nr_blockdev_pages(void) |
449 | { |
450 | struct inode *inode; |
451 | long ret = 0; |
452 | |
453 | spin_lock(lock: &blockdev_superblock->s_inode_list_lock); |
454 | list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) |
455 | ret += inode->i_mapping->nrpages; |
456 | spin_unlock(lock: &blockdev_superblock->s_inode_list_lock); |
457 | |
458 | return ret; |
459 | } |
460 | |
461 | /** |
462 | * bd_may_claim - test whether a block device can be claimed |
463 | * @bdev: block device of interest |
464 | * @holder: holder trying to claim @bdev |
465 | * @hops: holder ops |
466 | * |
467 | * Test whether @bdev can be claimed by @holder. |
468 | * |
469 | * RETURNS: |
470 | * %true if @bdev can be claimed, %false otherwise. |
471 | */ |
472 | static bool bd_may_claim(struct block_device *bdev, void *holder, |
473 | const struct blk_holder_ops *hops) |
474 | { |
475 | struct block_device *whole = bdev_whole(bdev); |
476 | |
477 | lockdep_assert_held(&bdev_lock); |
478 | |
479 | if (bdev->bd_holder) { |
480 | /* |
481 | * The same holder can always re-claim. |
482 | */ |
483 | if (bdev->bd_holder == holder) { |
484 | if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) |
485 | return false; |
486 | return true; |
487 | } |
488 | return false; |
489 | } |
490 | |
491 | /* |
492 | * If the whole devices holder is set to bd_may_claim, a partition on |
493 | * the device is claimed, but not the whole device. |
494 | */ |
495 | if (whole != bdev && |
496 | whole->bd_holder && whole->bd_holder != bd_may_claim) |
497 | return false; |
498 | return true; |
499 | } |
500 | |
501 | /** |
502 | * bd_prepare_to_claim - claim a block device |
503 | * @bdev: block device of interest |
504 | * @holder: holder trying to claim @bdev |
505 | * @hops: holder ops. |
506 | * |
507 | * Claim @bdev. This function fails if @bdev is already claimed by another |
508 | * holder and waits if another claiming is in progress. return, the caller |
509 | * has ownership of bd_claiming and bd_holder[s]. |
510 | * |
511 | * RETURNS: |
512 | * 0 if @bdev can be claimed, -EBUSY otherwise. |
513 | */ |
514 | int bd_prepare_to_claim(struct block_device *bdev, void *holder, |
515 | const struct blk_holder_ops *hops) |
516 | { |
517 | struct block_device *whole = bdev_whole(bdev); |
518 | |
519 | if (WARN_ON_ONCE(!holder)) |
520 | return -EINVAL; |
521 | retry: |
522 | mutex_lock(&bdev_lock); |
523 | /* if someone else claimed, fail */ |
524 | if (!bd_may_claim(bdev, holder, hops)) { |
525 | mutex_unlock(lock: &bdev_lock); |
526 | return -EBUSY; |
527 | } |
528 | |
529 | /* if claiming is already in progress, wait for it to finish */ |
530 | if (whole->bd_claiming) { |
531 | wait_queue_head_t *wq = bit_waitqueue(word: &whole->bd_claiming, bit: 0); |
532 | DEFINE_WAIT(wait); |
533 | |
534 | prepare_to_wait(wq_head: wq, wq_entry: &wait, TASK_UNINTERRUPTIBLE); |
535 | mutex_unlock(lock: &bdev_lock); |
536 | schedule(); |
537 | finish_wait(wq_head: wq, wq_entry: &wait); |
538 | goto retry; |
539 | } |
540 | |
541 | /* yay, all mine */ |
542 | whole->bd_claiming = holder; |
543 | mutex_unlock(lock: &bdev_lock); |
544 | return 0; |
545 | } |
546 | EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ |
547 | |
548 | static void bd_clear_claiming(struct block_device *whole, void *holder) |
549 | { |
550 | lockdep_assert_held(&bdev_lock); |
551 | /* tell others that we're done */ |
552 | BUG_ON(whole->bd_claiming != holder); |
553 | whole->bd_claiming = NULL; |
554 | wake_up_bit(word: &whole->bd_claiming, bit: 0); |
555 | } |
556 | |
557 | /** |
558 | * bd_finish_claiming - finish claiming of a block device |
559 | * @bdev: block device of interest |
560 | * @holder: holder that has claimed @bdev |
561 | * @hops: block device holder operations |
562 | * |
563 | * Finish exclusive open of a block device. Mark the device as exlusively |
564 | * open by the holder and wake up all waiters for exclusive open to finish. |
565 | */ |
566 | static void bd_finish_claiming(struct block_device *bdev, void *holder, |
567 | const struct blk_holder_ops *hops) |
568 | { |
569 | struct block_device *whole = bdev_whole(bdev); |
570 | |
571 | mutex_lock(&bdev_lock); |
572 | BUG_ON(!bd_may_claim(bdev, holder, hops)); |
573 | /* |
574 | * Note that for a whole device bd_holders will be incremented twice, |
575 | * and bd_holder will be set to bd_may_claim before being set to holder |
576 | */ |
577 | whole->bd_holders++; |
578 | whole->bd_holder = bd_may_claim; |
579 | bdev->bd_holders++; |
580 | mutex_lock(&bdev->bd_holder_lock); |
581 | bdev->bd_holder = holder; |
582 | bdev->bd_holder_ops = hops; |
583 | mutex_unlock(lock: &bdev->bd_holder_lock); |
584 | bd_clear_claiming(whole, holder); |
585 | mutex_unlock(lock: &bdev_lock); |
586 | } |
587 | |
588 | /** |
589 | * bd_abort_claiming - abort claiming of a block device |
590 | * @bdev: block device of interest |
591 | * @holder: holder that has claimed @bdev |
592 | * |
593 | * Abort claiming of a block device when the exclusive open failed. This can be |
594 | * also used when exclusive open is not actually desired and we just needed |
595 | * to block other exclusive openers for a while. |
596 | */ |
597 | void bd_abort_claiming(struct block_device *bdev, void *holder) |
598 | { |
599 | mutex_lock(&bdev_lock); |
600 | bd_clear_claiming(bdev_whole(bdev), holder); |
601 | mutex_unlock(lock: &bdev_lock); |
602 | } |
603 | EXPORT_SYMBOL(bd_abort_claiming); |
604 | |
605 | static void bd_end_claim(struct block_device *bdev, void *holder) |
606 | { |
607 | struct block_device *whole = bdev_whole(bdev); |
608 | bool unblock = false; |
609 | |
610 | /* |
611 | * Release a claim on the device. The holder fields are protected with |
612 | * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. |
613 | */ |
614 | mutex_lock(&bdev_lock); |
615 | WARN_ON_ONCE(bdev->bd_holder != holder); |
616 | WARN_ON_ONCE(--bdev->bd_holders < 0); |
617 | WARN_ON_ONCE(--whole->bd_holders < 0); |
618 | if (!bdev->bd_holders) { |
619 | mutex_lock(&bdev->bd_holder_lock); |
620 | bdev->bd_holder = NULL; |
621 | bdev->bd_holder_ops = NULL; |
622 | mutex_unlock(lock: &bdev->bd_holder_lock); |
623 | if (bdev->bd_write_holder) |
624 | unblock = true; |
625 | } |
626 | if (!whole->bd_holders) |
627 | whole->bd_holder = NULL; |
628 | mutex_unlock(lock: &bdev_lock); |
629 | |
630 | /* |
631 | * If this was the last claim, remove holder link and unblock evpoll if |
632 | * it was a write holder. |
633 | */ |
634 | if (unblock) { |
635 | disk_unblock_events(disk: bdev->bd_disk); |
636 | bdev->bd_write_holder = false; |
637 | } |
638 | } |
639 | |
640 | static void blkdev_flush_mapping(struct block_device *bdev) |
641 | { |
642 | WARN_ON_ONCE(bdev->bd_holders); |
643 | sync_blockdev(bdev); |
644 | kill_bdev(bdev); |
645 | bdev_write_inode(bdev); |
646 | } |
647 | |
648 | static void blkdev_put_whole(struct block_device *bdev) |
649 | { |
650 | if (atomic_dec_and_test(v: &bdev->bd_openers)) |
651 | blkdev_flush_mapping(bdev); |
652 | if (bdev->bd_disk->fops->release) |
653 | bdev->bd_disk->fops->release(bdev->bd_disk); |
654 | } |
655 | |
656 | static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) |
657 | { |
658 | struct gendisk *disk = bdev->bd_disk; |
659 | int ret; |
660 | |
661 | if (disk->fops->open) { |
662 | ret = disk->fops->open(disk, mode); |
663 | if (ret) { |
664 | /* avoid ghost partitions on a removed medium */ |
665 | if (ret == -ENOMEDIUM && |
666 | test_bit(GD_NEED_PART_SCAN, &disk->state)) |
667 | bdev_disk_changed(disk, invalidate: true); |
668 | return ret; |
669 | } |
670 | } |
671 | |
672 | if (!atomic_read(v: &bdev->bd_openers)) |
673 | set_init_blocksize(bdev); |
674 | atomic_inc(v: &bdev->bd_openers); |
675 | if (test_bit(GD_NEED_PART_SCAN, &disk->state)) { |
676 | /* |
677 | * Only return scanning errors if we are called from contexts |
678 | * that explicitly want them, e.g. the BLKRRPART ioctl. |
679 | */ |
680 | ret = bdev_disk_changed(disk, invalidate: false); |
681 | if (ret && (mode & BLK_OPEN_STRICT_SCAN)) { |
682 | blkdev_put_whole(bdev); |
683 | return ret; |
684 | } |
685 | } |
686 | return 0; |
687 | } |
688 | |
689 | static int blkdev_get_part(struct block_device *part, blk_mode_t mode) |
690 | { |
691 | struct gendisk *disk = part->bd_disk; |
692 | int ret; |
693 | |
694 | ret = blkdev_get_whole(bdev_whole(part), mode); |
695 | if (ret) |
696 | return ret; |
697 | |
698 | ret = -ENXIO; |
699 | if (!bdev_nr_sectors(bdev: part)) |
700 | goto out_blkdev_put; |
701 | |
702 | if (!atomic_read(v: &part->bd_openers)) { |
703 | disk->open_partitions++; |
704 | set_init_blocksize(part); |
705 | } |
706 | atomic_inc(v: &part->bd_openers); |
707 | return 0; |
708 | |
709 | out_blkdev_put: |
710 | blkdev_put_whole(bdev_whole(part)); |
711 | return ret; |
712 | } |
713 | |
714 | int bdev_permission(dev_t dev, blk_mode_t mode, void *holder) |
715 | { |
716 | int ret; |
717 | |
718 | ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, |
719 | MAJOR(dev), MINOR(dev), |
720 | access: ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | |
721 | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); |
722 | if (ret) |
723 | return ret; |
724 | |
725 | /* Blocking writes requires exclusive opener */ |
726 | if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) |
727 | return -EINVAL; |
728 | |
729 | /* |
730 | * We're using error pointers to indicate to ->release() when we |
731 | * failed to open that block device. Also this doesn't make sense. |
732 | */ |
733 | if (WARN_ON_ONCE(IS_ERR(holder))) |
734 | return -EINVAL; |
735 | |
736 | return 0; |
737 | } |
738 | |
739 | static void blkdev_put_part(struct block_device *part) |
740 | { |
741 | struct block_device *whole = bdev_whole(part); |
742 | |
743 | if (atomic_dec_and_test(v: &part->bd_openers)) { |
744 | blkdev_flush_mapping(bdev: part); |
745 | whole->bd_disk->open_partitions--; |
746 | } |
747 | blkdev_put_whole(bdev: whole); |
748 | } |
749 | |
750 | struct block_device *blkdev_get_no_open(dev_t dev) |
751 | { |
752 | struct block_device *bdev; |
753 | struct inode *inode; |
754 | |
755 | inode = ilookup(sb: blockdev_superblock, ino: dev); |
756 | if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { |
757 | blk_request_module(devt: dev); |
758 | inode = ilookup(sb: blockdev_superblock, ino: dev); |
759 | if (inode) |
760 | pr_warn_ratelimited( |
761 | "block device autoloading is deprecated and will be removed.\n" ); |
762 | } |
763 | if (!inode) |
764 | return NULL; |
765 | |
766 | /* switch from the inode reference to a device mode one: */ |
767 | bdev = &BDEV_I(inode)->bdev; |
768 | if (!kobject_get_unless_zero(kobj: &bdev->bd_device.kobj)) |
769 | bdev = NULL; |
770 | iput(inode); |
771 | return bdev; |
772 | } |
773 | |
774 | void blkdev_put_no_open(struct block_device *bdev) |
775 | { |
776 | put_device(dev: &bdev->bd_device); |
777 | } |
778 | |
779 | static bool bdev_writes_blocked(struct block_device *bdev) |
780 | { |
781 | return bdev->bd_writers < 0; |
782 | } |
783 | |
784 | static void bdev_block_writes(struct block_device *bdev) |
785 | { |
786 | bdev->bd_writers--; |
787 | } |
788 | |
789 | static void bdev_unblock_writes(struct block_device *bdev) |
790 | { |
791 | bdev->bd_writers++; |
792 | } |
793 | |
794 | static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) |
795 | { |
796 | if (bdev_allow_write_mounted) |
797 | return true; |
798 | /* Writes blocked? */ |
799 | if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) |
800 | return false; |
801 | if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) |
802 | return false; |
803 | return true; |
804 | } |
805 | |
806 | static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) |
807 | { |
808 | if (bdev_allow_write_mounted) |
809 | return; |
810 | |
811 | /* Claim exclusive or shared write access. */ |
812 | if (mode & BLK_OPEN_RESTRICT_WRITES) |
813 | bdev_block_writes(bdev); |
814 | else if (mode & BLK_OPEN_WRITE) |
815 | bdev->bd_writers++; |
816 | } |
817 | |
818 | static inline bool bdev_unclaimed(const struct file *bdev_file) |
819 | { |
820 | return bdev_file->private_data == BDEV_I(inode: bdev_file->f_mapping->host); |
821 | } |
822 | |
823 | static void bdev_yield_write_access(struct file *bdev_file) |
824 | { |
825 | struct block_device *bdev; |
826 | |
827 | if (bdev_allow_write_mounted) |
828 | return; |
829 | |
830 | if (bdev_unclaimed(bdev_file)) |
831 | return; |
832 | |
833 | bdev = file_bdev(bdev_file); |
834 | |
835 | if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED) |
836 | bdev_unblock_writes(bdev); |
837 | else if (bdev_file->f_mode & FMODE_WRITE) |
838 | bdev->bd_writers--; |
839 | } |
840 | |
841 | /** |
842 | * bdev_open - open a block device |
843 | * @bdev: block device to open |
844 | * @mode: open mode (BLK_OPEN_*) |
845 | * @holder: exclusive holder identifier |
846 | * @hops: holder operations |
847 | * @bdev_file: file for the block device |
848 | * |
849 | * Open the block device. If @holder is not %NULL, the block device is opened |
850 | * with exclusive access. Exclusive opens may nest for the same @holder. |
851 | * |
852 | * CONTEXT: |
853 | * Might sleep. |
854 | * |
855 | * RETURNS: |
856 | * zero on success, -errno on failure. |
857 | */ |
858 | int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, |
859 | const struct blk_holder_ops *hops, struct file *bdev_file) |
860 | { |
861 | bool unblock_events = true; |
862 | struct gendisk *disk = bdev->bd_disk; |
863 | int ret; |
864 | |
865 | if (holder) { |
866 | mode |= BLK_OPEN_EXCL; |
867 | ret = bd_prepare_to_claim(bdev, holder, hops); |
868 | if (ret) |
869 | return ret; |
870 | } else { |
871 | if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) |
872 | return -EIO; |
873 | } |
874 | |
875 | disk_block_events(disk); |
876 | |
877 | mutex_lock(&disk->open_mutex); |
878 | ret = -ENXIO; |
879 | if (!disk_live(disk)) |
880 | goto abort_claiming; |
881 | if (!try_module_get(module: disk->fops->owner)) |
882 | goto abort_claiming; |
883 | ret = -EBUSY; |
884 | if (!bdev_may_open(bdev, mode)) |
885 | goto abort_claiming; |
886 | if (bdev_is_partition(bdev)) |
887 | ret = blkdev_get_part(part: bdev, mode); |
888 | else |
889 | ret = blkdev_get_whole(bdev, mode); |
890 | if (ret) |
891 | goto put_module; |
892 | bdev_claim_write_access(bdev, mode); |
893 | if (holder) { |
894 | bd_finish_claiming(bdev, holder, hops); |
895 | |
896 | /* |
897 | * Block event polling for write claims if requested. Any write |
898 | * holder makes the write_holder state stick until all are |
899 | * released. This is good enough and tracking individual |
900 | * writeable reference is too fragile given the way @mode is |
901 | * used in blkdev_get/put(). |
902 | */ |
903 | if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && |
904 | (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { |
905 | bdev->bd_write_holder = true; |
906 | unblock_events = false; |
907 | } |
908 | } |
909 | mutex_unlock(lock: &disk->open_mutex); |
910 | |
911 | if (unblock_events) |
912 | disk_unblock_events(disk); |
913 | |
914 | bdev_file->f_flags |= O_LARGEFILE; |
915 | bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; |
916 | if (bdev_nowait(bdev)) |
917 | bdev_file->f_mode |= FMODE_NOWAIT; |
918 | if (mode & BLK_OPEN_RESTRICT_WRITES) |
919 | bdev_file->f_mode |= FMODE_WRITE_RESTRICTED; |
920 | bdev_file->f_mapping = bdev->bd_inode->i_mapping; |
921 | bdev_file->f_wb_err = filemap_sample_wb_err(mapping: bdev_file->f_mapping); |
922 | bdev_file->private_data = holder; |
923 | |
924 | return 0; |
925 | put_module: |
926 | module_put(module: disk->fops->owner); |
927 | abort_claiming: |
928 | if (holder) |
929 | bd_abort_claiming(bdev, holder); |
930 | mutex_unlock(lock: &disk->open_mutex); |
931 | disk_unblock_events(disk); |
932 | return ret; |
933 | } |
934 | |
935 | /* |
936 | * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk |
937 | * associated with the floppy driver where it has allowed ioctls if the |
938 | * file was opened for writing, but does not allow reads or writes. |
939 | * Make sure that this quirk is reflected in @f_flags. |
940 | * |
941 | * It can also happen if a block device is opened as O_RDWR | O_WRONLY. |
942 | */ |
943 | static unsigned blk_to_file_flags(blk_mode_t mode) |
944 | { |
945 | unsigned int flags = 0; |
946 | |
947 | if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) == |
948 | (BLK_OPEN_READ | BLK_OPEN_WRITE)) |
949 | flags |= O_RDWR; |
950 | else if (mode & BLK_OPEN_WRITE_IOCTL) |
951 | flags |= O_RDWR | O_WRONLY; |
952 | else if (mode & BLK_OPEN_WRITE) |
953 | flags |= O_WRONLY; |
954 | else if (mode & BLK_OPEN_READ) |
955 | flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */ |
956 | else |
957 | WARN_ON_ONCE(true); |
958 | |
959 | if (mode & BLK_OPEN_NDELAY) |
960 | flags |= O_NDELAY; |
961 | |
962 | return flags; |
963 | } |
964 | |
965 | struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, |
966 | const struct blk_holder_ops *hops) |
967 | { |
968 | struct file *bdev_file; |
969 | struct block_device *bdev; |
970 | unsigned int flags; |
971 | int ret; |
972 | |
973 | ret = bdev_permission(dev, mode, holder); |
974 | if (ret) |
975 | return ERR_PTR(error: ret); |
976 | |
977 | bdev = blkdev_get_no_open(dev); |
978 | if (!bdev) |
979 | return ERR_PTR(error: -ENXIO); |
980 | |
981 | flags = blk_to_file_flags(mode); |
982 | bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode, |
983 | blockdev_mnt, "" , flags: flags | O_LARGEFILE, &def_blk_fops); |
984 | if (IS_ERR(ptr: bdev_file)) { |
985 | blkdev_put_no_open(bdev); |
986 | return bdev_file; |
987 | } |
988 | ihold(inode: bdev->bd_inode); |
989 | |
990 | ret = bdev_open(bdev, mode, holder, hops, bdev_file); |
991 | if (ret) { |
992 | /* We failed to open the block device. Let ->release() know. */ |
993 | bdev_file->private_data = ERR_PTR(error: ret); |
994 | fput(bdev_file); |
995 | return ERR_PTR(error: ret); |
996 | } |
997 | return bdev_file; |
998 | } |
999 | EXPORT_SYMBOL(bdev_file_open_by_dev); |
1000 | |
1001 | struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, |
1002 | void *holder, |
1003 | const struct blk_holder_ops *hops) |
1004 | { |
1005 | struct file *file; |
1006 | dev_t dev; |
1007 | int error; |
1008 | |
1009 | error = lookup_bdev(pathname: path, dev: &dev); |
1010 | if (error) |
1011 | return ERR_PTR(error); |
1012 | |
1013 | file = bdev_file_open_by_dev(dev, mode, holder, hops); |
1014 | if (!IS_ERR(ptr: file) && (mode & BLK_OPEN_WRITE)) { |
1015 | if (bdev_read_only(bdev: file_bdev(file))) { |
1016 | fput(file); |
1017 | file = ERR_PTR(error: -EACCES); |
1018 | } |
1019 | } |
1020 | |
1021 | return file; |
1022 | } |
1023 | EXPORT_SYMBOL(bdev_file_open_by_path); |
1024 | |
1025 | static inline void bd_yield_claim(struct file *bdev_file) |
1026 | { |
1027 | struct block_device *bdev = file_bdev(bdev_file); |
1028 | void *holder = bdev_file->private_data; |
1029 | |
1030 | lockdep_assert_held(&bdev->bd_disk->open_mutex); |
1031 | |
1032 | if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder))) |
1033 | return; |
1034 | |
1035 | if (!bdev_unclaimed(bdev_file)) |
1036 | bd_end_claim(bdev, holder); |
1037 | } |
1038 | |
1039 | void bdev_release(struct file *bdev_file) |
1040 | { |
1041 | struct block_device *bdev = file_bdev(bdev_file); |
1042 | void *holder = bdev_file->private_data; |
1043 | struct gendisk *disk = bdev->bd_disk; |
1044 | |
1045 | /* We failed to open that block device. */ |
1046 | if (IS_ERR(ptr: holder)) |
1047 | goto put_no_open; |
1048 | |
1049 | /* |
1050 | * Sync early if it looks like we're the last one. If someone else |
1051 | * opens the block device between now and the decrement of bd_openers |
1052 | * then we did a sync that we didn't need to, but that's not the end |
1053 | * of the world and we want to avoid long (could be several minute) |
1054 | * syncs while holding the mutex. |
1055 | */ |
1056 | if (atomic_read(v: &bdev->bd_openers) == 1) |
1057 | sync_blockdev(bdev); |
1058 | |
1059 | mutex_lock(&disk->open_mutex); |
1060 | bdev_yield_write_access(bdev_file); |
1061 | |
1062 | if (holder) |
1063 | bd_yield_claim(bdev_file); |
1064 | |
1065 | /* |
1066 | * Trigger event checking and tell drivers to flush MEDIA_CHANGE |
1067 | * event. This is to ensure detection of media removal commanded |
1068 | * from userland - e.g. eject(1). |
1069 | */ |
1070 | disk_flush_events(disk, mask: DISK_EVENT_MEDIA_CHANGE); |
1071 | |
1072 | if (bdev_is_partition(bdev)) |
1073 | blkdev_put_part(part: bdev); |
1074 | else |
1075 | blkdev_put_whole(bdev); |
1076 | mutex_unlock(lock: &disk->open_mutex); |
1077 | |
1078 | module_put(module: disk->fops->owner); |
1079 | put_no_open: |
1080 | blkdev_put_no_open(bdev); |
1081 | } |
1082 | |
1083 | /** |
1084 | * bdev_fput - yield claim to the block device and put the file |
1085 | * @bdev_file: open block device |
1086 | * |
1087 | * Yield claim on the block device and put the file. Ensure that the |
1088 | * block device can be reclaimed before the file is closed which is a |
1089 | * deferred operation. |
1090 | */ |
1091 | void bdev_fput(struct file *bdev_file) |
1092 | { |
1093 | if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops)) |
1094 | return; |
1095 | |
1096 | if (bdev_file->private_data) { |
1097 | struct block_device *bdev = file_bdev(bdev_file); |
1098 | struct gendisk *disk = bdev->bd_disk; |
1099 | |
1100 | mutex_lock(&disk->open_mutex); |
1101 | bdev_yield_write_access(bdev_file); |
1102 | bd_yield_claim(bdev_file); |
1103 | /* |
1104 | * Tell release we already gave up our hold on the |
1105 | * device and if write restrictions are available that |
1106 | * we already gave up write access to the device. |
1107 | */ |
1108 | bdev_file->private_data = BDEV_I(inode: bdev_file->f_mapping->host); |
1109 | mutex_unlock(lock: &disk->open_mutex); |
1110 | } |
1111 | |
1112 | fput(bdev_file); |
1113 | } |
1114 | EXPORT_SYMBOL(bdev_fput); |
1115 | |
1116 | /** |
1117 | * lookup_bdev() - Look up a struct block_device by name. |
1118 | * @pathname: Name of the block device in the filesystem. |
1119 | * @dev: Pointer to the block device's dev_t, if found. |
1120 | * |
1121 | * Lookup the block device's dev_t at @pathname in the current |
1122 | * namespace if possible and return it in @dev. |
1123 | * |
1124 | * Context: May sleep. |
1125 | * Return: 0 if succeeded, negative errno otherwise. |
1126 | */ |
1127 | int lookup_bdev(const char *pathname, dev_t *dev) |
1128 | { |
1129 | struct inode *inode; |
1130 | struct path path; |
1131 | int error; |
1132 | |
1133 | if (!pathname || !*pathname) |
1134 | return -EINVAL; |
1135 | |
1136 | error = kern_path(pathname, LOOKUP_FOLLOW, &path); |
1137 | if (error) |
1138 | return error; |
1139 | |
1140 | inode = d_backing_inode(upper: path.dentry); |
1141 | error = -ENOTBLK; |
1142 | if (!S_ISBLK(inode->i_mode)) |
1143 | goto out_path_put; |
1144 | error = -EACCES; |
1145 | if (!may_open_dev(path: &path)) |
1146 | goto out_path_put; |
1147 | |
1148 | *dev = inode->i_rdev; |
1149 | error = 0; |
1150 | out_path_put: |
1151 | path_put(&path); |
1152 | return error; |
1153 | } |
1154 | EXPORT_SYMBOL(lookup_bdev); |
1155 | |
1156 | /** |
1157 | * bdev_mark_dead - mark a block device as dead |
1158 | * @bdev: block device to operate on |
1159 | * @surprise: indicate a surprise removal |
1160 | * |
1161 | * Tell the file system that this devices or media is dead. If @surprise is set |
1162 | * to %true the device or media is already gone, if not we are preparing for an |
1163 | * orderly removal. |
1164 | * |
1165 | * This calls into the file system, which then typicall syncs out all dirty data |
1166 | * and writes back inodes and then invalidates any cached data in the inodes on |
1167 | * the file system. In addition we also invalidate the block device mapping. |
1168 | */ |
1169 | void bdev_mark_dead(struct block_device *bdev, bool surprise) |
1170 | { |
1171 | mutex_lock(&bdev->bd_holder_lock); |
1172 | if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) |
1173 | bdev->bd_holder_ops->mark_dead(bdev, surprise); |
1174 | else { |
1175 | mutex_unlock(lock: &bdev->bd_holder_lock); |
1176 | sync_blockdev(bdev); |
1177 | } |
1178 | |
1179 | invalidate_bdev(bdev); |
1180 | } |
1181 | /* |
1182 | * New drivers should not use this directly. There are some drivers however |
1183 | * that needs this for historical reasons. For example, the DASD driver has |
1184 | * historically had a shutdown to offline mode that doesn't actually remove the |
1185 | * gendisk that otherwise looks a lot like a safe device removal. |
1186 | */ |
1187 | EXPORT_SYMBOL_GPL(bdev_mark_dead); |
1188 | |
1189 | void sync_bdevs(bool wait) |
1190 | { |
1191 | struct inode *inode, *old_inode = NULL; |
1192 | |
1193 | spin_lock(lock: &blockdev_superblock->s_inode_list_lock); |
1194 | list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { |
1195 | struct address_space *mapping = inode->i_mapping; |
1196 | struct block_device *bdev; |
1197 | |
1198 | spin_lock(lock: &inode->i_lock); |
1199 | if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || |
1200 | mapping->nrpages == 0) { |
1201 | spin_unlock(lock: &inode->i_lock); |
1202 | continue; |
1203 | } |
1204 | __iget(inode); |
1205 | spin_unlock(lock: &inode->i_lock); |
1206 | spin_unlock(lock: &blockdev_superblock->s_inode_list_lock); |
1207 | /* |
1208 | * We hold a reference to 'inode' so it couldn't have been |
1209 | * removed from s_inodes list while we dropped the |
1210 | * s_inode_list_lock We cannot iput the inode now as we can |
1211 | * be holding the last reference and we cannot iput it under |
1212 | * s_inode_list_lock. So we keep the reference and iput it |
1213 | * later. |
1214 | */ |
1215 | iput(old_inode); |
1216 | old_inode = inode; |
1217 | bdev = I_BDEV(inode); |
1218 | |
1219 | mutex_lock(&bdev->bd_disk->open_mutex); |
1220 | if (!atomic_read(v: &bdev->bd_openers)) { |
1221 | ; /* skip */ |
1222 | } else if (wait) { |
1223 | /* |
1224 | * We keep the error status of individual mapping so |
1225 | * that applications can catch the writeback error using |
1226 | * fsync(2). See filemap_fdatawait_keep_errors() for |
1227 | * details. |
1228 | */ |
1229 | filemap_fdatawait_keep_errors(mapping: inode->i_mapping); |
1230 | } else { |
1231 | filemap_fdatawrite(inode->i_mapping); |
1232 | } |
1233 | mutex_unlock(lock: &bdev->bd_disk->open_mutex); |
1234 | |
1235 | spin_lock(lock: &blockdev_superblock->s_inode_list_lock); |
1236 | } |
1237 | spin_unlock(lock: &blockdev_superblock->s_inode_list_lock); |
1238 | iput(old_inode); |
1239 | } |
1240 | |
1241 | /* |
1242 | * Handle STATX_DIOALIGN for block devices. |
1243 | * |
1244 | * Note that the inode passed to this is the inode of a block device node file, |
1245 | * not the block device's internal inode. Therefore it is *not* valid to use |
1246 | * I_BDEV() here; the block device has to be looked up by i_rdev instead. |
1247 | */ |
1248 | void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) |
1249 | { |
1250 | struct block_device *bdev; |
1251 | |
1252 | bdev = blkdev_get_no_open(dev: inode->i_rdev); |
1253 | if (!bdev) |
1254 | return; |
1255 | |
1256 | stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; |
1257 | stat->dio_offset_align = bdev_logical_block_size(bdev); |
1258 | stat->result_mask |= STATX_DIOALIGN; |
1259 | |
1260 | blkdev_put_no_open(bdev); |
1261 | } |
1262 | |
1263 | static int __init setup_bdev_allow_write_mounted(char *str) |
1264 | { |
1265 | if (kstrtobool(s: str, res: &bdev_allow_write_mounted)) |
1266 | pr_warn("Invalid option string for bdev_allow_write_mounted:" |
1267 | " '%s'\n" , str); |
1268 | return 1; |
1269 | } |
1270 | __setup("bdev_allow_write_mounted=" , setup_bdev_allow_write_mounted); |
1271 | |