bdev.c source code [linux/block/bdev.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 1991, 1992 Linus Torvalds
4	* Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
5	* Copyright (C) 2016 - 2020 Christoph Hellwig
6	*/
7
8	#include <linux/init.h>
9	#include <linux/mm.h>
10	#include <linux/slab.h>
11	#include <linux/kmod.h>
12	#include <linux/major.h>
13	#include <linux/device_cgroup.h>
14	#include <linux/blkdev.h>
15	#include <linux/blk-integrity.h>
16	#include <linux/backing-dev.h>
17	#include <linux/module.h>
18	#include <linux/blkpg.h>
19	#include <linux/magic.h>
20	#include <linux/buffer_head.h>
21	#include <linux/swap.h>
22	#include <linux/writeback.h>
23	#include <linux/mount.h>
24	#include <linux/pseudo_fs.h>
25	#include <linux/uio.h>
26	#include <linux/namei.h>
27	#include <linux/part_stat.h>
28	#include <linux/uaccess.h>
29	#include <linux/stat.h>
30	#include "../fs/internal.h"
31	#include "blk.h"
32
33	/ Should we allow writing to mounted block devices? /
34	static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
35
36	struct bdev_inode {
37	struct block_device bdev;
38	struct inode vfs_inode;
39	};
40
41	static inline struct bdev_inode BDEV_I(struct* inode *inode)
42	{
43	return container_of(inode, struct bdev_inode, vfs_inode);
44	}
45
46	struct block_device I_BDEV(struct* inode *inode)
47	{
48	return &BDEV_I(inode)->bdev;
49	}
50	EXPORT_SYMBOL(I_BDEV);
51
52	struct block_device file_bdev(struct* file *bdev_file)
53	{
54	return I_BDEV(bdev_file->f_mapping->host);
55	}
56	EXPORT_SYMBOL(file_bdev);
57
58	static void bdev_write_inode(struct block_device *bdev)
59	{
60	struct inode *inode = bdev->bd_inode;
61	int ret;
62
63	spin_lock(lock: &inode->i_lock);
64	while (inode->i_state & I_DIRTY) {
65	spin_unlock(lock: &inode->i_lock);
66	ret = write_inode_now(inode, sync: true);
67	if (ret)
68	pr_warn_ratelimited(
69	"VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
70	bdev, ret);
71	spin_lock(lock: &inode->i_lock);
72	}
73	spin_unlock(lock: &inode->i_lock);
74	}
75
76	/ Kill _all_ buffers and pagecache , dirty or not.. /
77	static void kill_bdev(struct block_device *bdev)
78	{
79	struct address_space *mapping = bdev->bd_inode->i_mapping;
80
81	if (mapping_empty(mapping))
82	return;
83
84	invalidate_bh_lrus();
85	truncate_inode_pages(mapping, `0`);
86	}
87
88	/ Invalidate clean unused buffers and pagecache. /
89	void invalidate_bdev(struct block_device *bdev)
90	{
91	struct address_space *mapping = bdev->bd_inode->i_mapping;
92
93	if (mapping->nrpages) {
94	invalidate_bh_lrus();
95	lru_add_drain_all(); / make sure all lru add caches are flushed /
96	invalidate_mapping_pages(mapping, start: `0`, end: -`1`);
97	}
98	}
99	EXPORT_SYMBOL(invalidate_bdev);
100
101	/*
102	* Drop all buffers & page cache for given bdev range. This function bails
103	* with error if bdev has other exclusive owner (such as filesystem).
104	*/
105	int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
106	loff_t lstart, loff_t lend)
107	{
108	/*
109	* If we don't hold exclusive handle for the device, upgrade to it
110	* while we discard the buffer cache to avoid discarding buffers
111	* under live filesystem.
112	*/
113	if (!(mode & BLK_OPEN_EXCL)) {
114	int err = bd_prepare_to_claim(bdev, holder: truncate_bdev_range, NULL);
115	if (err)
116	goto invalidate;
117	}
118
119	truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
120	if (!(mode & BLK_OPEN_EXCL))
121	bd_abort_claiming(bdev, holder: truncate_bdev_range);
122	return `0`;
123
124	invalidate:
125	/*
126	* Someone else has handle exclusively open. Try invalidating instead.
127	* The 'end' argument is inclusive so the rounding is safe.
128	*/
129	return invalidate_inode_pages2_range(mapping: bdev->bd_inode->i_mapping,
130	start: lstart >> PAGE_SHIFT,
131	end: lend >> PAGE_SHIFT);
132	}
133
134	static void set_init_blocksize(struct block_device *bdev)
135	{
136	unsigned int bsize = bdev_logical_block_size(bdev);
137	loff_t size = i_size_read(inode: bdev->bd_inode);
138
139	while (bsize < PAGE_SIZE) {
140	if (size & bsize)
141	break;
142	bsize <<= `1`;
143	}
144	bdev->bd_inode->i_blkbits = blksize_bits(size: bsize);
145	}
146
147	int set_blocksize(struct block_device bdev, int* size)
148	{
149	/ Size must be a power of two, and between 512 and PAGE_SIZE /
150	if (size > PAGE_SIZE \|\| size < `512` \|\| !is_power_of_2(n: size))
151	return -EINVAL;
152
153	/ Size cannot be smaller than the size supported by the device /
154	if (size < bdev_logical_block_size(bdev))
155	return -EINVAL;
156
157	/ Don't change the size if it is same as current /
158	if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
159	sync_blockdev(bdev);
160	bdev->bd_inode->i_blkbits = blksize_bits(size);
161	kill_bdev(bdev);
162	}
163	return `0`;
164	}
165
166	EXPORT_SYMBOL(set_blocksize);
167
168	int sb_set_blocksize(struct super_block sb, int* size)
169	{
170	if (set_blocksize(sb->s_bdev, size))
171	return `0`;
172	/ If we get here, we know size is power of two*
173	* and it's value is between 512 and PAGE_SIZE */
174	sb->s_blocksize = size;
175	sb->s_blocksize_bits = blksize_bits(size);
176	return sb->s_blocksize;
177	}
178
179	EXPORT_SYMBOL(sb_set_blocksize);
180
181	int sb_min_blocksize(struct super_block sb, int* size)
182	{
183	int minsize = bdev_logical_block_size(bdev: sb->s_bdev);
184	if (size < minsize)
185	size = minsize;
186	return sb_set_blocksize(sb, size);
187	}
188
189	EXPORT_SYMBOL(sb_min_blocksize);
190
191	int sync_blockdev_nowait(struct block_device *bdev)
192	{
193	if (!bdev)
194	return `0`;
195	return filemap_flush(bdev->bd_inode->i_mapping);
196	}
197	EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
198
199	/*
200	* Write out and wait upon all the dirty data associated with a block
201	* device via its mapping. Does not take the superblock lock.
202	*/
203	int sync_blockdev(struct block_device *bdev)
204	{
205	if (!bdev)
206	return `0`;
207	return filemap_write_and_wait(mapping: bdev->bd_inode->i_mapping);
208	}
209	EXPORT_SYMBOL(sync_blockdev);
210
211	int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
212	{
213	return filemap_write_and_wait_range(mapping: bdev->bd_inode->i_mapping,
214	lstart, lend);
215	}
216	EXPORT_SYMBOL(sync_blockdev_range);
217
218	/**
219	* bdev_freeze - lock a filesystem and force it into a consistent state
220	* @bdev: blockdevice to lock
221	*
222	* If a superblock is found on this device, we take the s_umount semaphore
223	* on it to make sure nobody unmounts until the snapshot creation is done.
224	* The reference counter (bd_fsfreeze_count) guarantees that only the last
225	* unfreeze process can unfreeze the frozen filesystem actually when multiple
226	* freeze requests arrive simultaneously. It counts up in bdev_freeze() and
227	* count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
228	* actually.
229	*
230	* Return: On success zero is returned, negative error code on failure.
231	*/
232	int bdev_freeze(struct block_device *bdev)
233	{
234	int error = `0`;
235
236	mutex_lock(&bdev->bd_fsfreeze_mutex);
237
238	if (atomic_inc_return(v: &bdev->bd_fsfreeze_count) > `1`) {
239	mutex_unlock(lock: &bdev->bd_fsfreeze_mutex);
240	return `0`;
241	}
242
243	mutex_lock(&bdev->bd_holder_lock);
244	if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
245	error = bdev->bd_holder_ops->freeze(bdev);
246	lockdep_assert_not_held(&bdev->bd_holder_lock);
247	} else {
248	mutex_unlock(lock: &bdev->bd_holder_lock);
249	error = sync_blockdev(bdev);
250	}
251
252	if (error)
253	atomic_dec(v: &bdev->bd_fsfreeze_count);
254
255	mutex_unlock(lock: &bdev->bd_fsfreeze_mutex);
256	return error;
257	}
258	EXPORT_SYMBOL(bdev_freeze);
259
260	/**
261	* bdev_thaw - unlock filesystem
262	* @bdev: blockdevice to unlock
263	*
264	* Unlocks the filesystem and marks it writeable again after bdev_freeze().
265	*
266	* Return: On success zero is returned, negative error code on failure.
267	*/
268	int bdev_thaw(struct block_device *bdev)
269	{
270	int error = -EINVAL, nr_freeze;
271
272	mutex_lock(&bdev->bd_fsfreeze_mutex);
273
274	/*
275	* If this returns < 0 it means that @bd_fsfreeze_count was
276	* already 0 and no decrement was performed.
277	*/
278	nr_freeze = atomic_dec_if_positive(v: &bdev->bd_fsfreeze_count);
279	if (nr_freeze < `0`)
280	goto out;
281
282	error = `0`;
283	if (nr_freeze > `0`)
284	goto out;
285
286	mutex_lock(&bdev->bd_holder_lock);
287	if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
288	error = bdev->bd_holder_ops->thaw(bdev);
289	lockdep_assert_not_held(&bdev->bd_holder_lock);
290	} else {
291	mutex_unlock(lock: &bdev->bd_holder_lock);
292	}
293
294	if (error)
295	atomic_inc(v: &bdev->bd_fsfreeze_count);
296	out:
297	mutex_unlock(lock: &bdev->bd_fsfreeze_mutex);
298	return error;
299	}
300	EXPORT_SYMBOL(bdev_thaw);
301
302	/*
303	* pseudo-fs
304	*/
305
306	static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
307	static struct kmem_cache *bdev_cachep __ro_after_init;
308
309	static struct inode bdev_alloc_inode(struct* super_block *sb)
310	{
311	struct bdev_inode *ei = alloc_inode_sb(sb, cache: bdev_cachep, GFP_KERNEL);
312
313	if (!ei)
314	return NULL;
315	memset(&ei->bdev, `0`, sizeof(ei->bdev));
316	return &ei->vfs_inode;
317	}
318
319	static void bdev_free_inode(struct inode *inode)
320	{
321	struct block_device *bdev = I_BDEV(inode);
322
323	free_percpu(pdata: bdev->bd_stats);
324	kfree(objp: bdev->bd_meta_info);
325
326	if (!bdev_is_partition(bdev)) {
327	if (bdev->bd_disk && bdev->bd_disk->bdi)
328	bdi_put(bdi: bdev->bd_disk->bdi);
329	kfree(objp: bdev->bd_disk);
330	}
331
332	if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
333	blk_free_ext_minor(MINOR(bdev->bd_dev));
334
335	kmem_cache_free(s: bdev_cachep, objp: BDEV_I(inode));
336	}
337
338	static void init_once(void *data)
339	{
340	struct bdev_inode *ei = data;
341
342	inode_init_once(&ei->vfs_inode);
343	}
344
345	static void bdev_evict_inode(struct inode *inode)
346	{
347	truncate_inode_pages_final(&inode->i_data);
348	invalidate_inode_buffers(inode); / is it needed here? /
349	clear_inode(inode);
350	}
351
352	static const struct super_operations bdev_sops = {
353	.statfs = simple_statfs,
354	.alloc_inode = bdev_alloc_inode,
355	.free_inode = bdev_free_inode,
356	.drop_inode = generic_delete_inode,
357	.evict_inode = bdev_evict_inode,
358	};
359
360	static int bd_init_fs_context(struct fs_context *fc)
361	{
362	struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
363	if (!ctx)
364	return -ENOMEM;
365	fc->s_iflags \|= SB_I_CGROUPWB;
366	ctx->ops = &bdev_sops;
367	return `0`;
368	}
369
370	static struct file_system_type bd_type = {
371	.name = "bdev",
372	.init_fs_context = bd_init_fs_context,
373	.kill_sb = kill_anon_super,
374	};
375
376	struct super_block *blockdev_superblock __ro_after_init;
377	struct vfsmount *blockdev_mnt __ro_after_init;
378	EXPORT_SYMBOL_GPL(blockdev_superblock);
379
380	void __init bdev_cache_init(void)
381	{
382	int err;
383
384	bdev_cachep = kmem_cache_create(name: "bdev_cache", size: sizeof(struct bdev_inode),
385	align: `0`, flags: (SLAB_HWCACHE_ALIGN\|SLAB_RECLAIM_ACCOUNT\|
386	SLAB_ACCOUNT\|SLAB_PANIC),
387	ctor: init_once);
388	err = register_filesystem(&bd_type);
389	if (err)
390	panic(fmt: "Cannot register bdev pseudo-fs");
391	blockdev_mnt = kern_mount(&bd_type);
392	if (IS_ERR(ptr: blockdev_mnt))
393	panic(fmt: "Cannot create bdev pseudo-fs");
394	blockdev_superblock = blockdev_mnt->mnt_sb; / For writeback /
395	}
396
397	struct block_device bdev_alloc(struct* gendisk *disk, u8 partno)
398	{
399	struct block_device *bdev;
400	struct inode *inode;
401
402	inode = new_inode(sb: blockdev_superblock);
403	if (!inode)
404	return NULL;
405	inode->i_mode = S_IFBLK;
406	inode->i_rdev = `0`;
407	inode->i_data.a_ops = &def_blk_aops;
408	mapping_set_gfp_mask(m: &inode->i_data, GFP_USER);
409
410	bdev = I_BDEV(inode);
411	mutex_init(&bdev->bd_fsfreeze_mutex);
412	spin_lock_init(&bdev->bd_size_lock);
413	mutex_init(&bdev->bd_holder_lock);
414	bdev->bd_partno = partno;
415	bdev->bd_inode = inode;
416	bdev->bd_queue = disk->queue;
417	if (partno)
418	bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio;
419	else
420	bdev->bd_has_submit_bio = false;
421	bdev->bd_stats = alloc_percpu(struct disk_stats);
422	if (!bdev->bd_stats) {
423	iput(inode);
424	return NULL;
425	}
426	bdev->bd_disk = disk;
427	return bdev;
428	}
429
430	void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
431	{
432	spin_lock(lock: &bdev->bd_size_lock);
433	i_size_write(inode: bdev->bd_inode, i_size: (loff_t)sectors << SECTOR_SHIFT);
434	bdev->bd_nr_sectors = sectors;
435	spin_unlock(lock: &bdev->bd_size_lock);
436	}
437
438	void bdev_add(struct block_device *bdev, dev_t dev)
439	{
440	if (bdev_stable_writes(bdev))
441	mapping_set_stable_writes(mapping: bdev->bd_inode->i_mapping);
442	bdev->bd_dev = dev;
443	bdev->bd_inode->i_rdev = dev;
444	bdev->bd_inode->i_ino = dev;
445	insert_inode_hash(inode: bdev->bd_inode);
446	}
447
448	long nr_blockdev_pages(void)
449	{
450	struct inode *inode;
451	long ret = `0`;
452
453	spin_lock(lock: &blockdev_superblock->s_inode_list_lock);
454	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
455	ret += inode->i_mapping->nrpages;
456	spin_unlock(lock: &blockdev_superblock->s_inode_list_lock);
457
458	return ret;
459	}
460
461	/**
462	* bd_may_claim - test whether a block device can be claimed
463	* @bdev: block device of interest
464	* @holder: holder trying to claim @bdev
465	* @hops: holder ops
466	*
467	* Test whether @bdev can be claimed by @holder.
468	*
469	* RETURNS:
470	* %true if @bdev can be claimed, %false otherwise.
471	*/
472	static bool bd_may_claim(struct block_device bdev, void* *holder,
473	const struct blk_holder_ops *hops)
474	{
475	struct block_device *whole = bdev_whole(bdev);
476
477	lockdep_assert_held(&bdev_lock);
478
479	if (bdev->bd_holder) {
480	/*
481	* The same holder can always re-claim.
482	*/
483	if (bdev->bd_holder == holder) {
484	if (WARN_ON_ONCE(bdev->bd_holder_ops != hops))
485	return false;
486	return true;
487	}
488	return false;
489	}
490
491	/*
492	* If the whole devices holder is set to bd_may_claim, a partition on
493	* the device is claimed, but not the whole device.
494	*/
495	if (whole != bdev &&
496	whole->bd_holder && whole->bd_holder != bd_may_claim)
497	return false;
498	return true;
499	}
500
501	/**
502	* bd_prepare_to_claim - claim a block device
503	* @bdev: block device of interest
504	* @holder: holder trying to claim @bdev
505	* @hops: holder ops.
506	*
507	* Claim @bdev. This function fails if @bdev is already claimed by another
508	* holder and waits if another claiming is in progress. return, the caller
509	* has ownership of bd_claiming and bd_holder[s].
510	*
511	* RETURNS:
512	* 0 if @bdev can be claimed, -EBUSY otherwise.
513	*/
514	int bd_prepare_to_claim(struct block_device bdev, void* *holder,
515	const struct blk_holder_ops *hops)
516	{
517	struct block_device *whole = bdev_whole(bdev);
518
519	if (WARN_ON_ONCE(!holder))
520	return -EINVAL;
521	retry:
522	mutex_lock(&bdev_lock);
523	/ if someone else claimed, fail /
524	if (!bd_may_claim(bdev, holder, hops)) {
525	mutex_unlock(lock: &bdev_lock);
526	return -EBUSY;
527	}
528
529	/ if claiming is already in progress, wait for it to finish /
530	if (whole->bd_claiming) {
531	wait_queue_head_t *wq = bit_waitqueue(word: &whole->bd_claiming, bit: `0`);
532	DEFINE_WAIT(wait);
533
534	prepare_to_wait(wq_head: wq, wq_entry: &wait, TASK_UNINTERRUPTIBLE);
535	mutex_unlock(lock: &bdev_lock);
536	schedule();
537	finish_wait(wq_head: wq, wq_entry: &wait);
538	goto retry;
539	}
540
541	/ yay, all mine /
542	whole->bd_claiming = holder;
543	mutex_unlock(lock: &bdev_lock);
544	return `0`;
545	}
546	EXPORT_SYMBOL_GPL(bd_prepare_to_claim); / only for the loop driver /
547
548	static void bd_clear_claiming(struct block_device whole, void* *holder)
549	{
550	lockdep_assert_held(&bdev_lock);
551	/ tell others that we're done /
552	BUG_ON(whole->bd_claiming != holder);
553	whole->bd_claiming = NULL;
554	wake_up_bit(word: &whole->bd_claiming, bit: `0`);
555	}
556
557	/**
558	* bd_finish_claiming - finish claiming of a block device
559	* @bdev: block device of interest
560	* @holder: holder that has claimed @bdev
561	* @hops: block device holder operations
562	*
563	* Finish exclusive open of a block device. Mark the device as exlusively
564	* open by the holder and wake up all waiters for exclusive open to finish.
565	*/
566	static void bd_finish_claiming(struct block_device bdev, void* *holder,
567	const struct blk_holder_ops *hops)
568	{
569	struct block_device *whole = bdev_whole(bdev);
570
571	mutex_lock(&bdev_lock);
572	BUG_ON(!bd_may_claim(bdev, holder, hops));
573	/*
574	* Note that for a whole device bd_holders will be incremented twice,
575	* and bd_holder will be set to bd_may_claim before being set to holder
576	*/
577	whole->bd_holders++;
578	whole->bd_holder = bd_may_claim;
579	bdev->bd_holders++;
580	mutex_lock(&bdev->bd_holder_lock);
581	bdev->bd_holder = holder;
582	bdev->bd_holder_ops = hops;
583	mutex_unlock(lock: &bdev->bd_holder_lock);
584	bd_clear_claiming(whole, holder);
585	mutex_unlock(lock: &bdev_lock);
586	}
587
588	/**
589	* bd_abort_claiming - abort claiming of a block device
590	* @bdev: block device of interest
591	* @holder: holder that has claimed @bdev
592	*
593	* Abort claiming of a block device when the exclusive open failed. This can be
594	* also used when exclusive open is not actually desired and we just needed
595	* to block other exclusive openers for a while.
596	*/
597	void bd_abort_claiming(struct block_device bdev, void* *holder)
598	{
599	mutex_lock(&bdev_lock);
600	bd_clear_claiming(bdev_whole(bdev), holder);
601	mutex_unlock(lock: &bdev_lock);
602	}
603	EXPORT_SYMBOL(bd_abort_claiming);
604
605	static void bd_end_claim(struct block_device bdev, void* *holder)
606	{
607	struct block_device *whole = bdev_whole(bdev);
608	bool unblock = false;
609
610	/*
611	* Release a claim on the device. The holder fields are protected with
612	* bdev_lock. open_mutex is used to synchronize disk_holder unlinking.
613	*/
614	mutex_lock(&bdev_lock);
615	WARN_ON_ONCE(bdev->bd_holder != holder);
616	WARN_ON_ONCE(--bdev->bd_holders < `0`);
617	WARN_ON_ONCE(--whole->bd_holders < `0`);
618	if (!bdev->bd_holders) {
619	mutex_lock(&bdev->bd_holder_lock);
620	bdev->bd_holder = NULL;
621	bdev->bd_holder_ops = NULL;
622	mutex_unlock(lock: &bdev->bd_holder_lock);
623	if (bdev->bd_write_holder)
624	unblock = true;
625	}
626	if (!whole->bd_holders)
627	whole->bd_holder = NULL;
628	mutex_unlock(lock: &bdev_lock);
629
630	/*
631	* If this was the last claim, remove holder link and unblock evpoll if
632	* it was a write holder.
633	*/
634	if (unblock) {
635	disk_unblock_events(disk: bdev->bd_disk);
636	bdev->bd_write_holder = false;
637	}
638	}
639
640	static void blkdev_flush_mapping(struct block_device *bdev)
641	{
642	WARN_ON_ONCE(bdev->bd_holders);
643	sync_blockdev(bdev);
644	kill_bdev(bdev);
645	bdev_write_inode(bdev);
646	}
647
648	static void blkdev_put_whole(struct block_device *bdev)
649	{
650	if (atomic_dec_and_test(v: &bdev->bd_openers))
651	blkdev_flush_mapping(bdev);
652	if (bdev->bd_disk->fops->release)
653	bdev->bd_disk->fops->release(bdev->bd_disk);
654	}
655
656	static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
657	{
658	struct gendisk *disk = bdev->bd_disk;
659	int ret;
660
661	if (disk->fops->open) {
662	ret = disk->fops->open(disk, mode);
663	if (ret) {
664	/ avoid ghost partitions on a removed medium /
665	if (ret == -ENOMEDIUM &&
666	test_bit(GD_NEED_PART_SCAN, &disk->state))
667	bdev_disk_changed(disk, invalidate: true);
668	return ret;
669	}
670	}
671
672	if (!atomic_read(v: &bdev->bd_openers))
673	set_init_blocksize(bdev);
674	atomic_inc(v: &bdev->bd_openers);
675	if (test_bit(GD_NEED_PART_SCAN, &disk->state)) {
676	/*
677	* Only return scanning errors if we are called from contexts
678	* that explicitly want them, e.g. the BLKRRPART ioctl.
679	*/
680	ret = bdev_disk_changed(disk, invalidate: false);
681	if (ret && (mode & BLK_OPEN_STRICT_SCAN)) {
682	blkdev_put_whole(bdev);
683	return ret;
684	}
685	}
686	return `0`;
687	}
688
689	static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
690	{
691	struct gendisk *disk = part->bd_disk;
692	int ret;
693
694	ret = blkdev_get_whole(bdev_whole(part), mode);
695	if (ret)
696	return ret;
697
698	ret = -ENXIO;
699	if (!bdev_nr_sectors(bdev: part))
700	goto out_blkdev_put;
701
702	if (!atomic_read(v: &part->bd_openers)) {
703	disk->open_partitions++;
704	set_init_blocksize(part);
705	}
706	atomic_inc(v: &part->bd_openers);
707	return `0`;
708
709	out_blkdev_put:
710	blkdev_put_whole(bdev_whole(part));
711	return ret;
712	}
713
714	int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
715	{
716	int ret;
717
718	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
719	MAJOR(dev), MINOR(dev),
720	access: ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : `0`) \|
721	((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : `0`));
722	if (ret)
723	return ret;
724
725	/ Blocking writes requires exclusive opener /
726	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
727	return -EINVAL;
728
729	/*
730	* We're using error pointers to indicate to ->release() when we
731	* failed to open that block device. Also this doesn't make sense.
732	*/
733	if (WARN_ON_ONCE(IS_ERR(holder)))
734	return -EINVAL;
735
736	return `0`;
737	}
738
739	static void blkdev_put_part(struct block_device *part)
740	{
741	struct block_device *whole = bdev_whole(part);
742
743	if (atomic_dec_and_test(v: &part->bd_openers)) {
744	blkdev_flush_mapping(bdev: part);
745	whole->bd_disk->open_partitions--;
746	}
747	blkdev_put_whole(bdev: whole);
748	}
749
750	struct block_device *blkdev_get_no_open(dev_t dev)
751	{
752	struct block_device *bdev;
753	struct inode *inode;
754
755	inode = ilookup(sb: blockdev_superblock, ino: dev);
756	if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
757	blk_request_module(devt: dev);
758	inode = ilookup(sb: blockdev_superblock, ino: dev);
759	if (inode)
760	pr_warn_ratelimited(
761	"block device autoloading is deprecated and will be removed.\n");
762	}
763	if (!inode)
764	return NULL;
765
766	/ switch from the inode reference to a device mode one: /
767	bdev = &BDEV_I(inode)->bdev;
768	if (!kobject_get_unless_zero(kobj: &bdev->bd_device.kobj))
769	bdev = NULL;
770	iput(inode);
771	return bdev;
772	}
773
774	void blkdev_put_no_open(struct block_device *bdev)
775	{
776	put_device(dev: &bdev->bd_device);
777	}
778
779	static bool bdev_writes_blocked(struct block_device *bdev)
780	{
781	return bdev->bd_writers < `0`;
782	}
783
784	static void bdev_block_writes(struct block_device *bdev)
785	{
786	bdev->bd_writers--;
787	}
788
789	static void bdev_unblock_writes(struct block_device *bdev)
790	{
791	bdev->bd_writers++;
792	}
793
794	static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
795	{
796	if (bdev_allow_write_mounted)
797	return true;
798	/ Writes blocked? /
799	if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
800	return false;
801	if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > `0`)
802	return false;
803	return true;
804	}
805
806	static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
807	{
808	if (bdev_allow_write_mounted)
809	return;
810
811	/ Claim exclusive or shared write access. /
812	if (mode & BLK_OPEN_RESTRICT_WRITES)
813	bdev_block_writes(bdev);
814	else if (mode & BLK_OPEN_WRITE)
815	bdev->bd_writers++;
816	}
817
818	static inline bool bdev_unclaimed(const struct file *bdev_file)
819	{
820	return bdev_file->private_data == BDEV_I(inode: bdev_file->f_mapping->host);
821	}
822
823	static void bdev_yield_write_access(struct file *bdev_file)
824	{
825	struct block_device *bdev;
826
827	if (bdev_allow_write_mounted)
828	return;
829
830	if (bdev_unclaimed(bdev_file))
831	return;
832
833	bdev = file_bdev(bdev_file);
834
835	if (bdev_file->f_mode & FMODE_WRITE_RESTRICTED)
836	bdev_unblock_writes(bdev);
837	else if (bdev_file->f_mode & FMODE_WRITE)
838	bdev->bd_writers--;
839	}
840
841	/**
842	* bdev_open - open a block device
843	* @bdev: block device to open
844	* @mode: open mode (BLK_OPEN_*)
845	* @holder: exclusive holder identifier
846	* @hops: holder operations
847	* @bdev_file: file for the block device
848	*
849	* Open the block device. If @holder is not %NULL, the block device is opened
850	* with exclusive access. Exclusive opens may nest for the same @holder.
851	*
852	* CONTEXT:
853	* Might sleep.
854	*
855	* RETURNS:
856	* zero on success, -errno on failure.
857	*/
858	int bdev_open(struct block_device bdev, blk_mode_t mode, void* *holder,
859	const struct blk_holder_ops hops, struct* file *bdev_file)
860	{
861	bool unblock_events = true;
862	struct gendisk *disk = bdev->bd_disk;
863	int ret;
864
865	if (holder) {
866	mode \|= BLK_OPEN_EXCL;
867	ret = bd_prepare_to_claim(bdev, holder, hops);
868	if (ret)
869	return ret;
870	} else {
871	if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
872	return -EIO;
873	}
874
875	disk_block_events(disk);
876
877	mutex_lock(&disk->open_mutex);
878	ret = -ENXIO;
879	if (!disk_live(disk))
880	goto abort_claiming;
881	if (!try_module_get(module: disk->fops->owner))
882	goto abort_claiming;
883	ret = -EBUSY;
884	if (!bdev_may_open(bdev, mode))
885	goto abort_claiming;
886	if (bdev_is_partition(bdev))
887	ret = blkdev_get_part(part: bdev, mode);
888	else
889	ret = blkdev_get_whole(bdev, mode);
890	if (ret)
891	goto put_module;
892	bdev_claim_write_access(bdev, mode);
893	if (holder) {
894	bd_finish_claiming(bdev, holder, hops);
895
896	/*
897	* Block event polling for write claims if requested. Any write
898	* holder makes the write_holder state stick until all are
899	* released. This is good enough and tracking individual
900	* writeable reference is too fragile given the way @mode is
901	* used in blkdev_get/put().
902	*/
903	if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder &&
904	(disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
905	bdev->bd_write_holder = true;
906	unblock_events = false;
907	}
908	}
909	mutex_unlock(lock: &disk->open_mutex);
910
911	if (unblock_events)
912	disk_unblock_events(disk);
913
914	bdev_file->f_flags \|= O_LARGEFILE;
915	bdev_file->f_mode \|= FMODE_BUF_RASYNC \| FMODE_CAN_ODIRECT;
916	if (bdev_nowait(bdev))
917	bdev_file->f_mode \|= FMODE_NOWAIT;
918	if (mode & BLK_OPEN_RESTRICT_WRITES)
919	bdev_file->f_mode \|= FMODE_WRITE_RESTRICTED;
920	bdev_file->f_mapping = bdev->bd_inode->i_mapping;
921	bdev_file->f_wb_err = filemap_sample_wb_err(mapping: bdev_file->f_mapping);
922	bdev_file->private_data = holder;
923
924	return `0`;
925	put_module:
926	module_put(module: disk->fops->owner);
927	abort_claiming:
928	if (holder)
929	bd_abort_claiming(bdev, holder);
930	mutex_unlock(lock: &disk->open_mutex);
931	disk_unblock_events(disk);
932	return ret;
933	}
934
935	/*
936	* If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
937	* associated with the floppy driver where it has allowed ioctls if the
938	* file was opened for writing, but does not allow reads or writes.
939	* Make sure that this quirk is reflected in @f_flags.
940	*
941	* It can also happen if a block device is opened as O_RDWR \| O_WRONLY.
942	*/
943	static unsigned blk_to_file_flags(blk_mode_t mode)
944	{
945	unsigned int flags = `0`;
946
947	if ((mode & (BLK_OPEN_READ \| BLK_OPEN_WRITE)) ==
948	(BLK_OPEN_READ \| BLK_OPEN_WRITE))
949	flags \|= O_RDWR;
950	else if (mode & BLK_OPEN_WRITE_IOCTL)
951	flags \|= O_RDWR \| O_WRONLY;
952	else if (mode & BLK_OPEN_WRITE)
953	flags \|= O_WRONLY;
954	else if (mode & BLK_OPEN_READ)
955	flags \|= O_RDONLY; / homeopathic, because O_RDONLY is 0 /
956	else
957	WARN_ON_ONCE(true);
958
959	if (mode & BLK_OPEN_NDELAY)
960	flags \|= O_NDELAY;
961
962	return flags;
963	}
964
965	struct file bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void* *holder,
966	const struct blk_holder_ops *hops)
967	{
968	struct file *bdev_file;
969	struct block_device *bdev;
970	unsigned int flags;
971	int ret;
972
973	ret = bdev_permission(dev, mode, holder);
974	if (ret)
975	return ERR_PTR(error: ret);
976
977	bdev = blkdev_get_no_open(dev);
978	if (!bdev)
979	return ERR_PTR(error: -ENXIO);
980
981	flags = blk_to_file_flags(mode);
982	bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode,
983	blockdev_mnt, "", flags: flags \| O_LARGEFILE, &def_blk_fops);
984	if (IS_ERR(ptr: bdev_file)) {
985	blkdev_put_no_open(bdev);
986	return bdev_file;
987	}
988	ihold(inode: bdev->bd_inode);
989
990	ret = bdev_open(bdev, mode, holder, hops, bdev_file);
991	if (ret) {
992	/ We failed to open the block device. Let ->release() know. /
993	bdev_file->private_data = ERR_PTR(error: ret);
994	fput(bdev_file);
995	return ERR_PTR(error: ret);
996	}
997	return bdev_file;
998	}
999	EXPORT_SYMBOL(bdev_file_open_by_dev);
1000
1001	struct file bdev_file_open_by_path(const* char *path, blk_mode_t mode,
1002	void *holder,
1003	const struct blk_holder_ops *hops)
1004	{
1005	struct file *file;
1006	dev_t dev;
1007	int error;
1008
1009	error = lookup_bdev(pathname: path, dev: &dev);
1010	if (error)
1011	return ERR_PTR(error);
1012
1013	file = bdev_file_open_by_dev(dev, mode, holder, hops);
1014	if (!IS_ERR(ptr: file) && (mode & BLK_OPEN_WRITE)) {
1015	if (bdev_read_only(bdev: file_bdev(file))) {
1016	fput(file);
1017	file = ERR_PTR(error: -EACCES);
1018	}
1019	}
1020
1021	return file;
1022	}
1023	EXPORT_SYMBOL(bdev_file_open_by_path);
1024
1025	static inline void bd_yield_claim(struct file *bdev_file)
1026	{
1027	struct block_device *bdev = file_bdev(bdev_file);
1028	void *holder = bdev_file->private_data;
1029
1030	lockdep_assert_held(&bdev->bd_disk->open_mutex);
1031
1032	if (WARN_ON_ONCE(IS_ERR_OR_NULL(holder)))
1033	return;
1034
1035	if (!bdev_unclaimed(bdev_file))
1036	bd_end_claim(bdev, holder);
1037	}
1038
1039	void bdev_release(struct file *bdev_file)
1040	{
1041	struct block_device *bdev = file_bdev(bdev_file);
1042	void *holder = bdev_file->private_data;
1043	struct gendisk *disk = bdev->bd_disk;
1044
1045	/ We failed to open that block device. /
1046	if (IS_ERR(ptr: holder))
1047	goto put_no_open;
1048
1049	/*
1050	* Sync early if it looks like we're the last one. If someone else
1051	* opens the block device between now and the decrement of bd_openers
1052	* then we did a sync that we didn't need to, but that's not the end
1053	* of the world and we want to avoid long (could be several minute)
1054	* syncs while holding the mutex.
1055	*/
1056	if (atomic_read(v: &bdev->bd_openers) == `1`)
1057	sync_blockdev(bdev);
1058
1059	mutex_lock(&disk->open_mutex);
1060	bdev_yield_write_access(bdev_file);
1061
1062	if (holder)
1063	bd_yield_claim(bdev_file);
1064
1065	/*
1066	* Trigger event checking and tell drivers to flush MEDIA_CHANGE
1067	* event. This is to ensure detection of media removal commanded
1068	* from userland - e.g. eject(1).
1069	*/
1070	disk_flush_events(disk, mask: DISK_EVENT_MEDIA_CHANGE);
1071
1072	if (bdev_is_partition(bdev))
1073	blkdev_put_part(part: bdev);
1074	else
1075	blkdev_put_whole(bdev);
1076	mutex_unlock(lock: &disk->open_mutex);
1077
1078	module_put(module: disk->fops->owner);
1079	put_no_open:
1080	blkdev_put_no_open(bdev);
1081	}
1082
1083	/**
1084	* bdev_fput - yield claim to the block device and put the file
1085	* @bdev_file: open block device
1086	*
1087	* Yield claim on the block device and put the file. Ensure that the
1088	* block device can be reclaimed before the file is closed which is a
1089	* deferred operation.
1090	*/
1091	void bdev_fput(struct file *bdev_file)
1092	{
1093	if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
1094	return;
1095
1096	if (bdev_file->private_data) {
1097	struct block_device *bdev = file_bdev(bdev_file);
1098	struct gendisk *disk = bdev->bd_disk;
1099
1100	mutex_lock(&disk->open_mutex);
1101	bdev_yield_write_access(bdev_file);
1102	bd_yield_claim(bdev_file);
1103	/*
1104	* Tell release we already gave up our hold on the
1105	* device and if write restrictions are available that
1106	* we already gave up write access to the device.
1107	*/
1108	bdev_file->private_data = BDEV_I(inode: bdev_file->f_mapping->host);
1109	mutex_unlock(lock: &disk->open_mutex);
1110	}
1111
1112	fput(bdev_file);
1113	}
1114	EXPORT_SYMBOL(bdev_fput);
1115
1116	/**
1117	* lookup_bdev() - Look up a struct block_device by name.
1118	* @pathname: Name of the block device in the filesystem.
1119	* @dev: Pointer to the block device's dev_t, if found.
1120	*
1121	* Lookup the block device's dev_t at @pathname in the current
1122	* namespace if possible and return it in @dev.
1123	*
1124	* Context: May sleep.
1125	* Return: 0 if succeeded, negative errno otherwise.
1126	*/
1127	int lookup_bdev(const char pathname, dev_t dev)
1128	{
1129	struct inode *inode;
1130	struct path path;
1131	int error;
1132
1133	if (!pathname \|\| !*pathname)
1134	return -EINVAL;
1135
1136	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
1137	if (error)
1138	return error;
1139
1140	inode = d_backing_inode(upper: path.dentry);
1141	error = -ENOTBLK;
1142	if (!S_ISBLK(inode->i_mode))
1143	goto out_path_put;
1144	error = -EACCES;
1145	if (!may_open_dev(path: &path))
1146	goto out_path_put;
1147
1148	*dev = inode->i_rdev;
1149	error = `0`;
1150	out_path_put:
1151	path_put(&path);
1152	return error;
1153	}
1154	EXPORT_SYMBOL(lookup_bdev);
1155
1156	/**
1157	* bdev_mark_dead - mark a block device as dead
1158	* @bdev: block device to operate on
1159	* @surprise: indicate a surprise removal
1160	*
1161	* Tell the file system that this devices or media is dead. If @surprise is set
1162	* to %true the device or media is already gone, if not we are preparing for an
1163	* orderly removal.
1164	*
1165	* This calls into the file system, which then typicall syncs out all dirty data
1166	* and writes back inodes and then invalidates any cached data in the inodes on
1167	* the file system. In addition we also invalidate the block device mapping.
1168	*/
1169	void bdev_mark_dead(struct block_device *bdev, bool surprise)
1170	{
1171	mutex_lock(&bdev->bd_holder_lock);
1172	if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
1173	bdev->bd_holder_ops->mark_dead(bdev, surprise);
1174	else {
1175	mutex_unlock(lock: &bdev->bd_holder_lock);
1176	sync_blockdev(bdev);
1177	}
1178
1179	invalidate_bdev(bdev);
1180	}
1181	/*
1182	* New drivers should not use this directly. There are some drivers however
1183	* that needs this for historical reasons. For example, the DASD driver has
1184	* historically had a shutdown to offline mode that doesn't actually remove the
1185	* gendisk that otherwise looks a lot like a safe device removal.
1186	*/
1187	EXPORT_SYMBOL_GPL(bdev_mark_dead);
1188
1189	void sync_bdevs(bool wait)
1190	{
1191	struct inode inode, old_inode = NULL;
1192
1193	spin_lock(lock: &blockdev_superblock->s_inode_list_lock);
1194	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
1195	struct address_space *mapping = inode->i_mapping;
1196	struct block_device *bdev;
1197
1198	spin_lock(lock: &inode->i_lock);
1199	if (inode->i_state & (I_FREEING\|I_WILL_FREE\|I_NEW) \|\|
1200	mapping->nrpages == `0`) {
1201	spin_unlock(lock: &inode->i_lock);
1202	continue;
1203	}
1204	__iget(inode);
1205	spin_unlock(lock: &inode->i_lock);
1206	spin_unlock(lock: &blockdev_superblock->s_inode_list_lock);
1207	/*
1208	* We hold a reference to 'inode' so it couldn't have been
1209	* removed from s_inodes list while we dropped the
1210	* s_inode_list_lock We cannot iput the inode now as we can
1211	* be holding the last reference and we cannot iput it under
1212	* s_inode_list_lock. So we keep the reference and iput it
1213	* later.
1214	*/
1215	iput(old_inode);
1216	old_inode = inode;
1217	bdev = I_BDEV(inode);
1218
1219	mutex_lock(&bdev->bd_disk->open_mutex);
1220	if (!atomic_read(v: &bdev->bd_openers)) {
1221	; / skip /
1222	} else if (wait) {
1223	/*
1224	* We keep the error status of individual mapping so
1225	* that applications can catch the writeback error using
1226	* fsync(2). See filemap_fdatawait_keep_errors() for
1227	* details.
1228	*/
1229	filemap_fdatawait_keep_errors(mapping: inode->i_mapping);
1230	} else {
1231	filemap_fdatawrite(inode->i_mapping);
1232	}
1233	mutex_unlock(lock: &bdev->bd_disk->open_mutex);
1234
1235	spin_lock(lock: &blockdev_superblock->s_inode_list_lock);
1236	}
1237	spin_unlock(lock: &blockdev_superblock->s_inode_list_lock);
1238	iput(old_inode);
1239	}
1240
1241	/*
1242	* Handle STATX_DIOALIGN for block devices.
1243	*
1244	* Note that the inode passed to this is the inode of a block device node file,
1245	* not the block device's internal inode. Therefore it is not valid to use
1246	* I_BDEV() here; the block device has to be looked up by i_rdev instead.
1247	*/
1248	void bdev_statx_dioalign(struct inode inode, struct* kstat *stat)
1249	{
1250	struct block_device *bdev;
1251
1252	bdev = blkdev_get_no_open(dev: inode->i_rdev);
1253	if (!bdev)
1254	return;
1255
1256	stat->dio_mem_align = bdev_dma_alignment(bdev) + `1`;
1257	stat->dio_offset_align = bdev_logical_block_size(bdev);
1258	stat->result_mask \|= STATX_DIOALIGN;
1259
1260	blkdev_put_no_open(bdev);
1261	}
1262
1263	static int __init setup_bdev_allow_write_mounted(char *str)
1264	{
1265	if (kstrtobool(s: str, res: &bdev_allow_write_mounted))
1266	pr_warn("Invalid option string for bdev_allow_write_mounted:"
1267	" '%s'\n", str);
1268	return `1`;
1269	}
1270	__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
1271

source code of linux/block/bdev.c