buffer.c source code [linux/fs/buffer.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/buffer.c
4	*
5	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
6	*/
7
8	/*
9	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10	*
11	* Removed a lot of unnecessary code and simplified things now that
12	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13	*
14	* Speed up hash, lru, and free list operations. Use gfp() for allocating
15	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16	*
17	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
18	*
19	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20	*/
21
22	#include <linux/kernel.h>
23	#include <linux/sched/signal.h>
24	#include <linux/syscalls.h>
25	#include <linux/fs.h>
26	#include <linux/iomap.h>
27	#include <linux/mm.h>
28	#include <linux/percpu.h>
29	#include <linux/slab.h>
30	#include <linux/capability.h>
31	#include <linux/blkdev.h>
32	#include <linux/file.h>
33	#include <linux/quotaops.h>
34	#include <linux/highmem.h>
35	#include <linux/export.h>
36	#include <linux/backing-dev.h>
37	#include <linux/writeback.h>
38	#include <linux/hash.h>
39	#include <linux/suspend.h>
40	#include <linux/buffer_head.h>
41	#include <linux/task_io_accounting_ops.h>
42	#include <linux/bio.h>
43	#include <linux/cpu.h>
44	#include <linux/bitops.h>
45	#include <linux/mpage.h>
46	#include <linux/bit_spinlock.h>
47	#include <linux/pagevec.h>
48	#include <linux/sched/mm.h>
49	#include <trace/events/block.h>
50	#include <linux/fscrypt.h>
51	#include <linux/fsverity.h>
52	#include <linux/sched/isolation.h>
53
54	#include "internal.h"
55
56	static int fsync_buffers_list(spinlock_t lock, struct* list_head *list);
57	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
58	struct writeback_control *wbc);
59
60	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
61
62	inline void touch_buffer(struct buffer_head *bh)
63	{
64	trace_block_touch_buffer(bh);
65	folio_mark_accessed(bh->b_folio);
66	}
67	EXPORT_SYMBOL(touch_buffer);
68
69	void __lock_buffer(struct buffer_head *bh)
70	{
71	wait_on_bit_lock_io(word: &bh->b_state, bit: BH_Lock, TASK_UNINTERRUPTIBLE);
72	}
73	EXPORT_SYMBOL(__lock_buffer);
74
75	void unlock_buffer(struct buffer_head *bh)
76	{
77	clear_bit_unlock(nr: BH_Lock, addr: &bh->b_state);
78	smp_mb__after_atomic();
79	wake_up_bit(word: &bh->b_state, bit: BH_Lock);
80	}
81	EXPORT_SYMBOL(unlock_buffer);
82
83	/*
84	* Returns if the folio has dirty or writeback buffers. If all the buffers
85	* are unlocked and clean then the folio_test_dirty information is stale. If
86	* any of the buffers are locked, it is assumed they are locked for IO.
87	*/
88	void buffer_check_dirty_writeback(struct folio *folio,
89	bool dirty, bool writeback)
90	{
91	struct buffer_head head, bh;
92	*dirty = false;
93	*writeback = false;
94
95	BUG_ON(!folio_test_locked(folio));
96
97	head = folio_buffers(folio);
98	if (!head)
99	return;
100
101	if (folio_test_writeback(folio))
102	*writeback = true;
103
104	bh = head;
105	do {
106	if (buffer_locked(bh))
107	*writeback = true;
108
109	if (buffer_dirty(bh))
110	*dirty = true;
111
112	bh = bh->b_this_page;
113	} while (bh != head);
114	}
115
116	/*
117	* Block until a buffer comes unlocked. This doesn't stop it
118	* from becoming locked again - you have to lock it yourself
119	* if you want to preserve its state.
120	*/
121	void __wait_on_buffer(struct buffer_head * bh)
122	{
123	wait_on_bit_io(word: &bh->b_state, bit: BH_Lock, TASK_UNINTERRUPTIBLE);
124	}
125	EXPORT_SYMBOL(__wait_on_buffer);
126
127	static void buffer_io_error(struct buffer_head bh, char* *msg)
128	{
129	if (!test_bit(BH_Quiet, &bh->b_state))
130	printk_ratelimited(KERN_ERR
131	"Buffer I/O error on dev %pg, logical block %llu%s\n",
132	bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
133	}
134
135	/*
136	* End-of-IO handler helper function which does not touch the bh after
137	* unlocking it.
138	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
139	* a race there is benign: unlock_buffer() only use the bh's address for
140	* hashing after unlocking the buffer, so it doesn't actually touch the bh
141	* itself.
142	*/
143	static void __end_buffer_read_notouch(struct buffer_head bh, int* uptodate)
144	{
145	if (uptodate) {
146	set_buffer_uptodate(bh);
147	} else {
148	/ This happens, due to failed read-ahead attempts. /
149	clear_buffer_uptodate(bh);
150	}
151	unlock_buffer(bh);
152	}
153
154	/*
155	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
156	* unlock the buffer.
157	*/
158	void end_buffer_read_sync(struct buffer_head bh, int* uptodate)
159	{
160	__end_buffer_read_notouch(bh, uptodate);
161	put_bh(bh);
162	}
163	EXPORT_SYMBOL(end_buffer_read_sync);
164
165	void end_buffer_write_sync(struct buffer_head bh, int* uptodate)
166	{
167	if (uptodate) {
168	set_buffer_uptodate(bh);
169	} else {
170	buffer_io_error(bh, msg: ", lost sync page write");
171	mark_buffer_write_io_error(bh);
172	clear_buffer_uptodate(bh);
173	}
174	unlock_buffer(bh);
175	put_bh(bh);
176	}
177	EXPORT_SYMBOL(end_buffer_write_sync);
178
179	/*
180	* Various filesystems appear to want __find_get_block to be non-blocking.
181	* But it's the page lock which protects the buffers. To get around this,
182	* we get exclusion from try_to_free_buffers with the blockdev mapping's
183	* private_lock.
184	*
185	* Hack idea: for the blockdev mapping, private_lock contention
186	* may be quite high. This code could TryLock the page, and if that
187	* succeeds, there is no need to take private_lock.
188	*/
189	static struct buffer_head *
190	__find_get_block_slow(struct block_device *bdev, sector_t block)
191	{
192	struct inode *bd_inode = bdev->bd_inode;
193	struct address_space *bd_mapping = bd_inode->i_mapping;
194	struct buffer_head *ret = NULL;
195	pgoff_t index;
196	struct buffer_head *bh;
197	struct buffer_head *head;
198	struct folio *folio;
199	int all_mapped = `1`;
200	static DEFINE_RATELIMIT_STATE(last_warned, HZ, `1`);
201
202	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
203	folio = __filemap_get_folio(mapping: bd_mapping, index, FGP_ACCESSED, gfp: `0`);
204	if (IS_ERR(ptr: folio))
205	goto out;
206
207	spin_lock(lock: &bd_mapping->private_lock);
208	head = folio_buffers(folio);
209	if (!head)
210	goto out_unlock;
211	bh = head;
212	do {
213	if (!buffer_mapped(bh))
214	all_mapped = `0`;
215	else if (bh->b_blocknr == block) {
216	ret = bh;
217	get_bh(bh);
218	goto out_unlock;
219	}
220	bh = bh->b_this_page;
221	} while (bh != head);
222
223	/ we might be here because some of the buffers on this page are*
224	* not mapped. This is due to various races between
225	* file io on the block device and getblk. It gets dealt with
226	* elsewhere, don't buffer_error if we had some unmapped buffers
227	*/
228	ratelimit_set_flags(rs: &last_warned, RATELIMIT_MSG_ON_RELEASE);
229	if (all_mapped && __ratelimit(&last_warned)) {
230	printk("__find_get_block_slow() failed. block=%llu, "
231	"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
232	"device %pg blocksize: %d\n",
233	(unsigned long long)block,
234	(unsigned long long)bh->b_blocknr,
235	bh->b_state, bh->b_size, bdev,
236	`1` << bd_inode->i_blkbits);
237	}
238	out_unlock:
239	spin_unlock(lock: &bd_mapping->private_lock);
240	folio_put(folio);
241	out:
242	return ret;
243	}
244
245	static void end_buffer_async_read(struct buffer_head bh, int* uptodate)
246	{
247	unsigned long flags;
248	struct buffer_head *first;
249	struct buffer_head *tmp;
250	struct folio *folio;
251	int folio_uptodate = `1`;
252
253	BUG_ON(!buffer_async_read(bh));
254
255	folio = bh->b_folio;
256	if (uptodate) {
257	set_buffer_uptodate(bh);
258	} else {
259	clear_buffer_uptodate(bh);
260	buffer_io_error(bh, msg: ", async page read");
261	folio_set_error(folio);
262	}
263
264	/*
265	* Be _very_ careful from here on. Bad things can happen if
266	* two buffer heads end IO at almost the same time and both
267	* decide that the page is now completely done.
268	*/
269	first = folio_buffers(folio);
270	spin_lock_irqsave(&first->b_uptodate_lock, flags);
271	clear_buffer_async_read(bh);
272	unlock_buffer(bh);
273	tmp = bh;
274	do {
275	if (!buffer_uptodate(bh: tmp))
276	folio_uptodate = `0`;
277	if (buffer_async_read(bh: tmp)) {
278	BUG_ON(!buffer_locked(tmp));
279	goto still_busy;
280	}
281	tmp = tmp->b_this_page;
282	} while (tmp != bh);
283	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
284
285	folio_end_read(folio, success: folio_uptodate);
286	return;
287
288	still_busy:
289	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
290	return;
291	}
292
293	struct postprocess_bh_ctx {
294	struct work_struct work;
295	struct buffer_head *bh;
296	};
297
298	static void verify_bh(struct work_struct *work)
299	{
300	struct postprocess_bh_ctx *ctx =
301	container_of(work, struct postprocess_bh_ctx, work);
302	struct buffer_head *bh = ctx->bh;
303	bool valid;
304
305	valid = fsverity_verify_blocks(folio: bh->b_folio, len: bh->b_size, offset: bh_offset(bh));
306	end_buffer_async_read(bh, uptodate: valid);
307	kfree(objp: ctx);
308	}
309
310	static bool need_fsverity(struct buffer_head *bh)
311	{
312	struct folio *folio = bh->b_folio;
313	struct inode *inode = folio->mapping->host;
314
315	return fsverity_active(inode) &&
316	/ needed by ext4 /
317	folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
318	}
319
320	static void decrypt_bh(struct work_struct *work)
321	{
322	struct postprocess_bh_ctx *ctx =
323	container_of(work, struct postprocess_bh_ctx, work);
324	struct buffer_head *bh = ctx->bh;
325	int err;
326
327	err = fscrypt_decrypt_pagecache_blocks(folio: bh->b_folio, len: bh->b_size,
328	offs: bh_offset(bh));
329	if (err == `0` && need_fsverity(bh)) {
330	/*
331	* We use different work queues for decryption and for verity
332	* because verity may require reading metadata pages that need
333	* decryption, and we shouldn't recurse to the same workqueue.
334	*/
335	INIT_WORK(&ctx->work, verify_bh);
336	fsverity_enqueue_verify_work(work: &ctx->work);
337	return;
338	}
339	end_buffer_async_read(bh, uptodate: err == `0`);
340	kfree(objp: ctx);
341	}
342
343	/*
344	* I/O completion handler for block_read_full_folio() - pages
345	* which come unlocked at the end of I/O.
346	*/
347	static void end_buffer_async_read_io(struct buffer_head bh, int* uptodate)
348	{
349	struct inode *inode = bh->b_folio->mapping->host;
350	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
351	bool verify = need_fsverity(bh);
352
353	/ Decrypt (with fscrypt) and/or verify (with fsverity) if needed. /
354	if (uptodate && (decrypt \|\| verify)) {
355	struct postprocess_bh_ctx *ctx =
356	kmalloc(size: sizeof(*ctx), GFP_ATOMIC);
357
358	if (ctx) {
359	ctx->bh = bh;
360	if (decrypt) {
361	INIT_WORK(&ctx->work, decrypt_bh);
362	fscrypt_enqueue_decrypt_work(&ctx->work);
363	} else {
364	INIT_WORK(&ctx->work, verify_bh);
365	fsverity_enqueue_verify_work(work: &ctx->work);
366	}
367	return;
368	}
369	uptodate = `0`;
370	}
371	end_buffer_async_read(bh, uptodate);
372	}
373
374	/*
375	* Completion handler for block_write_full_page() - pages which are unlocked
376	* during I/O, and which have PageWriteback cleared upon I/O completion.
377	*/
378	void end_buffer_async_write(struct buffer_head bh, int* uptodate)
379	{
380	unsigned long flags;
381	struct buffer_head *first;
382	struct buffer_head *tmp;
383	struct folio *folio;
384
385	BUG_ON(!buffer_async_write(bh));
386
387	folio = bh->b_folio;
388	if (uptodate) {
389	set_buffer_uptodate(bh);
390	} else {
391	buffer_io_error(bh, msg: ", lost async page write");
392	mark_buffer_write_io_error(bh);
393	clear_buffer_uptodate(bh);
394	folio_set_error(folio);
395	}
396
397	first = folio_buffers(folio);
398	spin_lock_irqsave(&first->b_uptodate_lock, flags);
399
400	clear_buffer_async_write(bh);
401	unlock_buffer(bh);
402	tmp = bh->b_this_page;
403	while (tmp != bh) {
404	if (buffer_async_write(bh: tmp)) {
405	BUG_ON(!buffer_locked(tmp));
406	goto still_busy;
407	}
408	tmp = tmp->b_this_page;
409	}
410	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
411	folio_end_writeback(folio);
412	return;
413
414	still_busy:
415	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
416	return;
417	}
418	EXPORT_SYMBOL(end_buffer_async_write);
419
420	/*
421	* If a page's buffers are under async readin (end_buffer_async_read
422	* completion) then there is a possibility that another thread of
423	* control could lock one of the buffers after it has completed
424	* but while some of the other buffers have not completed. This
425	* locked buffer would confuse end_buffer_async_read() into not unlocking
426	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
427	* that this buffer is not under async I/O.
428	*
429	* The page comes unlocked when it has no locked buffer_async buffers
430	* left.
431	*
432	* PageLocked prevents anyone starting new async I/O reads any of
433	* the buffers.
434	*
435	* PageWriteback is used to prevent simultaneous writeout of the same
436	* page.
437	*
438	* PageLocked prevents anyone from starting writeback of a page which is
439	* under read I/O (PageWriteback is only ever set against a locked page).
440	*/
441	static void mark_buffer_async_read(struct buffer_head *bh)
442	{
443	bh->b_end_io = end_buffer_async_read_io;
444	set_buffer_async_read(bh);
445	}
446
447	static void mark_buffer_async_write_endio(struct buffer_head *bh,
448	bh_end_io_t *handler)
449	{
450	bh->b_end_io = handler;
451	set_buffer_async_write(bh);
452	}
453
454	void mark_buffer_async_write(struct buffer_head *bh)
455	{
456	mark_buffer_async_write_endio(bh, handler: end_buffer_async_write);
457	}
458	EXPORT_SYMBOL(mark_buffer_async_write);
459
460
461	/*
462	* fs/buffer.c contains helper functions for buffer-backed address space's
463	* fsync functions. A common requirement for buffer-based filesystems is
464	* that certain data from the backing blockdev needs to be written out for
465	* a successful fsync(). For example, ext2 indirect blocks need to be
466	* written back and waited upon before fsync() returns.
467	*
468	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
469	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
470	* management of a list of dependent buffers at ->i_mapping->private_list.
471	*
472	* Locking is a little subtle: try_to_free_buffers() will remove buffers
473	* from their controlling inode's queue when they are being freed. But
474	* try_to_free_buffers() will be operating against the blockdev mapping
475	* at the time, not against the S_ISREG file which depends on those buffers.
476	* So the locking for private_list is via the private_lock in the address_space
477	* which backs the buffers. Which is different from the address_space
478	* against which the buffers are listed. So for a particular address_space,
479	* mapping->private_lock does not protect mapping->private_list! In fact,
480	* mapping->private_list will always be protected by the backing blockdev's
481	* ->private_lock.
482	*
483	* Which introduces a requirement: all buffers on an address_space's
484	* ->private_list must be from the same address_space: the blockdev's.
485	*
486	* address_spaces which do not place buffers at ->private_list via these
487	* utility functions are free to use private_lock and private_list for
488	* whatever they want. The only requirement is that list_empty(private_list)
489	* be true at clear_inode() time.
490	*
491	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
492	* filesystems should do that. invalidate_inode_buffers() should just go
493	* BUG_ON(!list_empty).
494	*
495	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
496	* take an address_space, not an inode. And it should be called
497	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
498	* queued up.
499	*
500	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
501	* list if it is already on a list. Because if the buffer is on a list,
502	* it must already be on the right one. If not, the filesystem is being
503	* silly. This will save a ton of locking. But first we have to ensure
504	* that buffers are taken off the old inode's list when they are freed
505	* (presumably in truncate). That requires careful auditing of all
506	* filesystems (do it inside bforget()). It could also be done by bringing
507	* b_inode back.
508	*/
509
510	/*
511	* The buffer's backing address_space's private_lock must be held
512	*/
513	static void __remove_assoc_queue(struct buffer_head *bh)
514	{
515	list_del_init(entry: &bh->b_assoc_buffers);
516	WARN_ON(!bh->b_assoc_map);
517	bh->b_assoc_map = NULL;
518	}
519
520	int inode_has_buffers(struct inode *inode)
521	{
522	return !list_empty(head: &inode->i_data.private_list);
523	}
524
525	/*
526	* osync is designed to support O_SYNC io. It waits synchronously for
527	* all already-submitted IO to complete, but does not queue any new
528	* writes to the disk.
529	*
530	* To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
531	* as you dirty the buffers, and then use osync_inode_buffers to wait for
532	* completion. Any other dirty buffers which are not yet queued for
533	* write will not be flushed to disk by the osync.
534	*/
535	static int osync_buffers_list(spinlock_t lock, struct* list_head *list)
536	{
537	struct buffer_head *bh;
538	struct list_head *p;
539	int err = `0`;
540
541	spin_lock(lock);
542	repeat:
543	list_for_each_prev(p, list) {
544	bh = BH_ENTRY(p);
545	if (buffer_locked(bh)) {
546	get_bh(bh);
547	spin_unlock(lock);
548	wait_on_buffer(bh);
549	if (!buffer_uptodate(bh))
550	err = -EIO;
551	brelse(bh);
552	spin_lock(lock);
553	goto repeat;
554	}
555	}
556	spin_unlock(lock);
557	return err;
558	}
559
560	/**
561	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
562	* @mapping: the mapping which wants those buffers written
563	*
564	* Starts I/O against the buffers at mapping->private_list, and waits upon
565	* that I/O.
566	*
567	* Basically, this is a convenience function for fsync().
568	* @mapping is a file or directory which needs those buffers to be written for
569	* a successful fsync().
570	*/
571	int sync_mapping_buffers(struct address_space *mapping)
572	{
573	struct address_space *buffer_mapping = mapping->private_data;
574
575	if (buffer_mapping == NULL \|\| list_empty(head: &mapping->private_list))
576	return `0`;
577
578	return fsync_buffers_list(lock: &buffer_mapping->private_lock,
579	list: &mapping->private_list);
580	}
581	EXPORT_SYMBOL(sync_mapping_buffers);
582
583	/**
584	* generic_buffers_fsync_noflush - generic buffer fsync implementation
585	* for simple filesystems with no inode lock
586	*
587	* @file: file to synchronize
588	* @start: start offset in bytes
589	* @end: end offset in bytes (inclusive)
590	* @datasync: only synchronize essential metadata if true
591	*
592	* This is a generic implementation of the fsync method for simple
593	* filesystems which track all non-inode metadata in the buffers list
594	* hanging off the address_space structure.
595	*/
596	int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
597	bool datasync)
598	{
599	struct inode *inode = file->f_mapping->host;
600	int err;
601	int ret;
602
603	err = file_write_and_wait_range(file, start, end);
604	if (err)
605	return err;
606
607	ret = sync_mapping_buffers(inode->i_mapping);
608	if (!(inode->i_state & I_DIRTY_ALL))
609	goto out;
610	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
611	goto out;
612
613	err = sync_inode_metadata(inode, wait: `1`);
614	if (ret == `0`)
615	ret = err;
616
617	out:
618	/ check and advance again to catch errors after syncing out buffers /
619	err = file_check_and_advance_wb_err(file);
620	if (ret == `0`)
621	ret = err;
622	return ret;
623	}
624	EXPORT_SYMBOL(generic_buffers_fsync_noflush);
625
626	/**
627	* generic_buffers_fsync - generic buffer fsync implementation
628	* for simple filesystems with no inode lock
629	*
630	* @file: file to synchronize
631	* @start: start offset in bytes
632	* @end: end offset in bytes (inclusive)
633	* @datasync: only synchronize essential metadata if true
634	*
635	* This is a generic implementation of the fsync method for simple
636	* filesystems which track all non-inode metadata in the buffers list
637	* hanging off the address_space structure. This also makes sure that
638	* a device cache flush operation is called at the end.
639	*/
640	int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
641	bool datasync)
642	{
643	struct inode *inode = file->f_mapping->host;
644	int ret;
645
646	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
647	if (!ret)
648	ret = blkdev_issue_flush(bdev: inode->i_sb->s_bdev);
649	return ret;
650	}
651	EXPORT_SYMBOL(generic_buffers_fsync);
652
653	/*
654	* Called when we've recently written block `bblock', and it is known that
655	* `bblock' was for a buffer_boundary() buffer. This means that the block at
656	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
657	* dirty, schedule it for IO. So that indirects merge nicely with their data.
658	*/
659	void write_boundary_block(struct block_device *bdev,
660	sector_t bblock, unsigned blocksize)
661	{
662	struct buffer_head *bh = __find_get_block(bdev, block: bblock + `1`, size: blocksize);
663	if (bh) {
664	if (buffer_dirty(bh))
665	write_dirty_buffer(bh, op_flags: `0`);
666	put_bh(bh);
667	}
668	}
669
670	void mark_buffer_dirty_inode(struct buffer_head bh, struct* inode *inode)
671	{
672	struct address_space *mapping = inode->i_mapping;
673	struct address_space *buffer_mapping = bh->b_folio->mapping;
674
675	mark_buffer_dirty(bh);
676	if (!mapping->private_data) {
677	mapping->private_data = buffer_mapping;
678	} else {
679	BUG_ON(mapping->private_data != buffer_mapping);
680	}
681	if (!bh->b_assoc_map) {
682	spin_lock(lock: &buffer_mapping->private_lock);
683	list_move_tail(list: &bh->b_assoc_buffers,
684	head: &mapping->private_list);
685	bh->b_assoc_map = mapping;
686	spin_unlock(lock: &buffer_mapping->private_lock);
687	}
688	}
689	EXPORT_SYMBOL(mark_buffer_dirty_inode);
690
691	/*
692	* Add a page to the dirty page list.
693	*
694	* It is a sad fact of life that this function is called from several places
695	* deeply under spinlocking. It may not sleep.
696	*
697	* If the page has buffers, the uptodate buffers are set dirty, to preserve
698	* dirty-state coherency between the page and the buffers. It the page does
699	* not have buffers then when they are later attached they will all be set
700	* dirty.
701	*
702	* The buffers are dirtied before the page is dirtied. There's a small race
703	* window in which a writepage caller may see the page cleanness but not the
704	* buffer dirtiness. That's fine. If this code were to set the page dirty
705	* before the buffers, a concurrent writepage caller could clear the page dirty
706	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
707	* page on the dirty page list.
708	*
709	* We use private_lock to lock against try_to_free_buffers while using the
710	* page's buffer list. Also use this to protect against clean buffers being
711	* added to the page after it was set dirty.
712	*
713	* FIXME: may need to call ->reservepage here as well. That's rather up to the
714	* address_space though.
715	*/
716	bool block_dirty_folio(struct address_space mapping, struct* folio *folio)
717	{
718	struct buffer_head *head;
719	bool newly_dirty;
720
721	spin_lock(lock: &mapping->private_lock);
722	head = folio_buffers(folio);
723	if (head) {
724	struct buffer_head *bh = head;
725
726	do {
727	set_buffer_dirty(bh);
728	bh = bh->b_this_page;
729	} while (bh != head);
730	}
731	/*
732	* Lock out page's memcg migration to keep PageDirty
733	* synchronized with per-memcg dirty page counters.
734	*/
735	folio_memcg_lock(folio);
736	newly_dirty = !folio_test_set_dirty(folio);
737	spin_unlock(lock: &mapping->private_lock);
738
739	if (newly_dirty)
740	__folio_mark_dirty(folio, mapping, warn: `1`);
741
742	folio_memcg_unlock(folio);
743
744	if (newly_dirty)
745	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
746
747	return newly_dirty;
748	}
749	EXPORT_SYMBOL(block_dirty_folio);
750
751	/*
752	* Write out and wait upon a list of buffers.
753	*
754	* We have conflicting pressures: we want to make sure that all
755	* initially dirty buffers get waited on, but that any subsequently
756	* dirtied buffers don't. After all, we don't want fsync to last
757	* forever if somebody is actively writing to the file.
758	*
759	* Do this in two main stages: first we copy dirty buffers to a
760	* temporary inode list, queueing the writes as we go. Then we clean
761	* up, waiting for those writes to complete.
762	*
763	* During this second stage, any subsequent updates to the file may end
764	* up refiling the buffer on the original inode's dirty list again, so
765	* there is a chance we will end up with a buffer queued for write but
766	* not yet completed on that list. So, as a final cleanup we go through
767	* the osync code to catch these locked, dirty buffers without requeuing
768	* any newly dirty buffers for write.
769	*/
770	static int fsync_buffers_list(spinlock_t lock, struct* list_head *list)
771	{
772	struct buffer_head *bh;
773	struct list_head tmp;
774	struct address_space *mapping;
775	int err = `0`, err2;
776	struct blk_plug plug;
777
778	INIT_LIST_HEAD(list: &tmp);
779	blk_start_plug(&plug);
780
781	spin_lock(lock);
782	while (!list_empty(head: list)) {
783	bh = BH_ENTRY(list->next);
784	mapping = bh->b_assoc_map;
785	__remove_assoc_queue(bh);
786	/ Avoid race with mark_buffer_dirty_inode() which does*
787	* a lockless check and we rely on seeing the dirty bit */
788	smp_mb();
789	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
790	list_add(new: &bh->b_assoc_buffers, head: &tmp);
791	bh->b_assoc_map = mapping;
792	if (buffer_dirty(bh)) {
793	get_bh(bh);
794	spin_unlock(lock);
795	/*
796	* Ensure any pending I/O completes so that
797	* write_dirty_buffer() actually writes the
798	* current contents - it is a noop if I/O is
799	* still in flight on potentially older
800	* contents.
801	*/
802	write_dirty_buffer(bh, REQ_SYNC);
803
804	/*
805	* Kick off IO for the previous mapping. Note
806	* that we will not run the very last mapping,
807	* wait_on_buffer() will do that for us
808	* through sync_buffer().
809	*/
810	brelse(bh);
811	spin_lock(lock);
812	}
813	}
814	}
815
816	spin_unlock(lock);
817	blk_finish_plug(&plug);
818	spin_lock(lock);
819
820	while (!list_empty(head: &tmp)) {
821	bh = BH_ENTRY(tmp.prev);
822	get_bh(bh);
823	mapping = bh->b_assoc_map;
824	__remove_assoc_queue(bh);
825	/ Avoid race with mark_buffer_dirty_inode() which does*
826	* a lockless check and we rely on seeing the dirty bit */
827	smp_mb();
828	if (buffer_dirty(bh)) {
829	list_add(new: &bh->b_assoc_buffers,
830	head: &mapping->private_list);
831	bh->b_assoc_map = mapping;
832	}
833	spin_unlock(lock);
834	wait_on_buffer(bh);
835	if (!buffer_uptodate(bh))
836	err = -EIO;
837	brelse(bh);
838	spin_lock(lock);
839	}
840
841	spin_unlock(lock);
842	err2 = osync_buffers_list(lock, list);
843	if (err)
844	return err;
845	else
846	return err2;
847	}
848
849	/*
850	* Invalidate any and all dirty buffers on a given inode. We are
851	* probably unmounting the fs, but that doesn't mean we have already
852	* done a sync(). Just drop the buffers from the inode list.
853	*
854	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
855	* assumes that all the buffers are against the blockdev. Not true
856	* for reiserfs.
857	*/
858	void invalidate_inode_buffers(struct inode *inode)
859	{
860	if (inode_has_buffers(inode)) {
861	struct address_space *mapping = &inode->i_data;
862	struct list_head *list = &mapping->private_list;
863	struct address_space *buffer_mapping = mapping->private_data;
864
865	spin_lock(lock: &buffer_mapping->private_lock);
866	while (!list_empty(head: list))
867	__remove_assoc_queue(BH_ENTRY(list->next));
868	spin_unlock(lock: &buffer_mapping->private_lock);
869	}
870	}
871	EXPORT_SYMBOL(invalidate_inode_buffers);
872
873	/*
874	* Remove any clean buffers from the inode's buffer list. This is called
875	* when we're trying to free the inode itself. Those buffers can pin it.
876	*
877	* Returns true if all buffers were removed.
878	*/
879	int remove_inode_buffers(struct inode *inode)
880	{
881	int ret = `1`;
882
883	if (inode_has_buffers(inode)) {
884	struct address_space *mapping = &inode->i_data;
885	struct list_head *list = &mapping->private_list;
886	struct address_space *buffer_mapping = mapping->private_data;
887
888	spin_lock(lock: &buffer_mapping->private_lock);
889	while (!list_empty(head: list)) {
890	struct buffer_head *bh = BH_ENTRY(list->next);
891	if (buffer_dirty(bh)) {
892	ret = `0`;
893	break;
894	}
895	__remove_assoc_queue(bh);
896	}
897	spin_unlock(lock: &buffer_mapping->private_lock);
898	}
899	return ret;
900	}
901
902	/*
903	* Create the appropriate buffers when given a folio for data area and
904	* the size of each buffer.. Use the bh->b_this_page linked list to
905	* follow the buffers created. Return NULL if unable to create more
906	* buffers.
907	*
908	* The retry flag is used to differentiate async IO (paging, swapping)
909	* which may not fail from ordinary buffer allocations.
910	*/
911	struct buffer_head folio_alloc_buffers(struct* folio folio, unsigned* long size,
912	gfp_t gfp)
913	{
914	struct buffer_head bh, head;
915	long offset;
916	struct mem_cgroup memcg, old_memcg;
917
918	/ The folio lock pins the memcg /
919	memcg = folio_memcg(folio);
920	old_memcg = set_active_memcg(memcg);
921
922	head = NULL;
923	offset = folio_size(folio);
924	while ((offset -= size) >= `0`) {
925	bh = alloc_buffer_head(gfp_flags: gfp);
926	if (!bh)
927	goto no_grow;
928
929	bh->b_this_page = head;
930	bh->b_blocknr = -`1`;
931	head = bh;
932
933	bh->b_size = size;
934
935	/ Link the buffer to its folio /
936	folio_set_bh(bh, folio, offset);
937	}
938	out:
939	set_active_memcg(old_memcg);
940	return head;
941	/*
942	* In case anything failed, we just free everything we got.
943	*/
944	no_grow:
945	if (head) {
946	do {
947	bh = head;
948	head = head->b_this_page;
949	free_buffer_head(bh);
950	} while (head);
951	}
952
953	goto out;
954	}
955	EXPORT_SYMBOL_GPL(folio_alloc_buffers);
956
957	struct buffer_head alloc_page_buffers(struct* page page, unsigned* long size,
958	bool retry)
959	{
960	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT;
961	if (retry)
962	gfp \|= __GFP_NOFAIL;
963
964	return folio_alloc_buffers(page_folio(page), size, gfp);
965	}
966	EXPORT_SYMBOL_GPL(alloc_page_buffers);
967
968	static inline void link_dev_buffers(struct folio *folio,
969	struct buffer_head *head)
970	{
971	struct buffer_head bh, tail;
972
973	bh = head;
974	do {
975	tail = bh;
976	bh = bh->b_this_page;
977	} while (bh);
978	tail->b_this_page = head;
979	folio_attach_private(folio, data: head);
980	}
981
982	static sector_t blkdev_max_block(struct block_device bdev, unsigned* int size)
983	{
984	sector_t retval = ~((sector_t)`0`);
985	loff_t sz = bdev_nr_bytes(bdev);
986
987	if (sz) {
988	unsigned int sizebits = blksize_bits(size);
989	retval = (sz >> sizebits);
990	}
991	return retval;
992	}
993
994	/*
995	* Initialise the state of a blockdev folio's buffers.
996	*/
997	static sector_t folio_init_buffers(struct folio *folio,
998	struct block_device bdev, sector_t block, int* size)
999	{
1000	struct buffer_head *head = folio_buffers(folio);
1001	struct buffer_head *bh = head;
1002	bool uptodate = folio_test_uptodate(folio);
1003	sector_t end_block = blkdev_max_block(bdev, size);
1004
1005	do {
1006	if (!buffer_mapped(bh)) {
1007	bh->b_end_io = NULL;
1008	bh->b_private = NULL;
1009	bh->b_bdev = bdev;
1010	bh->b_blocknr = block;
1011	if (uptodate)
1012	set_buffer_uptodate(bh);
1013	if (block < end_block)
1014	set_buffer_mapped(bh);
1015	}
1016	block++;
1017	bh = bh->b_this_page;
1018	} while (bh != head);
1019
1020	/*
1021	* Caller needs to validate requested block against end of device.
1022	*/
1023	return end_block;
1024	}
1025
1026	/*
1027	* Create the page-cache page that contains the requested block.
1028	*
1029	* This is used purely for blockdev mappings.
1030	*/
1031	static int
1032	grow_dev_page(struct block_device *bdev, sector_t block,
1033	pgoff_t index, int size, int sizebits, gfp_t gfp)
1034	{
1035	struct inode *inode = bdev->bd_inode;
1036	struct folio *folio;
1037	struct buffer_head *bh;
1038	sector_t end_block;
1039	int ret = `0`;
1040
1041	folio = __filemap_get_folio(mapping: inode->i_mapping, index,
1042	FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT, gfp);
1043	if (IS_ERR(ptr: folio))
1044	return PTR_ERR(ptr: folio);
1045
1046	bh = folio_buffers(folio);
1047	if (bh) {
1048	if (bh->b_size == size) {
1049	end_block = folio_init_buffers(folio, bdev,
1050	block: (sector_t)index << sizebits, size);
1051	goto done;
1052	}
1053	if (!try_to_free_buffers(folio))
1054	goto failed;
1055	}
1056
1057	ret = -ENOMEM;
1058	bh = folio_alloc_buffers(folio, size, gfp \| __GFP_ACCOUNT);
1059	if (!bh)
1060	goto failed;
1061
1062	/*
1063	* Link the folio to the buffers and initialise them. Take the
1064	* lock to be atomic wrt __find_get_block(), which does not
1065	* run under the folio lock.
1066	*/
1067	spin_lock(lock: &inode->i_mapping->private_lock);
1068	link_dev_buffers(folio, head: bh);
1069	end_block = folio_init_buffers(folio, bdev,
1070	block: (sector_t)index << sizebits, size);
1071	spin_unlock(lock: &inode->i_mapping->private_lock);
1072	done:
1073	ret = (block < end_block) ? `1` : -ENXIO;
1074	failed:
1075	folio_unlock(folio);
1076	folio_put(folio);
1077	return ret;
1078	}
1079
1080	/*
1081	* Create buffers for the specified block device block's page. If
1082	* that page was dirty, the buffers are set dirty also.
1083	*/
1084	static int
1085	grow_buffers(struct block_device bdev, sector_t block, int* size, gfp_t gfp)
1086	{
1087	pgoff_t index;
1088	int sizebits;
1089
1090	sizebits = PAGE_SHIFT - __ffs(size);
1091	index = block >> sizebits;
1092
1093	/*
1094	* Check for a block which wants to lie outside our maximum possible
1095	* pagecache index. (this comparison is done using sector_t types).
1096	*/
1097	if (unlikely(index != block >> sizebits)) {
1098	printk(KERN_ERR "%s: requested out-of-range block %llu for "
1099	"device %pg\n",
1100	__func__, (unsigned long long)block,
1101	bdev);
1102	return -EIO;
1103	}
1104
1105	/ Create a page with the proper size buffers.. /
1106	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1107	}
1108
1109	static struct buffer_head *
1110	__getblk_slow(struct block_device *bdev, sector_t block,
1111	unsigned size, gfp_t gfp)
1112	{
1113	/ Size must be multiple of hard sectorsize /
1114	if (unlikely(size & (bdev_logical_block_size(bdev)-`1`) \|\|
1115	(size < `512` \|\| size > PAGE_SIZE))) {
1116	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1117	size);
1118	printk(KERN_ERR "logical block size: %d\n",
1119	bdev_logical_block_size(bdev));
1120
1121	dump_stack();
1122	return NULL;
1123	}
1124
1125	for (;;) {
1126	struct buffer_head *bh;
1127	int ret;
1128
1129	bh = __find_get_block(bdev, block, size);
1130	if (bh)
1131	return bh;
1132
1133	ret = grow_buffers(bdev, block, size, gfp);
1134	if (ret < `0`)
1135	return NULL;
1136	}
1137	}
1138
1139	/*
1140	* The relationship between dirty buffers and dirty pages:
1141	*
1142	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
1143	* the page is tagged dirty in the page cache.
1144	*
1145	* At all times, the dirtiness of the buffers represents the dirtiness of
1146	* subsections of the page. If the page has buffers, the page dirty bit is
1147	* merely a hint about the true dirty state.
1148	*
1149	* When a page is set dirty in its entirety, all its buffers are marked dirty
1150	* (if the page has buffers).
1151	*
1152	* When a buffer is marked dirty, its page is dirtied, but the page's other
1153	* buffers are not.
1154	*
1155	* Also. When blockdev buffers are explicitly read with bread(), they
1156	* individually become uptodate. But their backing page remains not
1157	* uptodate - even if all of its buffers are uptodate. A subsequent
1158	* block_read_full_folio() against that folio will discover all the uptodate
1159	* buffers, will set the folio uptodate and will perform no I/O.
1160	*/
1161
1162	/**
1163	* mark_buffer_dirty - mark a buffer_head as needing writeout
1164	* @bh: the buffer_head to mark dirty
1165	*
1166	* mark_buffer_dirty() will set the dirty bit against the buffer, then set
1167	* its backing page dirty, then tag the page as dirty in the page cache
1168	* and then attach the address_space's inode to its superblock's dirty
1169	* inode list.
1170	*
1171	* mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
1172	* i_pages lock and mapping->host->i_lock.
1173	*/
1174	void mark_buffer_dirty(struct buffer_head *bh)
1175	{
1176	WARN_ON_ONCE(!buffer_uptodate(bh));
1177
1178	trace_block_dirty_buffer(bh);
1179
1180	/*
1181	* Very carefully optimize the it-is-already-dirty case.
1182	*
1183	* Don't let the final "is it dirty" escape to before we
1184	* perhaps modified the buffer.
1185	*/
1186	if (buffer_dirty(bh)) {
1187	smp_mb();
1188	if (buffer_dirty(bh))
1189	return;
1190	}
1191
1192	if (!test_set_buffer_dirty(bh)) {
1193	struct folio *folio = bh->b_folio;
1194	struct address_space *mapping = NULL;
1195
1196	folio_memcg_lock(folio);
1197	if (!folio_test_set_dirty(folio)) {
1198	mapping = folio->mapping;
1199	if (mapping)
1200	__folio_mark_dirty(folio, mapping, warn: `0`);
1201	}
1202	folio_memcg_unlock(folio);
1203	if (mapping)
1204	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1205	}
1206	}
1207	EXPORT_SYMBOL(mark_buffer_dirty);
1208
1209	void mark_buffer_write_io_error(struct buffer_head *bh)
1210	{
1211	set_buffer_write_io_error(bh);
1212	/ FIXME: do we need to set this in both places? /
1213	if (bh->b_folio && bh->b_folio->mapping)
1214	mapping_set_error(mapping: bh->b_folio->mapping, error: -EIO);
1215	if (bh->b_assoc_map) {
1216	mapping_set_error(mapping: bh->b_assoc_map, error: -EIO);
1217	errseq_set(eseq: &bh->b_assoc_map->host->i_sb->s_wb_err, err: -EIO);
1218	}
1219	}
1220	EXPORT_SYMBOL(mark_buffer_write_io_error);
1221
1222	/*
1223	* Decrement a buffer_head's reference count. If all buffers against a page
1224	* have zero reference count, are clean and unlocked, and if the page is clean
1225	* and unlocked then try_to_free_buffers() may strip the buffers from the page
1226	* in preparation for freeing it (sometimes, rarely, buffers are removed from
1227	* a page but it ends up not being freed, and buffers may later be reattached).
1228	*/
1229	void __brelse(struct buffer_head * buf)
1230	{
1231	if (atomic_read(v: &buf->b_count)) {
1232	put_bh(bh: buf);
1233	return;
1234	}
1235	WARN(`1`, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1236	}
1237	EXPORT_SYMBOL(__brelse);
1238
1239	/*
1240	* bforget() is like brelse(), except it discards any
1241	* potentially dirty data.
1242	*/
1243	void __bforget(struct buffer_head *bh)
1244	{
1245	clear_buffer_dirty(bh);
1246	if (bh->b_assoc_map) {
1247	struct address_space *buffer_mapping = bh->b_folio->mapping;
1248
1249	spin_lock(lock: &buffer_mapping->private_lock);
1250	list_del_init(entry: &bh->b_assoc_buffers);
1251	bh->b_assoc_map = NULL;
1252	spin_unlock(lock: &buffer_mapping->private_lock);
1253	}
1254	__brelse(bh);
1255	}
1256	EXPORT_SYMBOL(__bforget);
1257
1258	static struct buffer_head __bread_slow(struct* buffer_head *bh)
1259	{
1260	lock_buffer(bh);
1261	if (buffer_uptodate(bh)) {
1262	unlock_buffer(bh);
1263	return bh;
1264	} else {
1265	get_bh(bh);
1266	bh->b_end_io = end_buffer_read_sync;
1267	submit_bh(REQ_OP_READ, bh);
1268	wait_on_buffer(bh);
1269	if (buffer_uptodate(bh))
1270	return bh;
1271	}
1272	brelse(bh);
1273	return NULL;
1274	}
1275
1276	/*
1277	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1278	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1279	* refcount elevated by one when they're in an LRU. A buffer can only appear
1280	* once in a particular CPU's LRU. A single buffer can be present in multiple
1281	* CPU's LRUs at the same time.
1282	*
1283	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
1284	* sb_find_get_block().
1285	*
1286	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
1287	* a local interrupt disable for that.
1288	*/
1289
1290	#define BH_LRU_SIZE 16
1291
1292	struct bh_lru {
1293	struct buffer_head *bhs[BH_LRU_SIZE];
1294	};
1295
1296	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1297
1298	#ifdef CONFIG_SMP
1299	#define bh_lru_lock() local_irq_disable()
1300	#define bh_lru_unlock() local_irq_enable()
1301	#else
1302	#define bh_lru_lock() preempt_disable()
1303	#define bh_lru_unlock() preempt_enable()
1304	#endif
1305
1306	static inline void check_irqs_on(void)
1307	{
1308	#ifdef irqs_disabled
1309	BUG_ON(irqs_disabled());
1310	#endif
1311	}
1312
1313	/*
1314	* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1315	* inserted at the front, and the buffer_head at the back if any is evicted.
1316	* Or, if already in the LRU it is moved to the front.
1317	*/
1318	static void bh_lru_install(struct buffer_head *bh)
1319	{
1320	struct buffer_head *evictee = bh;
1321	struct bh_lru *b;
1322	int i;
1323
1324	check_irqs_on();
1325	bh_lru_lock();
1326
1327	/*
1328	* the refcount of buffer_head in bh_lru prevents dropping the
1329	* attached page(i.e., try_to_free_buffers) so it could cause
1330	* failing page migration.
1331	* Skip putting upcoming bh into bh_lru until migration is done.
1332	*/
1333	if (lru_cache_disabled() \|\| cpu_is_isolated(smp_processor_id())) {
1334	bh_lru_unlock();
1335	return;
1336	}
1337
1338	b = this_cpu_ptr(&bh_lrus);
1339	for (i = `0`; i < BH_LRU_SIZE; i++) {
1340	swap(evictee, b->bhs[i]);
1341	if (evictee == bh) {
1342	bh_lru_unlock();
1343	return;
1344	}
1345	}
1346
1347	get_bh(bh);
1348	bh_lru_unlock();
1349	brelse(bh: evictee);
1350	}
1351
1352	/*
1353	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
1354	*/
1355	static struct buffer_head *
1356	lookup_bh_lru(struct block_device bdev, sector_t block, unsigned* size)
1357	{
1358	struct buffer_head *ret = NULL;
1359	unsigned int i;
1360
1361	check_irqs_on();
1362	bh_lru_lock();
1363	if (cpu_is_isolated(smp_processor_id())) {
1364	bh_lru_unlock();
1365	return NULL;
1366	}
1367	for (i = `0`; i < BH_LRU_SIZE; i++) {
1368	struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1369
1370	if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1371	bh->b_size == size) {
1372	if (i) {
1373	while (i) {
1374	__this_cpu_write(bh_lrus.bhs[i],
1375	__this_cpu_read(bh_lrus.bhs[i - `1`]));
1376	i--;
1377	}
1378	__this_cpu_write(bh_lrus.bhs[`0`], bh);
1379	}
1380	get_bh(bh);
1381	ret = bh;
1382	break;
1383	}
1384	}
1385	bh_lru_unlock();
1386	return ret;
1387	}
1388
1389	/*
1390	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
1391	* it in the LRU and mark it as accessed. If it is not present then return
1392	* NULL
1393	*/
1394	struct buffer_head *
1395	__find_get_block(struct block_device bdev, sector_t block, unsigned* size)
1396	{
1397	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1398
1399	if (bh == NULL) {
1400	/ __find_get_block_slow will mark the page accessed /
1401	bh = __find_get_block_slow(bdev, block);
1402	if (bh)
1403	bh_lru_install(bh);
1404	} else
1405	touch_buffer(bh);
1406
1407	return bh;
1408	}
1409	EXPORT_SYMBOL(__find_get_block);
1410
1411	/**
1412	* bdev_getblk - Get a buffer_head in a block device's buffer cache.
1413	* @bdev: The block device.
1414	* @block: The block number.
1415	* @size: The size of buffer_heads for this @bdev.
1416	* @gfp: The memory allocation flags to use.
1417	*
1418	* Return: The buffer head, or NULL if memory could not be allocated.
1419	*/
1420	struct buffer_head bdev_getblk(struct* block_device *bdev, sector_t block,
1421	unsigned size, gfp_t gfp)
1422	{
1423	struct buffer_head *bh = __find_get_block(bdev, block, size);
1424
1425	might_alloc(gfp_mask: gfp);
1426	if (bh)
1427	return bh;
1428
1429	return __getblk_slow(bdev, block, size, gfp);
1430	}
1431	EXPORT_SYMBOL(bdev_getblk);
1432
1433	/*
1434	* Do async read-ahead on a buffer..
1435	*/
1436	void __breadahead(struct block_device bdev, sector_t block, unsigned* size)
1437	{
1438	struct buffer_head *bh = bdev_getblk(bdev, block, size,
1439	GFP_NOWAIT \| __GFP_MOVABLE);
1440
1441	if (likely(bh)) {
1442	bh_readahead(bh, REQ_RAHEAD);
1443	brelse(bh);
1444	}
1445	}
1446	EXPORT_SYMBOL(__breadahead);
1447
1448	/**
1449	* __bread_gfp() - reads a specified block and returns the bh
1450	* @bdev: the block_device to read from
1451	* @block: number of block
1452	* @size: size (in bytes) to read
1453	* @gfp: page allocation flag
1454	*
1455	* Reads a specified block, and returns buffer head that contains it.
1456	* The page cache can be allocated from non-movable area
1457	* not to prevent page migration if you set gfp to zero.
1458	* It returns NULL if the block was unreadable.
1459	*/
1460	struct buffer_head *
1461	__bread_gfp(struct block_device *bdev, sector_t block,
1462	unsigned size, gfp_t gfp)
1463	{
1464	struct buffer_head *bh;
1465
1466	gfp \|= mapping_gfp_constraint(mapping: bdev->bd_inode->i_mapping, gfp_mask: ~__GFP_FS);
1467
1468	/*
1469	* Prefer looping in the allocator rather than here, at least that
1470	* code knows what it's doing.
1471	*/
1472	gfp \|= __GFP_NOFAIL;
1473
1474	bh = bdev_getblk(bdev, block, size, gfp);
1475
1476	if (likely(bh) && !buffer_uptodate(bh))
1477	bh = __bread_slow(bh);
1478	return bh;
1479	}
1480	EXPORT_SYMBOL(__bread_gfp);
1481
1482	static void __invalidate_bh_lrus(struct bh_lru *b)
1483	{
1484	int i;
1485
1486	for (i = `0`; i < BH_LRU_SIZE; i++) {
1487	brelse(bh: b->bhs[i]);
1488	b->bhs[i] = NULL;
1489	}
1490	}
1491	/*
1492	* invalidate_bh_lrus() is called rarely - but not only at unmount.
1493	* This doesn't race because it runs in each cpu either in irq
1494	* or with preempt disabled.
1495	*/
1496	static void invalidate_bh_lru(void *arg)
1497	{
1498	struct bh_lru *b = &get_cpu_var(bh_lrus);
1499
1500	__invalidate_bh_lrus(b);
1501	put_cpu_var(bh_lrus);
1502	}
1503
1504	bool has_bh_in_lru(int cpu, void *dummy)
1505	{
1506	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1507	int i;
1508
1509	for (i = `0`; i < BH_LRU_SIZE; i++) {
1510	if (b->bhs[i])
1511	return true;
1512	}
1513
1514	return false;
1515	}
1516
1517	void invalidate_bh_lrus(void)
1518	{
1519	on_each_cpu_cond(cond_func: has_bh_in_lru, func: invalidate_bh_lru, NULL, wait: `1`);
1520	}
1521	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1522
1523	/*
1524	* It's called from workqueue context so we need a bh_lru_lock to close
1525	* the race with preemption/irq.
1526	*/
1527	void invalidate_bh_lrus_cpu(void)
1528	{
1529	struct bh_lru *b;
1530
1531	bh_lru_lock();
1532	b = this_cpu_ptr(&bh_lrus);
1533	__invalidate_bh_lrus(b);
1534	bh_lru_unlock();
1535	}
1536
1537	void folio_set_bh(struct buffer_head bh, struct* folio *folio,
1538	unsigned long offset)
1539	{
1540	bh->b_folio = folio;
1541	BUG_ON(offset >= folio_size(folio));
1542	if (folio_test_highmem(folio))
1543	/*
1544	* This catches illegal uses and preserves the offset:
1545	*/
1546	bh->b_data = (char *)(`0` + offset);
1547	else
1548	bh->b_data = folio_address(folio) + offset;
1549	}
1550	EXPORT_SYMBOL(folio_set_bh);
1551
1552	/*
1553	* Called when truncating a buffer on a page completely.
1554	*/
1555
1556	/ Bits that are cleared during an invalidate /
1557	#define BUFFER_FLAGS_DISCARD \
1558	(1 << BH_Mapped \| 1 << BH_New \| 1 << BH_Req \| \
1559	1 << BH_Delay \| 1 << BH_Unwritten)
1560
1561	static void discard_buffer(struct buffer_head * bh)
1562	{
1563	unsigned long b_state;
1564
1565	lock_buffer(bh);
1566	clear_buffer_dirty(bh);
1567	bh->b_bdev = NULL;
1568	b_state = READ_ONCE(bh->b_state);
1569	do {
1570	} while (!try_cmpxchg(&bh->b_state, &b_state,
1571	b_state & ~BUFFER_FLAGS_DISCARD));
1572	unlock_buffer(bh);
1573	}
1574
1575	/**
1576	* block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1577	* @folio: The folio which is affected.
1578	* @offset: start of the range to invalidate
1579	* @length: length of the range to invalidate
1580	*
1581	* block_invalidate_folio() is called when all or part of the folio has been
1582	* invalidated by a truncate operation.
1583	*
1584	* block_invalidate_folio() does not have to release all buffers, but it must
1585	* ensure that no dirty buffer is left outside @offset and that no I/O
1586	* is underway against any of the blocks which are outside the truncation
1587	* point. Because the caller is about to free (and possibly reuse) those
1588	* blocks on-disk.
1589	*/
1590	void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1591	{
1592	struct buffer_head head, bh, *next;
1593	size_t curr_off = `0`;
1594	size_t stop = length + offset;
1595
1596	BUG_ON(!folio_test_locked(folio));
1597
1598	/*
1599	* Check for overflow
1600	*/
1601	BUG_ON(stop > folio_size(folio) \|\| stop < length);
1602
1603	head = folio_buffers(folio);
1604	if (!head)
1605	return;
1606
1607	bh = head;
1608	do {
1609	size_t next_off = curr_off + bh->b_size;
1610	next = bh->b_this_page;
1611
1612	/*
1613	* Are we still fully in range ?
1614	*/
1615	if (next_off > stop)
1616	goto out;
1617
1618	/*
1619	* is this block fully invalidated?
1620	*/
1621	if (offset <= curr_off)
1622	discard_buffer(bh);
1623	curr_off = next_off;
1624	bh = next;
1625	} while (bh != head);
1626
1627	/*
1628	* We release buffers only if the entire folio is being invalidated.
1629	* The get_block cached value has been unconditionally invalidated,
1630	* so real IO is not possible anymore.
1631	*/
1632	if (length == folio_size(folio))
1633	filemap_release_folio(folio, gfp: `0`);
1634	out:
1635	return;
1636	}
1637	EXPORT_SYMBOL(block_invalidate_folio);
1638
1639	/*
1640	* We attach and possibly dirty the buffers atomically wrt
1641	* block_dirty_folio() via private_lock. try_to_free_buffers
1642	* is already excluded via the folio lock.
1643	*/
1644	struct buffer_head create_empty_buffers(struct* folio *folio,
1645	unsigned long blocksize, unsigned long b_state)
1646	{
1647	struct buffer_head bh, head, *tail;
1648	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT \| __GFP_NOFAIL;
1649
1650	head = folio_alloc_buffers(folio, blocksize, gfp);
1651	bh = head;
1652	do {
1653	bh->b_state \|= b_state;
1654	tail = bh;
1655	bh = bh->b_this_page;
1656	} while (bh);
1657	tail->b_this_page = head;
1658
1659	spin_lock(lock: &folio->mapping->private_lock);
1660	if (folio_test_uptodate(folio) \|\| folio_test_dirty(folio)) {
1661	bh = head;
1662	do {
1663	if (folio_test_dirty(folio))
1664	set_buffer_dirty(bh);
1665	if (folio_test_uptodate(folio))
1666	set_buffer_uptodate(bh);
1667	bh = bh->b_this_page;
1668	} while (bh != head);
1669	}
1670	folio_attach_private(folio, data: head);
1671	spin_unlock(lock: &folio->mapping->private_lock);
1672
1673	return head;
1674	}
1675	EXPORT_SYMBOL(create_empty_buffers);
1676
1677	/**
1678	* clean_bdev_aliases: clean a range of buffers in block device
1679	* @bdev: Block device to clean buffers in
1680	* @block: Start of a range of blocks to clean
1681	* @len: Number of blocks to clean
1682	*
1683	* We are taking a range of blocks for data and we don't want writeback of any
1684	* buffer-cache aliases starting from return from this function and until the
1685	* moment when something will explicitly mark the buffer dirty (hopefully that
1686	* will not happen until we will free that block ;-) We don't even need to mark
1687	* it not-uptodate - nobody can expect anything from a newly allocated buffer
1688	* anyway. We used to use unmap_buffer() for such invalidation, but that was
1689	* wrong. We definitely don't want to mark the alias unmapped, for example - it
1690	* would confuse anyone who might pick it with bread() afterwards...
1691	*
1692	* Also.. Note that bforget() doesn't lock the buffer. So there can be
1693	* writeout I/O going on against recently-freed buffers. We don't wait on that
1694	* I/O in bforget() - it's more efficient to wait on the I/O only if we really
1695	* need to. That happens here.
1696	*/
1697	void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1698	{
1699	struct inode *bd_inode = bdev->bd_inode;
1700	struct address_space *bd_mapping = bd_inode->i_mapping;
1701	struct folio_batch fbatch;
1702	pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
1703	pgoff_t end;
1704	int i, count;
1705	struct buffer_head *bh;
1706	struct buffer_head *head;
1707
1708	end = (block + len - `1`) >> (PAGE_SHIFT - bd_inode->i_blkbits);
1709	folio_batch_init(fbatch: &fbatch);
1710	while (filemap_get_folios(mapping: bd_mapping, start: &index, end, fbatch: &fbatch)) {
1711	count = folio_batch_count(fbatch: &fbatch);
1712	for (i = `0`; i < count; i++) {
1713	struct folio *folio = fbatch.folios[i];
1714
1715	if (!folio_buffers(folio))
1716	continue;
1717	/*
1718	* We use folio lock instead of bd_mapping->private_lock
1719	* to pin buffers here since we can afford to sleep and
1720	* it scales better than a global spinlock lock.
1721	*/
1722	folio_lock(folio);
1723	/ Recheck when the folio is locked which pins bhs /
1724	head = folio_buffers(folio);
1725	if (!head)
1726	goto unlock_page;
1727	bh = head;
1728	do {
1729	if (!buffer_mapped(bh) \|\| (bh->b_blocknr < block))
1730	goto next;
1731	if (bh->b_blocknr >= block + len)
1732	break;
1733	clear_buffer_dirty(bh);
1734	wait_on_buffer(bh);
1735	clear_buffer_req(bh);
1736	next:
1737	bh = bh->b_this_page;
1738	} while (bh != head);
1739	unlock_page:
1740	folio_unlock(folio);
1741	}
1742	folio_batch_release(fbatch: &fbatch);
1743	cond_resched();
1744	/ End of range already reached? /
1745	if (index > end \|\| !index)
1746	break;
1747	}
1748	}
1749	EXPORT_SYMBOL(clean_bdev_aliases);
1750
1751	/*
1752	* Size is a power-of-two in the range 512..PAGE_SIZE,
1753	* and the case we care about most is PAGE_SIZE.
1754	*
1755	* So this could possibly be written with those
1756	* constraints in mind (relevant mostly if some
1757	* architecture has a slow bit-scan instruction)
1758	*/
1759	static inline int block_size_bits(unsigned int blocksize)
1760	{
1761	return ilog2(blocksize);
1762	}
1763
1764	static struct buffer_head folio_create_buffers(struct* folio *folio,
1765	struct inode *inode,
1766	unsigned int b_state)
1767	{
1768	struct buffer_head *bh;
1769
1770	BUG_ON(!folio_test_locked(folio));
1771
1772	bh = folio_buffers(folio);
1773	if (!bh)
1774	bh = create_empty_buffers(folio,
1775	`1` << READ_ONCE(inode->i_blkbits), b_state);
1776	return bh;
1777	}
1778
1779	/*
1780	* NOTE! All mapped/uptodate combinations are valid:
1781	*
1782	* Mapped Uptodate Meaning
1783	*
1784	* No No "unknown" - must do get_block()
1785	* No Yes "hole" - zero-filled
1786	* Yes No "allocated" - allocated on disk, not read in
1787	* Yes Yes "valid" - allocated and up-to-date in memory.
1788	*
1789	* "Dirty" is valid only with the last case (mapped+uptodate).
1790	*/
1791
1792	/*
1793	* While block_write_full_page is writing back the dirty buffers under
1794	* the page lock, whoever dirtied the buffers may decide to clean them
1795	* again at any time. We handle that by only looking at the buffer
1796	* state inside lock_buffer().
1797	*
1798	* If block_write_full_page() is called for regular writeback
1799	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1800	* locked buffer. This only can happen if someone has written the buffer
1801	* directly, with submit_bh(). At the address_space level PageWriteback
1802	* prevents this contention from occurring.
1803	*
1804	* If block_write_full_page() is called with wbc->sync_mode ==
1805	* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1806	* causes the writes to be flagged as synchronous writes.
1807	*/
1808	int __block_write_full_folio(struct inode inode, struct* folio *folio,
1809	get_block_t get_block, struct* writeback_control *wbc,
1810	bh_end_io_t *handler)
1811	{
1812	int err;
1813	sector_t block;
1814	sector_t last_block;
1815	struct buffer_head bh, head;
1816	unsigned int blocksize, bbits;
1817	int nr_underway = `0`;
1818	blk_opf_t write_flags = wbc_to_write_flags(wbc);
1819
1820	head = folio_create_buffers(folio, inode,
1821	b_state: (`1` << BH_Dirty) \| (`1` << BH_Uptodate));
1822
1823	/*
1824	* Be very careful. We have no exclusion from block_dirty_folio
1825	* here, and the (potentially unmapped) buffers may become dirty at
1826	* any time. If a buffer becomes dirty here after we've inspected it
1827	* then we just miss that fact, and the folio stays dirty.
1828	*
1829	* Buffers outside i_size may be dirtied by block_dirty_folio;
1830	* handle that here by just cleaning them.
1831	*/
1832
1833	bh = head;
1834	blocksize = bh->b_size;
1835	bbits = block_size_bits(blocksize);
1836
1837	block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
1838	last_block = (i_size_read(inode) - `1`) >> bbits;
1839
1840	/*
1841	* Get all the dirty buffers mapped to disk addresses and
1842	* handle any aliases from the underlying blockdev's mapping.
1843	*/
1844	do {
1845	if (block > last_block) {
1846	/*
1847	* mapped buffers outside i_size will occur, because
1848	* this folio can be outside i_size when there is a
1849	* truncate in progress.
1850	*/
1851	/*
1852	* The buffer was zeroed by block_write_full_page()
1853	*/
1854	clear_buffer_dirty(bh);
1855	set_buffer_uptodate(bh);
1856	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
1857	buffer_dirty(bh)) {
1858	WARN_ON(bh->b_size != blocksize);
1859	err = get_block(inode, block, bh, `1`);
1860	if (err)
1861	goto recover;
1862	clear_buffer_delay(bh);
1863	if (buffer_new(bh)) {
1864	/ blockdev mappings never come here /
1865	clear_buffer_new(bh);
1866	clean_bdev_bh_alias(bh);
1867	}
1868	}
1869	bh = bh->b_this_page;
1870	block++;
1871	} while (bh != head);
1872
1873	do {
1874	if (!buffer_mapped(bh))
1875	continue;
1876	/*
1877	* If it's a fully non-blocking write attempt and we cannot
1878	* lock the buffer then redirty the folio. Note that this can
1879	* potentially cause a busy-wait loop from writeback threads
1880	* and kswapd activity, but those code paths have their own
1881	* higher-level throttling.
1882	*/
1883	if (wbc->sync_mode != WB_SYNC_NONE) {
1884	lock_buffer(bh);
1885	} else if (!trylock_buffer(bh)) {
1886	folio_redirty_for_writepage(wbc, folio);
1887	continue;
1888	}
1889	if (test_clear_buffer_dirty(bh)) {
1890	mark_buffer_async_write_endio(bh, handler);
1891	} else {
1892	unlock_buffer(bh);
1893	}
1894	} while ((bh = bh->b_this_page) != head);
1895
1896	/*
1897	* The folio and its buffers are protected by the writeback flag,
1898	* so we can drop the bh refcounts early.
1899	*/
1900	BUG_ON(folio_test_writeback(folio));
1901	folio_start_writeback(folio);
1902
1903	do {
1904	struct buffer_head *next = bh->b_this_page;
1905	if (buffer_async_write(bh)) {
1906	submit_bh_wbc(opf: REQ_OP_WRITE \| write_flags, bh, wbc);
1907	nr_underway++;
1908	}
1909	bh = next;
1910	} while (bh != head);
1911	folio_unlock(folio);
1912
1913	err = `0`;
1914	done:
1915	if (nr_underway == `0`) {
1916	/*
1917	* The folio was marked dirty, but the buffers were
1918	* clean. Someone wrote them back by hand with
1919	* write_dirty_buffer/submit_bh. A rare case.
1920	*/
1921	folio_end_writeback(folio);
1922
1923	/*
1924	* The folio and buffer_heads can be released at any time from
1925	* here on.
1926	*/
1927	}
1928	return err;
1929
1930	recover:
1931	/*
1932	* ENOSPC, or some other error. We may already have added some
1933	* blocks to the file, so we need to write these out to avoid
1934	* exposing stale data.
1935	* The folio is currently locked and not marked for writeback
1936	*/
1937	bh = head;
1938	/ Recovery: lock and submit the mapped buffers /
1939	do {
1940	if (buffer_mapped(bh) && buffer_dirty(bh) &&
1941	!buffer_delay(bh)) {
1942	lock_buffer(bh);
1943	mark_buffer_async_write_endio(bh, handler);
1944	} else {
1945	/*
1946	* The buffer may have been set dirty during
1947	* attachment to a dirty folio.
1948	*/
1949	clear_buffer_dirty(bh);
1950	}
1951	} while ((bh = bh->b_this_page) != head);
1952	folio_set_error(folio);
1953	BUG_ON(folio_test_writeback(folio));
1954	mapping_set_error(mapping: folio->mapping, error: err);
1955	folio_start_writeback(folio);
1956	do {
1957	struct buffer_head *next = bh->b_this_page;
1958	if (buffer_async_write(bh)) {
1959	clear_buffer_dirty(bh);
1960	submit_bh_wbc(opf: REQ_OP_WRITE \| write_flags, bh, wbc);
1961	nr_underway++;
1962	}
1963	bh = next;
1964	} while (bh != head);
1965	folio_unlock(folio);
1966	goto done;
1967	}
1968	EXPORT_SYMBOL(__block_write_full_folio);
1969
1970	/*
1971	* If a folio has any new buffers, zero them out here, and mark them uptodate
1972	* and dirty so they'll be written out (in order to prevent uninitialised
1973	* block data from leaking). And clear the new bit.
1974	*/
1975	void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1976	{
1977	size_t block_start, block_end;
1978	struct buffer_head head, bh;
1979
1980	BUG_ON(!folio_test_locked(folio));
1981	head = folio_buffers(folio);
1982	if (!head)
1983	return;
1984
1985	bh = head;
1986	block_start = `0`;
1987	do {
1988	block_end = block_start + bh->b_size;
1989
1990	if (buffer_new(bh)) {
1991	if (block_end > from && block_start < to) {
1992	if (!folio_test_uptodate(folio)) {
1993	size_t start, xend;
1994
1995	start = max(from, block_start);
1996	xend = min(to, block_end);
1997
1998	folio_zero_segment(folio, start, xend);
1999	set_buffer_uptodate(bh);
2000	}
2001
2002	clear_buffer_new(bh);
2003	mark_buffer_dirty(bh);
2004	}
2005	}
2006
2007	block_start = block_end;
2008	bh = bh->b_this_page;
2009	} while (bh != head);
2010	}
2011	EXPORT_SYMBOL(folio_zero_new_buffers);
2012
2013	static int
2014	iomap_to_bh(struct inode inode, sector_t block, struct* buffer_head *bh,
2015	const struct iomap *iomap)
2016	{
2017	loff_t offset = block << inode->i_blkbits;
2018
2019	bh->b_bdev = iomap->bdev;
2020
2021	/*
2022	* Block points to offset in file we need to map, iomap contains
2023	* the offset at which the map starts. If the map ends before the
2024	* current block, then do not map the buffer and let the caller
2025	* handle it.
2026	*/
2027	if (offset >= iomap->offset + iomap->length)
2028	return -EIO;
2029
2030	switch (iomap->type) {
2031	case IOMAP_HOLE:
2032	/*
2033	* If the buffer is not up to date or beyond the current EOF,
2034	* we need to mark it as new to ensure sub-block zeroing is
2035	* executed if necessary.
2036	*/
2037	if (!buffer_uptodate(bh) \|\|
2038	(offset >= i_size_read(inode)))
2039	set_buffer_new(bh);
2040	return `0`;
2041	case IOMAP_DELALLOC:
2042	if (!buffer_uptodate(bh) \|\|
2043	(offset >= i_size_read(inode)))
2044	set_buffer_new(bh);
2045	set_buffer_uptodate(bh);
2046	set_buffer_mapped(bh);
2047	set_buffer_delay(bh);
2048	return `0`;
2049	case IOMAP_UNWRITTEN:
2050	/*
2051	* For unwritten regions, we always need to ensure that regions
2052	* in the block we are not writing to are zeroed. Mark the
2053	* buffer as new to ensure this.
2054	*/
2055	set_buffer_new(bh);
2056	set_buffer_unwritten(bh);
2057	fallthrough;
2058	case IOMAP_MAPPED:
2059	if ((iomap->flags & IOMAP_F_NEW) \|\|
2060	offset >= i_size_read(inode)) {
2061	/*
2062	* This can happen if truncating the block device races
2063	* with the check in the caller as i_size updates on
2064	* block devices aren't synchronized by i_rwsem for
2065	* block devices.
2066	*/
2067	if (S_ISBLK(inode->i_mode))
2068	return -EIO;
2069	set_buffer_new(bh);
2070	}
2071	bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2072	inode->i_blkbits;
2073	set_buffer_mapped(bh);
2074	return `0`;
2075	default:
2076	WARN_ON_ONCE(`1`);
2077	return -EIO;
2078	}
2079	}
2080
2081	int __block_write_begin_int(struct folio folio, loff_t pos, unsigned* len,
2082	get_block_t get_block, const* struct iomap *iomap)
2083	{
2084	unsigned from = pos & (PAGE_SIZE - `1`);
2085	unsigned to = from + len;
2086	struct inode *inode = folio->mapping->host;
2087	unsigned block_start, block_end;
2088	sector_t block;
2089	int err = `0`;
2090	unsigned blocksize, bbits;
2091	struct buffer_head bh, head, wait[`2`], *wait_bh=wait;
2092
2093	BUG_ON(!folio_test_locked(folio));
2094	BUG_ON(from > PAGE_SIZE);
2095	BUG_ON(to > PAGE_SIZE);
2096	BUG_ON(from > to);
2097
2098	head = folio_create_buffers(folio, inode, b_state: `0`);
2099	blocksize = head->b_size;
2100	bbits = block_size_bits(blocksize);
2101
2102	block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2103
2104	for(bh = head, block_start = `0`; bh != head \|\| !block_start;
2105	block++, block_start=block_end, bh = bh->b_this_page) {
2106	block_end = block_start + blocksize;
2107	if (block_end <= from \|\| block_start >= to) {
2108	if (folio_test_uptodate(folio)) {
2109	if (!buffer_uptodate(bh))
2110	set_buffer_uptodate(bh);
2111	}
2112	continue;
2113	}
2114	if (buffer_new(bh))
2115	clear_buffer_new(bh);
2116	if (!buffer_mapped(bh)) {
2117	WARN_ON(bh->b_size != blocksize);
2118	if (get_block)
2119	err = get_block(inode, block, bh, `1`);
2120	else
2121	err = iomap_to_bh(inode, block, bh, iomap);
2122	if (err)
2123	break;
2124
2125	if (buffer_new(bh)) {
2126	clean_bdev_bh_alias(bh);
2127	if (folio_test_uptodate(folio)) {
2128	clear_buffer_new(bh);
2129	set_buffer_uptodate(bh);
2130	mark_buffer_dirty(bh);
2131	continue;
2132	}
2133	if (block_end > to \|\| block_start < from)
2134	folio_zero_segments(folio,
2135	start1: to, xend1: block_end,
2136	start2: block_start, xend2: from);
2137	continue;
2138	}
2139	}
2140	if (folio_test_uptodate(folio)) {
2141	if (!buffer_uptodate(bh))
2142	set_buffer_uptodate(bh);
2143	continue;
2144	}
2145	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2146	!buffer_unwritten(bh) &&
2147	(block_start < from \|\| block_end > to)) {
2148	bh_read_nowait(bh, op_flags: `0`);
2149	*wait_bh++=bh;
2150	}
2151	}
2152	/*
2153	* If we issued read requests - let them complete.
2154	*/
2155	while(wait_bh > wait) {
2156	wait_on_buffer(bh: *--wait_bh);
2157	if (!buffer_uptodate(bh: *wait_bh))
2158	err = -EIO;
2159	}
2160	if (unlikely(err))
2161	folio_zero_new_buffers(folio, from, to);
2162	return err;
2163	}
2164
2165	int __block_write_begin(struct page page, loff_t pos, unsigned* len,
2166	get_block_t *get_block)
2167	{
2168	return __block_write_begin_int(page_folio(page), pos, len, get_block,
2169	NULL);
2170	}
2171	EXPORT_SYMBOL(__block_write_begin);
2172
2173	static void __block_commit_write(struct folio *folio, size_t from, size_t to)
2174	{
2175	size_t block_start, block_end;
2176	bool partial = false;
2177	unsigned blocksize;
2178	struct buffer_head bh, head;
2179
2180	bh = head = folio_buffers(folio);
2181	blocksize = bh->b_size;
2182
2183	block_start = `0`;
2184	do {
2185	block_end = block_start + blocksize;
2186	if (block_end <= from \|\| block_start >= to) {
2187	if (!buffer_uptodate(bh))
2188	partial = true;
2189	} else {
2190	set_buffer_uptodate(bh);
2191	mark_buffer_dirty(bh);
2192	}
2193	if (buffer_new(bh))
2194	clear_buffer_new(bh);
2195
2196	block_start = block_end;
2197	bh = bh->b_this_page;
2198	} while (bh != head);
2199
2200	/*
2201	* If this is a partial write which happened to make all buffers
2202	* uptodate then we can optimize away a bogus read_folio() for
2203	* the next read(). Here we 'discover' whether the folio went
2204	* uptodate as a result of this (potentially partial) write.
2205	*/
2206	if (!partial)
2207	folio_mark_uptodate(folio);
2208	}
2209
2210	/*
2211	* block_write_begin takes care of the basic task of block allocation and
2212	* bringing partial write blocks uptodate first.
2213	*
2214	* The filesystem needs to handle block truncation upon failure.
2215	*/
2216	int block_write_begin(struct address_space mapping, loff_t pos, unsigned* len,
2217	struct page *pagep, get_block_t get_block)
2218	{
2219	pgoff_t index = pos >> PAGE_SHIFT;
2220	struct page *page;
2221	int status;
2222
2223	page = grab_cache_page_write_begin(mapping, index);
2224	if (!page)
2225	return -ENOMEM;
2226
2227	status = __block_write_begin(page, pos, len, get_block);
2228	if (unlikely(status)) {
2229	unlock_page(page);
2230	put_page(page);
2231	page = NULL;
2232	}
2233
2234	*pagep = page;
2235	return status;
2236	}
2237	EXPORT_SYMBOL(block_write_begin);
2238
2239	int block_write_end(struct file file, struct* address_space *mapping,
2240	loff_t pos, unsigned len, unsigned copied,
2241	struct page page, void* *fsdata)
2242	{
2243	struct folio *folio = page_folio(page);
2244	size_t start = pos - folio_pos(folio);
2245
2246	if (unlikely(copied < len)) {
2247	/*
2248	* The buffers that were written will now be uptodate, so
2249	* we don't have to worry about a read_folio reading them
2250	* and overwriting a partial write. However if we have
2251	* encountered a short write and only partially written
2252	* into a buffer, it will not be marked uptodate, so a
2253	* read_folio might come in and destroy our partial write.
2254	*
2255	* Do the simplest thing, and just treat any short write to a
2256	* non uptodate folio as a zero-length write, and force the
2257	* caller to redo the whole thing.
2258	*/
2259	if (!folio_test_uptodate(folio))
2260	copied = `0`;
2261
2262	folio_zero_new_buffers(folio, start+copied, start+len);
2263	}
2264	flush_dcache_folio(folio);
2265
2266	/ This could be a short (even 0-length) commit /
2267	__block_commit_write(folio, from: start, to: start + copied);
2268
2269	return copied;
2270	}
2271	EXPORT_SYMBOL(block_write_end);
2272
2273	int generic_write_end(struct file file, struct* address_space *mapping,
2274	loff_t pos, unsigned len, unsigned copied,
2275	struct page page, void* *fsdata)
2276	{
2277	struct inode *inode = mapping->host;
2278	loff_t old_size = inode->i_size;
2279	bool i_size_changed = false;
2280
2281	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2282
2283	/*
2284	* No need to use i_size_read() here, the i_size cannot change under us
2285	* because we hold i_rwsem.
2286	*
2287	* But it's important to update i_size while still holding page lock:
2288	* page writeout could otherwise come in and zero beyond i_size.
2289	*/
2290	if (pos + copied > inode->i_size) {
2291	i_size_write(inode, i_size: pos + copied);
2292	i_size_changed = true;
2293	}
2294
2295	unlock_page(page);
2296	put_page(page);
2297
2298	if (old_size < pos)
2299	pagecache_isize_extended(inode, from: old_size, to: pos);
2300	/*
2301	* Don't mark the inode dirty under page lock. First, it unnecessarily
2302	* makes the holding time of page lock longer. Second, it forces lock
2303	* ordering of page lock and transaction start for journaling
2304	* filesystems.
2305	*/
2306	if (i_size_changed)
2307	mark_inode_dirty(inode);
2308	return copied;
2309	}
2310	EXPORT_SYMBOL(generic_write_end);
2311
2312	/*
2313	* block_is_partially_uptodate checks whether buffers within a folio are
2314	* uptodate or not.
2315	*
2316	* Returns true if all buffers which correspond to the specified part
2317	* of the folio are uptodate.
2318	*/
2319	bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2320	{
2321	unsigned block_start, block_end, blocksize;
2322	unsigned to;
2323	struct buffer_head bh, head;
2324	bool ret = true;
2325
2326	head = folio_buffers(folio);
2327	if (!head)
2328	return false;
2329	blocksize = head->b_size;
2330	to = min_t(unsigned, folio_size(folio) - from, count);
2331	to = from + to;
2332	if (from < blocksize && to > folio_size(folio) - blocksize)
2333	return false;
2334
2335	bh = head;
2336	block_start = `0`;
2337	do {
2338	block_end = block_start + blocksize;
2339	if (block_end > from && block_start < to) {
2340	if (!buffer_uptodate(bh)) {
2341	ret = false;
2342	break;
2343	}
2344	if (block_end >= to)
2345	break;
2346	}
2347	block_start = block_end;
2348	bh = bh->b_this_page;
2349	} while (bh != head);
2350
2351	return ret;
2352	}
2353	EXPORT_SYMBOL(block_is_partially_uptodate);
2354
2355	/*
2356	* Generic "read_folio" function for block devices that have the normal
2357	* get_block functionality. This is most of the block device filesystems.
2358	* Reads the folio asynchronously --- the unlock_buffer() and
2359	* set/clear_buffer_uptodate() functions propagate buffer state into the
2360	* folio once IO has completed.
2361	*/
2362	int block_read_full_folio(struct folio folio, get_block_t get_block)
2363	{
2364	struct inode *inode = folio->mapping->host;
2365	sector_t iblock, lblock;
2366	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
2367	unsigned int blocksize, bbits;
2368	int nr, i;
2369	int fully_mapped = `1`;
2370	bool page_error = false;
2371	loff_t limit = i_size_read(inode);
2372
2373	/ This is needed for ext4. /
2374	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2375	limit = inode->i_sb->s_maxbytes;
2376
2377	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2378
2379	head = folio_create_buffers(folio, inode, b_state: `0`);
2380	blocksize = head->b_size;
2381	bbits = block_size_bits(blocksize);
2382
2383	iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
2384	lblock = (limit+blocksize-`1`) >> bbits;
2385	bh = head;
2386	nr = `0`;
2387	i = `0`;
2388
2389	do {
2390	if (buffer_uptodate(bh))
2391	continue;
2392
2393	if (!buffer_mapped(bh)) {
2394	int err = `0`;
2395
2396	fully_mapped = `0`;
2397	if (iblock < lblock) {
2398	WARN_ON(bh->b_size != blocksize);
2399	err = get_block(inode, iblock, bh, `0`);
2400	if (err) {
2401	folio_set_error(folio);
2402	page_error = true;
2403	}
2404	}
2405	if (!buffer_mapped(bh)) {
2406	folio_zero_range(folio, start: i * blocksize,
2407	length: blocksize);
2408	if (!err)
2409	set_buffer_uptodate(bh);
2410	continue;
2411	}
2412	/*
2413	* get_block() might have updated the buffer
2414	* synchronously
2415	*/
2416	if (buffer_uptodate(bh))
2417	continue;
2418	}
2419	arr[nr++] = bh;
2420	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2421
2422	if (fully_mapped)
2423	folio_set_mappedtodisk(folio);
2424
2425	if (!nr) {
2426	/*
2427	* All buffers are uptodate or get_block() returned an
2428	* error when trying to map them - we can finish the read.
2429	*/
2430	folio_end_read(folio, success: !page_error);
2431	return `0`;
2432	}
2433
2434	/ Stage two: lock the buffers /
2435	for (i = `0`; i < nr; i++) {
2436	bh = arr[i];
2437	lock_buffer(bh);
2438	mark_buffer_async_read(bh);
2439	}
2440
2441	/*
2442	* Stage 3: start the IO. Check for uptodateness
2443	* inside the buffer lock in case another process reading
2444	* the underlying blockdev brought it uptodate (the sct fix).
2445	*/
2446	for (i = `0`; i < nr; i++) {
2447	bh = arr[i];
2448	if (buffer_uptodate(bh))
2449	end_buffer_async_read(bh, uptodate: `1`);
2450	else
2451	submit_bh(REQ_OP_READ, bh);
2452	}
2453	return `0`;
2454	}
2455	EXPORT_SYMBOL(block_read_full_folio);
2456
2457	/ utility function for filesystems that need to do work on expanding*
2458	* truncates. Uses filesystem pagecache writes to allow the filesystem to
2459	* deal with the hole.
2460	*/
2461	int generic_cont_expand_simple(struct inode *inode, loff_t size)
2462	{
2463	struct address_space *mapping = inode->i_mapping;
2464	const struct address_space_operations *aops = mapping->a_ops;
2465	struct page *page;
2466	void *fsdata = NULL;
2467	int err;
2468
2469	err = inode_newsize_ok(inode, offset: size);
2470	if (err)
2471	goto out;
2472
2473	err = aops->write_begin(NULL, mapping, size, `0`, &page, &fsdata);
2474	if (err)
2475	goto out;
2476
2477	err = aops->write_end(NULL, mapping, size, `0`, `0`, page, fsdata);
2478	BUG_ON(err > `0`);
2479
2480	out:
2481	return err;
2482	}
2483	EXPORT_SYMBOL(generic_cont_expand_simple);
2484
2485	static int cont_expand_zero(struct file file, struct* address_space *mapping,
2486	loff_t pos, loff_t *bytes)
2487	{
2488	struct inode *inode = mapping->host;
2489	const struct address_space_operations *aops = mapping->a_ops;
2490	unsigned int blocksize = i_blocksize(node: inode);
2491	struct page *page;
2492	void *fsdata = NULL;
2493	pgoff_t index, curidx;
2494	loff_t curpos;
2495	unsigned zerofrom, offset, len;
2496	int err = `0`;
2497
2498	index = pos >> PAGE_SHIFT;
2499	offset = pos & ~PAGE_MASK;
2500
2501	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2502	zerofrom = curpos & ~PAGE_MASK;
2503	if (zerofrom & (blocksize-`1`)) {
2504	*bytes \|= (blocksize-`1`);
2505	(*bytes)++;
2506	}
2507	len = PAGE_SIZE - zerofrom;
2508
2509	err = aops->write_begin(file, mapping, curpos, len,
2510	&page, &fsdata);
2511	if (err)
2512	goto out;
2513	zero_user(page, start: zerofrom, size: len);
2514	err = aops->write_end(file, mapping, curpos, len, len,
2515	page, fsdata);
2516	if (err < `0`)
2517	goto out;
2518	BUG_ON(err != len);
2519	err = `0`;
2520
2521	balance_dirty_pages_ratelimited(mapping);
2522
2523	if (fatal_signal_pending(current)) {
2524	err = -EINTR;
2525	goto out;
2526	}
2527	}
2528
2529	/ page covers the boundary, find the boundary offset /
2530	if (index == curidx) {
2531	zerofrom = curpos & ~PAGE_MASK;
2532	/ if we will expand the thing last block will be filled /
2533	if (offset <= zerofrom) {
2534	goto out;
2535	}
2536	if (zerofrom & (blocksize-`1`)) {
2537	*bytes \|= (blocksize-`1`);
2538	(*bytes)++;
2539	}
2540	len = offset - zerofrom;
2541
2542	err = aops->write_begin(file, mapping, curpos, len,
2543	&page, &fsdata);
2544	if (err)
2545	goto out;
2546	zero_user(page, start: zerofrom, size: len);
2547	err = aops->write_end(file, mapping, curpos, len, len,
2548	page, fsdata);
2549	if (err < `0`)
2550	goto out;
2551	BUG_ON(err != len);
2552	err = `0`;
2553	}
2554	out:
2555	return err;
2556	}
2557
2558	/*
2559	* For moronic filesystems that do not allow holes in file.
2560	* We may have to extend the file.
2561	*/
2562	int cont_write_begin(struct file file, struct* address_space *mapping,
2563	loff_t pos, unsigned len,
2564	struct page *pagep, void* **fsdata,
2565	get_block_t get_block, loff_t bytes)
2566	{
2567	struct inode *inode = mapping->host;
2568	unsigned int blocksize = i_blocksize(node: inode);
2569	unsigned int zerofrom;
2570	int err;
2571
2572	err = cont_expand_zero(file, mapping, pos, bytes);
2573	if (err)
2574	return err;
2575
2576	zerofrom = *bytes & ~PAGE_MASK;
2577	if (pos+len > *bytes && zerofrom & (blocksize-`1`)) {
2578	*bytes \|= (blocksize-`1`);
2579	(*bytes)++;
2580	}
2581
2582	return block_write_begin(mapping, pos, len, pagep, get_block);
2583	}
2584	EXPORT_SYMBOL(cont_write_begin);
2585
2586	void block_commit_write(struct page page, unsigned* from, unsigned to)
2587	{
2588	struct folio *folio = page_folio(page);
2589	__block_commit_write(folio, from, to);
2590	}
2591	EXPORT_SYMBOL(block_commit_write);
2592
2593	/*
2594	* block_page_mkwrite() is not allowed to change the file size as it gets
2595	* called from a page fault handler when a page is first dirtied. Hence we must
2596	* be careful to check for EOF conditions here. We set the page up correctly
2597	* for a written page which means we get ENOSPC checking when writing into
2598	* holes and correct delalloc and unwritten extent mapping on filesystems that
2599	* support these features.
2600	*
2601	* We are not allowed to take the i_mutex here so we have to play games to
2602	* protect against truncate races as the page could now be beyond EOF. Because
2603	* truncate writes the inode size before removing pages, once we have the
2604	* page lock we can determine safely if the page is beyond EOF. If it is not
2605	* beyond EOF, then the page is guaranteed safe against truncation until we
2606	* unlock the page.
2607	*
2608	* Direct callers of this function should protect against filesystem freezing
2609	* using sb_start_pagefault() - sb_end_pagefault() functions.
2610	*/
2611	int block_page_mkwrite(struct vm_area_struct vma, struct* vm_fault *vmf,
2612	get_block_t get_block)
2613	{
2614	struct folio *folio = page_folio(vmf->page);
2615	struct inode *inode = file_inode(f: vma->vm_file);
2616	unsigned long end;
2617	loff_t size;
2618	int ret;
2619
2620	folio_lock(folio);
2621	size = i_size_read(inode);
2622	if ((folio->mapping != inode->i_mapping) \|\|
2623	(folio_pos(folio) >= size)) {
2624	/ We overload EFAULT to mean page got truncated /
2625	ret = -EFAULT;
2626	goto out_unlock;
2627	}
2628
2629	end = folio_size(folio);
2630	/ folio is wholly or partially inside EOF /
2631	if (folio_pos(folio) + end > size)
2632	end = size - folio_pos(folio);
2633
2634	ret = __block_write_begin_int(folio, pos: `0`, len: end, get_block, NULL);
2635	if (unlikely(ret))
2636	goto out_unlock;
2637
2638	__block_commit_write(folio, from: `0`, to: end);
2639
2640	folio_mark_dirty(folio);
2641	folio_wait_stable(folio);
2642	return `0`;
2643	out_unlock:
2644	folio_unlock(folio);
2645	return ret;
2646	}
2647	EXPORT_SYMBOL(block_page_mkwrite);
2648
2649	int block_truncate_page(struct address_space *mapping,
2650	loff_t from, get_block_t *get_block)
2651	{
2652	pgoff_t index = from >> PAGE_SHIFT;
2653	unsigned blocksize;
2654	sector_t iblock;
2655	size_t offset, length, pos;
2656	struct inode *inode = mapping->host;
2657	struct folio *folio;
2658	struct buffer_head *bh;
2659	int err = `0`;
2660
2661	blocksize = i_blocksize(node: inode);
2662	length = from & (blocksize - `1`);
2663
2664	/ Block boundary? Nothing to do /
2665	if (!length)
2666	return `0`;
2667
2668	length = blocksize - length;
2669	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2670
2671	folio = filemap_grab_folio(mapping, index);
2672	if (IS_ERR(ptr: folio))
2673	return PTR_ERR(ptr: folio);
2674
2675	bh = folio_buffers(folio);
2676	if (!bh)
2677	bh = create_empty_buffers(folio, blocksize, `0`);
2678
2679	/ Find the buffer that contains "offset" /
2680	offset = offset_in_folio(folio, from);
2681	pos = blocksize;
2682	while (offset >= pos) {
2683	bh = bh->b_this_page;
2684	iblock++;
2685	pos += blocksize;
2686	}
2687
2688	if (!buffer_mapped(bh)) {
2689	WARN_ON(bh->b_size != blocksize);
2690	err = get_block(inode, iblock, bh, `0`);
2691	if (err)
2692	goto unlock;
2693	/ unmapped? It's a hole - nothing to do /
2694	if (!buffer_mapped(bh))
2695	goto unlock;
2696	}
2697
2698	/ Ok, it's mapped. Make sure it's up-to-date /
2699	if (folio_test_uptodate(folio))
2700	set_buffer_uptodate(bh);
2701
2702	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2703	err = bh_read(bh, op_flags: `0`);
2704	/ Uhhuh. Read error. Complain and punt. /
2705	if (err < `0`)
2706	goto unlock;
2707	}
2708
2709	folio_zero_range(folio, start: offset, length);
2710	mark_buffer_dirty(bh);
2711
2712	unlock:
2713	folio_unlock(folio);
2714	folio_put(folio);
2715
2716	return err;
2717	}
2718	EXPORT_SYMBOL(block_truncate_page);
2719
2720	/*
2721	* The generic ->writepage function for buffer-backed address_spaces
2722	*/
2723	int block_write_full_page(struct page page, get_block_t get_block,
2724	struct writeback_control *wbc)
2725	{
2726	struct folio *folio = page_folio(page);
2727	struct inode * const inode = folio->mapping->host;
2728	loff_t i_size = i_size_read(inode);
2729
2730	/ Is the folio fully inside i_size? /
2731	if (folio_pos(folio) + folio_size(folio) <= i_size)
2732	return __block_write_full_folio(inode, folio, get_block, wbc,
2733	end_buffer_async_write);
2734
2735	/ Is the folio fully outside i_size? (truncate in progress) /
2736	if (folio_pos(folio) >= i_size) {
2737	folio_unlock(folio);
2738	return `0`; / don't care /
2739	}
2740
2741	/*
2742	* The folio straddles i_size. It must be zeroed out on each and every
2743	* writepage invocation because it may be mmapped. "A file is mapped
2744	* in multiples of the page size. For a file that is not a multiple of
2745	* the page size, the remaining memory is zeroed when mapped, and
2746	* writes to that region are not written out to the file."
2747	*/
2748	folio_zero_segment(folio, offset_in_folio(folio, i_size),
2749	xend: folio_size(folio));
2750	return __block_write_full_folio(inode, folio, get_block, wbc,
2751	end_buffer_async_write);
2752	}
2753	EXPORT_SYMBOL(block_write_full_page);
2754
2755	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2756	get_block_t *get_block)
2757	{
2758	struct inode *inode = mapping->host;
2759	struct buffer_head tmp = {
2760	.b_size = i_blocksize(node: inode),
2761	};
2762
2763	get_block(inode, block, &tmp, `0`);
2764	return tmp.b_blocknr;
2765	}
2766	EXPORT_SYMBOL(generic_block_bmap);
2767
2768	static void end_bio_bh_io_sync(struct bio *bio)
2769	{
2770	struct buffer_head *bh = bio->bi_private;
2771
2772	if (unlikely(bio_flagged(bio, BIO_QUIET)))
2773	set_bit(nr: BH_Quiet, addr: &bh->b_state);
2774
2775	bh->b_end_io(bh, !bio->bi_status);
2776	bio_put(bio);
2777	}
2778
2779	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2780	struct writeback_control *wbc)
2781	{
2782	const enum req_op op = opf & REQ_OP_MASK;
2783	struct bio *bio;
2784
2785	BUG_ON(!buffer_locked(bh));
2786	BUG_ON(!buffer_mapped(bh));
2787	BUG_ON(!bh->b_end_io);
2788	BUG_ON(buffer_delay(bh));
2789	BUG_ON(buffer_unwritten(bh));
2790
2791	/*
2792	* Only clear out a write error when rewriting
2793	*/
2794	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2795	clear_buffer_write_io_error(bh);
2796
2797	if (buffer_meta(bh))
2798	opf \|= REQ_META;
2799	if (buffer_prio(bh))
2800	opf \|= REQ_PRIO;
2801
2802	bio = bio_alloc(bdev: bh->b_bdev, nr_vecs: `1`, opf, GFP_NOIO);
2803
2804	fscrypt_set_bio_crypt_ctx_bh(bio, first_bh: bh, GFP_NOIO);
2805
2806	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> `9`);
2807
2808	__bio_add_page(bio, page: bh->b_page, len: bh->b_size, off: bh_offset(bh));
2809
2810	bio->bi_end_io = end_bio_bh_io_sync;
2811	bio->bi_private = bh;
2812
2813	/ Take care of bh's that straddle the end of the device /
2814	guard_bio_eod(bio);
2815
2816	if (wbc) {
2817	wbc_init_bio(wbc, bio);
2818	wbc_account_cgroup_owner(wbc, page: bh->b_page, bytes: bh->b_size);
2819	}
2820
2821	submit_bio(bio);
2822	}
2823
2824	void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2825	{
2826	submit_bh_wbc(opf, bh, NULL);
2827	}
2828	EXPORT_SYMBOL(submit_bh);
2829
2830	void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2831	{
2832	lock_buffer(bh);
2833	if (!test_clear_buffer_dirty(bh)) {
2834	unlock_buffer(bh);
2835	return;
2836	}
2837	bh->b_end_io = end_buffer_write_sync;
2838	get_bh(bh);
2839	submit_bh(REQ_OP_WRITE \| op_flags, bh);
2840	}
2841	EXPORT_SYMBOL(write_dirty_buffer);
2842
2843	/*
2844	* For a data-integrity writeout, we need to wait upon any in-progress I/O
2845	* and then start new I/O and then wait upon it. The caller must have a ref on
2846	* the buffer_head.
2847	*/
2848	int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2849	{
2850	WARN_ON(atomic_read(&bh->b_count) < `1`);
2851	lock_buffer(bh);
2852	if (test_clear_buffer_dirty(bh)) {
2853	/*
2854	* The bh should be mapped, but it might not be if the
2855	* device was hot-removed. Not much we can do but fail the I/O.
2856	*/
2857	if (!buffer_mapped(bh)) {
2858	unlock_buffer(bh);
2859	return -EIO;
2860	}
2861
2862	get_bh(bh);
2863	bh->b_end_io = end_buffer_write_sync;
2864	submit_bh(REQ_OP_WRITE \| op_flags, bh);
2865	wait_on_buffer(bh);
2866	if (!buffer_uptodate(bh))
2867	return -EIO;
2868	} else {
2869	unlock_buffer(bh);
2870	}
2871	return `0`;
2872	}
2873	EXPORT_SYMBOL(__sync_dirty_buffer);
2874
2875	int sync_dirty_buffer(struct buffer_head *bh)
2876	{
2877	return __sync_dirty_buffer(bh, REQ_SYNC);
2878	}
2879	EXPORT_SYMBOL(sync_dirty_buffer);
2880
2881	/*
2882	* try_to_free_buffers() checks if all the buffers on this particular folio
2883	* are unused, and releases them if so.
2884	*
2885	* Exclusion against try_to_free_buffers may be obtained by either
2886	* locking the folio or by holding its mapping's private_lock.
2887	*
2888	* If the folio is dirty but all the buffers are clean then we need to
2889	* be sure to mark the folio clean as well. This is because the folio
2890	* may be against a block device, and a later reattachment of buffers
2891	* to a dirty folio will set all buffers dirty. Which would corrupt
2892	* filesystem data on the same device.
2893	*
2894	* The same applies to regular filesystem folios: if all the buffers are
2895	* clean then we set the folio clean and proceed. To do that, we require
2896	* total exclusion from block_dirty_folio(). That is obtained with
2897	* private_lock.
2898	*
2899	* try_to_free_buffers() is non-blocking.
2900	*/
2901	static inline int buffer_busy(struct buffer_head *bh)
2902	{
2903	return atomic_read(v: &bh->b_count) \|
2904	(bh->b_state & ((`1` << BH_Dirty) \| (`1` << BH_Lock)));
2905	}
2906
2907	static bool
2908	drop_buffers(struct folio folio, struct* buffer_head **buffers_to_free)
2909	{
2910	struct buffer_head *head = folio_buffers(folio);
2911	struct buffer_head *bh;
2912
2913	bh = head;
2914	do {
2915	if (buffer_busy(bh))
2916	goto failed;
2917	bh = bh->b_this_page;
2918	} while (bh != head);
2919
2920	do {
2921	struct buffer_head *next = bh->b_this_page;
2922
2923	if (bh->b_assoc_map)
2924	__remove_assoc_queue(bh);
2925	bh = next;
2926	} while (bh != head);
2927	*buffers_to_free = head;
2928	folio_detach_private(folio);
2929	return true;
2930	failed:
2931	return false;
2932	}
2933
2934	bool try_to_free_buffers(struct folio *folio)
2935	{
2936	struct address_space * const mapping = folio->mapping;
2937	struct buffer_head *buffers_to_free = NULL;
2938	bool ret = `0`;
2939
2940	BUG_ON(!folio_test_locked(folio));
2941	if (folio_test_writeback(folio))
2942	return false;
2943
2944	if (mapping == NULL) { / can this still happen? /
2945	ret = drop_buffers(folio, buffers_to_free: &buffers_to_free);
2946	goto out;
2947	}
2948
2949	spin_lock(lock: &mapping->private_lock);
2950	ret = drop_buffers(folio, buffers_to_free: &buffers_to_free);
2951
2952	/*
2953	* If the filesystem writes its buffers by hand (eg ext3)
2954	* then we can have clean buffers against a dirty folio. We
2955	* clean the folio here; otherwise the VM will never notice
2956	* that the filesystem did any IO at all.
2957	*
2958	* Also, during truncate, discard_buffer will have marked all
2959	* the folio's buffers clean. We discover that here and clean
2960	* the folio also.
2961	*
2962	* private_lock must be held over this entire operation in order
2963	* to synchronise against block_dirty_folio and prevent the
2964	* dirty bit from being lost.
2965	*/
2966	if (ret)
2967	folio_cancel_dirty(folio);
2968	spin_unlock(lock: &mapping->private_lock);
2969	out:
2970	if (buffers_to_free) {
2971	struct buffer_head *bh = buffers_to_free;
2972
2973	do {
2974	struct buffer_head *next = bh->b_this_page;
2975	free_buffer_head(bh);
2976	bh = next;
2977	} while (bh != buffers_to_free);
2978	}
2979	return ret;
2980	}
2981	EXPORT_SYMBOL(try_to_free_buffers);
2982
2983	/*
2984	* Buffer-head allocation
2985	*/
2986	static struct kmem_cache *bh_cachep __ro_after_init;
2987
2988	/*
2989	* Once the number of bh's in the machine exceeds this level, we start
2990	* stripping them in writeback.
2991	*/
2992	static unsigned long max_buffer_heads __ro_after_init;
2993
2994	int buffer_heads_over_limit;
2995
2996	struct bh_accounting {
2997	int nr; / Number of live bh's /
2998	int ratelimit; / Limit cacheline bouncing /
2999	};
3000
3001	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {`0`, `0`};
3002
3003	static void recalc_bh_state(void)
3004	{
3005	int i;
3006	int tot = `0`;
3007
3008	if (__this_cpu_inc_return(bh_accounting.ratelimit) - `1` < `4096`)
3009	return;
3010	__this_cpu_write(bh_accounting.ratelimit, `0`);
3011	for_each_online_cpu(i)
3012	tot += per_cpu(bh_accounting, i).nr;
3013	buffer_heads_over_limit = (tot > max_buffer_heads);
3014	}
3015
3016	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3017	{
3018	struct buffer_head *ret = kmem_cache_zalloc(k: bh_cachep, flags: gfp_flags);
3019	if (ret) {
3020	INIT_LIST_HEAD(list: &ret->b_assoc_buffers);
3021	spin_lock_init(&ret->b_uptodate_lock);
3022	preempt_disable();
3023	__this_cpu_inc(bh_accounting.nr);
3024	recalc_bh_state();
3025	preempt_enable();
3026	}
3027	return ret;
3028	}
3029	EXPORT_SYMBOL(alloc_buffer_head);
3030
3031	void free_buffer_head(struct buffer_head *bh)
3032	{
3033	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3034	kmem_cache_free(s: bh_cachep, objp: bh);
3035	preempt_disable();
3036	__this_cpu_dec(bh_accounting.nr);
3037	recalc_bh_state();
3038	preempt_enable();
3039	}
3040	EXPORT_SYMBOL(free_buffer_head);
3041
3042	static int buffer_exit_cpu_dead(unsigned int cpu)
3043	{
3044	int i;
3045	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3046
3047	for (i = `0`; i < BH_LRU_SIZE; i++) {
3048	brelse(bh: b->bhs[i]);
3049	b->bhs[i] = NULL;
3050	}
3051	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3052	per_cpu(bh_accounting, cpu).nr = `0`;
3053	return `0`;
3054	}
3055
3056	/**
3057	* bh_uptodate_or_lock - Test whether the buffer is uptodate
3058	* @bh: struct buffer_head
3059	*
3060	* Return true if the buffer is up-to-date and false,
3061	* with the buffer locked, if not.
3062	*/
3063	int bh_uptodate_or_lock(struct buffer_head *bh)
3064	{
3065	if (!buffer_uptodate(bh)) {
3066	lock_buffer(bh);
3067	if (!buffer_uptodate(bh))
3068	return `0`;
3069	unlock_buffer(bh);
3070	}
3071	return `1`;
3072	}
3073	EXPORT_SYMBOL(bh_uptodate_or_lock);
3074
3075	/**
3076	* __bh_read - Submit read for a locked buffer
3077	* @bh: struct buffer_head
3078	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3079	* @wait: wait until reading finish
3080	*
3081	* Returns zero on success or don't wait, and -EIO on error.
3082	*/
3083	int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3084	{
3085	int ret = `0`;
3086
3087	BUG_ON(!buffer_locked(bh));
3088
3089	get_bh(bh);
3090	bh->b_end_io = end_buffer_read_sync;
3091	submit_bh(REQ_OP_READ \| op_flags, bh);
3092	if (wait) {
3093	wait_on_buffer(bh);
3094	if (!buffer_uptodate(bh))
3095	ret = -EIO;
3096	}
3097	return ret;
3098	}
3099	EXPORT_SYMBOL(__bh_read);
3100
3101	/**
3102	* __bh_read_batch - Submit read for a batch of unlocked buffers
3103	* @nr: entry number of the buffer batch
3104	* @bhs: a batch of struct buffer_head
3105	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3106	* @force_lock: force to get a lock on the buffer if set, otherwise drops any
3107	* buffer that cannot lock.
3108	*
3109	* Returns zero on success or don't wait, and -EIO on error.
3110	*/
3111	void __bh_read_batch(int nr, struct buffer_head *bhs[],
3112	blk_opf_t op_flags, bool force_lock)
3113	{
3114	int i;
3115
3116	for (i = `0`; i < nr; i++) {
3117	struct buffer_head *bh = bhs[i];
3118
3119	if (buffer_uptodate(bh))
3120	continue;
3121
3122	if (force_lock)
3123	lock_buffer(bh);
3124	else
3125	if (!trylock_buffer(bh))
3126	continue;
3127
3128	if (buffer_uptodate(bh)) {
3129	unlock_buffer(bh);
3130	continue;
3131	}
3132
3133	bh->b_end_io = end_buffer_read_sync;
3134	get_bh(bh);
3135	submit_bh(REQ_OP_READ \| op_flags, bh);
3136	}
3137	}
3138	EXPORT_SYMBOL(__bh_read_batch);
3139
3140	void __init buffer_init(void)
3141	{
3142	unsigned long nrpages;
3143	int ret;
3144
3145	bh_cachep = kmem_cache_create(name: "buffer_head",
3146	size: sizeof(struct buffer_head), align: `0`,
3147	flags: (SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
3148	SLAB_MEM_SPREAD),
3149	NULL);
3150
3151	/*
3152	* Limit the bh occupancy to 10% of ZONE_NORMAL
3153	*/
3154	nrpages = (nr_free_buffer_pages() * `10`) / `100`;
3155	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3156	ret = cpuhp_setup_state_nocalls(state: CPUHP_FS_BUFF_DEAD, name: "fs/buffer:dead",
3157	NULL, teardown: buffer_exit_cpu_dead);
3158	WARN_ON(ret < `0`);
3159	}
3160

source code of linux/fs/buffer.c