buffer.c source code [linux/fs/buffer.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/buffer.c
4	*
5	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
6	*/
7
8	/*
9	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10	*
11	* Removed a lot of unnecessary code and simplified things now that
12	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13	*
14	* Speed up hash, lru, and free list operations. Use gfp() for allocating
15	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16	*
17	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
18	*
19	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
20	*/
21
22	#include <linux/kernel.h>
23	#include <linux/sched/signal.h>
24	#include <linux/syscalls.h>
25	#include <linux/fs.h>
26	#include <linux/iomap.h>
27	#include <linux/mm.h>
28	#include <linux/percpu.h>
29	#include <linux/slab.h>
30	#include <linux/capability.h>
31	#include <linux/blkdev.h>
32	#include <linux/file.h>
33	#include <linux/quotaops.h>
34	#include <linux/highmem.h>
35	#include <linux/export.h>
36	#include <linux/backing-dev.h>
37	#include <linux/writeback.h>
38	#include <linux/hash.h>
39	#include <linux/suspend.h>
40	#include <linux/buffer_head.h>
41	#include <linux/task_io_accounting_ops.h>
42	#include <linux/bio.h>
43	#include <linux/cpu.h>
44	#include <linux/bitops.h>
45	#include <linux/mpage.h>
46	#include <linux/bit_spinlock.h>
47	#include <linux/pagevec.h>
48	#include <linux/sched/mm.h>
49	#include <trace/events/block.h>
50	#include <linux/fscrypt.h>
51	#include <linux/fsverity.h>
52	#include <linux/sched/isolation.h>
53
54	#include "internal.h"
55
56	static int fsync_buffers_list(spinlock_t lock, struct* list_head *list);
57	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
58	enum rw_hint hint, struct writeback_control *wbc);
59
60	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
61
62	inline void touch_buffer(struct buffer_head *bh)
63	{
64	trace_block_touch_buffer(bh);
65	folio_mark_accessed(bh->b_folio);
66	}
67	EXPORT_SYMBOL(touch_buffer);
68
69	void __lock_buffer(struct buffer_head *bh)
70	{
71	wait_on_bit_lock_io(word: &bh->b_state, bit: BH_Lock, TASK_UNINTERRUPTIBLE);
72	}
73	EXPORT_SYMBOL(__lock_buffer);
74
75	void unlock_buffer(struct buffer_head *bh)
76	{
77	clear_bit_unlock(nr: BH_Lock, addr: &bh->b_state);
78	smp_mb__after_atomic();
79	wake_up_bit(word: &bh->b_state, bit: BH_Lock);
80	}
81	EXPORT_SYMBOL(unlock_buffer);
82
83	/*
84	* Returns if the folio has dirty or writeback buffers. If all the buffers
85	* are unlocked and clean then the folio_test_dirty information is stale. If
86	* any of the buffers are locked, it is assumed they are locked for IO.
87	*/
88	void buffer_check_dirty_writeback(struct folio *folio,
89	bool dirty, bool writeback)
90	{
91	struct buffer_head head, bh;
92	*dirty = false;
93	*writeback = false;
94
95	BUG_ON(!folio_test_locked(folio));
96
97	head = folio_buffers(folio);
98	if (!head)
99	return;
100
101	if (folio_test_writeback(folio))
102	*writeback = true;
103
104	bh = head;
105	do {
106	if (buffer_locked(bh))
107	*writeback = true;
108
109	if (buffer_dirty(bh))
110	*dirty = true;
111
112	bh = bh->b_this_page;
113	} while (bh != head);
114	}
115
116	/*
117	* Block until a buffer comes unlocked. This doesn't stop it
118	* from becoming locked again - you have to lock it yourself
119	* if you want to preserve its state.
120	*/
121	void __wait_on_buffer(struct buffer_head * bh)
122	{
123	wait_on_bit_io(word: &bh->b_state, bit: BH_Lock, TASK_UNINTERRUPTIBLE);
124	}
125	EXPORT_SYMBOL(__wait_on_buffer);
126
127	static void buffer_io_error(struct buffer_head bh, char* *msg)
128	{
129	if (!test_bit(BH_Quiet, &bh->b_state))
130	printk_ratelimited(KERN_ERR
131	"Buffer I/O error on dev %pg, logical block %llu%s\n",
132	bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
133	}
134
135	/*
136	* End-of-IO handler helper function which does not touch the bh after
137	* unlocking it.
138	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
139	* a race there is benign: unlock_buffer() only use the bh's address for
140	* hashing after unlocking the buffer, so it doesn't actually touch the bh
141	* itself.
142	*/
143	static void __end_buffer_read_notouch(struct buffer_head bh, int* uptodate)
144	{
145	if (uptodate) {
146	set_buffer_uptodate(bh);
147	} else {
148	/ This happens, due to failed read-ahead attempts. /
149	clear_buffer_uptodate(bh);
150	}
151	unlock_buffer(bh);
152	}
153
154	/*
155	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
156	* unlock the buffer.
157	*/
158	void end_buffer_read_sync(struct buffer_head bh, int* uptodate)
159	{
160	__end_buffer_read_notouch(bh, uptodate);
161	put_bh(bh);
162	}
163	EXPORT_SYMBOL(end_buffer_read_sync);
164
165	void end_buffer_write_sync(struct buffer_head bh, int* uptodate)
166	{
167	if (uptodate) {
168	set_buffer_uptodate(bh);
169	} else {
170	buffer_io_error(bh, msg: ", lost sync page write");
171	mark_buffer_write_io_error(bh);
172	clear_buffer_uptodate(bh);
173	}
174	unlock_buffer(bh);
175	put_bh(bh);
176	}
177	EXPORT_SYMBOL(end_buffer_write_sync);
178
179	/*
180	* Various filesystems appear to want __find_get_block to be non-blocking.
181	* But it's the page lock which protects the buffers. To get around this,
182	* we get exclusion from try_to_free_buffers with the blockdev mapping's
183	* i_private_lock.
184	*
185	* Hack idea: for the blockdev mapping, i_private_lock contention
186	* may be quite high. This code could TryLock the page, and if that
187	* succeeds, there is no need to take i_private_lock.
188	*/
189	static struct buffer_head *
190	__find_get_block_slow(struct block_device *bdev, sector_t block)
191	{
192	struct inode *bd_inode = bdev->bd_inode;
193	struct address_space *bd_mapping = bd_inode->i_mapping;
194	struct buffer_head *ret = NULL;
195	pgoff_t index;
196	struct buffer_head *bh;
197	struct buffer_head *head;
198	struct folio *folio;
199	int all_mapped = `1`;
200	static DEFINE_RATELIMIT_STATE(last_warned, HZ, `1`);
201
202	index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
203	folio = __filemap_get_folio(mapping: bd_mapping, index, FGP_ACCESSED, gfp: `0`);
204	if (IS_ERR(ptr: folio))
205	goto out;
206
207	spin_lock(lock: &bd_mapping->i_private_lock);
208	head = folio_buffers(folio);
209	if (!head)
210	goto out_unlock;
211	bh = head;
212	do {
213	if (!buffer_mapped(bh))
214	all_mapped = `0`;
215	else if (bh->b_blocknr == block) {
216	ret = bh;
217	get_bh(bh);
218	goto out_unlock;
219	}
220	bh = bh->b_this_page;
221	} while (bh != head);
222
223	/ we might be here because some of the buffers on this page are*
224	* not mapped. This is due to various races between
225	* file io on the block device and getblk. It gets dealt with
226	* elsewhere, don't buffer_error if we had some unmapped buffers
227	*/
228	ratelimit_set_flags(rs: &last_warned, RATELIMIT_MSG_ON_RELEASE);
229	if (all_mapped && __ratelimit(&last_warned)) {
230	printk("__find_get_block_slow() failed. block=%llu, "
231	"b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
232	"device %pg blocksize: %d\n",
233	(unsigned long long)block,
234	(unsigned long long)bh->b_blocknr,
235	bh->b_state, bh->b_size, bdev,
236	`1` << bd_inode->i_blkbits);
237	}
238	out_unlock:
239	spin_unlock(lock: &bd_mapping->i_private_lock);
240	folio_put(folio);
241	out:
242	return ret;
243	}
244
245	static void end_buffer_async_read(struct buffer_head bh, int* uptodate)
246	{
247	unsigned long flags;
248	struct buffer_head *first;
249	struct buffer_head *tmp;
250	struct folio *folio;
251	int folio_uptodate = `1`;
252
253	BUG_ON(!buffer_async_read(bh));
254
255	folio = bh->b_folio;
256	if (uptodate) {
257	set_buffer_uptodate(bh);
258	} else {
259	clear_buffer_uptodate(bh);
260	buffer_io_error(bh, msg: ", async page read");
261	folio_set_error(folio);
262	}
263
264	/*
265	* Be _very_ careful from here on. Bad things can happen if
266	* two buffer heads end IO at almost the same time and both
267	* decide that the page is now completely done.
268	*/
269	first = folio_buffers(folio);
270	spin_lock_irqsave(&first->b_uptodate_lock, flags);
271	clear_buffer_async_read(bh);
272	unlock_buffer(bh);
273	tmp = bh;
274	do {
275	if (!buffer_uptodate(bh: tmp))
276	folio_uptodate = `0`;
277	if (buffer_async_read(bh: tmp)) {
278	BUG_ON(!buffer_locked(tmp));
279	goto still_busy;
280	}
281	tmp = tmp->b_this_page;
282	} while (tmp != bh);
283	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
284
285	folio_end_read(folio, success: folio_uptodate);
286	return;
287
288	still_busy:
289	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
290	return;
291	}
292
293	struct postprocess_bh_ctx {
294	struct work_struct work;
295	struct buffer_head *bh;
296	};
297
298	static void verify_bh(struct work_struct *work)
299	{
300	struct postprocess_bh_ctx *ctx =
301	container_of(work, struct postprocess_bh_ctx, work);
302	struct buffer_head *bh = ctx->bh;
303	bool valid;
304
305	valid = fsverity_verify_blocks(folio: bh->b_folio, len: bh->b_size, offset: bh_offset(bh));
306	end_buffer_async_read(bh, uptodate: valid);
307	kfree(objp: ctx);
308	}
309
310	static bool need_fsverity(struct buffer_head *bh)
311	{
312	struct folio *folio = bh->b_folio;
313	struct inode *inode = folio->mapping->host;
314
315	return fsverity_active(inode) &&
316	/ needed by ext4 /
317	folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
318	}
319
320	static void decrypt_bh(struct work_struct *work)
321	{
322	struct postprocess_bh_ctx *ctx =
323	container_of(work, struct postprocess_bh_ctx, work);
324	struct buffer_head *bh = ctx->bh;
325	int err;
326
327	err = fscrypt_decrypt_pagecache_blocks(folio: bh->b_folio, len: bh->b_size,
328	offs: bh_offset(bh));
329	if (err == `0` && need_fsverity(bh)) {
330	/*
331	* We use different work queues for decryption and for verity
332	* because verity may require reading metadata pages that need
333	* decryption, and we shouldn't recurse to the same workqueue.
334	*/
335	INIT_WORK(&ctx->work, verify_bh);
336	fsverity_enqueue_verify_work(work: &ctx->work);
337	return;
338	}
339	end_buffer_async_read(bh, uptodate: err == `0`);
340	kfree(objp: ctx);
341	}
342
343	/*
344	* I/O completion handler for block_read_full_folio() - pages
345	* which come unlocked at the end of I/O.
346	*/
347	static void end_buffer_async_read_io(struct buffer_head bh, int* uptodate)
348	{
349	struct inode *inode = bh->b_folio->mapping->host;
350	bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
351	bool verify = need_fsverity(bh);
352
353	/ Decrypt (with fscrypt) and/or verify (with fsverity) if needed. /
354	if (uptodate && (decrypt \|\| verify)) {
355	struct postprocess_bh_ctx *ctx =
356	kmalloc(size: sizeof(*ctx), GFP_ATOMIC);
357
358	if (ctx) {
359	ctx->bh = bh;
360	if (decrypt) {
361	INIT_WORK(&ctx->work, decrypt_bh);
362	fscrypt_enqueue_decrypt_work(&ctx->work);
363	} else {
364	INIT_WORK(&ctx->work, verify_bh);
365	fsverity_enqueue_verify_work(work: &ctx->work);
366	}
367	return;
368	}
369	uptodate = `0`;
370	}
371	end_buffer_async_read(bh, uptodate);
372	}
373
374	/*
375	* Completion handler for block_write_full_folio() - folios which are unlocked
376	* during I/O, and which have the writeback flag cleared upon I/O completion.
377	*/
378	static void end_buffer_async_write(struct buffer_head bh, int* uptodate)
379	{
380	unsigned long flags;
381	struct buffer_head *first;
382	struct buffer_head *tmp;
383	struct folio *folio;
384
385	BUG_ON(!buffer_async_write(bh));
386
387	folio = bh->b_folio;
388	if (uptodate) {
389	set_buffer_uptodate(bh);
390	} else {
391	buffer_io_error(bh, msg: ", lost async page write");
392	mark_buffer_write_io_error(bh);
393	clear_buffer_uptodate(bh);
394	folio_set_error(folio);
395	}
396
397	first = folio_buffers(folio);
398	spin_lock_irqsave(&first->b_uptodate_lock, flags);
399
400	clear_buffer_async_write(bh);
401	unlock_buffer(bh);
402	tmp = bh->b_this_page;
403	while (tmp != bh) {
404	if (buffer_async_write(bh: tmp)) {
405	BUG_ON(!buffer_locked(tmp));
406	goto still_busy;
407	}
408	tmp = tmp->b_this_page;
409	}
410	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
411	folio_end_writeback(folio);
412	return;
413
414	still_busy:
415	spin_unlock_irqrestore(lock: &first->b_uptodate_lock, flags);
416	return;
417	}
418
419	/*
420	* If a page's buffers are under async readin (end_buffer_async_read
421	* completion) then there is a possibility that another thread of
422	* control could lock one of the buffers after it has completed
423	* but while some of the other buffers have not completed. This
424	* locked buffer would confuse end_buffer_async_read() into not unlocking
425	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
426	* that this buffer is not under async I/O.
427	*
428	* The page comes unlocked when it has no locked buffer_async buffers
429	* left.
430	*
431	* PageLocked prevents anyone starting new async I/O reads any of
432	* the buffers.
433	*
434	* PageWriteback is used to prevent simultaneous writeout of the same
435	* page.
436	*
437	* PageLocked prevents anyone from starting writeback of a page which is
438	* under read I/O (PageWriteback is only ever set against a locked page).
439	*/
440	static void mark_buffer_async_read(struct buffer_head *bh)
441	{
442	bh->b_end_io = end_buffer_async_read_io;
443	set_buffer_async_read(bh);
444	}
445
446	static void mark_buffer_async_write_endio(struct buffer_head *bh,
447	bh_end_io_t *handler)
448	{
449	bh->b_end_io = handler;
450	set_buffer_async_write(bh);
451	}
452
453	void mark_buffer_async_write(struct buffer_head *bh)
454	{
455	mark_buffer_async_write_endio(bh, handler: end_buffer_async_write);
456	}
457	EXPORT_SYMBOL(mark_buffer_async_write);
458
459
460	/*
461	* fs/buffer.c contains helper functions for buffer-backed address space's
462	* fsync functions. A common requirement for buffer-based filesystems is
463	* that certain data from the backing blockdev needs to be written out for
464	* a successful fsync(). For example, ext2 indirect blocks need to be
465	* written back and waited upon before fsync() returns.
466	*
467	* The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
468	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
469	* management of a list of dependent buffers at ->i_mapping->i_private_list.
470	*
471	* Locking is a little subtle: try_to_free_buffers() will remove buffers
472	* from their controlling inode's queue when they are being freed. But
473	* try_to_free_buffers() will be operating against the blockdev mapping
474	* at the time, not against the S_ISREG file which depends on those buffers.
475	* So the locking for i_private_list is via the i_private_lock in the address_space
476	* which backs the buffers. Which is different from the address_space
477	* against which the buffers are listed. So for a particular address_space,
478	* mapping->i_private_lock does not protect mapping->i_private_list! In fact,
479	* mapping->i_private_list will always be protected by the backing blockdev's
480	* ->i_private_lock.
481	*
482	* Which introduces a requirement: all buffers on an address_space's
483	* ->i_private_list must be from the same address_space: the blockdev's.
484	*
485	* address_spaces which do not place buffers at ->i_private_list via these
486	* utility functions are free to use i_private_lock and i_private_list for
487	* whatever they want. The only requirement is that list_empty(i_private_list)
488	* be true at clear_inode() time.
489	*
490	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
491	* filesystems should do that. invalidate_inode_buffers() should just go
492	* BUG_ON(!list_empty).
493	*
494	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
495	* take an address_space, not an inode. And it should be called
496	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
497	* queued up.
498	*
499	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
500	* list if it is already on a list. Because if the buffer is on a list,
501	* it must already be on the right one. If not, the filesystem is being
502	* silly. This will save a ton of locking. But first we have to ensure
503	* that buffers are taken off the old inode's list when they are freed
504	* (presumably in truncate). That requires careful auditing of all
505	* filesystems (do it inside bforget()). It could also be done by bringing
506	* b_inode back.
507	*/
508
509	/*
510	* The buffer's backing address_space's i_private_lock must be held
511	*/
512	static void __remove_assoc_queue(struct buffer_head *bh)
513	{
514	list_del_init(entry: &bh->b_assoc_buffers);
515	WARN_ON(!bh->b_assoc_map);
516	bh->b_assoc_map = NULL;
517	}
518
519	int inode_has_buffers(struct inode *inode)
520	{
521	return !list_empty(head: &inode->i_data.i_private_list);
522	}
523
524	/*
525	* osync is designed to support O_SYNC io. It waits synchronously for
526	* all already-submitted IO to complete, but does not queue any new
527	* writes to the disk.
528	*
529	* To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
530	* as you dirty the buffers, and then use osync_inode_buffers to wait for
531	* completion. Any other dirty buffers which are not yet queued for
532	* write will not be flushed to disk by the osync.
533	*/
534	static int osync_buffers_list(spinlock_t lock, struct* list_head *list)
535	{
536	struct buffer_head *bh;
537	struct list_head *p;
538	int err = `0`;
539
540	spin_lock(lock);
541	repeat:
542	list_for_each_prev(p, list) {
543	bh = BH_ENTRY(p);
544	if (buffer_locked(bh)) {
545	get_bh(bh);
546	spin_unlock(lock);
547	wait_on_buffer(bh);
548	if (!buffer_uptodate(bh))
549	err = -EIO;
550	brelse(bh);
551	spin_lock(lock);
552	goto repeat;
553	}
554	}
555	spin_unlock(lock);
556	return err;
557	}
558
559	/**
560	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
561	* @mapping: the mapping which wants those buffers written
562	*
563	* Starts I/O against the buffers at mapping->i_private_list, and waits upon
564	* that I/O.
565	*
566	* Basically, this is a convenience function for fsync().
567	* @mapping is a file or directory which needs those buffers to be written for
568	* a successful fsync().
569	*/
570	int sync_mapping_buffers(struct address_space *mapping)
571	{
572	struct address_space *buffer_mapping = mapping->i_private_data;
573
574	if (buffer_mapping == NULL \|\| list_empty(head: &mapping->i_private_list))
575	return `0`;
576
577	return fsync_buffers_list(lock: &buffer_mapping->i_private_lock,
578	list: &mapping->i_private_list);
579	}
580	EXPORT_SYMBOL(sync_mapping_buffers);
581
582	/**
583	* generic_buffers_fsync_noflush - generic buffer fsync implementation
584	* for simple filesystems with no inode lock
585	*
586	* @file: file to synchronize
587	* @start: start offset in bytes
588	* @end: end offset in bytes (inclusive)
589	* @datasync: only synchronize essential metadata if true
590	*
591	* This is a generic implementation of the fsync method for simple
592	* filesystems which track all non-inode metadata in the buffers list
593	* hanging off the address_space structure.
594	*/
595	int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
596	bool datasync)
597	{
598	struct inode *inode = file->f_mapping->host;
599	int err;
600	int ret;
601
602	err = file_write_and_wait_range(file, start, end);
603	if (err)
604	return err;
605
606	ret = sync_mapping_buffers(inode->i_mapping);
607	if (!(inode->i_state & I_DIRTY_ALL))
608	goto out;
609	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
610	goto out;
611
612	err = sync_inode_metadata(inode, wait: `1`);
613	if (ret == `0`)
614	ret = err;
615
616	out:
617	/ check and advance again to catch errors after syncing out buffers /
618	err = file_check_and_advance_wb_err(file);
619	if (ret == `0`)
620	ret = err;
621	return ret;
622	}
623	EXPORT_SYMBOL(generic_buffers_fsync_noflush);
624
625	/**
626	* generic_buffers_fsync - generic buffer fsync implementation
627	* for simple filesystems with no inode lock
628	*
629	* @file: file to synchronize
630	* @start: start offset in bytes
631	* @end: end offset in bytes (inclusive)
632	* @datasync: only synchronize essential metadata if true
633	*
634	* This is a generic implementation of the fsync method for simple
635	* filesystems which track all non-inode metadata in the buffers list
636	* hanging off the address_space structure. This also makes sure that
637	* a device cache flush operation is called at the end.
638	*/
639	int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
640	bool datasync)
641	{
642	struct inode *inode = file->f_mapping->host;
643	int ret;
644
645	ret = generic_buffers_fsync_noflush(file, start, end, datasync);
646	if (!ret)
647	ret = blkdev_issue_flush(bdev: inode->i_sb->s_bdev);
648	return ret;
649	}
650	EXPORT_SYMBOL(generic_buffers_fsync);
651
652	/*
653	* Called when we've recently written block `bblock', and it is known that
654	* `bblock' was for a buffer_boundary() buffer. This means that the block at
655	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
656	* dirty, schedule it for IO. So that indirects merge nicely with their data.
657	*/
658	void write_boundary_block(struct block_device *bdev,
659	sector_t bblock, unsigned blocksize)
660	{
661	struct buffer_head *bh = __find_get_block(bdev, block: bblock + `1`, size: blocksize);
662	if (bh) {
663	if (buffer_dirty(bh))
664	write_dirty_buffer(bh, op_flags: `0`);
665	put_bh(bh);
666	}
667	}
668
669	void mark_buffer_dirty_inode(struct buffer_head bh, struct* inode *inode)
670	{
671	struct address_space *mapping = inode->i_mapping;
672	struct address_space *buffer_mapping = bh->b_folio->mapping;
673
674	mark_buffer_dirty(bh);
675	if (!mapping->i_private_data) {
676	mapping->i_private_data = buffer_mapping;
677	} else {
678	BUG_ON(mapping->i_private_data != buffer_mapping);
679	}
680	if (!bh->b_assoc_map) {
681	spin_lock(lock: &buffer_mapping->i_private_lock);
682	list_move_tail(list: &bh->b_assoc_buffers,
683	head: &mapping->i_private_list);
684	bh->b_assoc_map = mapping;
685	spin_unlock(lock: &buffer_mapping->i_private_lock);
686	}
687	}
688	EXPORT_SYMBOL(mark_buffer_dirty_inode);
689
690	/*
691	* Add a page to the dirty page list.
692	*
693	* It is a sad fact of life that this function is called from several places
694	* deeply under spinlocking. It may not sleep.
695	*
696	* If the page has buffers, the uptodate buffers are set dirty, to preserve
697	* dirty-state coherency between the page and the buffers. It the page does
698	* not have buffers then when they are later attached they will all be set
699	* dirty.
700	*
701	* The buffers are dirtied before the page is dirtied. There's a small race
702	* window in which a writepage caller may see the page cleanness but not the
703	* buffer dirtiness. That's fine. If this code were to set the page dirty
704	* before the buffers, a concurrent writepage caller could clear the page dirty
705	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
706	* page on the dirty page list.
707	*
708	* We use i_private_lock to lock against try_to_free_buffers while using the
709	* page's buffer list. Also use this to protect against clean buffers being
710	* added to the page after it was set dirty.
711	*
712	* FIXME: may need to call ->reservepage here as well. That's rather up to the
713	* address_space though.
714	*/
715	bool block_dirty_folio(struct address_space mapping, struct* folio *folio)
716	{
717	struct buffer_head *head;
718	bool newly_dirty;
719
720	spin_lock(lock: &mapping->i_private_lock);
721	head = folio_buffers(folio);
722	if (head) {
723	struct buffer_head *bh = head;
724
725	do {
726	set_buffer_dirty(bh);
727	bh = bh->b_this_page;
728	} while (bh != head);
729	}
730	/*
731	* Lock out page's memcg migration to keep PageDirty
732	* synchronized with per-memcg dirty page counters.
733	*/
734	folio_memcg_lock(folio);
735	newly_dirty = !folio_test_set_dirty(folio);
736	spin_unlock(lock: &mapping->i_private_lock);
737
738	if (newly_dirty)
739	__folio_mark_dirty(folio, mapping, warn: `1`);
740
741	folio_memcg_unlock(folio);
742
743	if (newly_dirty)
744	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
745
746	return newly_dirty;
747	}
748	EXPORT_SYMBOL(block_dirty_folio);
749
750	/*
751	* Write out and wait upon a list of buffers.
752	*
753	* We have conflicting pressures: we want to make sure that all
754	* initially dirty buffers get waited on, but that any subsequently
755	* dirtied buffers don't. After all, we don't want fsync to last
756	* forever if somebody is actively writing to the file.
757	*
758	* Do this in two main stages: first we copy dirty buffers to a
759	* temporary inode list, queueing the writes as we go. Then we clean
760	* up, waiting for those writes to complete.
761	*
762	* During this second stage, any subsequent updates to the file may end
763	* up refiling the buffer on the original inode's dirty list again, so
764	* there is a chance we will end up with a buffer queued for write but
765	* not yet completed on that list. So, as a final cleanup we go through
766	* the osync code to catch these locked, dirty buffers without requeuing
767	* any newly dirty buffers for write.
768	*/
769	static int fsync_buffers_list(spinlock_t lock, struct* list_head *list)
770	{
771	struct buffer_head *bh;
772	struct list_head tmp;
773	struct address_space *mapping;
774	int err = `0`, err2;
775	struct blk_plug plug;
776
777	INIT_LIST_HEAD(list: &tmp);
778	blk_start_plug(&plug);
779
780	spin_lock(lock);
781	while (!list_empty(head: list)) {
782	bh = BH_ENTRY(list->next);
783	mapping = bh->b_assoc_map;
784	__remove_assoc_queue(bh);
785	/ Avoid race with mark_buffer_dirty_inode() which does*
786	* a lockless check and we rely on seeing the dirty bit */
787	smp_mb();
788	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
789	list_add(new: &bh->b_assoc_buffers, head: &tmp);
790	bh->b_assoc_map = mapping;
791	if (buffer_dirty(bh)) {
792	get_bh(bh);
793	spin_unlock(lock);
794	/*
795	* Ensure any pending I/O completes so that
796	* write_dirty_buffer() actually writes the
797	* current contents - it is a noop if I/O is
798	* still in flight on potentially older
799	* contents.
800	*/
801	write_dirty_buffer(bh, REQ_SYNC);
802
803	/*
804	* Kick off IO for the previous mapping. Note
805	* that we will not run the very last mapping,
806	* wait_on_buffer() will do that for us
807	* through sync_buffer().
808	*/
809	brelse(bh);
810	spin_lock(lock);
811	}
812	}
813	}
814
815	spin_unlock(lock);
816	blk_finish_plug(&plug);
817	spin_lock(lock);
818
819	while (!list_empty(head: &tmp)) {
820	bh = BH_ENTRY(tmp.prev);
821	get_bh(bh);
822	mapping = bh->b_assoc_map;
823	__remove_assoc_queue(bh);
824	/ Avoid race with mark_buffer_dirty_inode() which does*
825	* a lockless check and we rely on seeing the dirty bit */
826	smp_mb();
827	if (buffer_dirty(bh)) {
828	list_add(new: &bh->b_assoc_buffers,
829	head: &mapping->i_private_list);
830	bh->b_assoc_map = mapping;
831	}
832	spin_unlock(lock);
833	wait_on_buffer(bh);
834	if (!buffer_uptodate(bh))
835	err = -EIO;
836	brelse(bh);
837	spin_lock(lock);
838	}
839
840	spin_unlock(lock);
841	err2 = osync_buffers_list(lock, list);
842	if (err)
843	return err;
844	else
845	return err2;
846	}
847
848	/*
849	* Invalidate any and all dirty buffers on a given inode. We are
850	* probably unmounting the fs, but that doesn't mean we have already
851	* done a sync(). Just drop the buffers from the inode list.
852	*
853	* NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
854	* assumes that all the buffers are against the blockdev. Not true
855	* for reiserfs.
856	*/
857	void invalidate_inode_buffers(struct inode *inode)
858	{
859	if (inode_has_buffers(inode)) {
860	struct address_space *mapping = &inode->i_data;
861	struct list_head *list = &mapping->i_private_list;
862	struct address_space *buffer_mapping = mapping->i_private_data;
863
864	spin_lock(lock: &buffer_mapping->i_private_lock);
865	while (!list_empty(head: list))
866	__remove_assoc_queue(BH_ENTRY(list->next));
867	spin_unlock(lock: &buffer_mapping->i_private_lock);
868	}
869	}
870	EXPORT_SYMBOL(invalidate_inode_buffers);
871
872	/*
873	* Remove any clean buffers from the inode's buffer list. This is called
874	* when we're trying to free the inode itself. Those buffers can pin it.
875	*
876	* Returns true if all buffers were removed.
877	*/
878	int remove_inode_buffers(struct inode *inode)
879	{
880	int ret = `1`;
881
882	if (inode_has_buffers(inode)) {
883	struct address_space *mapping = &inode->i_data;
884	struct list_head *list = &mapping->i_private_list;
885	struct address_space *buffer_mapping = mapping->i_private_data;
886
887	spin_lock(lock: &buffer_mapping->i_private_lock);
888	while (!list_empty(head: list)) {
889	struct buffer_head *bh = BH_ENTRY(list->next);
890	if (buffer_dirty(bh)) {
891	ret = `0`;
892	break;
893	}
894	__remove_assoc_queue(bh);
895	}
896	spin_unlock(lock: &buffer_mapping->i_private_lock);
897	}
898	return ret;
899	}
900
901	/*
902	* Create the appropriate buffers when given a folio for data area and
903	* the size of each buffer.. Use the bh->b_this_page linked list to
904	* follow the buffers created. Return NULL if unable to create more
905	* buffers.
906	*
907	* The retry flag is used to differentiate async IO (paging, swapping)
908	* which may not fail from ordinary buffer allocations.
909	*/
910	struct buffer_head folio_alloc_buffers(struct* folio folio, unsigned* long size,
911	gfp_t gfp)
912	{
913	struct buffer_head bh, head;
914	long offset;
915	struct mem_cgroup memcg, old_memcg;
916
917	/ The folio lock pins the memcg /
918	memcg = folio_memcg(folio);
919	old_memcg = set_active_memcg(memcg);
920
921	head = NULL;
922	offset = folio_size(folio);
923	while ((offset -= size) >= `0`) {
924	bh = alloc_buffer_head(gfp_flags: gfp);
925	if (!bh)
926	goto no_grow;
927
928	bh->b_this_page = head;
929	bh->b_blocknr = -`1`;
930	head = bh;
931
932	bh->b_size = size;
933
934	/ Link the buffer to its folio /
935	folio_set_bh(bh, folio, offset);
936	}
937	out:
938	set_active_memcg(old_memcg);
939	return head;
940	/*
941	* In case anything failed, we just free everything we got.
942	*/
943	no_grow:
944	if (head) {
945	do {
946	bh = head;
947	head = head->b_this_page;
948	free_buffer_head(bh);
949	} while (head);
950	}
951
952	goto out;
953	}
954	EXPORT_SYMBOL_GPL(folio_alloc_buffers);
955
956	struct buffer_head alloc_page_buffers(struct* page page, unsigned* long size,
957	bool retry)
958	{
959	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT;
960	if (retry)
961	gfp \|= __GFP_NOFAIL;
962
963	return folio_alloc_buffers(page_folio(page), size, gfp);
964	}
965	EXPORT_SYMBOL_GPL(alloc_page_buffers);
966
967	static inline void link_dev_buffers(struct folio *folio,
968	struct buffer_head *head)
969	{
970	struct buffer_head bh, tail;
971
972	bh = head;
973	do {
974	tail = bh;
975	bh = bh->b_this_page;
976	} while (bh);
977	tail->b_this_page = head;
978	folio_attach_private(folio, data: head);
979	}
980
981	static sector_t blkdev_max_block(struct block_device bdev, unsigned* int size)
982	{
983	sector_t retval = ~((sector_t)`0`);
984	loff_t sz = bdev_nr_bytes(bdev);
985
986	if (sz) {
987	unsigned int sizebits = blksize_bits(size);
988	retval = (sz >> sizebits);
989	}
990	return retval;
991	}
992
993	/*
994	* Initialise the state of a blockdev folio's buffers.
995	*/
996	static sector_t folio_init_buffers(struct folio *folio,
997	struct block_device bdev, unsigned* size)
998	{
999	struct buffer_head *head = folio_buffers(folio);
1000	struct buffer_head *bh = head;
1001	bool uptodate = folio_test_uptodate(folio);
1002	sector_t block = div_u64(dividend: folio_pos(folio), divisor: size);
1003	sector_t end_block = blkdev_max_block(bdev, size);
1004
1005	do {
1006	if (!buffer_mapped(bh)) {
1007	bh->b_end_io = NULL;
1008	bh->b_private = NULL;
1009	bh->b_bdev = bdev;
1010	bh->b_blocknr = block;
1011	if (uptodate)
1012	set_buffer_uptodate(bh);
1013	if (block < end_block)
1014	set_buffer_mapped(bh);
1015	}
1016	block++;
1017	bh = bh->b_this_page;
1018	} while (bh != head);
1019
1020	/*
1021	* Caller needs to validate requested block against end of device.
1022	*/
1023	return end_block;
1024	}
1025
1026	/*
1027	* Create the page-cache folio that contains the requested block.
1028	*
1029	* This is used purely for blockdev mappings.
1030	*
1031	* Returns false if we have a failure which cannot be cured by retrying
1032	* without sleeping. Returns true if we succeeded, or the caller should retry.
1033	*/
1034	static bool grow_dev_folio(struct block_device *bdev, sector_t block,
1035	pgoff_t index, unsigned size, gfp_t gfp)
1036	{
1037	struct inode *inode = bdev->bd_inode;
1038	struct folio *folio;
1039	struct buffer_head *bh;
1040	sector_t end_block = `0`;
1041
1042	folio = __filemap_get_folio(mapping: inode->i_mapping, index,
1043	FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT, gfp);
1044	if (IS_ERR(ptr: folio))
1045	return false;
1046
1047	bh = folio_buffers(folio);
1048	if (bh) {
1049	if (bh->b_size == size) {
1050	end_block = folio_init_buffers(folio, bdev, size);
1051	goto unlock;
1052	}
1053
1054	/*
1055	* Retrying may succeed; for example the folio may finish
1056	* writeback, or buffers may be cleaned. This should not
1057	* happen very often; maybe we have old buffers attached to
1058	* this blockdev's page cache and we're trying to change
1059	* the block size?
1060	*/
1061	if (!try_to_free_buffers(folio)) {
1062	end_block = ~`0ULL`;
1063	goto unlock;
1064	}
1065	}
1066
1067	bh = folio_alloc_buffers(folio, size, gfp \| __GFP_ACCOUNT);
1068	if (!bh)
1069	goto unlock;
1070
1071	/*
1072	* Link the folio to the buffers and initialise them. Take the
1073	* lock to be atomic wrt __find_get_block(), which does not
1074	* run under the folio lock.
1075	*/
1076	spin_lock(lock: &inode->i_mapping->i_private_lock);
1077	link_dev_buffers(folio, head: bh);
1078	end_block = folio_init_buffers(folio, bdev, size);
1079	spin_unlock(lock: &inode->i_mapping->i_private_lock);
1080	unlock:
1081	folio_unlock(folio);
1082	folio_put(folio);
1083	return block < end_block;
1084	}
1085
1086	/*
1087	* Create buffers for the specified block device block's folio. If
1088	* that folio was dirty, the buffers are set dirty also. Returns false
1089	* if we've hit a permanent error.
1090	*/
1091	static bool grow_buffers(struct block_device *bdev, sector_t block,
1092	unsigned size, gfp_t gfp)
1093	{
1094	loff_t pos;
1095
1096	/*
1097	* Check for a block which lies outside our maximum possible
1098	* pagecache index.
1099	*/
1100	if (check_mul_overflow(block, (sector_t)size, &pos) \|\| pos > MAX_LFS_FILESIZE) {
1101	printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
1102	__func__, (unsigned long long)block,
1103	bdev);
1104	return false;
1105	}
1106
1107	/ Create a folio with the proper size buffers /
1108	return grow_dev_folio(bdev, block, index: pos / PAGE_SIZE, size, gfp);
1109	}
1110
1111	static struct buffer_head *
1112	__getblk_slow(struct block_device *bdev, sector_t block,
1113	unsigned size, gfp_t gfp)
1114	{
1115	/ Size must be multiple of hard sectorsize /
1116	if (unlikely(size & (bdev_logical_block_size(bdev)-`1`) \|\|
1117	(size < `512` \|\| size > PAGE_SIZE))) {
1118	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1119	size);
1120	printk(KERN_ERR "logical block size: %d\n",
1121	bdev_logical_block_size(bdev));
1122
1123	dump_stack();
1124	return NULL;
1125	}
1126
1127	for (;;) {
1128	struct buffer_head *bh;
1129
1130	bh = __find_get_block(bdev, block, size);
1131	if (bh)
1132	return bh;
1133
1134	if (!grow_buffers(bdev, block, size, gfp))
1135	return NULL;
1136	}
1137	}
1138
1139	/*
1140	* The relationship between dirty buffers and dirty pages:
1141	*
1142	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
1143	* the page is tagged dirty in the page cache.
1144	*
1145	* At all times, the dirtiness of the buffers represents the dirtiness of
1146	* subsections of the page. If the page has buffers, the page dirty bit is
1147	* merely a hint about the true dirty state.
1148	*
1149	* When a page is set dirty in its entirety, all its buffers are marked dirty
1150	* (if the page has buffers).
1151	*
1152	* When a buffer is marked dirty, its page is dirtied, but the page's other
1153	* buffers are not.
1154	*
1155	* Also. When blockdev buffers are explicitly read with bread(), they
1156	* individually become uptodate. But their backing page remains not
1157	* uptodate - even if all of its buffers are uptodate. A subsequent
1158	* block_read_full_folio() against that folio will discover all the uptodate
1159	* buffers, will set the folio uptodate and will perform no I/O.
1160	*/
1161
1162	/**
1163	* mark_buffer_dirty - mark a buffer_head as needing writeout
1164	* @bh: the buffer_head to mark dirty
1165	*
1166	* mark_buffer_dirty() will set the dirty bit against the buffer, then set
1167	* its backing page dirty, then tag the page as dirty in the page cache
1168	* and then attach the address_space's inode to its superblock's dirty
1169	* inode list.
1170	*
1171	* mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
1172	* i_pages lock and mapping->host->i_lock.
1173	*/
1174	void mark_buffer_dirty(struct buffer_head *bh)
1175	{
1176	WARN_ON_ONCE(!buffer_uptodate(bh));
1177
1178	trace_block_dirty_buffer(bh);
1179
1180	/*
1181	* Very carefully optimize the it-is-already-dirty case.
1182	*
1183	* Don't let the final "is it dirty" escape to before we
1184	* perhaps modified the buffer.
1185	*/
1186	if (buffer_dirty(bh)) {
1187	smp_mb();
1188	if (buffer_dirty(bh))
1189	return;
1190	}
1191
1192	if (!test_set_buffer_dirty(bh)) {
1193	struct folio *folio = bh->b_folio;
1194	struct address_space *mapping = NULL;
1195
1196	folio_memcg_lock(folio);
1197	if (!folio_test_set_dirty(folio)) {
1198	mapping = folio->mapping;
1199	if (mapping)
1200	__folio_mark_dirty(folio, mapping, warn: `0`);
1201	}
1202	folio_memcg_unlock(folio);
1203	if (mapping)
1204	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1205	}
1206	}
1207	EXPORT_SYMBOL(mark_buffer_dirty);
1208
1209	void mark_buffer_write_io_error(struct buffer_head *bh)
1210	{
1211	set_buffer_write_io_error(bh);
1212	/ FIXME: do we need to set this in both places? /
1213	if (bh->b_folio && bh->b_folio->mapping)
1214	mapping_set_error(mapping: bh->b_folio->mapping, error: -EIO);
1215	if (bh->b_assoc_map) {
1216	mapping_set_error(mapping: bh->b_assoc_map, error: -EIO);
1217	errseq_set(eseq: &bh->b_assoc_map->host->i_sb->s_wb_err, err: -EIO);
1218	}
1219	}
1220	EXPORT_SYMBOL(mark_buffer_write_io_error);
1221
1222	/*
1223	* Decrement a buffer_head's reference count. If all buffers against a page
1224	* have zero reference count, are clean and unlocked, and if the page is clean
1225	* and unlocked then try_to_free_buffers() may strip the buffers from the page
1226	* in preparation for freeing it (sometimes, rarely, buffers are removed from
1227	* a page but it ends up not being freed, and buffers may later be reattached).
1228	*/
1229	void __brelse(struct buffer_head * buf)
1230	{
1231	if (atomic_read(v: &buf->b_count)) {
1232	put_bh(bh: buf);
1233	return;
1234	}
1235	WARN(`1`, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1236	}
1237	EXPORT_SYMBOL(__brelse);
1238
1239	/*
1240	* bforget() is like brelse(), except it discards any
1241	* potentially dirty data.
1242	*/
1243	void __bforget(struct buffer_head *bh)
1244	{
1245	clear_buffer_dirty(bh);
1246	if (bh->b_assoc_map) {
1247	struct address_space *buffer_mapping = bh->b_folio->mapping;
1248
1249	spin_lock(lock: &buffer_mapping->i_private_lock);
1250	list_del_init(entry: &bh->b_assoc_buffers);
1251	bh->b_assoc_map = NULL;
1252	spin_unlock(lock: &buffer_mapping->i_private_lock);
1253	}
1254	__brelse(bh);
1255	}
1256	EXPORT_SYMBOL(__bforget);
1257
1258	static struct buffer_head __bread_slow(struct* buffer_head *bh)
1259	{
1260	lock_buffer(bh);
1261	if (buffer_uptodate(bh)) {
1262	unlock_buffer(bh);
1263	return bh;
1264	} else {
1265	get_bh(bh);
1266	bh->b_end_io = end_buffer_read_sync;
1267	submit_bh(REQ_OP_READ, bh);
1268	wait_on_buffer(bh);
1269	if (buffer_uptodate(bh))
1270	return bh;
1271	}
1272	brelse(bh);
1273	return NULL;
1274	}
1275
1276	/*
1277	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1278	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1279	* refcount elevated by one when they're in an LRU. A buffer can only appear
1280	* once in a particular CPU's LRU. A single buffer can be present in multiple
1281	* CPU's LRUs at the same time.
1282	*
1283	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
1284	* sb_find_get_block().
1285	*
1286	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
1287	* a local interrupt disable for that.
1288	*/
1289
1290	#define BH_LRU_SIZE 16
1291
1292	struct bh_lru {
1293	struct buffer_head *bhs[BH_LRU_SIZE];
1294	};
1295
1296	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1297
1298	#ifdef CONFIG_SMP
1299	#define bh_lru_lock() local_irq_disable()
1300	#define bh_lru_unlock() local_irq_enable()
1301	#else
1302	#define bh_lru_lock() preempt_disable()
1303	#define bh_lru_unlock() preempt_enable()
1304	#endif
1305
1306	static inline void check_irqs_on(void)
1307	{
1308	#ifdef irqs_disabled
1309	BUG_ON(irqs_disabled());
1310	#endif
1311	}
1312
1313	/*
1314	* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
1315	* inserted at the front, and the buffer_head at the back if any is evicted.
1316	* Or, if already in the LRU it is moved to the front.
1317	*/
1318	static void bh_lru_install(struct buffer_head *bh)
1319	{
1320	struct buffer_head *evictee = bh;
1321	struct bh_lru *b;
1322	int i;
1323
1324	check_irqs_on();
1325	bh_lru_lock();
1326
1327	/*
1328	* the refcount of buffer_head in bh_lru prevents dropping the
1329	* attached page(i.e., try_to_free_buffers) so it could cause
1330	* failing page migration.
1331	* Skip putting upcoming bh into bh_lru until migration is done.
1332	*/
1333	if (lru_cache_disabled() \|\| cpu_is_isolated(smp_processor_id())) {
1334	bh_lru_unlock();
1335	return;
1336	}
1337
1338	b = this_cpu_ptr(&bh_lrus);
1339	for (i = `0`; i < BH_LRU_SIZE; i++) {
1340	swap(evictee, b->bhs[i]);
1341	if (evictee == bh) {
1342	bh_lru_unlock();
1343	return;
1344	}
1345	}
1346
1347	get_bh(bh);
1348	bh_lru_unlock();
1349	brelse(bh: evictee);
1350	}
1351
1352	/*
1353	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
1354	*/
1355	static struct buffer_head *
1356	lookup_bh_lru(struct block_device bdev, sector_t block, unsigned* size)
1357	{
1358	struct buffer_head *ret = NULL;
1359	unsigned int i;
1360
1361	check_irqs_on();
1362	bh_lru_lock();
1363	if (cpu_is_isolated(smp_processor_id())) {
1364	bh_lru_unlock();
1365	return NULL;
1366	}
1367	for (i = `0`; i < BH_LRU_SIZE; i++) {
1368	struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1369
1370	if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1371	bh->b_size == size) {
1372	if (i) {
1373	while (i) {
1374	__this_cpu_write(bh_lrus.bhs[i],
1375	__this_cpu_read(bh_lrus.bhs[i - `1`]));
1376	i--;
1377	}
1378	__this_cpu_write(bh_lrus.bhs[`0`], bh);
1379	}
1380	get_bh(bh);
1381	ret = bh;
1382	break;
1383	}
1384	}
1385	bh_lru_unlock();
1386	return ret;
1387	}
1388
1389	/*
1390	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
1391	* it in the LRU and mark it as accessed. If it is not present then return
1392	* NULL
1393	*/
1394	struct buffer_head *
1395	__find_get_block(struct block_device bdev, sector_t block, unsigned* size)
1396	{
1397	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1398
1399	if (bh == NULL) {
1400	/ __find_get_block_slow will mark the page accessed /
1401	bh = __find_get_block_slow(bdev, block);
1402	if (bh)
1403	bh_lru_install(bh);
1404	} else
1405	touch_buffer(bh);
1406
1407	return bh;
1408	}
1409	EXPORT_SYMBOL(__find_get_block);
1410
1411	/**
1412	* bdev_getblk - Get a buffer_head in a block device's buffer cache.
1413	* @bdev: The block device.
1414	* @block: The block number.
1415	* @size: The size of buffer_heads for this @bdev.
1416	* @gfp: The memory allocation flags to use.
1417	*
1418	* Return: The buffer head, or NULL if memory could not be allocated.
1419	*/
1420	struct buffer_head bdev_getblk(struct* block_device *bdev, sector_t block,
1421	unsigned size, gfp_t gfp)
1422	{
1423	struct buffer_head *bh = __find_get_block(bdev, block, size);
1424
1425	might_alloc(gfp_mask: gfp);
1426	if (bh)
1427	return bh;
1428
1429	return __getblk_slow(bdev, block, size, gfp);
1430	}
1431	EXPORT_SYMBOL(bdev_getblk);
1432
1433	/*
1434	* Do async read-ahead on a buffer..
1435	*/
1436	void __breadahead(struct block_device bdev, sector_t block, unsigned* size)
1437	{
1438	struct buffer_head *bh = bdev_getblk(bdev, block, size,
1439	GFP_NOWAIT \| __GFP_MOVABLE);
1440
1441	if (likely(bh)) {
1442	bh_readahead(bh, REQ_RAHEAD);
1443	brelse(bh);
1444	}
1445	}
1446	EXPORT_SYMBOL(__breadahead);
1447
1448	/**
1449	* __bread_gfp() - reads a specified block and returns the bh
1450	* @bdev: the block_device to read from
1451	* @block: number of block
1452	* @size: size (in bytes) to read
1453	* @gfp: page allocation flag
1454	*
1455	* Reads a specified block, and returns buffer head that contains it.
1456	* The page cache can be allocated from non-movable area
1457	* not to prevent page migration if you set gfp to zero.
1458	* It returns NULL if the block was unreadable.
1459	*/
1460	struct buffer_head *
1461	__bread_gfp(struct block_device *bdev, sector_t block,
1462	unsigned size, gfp_t gfp)
1463	{
1464	struct buffer_head *bh;
1465
1466	gfp \|= mapping_gfp_constraint(mapping: bdev->bd_inode->i_mapping, gfp_mask: ~__GFP_FS);
1467
1468	/*
1469	* Prefer looping in the allocator rather than here, at least that
1470	* code knows what it's doing.
1471	*/
1472	gfp \|= __GFP_NOFAIL;
1473
1474	bh = bdev_getblk(bdev, block, size, gfp);
1475
1476	if (likely(bh) && !buffer_uptodate(bh))
1477	bh = __bread_slow(bh);
1478	return bh;
1479	}
1480	EXPORT_SYMBOL(__bread_gfp);
1481
1482	static void __invalidate_bh_lrus(struct bh_lru *b)
1483	{
1484	int i;
1485
1486	for (i = `0`; i < BH_LRU_SIZE; i++) {
1487	brelse(bh: b->bhs[i]);
1488	b->bhs[i] = NULL;
1489	}
1490	}
1491	/*
1492	* invalidate_bh_lrus() is called rarely - but not only at unmount.
1493	* This doesn't race because it runs in each cpu either in irq
1494	* or with preempt disabled.
1495	*/
1496	static void invalidate_bh_lru(void *arg)
1497	{
1498	struct bh_lru *b = &get_cpu_var(bh_lrus);
1499
1500	__invalidate_bh_lrus(b);
1501	put_cpu_var(bh_lrus);
1502	}
1503
1504	bool has_bh_in_lru(int cpu, void *dummy)
1505	{
1506	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1507	int i;
1508
1509	for (i = `0`; i < BH_LRU_SIZE; i++) {
1510	if (b->bhs[i])
1511	return true;
1512	}
1513
1514	return false;
1515	}
1516
1517	void invalidate_bh_lrus(void)
1518	{
1519	on_each_cpu_cond(cond_func: has_bh_in_lru, func: invalidate_bh_lru, NULL, wait: `1`);
1520	}
1521	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1522
1523	/*
1524	* It's called from workqueue context so we need a bh_lru_lock to close
1525	* the race with preemption/irq.
1526	*/
1527	void invalidate_bh_lrus_cpu(void)
1528	{
1529	struct bh_lru *b;
1530
1531	bh_lru_lock();
1532	b = this_cpu_ptr(&bh_lrus);
1533	__invalidate_bh_lrus(b);
1534	bh_lru_unlock();
1535	}
1536
1537	void folio_set_bh(struct buffer_head bh, struct* folio *folio,
1538	unsigned long offset)
1539	{
1540	bh->b_folio = folio;
1541	BUG_ON(offset >= folio_size(folio));
1542	if (folio_test_highmem(folio))
1543	/*
1544	* This catches illegal uses and preserves the offset:
1545	*/
1546	bh->b_data = (char *)(`0` + offset);
1547	else
1548	bh->b_data = folio_address(folio) + offset;
1549	}
1550	EXPORT_SYMBOL(folio_set_bh);
1551
1552	/*
1553	* Called when truncating a buffer on a page completely.
1554	*/
1555
1556	/ Bits that are cleared during an invalidate /
1557	#define BUFFER_FLAGS_DISCARD \
1558	(1 << BH_Mapped \| 1 << BH_New \| 1 << BH_Req \| \
1559	1 << BH_Delay \| 1 << BH_Unwritten)
1560
1561	static void discard_buffer(struct buffer_head * bh)
1562	{
1563	unsigned long b_state;
1564
1565	lock_buffer(bh);
1566	clear_buffer_dirty(bh);
1567	bh->b_bdev = NULL;
1568	b_state = READ_ONCE(bh->b_state);
1569	do {
1570	} while (!try_cmpxchg(&bh->b_state, &b_state,
1571	b_state & ~BUFFER_FLAGS_DISCARD));
1572	unlock_buffer(bh);
1573	}
1574
1575	/**
1576	* block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
1577	* @folio: The folio which is affected.
1578	* @offset: start of the range to invalidate
1579	* @length: length of the range to invalidate
1580	*
1581	* block_invalidate_folio() is called when all or part of the folio has been
1582	* invalidated by a truncate operation.
1583	*
1584	* block_invalidate_folio() does not have to release all buffers, but it must
1585	* ensure that no dirty buffer is left outside @offset and that no I/O
1586	* is underway against any of the blocks which are outside the truncation
1587	* point. Because the caller is about to free (and possibly reuse) those
1588	* blocks on-disk.
1589	*/
1590	void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
1591	{
1592	struct buffer_head head, bh, *next;
1593	size_t curr_off = `0`;
1594	size_t stop = length + offset;
1595
1596	BUG_ON(!folio_test_locked(folio));
1597
1598	/*
1599	* Check for overflow
1600	*/
1601	BUG_ON(stop > folio_size(folio) \|\| stop < length);
1602
1603	head = folio_buffers(folio);
1604	if (!head)
1605	return;
1606
1607	bh = head;
1608	do {
1609	size_t next_off = curr_off + bh->b_size;
1610	next = bh->b_this_page;
1611
1612	/*
1613	* Are we still fully in range ?
1614	*/
1615	if (next_off > stop)
1616	goto out;
1617
1618	/*
1619	* is this block fully invalidated?
1620	*/
1621	if (offset <= curr_off)
1622	discard_buffer(bh);
1623	curr_off = next_off;
1624	bh = next;
1625	} while (bh != head);
1626
1627	/*
1628	* We release buffers only if the entire folio is being invalidated.
1629	* The get_block cached value has been unconditionally invalidated,
1630	* so real IO is not possible anymore.
1631	*/
1632	if (length == folio_size(folio))
1633	filemap_release_folio(folio, gfp: `0`);
1634	out:
1635	return;
1636	}
1637	EXPORT_SYMBOL(block_invalidate_folio);
1638
1639	/*
1640	* We attach and possibly dirty the buffers atomically wrt
1641	* block_dirty_folio() via i_private_lock. try_to_free_buffers
1642	* is already excluded via the folio lock.
1643	*/
1644	struct buffer_head create_empty_buffers(struct* folio *folio,
1645	unsigned long blocksize, unsigned long b_state)
1646	{
1647	struct buffer_head bh, head, *tail;
1648	gfp_t gfp = GFP_NOFS \| __GFP_ACCOUNT \| __GFP_NOFAIL;
1649
1650	head = folio_alloc_buffers(folio, blocksize, gfp);
1651	bh = head;
1652	do {
1653	bh->b_state \|= b_state;
1654	tail = bh;
1655	bh = bh->b_this_page;
1656	} while (bh);
1657	tail->b_this_page = head;
1658
1659	spin_lock(lock: &folio->mapping->i_private_lock);
1660	if (folio_test_uptodate(folio) \|\| folio_test_dirty(folio)) {
1661	bh = head;
1662	do {
1663	if (folio_test_dirty(folio))
1664	set_buffer_dirty(bh);
1665	if (folio_test_uptodate(folio))
1666	set_buffer_uptodate(bh);
1667	bh = bh->b_this_page;
1668	} while (bh != head);
1669	}
1670	folio_attach_private(folio, data: head);
1671	spin_unlock(lock: &folio->mapping->i_private_lock);
1672
1673	return head;
1674	}
1675	EXPORT_SYMBOL(create_empty_buffers);
1676
1677	/**
1678	* clean_bdev_aliases: clean a range of buffers in block device
1679	* @bdev: Block device to clean buffers in
1680	* @block: Start of a range of blocks to clean
1681	* @len: Number of blocks to clean
1682	*
1683	* We are taking a range of blocks for data and we don't want writeback of any
1684	* buffer-cache aliases starting from return from this function and until the
1685	* moment when something will explicitly mark the buffer dirty (hopefully that
1686	* will not happen until we will free that block ;-) We don't even need to mark
1687	* it not-uptodate - nobody can expect anything from a newly allocated buffer
1688	* anyway. We used to use unmap_buffer() for such invalidation, but that was
1689	* wrong. We definitely don't want to mark the alias unmapped, for example - it
1690	* would confuse anyone who might pick it with bread() afterwards...
1691	*
1692	* Also.. Note that bforget() doesn't lock the buffer. So there can be
1693	* writeout I/O going on against recently-freed buffers. We don't wait on that
1694	* I/O in bforget() - it's more efficient to wait on the I/O only if we really
1695	* need to. That happens here.
1696	*/
1697	void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
1698	{
1699	struct inode *bd_inode = bdev->bd_inode;
1700	struct address_space *bd_mapping = bd_inode->i_mapping;
1701	struct folio_batch fbatch;
1702	pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
1703	pgoff_t end;
1704	int i, count;
1705	struct buffer_head *bh;
1706	struct buffer_head *head;
1707
1708	end = ((loff_t)(block + len - `1`) << bd_inode->i_blkbits) / PAGE_SIZE;
1709	folio_batch_init(fbatch: &fbatch);
1710	while (filemap_get_folios(mapping: bd_mapping, start: &index, end, fbatch: &fbatch)) {
1711	count = folio_batch_count(fbatch: &fbatch);
1712	for (i = `0`; i < count; i++) {
1713	struct folio *folio = fbatch.folios[i];
1714
1715	if (!folio_buffers(folio))
1716	continue;
1717	/*
1718	* We use folio lock instead of bd_mapping->i_private_lock
1719	* to pin buffers here since we can afford to sleep and
1720	* it scales better than a global spinlock lock.
1721	*/
1722	folio_lock(folio);
1723	/ Recheck when the folio is locked which pins bhs /
1724	head = folio_buffers(folio);
1725	if (!head)
1726	goto unlock_page;
1727	bh = head;
1728	do {
1729	if (!buffer_mapped(bh) \|\| (bh->b_blocknr < block))
1730	goto next;
1731	if (bh->b_blocknr >= block + len)
1732	break;
1733	clear_buffer_dirty(bh);
1734	wait_on_buffer(bh);
1735	clear_buffer_req(bh);
1736	next:
1737	bh = bh->b_this_page;
1738	} while (bh != head);
1739	unlock_page:
1740	folio_unlock(folio);
1741	}
1742	folio_batch_release(fbatch: &fbatch);
1743	cond_resched();
1744	/ End of range already reached? /
1745	if (index > end \|\| !index)
1746	break;
1747	}
1748	}
1749	EXPORT_SYMBOL(clean_bdev_aliases);
1750
1751	static struct buffer_head folio_create_buffers(struct* folio *folio,
1752	struct inode *inode,
1753	unsigned int b_state)
1754	{
1755	struct buffer_head *bh;
1756
1757	BUG_ON(!folio_test_locked(folio));
1758
1759	bh = folio_buffers(folio);
1760	if (!bh)
1761	bh = create_empty_buffers(folio,
1762	`1` << READ_ONCE(inode->i_blkbits), b_state);
1763	return bh;
1764	}
1765
1766	/*
1767	* NOTE! All mapped/uptodate combinations are valid:
1768	*
1769	* Mapped Uptodate Meaning
1770	*
1771	* No No "unknown" - must do get_block()
1772	* No Yes "hole" - zero-filled
1773	* Yes No "allocated" - allocated on disk, not read in
1774	* Yes Yes "valid" - allocated and up-to-date in memory.
1775	*
1776	* "Dirty" is valid only with the last case (mapped+uptodate).
1777	*/
1778
1779	/*
1780	* While block_write_full_folio is writing back the dirty buffers under
1781	* the page lock, whoever dirtied the buffers may decide to clean them
1782	* again at any time. We handle that by only looking at the buffer
1783	* state inside lock_buffer().
1784	*
1785	* If block_write_full_folio() is called for regular writeback
1786	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1787	* locked buffer. This only can happen if someone has written the buffer
1788	* directly, with submit_bh(). At the address_space level PageWriteback
1789	* prevents this contention from occurring.
1790	*
1791	* If block_write_full_folio() is called with wbc->sync_mode ==
1792	* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
1793	* causes the writes to be flagged as synchronous writes.
1794	*/
1795	int __block_write_full_folio(struct inode inode, struct* folio *folio,
1796	get_block_t get_block, struct* writeback_control *wbc)
1797	{
1798	int err;
1799	sector_t block;
1800	sector_t last_block;
1801	struct buffer_head bh, head;
1802	size_t blocksize;
1803	int nr_underway = `0`;
1804	blk_opf_t write_flags = wbc_to_write_flags(wbc);
1805
1806	head = folio_create_buffers(folio, inode,
1807	b_state: (`1` << BH_Dirty) \| (`1` << BH_Uptodate));
1808
1809	/*
1810	* Be very careful. We have no exclusion from block_dirty_folio
1811	* here, and the (potentially unmapped) buffers may become dirty at
1812	* any time. If a buffer becomes dirty here after we've inspected it
1813	* then we just miss that fact, and the folio stays dirty.
1814	*
1815	* Buffers outside i_size may be dirtied by block_dirty_folio;
1816	* handle that here by just cleaning them.
1817	*/
1818
1819	bh = head;
1820	blocksize = bh->b_size;
1821
1822	block = div_u64(dividend: folio_pos(folio), divisor: blocksize);
1823	last_block = div_u64(dividend: i_size_read(inode) - `1`, divisor: blocksize);
1824
1825	/*
1826	* Get all the dirty buffers mapped to disk addresses and
1827	* handle any aliases from the underlying blockdev's mapping.
1828	*/
1829	do {
1830	if (block > last_block) {
1831	/*
1832	* mapped buffers outside i_size will occur, because
1833	* this folio can be outside i_size when there is a
1834	* truncate in progress.
1835	*/
1836	/*
1837	* The buffer was zeroed by block_write_full_folio()
1838	*/
1839	clear_buffer_dirty(bh);
1840	set_buffer_uptodate(bh);
1841	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
1842	buffer_dirty(bh)) {
1843	WARN_ON(bh->b_size != blocksize);
1844	err = get_block(inode, block, bh, `1`);
1845	if (err)
1846	goto recover;
1847	clear_buffer_delay(bh);
1848	if (buffer_new(bh)) {
1849	/ blockdev mappings never come here /
1850	clear_buffer_new(bh);
1851	clean_bdev_bh_alias(bh);
1852	}
1853	}
1854	bh = bh->b_this_page;
1855	block++;
1856	} while (bh != head);
1857
1858	do {
1859	if (!buffer_mapped(bh))
1860	continue;
1861	/*
1862	* If it's a fully non-blocking write attempt and we cannot
1863	* lock the buffer then redirty the folio. Note that this can
1864	* potentially cause a busy-wait loop from writeback threads
1865	* and kswapd activity, but those code paths have their own
1866	* higher-level throttling.
1867	*/
1868	if (wbc->sync_mode != WB_SYNC_NONE) {
1869	lock_buffer(bh);
1870	} else if (!trylock_buffer(bh)) {
1871	folio_redirty_for_writepage(wbc, folio);
1872	continue;
1873	}
1874	if (test_clear_buffer_dirty(bh)) {
1875	mark_buffer_async_write_endio(bh,
1876	handler: end_buffer_async_write);
1877	} else {
1878	unlock_buffer(bh);
1879	}
1880	} while ((bh = bh->b_this_page) != head);
1881
1882	/*
1883	* The folio and its buffers are protected by the writeback flag,
1884	* so we can drop the bh refcounts early.
1885	*/
1886	BUG_ON(folio_test_writeback(folio));
1887	folio_start_writeback(folio);
1888
1889	do {
1890	struct buffer_head *next = bh->b_this_page;
1891	if (buffer_async_write(bh)) {
1892	submit_bh_wbc(opf: REQ_OP_WRITE \| write_flags, bh,
1893	hint: inode->i_write_hint, wbc);
1894	nr_underway++;
1895	}
1896	bh = next;
1897	} while (bh != head);
1898	folio_unlock(folio);
1899
1900	err = `0`;
1901	done:
1902	if (nr_underway == `0`) {
1903	/*
1904	* The folio was marked dirty, but the buffers were
1905	* clean. Someone wrote them back by hand with
1906	* write_dirty_buffer/submit_bh. A rare case.
1907	*/
1908	folio_end_writeback(folio);
1909
1910	/*
1911	* The folio and buffer_heads can be released at any time from
1912	* here on.
1913	*/
1914	}
1915	return err;
1916
1917	recover:
1918	/*
1919	* ENOSPC, or some other error. We may already have added some
1920	* blocks to the file, so we need to write these out to avoid
1921	* exposing stale data.
1922	* The folio is currently locked and not marked for writeback
1923	*/
1924	bh = head;
1925	/ Recovery: lock and submit the mapped buffers /
1926	do {
1927	if (buffer_mapped(bh) && buffer_dirty(bh) &&
1928	!buffer_delay(bh)) {
1929	lock_buffer(bh);
1930	mark_buffer_async_write_endio(bh,
1931	handler: end_buffer_async_write);
1932	} else {
1933	/*
1934	* The buffer may have been set dirty during
1935	* attachment to a dirty folio.
1936	*/
1937	clear_buffer_dirty(bh);
1938	}
1939	} while ((bh = bh->b_this_page) != head);
1940	folio_set_error(folio);
1941	BUG_ON(folio_test_writeback(folio));
1942	mapping_set_error(mapping: folio->mapping, error: err);
1943	folio_start_writeback(folio);
1944	do {
1945	struct buffer_head *next = bh->b_this_page;
1946	if (buffer_async_write(bh)) {
1947	clear_buffer_dirty(bh);
1948	submit_bh_wbc(opf: REQ_OP_WRITE \| write_flags, bh,
1949	hint: inode->i_write_hint, wbc);
1950	nr_underway++;
1951	}
1952	bh = next;
1953	} while (bh != head);
1954	folio_unlock(folio);
1955	goto done;
1956	}
1957	EXPORT_SYMBOL(__block_write_full_folio);
1958
1959	/*
1960	* If a folio has any new buffers, zero them out here, and mark them uptodate
1961	* and dirty so they'll be written out (in order to prevent uninitialised
1962	* block data from leaking). And clear the new bit.
1963	*/
1964	void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
1965	{
1966	size_t block_start, block_end;
1967	struct buffer_head head, bh;
1968
1969	BUG_ON(!folio_test_locked(folio));
1970	head = folio_buffers(folio);
1971	if (!head)
1972	return;
1973
1974	bh = head;
1975	block_start = `0`;
1976	do {
1977	block_end = block_start + bh->b_size;
1978
1979	if (buffer_new(bh)) {
1980	if (block_end > from && block_start < to) {
1981	if (!folio_test_uptodate(folio)) {
1982	size_t start, xend;
1983
1984	start = max(from, block_start);
1985	xend = min(to, block_end);
1986
1987	folio_zero_segment(folio, start, xend);
1988	set_buffer_uptodate(bh);
1989	}
1990
1991	clear_buffer_new(bh);
1992	mark_buffer_dirty(bh);
1993	}
1994	}
1995
1996	block_start = block_end;
1997	bh = bh->b_this_page;
1998	} while (bh != head);
1999	}
2000	EXPORT_SYMBOL(folio_zero_new_buffers);
2001
2002	static int
2003	iomap_to_bh(struct inode inode, sector_t block, struct* buffer_head *bh,
2004	const struct iomap *iomap)
2005	{
2006	loff_t offset = (loff_t)block << inode->i_blkbits;
2007
2008	bh->b_bdev = iomap->bdev;
2009
2010	/*
2011	* Block points to offset in file we need to map, iomap contains
2012	* the offset at which the map starts. If the map ends before the
2013	* current block, then do not map the buffer and let the caller
2014	* handle it.
2015	*/
2016	if (offset >= iomap->offset + iomap->length)
2017	return -EIO;
2018
2019	switch (iomap->type) {
2020	case IOMAP_HOLE:
2021	/*
2022	* If the buffer is not up to date or beyond the current EOF,
2023	* we need to mark it as new to ensure sub-block zeroing is
2024	* executed if necessary.
2025	*/
2026	if (!buffer_uptodate(bh) \|\|
2027	(offset >= i_size_read(inode)))
2028	set_buffer_new(bh);
2029	return `0`;
2030	case IOMAP_DELALLOC:
2031	if (!buffer_uptodate(bh) \|\|
2032	(offset >= i_size_read(inode)))
2033	set_buffer_new(bh);
2034	set_buffer_uptodate(bh);
2035	set_buffer_mapped(bh);
2036	set_buffer_delay(bh);
2037	return `0`;
2038	case IOMAP_UNWRITTEN:
2039	/*
2040	* For unwritten regions, we always need to ensure that regions
2041	* in the block we are not writing to are zeroed. Mark the
2042	* buffer as new to ensure this.
2043	*/
2044	set_buffer_new(bh);
2045	set_buffer_unwritten(bh);
2046	fallthrough;
2047	case IOMAP_MAPPED:
2048	if ((iomap->flags & IOMAP_F_NEW) \|\|
2049	offset >= i_size_read(inode)) {
2050	/*
2051	* This can happen if truncating the block device races
2052	* with the check in the caller as i_size updates on
2053	* block devices aren't synchronized by i_rwsem for
2054	* block devices.
2055	*/
2056	if (S_ISBLK(inode->i_mode))
2057	return -EIO;
2058	set_buffer_new(bh);
2059	}
2060	bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
2061	inode->i_blkbits;
2062	set_buffer_mapped(bh);
2063	return `0`;
2064	default:
2065	WARN_ON_ONCE(`1`);
2066	return -EIO;
2067	}
2068	}
2069
2070	int __block_write_begin_int(struct folio folio, loff_t pos, unsigned* len,
2071	get_block_t get_block, const* struct iomap *iomap)
2072	{
2073	size_t from = offset_in_folio(folio, pos);
2074	size_t to = from + len;
2075	struct inode *inode = folio->mapping->host;
2076	size_t block_start, block_end;
2077	sector_t block;
2078	int err = `0`;
2079	size_t blocksize;
2080	struct buffer_head bh, head, wait[`2`], *wait_bh=wait;
2081
2082	BUG_ON(!folio_test_locked(folio));
2083	BUG_ON(to > folio_size(folio));
2084	BUG_ON(from > to);
2085
2086	head = folio_create_buffers(folio, inode, b_state: `0`);
2087	blocksize = head->b_size;
2088	block = div_u64(dividend: folio_pos(folio), divisor: blocksize);
2089
2090	for (bh = head, block_start = `0`; bh != head \|\| !block_start;
2091	block++, block_start=block_end, bh = bh->b_this_page) {
2092	block_end = block_start + blocksize;
2093	if (block_end <= from \|\| block_start >= to) {
2094	if (folio_test_uptodate(folio)) {
2095	if (!buffer_uptodate(bh))
2096	set_buffer_uptodate(bh);
2097	}
2098	continue;
2099	}
2100	if (buffer_new(bh))
2101	clear_buffer_new(bh);
2102	if (!buffer_mapped(bh)) {
2103	WARN_ON(bh->b_size != blocksize);
2104	if (get_block)
2105	err = get_block(inode, block, bh, `1`);
2106	else
2107	err = iomap_to_bh(inode, block, bh, iomap);
2108	if (err)
2109	break;
2110
2111	if (buffer_new(bh)) {
2112	clean_bdev_bh_alias(bh);
2113	if (folio_test_uptodate(folio)) {
2114	clear_buffer_new(bh);
2115	set_buffer_uptodate(bh);
2116	mark_buffer_dirty(bh);
2117	continue;
2118	}
2119	if (block_end > to \|\| block_start < from)
2120	folio_zero_segments(folio,
2121	start1: to, xend1: block_end,
2122	start2: block_start, xend2: from);
2123	continue;
2124	}
2125	}
2126	if (folio_test_uptodate(folio)) {
2127	if (!buffer_uptodate(bh))
2128	set_buffer_uptodate(bh);
2129	continue;
2130	}
2131	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2132	!buffer_unwritten(bh) &&
2133	(block_start < from \|\| block_end > to)) {
2134	bh_read_nowait(bh, op_flags: `0`);
2135	*wait_bh++=bh;
2136	}
2137	}
2138	/*
2139	* If we issued read requests - let them complete.
2140	*/
2141	while(wait_bh > wait) {
2142	wait_on_buffer(bh: *--wait_bh);
2143	if (!buffer_uptodate(bh: *wait_bh))
2144	err = -EIO;
2145	}
2146	if (unlikely(err))
2147	folio_zero_new_buffers(folio, from, to);
2148	return err;
2149	}
2150
2151	int __block_write_begin(struct page page, loff_t pos, unsigned* len,
2152	get_block_t *get_block)
2153	{
2154	return __block_write_begin_int(page_folio(page), pos, len, get_block,
2155	NULL);
2156	}
2157	EXPORT_SYMBOL(__block_write_begin);
2158
2159	static void __block_commit_write(struct folio *folio, size_t from, size_t to)
2160	{
2161	size_t block_start, block_end;
2162	bool partial = false;
2163	unsigned blocksize;
2164	struct buffer_head bh, head;
2165
2166	bh = head = folio_buffers(folio);
2167	blocksize = bh->b_size;
2168
2169	block_start = `0`;
2170	do {
2171	block_end = block_start + blocksize;
2172	if (block_end <= from \|\| block_start >= to) {
2173	if (!buffer_uptodate(bh))
2174	partial = true;
2175	} else {
2176	set_buffer_uptodate(bh);
2177	mark_buffer_dirty(bh);
2178	}
2179	if (buffer_new(bh))
2180	clear_buffer_new(bh);
2181
2182	block_start = block_end;
2183	bh = bh->b_this_page;
2184	} while (bh != head);
2185
2186	/*
2187	* If this is a partial write which happened to make all buffers
2188	* uptodate then we can optimize away a bogus read_folio() for
2189	* the next read(). Here we 'discover' whether the folio went
2190	* uptodate as a result of this (potentially partial) write.
2191	*/
2192	if (!partial)
2193	folio_mark_uptodate(folio);
2194	}
2195
2196	/*
2197	* block_write_begin takes care of the basic task of block allocation and
2198	* bringing partial write blocks uptodate first.
2199	*
2200	* The filesystem needs to handle block truncation upon failure.
2201	*/
2202	int block_write_begin(struct address_space mapping, loff_t pos, unsigned* len,
2203	struct page *pagep, get_block_t get_block)
2204	{
2205	pgoff_t index = pos >> PAGE_SHIFT;
2206	struct page *page;
2207	int status;
2208
2209	page = grab_cache_page_write_begin(mapping, index);
2210	if (!page)
2211	return -ENOMEM;
2212
2213	status = __block_write_begin(page, pos, len, get_block);
2214	if (unlikely(status)) {
2215	unlock_page(page);
2216	put_page(page);
2217	page = NULL;
2218	}
2219
2220	*pagep = page;
2221	return status;
2222	}
2223	EXPORT_SYMBOL(block_write_begin);
2224
2225	int block_write_end(struct file file, struct* address_space *mapping,
2226	loff_t pos, unsigned len, unsigned copied,
2227	struct page page, void* *fsdata)
2228	{
2229	struct folio *folio = page_folio(page);
2230	size_t start = pos - folio_pos(folio);
2231
2232	if (unlikely(copied < len)) {
2233	/*
2234	* The buffers that were written will now be uptodate, so
2235	* we don't have to worry about a read_folio reading them
2236	* and overwriting a partial write. However if we have
2237	* encountered a short write and only partially written
2238	* into a buffer, it will not be marked uptodate, so a
2239	* read_folio might come in and destroy our partial write.
2240	*
2241	* Do the simplest thing, and just treat any short write to a
2242	* non uptodate folio as a zero-length write, and force the
2243	* caller to redo the whole thing.
2244	*/
2245	if (!folio_test_uptodate(folio))
2246	copied = `0`;
2247
2248	folio_zero_new_buffers(folio, start+copied, start+len);
2249	}
2250	flush_dcache_folio(folio);
2251
2252	/ This could be a short (even 0-length) commit /
2253	__block_commit_write(folio, from: start, to: start + copied);
2254
2255	return copied;
2256	}
2257	EXPORT_SYMBOL(block_write_end);
2258
2259	int generic_write_end(struct file file, struct* address_space *mapping,
2260	loff_t pos, unsigned len, unsigned copied,
2261	struct page page, void* *fsdata)
2262	{
2263	struct inode *inode = mapping->host;
2264	loff_t old_size = inode->i_size;
2265	bool i_size_changed = false;
2266
2267	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2268
2269	/*
2270	* No need to use i_size_read() here, the i_size cannot change under us
2271	* because we hold i_rwsem.
2272	*
2273	* But it's important to update i_size while still holding page lock:
2274	* page writeout could otherwise come in and zero beyond i_size.
2275	*/
2276	if (pos + copied > inode->i_size) {
2277	i_size_write(inode, i_size: pos + copied);
2278	i_size_changed = true;
2279	}
2280
2281	unlock_page(page);
2282	put_page(page);
2283
2284	if (old_size < pos)
2285	pagecache_isize_extended(inode, from: old_size, to: pos);
2286	/*
2287	* Don't mark the inode dirty under page lock. First, it unnecessarily
2288	* makes the holding time of page lock longer. Second, it forces lock
2289	* ordering of page lock and transaction start for journaling
2290	* filesystems.
2291	*/
2292	if (i_size_changed)
2293	mark_inode_dirty(inode);
2294	return copied;
2295	}
2296	EXPORT_SYMBOL(generic_write_end);
2297
2298	/*
2299	* block_is_partially_uptodate checks whether buffers within a folio are
2300	* uptodate or not.
2301	*
2302	* Returns true if all buffers which correspond to the specified part
2303	* of the folio are uptodate.
2304	*/
2305	bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
2306	{
2307	unsigned block_start, block_end, blocksize;
2308	unsigned to;
2309	struct buffer_head bh, head;
2310	bool ret = true;
2311
2312	head = folio_buffers(folio);
2313	if (!head)
2314	return false;
2315	blocksize = head->b_size;
2316	to = min_t(unsigned, folio_size(folio) - from, count);
2317	to = from + to;
2318	if (from < blocksize && to > folio_size(folio) - blocksize)
2319	return false;
2320
2321	bh = head;
2322	block_start = `0`;
2323	do {
2324	block_end = block_start + blocksize;
2325	if (block_end > from && block_start < to) {
2326	if (!buffer_uptodate(bh)) {
2327	ret = false;
2328	break;
2329	}
2330	if (block_end >= to)
2331	break;
2332	}
2333	block_start = block_end;
2334	bh = bh->b_this_page;
2335	} while (bh != head);
2336
2337	return ret;
2338	}
2339	EXPORT_SYMBOL(block_is_partially_uptodate);
2340
2341	/*
2342	* Generic "read_folio" function for block devices that have the normal
2343	* get_block functionality. This is most of the block device filesystems.
2344	* Reads the folio asynchronously --- the unlock_buffer() and
2345	* set/clear_buffer_uptodate() functions propagate buffer state into the
2346	* folio once IO has completed.
2347	*/
2348	int block_read_full_folio(struct folio folio, get_block_t get_block)
2349	{
2350	struct inode *inode = folio->mapping->host;
2351	sector_t iblock, lblock;
2352	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
2353	size_t blocksize;
2354	int nr, i;
2355	int fully_mapped = `1`;
2356	bool page_error = false;
2357	loff_t limit = i_size_read(inode);
2358
2359	/ This is needed for ext4. /
2360	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
2361	limit = inode->i_sb->s_maxbytes;
2362
2363	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
2364
2365	head = folio_create_buffers(folio, inode, b_state: `0`);
2366	blocksize = head->b_size;
2367
2368	iblock = div_u64(dividend: folio_pos(folio), divisor: blocksize);
2369	lblock = div_u64(dividend: limit + blocksize - `1`, divisor: blocksize);
2370	bh = head;
2371	nr = `0`;
2372	i = `0`;
2373
2374	do {
2375	if (buffer_uptodate(bh))
2376	continue;
2377
2378	if (!buffer_mapped(bh)) {
2379	int err = `0`;
2380
2381	fully_mapped = `0`;
2382	if (iblock < lblock) {
2383	WARN_ON(bh->b_size != blocksize);
2384	err = get_block(inode, iblock, bh, `0`);
2385	if (err) {
2386	folio_set_error(folio);
2387	page_error = true;
2388	}
2389	}
2390	if (!buffer_mapped(bh)) {
2391	folio_zero_range(folio, start: i * blocksize,
2392	length: blocksize);
2393	if (!err)
2394	set_buffer_uptodate(bh);
2395	continue;
2396	}
2397	/*
2398	* get_block() might have updated the buffer
2399	* synchronously
2400	*/
2401	if (buffer_uptodate(bh))
2402	continue;
2403	}
2404	arr[nr++] = bh;
2405	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2406
2407	if (fully_mapped)
2408	folio_set_mappedtodisk(folio);
2409
2410	if (!nr) {
2411	/*
2412	* All buffers are uptodate or get_block() returned an
2413	* error when trying to map them - we can finish the read.
2414	*/
2415	folio_end_read(folio, success: !page_error);
2416	return `0`;
2417	}
2418
2419	/ Stage two: lock the buffers /
2420	for (i = `0`; i < nr; i++) {
2421	bh = arr[i];
2422	lock_buffer(bh);
2423	mark_buffer_async_read(bh);
2424	}
2425
2426	/*
2427	* Stage 3: start the IO. Check for uptodateness
2428	* inside the buffer lock in case another process reading
2429	* the underlying blockdev brought it uptodate (the sct fix).
2430	*/
2431	for (i = `0`; i < nr; i++) {
2432	bh = arr[i];
2433	if (buffer_uptodate(bh))
2434	end_buffer_async_read(bh, uptodate: `1`);
2435	else
2436	submit_bh(REQ_OP_READ, bh);
2437	}
2438	return `0`;
2439	}
2440	EXPORT_SYMBOL(block_read_full_folio);
2441
2442	/ utility function for filesystems that need to do work on expanding*
2443	* truncates. Uses filesystem pagecache writes to allow the filesystem to
2444	* deal with the hole.
2445	*/
2446	int generic_cont_expand_simple(struct inode *inode, loff_t size)
2447	{
2448	struct address_space *mapping = inode->i_mapping;
2449	const struct address_space_operations *aops = mapping->a_ops;
2450	struct page *page;
2451	void *fsdata = NULL;
2452	int err;
2453
2454	err = inode_newsize_ok(inode, offset: size);
2455	if (err)
2456	goto out;
2457
2458	err = aops->write_begin(NULL, mapping, size, `0`, &page, &fsdata);
2459	if (err)
2460	goto out;
2461
2462	err = aops->write_end(NULL, mapping, size, `0`, `0`, page, fsdata);
2463	BUG_ON(err > `0`);
2464
2465	out:
2466	return err;
2467	}
2468	EXPORT_SYMBOL(generic_cont_expand_simple);
2469
2470	static int cont_expand_zero(struct file file, struct* address_space *mapping,
2471	loff_t pos, loff_t *bytes)
2472	{
2473	struct inode *inode = mapping->host;
2474	const struct address_space_operations *aops = mapping->a_ops;
2475	unsigned int blocksize = i_blocksize(node: inode);
2476	struct page *page;
2477	void *fsdata = NULL;
2478	pgoff_t index, curidx;
2479	loff_t curpos;
2480	unsigned zerofrom, offset, len;
2481	int err = `0`;
2482
2483	index = pos >> PAGE_SHIFT;
2484	offset = pos & ~PAGE_MASK;
2485
2486	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2487	zerofrom = curpos & ~PAGE_MASK;
2488	if (zerofrom & (blocksize-`1`)) {
2489	*bytes \|= (blocksize-`1`);
2490	(*bytes)++;
2491	}
2492	len = PAGE_SIZE - zerofrom;
2493
2494	err = aops->write_begin(file, mapping, curpos, len,
2495	&page, &fsdata);
2496	if (err)
2497	goto out;
2498	zero_user(page, start: zerofrom, size: len);
2499	err = aops->write_end(file, mapping, curpos, len, len,
2500	page, fsdata);
2501	if (err < `0`)
2502	goto out;
2503	BUG_ON(err != len);
2504	err = `0`;
2505
2506	balance_dirty_pages_ratelimited(mapping);
2507
2508	if (fatal_signal_pending(current)) {
2509	err = -EINTR;
2510	goto out;
2511	}
2512	}
2513
2514	/ page covers the boundary, find the boundary offset /
2515	if (index == curidx) {
2516	zerofrom = curpos & ~PAGE_MASK;
2517	/ if we will expand the thing last block will be filled /
2518	if (offset <= zerofrom) {
2519	goto out;
2520	}
2521	if (zerofrom & (blocksize-`1`)) {
2522	*bytes \|= (blocksize-`1`);
2523	(*bytes)++;
2524	}
2525	len = offset - zerofrom;
2526
2527	err = aops->write_begin(file, mapping, curpos, len,
2528	&page, &fsdata);
2529	if (err)
2530	goto out;
2531	zero_user(page, start: zerofrom, size: len);
2532	err = aops->write_end(file, mapping, curpos, len, len,
2533	page, fsdata);
2534	if (err < `0`)
2535	goto out;
2536	BUG_ON(err != len);
2537	err = `0`;
2538	}
2539	out:
2540	return err;
2541	}
2542
2543	/*
2544	* For moronic filesystems that do not allow holes in file.
2545	* We may have to extend the file.
2546	*/
2547	int cont_write_begin(struct file file, struct* address_space *mapping,
2548	loff_t pos, unsigned len,
2549	struct page *pagep, void* **fsdata,
2550	get_block_t get_block, loff_t bytes)
2551	{
2552	struct inode *inode = mapping->host;
2553	unsigned int blocksize = i_blocksize(node: inode);
2554	unsigned int zerofrom;
2555	int err;
2556
2557	err = cont_expand_zero(file, mapping, pos, bytes);
2558	if (err)
2559	return err;
2560
2561	zerofrom = *bytes & ~PAGE_MASK;
2562	if (pos+len > *bytes && zerofrom & (blocksize-`1`)) {
2563	*bytes \|= (blocksize-`1`);
2564	(*bytes)++;
2565	}
2566
2567	return block_write_begin(mapping, pos, len, pagep, get_block);
2568	}
2569	EXPORT_SYMBOL(cont_write_begin);
2570
2571	void block_commit_write(struct page page, unsigned* from, unsigned to)
2572	{
2573	struct folio *folio = page_folio(page);
2574	__block_commit_write(folio, from, to);
2575	}
2576	EXPORT_SYMBOL(block_commit_write);
2577
2578	/*
2579	* block_page_mkwrite() is not allowed to change the file size as it gets
2580	* called from a page fault handler when a page is first dirtied. Hence we must
2581	* be careful to check for EOF conditions here. We set the page up correctly
2582	* for a written page which means we get ENOSPC checking when writing into
2583	* holes and correct delalloc and unwritten extent mapping on filesystems that
2584	* support these features.
2585	*
2586	* We are not allowed to take the i_mutex here so we have to play games to
2587	* protect against truncate races as the page could now be beyond EOF. Because
2588	* truncate writes the inode size before removing pages, once we have the
2589	* page lock we can determine safely if the page is beyond EOF. If it is not
2590	* beyond EOF, then the page is guaranteed safe against truncation until we
2591	* unlock the page.
2592	*
2593	* Direct callers of this function should protect against filesystem freezing
2594	* using sb_start_pagefault() - sb_end_pagefault() functions.
2595	*/
2596	int block_page_mkwrite(struct vm_area_struct vma, struct* vm_fault *vmf,
2597	get_block_t get_block)
2598	{
2599	struct folio *folio = page_folio(vmf->page);
2600	struct inode *inode = file_inode(f: vma->vm_file);
2601	unsigned long end;
2602	loff_t size;
2603	int ret;
2604
2605	folio_lock(folio);
2606	size = i_size_read(inode);
2607	if ((folio->mapping != inode->i_mapping) \|\|
2608	(folio_pos(folio) >= size)) {
2609	/ We overload EFAULT to mean page got truncated /
2610	ret = -EFAULT;
2611	goto out_unlock;
2612	}
2613
2614	end = folio_size(folio);
2615	/ folio is wholly or partially inside EOF /
2616	if (folio_pos(folio) + end > size)
2617	end = size - folio_pos(folio);
2618
2619	ret = __block_write_begin_int(folio, pos: `0`, len: end, get_block, NULL);
2620	if (unlikely(ret))
2621	goto out_unlock;
2622
2623	__block_commit_write(folio, from: `0`, to: end);
2624
2625	folio_mark_dirty(folio);
2626	folio_wait_stable(folio);
2627	return `0`;
2628	out_unlock:
2629	folio_unlock(folio);
2630	return ret;
2631	}
2632	EXPORT_SYMBOL(block_page_mkwrite);
2633
2634	int block_truncate_page(struct address_space *mapping,
2635	loff_t from, get_block_t *get_block)
2636	{
2637	pgoff_t index = from >> PAGE_SHIFT;
2638	unsigned blocksize;
2639	sector_t iblock;
2640	size_t offset, length, pos;
2641	struct inode *inode = mapping->host;
2642	struct folio *folio;
2643	struct buffer_head *bh;
2644	int err = `0`;
2645
2646	blocksize = i_blocksize(node: inode);
2647	length = from & (blocksize - `1`);
2648
2649	/ Block boundary? Nothing to do /
2650	if (!length)
2651	return `0`;
2652
2653	length = blocksize - length;
2654	iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
2655
2656	folio = filemap_grab_folio(mapping, index);
2657	if (IS_ERR(ptr: folio))
2658	return PTR_ERR(ptr: folio);
2659
2660	bh = folio_buffers(folio);
2661	if (!bh)
2662	bh = create_empty_buffers(folio, blocksize, `0`);
2663
2664	/ Find the buffer that contains "offset" /
2665	offset = offset_in_folio(folio, from);
2666	pos = blocksize;
2667	while (offset >= pos) {
2668	bh = bh->b_this_page;
2669	iblock++;
2670	pos += blocksize;
2671	}
2672
2673	if (!buffer_mapped(bh)) {
2674	WARN_ON(bh->b_size != blocksize);
2675	err = get_block(inode, iblock, bh, `0`);
2676	if (err)
2677	goto unlock;
2678	/ unmapped? It's a hole - nothing to do /
2679	if (!buffer_mapped(bh))
2680	goto unlock;
2681	}
2682
2683	/ Ok, it's mapped. Make sure it's up-to-date /
2684	if (folio_test_uptodate(folio))
2685	set_buffer_uptodate(bh);
2686
2687	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2688	err = bh_read(bh, op_flags: `0`);
2689	/ Uhhuh. Read error. Complain and punt. /
2690	if (err < `0`)
2691	goto unlock;
2692	}
2693
2694	folio_zero_range(folio, start: offset, length);
2695	mark_buffer_dirty(bh);
2696
2697	unlock:
2698	folio_unlock(folio);
2699	folio_put(folio);
2700
2701	return err;
2702	}
2703	EXPORT_SYMBOL(block_truncate_page);
2704
2705	/*
2706	* The generic ->writepage function for buffer-backed address_spaces
2707	*/
2708	int block_write_full_folio(struct folio folio, struct* writeback_control *wbc,
2709	void *get_block)
2710	{
2711	struct inode * const inode = folio->mapping->host;
2712	loff_t i_size = i_size_read(inode);
2713
2714	/ Is the folio fully inside i_size? /
2715	if (folio_pos(folio) + folio_size(folio) <= i_size)
2716	return __block_write_full_folio(inode, folio, get_block, wbc);
2717
2718	/ Is the folio fully outside i_size? (truncate in progress) /
2719	if (folio_pos(folio) >= i_size) {
2720	folio_unlock(folio);
2721	return `0`; / don't care /
2722	}
2723
2724	/*
2725	* The folio straddles i_size. It must be zeroed out on each and every
2726	* writepage invocation because it may be mmapped. "A file is mapped
2727	* in multiples of the page size. For a file that is not a multiple of
2728	* the page size, the remaining memory is zeroed when mapped, and
2729	* writes to that region are not written out to the file."
2730	*/
2731	folio_zero_segment(folio, offset_in_folio(folio, i_size),
2732	xend: folio_size(folio));
2733	return __block_write_full_folio(inode, folio, get_block, wbc);
2734	}
2735
2736	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2737	get_block_t *get_block)
2738	{
2739	struct inode *inode = mapping->host;
2740	struct buffer_head tmp = {
2741	.b_size = i_blocksize(node: inode),
2742	};
2743
2744	get_block(inode, block, &tmp, `0`);
2745	return tmp.b_blocknr;
2746	}
2747	EXPORT_SYMBOL(generic_block_bmap);
2748
2749	static void end_bio_bh_io_sync(struct bio *bio)
2750	{
2751	struct buffer_head *bh = bio->bi_private;
2752
2753	if (unlikely(bio_flagged(bio, BIO_QUIET)))
2754	set_bit(nr: BH_Quiet, addr: &bh->b_state);
2755
2756	bh->b_end_io(bh, !bio->bi_status);
2757	bio_put(bio);
2758	}
2759
2760	static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
2761	enum rw_hint write_hint,
2762	struct writeback_control *wbc)
2763	{
2764	const enum req_op op = opf & REQ_OP_MASK;
2765	struct bio *bio;
2766
2767	BUG_ON(!buffer_locked(bh));
2768	BUG_ON(!buffer_mapped(bh));
2769	BUG_ON(!bh->b_end_io);
2770	BUG_ON(buffer_delay(bh));
2771	BUG_ON(buffer_unwritten(bh));
2772
2773	/*
2774	* Only clear out a write error when rewriting
2775	*/
2776	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
2777	clear_buffer_write_io_error(bh);
2778
2779	if (buffer_meta(bh))
2780	opf \|= REQ_META;
2781	if (buffer_prio(bh))
2782	opf \|= REQ_PRIO;
2783
2784	bio = bio_alloc(bdev: bh->b_bdev, nr_vecs: `1`, opf, GFP_NOIO);
2785
2786	fscrypt_set_bio_crypt_ctx_bh(bio, first_bh: bh, GFP_NOIO);
2787
2788	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> `9`);
2789	bio->bi_write_hint = write_hint;
2790
2791	__bio_add_page(bio, page: bh->b_page, len: bh->b_size, off: bh_offset(bh));
2792
2793	bio->bi_end_io = end_bio_bh_io_sync;
2794	bio->bi_private = bh;
2795
2796	/ Take care of bh's that straddle the end of the device /
2797	guard_bio_eod(bio);
2798
2799	if (wbc) {
2800	wbc_init_bio(wbc, bio);
2801	wbc_account_cgroup_owner(wbc, page: bh->b_page, bytes: bh->b_size);
2802	}
2803
2804	submit_bio(bio);
2805	}
2806
2807	void submit_bh(blk_opf_t opf, struct buffer_head *bh)
2808	{
2809	submit_bh_wbc(opf, bh, write_hint: WRITE_LIFE_NOT_SET, NULL);
2810	}
2811	EXPORT_SYMBOL(submit_bh);
2812
2813	void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2814	{
2815	lock_buffer(bh);
2816	if (!test_clear_buffer_dirty(bh)) {
2817	unlock_buffer(bh);
2818	return;
2819	}
2820	bh->b_end_io = end_buffer_write_sync;
2821	get_bh(bh);
2822	submit_bh(REQ_OP_WRITE \| op_flags, bh);
2823	}
2824	EXPORT_SYMBOL(write_dirty_buffer);
2825
2826	/*
2827	* For a data-integrity writeout, we need to wait upon any in-progress I/O
2828	* and then start new I/O and then wait upon it. The caller must have a ref on
2829	* the buffer_head.
2830	*/
2831	int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
2832	{
2833	WARN_ON(atomic_read(&bh->b_count) < `1`);
2834	lock_buffer(bh);
2835	if (test_clear_buffer_dirty(bh)) {
2836	/*
2837	* The bh should be mapped, but it might not be if the
2838	* device was hot-removed. Not much we can do but fail the I/O.
2839	*/
2840	if (!buffer_mapped(bh)) {
2841	unlock_buffer(bh);
2842	return -EIO;
2843	}
2844
2845	get_bh(bh);
2846	bh->b_end_io = end_buffer_write_sync;
2847	submit_bh(REQ_OP_WRITE \| op_flags, bh);
2848	wait_on_buffer(bh);
2849	if (!buffer_uptodate(bh))
2850	return -EIO;
2851	} else {
2852	unlock_buffer(bh);
2853	}
2854	return `0`;
2855	}
2856	EXPORT_SYMBOL(__sync_dirty_buffer);
2857
2858	int sync_dirty_buffer(struct buffer_head *bh)
2859	{
2860	return __sync_dirty_buffer(bh, REQ_SYNC);
2861	}
2862	EXPORT_SYMBOL(sync_dirty_buffer);
2863
2864	/*
2865	* try_to_free_buffers() checks if all the buffers on this particular folio
2866	* are unused, and releases them if so.
2867	*
2868	* Exclusion against try_to_free_buffers may be obtained by either
2869	* locking the folio or by holding its mapping's i_private_lock.
2870	*
2871	* If the folio is dirty but all the buffers are clean then we need to
2872	* be sure to mark the folio clean as well. This is because the folio
2873	* may be against a block device, and a later reattachment of buffers
2874	* to a dirty folio will set all buffers dirty. Which would corrupt
2875	* filesystem data on the same device.
2876	*
2877	* The same applies to regular filesystem folios: if all the buffers are
2878	* clean then we set the folio clean and proceed. To do that, we require
2879	* total exclusion from block_dirty_folio(). That is obtained with
2880	* i_private_lock.
2881	*
2882	* try_to_free_buffers() is non-blocking.
2883	*/
2884	static inline int buffer_busy(struct buffer_head *bh)
2885	{
2886	return atomic_read(v: &bh->b_count) \|
2887	(bh->b_state & ((`1` << BH_Dirty) \| (`1` << BH_Lock)));
2888	}
2889
2890	static bool
2891	drop_buffers(struct folio folio, struct* buffer_head **buffers_to_free)
2892	{
2893	struct buffer_head *head = folio_buffers(folio);
2894	struct buffer_head *bh;
2895
2896	bh = head;
2897	do {
2898	if (buffer_busy(bh))
2899	goto failed;
2900	bh = bh->b_this_page;
2901	} while (bh != head);
2902
2903	do {
2904	struct buffer_head *next = bh->b_this_page;
2905
2906	if (bh->b_assoc_map)
2907	__remove_assoc_queue(bh);
2908	bh = next;
2909	} while (bh != head);
2910	*buffers_to_free = head;
2911	folio_detach_private(folio);
2912	return true;
2913	failed:
2914	return false;
2915	}
2916
2917	bool try_to_free_buffers(struct folio *folio)
2918	{
2919	struct address_space * const mapping = folio->mapping;
2920	struct buffer_head *buffers_to_free = NULL;
2921	bool ret = `0`;
2922
2923	BUG_ON(!folio_test_locked(folio));
2924	if (folio_test_writeback(folio))
2925	return false;
2926
2927	if (mapping == NULL) { / can this still happen? /
2928	ret = drop_buffers(folio, buffers_to_free: &buffers_to_free);
2929	goto out;
2930	}
2931
2932	spin_lock(lock: &mapping->i_private_lock);
2933	ret = drop_buffers(folio, buffers_to_free: &buffers_to_free);
2934
2935	/*
2936	* If the filesystem writes its buffers by hand (eg ext3)
2937	* then we can have clean buffers against a dirty folio. We
2938	* clean the folio here; otherwise the VM will never notice
2939	* that the filesystem did any IO at all.
2940	*
2941	* Also, during truncate, discard_buffer will have marked all
2942	* the folio's buffers clean. We discover that here and clean
2943	* the folio also.
2944	*
2945	* i_private_lock must be held over this entire operation in order
2946	* to synchronise against block_dirty_folio and prevent the
2947	* dirty bit from being lost.
2948	*/
2949	if (ret)
2950	folio_cancel_dirty(folio);
2951	spin_unlock(lock: &mapping->i_private_lock);
2952	out:
2953	if (buffers_to_free) {
2954	struct buffer_head *bh = buffers_to_free;
2955
2956	do {
2957	struct buffer_head *next = bh->b_this_page;
2958	free_buffer_head(bh);
2959	bh = next;
2960	} while (bh != buffers_to_free);
2961	}
2962	return ret;
2963	}
2964	EXPORT_SYMBOL(try_to_free_buffers);
2965
2966	/*
2967	* Buffer-head allocation
2968	*/
2969	static struct kmem_cache *bh_cachep __ro_after_init;
2970
2971	/*
2972	* Once the number of bh's in the machine exceeds this level, we start
2973	* stripping them in writeback.
2974	*/
2975	static unsigned long max_buffer_heads __ro_after_init;
2976
2977	int buffer_heads_over_limit;
2978
2979	struct bh_accounting {
2980	int nr; / Number of live bh's /
2981	int ratelimit; / Limit cacheline bouncing /
2982	};
2983
2984	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {`0`, `0`};
2985
2986	static void recalc_bh_state(void)
2987	{
2988	int i;
2989	int tot = `0`;
2990
2991	if (__this_cpu_inc_return(bh_accounting.ratelimit) - `1` < `4096`)
2992	return;
2993	__this_cpu_write(bh_accounting.ratelimit, `0`);
2994	for_each_online_cpu(i)
2995	tot += per_cpu(bh_accounting, i).nr;
2996	buffer_heads_over_limit = (tot > max_buffer_heads);
2997	}
2998
2999	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3000	{
3001	struct buffer_head *ret = kmem_cache_zalloc(k: bh_cachep, flags: gfp_flags);
3002	if (ret) {
3003	INIT_LIST_HEAD(list: &ret->b_assoc_buffers);
3004	spin_lock_init(&ret->b_uptodate_lock);
3005	preempt_disable();
3006	__this_cpu_inc(bh_accounting.nr);
3007	recalc_bh_state();
3008	preempt_enable();
3009	}
3010	return ret;
3011	}
3012	EXPORT_SYMBOL(alloc_buffer_head);
3013
3014	void free_buffer_head(struct buffer_head *bh)
3015	{
3016	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3017	kmem_cache_free(s: bh_cachep, objp: bh);
3018	preempt_disable();
3019	__this_cpu_dec(bh_accounting.nr);
3020	recalc_bh_state();
3021	preempt_enable();
3022	}
3023	EXPORT_SYMBOL(free_buffer_head);
3024
3025	static int buffer_exit_cpu_dead(unsigned int cpu)
3026	{
3027	int i;
3028	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3029
3030	for (i = `0`; i < BH_LRU_SIZE; i++) {
3031	brelse(bh: b->bhs[i]);
3032	b->bhs[i] = NULL;
3033	}
3034	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3035	per_cpu(bh_accounting, cpu).nr = `0`;
3036	return `0`;
3037	}
3038
3039	/**
3040	* bh_uptodate_or_lock - Test whether the buffer is uptodate
3041	* @bh: struct buffer_head
3042	*
3043	* Return true if the buffer is up-to-date and false,
3044	* with the buffer locked, if not.
3045	*/
3046	int bh_uptodate_or_lock(struct buffer_head *bh)
3047	{
3048	if (!buffer_uptodate(bh)) {
3049	lock_buffer(bh);
3050	if (!buffer_uptodate(bh))
3051	return `0`;
3052	unlock_buffer(bh);
3053	}
3054	return `1`;
3055	}
3056	EXPORT_SYMBOL(bh_uptodate_or_lock);
3057
3058	/**
3059	* __bh_read - Submit read for a locked buffer
3060	* @bh: struct buffer_head
3061	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3062	* @wait: wait until reading finish
3063	*
3064	* Returns zero on success or don't wait, and -EIO on error.
3065	*/
3066	int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
3067	{
3068	int ret = `0`;
3069
3070	BUG_ON(!buffer_locked(bh));
3071
3072	get_bh(bh);
3073	bh->b_end_io = end_buffer_read_sync;
3074	submit_bh(REQ_OP_READ \| op_flags, bh);
3075	if (wait) {
3076	wait_on_buffer(bh);
3077	if (!buffer_uptodate(bh))
3078	ret = -EIO;
3079	}
3080	return ret;
3081	}
3082	EXPORT_SYMBOL(__bh_read);
3083
3084	/**
3085	* __bh_read_batch - Submit read for a batch of unlocked buffers
3086	* @nr: entry number of the buffer batch
3087	* @bhs: a batch of struct buffer_head
3088	* @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
3089	* @force_lock: force to get a lock on the buffer if set, otherwise drops any
3090	* buffer that cannot lock.
3091	*
3092	* Returns zero on success or don't wait, and -EIO on error.
3093	*/
3094	void __bh_read_batch(int nr, struct buffer_head *bhs[],
3095	blk_opf_t op_flags, bool force_lock)
3096	{
3097	int i;
3098
3099	for (i = `0`; i < nr; i++) {
3100	struct buffer_head *bh = bhs[i];
3101
3102	if (buffer_uptodate(bh))
3103	continue;
3104
3105	if (force_lock)
3106	lock_buffer(bh);
3107	else
3108	if (!trylock_buffer(bh))
3109	continue;
3110
3111	if (buffer_uptodate(bh)) {
3112	unlock_buffer(bh);
3113	continue;
3114	}
3115
3116	bh->b_end_io = end_buffer_read_sync;
3117	get_bh(bh);
3118	submit_bh(REQ_OP_READ \| op_flags, bh);
3119	}
3120	}
3121	EXPORT_SYMBOL(__bh_read_batch);
3122
3123	void __init buffer_init(void)
3124	{
3125	unsigned long nrpages;
3126	int ret;
3127
3128	bh_cachep = KMEM_CACHE(buffer_head,
3129	SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC);
3130	/*
3131	* Limit the bh occupancy to 10% of ZONE_NORMAL
3132	*/
3133	nrpages = (nr_free_buffer_pages() * `10`) / `100`;
3134	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3135	ret = cpuhp_setup_state_nocalls(state: CPUHP_FS_BUFF_DEAD, name: "fs/buffer:dead",
3136	NULL, teardown: buffer_exit_cpu_dead);
3137	WARN_ON(ret < `0`);
3138	}
3139

source code of linux/fs/buffer.c