aops.c source code [linux/fs/ocfs2/aops.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
4	*/
5
6	#include <linux/fs.h>
7	#include <linux/slab.h>
8	#include <linux/highmem.h>
9	#include <linux/pagemap.h>
10	#include <asm/byteorder.h>
11	#include <linux/swap.h>
12	#include <linux/mpage.h>
13	#include <linux/quotaops.h>
14	#include <linux/blkdev.h>
15	#include <linux/uio.h>
16	#include <linux/mm.h>
17
18	#include <cluster/masklog.h>
19
20	#include "ocfs2.h"
21
22	#include "alloc.h"
23	#include "aops.h"
24	#include "dlmglue.h"
25	#include "extent_map.h"
26	#include "file.h"
27	#include "inode.h"
28	#include "journal.h"
29	#include "suballoc.h"
30	#include "super.h"
31	#include "symlink.h"
32	#include "refcounttree.h"
33	#include "ocfs2_trace.h"
34
35	#include "buffer_head_io.h"
36	#include "dir.h"
37	#include "namei.h"
38	#include "sysfile.h"
39
40	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
41	struct buffer_head bh_result, int* create)
42	{
43	int err = -EIO;
44	int status;
45	struct ocfs2_dinode *fe = NULL;
46	struct buffer_head *bh = NULL;
47	struct buffer_head *buffer_cache_bh = NULL;
48	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
49	void *kaddr;
50
51	trace_ocfs2_symlink_get_block(
52	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
53	iblock: (unsigned long long)iblock, bh_result, create);
54
55	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
56
57	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + `1`) {
58	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
59	(unsigned long long)iblock);
60	goto bail;
61	}
62
63	status = ocfs2_read_inode_block(inode, bh: &bh);
64	if (status < `0`) {
65	mlog_errno(status);
66	goto bail;
67	}
68	fe = (struct ocfs2_dinode *) bh->b_data;
69
70	if ((u64)iblock >= ocfs2_clusters_to_blocks(sb: inode->i_sb,
71	le32_to_cpu(fe->i_clusters))) {
72	err = -ENOMEM;
73	mlog(ML_ERROR, "block offset is outside the allocated size: "
74	"%llu\n", (unsigned long long)iblock);
75	goto bail;
76	}
77
78	/ We don't use the page cache to create symlink data, so if*
79	* need be, copy it over from the buffer cache. */
80	if (!buffer_uptodate(bh: bh_result) && ocfs2_inode_is_new(inode)) {
81	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[`0`].e_blkno) +
82	iblock;
83	buffer_cache_bh = sb_getblk(sb: osb->sb, block: blkno);
84	if (!buffer_cache_bh) {
85	err = -ENOMEM;
86	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
87	goto bail;
88	}
89
90	/ we haven't locked out transactions, so a commit*
91	* could've happened. Since we've got a reference on
92	* the bh, even if it commits while we're doing the
93	* copy, the data is still good. */
94	if (buffer_jbd(bh: buffer_cache_bh)
95	&& ocfs2_inode_is_new(inode)) {
96	kaddr = kmap_atomic(page: bh_result->b_page);
97	if (!kaddr) {
98	mlog(ML_ERROR, "couldn't kmap!\n");
99	goto bail;
100	}
101	memcpy(kaddr + (bh_result->b_size * iblock),
102	buffer_cache_bh->b_data,
103	bh_result->b_size);
104	kunmap_atomic(kaddr);
105	set_buffer_uptodate(bh_result);
106	}
107	brelse(bh: buffer_cache_bh);
108	}
109
110	map_bh(bh: bh_result, sb: inode->i_sb,
111	le64_to_cpu(fe->id2.i_list.l_recs[`0`].e_blkno) + iblock);
112
113	err = `0`;
114
115	bail:
116	brelse(bh);
117
118	return err;
119	}
120
121	static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
122	struct buffer_head bh_result, int* create)
123	{
124	int ret = `0`;
125	struct ocfs2_inode_info *oi = OCFS2_I(inode);
126
127	down_read(sem: &oi->ip_alloc_sem);
128	ret = ocfs2_get_block(inode, iblock, bh_result, create);
129	up_read(sem: &oi->ip_alloc_sem);
130
131	return ret;
132	}
133
134	int ocfs2_get_block(struct inode *inode, sector_t iblock,
135	struct buffer_head bh_result, int* create)
136	{
137	int err = `0`;
138	unsigned int ext_flags;
139	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
140	u64 p_blkno, count, past_eof;
141	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
142
143	trace_ocfs2_get_block(ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
144	iblock: (unsigned long long)iblock, bh_result, create);
145
146	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
147	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
148	inode, inode->i_ino);
149
150	if (S_ISLNK(inode->i_mode)) {
151	/ this always does I/O for some reason. /
152	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
153	goto bail;
154	}
155
156	err = ocfs2_extent_map_get_blocks(inode, v_blkno: iblock, p_blkno: &p_blkno, ret_count: &count,
157	extent_flags: &ext_flags);
158	if (err) {
159	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
160	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
161	(unsigned long long)p_blkno);
162	goto bail;
163	}
164
165	if (max_blocks < count)
166	count = max_blocks;
167
168	/*
169	* ocfs2 never allocates in this function - the only time we
170	* need to use BH_New is when we're extending i_size on a file
171	* system which doesn't support holes, in which case BH_New
172	* allows __block_write_begin() to zero.
173	*
174	* If we see this on a sparse file system, then a truncate has
175	* raced us and removed the cluster. In this case, we clear
176	* the buffers dirty and uptodate bits and let the buffer code
177	* ignore it as a hole.
178	*/
179	if (create && p_blkno == `0` && ocfs2_sparse_alloc(osb)) {
180	clear_buffer_dirty(bh: bh_result);
181	clear_buffer_uptodate(bh: bh_result);
182	goto bail;
183	}
184
185	/ Treat the unwritten extent as a hole for zeroing purposes. /
186	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
187	map_bh(bh: bh_result, sb: inode->i_sb, block: p_blkno);
188
189	bh_result->b_size = count << inode->i_blkbits;
190
191	if (!ocfs2_sparse_alloc(osb)) {
192	if (p_blkno == `0`) {
193	err = -EIO;
194	mlog(ML_ERROR,
195	"iblock = %llu p_blkno = %llu blkno=(%llu)\n",
196	(unsigned long long)iblock,
197	(unsigned long long)p_blkno,
198	(unsigned long long)OCFS2_I(inode)->ip_blkno);
199	mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
200	dump_stack();
201	goto bail;
202	}
203	}
204
205	past_eof = ocfs2_blocks_for_bytes(sb: inode->i_sb, bytes: i_size_read(inode));
206
207	trace_ocfs2_get_block_end(val1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
208	val2: (unsigned long long)past_eof);
209	if (create && (iblock >= past_eof))
210	set_buffer_new(bh_result);
211
212	bail:
213	if (err < `0`)
214	err = -EIO;
215
216	return err;
217	}
218
219	int ocfs2_read_inline_data(struct inode inode, struct* page *page,
220	struct buffer_head *di_bh)
221	{
222	void *kaddr;
223	loff_t size;
224	struct ocfs2_dinode di = (struct* ocfs2_dinode *)di_bh->b_data;
225
226	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
227	ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
228	(unsigned long long)OCFS2_I(inode)->ip_blkno);
229	return -EROFS;
230	}
231
232	size = i_size_read(inode);
233
234	if (size > PAGE_SIZE \|\|
235	size > ocfs2_max_inline_data_with_xattr(sb: inode->i_sb, di)) {
236	ocfs2_error(inode->i_sb,
237	"Inode %llu has with inline data has bad size: %Lu\n",
238	(unsigned long long)OCFS2_I(inode)->ip_blkno,
239	(unsigned long long)size);
240	return -EROFS;
241	}
242
243	kaddr = kmap_atomic(page);
244	if (size)
245	memcpy(kaddr, di->id2.i_data.id_data, size);
246	/ Clear the remaining part of the page /
247	memset(kaddr + size, `0`, PAGE_SIZE - size);
248	flush_dcache_page(page);
249	kunmap_atomic(kaddr);
250
251	SetPageUptodate(page);
252
253	return `0`;
254	}
255
256	static int ocfs2_readpage_inline(struct inode inode, struct* page *page)
257	{
258	int ret;
259	struct buffer_head *di_bh = NULL;
260
261	BUG_ON(!PageLocked(page));
262	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
263
264	ret = ocfs2_read_inode_block(inode, bh: &di_bh);
265	if (ret) {
266	mlog_errno(ret);
267	goto out;
268	}
269
270	ret = ocfs2_read_inline_data(inode, page, di_bh);
271	out:
272	unlock_page(page);
273
274	brelse(bh: di_bh);
275	return ret;
276	}
277
278	static int ocfs2_read_folio(struct file file, struct* folio *folio)
279	{
280	struct inode *inode = folio->mapping->host;
281	struct ocfs2_inode_info *oi = OCFS2_I(inode);
282	loff_t start = folio_pos(folio);
283	int ret, unlock = `1`;
284
285	trace_ocfs2_readpage(val1: (unsigned long long)oi->ip_blkno, val2: folio->index);
286
287	ret = ocfs2_inode_lock_with_page(inode, NULL, ex: `0`, page: &folio->page);
288	if (ret != `0`) {
289	if (ret == AOP_TRUNCATED_PAGE)
290	unlock = `0`;
291	mlog_errno(ret);
292	goto out;
293	}
294
295	if (down_read_trylock(sem: &oi->ip_alloc_sem) == `0`) {
296	/*
297	* Unlock the folio and cycle ip_alloc_sem so that we don't
298	* busyloop waiting for ip_alloc_sem to unlock
299	*/
300	ret = AOP_TRUNCATED_PAGE;
301	folio_unlock(folio);
302	unlock = `0`;
303	down_read(sem: &oi->ip_alloc_sem);
304	up_read(sem: &oi->ip_alloc_sem);
305	goto out_inode_unlock;
306	}
307
308	/*
309	* i_size might have just been updated as we grabed the meta lock. We
310	* might now be discovering a truncate that hit on another node.
311	* block_read_full_folio->get_block freaks out if it is asked to read
312	* beyond the end of a file, so we check here. Callers
313	* (generic_file_read, vm_ops->fault) are clever enough to check i_size
314	* and notice that the folio they just read isn't needed.
315	*
316	* XXX sys_readahead() seems to get that wrong?
317	*/
318	if (start >= i_size_read(inode)) {
319	folio_zero_segment(folio, start: `0`, xend: folio_size(folio));
320	folio_mark_uptodate(folio);
321	ret = `0`;
322	goto out_alloc;
323	}
324
325	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
326	ret = ocfs2_readpage_inline(inode, page: &folio->page);
327	else
328	ret = block_read_full_folio(folio, ocfs2_get_block);
329	unlock = `0`;
330
331	out_alloc:
332	up_read(sem: &oi->ip_alloc_sem);
333	out_inode_unlock:
334	ocfs2_inode_unlock(inode, ex: `0`);
335	out:
336	if (unlock)
337	folio_unlock(folio);
338	return ret;
339	}
340
341	/*
342	* This is used only for read-ahead. Failures or difficult to handle
343	* situations are safe to ignore.
344	*
345	* Right now, we don't bother with BH_Boundary - in-inode extent lists
346	* are quite large (243 extents on 4k blocks), so most inodes don't
347	* grow out to a tree. If need be, detecting boundary extents could
348	* trivially be added in a future version of ocfs2_get_block().
349	*/
350	static void ocfs2_readahead(struct readahead_control *rac)
351	{
352	int ret;
353	struct inode *inode = rac->mapping->host;
354	struct ocfs2_inode_info *oi = OCFS2_I(inode);
355
356	/*
357	* Use the nonblocking flag for the dlm code to avoid page
358	* lock inversion, but don't bother with retrying.
359	*/
360	ret = ocfs2_inode_lock_full(inode, NULL, `0`, OCFS2_LOCK_NONBLOCK);
361	if (ret)
362	return;
363
364	if (down_read_trylock(sem: &oi->ip_alloc_sem) == `0`)
365	goto out_unlock;
366
367	/*
368	* Don't bother with inline-data. There isn't anything
369	* to read-ahead in that case anyway...
370	*/
371	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
372	goto out_up;
373
374	/*
375	* Check whether a remote node truncated this file - we just
376	* drop out in that case as it's not worth handling here.
377	*/
378	if (readahead_pos(rac) >= i_size_read(inode))
379	goto out_up;
380
381	mpage_readahead(rac, get_block: ocfs2_get_block);
382
383	out_up:
384	up_read(sem: &oi->ip_alloc_sem);
385	out_unlock:
386	ocfs2_inode_unlock(inode, ex: `0`);
387	}
388
389	/ Note: Because we don't support holes, our allocation has*
390	* already happened (allocation writes zeros to the file data)
391	* so we don't have to worry about ordered writes in
392	* ocfs2_writepage.
393	*
394	* ->writepage is called during the process of invalidating the page cache
395	* during blocked lock processing. It can't block on any cluster locks
396	* to during block mapping. It's relying on the fact that the block
397	* mapping can't have disappeared under the dirty pages that it is
398	* being asked to write back.
399	*/
400	static int ocfs2_writepage(struct page page, struct* writeback_control *wbc)
401	{
402	trace_ocfs2_writepage(
403	val1: (unsigned long long)OCFS2_I(inode: page->mapping->host)->ip_blkno,
404	val2: page->index);
405
406	return block_write_full_page(page, get_block: ocfs2_get_block, wbc);
407	}
408
409	/ Taken from ext3. We don't necessarily need the full blown*
410	* functionality yet, but IMHO it's better to cut and paste the whole
411	* thing so we can avoid introducing our own bugs (and easily pick up
412	* their fixes when they happen) --Mark */
413	int walk_page_buffers( handle_t *handle,
414	struct buffer_head *head,
415	unsigned from,
416	unsigned to,
417	int *partial,
418	int (fn)( handle_t handle,
419	struct buffer_head *bh))
420	{
421	struct buffer_head *bh;
422	unsigned block_start, block_end;
423	unsigned blocksize = head->b_size;
424	int err, ret = `0`;
425	struct buffer_head *next;
426
427	for ( bh = head, block_start = `0`;
428	ret == `0` && (bh != head \|\| !block_start);
429	block_start = block_end, bh = next)
430	{
431	next = bh->b_this_page;
432	block_end = block_start + blocksize;
433	if (block_end <= from \|\| block_start >= to) {
434	if (partial && !buffer_uptodate(bh))
435	*partial = `1`;
436	continue;
437	}
438	err = (*fn)(handle, bh);
439	if (!ret)
440	ret = err;
441	}
442	return ret;
443	}
444
445	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
446	{
447	sector_t status;
448	u64 p_blkno = `0`;
449	int err = `0`;
450	struct inode *inode = mapping->host;
451
452	trace_ocfs2_bmap(val1: (unsigned long long)OCFS2_I(inode)->ip_blkno,
453	val2: (unsigned long long)block);
454
455	/*
456	* The swap code (ab-)uses ->bmap to get a block mapping and then
457	* bypasseѕ the file system for actual I/O. We really can't allow
458	* that on refcounted inodes, so we have to skip out here. And yes,
459	* 0 is the magic code for a bmap error..
460	*/
461	if (ocfs2_is_refcount_inode(inode))
462	return `0`;
463
464	/ We don't need to lock journal system files, since they aren't*
465	* accessed concurrently from multiple nodes.
466	*/
467	if (!INODE_JOURNAL(inode)) {
468	err = ocfs2_inode_lock(inode, NULL, `0`);
469	if (err) {
470	if (err != -ENOENT)
471	mlog_errno(err);
472	goto bail;
473	}
474	down_read(sem: &OCFS2_I(inode)->ip_alloc_sem);
475	}
476
477	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
478	err = ocfs2_extent_map_get_blocks(inode, v_blkno: block, p_blkno: &p_blkno, NULL,
479	NULL);
480
481	if (!INODE_JOURNAL(inode)) {
482	up_read(sem: &OCFS2_I(inode)->ip_alloc_sem);
483	ocfs2_inode_unlock(inode, ex: `0`);
484	}
485
486	if (err) {
487	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
488	(unsigned long long)block);
489	mlog_errno(err);
490	goto bail;
491	}
492
493	bail:
494	status = err ? `0` : p_blkno;
495
496	return status;
497	}
498
499	static bool ocfs2_release_folio(struct folio *folio, gfp_t wait)
500	{
501	if (!folio_buffers(folio))
502	return false;
503	return try_to_free_buffers(folio);
504	}
505
506	static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
507	u32 cpos,
508	unsigned int *start,
509	unsigned int *end)
510	{
511	unsigned int cluster_start = `0`, cluster_end = PAGE_SIZE;
512
513	if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
514	unsigned int cpp;
515
516	cpp = `1` << (PAGE_SHIFT - osb->s_clustersize_bits);
517
518	cluster_start = cpos % cpp;
519	cluster_start = cluster_start << osb->s_clustersize_bits;
520
521	cluster_end = cluster_start + osb->s_clustersize;
522	}
523
524	BUG_ON(cluster_start > PAGE_SIZE);
525	BUG_ON(cluster_end > PAGE_SIZE);
526
527	if (start)
528	*start = cluster_start;
529	if (end)
530	*end = cluster_end;
531	}
532
533	/*
534	* 'from' and 'to' are the region in the page to avoid zeroing.
535	*
536	* If pagesize > clustersize, this function will avoid zeroing outside
537	* of the cluster boundary.
538	*
539	* from == to == 0 is code for "zero the entire cluster region"
540	*/
541	static void ocfs2_clear_page_regions(struct page *page,
542	struct ocfs2_super *osb, u32 cpos,
543	unsigned from, unsigned to)
544	{
545	void *kaddr;
546	unsigned int cluster_start, cluster_end;
547
548	ocfs2_figure_cluster_boundaries(osb, cpos, start: &cluster_start, end: &cluster_end);
549
550	kaddr = kmap_atomic(page);
551
552	if (from \|\| to) {
553	if (from > cluster_start)
554	memset(kaddr + cluster_start, `0`, from - cluster_start);
555	if (to < cluster_end)
556	memset(kaddr + to, `0`, cluster_end - to);
557	} else {
558	memset(kaddr + cluster_start, `0`, cluster_end - cluster_start);
559	}
560
561	kunmap_atomic(kaddr);
562	}
563
564	/*
565	* Nonsparse file systems fully allocate before we get to the write
566	* code. This prevents ocfs2_write() from tagging the write as an
567	* allocating one, which means ocfs2_map_page_blocks() might try to
568	* read-in the blocks at the tail of our file. Avoid reading them by
569	* testing i_size against each block offset.
570	*/
571	static int ocfs2_should_read_blk(struct inode inode, struct* folio *folio,
572	unsigned int block_start)
573	{
574	u64 offset = folio_pos(folio) + block_start;
575
576	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
577	return `1`;
578
579	if (i_size_read(inode) > offset)
580	return `1`;
581
582	return `0`;
583	}
584
585	/*
586	* Some of this taken from __block_write_begin(). We already have our
587	* mapping by now though, and the entire write will be allocating or
588	* it won't, so not much need to use BH_New.
589	*
590	* This will also skip zeroing, which is handled externally.
591	*/
592	int ocfs2_map_page_blocks(struct page page, u64 p_blkno,
593	struct inode inode, unsigned* int from,
594	unsigned int to, int new)
595	{
596	struct folio *folio = page_folio(page);
597	int ret = `0`;
598	struct buffer_head head, bh, wait[`2`], *wait_bh = wait;
599	unsigned int block_end, block_start;
600	unsigned int bsize = i_blocksize(node: inode);
601
602	head = folio_buffers(folio);
603	if (!head)
604	head = create_empty_buffers(folio, blocksize: bsize, b_state: `0`);
605
606	for (bh = head, block_start = `0`; bh != head \|\| !block_start;
607	bh = bh->b_this_page, block_start += bsize) {
608	block_end = block_start + bsize;
609
610	clear_buffer_new(bh);
611
612	/*
613	* Ignore blocks outside of our i/o range -
614	* they may belong to unallocated clusters.
615	*/
616	if (block_start >= to \|\| block_end <= from) {
617	if (folio_test_uptodate(folio))
618	set_buffer_uptodate(bh);
619	continue;
620	}
621
622	/*
623	* For an allocating write with cluster size >= page
624	* size, we always write the entire page.
625	*/
626	if (new)
627	set_buffer_new(bh);
628
629	if (!buffer_mapped(bh)) {
630	map_bh(bh, sb: inode->i_sb, block: *p_blkno);
631	clean_bdev_bh_alias(bh);
632	}
633
634	if (folio_test_uptodate(folio)) {
635	set_buffer_uptodate(bh);
636	} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
637	!buffer_new(bh) &&
638	ocfs2_should_read_blk(inode, folio, block_start) &&
639	(block_start < from \|\| block_end > to)) {
640	bh_read_nowait(bh, op_flags: `0`);
641	*wait_bh++=bh;
642	}
643
644	p_blkno = p_blkno + `1`;
645	}
646
647	/*
648	* If we issued read requests - let them complete.
649	*/
650	while(wait_bh > wait) {
651	wait_on_buffer(bh: *--wait_bh);
652	if (!buffer_uptodate(bh: *wait_bh))
653	ret = -EIO;
654	}
655
656	if (ret == `0` \|\| !new)
657	return ret;
658
659	/*
660	* If we get -EIO above, zero out any newly allocated blocks
661	* to avoid exposing stale data.
662	*/
663	bh = head;
664	block_start = `0`;
665	do {
666	block_end = block_start + bsize;
667	if (block_end <= from)
668	goto next_bh;
669	if (block_start >= to)
670	break;
671
672	folio_zero_range(folio, start: block_start, length: bh->b_size);
673	set_buffer_uptodate(bh);
674	mark_buffer_dirty(bh);
675
676	next_bh:
677	block_start = block_end;
678	bh = bh->b_this_page;
679	} while (bh != head);
680
681	return ret;
682	}
683
684	#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
685	#define OCFS2_MAX_CTXT_PAGES 1
686	#else
687	#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
688	#endif
689
690	#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
691
692	struct ocfs2_unwritten_extent {
693	struct list_head ue_node;
694	struct list_head ue_ip_node;
695	u32 ue_cpos;
696	u32 ue_phys;
697	};
698
699	/*
700	* Describe the state of a single cluster to be written to.
701	*/
702	struct ocfs2_write_cluster_desc {
703	u32 c_cpos;
704	u32 c_phys;
705	/*
706	* Give this a unique field because c_phys eventually gets
707	* filled.
708	*/
709	unsigned c_new;
710	unsigned c_clear_unwritten;
711	unsigned c_needs_zero;
712	};
713
714	struct ocfs2_write_ctxt {
715	/ Logical cluster position / len of write /
716	u32 w_cpos;
717	u32 w_clen;
718
719	/ First cluster allocated in a nonsparse extend /
720	u32 w_first_new_cpos;
721
722	/ Type of caller. Must be one of buffer, mmap, direct. /
723	ocfs2_write_type_t w_type;
724
725	struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
726
727	/*
728	* This is true if page_size > cluster_size.
729	*
730	* It triggers a set of special cases during write which might
731	* have to deal with allocating writes to partial pages.
732	*/
733	unsigned int w_large_pages;
734
735	/*
736	* Pages involved in this write.
737	*
738	* w_target_page is the page being written to by the user.
739	*
740	* w_pages is an array of pages which always contains
741	* w_target_page, and in the case of an allocating write with
742	* page_size < cluster size, it will contain zero'd and mapped
743	* pages adjacent to w_target_page which need to be written
744	* out in so that future reads from that region will get
745	* zero's.
746	*/
747	unsigned int w_num_pages;
748	struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
749	struct page *w_target_page;
750
751	/*
752	* w_target_locked is used for page_mkwrite path indicating no unlocking
753	* against w_target_page in ocfs2_write_end_nolock.
754	*/
755	unsigned int w_target_locked:`1`;
756
757	/*
758	* ocfs2_write_end() uses this to know what the real range to
759	* write in the target should be.
760	*/
761	unsigned int w_target_from;
762	unsigned int w_target_to;
763
764	/*
765	* We could use journal_current_handle() but this is cleaner,
766	* IMHO -Mark
767	*/
768	handle_t *w_handle;
769
770	struct buffer_head *w_di_bh;
771
772	struct ocfs2_cached_dealloc_ctxt w_dealloc;
773
774	struct list_head w_unwritten_list;
775	unsigned int w_unwritten_count;
776	};
777
778	void ocfs2_unlock_and_free_pages(struct page *pages, int* num_pages)
779	{
780	int i;
781
782	for(i = `0`; i < num_pages; i++) {
783	if (pages[i]) {
784	unlock_page(page: pages[i]);
785	mark_page_accessed(pages[i]);
786	put_page(page: pages[i]);
787	}
788	}
789	}
790
791	static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
792	{
793	int i;
794
795	/*
796	* w_target_locked is only set to true in the page_mkwrite() case.
797	* The intent is to allow us to lock the target page from write_begin()
798	* to write_end(). The caller must hold a ref on w_target_page.
799	*/
800	if (wc->w_target_locked) {
801	BUG_ON(!wc->w_target_page);
802	for (i = `0`; i < wc->w_num_pages; i++) {
803	if (wc->w_target_page == wc->w_pages[i]) {
804	wc->w_pages[i] = NULL;
805	break;
806	}
807	}
808	mark_page_accessed(wc->w_target_page);
809	put_page(page: wc->w_target_page);
810	}
811	ocfs2_unlock_and_free_pages(pages: wc->w_pages, num_pages: wc->w_num_pages);
812	}
813
814	static void ocfs2_free_unwritten_list(struct inode *inode,
815	struct list_head *head)
816	{
817	struct ocfs2_inode_info *oi = OCFS2_I(inode);
818	struct ocfs2_unwritten_extent ue = NULL, tmp = NULL;
819
820	list_for_each_entry_safe(ue, tmp, head, ue_node) {
821	list_del(entry: &ue->ue_node);
822	spin_lock(lock: &oi->ip_lock);
823	list_del(entry: &ue->ue_ip_node);
824	spin_unlock(lock: &oi->ip_lock);
825	kfree(objp: ue);
826	}
827	}
828
829	static void ocfs2_free_write_ctxt(struct inode *inode,
830	struct ocfs2_write_ctxt *wc)
831	{
832	ocfs2_free_unwritten_list(inode, head: &wc->w_unwritten_list);
833	ocfs2_unlock_pages(wc);
834	brelse(bh: wc->w_di_bh);
835	kfree(objp: wc);
836	}
837
838	static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
839	struct ocfs2_super *osb, loff_t pos,
840	unsigned len, ocfs2_write_type_t type,
841	struct buffer_head *di_bh)
842	{
843	u32 cend;
844	struct ocfs2_write_ctxt *wc;
845
846	wc = kzalloc(size: sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
847	if (!wc)
848	return -ENOMEM;
849
850	wc->w_cpos = pos >> osb->s_clustersize_bits;
851	wc->w_first_new_cpos = UINT_MAX;
852	cend = (pos + len - `1`) >> osb->s_clustersize_bits;
853	wc->w_clen = cend - wc->w_cpos + `1`;
854	get_bh(bh: di_bh);
855	wc->w_di_bh = di_bh;
856	wc->w_type = type;
857
858	if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
859	wc->w_large_pages = `1`;
860	else
861	wc->w_large_pages = `0`;
862
863	ocfs2_init_dealloc_ctxt(c: &wc->w_dealloc);
864	INIT_LIST_HEAD(list: &wc->w_unwritten_list);
865
866	*wcp = wc;
867
868	return `0`;
869	}
870
871	/*
872	* If a page has any new buffers, zero them out here, and mark them uptodate
873	* and dirty so they'll be written out (in order to prevent uninitialised
874	* block data from leaking). And clear the new bit.
875	*/
876	static void ocfs2_zero_new_buffers(struct page page, unsigned* from, unsigned to)
877	{
878	unsigned int block_start, block_end;
879	struct buffer_head head, bh;
880
881	BUG_ON(!PageLocked(page));
882	if (!page_has_buffers(page))
883	return;
884
885	bh = head = page_buffers(page);
886	block_start = `0`;
887	do {
888	block_end = block_start + bh->b_size;
889
890	if (buffer_new(bh)) {
891	if (block_end > from && block_start < to) {
892	if (!PageUptodate(page)) {
893	unsigned start, end;
894
895	start = max(from, block_start);
896	end = min(to, block_end);
897
898	zero_user_segment(page, start, end);
899	set_buffer_uptodate(bh);
900	}
901
902	clear_buffer_new(bh);
903	mark_buffer_dirty(bh);
904	}
905	}
906
907	block_start = block_end;
908	bh = bh->b_this_page;
909	} while (bh != head);
910	}
911
912	/*
913	* Only called when we have a failure during allocating write to write
914	* zero's to the newly allocated region.
915	*/
916	static void ocfs2_write_failure(struct inode *inode,
917	struct ocfs2_write_ctxt *wc,
918	loff_t user_pos, unsigned user_len)
919	{
920	int i;
921	unsigned from = user_pos & (PAGE_SIZE - `1`),
922	to = user_pos + user_len;
923	struct page *tmppage;
924
925	if (wc->w_target_page)
926	ocfs2_zero_new_buffers(page: wc->w_target_page, from, to);
927
928	for(i = `0`; i < wc->w_num_pages; i++) {
929	tmppage = wc->w_pages[i];
930
931	if (tmppage && page_has_buffers(tmppage)) {
932	if (ocfs2_should_order_data(inode))
933	ocfs2_jbd2_inode_add_write(handle: wc->w_handle, inode,
934	start_byte: user_pos, length: user_len);
935
936	block_commit_write(page: tmppage, from, to);
937	}
938	}
939	}
940
941	static int ocfs2_prepare_page_for_write(struct inode inode, u64 p_blkno,
942	struct ocfs2_write_ctxt *wc,
943	struct page *page, u32 cpos,
944	loff_t user_pos, unsigned user_len,
945	int new)
946	{
947	int ret;
948	unsigned int map_from = `0`, map_to = `0`;
949	unsigned int cluster_start, cluster_end;
950	unsigned int user_data_from = `0`, user_data_to = `0`;
951
952	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
953	start: &cluster_start, end: &cluster_end);
954
955	/ treat the write as new if the a hole/lseek spanned across*
956	* the page boundary.
957	*/
958	new = new \| ((i_size_read(inode) <= page_offset(page)) &&
959	(page_offset(page) <= user_pos));
960
961	if (page == wc->w_target_page) {
962	map_from = user_pos & (PAGE_SIZE - `1`);
963	map_to = map_from + user_len;
964
965	if (new)
966	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
967	from: cluster_start, to: cluster_end,
968	new);
969	else
970	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
971	from: map_from, to: map_to, new);
972	if (ret) {
973	mlog_errno(ret);
974	goto out;
975	}
976
977	user_data_from = map_from;
978	user_data_to = map_to;
979	if (new) {
980	map_from = cluster_start;
981	map_to = cluster_end;
982	}
983	} else {
984	/*
985	* If we haven't allocated the new page yet, we
986	* shouldn't be writing it out without copying user
987	* data. This is likely a math error from the caller.
988	*/
989	BUG_ON(!new);
990
991	map_from = cluster_start;
992	map_to = cluster_end;
993
994	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
995	from: cluster_start, to: cluster_end, new);
996	if (ret) {
997	mlog_errno(ret);
998	goto out;
999	}
1000	}
1001
1002	/*
1003	* Parts of newly allocated pages need to be zero'd.
1004	*
1005	* Above, we have also rewritten 'to' and 'from' - as far as
1006	* the rest of the function is concerned, the entire cluster
1007	* range inside of a page needs to be written.
1008	*
1009	* We can skip this if the page is up to date - it's already
1010	* been zero'd from being read in as a hole.
1011	*/
1012	if (new && !PageUptodate(page))
1013	ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
1014	cpos, from: user_data_from, to: user_data_to);
1015
1016	flush_dcache_page(page);
1017
1018	out:
1019	return ret;
1020	}
1021
1022	/*
1023	* This function will only grab one clusters worth of pages.
1024	*/
1025	static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1026	struct ocfs2_write_ctxt *wc,
1027	u32 cpos, loff_t user_pos,
1028	unsigned user_len, int new,
1029	struct page *mmap_page)
1030	{
1031	int ret = `0`, i;
1032	unsigned long start, target_index, end_index, index;
1033	struct inode *inode = mapping->host;
1034	loff_t last_byte;
1035
1036	target_index = user_pos >> PAGE_SHIFT;
1037
1038	/*
1039	* Figure out how many pages we'll be manipulating here. For
1040	* non allocating write, we just change the one
1041	* page. Otherwise, we'll need a whole clusters worth. If we're
1042	* writing past i_size, we only need enough pages to cover the
1043	* last page of the write.
1044	*/
1045	if (new) {
1046	wc->w_num_pages = ocfs2_pages_per_cluster(sb: inode->i_sb);
1047	start = ocfs2_align_clusters_to_page_index(sb: inode->i_sb, clusters: cpos);
1048	/*
1049	* We need the index past the last page we could possibly
1050	* touch. This is the page past the end of the write or
1051	* i_size, whichever is greater.
1052	*/
1053	last_byte = max(user_pos + user_len, i_size_read(inode));
1054	BUG_ON(last_byte < `1`);
1055	end_index = ((last_byte - `1`) >> PAGE_SHIFT) + `1`;
1056	if ((start + wc->w_num_pages) > end_index)
1057	wc->w_num_pages = end_index - start;
1058	} else {
1059	wc->w_num_pages = `1`;
1060	start = target_index;
1061	}
1062	end_index = (user_pos + user_len - `1`) >> PAGE_SHIFT;
1063
1064	for(i = `0`; i < wc->w_num_pages; i++) {
1065	index = start + i;
1066
1067	if (index >= target_index && index <= end_index &&
1068	wc->w_type == OCFS2_WRITE_MMAP) {
1069	/*
1070	* ocfs2_pagemkwrite() is a little different
1071	* and wants us to directly use the page
1072	* passed in.
1073	*/
1074	lock_page(page: mmap_page);
1075
1076	/ Exit and let the caller retry /
1077	if (mmap_page->mapping != mapping) {
1078	WARN_ON(mmap_page->mapping);
1079	unlock_page(page: mmap_page);
1080	ret = -EAGAIN;
1081	goto out;
1082	}
1083
1084	get_page(page: mmap_page);
1085	wc->w_pages[i] = mmap_page;
1086	wc->w_target_locked = true;
1087	} else if (index >= target_index && index <= end_index &&
1088	wc->w_type == OCFS2_WRITE_DIRECT) {
1089	/ Direct write has no mapping page. /
1090	wc->w_pages[i] = NULL;
1091	continue;
1092	} else {
1093	wc->w_pages[i] = find_or_create_page(mapping, index,
1094	GFP_NOFS);
1095	if (!wc->w_pages[i]) {
1096	ret = -ENOMEM;
1097	mlog_errno(ret);
1098	goto out;
1099	}
1100	}
1101	wait_for_stable_page(page: wc->w_pages[i]);
1102
1103	if (index == target_index)
1104	wc->w_target_page = wc->w_pages[i];
1105	}
1106	out:
1107	if (ret)
1108	wc->w_target_locked = false;
1109	return ret;
1110	}
1111
1112	/*
1113	* Prepare a single cluster for write one cluster into the file.
1114	*/
1115	static int ocfs2_write_cluster(struct address_space *mapping,
1116	u32 phys, unsigned* int new,
1117	unsigned int clear_unwritten,
1118	unsigned int should_zero,
1119	struct ocfs2_alloc_context *data_ac,
1120	struct ocfs2_alloc_context *meta_ac,
1121	struct ocfs2_write_ctxt *wc, u32 cpos,
1122	loff_t user_pos, unsigned user_len)
1123	{
1124	int ret, i;
1125	u64 p_blkno;
1126	struct inode *inode = mapping->host;
1127	struct ocfs2_extent_tree et;
1128	int bpc = ocfs2_clusters_to_blocks(sb: inode->i_sb, clusters: `1`);
1129
1130	if (new) {
1131	u32 tmp_pos;
1132
1133	/*
1134	* This is safe to call with the page locks - it won't take
1135	* any additional semaphores or cluster locks.
1136	*/
1137	tmp_pos = cpos;
1138	ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
1139	logical_offset: &tmp_pos, clusters_to_add: `1`, mark_unwritten: !clear_unwritten,
1140	fe_bh: wc->w_di_bh, handle: wc->w_handle,
1141	data_ac, meta_ac, NULL);
1142	/*
1143	* This shouldn't happen because we must have already
1144	* calculated the correct meta data allocation required. The
1145	* internal tree allocation code should know how to increase
1146	* transaction credits itself.
1147	*
1148	* If need be, we could handle -EAGAIN for a
1149	* RESTART_TRANS here.
1150	*/
1151	mlog_bug_on_msg(ret == -EAGAIN,
1152	"Inode %llu: EAGAIN return during allocation.\n",
1153	(unsigned long long)OCFS2_I(inode)->ip_blkno);
1154	if (ret < `0`) {
1155	mlog_errno(ret);
1156	goto out;
1157	}
1158	} else if (clear_unwritten) {
1159	ocfs2_init_dinode_extent_tree(et: &et, ci: INODE_CACHE(inode),
1160	bh: wc->w_di_bh);
1161	ret = ocfs2_mark_extent_written(inode, et: &et,
1162	handle: wc->w_handle, cpos, len: `1`, phys: *phys,
1163	meta_ac, dealloc: &wc->w_dealloc);
1164	if (ret < `0`) {
1165	mlog_errno(ret);
1166	goto out;
1167	}
1168	}
1169
1170	/*
1171	* The only reason this should fail is due to an inability to
1172	* find the extent added.
1173	*/
1174	ret = ocfs2_get_clusters(inode, v_cluster: cpos, p_cluster: phys, NULL, NULL);
1175	if (ret < `0`) {
1176	mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
1177	"at logical cluster %u",
1178	(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
1179	goto out;
1180	}
1181
1182	BUG_ON(*phys == `0`);
1183
1184	p_blkno = ocfs2_clusters_to_blocks(sb: inode->i_sb, clusters: *phys);
1185	if (!should_zero)
1186	p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - `1`);
1187
1188	for(i = `0`; i < wc->w_num_pages; i++) {
1189	int tmpret;
1190
1191	/ This is the direct io target page. /
1192	if (wc->w_pages[i] == NULL) {
1193	p_blkno++;
1194	continue;
1195	}
1196
1197	tmpret = ocfs2_prepare_page_for_write(inode, p_blkno: &p_blkno, wc,
1198	page: wc->w_pages[i], cpos,
1199	user_pos, user_len,
1200	new: should_zero);
1201	if (tmpret) {
1202	mlog_errno(tmpret);
1203	if (ret == `0`)
1204	ret = tmpret;
1205	}
1206	}
1207
1208	/*
1209	* We only have cleanup to do in case of allocating write.
1210	*/
1211	if (ret && new)
1212	ocfs2_write_failure(inode, wc, user_pos, user_len);
1213
1214	out:
1215
1216	return ret;
1217	}
1218
1219	static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
1220	struct ocfs2_alloc_context *data_ac,
1221	struct ocfs2_alloc_context *meta_ac,
1222	struct ocfs2_write_ctxt *wc,
1223	loff_t pos, unsigned len)
1224	{
1225	int ret, i;
1226	loff_t cluster_off;
1227	unsigned int local_len = len;
1228	struct ocfs2_write_cluster_desc *desc;
1229	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
1230
1231	for (i = `0`; i < wc->w_clen; i++) {
1232	desc = &wc->w_desc[i];
1233
1234	/*
1235	* We have to make sure that the total write passed in
1236	* doesn't extend past a single cluster.
1237	*/
1238	local_len = len;
1239	cluster_off = pos & (osb->s_clustersize - `1`);
1240	if ((cluster_off + local_len) > osb->s_clustersize)
1241	local_len = osb->s_clustersize - cluster_off;
1242
1243	ret = ocfs2_write_cluster(mapping, phys: &desc->c_phys,
1244	new: desc->c_new,
1245	clear_unwritten: desc->c_clear_unwritten,
1246	should_zero: desc->c_needs_zero,
1247	data_ac, meta_ac,
1248	wc, cpos: desc->c_cpos, user_pos: pos, user_len: local_len);
1249	if (ret) {
1250	mlog_errno(ret);
1251	goto out;
1252	}
1253
1254	len -= local_len;
1255	pos += local_len;
1256	}
1257
1258	ret = `0`;
1259	out:
1260	return ret;
1261	}
1262
1263	/*
1264	* ocfs2_write_end() wants to know which parts of the target page it
1265	* should complete the write on. It's easiest to compute them ahead of
1266	* time when a more complete view of the write is available.
1267	*/
1268	static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
1269	struct ocfs2_write_ctxt *wc,
1270	loff_t pos, unsigned len, int alloc)
1271	{
1272	struct ocfs2_write_cluster_desc *desc;
1273
1274	wc->w_target_from = pos & (PAGE_SIZE - `1`);
1275	wc->w_target_to = wc->w_target_from + len;
1276
1277	if (alloc == `0`)
1278	return;
1279
1280	/*
1281	* Allocating write - we may have different boundaries based
1282	* on page size and cluster size.
1283	*
1284	* NOTE: We can no longer compute one value from the other as
1285	* the actual write length and user provided length may be
1286	* different.
1287	*/
1288
1289	if (wc->w_large_pages) {
1290	/*
1291	* We only care about the 1st and last cluster within
1292	* our range and whether they should be zero'd or not. Either
1293	* value may be extended out to the start/end of a
1294	* newly allocated cluster.
1295	*/
1296	desc = &wc->w_desc[`0`];
1297	if (desc->c_needs_zero)
1298	ocfs2_figure_cluster_boundaries(osb,
1299	cpos: desc->c_cpos,
1300	start: &wc->w_target_from,
1301	NULL);
1302
1303	desc = &wc->w_desc[wc->w_clen - `1`];
1304	if (desc->c_needs_zero)
1305	ocfs2_figure_cluster_boundaries(osb,
1306	cpos: desc->c_cpos,
1307	NULL,
1308	end: &wc->w_target_to);
1309	} else {
1310	wc->w_target_from = `0`;
1311	wc->w_target_to = PAGE_SIZE;
1312	}
1313	}
1314
1315	/*
1316	* Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
1317	* do the zero work. And should not to clear UNWRITTEN since it will be cleared
1318	* by the direct io procedure.
1319	* If this is a new extent that allocated by direct io, we should mark it in
1320	* the ip_unwritten_list.
1321	*/
1322	static int ocfs2_unwritten_check(struct inode *inode,
1323	struct ocfs2_write_ctxt *wc,
1324	struct ocfs2_write_cluster_desc *desc)
1325	{
1326	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1327	struct ocfs2_unwritten_extent ue = NULL, new = NULL;
1328	int ret = `0`;
1329
1330	if (!desc->c_needs_zero)
1331	return `0`;
1332
1333	retry:
1334	spin_lock(lock: &oi->ip_lock);
1335	/ Needs not to zero no metter buffer or direct. The one who is zero*
1336	* the cluster is doing zero. And he will clear unwritten after all
1337	* cluster io finished. */
1338	list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
1339	if (desc->c_cpos == ue->ue_cpos) {
1340	BUG_ON(desc->c_new);
1341	desc->c_needs_zero = `0`;
1342	desc->c_clear_unwritten = `0`;
1343	goto unlock;
1344	}
1345	}
1346
1347	if (wc->w_type != OCFS2_WRITE_DIRECT)
1348	goto unlock;
1349
1350	if (new == NULL) {
1351	spin_unlock(lock: &oi->ip_lock);
1352	new = kmalloc(size: sizeof(struct ocfs2_unwritten_extent),
1353	GFP_NOFS);
1354	if (new == NULL) {
1355	ret = -ENOMEM;
1356	goto out;
1357	}
1358	goto retry;
1359	}
1360	/ This direct write will doing zero. /
1361	new->ue_cpos = desc->c_cpos;
1362	new->ue_phys = desc->c_phys;
1363	desc->c_clear_unwritten = `0`;
1364	list_add_tail(new: &new->ue_ip_node, head: &oi->ip_unwritten_list);
1365	list_add_tail(new: &new->ue_node, head: &wc->w_unwritten_list);
1366	wc->w_unwritten_count++;
1367	new = NULL;
1368	unlock:
1369	spin_unlock(lock: &oi->ip_lock);
1370	out:
1371	kfree(objp: new);
1372	return ret;
1373	}
1374
1375	/*
1376	* Populate each single-cluster write descriptor in the write context
1377	* with information about the i/o to be done.
1378	*
1379	* Returns the number of clusters that will have to be allocated, as
1380	* well as a worst case estimate of the number of extent records that
1381	* would have to be created during a write to an unwritten region.
1382	*/
1383	static int ocfs2_populate_write_desc(struct inode *inode,
1384	struct ocfs2_write_ctxt *wc,
1385	unsigned int *clusters_to_alloc,
1386	unsigned int *extents_to_split)
1387	{
1388	int ret;
1389	struct ocfs2_write_cluster_desc *desc;
1390	unsigned int num_clusters = `0`;
1391	unsigned int ext_flags = `0`;
1392	u32 phys = `0`;
1393	int i;
1394
1395	*clusters_to_alloc = `0`;
1396	*extents_to_split = `0`;
1397
1398	for (i = `0`; i < wc->w_clen; i++) {
1399	desc = &wc->w_desc[i];
1400	desc->c_cpos = wc->w_cpos + i;
1401
1402	if (num_clusters == `0`) {
1403	/*
1404	* Need to look up the next extent record.
1405	*/
1406	ret = ocfs2_get_clusters(inode, v_cluster: desc->c_cpos, p_cluster: &phys,
1407	num_clusters: &num_clusters, extent_flags: &ext_flags);
1408	if (ret) {
1409	mlog_errno(ret);
1410	goto out;
1411	}
1412
1413	/ We should already CoW the refcountd extent. /
1414	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1415
1416	/*
1417	* Assume worst case - that we're writing in
1418	* the middle of the extent.
1419	*
1420	* We can assume that the write proceeds from
1421	* left to right, in which case the extent
1422	* insert code is smart enough to coalesce the
1423	* next splits into the previous records created.
1424	*/
1425	if (ext_flags & OCFS2_EXT_UNWRITTEN)
1426	extents_to_split = extents_to_split + `2`;
1427	} else if (phys) {
1428	/*
1429	* Only increment phys if it doesn't describe
1430	* a hole.
1431	*/
1432	phys++;
1433	}
1434
1435	/*
1436	* If w_first_new_cpos is < UINT_MAX, we have a non-sparse
1437	* file that got extended. w_first_new_cpos tells us
1438	* where the newly allocated clusters are so we can
1439	* zero them.
1440	*/
1441	if (desc->c_cpos >= wc->w_first_new_cpos) {
1442	BUG_ON(phys == `0`);
1443	desc->c_needs_zero = `1`;
1444	}
1445
1446	desc->c_phys = phys;
1447	if (phys == `0`) {
1448	desc->c_new = `1`;
1449	desc->c_needs_zero = `1`;
1450	desc->c_clear_unwritten = `1`;
1451	clusters_to_alloc = clusters_to_alloc + `1`;
1452	}
1453
1454	if (ext_flags & OCFS2_EXT_UNWRITTEN) {
1455	desc->c_clear_unwritten = `1`;
1456	desc->c_needs_zero = `1`;
1457	}
1458
1459	ret = ocfs2_unwritten_check(inode, wc, desc);
1460	if (ret) {
1461	mlog_errno(ret);
1462	goto out;
1463	}
1464
1465	num_clusters--;
1466	}
1467
1468	ret = `0`;
1469	out:
1470	return ret;
1471	}
1472
1473	static int ocfs2_write_begin_inline(struct address_space *mapping,
1474	struct inode *inode,
1475	struct ocfs2_write_ctxt *wc)
1476	{
1477	int ret;
1478	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1479	struct page *page;
1480	handle_t *handle;
1481	struct ocfs2_dinode di = (struct* ocfs2_dinode *)wc->w_di_bh->b_data;
1482
1483	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1484	if (IS_ERR(ptr: handle)) {
1485	ret = PTR_ERR(ptr: handle);
1486	mlog_errno(ret);
1487	goto out;
1488	}
1489
1490	page = find_or_create_page(mapping, index: `0`, GFP_NOFS);
1491	if (!page) {
1492	ocfs2_commit_trans(osb, handle);
1493	ret = -ENOMEM;
1494	mlog_errno(ret);
1495	goto out;
1496	}
1497	/*
1498	* If we don't set w_num_pages then this page won't get unlocked
1499	* and freed on cleanup of the write context.
1500	*/
1501	wc->w_pages[`0`] = wc->w_target_page = page;
1502	wc->w_num_pages = `1`;
1503
1504	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh: wc->w_di_bh,
1505	OCFS2_JOURNAL_ACCESS_WRITE);
1506	if (ret) {
1507	ocfs2_commit_trans(osb, handle);
1508
1509	mlog_errno(ret);
1510	goto out;
1511	}
1512
1513	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1514	ocfs2_set_inode_data_inline(inode, di);
1515
1516	if (!PageUptodate(page)) {
1517	ret = ocfs2_read_inline_data(inode, page, di_bh: wc->w_di_bh);
1518	if (ret) {
1519	ocfs2_commit_trans(osb, handle);
1520
1521	goto out;
1522	}
1523	}
1524
1525	wc->w_handle = handle;
1526	out:
1527	return ret;
1528	}
1529
1530	int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
1531	{
1532	struct ocfs2_dinode di = (struct* ocfs2_dinode *)di_bh->b_data;
1533
1534	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
1535	return `1`;
1536	return `0`;
1537	}
1538
1539	static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1540	struct inode *inode, loff_t pos,
1541	unsigned len, struct page *mmap_page,
1542	struct ocfs2_write_ctxt *wc)
1543	{
1544	int ret, written = `0`;
1545	loff_t end = pos + len;
1546	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1547	struct ocfs2_dinode *di = NULL;
1548
1549	trace_ocfs2_try_to_write_inline_data(ino: (unsigned long long)oi->ip_blkno,
1550	len, pos: (unsigned long long)pos,
1551	flags: oi->ip_dyn_features);
1552
1553	/*
1554	* Handle inodes which already have inline data 1st.
1555	*/
1556	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1557	if (mmap_page == NULL &&
1558	ocfs2_size_fits_inline_data(di_bh: wc->w_di_bh, new_size: end))
1559	goto do_inline_write;
1560
1561	/*
1562	* The write won't fit - we have to give this inode an
1563	* inline extent list now.
1564	*/
1565	ret = ocfs2_convert_inline_data_to_extents(inode, di_bh: wc->w_di_bh);
1566	if (ret)
1567	mlog_errno(ret);
1568	goto out;
1569	}
1570
1571	/*
1572	* Check whether the inode can accept inline data.
1573	*/
1574	if (oi->ip_clusters != `0` \|\| i_size_read(inode) != `0`)
1575	return `0`;
1576
1577	/*
1578	* Check whether the write can fit.
1579	*/
1580	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1581	if (mmap_page \|\|
1582	end > ocfs2_max_inline_data_with_xattr(sb: inode->i_sb, di))
1583	return `0`;
1584
1585	do_inline_write:
1586	ret = ocfs2_write_begin_inline(mapping, inode, wc);
1587	if (ret) {
1588	mlog_errno(ret);
1589	goto out;
1590	}
1591
1592	/*
1593	* This signals to the caller that the data can be written
1594	* inline.
1595	*/
1596	written = `1`;
1597	out:
1598	return written ? written : ret;
1599	}
1600
1601	/*
1602	* This function only does anything for file systems which can't
1603	* handle sparse files.
1604	*
1605	* What we want to do here is fill in any hole between the current end
1606	* of allocation and the end of our write. That way the rest of the
1607	* write path can treat it as an non-allocating write, which has no
1608	* special case code for sparse/nonsparse files.
1609	*/
1610	static int ocfs2_expand_nonsparse_inode(struct inode *inode,
1611	struct buffer_head *di_bh,
1612	loff_t pos, unsigned len,
1613	struct ocfs2_write_ctxt *wc)
1614	{
1615	int ret;
1616	loff_t newsize = pos + len;
1617
1618	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1619
1620	if (newsize <= i_size_read(inode))
1621	return `0`;
1622
1623	ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size: newsize, zero_to: pos);
1624	if (ret)
1625	mlog_errno(ret);
1626
1627	/ There is no wc if this is call from direct. /
1628	if (wc)
1629	wc->w_first_new_cpos =
1630	ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: i_size_read(inode));
1631
1632	return ret;
1633	}
1634
1635	static int ocfs2_zero_tail(struct inode inode, struct* buffer_head *di_bh,
1636	loff_t pos)
1637	{
1638	int ret = `0`;
1639
1640	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
1641	if (pos > i_size_read(inode))
1642	ret = ocfs2_zero_extend(inode, di_bh, zero_to: pos);
1643
1644	return ret;
1645	}
1646
1647	int ocfs2_write_begin_nolock(struct address_space *mapping,
1648	loff_t pos, unsigned len, ocfs2_write_type_t type,
1649	struct page *pagep, void* **fsdata,
1650	struct buffer_head di_bh, struct* page *mmap_page)
1651	{
1652	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
1653	unsigned int clusters_to_alloc, extents_to_split, clusters_need = `0`;
1654	struct ocfs2_write_ctxt *wc;
1655	struct inode *inode = mapping->host;
1656	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1657	struct ocfs2_dinode *di;
1658	struct ocfs2_alloc_context *data_ac = NULL;
1659	struct ocfs2_alloc_context *meta_ac = NULL;
1660	handle_t *handle;
1661	struct ocfs2_extent_tree et;
1662	int try_free = `1`, ret1;
1663
1664	try_again:
1665	ret = ocfs2_alloc_write_ctxt(wcp: &wc, osb, pos, len, type, di_bh);
1666	if (ret) {
1667	mlog_errno(ret);
1668	return ret;
1669	}
1670
1671	if (ocfs2_supports_inline_data(osb)) {
1672	ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
1673	mmap_page, wc);
1674	if (ret == `1`) {
1675	ret = `0`;
1676	goto success;
1677	}
1678	if (ret < `0`) {
1679	mlog_errno(ret);
1680	goto out;
1681	}
1682	}
1683
1684	/ Direct io change i_size late, should not zero tail here. /
1685	if (type != OCFS2_WRITE_DIRECT) {
1686	if (ocfs2_sparse_alloc(osb))
1687	ret = ocfs2_zero_tail(inode, di_bh, pos);
1688	else
1689	ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
1690	len, wc);
1691	if (ret) {
1692	mlog_errno(ret);
1693	goto out;
1694	}
1695	}
1696
1697	ret = ocfs2_check_range_for_refcount(inode, pos, count: len);
1698	if (ret < `0`) {
1699	mlog_errno(ret);
1700	goto out;
1701	} else if (ret == `1`) {
1702	clusters_need = wc->w_clen;
1703	ret = ocfs2_refcount_cow(inode, di_bh,
1704	cpos: wc->w_cpos, write_len: wc->w_clen, UINT_MAX);
1705	if (ret) {
1706	mlog_errno(ret);
1707	goto out;
1708	}
1709	}
1710
1711	ret = ocfs2_populate_write_desc(inode, wc, clusters_to_alloc: &clusters_to_alloc,
1712	extents_to_split: &extents_to_split);
1713	if (ret) {
1714	mlog_errno(ret);
1715	goto out;
1716	}
1717	clusters_need += clusters_to_alloc;
1718
1719	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1720
1721	trace_ocfs2_write_begin_nolock(
1722	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
1723	i_size: (long long)i_size_read(inode),
1724	le32_to_cpu(di->i_clusters),
1725	pos, len, flags: type, page: mmap_page,
1726	clusters: clusters_to_alloc, extents_to_split);
1727
1728	/*
1729	* We set w_target_from, w_target_to here so that
1730	* ocfs2_write_end() knows which range in the target page to
1731	* write out. An allocation requires that we write the entire
1732	* cluster range.
1733	*/
1734	if (clusters_to_alloc \|\| extents_to_split) {
1735	/*
1736	* XXX: We are stretching the limits of
1737	* ocfs2_lock_allocators(). It greatly over-estimates
1738	* the work to be done.
1739	*/
1740	ocfs2_init_dinode_extent_tree(et: &et, ci: INODE_CACHE(inode),
1741	bh: wc->w_di_bh);
1742	ret = ocfs2_lock_allocators(inode, et: &et,
1743	clusters_to_add: clusters_to_alloc, extents_to_split,
1744	data_ac: &data_ac, meta_ac: &meta_ac);
1745	if (ret) {
1746	mlog_errno(ret);
1747	goto out;
1748	}
1749
1750	if (data_ac)
1751	data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
1752
1753	credits = ocfs2_calc_extend_credits(sb: inode->i_sb,
1754	root_el: &di->id2.i_list);
1755	} else if (type == OCFS2_WRITE_DIRECT)
1756	/ direct write needs not to start trans if no extents alloc. /
1757	goto success;
1758
1759	/*
1760	* We have to zero sparse allocated clusters, unwritten extent clusters,
1761	* and non-sparse clusters we just extended. For non-sparse writes,
1762	* we know zeros will only be needed in the first and/or last cluster.
1763	*/
1764	if (wc->w_clen && (wc->w_desc[`0`].c_needs_zero \|\|
1765	wc->w_desc[wc->w_clen - `1`].c_needs_zero))
1766	cluster_of_pages = `1`;
1767	else
1768	cluster_of_pages = `0`;
1769
1770	ocfs2_set_target_boundaries(osb, wc, pos, len, alloc: cluster_of_pages);
1771
1772	handle = ocfs2_start_trans(osb, max_buffs: credits);
1773	if (IS_ERR(ptr: handle)) {
1774	ret = PTR_ERR(ptr: handle);
1775	mlog_errno(ret);
1776	goto out;
1777	}
1778
1779	wc->w_handle = handle;
1780
1781	if (clusters_to_alloc) {
1782	ret = dquot_alloc_space_nodirty(inode,
1783	nr: ocfs2_clusters_to_bytes(sb: osb->sb, clusters: clusters_to_alloc));
1784	if (ret)
1785	goto out_commit;
1786	}
1787
1788	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh: wc->w_di_bh,
1789	OCFS2_JOURNAL_ACCESS_WRITE);
1790	if (ret) {
1791	mlog_errno(ret);
1792	goto out_quota;
1793	}
1794
1795	/*
1796	* Fill our page array first. That way we've grabbed enough so
1797	* that we can zero and flush if we error after adding the
1798	* extent.
1799	*/
1800	ret = ocfs2_grab_pages_for_write(mapping, wc, cpos: wc->w_cpos, user_pos: pos, user_len: len,
1801	new: cluster_of_pages, mmap_page);
1802	if (ret) {
1803	/*
1804	* ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
1805	* the target page. In this case, we exit with no error and no target
1806	* page. This will trigger the caller, page_mkwrite(), to re-try
1807	* the operation.
1808	*/
1809	if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
1810	BUG_ON(wc->w_target_page);
1811	ret = `0`;
1812	goto out_quota;
1813	}
1814
1815	mlog_errno(ret);
1816	goto out_quota;
1817	}
1818
1819	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1820	len);
1821	if (ret) {
1822	mlog_errno(ret);
1823	goto out_quota;
1824	}
1825
1826	if (data_ac)
1827	ocfs2_free_alloc_context(ac: data_ac);
1828	if (meta_ac)
1829	ocfs2_free_alloc_context(ac: meta_ac);
1830
1831	success:
1832	if (pagep)
1833	*pagep = wc->w_target_page;
1834	*fsdata = wc;
1835	return `0`;
1836	out_quota:
1837	if (clusters_to_alloc)
1838	dquot_free_space(inode,
1839	nr: ocfs2_clusters_to_bytes(sb: osb->sb, clusters: clusters_to_alloc));
1840	out_commit:
1841	ocfs2_commit_trans(osb, handle);
1842
1843	out:
1844	/*
1845	* The mmapped page won't be unlocked in ocfs2_free_write_ctxt(),
1846	* even in case of error here like ENOSPC and ENOMEM. So, we need
1847	* to unlock the target page manually to prevent deadlocks when
1848	* retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED
1849	* to VM code.
1850	*/
1851	if (wc->w_target_locked)
1852	unlock_page(page: mmap_page);
1853
1854	ocfs2_free_write_ctxt(inode, wc);
1855
1856	if (data_ac) {
1857	ocfs2_free_alloc_context(ac: data_ac);
1858	data_ac = NULL;
1859	}
1860	if (meta_ac) {
1861	ocfs2_free_alloc_context(ac: meta_ac);
1862	meta_ac = NULL;
1863	}
1864
1865	if (ret == -ENOSPC && try_free) {
1866	/*
1867	* Try to free some truncate log so that we can have enough
1868	* clusters to allocate.
1869	*/
1870	try_free = `0`;
1871
1872	ret1 = ocfs2_try_to_free_truncate_log(osb, needed: clusters_need);
1873	if (ret1 == `1`)
1874	goto try_again;
1875
1876	if (ret1 < `0`)
1877	mlog_errno(ret1);
1878	}
1879
1880	return ret;
1881	}
1882
1883	static int ocfs2_write_begin(struct file file, struct* address_space *mapping,
1884	loff_t pos, unsigned len,
1885	struct page *pagep, void* **fsdata)
1886	{
1887	int ret;
1888	struct buffer_head *di_bh = NULL;
1889	struct inode *inode = mapping->host;
1890
1891	ret = ocfs2_inode_lock(inode, &di_bh, `1`);
1892	if (ret) {
1893	mlog_errno(ret);
1894	return ret;
1895	}
1896
1897	/*
1898	* Take alloc sem here to prevent concurrent lookups. That way
1899	* the mapping, zeroing and tree manipulation within
1900	* ocfs2_write() will be safe against ->read_folio(). This
1901	* should also serve to lock out allocation from a shared
1902	* writeable region.
1903	*/
1904	down_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
1905
1906	ret = ocfs2_write_begin_nolock(mapping, pos, len, type: OCFS2_WRITE_BUFFER,
1907	pagep, fsdata, di_bh, NULL);
1908	if (ret) {
1909	mlog_errno(ret);
1910	goto out_fail;
1911	}
1912
1913	brelse(bh: di_bh);
1914
1915	return `0`;
1916
1917	out_fail:
1918	up_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
1919
1920	brelse(bh: di_bh);
1921	ocfs2_inode_unlock(inode, ex: `1`);
1922
1923	return ret;
1924	}
1925
1926	static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1927	unsigned len, unsigned *copied,
1928	struct ocfs2_dinode *di,
1929	struct ocfs2_write_ctxt *wc)
1930	{
1931	void *kaddr;
1932
1933	if (unlikely(*copied < len)) {
1934	if (!PageUptodate(page: wc->w_target_page)) {
1935	*copied = `0`;
1936	return;
1937	}
1938	}
1939
1940	kaddr = kmap_atomic(page: wc->w_target_page);
1941	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1942	kunmap_atomic(kaddr);
1943
1944	trace_ocfs2_write_end_inline(
1945	ino: (unsigned long long)OCFS2_I(inode)->ip_blkno,
1946	pos: (unsigned long long)pos, copied: *copied,
1947	le16_to_cpu(di->id2.i_data.id_count),
1948	le16_to_cpu(di->i_dyn_features));
1949	}
1950
1951	int ocfs2_write_end_nolock(struct address_space *mapping,
1952	loff_t pos, unsigned len, unsigned copied, void *fsdata)
1953	{
1954	int i, ret;
1955	unsigned from, to, start = pos & (PAGE_SIZE - `1`);
1956	struct inode *inode = mapping->host;
1957	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1958	struct ocfs2_write_ctxt *wc = fsdata;
1959	struct ocfs2_dinode di = (struct* ocfs2_dinode *)wc->w_di_bh->b_data;
1960	handle_t *handle = wc->w_handle;
1961	struct page *tmppage;
1962
1963	BUG_ON(!list_empty(&wc->w_unwritten_list));
1964
1965	if (handle) {
1966	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode),
1967	bh: wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1968	if (ret) {
1969	copied = ret;
1970	mlog_errno(ret);
1971	goto out;
1972	}
1973	}
1974
1975	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1976	ocfs2_write_end_inline(inode, pos, len, copied: &copied, di, wc);
1977	goto out_write_size;
1978	}
1979
1980	if (unlikely(copied < len) && wc->w_target_page) {
1981	loff_t new_isize;
1982
1983	if (!PageUptodate(page: wc->w_target_page))
1984	copied = `0`;
1985
1986	new_isize = max_t(loff_t, i_size_read(inode), pos + copied);
1987	if (new_isize > page_offset(page: wc->w_target_page))
1988	ocfs2_zero_new_buffers(page: wc->w_target_page, from: start+copied,
1989	to: start+len);
1990	else {
1991	/*
1992	* When page is fully beyond new isize (data copy
1993	* failed), do not bother zeroing the page. Invalidate
1994	* it instead so that writeback does not get confused
1995	* put page & buffer dirty bits into inconsistent
1996	* state.
1997	*/
1998	block_invalidate_folio(page_folio(wc->w_target_page),
1999	offset: `0`, PAGE_SIZE);
2000	}
2001	}
2002	if (wc->w_target_page)
2003	flush_dcache_page(page: wc->w_target_page);
2004
2005	for(i = `0`; i < wc->w_num_pages; i++) {
2006	tmppage = wc->w_pages[i];
2007
2008	/ This is the direct io target page. /
2009	if (tmppage == NULL)
2010	continue;
2011
2012	if (tmppage == wc->w_target_page) {
2013	from = wc->w_target_from;
2014	to = wc->w_target_to;
2015
2016	BUG_ON(from > PAGE_SIZE \|\|
2017	to > PAGE_SIZE \|\|
2018	to < from);
2019	} else {
2020	/*
2021	* Pages adjacent to the target (if any) imply
2022	* a hole-filling write in which case we want
2023	* to flush their entire range.
2024	*/
2025	from = `0`;
2026	to = PAGE_SIZE;
2027	}
2028
2029	if (page_has_buffers(tmppage)) {
2030	if (handle && ocfs2_should_order_data(inode)) {
2031	loff_t start_byte =
2032	((loff_t)tmppage->index << PAGE_SHIFT) +
2033	from;
2034	loff_t length = to - from;
2035	ocfs2_jbd2_inode_add_write(handle, inode,
2036	start_byte, length);
2037	}
2038	block_commit_write(page: tmppage, from, to);
2039	}
2040	}
2041
2042	out_write_size:
2043	/ Direct io do not update i_size here. /
2044	if (wc->w_type != OCFS2_WRITE_DIRECT) {
2045	pos += copied;
2046	if (pos > i_size_read(inode)) {
2047	i_size_write(inode, i_size: pos);
2048	mark_inode_dirty(inode);
2049	}
2050	inode->i_blocks = ocfs2_inode_sector_count(inode);
2051	di->i_size = cpu_to_le64((u64)i_size_read(inode));
2052	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
2053	di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode));
2054	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode));
2055	if (handle)
2056	ocfs2_update_inode_fsync_trans(handle, inode, datasync: `1`);
2057	}
2058	if (handle)
2059	ocfs2_journal_dirty(handle, bh: wc->w_di_bh);
2060
2061	out:
2062	/ unlock pages before dealloc since it needs acquiring j_trans_barrier*
2063	* lock, or it will cause a deadlock since journal commit threads holds
2064	* this lock and will ask for the page lock when flushing the data.
2065	* put it here to preserve the unlock order.
2066	*/
2067	ocfs2_unlock_pages(wc);
2068
2069	if (handle)
2070	ocfs2_commit_trans(osb, handle);
2071
2072	ocfs2_run_deallocs(osb, ctxt: &wc->w_dealloc);
2073
2074	brelse(bh: wc->w_di_bh);
2075	kfree(objp: wc);
2076
2077	return copied;
2078	}
2079
2080	static int ocfs2_write_end(struct file file, struct* address_space *mapping,
2081	loff_t pos, unsigned len, unsigned copied,
2082	struct page page, void* *fsdata)
2083	{
2084	int ret;
2085	struct inode *inode = mapping->host;
2086
2087	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
2088
2089	up_write(sem: &OCFS2_I(inode)->ip_alloc_sem);
2090	ocfs2_inode_unlock(inode, ex: `1`);
2091
2092	return ret;
2093	}
2094
2095	struct ocfs2_dio_write_ctxt {
2096	struct list_head dw_zero_list;
2097	unsigned dw_zero_count;
2098	int dw_orphaned;
2099	pid_t dw_writer_pid;
2100	};
2101
2102	static struct ocfs2_dio_write_ctxt *
2103	ocfs2_dio_alloc_write_ctx(struct buffer_head bh, int* *alloc)
2104	{
2105	struct ocfs2_dio_write_ctxt *dwc = NULL;
2106
2107	if (bh->b_private)
2108	return bh->b_private;
2109
2110	dwc = kmalloc(size: sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
2111	if (dwc == NULL)
2112	return NULL;
2113	INIT_LIST_HEAD(list: &dwc->dw_zero_list);
2114	dwc->dw_zero_count = `0`;
2115	dwc->dw_orphaned = `0`;
2116	dwc->dw_writer_pid = task_pid_nr(current);
2117	bh->b_private = dwc;
2118	*alloc = `1`;
2119
2120	return dwc;
2121	}
2122
2123	static void ocfs2_dio_free_write_ctx(struct inode *inode,
2124	struct ocfs2_dio_write_ctxt *dwc)
2125	{
2126	ocfs2_free_unwritten_list(inode, head: &dwc->dw_zero_list);
2127	kfree(objp: dwc);
2128	}
2129
2130	/*
2131	* TODO: Make this into a generic get_blocks function.
2132	*
2133	* From do_direct_io in direct-io.c:
2134	* "So what we do is to permit the ->get_blocks function to populate
2135	* bh.b_size with the size of IO which is permitted at this offset and
2136	* this i_blkbits."
2137	*
2138	* This function is called directly from get_more_blocks in direct-io.c.
2139	*
2140	* called like this: dio->get_blocks(dio->inode, fs_startblk,
2141	* fs_count, map_bh, dio->rw == WRITE);
2142	*/
2143	static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
2144	struct buffer_head bh_result, int* create)
2145	{
2146	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2147	struct ocfs2_inode_info *oi = OCFS2_I(inode);
2148	struct ocfs2_write_ctxt *wc;
2149	struct ocfs2_write_cluster_desc *desc = NULL;
2150	struct ocfs2_dio_write_ctxt *dwc = NULL;
2151	struct buffer_head *di_bh = NULL;
2152	u64 p_blkno;
2153	unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
2154	loff_t pos = iblock << i_blkbits;
2155	sector_t endblk = (i_size_read(inode) - `1`) >> i_blkbits;
2156	unsigned len, total_len = bh_result->b_size;
2157	int ret = `0`, first_get_block = `0`;
2158
2159	len = osb->s_clustersize - (pos & (osb->s_clustersize - `1`));
2160	len = min(total_len, len);
2161
2162	/*
2163	* bh_result->b_size is count in get_more_blocks according to write
2164	* "pos" and "end", we need map twice to return different buffer state:
2165	* 1. area in file size, not set NEW;
2166	* 2. area out file size, set NEW.
2167	*
2168	* iblock endblk
2169	* \|--------\|---------\|---------\|---------
2170	* \|<-------area in file------->\|
2171	*/
2172
2173	if ((iblock <= endblk) &&
2174	((iblock + ((len - `1`) >> i_blkbits)) > endblk))
2175	len = (endblk - iblock + `1`) << i_blkbits;
2176
2177	mlog(`0`, "get block of %lu at %llu:%u req %u\n",
2178	inode->i_ino, pos, len, total_len);
2179
2180	/*
2181	* Because we need to change file size in ocfs2_dio_end_io_write(), or
2182	* we may need to add it to orphan dir. So can not fall to fast path
2183	* while file size will be changed.
2184	*/
2185	if (pos + total_len <= i_size_read(inode)) {
2186
2187	/ This is the fast path for re-write. /
2188	ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
2189	if (buffer_mapped(bh: bh_result) &&
2190	!buffer_new(bh: bh_result) &&
2191	ret == `0`)
2192	goto out;
2193
2194	/ Clear state set by ocfs2_get_block. /
2195	bh_result->b_state = `0`;
2196	}
2197
2198	dwc = ocfs2_dio_alloc_write_ctx(bh: bh_result, alloc: &first_get_block);
2199	if (unlikely(dwc == NULL)) {
2200	ret = -ENOMEM;
2201	mlog_errno(ret);
2202	goto out;
2203	}
2204
2205	if (ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: pos + total_len) >
2206	ocfs2_clusters_for_bytes(sb: inode->i_sb, bytes: i_size_read(inode)) &&
2207	!dwc->dw_orphaned) {
2208	/*
2209	* when we are going to alloc extents beyond file size, add the
2210	* inode to orphan dir, so we can recall those spaces when
2211	* system crashed during write.
2212	*/
2213	ret = ocfs2_add_inode_to_orphan(osb, inode);
2214	if (ret < `0`) {
2215	mlog_errno(ret);
2216	goto out;
2217	}
2218	dwc->dw_orphaned = `1`;
2219	}
2220
2221	ret = ocfs2_inode_lock(inode, &di_bh, `1`);
2222	if (ret) {
2223	mlog_errno(ret);
2224	goto out;
2225	}
2226
2227	down_write(sem: &oi->ip_alloc_sem);
2228
2229	if (first_get_block) {
2230	if (ocfs2_sparse_alloc(osb))
2231	ret = ocfs2_zero_tail(inode, di_bh, pos);
2232	else
2233	ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
2234	len: total_len, NULL);
2235	if (ret < `0`) {
2236	mlog_errno(ret);
2237	goto unlock;
2238	}
2239	}
2240
2241	ret = ocfs2_write_begin_nolock(mapping: inode->i_mapping, pos, len,
2242	type: OCFS2_WRITE_DIRECT, NULL,
2243	fsdata: (void **)&wc, di_bh, NULL);
2244	if (ret) {
2245	mlog_errno(ret);
2246	goto unlock;
2247	}
2248
2249	desc = &wc->w_desc[`0`];
2250
2251	p_blkno = ocfs2_clusters_to_blocks(sb: inode->i_sb, clusters: desc->c_phys);
2252	BUG_ON(p_blkno == `0`);
2253	p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(sb: inode->i_sb, clusters: `1`) - `1`);
2254
2255	map_bh(bh: bh_result, sb: inode->i_sb, block: p_blkno);
2256	bh_result->b_size = len;
2257	if (desc->c_needs_zero)
2258	set_buffer_new(bh_result);
2259
2260	if (iblock > endblk)
2261	set_buffer_new(bh_result);
2262
2263	/ May sleep in end_io. It should not happen in a irq context. So defer*
2264	* it to dio work queue. */
2265	set_buffer_defer_completion(bh_result);
2266
2267	if (!list_empty(head: &wc->w_unwritten_list)) {
2268	struct ocfs2_unwritten_extent *ue = NULL;
2269
2270	ue = list_first_entry(&wc->w_unwritten_list,
2271	struct ocfs2_unwritten_extent,
2272	ue_node);
2273	BUG_ON(ue->ue_cpos != desc->c_cpos);
2274	/ The physical address may be 0, fill it. /
2275	ue->ue_phys = desc->c_phys;
2276
2277	list_splice_tail_init(list: &wc->w_unwritten_list, head: &dwc->dw_zero_list);
2278	dwc->dw_zero_count += wc->w_unwritten_count;
2279	}
2280
2281	ret = ocfs2_write_end_nolock(mapping: inode->i_mapping, pos, len, copied: len, fsdata: wc);
2282	BUG_ON(ret != len);
2283	ret = `0`;
2284	unlock:
2285	up_write(sem: &oi->ip_alloc_sem);
2286	ocfs2_inode_unlock(inode, ex: `1`);
2287	brelse(bh: di_bh);
2288	out:
2289	if (ret < `0`)
2290	ret = -EIO;
2291	return ret;
2292	}
2293
2294	static int ocfs2_dio_end_io_write(struct inode *inode,
2295	struct ocfs2_dio_write_ctxt *dwc,
2296	loff_t offset,
2297	ssize_t bytes)
2298	{
2299	struct ocfs2_cached_dealloc_ctxt dealloc;
2300	struct ocfs2_extent_tree et;
2301	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2302	struct ocfs2_inode_info *oi = OCFS2_I(inode);
2303	struct ocfs2_unwritten_extent *ue = NULL;
2304	struct buffer_head *di_bh = NULL;
2305	struct ocfs2_dinode *di;
2306	struct ocfs2_alloc_context *data_ac = NULL;
2307	struct ocfs2_alloc_context *meta_ac = NULL;
2308	handle_t *handle = NULL;
2309	loff_t end = offset + bytes;
2310	int ret = `0`, credits = `0`;
2311
2312	ocfs2_init_dealloc_ctxt(c: &dealloc);
2313
2314	/ We do clear unwritten, delete orphan, change i_size here. If neither*
2315	* of these happen, we can skip all this. */
2316	if (list_empty(head: &dwc->dw_zero_list) &&
2317	end <= i_size_read(inode) &&
2318	!dwc->dw_orphaned)
2319	goto out;
2320
2321	ret = ocfs2_inode_lock(inode, &di_bh, `1`);
2322	if (ret < `0`) {
2323	mlog_errno(ret);
2324	goto out;
2325	}
2326
2327	down_write(sem: &oi->ip_alloc_sem);
2328
2329	/ Delete orphan before acquire i_rwsem. /
2330	if (dwc->dw_orphaned) {
2331	BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
2332
2333	end = end > i_size_read(inode) ? end : `0`;
2334
2335	ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
2336	update_isize: !!end, end);
2337	if (ret < `0`)
2338	mlog_errno(ret);
2339	}
2340
2341	di = (struct ocfs2_dinode *)di_bh->b_data;
2342
2343	ocfs2_init_dinode_extent_tree(et: &et, ci: INODE_CACHE(inode), bh: di_bh);
2344
2345	/ Attach dealloc with extent tree in case that we may reuse extents*
2346	* which are already unlinked from current extent tree due to extent
2347	* rotation and merging.
2348	*/
2349	et.et_dealloc = &dealloc;
2350
2351	ret = ocfs2_lock_allocators(inode, et: &et, clusters_to_add: `0`, extents_to_split: dwc->dw_zero_count*`2`,
2352	data_ac: &data_ac, meta_ac: &meta_ac);
2353	if (ret) {
2354	mlog_errno(ret);
2355	goto unlock;
2356	}
2357
2358	credits = ocfs2_calc_extend_credits(sb: inode->i_sb, root_el: &di->id2.i_list);
2359
2360	handle = ocfs2_start_trans(osb, max_buffs: credits);
2361	if (IS_ERR(ptr: handle)) {
2362	ret = PTR_ERR(ptr: handle);
2363	mlog_errno(ret);
2364	goto unlock;
2365	}
2366	ret = ocfs2_journal_access_di(handle, ci: INODE_CACHE(inode), bh: di_bh,
2367	OCFS2_JOURNAL_ACCESS_WRITE);
2368	if (ret) {
2369	mlog_errno(ret);
2370	goto commit;
2371	}
2372
2373	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
2374	ret = ocfs2_mark_extent_written(inode, et: &et, handle,
2375	cpos: ue->ue_cpos, len: `1`,
2376	phys: ue->ue_phys,
2377	meta_ac, dealloc: &dealloc);
2378	if (ret < `0`) {
2379	mlog_errno(ret);
2380	break;
2381	}
2382	}
2383
2384	if (end > i_size_read(inode)) {
2385	ret = ocfs2_set_inode_size(handle, inode, fe_bh: di_bh, new_i_size: end);
2386	if (ret < `0`)
2387	mlog_errno(ret);
2388	}
2389	commit:
2390	ocfs2_commit_trans(osb, handle);
2391	unlock:
2392	up_write(sem: &oi->ip_alloc_sem);
2393	ocfs2_inode_unlock(inode, ex: `1`);
2394	brelse(bh: di_bh);
2395	out:
2396	if (data_ac)
2397	ocfs2_free_alloc_context(ac: data_ac);
2398	if (meta_ac)
2399	ocfs2_free_alloc_context(ac: meta_ac);
2400	ocfs2_run_deallocs(osb, ctxt: &dealloc);
2401	ocfs2_dio_free_write_ctx(inode, dwc);
2402
2403	return ret;
2404	}
2405
2406	/*
2407	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
2408	* particularly interested in the aio/dio case. We use the rw_lock DLM lock
2409	* to protect io on one node from truncation on another.
2410	*/
2411	static int ocfs2_dio_end_io(struct kiocb *iocb,
2412	loff_t offset,
2413	ssize_t bytes,
2414	void *private)
2415	{
2416	struct inode *inode = file_inode(f: iocb->ki_filp);
2417	int level;
2418	int ret = `0`;
2419
2420	/ this io's submitter should not have unlocked this before we could /
2421	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
2422
2423	if (bytes <= `0`)
2424	mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld",
2425	(long long)bytes);
2426	if (private) {
2427	if (bytes > `0`)
2428	ret = ocfs2_dio_end_io_write(inode, dwc: private, offset,
2429	bytes);
2430	else
2431	ocfs2_dio_free_write_ctx(inode, dwc: private);
2432	}
2433
2434	ocfs2_iocb_clear_rw_locked(iocb);
2435
2436	level = ocfs2_iocb_rw_locked_level(iocb);
2437	ocfs2_rw_unlock(inode, write: level);
2438	return ret;
2439	}
2440
2441	static ssize_t ocfs2_direct_IO(struct kiocb iocb, struct* iov_iter *iter)
2442	{
2443	struct file *file = iocb->ki_filp;
2444	struct inode *inode = file->f_mapping->host;
2445	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2446	get_block_t *get_block;
2447
2448	/*
2449	* Fallback to buffered I/O if we see an inode without
2450	* extents.
2451	*/
2452	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2453	return `0`;
2454
2455	/ Fallback to buffered I/O if we do not support append dio. /
2456	if (iocb->ki_pos + iter->count > i_size_read(inode) &&
2457	!ocfs2_supports_append_dio(osb))
2458	return `0`;
2459
2460	if (iov_iter_rw(i: iter) == READ)
2461	get_block = ocfs2_lock_get_block;
2462	else
2463	get_block = ocfs2_dio_wr_get_block;
2464
2465	return __blockdev_direct_IO(iocb, inode, bdev: inode->i_sb->s_bdev,
2466	iter, get_block,
2467	end_io: ocfs2_dio_end_io, flags: `0`);
2468	}
2469
2470	const struct address_space_operations ocfs2_aops = {
2471	.dirty_folio = block_dirty_folio,
2472	.read_folio = ocfs2_read_folio,
2473	.readahead = ocfs2_readahead,
2474	.writepage = ocfs2_writepage,
2475	.write_begin = ocfs2_write_begin,
2476	.write_end = ocfs2_write_end,
2477	.bmap = ocfs2_bmap,
2478	.direct_IO = ocfs2_direct_IO,
2479	.invalidate_folio = block_invalidate_folio,
2480	.release_folio = ocfs2_release_folio,
2481	.migrate_folio = buffer_migrate_folio,
2482	.is_partially_uptodate = block_is_partially_uptodate,
2483	.error_remove_page = generic_error_remove_page,
2484	};
2485

source code of linux/fs/ocfs2/aops.c