file.c source code [linux/fs/btrfs/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007 Oracle. All rights reserved.
4	*/
5
6	#include <linux/fs.h>
7	#include <linux/pagemap.h>
8	#include <linux/time.h>
9	#include <linux/init.h>
10	#include <linux/string.h>
11	#include <linux/backing-dev.h>
12	#include <linux/falloc.h>
13	#include <linux/writeback.h>
14	#include <linux/compat.h>
15	#include <linux/slab.h>
16	#include <linux/btrfs.h>
17	#include <linux/uio.h>
18	#include <linux/iversion.h>
19	#include <linux/fsverity.h>
20	#include <linux/iomap.h>
21	#include "ctree.h"
22	#include "disk-io.h"
23	#include "transaction.h"
24	#include "btrfs_inode.h"
25	#include "print-tree.h"
26	#include "tree-log.h"
27	#include "locking.h"
28	#include "volumes.h"
29	#include "qgroup.h"
30	#include "compression.h"
31	#include "delalloc-space.h"
32	#include "reflink.h"
33	#include "subpage.h"
34	#include "fs.h"
35	#include "accessors.h"
36	#include "extent-tree.h"
37	#include "file-item.h"
38	#include "ioctl.h"
39	#include "file.h"
40	#include "super.h"
41
42	/ simple helper to fault in pages and copy. This should go away*
43	* and be replaced with calls into generic code.
44	*/
45	static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
46	struct page **prepared_pages,
47	struct iov_iter *i)
48	{
49	size_t copied = `0`;
50	size_t total_copied = `0`;
51	int pg = `0`;
52	int offset = offset_in_page(pos);
53
54	while (write_bytes > `0`) {
55	size_t count = min_t(size_t,
56	PAGE_SIZE - offset, write_bytes);
57	struct page *page = prepared_pages[pg];
58	/*
59	* Copy data from userspace to the current page
60	*/
61	copied = copy_page_from_iter_atomic(page, offset, bytes: count, i);
62
63	/ Flush processor's dcache for this page /
64	flush_dcache_page(page);
65
66	/*
67	* if we get a partial write, we can end up with
68	* partially up to date pages. These add
69	* a lot of complexity, so make sure they don't
70	* happen by forcing this copy to be retried.
71	*
72	* The rest of the btrfs_file_write code will fall
73	* back to page at a time copies after we return 0.
74	*/
75	if (unlikely(copied < count)) {
76	if (!PageUptodate(page)) {
77	iov_iter_revert(i, bytes: copied);
78	copied = `0`;
79	}
80	if (!copied)
81	break;
82	}
83
84	write_bytes -= copied;
85	total_copied += copied;
86	offset += copied;
87	if (offset == PAGE_SIZE) {
88	pg++;
89	offset = `0`;
90	}
91	}
92	return total_copied;
93	}
94
95	/*
96	* unlocks pages after btrfs_file_write is done with them
97	*/
98	static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
99	struct page **pages, size_t num_pages,
100	u64 pos, u64 copied)
101	{
102	size_t i;
103	u64 block_start = round_down(pos, fs_info->sectorsize);
104	u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
105
106	ASSERT(block_len <= U32_MAX);
107	for (i = `0`; i < num_pages; i++) {
108	/ page checked is some magic around finding pages that*
109	* have been modified without going through btrfs_set_page_dirty
110	* clear it here. There should be no need to mark the pages
111	* accessed as prepare_pages should have marked them accessed
112	* in prepare_pages via find_or_create_page()
113	*/
114	btrfs_page_clamp_clear_checked(fs_info, page: pages[i], start: block_start,
115	len: block_len);
116	unlock_page(page: pages[i]);
117	put_page(page: pages[i]);
118	}
119	}
120
121	/*
122	* After btrfs_copy_from_user(), update the following things for delalloc:
123	* - Mark newly dirtied pages as DELALLOC in the io tree.
124	* Used to advise which range is to be written back.
125	* - Mark modified pages as Uptodate/Dirty and not needing COW fixup
126	* - Update inode size for past EOF write
127	*/
128	int btrfs_dirty_pages(struct btrfs_inode inode, struct* page **pages,
129	size_t num_pages, loff_t pos, size_t write_bytes,
130	struct extent_state **cached, bool noreserve)
131	{
132	struct btrfs_fs_info *fs_info = inode->root->fs_info;
133	int err = `0`;
134	int i;
135	u64 num_bytes;
136	u64 start_pos;
137	u64 end_of_last_block;
138	u64 end_pos = pos + write_bytes;
139	loff_t isize = i_size_read(inode: &inode->vfs_inode);
140	unsigned int extra_bits = `0`;
141
142	if (write_bytes == `0`)
143	return `0`;
144
145	if (noreserve)
146	extra_bits \|= EXTENT_NORESERVE;
147
148	start_pos = round_down(pos, fs_info->sectorsize);
149	num_bytes = round_up(write_bytes + pos - start_pos,
150	fs_info->sectorsize);
151	ASSERT(num_bytes <= U32_MAX);
152
153	end_of_last_block = start_pos + num_bytes - `1`;
154
155	/*
156	* The pages may have already been dirty, clear out old accounting so
157	* we can set things up properly
158	*/
159	clear_extent_bit(tree: &inode->io_tree, start: start_pos, end: end_of_last_block,
160	bits: EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
161	cached);
162
163	err = btrfs_set_extent_delalloc(inode, start: start_pos, end: end_of_last_block,
164	extra_bits, cached_state: cached);
165	if (err)
166	return err;
167
168	for (i = `0`; i < num_pages; i++) {
169	struct page *p = pages[i];
170
171	btrfs_page_clamp_set_uptodate(fs_info, page: p, start: start_pos, len: num_bytes);
172	btrfs_page_clamp_clear_checked(fs_info, page: p, start: start_pos, len: num_bytes);
173	btrfs_page_clamp_set_dirty(fs_info, page: p, start: start_pos, len: num_bytes);
174	}
175
176	/*
177	* we've only changed i_size in ram, and we haven't updated
178	* the disk i_size. There is no need to log the inode
179	* at this time.
180	*/
181	if (end_pos > isize)
182	i_size_write(inode: &inode->vfs_inode, i_size: end_pos);
183	return `0`;
184	}
185
186	/*
187	* this is very complex, but the basic idea is to drop all extents
188	* in the range start - end. hint_block is filled in with a block number
189	* that would be a good hint to the block allocator for this file.
190	*
191	* If an extent intersects the range but is not entirely inside the range
192	* it is either truncated or split. Anything entirely inside the range
193	* is deleted from the tree.
194	*
195	* Note: the VFS' inode number of bytes is not updated, it's up to the caller
196	* to deal with that. We set the field 'bytes_found' of the arguments structure
197	* with the number of allocated bytes found in the target range, so that the
198	* caller can update the inode's number of bytes in an atomic way when
199	* replacing extents in a range to avoid races with stat(2).
200	*/
201	int btrfs_drop_extents(struct btrfs_trans_handle *trans,
202	struct btrfs_root root, struct* btrfs_inode *inode,
203	struct btrfs_drop_extents_args *args)
204	{
205	struct btrfs_fs_info *fs_info = root->fs_info;
206	struct extent_buffer *leaf;
207	struct btrfs_file_extent_item *fi;
208	struct btrfs_ref ref = { `0` };
209	struct btrfs_key key;
210	struct btrfs_key new_key;
211	u64 ino = btrfs_ino(inode);
212	u64 search_start = args->start;
213	u64 disk_bytenr = `0`;
214	u64 num_bytes = `0`;
215	u64 extent_offset = `0`;
216	u64 extent_end = `0`;
217	u64 last_end = args->start;
218	int del_nr = `0`;
219	int del_slot = `0`;
220	int extent_type;
221	int recow;
222	int ret;
223	int modify_tree = -`1`;
224	int update_refs;
225	int found = `0`;
226	struct btrfs_path *path = args->path;
227
228	args->bytes_found = `0`;
229	args->extent_inserted = false;
230
231	/ Must always have a path if ->replace_extent is true /
232	ASSERT(!(args->replace_extent && !args->path));
233
234	if (!path) {
235	path = btrfs_alloc_path();
236	if (!path) {
237	ret = -ENOMEM;
238	goto out;
239	}
240	}
241
242	if (args->drop_cache)
243	btrfs_drop_extent_map_range(inode, start: args->start, end: args->end - `1`, skip_pinned: false);
244
245	if (args->start >= inode->disk_i_size && !args->replace_extent)
246	modify_tree = `0`;
247
248	update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
249	while (`1`) {
250	recow = `0`;
251	ret = btrfs_lookup_file_extent(trans, root, path, objectid: ino,
252	bytenr: search_start, mod: modify_tree);
253	if (ret < `0`)
254	break;
255	if (ret > `0` && path->slots[`0`] > `0` && search_start == args->start) {
256	leaf = path->nodes[`0`];
257	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`] - `1`);
258	if (key.objectid == ino &&
259	key.type == BTRFS_EXTENT_DATA_KEY)
260	path->slots[`0`]--;
261	}
262	ret = `0`;
263	next_slot:
264	leaf = path->nodes[`0`];
265	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
266	BUG_ON(del_nr > `0`);
267	ret = btrfs_next_leaf(root, path);
268	if (ret < `0`)
269	break;
270	if (ret > `0`) {
271	ret = `0`;
272	break;
273	}
274	leaf = path->nodes[`0`];
275	recow = `1`;
276	}
277
278	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
279
280	if (key.objectid > ino)
281	break;
282	if (WARN_ON_ONCE(key.objectid < ino) \|\|
283	key.type < BTRFS_EXTENT_DATA_KEY) {
284	ASSERT(del_nr == `0`);
285	path->slots[`0`]++;
286	goto next_slot;
287	}
288	if (key.type > BTRFS_EXTENT_DATA_KEY \|\| key.offset >= args->end)
289	break;
290
291	fi = btrfs_item_ptr(leaf, path->slots[`0`],
292	struct btrfs_file_extent_item);
293	extent_type = btrfs_file_extent_type(eb: leaf, s: fi);
294
295	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
296	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
297	disk_bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: fi);
298	num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: fi);
299	extent_offset = btrfs_file_extent_offset(eb: leaf, s: fi);
300	extent_end = key.offset +
301	btrfs_file_extent_num_bytes(eb: leaf, s: fi);
302	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
303	extent_end = key.offset +
304	btrfs_file_extent_ram_bytes(eb: leaf, s: fi);
305	} else {
306	/ can't happen /
307	BUG();
308	}
309
310	/*
311	* Don't skip extent items representing 0 byte lengths. They
312	* used to be created (bug) if while punching holes we hit
313	* -ENOSPC condition. So if we find one here, just ensure we
314	* delete it, otherwise we would insert a new file extent item
315	* with the same key (offset) as that 0 bytes length file
316	* extent item in the call to setup_items_for_insert() later
317	* in this function.
318	*/
319	if (extent_end == key.offset && extent_end >= search_start) {
320	last_end = extent_end;
321	goto delete_extent_item;
322	}
323
324	if (extent_end <= search_start) {
325	path->slots[`0`]++;
326	goto next_slot;
327	}
328
329	found = `1`;
330	search_start = max(key.offset, args->start);
331	if (recow \|\| !modify_tree) {
332	modify_tree = -`1`;
333	btrfs_release_path(p: path);
334	continue;
335	}
336
337	/*
338	* \| - range to drop - \|
339	* \| -------- extent -------- \|
340	*/
341	if (args->start > key.offset && args->end < extent_end) {
342	BUG_ON(del_nr > `0`);
343	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
344	ret = -EOPNOTSUPP;
345	break;
346	}
347
348	memcpy(&new_key, &key, sizeof(new_key));
349	new_key.offset = args->start;
350	ret = btrfs_duplicate_item(trans, root, path,
351	new_key: &new_key);
352	if (ret == -EAGAIN) {
353	btrfs_release_path(p: path);
354	continue;
355	}
356	if (ret < `0`)
357	break;
358
359	leaf = path->nodes[`0`];
360	fi = btrfs_item_ptr(leaf, path->slots[`0`] - `1`,
361	struct btrfs_file_extent_item);
362	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
363	val: args->start - key.offset);
364
365	fi = btrfs_item_ptr(leaf, path->slots[`0`],
366	struct btrfs_file_extent_item);
367
368	extent_offset += args->start - key.offset;
369	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: extent_offset);
370	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
371	val: extent_end - args->start);
372	btrfs_mark_buffer_dirty(trans, buf: leaf);
373
374	if (update_refs && disk_bytenr > `0`) {
375	btrfs_init_generic_ref(generic_ref: &ref,
376	action: BTRFS_ADD_DELAYED_REF,
377	bytenr: disk_bytenr, len: num_bytes, parent: `0`,
378	owning_root: root->root_key.objectid);
379	btrfs_init_data_ref(generic_ref: &ref,
380	ref_root: root->root_key.objectid,
381	ino: new_key.objectid,
382	offset: args->start - extent_offset,
383	mod_root: `0`, skip_qgroup: false);
384	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
385	if (ret) {
386	btrfs_abort_transaction(trans, ret);
387	break;
388	}
389	}
390	key.offset = args->start;
391	}
392	/*
393	* From here on out we will have actually dropped something, so
394	* last_end can be updated.
395	*/
396	last_end = extent_end;
397
398	/*
399	* \| ---- range to drop ----- \|
400	* \| -------- extent -------- \|
401	*/
402	if (args->start <= key.offset && args->end < extent_end) {
403	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
404	ret = -EOPNOTSUPP;
405	break;
406	}
407
408	memcpy(&new_key, &key, sizeof(new_key));
409	new_key.offset = args->end;
410	btrfs_set_item_key_safe(trans, path, new_key: &new_key);
411
412	extent_offset += args->end - key.offset;
413	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: extent_offset);
414	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
415	val: extent_end - args->end);
416	btrfs_mark_buffer_dirty(trans, buf: leaf);
417	if (update_refs && disk_bytenr > `0`)
418	args->bytes_found += args->end - key.offset;
419	break;
420	}
421
422	search_start = extent_end;
423	/*
424	* \| ---- range to drop ----- \|
425	* \| -------- extent -------- \|
426	*/
427	if (args->start > key.offset && args->end >= extent_end) {
428	BUG_ON(del_nr > `0`);
429	if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
430	ret = -EOPNOTSUPP;
431	break;
432	}
433
434	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
435	val: args->start - key.offset);
436	btrfs_mark_buffer_dirty(trans, buf: leaf);
437	if (update_refs && disk_bytenr > `0`)
438	args->bytes_found += extent_end - args->start;
439	if (args->end == extent_end)
440	break;
441
442	path->slots[`0`]++;
443	goto next_slot;
444	}
445
446	/*
447	* \| ---- range to drop ----- \|
448	* \| ------ extent ------ \|
449	*/
450	if (args->start <= key.offset && args->end >= extent_end) {
451	delete_extent_item:
452	if (del_nr == `0`) {
453	del_slot = path->slots[`0`];
454	del_nr = `1`;
455	} else {
456	BUG_ON(del_slot + del_nr != path->slots[`0`]);
457	del_nr++;
458	}
459
460	if (update_refs &&
461	extent_type == BTRFS_FILE_EXTENT_INLINE) {
462	args->bytes_found += extent_end - key.offset;
463	extent_end = ALIGN(extent_end,
464	fs_info->sectorsize);
465	} else if (update_refs && disk_bytenr > `0`) {
466	btrfs_init_generic_ref(generic_ref: &ref,
467	action: BTRFS_DROP_DELAYED_REF,
468	bytenr: disk_bytenr, len: num_bytes, parent: `0`,
469	owning_root: root->root_key.objectid);
470	btrfs_init_data_ref(generic_ref: &ref,
471	ref_root: root->root_key.objectid,
472	ino: key.objectid,
473	offset: key.offset - extent_offset, mod_root: `0`,
474	skip_qgroup: false);
475	ret = btrfs_free_extent(trans, ref: &ref);
476	if (ret) {
477	btrfs_abort_transaction(trans, ret);
478	break;
479	}
480	args->bytes_found += extent_end - key.offset;
481	}
482
483	if (args->end == extent_end)
484	break;
485
486	if (path->slots[`0`] + `1` < btrfs_header_nritems(eb: leaf)) {
487	path->slots[`0`]++;
488	goto next_slot;
489	}
490
491	ret = btrfs_del_items(trans, root, path, slot: del_slot,
492	nr: del_nr);
493	if (ret) {
494	btrfs_abort_transaction(trans, ret);
495	break;
496	}
497
498	del_nr = `0`;
499	del_slot = `0`;
500
501	btrfs_release_path(p: path);
502	continue;
503	}
504
505	BUG();
506	}
507
508	if (!ret && del_nr > `0`) {
509	/*
510	* Set path->slots[0] to first slot, so that after the delete
511	* if items are move off from our leaf to its immediate left or
512	* right neighbor leafs, we end up with a correct and adjusted
513	* path->slots[0] for our insertion (if args->replace_extent).
514	*/
515	path->slots[`0`] = del_slot;
516	ret = btrfs_del_items(trans, root, path, slot: del_slot, nr: del_nr);
517	if (ret)
518	btrfs_abort_transaction(trans, ret);
519	}
520
521	leaf = path->nodes[`0`];
522	/*
523	* If btrfs_del_items() was called, it might have deleted a leaf, in
524	* which case it unlocked our path, so check path->locks[0] matches a
525	* write lock.
526	*/
527	if (!ret && args->replace_extent &&
528	path->locks[`0`] == BTRFS_WRITE_LOCK &&
529	btrfs_leaf_free_space(leaf) >=
530	sizeof(struct btrfs_item) + args->extent_item_size) {
531
532	key.objectid = ino;
533	key.type = BTRFS_EXTENT_DATA_KEY;
534	key.offset = args->start;
535	if (!del_nr && path->slots[`0`] < btrfs_header_nritems(eb: leaf)) {
536	struct btrfs_key slot_key;
537
538	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &slot_key, nr: path->slots[`0`]);
539	if (btrfs_comp_cpu_keys(k1: &key, k2: &slot_key) > `0`)
540	path->slots[`0`]++;
541	}
542	btrfs_setup_item_for_insert(trans, root, path, key: &key,
543	data_size: args->extent_item_size);
544	args->extent_inserted = true;
545	}
546
547	if (!args->path)
548	btrfs_free_path(p: path);
549	else if (!args->extent_inserted)
550	btrfs_release_path(p: path);
551	out:
552	args->drop_end = found ? min(args->end, last_end) : args->end;
553
554	return ret;
555	}
556
557	static int extent_mergeable(struct extent_buffer leaf, int* slot,
558	u64 objectid, u64 bytenr, u64 orig_offset,
559	u64 start, u64 end)
560	{
561	struct btrfs_file_extent_item *fi;
562	struct btrfs_key key;
563	u64 extent_end;
564
565	if (slot < `0` \|\| slot >= btrfs_header_nritems(eb: leaf))
566	return `0`;
567
568	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
569	if (key.objectid != objectid \|\| key.type != BTRFS_EXTENT_DATA_KEY)
570	return `0`;
571
572	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
573	if (btrfs_file_extent_type(eb: leaf, s: fi) != BTRFS_FILE_EXTENT_REG \|\|
574	btrfs_file_extent_disk_bytenr(eb: leaf, s: fi) != bytenr \|\|
575	btrfs_file_extent_offset(eb: leaf, s: fi) != key.offset - orig_offset \|\|
576	btrfs_file_extent_compression(eb: leaf, s: fi) \|\|
577	btrfs_file_extent_encryption(eb: leaf, s: fi) \|\|
578	btrfs_file_extent_other_encoding(eb: leaf, s: fi))
579	return `0`;
580
581	extent_end = key.offset + btrfs_file_extent_num_bytes(eb: leaf, s: fi);
582	if ((start && start != key.offset) \|\| (end && end != extent_end))
583	return `0`;
584
585	*start = key.offset;
586	*end = extent_end;
587	return `1`;
588	}
589
590	/*
591	* Mark extent in the range start - end as written.
592	*
593	* This changes extent type from 'pre-allocated' to 'regular'. If only
594	* part of extent is marked as written, the extent will be split into
595	* two or three.
596	*/
597	int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
598	struct btrfs_inode *inode, u64 start, u64 end)
599	{
600	struct btrfs_root *root = inode->root;
601	struct extent_buffer *leaf;
602	struct btrfs_path *path;
603	struct btrfs_file_extent_item *fi;
604	struct btrfs_ref ref = { `0` };
605	struct btrfs_key key;
606	struct btrfs_key new_key;
607	u64 bytenr;
608	u64 num_bytes;
609	u64 extent_end;
610	u64 orig_offset;
611	u64 other_start;
612	u64 other_end;
613	u64 split;
614	int del_nr = `0`;
615	int del_slot = `0`;
616	int recow;
617	int ret = `0`;
618	u64 ino = btrfs_ino(inode);
619
620	path = btrfs_alloc_path();
621	if (!path)
622	return -ENOMEM;
623	again:
624	recow = `0`;
625	split = start;
626	key.objectid = ino;
627	key.type = BTRFS_EXTENT_DATA_KEY;
628	key.offset = split;
629
630	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: -`1`, cow: `1`);
631	if (ret < `0`)
632	goto out;
633	if (ret > `0` && path->slots[`0`] > `0`)
634	path->slots[`0`]--;
635
636	leaf = path->nodes[`0`];
637	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
638	if (key.objectid != ino \|\|
639	key.type != BTRFS_EXTENT_DATA_KEY) {
640	ret = -EINVAL;
641	btrfs_abort_transaction(trans, ret);
642	goto out;
643	}
644	fi = btrfs_item_ptr(leaf, path->slots[`0`],
645	struct btrfs_file_extent_item);
646	if (btrfs_file_extent_type(eb: leaf, s: fi) != BTRFS_FILE_EXTENT_PREALLOC) {
647	ret = -EINVAL;
648	btrfs_abort_transaction(trans, ret);
649	goto out;
650	}
651	extent_end = key.offset + btrfs_file_extent_num_bytes(eb: leaf, s: fi);
652	if (key.offset > start \|\| extent_end < end) {
653	ret = -EINVAL;
654	btrfs_abort_transaction(trans, ret);
655	goto out;
656	}
657
658	bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: fi);
659	num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: fi);
660	orig_offset = key.offset - btrfs_file_extent_offset(eb: leaf, s: fi);
661	memcpy(&new_key, &key, sizeof(new_key));
662
663	if (start == key.offset && end < extent_end) {
664	other_start = `0`;
665	other_end = start;
666	if (extent_mergeable(leaf, slot: path->slots[`0`] - `1`,
667	objectid: ino, bytenr, orig_offset,
668	start: &other_start, end: &other_end)) {
669	new_key.offset = end;
670	btrfs_set_item_key_safe(trans, path, new_key: &new_key);
671	fi = btrfs_item_ptr(leaf, path->slots[`0`],
672	struct btrfs_file_extent_item);
673	btrfs_set_file_extent_generation(eb: leaf, s: fi,
674	val: trans->transid);
675	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
676	val: extent_end - end);
677	btrfs_set_file_extent_offset(eb: leaf, s: fi,
678	val: end - orig_offset);
679	fi = btrfs_item_ptr(leaf, path->slots[`0`] - `1`,
680	struct btrfs_file_extent_item);
681	btrfs_set_file_extent_generation(eb: leaf, s: fi,
682	val: trans->transid);
683	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
684	val: end - other_start);
685	btrfs_mark_buffer_dirty(trans, buf: leaf);
686	goto out;
687	}
688	}
689
690	if (start > key.offset && end == extent_end) {
691	other_start = end;
692	other_end = `0`;
693	if (extent_mergeable(leaf, slot: path->slots[`0`] + `1`,
694	objectid: ino, bytenr, orig_offset,
695	start: &other_start, end: &other_end)) {
696	fi = btrfs_item_ptr(leaf, path->slots[`0`],
697	struct btrfs_file_extent_item);
698	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
699	val: start - key.offset);
700	btrfs_set_file_extent_generation(eb: leaf, s: fi,
701	val: trans->transid);
702	path->slots[`0`]++;
703	new_key.offset = start;
704	btrfs_set_item_key_safe(trans, path, new_key: &new_key);
705
706	fi = btrfs_item_ptr(leaf, path->slots[`0`],
707	struct btrfs_file_extent_item);
708	btrfs_set_file_extent_generation(eb: leaf, s: fi,
709	val: trans->transid);
710	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
711	val: other_end - start);
712	btrfs_set_file_extent_offset(eb: leaf, s: fi,
713	val: start - orig_offset);
714	btrfs_mark_buffer_dirty(trans, buf: leaf);
715	goto out;
716	}
717	}
718
719	while (start > key.offset \|\| end < extent_end) {
720	if (key.offset == start)
721	split = end;
722
723	new_key.offset = split;
724	ret = btrfs_duplicate_item(trans, root, path, new_key: &new_key);
725	if (ret == -EAGAIN) {
726	btrfs_release_path(p: path);
727	goto again;
728	}
729	if (ret < `0`) {
730	btrfs_abort_transaction(trans, ret);
731	goto out;
732	}
733
734	leaf = path->nodes[`0`];
735	fi = btrfs_item_ptr(leaf, path->slots[`0`] - `1`,
736	struct btrfs_file_extent_item);
737	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
738	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
739	val: split - key.offset);
740
741	fi = btrfs_item_ptr(leaf, path->slots[`0`],
742	struct btrfs_file_extent_item);
743
744	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
745	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: split - orig_offset);
746	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
747	val: extent_end - split);
748	btrfs_mark_buffer_dirty(trans, buf: leaf);
749
750	btrfs_init_generic_ref(generic_ref: &ref, action: BTRFS_ADD_DELAYED_REF, bytenr,
751	len: num_bytes, parent: `0`, owning_root: root->root_key.objectid);
752	btrfs_init_data_ref(generic_ref: &ref, ref_root: root->root_key.objectid, ino,
753	offset: orig_offset, mod_root: `0`, skip_qgroup: false);
754	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
755	if (ret) {
756	btrfs_abort_transaction(trans, ret);
757	goto out;
758	}
759
760	if (split == start) {
761	key.offset = start;
762	} else {
763	if (start != key.offset) {
764	ret = -EINVAL;
765	btrfs_abort_transaction(trans, ret);
766	goto out;
767	}
768	path->slots[`0`]--;
769	extent_end = end;
770	}
771	recow = `1`;
772	}
773
774	other_start = end;
775	other_end = `0`;
776	btrfs_init_generic_ref(generic_ref: &ref, action: BTRFS_DROP_DELAYED_REF, bytenr,
777	len: num_bytes, parent: `0`, owning_root: root->root_key.objectid);
778	btrfs_init_data_ref(generic_ref: &ref, ref_root: root->root_key.objectid, ino, offset: orig_offset,
779	mod_root: `0`, skip_qgroup: false);
780	if (extent_mergeable(leaf, slot: path->slots[`0`] + `1`,
781	objectid: ino, bytenr, orig_offset,
782	start: &other_start, end: &other_end)) {
783	if (recow) {
784	btrfs_release_path(p: path);
785	goto again;
786	}
787	extent_end = other_end;
788	del_slot = path->slots[`0`] + `1`;
789	del_nr++;
790	ret = btrfs_free_extent(trans, ref: &ref);
791	if (ret) {
792	btrfs_abort_transaction(trans, ret);
793	goto out;
794	}
795	}
796	other_start = `0`;
797	other_end = start;
798	if (extent_mergeable(leaf, slot: path->slots[`0`] - `1`,
799	objectid: ino, bytenr, orig_offset,
800	start: &other_start, end: &other_end)) {
801	if (recow) {
802	btrfs_release_path(p: path);
803	goto again;
804	}
805	key.offset = other_start;
806	del_slot = path->slots[`0`];
807	del_nr++;
808	ret = btrfs_free_extent(trans, ref: &ref);
809	if (ret) {
810	btrfs_abort_transaction(trans, ret);
811	goto out;
812	}
813	}
814	if (del_nr == `0`) {
815	fi = btrfs_item_ptr(leaf, path->slots[`0`],
816	struct btrfs_file_extent_item);
817	btrfs_set_file_extent_type(eb: leaf, s: fi,
818	val: BTRFS_FILE_EXTENT_REG);
819	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
820	btrfs_mark_buffer_dirty(trans, buf: leaf);
821	} else {
822	fi = btrfs_item_ptr(leaf, del_slot - `1`,
823	struct btrfs_file_extent_item);
824	btrfs_set_file_extent_type(eb: leaf, s: fi,
825	val: BTRFS_FILE_EXTENT_REG);
826	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
827	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi,
828	val: extent_end - key.offset);
829	btrfs_mark_buffer_dirty(trans, buf: leaf);
830
831	ret = btrfs_del_items(trans, root, path, slot: del_slot, nr: del_nr);
832	if (ret < `0`) {
833	btrfs_abort_transaction(trans, ret);
834	goto out;
835	}
836	}
837	out:
838	btrfs_free_path(p: path);
839	return ret;
840	}
841
842	/*
843	* on error we return an unlocked page and the error value
844	* on success we return a locked page and 0
845	*/
846	static int prepare_uptodate_page(struct inode *inode,
847	struct page *page, u64 pos,
848	bool force_uptodate)
849	{
850	struct folio *folio = page_folio(page);
851	int ret = `0`;
852
853	if (((pos & (PAGE_SIZE - `1`)) \|\| force_uptodate) &&
854	!PageUptodate(page)) {
855	ret = btrfs_read_folio(NULL, folio);
856	if (ret)
857	return ret;
858	lock_page(page);
859	if (!PageUptodate(page)) {
860	unlock_page(page);
861	return -EIO;
862	}
863
864	/*
865	* Since btrfs_read_folio() will unlock the folio before it
866	* returns, there is a window where btrfs_release_folio() can be
867	* called to release the page. Here we check both inode
868	* mapping and PagePrivate() to make sure the page was not
869	* released.
870	*
871	* The private flag check is essential for subpage as we need
872	* to store extra bitmap using page->private.
873	*/
874	if (page->mapping != inode->i_mapping \|\| !PagePrivate(page)) {
875	unlock_page(page);
876	return -EAGAIN;
877	}
878	}
879	return `0`;
880	}
881
882	static fgf_t get_prepare_fgp_flags(bool nowait)
883	{
884	fgf_t fgp_flags = FGP_LOCK \| FGP_ACCESSED \| FGP_CREAT;
885
886	if (nowait)
887	fgp_flags \|= FGP_NOWAIT;
888
889	return fgp_flags;
890	}
891
892	static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
893	{
894	gfp_t gfp;
895
896	gfp = btrfs_alloc_write_mask(mapping: inode->i_mapping);
897	if (nowait) {
898	gfp &= ~__GFP_DIRECT_RECLAIM;
899	gfp \|= GFP_NOWAIT;
900	}
901
902	return gfp;
903	}
904
905	/*
906	* this just gets pages into the page cache and locks them down.
907	*/
908	static noinline int prepare_pages(struct inode inode, struct* page **pages,
909	size_t num_pages, loff_t pos,
910	size_t write_bytes, bool force_uptodate,
911	bool nowait)
912	{
913	int i;
914	unsigned long index = pos >> PAGE_SHIFT;
915	gfp_t mask = get_prepare_gfp_flags(inode, nowait);
916	fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
917	int err = `0`;
918	int faili;
919
920	for (i = `0`; i < num_pages; i++) {
921	again:
922	pages[i] = pagecache_get_page(mapping: inode->i_mapping, index: index + i,
923	fgp_flags, gfp: mask \| __GFP_WRITE);
924	if (!pages[i]) {
925	faili = i - `1`;
926	if (nowait)
927	err = -EAGAIN;
928	else
929	err = -ENOMEM;
930	goto fail;
931	}
932
933	err = set_page_extent_mapped(pages[i]);
934	if (err < `0`) {
935	faili = i;
936	goto fail;
937	}
938
939	if (i == `0`)
940	err = prepare_uptodate_page(inode, page: pages[i], pos,
941	force_uptodate);
942	if (!err && i == num_pages - `1`)
943	err = prepare_uptodate_page(inode, page: pages[i],
944	pos: pos + write_bytes, force_uptodate: false);
945	if (err) {
946	put_page(page: pages[i]);
947	if (!nowait && err == -EAGAIN) {
948	err = `0`;
949	goto again;
950	}
951	faili = i - `1`;
952	goto fail;
953	}
954	wait_on_page_writeback(page: pages[i]);
955	}
956
957	return `0`;
958	fail:
959	while (faili >= `0`) {
960	unlock_page(page: pages[faili]);
961	put_page(page: pages[faili]);
962	faili--;
963	}
964	return err;
965
966	}
967
968	/*
969	* This function locks the extent and properly waits for data=ordered extents
970	* to finish before allowing the pages to be modified if need.
971	*
972	* The return value:
973	* 1 - the extent is locked
974	* 0 - the extent is not locked, and everything is OK
975	* -EAGAIN - need re-prepare the pages
976	* the other < 0 number - Something wrong happens
977	*/
978	static noinline int
979	lock_and_cleanup_extent_if_need(struct btrfs_inode inode, struct* page **pages,
980	size_t num_pages, loff_t pos,
981	size_t write_bytes,
982	u64 lockstart, u64 lockend, bool nowait,
983	struct extent_state **cached_state)
984	{
985	struct btrfs_fs_info *fs_info = inode->root->fs_info;
986	u64 start_pos;
987	u64 last_pos;
988	int i;
989	int ret = `0`;
990
991	start_pos = round_down(pos, fs_info->sectorsize);
992	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - `1`;
993
994	if (start_pos < inode->vfs_inode.i_size) {
995	struct btrfs_ordered_extent *ordered;
996
997	if (nowait) {
998	if (!try_lock_extent(tree: &inode->io_tree, start: start_pos, end: last_pos,
999	cached: cached_state)) {
1000	for (i = `0`; i < num_pages; i++) {
1001	unlock_page(page: pages[i]);
1002	put_page(page: pages[i]);
1003	pages[i] = NULL;
1004	}
1005
1006	return -EAGAIN;
1007	}
1008	} else {
1009	lock_extent(tree: &inode->io_tree, start: start_pos, end: last_pos, cached: cached_state);
1010	}
1011
1012	ordered = btrfs_lookup_ordered_range(inode, file_offset: start_pos,
1013	len: last_pos - start_pos + `1`);
1014	if (ordered &&
1015	ordered->file_offset + ordered->num_bytes > start_pos &&
1016	ordered->file_offset <= last_pos) {
1017	unlock_extent(tree: &inode->io_tree, start: start_pos, end: last_pos,
1018	cached: cached_state);
1019	for (i = `0`; i < num_pages; i++) {
1020	unlock_page(page: pages[i]);
1021	put_page(page: pages[i]);
1022	}
1023	btrfs_start_ordered_extent(entry: ordered);
1024	btrfs_put_ordered_extent(entry: ordered);
1025	return -EAGAIN;
1026	}
1027	if (ordered)
1028	btrfs_put_ordered_extent(entry: ordered);
1029
1030	*lockstart = start_pos;
1031	*lockend = last_pos;
1032	ret = `1`;
1033	}
1034
1035	/*
1036	* We should be called after prepare_pages() which should have locked
1037	* all pages in the range.
1038	*/
1039	for (i = `0`; i < num_pages; i++)
1040	WARN_ON(!PageLocked(pages[i]));
1041
1042	return ret;
1043	}
1044
1045	/*
1046	* Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1047	*
1048	* @pos: File offset.
1049	* @write_bytes: The length to write, will be updated to the nocow writeable
1050	* range.
1051	*
1052	* This function will flush ordered extents in the range to ensure proper
1053	* nocow checks.
1054	*
1055	* Return:
1056	* > 0 If we can nocow, and updates @write_bytes.
1057	* 0 If we can't do a nocow write.
1058	* -EAGAIN If we can't do a nocow write because snapshoting of the inode's
1059	* root is in progress.
1060	* < 0 If an error happened.
1061	*
1062	* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1063	*/
1064	int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1065	size_t *write_bytes, bool nowait)
1066	{
1067	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1068	struct btrfs_root *root = inode->root;
1069	struct extent_state *cached_state = NULL;
1070	u64 lockstart, lockend;
1071	u64 num_bytes;
1072	int ret;
1073
1074	if (!(inode->flags & (BTRFS_INODE_NODATACOW \| BTRFS_INODE_PREALLOC)))
1075	return `0`;
1076
1077	if (!btrfs_drew_try_write_lock(lock: &root->snapshot_lock))
1078	return -EAGAIN;
1079
1080	lockstart = round_down(pos, fs_info->sectorsize);
1081	lockend = round_up(pos + *write_bytes,
1082	fs_info->sectorsize) - `1`;
1083	num_bytes = lockend - lockstart + `1`;
1084
1085	if (nowait) {
1086	if (!btrfs_try_lock_ordered_range(inode, start: lockstart, end: lockend,
1087	cached_state: &cached_state)) {
1088	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
1089	return -EAGAIN;
1090	}
1091	} else {
1092	btrfs_lock_and_flush_ordered_range(inode, start: lockstart, end: lockend,
1093	cached_state: &cached_state);
1094	}
1095	ret = can_nocow_extent(inode: &inode->vfs_inode, offset: lockstart, len: &num_bytes,
1096	NULL, NULL, NULL, nowait, strict: false);
1097	if (ret <= `0`)
1098	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
1099	else
1100	write_bytes = min_t(size_t, write_bytes ,
1101	num_bytes - pos + lockstart);
1102	unlock_extent(tree: &inode->io_tree, start: lockstart, end: lockend, cached: &cached_state);
1103
1104	return ret;
1105	}
1106
1107	void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1108	{
1109	btrfs_drew_write_unlock(lock: &inode->root->snapshot_lock);
1110	}
1111
1112	static void update_time_for_write(struct inode *inode)
1113	{
1114	struct timespec64 now, ts;
1115
1116	if (IS_NOCMTIME(inode))
1117	return;
1118
1119	now = current_time(inode);
1120	ts = inode_get_mtime(inode);
1121	if (!timespec64_equal(a: &ts, b: &now))
1122	inode_set_mtime_to_ts(inode, ts: now);
1123
1124	ts = inode_get_ctime(inode);
1125	if (!timespec64_equal(a: &ts, b: &now))
1126	inode_set_ctime_to_ts(inode, ts: now);
1127
1128	if (IS_I_VERSION(inode))
1129	inode_inc_iversion(inode);
1130	}
1131
1132	static int btrfs_write_check(struct kiocb iocb, struct* iov_iter *from,
1133	size_t count)
1134	{
1135	struct file *file = iocb->ki_filp;
1136	struct inode *inode = file_inode(f: file);
1137	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
1138	loff_t pos = iocb->ki_pos;
1139	int ret;
1140	loff_t oldsize;
1141	loff_t start_pos;
1142
1143	/*
1144	* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1145	* prealloc flags, as without those flags we always have to COW. We will
1146	* later check if we can really COW into the target range (using
1147	* can_nocow_extent() at btrfs_get_blocks_direct_write()).
1148	*/
1149	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1150	!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW \| BTRFS_INODE_PREALLOC)))
1151	return -EAGAIN;
1152
1153	ret = file_remove_privs(file);
1154	if (ret)
1155	return ret;
1156
1157	/*
1158	* We reserve space for updating the inode when we reserve space for the
1159	* extent we are going to write, so we will enospc out there. We don't
1160	* need to start yet another transaction to update the inode as we will
1161	* update the inode when we finish writing whatever data we write.
1162	*/
1163	update_time_for_write(inode);
1164
1165	start_pos = round_down(pos, fs_info->sectorsize);
1166	oldsize = i_size_read(inode);
1167	if (start_pos > oldsize) {
1168	/ Expand hole size to cover write data, preventing empty gap /
1169	loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1170
1171	ret = btrfs_cont_expand(inode: BTRFS_I(inode), oldsize, size: end_pos);
1172	if (ret)
1173	return ret;
1174	}
1175
1176	return `0`;
1177	}
1178
1179	static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1180	struct iov_iter *i)
1181	{
1182	struct file *file = iocb->ki_filp;
1183	loff_t pos;
1184	struct inode *inode = file_inode(f: file);
1185	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
1186	struct page **pages = NULL;
1187	struct extent_changeset *data_reserved = NULL;
1188	u64 release_bytes = `0`;
1189	u64 lockstart;
1190	u64 lockend;
1191	size_t num_written = `0`;
1192	int nrptrs;
1193	ssize_t ret;
1194	bool only_release_metadata = false;
1195	bool force_page_uptodate = false;
1196	loff_t old_isize = i_size_read(inode);
1197	unsigned int ilock_flags = `0`;
1198	const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1199	unsigned int bdp_flags = (nowait ? BDP_ASYNC : `0`);
1200
1201	if (nowait)
1202	ilock_flags \|= BTRFS_ILOCK_TRY;
1203
1204	ret = btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags);
1205	if (ret < `0`)
1206	return ret;
1207
1208	ret = generic_write_checks(iocb, i);
1209	if (ret <= `0`)
1210	goto out;
1211
1212	ret = btrfs_write_check(iocb, from: i, count: ret);
1213	if (ret < `0`)
1214	goto out;
1215
1216	pos = iocb->ki_pos;
1217	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1218	PAGE_SIZE / (sizeof(struct page *)));
1219	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1220	nrptrs = max(nrptrs, `8`);
1221	pages = kmalloc_array(n: nrptrs, size: sizeof(struct page *), GFP_KERNEL);
1222	if (!pages) {
1223	ret = -ENOMEM;
1224	goto out;
1225	}
1226
1227	while (iov_iter_count(i) > `0`) {
1228	struct extent_state *cached_state = NULL;
1229	size_t offset = offset_in_page(pos);
1230	size_t sector_offset;
1231	size_t write_bytes = min(iov_iter_count(i),
1232	nrptrs * (size_t)PAGE_SIZE -
1233	offset);
1234	size_t num_pages;
1235	size_t reserve_bytes;
1236	size_t dirty_pages;
1237	size_t copied;
1238	size_t dirty_sectors;
1239	size_t num_sectors;
1240	int extents_locked;
1241
1242	/*
1243	* Fault pages before locking them in prepare_pages
1244	* to avoid recursive lock
1245	*/
1246	if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1247	ret = -EFAULT;
1248	break;
1249	}
1250
1251	only_release_metadata = false;
1252	sector_offset = pos & (fs_info->sectorsize - `1`);
1253
1254	extent_changeset_release(changeset: data_reserved);
1255	ret = btrfs_check_data_free_space(inode: BTRFS_I(inode),
1256	reserved: &data_reserved, start: pos,
1257	len: write_bytes, noflush: nowait);
1258	if (ret < `0`) {
1259	int can_nocow;
1260
1261	if (nowait && (ret == -ENOSPC \|\| ret == -EAGAIN)) {
1262	ret = -EAGAIN;
1263	break;
1264	}
1265
1266	/*
1267	* If we don't have to COW at the offset, reserve
1268	* metadata only. write_bytes may get smaller than
1269	* requested here.
1270	*/
1271	can_nocow = btrfs_check_nocow_lock(inode: BTRFS_I(inode), pos,
1272	write_bytes: &write_bytes, nowait);
1273	if (can_nocow < `0`)
1274	ret = can_nocow;
1275	if (can_nocow > `0`)
1276	ret = `0`;
1277	if (ret)
1278	break;
1279	only_release_metadata = true;
1280	}
1281
1282	num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1283	WARN_ON(num_pages > nrptrs);
1284	reserve_bytes = round_up(write_bytes + sector_offset,
1285	fs_info->sectorsize);
1286	WARN_ON(reserve_bytes == `0`);
1287	ret = btrfs_delalloc_reserve_metadata(inode: BTRFS_I(inode),
1288	num_bytes: reserve_bytes,
1289	disk_num_bytes: reserve_bytes, noflush: nowait);
1290	if (ret) {
1291	if (!only_release_metadata)
1292	btrfs_free_reserved_data_space(inode: BTRFS_I(inode),
1293	reserved: data_reserved, start: pos,
1294	len: write_bytes);
1295	else
1296	btrfs_check_nocow_unlock(inode: BTRFS_I(inode));
1297
1298	if (nowait && ret == -ENOSPC)
1299	ret = -EAGAIN;
1300	break;
1301	}
1302
1303	release_bytes = reserve_bytes;
1304	again:
1305	ret = balance_dirty_pages_ratelimited_flags(mapping: inode->i_mapping, flags: bdp_flags);
1306	if (ret) {
1307	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), num_bytes: reserve_bytes);
1308	break;
1309	}
1310
1311	/*
1312	* This is going to setup the pages array with the number of
1313	* pages we want, so we don't really need to worry about the
1314	* contents of pages from loop to loop
1315	*/
1316	ret = prepare_pages(inode, pages, num_pages,
1317	pos, write_bytes, force_uptodate: force_page_uptodate, nowait: false);
1318	if (ret) {
1319	btrfs_delalloc_release_extents(inode: BTRFS_I(inode),
1320	num_bytes: reserve_bytes);
1321	break;
1322	}
1323
1324	extents_locked = lock_and_cleanup_extent_if_need(
1325	inode: BTRFS_I(inode), pages,
1326	num_pages, pos, write_bytes, lockstart: &lockstart,
1327	lockend: &lockend, nowait, cached_state: &cached_state);
1328	if (extents_locked < `0`) {
1329	if (!nowait && extents_locked == -EAGAIN)
1330	goto again;
1331
1332	btrfs_delalloc_release_extents(inode: BTRFS_I(inode),
1333	num_bytes: reserve_bytes);
1334	ret = extents_locked;
1335	break;
1336	}
1337
1338	copied = btrfs_copy_from_user(pos, write_bytes, prepared_pages: pages, i);
1339
1340	num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1341	dirty_sectors = round_up(copied + sector_offset,
1342	fs_info->sectorsize);
1343	dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1344
1345	/*
1346	* if we have trouble faulting in the pages, fall
1347	* back to one page at a time
1348	*/
1349	if (copied < write_bytes)
1350	nrptrs = `1`;
1351
1352	if (copied == `0`) {
1353	force_page_uptodate = true;
1354	dirty_sectors = `0`;
1355	dirty_pages = `0`;
1356	} else {
1357	force_page_uptodate = false;
1358	dirty_pages = DIV_ROUND_UP(copied + offset,
1359	PAGE_SIZE);
1360	}
1361
1362	if (num_sectors > dirty_sectors) {
1363	/ release everything except the sectors we dirtied /
1364	release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1365	if (only_release_metadata) {
1366	btrfs_delalloc_release_metadata(inode: BTRFS_I(inode),
1367	num_bytes: release_bytes, qgroup_free: true);
1368	} else {
1369	u64 __pos;
1370
1371	__pos = round_down(pos,
1372	fs_info->sectorsize) +
1373	(dirty_pages << PAGE_SHIFT);
1374	btrfs_delalloc_release_space(inode: BTRFS_I(inode),
1375	reserved: data_reserved, start: __pos,
1376	len: release_bytes, qgroup_free: true);
1377	}
1378	}
1379
1380	release_bytes = round_up(copied + sector_offset,
1381	fs_info->sectorsize);
1382
1383	ret = btrfs_dirty_pages(inode: BTRFS_I(inode), pages,
1384	num_pages: dirty_pages, pos, write_bytes: copied,
1385	cached: &cached_state, noreserve: only_release_metadata);
1386
1387	/*
1388	* If we have not locked the extent range, because the range's
1389	* start offset is >= i_size, we might still have a non-NULL
1390	* cached extent state, acquired while marking the extent range
1391	* as delalloc through btrfs_dirty_pages(). Therefore free any
1392	* possible cached extent state to avoid a memory leak.
1393	*/
1394	if (extents_locked)
1395	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart,
1396	end: lockend, cached: &cached_state);
1397	else
1398	free_extent_state(state: cached_state);
1399
1400	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), num_bytes: reserve_bytes);
1401	if (ret) {
1402	btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1403	break;
1404	}
1405
1406	release_bytes = `0`;
1407	if (only_release_metadata)
1408	btrfs_check_nocow_unlock(inode: BTRFS_I(inode));
1409
1410	btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1411
1412	cond_resched();
1413
1414	pos += copied;
1415	num_written += copied;
1416	}
1417
1418	kfree(objp: pages);
1419
1420	if (release_bytes) {
1421	if (only_release_metadata) {
1422	btrfs_check_nocow_unlock(inode: BTRFS_I(inode));
1423	btrfs_delalloc_release_metadata(inode: BTRFS_I(inode),
1424	num_bytes: release_bytes, qgroup_free: true);
1425	} else {
1426	btrfs_delalloc_release_space(inode: BTRFS_I(inode),
1427	reserved: data_reserved,
1428	round_down(pos, fs_info->sectorsize),
1429	len: release_bytes, qgroup_free: true);
1430	}
1431	}
1432
1433	extent_changeset_free(changeset: data_reserved);
1434	if (num_written > `0`) {
1435	pagecache_isize_extended(inode, from: old_isize, to: iocb->ki_pos);
1436	iocb->ki_pos += num_written;
1437	}
1438	out:
1439	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1440	return num_written ? num_written : ret;
1441	}
1442
1443	static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1444	const struct iov_iter *iter, loff_t offset)
1445	{
1446	const u32 blocksize_mask = fs_info->sectorsize - `1`;
1447
1448	if (offset & blocksize_mask)
1449	return -EINVAL;
1450
1451	if (iov_iter_alignment(i: iter) & blocksize_mask)
1452	return -EINVAL;
1453
1454	return `0`;
1455	}
1456
1457	static ssize_t btrfs_direct_write(struct kiocb iocb, struct* iov_iter *from)
1458	{
1459	struct file *file = iocb->ki_filp;
1460	struct inode *inode = file_inode(f: file);
1461	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
1462	loff_t pos;
1463	ssize_t written = `0`;
1464	ssize_t written_buffered;
1465	size_t prev_left = `0`;
1466	loff_t endbyte;
1467	ssize_t err;
1468	unsigned int ilock_flags = `0`;
1469	struct iomap_dio *dio;
1470
1471	if (iocb->ki_flags & IOCB_NOWAIT)
1472	ilock_flags \|= BTRFS_ILOCK_TRY;
1473
1474	/*
1475	* If the write DIO is within EOF, use a shared lock and also only if
1476	* security bits will likely not be dropped by file_remove_privs() called
1477	* from btrfs_write_check(). Either will need to be rechecked after the
1478	* lock was acquired.
1479	*/
1480	if (iocb->ki_pos + iov_iter_count(i: from) <= i_size_read(inode) && IS_NOSEC(inode))
1481	ilock_flags \|= BTRFS_ILOCK_SHARED;
1482
1483	relock:
1484	err = btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags);
1485	if (err < `0`)
1486	return err;
1487
1488	/ Shared lock cannot be used with security bits set. /
1489	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1490	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1491	ilock_flags &= ~BTRFS_ILOCK_SHARED;
1492	goto relock;
1493	}
1494
1495	err = generic_write_checks(iocb, from);
1496	if (err <= `0`) {
1497	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1498	return err;
1499	}
1500
1501	err = btrfs_write_check(iocb, from, count: err);
1502	if (err < `0`) {
1503	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1504	goto out;
1505	}
1506
1507	pos = iocb->ki_pos;
1508	/*
1509	* Re-check since file size may have changed just before taking the
1510	* lock or pos may have changed because of O_APPEND in generic_write_check()
1511	*/
1512	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1513	pos + iov_iter_count(i: from) > i_size_read(inode)) {
1514	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1515	ilock_flags &= ~BTRFS_ILOCK_SHARED;
1516	goto relock;
1517	}
1518
1519	if (check_direct_IO(fs_info, iter: from, offset: pos)) {
1520	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1521	goto buffered;
1522	}
1523
1524	/*
1525	* The iov_iter can be mapped to the same file range we are writing to.
1526	* If that's the case, then we will deadlock in the iomap code, because
1527	* it first calls our callback btrfs_dio_iomap_begin(), which will create
1528	* an ordered extent, and after that it will fault in the pages that the
1529	* iov_iter refers to. During the fault in we end up in the readahead
1530	* pages code (starting at btrfs_readahead()), which will lock the range,
1531	* find that ordered extent and then wait for it to complete (at
1532	* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1533	* obviously the ordered extent can never complete as we didn't submit
1534	* yet the respective bio(s). This always happens when the buffer is
1535	* memory mapped to the same file range, since the iomap DIO code always
1536	* invalidates pages in the target file range (after starting and waiting
1537	* for any writeback).
1538	*
1539	* So here we disable page faults in the iov_iter and then retry if we
1540	* got -EFAULT, faulting in the pages before the retry.
1541	*/
1542	from->nofault = true;
1543	dio = btrfs_dio_write(iocb, iter: from, done_before: written);
1544	from->nofault = false;
1545
1546	/*
1547	* iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1548	* iocb, and that needs to lock the inode. So unlock it before calling
1549	* iomap_dio_complete() to avoid a deadlock.
1550	*/
1551	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags);
1552
1553	if (IS_ERR_OR_NULL(ptr: dio))
1554	err = PTR_ERR_OR_ZERO(ptr: dio);
1555	else
1556	err = iomap_dio_complete(dio);
1557
1558	/ No increment (+=) because iomap returns a cumulative value. /
1559	if (err > `0`)
1560	written = err;
1561
1562	if (iov_iter_count(i: from) > `0` && (err == -EFAULT \|\| err > `0`)) {
1563	const size_t left = iov_iter_count(i: from);
1564	/*
1565	* We have more data left to write. Try to fault in as many as
1566	* possible of the remainder pages and retry. We do this without
1567	* releasing and locking again the inode, to prevent races with
1568	* truncate.
1569	*
1570	* Also, in case the iov refers to pages in the file range of the
1571	* file we want to write to (due to a mmap), we could enter an
1572	* infinite loop if we retry after faulting the pages in, since
1573	* iomap will invalidate any pages in the range early on, before
1574	* it tries to fault in the pages of the iov. So we keep track of
1575	* how much was left of iov in the previous EFAULT and fallback
1576	* to buffered IO in case we haven't made any progress.
1577	*/
1578	if (left == prev_left) {
1579	err = -ENOTBLK;
1580	} else {
1581	fault_in_iov_iter_readable(i: from, bytes: left);
1582	prev_left = left;
1583	goto relock;
1584	}
1585	}
1586
1587	/*
1588	* If 'err' is -ENOTBLK or we have not written all data, then it means
1589	* we must fallback to buffered IO.
1590	*/
1591	if ((err < `0` && err != -ENOTBLK) \|\| !iov_iter_count(i: from))
1592	goto out;
1593
1594	buffered:
1595	/*
1596	* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1597	* it must retry the operation in a context where blocking is acceptable,
1598	* because even if we end up not blocking during the buffered IO attempt
1599	* below, we will block when flushing and waiting for the IO.
1600	*/
1601	if (iocb->ki_flags & IOCB_NOWAIT) {
1602	err = -EAGAIN;
1603	goto out;
1604	}
1605
1606	pos = iocb->ki_pos;
1607	written_buffered = btrfs_buffered_write(iocb, i: from);
1608	if (written_buffered < `0`) {
1609	err = written_buffered;
1610	goto out;
1611	}
1612	/*
1613	* Ensure all data is persisted. We want the next direct IO read to be
1614	* able to read what was just written.
1615	*/
1616	endbyte = pos + written_buffered - `1`;
1617	err = btrfs_fdatawrite_range(inode, start: pos, end: endbyte);
1618	if (err)
1619	goto out;
1620	err = filemap_fdatawait_range(inode->i_mapping, lstart: pos, lend: endbyte);
1621	if (err)
1622	goto out;
1623	written += written_buffered;
1624	iocb->ki_pos = pos + written_buffered;
1625	invalidate_mapping_pages(mapping: file->f_mapping, start: pos >> PAGE_SHIFT,
1626	end: endbyte >> PAGE_SHIFT);
1627	out:
1628	return err < `0` ? err : written;
1629	}
1630
1631	static ssize_t btrfs_encoded_write(struct kiocb iocb, struct* iov_iter *from,
1632	const struct btrfs_ioctl_encoded_io_args *encoded)
1633	{
1634	struct file *file = iocb->ki_filp;
1635	struct inode *inode = file_inode(f: file);
1636	loff_t count;
1637	ssize_t ret;
1638
1639	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: `0`);
1640	count = encoded->len;
1641	ret = generic_write_checks_count(iocb, count: &count);
1642	if (ret == `0` && count != encoded->len) {
1643	/*
1644	* The write got truncated by generic_write_checks_count(). We
1645	* can't do a partial encoded write.
1646	*/
1647	ret = -EFBIG;
1648	}
1649	if (ret \|\| encoded->len == `0`)
1650	goto out;
1651
1652	ret = btrfs_write_check(iocb, from, count: encoded->len);
1653	if (ret < `0`)
1654	goto out;
1655
1656	ret = btrfs_do_encoded_write(iocb, from, encoded);
1657	out:
1658	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: `0`);
1659	return ret;
1660	}
1661
1662	ssize_t btrfs_do_write_iter(struct kiocb iocb, struct* iov_iter *from,
1663	const struct btrfs_ioctl_encoded_io_args *encoded)
1664	{
1665	struct file *file = iocb->ki_filp;
1666	struct btrfs_inode *inode = BTRFS_I(inode: file_inode(f: file));
1667	ssize_t num_written, num_sync;
1668
1669	/*
1670	* If the fs flips readonly due to some impossible error, although we
1671	* have opened a file as writable, we have to stop this write operation
1672	* to ensure consistency.
1673	*/
1674	if (BTRFS_FS_ERROR(inode->root->fs_info))
1675	return -EROFS;
1676
1677	if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1678	return -EOPNOTSUPP;
1679
1680	if (encoded) {
1681	num_written = btrfs_encoded_write(iocb, from, encoded);
1682	num_sync = encoded->len;
1683	} else if (iocb->ki_flags & IOCB_DIRECT) {
1684	num_written = btrfs_direct_write(iocb, from);
1685	num_sync = num_written;
1686	} else {
1687	num_written = btrfs_buffered_write(iocb, i: from);
1688	num_sync = num_written;
1689	}
1690
1691	btrfs_set_inode_last_sub_trans(inode);
1692
1693	if (num_sync > `0`) {
1694	num_sync = generic_write_sync(iocb, count: num_sync);
1695	if (num_sync < `0`)
1696	num_written = num_sync;
1697	}
1698
1699	return num_written;
1700	}
1701
1702	static ssize_t btrfs_file_write_iter(struct kiocb iocb, struct* iov_iter *from)
1703	{
1704	return btrfs_do_write_iter(iocb, from, NULL);
1705	}
1706
1707	int btrfs_release_file(struct inode inode, struct* file *filp)
1708	{
1709	struct btrfs_file_private *private = filp->private_data;
1710
1711	if (private) {
1712	kfree(objp: private->filldir_buf);
1713	free_extent_state(state: private->llseek_cached_state);
1714	kfree(objp: private);
1715	filp->private_data = NULL;
1716	}
1717
1718	/*
1719	* Set by setattr when we are about to truncate a file from a non-zero
1720	* size to a zero size. This tries to flush down new bytes that may
1721	* have been written if the application were using truncate to replace
1722	* a file in place.
1723	*/
1724	if (test_and_clear_bit(nr: BTRFS_INODE_FLUSH_ON_CLOSE,
1725	addr: &BTRFS_I(inode)->runtime_flags))
1726	filemap_flush(inode->i_mapping);
1727	return `0`;
1728	}
1729
1730	static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1731	{
1732	int ret;
1733	struct blk_plug plug;
1734
1735	/*
1736	* This is only called in fsync, which would do synchronous writes, so
1737	* a plug can merge adjacent IOs as much as possible. Esp. in case of
1738	* multiple disks using raid profile, a large IO can be split to
1739	* several segments of stripe length (currently 64K).
1740	*/
1741	blk_start_plug(&plug);
1742	ret = btrfs_fdatawrite_range(inode, start, end);
1743	blk_finish_plug(&plug);
1744
1745	return ret;
1746	}
1747
1748	static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1749	{
1750	struct btrfs_inode *inode = BTRFS_I(inode: ctx->inode);
1751	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1752
1753	if (btrfs_inode_in_log(inode, generation: btrfs_get_fs_generation(fs_info)) &&
1754	list_empty(head: &ctx->ordered_extents))
1755	return true;
1756
1757	/*
1758	* If we are doing a fast fsync we can not bail out if the inode's
1759	* last_trans is <= then the last committed transaction, because we only
1760	* update the last_trans of the inode during ordered extent completion,
1761	* and for a fast fsync we don't wait for that, we only wait for the
1762	* writeback to complete.
1763	*/
1764	if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) &&
1765	(test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) \|\|
1766	list_empty(head: &ctx->ordered_extents)))
1767	return true;
1768
1769	return false;
1770	}
1771
1772	/*
1773	* fsync call for both files and directories. This logs the inode into
1774	* the tree log instead of forcing full commits whenever possible.
1775	*
1776	* It needs to call filemap_fdatawait so that all ordered extent updates are
1777	* in the metadata btree are up to date for copying to the log.
1778	*
1779	* It drops the inode mutex before doing the tree log commit. This is an
1780	* important optimization for directories because holding the mutex prevents
1781	* new operations on the dir while we write to disk.
1782	*/
1783	int btrfs_sync_file(struct file file, loff_t start, loff_t end, int* datasync)
1784	{
1785	struct dentry *dentry = file_dentry(file);
1786	struct inode *inode = d_inode(dentry);
1787	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
1788	struct btrfs_root *root = BTRFS_I(inode)->root;
1789	struct btrfs_trans_handle *trans;
1790	struct btrfs_log_ctx ctx;
1791	int ret = `0`, err;
1792	u64 len;
1793	bool full_sync;
1794
1795	trace_btrfs_sync_file(file, datasync);
1796
1797	btrfs_init_log_ctx(ctx: &ctx, inode);
1798
1799	/*
1800	* Always set the range to a full range, otherwise we can get into
1801	* several problems, from missing file extent items to represent holes
1802	* when not using the NO_HOLES feature, to log tree corruption due to
1803	* races between hole detection during logging and completion of ordered
1804	* extents outside the range, to missing checksums due to ordered extents
1805	* for which we flushed only a subset of their pages.
1806	*/
1807	start = `0`;
1808	end = LLONG_MAX;
1809	len = (u64)LLONG_MAX + `1`;
1810
1811	/*
1812	* We write the dirty pages in the range and wait until they complete
1813	* out of the ->i_mutex. If so, we can flush the dirty pages by
1814	* multi-task, and make the performance up. See
1815	* btrfs_wait_ordered_range for an explanation of the ASYNC check.
1816	*/
1817	ret = start_ordered_ops(inode, start, end);
1818	if (ret)
1819	goto out;
1820
1821	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
1822
1823	atomic_inc(v: &root->log_batch);
1824
1825	/*
1826	* Before we acquired the inode's lock and the mmap lock, someone may
1827	* have dirtied more pages in the target range. We need to make sure
1828	* that writeback for any such pages does not start while we are logging
1829	* the inode, because if it does, any of the following might happen when
1830	* we are not doing a full inode sync:
1831	*
1832	* 1) We log an extent after its writeback finishes but before its
1833	* checksums are added to the csum tree, leading to -EIO errors
1834	* when attempting to read the extent after a log replay.
1835	*
1836	* 2) We can end up logging an extent before its writeback finishes.
1837	* Therefore after the log replay we will have a file extent item
1838	* pointing to an unwritten extent (and no data checksums as well).
1839	*
1840	* So trigger writeback for any eventual new dirty pages and then we
1841	* wait for all ordered extents to complete below.
1842	*/
1843	ret = start_ordered_ops(inode, start, end);
1844	if (ret) {
1845	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
1846	goto out;
1847	}
1848
1849	/*
1850	* Always check for the full sync flag while holding the inode's lock,
1851	* to avoid races with other tasks. The flag must be either set all the
1852	* time during logging or always off all the time while logging.
1853	* We check the flag here after starting delalloc above, because when
1854	* running delalloc the full sync flag may be set if we need to drop
1855	* extra extent map ranges due to temporary memory allocation failures.
1856	*/
1857	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1858	&BTRFS_I(inode)->runtime_flags);
1859
1860	/*
1861	* We have to do this here to avoid the priority inversion of waiting on
1862	* IO of a lower priority task while holding a transaction open.
1863	*
1864	* For a full fsync we wait for the ordered extents to complete while
1865	* for a fast fsync we wait just for writeback to complete, and then
1866	* attach the ordered extents to the transaction so that a transaction
1867	* commit waits for their completion, to avoid data loss if we fsync,
1868	* the current transaction commits before the ordered extents complete
1869	* and a power failure happens right after that.
1870	*
1871	* For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1872	* logical address recorded in the ordered extent may change. We need
1873	* to wait for the IO to stabilize the logical address.
1874	*/
1875	if (full_sync \|\| btrfs_is_zoned(fs_info)) {
1876	ret = btrfs_wait_ordered_range(inode, start, len);
1877	} else {
1878	/*
1879	* Get our ordered extents as soon as possible to avoid doing
1880	* checksum lookups in the csum tree, and use instead the
1881	* checksums attached to the ordered extents.
1882	*/
1883	btrfs_get_ordered_extents_for_logging(inode: BTRFS_I(inode),
1884	list: &ctx.ordered_extents);
1885	ret = filemap_fdatawait_range(inode->i_mapping, lstart: start, lend: end);
1886	}
1887
1888	if (ret)
1889	goto out_release_extents;
1890
1891	atomic_inc(v: &root->log_batch);
1892
1893	if (skip_inode_logging(ctx: &ctx)) {
1894	/*
1895	* We've had everything committed since the last time we were
1896	* modified so clear this flag in case it was set for whatever
1897	* reason, it's no longer relevant.
1898	*/
1899	clear_bit(nr: BTRFS_INODE_NEEDS_FULL_SYNC,
1900	addr: &BTRFS_I(inode)->runtime_flags);
1901	/*
1902	* An ordered extent might have started before and completed
1903	* already with io errors, in which case the inode was not
1904	* updated and we end up here. So check the inode's mapping
1905	* for any errors that might have happened since we last
1906	* checked called fsync.
1907	*/
1908	ret = filemap_check_wb_err(mapping: inode->i_mapping, since: file->f_wb_err);
1909	goto out_release_extents;
1910	}
1911
1912	/*
1913	* We use start here because we will need to wait on the IO to complete
1914	* in btrfs_sync_log, which could require joining a transaction (for
1915	* example checking cross references in the nocow path). If we use join
1916	* here we could get into a situation where we're waiting on IO to
1917	* happen that is blocked on a transaction trying to commit. With start
1918	* we inc the extwriter counter, so we wait for all extwriters to exit
1919	* before we start blocking joiners. This comment is to keep somebody
1920	* from thinking they are super smart and changing this to
1921	* btrfs_join_transaction coughJosefcough.
1922	*/
1923	trans = btrfs_start_transaction(root, num_items: `0`);
1924	if (IS_ERR(ptr: trans)) {
1925	ret = PTR_ERR(ptr: trans);
1926	goto out_release_extents;
1927	}
1928	trans->in_fsync = true;
1929
1930	ret = btrfs_log_dentry_safe(trans, dentry, ctx: &ctx);
1931	btrfs_release_log_ctx_extents(ctx: &ctx);
1932	if (ret < `0`) {
1933	/ Fallthrough and commit/free transaction. /
1934	ret = BTRFS_LOG_FORCE_COMMIT;
1935	}
1936
1937	/ we've logged all the items and now have a consistent*
1938	* version of the file in the log. It is possible that
1939	* someone will come in and modify the file, but that's
1940	* fine because the log is consistent on disk, and we
1941	* have references to all of the file's extents
1942	*
1943	* It is possible that someone will come in and log the
1944	* file again, but that will end up using the synchronization
1945	* inside btrfs_sync_log to keep things safe.
1946	*/
1947	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
1948
1949	if (ret == BTRFS_NO_LOG_SYNC) {
1950	ret = btrfs_end_transaction(trans);
1951	goto out;
1952	}
1953
1954	/ We successfully logged the inode, attempt to sync the log. /
1955	if (!ret) {
1956	ret = btrfs_sync_log(trans, root, ctx: &ctx);
1957	if (!ret) {
1958	ret = btrfs_end_transaction(trans);
1959	goto out;
1960	}
1961	}
1962
1963	/*
1964	* At this point we need to commit the transaction because we had
1965	* btrfs_need_log_full_commit() or some other error.
1966	*
1967	* If we didn't do a full sync we have to stop the trans handle, wait on
1968	* the ordered extents, start it again and commit the transaction. If
1969	* we attempt to wait on the ordered extents here we could deadlock with
1970	* something like fallocate() that is holding the extent lock trying to
1971	* start a transaction while some other thread is trying to commit the
1972	* transaction while we (fsync) are currently holding the transaction
1973	* open.
1974	*/
1975	if (!full_sync) {
1976	ret = btrfs_end_transaction(trans);
1977	if (ret)
1978	goto out;
1979	ret = btrfs_wait_ordered_range(inode, start, len);
1980	if (ret)
1981	goto out;
1982
1983	/*
1984	* This is safe to use here because we're only interested in
1985	* making sure the transaction that had the ordered extents is
1986	* committed. We aren't waiting on anything past this point,
1987	* we're purely getting the transaction and committing it.
1988	*/
1989	trans = btrfs_attach_transaction_barrier(root);
1990	if (IS_ERR(ptr: trans)) {
1991	ret = PTR_ERR(ptr: trans);
1992
1993	/*
1994	* We committed the transaction and there's no currently
1995	* running transaction, this means everything we care
1996	* about made it to disk and we are done.
1997	*/
1998	if (ret == -ENOENT)
1999	ret = `0`;
2000	goto out;
2001	}
2002	}
2003
2004	ret = btrfs_commit_transaction(trans);
2005	out:
2006	ASSERT(list_empty(&ctx.list));
2007	ASSERT(list_empty(&ctx.conflict_inodes));
2008	err = file_check_and_advance_wb_err(file);
2009	if (!ret)
2010	ret = err;
2011	return ret > `0` ? -EIO : ret;
2012
2013	out_release_extents:
2014	btrfs_release_log_ctx_extents(ctx: &ctx);
2015	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2016	goto out;
2017	}
2018
2019	static const struct vm_operations_struct btrfs_file_vm_ops = {
2020	.fault = filemap_fault,
2021	.map_pages = filemap_map_pages,
2022	.page_mkwrite = btrfs_page_mkwrite,
2023	};
2024
2025	static int btrfs_file_mmap(struct file filp, struct* vm_area_struct *vma)
2026	{
2027	struct address_space *mapping = filp->f_mapping;
2028
2029	if (!mapping->a_ops->read_folio)
2030	return -ENOEXEC;
2031
2032	file_accessed(file: filp);
2033	vma->vm_ops = &btrfs_file_vm_ops;
2034
2035	return `0`;
2036	}
2037
2038	static int hole_mergeable(struct btrfs_inode inode, struct* extent_buffer *leaf,
2039	int slot, u64 start, u64 end)
2040	{
2041	struct btrfs_file_extent_item *fi;
2042	struct btrfs_key key;
2043
2044	if (slot < `0` \|\| slot >= btrfs_header_nritems(eb: leaf))
2045	return `0`;
2046
2047	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
2048	if (key.objectid != btrfs_ino(inode) \|\|
2049	key.type != BTRFS_EXTENT_DATA_KEY)
2050	return `0`;
2051
2052	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2053
2054	if (btrfs_file_extent_type(eb: leaf, s: fi) != BTRFS_FILE_EXTENT_REG)
2055	return `0`;
2056
2057	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: fi))
2058	return `0`;
2059
2060	if (key.offset == end)
2061	return `1`;
2062	if (key.offset + btrfs_file_extent_num_bytes(eb: leaf, s: fi) == start)
2063	return `1`;
2064	return `0`;
2065	}
2066
2067	static int fill_holes(struct btrfs_trans_handle *trans,
2068	struct btrfs_inode *inode,
2069	struct btrfs_path *path, u64 offset, u64 end)
2070	{
2071	struct btrfs_fs_info *fs_info = trans->fs_info;
2072	struct btrfs_root *root = inode->root;
2073	struct extent_buffer *leaf;
2074	struct btrfs_file_extent_item *fi;
2075	struct extent_map *hole_em;
2076	struct btrfs_key key;
2077	int ret;
2078
2079	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2080	goto out;
2081
2082	key.objectid = btrfs_ino(inode);
2083	key.type = BTRFS_EXTENT_DATA_KEY;
2084	key.offset = offset;
2085
2086	ret = btrfs_search_slot(trans, root, key: &key, p: path, ins_len: `0`, cow: `1`);
2087	if (ret <= `0`) {
2088	/*
2089	* We should have dropped this offset, so if we find it then
2090	* something has gone horribly wrong.
2091	*/
2092	if (ret == `0`)
2093	ret = -EINVAL;
2094	return ret;
2095	}
2096
2097	leaf = path->nodes[`0`];
2098	if (hole_mergeable(inode, leaf, slot: path->slots[`0`] - `1`, start: offset, end)) {
2099	u64 num_bytes;
2100
2101	path->slots[`0`]--;
2102	fi = btrfs_item_ptr(leaf, path->slots[`0`],
2103	struct btrfs_file_extent_item);
2104	num_bytes = btrfs_file_extent_num_bytes(eb: leaf, s: fi) +
2105	end - offset;
2106	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi, val: num_bytes);
2107	btrfs_set_file_extent_ram_bytes(eb: leaf, s: fi, val: num_bytes);
2108	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: `0`);
2109	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
2110	btrfs_mark_buffer_dirty(trans, buf: leaf);
2111	goto out;
2112	}
2113
2114	if (hole_mergeable(inode, leaf, slot: path->slots[`0`], start: offset, end)) {
2115	u64 num_bytes;
2116
2117	key.offset = offset;
2118	btrfs_set_item_key_safe(trans, path, new_key: &key);
2119	fi = btrfs_item_ptr(leaf, path->slots[`0`],
2120	struct btrfs_file_extent_item);
2121	num_bytes = btrfs_file_extent_num_bytes(eb: leaf, s: fi) + end -
2122	offset;
2123	btrfs_set_file_extent_num_bytes(eb: leaf, s: fi, val: num_bytes);
2124	btrfs_set_file_extent_ram_bytes(eb: leaf, s: fi, val: num_bytes);
2125	btrfs_set_file_extent_offset(eb: leaf, s: fi, val: `0`);
2126	btrfs_set_file_extent_generation(eb: leaf, s: fi, val: trans->transid);
2127	btrfs_mark_buffer_dirty(trans, buf: leaf);
2128	goto out;
2129	}
2130	btrfs_release_path(p: path);
2131
2132	ret = btrfs_insert_hole_extent(trans, root, objectid: btrfs_ino(inode), pos: offset,
2133	num_bytes: end - offset);
2134	if (ret)
2135	return ret;
2136
2137	out:
2138	btrfs_release_path(p: path);
2139
2140	hole_em = alloc_extent_map();
2141	if (!hole_em) {
2142	btrfs_drop_extent_map_range(inode, start: offset, end: end - `1`, skip_pinned: false);
2143	btrfs_set_inode_full_sync(inode);
2144	} else {
2145	hole_em->start = offset;
2146	hole_em->len = end - offset;
2147	hole_em->ram_bytes = hole_em->len;
2148	hole_em->orig_start = offset;
2149
2150	hole_em->block_start = EXTENT_MAP_HOLE;
2151	hole_em->block_len = `0`;
2152	hole_em->orig_block_len = `0`;
2153	hole_em->compress_type = BTRFS_COMPRESS_NONE;
2154	hole_em->generation = trans->transid;
2155
2156	ret = btrfs_replace_extent_map_range(inode, new_em: hole_em, modified: true);
2157	free_extent_map(em: hole_em);
2158	if (ret)
2159	btrfs_set_inode_full_sync(inode);
2160	}
2161
2162	return `0`;
2163	}
2164
2165	/*
2166	* Find a hole extent on given inode and change start/len to the end of hole
2167	* extent.(hole/vacuum extent whose em->start <= start &&
2168	* em->start + em->len > start)
2169	* When a hole extent is found, return 1 and modify start/len.
2170	*/
2171	static int find_first_non_hole(struct btrfs_inode inode, u64 start, u64 *len)
2172	{
2173	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2174	struct extent_map *em;
2175	int ret = `0`;
2176
2177	em = btrfs_get_extent(inode, NULL, pg_offset: `0`,
2178	round_down(*start, fs_info->sectorsize),
2179	round_up(*len, fs_info->sectorsize));
2180	if (IS_ERR(ptr: em))
2181	return PTR_ERR(ptr: em);
2182
2183	/ Hole or vacuum extent(only exists in no-hole mode) /
2184	if (em->block_start == EXTENT_MAP_HOLE) {
2185	ret = `1`;
2186	len = em->start + em->len > start + *len ?
2187	`0` : start + len - em->start - em->len;
2188	*start = em->start + em->len;
2189	}
2190	free_extent_map(em);
2191	return ret;
2192	}
2193
2194	static void btrfs_punch_hole_lock_range(struct inode *inode,
2195	const u64 lockstart,
2196	const u64 lockend,
2197	struct extent_state **cached_state)
2198	{
2199	/*
2200	* For subpage case, if the range is not at page boundary, we could
2201	* have pages at the leading/tailing part of the range.
2202	* This could lead to dead loop since filemap_range_has_page()
2203	* will always return true.
2204	* So here we need to do extra page alignment for
2205	* filemap_range_has_page().
2206	*/
2207	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2208	const u64 page_lockend = round_down(lockend + `1`, PAGE_SIZE) - `1`;
2209
2210	while (`1`) {
2211	truncate_pagecache_range(inode, offset: lockstart, end: lockend);
2212
2213	lock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
2214	cached: cached_state);
2215	/*
2216	* We can't have ordered extents in the range, nor dirty/writeback
2217	* pages, because we have locked the inode's VFS lock in exclusive
2218	* mode, we have locked the inode's i_mmap_lock in exclusive mode,
2219	* we have flushed all delalloc in the range and we have waited
2220	* for any ordered extents in the range to complete.
2221	* We can race with anyone reading pages from this range, so after
2222	* locking the range check if we have pages in the range, and if
2223	* we do, unlock the range and retry.
2224	*/
2225	if (!filemap_range_has_page(inode->i_mapping, lstart: page_lockstart,
2226	lend: page_lockend))
2227	break;
2228
2229	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
2230	cached: cached_state);
2231	}
2232
2233	btrfs_assert_inode_range_clean(inode: BTRFS_I(inode), start: lockstart, end: lockend);
2234	}
2235
2236	static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2237	struct btrfs_inode *inode,
2238	struct btrfs_path *path,
2239	struct btrfs_replace_extent_info *extent_info,
2240	const u64 replace_len,
2241	const u64 bytes_to_drop)
2242	{
2243	struct btrfs_fs_info *fs_info = trans->fs_info;
2244	struct btrfs_root *root = inode->root;
2245	struct btrfs_file_extent_item *extent;
2246	struct extent_buffer *leaf;
2247	struct btrfs_key key;
2248	int slot;
2249	struct btrfs_ref ref = { `0` };
2250	int ret;
2251
2252	if (replace_len == `0`)
2253	return `0`;
2254
2255	if (extent_info->disk_offset == `0` &&
2256	btrfs_fs_incompat(fs_info, NO_HOLES)) {
2257	btrfs_update_inode_bytes(inode, add_bytes: `0`, del_bytes: bytes_to_drop);
2258	return `0`;
2259	}
2260
2261	key.objectid = btrfs_ino(inode);
2262	key.type = BTRFS_EXTENT_DATA_KEY;
2263	key.offset = extent_info->file_offset;
2264	ret = btrfs_insert_empty_item(trans, root, path, key: &key,
2265	data_size: sizeof(struct btrfs_file_extent_item));
2266	if (ret)
2267	return ret;
2268	leaf = path->nodes[`0`];
2269	slot = path->slots[`0`];
2270	write_extent_buffer(eb: leaf, src: extent_info->extent_buf,
2271	btrfs_item_ptr_offset(leaf, slot),
2272	len: sizeof(struct btrfs_file_extent_item));
2273	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2274	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2275	btrfs_set_file_extent_offset(eb: leaf, s: extent, val: extent_info->data_offset);
2276	btrfs_set_file_extent_num_bytes(eb: leaf, s: extent, val: replace_len);
2277	if (extent_info->is_new_extent)
2278	btrfs_set_file_extent_generation(eb: leaf, s: extent, val: trans->transid);
2279	btrfs_mark_buffer_dirty(trans, buf: leaf);
2280	btrfs_release_path(p: path);
2281
2282	ret = btrfs_inode_set_file_extent_range(inode, start: extent_info->file_offset,
2283	len: replace_len);
2284	if (ret)
2285	return ret;
2286
2287	/ If it's a hole, nothing more needs to be done. /
2288	if (extent_info->disk_offset == `0`) {
2289	btrfs_update_inode_bytes(inode, add_bytes: `0`, del_bytes: bytes_to_drop);
2290	return `0`;
2291	}
2292
2293	btrfs_update_inode_bytes(inode, add_bytes: replace_len, del_bytes: bytes_to_drop);
2294
2295	if (extent_info->is_new_extent && extent_info->insertions == `0`) {
2296	key.objectid = extent_info->disk_offset;
2297	key.type = BTRFS_EXTENT_ITEM_KEY;
2298	key.offset = extent_info->disk_len;
2299	ret = btrfs_alloc_reserved_file_extent(trans, root,
2300	owner: btrfs_ino(inode),
2301	offset: extent_info->file_offset,
2302	ram_bytes: extent_info->qgroup_reserved,
2303	ins: &key);
2304	} else {
2305	u64 ref_offset;
2306
2307	btrfs_init_generic_ref(generic_ref: &ref, action: BTRFS_ADD_DELAYED_REF,
2308	bytenr: extent_info->disk_offset,
2309	len: extent_info->disk_len, parent: `0`,
2310	owning_root: root->root_key.objectid);
2311	ref_offset = extent_info->file_offset - extent_info->data_offset;
2312	btrfs_init_data_ref(generic_ref: &ref, ref_root: root->root_key.objectid,
2313	ino: btrfs_ino(inode), offset: ref_offset, mod_root: `0`, skip_qgroup: false);
2314	ret = btrfs_inc_extent_ref(trans, generic_ref: &ref);
2315	}
2316
2317	extent_info->insertions++;
2318
2319	return ret;
2320	}
2321
2322	/*
2323	* The respective range must have been previously locked, as well as the inode.
2324	* The end offset is inclusive (last byte of the range).
2325	* @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2326	* the file range with an extent.
2327	* When not punching a hole, we don't want to end up in a state where we dropped
2328	* extents without inserting a new one, so we must abort the transaction to avoid
2329	* a corruption.
2330	*/
2331	int btrfs_replace_file_extents(struct btrfs_inode *inode,
2332	struct btrfs_path path, const* u64 start,
2333	const u64 end,
2334	struct btrfs_replace_extent_info *extent_info,
2335	struct btrfs_trans_handle **trans_out)
2336	{
2337	struct btrfs_drop_extents_args drop_args = { `0` };
2338	struct btrfs_root *root = inode->root;
2339	struct btrfs_fs_info *fs_info = root->fs_info;
2340	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, num_items: `1`);
2341	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2342	struct btrfs_trans_handle *trans = NULL;
2343	struct btrfs_block_rsv *rsv;
2344	unsigned int rsv_count;
2345	u64 cur_offset;
2346	u64 len = end - start;
2347	int ret = `0`;
2348
2349	if (end <= start)
2350	return -EINVAL;
2351
2352	rsv = btrfs_alloc_block_rsv(fs_info, type: BTRFS_BLOCK_RSV_TEMP);
2353	if (!rsv) {
2354	ret = -ENOMEM;
2355	goto out;
2356	}
2357	rsv->size = btrfs_calc_insert_metadata_size(fs_info, num_items: `1`);
2358	rsv->failfast = true;
2359
2360	/*
2361	* 1 - update the inode
2362	* 1 - removing the extents in the range
2363	* 1 - adding the hole extent if no_holes isn't set or if we are
2364	* replacing the range with a new extent
2365	*/
2366	if (!btrfs_fs_incompat(fs_info, NO_HOLES) \|\| extent_info)
2367	rsv_count = `3`;
2368	else
2369	rsv_count = `2`;
2370
2371	trans = btrfs_start_transaction(root, num_items: rsv_count);
2372	if (IS_ERR(ptr: trans)) {
2373	ret = PTR_ERR(ptr: trans);
2374	trans = NULL;
2375	goto out_free;
2376	}
2377
2378	ret = btrfs_block_rsv_migrate(src_rsv: &fs_info->trans_block_rsv, dst_rsv: rsv,
2379	num_bytes: min_size, update_size: false);
2380	if (WARN_ON(ret))
2381	goto out_trans;
2382	trans->block_rsv = rsv;
2383
2384	cur_offset = start;
2385	drop_args.path = path;
2386	drop_args.end = end + `1`;
2387	drop_args.drop_cache = true;
2388	while (cur_offset < end) {
2389	drop_args.start = cur_offset;
2390	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
2391	/ If we are punching a hole decrement the inode's byte count /
2392	if (!extent_info)
2393	btrfs_update_inode_bytes(inode, add_bytes: `0`,
2394	del_bytes: drop_args.bytes_found);
2395	if (ret != -ENOSPC) {
2396	/*
2397	* The only time we don't want to abort is if we are
2398	* attempting to clone a partial inline extent, in which
2399	* case we'll get EOPNOTSUPP. However if we aren't
2400	* clone we need to abort no matter what, because if we
2401	* got EOPNOTSUPP via prealloc then we messed up and
2402	* need to abort.
2403	*/
2404	if (ret &&
2405	(ret != -EOPNOTSUPP \|\|
2406	(extent_info && extent_info->is_new_extent)))
2407	btrfs_abort_transaction(trans, ret);
2408	break;
2409	}
2410
2411	trans->block_rsv = &fs_info->trans_block_rsv;
2412
2413	if (!extent_info && cur_offset < drop_args.drop_end &&
2414	cur_offset < ino_size) {
2415	ret = fill_holes(trans, inode, path, offset: cur_offset,
2416	end: drop_args.drop_end);
2417	if (ret) {
2418	/*
2419	* If we failed then we didn't insert our hole
2420	* entries for the area we dropped, so now the
2421	* fs is corrupted, so we must abort the
2422	* transaction.
2423	*/
2424	btrfs_abort_transaction(trans, ret);
2425	break;
2426	}
2427	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2428	/*
2429	* We are past the i_size here, but since we didn't
2430	* insert holes we need to clear the mapped area so we
2431	* know to not set disk_i_size in this area until a new
2432	* file extent is inserted here.
2433	*/
2434	ret = btrfs_inode_clear_file_extent_range(inode,
2435	start: cur_offset,
2436	len: drop_args.drop_end - cur_offset);
2437	if (ret) {
2438	/*
2439	* We couldn't clear our area, so we could
2440	* presumably adjust up and corrupt the fs, so
2441	* we need to abort.
2442	*/
2443	btrfs_abort_transaction(trans, ret);
2444	break;
2445	}
2446	}
2447
2448	if (extent_info &&
2449	drop_args.drop_end > extent_info->file_offset) {
2450	u64 replace_len = drop_args.drop_end -
2451	extent_info->file_offset;
2452
2453	ret = btrfs_insert_replace_extent(trans, inode, path,
2454	extent_info, replace_len,
2455	bytes_to_drop: drop_args.bytes_found);
2456	if (ret) {
2457	btrfs_abort_transaction(trans, ret);
2458	break;
2459	}
2460	extent_info->data_len -= replace_len;
2461	extent_info->data_offset += replace_len;
2462	extent_info->file_offset += replace_len;
2463	}
2464
2465	/*
2466	* We are releasing our handle on the transaction, balance the
2467	* dirty pages of the btree inode and flush delayed items, and
2468	* then get a new transaction handle, which may now point to a
2469	* new transaction in case someone else may have committed the
2470	* transaction we used to replace/drop file extent items. So
2471	* bump the inode's iversion and update mtime and ctime except
2472	* if we are called from a dedupe context. This is because a
2473	* power failure/crash may happen after the transaction is
2474	* committed and before we finish replacing/dropping all the
2475	* file extent items we need.
2476	*/
2477	inode_inc_iversion(inode: &inode->vfs_inode);
2478
2479	if (!extent_info \|\| extent_info->update_times)
2480	inode_set_mtime_to_ts(inode: &inode->vfs_inode,
2481	ts: inode_set_ctime_current(inode: &inode->vfs_inode));
2482
2483	ret = btrfs_update_inode(trans, inode);
2484	if (ret)
2485	break;
2486
2487	btrfs_end_transaction(trans);
2488	btrfs_btree_balance_dirty(fs_info);
2489
2490	trans = btrfs_start_transaction(root, num_items: rsv_count);
2491	if (IS_ERR(ptr: trans)) {
2492	ret = PTR_ERR(ptr: trans);
2493	trans = NULL;
2494	break;
2495	}
2496
2497	ret = btrfs_block_rsv_migrate(src_rsv: &fs_info->trans_block_rsv,
2498	dst_rsv: rsv, num_bytes: min_size, update_size: false);
2499	if (WARN_ON(ret))
2500	break;
2501	trans->block_rsv = rsv;
2502
2503	cur_offset = drop_args.drop_end;
2504	len = end - cur_offset;
2505	if (!extent_info && len) {
2506	ret = find_first_non_hole(inode, start: &cur_offset, len: &len);
2507	if (unlikely(ret < `0`))
2508	break;
2509	if (ret && !len) {
2510	ret = `0`;
2511	break;
2512	}
2513	}
2514	}
2515
2516	/*
2517	* If we were cloning, force the next fsync to be a full one since we
2518	* we replaced (or just dropped in the case of cloning holes when
2519	* NO_HOLES is enabled) file extent items and did not setup new extent
2520	* maps for the replacement extents (or holes).
2521	*/
2522	if (extent_info && !extent_info->is_new_extent)
2523	btrfs_set_inode_full_sync(inode);
2524
2525	if (ret)
2526	goto out_trans;
2527
2528	trans->block_rsv = &fs_info->trans_block_rsv;
2529	/*
2530	* If we are using the NO_HOLES feature we might have had already an
2531	* hole that overlaps a part of the region [lockstart, lockend] and
2532	* ends at (or beyond) lockend. Since we have no file extent items to
2533	* represent holes, drop_end can be less than lockend and so we must
2534	* make sure we have an extent map representing the existing hole (the
2535	* call to __btrfs_drop_extents() might have dropped the existing extent
2536	* map representing the existing hole), otherwise the fast fsync path
2537	* will not record the existence of the hole region
2538	* [existing_hole_start, lockend].
2539	*/
2540	if (drop_args.drop_end <= end)
2541	drop_args.drop_end = end + `1`;
2542	/*
2543	* Don't insert file hole extent item if it's for a range beyond eof
2544	* (because it's useless) or if it represents a 0 bytes range (when
2545	* cur_offset == drop_end).
2546	*/
2547	if (!extent_info && cur_offset < ino_size &&
2548	cur_offset < drop_args.drop_end) {
2549	ret = fill_holes(trans, inode, path, offset: cur_offset,
2550	end: drop_args.drop_end);
2551	if (ret) {
2552	/ Same comment as above. /
2553	btrfs_abort_transaction(trans, ret);
2554	goto out_trans;
2555	}
2556	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2557	/ See the comment in the loop above for the reasoning here. /
2558	ret = btrfs_inode_clear_file_extent_range(inode, start: cur_offset,
2559	len: drop_args.drop_end - cur_offset);
2560	if (ret) {
2561	btrfs_abort_transaction(trans, ret);
2562	goto out_trans;
2563	}
2564
2565	}
2566	if (extent_info) {
2567	ret = btrfs_insert_replace_extent(trans, inode, path,
2568	extent_info, replace_len: extent_info->data_len,
2569	bytes_to_drop: drop_args.bytes_found);
2570	if (ret) {
2571	btrfs_abort_transaction(trans, ret);
2572	goto out_trans;
2573	}
2574	}
2575
2576	out_trans:
2577	if (!trans)
2578	goto out_free;
2579
2580	trans->block_rsv = &fs_info->trans_block_rsv;
2581	if (ret)
2582	btrfs_end_transaction(trans);
2583	else
2584	*trans_out = trans;
2585	out_free:
2586	btrfs_free_block_rsv(fs_info, rsv);
2587	out:
2588	return ret;
2589	}
2590
2591	static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2592	{
2593	struct inode *inode = file_inode(f: file);
2594	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
2595	struct btrfs_root *root = BTRFS_I(inode)->root;
2596	struct extent_state *cached_state = NULL;
2597	struct btrfs_path *path;
2598	struct btrfs_trans_handle *trans = NULL;
2599	u64 lockstart;
2600	u64 lockend;
2601	u64 tail_start;
2602	u64 tail_len;
2603	u64 orig_start = offset;
2604	int ret = `0`;
2605	bool same_block;
2606	u64 ino_size;
2607	bool truncated_block = false;
2608	bool updated_inode = false;
2609
2610	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2611
2612	ret = btrfs_wait_ordered_range(inode, start: offset, len);
2613	if (ret)
2614	goto out_only_mutex;
2615
2616	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2617	ret = find_first_non_hole(inode: BTRFS_I(inode), start: &offset, len: &len);
2618	if (ret < `0`)
2619	goto out_only_mutex;
2620	if (ret && !len) {
2621	/ Already in a large hole /
2622	ret = `0`;
2623	goto out_only_mutex;
2624	}
2625
2626	ret = file_modified(file);
2627	if (ret)
2628	goto out_only_mutex;
2629
2630	lockstart = round_up(offset, fs_info->sectorsize);
2631	lockend = round_down(offset + len, fs_info->sectorsize) - `1`;
2632	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2633	== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - `1`));
2634	/*
2635	* We needn't truncate any block which is beyond the end of the file
2636	* because we are sure there is no data there.
2637	*/
2638	/*
2639	* Only do this if we are in the same block and we aren't doing the
2640	* entire block.
2641	*/
2642	if (same_block && len < fs_info->sectorsize) {
2643	if (offset < ino_size) {
2644	truncated_block = true;
2645	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len,
2646	front: `0`);
2647	} else {
2648	ret = `0`;
2649	}
2650	goto out_only_mutex;
2651	}
2652
2653	/ zero back part of the first block /
2654	if (offset < ino_size) {
2655	truncated_block = true;
2656	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len: `0`, front: `0`);
2657	if (ret) {
2658	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2659	return ret;
2660	}
2661	}
2662
2663	/ Check the aligned pages after the first unaligned page,*
2664	* if offset != orig_start, which means the first unaligned page
2665	* including several following pages are already in holes,
2666	* the extra check can be skipped */
2667	if (offset == orig_start) {
2668	/ after truncate page, check hole again /
2669	len = offset + len - lockstart;
2670	offset = lockstart;
2671	ret = find_first_non_hole(inode: BTRFS_I(inode), start: &offset, len: &len);
2672	if (ret < `0`)
2673	goto out_only_mutex;
2674	if (ret && !len) {
2675	ret = `0`;
2676	goto out_only_mutex;
2677	}
2678	lockstart = offset;
2679	}
2680
2681	/ Check the tail unaligned part is in a hole /
2682	tail_start = lockend + `1`;
2683	tail_len = offset + len - tail_start;
2684	if (tail_len) {
2685	ret = find_first_non_hole(inode: BTRFS_I(inode), start: &tail_start, len: &tail_len);
2686	if (unlikely(ret < `0`))
2687	goto out_only_mutex;
2688	if (!ret) {
2689	/ zero the front end of the last page /
2690	if (tail_start + tail_len < ino_size) {
2691	truncated_block = true;
2692	ret = btrfs_truncate_block(inode: BTRFS_I(inode),
2693	from: tail_start + tail_len,
2694	len: `0`, front: `1`);
2695	if (ret)
2696	goto out_only_mutex;
2697	}
2698	}
2699	}
2700
2701	if (lockend < lockstart) {
2702	ret = `0`;
2703	goto out_only_mutex;
2704	}
2705
2706	btrfs_punch_hole_lock_range(inode, lockstart, lockend, cached_state: &cached_state);
2707
2708	path = btrfs_alloc_path();
2709	if (!path) {
2710	ret = -ENOMEM;
2711	goto out;
2712	}
2713
2714	ret = btrfs_replace_file_extents(inode: BTRFS_I(inode), path, start: lockstart,
2715	end: lockend, NULL, trans_out: &trans);
2716	btrfs_free_path(p: path);
2717	if (ret)
2718	goto out;
2719
2720	ASSERT(trans != NULL);
2721	inode_inc_iversion(inode);
2722	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
2723	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
2724	updated_inode = true;
2725	btrfs_end_transaction(trans);
2726	btrfs_btree_balance_dirty(fs_info);
2727	out:
2728	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
2729	cached: &cached_state);
2730	out_only_mutex:
2731	if (!updated_inode && truncated_block && !ret) {
2732	/*
2733	* If we only end up zeroing part of a page, we still need to
2734	* update the inode item, so that all the time fields are
2735	* updated as well as the necessary btrfs inode in memory fields
2736	* for detecting, at fsync time, if the inode isn't yet in the
2737	* log tree or it's there but not up to date.
2738	*/
2739	struct timespec64 now = inode_set_ctime_current(inode);
2740
2741	inode_inc_iversion(inode);
2742	inode_set_mtime_to_ts(inode, ts: now);
2743	trans = btrfs_start_transaction(root, num_items: `1`);
2744	if (IS_ERR(ptr: trans)) {
2745	ret = PTR_ERR(ptr: trans);
2746	} else {
2747	int ret2;
2748
2749	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
2750	ret2 = btrfs_end_transaction(trans);
2751	if (!ret)
2752	ret = ret2;
2753	}
2754	}
2755	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
2756	return ret;
2757	}
2758
2759	/ Helper structure to record which range is already reserved /
2760	struct falloc_range {
2761	struct list_head list;
2762	u64 start;
2763	u64 len;
2764	};
2765
2766	/*
2767	* Helper function to add falloc range
2768	*
2769	* Caller should have locked the larger range of extent containing
2770	* [start, len)
2771	*/
2772	static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2773	{
2774	struct falloc_range *range = NULL;
2775
2776	if (!list_empty(head)) {
2777	/*
2778	* As fallocate iterates by bytenr order, we only need to check
2779	* the last range.
2780	*/
2781	range = list_last_entry(head, struct falloc_range, list);
2782	if (range->start + range->len == start) {
2783	range->len += len;
2784	return `0`;
2785	}
2786	}
2787
2788	range = kmalloc(size: sizeof(*range), GFP_KERNEL);
2789	if (!range)
2790	return -ENOMEM;
2791	range->start = start;
2792	range->len = len;
2793	list_add_tail(new: &range->list, head);
2794	return `0`;
2795	}
2796
2797	static int btrfs_fallocate_update_isize(struct inode *inode,
2798	const u64 end,
2799	const int mode)
2800	{
2801	struct btrfs_trans_handle *trans;
2802	struct btrfs_root *root = BTRFS_I(inode)->root;
2803	int ret;
2804	int ret2;
2805
2806	if (mode & FALLOC_FL_KEEP_SIZE \|\| end <= i_size_read(inode))
2807	return `0`;
2808
2809	trans = btrfs_start_transaction(root, num_items: `1`);
2810	if (IS_ERR(ptr: trans))
2811	return PTR_ERR(ptr: trans);
2812
2813	inode_set_ctime_current(inode);
2814	i_size_write(inode, i_size: end);
2815	btrfs_inode_safe_disk_i_size_write(inode: BTRFS_I(inode), new_i_size: `0`);
2816	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
2817	ret2 = btrfs_end_transaction(trans);
2818
2819	return ret ? ret : ret2;
2820	}
2821
2822	enum {
2823	RANGE_BOUNDARY_WRITTEN_EXTENT,
2824	RANGE_BOUNDARY_PREALLOC_EXTENT,
2825	RANGE_BOUNDARY_HOLE,
2826	};
2827
2828	static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2829	u64 offset)
2830	{
2831	const u64 sectorsize = inode->root->fs_info->sectorsize;
2832	struct extent_map *em;
2833	int ret;
2834
2835	offset = round_down(offset, sectorsize);
2836	em = btrfs_get_extent(inode, NULL, pg_offset: `0`, start: offset, end: sectorsize);
2837	if (IS_ERR(ptr: em))
2838	return PTR_ERR(ptr: em);
2839
2840	if (em->block_start == EXTENT_MAP_HOLE)
2841	ret = RANGE_BOUNDARY_HOLE;
2842	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2843	ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2844	else
2845	ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2846
2847	free_extent_map(em);
2848	return ret;
2849	}
2850
2851	static int btrfs_zero_range(struct inode *inode,
2852	loff_t offset,
2853	loff_t len,
2854	const int mode)
2855	{
2856	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2857	struct extent_map *em;
2858	struct extent_changeset *data_reserved = NULL;
2859	int ret;
2860	u64 alloc_hint = `0`;
2861	const u64 sectorsize = fs_info->sectorsize;
2862	u64 alloc_start = round_down(offset, sectorsize);
2863	u64 alloc_end = round_up(offset + len, sectorsize);
2864	u64 bytes_to_reserve = `0`;
2865	bool space_reserved = false;
2866
2867	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, pg_offset: `0`, start: alloc_start,
2868	end: alloc_end - alloc_start);
2869	if (IS_ERR(ptr: em)) {
2870	ret = PTR_ERR(ptr: em);
2871	goto out;
2872	}
2873
2874	/*
2875	* Avoid hole punching and extent allocation for some cases. More cases
2876	* could be considered, but these are unlikely common and we keep things
2877	* as simple as possible for now. Also, intentionally, if the target
2878	* range contains one or more prealloc extents together with regular
2879	* extents and holes, we drop all the existing extents and allocate a
2880	* new prealloc extent, so that we get a larger contiguous disk extent.
2881	*/
2882	if (em->start <= alloc_start &&
2883	test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2884	const u64 em_end = em->start + em->len;
2885
2886	if (em_end >= offset + len) {
2887	/*
2888	* The whole range is already a prealloc extent,
2889	* do nothing except updating the inode's i_size if
2890	* needed.
2891	*/
2892	free_extent_map(em);
2893	ret = btrfs_fallocate_update_isize(inode, end: offset + len,
2894	mode);
2895	goto out;
2896	}
2897	/*
2898	* Part of the range is already a prealloc extent, so operate
2899	* only on the remaining part of the range.
2900	*/
2901	alloc_start = em_end;
2902	ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2903	len = offset + len - alloc_start;
2904	offset = alloc_start;
2905	alloc_hint = em->block_start + em->len;
2906	}
2907	free_extent_map(em);
2908
2909	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2910	BTRFS_BYTES_TO_BLKS(fs_info, offset + len - `1`)) {
2911	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, pg_offset: `0`, start: alloc_start,
2912	end: sectorsize);
2913	if (IS_ERR(ptr: em)) {
2914	ret = PTR_ERR(ptr: em);
2915	goto out;
2916	}
2917
2918	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2919	free_extent_map(em);
2920	ret = btrfs_fallocate_update_isize(inode, end: offset + len,
2921	mode);
2922	goto out;
2923	}
2924	if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
2925	free_extent_map(em);
2926	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len,
2927	front: `0`);
2928	if (!ret)
2929	ret = btrfs_fallocate_update_isize(inode,
2930	end: offset + len,
2931	mode);
2932	return ret;
2933	}
2934	free_extent_map(em);
2935	alloc_start = round_down(offset, sectorsize);
2936	alloc_end = alloc_start + sectorsize;
2937	goto reserve_space;
2938	}
2939
2940	alloc_start = round_up(offset, sectorsize);
2941	alloc_end = round_down(offset + len, sectorsize);
2942
2943	/*
2944	* For unaligned ranges, check the pages at the boundaries, they might
2945	* map to an extent, in which case we need to partially zero them, or
2946	* they might map to a hole, in which case we need our allocation range
2947	* to cover them.
2948	*/
2949	if (!IS_ALIGNED(offset, sectorsize)) {
2950	ret = btrfs_zero_range_check_range_boundary(inode: BTRFS_I(inode),
2951	offset);
2952	if (ret < `0`)
2953	goto out;
2954	if (ret == RANGE_BOUNDARY_HOLE) {
2955	alloc_start = round_down(offset, sectorsize);
2956	ret = `0`;
2957	} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2958	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset, len: `0`, front: `0`);
2959	if (ret)
2960	goto out;
2961	} else {
2962	ret = `0`;
2963	}
2964	}
2965
2966	if (!IS_ALIGNED(offset + len, sectorsize)) {
2967	ret = btrfs_zero_range_check_range_boundary(inode: BTRFS_I(inode),
2968	offset: offset + len);
2969	if (ret < `0`)
2970	goto out;
2971	if (ret == RANGE_BOUNDARY_HOLE) {
2972	alloc_end = round_up(offset + len, sectorsize);
2973	ret = `0`;
2974	} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2975	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: offset + len,
2976	len: `0`, front: `1`);
2977	if (ret)
2978	goto out;
2979	} else {
2980	ret = `0`;
2981	}
2982	}
2983
2984	reserve_space:
2985	if (alloc_start < alloc_end) {
2986	struct extent_state *cached_state = NULL;
2987	const u64 lockstart = alloc_start;
2988	const u64 lockend = alloc_end - `1`;
2989
2990	bytes_to_reserve = alloc_end - alloc_start;
2991	ret = btrfs_alloc_data_chunk_ondemand(inode: BTRFS_I(inode),
2992	bytes: bytes_to_reserve);
2993	if (ret < `0`)
2994	goto out;
2995	space_reserved = true;
2996	btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2997	cached_state: &cached_state);
2998	ret = btrfs_qgroup_reserve_data(inode: BTRFS_I(inode), reserved: &data_reserved,
2999	start: alloc_start, len: bytes_to_reserve);
3000	if (ret) {
3001	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart,
3002	end: lockend, cached: &cached_state);
3003	goto out;
3004	}
3005	ret = btrfs_prealloc_file_range(inode, mode, start: alloc_start,
3006	num_bytes: alloc_end - alloc_start,
3007	min_size: i_blocksize(node: inode),
3008	actual_len: offset + len, alloc_hint: &alloc_hint);
3009	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
3010	cached: &cached_state);
3011	/ btrfs_prealloc_file_range releases reserved space on error /
3012	if (ret) {
3013	space_reserved = false;
3014	goto out;
3015	}
3016	}
3017	ret = btrfs_fallocate_update_isize(inode, end: offset + len, mode);
3018	out:
3019	if (ret && space_reserved)
3020	btrfs_free_reserved_data_space(inode: BTRFS_I(inode), reserved: data_reserved,
3021	start: alloc_start, len: bytes_to_reserve);
3022	extent_changeset_free(changeset: data_reserved);
3023
3024	return ret;
3025	}
3026
3027	static long btrfs_fallocate(struct file file, int* mode,
3028	loff_t offset, loff_t len)
3029	{
3030	struct inode *inode = file_inode(f: file);
3031	struct extent_state *cached_state = NULL;
3032	struct extent_changeset *data_reserved = NULL;
3033	struct falloc_range *range;
3034	struct falloc_range *tmp;
3035	LIST_HEAD(reserve_list);
3036	u64 cur_offset;
3037	u64 last_byte;
3038	u64 alloc_start;
3039	u64 alloc_end;
3040	u64 alloc_hint = `0`;
3041	u64 locked_end;
3042	u64 actual_end = `0`;
3043	u64 data_space_needed = `0`;
3044	u64 data_space_reserved = `0`;
3045	u64 qgroup_reserved = `0`;
3046	struct extent_map *em;
3047	int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3048	int ret;
3049
3050	/ Do not allow fallocate in ZONED mode /
3051	if (btrfs_is_zoned(fs_info: btrfs_sb(sb: inode->i_sb)))
3052	return -EOPNOTSUPP;
3053
3054	alloc_start = round_down(offset, blocksize);
3055	alloc_end = round_up(offset + len, blocksize);
3056	cur_offset = alloc_start;
3057
3058	/ Make sure we aren't being give some crap mode /
3059	if (mode & ~(FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE \|
3060	FALLOC_FL_ZERO_RANGE))
3061	return -EOPNOTSUPP;
3062
3063	if (mode & FALLOC_FL_PUNCH_HOLE)
3064	return btrfs_punch_hole(file, offset, len);
3065
3066	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
3067
3068	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3069	ret = inode_newsize_ok(inode, offset: offset + len);
3070	if (ret)
3071	goto out;
3072	}
3073
3074	ret = file_modified(file);
3075	if (ret)
3076	goto out;
3077
3078	/*
3079	* TODO: Move these two operations after we have checked
3080	* accurate reserved space, or fallocate can still fail but
3081	* with page truncated or size expanded.
3082	*
3083	* But that's a minor problem and won't do much harm BTW.
3084	*/
3085	if (alloc_start > inode->i_size) {
3086	ret = btrfs_cont_expand(inode: BTRFS_I(inode), oldsize: i_size_read(inode),
3087	size: alloc_start);
3088	if (ret)
3089	goto out;
3090	} else if (offset + len > inode->i_size) {
3091	/*
3092	* If we are fallocating from the end of the file onward we
3093	* need to zero out the end of the block if i_size lands in the
3094	* middle of a block.
3095	*/
3096	ret = btrfs_truncate_block(inode: BTRFS_I(inode), from: inode->i_size, len: `0`, front: `0`);
3097	if (ret)
3098	goto out;
3099	}
3100
3101	/*
3102	* We have locked the inode at the VFS level (in exclusive mode) and we
3103	* have locked the i_mmap_lock lock (in exclusive mode). Now before
3104	* locking the file range, flush all dealloc in the range and wait for
3105	* all ordered extents in the range to complete. After this we can lock
3106	* the file range and, due to the previous locking we did, we know there
3107	* can't be more delalloc or ordered extents in the range.
3108	*/
3109	ret = btrfs_wait_ordered_range(inode, start: alloc_start,
3110	len: alloc_end - alloc_start);
3111	if (ret)
3112	goto out;
3113
3114	if (mode & FALLOC_FL_ZERO_RANGE) {
3115	ret = btrfs_zero_range(inode, offset, len, mode);
3116	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
3117	return ret;
3118	}
3119
3120	locked_end = alloc_end - `1`;
3121	lock_extent(tree: &BTRFS_I(inode)->io_tree, start: alloc_start, end: locked_end,
3122	cached: &cached_state);
3123
3124	btrfs_assert_inode_range_clean(inode: BTRFS_I(inode), start: alloc_start, end: locked_end);
3125
3126	/ First, check if we exceed the qgroup limit /
3127	while (cur_offset < alloc_end) {
3128	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, pg_offset: `0`, start: cur_offset,
3129	end: alloc_end - cur_offset);
3130	if (IS_ERR(ptr: em)) {
3131	ret = PTR_ERR(ptr: em);
3132	break;
3133	}
3134	last_byte = min(extent_map_end(em), alloc_end);
3135	actual_end = min_t(u64, extent_map_end(em), offset + len);
3136	last_byte = ALIGN(last_byte, blocksize);
3137	if (em->block_start == EXTENT_MAP_HOLE \|\|
3138	(cur_offset >= inode->i_size &&
3139	!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3140	const u64 range_len = last_byte - cur_offset;
3141
3142	ret = add_falloc_range(head: &reserve_list, start: cur_offset, len: range_len);
3143	if (ret < `0`) {
3144	free_extent_map(em);
3145	break;
3146	}
3147	ret = btrfs_qgroup_reserve_data(inode: BTRFS_I(inode),
3148	reserved: &data_reserved, start: cur_offset, len: range_len);
3149	if (ret < `0`) {
3150	free_extent_map(em);
3151	break;
3152	}
3153	qgroup_reserved += range_len;
3154	data_space_needed += range_len;
3155	}
3156	free_extent_map(em);
3157	cur_offset = last_byte;
3158	}
3159
3160	if (!ret && data_space_needed > `0`) {
3161	/*
3162	* We are safe to reserve space here as we can't have delalloc
3163	* in the range, see above.
3164	*/
3165	ret = btrfs_alloc_data_chunk_ondemand(inode: BTRFS_I(inode),
3166	bytes: data_space_needed);
3167	if (!ret)
3168	data_space_reserved = data_space_needed;
3169	}
3170
3171	/*
3172	* If ret is still 0, means we're OK to fallocate.
3173	* Or just cleanup the list and exit.
3174	*/
3175	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3176	if (!ret) {
3177	ret = btrfs_prealloc_file_range(inode, mode,
3178	start: range->start,
3179	num_bytes: range->len, min_size: i_blocksize(node: inode),
3180	actual_len: offset + len, alloc_hint: &alloc_hint);
3181	/*
3182	* btrfs_prealloc_file_range() releases space even
3183	* if it returns an error.
3184	*/
3185	data_space_reserved -= range->len;
3186	qgroup_reserved -= range->len;
3187	} else if (data_space_reserved > `0`) {
3188	btrfs_free_reserved_data_space(inode: BTRFS_I(inode),
3189	reserved: data_reserved, start: range->start,
3190	len: range->len);
3191	data_space_reserved -= range->len;
3192	qgroup_reserved -= range->len;
3193	} else if (qgroup_reserved > `0`) {
3194	btrfs_qgroup_free_data(inode: BTRFS_I(inode), reserved: data_reserved,
3195	start: range->start, len: range->len);
3196	qgroup_reserved -= range->len;
3197	}
3198	list_del(entry: &range->list);
3199	kfree(objp: range);
3200	}
3201	if (ret < `0`)
3202	goto out_unlock;
3203
3204	/*
3205	* We didn't need to allocate any more space, but we still extended the
3206	* size of the file so we need to update i_size and the inode item.
3207	*/
3208	ret = btrfs_fallocate_update_isize(inode, end: actual_end, mode);
3209	out_unlock:
3210	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: alloc_start, end: locked_end,
3211	cached: &cached_state);
3212	out:
3213	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_MMAP);
3214	extent_changeset_free(changeset: data_reserved);
3215	return ret;
3216	}
3217
3218	/*
3219	* Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3220	* that has unflushed and/or flushing delalloc. There might be other adjacent
3221	* subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3222	* looping while it gets adjacent subranges, and merging them together.
3223	*/
3224	static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3225	struct extent_state **cached_state,
3226	bool *search_io_tree,
3227	u64 delalloc_start_ret, u64 delalloc_end_ret)
3228	{
3229	u64 len = end + `1` - start;
3230	u64 delalloc_len = `0`;
3231	struct btrfs_ordered_extent *oe;
3232	u64 oe_start;
3233	u64 oe_end;
3234
3235	/*
3236	* Search the io tree first for EXTENT_DELALLOC. If we find any, it
3237	* means we have delalloc (dirty pages) for which writeback has not
3238	* started yet.
3239	*/
3240	if (*search_io_tree) {
3241	spin_lock(lock: &inode->lock);
3242	if (inode->delalloc_bytes > `0`) {
3243	spin_unlock(lock: &inode->lock);
3244	*delalloc_start_ret = start;
3245	delalloc_len = count_range_bits(tree: &inode->io_tree,
3246	start: delalloc_start_ret, search_end: end,
3247	max_bytes: len, bits: EXTENT_DELALLOC, contig: `1`,
3248	cached_state);
3249	} else {
3250	spin_unlock(lock: &inode->lock);
3251	}
3252	}
3253
3254	if (delalloc_len > `0`) {
3255	/*
3256	* If delalloc was found then *delalloc_start_ret has a sector size
3257	* aligned value (rounded down).
3258	*/
3259	delalloc_end_ret = delalloc_start_ret + delalloc_len - `1`;
3260
3261	if (*delalloc_start_ret == start) {
3262	/ Delalloc for the whole range, nothing more to do. /
3263	if (*delalloc_end_ret == end)
3264	return true;
3265	/ Else trim our search range for ordered extents. /
3266	start = *delalloc_end_ret + `1`;
3267	len = end + `1` - start;
3268	}
3269	} else {
3270	/ No delalloc, future calls don't need to search again. /
3271	*search_io_tree = false;
3272	}
3273
3274	/*
3275	* Now also check if there's any ordered extent in the range.
3276	* We do this because:
3277	*
3278	* 1) When delalloc is flushed, the file range is locked, we clear the
3279	* EXTENT_DELALLOC bit from the io tree and create an extent map and
3280	* an ordered extent for the write. So we might just have been called
3281	* after delalloc is flushed and before the ordered extent completes
3282	* and inserts the new file extent item in the subvolume's btree;
3283	*
3284	* 2) We may have an ordered extent created by flushing delalloc for a
3285	* subrange that starts before the subrange we found marked with
3286	* EXTENT_DELALLOC in the io tree.
3287	*
3288	* We could also use the extent map tree to find such delalloc that is
3289	* being flushed, but using the ordered extents tree is more efficient
3290	* because it's usually much smaller as ordered extents are removed from
3291	* the tree once they complete. With the extent maps, we mau have them
3292	* in the extent map tree for a very long time, and they were either
3293	* created by previous writes or loaded by read operations.
3294	*/
3295	oe = btrfs_lookup_first_ordered_range(inode, file_offset: start, len);
3296	if (!oe)
3297	return (delalloc_len > `0`);
3298
3299	/ The ordered extent may span beyond our search range. /
3300	oe_start = max(oe->file_offset, start);
3301	oe_end = min(oe->file_offset + oe->num_bytes - `1`, end);
3302
3303	btrfs_put_ordered_extent(entry: oe);
3304
3305	/ Don't have unflushed delalloc, return the ordered extent range. /
3306	if (delalloc_len == `0`) {
3307	*delalloc_start_ret = oe_start;
3308	*delalloc_end_ret = oe_end;
3309	return true;
3310	}
3311
3312	/*
3313	* We have both unflushed delalloc (io_tree) and an ordered extent.
3314	* If the ranges are adjacent returned a combined range, otherwise
3315	* return the leftmost range.
3316	*/
3317	if (oe_start < *delalloc_start_ret) {
3318	if (oe_end < *delalloc_start_ret)
3319	*delalloc_end_ret = oe_end;
3320	*delalloc_start_ret = oe_start;
3321	} else if (*delalloc_end_ret + `1` == oe_start) {
3322	*delalloc_end_ret = oe_end;
3323	}
3324
3325	return true;
3326	}
3327
3328	/*
3329	* Check if there's delalloc in a given range.
3330	*
3331	* @inode: The inode.
3332	* @start: The start offset of the range. It does not need to be
3333	* sector size aligned.
3334	* @end: The end offset (inclusive value) of the search range.
3335	* It does not need to be sector size aligned.
3336	* @cached_state: Extent state record used for speeding up delalloc
3337	* searches in the inode's io_tree. Can be NULL.
3338	* @delalloc_start_ret: Output argument, set to the start offset of the
3339	* subrange found with delalloc (may not be sector size
3340	* aligned).
3341	* @delalloc_end_ret: Output argument, set to he end offset (inclusive value)
3342	* of the subrange found with delalloc.
3343	*
3344	* Returns true if a subrange with delalloc is found within the given range, and
3345	* if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3346	* end offsets of the subrange.
3347	*/
3348	bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3349	struct extent_state **cached_state,
3350	u64 delalloc_start_ret, u64 delalloc_end_ret)
3351	{
3352	u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3353	u64 prev_delalloc_end = `0`;
3354	bool search_io_tree = true;
3355	bool ret = false;
3356
3357	while (cur_offset <= end) {
3358	u64 delalloc_start;
3359	u64 delalloc_end;
3360	bool delalloc;
3361
3362	delalloc = find_delalloc_subrange(inode, start: cur_offset, end,
3363	cached_state, search_io_tree: &search_io_tree,
3364	delalloc_start_ret: &delalloc_start,
3365	delalloc_end_ret: &delalloc_end);
3366	if (!delalloc)
3367	break;
3368
3369	if (prev_delalloc_end == `0`) {
3370	/ First subrange found. /
3371	*delalloc_start_ret = max(delalloc_start, start);
3372	*delalloc_end_ret = delalloc_end;
3373	ret = true;
3374	} else if (delalloc_start == prev_delalloc_end + `1`) {
3375	/ Subrange adjacent to the previous one, merge them. /
3376	*delalloc_end_ret = delalloc_end;
3377	} else {
3378	/ Subrange not adjacent to the previous one, exit. /
3379	break;
3380	}
3381
3382	prev_delalloc_end = delalloc_end;
3383	cur_offset = delalloc_end + `1`;
3384	cond_resched();
3385	}
3386
3387	return ret;
3388	}
3389
3390	/*
3391	* Check if there's a hole or delalloc range in a range representing a hole (or
3392	* prealloc extent) found in the inode's subvolume btree.
3393	*
3394	* @inode: The inode.
3395	* @whence: Seek mode (SEEK_DATA or SEEK_HOLE).
3396	* @start: Start offset of the hole region. It does not need to be sector
3397	* size aligned.
3398	* @end: End offset (inclusive value) of the hole region. It does not
3399	* need to be sector size aligned.
3400	* @start_ret: Return parameter, used to set the start of the subrange in the
3401	* hole that matches the search criteria (seek mode), if such
3402	* subrange is found (return value of the function is true).
3403	* The value returned here may not be sector size aligned.
3404	*
3405	* Returns true if a subrange matching the given seek mode is found, and if one
3406	* is found, it updates @start_ret with the start of the subrange.
3407	*/
3408	static bool find_desired_extent_in_hole(struct btrfs_inode inode, int* whence,
3409	struct extent_state **cached_state,
3410	u64 start, u64 end, u64 *start_ret)
3411	{
3412	u64 delalloc_start;
3413	u64 delalloc_end;
3414	bool delalloc;
3415
3416	delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3417	delalloc_start_ret: &delalloc_start, delalloc_end_ret: &delalloc_end);
3418	if (delalloc && whence == SEEK_DATA) {
3419	*start_ret = delalloc_start;
3420	return true;
3421	}
3422
3423	if (delalloc && whence == SEEK_HOLE) {
3424	/*
3425	* We found delalloc but it starts after out start offset. So we
3426	* have a hole between our start offset and the delalloc start.
3427	*/
3428	if (start < delalloc_start) {
3429	*start_ret = start;
3430	return true;
3431	}
3432	/*
3433	* Delalloc range starts at our start offset.
3434	* If the delalloc range's length is smaller than our range,
3435	* then it means we have a hole that starts where the delalloc
3436	* subrange ends.
3437	*/
3438	if (delalloc_end < end) {
3439	*start_ret = delalloc_end + `1`;
3440	return true;
3441	}
3442
3443	/ There's delalloc for the whole range. /
3444	return false;
3445	}
3446
3447	if (!delalloc && whence == SEEK_HOLE) {
3448	*start_ret = start;
3449	return true;
3450	}
3451
3452	/*
3453	* No delalloc in the range and we are seeking for data. The caller has
3454	* to iterate to the next extent item in the subvolume btree.
3455	*/
3456	return false;
3457	}
3458
3459	static loff_t find_desired_extent(struct file file, loff_t offset, int* whence)
3460	{
3461	struct btrfs_inode *inode = BTRFS_I(inode: file->f_mapping->host);
3462	struct btrfs_file_private *private = file->private_data;
3463	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3464	struct extent_state *cached_state = NULL;
3465	struct extent_state **delalloc_cached_state;
3466	const loff_t i_size = i_size_read(inode: &inode->vfs_inode);
3467	const u64 ino = btrfs_ino(inode);
3468	struct btrfs_root *root = inode->root;
3469	struct btrfs_path *path;
3470	struct btrfs_key key;
3471	u64 last_extent_end;
3472	u64 lockstart;
3473	u64 lockend;
3474	u64 start;
3475	int ret;
3476	bool found = false;
3477
3478	if (i_size == `0` \|\| offset >= i_size)
3479	return -ENXIO;
3480
3481	/*
3482	* Quick path. If the inode has no prealloc extents and its number of
3483	* bytes used matches its i_size, then it can not have holes.
3484	*/
3485	if (whence == SEEK_HOLE &&
3486	!(inode->flags & BTRFS_INODE_PREALLOC) &&
3487	inode_get_bytes(inode: &inode->vfs_inode) == i_size)
3488	return i_size;
3489
3490	if (!private) {
3491	private = kzalloc(size: sizeof(*private), GFP_KERNEL);
3492	/*
3493	* No worries if memory allocation failed.
3494	* The private structure is used only for speeding up multiple
3495	* lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3496	* so everything will still be correct.
3497	*/
3498	file->private_data = private;
3499	}
3500
3501	if (private)
3502	delalloc_cached_state = &private->llseek_cached_state;
3503	else
3504	delalloc_cached_state = NULL;
3505
3506	/*
3507	* offset can be negative, in this case we start finding DATA/HOLE from
3508	* the very start of the file.
3509	*/
3510	start = max_t(loff_t, `0`, offset);
3511
3512	lockstart = round_down(start, fs_info->sectorsize);
3513	lockend = round_up(i_size, fs_info->sectorsize);
3514	if (lockend <= lockstart)
3515	lockend = lockstart + fs_info->sectorsize;
3516	lockend--;
3517
3518	path = btrfs_alloc_path();
3519	if (!path)
3520	return -ENOMEM;
3521	path->reada = READA_FORWARD;
3522
3523	key.objectid = ino;
3524	key.type = BTRFS_EXTENT_DATA_KEY;
3525	key.offset = start;
3526
3527	last_extent_end = lockstart;
3528
3529	lock_extent(tree: &inode->io_tree, start: lockstart, end: lockend, cached: &cached_state);
3530
3531	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
3532	if (ret < `0`) {
3533	goto out;
3534	} else if (ret > `0` && path->slots[`0`] > `0`) {
3535	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`] - `1`);
3536	if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3537	path->slots[`0`]--;
3538	}
3539
3540	while (start < i_size) {
3541	struct extent_buffer *leaf = path->nodes[`0`];
3542	struct btrfs_file_extent_item *extent;
3543	u64 extent_end;
3544	u8 type;
3545
3546	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
3547	ret = btrfs_next_leaf(root, path);
3548	if (ret < `0`)
3549	goto out;
3550	else if (ret > `0`)
3551	break;
3552
3553	leaf = path->nodes[`0`];
3554	}
3555
3556	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
3557	if (key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
3558	break;
3559
3560	extent_end = btrfs_file_extent_end(path);
3561
3562	/*
3563	* In the first iteration we may have a slot that points to an
3564	* extent that ends before our start offset, so skip it.
3565	*/
3566	if (extent_end <= start) {
3567	path->slots[`0`]++;
3568	continue;
3569	}
3570
3571	/ We have an implicit hole, NO_HOLES feature is likely set. /
3572	if (last_extent_end < key.offset) {
3573	u64 search_start = last_extent_end;
3574	u64 found_start;
3575
3576	/*
3577	* First iteration, @start matches @offset and it's
3578	* within the hole.
3579	*/
3580	if (start == offset)
3581	search_start = offset;
3582
3583	found = find_desired_extent_in_hole(inode, whence,
3584	cached_state: delalloc_cached_state,
3585	start: search_start,
3586	end: key.offset - `1`,
3587	start_ret: &found_start);
3588	if (found) {
3589	start = found_start;
3590	break;
3591	}
3592	/*
3593	* Didn't find data or a hole (due to delalloc) in the
3594	* implicit hole range, so need to analyze the extent.
3595	*/
3596	}
3597
3598	extent = btrfs_item_ptr(leaf, path->slots[`0`],
3599	struct btrfs_file_extent_item);
3600	type = btrfs_file_extent_type(eb: leaf, s: extent);
3601
3602	/*
3603	* Can't access the extent's disk_bytenr field if this is an
3604	* inline extent, since at that offset, it's where the extent
3605	* data starts.
3606	*/
3607	if (type == BTRFS_FILE_EXTENT_PREALLOC \|\|
3608	(type == BTRFS_FILE_EXTENT_REG &&
3609	btrfs_file_extent_disk_bytenr(eb: leaf, s: extent) == `0`)) {
3610	/*
3611	* Explicit hole or prealloc extent, search for delalloc.
3612	* A prealloc extent is treated like a hole.
3613	*/
3614	u64 search_start = key.offset;
3615	u64 found_start;
3616
3617	/*
3618	* First iteration, @start matches @offset and it's
3619	* within the hole.
3620	*/
3621	if (start == offset)
3622	search_start = offset;
3623
3624	found = find_desired_extent_in_hole(inode, whence,
3625	cached_state: delalloc_cached_state,
3626	start: search_start,
3627	end: extent_end - `1`,
3628	start_ret: &found_start);
3629	if (found) {
3630	start = found_start;
3631	break;
3632	}
3633	/*
3634	* Didn't find data or a hole (due to delalloc) in the
3635	* implicit hole range, so need to analyze the next
3636	* extent item.
3637	*/
3638	} else {
3639	/*
3640	* Found a regular or inline extent.
3641	* If we are seeking for data, adjust the start offset
3642	* and stop, we're done.
3643	*/
3644	if (whence == SEEK_DATA) {
3645	start = max_t(u64, key.offset, offset);
3646	found = true;
3647	break;
3648	}
3649	/*
3650	* Else, we are seeking for a hole, check the next file
3651	* extent item.
3652	*/
3653	}
3654
3655	start = extent_end;
3656	last_extent_end = extent_end;
3657	path->slots[`0`]++;
3658	if (fatal_signal_pending(current)) {
3659	ret = -EINTR;
3660	goto out;
3661	}
3662	cond_resched();
3663	}
3664
3665	/ We have an implicit hole from the last extent found up to i_size. /
3666	if (!found && start < i_size) {
3667	found = find_desired_extent_in_hole(inode, whence,
3668	cached_state: delalloc_cached_state, start,
3669	end: i_size - `1`, start_ret: &start);
3670	if (!found)
3671	start = i_size;
3672	}
3673
3674	out:
3675	unlock_extent(tree: &inode->io_tree, start: lockstart, end: lockend, cached: &cached_state);
3676	btrfs_free_path(p: path);
3677
3678	if (ret < `0`)
3679	return ret;
3680
3681	if (whence == SEEK_DATA && start >= i_size)
3682	return -ENXIO;
3683
3684	return min_t(loff_t, start, i_size);
3685	}
3686
3687	static loff_t btrfs_file_llseek(struct file file, loff_t offset, int* whence)
3688	{
3689	struct inode *inode = file->f_mapping->host;
3690
3691	switch (whence) {
3692	default:
3693	return generic_file_llseek(file, offset, whence);
3694	case SEEK_DATA:
3695	case SEEK_HOLE:
3696	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3697	offset = find_desired_extent(file, offset, whence);
3698	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3699	break;
3700	}
3701
3702	if (offset < `0`)
3703	return offset;
3704
3705	return vfs_setpos(file, offset, maxsize: inode->i_sb->s_maxbytes);
3706	}
3707
3708	static int btrfs_file_open(struct inode inode, struct* file *filp)
3709	{
3710	int ret;
3711
3712	filp->f_mode \|= FMODE_NOWAIT \| FMODE_BUF_RASYNC \| FMODE_BUF_WASYNC \|
3713	FMODE_CAN_ODIRECT;
3714
3715	ret = fsverity_file_open(inode, filp);
3716	if (ret)
3717	return ret;
3718	return generic_file_open(inode, filp);
3719	}
3720
3721	static int check_direct_read(struct btrfs_fs_info *fs_info,
3722	const struct iov_iter *iter, loff_t offset)
3723	{
3724	int ret;
3725	int i, seg;
3726
3727	ret = check_direct_IO(fs_info, iter, offset);
3728	if (ret < `0`)
3729	return ret;
3730
3731	if (!iter_is_iovec(i: iter))
3732	return `0`;
3733
3734	for (seg = `0`; seg < iter->nr_segs; seg++) {
3735	for (i = seg + `1`; i < iter->nr_segs; i++) {
3736	const struct iovec *iov1 = iter_iov(iter) + seg;
3737	const struct iovec *iov2 = iter_iov(iter) + i;
3738
3739	if (iov1->iov_base == iov2->iov_base)
3740	return -EINVAL;
3741	}
3742	}
3743	return `0`;
3744	}
3745
3746	static ssize_t btrfs_direct_read(struct kiocb iocb, struct* iov_iter *to)
3747	{
3748	struct inode *inode = file_inode(f: iocb->ki_filp);
3749	size_t prev_left = `0`;
3750	ssize_t read = `0`;
3751	ssize_t ret;
3752
3753	if (fsverity_active(inode))
3754	return `0`;
3755
3756	if (check_direct_read(fs_info: btrfs_sb(sb: inode->i_sb), iter: to, offset: iocb->ki_pos))
3757	return `0`;
3758
3759	btrfs_inode_lock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3760	again:
3761	/*
3762	* This is similar to what we do for direct IO writes, see the comment
3763	* at btrfs_direct_write(), but we also disable page faults in addition
3764	* to disabling them only at the iov_iter level. This is because when
3765	* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3766	* which can still trigger page fault ins despite having set ->nofault
3767	* to true of our 'to' iov_iter.
3768	*
3769	* The difference to direct IO writes is that we deadlock when trying
3770	* to lock the extent range in the inode's tree during he page reads
3771	* triggered by the fault in (while for writes it is due to waiting for
3772	* our own ordered extent). This is because for direct IO reads,
3773	* btrfs_dio_iomap_begin() returns with the extent range locked, which
3774	* is only unlocked in the endio callback (end_bio_extent_readpage()).
3775	*/
3776	pagefault_disable();
3777	to->nofault = true;
3778	ret = btrfs_dio_read(iocb, iter: to, done_before: read);
3779	to->nofault = false;
3780	pagefault_enable();
3781
3782	/ No increment (+=) because iomap returns a cumulative value. /
3783	if (ret > `0`)
3784	read = ret;
3785
3786	if (iov_iter_count(i: to) > `0` && (ret == -EFAULT \|\| ret > `0`)) {
3787	const size_t left = iov_iter_count(i: to);
3788
3789	if (left == prev_left) {
3790	/*
3791	* We didn't make any progress since the last attempt,
3792	* fallback to a buffered read for the remainder of the
3793	* range. This is just to avoid any possibility of looping
3794	* for too long.
3795	*/
3796	ret = read;
3797	} else {
3798	/*
3799	* We made some progress since the last retry or this is
3800	* the first time we are retrying. Fault in as many pages
3801	* as possible and retry.
3802	*/
3803	fault_in_iov_iter_writeable(i: to, bytes: left);
3804	prev_left = left;
3805	goto again;
3806	}
3807	}
3808	btrfs_inode_unlock(inode: BTRFS_I(inode), ilock_flags: BTRFS_ILOCK_SHARED);
3809	return ret < `0` ? ret : read;
3810	}
3811
3812	static ssize_t btrfs_file_read_iter(struct kiocb iocb, struct* iov_iter *to)
3813	{
3814	ssize_t ret = `0`;
3815
3816	if (iocb->ki_flags & IOCB_DIRECT) {
3817	ret = btrfs_direct_read(iocb, to);
3818	if (ret < `0` \|\| !iov_iter_count(i: to) \|\|
3819	iocb->ki_pos >= i_size_read(inode: file_inode(f: iocb->ki_filp)))
3820	return ret;
3821	}
3822
3823	return filemap_read(iocb, to, already_read: ret);
3824	}
3825
3826	const struct file_operations btrfs_file_operations = {
3827	.llseek = btrfs_file_llseek,
3828	.read_iter = btrfs_file_read_iter,
3829	.splice_read = filemap_splice_read,
3830	.write_iter = btrfs_file_write_iter,
3831	.splice_write = iter_file_splice_write,
3832	.mmap = btrfs_file_mmap,
3833	.open = btrfs_file_open,
3834	.release = btrfs_release_file,
3835	.get_unmapped_area = thp_get_unmapped_area,
3836	.fsync = btrfs_sync_file,
3837	.fallocate = btrfs_fallocate,
3838	.unlocked_ioctl = btrfs_ioctl,
3839	#ifdef CONFIG_COMPAT
3840	.compat_ioctl = btrfs_compat_ioctl,
3841	#endif
3842	.remap_file_range = btrfs_remap_file_range,
3843	};
3844
3845	int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3846	{
3847	int ret;
3848
3849	/*
3850	* So with compression we will find and lock a dirty page and clear the
3851	* first one as dirty, setup an async extent, and immediately return
3852	* with the entire range locked but with nobody actually marked with
3853	* writeback. So we can't just filemap_write_and_wait_range() and
3854	* expect it to work since it will just kick off a thread to do the
3855	* actual work. So we need to call filemap_fdatawrite_range _again_
3856	* since it will wait on the page lock, which won't be unlocked until
3857	* after the pages have been marked as writeback and so we're good to go
3858	* from there. We have to do this otherwise we'll miss the ordered
3859	* extents and that results in badness. Please Josef, do not think you
3860	* know better and pull this out at some point in the future, it is
3861	* right and you are wrong.
3862	*/
3863	ret = filemap_fdatawrite_range(mapping: inode->i_mapping, start, end);
3864	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3865	&BTRFS_I(inode)->runtime_flags))
3866	ret = filemap_fdatawrite_range(mapping: inode->i_mapping, start, end);
3867
3868	return ret;
3869	}
3870

source code of linux/fs/btrfs/file.c