inode.c source code [linux/fs/btrfs/inode.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2007 Oracle. All rights reserved.
4	*/
5
6	#include <crypto/hash.h>
7	#include <linux/kernel.h>
8	#include <linux/bio.h>
9	#include <linux/blk-cgroup.h>
10	#include <linux/file.h>
11	#include <linux/fs.h>
12	#include <linux/pagemap.h>
13	#include <linux/highmem.h>
14	#include <linux/time.h>
15	#include <linux/init.h>
16	#include <linux/string.h>
17	#include <linux/backing-dev.h>
18	#include <linux/writeback.h>
19	#include <linux/compat.h>
20	#include <linux/xattr.h>
21	#include <linux/posix_acl.h>
22	#include <linux/falloc.h>
23	#include <linux/slab.h>
24	#include <linux/ratelimit.h>
25	#include <linux/btrfs.h>
26	#include <linux/blkdev.h>
27	#include <linux/posix_acl_xattr.h>
28	#include <linux/uio.h>
29	#include <linux/magic.h>
30	#include <linux/iversion.h>
31	#include <linux/swap.h>
32	#include <linux/migrate.h>
33	#include <linux/sched/mm.h>
34	#include <linux/iomap.h>
35	#include <asm/unaligned.h>
36	#include <linux/fsverity.h>
37	#include "misc.h"
38	#include "ctree.h"
39	#include "disk-io.h"
40	#include "transaction.h"
41	#include "btrfs_inode.h"
42	#include "print-tree.h"
43	#include "ordered-data.h"
44	#include "xattr.h"
45	#include "tree-log.h"
46	#include "bio.h"
47	#include "compression.h"
48	#include "locking.h"
49	#include "free-space-cache.h"
50	#include "props.h"
51	#include "qgroup.h"
52	#include "delalloc-space.h"
53	#include "block-group.h"
54	#include "space-info.h"
55	#include "zoned.h"
56	#include "subpage.h"
57	#include "inode-item.h"
58	#include "fs.h"
59	#include "accessors.h"
60	#include "extent-tree.h"
61	#include "root-tree.h"
62	#include "defrag.h"
63	#include "dir-item.h"
64	#include "file-item.h"
65	#include "uuid-tree.h"
66	#include "ioctl.h"
67	#include "file.h"
68	#include "acl.h"
69	#include "relocation.h"
70	#include "verity.h"
71	#include "super.h"
72	#include "orphan.h"
73	#include "backref.h"
74	#include "raid-stripe-tree.h"
75
76	struct btrfs_iget_args {
77	u64 ino;
78	struct btrfs_root *root;
79	};
80
81	struct btrfs_dio_data {
82	ssize_t submitted;
83	struct extent_changeset *data_reserved;
84	struct btrfs_ordered_extent *ordered;
85	bool data_space_reserved;
86	bool nocow_done;
87	};
88
89	struct btrfs_dio_private {
90	/ Range of I/O /
91	u64 file_offset;
92	u32 bytes;
93
94	/ This must be last /
95	struct btrfs_bio bbio;
96	};
97
98	static struct bio_set btrfs_dio_bioset;
99
100	struct btrfs_rename_ctx {
101	/ Output field. Stores the index number of the old directory entry. /
102	u64 index;
103	};
104
105	/*
106	* Used by data_reloc_print_warning_inode() to pass needed info for filename
107	* resolution and output of error message.
108	*/
109	struct data_reloc_warn {
110	struct btrfs_path path;
111	struct btrfs_fs_info *fs_info;
112	u64 extent_item_size;
113	u64 logical;
114	int mirror_num;
115	};
116
117	static const struct inode_operations btrfs_dir_inode_operations;
118	static const struct inode_operations btrfs_symlink_inode_operations;
119	static const struct inode_operations btrfs_special_inode_operations;
120	static const struct inode_operations btrfs_file_inode_operations;
121	static const struct address_space_operations btrfs_aops;
122	static const struct file_operations btrfs_dir_file_operations;
123
124	static struct kmem_cache *btrfs_inode_cachep;
125
126	static int btrfs_setsize(struct inode inode, struct* iattr *attr);
127	static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
128
129	static noinline int run_delalloc_cow(struct btrfs_inode *inode,
130	struct page *locked_page, u64 start,
131	u64 end, struct writeback_control *wbc,
132	bool pages_dirty);
133	static struct extent_map create_io_em(struct* btrfs_inode *inode, u64 start,
134	u64 len, u64 orig_start, u64 block_start,
135	u64 block_len, u64 orig_block_len,
136	u64 ram_bytes, int compress_type,
137	int type);
138
139	static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
140	u64 root, void *warn_ctx)
141	{
142	struct data_reloc_warn *warn = warn_ctx;
143	struct btrfs_fs_info *fs_info = warn->fs_info;
144	struct extent_buffer *eb;
145	struct btrfs_inode_item *inode_item;
146	struct inode_fs_paths *ipath = NULL;
147	struct btrfs_root *local_root;
148	struct btrfs_key key;
149	unsigned int nofs_flag;
150	u32 nlink;
151	int ret;
152
153	local_root = btrfs_get_fs_root(fs_info, objectid: root, check_ref: true);
154	if (IS_ERR(ptr: local_root)) {
155	ret = PTR_ERR(ptr: local_root);
156	goto err;
157	}
158
159	/ This makes the path point to (inum INODE_ITEM ioff). /
160	key.objectid = inum;
161	key.type = BTRFS_INODE_ITEM_KEY;
162	key.offset = `0`;
163
164	ret = btrfs_search_slot(NULL, root: local_root, key: &key, p: &warn->path, ins_len: `0`, cow: `0`);
165	if (ret) {
166	btrfs_put_root(root: local_root);
167	btrfs_release_path(p: &warn->path);
168	goto err;
169	}
170
171	eb = warn->path.nodes[`0`];
172	inode_item = btrfs_item_ptr(eb, warn->path.slots[`0`], struct btrfs_inode_item);
173	nlink = btrfs_inode_nlink(eb, s: inode_item);
174	btrfs_release_path(p: &warn->path);
175
176	nofs_flag = memalloc_nofs_save();
177	ipath = init_ipath(total_bytes: `4096`, fs_root: local_root, path: &warn->path);
178	memalloc_nofs_restore(flags: nofs_flag);
179	if (IS_ERR(ptr: ipath)) {
180	btrfs_put_root(root: local_root);
181	ret = PTR_ERR(ptr: ipath);
182	ipath = NULL;
183	/*
184	* -ENOMEM, not a critical error, just output an generic error
185	* without filename.
186	*/
187	btrfs_warn(fs_info,
188	"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
189	warn->logical, warn->mirror_num, root, inum, offset);
190	return ret;
191	}
192	ret = paths_from_inode(inum, ipath);
193	if (ret < `0`)
194	goto err;
195
196	/*
197	* We deliberately ignore the bit ipath might have been too small to
198	* hold all of the paths here
199	*/
200	for (int i = `0`; i < ipath->fspath->elem_cnt; i++) {
201	btrfs_warn(fs_info,
202	"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
203	warn->logical, warn->mirror_num, root, inum, offset,
204	fs_info->sectorsize, nlink,
205	(char )(unsigned* long)ipath->fspath->val[i]);
206	}
207
208	btrfs_put_root(root: local_root);
209	free_ipath(ipath);
210	return `0`;
211
212	err:
213	btrfs_warn(fs_info,
214	"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
215	warn->logical, warn->mirror_num, root, inum, offset, ret);
216
217	free_ipath(ipath);
218	return ret;
219	}
220
221	/*
222	* Do extra user-friendly error output (e.g. lookup all the affected files).
223	*
224	* Return true if we succeeded doing the backref lookup.
225	* Return false if such lookup failed, and has to fallback to the old error message.
226	*/
227	static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
228	const u8 csum, const* u8 *csum_expected,
229	int mirror_num)
230	{
231	struct btrfs_fs_info *fs_info = inode->root->fs_info;
232	struct btrfs_path path = { `0` };
233	struct btrfs_key found_key = { `0` };
234	struct extent_buffer *eb;
235	struct btrfs_extent_item *ei;
236	const u32 csum_size = fs_info->csum_size;
237	u64 logical;
238	u64 flags;
239	u32 item_size;
240	int ret;
241
242	mutex_lock(&fs_info->reloc_mutex);
243	logical = btrfs_get_reloc_bg_bytenr(fs_info);
244	mutex_unlock(lock: &fs_info->reloc_mutex);
245
246	if (logical == U64_MAX) {
247	btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
248	btrfs_warn_rl(fs_info,
249	"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
250	inode->root->root_key.objectid, btrfs_ino(inode), file_off,
251	CSUM_FMT_VALUE(csum_size, csum),
252	CSUM_FMT_VALUE(csum_size, csum_expected),
253	mirror_num);
254	return;
255	}
256
257	logical += file_off;
258	btrfs_warn_rl(fs_info,
259	"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
260	inode->root->root_key.objectid,
261	btrfs_ino(inode), file_off, logical,
262	CSUM_FMT_VALUE(csum_size, csum),
263	CSUM_FMT_VALUE(csum_size, csum_expected),
264	mirror_num);
265
266	ret = extent_from_logical(fs_info, logical, path: &path, found_key: &found_key, flags: &flags);
267	if (ret < `0`) {
268	btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
269	logical, ret);
270	return;
271	}
272	eb = path.nodes[`0`];
273	ei = btrfs_item_ptr(eb, path.slots[`0`], struct btrfs_extent_item);
274	item_size = btrfs_item_size(eb, slot: path.slots[`0`]);
275	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
276	unsigned long ptr = `0`;
277	u64 ref_root;
278	u8 ref_level;
279
280	while (true) {
281	ret = tree_backref_for_extent(ptr: &ptr, eb, key: &found_key, ei,
282	item_size, out_root: &ref_root,
283	out_level: &ref_level);
284	if (ret < `0`) {
285	btrfs_warn_rl(fs_info,
286	"failed to resolve tree backref for logical %llu: %d",
287	logical, ret);
288	break;
289	}
290	if (ret > `0`)
291	break;
292
293	btrfs_warn_rl(fs_info,
294	"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
295	logical, mirror_num,
296	(ref_level ? "node" : "leaf"),
297	ref_level, ref_root);
298	}
299	btrfs_release_path(p: &path);
300	} else {
301	struct btrfs_backref_walk_ctx ctx = { `0` };
302	struct data_reloc_warn reloc_warn = { `0` };
303
304	btrfs_release_path(p: &path);
305
306	ctx.bytenr = found_key.objectid;
307	ctx.extent_item_pos = logical - found_key.objectid;
308	ctx.fs_info = fs_info;
309
310	reloc_warn.logical = logical;
311	reloc_warn.extent_item_size = found_key.offset;
312	reloc_warn.mirror_num = mirror_num;
313	reloc_warn.fs_info = fs_info;
314
315	iterate_extent_inodes(ctx: &ctx, search_commit_root: true,
316	iterate: data_reloc_print_warning_inode, user_ctx: &reloc_warn);
317	}
318	}
319
320	static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
321	u64 logical_start, u8 csum, u8 csum_expected, int mirror_num)
322	{
323	struct btrfs_root *root = inode->root;
324	const u32 csum_size = root->fs_info->csum_size;
325
326	/ For data reloc tree, it's better to do a backref lookup instead. /
327	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
328	return print_data_reloc_error(inode, file_off: logical_start, csum,
329	csum_expected, mirror_num);
330
331	/ Output without objectid, which is more meaningful /
332	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
333	btrfs_warn_rl(root->fs_info,
334	"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
335	root->root_key.objectid, btrfs_ino(inode),
336	logical_start,
337	CSUM_FMT_VALUE(csum_size, csum),
338	CSUM_FMT_VALUE(csum_size, csum_expected),
339	mirror_num);
340	} else {
341	btrfs_warn_rl(root->fs_info,
342	"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
343	root->root_key.objectid, btrfs_ino(inode),
344	logical_start,
345	CSUM_FMT_VALUE(csum_size, csum),
346	CSUM_FMT_VALUE(csum_size, csum_expected),
347	mirror_num);
348	}
349	}
350
351	/*
352	* Lock inode i_rwsem based on arguments passed.
353	*
354	* ilock_flags can have the following bit set:
355	*
356	* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
357	* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
358	* return -EAGAIN
359	* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
360	*/
361	int btrfs_inode_lock(struct btrfs_inode inode, unsigned* int ilock_flags)
362	{
363	if (ilock_flags & BTRFS_ILOCK_SHARED) {
364	if (ilock_flags & BTRFS_ILOCK_TRY) {
365	if (!inode_trylock_shared(inode: &inode->vfs_inode))
366	return -EAGAIN;
367	else
368	return `0`;
369	}
370	inode_lock_shared(inode: &inode->vfs_inode);
371	} else {
372	if (ilock_flags & BTRFS_ILOCK_TRY) {
373	if (!inode_trylock(inode: &inode->vfs_inode))
374	return -EAGAIN;
375	else
376	return `0`;
377	}
378	inode_lock(inode: &inode->vfs_inode);
379	}
380	if (ilock_flags & BTRFS_ILOCK_MMAP)
381	down_write(sem: &inode->i_mmap_lock);
382	return `0`;
383	}
384
385	/*
386	* Unock inode i_rwsem.
387	*
388	* ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
389	* to decide whether the lock acquired is shared or exclusive.
390	*/
391	void btrfs_inode_unlock(struct btrfs_inode inode, unsigned* int ilock_flags)
392	{
393	if (ilock_flags & BTRFS_ILOCK_MMAP)
394	up_write(sem: &inode->i_mmap_lock);
395	if (ilock_flags & BTRFS_ILOCK_SHARED)
396	inode_unlock_shared(inode: &inode->vfs_inode);
397	else
398	inode_unlock(inode: &inode->vfs_inode);
399	}
400
401	/*
402	* Cleanup all submitted ordered extents in specified range to handle errors
403	* from the btrfs_run_delalloc_range() callback.
404	*
405	* NOTE: caller must ensure that when an error happens, it can not call
406	* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
407	* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
408	* to be released, which we want to happen only when finishing the ordered
409	* extent (btrfs_finish_ordered_io()).
410	*/
411	static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
412	struct page *locked_page,
413	u64 offset, u64 bytes)
414	{
415	unsigned long index = offset >> PAGE_SHIFT;
416	unsigned long end_index = (offset + bytes - `1`) >> PAGE_SHIFT;
417	u64 page_start = `0`, page_end = `0`;
418	struct page *page;
419
420	if (locked_page) {
421	page_start = page_offset(page: locked_page);
422	page_end = page_start + PAGE_SIZE - `1`;
423	}
424
425	while (index <= end_index) {
426	/*
427	* For locked page, we will call btrfs_mark_ordered_io_finished
428	* through btrfs_mark_ordered_io_finished() on it
429	* in run_delalloc_range() for the error handling, which will
430	* clear page Ordered and run the ordered extent accounting.
431	*
432	* Here we can't just clear the Ordered bit, or
433	* btrfs_mark_ordered_io_finished() would skip the accounting
434	* for the page range, and the ordered extent will never finish.
435	*/
436	if (locked_page && index == (page_start >> PAGE_SHIFT)) {
437	index++;
438	continue;
439	}
440	page = find_get_page(mapping: inode->vfs_inode.i_mapping, offset: index);
441	index++;
442	if (!page)
443	continue;
444
445	/*
446	* Here we just clear all Ordered bits for every page in the
447	* range, then btrfs_mark_ordered_io_finished() will handle
448	* the ordered extent accounting for the range.
449	*/
450	btrfs_page_clamp_clear_ordered(fs_info: inode->root->fs_info, page,
451	start: offset, len: bytes);
452	put_page(page);
453	}
454
455	if (locked_page) {
456	/ The locked page covers the full range, nothing needs to be done /
457	if (bytes + offset <= page_start + PAGE_SIZE)
458	return;
459	/*
460	* In case this page belongs to the delalloc range being
461	* instantiated then skip it, since the first page of a range is
462	* going to be properly cleaned up by the caller of
463	* run_delalloc_range
464	*/
465	if (page_start >= offset && page_end <= (offset + bytes - `1`)) {
466	bytes = offset + bytes - page_offset(page: locked_page) - PAGE_SIZE;
467	offset = page_offset(page: locked_page) + PAGE_SIZE;
468	}
469	}
470
471	return btrfs_mark_ordered_io_finished(inode, NULL, file_offset: offset, num_bytes: bytes, uptodate: false);
472	}
473
474	static int btrfs_dirty_inode(struct btrfs_inode *inode);
475
476	static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
477	struct btrfs_new_inode_args *args)
478	{
479	int err;
480
481	if (args->default_acl) {
482	err = __btrfs_set_acl(trans, inode: args->inode, acl: args->default_acl,
483	ACL_TYPE_DEFAULT);
484	if (err)
485	return err;
486	}
487	if (args->acl) {
488	err = __btrfs_set_acl(trans, inode: args->inode, acl: args->acl, ACL_TYPE_ACCESS);
489	if (err)
490	return err;
491	}
492	if (!args->default_acl && !args->acl)
493	cache_no_acl(inode: args->inode);
494	return btrfs_xattr_security_init(trans, inode: args->inode, dir: args->dir,
495	qstr: &args->dentry->d_name);
496	}
497
498	/*
499	* this does all the hard work for inserting an inline extent into
500	* the btree. The caller should have done a btrfs_drop_extents so that
501	* no overlapping inline items exist in the btree
502	*/
503	static int insert_inline_extent(struct btrfs_trans_handle *trans,
504	struct btrfs_path *path,
505	struct btrfs_inode *inode, bool extent_inserted,
506	size_t size, size_t compressed_size,
507	int compress_type,
508	struct page **compressed_pages,
509	bool update_i_size)
510	{
511	struct btrfs_root *root = inode->root;
512	struct extent_buffer *leaf;
513	struct page *page = NULL;
514	char *kaddr;
515	unsigned long ptr;
516	struct btrfs_file_extent_item *ei;
517	int ret;
518	size_t cur_size = size;
519	u64 i_size;
520
521	ASSERT((compressed_size > `0` && compressed_pages) \|\|
522	(compressed_size == `0` && !compressed_pages));
523
524	if (compressed_size && compressed_pages)
525	cur_size = compressed_size;
526
527	if (!extent_inserted) {
528	struct btrfs_key key;
529	size_t datasize;
530
531	key.objectid = btrfs_ino(inode);
532	key.offset = `0`;
533	key.type = BTRFS_EXTENT_DATA_KEY;
534
535	datasize = btrfs_file_extent_calc_inline_size(datasize: cur_size);
536	ret = btrfs_insert_empty_item(trans, root, path, key: &key,
537	data_size: datasize);
538	if (ret)
539	goto fail;
540	}
541	leaf = path->nodes[`0`];
542	ei = btrfs_item_ptr(leaf, path->slots[`0`],
543	struct btrfs_file_extent_item);
544	btrfs_set_file_extent_generation(eb: leaf, s: ei, val: trans->transid);
545	btrfs_set_file_extent_type(eb: leaf, s: ei, val: BTRFS_FILE_EXTENT_INLINE);
546	btrfs_set_file_extent_encryption(eb: leaf, s: ei, val: `0`);
547	btrfs_set_file_extent_other_encoding(eb: leaf, s: ei, val: `0`);
548	btrfs_set_file_extent_ram_bytes(eb: leaf, s: ei, val: size);
549	ptr = btrfs_file_extent_inline_start(e: ei);
550
551	if (compress_type != BTRFS_COMPRESS_NONE) {
552	struct page *cpage;
553	int i = `0`;
554	while (compressed_size > `0`) {
555	cpage = compressed_pages[i];
556	cur_size = min_t(unsigned long, compressed_size,
557	PAGE_SIZE);
558
559	kaddr = kmap_local_page(page: cpage);
560	write_extent_buffer(eb: leaf, src: kaddr, start: ptr, len: cur_size);
561	kunmap_local(kaddr);
562
563	i++;
564	ptr += cur_size;
565	compressed_size -= cur_size;
566	}
567	btrfs_set_file_extent_compression(eb: leaf, s: ei,
568	val: compress_type);
569	} else {
570	page = find_get_page(mapping: inode->vfs_inode.i_mapping, offset: `0`);
571	btrfs_set_file_extent_compression(eb: leaf, s: ei, val: `0`);
572	kaddr = kmap_local_page(page);
573	write_extent_buffer(eb: leaf, src: kaddr, start: ptr, len: size);
574	kunmap_local(kaddr);
575	put_page(page);
576	}
577	btrfs_mark_buffer_dirty(trans, buf: leaf);
578	btrfs_release_path(p: path);
579
580	/*
581	* We align size to sectorsize for inline extents just for simplicity
582	* sake.
583	*/
584	ret = btrfs_inode_set_file_extent_range(inode, start: `0`,
585	ALIGN(size, root->fs_info->sectorsize));
586	if (ret)
587	goto fail;
588
589	/*
590	* We're an inline extent, so nobody can extend the file past i_size
591	* without locking a page we already have locked.
592	*
593	* We must do any i_size and inode updates before we unlock the pages.
594	* Otherwise we could end up racing with unlink.
595	*/
596	i_size = i_size_read(inode: &inode->vfs_inode);
597	if (update_i_size && size > i_size) {
598	i_size_write(inode: &inode->vfs_inode, i_size: size);
599	i_size = size;
600	}
601	inode->disk_i_size = i_size;
602
603	fail:
604	return ret;
605	}
606
607
608	/*
609	* conditionally insert an inline extent into the file. This
610	* does the checks required to make sure the data is small enough
611	* to fit as an inline extent.
612	*/
613	static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
614	size_t compressed_size,
615	int compress_type,
616	struct page **compressed_pages,
617	bool update_i_size)
618	{
619	struct btrfs_drop_extents_args drop_args = { `0` };
620	struct btrfs_root *root = inode->root;
621	struct btrfs_fs_info *fs_info = root->fs_info;
622	struct btrfs_trans_handle *trans;
623	u64 data_len = (compressed_size ?: size);
624	int ret;
625	struct btrfs_path *path;
626
627	/*
628	* We can create an inline extent if it ends at or beyond the current
629	* i_size, is no larger than a sector (decompressed), and the (possibly
630	* compressed) data fits in a leaf and the configured maximum inline
631	* size.
632	*/
633	if (size < i_size_read(inode: &inode->vfs_inode) \|\|
634	size > fs_info->sectorsize \|\|
635	data_len > BTRFS_MAX_INLINE_DATA_SIZE(info: fs_info) \|\|
636	data_len > fs_info->max_inline)
637	return `1`;
638
639	path = btrfs_alloc_path();
640	if (!path)
641	return -ENOMEM;
642
643	trans = btrfs_join_transaction(root);
644	if (IS_ERR(ptr: trans)) {
645	btrfs_free_path(p: path);
646	return PTR_ERR(ptr: trans);
647	}
648	trans->block_rsv = &inode->block_rsv;
649
650	drop_args.path = path;
651	drop_args.start = `0`;
652	drop_args.end = fs_info->sectorsize;
653	drop_args.drop_cache = true;
654	drop_args.replace_extent = true;
655	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(datasize: data_len);
656	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
657	if (ret) {
658	btrfs_abort_transaction(trans, ret);
659	goto out;
660	}
661
662	ret = insert_inline_extent(trans, path, inode, extent_inserted: drop_args.extent_inserted,
663	size, compressed_size, compress_type,
664	compressed_pages, update_i_size);
665	if (ret && ret != -ENOSPC) {
666	btrfs_abort_transaction(trans, ret);
667	goto out;
668	} else if (ret == -ENOSPC) {
669	ret = `1`;
670	goto out;
671	}
672
673	btrfs_update_inode_bytes(inode, add_bytes: size, del_bytes: drop_args.bytes_found);
674	ret = btrfs_update_inode(trans, inode);
675	if (ret && ret != -ENOSPC) {
676	btrfs_abort_transaction(trans, ret);
677	goto out;
678	} else if (ret == -ENOSPC) {
679	ret = `1`;
680	goto out;
681	}
682
683	btrfs_set_inode_full_sync(inode);
684	out:
685	/*
686	* Don't forget to free the reserved space, as for inlined extent
687	* it won't count as data extent, free them directly here.
688	* And at reserve time, it's always aligned to page size, so
689	* just free one page here.
690	*/
691	btrfs_qgroup_free_data(inode, NULL, start: `0`, PAGE_SIZE);
692	btrfs_free_path(p: path);
693	btrfs_end_transaction(trans);
694	return ret;
695	}
696
697	struct async_extent {
698	u64 start;
699	u64 ram_size;
700	u64 compressed_size;
701	struct page **pages;
702	unsigned long nr_pages;
703	int compress_type;
704	struct list_head list;
705	};
706
707	struct async_chunk {
708	struct btrfs_inode *inode;
709	struct page *locked_page;
710	u64 start;
711	u64 end;
712	blk_opf_t write_flags;
713	struct list_head extents;
714	struct cgroup_subsys_state *blkcg_css;
715	struct btrfs_work work;
716	struct async_cow *async_cow;
717	};
718
719	struct async_cow {
720	atomic_t num_chunks;
721	struct async_chunk chunks[];
722	};
723
724	static noinline int add_async_extent(struct async_chunk *cow,
725	u64 start, u64 ram_size,
726	u64 compressed_size,
727	struct page **pages,
728	unsigned long nr_pages,
729	int compress_type)
730	{
731	struct async_extent *async_extent;
732
733	async_extent = kmalloc(size: sizeof(*async_extent), GFP_NOFS);
734	BUG_ON(!async_extent); / -ENOMEM /
735	async_extent->start = start;
736	async_extent->ram_size = ram_size;
737	async_extent->compressed_size = compressed_size;
738	async_extent->pages = pages;
739	async_extent->nr_pages = nr_pages;
740	async_extent->compress_type = compress_type;
741	list_add_tail(new: &async_extent->list, head: &cow->extents);
742	return `0`;
743	}
744
745	/*
746	* Check if the inode needs to be submitted to compression, based on mount
747	* options, defragmentation, properties or heuristics.
748	*/
749	static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
750	u64 end)
751	{
752	struct btrfs_fs_info *fs_info = inode->root->fs_info;
753
754	if (!btrfs_inode_can_compress(inode)) {
755	WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
756	KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
757	btrfs_ino(inode));
758	return `0`;
759	}
760	/*
761	* Special check for subpage.
762	*
763	* We lock the full page then run each delalloc range in the page, thus
764	* for the following case, we will hit some subpage specific corner case:
765	*
766	* 0 32K 64K
767	* \| \|///////\| \|///////\|
768	* \- A \- B
769	*
770	* In above case, both range A and range B will try to unlock the full
771	* page [0, 64K), causing the one finished later will have page
772	* unlocked already, triggering various page lock requirement BUG_ON()s.
773	*
774	* So here we add an artificial limit that subpage compression can only
775	* if the range is fully page aligned.
776	*
777	* In theory we only need to ensure the first page is fully covered, but
778	* the tailing partial page will be locked until the full compression
779	* finishes, delaying the write of other range.
780	*
781	* TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
782	* first to prevent any submitted async extent to unlock the full page.
783	* By this, we can ensure for subpage case that only the last async_cow
784	* will unlock the full page.
785	*/
786	if (fs_info->sectorsize < PAGE_SIZE) {
787	if (!PAGE_ALIGNED(start) \|\|
788	!PAGE_ALIGNED(end + `1`))
789	return `0`;
790	}
791
792	/ force compress /
793	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
794	return `1`;
795	/ defrag ioctl /
796	if (inode->defrag_compress)
797	return `1`;
798	/ bad compression ratios /
799	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
800	return `0`;
801	if (btrfs_test_opt(fs_info, COMPRESS) \|\|
802	inode->flags & BTRFS_INODE_COMPRESS \|\|
803	inode->prop_compress)
804	return btrfs_compress_heuristic(inode: &inode->vfs_inode, start, end);
805	return `0`;
806	}
807
808	static inline void inode_should_defrag(struct btrfs_inode *inode,
809	u64 start, u64 end, u64 num_bytes, u32 small_write)
810	{
811	/ If this is a small write inside eof, kick off a defrag /
812	if (num_bytes < small_write &&
813	(start > `0` \|\| end + `1` < inode->disk_i_size))
814	btrfs_add_inode_defrag(NULL, inode, extent_thresh: small_write);
815	}
816
817	/*
818	* Work queue call back to started compression on a file and pages.
819	*
820	* This is done inside an ordered work queue, and the compression is spread
821	* across many cpus. The actual IO submission is step two, and the ordered work
822	* queue takes care of making sure that happens in the same order things were
823	* put onto the queue by writepages and friends.
824	*
825	* If this code finds it can't get good compression, it puts an entry onto the
826	* work queue to write the uncompressed bytes. This makes sure that both
827	* compressed inodes and uncompressed inodes are written in the same order that
828	* the flusher thread sent them down.
829	*/
830	static void compress_file_range(struct btrfs_work *work)
831	{
832	struct async_chunk *async_chunk =
833	container_of(work, struct async_chunk, work);
834	struct btrfs_inode *inode = async_chunk->inode;
835	struct btrfs_fs_info *fs_info = inode->root->fs_info;
836	struct address_space *mapping = inode->vfs_inode.i_mapping;
837	u64 blocksize = fs_info->sectorsize;
838	u64 start = async_chunk->start;
839	u64 end = async_chunk->end;
840	u64 actual_end;
841	u64 i_size;
842	int ret = `0`;
843	struct page **pages;
844	unsigned long nr_pages;
845	unsigned long total_compressed = `0`;
846	unsigned long total_in = `0`;
847	unsigned int poff;
848	int i;
849	int compress_type = fs_info->compress_type;
850
851	inode_should_defrag(inode, start, end, num_bytes: end - start + `1`, SZ_16K);
852
853	/*
854	* We need to call clear_page_dirty_for_io on each page in the range.
855	* Otherwise applications with the file mmap'd can wander in and change
856	* the page contents while we are compressing them.
857	*/
858	extent_range_clear_dirty_for_io(inode: &inode->vfs_inode, start, end);
859
860	/*
861	* We need to save i_size before now because it could change in between
862	* us evaluating the size and assigning it. This is because we lock and
863	* unlock the page in truncate and fallocate, and then modify the i_size
864	* later on.
865	*
866	* The barriers are to emulate READ_ONCE, remove that once i_size_read
867	* does that for us.
868	*/
869	barrier();
870	i_size = i_size_read(inode: &inode->vfs_inode);
871	barrier();
872	actual_end = min_t(u64, i_size, end + `1`);
873	again:
874	pages = NULL;
875	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + `1`;
876	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
877
878	/*
879	* we don't want to send crud past the end of i_size through
880	* compression, that's just a waste of CPU time. So, if the
881	* end of the file is before the start of our current
882	* requested range of bytes, we bail out to the uncompressed
883	* cleanup code that can deal with all of this.
884	*
885	* It isn't really the fastest way to fix things, but this is a
886	* very uncommon corner.
887	*/
888	if (actual_end <= start)
889	goto cleanup_and_bail_uncompressed;
890
891	total_compressed = actual_end - start;
892
893	/*
894	* Skip compression for a small file range(<=blocksize) that
895	* isn't an inline extent, since it doesn't save disk space at all.
896	*/
897	if (total_compressed <= blocksize &&
898	(start > `0` \|\| end + `1` < inode->disk_i_size))
899	goto cleanup_and_bail_uncompressed;
900
901	/*
902	* For subpage case, we require full page alignment for the sector
903	* aligned range.
904	* Thus we must also check against @actual_end, not just @end.
905	*/
906	if (blocksize < PAGE_SIZE) {
907	if (!PAGE_ALIGNED(start) \|\|
908	!PAGE_ALIGNED(round_up(actual_end, blocksize)))
909	goto cleanup_and_bail_uncompressed;
910	}
911
912	total_compressed = min_t(unsigned long, total_compressed,
913	BTRFS_MAX_UNCOMPRESSED);
914	total_in = `0`;
915	ret = `0`;
916
917	/*
918	* We do compression for mount -o compress and when the inode has not
919	* been flagged as NOCOMPRESS. This flag can change at any time if we
920	* discover bad compression ratios.
921	*/
922	if (!inode_need_compress(inode, start, end))
923	goto cleanup_and_bail_uncompressed;
924
925	pages = kcalloc(n: nr_pages, size: sizeof(struct page *), GFP_NOFS);
926	if (!pages) {
927	/*
928	* Memory allocation failure is not a fatal error, we can fall
929	* back to uncompressed code.
930	*/
931	goto cleanup_and_bail_uncompressed;
932	}
933
934	if (inode->defrag_compress)
935	compress_type = inode->defrag_compress;
936	else if (inode->prop_compress)
937	compress_type = inode->prop_compress;
938
939	/ Compression level is applied here. /
940	ret = btrfs_compress_pages(type_level: compress_type \| (fs_info->compress_level << `4`),
941	mapping, start, pages, out_pages: &nr_pages, total_in: &total_in,
942	total_out: &total_compressed);
943	if (ret)
944	goto mark_incompressible;
945
946	/*
947	* Zero the tail end of the last page, as we might be sending it down
948	* to disk.
949	*/
950	poff = offset_in_page(total_compressed);
951	if (poff)
952	memzero_page(page: pages[nr_pages - `1`], offset: poff, PAGE_SIZE - poff);
953
954	/*
955	* Try to create an inline extent.
956	*
957	* If we didn't compress the entire range, try to create an uncompressed
958	* inline extent, else a compressed one.
959	*
960	* Check cow_file_range() for why we don't even try to create inline
961	* extent for the subpage case.
962	*/
963	if (start == `0` && fs_info->sectorsize == PAGE_SIZE) {
964	if (total_in < actual_end) {
965	ret = cow_file_range_inline(inode, size: actual_end, compressed_size: `0`,
966	compress_type: BTRFS_COMPRESS_NONE, NULL,
967	update_i_size: false);
968	} else {
969	ret = cow_file_range_inline(inode, size: actual_end,
970	compressed_size: total_compressed,
971	compress_type, compressed_pages: pages,
972	update_i_size: false);
973	}
974	if (ret <= `0`) {
975	unsigned long clear_flags = EXTENT_DELALLOC \|
976	EXTENT_DELALLOC_NEW \| EXTENT_DEFRAG \|
977	EXTENT_DO_ACCOUNTING;
978
979	if (ret < `0`)
980	mapping_set_error(mapping, error: -EIO);
981
982	/*
983	* inline extent creation worked or returned error,
984	* we don't need to create any more async work items.
985	* Unlock and free up our temp pages.
986	*
987	* We use DO_ACCOUNTING here because we need the
988	* delalloc_release_metadata to be done _after_ we drop
989	* our outstanding extent for clearing delalloc for this
990	* range.
991	*/
992	extent_clear_unlock_delalloc(inode, start, end,
993	NULL,
994	bits_to_clear: clear_flags,
995	page_ops: PAGE_UNLOCK \|
996	PAGE_START_WRITEBACK \|
997	PAGE_END_WRITEBACK);
998	goto free_pages;
999	}
1000	}
1001
1002	/*
1003	* We aren't doing an inline extent. Round the compressed size up to a
1004	* block size boundary so the allocator does sane things.
1005	*/
1006	total_compressed = ALIGN(total_compressed, blocksize);
1007
1008	/*
1009	* One last check to make sure the compression is really a win, compare
1010	* the page count read with the blocks on disk, compression must free at
1011	* least one sector.
1012	*/
1013	total_in = round_up(total_in, fs_info->sectorsize);
1014	if (total_compressed + blocksize > total_in)
1015	goto mark_incompressible;
1016
1017	/*
1018	* The async work queues will take care of doing actual allocation on
1019	* disk for these compressed pages, and will submit the bios.
1020	*/
1021	add_async_extent(cow: async_chunk, start, ram_size: total_in, compressed_size: total_compressed, pages,
1022	nr_pages, compress_type);
1023	if (start + total_in < end) {
1024	start += total_in;
1025	cond_resched();
1026	goto again;
1027	}
1028	return;
1029
1030	mark_incompressible:
1031	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1032	inode->flags \|= BTRFS_INODE_NOCOMPRESS;
1033	cleanup_and_bail_uncompressed:
1034	add_async_extent(cow: async_chunk, start, ram_size: end - start + `1`, compressed_size: `0`, NULL, nr_pages: `0`,
1035	compress_type: BTRFS_COMPRESS_NONE);
1036	free_pages:
1037	if (pages) {
1038	for (i = `0`; i < nr_pages; i++) {
1039	WARN_ON(pages[i]->mapping);
1040	put_page(page: pages[i]);
1041	}
1042	kfree(objp: pages);
1043	}
1044	}
1045
1046	static void free_async_extent_pages(struct async_extent *async_extent)
1047	{
1048	int i;
1049
1050	if (!async_extent->pages)
1051	return;
1052
1053	for (i = `0`; i < async_extent->nr_pages; i++) {
1054	WARN_ON(async_extent->pages[i]->mapping);
1055	put_page(page: async_extent->pages[i]);
1056	}
1057	kfree(objp: async_extent->pages);
1058	async_extent->nr_pages = `0`;
1059	async_extent->pages = NULL;
1060	}
1061
1062	static void submit_uncompressed_range(struct btrfs_inode *inode,
1063	struct async_extent *async_extent,
1064	struct page *locked_page)
1065	{
1066	u64 start = async_extent->start;
1067	u64 end = async_extent->start + async_extent->ram_size - `1`;
1068	int ret;
1069	struct writeback_control wbc = {
1070	.sync_mode = WB_SYNC_ALL,
1071	.range_start = start,
1072	.range_end = end,
1073	.no_cgroup_owner = `1`,
1074	};
1075
1076	wbc_attach_fdatawrite_inode(wbc: &wbc, inode: &inode->vfs_inode);
1077	ret = run_delalloc_cow(inode, locked_page, start, end, wbc: &wbc, pages_dirty: false);
1078	wbc_detach_inode(wbc: &wbc);
1079	if (ret < `0`) {
1080	btrfs_cleanup_ordered_extents(inode, locked_page, offset: start, bytes: end - start + `1`);
1081	if (locked_page) {
1082	const u64 page_start = page_offset(page: locked_page);
1083
1084	set_page_writeback(locked_page);
1085	end_page_writeback(page: locked_page);
1086	btrfs_mark_ordered_io_finished(inode, page: locked_page,
1087	file_offset: page_start, PAGE_SIZE,
1088	uptodate: !ret);
1089	mapping_set_error(mapping: locked_page->mapping, error: ret);
1090	unlock_page(page: locked_page);
1091	}
1092	}
1093	}
1094
1095	static void submit_one_async_extent(struct async_chunk *async_chunk,
1096	struct async_extent *async_extent,
1097	u64 *alloc_hint)
1098	{
1099	struct btrfs_inode *inode = async_chunk->inode;
1100	struct extent_io_tree *io_tree = &inode->io_tree;
1101	struct btrfs_root *root = inode->root;
1102	struct btrfs_fs_info *fs_info = root->fs_info;
1103	struct btrfs_ordered_extent *ordered;
1104	struct btrfs_key ins;
1105	struct page *locked_page = NULL;
1106	struct extent_map *em;
1107	int ret = `0`;
1108	u64 start = async_extent->start;
1109	u64 end = async_extent->start + async_extent->ram_size - `1`;
1110
1111	if (async_chunk->blkcg_css)
1112	kthread_associate_blkcg(css: async_chunk->blkcg_css);
1113
1114	/*
1115	* If async_chunk->locked_page is in the async_extent range, we need to
1116	* handle it.
1117	*/
1118	if (async_chunk->locked_page) {
1119	u64 locked_page_start = page_offset(page: async_chunk->locked_page);
1120	u64 locked_page_end = locked_page_start + PAGE_SIZE - `1`;
1121
1122	if (!(start >= locked_page_end \|\| end <= locked_page_start))
1123	locked_page = async_chunk->locked_page;
1124	}
1125	lock_extent(tree: io_tree, start, end, NULL);
1126
1127	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1128	submit_uncompressed_range(inode, async_extent, locked_page);
1129	goto done;
1130	}
1131
1132	ret = btrfs_reserve_extent(root, ram_bytes: async_extent->ram_size,
1133	num_bytes: async_extent->compressed_size,
1134	min_alloc_size: async_extent->compressed_size,
1135	empty_size: `0`, hint_byte: *alloc_hint, ins: &ins, is_data: `1`, delalloc: `1`);
1136	if (ret) {
1137	/*
1138	* Here we used to try again by going back to non-compressed
1139	* path for ENOSPC. But we can't reserve space even for
1140	* compressed size, how could it work for uncompressed size
1141	* which requires larger size? So here we directly go error
1142	* path.
1143	*/
1144	goto out_free;
1145	}
1146
1147	/ Here we're doing allocation and writeback of the compressed pages /
1148	em = create_io_em(inode, start,
1149	len: async_extent->ram_size, / len /
1150	orig_start: start, / orig_start /
1151	block_start: ins.objectid, / block_start /
1152	block_len: ins.offset, / block_len /
1153	orig_block_len: ins.offset, / orig_block_len /
1154	ram_bytes: async_extent->ram_size, / ram_bytes /
1155	compress_type: async_extent->compress_type,
1156	type: BTRFS_ORDERED_COMPRESSED);
1157	if (IS_ERR(ptr: em)) {
1158	ret = PTR_ERR(ptr: em);
1159	goto out_free_reserve;
1160	}
1161	free_extent_map(em);
1162
1163	ordered = btrfs_alloc_ordered_extent(inode, file_offset: start, / file_offset /
1164	num_bytes: async_extent->ram_size, / num_bytes /
1165	ram_bytes: async_extent->ram_size, / ram_bytes /
1166	disk_bytenr: ins.objectid, / disk_bytenr /
1167	disk_num_bytes: ins.offset, / disk_num_bytes /
1168	offset: `0`, / offset /
1169	flags: `1` << BTRFS_ORDERED_COMPRESSED,
1170	compress_type: async_extent->compress_type);
1171	if (IS_ERR(ptr: ordered)) {
1172	btrfs_drop_extent_map_range(inode, start, end, skip_pinned: false);
1173	ret = PTR_ERR(ptr: ordered);
1174	goto out_free_reserve;
1175	}
1176	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
1177
1178	/ Clear dirty, set writeback and unlock the pages. /
1179	extent_clear_unlock_delalloc(inode, start, end,
1180	NULL, bits_to_clear: EXTENT_LOCKED \| EXTENT_DELALLOC,
1181	page_ops: PAGE_UNLOCK \| PAGE_START_WRITEBACK);
1182	btrfs_submit_compressed_write(ordered,
1183	compressed_pages: async_extent->pages, / compressed_pages /
1184	nr_pages: async_extent->nr_pages,
1185	write_flags: async_chunk->write_flags, writeback: true);
1186	*alloc_hint = ins.objectid + ins.offset;
1187	done:
1188	if (async_chunk->blkcg_css)
1189	kthread_associate_blkcg(NULL);
1190	kfree(objp: async_extent);
1191	return;
1192
1193	out_free_reserve:
1194	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
1195	btrfs_free_reserved_extent(fs_info, start: ins.objectid, len: ins.offset, delalloc: `1`);
1196	out_free:
1197	mapping_set_error(mapping: inode->vfs_inode.i_mapping, error: -EIO);
1198	extent_clear_unlock_delalloc(inode, start, end,
1199	NULL, bits_to_clear: EXTENT_LOCKED \| EXTENT_DELALLOC \|
1200	EXTENT_DELALLOC_NEW \|
1201	EXTENT_DEFRAG \| EXTENT_DO_ACCOUNTING,
1202	page_ops: PAGE_UNLOCK \| PAGE_START_WRITEBACK \|
1203	PAGE_END_WRITEBACK);
1204	free_async_extent_pages(async_extent);
1205	if (async_chunk->blkcg_css)
1206	kthread_associate_blkcg(NULL);
1207	btrfs_debug(fs_info,
1208	"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1209	root->root_key.objectid, btrfs_ino(inode), start,
1210	async_extent->ram_size, ret);
1211	kfree(objp: async_extent);
1212	}
1213
1214	static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1215	u64 num_bytes)
1216	{
1217	struct extent_map_tree *em_tree = &inode->extent_tree;
1218	struct extent_map *em;
1219	u64 alloc_hint = `0`;
1220
1221	read_lock(&em_tree->lock);
1222	em = search_extent_mapping(tree: em_tree, start, len: num_bytes);
1223	if (em) {
1224	/*
1225	* if block start isn't an actual block number then find the
1226	* first block in this inode and use that as a hint. If that
1227	* block is also bogus then just don't worry about it.
1228	*/
1229	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1230	free_extent_map(em);
1231	em = search_extent_mapping(tree: em_tree, start: `0`, len: `0`);
1232	if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1233	alloc_hint = em->block_start;
1234	if (em)
1235	free_extent_map(em);
1236	} else {
1237	alloc_hint = em->block_start;
1238	free_extent_map(em);
1239	}
1240	}
1241	read_unlock(&em_tree->lock);
1242
1243	return alloc_hint;
1244	}
1245
1246	/*
1247	* when extent_io.c finds a delayed allocation range in the file,
1248	* the call backs end up in this code. The basic idea is to
1249	* allocate extents on disk for the range, and create ordered data structs
1250	* in ram to track those extents.
1251	*
1252	* locked_page is the page that writepage had locked already. We use
1253	* it to make sure we don't do extra locks or unlocks.
1254	*
1255	* When this function fails, it unlocks all pages except @locked_page.
1256	*
1257	* When this function successfully creates an inline extent, it returns 1 and
1258	* unlocks all pages including locked_page and starts I/O on them.
1259	* (In reality inline extents are limited to a single page, so locked_page is
1260	* the only page handled anyway).
1261	*
1262	* When this function succeed and creates a normal extent, the page locking
1263	* status depends on the passed in flags:
1264	*
1265	* - If @keep_locked is set, all pages are kept locked.
1266	* - Else all pages except for @locked_page are unlocked.
1267	*
1268	* When a failure happens in the second or later iteration of the
1269	* while-loop, the ordered extents created in previous iterations are kept
1270	* intact. So, the caller must clean them up by calling
1271	* btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1272	* example.
1273	*/
1274	static noinline int cow_file_range(struct btrfs_inode *inode,
1275	struct page *locked_page, u64 start, u64 end,
1276	u64 *done_offset,
1277	bool keep_locked, bool no_inline)
1278	{
1279	struct btrfs_root *root = inode->root;
1280	struct btrfs_fs_info *fs_info = root->fs_info;
1281	u64 alloc_hint = `0`;
1282	u64 orig_start = start;
1283	u64 num_bytes;
1284	unsigned long ram_size;
1285	u64 cur_alloc_size = `0`;
1286	u64 min_alloc_size;
1287	u64 blocksize = fs_info->sectorsize;
1288	struct btrfs_key ins;
1289	struct extent_map *em;
1290	unsigned clear_bits;
1291	unsigned long page_ops;
1292	bool extent_reserved = false;
1293	int ret = `0`;
1294
1295	if (btrfs_is_free_space_inode(inode)) {
1296	ret = -EINVAL;
1297	goto out_unlock;
1298	}
1299
1300	num_bytes = ALIGN(end - start + `1`, blocksize);
1301	num_bytes = max(blocksize, num_bytes);
1302	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1303
1304	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1305
1306	/*
1307	* Due to the page size limit, for subpage we can only trigger the
1308	* writeback for the dirty sectors of page, that means data writeback
1309	* is doing more writeback than what we want.
1310	*
1311	* This is especially unexpected for some call sites like fallocate,
1312	* where we only increase i_size after everything is done.
1313	* This means we can trigger inline extent even if we didn't want to.
1314	* So here we skip inline extent creation completely.
1315	*/
1316	if (start == `0` && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
1317	u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1318	end + `1`);
1319
1320	/ lets try to make an inline extent /
1321	ret = cow_file_range_inline(inode, size: actual_end, compressed_size: `0`,
1322	compress_type: BTRFS_COMPRESS_NONE, NULL, update_i_size: false);
1323	if (ret == `0`) {
1324	/*
1325	* We use DO_ACCOUNTING here because we need the
1326	* delalloc_release_metadata to be run _after_ we drop
1327	* our outstanding extent for clearing delalloc for this
1328	* range.
1329	*/
1330	extent_clear_unlock_delalloc(inode, start, end,
1331	locked_page,
1332	bits_to_clear: EXTENT_LOCKED \| EXTENT_DELALLOC \|
1333	EXTENT_DELALLOC_NEW \| EXTENT_DEFRAG \|
1334	EXTENT_DO_ACCOUNTING, page_ops: PAGE_UNLOCK \|
1335	PAGE_START_WRITEBACK \| PAGE_END_WRITEBACK);
1336	/*
1337	* locked_page is locked by the caller of
1338	* writepage_delalloc(), not locked by
1339	* __process_pages_contig().
1340	*
1341	* We can't let __process_pages_contig() to unlock it,
1342	* as it doesn't have any subpage::writers recorded.
1343	*
1344	* Here we manually unlock the page, since the caller
1345	* can't determine if it's an inline extent or a
1346	* compressed extent.
1347	*/
1348	unlock_page(page: locked_page);
1349	ret = `1`;
1350	goto done;
1351	} else if (ret < `0`) {
1352	goto out_unlock;
1353	}
1354	}
1355
1356	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1357
1358	/*
1359	* Relocation relies on the relocated extents to have exactly the same
1360	* size as the original extents. Normally writeback for relocation data
1361	* extents follows a NOCOW path because relocation preallocates the
1362	* extents. However, due to an operation such as scrub turning a block
1363	* group to RO mode, it may fallback to COW mode, so we must make sure
1364	* an extent allocated during COW has exactly the requested size and can
1365	* not be split into smaller extents, otherwise relocation breaks and
1366	* fails during the stage where it updates the bytenr of file extent
1367	* items.
1368	*/
1369	if (btrfs_is_data_reloc_root(root))
1370	min_alloc_size = num_bytes;
1371	else
1372	min_alloc_size = fs_info->sectorsize;
1373
1374	while (num_bytes > `0`) {
1375	struct btrfs_ordered_extent *ordered;
1376
1377	cur_alloc_size = num_bytes;
1378	ret = btrfs_reserve_extent(root, ram_bytes: cur_alloc_size, num_bytes: cur_alloc_size,
1379	min_alloc_size, empty_size: `0`, hint_byte: alloc_hint,
1380	ins: &ins, is_data: `1`, delalloc: `1`);
1381	if (ret == -EAGAIN) {
1382	/*
1383	* btrfs_reserve_extent only returns -EAGAIN for zoned
1384	* file systems, which is an indication that there are
1385	* no active zones to allocate from at the moment.
1386	*
1387	* If this is the first loop iteration, wait for at
1388	* least one zone to finish before retrying the
1389	* allocation. Otherwise ask the caller to write out
1390	* the already allocated blocks before coming back to
1391	* us, or return -ENOSPC if it can't handle retries.
1392	*/
1393	ASSERT(btrfs_is_zoned(fs_info));
1394	if (start == orig_start) {
1395	wait_on_bit_io(word: &inode->root->fs_info->flags,
1396	bit: BTRFS_FS_NEED_ZONE_FINISH,
1397	TASK_UNINTERRUPTIBLE);
1398	continue;
1399	}
1400	if (done_offset) {
1401	*done_offset = start - `1`;
1402	return `0`;
1403	}
1404	ret = -ENOSPC;
1405	}
1406	if (ret < `0`)
1407	goto out_unlock;
1408	cur_alloc_size = ins.offset;
1409	extent_reserved = true;
1410
1411	ram_size = ins.offset;
1412	em = create_io_em(inode, start, len: ins.offset, / len /
1413	orig_start: start, / orig_start /
1414	block_start: ins.objectid, / block_start /
1415	block_len: ins.offset, / block_len /
1416	orig_block_len: ins.offset, / orig_block_len /
1417	ram_bytes: ram_size, / ram_bytes /
1418	compress_type: BTRFS_COMPRESS_NONE, / compress_type /
1419	type: BTRFS_ORDERED_REGULAR / type /);
1420	if (IS_ERR(ptr: em)) {
1421	ret = PTR_ERR(ptr: em);
1422	goto out_reserve;
1423	}
1424	free_extent_map(em);
1425
1426	ordered = btrfs_alloc_ordered_extent(inode, file_offset: start, num_bytes: ram_size,
1427	ram_bytes: ram_size, disk_bytenr: ins.objectid, disk_num_bytes: cur_alloc_size,
1428	offset: `0`, flags: `1` << BTRFS_ORDERED_REGULAR,
1429	compress_type: BTRFS_COMPRESS_NONE);
1430	if (IS_ERR(ptr: ordered)) {
1431	ret = PTR_ERR(ptr: ordered);
1432	goto out_drop_extent_cache;
1433	}
1434
1435	if (btrfs_is_data_reloc_root(root)) {
1436	ret = btrfs_reloc_clone_csums(ordered);
1437
1438	/*
1439	* Only drop cache here, and process as normal.
1440	*
1441	* We must not allow extent_clear_unlock_delalloc()
1442	* at out_unlock label to free meta of this ordered
1443	* extent, as its meta should be freed by
1444	* btrfs_finish_ordered_io().
1445	*
1446	* So we must continue until @start is increased to
1447	* skip current ordered extent.
1448	*/
1449	if (ret)
1450	btrfs_drop_extent_map_range(inode, start,
1451	end: start + ram_size - `1`,
1452	skip_pinned: false);
1453	}
1454	btrfs_put_ordered_extent(entry: ordered);
1455
1456	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
1457
1458	/*
1459	* We're not doing compressed IO, don't unlock the first page
1460	* (which the caller expects to stay locked), don't clear any
1461	* dirty bits and don't set any writeback bits
1462	*
1463	* Do set the Ordered (Private2) bit so we know this page was
1464	* properly setup for writepage.
1465	*/
1466	page_ops = (keep_locked ? `0` : PAGE_UNLOCK);
1467	page_ops \|= PAGE_SET_ORDERED;
1468
1469	extent_clear_unlock_delalloc(inode, start, end: start + ram_size - `1`,
1470	locked_page,
1471	bits_to_clear: EXTENT_LOCKED \| EXTENT_DELALLOC,
1472	page_ops);
1473	if (num_bytes < cur_alloc_size)
1474	num_bytes = `0`;
1475	else
1476	num_bytes -= cur_alloc_size;
1477	alloc_hint = ins.objectid + ins.offset;
1478	start += cur_alloc_size;
1479	extent_reserved = false;
1480
1481	/*
1482	* btrfs_reloc_clone_csums() error, since start is increased
1483	* extent_clear_unlock_delalloc() at out_unlock label won't
1484	* free metadata of current ordered extent, we're OK to exit.
1485	*/
1486	if (ret)
1487	goto out_unlock;
1488	}
1489	done:
1490	if (done_offset)
1491	*done_offset = end;
1492	return ret;
1493
1494	out_drop_extent_cache:
1495	btrfs_drop_extent_map_range(inode, start, end: start + ram_size - `1`, skip_pinned: false);
1496	out_reserve:
1497	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
1498	btrfs_free_reserved_extent(fs_info, start: ins.objectid, len: ins.offset, delalloc: `1`);
1499	out_unlock:
1500	/*
1501	* Now, we have three regions to clean up:
1502	*
1503	* \|-------(1)----\|---(2)---\|-------------(3)----------\|
1504	* `- orig_start `- start `- start + cur_alloc_size `- end
1505	*
1506	* We process each region below.
1507	*/
1508
1509	clear_bits = EXTENT_LOCKED \| EXTENT_DELALLOC \| EXTENT_DELALLOC_NEW \|
1510	EXTENT_DEFRAG \| EXTENT_CLEAR_META_RESV;
1511	page_ops = PAGE_UNLOCK \| PAGE_START_WRITEBACK \| PAGE_END_WRITEBACK;
1512
1513	/*
1514	* For the range (1). We have already instantiated the ordered extents
1515	* for this region. They are cleaned up by
1516	* btrfs_cleanup_ordered_extents() in e.g,
1517	* btrfs_run_delalloc_range(). EXTENT_LOCKED \| EXTENT_DELALLOC are
1518	* already cleared in the above loop. And, EXTENT_DELALLOC_NEW \|
1519	* EXTENT_DEFRAG \| EXTENT_CLEAR_META_RESV are handled by the cleanup
1520	* function.
1521	*
1522	* However, in case of @keep_locked, we still need to unlock the pages
1523	* (except @locked_page) to ensure all the pages are unlocked.
1524	*/
1525	if (keep_locked && orig_start < start) {
1526	if (!locked_page)
1527	mapping_set_error(mapping: inode->vfs_inode.i_mapping, error: ret);
1528	extent_clear_unlock_delalloc(inode, start: orig_start, end: start - `1`,
1529	locked_page, bits_to_clear: `0`, page_ops);
1530	}
1531
1532	/*
1533	* For the range (2). If we reserved an extent for our delalloc range
1534	* (or a subrange) and failed to create the respective ordered extent,
1535	* then it means that when we reserved the extent we decremented the
1536	* extent's size from the data space_info's bytes_may_use counter and
1537	* incremented the space_info's bytes_reserved counter by the same
1538	* amount. We must make sure extent_clear_unlock_delalloc() does not try
1539	* to decrement again the data space_info's bytes_may_use counter,
1540	* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1541	*/
1542	if (extent_reserved) {
1543	extent_clear_unlock_delalloc(inode, start,
1544	end: start + cur_alloc_size - `1`,
1545	locked_page,
1546	bits_to_clear: clear_bits,
1547	page_ops);
1548	start += cur_alloc_size;
1549	}
1550
1551	/*
1552	* For the range (3). We never touched the region. In addition to the
1553	* clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1554	* space_info's bytes_may_use counter, reserved in
1555	* btrfs_check_data_free_space().
1556	*/
1557	if (start < end) {
1558	clear_bits \|= EXTENT_CLEAR_DATA_RESV;
1559	extent_clear_unlock_delalloc(inode, start, end, locked_page,
1560	bits_to_clear: clear_bits, page_ops);
1561	}
1562	return ret;
1563	}
1564
1565	/*
1566	* Phase two of compressed writeback. This is the ordered portion of the code,
1567	* which only gets called in the order the work was queued. We walk all the
1568	* async extents created by compress_file_range and send them down to the disk.
1569	*
1570	* If called with @do_free == true then it'll try to finish the work and free
1571	* the work struct eventually.
1572	*/
1573	static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
1574	{
1575	struct async_chunk async_chunk = container_of(work, struct* async_chunk,
1576	work);
1577	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1578	struct async_extent *async_extent;
1579	unsigned long nr_pages;
1580	u64 alloc_hint = `0`;
1581
1582	if (do_free) {
1583	struct async_chunk *async_chunk;
1584	struct async_cow *async_cow;
1585
1586	async_chunk = container_of(work, struct async_chunk, work);
1587	btrfs_add_delayed_iput(inode: async_chunk->inode);
1588	if (async_chunk->blkcg_css)
1589	css_put(css: async_chunk->blkcg_css);
1590
1591	async_cow = async_chunk->async_cow;
1592	if (atomic_dec_and_test(v: &async_cow->num_chunks))
1593	kvfree(addr: async_cow);
1594	return;
1595	}
1596
1597	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1598	PAGE_SHIFT;
1599
1600	while (!list_empty(head: &async_chunk->extents)) {
1601	async_extent = list_entry(async_chunk->extents.next,
1602	struct async_extent, list);
1603	list_del(entry: &async_extent->list);
1604	submit_one_async_extent(async_chunk, async_extent, alloc_hint: &alloc_hint);
1605	}
1606
1607	/ atomic_sub_return implies a barrier /
1608	if (atomic_sub_return(i: nr_pages, v: &fs_info->async_delalloc_pages) <
1609	`5` * SZ_1M)
1610	cond_wake_up_nomb(wq: &fs_info->async_submit_wait);
1611	}
1612
1613	static bool run_delalloc_compressed(struct btrfs_inode *inode,
1614	struct page *locked_page, u64 start,
1615	u64 end, struct writeback_control *wbc)
1616	{
1617	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1618	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1619	struct async_cow *ctx;
1620	struct async_chunk *async_chunk;
1621	unsigned long nr_pages;
1622	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1623	int i;
1624	unsigned nofs_flag;
1625	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1626
1627	nofs_flag = memalloc_nofs_save();
1628	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1629	memalloc_nofs_restore(flags: nofs_flag);
1630	if (!ctx)
1631	return false;
1632
1633	unlock_extent(tree: &inode->io_tree, start, end, NULL);
1634	set_bit(nr: BTRFS_INODE_HAS_ASYNC_EXTENT, addr: &inode->runtime_flags);
1635
1636	async_chunk = ctx->chunks;
1637	atomic_set(v: &ctx->num_chunks, i: num_chunks);
1638
1639	for (i = `0`; i < num_chunks; i++) {
1640	u64 cur_end = min(end, start + SZ_512K - `1`);
1641
1642	/*
1643	* igrab is called higher up in the call chain, take only the
1644	* lightweight reference for the callback lifetime
1645	*/
1646	ihold(inode: &inode->vfs_inode);
1647	async_chunk[i].async_cow = ctx;
1648	async_chunk[i].inode = inode;
1649	async_chunk[i].start = start;
1650	async_chunk[i].end = cur_end;
1651	async_chunk[i].write_flags = write_flags;
1652	INIT_LIST_HEAD(list: &async_chunk[i].extents);
1653
1654	/*
1655	* The locked_page comes all the way from writepage and its
1656	* the original page we were actually given. As we spread
1657	* this large delalloc region across multiple async_chunk
1658	* structs, only the first struct needs a pointer to locked_page
1659	*
1660	* This way we don't need racey decisions about who is supposed
1661	* to unlock it.
1662	*/
1663	if (locked_page) {
1664	/*
1665	* Depending on the compressibility, the pages might or
1666	* might not go through async. We want all of them to
1667	* be accounted against wbc once. Let's do it here
1668	* before the paths diverge. wbc accounting is used
1669	* only for foreign writeback detection and doesn't
1670	* need full accuracy. Just account the whole thing
1671	* against the first page.
1672	*/
1673	wbc_account_cgroup_owner(wbc, page: locked_page,
1674	bytes: cur_end - start);
1675	async_chunk[i].locked_page = locked_page;
1676	locked_page = NULL;
1677	} else {
1678	async_chunk[i].locked_page = NULL;
1679	}
1680
1681	if (blkcg_css != blkcg_root_css) {
1682	css_get(css: blkcg_css);
1683	async_chunk[i].blkcg_css = blkcg_css;
1684	async_chunk[i].write_flags \|= REQ_BTRFS_CGROUP_PUNT;
1685	} else {
1686	async_chunk[i].blkcg_css = NULL;
1687	}
1688
1689	btrfs_init_work(work: &async_chunk[i].work, func: compress_file_range,
1690	ordered_func: submit_compressed_extents);
1691
1692	nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1693	atomic_add(i: nr_pages, v: &fs_info->async_delalloc_pages);
1694
1695	btrfs_queue_work(wq: fs_info->delalloc_workers, work: &async_chunk[i].work);
1696
1697	start = cur_end + `1`;
1698	}
1699	return true;
1700	}
1701
1702	/*
1703	* Run the delalloc range from start to end, and write back any dirty pages
1704	* covered by the range.
1705	*/
1706	static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1707	struct page *locked_page, u64 start,
1708	u64 end, struct writeback_control *wbc,
1709	bool pages_dirty)
1710	{
1711	u64 done_offset = end;
1712	int ret;
1713
1714	while (start <= end) {
1715	ret = cow_file_range(inode, locked_page, start, end, done_offset: &done_offset,
1716	keep_locked: true, no_inline: false);
1717	if (ret)
1718	return ret;
1719	extent_write_locked_range(inode: &inode->vfs_inode, locked_page, start,
1720	end: done_offset, wbc, pages_dirty);
1721	start = done_offset + `1`;
1722	}
1723
1724	return `1`;
1725	}
1726
1727	static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1728	u64 bytenr, u64 num_bytes, bool nowait)
1729	{
1730	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1731	struct btrfs_ordered_sum *sums;
1732	int ret;
1733	LIST_HEAD(list);
1734
1735	ret = btrfs_lookup_csums_list(root: csum_root, start: bytenr, end: bytenr + num_bytes - `1`,
1736	list: &list, search_commit: `0`, nowait);
1737	if (ret == `0` && list_empty(head: &list))
1738	return `0`;
1739
1740	while (!list_empty(head: &list)) {
1741	sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1742	list_del(entry: &sums->list);
1743	kfree(objp: sums);
1744	}
1745	if (ret < `0`)
1746	return ret;
1747	return `1`;
1748	}
1749
1750	static int fallback_to_cow(struct btrfs_inode inode, struct* page *locked_page,
1751	const u64 start, const u64 end)
1752	{
1753	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1754	const bool is_reloc_ino = btrfs_is_data_reloc_root(root: inode->root);
1755	const u64 range_bytes = end + `1` - start;
1756	struct extent_io_tree *io_tree = &inode->io_tree;
1757	u64 range_start = start;
1758	u64 count;
1759	int ret;
1760
1761	/*
1762	* If EXTENT_NORESERVE is set it means that when the buffered write was
1763	* made we had not enough available data space and therefore we did not
1764	* reserve data space for it, since we though we could do NOCOW for the
1765	* respective file range (either there is prealloc extent or the inode
1766	* has the NOCOW bit set).
1767	*
1768	* However when we need to fallback to COW mode (because for example the
1769	* block group for the corresponding extent was turned to RO mode by a
1770	* scrub or relocation) we need to do the following:
1771	*
1772	* 1) We increment the bytes_may_use counter of the data space info.
1773	* If COW succeeds, it allocates a new data extent and after doing
1774	* that it decrements the space info's bytes_may_use counter and
1775	* increments its bytes_reserved counter by the same amount (we do
1776	* this at btrfs_add_reserved_bytes()). So we need to increment the
1777	* bytes_may_use counter to compensate (when space is reserved at
1778	* buffered write time, the bytes_may_use counter is incremented);
1779	*
1780	* 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1781	* that if the COW path fails for any reason, it decrements (through
1782	* extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1783	* data space info, which we incremented in the step above.
1784	*
1785	* If we need to fallback to cow and the inode corresponds to a free
1786	* space cache inode or an inode of the data relocation tree, we must
1787	* also increment bytes_may_use of the data space_info for the same
1788	* reason. Space caches and relocated data extents always get a prealloc
1789	* extent for them, however scrub or balance may have set the block
1790	* group that contains that extent to RO mode and therefore force COW
1791	* when starting writeback.
1792	*/
1793	count = count_range_bits(tree: io_tree, start: &range_start, search_end: end, max_bytes: range_bytes,
1794	bits: EXTENT_NORESERVE, contig: `0`, NULL);
1795	if (count > `0` \|\| is_space_ino \|\| is_reloc_ino) {
1796	u64 bytes = count;
1797	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1798	struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1799
1800	if (is_space_ino \|\| is_reloc_ino)
1801	bytes = range_bytes;
1802
1803	spin_lock(lock: &sinfo->lock);
1804	btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1805	spin_unlock(lock: &sinfo->lock);
1806
1807	if (count > `0`)
1808	clear_extent_bit(tree: io_tree, start, end, bits: EXTENT_NORESERVE,
1809	NULL);
1810	}
1811
1812	/*
1813	* Don't try to create inline extents, as a mix of inline extent that
1814	* is written out and unlocked directly and a normal NOCOW extent
1815	* doesn't work.
1816	*/
1817	ret = cow_file_range(inode, locked_page, start, end, NULL, keep_locked: false, no_inline: true);
1818	ASSERT(ret != `1`);
1819	return ret;
1820	}
1821
1822	struct can_nocow_file_extent_args {
1823	/ Input fields. /
1824
1825	/ Start file offset of the range we want to NOCOW. /
1826	u64 start;
1827	/ End file offset (inclusive) of the range we want to NOCOW. /
1828	u64 end;
1829	bool writeback_path;
1830	bool strict;
1831	/*
1832	* Free the path passed to can_nocow_file_extent() once it's not needed
1833	* anymore.
1834	*/
1835	bool free_path;
1836
1837	/ Output fields. Only set when can_nocow_file_extent() returns 1. /
1838
1839	u64 disk_bytenr;
1840	u64 disk_num_bytes;
1841	u64 extent_offset;
1842	/ Number of bytes that can be written to in NOCOW mode. /
1843	u64 num_bytes;
1844	};
1845
1846	/*
1847	* Check if we can NOCOW the file extent that the path points to.
1848	* This function may return with the path released, so the caller should check
1849	* if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1850	*
1851	* Returns: < 0 on error
1852	* 0 if we can not NOCOW
1853	* 1 if we can NOCOW
1854	*/
1855	static int can_nocow_file_extent(struct btrfs_path *path,
1856	struct btrfs_key *key,
1857	struct btrfs_inode *inode,
1858	struct can_nocow_file_extent_args *args)
1859	{
1860	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1861	struct extent_buffer *leaf = path->nodes[`0`];
1862	struct btrfs_root *root = inode->root;
1863	struct btrfs_file_extent_item *fi;
1864	u64 extent_end;
1865	u8 extent_type;
1866	int can_nocow = `0`;
1867	int ret = `0`;
1868	bool nowait = path->nowait;
1869
1870	fi = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
1871	extent_type = btrfs_file_extent_type(eb: leaf, s: fi);
1872
1873	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1874	goto out;
1875
1876	/ Can't access these fields unless we know it's not an inline extent. /
1877	args->disk_bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: fi);
1878	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: fi);
1879	args->extent_offset = btrfs_file_extent_offset(eb: leaf, s: fi);
1880
1881	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1882	extent_type == BTRFS_FILE_EXTENT_REG)
1883	goto out;
1884
1885	/*
1886	* If the extent was created before the generation where the last snapshot
1887	* for its subvolume was created, then this implies the extent is shared,
1888	* hence we must COW.
1889	*/
1890	if (!args->strict &&
1891	btrfs_file_extent_generation(eb: leaf, s: fi) <=
1892	btrfs_root_last_snapshot(s: &root->root_item))
1893	goto out;
1894
1895	/ An explicit hole, must COW. /
1896	if (args->disk_bytenr == `0`)
1897	goto out;
1898
1899	/ Compressed/encrypted/encoded extents must be COWed. /
1900	if (btrfs_file_extent_compression(eb: leaf, s: fi) \|\|
1901	btrfs_file_extent_encryption(eb: leaf, s: fi) \|\|
1902	btrfs_file_extent_other_encoding(eb: leaf, s: fi))
1903	goto out;
1904
1905	extent_end = btrfs_file_extent_end(path);
1906
1907	/*
1908	* The following checks can be expensive, as they need to take other
1909	* locks and do btree or rbtree searches, so release the path to avoid
1910	* blocking other tasks for too long.
1911	*/
1912	btrfs_release_path(p: path);
1913
1914	ret = btrfs_cross_ref_exist(root, objectid: btrfs_ino(inode),
1915	offset: key->offset - args->extent_offset,
1916	bytenr: args->disk_bytenr, strict: args->strict, path);
1917	WARN_ON_ONCE(ret > `0` && is_freespace_inode);
1918	if (ret != `0`)
1919	goto out;
1920
1921	if (args->free_path) {
1922	/*
1923	* We don't need the path anymore, plus through the
1924	* csum_exist_in_range() call below we will end up allocating
1925	* another path. So free the path to avoid unnecessary extra
1926	* memory usage.
1927	*/
1928	btrfs_free_path(p: path);
1929	path = NULL;
1930	}
1931
1932	/ If there are pending snapshots for this root, we must COW. /
1933	if (args->writeback_path && !is_freespace_inode &&
1934	atomic_read(v: &root->snapshot_force_cow))
1935	goto out;
1936
1937	args->disk_bytenr += args->extent_offset;
1938	args->disk_bytenr += args->start - key->offset;
1939	args->num_bytes = min(args->end + `1`, extent_end) - args->start;
1940
1941	/*
1942	* Force COW if csums exist in the range. This ensures that csums for a
1943	* given extent are either valid or do not exist.
1944	*/
1945	ret = csum_exist_in_range(fs_info: root->fs_info, bytenr: args->disk_bytenr, num_bytes: args->num_bytes,
1946	nowait);
1947	WARN_ON_ONCE(ret > `0` && is_freespace_inode);
1948	if (ret != `0`)
1949	goto out;
1950
1951	can_nocow = `1`;
1952	out:
1953	if (args->free_path && path)
1954	btrfs_free_path(p: path);
1955
1956	return ret < `0` ? ret : can_nocow;
1957	}
1958
1959	/*
1960	* when nowcow writeback call back. This checks for snapshots or COW copies
1961	* of the extents that exist in the file, and COWs the file as required.
1962	*
1963	* If no cow copies or snapshots exist, we write directly to the existing
1964	* blocks on disk
1965	*/
1966	static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1967	struct page *locked_page,
1968	const u64 start, const u64 end)
1969	{
1970	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1971	struct btrfs_root *root = inode->root;
1972	struct btrfs_path *path;
1973	u64 cow_start = (u64)-`1`;
1974	u64 cur_offset = start;
1975	int ret;
1976	bool check_prev = true;
1977	u64 ino = btrfs_ino(inode);
1978	struct can_nocow_file_extent_args nocow_args = { `0` };
1979
1980	/*
1981	* Normally on a zoned device we're only doing COW writes, but in case
1982	* of relocation on a zoned filesystem serializes I/O so that we're only
1983	* writing sequentially and can end up here as well.
1984	*/
1985	ASSERT(!btrfs_is_zoned(fs_info) \|\| btrfs_is_data_reloc_root(root));
1986
1987	path = btrfs_alloc_path();
1988	if (!path) {
1989	ret = -ENOMEM;
1990	goto error;
1991	}
1992
1993	nocow_args.end = end;
1994	nocow_args.writeback_path = true;
1995
1996	while (`1`) {
1997	struct btrfs_block_group *nocow_bg = NULL;
1998	struct btrfs_ordered_extent *ordered;
1999	struct btrfs_key found_key;
2000	struct btrfs_file_extent_item *fi;
2001	struct extent_buffer *leaf;
2002	u64 extent_end;
2003	u64 ram_bytes;
2004	u64 nocow_end;
2005	int extent_type;
2006	bool is_prealloc;
2007
2008	ret = btrfs_lookup_file_extent(NULL, root, path, objectid: ino,
2009	bytenr: cur_offset, mod: `0`);
2010	if (ret < `0`)
2011	goto error;
2012
2013	/*
2014	* If there is no extent for our range when doing the initial
2015	* search, then go back to the previous slot as it will be the
2016	* one containing the search offset
2017	*/
2018	if (ret > `0` && path->slots[`0`] > `0` && check_prev) {
2019	leaf = path->nodes[`0`];
2020	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key,
2021	nr: path->slots[`0`] - `1`);
2022	if (found_key.objectid == ino &&
2023	found_key.type == BTRFS_EXTENT_DATA_KEY)
2024	path->slots[`0`]--;
2025	}
2026	check_prev = false;
2027	next_slot:
2028	/ Go to next leaf if we have exhausted the current one /
2029	leaf = path->nodes[`0`];
2030	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
2031	ret = btrfs_next_leaf(root, path);
2032	if (ret < `0`)
2033	goto error;
2034	if (ret > `0`)
2035	break;
2036	leaf = path->nodes[`0`];
2037	}
2038
2039	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
2040
2041	/ Didn't find anything for our INO /
2042	if (found_key.objectid > ino)
2043	break;
2044	/*
2045	* Keep searching until we find an EXTENT_ITEM or there are no
2046	* more extents for this inode
2047	*/
2048	if (WARN_ON_ONCE(found_key.objectid < ino) \|\|
2049	found_key.type < BTRFS_EXTENT_DATA_KEY) {
2050	path->slots[`0`]++;
2051	goto next_slot;
2052	}
2053
2054	/ Found key is not EXTENT_DATA_KEY or starts after req range /
2055	if (found_key.type > BTRFS_EXTENT_DATA_KEY \|\|
2056	found_key.offset > end)
2057	break;
2058
2059	/*
2060	* If the found extent starts after requested offset, then
2061	* adjust extent_end to be right before this extent begins
2062	*/
2063	if (found_key.offset > cur_offset) {
2064	extent_end = found_key.offset;
2065	extent_type = `0`;
2066	goto must_cow;
2067	}
2068
2069	/*
2070	* Found extent which begins before our range and potentially
2071	* intersect it
2072	*/
2073	fi = btrfs_item_ptr(leaf, path->slots[`0`],
2074	struct btrfs_file_extent_item);
2075	extent_type = btrfs_file_extent_type(eb: leaf, s: fi);
2076	/ If this is triggered then we have a memory corruption. /
2077	ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2078	if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2079	ret = -EUCLEAN;
2080	goto error;
2081	}
2082	ram_bytes = btrfs_file_extent_ram_bytes(eb: leaf, s: fi);
2083	extent_end = btrfs_file_extent_end(path);
2084
2085	/*
2086	* If the extent we got ends before our current offset, skip to
2087	* the next extent.
2088	*/
2089	if (extent_end <= cur_offset) {
2090	path->slots[`0`]++;
2091	goto next_slot;
2092	}
2093
2094	nocow_args.start = cur_offset;
2095	ret = can_nocow_file_extent(path, key: &found_key, inode, args: &nocow_args);
2096	if (ret < `0`)
2097	goto error;
2098	if (ret == `0`)
2099	goto must_cow;
2100
2101	ret = `0`;
2102	nocow_bg = btrfs_inc_nocow_writers(fs_info, bytenr: nocow_args.disk_bytenr);
2103	if (!nocow_bg) {
2104	must_cow:
2105	/*
2106	* If we can't perform NOCOW writeback for the range,
2107	* then record the beginning of the range that needs to
2108	* be COWed. It will be written out before the next
2109	* NOCOW range if we find one, or when exiting this
2110	* loop.
2111	*/
2112	if (cow_start == (u64)-`1`)
2113	cow_start = cur_offset;
2114	cur_offset = extent_end;
2115	if (cur_offset > end)
2116	break;
2117	if (!path->nodes[`0`])
2118	continue;
2119	path->slots[`0`]++;
2120	goto next_slot;
2121	}
2122
2123	/*
2124	* COW range from cow_start to found_key.offset - 1. As the key
2125	* will contain the beginning of the first extent that can be
2126	* NOCOW, following one which needs to be COW'ed
2127	*/
2128	if (cow_start != (u64)-`1`) {
2129	ret = fallback_to_cow(inode, locked_page,
2130	start: cow_start, end: found_key.offset - `1`);
2131	cow_start = (u64)-`1`;
2132	if (ret) {
2133	btrfs_dec_nocow_writers(bg: nocow_bg);
2134	goto error;
2135	}
2136	}
2137
2138	nocow_end = cur_offset + nocow_args.num_bytes - `1`;
2139	is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2140	if (is_prealloc) {
2141	u64 orig_start = found_key.offset - nocow_args.extent_offset;
2142	struct extent_map *em;
2143
2144	em = create_io_em(inode, start: cur_offset, len: nocow_args.num_bytes,
2145	orig_start,
2146	block_start: nocow_args.disk_bytenr, / block_start /
2147	block_len: nocow_args.num_bytes, / block_len /
2148	orig_block_len: nocow_args.disk_num_bytes, / orig_block_len /
2149	ram_bytes, compress_type: BTRFS_COMPRESS_NONE,
2150	type: BTRFS_ORDERED_PREALLOC);
2151	if (IS_ERR(ptr: em)) {
2152	btrfs_dec_nocow_writers(bg: nocow_bg);
2153	ret = PTR_ERR(ptr: em);
2154	goto error;
2155	}
2156	free_extent_map(em);
2157	}
2158
2159	ordered = btrfs_alloc_ordered_extent(inode, file_offset: cur_offset,
2160	num_bytes: nocow_args.num_bytes, ram_bytes: nocow_args.num_bytes,
2161	disk_bytenr: nocow_args.disk_bytenr, disk_num_bytes: nocow_args.num_bytes, offset: `0`,
2162	flags: is_prealloc
2163	? (`1` << BTRFS_ORDERED_PREALLOC)
2164	: (`1` << BTRFS_ORDERED_NOCOW),
2165	compress_type: BTRFS_COMPRESS_NONE);
2166	btrfs_dec_nocow_writers(bg: nocow_bg);
2167	if (IS_ERR(ptr: ordered)) {
2168	if (is_prealloc) {
2169	btrfs_drop_extent_map_range(inode, start: cur_offset,
2170	end: nocow_end, skip_pinned: false);
2171	}
2172	ret = PTR_ERR(ptr: ordered);
2173	goto error;
2174	}
2175
2176	if (btrfs_is_data_reloc_root(root))
2177	/*
2178	* Error handled later, as we must prevent
2179	* extent_clear_unlock_delalloc() in error handler
2180	* from freeing metadata of created ordered extent.
2181	*/
2182	ret = btrfs_reloc_clone_csums(ordered);
2183	btrfs_put_ordered_extent(entry: ordered);
2184
2185	extent_clear_unlock_delalloc(inode, start: cur_offset, end: nocow_end,
2186	locked_page, bits_to_clear: EXTENT_LOCKED \|
2187	EXTENT_DELALLOC \|
2188	EXTENT_CLEAR_DATA_RESV,
2189	page_ops: PAGE_UNLOCK \| PAGE_SET_ORDERED);
2190
2191	cur_offset = extent_end;
2192
2193	/*
2194	* btrfs_reloc_clone_csums() error, now we're OK to call error
2195	* handler, as metadata for created ordered extent will only
2196	* be freed by btrfs_finish_ordered_io().
2197	*/
2198	if (ret)
2199	goto error;
2200	if (cur_offset > end)
2201	break;
2202	}
2203	btrfs_release_path(p: path);
2204
2205	if (cur_offset <= end && cow_start == (u64)-`1`)
2206	cow_start = cur_offset;
2207
2208	if (cow_start != (u64)-`1`) {
2209	cur_offset = end;
2210	ret = fallback_to_cow(inode, locked_page, start: cow_start, end);
2211	cow_start = (u64)-`1`;
2212	if (ret)
2213	goto error;
2214	}
2215
2216	btrfs_free_path(p: path);
2217	return `0`;
2218
2219	error:
2220	/*
2221	* If an error happened while a COW region is outstanding, cur_offset
2222	* needs to be reset to cow_start to ensure the COW region is unlocked
2223	* as well.
2224	*/
2225	if (cow_start != (u64)-`1`)
2226	cur_offset = cow_start;
2227	if (cur_offset < end)
2228	extent_clear_unlock_delalloc(inode, start: cur_offset, end,
2229	locked_page, bits_to_clear: EXTENT_LOCKED \|
2230	EXTENT_DELALLOC \| EXTENT_DEFRAG \|
2231	EXTENT_DO_ACCOUNTING, page_ops: PAGE_UNLOCK \|
2232	PAGE_START_WRITEBACK \|
2233	PAGE_END_WRITEBACK);
2234	btrfs_free_path(p: path);
2235	return ret;
2236	}
2237
2238	static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2239	{
2240	if (inode->flags & (BTRFS_INODE_NODATACOW \| BTRFS_INODE_PREALLOC)) {
2241	if (inode->defrag_bytes &&
2242	test_range_bit_exists(tree: &inode->io_tree, start, end, bit: EXTENT_DEFRAG))
2243	return false;
2244	return true;
2245	}
2246	return false;
2247	}
2248
2249	/*
2250	* Function to process delayed allocation (create CoW) for ranges which are
2251	* being touched for the first time.
2252	*/
2253	int btrfs_run_delalloc_range(struct btrfs_inode inode, struct* page *locked_page,
2254	u64 start, u64 end, struct writeback_control *wbc)
2255	{
2256	const bool zoned = btrfs_is_zoned(fs_info: inode->root->fs_info);
2257	int ret;
2258
2259	/*
2260	* The range must cover part of the @locked_page, or a return of 1
2261	* can confuse the caller.
2262	*/
2263	ASSERT(!(end <= page_offset(locked_page) \|\|
2264	start >= page_offset(locked_page) + PAGE_SIZE));
2265
2266	if (should_nocow(inode, start, end)) {
2267	ret = run_delalloc_nocow(inode, locked_page, start, end);
2268	goto out;
2269	}
2270
2271	if (btrfs_inode_can_compress(inode) &&
2272	inode_need_compress(inode, start, end) &&
2273	run_delalloc_compressed(inode, locked_page, start, end, wbc))
2274	return `1`;
2275
2276	if (zoned)
2277	ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
2278	pages_dirty: true);
2279	else
2280	ret = cow_file_range(inode, locked_page, start, end, NULL,
2281	keep_locked: false, no_inline: false);
2282
2283	out:
2284	if (ret < `0`)
2285	btrfs_cleanup_ordered_extents(inode, locked_page, offset: start,
2286	bytes: end - start + `1`);
2287	return ret;
2288	}
2289
2290	void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2291	struct extent_state *orig, u64 split)
2292	{
2293	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2294	u64 size;
2295
2296	/ not delalloc, ignore it /
2297	if (!(orig->state & EXTENT_DELALLOC))
2298	return;
2299
2300	size = orig->end - orig->start + `1`;
2301	if (size > fs_info->max_extent_size) {
2302	u32 num_extents;
2303	u64 new_size;
2304
2305	/*
2306	* See the explanation in btrfs_merge_delalloc_extent, the same
2307	* applies here, just in reverse.
2308	*/
2309	new_size = orig->end - split + `1`;
2310	num_extents = count_max_extents(fs_info, size: new_size);
2311	new_size = split - orig->start;
2312	num_extents += count_max_extents(fs_info, size: new_size);
2313	if (count_max_extents(fs_info, size) >= num_extents)
2314	return;
2315	}
2316
2317	spin_lock(lock: &inode->lock);
2318	btrfs_mod_outstanding_extents(inode, mod: `1`);
2319	spin_unlock(lock: &inode->lock);
2320	}
2321
2322	/*
2323	* Handle merged delayed allocation extents so we can keep track of new extents
2324	* that are just merged onto old extents, such as when we are doing sequential
2325	* writes, so we can properly account for the metadata space we'll need.
2326	*/
2327	void btrfs_merge_delalloc_extent(struct btrfs_inode inode, struct* extent_state *new,
2328	struct extent_state *other)
2329	{
2330	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2331	u64 new_size, old_size;
2332	u32 num_extents;
2333
2334	/ not delalloc, ignore it /
2335	if (!(other->state & EXTENT_DELALLOC))
2336	return;
2337
2338	if (new->start > other->start)
2339	new_size = new->end - other->start + `1`;
2340	else
2341	new_size = other->end - new->start + `1`;
2342
2343	/ we're not bigger than the max, unreserve the space and go /
2344	if (new_size <= fs_info->max_extent_size) {
2345	spin_lock(lock: &inode->lock);
2346	btrfs_mod_outstanding_extents(inode, mod: -`1`);
2347	spin_unlock(lock: &inode->lock);
2348	return;
2349	}
2350
2351	/*
2352	* We have to add up either side to figure out how many extents were
2353	* accounted for before we merged into one big extent. If the number of
2354	* extents we accounted for is <= the amount we need for the new range
2355	* then we can return, otherwise drop. Think of it like this
2356	*
2357	* [ 4k][MAX_SIZE]
2358	*
2359	* So we've grown the extent by a MAX_SIZE extent, this would mean we
2360	* need 2 outstanding extents, on one side we have 1 and the other side
2361	* we have 1 so they are == and we can return. But in this case
2362	*
2363	* [MAX_SIZE+4k][MAX_SIZE+4k]
2364	*
2365	* Each range on their own accounts for 2 extents, but merged together
2366	* they are only 3 extents worth of accounting, so we need to drop in
2367	* this case.
2368	*/
2369	old_size = other->end - other->start + `1`;
2370	num_extents = count_max_extents(fs_info, size: old_size);
2371	old_size = new->end - new->start + `1`;
2372	num_extents += count_max_extents(fs_info, size: old_size);
2373	if (count_max_extents(fs_info, size: new_size) >= num_extents)
2374	return;
2375
2376	spin_lock(lock: &inode->lock);
2377	btrfs_mod_outstanding_extents(inode, mod: -`1`);
2378	spin_unlock(lock: &inode->lock);
2379	}
2380
2381	static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2382	struct btrfs_inode *inode)
2383	{
2384	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2385
2386	spin_lock(lock: &root->delalloc_lock);
2387	if (list_empty(head: &inode->delalloc_inodes)) {
2388	list_add_tail(new: &inode->delalloc_inodes, head: &root->delalloc_inodes);
2389	set_bit(nr: BTRFS_INODE_IN_DELALLOC_LIST, addr: &inode->runtime_flags);
2390	root->nr_delalloc_inodes++;
2391	if (root->nr_delalloc_inodes == `1`) {
2392	spin_lock(lock: &fs_info->delalloc_root_lock);
2393	BUG_ON(!list_empty(&root->delalloc_root));
2394	list_add_tail(new: &root->delalloc_root,
2395	head: &fs_info->delalloc_roots);
2396	spin_unlock(lock: &fs_info->delalloc_root_lock);
2397	}
2398	}
2399	spin_unlock(lock: &root->delalloc_lock);
2400	}
2401
2402	void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2403	struct btrfs_inode *inode)
2404	{
2405	struct btrfs_fs_info *fs_info = root->fs_info;
2406
2407	if (!list_empty(head: &inode->delalloc_inodes)) {
2408	list_del_init(entry: &inode->delalloc_inodes);
2409	clear_bit(nr: BTRFS_INODE_IN_DELALLOC_LIST,
2410	addr: &inode->runtime_flags);
2411	root->nr_delalloc_inodes--;
2412	if (!root->nr_delalloc_inodes) {
2413	ASSERT(list_empty(&root->delalloc_inodes));
2414	spin_lock(lock: &fs_info->delalloc_root_lock);
2415	BUG_ON(list_empty(&root->delalloc_root));
2416	list_del_init(entry: &root->delalloc_root);
2417	spin_unlock(lock: &fs_info->delalloc_root_lock);
2418	}
2419	}
2420	}
2421
2422	static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2423	struct btrfs_inode *inode)
2424	{
2425	spin_lock(lock: &root->delalloc_lock);
2426	__btrfs_del_delalloc_inode(root, inode);
2427	spin_unlock(lock: &root->delalloc_lock);
2428	}
2429
2430	/*
2431	* Properly track delayed allocation bytes in the inode and to maintain the
2432	* list of inodes that have pending delalloc work to be done.
2433	*/
2434	void btrfs_set_delalloc_extent(struct btrfs_inode inode, struct* extent_state *state,
2435	u32 bits)
2436	{
2437	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2438
2439	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2440	WARN_ON(`1`);
2441	/*
2442	* set_bit and clear bit hooks normally require _irqsave/restore
2443	* but in this case, we are only testing for the DELALLOC
2444	* bit, which is only set or cleared with irqs on
2445	*/
2446	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2447	struct btrfs_root *root = inode->root;
2448	u64 len = state->end + `1` - state->start;
2449	u32 num_extents = count_max_extents(fs_info, size: len);
2450	bool do_list = !btrfs_is_free_space_inode(inode);
2451
2452	spin_lock(lock: &inode->lock);
2453	btrfs_mod_outstanding_extents(inode, mod: num_extents);
2454	spin_unlock(lock: &inode->lock);
2455
2456	/ For sanity tests /
2457	if (btrfs_is_testing(fs_info))
2458	return;
2459
2460	percpu_counter_add_batch(fbc: &fs_info->delalloc_bytes, amount: len,
2461	batch: fs_info->delalloc_batch);
2462	spin_lock(lock: &inode->lock);
2463	inode->delalloc_bytes += len;
2464	if (bits & EXTENT_DEFRAG)
2465	inode->defrag_bytes += len;
2466	if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2467	&inode->runtime_flags))
2468	btrfs_add_delalloc_inodes(root, inode);
2469	spin_unlock(lock: &inode->lock);
2470	}
2471
2472	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2473	(bits & EXTENT_DELALLOC_NEW)) {
2474	spin_lock(lock: &inode->lock);
2475	inode->new_delalloc_bytes += state->end + `1` - state->start;
2476	spin_unlock(lock: &inode->lock);
2477	}
2478	}
2479
2480	/*
2481	* Once a range is no longer delalloc this function ensures that proper
2482	* accounting happens.
2483	*/
2484	void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2485	struct extent_state *state, u32 bits)
2486	{
2487	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2488	u64 len = state->end + `1` - state->start;
2489	u32 num_extents = count_max_extents(fs_info, size: len);
2490
2491	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2492	spin_lock(lock: &inode->lock);
2493	inode->defrag_bytes -= len;
2494	spin_unlock(lock: &inode->lock);
2495	}
2496
2497	/*
2498	* set_bit and clear bit hooks normally require _irqsave/restore
2499	* but in this case, we are only testing for the DELALLOC
2500	* bit, which is only set or cleared with irqs on
2501	*/
2502	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2503	struct btrfs_root *root = inode->root;
2504	bool do_list = !btrfs_is_free_space_inode(inode);
2505
2506	spin_lock(lock: &inode->lock);
2507	btrfs_mod_outstanding_extents(inode, mod: -num_extents);
2508	spin_unlock(lock: &inode->lock);
2509
2510	/*
2511	* We don't reserve metadata space for space cache inodes so we
2512	* don't need to call delalloc_release_metadata if there is an
2513	* error.
2514	*/
2515	if (bits & EXTENT_CLEAR_META_RESV &&
2516	root != fs_info->tree_root)
2517	btrfs_delalloc_release_metadata(inode, num_bytes: len, qgroup_free: false);
2518
2519	/ For sanity tests. /
2520	if (btrfs_is_testing(fs_info))
2521	return;
2522
2523	if (!btrfs_is_data_reloc_root(root) &&
2524	do_list && !(state->state & EXTENT_NORESERVE) &&
2525	(bits & EXTENT_CLEAR_DATA_RESV))
2526	btrfs_free_reserved_data_space_noquota(fs_info, len);
2527
2528	percpu_counter_add_batch(fbc: &fs_info->delalloc_bytes, amount: -len,
2529	batch: fs_info->delalloc_batch);
2530	spin_lock(lock: &inode->lock);
2531	inode->delalloc_bytes -= len;
2532	if (do_list && inode->delalloc_bytes == `0` &&
2533	test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2534	&inode->runtime_flags))
2535	btrfs_del_delalloc_inode(root, inode);
2536	spin_unlock(lock: &inode->lock);
2537	}
2538
2539	if ((state->state & EXTENT_DELALLOC_NEW) &&
2540	(bits & EXTENT_DELALLOC_NEW)) {
2541	spin_lock(lock: &inode->lock);
2542	ASSERT(inode->new_delalloc_bytes >= len);
2543	inode->new_delalloc_bytes -= len;
2544	if (bits & EXTENT_ADD_INODE_BYTES)
2545	inode_add_bytes(inode: &inode->vfs_inode, bytes: len);
2546	spin_unlock(lock: &inode->lock);
2547	}
2548	}
2549
2550	static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
2551	struct btrfs_ordered_extent *ordered)
2552	{
2553	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
2554	u64 len = bbio->bio.bi_iter.bi_size;
2555	struct btrfs_ordered_extent *new;
2556	int ret;
2557
2558	/ Must always be called for the beginning of an ordered extent. /
2559	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
2560	return -EINVAL;
2561
2562	/ No need to split if the ordered extent covers the entire bio. /
2563	if (ordered->disk_num_bytes == len) {
2564	refcount_inc(r: &ordered->refs);
2565	bbio->ordered = ordered;
2566	return `0`;
2567	}
2568
2569	/*
2570	* Don't split the extent_map for NOCOW extents, as we're writing into
2571	* a pre-existing one.
2572	*/
2573	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
2574	ret = split_extent_map(inode: bbio->inode, start: bbio->file_offset,
2575	len: ordered->num_bytes, pre: len,
2576	new_logical: ordered->disk_bytenr);
2577	if (ret)
2578	return ret;
2579	}
2580
2581	new = btrfs_split_ordered_extent(ordered, len);
2582	if (IS_ERR(ptr: new))
2583	return PTR_ERR(ptr: new);
2584	bbio->ordered = new;
2585	return `0`;
2586	}
2587
2588	/*
2589	* given a list of ordered sums record them in the inode. This happens
2590	* at IO completion time based on sums calculated at bio submission time.
2591	*/
2592	static int add_pending_csums(struct btrfs_trans_handle *trans,
2593	struct list_head *list)
2594	{
2595	struct btrfs_ordered_sum *sum;
2596	struct btrfs_root *csum_root = NULL;
2597	int ret;
2598
2599	list_for_each_entry(sum, list, list) {
2600	trans->adding_csums = true;
2601	if (!csum_root)
2602	csum_root = btrfs_csum_root(fs_info: trans->fs_info,
2603	bytenr: sum->logical);
2604	ret = btrfs_csum_file_blocks(trans, root: csum_root, sums: sum);
2605	trans->adding_csums = false;
2606	if (ret)
2607	return ret;
2608	}
2609	return `0`;
2610	}
2611
2612	static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2613	const u64 start,
2614	const u64 len,
2615	struct extent_state **cached_state)
2616	{
2617	u64 search_start = start;
2618	const u64 end = start + len - `1`;
2619
2620	while (search_start < end) {
2621	const u64 search_len = end - search_start + `1`;
2622	struct extent_map *em;
2623	u64 em_len;
2624	int ret = `0`;
2625
2626	em = btrfs_get_extent(inode, NULL, pg_offset: `0`, start: search_start, end: search_len);
2627	if (IS_ERR(ptr: em))
2628	return PTR_ERR(ptr: em);
2629
2630	if (em->block_start != EXTENT_MAP_HOLE)
2631	goto next;
2632
2633	em_len = em->len;
2634	if (em->start < search_start)
2635	em_len -= search_start - em->start;
2636	if (em_len > search_len)
2637	em_len = search_len;
2638
2639	ret = set_extent_bit(tree: &inode->io_tree, start: search_start,
2640	end: search_start + em_len - `1`,
2641	bits: EXTENT_DELALLOC_NEW, cached_state);
2642	next:
2643	search_start = extent_map_end(em);
2644	free_extent_map(em);
2645	if (ret)
2646	return ret;
2647	}
2648	return `0`;
2649	}
2650
2651	int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2652	unsigned int extra_bits,
2653	struct extent_state **cached_state)
2654	{
2655	WARN_ON(PAGE_ALIGNED(end));
2656
2657	if (start >= i_size_read(inode: &inode->vfs_inode) &&
2658	!(inode->flags & BTRFS_INODE_PREALLOC)) {
2659	/*
2660	* There can't be any extents following eof in this case so just
2661	* set the delalloc new bit for the range directly.
2662	*/
2663	extra_bits \|= EXTENT_DELALLOC_NEW;
2664	} else {
2665	int ret;
2666
2667	ret = btrfs_find_new_delalloc_bytes(inode, start,
2668	len: end + `1` - start,
2669	cached_state);
2670	if (ret)
2671	return ret;
2672	}
2673
2674	return set_extent_bit(tree: &inode->io_tree, start, end,
2675	bits: EXTENT_DELALLOC \| extra_bits, cached_state);
2676	}
2677
2678	/ see btrfs_writepage_start_hook for details on why this is required /
2679	struct btrfs_writepage_fixup {
2680	struct page *page;
2681	struct btrfs_inode *inode;
2682	struct btrfs_work work;
2683	};
2684
2685	static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2686	{
2687	struct btrfs_writepage_fixup *fixup =
2688	container_of(work, struct btrfs_writepage_fixup, work);
2689	struct btrfs_ordered_extent *ordered;
2690	struct extent_state *cached_state = NULL;
2691	struct extent_changeset *data_reserved = NULL;
2692	struct page *page = fixup->page;
2693	struct btrfs_inode *inode = fixup->inode;
2694	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2695	u64 page_start = page_offset(page);
2696	u64 page_end = page_offset(page) + PAGE_SIZE - `1`;
2697	int ret = `0`;
2698	bool free_delalloc_space = true;
2699
2700	/*
2701	* This is similar to page_mkwrite, we need to reserve the space before
2702	* we take the page lock.
2703	*/
2704	ret = btrfs_delalloc_reserve_space(inode, reserved: &data_reserved, start: page_start,
2705	PAGE_SIZE);
2706	again:
2707	lock_page(page);
2708
2709	/*
2710	* Before we queued this fixup, we took a reference on the page.
2711	* page->mapping may go NULL, but it shouldn't be moved to a different
2712	* address space.
2713	*/
2714	if (!page->mapping \|\| !PageDirty(page) \|\| !PageChecked(page)) {
2715	/*
2716	* Unfortunately this is a little tricky, either
2717	*
2718	* 1) We got here and our page had already been dealt with and
2719	* we reserved our space, thus ret == 0, so we need to just
2720	* drop our space reservation and bail. This can happen the
2721	* first time we come into the fixup worker, or could happen
2722	* while waiting for the ordered extent.
2723	* 2) Our page was already dealt with, but we happened to get an
2724	* ENOSPC above from the btrfs_delalloc_reserve_space. In
2725	* this case we obviously don't have anything to release, but
2726	* because the page was already dealt with we don't want to
2727	* mark the page with an error, so make sure we're resetting
2728	* ret to 0. This is why we have this check _before_ the ret
2729	* check, because we do not want to have a surprise ENOSPC
2730	* when the page was already properly dealt with.
2731	*/
2732	if (!ret) {
2733	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2734	btrfs_delalloc_release_space(inode, reserved: data_reserved,
2735	start: page_start, PAGE_SIZE,
2736	qgroup_free: true);
2737	}
2738	ret = `0`;
2739	goto out_page;
2740	}
2741
2742	/*
2743	* We can't mess with the page state unless it is locked, so now that
2744	* it is locked bail if we failed to make our space reservation.
2745	*/
2746	if (ret)
2747	goto out_page;
2748
2749	lock_extent(tree: &inode->io_tree, start: page_start, end: page_end, cached: &cached_state);
2750
2751	/ already ordered? We're done /
2752	if (PageOrdered(page))
2753	goto out_reserved;
2754
2755	ordered = btrfs_lookup_ordered_range(inode, file_offset: page_start, PAGE_SIZE);
2756	if (ordered) {
2757	unlock_extent(tree: &inode->io_tree, start: page_start, end: page_end,
2758	cached: &cached_state);
2759	unlock_page(page);
2760	btrfs_start_ordered_extent(entry: ordered);
2761	btrfs_put_ordered_extent(entry: ordered);
2762	goto again;
2763	}
2764
2765	ret = btrfs_set_extent_delalloc(inode, start: page_start, end: page_end, extra_bits: `0`,
2766	cached_state: &cached_state);
2767	if (ret)
2768	goto out_reserved;
2769
2770	/*
2771	* Everything went as planned, we're now the owner of a dirty page with
2772	* delayed allocation bits set and space reserved for our COW
2773	* destination.
2774	*
2775	* The page was dirty when we started, nothing should have cleaned it.
2776	*/
2777	BUG_ON(!PageDirty(page));
2778	free_delalloc_space = false;
2779	out_reserved:
2780	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2781	if (free_delalloc_space)
2782	btrfs_delalloc_release_space(inode, reserved: data_reserved, start: page_start,
2783	PAGE_SIZE, qgroup_free: true);
2784	unlock_extent(tree: &inode->io_tree, start: page_start, end: page_end, cached: &cached_state);
2785	out_page:
2786	if (ret) {
2787	/*
2788	* We hit ENOSPC or other errors. Update the mapping and page
2789	* to reflect the errors and clean the page.
2790	*/
2791	mapping_set_error(mapping: page->mapping, error: ret);
2792	btrfs_mark_ordered_io_finished(inode, page, file_offset: page_start,
2793	PAGE_SIZE, uptodate: !ret);
2794	clear_page_dirty_for_io(page);
2795	}
2796	btrfs_page_clear_checked(fs_info, page, start: page_start, PAGE_SIZE);
2797	unlock_page(page);
2798	put_page(page);
2799	kfree(objp: fixup);
2800	extent_changeset_free(changeset: data_reserved);
2801	/*
2802	* As a precaution, do a delayed iput in case it would be the last iput
2803	* that could need flushing space. Recursing back to fixup worker would
2804	* deadlock.
2805	*/
2806	btrfs_add_delayed_iput(inode);
2807	}
2808
2809	/*
2810	* There are a few paths in the higher layers of the kernel that directly
2811	* set the page dirty bit without asking the filesystem if it is a
2812	* good idea. This causes problems because we want to make sure COW
2813	* properly happens and the data=ordered rules are followed.
2814	*
2815	* In our case any range that doesn't have the ORDERED bit set
2816	* hasn't been properly setup for IO. We kick off an async process
2817	* to fix it up. The async helper will wait for ordered extents, set
2818	* the delalloc bit and make it safe to write the page.
2819	*/
2820	int btrfs_writepage_cow_fixup(struct page *page)
2821	{
2822	struct inode *inode = page->mapping->host;
2823	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
2824	struct btrfs_writepage_fixup *fixup;
2825
2826	/ This page has ordered extent covering it already /
2827	if (PageOrdered(page))
2828	return `0`;
2829
2830	/*
2831	* PageChecked is set below when we create a fixup worker for this page,
2832	* don't try to create another one if we're already PageChecked()
2833	*
2834	* The extent_io writepage code will redirty the page if we send back
2835	* EAGAIN.
2836	*/
2837	if (PageChecked(page))
2838	return -EAGAIN;
2839
2840	fixup = kzalloc(size: sizeof(*fixup), GFP_NOFS);
2841	if (!fixup)
2842	return -EAGAIN;
2843
2844	/*
2845	* We are already holding a reference to this inode from
2846	* write_cache_pages. We need to hold it because the space reservation
2847	* takes place outside of the page lock, and we can't trust
2848	* page->mapping outside of the page lock.
2849	*/
2850	ihold(inode);
2851	btrfs_page_set_checked(fs_info, page, start: page_offset(page), PAGE_SIZE);
2852	get_page(page);
2853	btrfs_init_work(work: &fixup->work, func: btrfs_writepage_fixup_worker, NULL);
2854	fixup->page = page;
2855	fixup->inode = BTRFS_I(inode);
2856	btrfs_queue_work(wq: fs_info->fixup_workers, work: &fixup->work);
2857
2858	return -EAGAIN;
2859	}
2860
2861	static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2862	struct btrfs_inode *inode, u64 file_pos,
2863	struct btrfs_file_extent_item *stack_fi,
2864	const bool update_inode_bytes,
2865	u64 qgroup_reserved)
2866	{
2867	struct btrfs_root *root = inode->root;
2868	const u64 sectorsize = root->fs_info->sectorsize;
2869	struct btrfs_path *path;
2870	struct extent_buffer *leaf;
2871	struct btrfs_key ins;
2872	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(s: stack_fi);
2873	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(s: stack_fi);
2874	u64 offset = btrfs_stack_file_extent_offset(s: stack_fi);
2875	u64 num_bytes = btrfs_stack_file_extent_num_bytes(s: stack_fi);
2876	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(s: stack_fi);
2877	struct btrfs_drop_extents_args drop_args = { `0` };
2878	int ret;
2879
2880	path = btrfs_alloc_path();
2881	if (!path)
2882	return -ENOMEM;
2883
2884	/*
2885	* we may be replacing one extent in the tree with another.
2886	* The new extent is pinned in the extent map, and we don't want
2887	* to drop it from the cache until it is completely in the btree.
2888	*
2889	* So, tell btrfs_drop_extents to leave this extent in the cache.
2890	* the caller is expected to unpin it and allow it to be merged
2891	* with the others.
2892	*/
2893	drop_args.path = path;
2894	drop_args.start = file_pos;
2895	drop_args.end = file_pos + num_bytes;
2896	drop_args.replace_extent = true;
2897	drop_args.extent_item_size = sizeof(*stack_fi);
2898	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
2899	if (ret)
2900	goto out;
2901
2902	if (!drop_args.extent_inserted) {
2903	ins.objectid = btrfs_ino(inode);
2904	ins.offset = file_pos;
2905	ins.type = BTRFS_EXTENT_DATA_KEY;
2906
2907	ret = btrfs_insert_empty_item(trans, root, path, key: &ins,
2908	data_size: sizeof(*stack_fi));
2909	if (ret)
2910	goto out;
2911	}
2912	leaf = path->nodes[`0`];
2913	btrfs_set_stack_file_extent_generation(s: stack_fi, val: trans->transid);
2914	write_extent_buffer(eb: leaf, src: stack_fi,
2915	btrfs_item_ptr_offset(leaf, path->slots[`0`]),
2916	len: sizeof(struct btrfs_file_extent_item));
2917
2918	btrfs_mark_buffer_dirty(trans, buf: leaf);
2919	btrfs_release_path(p: path);
2920
2921	/*
2922	* If we dropped an inline extent here, we know the range where it is
2923	* was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2924	* number of bytes only for that range containing the inline extent.
2925	* The remaining of the range will be processed when clearning the
2926	* EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2927	*/
2928	if (file_pos == `0` && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2929	u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2930
2931	inline_size = drop_args.bytes_found - inline_size;
2932	btrfs_update_inode_bytes(inode, add_bytes: sectorsize, del_bytes: inline_size);
2933	drop_args.bytes_found -= inline_size;
2934	num_bytes -= sectorsize;
2935	}
2936
2937	if (update_inode_bytes)
2938	btrfs_update_inode_bytes(inode, add_bytes: num_bytes, del_bytes: drop_args.bytes_found);
2939
2940	ins.objectid = disk_bytenr;
2941	ins.offset = disk_num_bytes;
2942	ins.type = BTRFS_EXTENT_ITEM_KEY;
2943
2944	ret = btrfs_inode_set_file_extent_range(inode, start: file_pos, len: ram_bytes);
2945	if (ret)
2946	goto out;
2947
2948	ret = btrfs_alloc_reserved_file_extent(trans, root, owner: btrfs_ino(inode),
2949	offset: file_pos - offset,
2950	ram_bytes: qgroup_reserved, ins: &ins);
2951	out:
2952	btrfs_free_path(p: path);
2953
2954	return ret;
2955	}
2956
2957	static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2958	u64 start, u64 len)
2959	{
2960	struct btrfs_block_group *cache;
2961
2962	cache = btrfs_lookup_block_group(info: fs_info, bytenr: start);
2963	ASSERT(cache);
2964
2965	spin_lock(lock: &cache->lock);
2966	cache->delalloc_bytes -= len;
2967	spin_unlock(lock: &cache->lock);
2968
2969	btrfs_put_block_group(cache);
2970	}
2971
2972	static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2973	struct btrfs_ordered_extent *oe)
2974	{
2975	struct btrfs_file_extent_item stack_fi;
2976	bool update_inode_bytes;
2977	u64 num_bytes = oe->num_bytes;
2978	u64 ram_bytes = oe->ram_bytes;
2979
2980	memset(&stack_fi, `0`, sizeof(stack_fi));
2981	btrfs_set_stack_file_extent_type(s: &stack_fi, val: BTRFS_FILE_EXTENT_REG);
2982	btrfs_set_stack_file_extent_disk_bytenr(s: &stack_fi, val: oe->disk_bytenr);
2983	btrfs_set_stack_file_extent_disk_num_bytes(s: &stack_fi,
2984	val: oe->disk_num_bytes);
2985	btrfs_set_stack_file_extent_offset(s: &stack_fi, val: oe->offset);
2986	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
2987	num_bytes = oe->truncated_len;
2988	ram_bytes = num_bytes;
2989	}
2990	btrfs_set_stack_file_extent_num_bytes(s: &stack_fi, val: num_bytes);
2991	btrfs_set_stack_file_extent_ram_bytes(s: &stack_fi, val: ram_bytes);
2992	btrfs_set_stack_file_extent_compression(s: &stack_fi, val: oe->compress_type);
2993	/ Encryption and other encoding is reserved and all 0 /
2994
2995	/*
2996	* For delalloc, when completing an ordered extent we update the inode's
2997	* bytes when clearing the range in the inode's io tree, so pass false
2998	* as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2999	* except if the ordered extent was truncated.
3000	*/
3001	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) \|\|
3002	test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) \|\|
3003	test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3004
3005	return insert_reserved_file_extent(trans, inode: BTRFS_I(inode: oe->inode),
3006	file_pos: oe->file_offset, stack_fi: &stack_fi,
3007	update_inode_bytes, qgroup_reserved: oe->qgroup_rsv);
3008	}
3009
3010	/*
3011	* As ordered data IO finishes, this gets called so we can finish
3012	* an ordered extent if the range of bytes in the file it covers are
3013	* fully written.
3014	*/
3015	int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3016	{
3017	struct btrfs_inode *inode = BTRFS_I(inode: ordered_extent->inode);
3018	struct btrfs_root *root = inode->root;
3019	struct btrfs_fs_info *fs_info = root->fs_info;
3020	struct btrfs_trans_handle *trans = NULL;
3021	struct extent_io_tree *io_tree = &inode->io_tree;
3022	struct extent_state *cached_state = NULL;
3023	u64 start, end;
3024	int compress_type = `0`;
3025	int ret = `0`;
3026	u64 logical_len = ordered_extent->num_bytes;
3027	bool freespace_inode;
3028	bool truncated = false;
3029	bool clear_reserved_extent = true;
3030	unsigned int clear_bits = EXTENT_DEFRAG;
3031
3032	start = ordered_extent->file_offset;
3033	end = start + ordered_extent->num_bytes - `1`;
3034
3035	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3036	!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3037	!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3038	!test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3039	clear_bits \|= EXTENT_DELALLOC_NEW;
3040
3041	freespace_inode = btrfs_is_free_space_inode(inode);
3042	if (!freespace_inode)
3043	btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3044
3045	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3046	ret = -EIO;
3047	goto out;
3048	}
3049
3050	if (btrfs_is_zoned(fs_info))
3051	btrfs_zone_finish_endio(fs_info, logical: ordered_extent->disk_bytenr,
3052	length: ordered_extent->disk_num_bytes);
3053
3054	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3055	truncated = true;
3056	logical_len = ordered_extent->truncated_len;
3057	/ Truncated the entire extent, don't bother adding /
3058	if (!logical_len)
3059	goto out;
3060	}
3061
3062	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3063	BUG_ON(!list_empty(&ordered_extent->list)); / Logic error /
3064
3065	btrfs_inode_safe_disk_i_size_write(inode, new_i_size: `0`);
3066	if (freespace_inode)
3067	trans = btrfs_join_transaction_spacecache(root);
3068	else
3069	trans = btrfs_join_transaction(root);
3070	if (IS_ERR(ptr: trans)) {
3071	ret = PTR_ERR(ptr: trans);
3072	trans = NULL;
3073	goto out;
3074	}
3075	trans->block_rsv = &inode->block_rsv;
3076	ret = btrfs_update_inode_fallback(trans, inode);
3077	if (ret) / -ENOMEM or corruption /
3078	btrfs_abort_transaction(trans, ret);
3079	goto out;
3080	}
3081
3082	clear_bits \|= EXTENT_LOCKED;
3083	lock_extent(tree: io_tree, start, end, cached: &cached_state);
3084
3085	if (freespace_inode)
3086	trans = btrfs_join_transaction_spacecache(root);
3087	else
3088	trans = btrfs_join_transaction(root);
3089	if (IS_ERR(ptr: trans)) {
3090	ret = PTR_ERR(ptr: trans);
3091	trans = NULL;
3092	goto out;
3093	}
3094
3095	trans->block_rsv = &inode->block_rsv;
3096
3097	ret = btrfs_insert_raid_extent(trans, ordered_extent);
3098	if (ret)
3099	goto out;
3100
3101	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3102	compress_type = ordered_extent->compress_type;
3103	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3104	BUG_ON(compress_type);
3105	ret = btrfs_mark_extent_written(trans, inode,
3106	start: ordered_extent->file_offset,
3107	end: ordered_extent->file_offset +
3108	logical_len);
3109	btrfs_zoned_release_data_reloc_bg(fs_info, logical: ordered_extent->disk_bytenr,
3110	length: ordered_extent->disk_num_bytes);
3111	} else {
3112	BUG_ON(root == fs_info->tree_root);
3113	ret = insert_ordered_extent_file_extent(trans, oe: ordered_extent);
3114	if (!ret) {
3115	clear_reserved_extent = false;
3116	btrfs_release_delalloc_bytes(fs_info,
3117	start: ordered_extent->disk_bytenr,
3118	len: ordered_extent->disk_num_bytes);
3119	}
3120	}
3121	unpin_extent_cache(tree: &inode->extent_tree, start: ordered_extent->file_offset,
3122	len: ordered_extent->num_bytes, gen: trans->transid);
3123	if (ret < `0`) {
3124	btrfs_abort_transaction(trans, ret);
3125	goto out;
3126	}
3127
3128	ret = add_pending_csums(trans, list: &ordered_extent->list);
3129	if (ret) {
3130	btrfs_abort_transaction(trans, ret);
3131	goto out;
3132	}
3133
3134	/*
3135	* If this is a new delalloc range, clear its new delalloc flag to
3136	* update the inode's number of bytes. This needs to be done first
3137	* before updating the inode item.
3138	*/
3139	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3140	!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3141	clear_extent_bit(tree: &inode->io_tree, start, end,
3142	bits: EXTENT_DELALLOC_NEW \| EXTENT_ADD_INODE_BYTES,
3143	cached: &cached_state);
3144
3145	btrfs_inode_safe_disk_i_size_write(inode, new_i_size: `0`);
3146	ret = btrfs_update_inode_fallback(trans, inode);
3147	if (ret) { / -ENOMEM or corruption /
3148	btrfs_abort_transaction(trans, ret);
3149	goto out;
3150	}
3151	ret = `0`;
3152	out:
3153	clear_extent_bit(tree: &inode->io_tree, start, end, bits: clear_bits,
3154	cached: &cached_state);
3155
3156	if (trans)
3157	btrfs_end_transaction(trans);
3158
3159	if (ret \|\| truncated) {
3160	u64 unwritten_start = start;
3161
3162	/*
3163	* If we failed to finish this ordered extent for any reason we
3164	* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3165	* extent, and mark the inode with the error if it wasn't
3166	* already set. Any error during writeback would have already
3167	* set the mapping error, so we need to set it if we're the ones
3168	* marking this ordered extent as failed.
3169	*/
3170	if (ret && !test_and_set_bit(nr: BTRFS_ORDERED_IOERR,
3171	addr: &ordered_extent->flags))
3172	mapping_set_error(mapping: ordered_extent->inode->i_mapping, error: -EIO);
3173
3174	if (truncated)
3175	unwritten_start += logical_len;
3176	clear_extent_uptodate(tree: io_tree, start: unwritten_start, end, NULL);
3177
3178	/ Drop extent maps for the part of the extent we didn't write. /
3179	btrfs_drop_extent_map_range(inode, start: unwritten_start, end, skip_pinned: false);
3180
3181	/*
3182	* If the ordered extent had an IOERR or something else went
3183	* wrong we need to return the space for this ordered extent
3184	* back to the allocator. We only free the extent in the
3185	* truncated case if we didn't write out the extent at all.
3186	*
3187	* If we made it past insert_reserved_file_extent before we
3188	* errored out then we don't need to do this as the accounting
3189	* has already been done.
3190	*/
3191	if ((ret \|\| !logical_len) &&
3192	clear_reserved_extent &&
3193	!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3194	!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3195	/*
3196	* Discard the range before returning it back to the
3197	* free space pool
3198	*/
3199	if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3200	btrfs_discard_extent(fs_info,
3201	bytenr: ordered_extent->disk_bytenr,
3202	num_bytes: ordered_extent->disk_num_bytes,
3203	NULL);
3204	btrfs_free_reserved_extent(fs_info,
3205	start: ordered_extent->disk_bytenr,
3206	len: ordered_extent->disk_num_bytes, delalloc: `1`);
3207	/*
3208	* Actually free the qgroup rsv which was released when
3209	* the ordered extent was created.
3210	*/
3211	btrfs_qgroup_free_refroot(fs_info, ref_root: inode->root->root_key.objectid,
3212	num_bytes: ordered_extent->qgroup_rsv,
3213	type: BTRFS_QGROUP_RSV_DATA);
3214	}
3215	}
3216
3217	/*
3218	* This needs to be done to make sure anybody waiting knows we are done
3219	* updating everything for this ordered extent.
3220	*/
3221	btrfs_remove_ordered_extent(btrfs_inode: inode, entry: ordered_extent);
3222
3223	/ once for us /
3224	btrfs_put_ordered_extent(entry: ordered_extent);
3225	/ once for the tree /
3226	btrfs_put_ordered_extent(entry: ordered_extent);
3227
3228	return ret;
3229	}
3230
3231	int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3232	{
3233	if (btrfs_is_zoned(fs_info: btrfs_sb(sb: ordered->inode->i_sb)) &&
3234	!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3235	list_empty(head: &ordered->bioc_list))
3236	btrfs_finish_ordered_zoned(ordered);
3237	return btrfs_finish_one_ordered(ordered_extent: ordered);
3238	}
3239
3240	/*
3241	* Verify the checksum for a single sector without any extra action that depend
3242	* on the type of I/O.
3243	*/
3244	int btrfs_check_sector_csum(struct btrfs_fs_info fs_info, struct* page *page,
3245	u32 pgoff, u8 csum, const* u8 * const csum_expected)
3246	{
3247	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3248	char *kaddr;
3249
3250	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3251
3252	shash->tfm = fs_info->csum_shash;
3253
3254	kaddr = kmap_local_page(page) + pgoff;
3255	crypto_shash_digest(desc: shash, data: kaddr, len: fs_info->sectorsize, out: csum);
3256	kunmap_local(kaddr);
3257
3258	if (memcmp(p: csum, q: csum_expected, size: fs_info->csum_size))
3259	return -EIO;
3260	return `0`;
3261	}
3262
3263	/*
3264	* Verify the checksum of a single data sector.
3265	*
3266	* @bbio: btrfs_io_bio which contains the csum
3267	* @dev: device the sector is on
3268	* @bio_offset: offset to the beginning of the bio (in bytes)
3269	* @bv: bio_vec to check
3270	*
3271	* Check if the checksum on a data block is valid. When a checksum mismatch is
3272	* detected, report the error and fill the corrupted range with zero.
3273	*
3274	* Return %true if the sector is ok or had no checksum to start with, else %false.
3275	*/
3276	bool btrfs_data_csum_ok(struct btrfs_bio bbio, struct* btrfs_device *dev,
3277	u32 bio_offset, struct bio_vec *bv)
3278	{
3279	struct btrfs_inode *inode = bbio->inode;
3280	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3281	u64 file_offset = bbio->file_offset + bio_offset;
3282	u64 end = file_offset + bv->bv_len - `1`;
3283	u8 *csum_expected;
3284	u8 csum[BTRFS_CSUM_SIZE];
3285
3286	ASSERT(bv->bv_len == fs_info->sectorsize);
3287
3288	if (!bbio->csum)
3289	return true;
3290
3291	if (btrfs_is_data_reloc_root(root: inode->root) &&
3292	test_range_bit(tree: &inode->io_tree, start: file_offset, end, bit: EXTENT_NODATASUM,
3293	NULL)) {
3294	/ Skip the range without csum for data reloc inode /
3295	clear_extent_bits(tree: &inode->io_tree, start: file_offset, end,
3296	bits: EXTENT_NODATASUM);
3297	return true;
3298	}
3299
3300	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3301	fs_info->csum_size;
3302	if (btrfs_check_sector_csum(fs_info, page: bv->bv_page, pgoff: bv->bv_offset, csum,
3303	csum_expected))
3304	goto zeroit;
3305	return true;
3306
3307	zeroit:
3308	btrfs_print_data_csum_error(inode, logical_start: file_offset, csum, csum_expected,
3309	mirror_num: bbio->mirror_num);
3310	if (dev)
3311	btrfs_dev_stat_inc_and_print(dev, index: BTRFS_DEV_STAT_CORRUPTION_ERRS);
3312	memzero_bvec(bvec: bv);
3313	return false;
3314	}
3315
3316	/*
3317	* Perform a delayed iput on @inode.
3318	*
3319	* @inode: The inode we want to perform iput on
3320	*
3321	* This function uses the generic vfs_inode::i_count to track whether we should
3322	* just decrement it (in case it's > 1) or if this is the last iput then link
3323	* the inode to the delayed iput machinery. Delayed iputs are processed at
3324	* transaction commit time/superblock commit/cleaner kthread.
3325	*/
3326	void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3327	{
3328	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3329	unsigned long flags;
3330
3331	if (atomic_add_unless(v: &inode->vfs_inode.i_count, a: -`1`, u: `1`))
3332	return;
3333
3334	atomic_inc(v: &fs_info->nr_delayed_iputs);
3335	/*
3336	* Need to be irq safe here because we can be called from either an irq
3337	* context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3338	* context.
3339	*/
3340	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3341	ASSERT(list_empty(&inode->delayed_iput));
3342	list_add_tail(new: &inode->delayed_iput, head: &fs_info->delayed_iputs);
3343	spin_unlock_irqrestore(lock: &fs_info->delayed_iput_lock, flags);
3344	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3345	wake_up_process(tsk: fs_info->cleaner_kthread);
3346	}
3347
3348	static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3349	struct btrfs_inode *inode)
3350	{
3351	list_del_init(entry: &inode->delayed_iput);
3352	spin_unlock_irq(lock: &fs_info->delayed_iput_lock);
3353	iput(&inode->vfs_inode);
3354	if (atomic_dec_and_test(v: &fs_info->nr_delayed_iputs))
3355	wake_up(&fs_info->delayed_iputs_wait);
3356	spin_lock_irq(lock: &fs_info->delayed_iput_lock);
3357	}
3358
3359	static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3360	struct btrfs_inode *inode)
3361	{
3362	if (!list_empty(head: &inode->delayed_iput)) {
3363	spin_lock_irq(lock: &fs_info->delayed_iput_lock);
3364	if (!list_empty(head: &inode->delayed_iput))
3365	run_delayed_iput_locked(fs_info, inode);
3366	spin_unlock_irq(lock: &fs_info->delayed_iput_lock);
3367	}
3368	}
3369
3370	void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3371	{
3372	/*
3373	* btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3374	* calls btrfs_add_delayed_iput() and that needs to lock
3375	* fs_info->delayed_iput_lock. So we need to disable irqs here to
3376	* prevent a deadlock.
3377	*/
3378	spin_lock_irq(lock: &fs_info->delayed_iput_lock);
3379	while (!list_empty(head: &fs_info->delayed_iputs)) {
3380	struct btrfs_inode *inode;
3381
3382	inode = list_first_entry(&fs_info->delayed_iputs,
3383	struct btrfs_inode, delayed_iput);
3384	run_delayed_iput_locked(fs_info, inode);
3385	if (need_resched()) {
3386	spin_unlock_irq(lock: &fs_info->delayed_iput_lock);
3387	cond_resched();
3388	spin_lock_irq(lock: &fs_info->delayed_iput_lock);
3389	}
3390	}
3391	spin_unlock_irq(lock: &fs_info->delayed_iput_lock);
3392	}
3393
3394	/*
3395	* Wait for flushing all delayed iputs
3396	*
3397	* @fs_info: the filesystem
3398	*
3399	* This will wait on any delayed iputs that are currently running with KILLABLE
3400	* set. Once they are all done running we will return, unless we are killed in
3401	* which case we return EINTR. This helps in user operations like fallocate etc
3402	* that might get blocked on the iputs.
3403	*
3404	* Return EINTR if we were killed, 0 if nothing's pending
3405	*/
3406	int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3407	{
3408	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3409	atomic_read(&fs_info->nr_delayed_iputs) == `0`);
3410	if (ret)
3411	return -EINTR;
3412	return `0`;
3413	}
3414
3415	/*
3416	* This creates an orphan entry for the given inode in case something goes wrong
3417	* in the middle of an unlink.
3418	*/
3419	int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3420	struct btrfs_inode *inode)
3421	{
3422	int ret;
3423
3424	ret = btrfs_insert_orphan_item(trans, root: inode->root, offset: btrfs_ino(inode));
3425	if (ret && ret != -EEXIST) {
3426	btrfs_abort_transaction(trans, ret);
3427	return ret;
3428	}
3429
3430	return `0`;
3431	}
3432
3433	/*
3434	* We have done the delete so we can go ahead and remove the orphan item for
3435	* this particular inode.
3436	*/
3437	static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3438	struct btrfs_inode *inode)
3439	{
3440	return btrfs_del_orphan_item(trans, root: inode->root, offset: btrfs_ino(inode));
3441	}
3442
3443	/*
3444	* this cleans up any orphans that may be left on the list from the last use
3445	* of this root.
3446	*/
3447	int btrfs_orphan_cleanup(struct btrfs_root *root)
3448	{
3449	struct btrfs_fs_info *fs_info = root->fs_info;
3450	struct btrfs_path *path;
3451	struct extent_buffer *leaf;
3452	struct btrfs_key key, found_key;
3453	struct btrfs_trans_handle *trans;
3454	struct inode *inode;
3455	u64 last_objectid = `0`;
3456	int ret = `0`, nr_unlink = `0`;
3457
3458	if (test_and_set_bit(nr: BTRFS_ROOT_ORPHAN_CLEANUP, addr: &root->state))
3459	return `0`;
3460
3461	path = btrfs_alloc_path();
3462	if (!path) {
3463	ret = -ENOMEM;
3464	goto out;
3465	}
3466	path->reada = READA_BACK;
3467
3468	key.objectid = BTRFS_ORPHAN_OBJECTID;
3469	key.type = BTRFS_ORPHAN_ITEM_KEY;
3470	key.offset = (u64)-`1`;
3471
3472	while (`1`) {
3473	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
3474	if (ret < `0`)
3475	goto out;
3476
3477	/*
3478	* if ret == 0 means we found what we were searching for, which
3479	* is weird, but possible, so only screw with path if we didn't
3480	* find the key and see if we have stuff that matches
3481	*/
3482	if (ret > `0`) {
3483	ret = `0`;
3484	if (path->slots[`0`] == `0`)
3485	break;
3486	path->slots[`0`]--;
3487	}
3488
3489	/ pull out the item /
3490	leaf = path->nodes[`0`];
3491	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
3492
3493	/ make sure the item matches what we want /
3494	if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3495	break;
3496	if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3497	break;
3498
3499	/ release the path since we're done with it /
3500	btrfs_release_path(p: path);
3501
3502	/*
3503	* this is where we are basically btrfs_lookup, without the
3504	* crossing root thing. we store the inode number in the
3505	* offset of the orphan item.
3506	*/
3507
3508	if (found_key.offset == last_objectid) {
3509	/*
3510	* We found the same inode as before. This means we were
3511	* not able to remove its items via eviction triggered
3512	* by an iput(). A transaction abort may have happened,
3513	* due to -ENOSPC for example, so try to grab the error
3514	* that lead to a transaction abort, if any.
3515	*/
3516	btrfs_err(fs_info,
3517	"Error removing orphan entry, stopping orphan cleanup");
3518	ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3519	goto out;
3520	}
3521
3522	last_objectid = found_key.offset;
3523
3524	found_key.objectid = found_key.offset;
3525	found_key.type = BTRFS_INODE_ITEM_KEY;
3526	found_key.offset = `0`;
3527	inode = btrfs_iget(s: fs_info->sb, ino: last_objectid, root);
3528	if (IS_ERR(ptr: inode)) {
3529	ret = PTR_ERR(ptr: inode);
3530	inode = NULL;
3531	if (ret != -ENOENT)
3532	goto out;
3533	}
3534
3535	if (!inode && root == fs_info->tree_root) {
3536	struct btrfs_root *dead_root;
3537	int is_dead_root = `0`;
3538
3539	/*
3540	* This is an orphan in the tree root. Currently these
3541	* could come from 2 sources:
3542	* a) a root (snapshot/subvolume) deletion in progress
3543	* b) a free space cache inode
3544	* We need to distinguish those two, as the orphan item
3545	* for a root must not get deleted before the deletion
3546	* of the snapshot/subvolume's tree completes.
3547	*
3548	* btrfs_find_orphan_roots() ran before us, which has
3549	* found all deleted roots and loaded them into
3550	* fs_info->fs_roots_radix. So here we can find if an
3551	* orphan item corresponds to a deleted root by looking
3552	* up the root from that radix tree.
3553	*/
3554
3555	spin_lock(lock: &fs_info->fs_roots_radix_lock);
3556	dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3557	(unsigned long)found_key.objectid);
3558	if (dead_root && btrfs_root_refs(s: &dead_root->root_item) == `0`)
3559	is_dead_root = `1`;
3560	spin_unlock(lock: &fs_info->fs_roots_radix_lock);
3561
3562	if (is_dead_root) {
3563	/ prevent this orphan from being found again /
3564	key.offset = found_key.objectid - `1`;
3565	continue;
3566	}
3567
3568	}
3569
3570	/*
3571	* If we have an inode with links, there are a couple of
3572	* possibilities:
3573	*
3574	* 1. We were halfway through creating fsverity metadata for the
3575	* file. In that case, the orphan item represents incomplete
3576	* fsverity metadata which must be cleaned up with
3577	* btrfs_drop_verity_items and deleting the orphan item.
3578
3579	* 2. Old kernels (before v3.12) used to create an
3580	* orphan item for truncate indicating that there were possibly
3581	* extent items past i_size that needed to be deleted. In v3.12,
3582	* truncate was changed to update i_size in sync with the extent
3583	* items, but the (useless) orphan item was still created. Since
3584	* v4.18, we don't create the orphan item for truncate at all.
3585	*
3586	* So, this item could mean that we need to do a truncate, but
3587	* only if this filesystem was last used on a pre-v3.12 kernel
3588	* and was not cleanly unmounted. The odds of that are quite
3589	* slim, and it's a pain to do the truncate now, so just delete
3590	* the orphan item.
3591	*
3592	* It's also possible that this orphan item was supposed to be
3593	* deleted but wasn't. The inode number may have been reused,
3594	* but either way, we can delete the orphan item.
3595	*/
3596	if (!inode \|\| inode->i_nlink) {
3597	if (inode) {
3598	ret = btrfs_drop_verity_items(inode: BTRFS_I(inode));
3599	iput(inode);
3600	inode = NULL;
3601	if (ret)
3602	goto out;
3603	}
3604	trans = btrfs_start_transaction(root, num_items: `1`);
3605	if (IS_ERR(ptr: trans)) {
3606	ret = PTR_ERR(ptr: trans);
3607	goto out;
3608	}
3609	btrfs_debug(fs_info, "auto deleting %Lu",
3610	found_key.objectid);
3611	ret = btrfs_del_orphan_item(trans, root,
3612	offset: found_key.objectid);
3613	btrfs_end_transaction(trans);
3614	if (ret)
3615	goto out;
3616	continue;
3617	}
3618
3619	nr_unlink++;
3620
3621	/ this will do delete_inode and everything for us /
3622	iput(inode);
3623	}
3624	/ release the path since we're done with it /
3625	btrfs_release_path(p: path);
3626
3627	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3628	trans = btrfs_join_transaction(root);
3629	if (!IS_ERR(ptr: trans))
3630	btrfs_end_transaction(trans);
3631	}
3632
3633	if (nr_unlink)
3634	btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3635
3636	out:
3637	if (ret)
3638	btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3639	btrfs_free_path(p: path);
3640	return ret;
3641	}
3642
3643	/*
3644	* very simple check to peek ahead in the leaf looking for xattrs. If we
3645	* don't find any xattrs, we know there can't be any acls.
3646	*
3647	* slot is the slot the inode is in, objectid is the objectid of the inode
3648	*/
3649	static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3650	int slot, u64 objectid,
3651	int *first_xattr_slot)
3652	{
3653	u32 nritems = btrfs_header_nritems(eb: leaf);
3654	struct btrfs_key found_key;
3655	static u64 xattr_access = `0`;
3656	static u64 xattr_default = `0`;
3657	int scanned = `0`;
3658
3659	if (!xattr_access) {
3660	xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3661	strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3662	xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3663	strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3664	}
3665
3666	slot++;
3667	*first_xattr_slot = -`1`;
3668	while (slot < nritems) {
3669	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: slot);
3670
3671	/ we found a different objectid, there must not be acls /
3672	if (found_key.objectid != objectid)
3673	return `0`;
3674
3675	/ we found an xattr, assume we've got an acl /
3676	if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3677	if (*first_xattr_slot == -`1`)
3678	*first_xattr_slot = slot;
3679	if (found_key.offset == xattr_access \|\|
3680	found_key.offset == xattr_default)
3681	return `1`;
3682	}
3683
3684	/*
3685	* we found a key greater than an xattr key, there can't
3686	* be any acls later on
3687	*/
3688	if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3689	return `0`;
3690
3691	slot++;
3692	scanned++;
3693
3694	/*
3695	* it goes inode, inode backrefs, xattrs, extents,
3696	* so if there are a ton of hard links to an inode there can
3697	* be a lot of backrefs. Don't waste time searching too hard,
3698	* this is just an optimization
3699	*/
3700	if (scanned >= `8`)
3701	break;
3702	}
3703	/ we hit the end of the leaf before we found an xattr or*
3704	* something larger than an xattr. We have to assume the inode
3705	* has acls
3706	*/
3707	if (*first_xattr_slot == -`1`)
3708	*first_xattr_slot = slot;
3709	return `1`;
3710	}
3711
3712	/*
3713	* read an inode from the btree into the in-memory inode
3714	*/
3715	static int btrfs_read_locked_inode(struct inode *inode,
3716	struct btrfs_path *in_path)
3717	{
3718	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
3719	struct btrfs_path *path = in_path;
3720	struct extent_buffer *leaf;
3721	struct btrfs_inode_item *inode_item;
3722	struct btrfs_root *root = BTRFS_I(inode)->root;
3723	struct btrfs_key location;
3724	unsigned long ptr;
3725	int maybe_acls;
3726	u32 rdev;
3727	int ret;
3728	bool filled = false;
3729	int first_xattr_slot;
3730
3731	ret = btrfs_fill_inode(inode, rdev: &rdev);
3732	if (!ret)
3733	filled = true;
3734
3735	if (!path) {
3736	path = btrfs_alloc_path();
3737	if (!path)
3738	return -ENOMEM;
3739	}
3740
3741	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3742
3743	ret = btrfs_lookup_inode(NULL, root, path, location: &location, mod: `0`);
3744	if (ret) {
3745	if (path != in_path)
3746	btrfs_free_path(p: path);
3747	return ret;
3748	}
3749
3750	leaf = path->nodes[`0`];
3751
3752	if (filled)
3753	goto cache_index;
3754
3755	inode_item = btrfs_item_ptr(leaf, path->slots[`0`],
3756	struct btrfs_inode_item);
3757	inode->i_mode = btrfs_inode_mode(eb: leaf, s: inode_item);
3758	set_nlink(inode, nlink: btrfs_inode_nlink(eb: leaf, s: inode_item));
3759	i_uid_write(inode, uid: btrfs_inode_uid(eb: leaf, s: inode_item));
3760	i_gid_write(inode, gid: btrfs_inode_gid(eb: leaf, s: inode_item));
3761	btrfs_i_size_write(inode: BTRFS_I(inode), size: btrfs_inode_size(eb: leaf, s: inode_item));
3762	btrfs_inode_set_file_extent_range(inode: BTRFS_I(inode), start: `0`,
3763	round_up(i_size_read(inode), fs_info->sectorsize));
3764
3765	inode_set_atime(inode, sec: btrfs_timespec_sec(eb: leaf, s: &inode_item->atime),
3766	nsec: btrfs_timespec_nsec(eb: leaf, s: &inode_item->atime));
3767
3768	inode_set_mtime(inode, sec: btrfs_timespec_sec(eb: leaf, s: &inode_item->mtime),
3769	nsec: btrfs_timespec_nsec(eb: leaf, s: &inode_item->mtime));
3770
3771	inode_set_ctime(inode, sec: btrfs_timespec_sec(eb: leaf, s: &inode_item->ctime),
3772	nsec: btrfs_timespec_nsec(eb: leaf, s: &inode_item->ctime));
3773
3774	BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(eb: leaf, s: &inode_item->otime);
3775	BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(eb: leaf, s: &inode_item->otime);
3776
3777	inode_set_bytes(inode, bytes: btrfs_inode_nbytes(eb: leaf, s: inode_item));
3778	BTRFS_I(inode)->generation = btrfs_inode_generation(eb: leaf, s: inode_item);
3779	BTRFS_I(inode)->last_trans = btrfs_inode_transid(eb: leaf, s: inode_item);
3780
3781	inode_set_iversion_queried(inode,
3782	val: btrfs_inode_sequence(eb: leaf, s: inode_item));
3783	inode->i_generation = BTRFS_I(inode)->generation;
3784	inode->i_rdev = `0`;
3785	rdev = btrfs_inode_rdev(eb: leaf, s: inode_item);
3786
3787	BTRFS_I(inode)->index_cnt = (u64)-`1`;
3788	btrfs_inode_split_flags(inode_item_flags: btrfs_inode_flags(eb: leaf, s: inode_item),
3789	flags: &BTRFS_I(inode)->flags, ro_flags: &BTRFS_I(inode)->ro_flags);
3790
3791	cache_index:
3792	/*
3793	* If we were modified in the current generation and evicted from memory
3794	* and then re-read we need to do a full sync since we don't have any
3795	* idea about which extents were modified before we were evicted from
3796	* cache.
3797	*
3798	* This is required for both inode re-read from disk and delayed inode
3799	* in delayed_nodes_tree.
3800	*/
3801	if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
3802	set_bit(nr: BTRFS_INODE_NEEDS_FULL_SYNC,
3803	addr: &BTRFS_I(inode)->runtime_flags);
3804
3805	/*
3806	* We don't persist the id of the transaction where an unlink operation
3807	* against the inode was last made. So here we assume the inode might
3808	* have been evicted, and therefore the exact value of last_unlink_trans
3809	* lost, and set it to last_trans to avoid metadata inconsistencies
3810	* between the inode and its parent if the inode is fsync'ed and the log
3811	* replayed. For example, in the scenario:
3812	*
3813	* touch mydir/foo
3814	* ln mydir/foo mydir/bar
3815	* sync
3816	* unlink mydir/bar
3817	* echo 2 > /proc/sys/vm/drop_caches # evicts inode
3818	* xfs_io -c fsync mydir/foo
3819	* <power failure>
3820	* mount fs, triggers fsync log replay
3821	*
3822	* We must make sure that when we fsync our inode foo we also log its
3823	* parent inode, otherwise after log replay the parent still has the
3824	* dentry with the "bar" name but our inode foo has a link count of 1
3825	* and doesn't have an inode ref with the name "bar" anymore.
3826	*
3827	* Setting last_unlink_trans to last_trans is a pessimistic approach,
3828	* but it guarantees correctness at the expense of occasional full
3829	* transaction commits on fsync if our inode is a directory, or if our
3830	* inode is not a directory, logging its parent unnecessarily.
3831	*/
3832	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3833
3834	/*
3835	* Same logic as for last_unlink_trans. We don't persist the generation
3836	* of the last transaction where this inode was used for a reflink
3837	* operation, so after eviction and reloading the inode we must be
3838	* pessimistic and assume the last transaction that modified the inode.
3839	*/
3840	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3841
3842	path->slots[`0`]++;
3843	if (inode->i_nlink != `1` \|\|
3844	path->slots[`0`] >= btrfs_header_nritems(eb: leaf))
3845	goto cache_acl;
3846
3847	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &location, nr: path->slots[`0`]);
3848	if (location.objectid != btrfs_ino(inode: BTRFS_I(inode)))
3849	goto cache_acl;
3850
3851	ptr = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
3852	if (location.type == BTRFS_INODE_REF_KEY) {
3853	struct btrfs_inode_ref *ref;
3854
3855	ref = (struct btrfs_inode_ref *)ptr;
3856	BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(eb: leaf, s: ref);
3857	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3858	struct btrfs_inode_extref *extref;
3859
3860	extref = (struct btrfs_inode_extref *)ptr;
3861	BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(eb: leaf,
3862	s: extref);
3863	}
3864	cache_acl:
3865	/*
3866	* try to precache a NULL acl entry for files that don't have
3867	* any xattrs or acls
3868	*/
3869	maybe_acls = acls_after_inode_item(leaf, slot: path->slots[`0`],
3870	objectid: btrfs_ino(inode: BTRFS_I(inode)), first_xattr_slot: &first_xattr_slot);
3871	if (first_xattr_slot != -`1`) {
3872	path->slots[`0`] = first_xattr_slot;
3873	ret = btrfs_load_inode_props(inode, path);
3874	if (ret)
3875	btrfs_err(fs_info,
3876	"error loading props for ino %llu (root %llu): %d",
3877	btrfs_ino(BTRFS_I(inode)),
3878	root->root_key.objectid, ret);
3879	}
3880	if (path != in_path)
3881	btrfs_free_path(p: path);
3882
3883	if (!maybe_acls)
3884	cache_no_acl(inode);
3885
3886	switch (inode->i_mode & S_IFMT) {
3887	case S_IFREG:
3888	inode->i_mapping->a_ops = &btrfs_aops;
3889	inode->i_fop = &btrfs_file_operations;
3890	inode->i_op = &btrfs_file_inode_operations;
3891	break;
3892	case S_IFDIR:
3893	inode->i_fop = &btrfs_dir_file_operations;
3894	inode->i_op = &btrfs_dir_inode_operations;
3895	break;
3896	case S_IFLNK:
3897	inode->i_op = &btrfs_symlink_inode_operations;
3898	inode_nohighmem(inode);
3899	inode->i_mapping->a_ops = &btrfs_aops;
3900	break;
3901	default:
3902	inode->i_op = &btrfs_special_inode_operations;
3903	init_special_inode(inode, inode->i_mode, rdev);
3904	break;
3905	}
3906
3907	btrfs_sync_inode_flags_to_i_flags(inode);
3908	return `0`;
3909	}
3910
3911	/*
3912	* given a leaf and an inode, copy the inode fields into the leaf
3913	*/
3914	static void fill_inode_item(struct btrfs_trans_handle *trans,
3915	struct extent_buffer *leaf,
3916	struct btrfs_inode_item *item,
3917	struct inode *inode)
3918	{
3919	struct btrfs_map_token token;
3920	u64 flags;
3921
3922	btrfs_init_map_token(token: &token, eb: leaf);
3923
3924	btrfs_set_token_inode_uid(token: &token, s: item, val: i_uid_read(inode));
3925	btrfs_set_token_inode_gid(token: &token, s: item, val: i_gid_read(inode));
3926	btrfs_set_token_inode_size(token: &token, s: item, val: BTRFS_I(inode)->disk_i_size);
3927	btrfs_set_token_inode_mode(token: &token, s: item, val: inode->i_mode);
3928	btrfs_set_token_inode_nlink(token: &token, s: item, val: inode->i_nlink);
3929
3930	btrfs_set_token_timespec_sec(token: &token, s: &item->atime,
3931	val: inode_get_atime_sec(inode));
3932	btrfs_set_token_timespec_nsec(token: &token, s: &item->atime,
3933	val: inode_get_atime_nsec(inode));
3934
3935	btrfs_set_token_timespec_sec(token: &token, s: &item->mtime,
3936	val: inode_get_mtime_sec(inode));
3937	btrfs_set_token_timespec_nsec(token: &token, s: &item->mtime,
3938	val: inode_get_mtime_nsec(inode));
3939
3940	btrfs_set_token_timespec_sec(token: &token, s: &item->ctime,
3941	val: inode_get_ctime_sec(inode));
3942	btrfs_set_token_timespec_nsec(token: &token, s: &item->ctime,
3943	val: inode_get_ctime_nsec(inode));
3944
3945	btrfs_set_token_timespec_sec(token: &token, s: &item->otime, val: BTRFS_I(inode)->i_otime_sec);
3946	btrfs_set_token_timespec_nsec(token: &token, s: &item->otime, val: BTRFS_I(inode)->i_otime_nsec);
3947
3948	btrfs_set_token_inode_nbytes(token: &token, s: item, val: inode_get_bytes(inode));
3949	btrfs_set_token_inode_generation(token: &token, s: item,
3950	val: BTRFS_I(inode)->generation);
3951	btrfs_set_token_inode_sequence(token: &token, s: item, val: inode_peek_iversion(inode));
3952	btrfs_set_token_inode_transid(token: &token, s: item, val: trans->transid);
3953	btrfs_set_token_inode_rdev(token: &token, s: item, val: inode->i_rdev);
3954	flags = btrfs_inode_combine_flags(flags: BTRFS_I(inode)->flags,
3955	ro_flags: BTRFS_I(inode)->ro_flags);
3956	btrfs_set_token_inode_flags(token: &token, s: item, val: flags);
3957	btrfs_set_token_inode_block_group(token: &token, s: item, val: `0`);
3958	}
3959
3960	/*
3961	* copy everything in the in-memory inode into the btree.
3962	*/
3963	static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3964	struct btrfs_inode *inode)
3965	{
3966	struct btrfs_inode_item *inode_item;
3967	struct btrfs_path *path;
3968	struct extent_buffer *leaf;
3969	int ret;
3970
3971	path = btrfs_alloc_path();
3972	if (!path)
3973	return -ENOMEM;
3974
3975	ret = btrfs_lookup_inode(trans, root: inode->root, path, location: &inode->location, mod: `1`);
3976	if (ret) {
3977	if (ret > `0`)
3978	ret = -ENOENT;
3979	goto failed;
3980	}
3981
3982	leaf = path->nodes[`0`];
3983	inode_item = btrfs_item_ptr(leaf, path->slots[`0`],
3984	struct btrfs_inode_item);
3985
3986	fill_inode_item(trans, leaf, item: inode_item, inode: &inode->vfs_inode);
3987	btrfs_mark_buffer_dirty(trans, buf: leaf);
3988	btrfs_set_inode_last_trans(trans, inode);
3989	ret = `0`;
3990	failed:
3991	btrfs_free_path(p: path);
3992	return ret;
3993	}
3994
3995	/*
3996	* copy everything in the in-memory inode into the btree.
3997	*/
3998	int btrfs_update_inode(struct btrfs_trans_handle *trans,
3999	struct btrfs_inode *inode)
4000	{
4001	struct btrfs_root *root = inode->root;
4002	struct btrfs_fs_info *fs_info = root->fs_info;
4003	int ret;
4004
4005	/*
4006	* If the inode is a free space inode, we can deadlock during commit
4007	* if we put it into the delayed code.
4008	*
4009	* The data relocation inode should also be directly updated
4010	* without delay
4011	*/
4012	if (!btrfs_is_free_space_inode(inode)
4013	&& !btrfs_is_data_reloc_root(root)
4014	&& !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4015	btrfs_update_root_times(trans, root);
4016
4017	ret = btrfs_delayed_update_inode(trans, inode);
4018	if (!ret)
4019	btrfs_set_inode_last_trans(trans, inode);
4020	return ret;
4021	}
4022
4023	return btrfs_update_inode_item(trans, inode);
4024	}
4025
4026	int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4027	struct btrfs_inode *inode)
4028	{
4029	int ret;
4030
4031	ret = btrfs_update_inode(trans, inode);
4032	if (ret == -ENOSPC)
4033	return btrfs_update_inode_item(trans, inode);
4034	return ret;
4035	}
4036
4037	/*
4038	* unlink helper that gets used here in inode.c and in the tree logging
4039	* recovery code. It remove a link in a directory with a given name, and
4040	* also drops the back refs in the inode to the directory
4041	*/
4042	static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4043	struct btrfs_inode *dir,
4044	struct btrfs_inode *inode,
4045	const struct fscrypt_str *name,
4046	struct btrfs_rename_ctx *rename_ctx)
4047	{
4048	struct btrfs_root *root = dir->root;
4049	struct btrfs_fs_info *fs_info = root->fs_info;
4050	struct btrfs_path *path;
4051	int ret = `0`;
4052	struct btrfs_dir_item *di;
4053	u64 index;
4054	u64 ino = btrfs_ino(inode);
4055	u64 dir_ino = btrfs_ino(inode: dir);
4056
4057	path = btrfs_alloc_path();
4058	if (!path) {
4059	ret = -ENOMEM;
4060	goto out;
4061	}
4062
4063	di = btrfs_lookup_dir_item(trans, root, path, dir: dir_ino, name, mod: -`1`);
4064	if (IS_ERR_OR_NULL(ptr: di)) {
4065	ret = di ? PTR_ERR(ptr: di) : -ENOENT;
4066	goto err;
4067	}
4068	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4069	if (ret)
4070	goto err;
4071	btrfs_release_path(p: path);
4072
4073	/*
4074	* If we don't have dir index, we have to get it by looking up
4075	* the inode ref, since we get the inode ref, remove it directly,
4076	* it is unnecessary to do delayed deletion.
4077	*
4078	* But if we have dir index, needn't search inode ref to get it.
4079	* Since the inode ref is close to the inode item, it is better
4080	* that we delay to delete it, and just do this deletion when
4081	* we update the inode item.
4082	*/
4083	if (inode->dir_index) {
4084	ret = btrfs_delayed_delete_inode_ref(inode);
4085	if (!ret) {
4086	index = inode->dir_index;
4087	goto skip_backref;
4088	}
4089	}
4090
4091	ret = btrfs_del_inode_ref(trans, root, name, inode_objectid: ino, ref_objectid: dir_ino, index: &index);
4092	if (ret) {
4093	btrfs_info(fs_info,
4094	"failed to delete reference to %.*s, inode %llu parent %llu",
4095	name->len, name->name, ino, dir_ino);
4096	btrfs_abort_transaction(trans, ret);
4097	goto err;
4098	}
4099	skip_backref:
4100	if (rename_ctx)
4101	rename_ctx->index = index;
4102
4103	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4104	if (ret) {
4105	btrfs_abort_transaction(trans, ret);
4106	goto err;
4107	}
4108
4109	/*
4110	* If we are in a rename context, we don't need to update anything in the
4111	* log. That will be done later during the rename by btrfs_log_new_name().
4112	* Besides that, doing it here would only cause extra unnecessary btree
4113	* operations on the log tree, increasing latency for applications.
4114	*/
4115	if (!rename_ctx) {
4116	btrfs_del_inode_ref_in_log(trans, root, name, inode, dirid: dir_ino);
4117	btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4118	}
4119
4120	/*
4121	* If we have a pending delayed iput we could end up with the final iput
4122	* being run in btrfs-cleaner context. If we have enough of these built
4123	* up we can end up burning a lot of time in btrfs-cleaner without any
4124	* way to throttle the unlinks. Since we're currently holding a ref on
4125	* the inode we can run the delayed iput here without any issues as the
4126	* final iput won't be done until after we drop the ref we're currently
4127	* holding.
4128	*/
4129	btrfs_run_delayed_iput(fs_info, inode);
4130	err:
4131	btrfs_free_path(p: path);
4132	if (ret)
4133	goto out;
4134
4135	btrfs_i_size_write(inode: dir, size: dir->vfs_inode.i_size - name->len * `2`);
4136	inode_inc_iversion(inode: &inode->vfs_inode);
4137	inode_inc_iversion(inode: &dir->vfs_inode);
4138	inode_set_mtime_to_ts(inode: &dir->vfs_inode, ts: inode_set_ctime_current(inode: &dir->vfs_inode));
4139	ret = btrfs_update_inode(trans, inode: dir);
4140	out:
4141	return ret;
4142	}
4143
4144	int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4145	struct btrfs_inode dir, struct* btrfs_inode *inode,
4146	const struct fscrypt_str *name)
4147	{
4148	int ret;
4149
4150	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4151	if (!ret) {
4152	drop_nlink(inode: &inode->vfs_inode);
4153	ret = btrfs_update_inode(trans, inode);
4154	}
4155	return ret;
4156	}
4157
4158	/*
4159	* helper to start transaction for unlink and rmdir.
4160	*
4161	* unlink and rmdir are special in btrfs, they do not always free space, so
4162	* if we cannot make our reservations the normal way try and see if there is
4163	* plenty of slack room in the global reserve to migrate, otherwise we cannot
4164	* allow the unlink to occur.
4165	*/
4166	static struct btrfs_trans_handle __unlink_start_trans(struct* btrfs_inode *dir)
4167	{
4168	struct btrfs_root *root = dir->root;
4169
4170	return btrfs_start_transaction_fallback_global_rsv(root,
4171	BTRFS_UNLINK_METADATA_UNITS);
4172	}
4173
4174	static int btrfs_unlink(struct inode dir, struct* dentry *dentry)
4175	{
4176	struct btrfs_trans_handle *trans;
4177	struct inode *inode = d_inode(dentry);
4178	int ret;
4179	struct fscrypt_name fname;
4180
4181	ret = fscrypt_setup_filename(inode: dir, iname: &dentry->d_name, lookup: `1`, fname: &fname);
4182	if (ret)
4183	return ret;
4184
4185	/ This needs to handle no-key deletions later on /
4186
4187	trans = __unlink_start_trans(dir: BTRFS_I(inode: dir));
4188	if (IS_ERR(ptr: trans)) {
4189	ret = PTR_ERR(ptr: trans);
4190	goto fscrypt_free;
4191	}
4192
4193	btrfs_record_unlink_dir(trans, dir: BTRFS_I(inode: dir), inode: BTRFS_I(inode: d_inode(dentry)),
4194	for_rename: false);
4195
4196	ret = btrfs_unlink_inode(trans, dir: BTRFS_I(inode: dir), inode: BTRFS_I(inode: d_inode(dentry)),
4197	name: &fname.disk_name);
4198	if (ret)
4199	goto end_trans;
4200
4201	if (inode->i_nlink == `0`) {
4202	ret = btrfs_orphan_add(trans, inode: BTRFS_I(inode));
4203	if (ret)
4204	goto end_trans;
4205	}
4206
4207	end_trans:
4208	btrfs_end_transaction(trans);
4209	btrfs_btree_balance_dirty(fs_info: BTRFS_I(inode: dir)->root->fs_info);
4210	fscrypt_free:
4211	fscrypt_free_filename(fname: &fname);
4212	return ret;
4213	}
4214
4215	static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4216	struct btrfs_inode dir, struct* dentry *dentry)
4217	{
4218	struct btrfs_root *root = dir->root;
4219	struct btrfs_inode *inode = BTRFS_I(inode: d_inode(dentry));
4220	struct btrfs_path *path;
4221	struct extent_buffer *leaf;
4222	struct btrfs_dir_item *di;
4223	struct btrfs_key key;
4224	u64 index;
4225	int ret;
4226	u64 objectid;
4227	u64 dir_ino = btrfs_ino(inode: dir);
4228	struct fscrypt_name fname;
4229
4230	ret = fscrypt_setup_filename(inode: &dir->vfs_inode, iname: &dentry->d_name, lookup: `1`, fname: &fname);
4231	if (ret)
4232	return ret;
4233
4234	/ This needs to handle no-key deletions later on /
4235
4236	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4237	objectid = inode->root->root_key.objectid;
4238	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4239	objectid = inode->location.objectid;
4240	} else {
4241	WARN_ON(`1`);
4242	fscrypt_free_filename(fname: &fname);
4243	return -EINVAL;
4244	}
4245
4246	path = btrfs_alloc_path();
4247	if (!path) {
4248	ret = -ENOMEM;
4249	goto out;
4250	}
4251
4252	di = btrfs_lookup_dir_item(trans, root, path, dir: dir_ino,
4253	name: &fname.disk_name, mod: -`1`);
4254	if (IS_ERR_OR_NULL(ptr: di)) {
4255	ret = di ? PTR_ERR(ptr: di) : -ENOENT;
4256	goto out;
4257	}
4258
4259	leaf = path->nodes[`0`];
4260	btrfs_dir_item_key_to_cpu(eb: leaf, item: di, cpu_key: &key);
4261	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY \|\| key.objectid != objectid);
4262	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4263	if (ret) {
4264	btrfs_abort_transaction(trans, ret);
4265	goto out;
4266	}
4267	btrfs_release_path(p: path);
4268
4269	/*
4270	* This is a placeholder inode for a subvolume we didn't have a
4271	* reference to at the time of the snapshot creation. In the meantime
4272	* we could have renamed the real subvol link into our snapshot, so
4273	* depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4274	* Instead simply lookup the dir_index_item for this entry so we can
4275	* remove it. Otherwise we know we have a ref to the root and we can
4276	* call btrfs_del_root_ref, and it _shouldn't_ fail.
4277	*/
4278	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4279	di = btrfs_search_dir_index_item(root, path, dirid: dir_ino, name: &fname.disk_name);
4280	if (IS_ERR_OR_NULL(ptr: di)) {
4281	if (!di)
4282	ret = -ENOENT;
4283	else
4284	ret = PTR_ERR(ptr: di);
4285	btrfs_abort_transaction(trans, ret);
4286	goto out;
4287	}
4288
4289	leaf = path->nodes[`0`];
4290	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
4291	index = key.offset;
4292	btrfs_release_path(p: path);
4293	} else {
4294	ret = btrfs_del_root_ref(trans, root_id: objectid,
4295	ref_id: root->root_key.objectid, dirid: dir_ino,
4296	sequence: &index, name: &fname.disk_name);
4297	if (ret) {
4298	btrfs_abort_transaction(trans, ret);
4299	goto out;
4300	}
4301	}
4302
4303	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4304	if (ret) {
4305	btrfs_abort_transaction(trans, ret);
4306	goto out;
4307	}
4308
4309	btrfs_i_size_write(inode: dir, size: dir->vfs_inode.i_size - fname.disk_name.len * `2`);
4310	inode_inc_iversion(inode: &dir->vfs_inode);
4311	inode_set_mtime_to_ts(inode: &dir->vfs_inode, ts: inode_set_ctime_current(inode: &dir->vfs_inode));
4312	ret = btrfs_update_inode_fallback(trans, inode: dir);
4313	if (ret)
4314	btrfs_abort_transaction(trans, ret);
4315	out:
4316	btrfs_free_path(p: path);
4317	fscrypt_free_filename(fname: &fname);
4318	return ret;
4319	}
4320
4321	/*
4322	* Helper to check if the subvolume references other subvolumes or if it's
4323	* default.
4324	*/
4325	static noinline int may_destroy_subvol(struct btrfs_root *root)
4326	{
4327	struct btrfs_fs_info *fs_info = root->fs_info;
4328	struct btrfs_path *path;
4329	struct btrfs_dir_item *di;
4330	struct btrfs_key key;
4331	struct fscrypt_str name = FSTR_INIT("default", `7`);
4332	u64 dir_id;
4333	int ret;
4334
4335	path = btrfs_alloc_path();
4336	if (!path)
4337	return -ENOMEM;
4338
4339	/ Make sure this root isn't set as the default subvol /
4340	dir_id = btrfs_super_root_dir(s: fs_info->super_copy);
4341	di = btrfs_lookup_dir_item(NULL, root: fs_info->tree_root, path,
4342	dir: dir_id, name: &name, mod: `0`);
4343	if (di && !IS_ERR(ptr: di)) {
4344	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &key);
4345	if (key.objectid == root->root_key.objectid) {
4346	ret = -EPERM;
4347	btrfs_err(fs_info,
4348	"deleting default subvolume %llu is not allowed",
4349	key.objectid);
4350	goto out;
4351	}
4352	btrfs_release_path(p: path);
4353	}
4354
4355	key.objectid = root->root_key.objectid;
4356	key.type = BTRFS_ROOT_REF_KEY;
4357	key.offset = (u64)-`1`;
4358
4359	ret = btrfs_search_slot(NULL, root: fs_info->tree_root, key: &key, p: path, ins_len: `0`, cow: `0`);
4360	if (ret < `0`)
4361	goto out;
4362	BUG_ON(ret == `0`);
4363
4364	ret = `0`;
4365	if (path->slots[`0`] > `0`) {
4366	path->slots[`0`]--;
4367	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
4368	if (key.objectid == root->root_key.objectid &&
4369	key.type == BTRFS_ROOT_REF_KEY)
4370	ret = -ENOTEMPTY;
4371	}
4372	out:
4373	btrfs_free_path(p: path);
4374	return ret;
4375	}
4376
4377	/ Delete all dentries for inodes belonging to the root /
4378	static void btrfs_prune_dentries(struct btrfs_root *root)
4379	{
4380	struct btrfs_fs_info *fs_info = root->fs_info;
4381	struct rb_node *node;
4382	struct rb_node *prev;
4383	struct btrfs_inode *entry;
4384	struct inode *inode;
4385	u64 objectid = `0`;
4386
4387	if (!BTRFS_FS_ERROR(fs_info))
4388	WARN_ON(btrfs_root_refs(&root->root_item) != `0`);
4389
4390	spin_lock(lock: &root->inode_lock);
4391	again:
4392	node = root->inode_tree.rb_node;
4393	prev = NULL;
4394	while (node) {
4395	prev = node;
4396	entry = rb_entry(node, struct btrfs_inode, rb_node);
4397
4398	if (objectid < btrfs_ino(inode: entry))
4399	node = node->rb_left;
4400	else if (objectid > btrfs_ino(inode: entry))
4401	node = node->rb_right;
4402	else
4403	break;
4404	}
4405	if (!node) {
4406	while (prev) {
4407	entry = rb_entry(prev, struct btrfs_inode, rb_node);
4408	if (objectid <= btrfs_ino(inode: entry)) {
4409	node = prev;
4410	break;
4411	}
4412	prev = rb_next(prev);
4413	}
4414	}
4415	while (node) {
4416	entry = rb_entry(node, struct btrfs_inode, rb_node);
4417	objectid = btrfs_ino(inode: entry) + `1`;
4418	inode = igrab(&entry->vfs_inode);
4419	if (inode) {
4420	spin_unlock(lock: &root->inode_lock);
4421	if (atomic_read(v: &inode->i_count) > `1`)
4422	d_prune_aliases(inode);
4423	/*
4424	* btrfs_drop_inode will have it removed from the inode
4425	* cache when its usage count hits zero.
4426	*/
4427	iput(inode);
4428	cond_resched();
4429	spin_lock(lock: &root->inode_lock);
4430	goto again;
4431	}
4432
4433	if (cond_resched_lock(&root->inode_lock))
4434	goto again;
4435
4436	node = rb_next(node);
4437	}
4438	spin_unlock(lock: &root->inode_lock);
4439	}
4440
4441	int btrfs_delete_subvolume(struct btrfs_inode dir, struct* dentry *dentry)
4442	{
4443	struct btrfs_fs_info *fs_info = btrfs_sb(sb: dentry->d_sb);
4444	struct btrfs_root *root = dir->root;
4445	struct inode *inode = d_inode(dentry);
4446	struct btrfs_root *dest = BTRFS_I(inode)->root;
4447	struct btrfs_trans_handle *trans;
4448	struct btrfs_block_rsv block_rsv;
4449	u64 root_flags;
4450	int ret;
4451
4452	/*
4453	* Don't allow to delete a subvolume with send in progress. This is
4454	* inside the inode lock so the error handling that has to drop the bit
4455	* again is not run concurrently.
4456	*/
4457	spin_lock(lock: &dest->root_item_lock);
4458	if (dest->send_in_progress) {
4459	spin_unlock(lock: &dest->root_item_lock);
4460	btrfs_warn(fs_info,
4461	"attempt to delete subvolume %llu during send",
4462	dest->root_key.objectid);
4463	return -EPERM;
4464	}
4465	if (atomic_read(v: &dest->nr_swapfiles)) {
4466	spin_unlock(lock: &dest->root_item_lock);
4467	btrfs_warn(fs_info,
4468	"attempt to delete subvolume %llu with active swapfile",
4469	root->root_key.objectid);
4470	return -EPERM;
4471	}
4472	root_flags = btrfs_root_flags(s: &dest->root_item);
4473	btrfs_set_root_flags(s: &dest->root_item,
4474	val: root_flags \| BTRFS_ROOT_SUBVOL_DEAD);
4475	spin_unlock(lock: &dest->root_item_lock);
4476
4477	down_write(sem: &fs_info->subvol_sem);
4478
4479	ret = may_destroy_subvol(root: dest);
4480	if (ret)
4481	goto out_up_write;
4482
4483	btrfs_init_block_rsv(rsv: &block_rsv, type: BTRFS_BLOCK_RSV_TEMP);
4484	/*
4485	* One for dir inode,
4486	* two for dir entries,
4487	* two for root ref/backref.
4488	*/
4489	ret = btrfs_subvolume_reserve_metadata(root, rsv: &block_rsv, nitems: `5`, use_global_rsv: true);
4490	if (ret)
4491	goto out_up_write;
4492
4493	trans = btrfs_start_transaction(root, num_items: `0`);
4494	if (IS_ERR(ptr: trans)) {
4495	ret = PTR_ERR(ptr: trans);
4496	goto out_release;
4497	}
4498	trans->block_rsv = &block_rsv;
4499	trans->bytes_reserved = block_rsv.size;
4500
4501	btrfs_record_snapshot_destroy(trans, dir);
4502
4503	ret = btrfs_unlink_subvol(trans, dir, dentry);
4504	if (ret) {
4505	btrfs_abort_transaction(trans, ret);
4506	goto out_end_trans;
4507	}
4508
4509	ret = btrfs_record_root_in_trans(trans, root: dest);
4510	if (ret) {
4511	btrfs_abort_transaction(trans, ret);
4512	goto out_end_trans;
4513	}
4514
4515	memset(&dest->root_item.drop_progress, `0`,
4516	sizeof(dest->root_item.drop_progress));
4517	btrfs_set_root_drop_level(s: &dest->root_item, val: `0`);
4518	btrfs_set_root_refs(s: &dest->root_item, val: `0`);
4519
4520	if (!test_and_set_bit(nr: BTRFS_ROOT_ORPHAN_ITEM_INSERTED, addr: &dest->state)) {
4521	ret = btrfs_insert_orphan_item(trans,
4522	root: fs_info->tree_root,
4523	offset: dest->root_key.objectid);
4524	if (ret) {
4525	btrfs_abort_transaction(trans, ret);
4526	goto out_end_trans;
4527	}
4528	}
4529
4530	ret = btrfs_uuid_tree_remove(trans, uuid: dest->root_item.uuid,
4531	BTRFS_UUID_KEY_SUBVOL,
4532	subid: dest->root_key.objectid);
4533	if (ret && ret != -ENOENT) {
4534	btrfs_abort_transaction(trans, ret);
4535	goto out_end_trans;
4536	}
4537	if (!btrfs_is_empty_uuid(uuid: dest->root_item.received_uuid)) {
4538	ret = btrfs_uuid_tree_remove(trans,
4539	uuid: dest->root_item.received_uuid,
4540	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4541	subid: dest->root_key.objectid);
4542	if (ret && ret != -ENOENT) {
4543	btrfs_abort_transaction(trans, ret);
4544	goto out_end_trans;
4545	}
4546	}
4547
4548	free_anon_bdev(dest->anon_dev);
4549	dest->anon_dev = `0`;
4550	out_end_trans:
4551	trans->block_rsv = NULL;
4552	trans->bytes_reserved = `0`;
4553	ret = btrfs_end_transaction(trans);
4554	inode->i_flags \|= S_DEAD;
4555	out_release:
4556	btrfs_subvolume_release_metadata(root, rsv: &block_rsv);
4557	out_up_write:
4558	up_write(sem: &fs_info->subvol_sem);
4559	if (ret) {
4560	spin_lock(lock: &dest->root_item_lock);
4561	root_flags = btrfs_root_flags(s: &dest->root_item);
4562	btrfs_set_root_flags(s: &dest->root_item,
4563	val: root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4564	spin_unlock(lock: &dest->root_item_lock);
4565	} else {
4566	d_invalidate(dentry);
4567	btrfs_prune_dentries(root: dest);
4568	ASSERT(dest->send_in_progress == `0`);
4569	}
4570
4571	return ret;
4572	}
4573
4574	static int btrfs_rmdir(struct inode dir, struct* dentry *dentry)
4575	{
4576	struct inode *inode = d_inode(dentry);
4577	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4578	int err = `0`;
4579	struct btrfs_trans_handle *trans;
4580	u64 last_unlink_trans;
4581	struct fscrypt_name fname;
4582
4583	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4584	return -ENOTEMPTY;
4585	if (btrfs_ino(inode: BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4586	if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4587	btrfs_err(fs_info,
4588	"extent tree v2 doesn't support snapshot deletion yet");
4589	return -EOPNOTSUPP;
4590	}
4591	return btrfs_delete_subvolume(dir: BTRFS_I(inode: dir), dentry);
4592	}
4593
4594	err = fscrypt_setup_filename(inode: dir, iname: &dentry->d_name, lookup: `1`, fname: &fname);
4595	if (err)
4596	return err;
4597
4598	/ This needs to handle no-key deletions later on /
4599
4600	trans = __unlink_start_trans(dir: BTRFS_I(inode: dir));
4601	if (IS_ERR(ptr: trans)) {
4602	err = PTR_ERR(ptr: trans);
4603	goto out_notrans;
4604	}
4605
4606	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4607	err = btrfs_unlink_subvol(trans, dir: BTRFS_I(inode: dir), dentry);
4608	goto out;
4609	}
4610
4611	err = btrfs_orphan_add(trans, inode: BTRFS_I(inode));
4612	if (err)
4613	goto out;
4614
4615	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4616
4617	/ now the directory is empty /
4618	err = btrfs_unlink_inode(trans, dir: BTRFS_I(inode: dir), inode: BTRFS_I(inode: d_inode(dentry)),
4619	name: &fname.disk_name);
4620	if (!err) {
4621	btrfs_i_size_write(inode: BTRFS_I(inode), size: `0`);
4622	/*
4623	* Propagate the last_unlink_trans value of the deleted dir to
4624	* its parent directory. This is to prevent an unrecoverable
4625	* log tree in the case we do something like this:
4626	* 1) create dir foo
4627	* 2) create snapshot under dir foo
4628	* 3) delete the snapshot
4629	* 4) rmdir foo
4630	* 5) mkdir foo
4631	* 6) fsync foo or some file inside foo
4632	*/
4633	if (last_unlink_trans >= trans->transid)
4634	BTRFS_I(inode: dir)->last_unlink_trans = last_unlink_trans;
4635	}
4636	out:
4637	btrfs_end_transaction(trans);
4638	out_notrans:
4639	btrfs_btree_balance_dirty(fs_info);
4640	fscrypt_free_filename(fname: &fname);
4641
4642	return err;
4643	}
4644
4645	/*
4646	* Read, zero a chunk and write a block.
4647	*
4648	* @inode - inode that we're zeroing
4649	* @from - the offset to start zeroing
4650	* @len - the length to zero, 0 to zero the entire range respective to the
4651	* offset
4652	* @front - zero up to the offset instead of from the offset on
4653	*
4654	* This will find the block for the "from" offset and cow the block and zero the
4655	* part we want to zero. This is used with truncate and hole punching.
4656	*/
4657	int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4658	int front)
4659	{
4660	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4661	struct address_space *mapping = inode->vfs_inode.i_mapping;
4662	struct extent_io_tree *io_tree = &inode->io_tree;
4663	struct btrfs_ordered_extent *ordered;
4664	struct extent_state *cached_state = NULL;
4665	struct extent_changeset *data_reserved = NULL;
4666	bool only_release_metadata = false;
4667	u32 blocksize = fs_info->sectorsize;
4668	pgoff_t index = from >> PAGE_SHIFT;
4669	unsigned offset = from & (blocksize - `1`);
4670	struct page *page;
4671	gfp_t mask = btrfs_alloc_write_mask(mapping);
4672	size_t write_bytes = blocksize;
4673	int ret = `0`;
4674	u64 block_start;
4675	u64 block_end;
4676
4677	if (IS_ALIGNED(offset, blocksize) &&
4678	(!len \|\| IS_ALIGNED(len, blocksize)))
4679	goto out;
4680
4681	block_start = round_down(from, blocksize);
4682	block_end = block_start + blocksize - `1`;
4683
4684	ret = btrfs_check_data_free_space(inode, reserved: &data_reserved, start: block_start,
4685	len: blocksize, noflush: false);
4686	if (ret < `0`) {
4687	if (btrfs_check_nocow_lock(inode, pos: block_start, write_bytes: &write_bytes, nowait: false) > `0`) {
4688	/ For nocow case, no need to reserve data space /
4689	only_release_metadata = true;
4690	} else {
4691	goto out;
4692	}
4693	}
4694	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes: blocksize, disk_num_bytes: blocksize, noflush: false);
4695	if (ret < `0`) {
4696	if (!only_release_metadata)
4697	btrfs_free_reserved_data_space(inode, reserved: data_reserved,
4698	start: block_start, len: blocksize);
4699	goto out;
4700	}
4701	again:
4702	page = find_or_create_page(mapping, index, gfp_mask: mask);
4703	if (!page) {
4704	btrfs_delalloc_release_space(inode, reserved: data_reserved, start: block_start,
4705	len: blocksize, qgroup_free: true);
4706	btrfs_delalloc_release_extents(inode, num_bytes: blocksize);
4707	ret = -ENOMEM;
4708	goto out;
4709	}
4710
4711	if (!PageUptodate(page)) {
4712	ret = btrfs_read_folio(NULL, page_folio(page));
4713	lock_page(page);
4714	if (page->mapping != mapping) {
4715	unlock_page(page);
4716	put_page(page);
4717	goto again;
4718	}
4719	if (!PageUptodate(page)) {
4720	ret = -EIO;
4721	goto out_unlock;
4722	}
4723	}
4724
4725	/*
4726	* We unlock the page after the io is completed and then re-lock it
4727	* above. release_folio() could have come in between that and cleared
4728	* PagePrivate(), but left the page in the mapping. Set the page mapped
4729	* here to make sure it's properly set for the subpage stuff.
4730	*/
4731	ret = set_page_extent_mapped(page);
4732	if (ret < `0`)
4733	goto out_unlock;
4734
4735	wait_on_page_writeback(page);
4736
4737	lock_extent(tree: io_tree, start: block_start, end: block_end, cached: &cached_state);
4738
4739	ordered = btrfs_lookup_ordered_extent(inode, file_offset: block_start);
4740	if (ordered) {
4741	unlock_extent(tree: io_tree, start: block_start, end: block_end, cached: &cached_state);
4742	unlock_page(page);
4743	put_page(page);
4744	btrfs_start_ordered_extent(entry: ordered);
4745	btrfs_put_ordered_extent(entry: ordered);
4746	goto again;
4747	}
4748
4749	clear_extent_bit(tree: &inode->io_tree, start: block_start, end: block_end,
4750	bits: EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
4751	cached: &cached_state);
4752
4753	ret = btrfs_set_extent_delalloc(inode, start: block_start, end: block_end, extra_bits: `0`,
4754	cached_state: &cached_state);
4755	if (ret) {
4756	unlock_extent(tree: io_tree, start: block_start, end: block_end, cached: &cached_state);
4757	goto out_unlock;
4758	}
4759
4760	if (offset != blocksize) {
4761	if (!len)
4762	len = blocksize - offset;
4763	if (front)
4764	memzero_page(page, offset: (block_start - page_offset(page)),
4765	len: offset);
4766	else
4767	memzero_page(page, offset: (block_start - page_offset(page)) + offset,
4768	len);
4769	}
4770	btrfs_page_clear_checked(fs_info, page, start: block_start,
4771	len: block_end + `1` - block_start);
4772	btrfs_page_set_dirty(fs_info, page, start: block_start, len: block_end + `1` - block_start);
4773	unlock_extent(tree: io_tree, start: block_start, end: block_end, cached: &cached_state);
4774
4775	if (only_release_metadata)
4776	set_extent_bit(tree: &inode->io_tree, start: block_start, end: block_end,
4777	bits: EXTENT_NORESERVE, NULL);
4778
4779	out_unlock:
4780	if (ret) {
4781	if (only_release_metadata)
4782	btrfs_delalloc_release_metadata(inode, num_bytes: blocksize, qgroup_free: true);
4783	else
4784	btrfs_delalloc_release_space(inode, reserved: data_reserved,
4785	start: block_start, len: blocksize, qgroup_free: true);
4786	}
4787	btrfs_delalloc_release_extents(inode, num_bytes: blocksize);
4788	unlock_page(page);
4789	put_page(page);
4790	out:
4791	if (only_release_metadata)
4792	btrfs_check_nocow_unlock(inode);
4793	extent_changeset_free(changeset: data_reserved);
4794	return ret;
4795	}
4796
4797	static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
4798	{
4799	struct btrfs_root *root = inode->root;
4800	struct btrfs_fs_info *fs_info = root->fs_info;
4801	struct btrfs_trans_handle *trans;
4802	struct btrfs_drop_extents_args drop_args = { `0` };
4803	int ret;
4804
4805	/*
4806	* If NO_HOLES is enabled, we don't need to do anything.
4807	* Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4808	* or btrfs_update_inode() will be called, which guarantee that the next
4809	* fsync will know this inode was changed and needs to be logged.
4810	*/
4811	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4812	return `0`;
4813
4814	/*
4815	* 1 - for the one we're dropping
4816	* 1 - for the one we're adding
4817	* 1 - for updating the inode.
4818	*/
4819	trans = btrfs_start_transaction(root, num_items: `3`);
4820	if (IS_ERR(ptr: trans))
4821	return PTR_ERR(ptr: trans);
4822
4823	drop_args.start = offset;
4824	drop_args.end = offset + len;
4825	drop_args.drop_cache = true;
4826
4827	ret = btrfs_drop_extents(trans, root, inode, args: &drop_args);
4828	if (ret) {
4829	btrfs_abort_transaction(trans, ret);
4830	btrfs_end_transaction(trans);
4831	return ret;
4832	}
4833
4834	ret = btrfs_insert_hole_extent(trans, root, objectid: btrfs_ino(inode), pos: offset, num_bytes: len);
4835	if (ret) {
4836	btrfs_abort_transaction(trans, ret);
4837	} else {
4838	btrfs_update_inode_bytes(inode, add_bytes: `0`, del_bytes: drop_args.bytes_found);
4839	btrfs_update_inode(trans, inode);
4840	}
4841	btrfs_end_transaction(trans);
4842	return ret;
4843	}
4844
4845	/*
4846	* This function puts in dummy file extents for the area we're creating a hole
4847	* for. So if we are truncating this file to a larger size we need to insert
4848	* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4849	* the range between oldsize and size
4850	*/
4851	int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4852	{
4853	struct btrfs_root *root = inode->root;
4854	struct btrfs_fs_info *fs_info = root->fs_info;
4855	struct extent_io_tree *io_tree = &inode->io_tree;
4856	struct extent_map *em = NULL;
4857	struct extent_state *cached_state = NULL;
4858	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4859	u64 block_end = ALIGN(size, fs_info->sectorsize);
4860	u64 last_byte;
4861	u64 cur_offset;
4862	u64 hole_size;
4863	int err = `0`;
4864
4865	/*
4866	* If our size started in the middle of a block we need to zero out the
4867	* rest of the block before we expand the i_size, otherwise we could
4868	* expose stale data.
4869	*/
4870	err = btrfs_truncate_block(inode, from: oldsize, len: `0`, front: `0`);
4871	if (err)
4872	return err;
4873
4874	if (size <= hole_start)
4875	return `0`;
4876
4877	btrfs_lock_and_flush_ordered_range(inode, start: hole_start, end: block_end - `1`,
4878	cached_state: &cached_state);
4879	cur_offset = hole_start;
4880	while (`1`) {
4881	em = btrfs_get_extent(inode, NULL, pg_offset: `0`, start: cur_offset,
4882	end: block_end - cur_offset);
4883	if (IS_ERR(ptr: em)) {
4884	err = PTR_ERR(ptr: em);
4885	em = NULL;
4886	break;
4887	}
4888	last_byte = min(extent_map_end(em), block_end);
4889	last_byte = ALIGN(last_byte, fs_info->sectorsize);
4890	hole_size = last_byte - cur_offset;
4891
4892	if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4893	struct extent_map *hole_em;
4894
4895	err = maybe_insert_hole(inode, offset: cur_offset, len: hole_size);
4896	if (err)
4897	break;
4898
4899	err = btrfs_inode_set_file_extent_range(inode,
4900	start: cur_offset, len: hole_size);
4901	if (err)
4902	break;
4903
4904	hole_em = alloc_extent_map();
4905	if (!hole_em) {
4906	btrfs_drop_extent_map_range(inode, start: cur_offset,
4907	end: cur_offset + hole_size - `1`,
4908	skip_pinned: false);
4909	btrfs_set_inode_full_sync(inode);
4910	goto next;
4911	}
4912	hole_em->start = cur_offset;
4913	hole_em->len = hole_size;
4914	hole_em->orig_start = cur_offset;
4915
4916	hole_em->block_start = EXTENT_MAP_HOLE;
4917	hole_em->block_len = `0`;
4918	hole_em->orig_block_len = `0`;
4919	hole_em->ram_bytes = hole_size;
4920	hole_em->compress_type = BTRFS_COMPRESS_NONE;
4921	hole_em->generation = btrfs_get_fs_generation(fs_info);
4922
4923	err = btrfs_replace_extent_map_range(inode, new_em: hole_em, modified: true);
4924	free_extent_map(em: hole_em);
4925	} else {
4926	err = btrfs_inode_set_file_extent_range(inode,
4927	start: cur_offset, len: hole_size);
4928	if (err)
4929	break;
4930	}
4931	next:
4932	free_extent_map(em);
4933	em = NULL;
4934	cur_offset = last_byte;
4935	if (cur_offset >= block_end)
4936	break;
4937	}
4938	free_extent_map(em);
4939	unlock_extent(tree: io_tree, start: hole_start, end: block_end - `1`, cached: &cached_state);
4940	return err;
4941	}
4942
4943	static int btrfs_setsize(struct inode inode, struct* iattr *attr)
4944	{
4945	struct btrfs_root *root = BTRFS_I(inode)->root;
4946	struct btrfs_trans_handle *trans;
4947	loff_t oldsize = i_size_read(inode);
4948	loff_t newsize = attr->ia_size;
4949	int mask = attr->ia_valid;
4950	int ret;
4951
4952	/*
4953	* The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4954	* special case where we need to update the times despite not having
4955	* these flags set. For all other operations the VFS set these flags
4956	* explicitly if it wants a timestamp update.
4957	*/
4958	if (newsize != oldsize) {
4959	inode_inc_iversion(inode);
4960	if (!(mask & (ATTR_CTIME \| ATTR_MTIME))) {
4961	inode_set_mtime_to_ts(inode,
4962	ts: inode_set_ctime_current(inode));
4963	}
4964	}
4965
4966	if (newsize > oldsize) {
4967	/*
4968	* Don't do an expanding truncate while snapshotting is ongoing.
4969	* This is to ensure the snapshot captures a fully consistent
4970	* state of this file - if the snapshot captures this expanding
4971	* truncation, it must capture all writes that happened before
4972	* this truncation.
4973	*/
4974	btrfs_drew_write_lock(lock: &root->snapshot_lock);
4975	ret = btrfs_cont_expand(inode: BTRFS_I(inode), oldsize, size: newsize);
4976	if (ret) {
4977	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
4978	return ret;
4979	}
4980
4981	trans = btrfs_start_transaction(root, num_items: `1`);
4982	if (IS_ERR(ptr: trans)) {
4983	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
4984	return PTR_ERR(ptr: trans);
4985	}
4986
4987	i_size_write(inode, i_size: newsize);
4988	btrfs_inode_safe_disk_i_size_write(inode: BTRFS_I(inode), new_i_size: `0`);
4989	pagecache_isize_extended(inode, from: oldsize, to: newsize);
4990	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
4991	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
4992	btrfs_end_transaction(trans);
4993	} else {
4994	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
4995
4996	if (btrfs_is_zoned(fs_info)) {
4997	ret = btrfs_wait_ordered_range(inode,
4998	ALIGN(newsize, fs_info->sectorsize),
4999	len: (u64)-`1`);
5000	if (ret)
5001	return ret;
5002	}
5003
5004	/*
5005	* We're truncating a file that used to have good data down to
5006	* zero. Make sure any new writes to the file get on disk
5007	* on close.
5008	*/
5009	if (newsize == `0`)
5010	set_bit(nr: BTRFS_INODE_FLUSH_ON_CLOSE,
5011	addr: &BTRFS_I(inode)->runtime_flags);
5012
5013	truncate_setsize(inode, newsize);
5014
5015	inode_dio_wait(inode);
5016
5017	ret = btrfs_truncate(inode: BTRFS_I(inode), skip_writeback: newsize == oldsize);
5018	if (ret && inode->i_nlink) {
5019	int err;
5020
5021	/*
5022	* Truncate failed, so fix up the in-memory size. We
5023	* adjusted disk_i_size down as we removed extents, so
5024	* wait for disk_i_size to be stable and then update the
5025	* in-memory size to match.
5026	*/
5027	err = btrfs_wait_ordered_range(inode, start: `0`, len: (u64)-`1`);
5028	if (err)
5029	return err;
5030	i_size_write(inode, i_size: BTRFS_I(inode)->disk_i_size);
5031	}
5032	}
5033
5034	return ret;
5035	}
5036
5037	static int btrfs_setattr(struct mnt_idmap idmap, struct* dentry *dentry,
5038	struct iattr *attr)
5039	{
5040	struct inode *inode = d_inode(dentry);
5041	struct btrfs_root *root = BTRFS_I(inode)->root;
5042	int err;
5043
5044	if (btrfs_root_readonly(root))
5045	return -EROFS;
5046
5047	err = setattr_prepare(idmap, dentry, attr);
5048	if (err)
5049	return err;
5050
5051	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5052	err = btrfs_setsize(inode, attr);
5053	if (err)
5054	return err;
5055	}
5056
5057	if (attr->ia_valid) {
5058	setattr_copy(idmap, inode, attr);
5059	inode_inc_iversion(inode);
5060	err = btrfs_dirty_inode(inode: BTRFS_I(inode));
5061
5062	if (!err && attr->ia_valid & ATTR_MODE)
5063	err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5064	}
5065
5066	return err;
5067	}
5068
5069	/*
5070	* While truncating the inode pages during eviction, we get the VFS
5071	* calling btrfs_invalidate_folio() against each folio of the inode. This
5072	* is slow because the calls to btrfs_invalidate_folio() result in a
5073	* huge amount of calls to lock_extent() and clear_extent_bit(),
5074	* which keep merging and splitting extent_state structures over and over,
5075	* wasting lots of time.
5076	*
5077	* Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5078	* skip all those expensive operations on a per folio basis and do only
5079	* the ordered io finishing, while we release here the extent_map and
5080	* extent_state structures, without the excessive merging and splitting.
5081	*/
5082	static void evict_inode_truncate_pages(struct inode *inode)
5083	{
5084	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5085	struct rb_node *node;
5086
5087	ASSERT(inode->i_state & I_FREEING);
5088	truncate_inode_pages_final(&inode->i_data);
5089
5090	btrfs_drop_extent_map_range(inode: BTRFS_I(inode), start: `0`, end: (u64)-`1`, skip_pinned: false);
5091
5092	/*
5093	* Keep looping until we have no more ranges in the io tree.
5094	* We can have ongoing bios started by readahead that have
5095	* their endio callback (extent_io.c:end_bio_extent_readpage)
5096	* still in progress (unlocked the pages in the bio but did not yet
5097	* unlocked the ranges in the io tree). Therefore this means some
5098	* ranges can still be locked and eviction started because before
5099	* submitting those bios, which are executed by a separate task (work
5100	* queue kthread), inode references (inode->i_count) were not taken
5101	* (which would be dropped in the end io callback of each bio).
5102	* Therefore here we effectively end up waiting for those bios and
5103	* anyone else holding locked ranges without having bumped the inode's
5104	* reference count - if we don't do it, when they access the inode's
5105	* io_tree to unlock a range it may be too late, leading to an
5106	* use-after-free issue.
5107	*/
5108	spin_lock(lock: &io_tree->lock);
5109	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5110	struct extent_state *state;
5111	struct extent_state *cached_state = NULL;
5112	u64 start;
5113	u64 end;
5114	unsigned state_flags;
5115
5116	node = rb_first(&io_tree->state);
5117	state = rb_entry(node, struct extent_state, rb_node);
5118	start = state->start;
5119	end = state->end;
5120	state_flags = state->state;
5121	spin_unlock(lock: &io_tree->lock);
5122
5123	lock_extent(tree: io_tree, start, end, cached: &cached_state);
5124
5125	/*
5126	* If still has DELALLOC flag, the extent didn't reach disk,
5127	* and its reserved space won't be freed by delayed_ref.
5128	* So we need to free its reserved space here.
5129	* (Refer to comment in btrfs_invalidate_folio, case 2)
5130	*
5131	* Note, end is the bytenr of last byte, so we need + 1 here.
5132	*/
5133	if (state_flags & EXTENT_DELALLOC)
5134	btrfs_qgroup_free_data(inode: BTRFS_I(inode), NULL, start,
5135	len: end - start + `1`);
5136
5137	clear_extent_bit(tree: io_tree, start, end,
5138	bits: EXTENT_CLEAR_ALL_BITS \| EXTENT_DO_ACCOUNTING,
5139	cached: &cached_state);
5140
5141	cond_resched();
5142	spin_lock(lock: &io_tree->lock);
5143	}
5144	spin_unlock(lock: &io_tree->lock);
5145	}
5146
5147	static struct btrfs_trans_handle evict_refill_and_join(struct* btrfs_root *root,
5148	struct btrfs_block_rsv *rsv)
5149	{
5150	struct btrfs_fs_info *fs_info = root->fs_info;
5151	struct btrfs_trans_handle *trans;
5152	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, num_delayed_refs: `1`);
5153	int ret;
5154
5155	/*
5156	* Eviction should be taking place at some place safe because of our
5157	* delayed iputs. However the normal flushing code will run delayed
5158	* iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5159	*
5160	* We reserve the delayed_refs_extra here again because we can't use
5161	* btrfs_start_transaction(root, 0) for the same deadlocky reason as
5162	* above. We reserve our extra bit here because we generate a ton of
5163	* delayed refs activity by truncating.
5164	*
5165	* BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5166	* if we fail to make this reservation we can re-try without the
5167	* delayed_refs_extra so we can make some forward progress.
5168	*/
5169	ret = btrfs_block_rsv_refill(fs_info, block_rsv: rsv, num_bytes: rsv->size + delayed_refs_extra,
5170	flush: BTRFS_RESERVE_FLUSH_EVICT);
5171	if (ret) {
5172	ret = btrfs_block_rsv_refill(fs_info, block_rsv: rsv, num_bytes: rsv->size,
5173	flush: BTRFS_RESERVE_FLUSH_EVICT);
5174	if (ret) {
5175	btrfs_warn(fs_info,
5176	"could not allocate space for delete; will truncate on mount");
5177	return ERR_PTR(error: -ENOSPC);
5178	}
5179	delayed_refs_extra = `0`;
5180	}
5181
5182	trans = btrfs_join_transaction(root);
5183	if (IS_ERR(ptr: trans))
5184	return trans;
5185
5186	if (delayed_refs_extra) {
5187	trans->block_rsv = &fs_info->trans_block_rsv;
5188	trans->bytes_reserved = delayed_refs_extra;
5189	btrfs_block_rsv_migrate(src_rsv: rsv, dst_rsv: trans->block_rsv,
5190	num_bytes: delayed_refs_extra, update_size: true);
5191	}
5192	return trans;
5193	}
5194
5195	void btrfs_evict_inode(struct inode *inode)
5196	{
5197	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
5198	struct btrfs_trans_handle *trans;
5199	struct btrfs_root *root = BTRFS_I(inode)->root;
5200	struct btrfs_block_rsv *rsv = NULL;
5201	int ret;
5202
5203	trace_btrfs_inode_evict(inode);
5204
5205	if (!root) {
5206	fsverity_cleanup_inode(inode);
5207	clear_inode(inode);
5208	return;
5209	}
5210
5211	evict_inode_truncate_pages(inode);
5212
5213	if (inode->i_nlink &&
5214	((btrfs_root_refs(s: &root->root_item) != `0` &&
5215	root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) \|\|
5216	btrfs_is_free_space_inode(inode: BTRFS_I(inode))))
5217	goto out;
5218
5219	if (is_bad_inode(inode))
5220	goto out;
5221
5222	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5223	goto out;
5224
5225	if (inode->i_nlink > `0`) {
5226	BUG_ON(btrfs_root_refs(&root->root_item) != `0` &&
5227	root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5228	goto out;
5229	}
5230
5231	/*
5232	* This makes sure the inode item in tree is uptodate and the space for
5233	* the inode update is released.
5234	*/
5235	ret = btrfs_commit_inode_delayed_inode(inode: BTRFS_I(inode));
5236	if (ret)
5237	goto out;
5238
5239	/*
5240	* This drops any pending insert or delete operations we have for this
5241	* inode. We could have a delayed dir index deletion queued up, but
5242	* we're removing the inode completely so that'll be taken care of in
5243	* the truncate.
5244	*/
5245	btrfs_kill_delayed_inode_items(inode: BTRFS_I(inode));
5246
5247	rsv = btrfs_alloc_block_rsv(fs_info, type: BTRFS_BLOCK_RSV_TEMP);
5248	if (!rsv)
5249	goto out;
5250	rsv->size = btrfs_calc_metadata_size(fs_info, num_items: `1`);
5251	rsv->failfast = true;
5252
5253	btrfs_i_size_write(inode: BTRFS_I(inode), size: `0`);
5254
5255	while (`1`) {
5256	struct btrfs_truncate_control control = {
5257	.inode = BTRFS_I(inode),
5258	.ino = btrfs_ino(inode: BTRFS_I(inode)),
5259	.new_size = `0`,
5260	.min_type = `0`,
5261	};
5262
5263	trans = evict_refill_and_join(root, rsv);
5264	if (IS_ERR(ptr: trans))
5265	goto out;
5266
5267	trans->block_rsv = rsv;
5268
5269	ret = btrfs_truncate_inode_items(trans, root, control: &control);
5270	trans->block_rsv = &fs_info->trans_block_rsv;
5271	btrfs_end_transaction(trans);
5272	/*
5273	* We have not added new delayed items for our inode after we
5274	* have flushed its delayed items, so no need to throttle on
5275	* delayed items. However we have modified extent buffers.
5276	*/
5277	btrfs_btree_balance_dirty_nodelay(fs_info);
5278	if (ret && ret != -ENOSPC && ret != -EAGAIN)
5279	goto out;
5280	else if (!ret)
5281	break;
5282	}
5283
5284	/*
5285	* Errors here aren't a big deal, it just means we leave orphan items in
5286	* the tree. They will be cleaned up on the next mount. If the inode
5287	* number gets reused, cleanup deletes the orphan item without doing
5288	* anything, and unlink reuses the existing orphan item.
5289	*
5290	* If it turns out that we are dropping too many of these, we might want
5291	* to add a mechanism for retrying these after a commit.
5292	*/
5293	trans = evict_refill_and_join(root, rsv);
5294	if (!IS_ERR(ptr: trans)) {
5295	trans->block_rsv = rsv;
5296	btrfs_orphan_del(trans, inode: BTRFS_I(inode));
5297	trans->block_rsv = &fs_info->trans_block_rsv;
5298	btrfs_end_transaction(trans);
5299	}
5300
5301	out:
5302	btrfs_free_block_rsv(fs_info, rsv);
5303	/*
5304	* If we didn't successfully delete, the orphan item will still be in
5305	* the tree and we'll retry on the next mount. Again, we might also want
5306	* to retry these periodically in the future.
5307	*/
5308	btrfs_remove_delayed_node(inode: BTRFS_I(inode));
5309	fsverity_cleanup_inode(inode);
5310	clear_inode(inode);
5311	}
5312
5313	/*
5314	* Return the key found in the dir entry in the location pointer, fill @type
5315	* with BTRFS_FT_*, and return 0.
5316	*
5317	* If no dir entries were found, returns -ENOENT.
5318	* If found a corrupted location in dir entry, returns -EUCLEAN.
5319	*/
5320	static int btrfs_inode_by_name(struct btrfs_inode dir, struct* dentry *dentry,
5321	struct btrfs_key location, u8 type)
5322	{
5323	struct btrfs_dir_item *di;
5324	struct btrfs_path *path;
5325	struct btrfs_root *root = dir->root;
5326	int ret = `0`;
5327	struct fscrypt_name fname;
5328
5329	path = btrfs_alloc_path();
5330	if (!path)
5331	return -ENOMEM;
5332
5333	ret = fscrypt_setup_filename(inode: &dir->vfs_inode, iname: &dentry->d_name, lookup: `1`, fname: &fname);
5334	if (ret < `0`)
5335	goto out;
5336	/*
5337	* fscrypt_setup_filename() should never return a positive value, but
5338	* gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5339	*/
5340	ASSERT(ret == `0`);
5341
5342	/ This needs to handle no-key deletions later on /
5343
5344	di = btrfs_lookup_dir_item(NULL, root, path, dir: btrfs_ino(inode: dir),
5345	name: &fname.disk_name, mod: `0`);
5346	if (IS_ERR_OR_NULL(ptr: di)) {
5347	ret = di ? PTR_ERR(ptr: di) : -ENOENT;
5348	goto out;
5349	}
5350
5351	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: location);
5352	if (location->type != BTRFS_INODE_ITEM_KEY &&
5353	location->type != BTRFS_ROOT_ITEM_KEY) {
5354	ret = -EUCLEAN;
5355	btrfs_warn(root->fs_info,
5356	"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5357	__func__, fname.disk_name.name, btrfs_ino(dir),
5358	location->objectid, location->type, location->offset);
5359	}
5360	if (!ret)
5361	*type = btrfs_dir_ftype(eb: path->nodes[`0`], item: di);
5362	out:
5363	fscrypt_free_filename(fname: &fname);
5364	btrfs_free_path(p: path);
5365	return ret;
5366	}
5367
5368	/*
5369	* when we hit a tree root in a directory, the btrfs part of the inode
5370	* needs to be changed to reflect the root directory of the tree root. This
5371	* is kind of like crossing a mount point.
5372	*/
5373	static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5374	struct btrfs_inode *dir,
5375	struct dentry *dentry,
5376	struct btrfs_key *location,
5377	struct btrfs_root **sub_root)
5378	{
5379	struct btrfs_path *path;
5380	struct btrfs_root *new_root;
5381	struct btrfs_root_ref *ref;
5382	struct extent_buffer *leaf;
5383	struct btrfs_key key;
5384	int ret;
5385	int err = `0`;
5386	struct fscrypt_name fname;
5387
5388	ret = fscrypt_setup_filename(inode: &dir->vfs_inode, iname: &dentry->d_name, lookup: `0`, fname: &fname);
5389	if (ret)
5390	return ret;
5391
5392	path = btrfs_alloc_path();
5393	if (!path) {
5394	err = -ENOMEM;
5395	goto out;
5396	}
5397
5398	err = -ENOENT;
5399	key.objectid = dir->root->root_key.objectid;
5400	key.type = BTRFS_ROOT_REF_KEY;
5401	key.offset = location->objectid;
5402
5403	ret = btrfs_search_slot(NULL, root: fs_info->tree_root, key: &key, p: path, ins_len: `0`, cow: `0`);
5404	if (ret) {
5405	if (ret < `0`)
5406	err = ret;
5407	goto out;
5408	}
5409
5410	leaf = path->nodes[`0`];
5411	ref = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_root_ref);
5412	if (btrfs_root_ref_dirid(eb: leaf, s: ref) != btrfs_ino(inode: dir) \|\|
5413	btrfs_root_ref_name_len(eb: leaf, s: ref) != fname.disk_name.len)
5414	goto out;
5415
5416	ret = memcmp_extent_buffer(eb: leaf, ptrv: fname.disk_name.name,
5417	start: (unsigned long)(ref + `1`), len: fname.disk_name.len);
5418	if (ret)
5419	goto out;
5420
5421	btrfs_release_path(p: path);
5422
5423	new_root = btrfs_get_fs_root(fs_info, objectid: location->objectid, check_ref: true);
5424	if (IS_ERR(ptr: new_root)) {
5425	err = PTR_ERR(ptr: new_root);
5426	goto out;
5427	}
5428
5429	*sub_root = new_root;
5430	location->objectid = btrfs_root_dirid(s: &new_root->root_item);
5431	location->type = BTRFS_INODE_ITEM_KEY;
5432	location->offset = `0`;
5433	err = `0`;
5434	out:
5435	btrfs_free_path(p: path);
5436	fscrypt_free_filename(fname: &fname);
5437	return err;
5438	}
5439
5440	static void inode_tree_add(struct btrfs_inode *inode)
5441	{
5442	struct btrfs_root *root = inode->root;
5443	struct btrfs_inode *entry;
5444	struct rb_node **p;
5445	struct rb_node *parent;
5446	struct rb_node *new = &inode->rb_node;
5447	u64 ino = btrfs_ino(inode);
5448
5449	if (inode_unhashed(inode: &inode->vfs_inode))
5450	return;
5451	parent = NULL;
5452	spin_lock(lock: &root->inode_lock);
5453	p = &root->inode_tree.rb_node;
5454	while (*p) {
5455	parent = *p;
5456	entry = rb_entry(parent, struct btrfs_inode, rb_node);
5457
5458	if (ino < btrfs_ino(inode: entry))
5459	p = &parent->rb_left;
5460	else if (ino > btrfs_ino(inode: entry))
5461	p = &parent->rb_right;
5462	else {
5463	WARN_ON(!(entry->vfs_inode.i_state &
5464	(I_WILL_FREE \| I_FREEING)));
5465	rb_replace_node(victim: parent, new, root: &root->inode_tree);
5466	RB_CLEAR_NODE(parent);
5467	spin_unlock(lock: &root->inode_lock);
5468	return;
5469	}
5470	}
5471	rb_link_node(node: new, parent, rb_link: p);
5472	rb_insert_color(new, &root->inode_tree);
5473	spin_unlock(lock: &root->inode_lock);
5474	}
5475
5476	static void inode_tree_del(struct btrfs_inode *inode)
5477	{
5478	struct btrfs_root *root = inode->root;
5479	int empty = `0`;
5480
5481	spin_lock(lock: &root->inode_lock);
5482	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5483	rb_erase(&inode->rb_node, &root->inode_tree);
5484	RB_CLEAR_NODE(&inode->rb_node);
5485	empty = RB_EMPTY_ROOT(&root->inode_tree);
5486	}
5487	spin_unlock(lock: &root->inode_lock);
5488
5489	if (empty && btrfs_root_refs(s: &root->root_item) == `0`) {
5490	spin_lock(lock: &root->inode_lock);
5491	empty = RB_EMPTY_ROOT(&root->inode_tree);
5492	spin_unlock(lock: &root->inode_lock);
5493	if (empty)
5494	btrfs_add_dead_root(root);
5495	}
5496	}
5497
5498
5499	static int btrfs_init_locked_inode(struct inode inode, void* *p)
5500	{
5501	struct btrfs_iget_args *args = p;
5502
5503	inode->i_ino = args->ino;
5504	BTRFS_I(inode)->location.objectid = args->ino;
5505	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5506	BTRFS_I(inode)->location.offset = `0`;
5507	BTRFS_I(inode)->root = btrfs_grab_root(root: args->root);
5508	BUG_ON(args->root && !BTRFS_I(inode)->root);
5509
5510	if (args->root && args->root == args->root->fs_info->tree_root &&
5511	args->ino != BTRFS_BTREE_INODE_OBJECTID)
5512	set_bit(nr: BTRFS_INODE_FREE_SPACE_INODE,
5513	addr: &BTRFS_I(inode)->runtime_flags);
5514	return `0`;
5515	}
5516
5517	static int btrfs_find_actor(struct inode inode, void* *opaque)
5518	{
5519	struct btrfs_iget_args *args = opaque;
5520
5521	return args->ino == BTRFS_I(inode)->location.objectid &&
5522	args->root == BTRFS_I(inode)->root;
5523	}
5524
5525	static struct inode btrfs_iget_locked(struct* super_block *s, u64 ino,
5526	struct btrfs_root *root)
5527	{
5528	struct inode *inode;
5529	struct btrfs_iget_args args;
5530	unsigned long hashval = btrfs_inode_hash(objectid: ino, root);
5531
5532	args.ino = ino;
5533	args.root = root;
5534
5535	inode = iget5_locked(s, hashval, test: btrfs_find_actor,
5536	set: btrfs_init_locked_inode,
5537	(void *)&args);
5538	return inode;
5539	}
5540
5541	/*
5542	* Get an inode object given its inode number and corresponding root.
5543	* Path can be preallocated to prevent recursing back to iget through
5544	* allocator. NULL is also valid but may require an additional allocation
5545	* later.
5546	*/
5547	struct inode btrfs_iget_path(struct* super_block *s, u64 ino,
5548	struct btrfs_root root, struct* btrfs_path *path)
5549	{
5550	struct inode *inode;
5551
5552	inode = btrfs_iget_locked(s, ino, root);
5553	if (!inode)
5554	return ERR_PTR(error: -ENOMEM);
5555
5556	if (inode->i_state & I_NEW) {
5557	int ret;
5558
5559	ret = btrfs_read_locked_inode(inode, in_path: path);
5560	if (!ret) {
5561	inode_tree_add(inode: BTRFS_I(inode));
5562	unlock_new_inode(inode);
5563	} else {
5564	iget_failed(inode);
5565	/*
5566	* ret > 0 can come from btrfs_search_slot called by
5567	* btrfs_read_locked_inode, this means the inode item
5568	* was not found.
5569	*/
5570	if (ret > `0`)
5571	ret = -ENOENT;
5572	inode = ERR_PTR(error: ret);
5573	}
5574	}
5575
5576	return inode;
5577	}
5578
5579	struct inode btrfs_iget(struct* super_block s, u64 ino, struct* btrfs_root *root)
5580	{
5581	return btrfs_iget_path(s, ino, root, NULL);
5582	}
5583
5584	static struct inode new_simple_dir(struct* inode *dir,
5585	struct btrfs_key *key,
5586	struct btrfs_root *root)
5587	{
5588	struct timespec64 ts;
5589	struct inode *inode = new_inode(sb: dir->i_sb);
5590
5591	if (!inode)
5592	return ERR_PTR(error: -ENOMEM);
5593
5594	BTRFS_I(inode)->root = btrfs_grab_root(root);
5595	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5596	set_bit(nr: BTRFS_INODE_DUMMY, addr: &BTRFS_I(inode)->runtime_flags);
5597
5598	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5599	/*
5600	* We only need lookup, the rest is read-only and there's no inode
5601	* associated with the dentry
5602	*/
5603	inode->i_op = &simple_dir_inode_operations;
5604	inode->i_opflags &= ~IOP_XATTR;
5605	inode->i_fop = &simple_dir_operations;
5606	inode->i_mode = S_IFDIR \| S_IRUGO \| S_IWUSR \| S_IXUGO;
5607
5608	ts = inode_set_ctime_current(inode);
5609	inode_set_mtime_to_ts(inode, ts);
5610	inode_set_atime_to_ts(inode, ts: inode_get_atime(inode: dir));
5611	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
5612	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
5613
5614	inode->i_uid = dir->i_uid;
5615	inode->i_gid = dir->i_gid;
5616
5617	return inode;
5618	}
5619
5620	static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5621	static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5622	static_assert(BTRFS_FT_DIR == FT_DIR);
5623	static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5624	static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5625	static_assert(BTRFS_FT_FIFO == FT_FIFO);
5626	static_assert(BTRFS_FT_SOCK == FT_SOCK);
5627	static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5628
5629	static inline u8 btrfs_inode_type(struct inode *inode)
5630	{
5631	return fs_umode_to_ftype(mode: inode->i_mode);
5632	}
5633
5634	struct inode btrfs_lookup_dentry(struct* inode dir, struct* dentry *dentry)
5635	{
5636	struct btrfs_fs_info *fs_info = btrfs_sb(sb: dir->i_sb);
5637	struct inode *inode;
5638	struct btrfs_root *root = BTRFS_I(inode: dir)->root;
5639	struct btrfs_root *sub_root = root;
5640	struct btrfs_key location;
5641	u8 di_type = `0`;
5642	int ret = `0`;
5643
5644	if (dentry->d_name.len > BTRFS_NAME_LEN)
5645	return ERR_PTR(error: -ENAMETOOLONG);
5646
5647	ret = btrfs_inode_by_name(dir: BTRFS_I(inode: dir), dentry, location: &location, type: &di_type);
5648	if (ret < `0`)
5649	return ERR_PTR(error: ret);
5650
5651	if (location.type == BTRFS_INODE_ITEM_KEY) {
5652	inode = btrfs_iget(s: dir->i_sb, ino: location.objectid, root);
5653	if (IS_ERR(ptr: inode))
5654	return inode;
5655
5656	/ Do extra check against inode mode with di_type /
5657	if (btrfs_inode_type(inode) != di_type) {
5658	btrfs_crit(fs_info,
5659	"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5660	inode->i_mode, btrfs_inode_type(inode),
5661	di_type);
5662	iput(inode);
5663	return ERR_PTR(error: -EUCLEAN);
5664	}
5665	return inode;
5666	}
5667
5668	ret = fixup_tree_root_location(fs_info, dir: BTRFS_I(inode: dir), dentry,
5669	location: &location, sub_root: &sub_root);
5670	if (ret < `0`) {
5671	if (ret != -ENOENT)
5672	inode = ERR_PTR(error: ret);
5673	else
5674	inode = new_simple_dir(dir, key: &location, root);
5675	} else {
5676	inode = btrfs_iget(s: dir->i_sb, ino: location.objectid, root: sub_root);
5677	btrfs_put_root(root: sub_root);
5678
5679	if (IS_ERR(ptr: inode))
5680	return inode;
5681
5682	down_read(sem: &fs_info->cleanup_work_sem);
5683	if (!sb_rdonly(sb: inode->i_sb))
5684	ret = btrfs_orphan_cleanup(root: sub_root);
5685	up_read(sem: &fs_info->cleanup_work_sem);
5686	if (ret) {
5687	iput(inode);
5688	inode = ERR_PTR(error: ret);
5689	}
5690	}
5691
5692	return inode;
5693	}
5694
5695	static int btrfs_dentry_delete(const struct dentry *dentry)
5696	{
5697	struct btrfs_root *root;
5698	struct inode *inode = d_inode(dentry);
5699
5700	if (!inode && !IS_ROOT(dentry))
5701	inode = d_inode(dentry: dentry->d_parent);
5702
5703	if (inode) {
5704	root = BTRFS_I(inode)->root;
5705	if (btrfs_root_refs(s: &root->root_item) == `0`)
5706	return `1`;
5707
5708	if (btrfs_ino(inode: BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5709	return `1`;
5710	}
5711	return `0`;
5712	}
5713
5714	static struct dentry btrfs_lookup(struct* inode dir, struct* dentry *dentry,
5715	unsigned int flags)
5716	{
5717	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5718
5719	if (inode == ERR_PTR(error: -ENOENT))
5720	inode = NULL;
5721	return d_splice_alias(inode, dentry);
5722	}
5723
5724	/*
5725	* Find the highest existing sequence number in a directory and then set the
5726	* in-memory index_cnt variable to the first free sequence number.
5727	*/
5728	static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5729	{
5730	struct btrfs_root *root = inode->root;
5731	struct btrfs_key key, found_key;
5732	struct btrfs_path *path;
5733	struct extent_buffer *leaf;
5734	int ret;
5735
5736	key.objectid = btrfs_ino(inode);
5737	key.type = BTRFS_DIR_INDEX_KEY;
5738	key.offset = (u64)-`1`;
5739
5740	path = btrfs_alloc_path();
5741	if (!path)
5742	return -ENOMEM;
5743
5744	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
5745	if (ret < `0`)
5746	goto out;
5747	/ FIXME: we should be able to handle this /
5748	if (ret == `0`)
5749	goto out;
5750	ret = `0`;
5751
5752	if (path->slots[`0`] == `0`) {
5753	inode->index_cnt = BTRFS_DIR_START_INDEX;
5754	goto out;
5755	}
5756
5757	path->slots[`0`]--;
5758
5759	leaf = path->nodes[`0`];
5760	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
5761
5762	if (found_key.objectid != btrfs_ino(inode) \|\|
5763	found_key.type != BTRFS_DIR_INDEX_KEY) {
5764	inode->index_cnt = BTRFS_DIR_START_INDEX;
5765	goto out;
5766	}
5767
5768	inode->index_cnt = found_key.offset + `1`;
5769	out:
5770	btrfs_free_path(p: path);
5771	return ret;
5772	}
5773
5774	static int btrfs_get_dir_last_index(struct btrfs_inode dir, u64 index)
5775	{
5776	int ret = `0`;
5777
5778	btrfs_inode_lock(inode: dir, ilock_flags: `0`);
5779	if (dir->index_cnt == (u64)-`1`) {
5780	ret = btrfs_inode_delayed_dir_index_count(inode: dir);
5781	if (ret) {
5782	ret = btrfs_set_inode_index_count(inode: dir);
5783	if (ret)
5784	goto out;
5785	}
5786	}
5787
5788	/ index_cnt is the index number of next new entry, so decrement it. /
5789	*index = dir->index_cnt - `1`;
5790	out:
5791	btrfs_inode_unlock(inode: dir, ilock_flags: `0`);
5792
5793	return ret;
5794	}
5795
5796	/*
5797	* All this infrastructure exists because dir_emit can fault, and we are holding
5798	* the tree lock when doing readdir. For now just allocate a buffer and copy
5799	* our information into that, and then dir_emit from the buffer. This is
5800	* similar to what NFS does, only we don't keep the buffer around in pagecache
5801	* because I'm afraid I'll mess that up. Long term we need to make filldir do
5802	* copy_to_user_inatomic so we don't have to worry about page faulting under the
5803	* tree lock.
5804	*/
5805	static int btrfs_opendir(struct inode inode, struct* file *file)
5806	{
5807	struct btrfs_file_private *private;
5808	u64 last_index;
5809	int ret;
5810
5811	ret = btrfs_get_dir_last_index(dir: BTRFS_I(inode), index: &last_index);
5812	if (ret)
5813	return ret;
5814
5815	private = kzalloc(size: sizeof(struct btrfs_file_private), GFP_KERNEL);
5816	if (!private)
5817	return -ENOMEM;
5818	private->last_index = last_index;
5819	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5820	if (!private->filldir_buf) {
5821	kfree(objp: private);
5822	return -ENOMEM;
5823	}
5824	file->private_data = private;
5825	return `0`;
5826	}
5827
5828	static loff_t btrfs_dir_llseek(struct file file, loff_t offset, int* whence)
5829	{
5830	struct btrfs_file_private *private = file->private_data;
5831	int ret;
5832
5833	ret = btrfs_get_dir_last_index(dir: BTRFS_I(inode: file_inode(f: file)),
5834	index: &private->last_index);
5835	if (ret)
5836	return ret;
5837
5838	return generic_file_llseek(file, offset, whence);
5839	}
5840
5841	struct dir_entry {
5842	u64 ino;
5843	u64 offset;
5844	unsigned type;
5845	int name_len;
5846	};
5847
5848	static int btrfs_filldir(void addr, int* entries, struct dir_context *ctx)
5849	{
5850	while (entries--) {
5851	struct dir_entry *entry = addr;
5852	char name = (char* *)(entry + `1`);
5853
5854	ctx->pos = get_unaligned(&entry->offset);
5855	if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5856	get_unaligned(&entry->ino),
5857	get_unaligned(&entry->type)))
5858	return `1`;
5859	addr += sizeof(struct dir_entry) +
5860	get_unaligned(&entry->name_len);
5861	ctx->pos++;
5862	}
5863	return `0`;
5864	}
5865
5866	static int btrfs_real_readdir(struct file file, struct* dir_context *ctx)
5867	{
5868	struct inode *inode = file_inode(f: file);
5869	struct btrfs_root *root = BTRFS_I(inode)->root;
5870	struct btrfs_file_private *private = file->private_data;
5871	struct btrfs_dir_item *di;
5872	struct btrfs_key key;
5873	struct btrfs_key found_key;
5874	struct btrfs_path *path;
5875	void *addr;
5876	LIST_HEAD(ins_list);
5877	LIST_HEAD(del_list);
5878	int ret;
5879	char *name_ptr;
5880	int name_len;
5881	int entries = `0`;
5882	int total_len = `0`;
5883	bool put = false;
5884	struct btrfs_key location;
5885
5886	if (!dir_emit_dots(file, ctx))
5887	return `0`;
5888
5889	path = btrfs_alloc_path();
5890	if (!path)
5891	return -ENOMEM;
5892
5893	addr = private->filldir_buf;
5894	path->reada = READA_FORWARD;
5895
5896	put = btrfs_readdir_get_delayed_items(inode, last_index: private->last_index,
5897	ins_list: &ins_list, del_list: &del_list);
5898
5899	again:
5900	key.type = BTRFS_DIR_INDEX_KEY;
5901	key.offset = ctx->pos;
5902	key.objectid = btrfs_ino(inode: BTRFS_I(inode));
5903
5904	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5905	struct dir_entry *entry;
5906	struct extent_buffer *leaf = path->nodes[`0`];
5907	u8 ftype;
5908
5909	if (found_key.objectid != key.objectid)
5910	break;
5911	if (found_key.type != BTRFS_DIR_INDEX_KEY)
5912	break;
5913	if (found_key.offset < ctx->pos)
5914	continue;
5915	if (found_key.offset > private->last_index)
5916	break;
5917	if (btrfs_should_delete_dir_index(del_list: &del_list, index: found_key.offset))
5918	continue;
5919	di = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_dir_item);
5920	name_len = btrfs_dir_name_len(eb: leaf, s: di);
5921	if ((total_len + sizeof(struct dir_entry) + name_len) >=
5922	PAGE_SIZE) {
5923	btrfs_release_path(p: path);
5924	ret = btrfs_filldir(addr: private->filldir_buf, entries, ctx);
5925	if (ret)
5926	goto nopos;
5927	addr = private->filldir_buf;
5928	entries = `0`;
5929	total_len = `0`;
5930	goto again;
5931	}
5932
5933	ftype = btrfs_dir_flags_to_ftype(flags: btrfs_dir_flags(eb: leaf, s: di));
5934	entry = addr;
5935	name_ptr = (char *)(entry + `1`);
5936	read_extent_buffer(eb: leaf, dst: name_ptr,
5937	start: (unsigned long)(di + `1`), len: name_len);
5938	put_unaligned(name_len, &entry->name_len);
5939	put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5940	btrfs_dir_item_key_to_cpu(eb: leaf, item: di, cpu_key: &location);
5941	put_unaligned(location.objectid, &entry->ino);
5942	put_unaligned(found_key.offset, &entry->offset);
5943	entries++;
5944	addr += sizeof(struct dir_entry) + name_len;
5945	total_len += sizeof(struct dir_entry) + name_len;
5946	}
5947	/ Catch error encountered during iteration /
5948	if (ret < `0`)
5949	goto err;
5950
5951	btrfs_release_path(p: path);
5952
5953	ret = btrfs_filldir(addr: private->filldir_buf, entries, ctx);
5954	if (ret)
5955	goto nopos;
5956
5957	ret = btrfs_readdir_delayed_dir_index(ctx, ins_list: &ins_list);
5958	if (ret)
5959	goto nopos;
5960
5961	/*
5962	* Stop new entries from being returned after we return the last
5963	* entry.
5964	*
5965	* New directory entries are assigned a strictly increasing
5966	* offset. This means that new entries created during readdir
5967	* are guaranteed to be seen in the future by that readdir.
5968	* This has broken buggy programs which operate on names as
5969	* they're returned by readdir. Until we re-use freed offsets
5970	* we have this hack to stop new entries from being returned
5971	* under the assumption that they'll never reach this huge
5972	* offset.
5973	*
5974	* This is being careful not to overflow 32bit loff_t unless the
5975	* last entry requires it because doing so has broken 32bit apps
5976	* in the past.
5977	*/
5978	if (ctx->pos >= INT_MAX)
5979	ctx->pos = LLONG_MAX;
5980	else
5981	ctx->pos = INT_MAX;
5982	nopos:
5983	ret = `0`;
5984	err:
5985	if (put)
5986	btrfs_readdir_put_delayed_items(inode, ins_list: &ins_list, del_list: &del_list);
5987	btrfs_free_path(p: path);
5988	return ret;
5989	}
5990
5991	/*
5992	* This is somewhat expensive, updating the tree every time the
5993	* inode changes. But, it is most likely to find the inode in cache.
5994	* FIXME, needs more benchmarking...there are no reasons other than performance
5995	* to keep or drop this code.
5996	*/
5997	static int btrfs_dirty_inode(struct btrfs_inode *inode)
5998	{
5999	struct btrfs_root *root = inode->root;
6000	struct btrfs_fs_info *fs_info = root->fs_info;
6001	struct btrfs_trans_handle *trans;
6002	int ret;
6003
6004	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6005	return `0`;
6006
6007	trans = btrfs_join_transaction(root);
6008	if (IS_ERR(ptr: trans))
6009	return PTR_ERR(ptr: trans);
6010
6011	ret = btrfs_update_inode(trans, inode);
6012	if (ret == -ENOSPC \|\| ret == -EDQUOT) {
6013	/ whoops, lets try again with the full transaction /
6014	btrfs_end_transaction(trans);
6015	trans = btrfs_start_transaction(root, num_items: `1`);
6016	if (IS_ERR(ptr: trans))
6017	return PTR_ERR(ptr: trans);
6018
6019	ret = btrfs_update_inode(trans, inode);
6020	}
6021	btrfs_end_transaction(trans);
6022	if (inode->delayed_node)
6023	btrfs_balance_delayed_items(fs_info);
6024
6025	return ret;
6026	}
6027
6028	/*
6029	* This is a copy of file_update_time. We need this so we can return error on
6030	* ENOSPC for updating the inode in the case of file write and mmap writes.
6031	*/
6032	static int btrfs_update_time(struct inode inode, int* flags)
6033	{
6034	struct btrfs_root *root = BTRFS_I(inode)->root;
6035	bool dirty;
6036
6037	if (btrfs_root_readonly(root))
6038	return -EROFS;
6039
6040	dirty = inode_update_timestamps(inode, flags);
6041	return dirty ? btrfs_dirty_inode(inode: BTRFS_I(inode)) : `0`;
6042	}
6043
6044	/*
6045	* helper to find a free sequence number in a given directory. This current
6046	* code is very simple, later versions will do smarter things in the btree
6047	*/
6048	int btrfs_set_inode_index(struct btrfs_inode dir, u64 index)
6049	{
6050	int ret = `0`;
6051
6052	if (dir->index_cnt == (u64)-`1`) {
6053	ret = btrfs_inode_delayed_dir_index_count(inode: dir);
6054	if (ret) {
6055	ret = btrfs_set_inode_index_count(inode: dir);
6056	if (ret)
6057	return ret;
6058	}
6059	}
6060
6061	*index = dir->index_cnt;
6062	dir->index_cnt++;
6063
6064	return ret;
6065	}
6066
6067	static int btrfs_insert_inode_locked(struct inode *inode)
6068	{
6069	struct btrfs_iget_args args;
6070
6071	args.ino = BTRFS_I(inode)->location.objectid;
6072	args.root = BTRFS_I(inode)->root;
6073
6074	return insert_inode_locked4(inode,
6075	btrfs_inode_hash(objectid: inode->i_ino, root: BTRFS_I(inode)->root),
6076	test: btrfs_find_actor, &args);
6077	}
6078
6079	int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6080	unsigned int *trans_num_items)
6081	{
6082	struct inode *dir = args->dir;
6083	struct inode *inode = args->inode;
6084	int ret;
6085
6086	if (!args->orphan) {
6087	ret = fscrypt_setup_filename(inode: dir, iname: &args->dentry->d_name, lookup: `0`,
6088	fname: &args->fname);
6089	if (ret)
6090	return ret;
6091	}
6092
6093	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6094	if (ret) {
6095	fscrypt_free_filename(fname: &args->fname);
6096	return ret;
6097	}
6098
6099	/ 1 to add inode item /
6100	*trans_num_items = `1`;
6101	/ 1 to add compression property /
6102	if (BTRFS_I(inode: dir)->prop_compress)
6103	(*trans_num_items)++;
6104	/ 1 to add default ACL xattr /
6105	if (args->default_acl)
6106	(*trans_num_items)++;
6107	/ 1 to add access ACL xattr /
6108	if (args->acl)
6109	(*trans_num_items)++;
6110	#ifdef CONFIG_SECURITY
6111	/ 1 to add LSM xattr /
6112	if (dir->i_security)
6113	(*trans_num_items)++;
6114	#endif
6115	if (args->orphan) {
6116	/ 1 to add orphan item /
6117	(*trans_num_items)++;
6118	} else {
6119	/*
6120	* 1 to add dir item
6121	* 1 to add dir index
6122	* 1 to update parent inode item
6123	*
6124	* No need for 1 unit for the inode ref item because it is
6125	* inserted in a batch together with the inode item at
6126	* btrfs_create_new_inode().
6127	*/
6128	*trans_num_items += `3`;
6129	}
6130	return `0`;
6131	}
6132
6133	void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6134	{
6135	posix_acl_release(acl: args->acl);
6136	posix_acl_release(acl: args->default_acl);
6137	fscrypt_free_filename(fname: &args->fname);
6138	}
6139
6140	/*
6141	* Inherit flags from the parent inode.
6142	*
6143	* Currently only the compression flags and the cow flags are inherited.
6144	*/
6145	static void btrfs_inherit_iflags(struct btrfs_inode inode, struct* btrfs_inode *dir)
6146	{
6147	unsigned int flags;
6148
6149	flags = dir->flags;
6150
6151	if (flags & BTRFS_INODE_NOCOMPRESS) {
6152	inode->flags &= ~BTRFS_INODE_COMPRESS;
6153	inode->flags \|= BTRFS_INODE_NOCOMPRESS;
6154	} else if (flags & BTRFS_INODE_COMPRESS) {
6155	inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6156	inode->flags \|= BTRFS_INODE_COMPRESS;
6157	}
6158
6159	if (flags & BTRFS_INODE_NODATACOW) {
6160	inode->flags \|= BTRFS_INODE_NODATACOW;
6161	if (S_ISREG(inode->vfs_inode.i_mode))
6162	inode->flags \|= BTRFS_INODE_NODATASUM;
6163	}
6164
6165	btrfs_sync_inode_flags_to_i_flags(inode: &inode->vfs_inode);
6166	}
6167
6168	int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6169	struct btrfs_new_inode_args *args)
6170	{
6171	struct timespec64 ts;
6172	struct inode *dir = args->dir;
6173	struct inode *inode = args->inode;
6174	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6175	struct btrfs_fs_info *fs_info = btrfs_sb(sb: dir->i_sb);
6176	struct btrfs_root *root;
6177	struct btrfs_inode_item *inode_item;
6178	struct btrfs_key *location;
6179	struct btrfs_path *path;
6180	u64 objectid;
6181	struct btrfs_inode_ref *ref;
6182	struct btrfs_key key[`2`];
6183	u32 sizes[`2`];
6184	struct btrfs_item_batch batch;
6185	unsigned long ptr;
6186	int ret;
6187
6188	path = btrfs_alloc_path();
6189	if (!path)
6190	return -ENOMEM;
6191
6192	if (!args->subvol)
6193	BTRFS_I(inode)->root = btrfs_grab_root(root: BTRFS_I(inode: dir)->root);
6194	root = BTRFS_I(inode)->root;
6195
6196	ret = btrfs_get_free_objectid(root, objectid: &objectid);
6197	if (ret)
6198	goto out;
6199	inode->i_ino = objectid;
6200
6201	if (args->orphan) {
6202	/*
6203	* O_TMPFILE, set link count to 0, so that after this point, we
6204	* fill in an inode item with the correct link count.
6205	*/
6206	set_nlink(inode, nlink: `0`);
6207	} else {
6208	trace_btrfs_inode_request(inode: dir);
6209
6210	ret = btrfs_set_inode_index(dir: BTRFS_I(inode: dir), index: &BTRFS_I(inode)->dir_index);
6211	if (ret)
6212	goto out;
6213	}
6214	/ index_cnt is ignored for everything but a dir. /
6215	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6216	BTRFS_I(inode)->generation = trans->transid;
6217	inode->i_generation = BTRFS_I(inode)->generation;
6218
6219	/*
6220	* Subvolumes don't inherit flags from their parent directory.
6221	* Originally this was probably by accident, but we probably can't
6222	* change it now without compatibility issues.
6223	*/
6224	if (!args->subvol)
6225	btrfs_inherit_iflags(inode: BTRFS_I(inode), dir: BTRFS_I(inode: dir));
6226
6227	if (S_ISREG(inode->i_mode)) {
6228	if (btrfs_test_opt(fs_info, NODATASUM))
6229	BTRFS_I(inode)->flags \|= BTRFS_INODE_NODATASUM;
6230	if (btrfs_test_opt(fs_info, NODATACOW))
6231	BTRFS_I(inode)->flags \|= BTRFS_INODE_NODATACOW \|
6232	BTRFS_INODE_NODATASUM;
6233	}
6234
6235	location = &BTRFS_I(inode)->location;
6236	location->objectid = objectid;
6237	location->offset = `0`;
6238	location->type = BTRFS_INODE_ITEM_KEY;
6239
6240	ret = btrfs_insert_inode_locked(inode);
6241	if (ret < `0`) {
6242	if (!args->orphan)
6243	BTRFS_I(inode: dir)->index_cnt--;
6244	goto out;
6245	}
6246
6247	/*
6248	* We could have gotten an inode number from somebody who was fsynced
6249	* and then removed in this same transaction, so let's just set full
6250	* sync since it will be a full sync anyway and this will blow away the
6251	* old info in the log.
6252	*/
6253	btrfs_set_inode_full_sync(inode: BTRFS_I(inode));
6254
6255	key[`0`].objectid = objectid;
6256	key[`0`].type = BTRFS_INODE_ITEM_KEY;
6257	key[`0`].offset = `0`;
6258
6259	sizes[`0`] = sizeof(struct btrfs_inode_item);
6260
6261	if (!args->orphan) {
6262	/*
6263	* Start new inodes with an inode_ref. This is slightly more
6264	* efficient for small numbers of hard links since they will
6265	* be packed into one item. Extended refs will kick in if we
6266	* add more hard links than can fit in the ref item.
6267	*/
6268	key[`1`].objectid = objectid;
6269	key[`1`].type = BTRFS_INODE_REF_KEY;
6270	if (args->subvol) {
6271	key[`1`].offset = objectid;
6272	sizes[`1`] = `2` + sizeof(*ref);
6273	} else {
6274	key[`1`].offset = btrfs_ino(inode: BTRFS_I(inode: dir));
6275	sizes[`1`] = name->len + sizeof(*ref);
6276	}
6277	}
6278
6279	batch.keys = &key[`0`];
6280	batch.data_sizes = &sizes[`0`];
6281	batch.total_data_size = sizes[`0`] + (args->orphan ? `0` : sizes[`1`]);
6282	batch.nr = args->orphan ? `1` : `2`;
6283	ret = btrfs_insert_empty_items(trans, root, path, batch: &batch);
6284	if (ret != `0`) {
6285	btrfs_abort_transaction(trans, ret);
6286	goto discard;
6287	}
6288
6289	ts = simple_inode_init_ts(inode);
6290	BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6291	BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
6292
6293	/*
6294	* We're going to fill the inode item now, so at this point the inode
6295	* must be fully initialized.
6296	*/
6297
6298	inode_item = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6299	struct btrfs_inode_item);
6300	memzero_extent_buffer(eb: path->nodes[`0`], start: (unsigned long)inode_item,
6301	len: sizeof(*inode_item));
6302	fill_inode_item(trans, leaf: path->nodes[`0`], item: inode_item, inode);
6303
6304	if (!args->orphan) {
6305	ref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`] + `1`,
6306	struct btrfs_inode_ref);
6307	ptr = (unsigned long)(ref + `1`);
6308	if (args->subvol) {
6309	btrfs_set_inode_ref_name_len(eb: path->nodes[`0`], s: ref, val: `2`);
6310	btrfs_set_inode_ref_index(eb: path->nodes[`0`], s: ref, val: `0`);
6311	write_extent_buffer(eb: path->nodes[`0`], src: "..", start: ptr, len: `2`);
6312	} else {
6313	btrfs_set_inode_ref_name_len(eb: path->nodes[`0`], s: ref,
6314	val: name->len);
6315	btrfs_set_inode_ref_index(eb: path->nodes[`0`], s: ref,
6316	val: BTRFS_I(inode)->dir_index);
6317	write_extent_buffer(eb: path->nodes[`0`], src: name->name, start: ptr,
6318	len: name->len);
6319	}
6320	}
6321
6322	btrfs_mark_buffer_dirty(trans, buf: path->nodes[`0`]);
6323	/*
6324	* We don't need the path anymore, plus inheriting properties, adding
6325	* ACLs, security xattrs, orphan item or adding the link, will result in
6326	* allocating yet another path. So just free our path.
6327	*/
6328	btrfs_free_path(p: path);
6329	path = NULL;
6330
6331	if (args->subvol) {
6332	struct inode *parent;
6333
6334	/*
6335	* Subvolumes inherit properties from their parent subvolume,
6336	* not the directory they were created in.
6337	*/
6338	parent = btrfs_iget(s: fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
6339	root: BTRFS_I(inode: dir)->root);
6340	if (IS_ERR(ptr: parent)) {
6341	ret = PTR_ERR(ptr: parent);
6342	} else {
6343	ret = btrfs_inode_inherit_props(trans, inode, dir: parent);
6344	iput(parent);
6345	}
6346	} else {
6347	ret = btrfs_inode_inherit_props(trans, inode, dir);
6348	}
6349	if (ret) {
6350	btrfs_err(fs_info,
6351	"error inheriting props for ino %llu (root %llu): %d",
6352	btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
6353	ret);
6354	}
6355
6356	/*
6357	* Subvolumes don't inherit ACLs or get passed to the LSM. This is
6358	* probably a bug.
6359	*/
6360	if (!args->subvol) {
6361	ret = btrfs_init_inode_security(trans, args);
6362	if (ret) {
6363	btrfs_abort_transaction(trans, ret);
6364	goto discard;
6365	}
6366	}
6367
6368	inode_tree_add(inode: BTRFS_I(inode));
6369
6370	trace_btrfs_inode_new(inode);
6371	btrfs_set_inode_last_trans(trans, inode: BTRFS_I(inode));
6372
6373	btrfs_update_root_times(trans, root);
6374
6375	if (args->orphan) {
6376	ret = btrfs_orphan_add(trans, inode: BTRFS_I(inode));
6377	} else {
6378	ret = btrfs_add_link(trans, parent_inode: BTRFS_I(inode: dir), inode: BTRFS_I(inode), name,
6379	add_backref: `0`, index: BTRFS_I(inode)->dir_index);
6380	}
6381	if (ret) {
6382	btrfs_abort_transaction(trans, ret);
6383	goto discard;
6384	}
6385
6386	return `0`;
6387
6388	discard:
6389	/*
6390	* discard_new_inode() calls iput(), but the caller owns the reference
6391	* to the inode.
6392	*/
6393	ihold(inode);
6394	discard_new_inode(inode);
6395	out:
6396	btrfs_free_path(p: path);
6397	return ret;
6398	}
6399
6400	/*
6401	* utility function to add 'inode' into 'parent_inode' with
6402	* a give name and a given sequence number.
6403	* if 'add_backref' is true, also insert a backref from the
6404	* inode to the parent directory.
6405	*/
6406	int btrfs_add_link(struct btrfs_trans_handle *trans,
6407	struct btrfs_inode parent_inode, struct* btrfs_inode *inode,
6408	const struct fscrypt_str name, int* add_backref, u64 index)
6409	{
6410	int ret = `0`;
6411	struct btrfs_key key;
6412	struct btrfs_root *root = parent_inode->root;
6413	u64 ino = btrfs_ino(inode);
6414	u64 parent_ino = btrfs_ino(inode: parent_inode);
6415
6416	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6417	memcpy(&key, &inode->root->root_key, sizeof(key));
6418	} else {
6419	key.objectid = ino;
6420	key.type = BTRFS_INODE_ITEM_KEY;
6421	key.offset = `0`;
6422	}
6423
6424	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6425	ret = btrfs_add_root_ref(trans, root_id: key.objectid,
6426	ref_id: root->root_key.objectid, dirid: parent_ino,
6427	sequence: index, name);
6428	} else if (add_backref) {
6429	ret = btrfs_insert_inode_ref(trans, root, name,
6430	inode_objectid: ino, ref_objectid: parent_ino, index);
6431	}
6432
6433	/ Nothing to clean up yet /
6434	if (ret)
6435	return ret;
6436
6437	ret = btrfs_insert_dir_item(trans, name, dir: parent_inode, location: &key,
6438	type: btrfs_inode_type(inode: &inode->vfs_inode), index);
6439	if (ret == -EEXIST \|\| ret == -EOVERFLOW)
6440	goto fail_dir_item;
6441	else if (ret) {
6442	btrfs_abort_transaction(trans, ret);
6443	return ret;
6444	}
6445
6446	btrfs_i_size_write(inode: parent_inode, size: parent_inode->vfs_inode.i_size +
6447	name->len * `2`);
6448	inode_inc_iversion(inode: &parent_inode->vfs_inode);
6449	/*
6450	* If we are replaying a log tree, we do not want to update the mtime
6451	* and ctime of the parent directory with the current time, since the
6452	* log replay procedure is responsible for setting them to their correct
6453	* values (the ones it had when the fsync was done).
6454	*/
6455	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6456	inode_set_mtime_to_ts(inode: &parent_inode->vfs_inode,
6457	ts: inode_set_ctime_current(inode: &parent_inode->vfs_inode));
6458
6459	ret = btrfs_update_inode(trans, inode: parent_inode);
6460	if (ret)
6461	btrfs_abort_transaction(trans, ret);
6462	return ret;
6463
6464	fail_dir_item:
6465	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6466	u64 local_index;
6467	int err;
6468	err = btrfs_del_root_ref(trans, root_id: key.objectid,
6469	ref_id: root->root_key.objectid, dirid: parent_ino,
6470	sequence: &local_index, name);
6471	if (err)
6472	btrfs_abort_transaction(trans, err);
6473	} else if (add_backref) {
6474	u64 local_index;
6475	int err;
6476
6477	err = btrfs_del_inode_ref(trans, root, name, inode_objectid: ino, ref_objectid: parent_ino,
6478	index: &local_index);
6479	if (err)
6480	btrfs_abort_transaction(trans, err);
6481	}
6482
6483	/ Return the original error code /
6484	return ret;
6485	}
6486
6487	static int btrfs_create_common(struct inode dir, struct* dentry *dentry,
6488	struct inode *inode)
6489	{
6490	struct btrfs_fs_info *fs_info = btrfs_sb(sb: dir->i_sb);
6491	struct btrfs_root *root = BTRFS_I(inode: dir)->root;
6492	struct btrfs_new_inode_args new_inode_args = {
6493	.dir = dir,
6494	.dentry = dentry,
6495	.inode = inode,
6496	};
6497	unsigned int trans_num_items;
6498	struct btrfs_trans_handle *trans;
6499	int err;
6500
6501	err = btrfs_new_inode_prepare(args: &new_inode_args, trans_num_items: &trans_num_items);
6502	if (err)
6503	goto out_inode;
6504
6505	trans = btrfs_start_transaction(root, num_items: trans_num_items);
6506	if (IS_ERR(ptr: trans)) {
6507	err = PTR_ERR(ptr: trans);
6508	goto out_new_inode_args;
6509	}
6510
6511	err = btrfs_create_new_inode(trans, args: &new_inode_args);
6512	if (!err)
6513	d_instantiate_new(dentry, inode);
6514
6515	btrfs_end_transaction(trans);
6516	btrfs_btree_balance_dirty(fs_info);
6517	out_new_inode_args:
6518	btrfs_new_inode_args_destroy(args: &new_inode_args);
6519	out_inode:
6520	if (err)
6521	iput(inode);
6522	return err;
6523	}
6524
6525	static int btrfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
6526	struct dentry *dentry, umode_t mode, dev_t rdev)
6527	{
6528	struct inode *inode;
6529
6530	inode = new_inode(sb: dir->i_sb);
6531	if (!inode)
6532	return -ENOMEM;
6533	inode_init_owner(idmap, inode, dir, mode);
6534	inode->i_op = &btrfs_special_inode_operations;
6535	init_special_inode(inode, inode->i_mode, rdev);
6536	return btrfs_create_common(dir, dentry, inode);
6537	}
6538
6539	static int btrfs_create(struct mnt_idmap idmap, struct* inode *dir,
6540	struct dentry *dentry, umode_t mode, bool excl)
6541	{
6542	struct inode *inode;
6543
6544	inode = new_inode(sb: dir->i_sb);
6545	if (!inode)
6546	return -ENOMEM;
6547	inode_init_owner(idmap, inode, dir, mode);
6548	inode->i_fop = &btrfs_file_operations;
6549	inode->i_op = &btrfs_file_inode_operations;
6550	inode->i_mapping->a_ops = &btrfs_aops;
6551	return btrfs_create_common(dir, dentry, inode);
6552	}
6553
6554	static int btrfs_link(struct dentry old_dentry, struct* inode *dir,
6555	struct dentry *dentry)
6556	{
6557	struct btrfs_trans_handle *trans = NULL;
6558	struct btrfs_root *root = BTRFS_I(inode: dir)->root;
6559	struct inode *inode = d_inode(dentry: old_dentry);
6560	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
6561	struct fscrypt_name fname;
6562	u64 index;
6563	int err;
6564	int drop_inode = `0`;
6565
6566	/ do not allow sys_link's with other subvols of the same device /
6567	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6568	return -EXDEV;
6569
6570	if (inode->i_nlink >= BTRFS_LINK_MAX)
6571	return -EMLINK;
6572
6573	err = fscrypt_setup_filename(inode: dir, iname: &dentry->d_name, lookup: `0`, fname: &fname);
6574	if (err)
6575	goto fail;
6576
6577	err = btrfs_set_inode_index(dir: BTRFS_I(inode: dir), index: &index);
6578	if (err)
6579	goto fail;
6580
6581	/*
6582	* 2 items for inode and inode ref
6583	* 2 items for dir items
6584	* 1 item for parent inode
6585	* 1 item for orphan item deletion if O_TMPFILE
6586	*/
6587	trans = btrfs_start_transaction(root, num_items: inode->i_nlink ? `5` : `6`);
6588	if (IS_ERR(ptr: trans)) {
6589	err = PTR_ERR(ptr: trans);
6590	trans = NULL;
6591	goto fail;
6592	}
6593
6594	/ There are several dir indexes for this inode, clear the cache. /
6595	BTRFS_I(inode)->dir_index = `0ULL`;
6596	inc_nlink(inode);
6597	inode_inc_iversion(inode);
6598	inode_set_ctime_current(inode);
6599	ihold(inode);
6600	set_bit(nr: BTRFS_INODE_COPY_EVERYTHING, addr: &BTRFS_I(inode)->runtime_flags);
6601
6602	err = btrfs_add_link(trans, parent_inode: BTRFS_I(inode: dir), inode: BTRFS_I(inode),
6603	name: &fname.disk_name, add_backref: `1`, index);
6604
6605	if (err) {
6606	drop_inode = `1`;
6607	} else {
6608	struct dentry *parent = dentry->d_parent;
6609
6610	err = btrfs_update_inode(trans, inode: BTRFS_I(inode));
6611	if (err)
6612	goto fail;
6613	if (inode->i_nlink == `1`) {
6614	/*
6615	* If new hard link count is 1, it's a file created
6616	* with open(2) O_TMPFILE flag.
6617	*/
6618	err = btrfs_orphan_del(trans, inode: BTRFS_I(inode));
6619	if (err)
6620	goto fail;
6621	}
6622	d_instantiate(dentry, inode);
6623	btrfs_log_new_name(trans, old_dentry, NULL, old_dir_index: `0`, parent);
6624	}
6625
6626	fail:
6627	fscrypt_free_filename(fname: &fname);
6628	if (trans)
6629	btrfs_end_transaction(trans);
6630	if (drop_inode) {
6631	inode_dec_link_count(inode);
6632	iput(inode);
6633	}
6634	btrfs_btree_balance_dirty(fs_info);
6635	return err;
6636	}
6637
6638	static int btrfs_mkdir(struct mnt_idmap idmap, struct* inode *dir,
6639	struct dentry *dentry, umode_t mode)
6640	{
6641	struct inode *inode;
6642
6643	inode = new_inode(sb: dir->i_sb);
6644	if (!inode)
6645	return -ENOMEM;
6646	inode_init_owner(idmap, inode, dir, S_IFDIR \| mode);
6647	inode->i_op = &btrfs_dir_inode_operations;
6648	inode->i_fop = &btrfs_dir_file_operations;
6649	return btrfs_create_common(dir, dentry, inode);
6650	}
6651
6652	static noinline int uncompress_inline(struct btrfs_path *path,
6653	struct page *page,
6654	struct btrfs_file_extent_item *item)
6655	{
6656	int ret;
6657	struct extent_buffer *leaf = path->nodes[`0`];
6658	char *tmp;
6659	size_t max_size;
6660	unsigned long inline_size;
6661	unsigned long ptr;
6662	int compress_type;
6663
6664	compress_type = btrfs_file_extent_compression(eb: leaf, s: item);
6665	max_size = btrfs_file_extent_ram_bytes(eb: leaf, s: item);
6666	inline_size = btrfs_file_extent_inline_item_len(eb: leaf, nr: path->slots[`0`]);
6667	tmp = kmalloc(size: inline_size, GFP_NOFS);
6668	if (!tmp)
6669	return -ENOMEM;
6670	ptr = btrfs_file_extent_inline_start(e: item);
6671
6672	read_extent_buffer(eb: leaf, dst: tmp, start: ptr, len: inline_size);
6673
6674	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6675	ret = btrfs_decompress(type: compress_type, data_in: tmp, dest_page: page, start_byte: `0`, srclen: inline_size, destlen: max_size);
6676
6677	/*
6678	* decompression code contains a memset to fill in any space between the end
6679	* of the uncompressed data and the end of max_size in case the decompressed
6680	* data ends up shorter than ram_bytes. That doesn't cover the hole between
6681	* the end of an inline extent and the beginning of the next block, so we
6682	* cover that region here.
6683	*/
6684
6685	if (max_size < PAGE_SIZE)
6686	memzero_page(page, offset: max_size, PAGE_SIZE - max_size);
6687	kfree(objp: tmp);
6688	return ret;
6689	}
6690
6691	static int read_inline_extent(struct btrfs_inode inode, struct* btrfs_path *path,
6692	struct page *page)
6693	{
6694	struct btrfs_file_extent_item *fi;
6695	void *kaddr;
6696	size_t copy_size;
6697
6698	if (!page \|\| PageUptodate(page))
6699	return `0`;
6700
6701	ASSERT(page_offset(page) == `0`);
6702
6703	fi = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6704	struct btrfs_file_extent_item);
6705	if (btrfs_file_extent_compression(eb: path->nodes[`0`], s: fi) != BTRFS_COMPRESS_NONE)
6706	return uncompress_inline(path, page, item: fi);
6707
6708	copy_size = min_t(u64, PAGE_SIZE,
6709	btrfs_file_extent_ram_bytes(path->nodes[`0`], fi));
6710	kaddr = kmap_local_page(page);
6711	read_extent_buffer(eb: path->nodes[`0`], dst: kaddr,
6712	start: btrfs_file_extent_inline_start(e: fi), len: copy_size);
6713	kunmap_local(kaddr);
6714	if (copy_size < PAGE_SIZE)
6715	memzero_page(page, offset: copy_size, PAGE_SIZE - copy_size);
6716	return `0`;
6717	}
6718
6719	/*
6720	* Lookup the first extent overlapping a range in a file.
6721	*
6722	* @inode: file to search in
6723	* @page: page to read extent data into if the extent is inline
6724	* @pg_offset: offset into @page to copy to
6725	* @start: file offset
6726	* @len: length of range starting at @start
6727	*
6728	* Return the first &struct extent_map which overlaps the given range, reading
6729	* it from the B-tree and caching it if necessary. Note that there may be more
6730	* extents which overlap the given range after the returned extent_map.
6731	*
6732	* If @page is not NULL and the extent is inline, this also reads the extent
6733	* data directly into the page and marks the extent up to date in the io_tree.
6734	*
6735	* Return: ERR_PTR on error, non-NULL extent_map on success.
6736	*/
6737	struct extent_map btrfs_get_extent(struct* btrfs_inode *inode,
6738	struct page *page, size_t pg_offset,
6739	u64 start, u64 len)
6740	{
6741	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6742	int ret = `0`;
6743	u64 extent_start = `0`;
6744	u64 extent_end = `0`;
6745	u64 objectid = btrfs_ino(inode);
6746	int extent_type = -`1`;
6747	struct btrfs_path *path = NULL;
6748	struct btrfs_root *root = inode->root;
6749	struct btrfs_file_extent_item *item;
6750	struct extent_buffer *leaf;
6751	struct btrfs_key found_key;
6752	struct extent_map *em = NULL;
6753	struct extent_map_tree *em_tree = &inode->extent_tree;
6754
6755	read_lock(&em_tree->lock);
6756	em = lookup_extent_mapping(tree: em_tree, start, len);
6757	read_unlock(&em_tree->lock);
6758
6759	if (em) {
6760	if (em->start > start \|\| em->start + em->len <= start)
6761	free_extent_map(em);
6762	else if (em->block_start == EXTENT_MAP_INLINE && page)
6763	free_extent_map(em);
6764	else
6765	goto out;
6766	}
6767	em = alloc_extent_map();
6768	if (!em) {
6769	ret = -ENOMEM;
6770	goto out;
6771	}
6772	em->start = EXTENT_MAP_HOLE;
6773	em->orig_start = EXTENT_MAP_HOLE;
6774	em->len = (u64)-`1`;
6775	em->block_len = (u64)-`1`;
6776
6777	path = btrfs_alloc_path();
6778	if (!path) {
6779	ret = -ENOMEM;
6780	goto out;
6781	}
6782
6783	/ Chances are we'll be called again, so go ahead and do readahead /
6784	path->reada = READA_FORWARD;
6785
6786	/*
6787	* The same explanation in load_free_space_cache applies here as well,
6788	* we only read when we're loading the free space cache, and at that
6789	* point the commit_root has everything we need.
6790	*/
6791	if (btrfs_is_free_space_inode(inode)) {
6792	path->search_commit_root = `1`;
6793	path->skip_locking = `1`;
6794	}
6795
6796	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, bytenr: start, mod: `0`);
6797	if (ret < `0`) {
6798	goto out;
6799	} else if (ret > `0`) {
6800	if (path->slots[`0`] == `0`)
6801	goto not_found;
6802	path->slots[`0`]--;
6803	ret = `0`;
6804	}
6805
6806	leaf = path->nodes[`0`];
6807	item = btrfs_item_ptr(leaf, path->slots[`0`],
6808	struct btrfs_file_extent_item);
6809	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
6810	if (found_key.objectid != objectid \|\|
6811	found_key.type != BTRFS_EXTENT_DATA_KEY) {
6812	/*
6813	* If we backup past the first extent we want to move forward
6814	* and see if there is an extent in front of us, otherwise we'll
6815	* say there is a hole for our whole search range which can
6816	* cause problems.
6817	*/
6818	extent_end = start;
6819	goto next;
6820	}
6821
6822	extent_type = btrfs_file_extent_type(eb: leaf, s: item);
6823	extent_start = found_key.offset;
6824	extent_end = btrfs_file_extent_end(path);
6825	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
6826	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6827	/ Only regular file could have regular/prealloc extent /
6828	if (!S_ISREG(inode->vfs_inode.i_mode)) {
6829	ret = -EUCLEAN;
6830	btrfs_crit(fs_info,
6831	"regular/prealloc extent found for non-regular inode %llu",
6832	btrfs_ino(inode));
6833	goto out;
6834	}
6835	trace_btrfs_get_extent_show_fi_regular(bi: inode, l: leaf, fi: item,
6836	start: extent_start);
6837	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6838	trace_btrfs_get_extent_show_fi_inline(bi: inode, l: leaf, fi: item,
6839	slot: path->slots[`0`],
6840	start: extent_start);
6841	}
6842	next:
6843	if (start >= extent_end) {
6844	path->slots[`0`]++;
6845	if (path->slots[`0`] >= btrfs_header_nritems(eb: leaf)) {
6846	ret = btrfs_next_leaf(root, path);
6847	if (ret < `0`)
6848	goto out;
6849	else if (ret > `0`)
6850	goto not_found;
6851
6852	leaf = path->nodes[`0`];
6853	}
6854	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &found_key, nr: path->slots[`0`]);
6855	if (found_key.objectid != objectid \|\|
6856	found_key.type != BTRFS_EXTENT_DATA_KEY)
6857	goto not_found;
6858	if (start + len <= found_key.offset)
6859	goto not_found;
6860	if (start > found_key.offset)
6861	goto next;
6862
6863	/ New extent overlaps with existing one /
6864	em->start = start;
6865	em->orig_start = start;
6866	em->len = found_key.offset - start;
6867	em->block_start = EXTENT_MAP_HOLE;
6868	goto insert;
6869	}
6870
6871	btrfs_extent_item_to_extent_map(inode, path, fi: item, em);
6872
6873	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
6874	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6875	goto insert;
6876	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6877	/*
6878	* Inline extent can only exist at file offset 0. This is
6879	* ensured by tree-checker and inline extent creation path.
6880	* Thus all members representing file offsets should be zero.
6881	*/
6882	ASSERT(pg_offset == `0`);
6883	ASSERT(extent_start == `0`);
6884	ASSERT(em->start == `0`);
6885
6886	/*
6887	* btrfs_extent_item_to_extent_map() should have properly
6888	* initialized em members already.
6889	*
6890	* Other members are not utilized for inline extents.
6891	*/
6892	ASSERT(em->block_start == EXTENT_MAP_INLINE);
6893	ASSERT(em->len == fs_info->sectorsize);
6894
6895	ret = read_inline_extent(inode, path, page);
6896	if (ret < `0`)
6897	goto out;
6898	goto insert;
6899	}
6900	not_found:
6901	em->start = start;
6902	em->orig_start = start;
6903	em->len = len;
6904	em->block_start = EXTENT_MAP_HOLE;
6905	insert:
6906	ret = `0`;
6907	btrfs_release_path(p: path);
6908	if (em->start > start \|\| extent_map_end(em) <= start) {
6909	btrfs_err(fs_info,
6910	"bad extent! em: [%llu %llu] passed [%llu %llu]",
6911	em->start, em->len, start, len);
6912	ret = -EIO;
6913	goto out;
6914	}
6915
6916	write_lock(&em_tree->lock);
6917	ret = btrfs_add_extent_mapping(fs_info, em_tree, em_in: &em, start, len);
6918	write_unlock(&em_tree->lock);
6919	out:
6920	btrfs_free_path(p: path);
6921
6922	trace_btrfs_get_extent(root, inode, map: em);
6923
6924	if (ret) {
6925	free_extent_map(em);
6926	return ERR_PTR(error: ret);
6927	}
6928	return em;
6929	}
6930
6931	static struct extent_map btrfs_create_dio_extent(struct* btrfs_inode *inode,
6932	struct btrfs_dio_data *dio_data,
6933	const u64 start,
6934	const u64 len,
6935	const u64 orig_start,
6936	const u64 block_start,
6937	const u64 block_len,
6938	const u64 orig_block_len,
6939	const u64 ram_bytes,
6940	const int type)
6941	{
6942	struct extent_map *em = NULL;
6943	struct btrfs_ordered_extent *ordered;
6944
6945	if (type != BTRFS_ORDERED_NOCOW) {
6946	em = create_io_em(inode, start, len, orig_start, block_start,
6947	block_len, orig_block_len, ram_bytes,
6948	compress_type: BTRFS_COMPRESS_NONE, / compress_type /
6949	type);
6950	if (IS_ERR(ptr: em))
6951	goto out;
6952	}
6953	ordered = btrfs_alloc_ordered_extent(inode, file_offset: start, num_bytes: len, ram_bytes: len,
6954	disk_bytenr: block_start, disk_num_bytes: block_len, offset: `0`,
6955	flags: (`1` << type) \|
6956	(`1` << BTRFS_ORDERED_DIRECT),
6957	compress_type: BTRFS_COMPRESS_NONE);
6958	if (IS_ERR(ptr: ordered)) {
6959	if (em) {
6960	free_extent_map(em);
6961	btrfs_drop_extent_map_range(inode, start,
6962	end: start + len - `1`, skip_pinned: false);
6963	}
6964	em = ERR_CAST(ptr: ordered);
6965	} else {
6966	ASSERT(!dio_data->ordered);
6967	dio_data->ordered = ordered;
6968	}
6969	out:
6970
6971	return em;
6972	}
6973
6974	static struct extent_map btrfs_new_extent_direct(struct* btrfs_inode *inode,
6975	struct btrfs_dio_data *dio_data,
6976	u64 start, u64 len)
6977	{
6978	struct btrfs_root *root = inode->root;
6979	struct btrfs_fs_info *fs_info = root->fs_info;
6980	struct extent_map *em;
6981	struct btrfs_key ins;
6982	u64 alloc_hint;
6983	int ret;
6984
6985	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes: len);
6986	ret = btrfs_reserve_extent(root, ram_bytes: len, num_bytes: len, min_alloc_size: fs_info->sectorsize,
6987	empty_size: `0`, hint_byte: alloc_hint, ins: &ins, is_data: `1`, delalloc: `1`);
6988	if (ret)
6989	return ERR_PTR(error: ret);
6990
6991	em = btrfs_create_dio_extent(inode, dio_data, start, len: ins.offset, orig_start: start,
6992	block_start: ins.objectid, block_len: ins.offset, orig_block_len: ins.offset,
6993	ram_bytes: ins.offset, type: BTRFS_ORDERED_REGULAR);
6994	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
6995	if (IS_ERR(ptr: em))
6996	btrfs_free_reserved_extent(fs_info, start: ins.objectid, len: ins.offset,
6997	delalloc: `1`);
6998
6999	return em;
7000	}
7001
7002	static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7003	{
7004	struct btrfs_block_group *block_group;
7005	bool readonly = false;
7006
7007	block_group = btrfs_lookup_block_group(info: fs_info, bytenr);
7008	if (!block_group \|\| block_group->ro)
7009	readonly = true;
7010	if (block_group)
7011	btrfs_put_block_group(cache: block_group);
7012	return readonly;
7013	}
7014
7015	/*
7016	* Check if we can do nocow write into the range [@offset, @offset + @len)
7017	*
7018	* @offset: File offset
7019	* @len: The length to write, will be updated to the nocow writeable
7020	* range
7021	* @orig_start: (optional) Return the original file offset of the file extent
7022	* @orig_len: (optional) Return the original on-disk length of the file extent
7023	* @ram_bytes: (optional) Return the ram_bytes of the file extent
7024	* @strict: if true, omit optimizations that might force us into unnecessary
7025	* cow. e.g., don't trust generation number.
7026	*
7027	* Return:
7028	* >0 and update @len if we can do nocow write
7029	* 0 if we can't do nocow write
7030	* <0 if error happened
7031	*
7032	* NOTE: This only checks the file extents, caller is responsible to wait for
7033	* any ordered extents.
7034	*/
7035	noinline int can_nocow_extent(struct inode inode, u64 offset, u64 len,
7036	u64 orig_start, u64 orig_block_len,
7037	u64 *ram_bytes, bool nowait, bool strict)
7038	{
7039	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
7040	struct can_nocow_file_extent_args nocow_args = { `0` };
7041	struct btrfs_path *path;
7042	int ret;
7043	struct extent_buffer *leaf;
7044	struct btrfs_root *root = BTRFS_I(inode)->root;
7045	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7046	struct btrfs_file_extent_item *fi;
7047	struct btrfs_key key;
7048	int found_type;
7049
7050	path = btrfs_alloc_path();
7051	if (!path)
7052	return -ENOMEM;
7053	path->nowait = nowait;
7054
7055	ret = btrfs_lookup_file_extent(NULL, root, path,
7056	objectid: btrfs_ino(inode: BTRFS_I(inode)), bytenr: offset, mod: `0`);
7057	if (ret < `0`)
7058	goto out;
7059
7060	if (ret == `1`) {
7061	if (path->slots[`0`] == `0`) {
7062	/ can't find the item, must cow /
7063	ret = `0`;
7064	goto out;
7065	}
7066	path->slots[`0`]--;
7067	}
7068	ret = `0`;
7069	leaf = path->nodes[`0`];
7070	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
7071	if (key.objectid != btrfs_ino(inode: BTRFS_I(inode)) \|\|
7072	key.type != BTRFS_EXTENT_DATA_KEY) {
7073	/ not our file or wrong item type, must cow /
7074	goto out;
7075	}
7076
7077	if (key.offset > offset) {
7078	/ Wrong offset, must cow /
7079	goto out;
7080	}
7081
7082	if (btrfs_file_extent_end(path) <= offset)
7083	goto out;
7084
7085	fi = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
7086	found_type = btrfs_file_extent_type(eb: leaf, s: fi);
7087	if (ram_bytes)
7088	*ram_bytes = btrfs_file_extent_ram_bytes(eb: leaf, s: fi);
7089
7090	nocow_args.start = offset;
7091	nocow_args.end = offset + *len - `1`;
7092	nocow_args.strict = strict;
7093	nocow_args.free_path = true;
7094
7095	ret = can_nocow_file_extent(path, key: &key, inode: BTRFS_I(inode), args: &nocow_args);
7096	/ can_nocow_file_extent() has freed the path. /
7097	path = NULL;
7098
7099	if (ret != `1`) {
7100	/ Treat errors as not being able to NOCOW. /
7101	ret = `0`;
7102	goto out;
7103	}
7104
7105	ret = `0`;
7106	if (btrfs_extent_readonly(fs_info, bytenr: nocow_args.disk_bytenr))
7107	goto out;
7108
7109	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7110	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7111	u64 range_end;
7112
7113	range_end = round_up(offset + nocow_args.num_bytes,
7114	root->fs_info->sectorsize) - `1`;
7115	ret = test_range_bit_exists(tree: io_tree, start: offset, end: range_end, bit: EXTENT_DELALLOC);
7116	if (ret) {
7117	ret = -EAGAIN;
7118	goto out;
7119	}
7120	}
7121
7122	if (orig_start)
7123	*orig_start = key.offset - nocow_args.extent_offset;
7124	if (orig_block_len)
7125	*orig_block_len = nocow_args.disk_num_bytes;
7126
7127	*len = nocow_args.num_bytes;
7128	ret = `1`;
7129	out:
7130	btrfs_free_path(p: path);
7131	return ret;
7132	}
7133
7134	static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7135	struct extent_state **cached_state,
7136	unsigned int iomap_flags)
7137	{
7138	const bool writing = (iomap_flags & IOMAP_WRITE);
7139	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7140	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7141	struct btrfs_ordered_extent *ordered;
7142	int ret = `0`;
7143
7144	while (`1`) {
7145	if (nowait) {
7146	if (!try_lock_extent(tree: io_tree, start: lockstart, end: lockend,
7147	cached: cached_state))
7148	return -EAGAIN;
7149	} else {
7150	lock_extent(tree: io_tree, start: lockstart, end: lockend, cached: cached_state);
7151	}
7152	/*
7153	* We're concerned with the entire range that we're going to be
7154	* doing DIO to, so we need to make sure there's no ordered
7155	* extents in this range.
7156	*/
7157	ordered = btrfs_lookup_ordered_range(inode: BTRFS_I(inode), file_offset: lockstart,
7158	len: lockend - lockstart + `1`);
7159
7160	/*
7161	* We need to make sure there are no buffered pages in this
7162	* range either, we could have raced between the invalidate in
7163	* generic_file_direct_write and locking the extent. The
7164	* invalidate needs to happen so that reads after a write do not
7165	* get stale data.
7166	*/
7167	if (!ordered &&
7168	(!writing \|\| !filemap_range_has_page(inode->i_mapping,
7169	lstart: lockstart, lend: lockend)))
7170	break;
7171
7172	unlock_extent(tree: io_tree, start: lockstart, end: lockend, cached: cached_state);
7173
7174	if (ordered) {
7175	if (nowait) {
7176	btrfs_put_ordered_extent(entry: ordered);
7177	ret = -EAGAIN;
7178	break;
7179	}
7180	/*
7181	* If we are doing a DIO read and the ordered extent we
7182	* found is for a buffered write, we can not wait for it
7183	* to complete and retry, because if we do so we can
7184	* deadlock with concurrent buffered writes on page
7185	* locks. This happens only if our DIO read covers more
7186	* than one extent map, if at this point has already
7187	* created an ordered extent for a previous extent map
7188	* and locked its range in the inode's io tree, and a
7189	* concurrent write against that previous extent map's
7190	* range and this range started (we unlock the ranges
7191	* in the io tree only when the bios complete and
7192	* buffered writes always lock pages before attempting
7193	* to lock range in the io tree).
7194	*/
7195	if (writing \|\|
7196	test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7197	btrfs_start_ordered_extent(entry: ordered);
7198	else
7199	ret = nowait ? -EAGAIN : -ENOTBLK;
7200	btrfs_put_ordered_extent(entry: ordered);
7201	} else {
7202	/*
7203	* We could trigger writeback for this range (and wait
7204	* for it to complete) and then invalidate the pages for
7205	* this range (through invalidate_inode_pages2_range()),
7206	* but that can lead us to a deadlock with a concurrent
7207	* call to readahead (a buffered read or a defrag call
7208	* triggered a readahead) on a page lock due to an
7209	* ordered dio extent we created before but did not have
7210	* yet a corresponding bio submitted (whence it can not
7211	* complete), which makes readahead wait for that
7212	* ordered extent to complete while holding a lock on
7213	* that page.
7214	*/
7215	ret = nowait ? -EAGAIN : -ENOTBLK;
7216	}
7217
7218	if (ret)
7219	break;
7220
7221	cond_resched();
7222	}
7223
7224	return ret;
7225	}
7226
7227	/ The callers of this must take lock_extent() /
7228	static struct extent_map create_io_em(struct* btrfs_inode *inode, u64 start,
7229	u64 len, u64 orig_start, u64 block_start,
7230	u64 block_len, u64 orig_block_len,
7231	u64 ram_bytes, int compress_type,
7232	int type)
7233	{
7234	struct extent_map *em;
7235	int ret;
7236
7237	ASSERT(type == BTRFS_ORDERED_PREALLOC \|\|
7238	type == BTRFS_ORDERED_COMPRESSED \|\|
7239	type == BTRFS_ORDERED_NOCOW \|\|
7240	type == BTRFS_ORDERED_REGULAR);
7241
7242	em = alloc_extent_map();
7243	if (!em)
7244	return ERR_PTR(error: -ENOMEM);
7245
7246	em->start = start;
7247	em->orig_start = orig_start;
7248	em->len = len;
7249	em->block_len = block_len;
7250	em->block_start = block_start;
7251	em->orig_block_len = orig_block_len;
7252	em->ram_bytes = ram_bytes;
7253	em->generation = -`1`;
7254	set_bit(nr: EXTENT_FLAG_PINNED, addr: &em->flags);
7255	if (type == BTRFS_ORDERED_PREALLOC) {
7256	set_bit(nr: EXTENT_FLAG_FILLING, addr: &em->flags);
7257	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7258	set_bit(nr: EXTENT_FLAG_COMPRESSED, addr: &em->flags);
7259	em->compress_type = compress_type;
7260	}
7261
7262	ret = btrfs_replace_extent_map_range(inode, new_em: em, modified: true);
7263	if (ret) {
7264	free_extent_map(em);
7265	return ERR_PTR(error: ret);
7266	}
7267
7268	/ em got 2 refs now, callers needs to do free_extent_map once. /
7269	return em;
7270	}
7271
7272
7273	static int btrfs_get_blocks_direct_write(struct extent_map **map,
7274	struct inode *inode,
7275	struct btrfs_dio_data *dio_data,
7276	u64 start, u64 *lenp,
7277	unsigned int iomap_flags)
7278	{
7279	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7280	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
7281	struct extent_map em = map;
7282	int type;
7283	u64 block_start, orig_start, orig_block_len, ram_bytes;
7284	struct btrfs_block_group *bg;
7285	bool can_nocow = false;
7286	bool space_reserved = false;
7287	u64 len = *lenp;
7288	u64 prev_len;
7289	int ret = `0`;
7290
7291	/*
7292	* We don't allocate a new extent in the following cases
7293	*
7294	* 1) The inode is marked as NODATACOW. In this case we'll just use the
7295	* existing extent.
7296	* 2) The extent is marked as PREALLOC. We're good to go here and can
7297	* just use the extent.
7298	*
7299	*/
7300	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) \|\|
7301	((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7302	em->block_start != EXTENT_MAP_HOLE)) {
7303	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7304	type = BTRFS_ORDERED_PREALLOC;
7305	else
7306	type = BTRFS_ORDERED_NOCOW;
7307	len = min(len, em->len - (start - em->start));
7308	block_start = em->block_start + (start - em->start);
7309
7310	if (can_nocow_extent(inode, offset: start, len: &len, orig_start: &orig_start,
7311	orig_block_len: &orig_block_len, ram_bytes: &ram_bytes, nowait: false, strict: false) == `1`) {
7312	bg = btrfs_inc_nocow_writers(fs_info, bytenr: block_start);
7313	if (bg)
7314	can_nocow = true;
7315	}
7316	}
7317
7318	prev_len = len;
7319	if (can_nocow) {
7320	struct extent_map *em2;
7321
7322	/ We can NOCOW, so only need to reserve metadata space. /
7323	ret = btrfs_delalloc_reserve_metadata(inode: BTRFS_I(inode), num_bytes: len, disk_num_bytes: len,
7324	noflush: nowait);
7325	if (ret < `0`) {
7326	/ Our caller expects us to free the input extent map. /
7327	free_extent_map(em);
7328	*map = NULL;
7329	btrfs_dec_nocow_writers(bg);
7330	if (nowait && (ret == -ENOSPC \|\| ret == -EDQUOT))
7331	ret = -EAGAIN;
7332	goto out;
7333	}
7334	space_reserved = true;
7335
7336	em2 = btrfs_create_dio_extent(inode: BTRFS_I(inode), dio_data, start, len,
7337	orig_start, block_start,
7338	block_len: len, orig_block_len,
7339	ram_bytes, type);
7340	btrfs_dec_nocow_writers(bg);
7341	if (type == BTRFS_ORDERED_PREALLOC) {
7342	free_extent_map(em);
7343	*map = em2;
7344	em = em2;
7345	}
7346
7347	if (IS_ERR(ptr: em2)) {
7348	ret = PTR_ERR(ptr: em2);
7349	goto out;
7350	}
7351
7352	dio_data->nocow_done = true;
7353	} else {
7354	/ Our caller expects us to free the input extent map. /
7355	free_extent_map(em);
7356	*map = NULL;
7357
7358	if (nowait) {
7359	ret = -EAGAIN;
7360	goto out;
7361	}
7362
7363	/*
7364	* If we could not allocate data space before locking the file
7365	* range and we can't do a NOCOW write, then we have to fail.
7366	*/
7367	if (!dio_data->data_space_reserved) {
7368	ret = -ENOSPC;
7369	goto out;
7370	}
7371
7372	/*
7373	* We have to COW and we have already reserved data space before,
7374	* so now we reserve only metadata.
7375	*/
7376	ret = btrfs_delalloc_reserve_metadata(inode: BTRFS_I(inode), num_bytes: len, disk_num_bytes: len,
7377	noflush: false);
7378	if (ret < `0`)
7379	goto out;
7380	space_reserved = true;
7381
7382	em = btrfs_new_extent_direct(inode: BTRFS_I(inode), dio_data, start, len);
7383	if (IS_ERR(ptr: em)) {
7384	ret = PTR_ERR(ptr: em);
7385	goto out;
7386	}
7387	*map = em;
7388	len = min(len, em->len - (start - em->start));
7389	if (len < prev_len)
7390	btrfs_delalloc_release_metadata(inode: BTRFS_I(inode),
7391	num_bytes: prev_len - len, qgroup_free: true);
7392	}
7393
7394	/*
7395	* We have created our ordered extent, so we can now release our reservation
7396	* for an outstanding extent.
7397	*/
7398	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), num_bytes: prev_len);
7399
7400	/*
7401	* Need to update the i_size under the extent lock so buffered
7402	* readers will get the updated i_size when we unlock.
7403	*/
7404	if (start + len > i_size_read(inode))
7405	i_size_write(inode, i_size: start + len);
7406	out:
7407	if (ret && space_reserved) {
7408	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), num_bytes: len);
7409	btrfs_delalloc_release_metadata(inode: BTRFS_I(inode), num_bytes: len, qgroup_free: true);
7410	}
7411	*lenp = len;
7412	return ret;
7413	}
7414
7415	static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7416	loff_t length, unsigned int flags, struct iomap *iomap,
7417	struct iomap *srcmap)
7418	{
7419	struct iomap_iter iter = container_of(iomap, struct* iomap_iter, iomap);
7420	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
7421	struct extent_map *em;
7422	struct extent_state *cached_state = NULL;
7423	struct btrfs_dio_data *dio_data = iter->private;
7424	u64 lockstart, lockend;
7425	const bool write = !!(flags & IOMAP_WRITE);
7426	int ret = `0`;
7427	u64 len = length;
7428	const u64 data_alloc_len = length;
7429	bool unlock_extents = false;
7430
7431	/*
7432	* We could potentially fault if we have a buffer > PAGE_SIZE, and if
7433	* we're NOWAIT we may submit a bio for a partial range and return
7434	* EIOCBQUEUED, which would result in an errant short read.
7435	*
7436	* The best way to handle this would be to allow for partial completions
7437	* of iocb's, so we could submit the partial bio, return and fault in
7438	* the rest of the pages, and then submit the io for the rest of the
7439	* range. However we don't have that currently, so simply return
7440	* -EAGAIN at this point so that the normal path is used.
7441	*/
7442	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
7443	return -EAGAIN;
7444
7445	/*
7446	* Cap the size of reads to that usually seen in buffered I/O as we need
7447	* to allocate a contiguous array for the checksums.
7448	*/
7449	if (!write)
7450	len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
7451
7452	lockstart = start;
7453	lockend = start + len - `1`;
7454
7455	/*
7456	* iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
7457	* enough if we've written compressed pages to this area, so we need to
7458	* flush the dirty pages again to make absolutely sure that any
7459	* outstanding dirty pages are on disk - the first flush only starts
7460	* compression on the data, while keeping the pages locked, so by the
7461	* time the second flush returns we know bios for the compressed pages
7462	* were submitted and finished, and the pages no longer under writeback.
7463	*
7464	* If we have a NOWAIT request and we have any pages in the range that
7465	* are locked, likely due to compression still in progress, we don't want
7466	* to block on page locks. We also don't want to block on pages marked as
7467	* dirty or under writeback (same as for the non-compression case).
7468	* iomap_dio_rw() did the same check, but after that and before we got
7469	* here, mmap'ed writes may have happened or buffered reads started
7470	* (readpage() and readahead(), which lock pages), as we haven't locked
7471	* the file range yet.
7472	*/
7473	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7474	&BTRFS_I(inode)->runtime_flags)) {
7475	if (flags & IOMAP_NOWAIT) {
7476	if (filemap_range_needs_writeback(mapping: inode->i_mapping,
7477	start_byte: lockstart, end_byte: lockend))
7478	return -EAGAIN;
7479	} else {
7480	ret = filemap_fdatawrite_range(mapping: inode->i_mapping, start,
7481	end: start + length - `1`);
7482	if (ret)
7483	return ret;
7484	}
7485	}
7486
7487	memset(dio_data, `0`, sizeof(*dio_data));
7488
7489	/*
7490	* We always try to allocate data space and must do it before locking
7491	* the file range, to avoid deadlocks with concurrent writes to the same
7492	* range if the range has several extents and the writes don't expand the
7493	* current i_size (the inode lock is taken in shared mode). If we fail to
7494	* allocate data space here we continue and later, after locking the
7495	* file range, we fail with ENOSPC only if we figure out we can not do a
7496	* NOCOW write.
7497	*/
7498	if (write && !(flags & IOMAP_NOWAIT)) {
7499	ret = btrfs_check_data_free_space(inode: BTRFS_I(inode),
7500	reserved: &dio_data->data_reserved,
7501	start, len: data_alloc_len, noflush: false);
7502	if (!ret)
7503	dio_data->data_space_reserved = true;
7504	else if (ret && !(BTRFS_I(inode)->flags &
7505	(BTRFS_INODE_NODATACOW \| BTRFS_INODE_PREALLOC)))
7506	goto err;
7507	}
7508
7509	/*
7510	* If this errors out it's because we couldn't invalidate pagecache for
7511	* this range and we need to fallback to buffered IO, or we are doing a
7512	* NOWAIT read/write and we need to block.
7513	*/
7514	ret = lock_extent_direct(inode, lockstart, lockend, cached_state: &cached_state, iomap_flags: flags);
7515	if (ret < `0`)
7516	goto err;
7517
7518	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, pg_offset: `0`, start, len);
7519	if (IS_ERR(ptr: em)) {
7520	ret = PTR_ERR(ptr: em);
7521	goto unlock_err;
7522	}
7523
7524	/*
7525	* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7526	* io. INLINE is special, and we could probably kludge it in here, but
7527	* it's still buffered so for safety lets just fall back to the generic
7528	* buffered path.
7529	*
7530	* For COMPRESSED we _have_ to read the entire extent in so we can
7531	* decompress it, so there will be buffering required no matter what we
7532	* do, so go ahead and fallback to buffered.
7533	*
7534	* We return -ENOTBLK because that's what makes DIO go ahead and go back
7535	* to buffered IO. Don't blame me, this is the price we pay for using
7536	* the generic code.
7537	*/
7538	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) \|\|
7539	em->block_start == EXTENT_MAP_INLINE) {
7540	free_extent_map(em);
7541	/*
7542	* If we are in a NOWAIT context, return -EAGAIN in order to
7543	* fallback to buffered IO. This is not only because we can
7544	* block with buffered IO (no support for NOWAIT semantics at
7545	* the moment) but also to avoid returning short reads to user
7546	* space - this happens if we were able to read some data from
7547	* previous non-compressed extents and then when we fallback to
7548	* buffered IO, at btrfs_file_read_iter() by calling
7549	* filemap_read(), we fail to fault in pages for the read buffer,
7550	* in which case filemap_read() returns a short read (the number
7551	* of bytes previously read is > 0, so it does not return -EFAULT).
7552	*/
7553	ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7554	goto unlock_err;
7555	}
7556
7557	len = min(len, em->len - (start - em->start));
7558
7559	/*
7560	* If we have a NOWAIT request and the range contains multiple extents
7561	* (or a mix of extents and holes), then we return -EAGAIN to make the
7562	* caller fallback to a context where it can do a blocking (without
7563	* NOWAIT) request. This way we avoid doing partial IO and returning
7564	* success to the caller, which is not optimal for writes and for reads
7565	* it can result in unexpected behaviour for an application.
7566	*
7567	* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7568	* iomap_dio_rw(), we can end up returning less data then what the caller
7569	* asked for, resulting in an unexpected, and incorrect, short read.
7570	* That is, the caller asked to read N bytes and we return less than that,
7571	* which is wrong unless we are crossing EOF. This happens if we get a
7572	* page fault error when trying to fault in pages for the buffer that is
7573	* associated to the struct iov_iter passed to iomap_dio_rw(), and we
7574	* have previously submitted bios for other extents in the range, in
7575	* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7576	* those bios have completed by the time we get the page fault error,
7577	* which we return back to our caller - we should only return EIOCBQUEUED
7578	* after we have submitted bios for all the extents in the range.
7579	*/
7580	if ((flags & IOMAP_NOWAIT) && len < length) {
7581	free_extent_map(em);
7582	ret = -EAGAIN;
7583	goto unlock_err;
7584	}
7585
7586	if (write) {
7587	ret = btrfs_get_blocks_direct_write(map: &em, inode, dio_data,
7588	start, lenp: &len, iomap_flags: flags);
7589	if (ret < `0`)
7590	goto unlock_err;
7591	unlock_extents = true;
7592	/ Recalc len in case the new em is smaller than requested /
7593	len = min(len, em->len - (start - em->start));
7594	if (dio_data->data_space_reserved) {
7595	u64 release_offset;
7596	u64 release_len = `0`;
7597
7598	if (dio_data->nocow_done) {
7599	release_offset = start;
7600	release_len = data_alloc_len;
7601	} else if (len < data_alloc_len) {
7602	release_offset = start + len;
7603	release_len = data_alloc_len - len;
7604	}
7605
7606	if (release_len > `0`)
7607	btrfs_free_reserved_data_space(inode: BTRFS_I(inode),
7608	reserved: dio_data->data_reserved,
7609	start: release_offset,
7610	len: release_len);
7611	}
7612	} else {
7613	/*
7614	* We need to unlock only the end area that we aren't using.
7615	* The rest is going to be unlocked by the endio routine.
7616	*/
7617	lockstart = start + len;
7618	if (lockstart < lockend)
7619	unlock_extents = true;
7620	}
7621
7622	if (unlock_extents)
7623	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
7624	cached: &cached_state);
7625	else
7626	free_extent_state(state: cached_state);
7627
7628	/*
7629	* Translate extent map information to iomap.
7630	* We trim the extents (and move the addr) even though iomap code does
7631	* that, since we have locked only the parts we are performing I/O in.
7632	*/
7633	if ((em->block_start == EXTENT_MAP_HOLE) \|\|
7634	(test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7635	iomap->addr = IOMAP_NULL_ADDR;
7636	iomap->type = IOMAP_HOLE;
7637	} else {
7638	iomap->addr = em->block_start + (start - em->start);
7639	iomap->type = IOMAP_MAPPED;
7640	}
7641	iomap->offset = start;
7642	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7643	iomap->length = len;
7644	free_extent_map(em);
7645
7646	return `0`;
7647
7648	unlock_err:
7649	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: lockstart, end: lockend,
7650	cached: &cached_state);
7651	err:
7652	if (dio_data->data_space_reserved) {
7653	btrfs_free_reserved_data_space(inode: BTRFS_I(inode),
7654	reserved: dio_data->data_reserved,
7655	start, len: data_alloc_len);
7656	extent_changeset_free(changeset: dio_data->data_reserved);
7657	}
7658
7659	return ret;
7660	}
7661
7662	static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7663	ssize_t written, unsigned int flags, struct iomap *iomap)
7664	{
7665	struct iomap_iter iter = container_of(iomap, struct* iomap_iter, iomap);
7666	struct btrfs_dio_data *dio_data = iter->private;
7667	size_t submitted = dio_data->submitted;
7668	const bool write = !!(flags & IOMAP_WRITE);
7669	int ret = `0`;
7670
7671	if (!write && (iomap->type == IOMAP_HOLE)) {
7672	/ If reading from a hole, unlock and return /
7673	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: pos, end: pos + length - `1`,
7674	NULL);
7675	return `0`;
7676	}
7677
7678	if (submitted < length) {
7679	pos += submitted;
7680	length -= submitted;
7681	if (write)
7682	btrfs_finish_ordered_extent(ordered: dio_data->ordered, NULL,
7683	file_offset: pos, len: length, uptodate: false);
7684	else
7685	unlock_extent(tree: &BTRFS_I(inode)->io_tree, start: pos,
7686	end: pos + length - `1`, NULL);
7687	ret = -ENOTBLK;
7688	}
7689	if (write) {
7690	btrfs_put_ordered_extent(entry: dio_data->ordered);
7691	dio_data->ordered = NULL;
7692	}
7693
7694	if (write)
7695	extent_changeset_free(changeset: dio_data->data_reserved);
7696	return ret;
7697	}
7698
7699	static void btrfs_dio_end_io(struct btrfs_bio *bbio)
7700	{
7701	struct btrfs_dio_private *dip =
7702	container_of(bbio, struct btrfs_dio_private, bbio);
7703	struct btrfs_inode *inode = bbio->inode;
7704	struct bio *bio = &bbio->bio;
7705
7706	if (bio->bi_status) {
7707	btrfs_warn(inode->root->fs_info,
7708	"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
7709	btrfs_ino(inode), bio->bi_opf,
7710	dip->file_offset, dip->bytes, bio->bi_status);
7711	}
7712
7713	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
7714	btrfs_finish_ordered_extent(ordered: bbio->ordered, NULL,
7715	file_offset: dip->file_offset, len: dip->bytes,
7716	uptodate: !bio->bi_status);
7717	} else {
7718	unlock_extent(tree: &inode->io_tree, start: dip->file_offset,
7719	end: dip->file_offset + dip->bytes - `1`, NULL);
7720	}
7721
7722	bbio->bio.bi_private = bbio->private;
7723	iomap_dio_bio_end_io(bio);
7724	}
7725
7726	static void btrfs_dio_submit_io(const struct iomap_iter iter, struct* bio *bio,
7727	loff_t file_offset)
7728	{
7729	struct btrfs_bio *bbio = btrfs_bio(bio);
7730	struct btrfs_dio_private *dip =
7731	container_of(bbio, struct btrfs_dio_private, bbio);
7732	struct btrfs_dio_data *dio_data = iter->private;
7733
7734	btrfs_bio_init(bbio, fs_info: BTRFS_I(inode: iter->inode)->root->fs_info,
7735	end_io: btrfs_dio_end_io, private: bio->bi_private);
7736	bbio->inode = BTRFS_I(inode: iter->inode);
7737	bbio->file_offset = file_offset;
7738
7739	dip->file_offset = file_offset;
7740	dip->bytes = bio->bi_iter.bi_size;
7741
7742	dio_data->submitted += bio->bi_iter.bi_size;
7743
7744	/*
7745	* Check if we are doing a partial write. If we are, we need to split
7746	* the ordered extent to match the submitted bio. Hang on to the
7747	* remaining unfinishable ordered_extent in dio_data so that it can be
7748	* cancelled in iomap_end to avoid a deadlock wherein faulting the
7749	* remaining pages is blocked on the outstanding ordered extent.
7750	*/
7751	if (iter->flags & IOMAP_WRITE) {
7752	int ret;
7753
7754	ret = btrfs_extract_ordered_extent(bbio, ordered: dio_data->ordered);
7755	if (ret) {
7756	btrfs_finish_ordered_extent(ordered: dio_data->ordered, NULL,
7757	file_offset, len: dip->bytes,
7758	uptodate: !ret);
7759	bio->bi_status = errno_to_blk_status(errno: ret);
7760	iomap_dio_bio_end_io(bio);
7761	return;
7762	}
7763	}
7764
7765	btrfs_submit_bio(bbio, mirror_num: `0`);
7766	}
7767
7768	static const struct iomap_ops btrfs_dio_iomap_ops = {
7769	.iomap_begin = btrfs_dio_iomap_begin,
7770	.iomap_end = btrfs_dio_iomap_end,
7771	};
7772
7773	static const struct iomap_dio_ops btrfs_dio_ops = {
7774	.submit_io = btrfs_dio_submit_io,
7775	.bio_set = &btrfs_dio_bioset,
7776	};
7777
7778	ssize_t btrfs_dio_read(struct kiocb iocb, struct* iov_iter *iter, size_t done_before)
7779	{
7780	struct btrfs_dio_data data = { `0` };
7781
7782	return iomap_dio_rw(iocb, iter, ops: &btrfs_dio_iomap_ops, dops: &btrfs_dio_ops,
7783	IOMAP_DIO_PARTIAL, private: &data, done_before);
7784	}
7785
7786	struct iomap_dio btrfs_dio_write(struct* kiocb iocb, struct* iov_iter *iter,
7787	size_t done_before)
7788	{
7789	struct btrfs_dio_data data = { `0` };
7790
7791	return __iomap_dio_rw(iocb, iter, ops: &btrfs_dio_iomap_ops, dops: &btrfs_dio_ops,
7792	IOMAP_DIO_PARTIAL, private: &data, done_before);
7793	}
7794
7795	static int btrfs_fiemap(struct inode inode, struct* fiemap_extent_info *fieinfo,
7796	u64 start, u64 len)
7797	{
7798	int ret;
7799
7800	ret = fiemap_prep(inode, fieinfo, start, len: &len, supported_flags: `0`);
7801	if (ret)
7802	return ret;
7803
7804	/*
7805	* fiemap_prep() called filemap_write_and_wait() for the whole possible
7806	* file range (0 to LLONG_MAX), but that is not enough if we have
7807	* compression enabled. The first filemap_fdatawrite_range() only kicks
7808	* in the compression of data (in an async thread) and will return
7809	* before the compression is done and writeback is started. A second
7810	* filemap_fdatawrite_range() is needed to wait for the compression to
7811	* complete and writeback to start. We also need to wait for ordered
7812	* extents to complete, because our fiemap implementation uses mainly
7813	* file extent items to list the extents, searching for extent maps
7814	* only for file ranges with holes or prealloc extents to figure out
7815	* if we have delalloc in those ranges.
7816	*/
7817	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7818	ret = btrfs_wait_ordered_range(inode, start: `0`, LLONG_MAX);
7819	if (ret)
7820	return ret;
7821	}
7822
7823	return extent_fiemap(inode: BTRFS_I(inode), fieinfo, start, len);
7824	}
7825
7826	static int btrfs_writepages(struct address_space *mapping,
7827	struct writeback_control *wbc)
7828	{
7829	return extent_writepages(mapping, wbc);
7830	}
7831
7832	static void btrfs_readahead(struct readahead_control *rac)
7833	{
7834	extent_readahead(rac);
7835	}
7836
7837	/*
7838	* For release_folio() and invalidate_folio() we have a race window where
7839	* folio_end_writeback() is called but the subpage spinlock is not yet released.
7840	* If we continue to release/invalidate the page, we could cause use-after-free
7841	* for subpage spinlock. So this function is to spin and wait for subpage
7842	* spinlock.
7843	*/
7844	static void wait_subpage_spinlock(struct page *page)
7845	{
7846	struct btrfs_fs_info *fs_info = btrfs_sb(sb: page->mapping->host->i_sb);
7847	struct btrfs_subpage *subpage;
7848
7849	if (!btrfs_is_subpage(fs_info, page))
7850	return;
7851
7852	ASSERT(PagePrivate(page) && page->private);
7853	subpage = (struct btrfs_subpage *)page->private;
7854
7855	/*
7856	* This may look insane as we just acquire the spinlock and release it,
7857	* without doing anything. But we just want to make sure no one is
7858	* still holding the subpage spinlock.
7859	* And since the page is not dirty nor writeback, and we have page
7860	* locked, the only possible way to hold a spinlock is from the endio
7861	* function to clear page writeback.
7862	*
7863	* Here we just acquire the spinlock so that all existing callers
7864	* should exit and we're safe to release/invalidate the page.
7865	*/
7866	spin_lock_irq(lock: &subpage->lock);
7867	spin_unlock_irq(lock: &subpage->lock);
7868	}
7869
7870	static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7871	{
7872	int ret = try_release_extent_mapping(page: &folio->page, mask: gfp_flags);
7873
7874	if (ret == `1`) {
7875	wait_subpage_spinlock(page: &folio->page);
7876	clear_page_extent_mapped(page: &folio->page);
7877	}
7878	return ret;
7879	}
7880
7881	static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7882	{
7883	if (folio_test_writeback(folio) \|\| folio_test_dirty(folio))
7884	return false;
7885	return __btrfs_release_folio(folio, gfp_flags);
7886	}
7887
7888	#ifdef CONFIG_MIGRATION
7889	static int btrfs_migrate_folio(struct address_space *mapping,
7890	struct folio dst, struct* folio *src,
7891	enum migrate_mode mode)
7892	{
7893	int ret = filemap_migrate_folio(mapping, dst, src, mode);
7894
7895	if (ret != MIGRATEPAGE_SUCCESS)
7896	return ret;
7897
7898	if (folio_test_ordered(src)) {
7899	folio_clear_ordered(src);
7900	folio_set_ordered(dst);
7901	}
7902
7903	return MIGRATEPAGE_SUCCESS;
7904	}
7905	#else
7906	#define btrfs_migrate_folio NULL
7907	#endif
7908
7909	static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7910	size_t length)
7911	{
7912	struct btrfs_inode *inode = BTRFS_I(inode: folio->mapping->host);
7913	struct btrfs_fs_info *fs_info = inode->root->fs_info;
7914	struct extent_io_tree *tree = &inode->io_tree;
7915	struct extent_state *cached_state = NULL;
7916	u64 page_start = folio_pos(folio);
7917	u64 page_end = page_start + folio_size(folio) - `1`;
7918	u64 cur;
7919	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7920
7921	/*
7922	* We have folio locked so no new ordered extent can be created on this
7923	* page, nor bio can be submitted for this folio.
7924	*
7925	* But already submitted bio can still be finished on this folio.
7926	* Furthermore, endio function won't skip folio which has Ordered
7927	* (Private2) already cleared, so it's possible for endio and
7928	* invalidate_folio to do the same ordered extent accounting twice
7929	* on one folio.
7930	*
7931	* So here we wait for any submitted bios to finish, so that we won't
7932	* do double ordered extent accounting on the same folio.
7933	*/
7934	folio_wait_writeback(folio);
7935	wait_subpage_spinlock(page: &folio->page);
7936
7937	/*
7938	* For subpage case, we have call sites like
7939	* btrfs_punch_hole_lock_range() which passes range not aligned to
7940	* sectorsize.
7941	* If the range doesn't cover the full folio, we don't need to and
7942	* shouldn't clear page extent mapped, as folio->private can still
7943	* record subpage dirty bits for other part of the range.
7944	*
7945	* For cases that invalidate the full folio even the range doesn't
7946	* cover the full folio, like invalidating the last folio, we're
7947	* still safe to wait for ordered extent to finish.
7948	*/
7949	if (!(offset == `0` && length == folio_size(folio))) {
7950	btrfs_release_folio(folio, GFP_NOFS);
7951	return;
7952	}
7953
7954	if (!inode_evicting)
7955	lock_extent(tree, start: page_start, end: page_end, cached: &cached_state);
7956
7957	cur = page_start;
7958	while (cur < page_end) {
7959	struct btrfs_ordered_extent *ordered;
7960	u64 range_end;
7961	u32 range_len;
7962	u32 extra_flags = `0`;
7963
7964	ordered = btrfs_lookup_first_ordered_range(inode, file_offset: cur,
7965	len: page_end + `1` - cur);
7966	if (!ordered) {
7967	range_end = page_end;
7968	/*
7969	* No ordered extent covering this range, we are safe
7970	* to delete all extent states in the range.
7971	*/
7972	extra_flags = EXTENT_CLEAR_ALL_BITS;
7973	goto next;
7974	}
7975	if (ordered->file_offset > cur) {
7976	/*
7977	* There is a range between [cur, oe->file_offset) not
7978	* covered by any ordered extent.
7979	* We are safe to delete all extent states, and handle
7980	* the ordered extent in the next iteration.
7981	*/
7982	range_end = ordered->file_offset - `1`;
7983	extra_flags = EXTENT_CLEAR_ALL_BITS;
7984	goto next;
7985	}
7986
7987	range_end = min(ordered->file_offset + ordered->num_bytes - `1`,
7988	page_end);
7989	ASSERT(range_end + `1` - cur < U32_MAX);
7990	range_len = range_end + `1` - cur;
7991	if (!btrfs_page_test_ordered(fs_info, page: &folio->page, start: cur, len: range_len)) {
7992	/*
7993	* If Ordered (Private2) is cleared, it means endio has
7994	* already been executed for the range.
7995	* We can't delete the extent states as
7996	* btrfs_finish_ordered_io() may still use some of them.
7997	*/
7998	goto next;
7999	}
8000	btrfs_page_clear_ordered(fs_info, page: &folio->page, start: cur, len: range_len);
8001
8002	/*
8003	* IO on this page will never be started, so we need to account
8004	* for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8005	* here, must leave that up for the ordered extent completion.
8006	*
8007	* This will also unlock the range for incoming
8008	* btrfs_finish_ordered_io().
8009	*/
8010	if (!inode_evicting)
8011	clear_extent_bit(tree, start: cur, end: range_end,
8012	bits: EXTENT_DELALLOC \|
8013	EXTENT_LOCKED \| EXTENT_DO_ACCOUNTING \|
8014	EXTENT_DEFRAG, cached: &cached_state);
8015
8016	spin_lock_irq(lock: &inode->ordered_tree_lock);
8017	set_bit(nr: BTRFS_ORDERED_TRUNCATED, addr: &ordered->flags);
8018	ordered->truncated_len = min(ordered->truncated_len,
8019	cur - ordered->file_offset);
8020	spin_unlock_irq(lock: &inode->ordered_tree_lock);
8021
8022	/*
8023	* If the ordered extent has finished, we're safe to delete all
8024	* the extent states of the range, otherwise
8025	* btrfs_finish_ordered_io() will get executed by endio for
8026	* other pages, so we can't delete extent states.
8027	*/
8028	if (btrfs_dec_test_ordered_pending(inode, cached: &ordered,
8029	file_offset: cur, io_size: range_end + `1` - cur)) {
8030	btrfs_finish_ordered_io(ordered);
8031	/*
8032	* The ordered extent has finished, now we're again
8033	* safe to delete all extent states of the range.
8034	*/
8035	extra_flags = EXTENT_CLEAR_ALL_BITS;
8036	}
8037	next:
8038	if (ordered)
8039	btrfs_put_ordered_extent(entry: ordered);
8040	/*
8041	* Qgroup reserved space handler
8042	* Sector(s) here will be either:
8043	*
8044	* 1) Already written to disk or bio already finished
8045	* Then its QGROUP_RESERVED bit in io_tree is already cleared.
8046	* Qgroup will be handled by its qgroup_record then.
8047	* btrfs_qgroup_free_data() call will do nothing here.
8048	*
8049	* 2) Not written to disk yet
8050	* Then btrfs_qgroup_free_data() call will clear the
8051	* QGROUP_RESERVED bit of its io_tree, and free the qgroup
8052	* reserved data space.
8053	* Since the IO will never happen for this page.
8054	*/
8055	btrfs_qgroup_free_data(inode, NULL, start: cur, len: range_end + `1` - cur);
8056	if (!inode_evicting) {
8057	clear_extent_bit(tree, start: cur, end: range_end, bits: EXTENT_LOCKED \|
8058	EXTENT_DELALLOC \| EXTENT_UPTODATE \|
8059	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG \|
8060	extra_flags, cached: &cached_state);
8061	}
8062	cur = range_end + `1`;
8063	}
8064	/*
8065	* We have iterated through all ordered extents of the page, the page
8066	* should not have Ordered (Private2) anymore, or the above iteration
8067	* did something wrong.
8068	*/
8069	ASSERT(!folio_test_ordered(folio));
8070	btrfs_page_clear_checked(fs_info, page: &folio->page, start: folio_pos(folio), len: folio_size(folio));
8071	if (!inode_evicting)
8072	__btrfs_release_folio(folio, GFP_NOFS);
8073	clear_page_extent_mapped(page: &folio->page);
8074	}
8075
8076	/*
8077	* btrfs_page_mkwrite() is not allowed to change the file size as it gets
8078	* called from a page fault handler when a page is first dirtied. Hence we must
8079	* be careful to check for EOF conditions here. We set the page up correctly
8080	* for a written page which means we get ENOSPC checking when writing into
8081	* holes and correct delalloc and unwritten extent mapping on filesystems that
8082	* support these features.
8083	*
8084	* We are not allowed to take the i_mutex here so we have to play games to
8085	* protect against truncate races as the page could now be beyond EOF. Because
8086	* truncate_setsize() writes the inode size before removing pages, once we have
8087	* the page lock we can determine safely if the page is beyond EOF. If it is not
8088	* beyond EOF, then the page is guaranteed safe against truncation until we
8089	* unlock the page.
8090	*/
8091	vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8092	{
8093	struct page *page = vmf->page;
8094	struct inode *inode = file_inode(f: vmf->vma->vm_file);
8095	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
8096	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8097	struct btrfs_ordered_extent *ordered;
8098	struct extent_state *cached_state = NULL;
8099	struct extent_changeset *data_reserved = NULL;
8100	unsigned long zero_start;
8101	loff_t size;
8102	vm_fault_t ret;
8103	int ret2;
8104	int reserved = `0`;
8105	u64 reserved_space;
8106	u64 page_start;
8107	u64 page_end;
8108	u64 end;
8109
8110	reserved_space = PAGE_SIZE;
8111
8112	sb_start_pagefault(sb: inode->i_sb);
8113	page_start = page_offset(page);
8114	page_end = page_start + PAGE_SIZE - `1`;
8115	end = page_end;
8116
8117	/*
8118	* Reserving delalloc space after obtaining the page lock can lead to
8119	* deadlock. For example, if a dirty page is locked by this function
8120	* and the call to btrfs_delalloc_reserve_space() ends up triggering
8121	* dirty page write out, then the btrfs_writepages() function could
8122	* end up waiting indefinitely to get a lock on the page currently
8123	* being processed by btrfs_page_mkwrite() function.
8124	*/
8125	ret2 = btrfs_delalloc_reserve_space(inode: BTRFS_I(inode), reserved: &data_reserved,
8126	start: page_start, len: reserved_space);
8127	if (!ret2) {
8128	ret2 = file_update_time(file: vmf->vma->vm_file);
8129	reserved = `1`;
8130	}
8131	if (ret2) {
8132	ret = vmf_error(err: ret2);
8133	if (reserved)
8134	goto out;
8135	goto out_noreserve;
8136	}
8137
8138	ret = VM_FAULT_NOPAGE; / make the VM retry the fault /
8139	again:
8140	down_read(sem: &BTRFS_I(inode)->i_mmap_lock);
8141	lock_page(page);
8142	size = i_size_read(inode);
8143
8144	if ((page->mapping != inode->i_mapping) \|\|
8145	(page_start >= size)) {
8146	/ page got truncated out from underneath us /
8147	goto out_unlock;
8148	}
8149	wait_on_page_writeback(page);
8150
8151	lock_extent(tree: io_tree, start: page_start, end: page_end, cached: &cached_state);
8152	ret2 = set_page_extent_mapped(page);
8153	if (ret2 < `0`) {
8154	ret = vmf_error(err: ret2);
8155	unlock_extent(tree: io_tree, start: page_start, end: page_end, cached: &cached_state);
8156	goto out_unlock;
8157	}
8158
8159	/*
8160	* we can't set the delalloc bits if there are pending ordered
8161	* extents. Drop our locks and wait for them to finish
8162	*/
8163	ordered = btrfs_lookup_ordered_range(inode: BTRFS_I(inode), file_offset: page_start,
8164	PAGE_SIZE);
8165	if (ordered) {
8166	unlock_extent(tree: io_tree, start: page_start, end: page_end, cached: &cached_state);
8167	unlock_page(page);
8168	up_read(sem: &BTRFS_I(inode)->i_mmap_lock);
8169	btrfs_start_ordered_extent(entry: ordered);
8170	btrfs_put_ordered_extent(entry: ordered);
8171	goto again;
8172	}
8173
8174	if (page->index == ((size - `1`) >> PAGE_SHIFT)) {
8175	reserved_space = round_up(size - page_start,
8176	fs_info->sectorsize);
8177	if (reserved_space < PAGE_SIZE) {
8178	end = page_start + reserved_space - `1`;
8179	btrfs_delalloc_release_space(inode: BTRFS_I(inode),
8180	reserved: data_reserved, start: page_start,
8181	PAGE_SIZE - reserved_space, qgroup_free: true);
8182	}
8183	}
8184
8185	/*
8186	* page_mkwrite gets called when the page is firstly dirtied after it's
8187	* faulted in, but write(2) could also dirty a page and set delalloc
8188	* bits, thus in this case for space account reason, we still need to
8189	* clear any delalloc bits within this page range since we have to
8190	* reserve data&meta space before lock_page() (see above comments).
8191	*/
8192	clear_extent_bit(tree: &BTRFS_I(inode)->io_tree, start: page_start, end,
8193	bits: EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \|
8194	EXTENT_DEFRAG, cached: &cached_state);
8195
8196	ret2 = btrfs_set_extent_delalloc(inode: BTRFS_I(inode), start: page_start, end, extra_bits: `0`,
8197	cached_state: &cached_state);
8198	if (ret2) {
8199	unlock_extent(tree: io_tree, start: page_start, end: page_end, cached: &cached_state);
8200	ret = VM_FAULT_SIGBUS;
8201	goto out_unlock;
8202	}
8203
8204	/ page is wholly or partially inside EOF /
8205	if (page_start + PAGE_SIZE > size)
8206	zero_start = offset_in_page(size);
8207	else
8208	zero_start = PAGE_SIZE;
8209
8210	if (zero_start != PAGE_SIZE)
8211	memzero_page(page, offset: zero_start, PAGE_SIZE - zero_start);
8212
8213	btrfs_page_clear_checked(fs_info, page, start: page_start, PAGE_SIZE);
8214	btrfs_page_set_dirty(fs_info, page, start: page_start, len: end + `1` - page_start);
8215	btrfs_page_set_uptodate(fs_info, page, start: page_start, len: end + `1` - page_start);
8216
8217	btrfs_set_inode_last_sub_trans(inode: BTRFS_I(inode));
8218
8219	unlock_extent(tree: io_tree, start: page_start, end: page_end, cached: &cached_state);
8220	up_read(sem: &BTRFS_I(inode)->i_mmap_lock);
8221
8222	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), PAGE_SIZE);
8223	sb_end_pagefault(sb: inode->i_sb);
8224	extent_changeset_free(changeset: data_reserved);
8225	return VM_FAULT_LOCKED;
8226
8227	out_unlock:
8228	unlock_page(page);
8229	up_read(sem: &BTRFS_I(inode)->i_mmap_lock);
8230	out:
8231	btrfs_delalloc_release_extents(inode: BTRFS_I(inode), PAGE_SIZE);
8232	btrfs_delalloc_release_space(inode: BTRFS_I(inode), reserved: data_reserved, start: page_start,
8233	len: reserved_space, qgroup_free: (ret != `0`));
8234	out_noreserve:
8235	sb_end_pagefault(sb: inode->i_sb);
8236	extent_changeset_free(changeset: data_reserved);
8237	return ret;
8238	}
8239
8240	static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
8241	{
8242	struct btrfs_truncate_control control = {
8243	.inode = inode,
8244	.ino = btrfs_ino(inode),
8245	.min_type = BTRFS_EXTENT_DATA_KEY,
8246	.clear_extent_range = true,
8247	};
8248	struct btrfs_root *root = inode->root;
8249	struct btrfs_fs_info *fs_info = root->fs_info;
8250	struct btrfs_block_rsv *rsv;
8251	int ret;
8252	struct btrfs_trans_handle *trans;
8253	u64 mask = fs_info->sectorsize - `1`;
8254	const u64 min_size = btrfs_calc_metadata_size(fs_info, num_items: `1`);
8255
8256	if (!skip_writeback) {
8257	ret = btrfs_wait_ordered_range(inode: &inode->vfs_inode,
8258	start: inode->vfs_inode.i_size & (~mask),
8259	len: (u64)-`1`);
8260	if (ret)
8261	return ret;
8262	}
8263
8264	/*
8265	* Yes ladies and gentlemen, this is indeed ugly. We have a couple of
8266	* things going on here:
8267	*
8268	* 1) We need to reserve space to update our inode.
8269	*
8270	* 2) We need to have something to cache all the space that is going to
8271	* be free'd up by the truncate operation, but also have some slack
8272	* space reserved in case it uses space during the truncate (thank you
8273	* very much snapshotting).
8274	*
8275	* And we need these to be separate. The fact is we can use a lot of
8276	* space doing the truncate, and we have no earthly idea how much space
8277	* we will use, so we need the truncate reservation to be separate so it
8278	* doesn't end up using space reserved for updating the inode. We also
8279	* need to be able to stop the transaction and start a new one, which
8280	* means we need to be able to update the inode several times, and we
8281	* have no idea of knowing how many times that will be, so we can't just
8282	* reserve 1 item for the entirety of the operation, so that has to be
8283	* done separately as well.
8284	*
8285	* So that leaves us with
8286	*
8287	* 1) rsv - for the truncate reservation, which we will steal from the
8288	* transaction reservation.
8289	* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8290	* updating the inode.
8291	*/
8292	rsv = btrfs_alloc_block_rsv(fs_info, type: BTRFS_BLOCK_RSV_TEMP);
8293	if (!rsv)
8294	return -ENOMEM;
8295	rsv->size = min_size;
8296	rsv->failfast = true;
8297
8298	/*
8299	* 1 for the truncate slack space
8300	* 1 for updating the inode.
8301	*/
8302	trans = btrfs_start_transaction(root, num_items: `2`);
8303	if (IS_ERR(ptr: trans)) {
8304	ret = PTR_ERR(ptr: trans);
8305	goto out;
8306	}
8307
8308	/ Migrate the slack space for the truncate to our reserve /
8309	ret = btrfs_block_rsv_migrate(src_rsv: &fs_info->trans_block_rsv, dst_rsv: rsv,
8310	num_bytes: min_size, update_size: false);
8311	/*
8312	* We have reserved 2 metadata units when we started the transaction and
8313	* min_size matches 1 unit, so this should never fail, but if it does,
8314	* it's not critical we just fail truncation.
8315	*/
8316	if (WARN_ON(ret)) {
8317	btrfs_end_transaction(trans);
8318	goto out;
8319	}
8320
8321	trans->block_rsv = rsv;
8322
8323	while (`1`) {
8324	struct extent_state *cached_state = NULL;
8325	const u64 new_size = inode->vfs_inode.i_size;
8326	const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8327
8328	control.new_size = new_size;
8329	lock_extent(tree: &inode->io_tree, start: lock_start, end: (u64)-`1`, cached: &cached_state);
8330	/*
8331	* We want to drop from the next block forward in case this new
8332	* size is not block aligned since we will be keeping the last
8333	* block of the extent just the way it is.
8334	*/
8335	btrfs_drop_extent_map_range(inode,
8336	ALIGN(new_size, fs_info->sectorsize),
8337	end: (u64)-`1`, skip_pinned: false);
8338
8339	ret = btrfs_truncate_inode_items(trans, root, control: &control);
8340
8341	inode_sub_bytes(inode: &inode->vfs_inode, bytes: control.sub_bytes);
8342	btrfs_inode_safe_disk_i_size_write(inode, new_i_size: control.last_size);
8343
8344	unlock_extent(tree: &inode->io_tree, start: lock_start, end: (u64)-`1`, cached: &cached_state);
8345
8346	trans->block_rsv = &fs_info->trans_block_rsv;
8347	if (ret != -ENOSPC && ret != -EAGAIN)
8348	break;
8349
8350	ret = btrfs_update_inode(trans, inode);
8351	if (ret)
8352	break;
8353
8354	btrfs_end_transaction(trans);
8355	btrfs_btree_balance_dirty(fs_info);
8356
8357	trans = btrfs_start_transaction(root, num_items: `2`);
8358	if (IS_ERR(ptr: trans)) {
8359	ret = PTR_ERR(ptr: trans);
8360	trans = NULL;
8361	break;
8362	}
8363
8364	btrfs_block_rsv_release(fs_info, block_rsv: rsv, num_bytes: -`1`, NULL);
8365	ret = btrfs_block_rsv_migrate(src_rsv: &fs_info->trans_block_rsv,
8366	dst_rsv: rsv, num_bytes: min_size, update_size: false);
8367	/*
8368	* We have reserved 2 metadata units when we started the
8369	* transaction and min_size matches 1 unit, so this should never
8370	* fail, but if it does, it's not critical we just fail truncation.
8371	*/
8372	if (WARN_ON(ret))
8373	break;
8374
8375	trans->block_rsv = rsv;
8376	}
8377
8378	/*
8379	* We can't call btrfs_truncate_block inside a trans handle as we could
8380	* deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8381	* know we've truncated everything except the last little bit, and can
8382	* do btrfs_truncate_block and then update the disk_i_size.
8383	*/
8384	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8385	btrfs_end_transaction(trans);
8386	btrfs_btree_balance_dirty(fs_info);
8387
8388	ret = btrfs_truncate_block(inode, from: inode->vfs_inode.i_size, len: `0`, front: `0`);
8389	if (ret)
8390	goto out;
8391	trans = btrfs_start_transaction(root, num_items: `1`);
8392	if (IS_ERR(ptr: trans)) {
8393	ret = PTR_ERR(ptr: trans);
8394	goto out;
8395	}
8396	btrfs_inode_safe_disk_i_size_write(inode, new_i_size: `0`);
8397	}
8398
8399	if (trans) {
8400	int ret2;
8401
8402	trans->block_rsv = &fs_info->trans_block_rsv;
8403	ret2 = btrfs_update_inode(trans, inode);
8404	if (ret2 && !ret)
8405	ret = ret2;
8406
8407	ret2 = btrfs_end_transaction(trans);
8408	if (ret2 && !ret)
8409	ret = ret2;
8410	btrfs_btree_balance_dirty(fs_info);
8411	}
8412	out:
8413	btrfs_free_block_rsv(fs_info, rsv);
8414	/*
8415	* So if we truncate and then write and fsync we normally would just
8416	* write the extents that changed, which is a problem if we need to
8417	* first truncate that entire inode. So set this flag so we write out
8418	* all of the extents in the inode to the sync log so we're completely
8419	* safe.
8420	*
8421	* If no extents were dropped or trimmed we don't need to force the next
8422	* fsync to truncate all the inode's items from the log and re-log them
8423	* all. This means the truncate operation did not change the file size,
8424	* or changed it to a smaller size but there was only an implicit hole
8425	* between the old i_size and the new i_size, and there were no prealloc
8426	* extents beyond i_size to drop.
8427	*/
8428	if (control.extents_found > `0`)
8429	btrfs_set_inode_full_sync(inode);
8430
8431	return ret;
8432	}
8433
8434	struct inode btrfs_new_subvol_inode(struct* mnt_idmap *idmap,
8435	struct inode *dir)
8436	{
8437	struct inode *inode;
8438
8439	inode = new_inode(sb: dir->i_sb);
8440	if (inode) {
8441	/*
8442	* Subvolumes don't inherit the sgid bit or the parent's gid if
8443	* the parent's sgid bit is set. This is probably a bug.
8444	*/
8445	inode_init_owner(idmap, inode, NULL,
8446	S_IFDIR \| (~current_umask() & S_IRWXUGO));
8447	inode->i_op = &btrfs_dir_inode_operations;
8448	inode->i_fop = &btrfs_dir_file_operations;
8449	}
8450	return inode;
8451	}
8452
8453	struct inode btrfs_alloc_inode(struct* super_block *sb)
8454	{
8455	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8456	struct btrfs_inode *ei;
8457	struct inode *inode;
8458
8459	ei = alloc_inode_sb(sb, cache: btrfs_inode_cachep, GFP_KERNEL);
8460	if (!ei)
8461	return NULL;
8462
8463	ei->root = NULL;
8464	ei->generation = `0`;
8465	ei->last_trans = `0`;
8466	ei->last_sub_trans = `0`;
8467	ei->logged_trans = `0`;
8468	ei->delalloc_bytes = `0`;
8469	ei->new_delalloc_bytes = `0`;
8470	ei->defrag_bytes = `0`;
8471	ei->disk_i_size = `0`;
8472	ei->flags = `0`;
8473	ei->ro_flags = `0`;
8474	ei->csum_bytes = `0`;
8475	ei->index_cnt = (u64)-`1`;
8476	ei->dir_index = `0`;
8477	ei->last_unlink_trans = `0`;
8478	ei->last_reflink_trans = `0`;
8479	ei->last_log_commit = `0`;
8480
8481	spin_lock_init(&ei->lock);
8482	ei->outstanding_extents = `0`;
8483	if (sb->s_magic != BTRFS_TEST_MAGIC)
8484	btrfs_init_metadata_block_rsv(fs_info, rsv: &ei->block_rsv,
8485	type: BTRFS_BLOCK_RSV_DELALLOC);
8486	ei->runtime_flags = `0`;
8487	ei->prop_compress = BTRFS_COMPRESS_NONE;
8488	ei->defrag_compress = BTRFS_COMPRESS_NONE;
8489
8490	ei->delayed_node = NULL;
8491
8492	ei->i_otime_sec = `0`;
8493	ei->i_otime_nsec = `0`;
8494
8495	inode = &ei->vfs_inode;
8496	extent_map_tree_init(tree: &ei->extent_tree);
8497	extent_io_tree_init(fs_info, tree: &ei->io_tree, owner: IO_TREE_INODE_IO);
8498	ei->io_tree.inode = ei;
8499	extent_io_tree_init(fs_info, tree: &ei->file_extent_tree,
8500	owner: IO_TREE_INODE_FILE_EXTENT);
8501	mutex_init(&ei->log_mutex);
8502	spin_lock_init(&ei->ordered_tree_lock);
8503	ei->ordered_tree = RB_ROOT;
8504	ei->ordered_tree_last = NULL;
8505	INIT_LIST_HEAD(list: &ei->delalloc_inodes);
8506	INIT_LIST_HEAD(list: &ei->delayed_iput);
8507	RB_CLEAR_NODE(&ei->rb_node);
8508	init_rwsem(&ei->i_mmap_lock);
8509
8510	return inode;
8511	}
8512
8513	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8514	void btrfs_test_destroy_inode(struct inode *inode)
8515	{
8516	btrfs_drop_extent_map_range(inode: BTRFS_I(inode), start: `0`, end: (u64)-`1`, skip_pinned: false);
8517	kmem_cache_free(s: btrfs_inode_cachep, objp: BTRFS_I(inode));
8518	}
8519	#endif
8520
8521	void btrfs_free_inode(struct inode *inode)
8522	{
8523	kmem_cache_free(s: btrfs_inode_cachep, objp: BTRFS_I(inode));
8524	}
8525
8526	void btrfs_destroy_inode(struct inode *vfs_inode)
8527	{
8528	struct btrfs_ordered_extent *ordered;
8529	struct btrfs_inode *inode = BTRFS_I(inode: vfs_inode);
8530	struct btrfs_root *root = inode->root;
8531	bool freespace_inode;
8532
8533	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8534	WARN_ON(vfs_inode->i_data.nrpages);
8535	WARN_ON(inode->block_rsv.reserved);
8536	WARN_ON(inode->block_rsv.size);
8537	WARN_ON(inode->outstanding_extents);
8538	if (!S_ISDIR(vfs_inode->i_mode)) {
8539	WARN_ON(inode->delalloc_bytes);
8540	WARN_ON(inode->new_delalloc_bytes);
8541	}
8542	WARN_ON(inode->csum_bytes);
8543	WARN_ON(inode->defrag_bytes);
8544
8545	/*
8546	* This can happen where we create an inode, but somebody else also
8547	* created the same inode and we need to destroy the one we already
8548	* created.
8549	*/
8550	if (!root)
8551	return;
8552
8553	/*
8554	* If this is a free space inode do not take the ordered extents lockdep
8555	* map.
8556	*/
8557	freespace_inode = btrfs_is_free_space_inode(inode);
8558
8559	while (`1`) {
8560	ordered = btrfs_lookup_first_ordered_extent(inode, file_offset: (u64)-`1`);
8561	if (!ordered)
8562	break;
8563	else {
8564	btrfs_err(root->fs_info,
8565	"found ordered extent %llu %llu on inode cleanup",
8566	ordered->file_offset, ordered->num_bytes);
8567
8568	if (!freespace_inode)
8569	btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
8570
8571	btrfs_remove_ordered_extent(btrfs_inode: inode, entry: ordered);
8572	btrfs_put_ordered_extent(entry: ordered);
8573	btrfs_put_ordered_extent(entry: ordered);
8574	}
8575	}
8576	btrfs_qgroup_check_reserved_leak(inode);
8577	inode_tree_del(inode);
8578	btrfs_drop_extent_map_range(inode, start: `0`, end: (u64)-`1`, skip_pinned: false);
8579	btrfs_inode_clear_file_extent_range(inode, start: `0`, len: (u64)-`1`);
8580	btrfs_put_root(root: inode->root);
8581	}
8582
8583	int btrfs_drop_inode(struct inode *inode)
8584	{
8585	struct btrfs_root *root = BTRFS_I(inode)->root;
8586
8587	if (root == NULL)
8588	return `1`;
8589
8590	/ the snap/subvol tree is on deleting /
8591	if (btrfs_root_refs(s: &root->root_item) == `0`)
8592	return `1`;
8593	else
8594	return generic_drop_inode(inode);
8595	}
8596
8597	static void init_once(void *foo)
8598	{
8599	struct btrfs_inode *ei = foo;
8600
8601	inode_init_once(&ei->vfs_inode);
8602	}
8603
8604	void __cold btrfs_destroy_cachep(void)
8605	{
8606	/*
8607	* Make sure all delayed rcu free inodes are flushed before we
8608	* destroy cache.
8609	*/
8610	rcu_barrier();
8611	bioset_exit(&btrfs_dio_bioset);
8612	kmem_cache_destroy(s: btrfs_inode_cachep);
8613	}
8614
8615	int __init btrfs_init_cachep(void)
8616	{
8617	btrfs_inode_cachep = kmem_cache_create(name: "btrfs_inode",
8618	size: sizeof(struct btrfs_inode), align: `0`,
8619	SLAB_RECLAIM_ACCOUNT \| SLAB_MEM_SPREAD \| SLAB_ACCOUNT,
8620	ctor: init_once);
8621	if (!btrfs_inode_cachep)
8622	goto fail;
8623
8624	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
8625	offsetof(struct btrfs_dio_private, bbio.bio),
8626	flags: BIOSET_NEED_BVECS))
8627	goto fail;
8628
8629	return `0`;
8630	fail:
8631	btrfs_destroy_cachep();
8632	return -ENOMEM;
8633	}
8634
8635	static int btrfs_getattr(struct mnt_idmap *idmap,
8636	const struct path path, struct* kstat *stat,
8637	u32 request_mask, unsigned int flags)
8638	{
8639	u64 delalloc_bytes;
8640	u64 inode_bytes;
8641	struct inode *inode = d_inode(dentry: path->dentry);
8642	u32 blocksize = inode->i_sb->s_blocksize;
8643	u32 bi_flags = BTRFS_I(inode)->flags;
8644	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8645
8646	stat->result_mask \|= STATX_BTIME;
8647	stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
8648	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
8649	if (bi_flags & BTRFS_INODE_APPEND)
8650	stat->attributes \|= STATX_ATTR_APPEND;
8651	if (bi_flags & BTRFS_INODE_COMPRESS)
8652	stat->attributes \|= STATX_ATTR_COMPRESSED;
8653	if (bi_flags & BTRFS_INODE_IMMUTABLE)
8654	stat->attributes \|= STATX_ATTR_IMMUTABLE;
8655	if (bi_flags & BTRFS_INODE_NODUMP)
8656	stat->attributes \|= STATX_ATTR_NODUMP;
8657	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8658	stat->attributes \|= STATX_ATTR_VERITY;
8659
8660	stat->attributes_mask \|= (STATX_ATTR_APPEND \|
8661	STATX_ATTR_COMPRESSED \|
8662	STATX_ATTR_IMMUTABLE \|
8663	STATX_ATTR_NODUMP);
8664
8665	generic_fillattr(idmap, request_mask, inode, stat);
8666	stat->dev = BTRFS_I(inode)->root->anon_dev;
8667
8668	spin_lock(lock: &BTRFS_I(inode)->lock);
8669	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8670	inode_bytes = inode_get_bytes(inode);
8671	spin_unlock(lock: &BTRFS_I(inode)->lock);
8672	stat->blocks = (ALIGN(inode_bytes, blocksize) +
8673	ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8674	return `0`;
8675	}
8676
8677	static int btrfs_rename_exchange(struct inode *old_dir,
8678	struct dentry *old_dentry,
8679	struct inode *new_dir,
8680	struct dentry *new_dentry)
8681	{
8682	struct btrfs_fs_info *fs_info = btrfs_sb(sb: old_dir->i_sb);
8683	struct btrfs_trans_handle *trans;
8684	unsigned int trans_num_items;
8685	struct btrfs_root *root = BTRFS_I(inode: old_dir)->root;
8686	struct btrfs_root *dest = BTRFS_I(inode: new_dir)->root;
8687	struct inode *new_inode = new_dentry->d_inode;
8688	struct inode *old_inode = old_dentry->d_inode;
8689	struct btrfs_rename_ctx old_rename_ctx;
8690	struct btrfs_rename_ctx new_rename_ctx;
8691	u64 old_ino = btrfs_ino(inode: BTRFS_I(inode: old_inode));
8692	u64 new_ino = btrfs_ino(inode: BTRFS_I(inode: new_inode));
8693	u64 old_idx = `0`;
8694	u64 new_idx = `0`;
8695	int ret;
8696	int ret2;
8697	bool need_abort = false;
8698	struct fscrypt_name old_fname, new_fname;
8699	struct fscrypt_str old_name, new_name;
8700
8701	/*
8702	* For non-subvolumes allow exchange only within one subvolume, in the
8703	* same inode namespace. Two subvolumes (represented as directory) can
8704	* be exchanged as they're a logical link and have a fixed inode number.
8705	*/
8706	if (root != dest &&
8707	(old_ino != BTRFS_FIRST_FREE_OBJECTID \|\|
8708	new_ino != BTRFS_FIRST_FREE_OBJECTID))
8709	return -EXDEV;
8710
8711	ret = fscrypt_setup_filename(inode: old_dir, iname: &old_dentry->d_name, lookup: `0`, fname: &old_fname);
8712	if (ret)
8713	return ret;
8714
8715	ret = fscrypt_setup_filename(inode: new_dir, iname: &new_dentry->d_name, lookup: `0`, fname: &new_fname);
8716	if (ret) {
8717	fscrypt_free_filename(fname: &old_fname);
8718	return ret;
8719	}
8720
8721	old_name = &old_fname.disk_name;
8722	new_name = &new_fname.disk_name;
8723
8724	/ close the race window with snapshot create/destroy ioctl /
8725	if (old_ino == BTRFS_FIRST_FREE_OBJECTID \|\|
8726	new_ino == BTRFS_FIRST_FREE_OBJECTID)
8727	down_read(sem: &fs_info->subvol_sem);
8728
8729	/*
8730	* For each inode:
8731	* 1 to remove old dir item
8732	* 1 to remove old dir index
8733	* 1 to add new dir item
8734	* 1 to add new dir index
8735	* 1 to update parent inode
8736	*
8737	* If the parents are the same, we only need to account for one
8738	*/
8739	trans_num_items = (old_dir == new_dir ? `9` : `10`);
8740	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8741	/*
8742	* 1 to remove old root ref
8743	* 1 to remove old root backref
8744	* 1 to add new root ref
8745	* 1 to add new root backref
8746	*/
8747	trans_num_items += `4`;
8748	} else {
8749	/*
8750	* 1 to update inode item
8751	* 1 to remove old inode ref
8752	* 1 to add new inode ref
8753	*/
8754	trans_num_items += `3`;
8755	}
8756	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8757	trans_num_items += `4`;
8758	else
8759	trans_num_items += `3`;
8760	trans = btrfs_start_transaction(root, num_items: trans_num_items);
8761	if (IS_ERR(ptr: trans)) {
8762	ret = PTR_ERR(ptr: trans);
8763	goto out_notrans;
8764	}
8765
8766	if (dest != root) {
8767	ret = btrfs_record_root_in_trans(trans, root: dest);
8768	if (ret)
8769	goto out_fail;
8770	}
8771
8772	/*
8773	* We need to find a free sequence number both in the source and
8774	* in the destination directory for the exchange.
8775	*/
8776	ret = btrfs_set_inode_index(dir: BTRFS_I(inode: new_dir), index: &old_idx);
8777	if (ret)
8778	goto out_fail;
8779	ret = btrfs_set_inode_index(dir: BTRFS_I(inode: old_dir), index: &new_idx);
8780	if (ret)
8781	goto out_fail;
8782
8783	BTRFS_I(inode: old_inode)->dir_index = `0ULL`;
8784	BTRFS_I(inode: new_inode)->dir_index = `0ULL`;
8785
8786	/ Reference for the source. /
8787	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8788	/ force full log commit if subvolume involved. /
8789	btrfs_set_log_full_commit(trans);
8790	} else {
8791	ret = btrfs_insert_inode_ref(trans, root: dest, name: new_name, inode_objectid: old_ino,
8792	ref_objectid: btrfs_ino(inode: BTRFS_I(inode: new_dir)),
8793	index: old_idx);
8794	if (ret)
8795	goto out_fail;
8796	need_abort = true;
8797	}
8798
8799	/ And now for the dest. /
8800	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8801	/ force full log commit if subvolume involved. /
8802	btrfs_set_log_full_commit(trans);
8803	} else {
8804	ret = btrfs_insert_inode_ref(trans, root, name: old_name, inode_objectid: new_ino,
8805	ref_objectid: btrfs_ino(inode: BTRFS_I(inode: old_dir)),
8806	index: new_idx);
8807	if (ret) {
8808	if (need_abort)
8809	btrfs_abort_transaction(trans, ret);
8810	goto out_fail;
8811	}
8812	}
8813
8814	/ Update inode version and ctime/mtime. /
8815	inode_inc_iversion(inode: old_dir);
8816	inode_inc_iversion(inode: new_dir);
8817	inode_inc_iversion(inode: old_inode);
8818	inode_inc_iversion(inode: new_inode);
8819	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8820
8821	if (old_dentry->d_parent != new_dentry->d_parent) {
8822	btrfs_record_unlink_dir(trans, dir: BTRFS_I(inode: old_dir),
8823	inode: BTRFS_I(inode: old_inode), for_rename: true);
8824	btrfs_record_unlink_dir(trans, dir: BTRFS_I(inode: new_dir),
8825	inode: BTRFS_I(inode: new_inode), for_rename: true);
8826	}
8827
8828	/ src is a subvolume /
8829	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8830	ret = btrfs_unlink_subvol(trans, dir: BTRFS_I(inode: old_dir), dentry: old_dentry);
8831	} else { / src is an inode /
8832	ret = __btrfs_unlink_inode(trans, dir: BTRFS_I(inode: old_dir),
8833	inode: BTRFS_I(inode: old_dentry->d_inode),
8834	name: old_name, rename_ctx: &old_rename_ctx);
8835	if (!ret)
8836	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode: old_inode));
8837	}
8838	if (ret) {
8839	btrfs_abort_transaction(trans, ret);
8840	goto out_fail;
8841	}
8842
8843	/ dest is a subvolume /
8844	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8845	ret = btrfs_unlink_subvol(trans, dir: BTRFS_I(inode: new_dir), dentry: new_dentry);
8846	} else { / dest is an inode /
8847	ret = __btrfs_unlink_inode(trans, dir: BTRFS_I(inode: new_dir),
8848	inode: BTRFS_I(inode: new_dentry->d_inode),
8849	name: new_name, rename_ctx: &new_rename_ctx);
8850	if (!ret)
8851	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode: new_inode));
8852	}
8853	if (ret) {
8854	btrfs_abort_transaction(trans, ret);
8855	goto out_fail;
8856	}
8857
8858	ret = btrfs_add_link(trans, parent_inode: BTRFS_I(inode: new_dir), inode: BTRFS_I(inode: old_inode),
8859	name: new_name, add_backref: `0`, index: old_idx);
8860	if (ret) {
8861	btrfs_abort_transaction(trans, ret);
8862	goto out_fail;
8863	}
8864
8865	ret = btrfs_add_link(trans, parent_inode: BTRFS_I(inode: old_dir), inode: BTRFS_I(inode: new_inode),
8866	name: old_name, add_backref: `0`, index: new_idx);
8867	if (ret) {
8868	btrfs_abort_transaction(trans, ret);
8869	goto out_fail;
8870	}
8871
8872	if (old_inode->i_nlink == `1`)
8873	BTRFS_I(inode: old_inode)->dir_index = old_idx;
8874	if (new_inode->i_nlink == `1`)
8875	BTRFS_I(inode: new_inode)->dir_index = new_idx;
8876
8877	/*
8878	* Now pin the logs of the roots. We do it to ensure that no other task
8879	* can sync the logs while we are in progress with the rename, because
8880	* that could result in an inconsistency in case any of the inodes that
8881	* are part of this rename operation were logged before.
8882	*/
8883	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8884	btrfs_pin_log_trans(root);
8885	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8886	btrfs_pin_log_trans(root: dest);
8887
8888	/ Do the log updates for all inodes. /
8889	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8890	btrfs_log_new_name(trans, old_dentry, old_dir: BTRFS_I(inode: old_dir),
8891	old_dir_index: old_rename_ctx.index, parent: new_dentry->d_parent);
8892	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8893	btrfs_log_new_name(trans, old_dentry: new_dentry, old_dir: BTRFS_I(inode: new_dir),
8894	old_dir_index: new_rename_ctx.index, parent: old_dentry->d_parent);
8895
8896	/ Now unpin the logs. /
8897	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8898	btrfs_end_log_trans(root);
8899	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8900	btrfs_end_log_trans(root: dest);
8901	out_fail:
8902	ret2 = btrfs_end_transaction(trans);
8903	ret = ret ? ret : ret2;
8904	out_notrans:
8905	if (new_ino == BTRFS_FIRST_FREE_OBJECTID \|\|
8906	old_ino == BTRFS_FIRST_FREE_OBJECTID)
8907	up_read(sem: &fs_info->subvol_sem);
8908
8909	fscrypt_free_filename(fname: &new_fname);
8910	fscrypt_free_filename(fname: &old_fname);
8911	return ret;
8912	}
8913
8914	static struct inode new_whiteout_inode(struct* mnt_idmap *idmap,
8915	struct inode *dir)
8916	{
8917	struct inode *inode;
8918
8919	inode = new_inode(sb: dir->i_sb);
8920	if (inode) {
8921	inode_init_owner(idmap, inode, dir,
8922	S_IFCHR \| WHITEOUT_MODE);
8923	inode->i_op = &btrfs_special_inode_operations;
8924	init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8925	}
8926	return inode;
8927	}
8928
8929	static int btrfs_rename(struct mnt_idmap *idmap,
8930	struct inode old_dir, struct* dentry *old_dentry,
8931	struct inode new_dir, struct* dentry *new_dentry,
8932	unsigned int flags)
8933	{
8934	struct btrfs_fs_info *fs_info = btrfs_sb(sb: old_dir->i_sb);
8935	struct btrfs_new_inode_args whiteout_args = {
8936	.dir = old_dir,
8937	.dentry = old_dentry,
8938	};
8939	struct btrfs_trans_handle *trans;
8940	unsigned int trans_num_items;
8941	struct btrfs_root *root = BTRFS_I(inode: old_dir)->root;
8942	struct btrfs_root *dest = BTRFS_I(inode: new_dir)->root;
8943	struct inode *new_inode = d_inode(dentry: new_dentry);
8944	struct inode *old_inode = d_inode(dentry: old_dentry);
8945	struct btrfs_rename_ctx rename_ctx;
8946	u64 index = `0`;
8947	int ret;
8948	int ret2;
8949	u64 old_ino = btrfs_ino(inode: BTRFS_I(inode: old_inode));
8950	struct fscrypt_name old_fname, new_fname;
8951
8952	if (btrfs_ino(inode: BTRFS_I(inode: new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8953	return -EPERM;
8954
8955	/ we only allow rename subvolume link between subvolumes /
8956	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8957	return -EXDEV;
8958
8959	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID \|\|
8960	(new_inode && btrfs_ino(inode: BTRFS_I(inode: new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8961	return -ENOTEMPTY;
8962
8963	if (S_ISDIR(old_inode->i_mode) && new_inode &&
8964	new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8965	return -ENOTEMPTY;
8966
8967	ret = fscrypt_setup_filename(inode: old_dir, iname: &old_dentry->d_name, lookup: `0`, fname: &old_fname);
8968	if (ret)
8969	return ret;
8970
8971	ret = fscrypt_setup_filename(inode: new_dir, iname: &new_dentry->d_name, lookup: `0`, fname: &new_fname);
8972	if (ret) {
8973	fscrypt_free_filename(fname: &old_fname);
8974	return ret;
8975	}
8976
8977	/ check for collisions, even if the name isn't there /
8978	ret = btrfs_check_dir_item_collision(root: dest, dir: new_dir->i_ino, name: &new_fname.disk_name);
8979	if (ret) {
8980	if (ret == -EEXIST) {
8981	/ we shouldn't get*
8982	* eexist without a new_inode */
8983	if (WARN_ON(!new_inode)) {
8984	goto out_fscrypt_names;
8985	}
8986	} else {
8987	/ maybe -EOVERFLOW /
8988	goto out_fscrypt_names;
8989	}
8990	}
8991	ret = `0`;
8992
8993	/*
8994	* we're using rename to replace one file with another. Start IO on it
8995	* now so we don't add too much work to the end of the transaction
8996	*/
8997	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8998	filemap_flush(old_inode->i_mapping);
8999
9000	if (flags & RENAME_WHITEOUT) {
9001	whiteout_args.inode = new_whiteout_inode(idmap, dir: old_dir);
9002	if (!whiteout_args.inode) {
9003	ret = -ENOMEM;
9004	goto out_fscrypt_names;
9005	}
9006	ret = btrfs_new_inode_prepare(args: &whiteout_args, trans_num_items: &trans_num_items);
9007	if (ret)
9008	goto out_whiteout_inode;
9009	} else {
9010	/ 1 to update the old parent inode. /
9011	trans_num_items = `1`;
9012	}
9013
9014	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9015	/ Close the race window with snapshot create/destroy ioctl /
9016	down_read(sem: &fs_info->subvol_sem);
9017	/*
9018	* 1 to remove old root ref
9019	* 1 to remove old root backref
9020	* 1 to add new root ref
9021	* 1 to add new root backref
9022	*/
9023	trans_num_items += `4`;
9024	} else {
9025	/*
9026	* 1 to update inode
9027	* 1 to remove old inode ref
9028	* 1 to add new inode ref
9029	*/
9030	trans_num_items += `3`;
9031	}
9032	/*
9033	* 1 to remove old dir item
9034	* 1 to remove old dir index
9035	* 1 to add new dir item
9036	* 1 to add new dir index
9037	*/
9038	trans_num_items += `4`;
9039	/ 1 to update new parent inode if it's not the same as the old parent /
9040	if (new_dir != old_dir)
9041	trans_num_items++;
9042	if (new_inode) {
9043	/*
9044	* 1 to update inode
9045	* 1 to remove inode ref
9046	* 1 to remove dir item
9047	* 1 to remove dir index
9048	* 1 to possibly add orphan item
9049	*/
9050	trans_num_items += `5`;
9051	}
9052	trans = btrfs_start_transaction(root, num_items: trans_num_items);
9053	if (IS_ERR(ptr: trans)) {
9054	ret = PTR_ERR(ptr: trans);
9055	goto out_notrans;
9056	}
9057
9058	if (dest != root) {
9059	ret = btrfs_record_root_in_trans(trans, root: dest);
9060	if (ret)
9061	goto out_fail;
9062	}
9063
9064	ret = btrfs_set_inode_index(dir: BTRFS_I(inode: new_dir), index: &index);
9065	if (ret)
9066	goto out_fail;
9067
9068	BTRFS_I(inode: old_inode)->dir_index = `0ULL`;
9069	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9070	/ force full log commit if subvolume involved. /
9071	btrfs_set_log_full_commit(trans);
9072	} else {
9073	ret = btrfs_insert_inode_ref(trans, root: dest, name: &new_fname.disk_name,
9074	inode_objectid: old_ino, ref_objectid: btrfs_ino(inode: BTRFS_I(inode: new_dir)),
9075	index);
9076	if (ret)
9077	goto out_fail;
9078	}
9079
9080	inode_inc_iversion(inode: old_dir);
9081	inode_inc_iversion(inode: new_dir);
9082	inode_inc_iversion(inode: old_inode);
9083	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
9084
9085	if (old_dentry->d_parent != new_dentry->d_parent)
9086	btrfs_record_unlink_dir(trans, dir: BTRFS_I(inode: old_dir),
9087	inode: BTRFS_I(inode: old_inode), for_rename: true);
9088
9089	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9090	ret = btrfs_unlink_subvol(trans, dir: BTRFS_I(inode: old_dir), dentry: old_dentry);
9091	} else {
9092	ret = __btrfs_unlink_inode(trans, dir: BTRFS_I(inode: old_dir),
9093	inode: BTRFS_I(inode: d_inode(dentry: old_dentry)),
9094	name: &old_fname.disk_name, rename_ctx: &rename_ctx);
9095	if (!ret)
9096	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode: old_inode));
9097	}
9098	if (ret) {
9099	btrfs_abort_transaction(trans, ret);
9100	goto out_fail;
9101	}
9102
9103	if (new_inode) {
9104	inode_inc_iversion(inode: new_inode);
9105	if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9106	BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9107	ret = btrfs_unlink_subvol(trans, dir: BTRFS_I(inode: new_dir), dentry: new_dentry);
9108	BUG_ON(new_inode->i_nlink == `0`);
9109	} else {
9110	ret = btrfs_unlink_inode(trans, dir: BTRFS_I(inode: new_dir),
9111	inode: BTRFS_I(inode: d_inode(dentry: new_dentry)),
9112	name: &new_fname.disk_name);
9113	}
9114	if (!ret && new_inode->i_nlink == `0`)
9115	ret = btrfs_orphan_add(trans,
9116	inode: BTRFS_I(inode: d_inode(dentry: new_dentry)));
9117	if (ret) {
9118	btrfs_abort_transaction(trans, ret);
9119	goto out_fail;
9120	}
9121	}
9122
9123	ret = btrfs_add_link(trans, parent_inode: BTRFS_I(inode: new_dir), inode: BTRFS_I(inode: old_inode),
9124	name: &new_fname.disk_name, add_backref: `0`, index);
9125	if (ret) {
9126	btrfs_abort_transaction(trans, ret);
9127	goto out_fail;
9128	}
9129
9130	if (old_inode->i_nlink == `1`)
9131	BTRFS_I(inode: old_inode)->dir_index = index;
9132
9133	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9134	btrfs_log_new_name(trans, old_dentry, old_dir: BTRFS_I(inode: old_dir),
9135	old_dir_index: rename_ctx.index, parent: new_dentry->d_parent);
9136
9137	if (flags & RENAME_WHITEOUT) {
9138	ret = btrfs_create_new_inode(trans, args: &whiteout_args);
9139	if (ret) {
9140	btrfs_abort_transaction(trans, ret);
9141	goto out_fail;
9142	} else {
9143	unlock_new_inode(whiteout_args.inode);
9144	iput(whiteout_args.inode);
9145	whiteout_args.inode = NULL;
9146	}
9147	}
9148	out_fail:
9149	ret2 = btrfs_end_transaction(trans);
9150	ret = ret ? ret : ret2;
9151	out_notrans:
9152	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9153	up_read(sem: &fs_info->subvol_sem);
9154	if (flags & RENAME_WHITEOUT)
9155	btrfs_new_inode_args_destroy(args: &whiteout_args);
9156	out_whiteout_inode:
9157	if (flags & RENAME_WHITEOUT)
9158	iput(whiteout_args.inode);
9159	out_fscrypt_names:
9160	fscrypt_free_filename(fname: &old_fname);
9161	fscrypt_free_filename(fname: &new_fname);
9162	return ret;
9163	}
9164
9165	static int btrfs_rename2(struct mnt_idmap idmap, struct* inode *old_dir,
9166	struct dentry old_dentry, struct* inode *new_dir,
9167	struct dentry new_dentry, unsigned* int flags)
9168	{
9169	int ret;
9170
9171	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
9172	return -EINVAL;
9173
9174	if (flags & RENAME_EXCHANGE)
9175	ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9176	new_dentry);
9177	else
9178	ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
9179	new_dentry, flags);
9180
9181	btrfs_btree_balance_dirty(fs_info: BTRFS_I(inode: new_dir)->root->fs_info);
9182
9183	return ret;
9184	}
9185
9186	struct btrfs_delalloc_work {
9187	struct inode *inode;
9188	struct completion completion;
9189	struct list_head list;
9190	struct btrfs_work work;
9191	};
9192
9193	static void btrfs_run_delalloc_work(struct btrfs_work *work)
9194	{
9195	struct btrfs_delalloc_work *delalloc_work;
9196	struct inode *inode;
9197
9198	delalloc_work = container_of(work, struct btrfs_delalloc_work,
9199	work);
9200	inode = delalloc_work->inode;
9201	filemap_flush(inode->i_mapping);
9202	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9203	&BTRFS_I(inode)->runtime_flags))
9204	filemap_flush(inode->i_mapping);
9205
9206	iput(inode);
9207	complete(&delalloc_work->completion);
9208	}
9209
9210	static struct btrfs_delalloc_work btrfs_alloc_delalloc_work(struct* inode *inode)
9211	{
9212	struct btrfs_delalloc_work *work;
9213
9214	work = kmalloc(size: sizeof(*work), GFP_NOFS);
9215	if (!work)
9216	return NULL;
9217
9218	init_completion(x: &work->completion);
9219	INIT_LIST_HEAD(list: &work->list);
9220	work->inode = inode;
9221	btrfs_init_work(work: &work->work, func: btrfs_run_delalloc_work, NULL);
9222
9223	return work;
9224	}
9225
9226	/*
9227	* some fairly slow code that needs optimization. This walks the list
9228	* of all the inodes with pending delalloc and forces them to disk.
9229	*/
9230	static int start_delalloc_inodes(struct btrfs_root *root,
9231	struct writeback_control *wbc, bool snapshot,
9232	bool in_reclaim_context)
9233	{
9234	struct btrfs_inode *binode;
9235	struct inode *inode;
9236	struct btrfs_delalloc_work work, next;
9237	LIST_HEAD(works);
9238	LIST_HEAD(splice);
9239	int ret = `0`;
9240	bool full_flush = wbc->nr_to_write == LONG_MAX;
9241
9242	mutex_lock(&root->delalloc_mutex);
9243	spin_lock(lock: &root->delalloc_lock);
9244	list_splice_init(list: &root->delalloc_inodes, head: &splice);
9245	while (!list_empty(head: &splice)) {
9246	binode = list_entry(splice.next, struct btrfs_inode,
9247	delalloc_inodes);
9248
9249	list_move_tail(list: &binode->delalloc_inodes,
9250	head: &root->delalloc_inodes);
9251
9252	if (in_reclaim_context &&
9253	test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9254	continue;
9255
9256	inode = igrab(&binode->vfs_inode);
9257	if (!inode) {
9258	cond_resched_lock(&root->delalloc_lock);
9259	continue;
9260	}
9261	spin_unlock(lock: &root->delalloc_lock);
9262
9263	if (snapshot)
9264	set_bit(nr: BTRFS_INODE_SNAPSHOT_FLUSH,
9265	addr: &binode->runtime_flags);
9266	if (full_flush) {
9267	work = btrfs_alloc_delalloc_work(inode);
9268	if (!work) {
9269	iput(inode);
9270	ret = -ENOMEM;
9271	goto out;
9272	}
9273	list_add_tail(new: &work->list, head: &works);
9274	btrfs_queue_work(wq: root->fs_info->flush_workers,
9275	work: &work->work);
9276	} else {
9277	ret = filemap_fdatawrite_wbc(mapping: inode->i_mapping, wbc);
9278	btrfs_add_delayed_iput(inode: BTRFS_I(inode));
9279	if (ret \|\| wbc->nr_to_write <= `0`)
9280	goto out;
9281	}
9282	cond_resched();
9283	spin_lock(lock: &root->delalloc_lock);
9284	}
9285	spin_unlock(lock: &root->delalloc_lock);
9286
9287	out:
9288	list_for_each_entry_safe(work, next, &works, list) {
9289	list_del_init(entry: &work->list);
9290	wait_for_completion(&work->completion);
9291	kfree(objp: work);
9292	}
9293
9294	if (!list_empty(head: &splice)) {
9295	spin_lock(lock: &root->delalloc_lock);
9296	list_splice_tail(list: &splice, head: &root->delalloc_inodes);
9297	spin_unlock(lock: &root->delalloc_lock);
9298	}
9299	mutex_unlock(lock: &root->delalloc_mutex);
9300	return ret;
9301	}
9302
9303	int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9304	{
9305	struct writeback_control wbc = {
9306	.nr_to_write = LONG_MAX,
9307	.sync_mode = WB_SYNC_NONE,
9308	.range_start = `0`,
9309	.range_end = LLONG_MAX,
9310	};
9311	struct btrfs_fs_info *fs_info = root->fs_info;
9312
9313	if (BTRFS_FS_ERROR(fs_info))
9314	return -EROFS;
9315
9316	return start_delalloc_inodes(root, wbc: &wbc, snapshot: true, in_reclaim_context);
9317	}
9318
9319	int btrfs_start_delalloc_roots(struct btrfs_fs_info fs_info, long* nr,
9320	bool in_reclaim_context)
9321	{
9322	struct writeback_control wbc = {
9323	.nr_to_write = nr,
9324	.sync_mode = WB_SYNC_NONE,
9325	.range_start = `0`,
9326	.range_end = LLONG_MAX,
9327	};
9328	struct btrfs_root *root;
9329	LIST_HEAD(splice);
9330	int ret;
9331
9332	if (BTRFS_FS_ERROR(fs_info))
9333	return -EROFS;
9334
9335	mutex_lock(&fs_info->delalloc_root_mutex);
9336	spin_lock(lock: &fs_info->delalloc_root_lock);
9337	list_splice_init(list: &fs_info->delalloc_roots, head: &splice);
9338	while (!list_empty(head: &splice)) {
9339	/*
9340	* Reset nr_to_write here so we know that we're doing a full
9341	* flush.
9342	*/
9343	if (nr == LONG_MAX)
9344	wbc.nr_to_write = LONG_MAX;
9345
9346	root = list_first_entry(&splice, struct btrfs_root,
9347	delalloc_root);
9348	root = btrfs_grab_root(root);
9349	BUG_ON(!root);
9350	list_move_tail(list: &root->delalloc_root,
9351	head: &fs_info->delalloc_roots);
9352	spin_unlock(lock: &fs_info->delalloc_root_lock);
9353
9354	ret = start_delalloc_inodes(root, wbc: &wbc, snapshot: false, in_reclaim_context);
9355	btrfs_put_root(root);
9356	if (ret < `0` \|\| wbc.nr_to_write <= `0`)
9357	goto out;
9358	spin_lock(lock: &fs_info->delalloc_root_lock);
9359	}
9360	spin_unlock(lock: &fs_info->delalloc_root_lock);
9361
9362	ret = `0`;
9363	out:
9364	if (!list_empty(head: &splice)) {
9365	spin_lock(lock: &fs_info->delalloc_root_lock);
9366	list_splice_tail(list: &splice, head: &fs_info->delalloc_roots);
9367	spin_unlock(lock: &fs_info->delalloc_root_lock);
9368	}
9369	mutex_unlock(lock: &fs_info->delalloc_root_mutex);
9370	return ret;
9371	}
9372
9373	static int btrfs_symlink(struct mnt_idmap idmap, struct* inode *dir,
9374	struct dentry dentry, const* char *symname)
9375	{
9376	struct btrfs_fs_info *fs_info = btrfs_sb(sb: dir->i_sb);
9377	struct btrfs_trans_handle *trans;
9378	struct btrfs_root *root = BTRFS_I(inode: dir)->root;
9379	struct btrfs_path *path;
9380	struct btrfs_key key;
9381	struct inode *inode;
9382	struct btrfs_new_inode_args new_inode_args = {
9383	.dir = dir,
9384	.dentry = dentry,
9385	};
9386	unsigned int trans_num_items;
9387	int err;
9388	int name_len;
9389	int datasize;
9390	unsigned long ptr;
9391	struct btrfs_file_extent_item *ei;
9392	struct extent_buffer *leaf;
9393
9394	name_len = strlen(symname);
9395	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(info: fs_info))
9396	return -ENAMETOOLONG;
9397
9398	inode = new_inode(sb: dir->i_sb);
9399	if (!inode)
9400	return -ENOMEM;
9401	inode_init_owner(idmap, inode, dir, S_IFLNK \| S_IRWXUGO);
9402	inode->i_op = &btrfs_symlink_inode_operations;
9403	inode_nohighmem(inode);
9404	inode->i_mapping->a_ops = &btrfs_aops;
9405	btrfs_i_size_write(inode: BTRFS_I(inode), size: name_len);
9406	inode_set_bytes(inode, bytes: name_len);
9407
9408	new_inode_args.inode = inode;
9409	err = btrfs_new_inode_prepare(args: &new_inode_args, trans_num_items: &trans_num_items);
9410	if (err)
9411	goto out_inode;
9412	/ 1 additional item for the inline extent /
9413	trans_num_items++;
9414
9415	trans = btrfs_start_transaction(root, num_items: trans_num_items);
9416	if (IS_ERR(ptr: trans)) {
9417	err = PTR_ERR(ptr: trans);
9418	goto out_new_inode_args;
9419	}
9420
9421	err = btrfs_create_new_inode(trans, args: &new_inode_args);
9422	if (err)
9423	goto out;
9424
9425	path = btrfs_alloc_path();
9426	if (!path) {
9427	err = -ENOMEM;
9428	btrfs_abort_transaction(trans, err);
9429	discard_new_inode(inode);
9430	inode = NULL;
9431	goto out;
9432	}
9433	key.objectid = btrfs_ino(inode: BTRFS_I(inode));
9434	key.offset = `0`;
9435	key.type = BTRFS_EXTENT_DATA_KEY;
9436	datasize = btrfs_file_extent_calc_inline_size(datasize: name_len);
9437	err = btrfs_insert_empty_item(trans, root, path, key: &key,
9438	data_size: datasize);
9439	if (err) {
9440	btrfs_abort_transaction(trans, err);
9441	btrfs_free_path(p: path);
9442	discard_new_inode(inode);
9443	inode = NULL;
9444	goto out;
9445	}
9446	leaf = path->nodes[`0`];
9447	ei = btrfs_item_ptr(leaf, path->slots[`0`],
9448	struct btrfs_file_extent_item);
9449	btrfs_set_file_extent_generation(eb: leaf, s: ei, val: trans->transid);
9450	btrfs_set_file_extent_type(eb: leaf, s: ei,
9451	val: BTRFS_FILE_EXTENT_INLINE);
9452	btrfs_set_file_extent_encryption(eb: leaf, s: ei, val: `0`);
9453	btrfs_set_file_extent_compression(eb: leaf, s: ei, val: `0`);
9454	btrfs_set_file_extent_other_encoding(eb: leaf, s: ei, val: `0`);
9455	btrfs_set_file_extent_ram_bytes(eb: leaf, s: ei, val: name_len);
9456
9457	ptr = btrfs_file_extent_inline_start(e: ei);
9458	write_extent_buffer(eb: leaf, src: symname, start: ptr, len: name_len);
9459	btrfs_mark_buffer_dirty(trans, buf: leaf);
9460	btrfs_free_path(p: path);
9461
9462	d_instantiate_new(dentry, inode);
9463	err = `0`;
9464	out:
9465	btrfs_end_transaction(trans);
9466	btrfs_btree_balance_dirty(fs_info);
9467	out_new_inode_args:
9468	btrfs_new_inode_args_destroy(args: &new_inode_args);
9469	out_inode:
9470	if (err)
9471	iput(inode);
9472	return err;
9473	}
9474
9475	static struct btrfs_trans_handle *insert_prealloc_file_extent(
9476	struct btrfs_trans_handle *trans_in,
9477	struct btrfs_inode *inode,
9478	struct btrfs_key *ins,
9479	u64 file_offset)
9480	{
9481	struct btrfs_file_extent_item stack_fi;
9482	struct btrfs_replace_extent_info extent_info;
9483	struct btrfs_trans_handle *trans = trans_in;
9484	struct btrfs_path *path;
9485	u64 start = ins->objectid;
9486	u64 len = ins->offset;
9487	int qgroup_released;
9488	int ret;
9489
9490	memset(&stack_fi, `0`, sizeof(stack_fi));
9491
9492	btrfs_set_stack_file_extent_type(s: &stack_fi, val: BTRFS_FILE_EXTENT_PREALLOC);
9493	btrfs_set_stack_file_extent_disk_bytenr(s: &stack_fi, val: start);
9494	btrfs_set_stack_file_extent_disk_num_bytes(s: &stack_fi, val: len);
9495	btrfs_set_stack_file_extent_num_bytes(s: &stack_fi, val: len);
9496	btrfs_set_stack_file_extent_ram_bytes(s: &stack_fi, val: len);
9497	btrfs_set_stack_file_extent_compression(s: &stack_fi, val: BTRFS_COMPRESS_NONE);
9498	/ Encryption and other encoding is reserved and all 0 /
9499
9500	qgroup_released = btrfs_qgroup_release_data(inode, start: file_offset, len);
9501	if (qgroup_released < `0`)
9502	return ERR_PTR(error: qgroup_released);
9503
9504	if (trans) {
9505	ret = insert_reserved_file_extent(trans, inode,
9506	file_pos: file_offset, stack_fi: &stack_fi,
9507	update_inode_bytes: true, qgroup_reserved: qgroup_released);
9508	if (ret)
9509	goto free_qgroup;
9510	return trans;
9511	}
9512
9513	extent_info.disk_offset = start;
9514	extent_info.disk_len = len;
9515	extent_info.data_offset = `0`;
9516	extent_info.data_len = len;
9517	extent_info.file_offset = file_offset;
9518	extent_info.extent_buf = (char *)&stack_fi;
9519	extent_info.is_new_extent = true;
9520	extent_info.update_times = true;
9521	extent_info.qgroup_reserved = qgroup_released;
9522	extent_info.insertions = `0`;
9523
9524	path = btrfs_alloc_path();
9525	if (!path) {
9526	ret = -ENOMEM;
9527	goto free_qgroup;
9528	}
9529
9530	ret = btrfs_replace_file_extents(inode, path, start: file_offset,
9531	end: file_offset + len - `1`, extent_info: &extent_info,
9532	trans_out: &trans);
9533	btrfs_free_path(p: path);
9534	if (ret)
9535	goto free_qgroup;
9536	return trans;
9537
9538	free_qgroup:
9539	/*
9540	* We have released qgroup data range at the beginning of the function,
9541	* and normally qgroup_released bytes will be freed when committing
9542	* transaction.
9543	* But if we error out early, we have to free what we have released
9544	* or we leak qgroup data reservation.
9545	*/
9546	btrfs_qgroup_free_refroot(fs_info: inode->root->fs_info,
9547	ref_root: inode->root->root_key.objectid, num_bytes: qgroup_released,
9548	type: BTRFS_QGROUP_RSV_DATA);
9549	return ERR_PTR(error: ret);
9550	}
9551
9552	static int __btrfs_prealloc_file_range(struct inode inode, int* mode,
9553	u64 start, u64 num_bytes, u64 min_size,
9554	loff_t actual_len, u64 *alloc_hint,
9555	struct btrfs_trans_handle *trans)
9556	{
9557	struct btrfs_fs_info *fs_info = btrfs_sb(sb: inode->i_sb);
9558	struct extent_map *em;
9559	struct btrfs_root *root = BTRFS_I(inode)->root;
9560	struct btrfs_key ins;
9561	u64 cur_offset = start;
9562	u64 clear_offset = start;
9563	u64 i_size;
9564	u64 cur_bytes;
9565	u64 last_alloc = (u64)-`1`;
9566	int ret = `0`;
9567	bool own_trans = true;
9568	u64 end = start + num_bytes - `1`;
9569
9570	if (trans)
9571	own_trans = false;
9572	while (num_bytes > `0`) {
9573	cur_bytes = min_t(u64, num_bytes, SZ_256M);
9574	cur_bytes = max(cur_bytes, min_size);
9575	/*
9576	* If we are severely fragmented we could end up with really
9577	* small allocations, so if the allocator is returning small
9578	* chunks lets make its job easier by only searching for those
9579	* sized chunks.
9580	*/
9581	cur_bytes = min(cur_bytes, last_alloc);
9582	ret = btrfs_reserve_extent(root, ram_bytes: cur_bytes, num_bytes: cur_bytes,
9583	min_alloc_size: min_size, empty_size: `0`, hint_byte: *alloc_hint, ins: &ins, is_data: `1`, delalloc: `0`);
9584	if (ret)
9585	break;
9586
9587	/*
9588	* We've reserved this space, and thus converted it from
9589	* ->bytes_may_use to ->bytes_reserved. Any error that happens
9590	* from here on out we will only need to clear our reservation
9591	* for the remaining unreserved area, so advance our
9592	* clear_offset by our extent size.
9593	*/
9594	clear_offset += ins.offset;
9595
9596	last_alloc = ins.offset;
9597	trans = insert_prealloc_file_extent(trans_in: trans, inode: BTRFS_I(inode),
9598	ins: &ins, file_offset: cur_offset);
9599	/*
9600	* Now that we inserted the prealloc extent we can finally
9601	* decrement the number of reservations in the block group.
9602	* If we did it before, we could race with relocation and have
9603	* relocation miss the reserved extent, making it fail later.
9604	*/
9605	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
9606	if (IS_ERR(ptr: trans)) {
9607	ret = PTR_ERR(ptr: trans);
9608	btrfs_free_reserved_extent(fs_info, start: ins.objectid,
9609	len: ins.offset, delalloc: `0`);
9610	break;
9611	}
9612
9613	em = alloc_extent_map();
9614	if (!em) {
9615	btrfs_drop_extent_map_range(inode: BTRFS_I(inode), start: cur_offset,
9616	end: cur_offset + ins.offset - `1`, skip_pinned: false);
9617	btrfs_set_inode_full_sync(inode: BTRFS_I(inode));
9618	goto next;
9619	}
9620
9621	em->start = cur_offset;
9622	em->orig_start = cur_offset;
9623	em->len = ins.offset;
9624	em->block_start = ins.objectid;
9625	em->block_len = ins.offset;
9626	em->orig_block_len = ins.offset;
9627	em->ram_bytes = ins.offset;
9628	set_bit(nr: EXTENT_FLAG_PREALLOC, addr: &em->flags);
9629	em->generation = trans->transid;
9630
9631	ret = btrfs_replace_extent_map_range(inode: BTRFS_I(inode), new_em: em, modified: true);
9632	free_extent_map(em);
9633	next:
9634	num_bytes -= ins.offset;
9635	cur_offset += ins.offset;
9636	*alloc_hint = ins.objectid + ins.offset;
9637
9638	inode_inc_iversion(inode);
9639	inode_set_ctime_current(inode);
9640	BTRFS_I(inode)->flags \|= BTRFS_INODE_PREALLOC;
9641	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9642	(actual_len > inode->i_size) &&
9643	(cur_offset > inode->i_size)) {
9644	if (cur_offset > actual_len)
9645	i_size = actual_len;
9646	else
9647	i_size = cur_offset;
9648	i_size_write(inode, i_size);
9649	btrfs_inode_safe_disk_i_size_write(inode: BTRFS_I(inode), new_i_size: `0`);
9650	}
9651
9652	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
9653
9654	if (ret) {
9655	btrfs_abort_transaction(trans, ret);
9656	if (own_trans)
9657	btrfs_end_transaction(trans);
9658	break;
9659	}
9660
9661	if (own_trans) {
9662	btrfs_end_transaction(trans);
9663	trans = NULL;
9664	}
9665	}
9666	if (clear_offset < end)
9667	btrfs_free_reserved_data_space(inode: BTRFS_I(inode), NULL, start: clear_offset,
9668	len: end - clear_offset + `1`);
9669	return ret;
9670	}
9671
9672	int btrfs_prealloc_file_range(struct inode inode, int* mode,
9673	u64 start, u64 num_bytes, u64 min_size,
9674	loff_t actual_len, u64 *alloc_hint)
9675	{
9676	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9677	min_size, actual_len, alloc_hint,
9678	NULL);
9679	}
9680
9681	int btrfs_prealloc_file_range_trans(struct inode *inode,
9682	struct btrfs_trans_handle trans, int* mode,
9683	u64 start, u64 num_bytes, u64 min_size,
9684	loff_t actual_len, u64 *alloc_hint)
9685	{
9686	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9687	min_size, actual_len, alloc_hint, trans);
9688	}
9689
9690	static int btrfs_permission(struct mnt_idmap *idmap,
9691	struct inode inode, int* mask)
9692	{
9693	struct btrfs_root *root = BTRFS_I(inode)->root;
9694	umode_t mode = inode->i_mode;
9695
9696	if (mask & MAY_WRITE &&
9697	(S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode))) {
9698	if (btrfs_root_readonly(root))
9699	return -EROFS;
9700	if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9701	return -EACCES;
9702	}
9703	return generic_permission(idmap, inode, mask);
9704	}
9705
9706	static int btrfs_tmpfile(struct mnt_idmap idmap, struct* inode *dir,
9707	struct file *file, umode_t mode)
9708	{
9709	struct btrfs_fs_info *fs_info = btrfs_sb(sb: dir->i_sb);
9710	struct btrfs_trans_handle *trans;
9711	struct btrfs_root *root = BTRFS_I(inode: dir)->root;
9712	struct inode *inode;
9713	struct btrfs_new_inode_args new_inode_args = {
9714	.dir = dir,
9715	.dentry = file->f_path.dentry,
9716	.orphan = true,
9717	};
9718	unsigned int trans_num_items;
9719	int ret;
9720
9721	inode = new_inode(sb: dir->i_sb);
9722	if (!inode)
9723	return -ENOMEM;
9724	inode_init_owner(idmap, inode, dir, mode);
9725	inode->i_fop = &btrfs_file_operations;
9726	inode->i_op = &btrfs_file_inode_operations;
9727	inode->i_mapping->a_ops = &btrfs_aops;
9728
9729	new_inode_args.inode = inode;
9730	ret = btrfs_new_inode_prepare(args: &new_inode_args, trans_num_items: &trans_num_items);
9731	if (ret)
9732	goto out_inode;
9733
9734	trans = btrfs_start_transaction(root, num_items: trans_num_items);
9735	if (IS_ERR(ptr: trans)) {
9736	ret = PTR_ERR(ptr: trans);
9737	goto out_new_inode_args;
9738	}
9739
9740	ret = btrfs_create_new_inode(trans, args: &new_inode_args);
9741
9742	/*
9743	* We set number of links to 0 in btrfs_create_new_inode(), and here we
9744	* set it to 1 because d_tmpfile() will issue a warning if the count is
9745	* 0, through:
9746	*
9747	* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9748	*/
9749	set_nlink(inode, nlink: `1`);
9750
9751	if (!ret) {
9752	d_tmpfile(file, inode);
9753	unlock_new_inode(inode);
9754	mark_inode_dirty(inode);
9755	}
9756
9757	btrfs_end_transaction(trans);
9758	btrfs_btree_balance_dirty(fs_info);
9759	out_new_inode_args:
9760	btrfs_new_inode_args_destroy(args: &new_inode_args);
9761	out_inode:
9762	if (ret)
9763	iput(inode);
9764	return finish_open_simple(file, error: ret);
9765	}
9766
9767	void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
9768	{
9769	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9770	unsigned long index = start >> PAGE_SHIFT;
9771	unsigned long end_index = end >> PAGE_SHIFT;
9772	struct page *page;
9773	u32 len;
9774
9775	ASSERT(end + `1` - start <= U32_MAX);
9776	len = end + `1` - start;
9777	while (index <= end_index) {
9778	page = find_get_page(mapping: inode->vfs_inode.i_mapping, offset: index);
9779	ASSERT(page); / Pages should be in the extent_io_tree /
9780
9781	btrfs_page_set_writeback(fs_info, page, start, len);
9782	put_page(page);
9783	index++;
9784	}
9785	}
9786
9787	int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9788	int compress_type)
9789	{
9790	switch (compress_type) {
9791	case BTRFS_COMPRESS_NONE:
9792	return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9793	case BTRFS_COMPRESS_ZLIB:
9794	return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9795	case BTRFS_COMPRESS_LZO:
9796	/*
9797	* The LZO format depends on the sector size. 64K is the maximum
9798	* sector size that we support.
9799	*/
9800	if (fs_info->sectorsize < SZ_4K \|\| fs_info->sectorsize > SZ_64K)
9801	return -EINVAL;
9802	return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9803	(fs_info->sectorsize_bits - `12`);
9804	case BTRFS_COMPRESS_ZSTD:
9805	return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9806	default:
9807	return -EUCLEAN;
9808	}
9809	}
9810
9811	static ssize_t btrfs_encoded_read_inline(
9812	struct kiocb *iocb,
9813	struct iov_iter *iter, u64 start,
9814	u64 lockend,
9815	struct extent_state **cached_state,
9816	u64 extent_start, size_t count,
9817	struct btrfs_ioctl_encoded_io_args *encoded,
9818	bool *unlocked)
9819	{
9820	struct btrfs_inode *inode = BTRFS_I(inode: file_inode(f: iocb->ki_filp));
9821	struct btrfs_root *root = inode->root;
9822	struct btrfs_fs_info *fs_info = root->fs_info;
9823	struct extent_io_tree *io_tree = &inode->io_tree;
9824	struct btrfs_path *path;
9825	struct extent_buffer *leaf;
9826	struct btrfs_file_extent_item *item;
9827	u64 ram_bytes;
9828	unsigned long ptr;
9829	void *tmp;
9830	ssize_t ret;
9831
9832	path = btrfs_alloc_path();
9833	if (!path) {
9834	ret = -ENOMEM;
9835	goto out;
9836	}
9837	ret = btrfs_lookup_file_extent(NULL, root, path, objectid: btrfs_ino(inode),
9838	bytenr: extent_start, mod: `0`);
9839	if (ret) {
9840	if (ret > `0`) {
9841	/ The extent item disappeared? /
9842	ret = -EIO;
9843	}
9844	goto out;
9845	}
9846	leaf = path->nodes[`0`];
9847	item = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
9848
9849	ram_bytes = btrfs_file_extent_ram_bytes(eb: leaf, s: item);
9850	ptr = btrfs_file_extent_inline_start(e: item);
9851
9852	encoded->len = min_t(u64, extent_start + ram_bytes,
9853	inode->vfs_inode.i_size) - iocb->ki_pos;
9854	ret = btrfs_encoded_io_compression_from_extent(fs_info,
9855	compress_type: btrfs_file_extent_compression(eb: leaf, s: item));
9856	if (ret < `0`)
9857	goto out;
9858	encoded->compression = ret;
9859	if (encoded->compression) {
9860	size_t inline_size;
9861
9862	inline_size = btrfs_file_extent_inline_item_len(eb: leaf,
9863	nr: path->slots[`0`]);
9864	if (inline_size > count) {
9865	ret = -ENOBUFS;
9866	goto out;
9867	}
9868	count = inline_size;
9869	encoded->unencoded_len = ram_bytes;
9870	encoded->unencoded_offset = iocb->ki_pos - extent_start;
9871	} else {
9872	count = min_t(u64, count, encoded->len);
9873	encoded->len = count;
9874	encoded->unencoded_len = count;
9875	ptr += iocb->ki_pos - extent_start;
9876	}
9877
9878	tmp = kmalloc(size: count, GFP_NOFS);
9879	if (!tmp) {
9880	ret = -ENOMEM;
9881	goto out;
9882	}
9883	read_extent_buffer(eb: leaf, dst: tmp, start: ptr, len: count);
9884	btrfs_release_path(p: path);
9885	unlock_extent(tree: io_tree, start, end: lockend, cached: cached_state);
9886	btrfs_inode_unlock(inode, ilock_flags: BTRFS_ILOCK_SHARED);
9887	*unlocked = true;
9888
9889	ret = copy_to_iter(addr: tmp, bytes: count, i: iter);
9890	if (ret != count)
9891	ret = -EFAULT;
9892	kfree(objp: tmp);
9893	out:
9894	btrfs_free_path(p: path);
9895	return ret;
9896	}
9897
9898	struct btrfs_encoded_read_private {
9899	wait_queue_head_t wait;
9900	atomic_t pending;
9901	blk_status_t status;
9902	};
9903
9904	static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9905	{
9906	struct btrfs_encoded_read_private *priv = bbio->private;
9907
9908	if (bbio->bio.bi_status) {
9909	/*
9910	* The memory barrier implied by the atomic_dec_return() here
9911	* pairs with the memory barrier implied by the
9912	* atomic_dec_return() or io_wait_event() in
9913	* btrfs_encoded_read_regular_fill_pages() to ensure that this
9914	* write is observed before the load of status in
9915	* btrfs_encoded_read_regular_fill_pages().
9916	*/
9917	WRITE_ONCE(priv->status, bbio->bio.bi_status);
9918	}
9919	if (!atomic_dec_return(v: &priv->pending))
9920	wake_up(&priv->wait);
9921	bio_put(&bbio->bio);
9922	}
9923
9924	int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9925	u64 file_offset, u64 disk_bytenr,
9926	u64 disk_io_size, struct page **pages)
9927	{
9928	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9929	struct btrfs_encoded_read_private priv = {
9930	.pending = ATOMIC_INIT(`1`),
9931	};
9932	unsigned long i = `0`;
9933	struct btrfs_bio *bbio;
9934
9935	init_waitqueue_head(&priv.wait);
9936
9937	bbio = btrfs_bio_alloc(BIO_MAX_VECS, opf: REQ_OP_READ, fs_info,
9938	end_io: btrfs_encoded_read_endio, private: &priv);
9939	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9940	bbio->inode = inode;
9941
9942	do {
9943	size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9944
9945	if (bio_add_page(bio: &bbio->bio, page: pages[i], len: bytes, off: `0`) < bytes) {
9946	atomic_inc(v: &priv.pending);
9947	btrfs_submit_bio(bbio, mirror_num: `0`);
9948
9949	bbio = btrfs_bio_alloc(BIO_MAX_VECS, opf: REQ_OP_READ, fs_info,
9950	end_io: btrfs_encoded_read_endio, private: &priv);
9951	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9952	bbio->inode = inode;
9953	continue;
9954	}
9955
9956	i++;
9957	disk_bytenr += bytes;
9958	disk_io_size -= bytes;
9959	} while (disk_io_size);
9960
9961	atomic_inc(v: &priv.pending);
9962	btrfs_submit_bio(bbio, mirror_num: `0`);
9963
9964	if (atomic_dec_return(v: &priv.pending))
9965	io_wait_event(priv.wait, !atomic_read(&priv.pending));
9966	/ See btrfs_encoded_read_endio() for ordering. /
9967	return blk_status_to_errno(READ_ONCE(priv.status));
9968	}
9969
9970	static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
9971	struct iov_iter *iter,
9972	u64 start, u64 lockend,
9973	struct extent_state **cached_state,
9974	u64 disk_bytenr, u64 disk_io_size,
9975	size_t count, bool compressed,
9976	bool *unlocked)
9977	{
9978	struct btrfs_inode *inode = BTRFS_I(inode: file_inode(f: iocb->ki_filp));
9979	struct extent_io_tree *io_tree = &inode->io_tree;
9980	struct page **pages;
9981	unsigned long nr_pages, i;
9982	u64 cur;
9983	size_t page_offset;
9984	ssize_t ret;
9985
9986	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9987	pages = kcalloc(n: nr_pages, size: sizeof(struct page *), GFP_NOFS);
9988	if (!pages)
9989	return -ENOMEM;
9990	ret = btrfs_alloc_page_array(nr_pages, page_array: pages);
9991	if (ret) {
9992	ret = -ENOMEM;
9993	goto out;
9994	}
9995
9996	ret = btrfs_encoded_read_regular_fill_pages(inode, file_offset: start, disk_bytenr,
9997	disk_io_size, pages);
9998	if (ret)
9999	goto out;
10000
10001	unlock_extent(tree: io_tree, start, end: lockend, cached: cached_state);
10002	btrfs_inode_unlock(inode, ilock_flags: BTRFS_ILOCK_SHARED);
10003	*unlocked = true;
10004
10005	if (compressed) {
10006	i = `0`;
10007	page_offset = `0`;
10008	} else {
10009	i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10010	page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - `1`);
10011	}
10012	cur = `0`;
10013	while (cur < count) {
10014	size_t bytes = min_t(size_t, count - cur,
10015	PAGE_SIZE - page_offset);
10016
10017	if (copy_page_to_iter(page: pages[i], offset: page_offset, bytes,
10018	i: iter) != bytes) {
10019	ret = -EFAULT;
10020	goto out;
10021	}
10022	i++;
10023	cur += bytes;
10024	page_offset = `0`;
10025	}
10026	ret = count;
10027	out:
10028	for (i = `0`; i < nr_pages; i++) {
10029	if (pages[i])
10030	__free_page(pages[i]);
10031	}
10032	kfree(objp: pages);
10033	return ret;
10034	}
10035
10036	ssize_t btrfs_encoded_read(struct kiocb iocb, struct* iov_iter *iter,
10037	struct btrfs_ioctl_encoded_io_args *encoded)
10038	{
10039	struct btrfs_inode *inode = BTRFS_I(inode: file_inode(f: iocb->ki_filp));
10040	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10041	struct extent_io_tree *io_tree = &inode->io_tree;
10042	ssize_t ret;
10043	size_t count = iov_iter_count(i: iter);
10044	u64 start, lockend, disk_bytenr, disk_io_size;
10045	struct extent_state *cached_state = NULL;
10046	struct extent_map *em;
10047	bool unlocked = false;
10048
10049	file_accessed(file: iocb->ki_filp);
10050
10051	btrfs_inode_lock(inode, ilock_flags: BTRFS_ILOCK_SHARED);
10052
10053	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10054	btrfs_inode_unlock(inode, ilock_flags: BTRFS_ILOCK_SHARED);
10055	return `0`;
10056	}
10057	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10058	/*
10059	* We don't know how long the extent containing iocb->ki_pos is, but if
10060	* it's compressed we know that it won't be longer than this.
10061	*/
10062	lockend = start + BTRFS_MAX_UNCOMPRESSED - `1`;
10063
10064	for (;;) {
10065	struct btrfs_ordered_extent *ordered;
10066
10067	ret = btrfs_wait_ordered_range(inode: &inode->vfs_inode, start,
10068	len: lockend - start + `1`);
10069	if (ret)
10070	goto out_unlock_inode;
10071	lock_extent(tree: io_tree, start, end: lockend, cached: &cached_state);
10072	ordered = btrfs_lookup_ordered_range(inode, file_offset: start,
10073	len: lockend - start + `1`);
10074	if (!ordered)
10075	break;
10076	btrfs_put_ordered_extent(entry: ordered);
10077	unlock_extent(tree: io_tree, start, end: lockend, cached: &cached_state);
10078	cond_resched();
10079	}
10080
10081	em = btrfs_get_extent(inode, NULL, pg_offset: `0`, start, len: lockend - start + `1`);
10082	if (IS_ERR(ptr: em)) {
10083	ret = PTR_ERR(ptr: em);
10084	goto out_unlock_extent;
10085	}
10086
10087	if (em->block_start == EXTENT_MAP_INLINE) {
10088	u64 extent_start = em->start;
10089
10090	/*
10091	* For inline extents we get everything we need out of the
10092	* extent item.
10093	*/
10094	free_extent_map(em);
10095	em = NULL;
10096	ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10097	cached_state: &cached_state, extent_start,
10098	count, encoded, unlocked: &unlocked);
10099	goto out;
10100	}
10101
10102	/*
10103	* We only want to return up to EOF even if the extent extends beyond
10104	* that.
10105	*/
10106	encoded->len = min_t(u64, extent_map_end(em),
10107	inode->vfs_inode.i_size) - iocb->ki_pos;
10108	if (em->block_start == EXTENT_MAP_HOLE \|\|
10109	test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10110	disk_bytenr = EXTENT_MAP_HOLE;
10111	count = min_t(u64, count, encoded->len);
10112	encoded->len = count;
10113	encoded->unencoded_len = count;
10114	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10115	disk_bytenr = em->block_start;
10116	/*
10117	* Bail if the buffer isn't large enough to return the whole
10118	* compressed extent.
10119	*/
10120	if (em->block_len > count) {
10121	ret = -ENOBUFS;
10122	goto out_em;
10123	}
10124	disk_io_size = em->block_len;
10125	count = em->block_len;
10126	encoded->unencoded_len = em->ram_bytes;
10127	encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10128	ret = btrfs_encoded_io_compression_from_extent(fs_info,
10129	compress_type: em->compress_type);
10130	if (ret < `0`)
10131	goto out_em;
10132	encoded->compression = ret;
10133	} else {
10134	disk_bytenr = em->block_start + (start - em->start);
10135	if (encoded->len > count)
10136	encoded->len = count;
10137	/*
10138	* Don't read beyond what we locked. This also limits the page
10139	* allocations that we'll do.
10140	*/
10141	disk_io_size = min(lockend + `1`, iocb->ki_pos + encoded->len) - start;
10142	count = start + disk_io_size - iocb->ki_pos;
10143	encoded->len = count;
10144	encoded->unencoded_len = count;
10145	disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10146	}
10147	free_extent_map(em);
10148	em = NULL;
10149
10150	if (disk_bytenr == EXTENT_MAP_HOLE) {
10151	unlock_extent(tree: io_tree, start, end: lockend, cached: &cached_state);
10152	btrfs_inode_unlock(inode, ilock_flags: BTRFS_ILOCK_SHARED);
10153	unlocked = true;
10154	ret = iov_iter_zero(bytes: count, iter);
10155	if (ret != count)
10156	ret = -EFAULT;
10157	} else {
10158	ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10159	cached_state: &cached_state, disk_bytenr,
10160	disk_io_size, count,
10161	compressed: encoded->compression,
10162	unlocked: &unlocked);
10163	}
10164
10165	out:
10166	if (ret >= `0`)
10167	iocb->ki_pos += encoded->len;
10168	out_em:
10169	free_extent_map(em);
10170	out_unlock_extent:
10171	if (!unlocked)
10172	unlock_extent(tree: io_tree, start, end: lockend, cached: &cached_state);
10173	out_unlock_inode:
10174	if (!unlocked)
10175	btrfs_inode_unlock(inode, ilock_flags: BTRFS_ILOCK_SHARED);
10176	return ret;
10177	}
10178
10179	ssize_t btrfs_do_encoded_write(struct kiocb iocb, struct* iov_iter *from,
10180	const struct btrfs_ioctl_encoded_io_args *encoded)
10181	{
10182	struct btrfs_inode *inode = BTRFS_I(inode: file_inode(f: iocb->ki_filp));
10183	struct btrfs_root *root = inode->root;
10184	struct btrfs_fs_info *fs_info = root->fs_info;
10185	struct extent_io_tree *io_tree = &inode->io_tree;
10186	struct extent_changeset *data_reserved = NULL;
10187	struct extent_state *cached_state = NULL;
10188	struct btrfs_ordered_extent *ordered;
10189	int compression;
10190	size_t orig_count;
10191	u64 start, end;
10192	u64 num_bytes, ram_bytes, disk_num_bytes;
10193	unsigned long nr_pages, i;
10194	struct page **pages;
10195	struct btrfs_key ins;
10196	bool extent_reserved = false;
10197	struct extent_map *em;
10198	ssize_t ret;
10199
10200	switch (encoded->compression) {
10201	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10202	compression = BTRFS_COMPRESS_ZLIB;
10203	break;
10204	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10205	compression = BTRFS_COMPRESS_ZSTD;
10206	break;
10207	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10208	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10209	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10210	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10211	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10212	/ The sector size must match for LZO. /
10213	if (encoded->compression -
10214	BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + `12` !=
10215	fs_info->sectorsize_bits)
10216	return -EINVAL;
10217	compression = BTRFS_COMPRESS_LZO;
10218	break;
10219	default:
10220	return -EINVAL;
10221	}
10222	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10223	return -EINVAL;
10224
10225	orig_count = iov_iter_count(i: from);
10226
10227	/ The extent size must be sane. /
10228	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED \|\|
10229	orig_count > BTRFS_MAX_COMPRESSED \|\| orig_count == `0`)
10230	return -EINVAL;
10231
10232	/*
10233	* The compressed data must be smaller than the decompressed data.
10234	*
10235	* It's of course possible for data to compress to larger or the same
10236	* size, but the buffered I/O path falls back to no compression for such
10237	* data, and we don't want to break any assumptions by creating these
10238	* extents.
10239	*
10240	* Note that this is less strict than the current check we have that the
10241	* compressed data must be at least one sector smaller than the
10242	* decompressed data. We only want to enforce the weaker requirement
10243	* from old kernels that it is at least one byte smaller.
10244	*/
10245	if (orig_count >= encoded->unencoded_len)
10246	return -EINVAL;
10247
10248	/ The extent must start on a sector boundary. /
10249	start = iocb->ki_pos;
10250	if (!IS_ALIGNED(start, fs_info->sectorsize))
10251	return -EINVAL;
10252
10253	/*
10254	* The extent must end on a sector boundary. However, we allow a write
10255	* which ends at or extends i_size to have an unaligned length; we round
10256	* up the extent size and set i_size to the unaligned end.
10257	*/
10258	if (start + encoded->len < inode->vfs_inode.i_size &&
10259	!IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10260	return -EINVAL;
10261
10262	/ Finally, the offset in the unencoded data must be sector-aligned. /
10263	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10264	return -EINVAL;
10265
10266	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10267	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10268	end = start + num_bytes - `1`;
10269
10270	/*
10271	* If the extent cannot be inline, the compressed data on disk must be
10272	* sector-aligned. For convenience, we extend it with zeroes if it
10273	* isn't.
10274	*/
10275	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10276	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10277	pages = kvcalloc(n: nr_pages, size: sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10278	if (!pages)
10279	return -ENOMEM;
10280	for (i = `0`; i < nr_pages; i++) {
10281	size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10282	char *kaddr;
10283
10284	pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10285	if (!pages[i]) {
10286	ret = -ENOMEM;
10287	goto out_pages;
10288	}
10289	kaddr = kmap_local_page(page: pages[i]);
10290	if (copy_from_iter(addr: kaddr, bytes, i: from) != bytes) {
10291	kunmap_local(kaddr);
10292	ret = -EFAULT;
10293	goto out_pages;
10294	}
10295	if (bytes < PAGE_SIZE)
10296	memset(kaddr + bytes, `0`, PAGE_SIZE - bytes);
10297	kunmap_local(kaddr);
10298	}
10299
10300	for (;;) {
10301	struct btrfs_ordered_extent *ordered;
10302
10303	ret = btrfs_wait_ordered_range(inode: &inode->vfs_inode, start, len: num_bytes);
10304	if (ret)
10305	goto out_pages;
10306	ret = invalidate_inode_pages2_range(mapping: inode->vfs_inode.i_mapping,
10307	start: start >> PAGE_SHIFT,
10308	end: end >> PAGE_SHIFT);
10309	if (ret)
10310	goto out_pages;
10311	lock_extent(tree: io_tree, start, end, cached: &cached_state);
10312	ordered = btrfs_lookup_ordered_range(inode, file_offset: start, len: num_bytes);
10313	if (!ordered &&
10314	!filemap_range_has_page(inode->vfs_inode.i_mapping, lstart: start, lend: end))
10315	break;
10316	if (ordered)
10317	btrfs_put_ordered_extent(entry: ordered);
10318	unlock_extent(tree: io_tree, start, end, cached: &cached_state);
10319	cond_resched();
10320	}
10321
10322	/*
10323	* We don't use the higher-level delalloc space functions because our
10324	* num_bytes and disk_num_bytes are different.
10325	*/
10326	ret = btrfs_alloc_data_chunk_ondemand(inode, bytes: disk_num_bytes);
10327	if (ret)
10328	goto out_unlock;
10329	ret = btrfs_qgroup_reserve_data(inode, reserved: &data_reserved, start, len: num_bytes);
10330	if (ret)
10331	goto out_free_data_space;
10332	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10333	noflush: false);
10334	if (ret)
10335	goto out_qgroup_free_data;
10336
10337	/ Try an inline extent first. /
10338	if (start == `0` && encoded->unencoded_len == encoded->len &&
10339	encoded->unencoded_offset == `0`) {
10340	ret = cow_file_range_inline(inode, size: encoded->len, compressed_size: orig_count,
10341	compress_type: compression, compressed_pages: pages, update_i_size: true);
10342	if (ret <= `0`) {
10343	if (ret == `0`)
10344	ret = orig_count;
10345	goto out_delalloc_release;
10346	}
10347	}
10348
10349	ret = btrfs_reserve_extent(root, ram_bytes: disk_num_bytes, num_bytes: disk_num_bytes,
10350	min_alloc_size: disk_num_bytes, empty_size: `0`, hint_byte: `0`, ins: &ins, is_data: `1`, delalloc: `1`);
10351	if (ret)
10352	goto out_delalloc_release;
10353	extent_reserved = true;
10354
10355	em = create_io_em(inode, start, len: num_bytes,
10356	orig_start: start - encoded->unencoded_offset, block_start: ins.objectid,
10357	block_len: ins.offset, orig_block_len: ins.offset, ram_bytes, compress_type: compression,
10358	type: BTRFS_ORDERED_COMPRESSED);
10359	if (IS_ERR(ptr: em)) {
10360	ret = PTR_ERR(ptr: em);
10361	goto out_free_reserved;
10362	}
10363	free_extent_map(em);
10364
10365	ordered = btrfs_alloc_ordered_extent(inode, file_offset: start, num_bytes, ram_bytes,
10366	disk_bytenr: ins.objectid, disk_num_bytes: ins.offset,
10367	offset: encoded->unencoded_offset,
10368	flags: (`1` << BTRFS_ORDERED_ENCODED) \|
10369	(`1` << BTRFS_ORDERED_COMPRESSED),
10370	compress_type: compression);
10371	if (IS_ERR(ptr: ordered)) {
10372	btrfs_drop_extent_map_range(inode, start, end, skip_pinned: false);
10373	ret = PTR_ERR(ptr: ordered);
10374	goto out_free_reserved;
10375	}
10376	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
10377
10378	if (start + encoded->len > inode->vfs_inode.i_size)
10379	i_size_write(inode: &inode->vfs_inode, i_size: start + encoded->len);
10380
10381	unlock_extent(tree: io_tree, start, end, cached: &cached_state);
10382
10383	btrfs_delalloc_release_extents(inode, num_bytes);
10384
10385	btrfs_submit_compressed_write(ordered, compressed_pages: pages, nr_pages, write_flags: `0`, writeback: false);
10386	ret = orig_count;
10387	goto out;
10388
10389	out_free_reserved:
10390	btrfs_dec_block_group_reservations(fs_info, start: ins.objectid);
10391	btrfs_free_reserved_extent(fs_info, start: ins.objectid, len: ins.offset, delalloc: `1`);
10392	out_delalloc_release:
10393	btrfs_delalloc_release_extents(inode, num_bytes);
10394	btrfs_delalloc_release_metadata(inode, num_bytes: disk_num_bytes, qgroup_free: ret < `0`);
10395	out_qgroup_free_data:
10396	if (ret < `0`)
10397	btrfs_qgroup_free_data(inode, reserved: data_reserved, start, len: num_bytes);
10398	out_free_data_space:
10399	/*
10400	* If btrfs_reserve_extent() succeeded, then we already decremented
10401	* bytes_may_use.
10402	*/
10403	if (!extent_reserved)
10404	btrfs_free_reserved_data_space_noquota(fs_info, len: disk_num_bytes);
10405	out_unlock:
10406	unlock_extent(tree: io_tree, start, end, cached: &cached_state);
10407	out_pages:
10408	for (i = `0`; i < nr_pages; i++) {
10409	if (pages[i])
10410	__free_page(pages[i]);
10411	}
10412	kvfree(addr: pages);
10413	out:
10414	if (ret >= `0`)
10415	iocb->ki_pos += encoded->len;
10416	return ret;
10417	}
10418
10419	#ifdef CONFIG_SWAP
10420	/*
10421	* Add an entry indicating a block group or device which is pinned by a
10422	* swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10423	* negative errno on failure.
10424	*/
10425	static int btrfs_add_swapfile_pin(struct inode inode, void* *ptr,
10426	bool is_block_group)
10427	{
10428	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10429	struct btrfs_swapfile_pin sp, entry;
10430	struct rb_node **p;
10431	struct rb_node *parent = NULL;
10432
10433	sp = kmalloc(size: sizeof(*sp), GFP_NOFS);
10434	if (!sp)
10435	return -ENOMEM;
10436	sp->ptr = ptr;
10437	sp->inode = inode;
10438	sp->is_block_group = is_block_group;
10439	sp->bg_extent_count = `1`;
10440
10441	spin_lock(lock: &fs_info->swapfile_pins_lock);
10442	p = &fs_info->swapfile_pins.rb_node;
10443	while (*p) {
10444	parent = *p;
10445	entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10446	if (sp->ptr < entry->ptr \|\|
10447	(sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10448	p = &(*p)->rb_left;
10449	} else if (sp->ptr > entry->ptr \|\|
10450	(sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10451	p = &(*p)->rb_right;
10452	} else {
10453	if (is_block_group)
10454	entry->bg_extent_count++;
10455	spin_unlock(lock: &fs_info->swapfile_pins_lock);
10456	kfree(objp: sp);
10457	return `1`;
10458	}
10459	}
10460	rb_link_node(node: &sp->node, parent, rb_link: p);
10461	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10462	spin_unlock(lock: &fs_info->swapfile_pins_lock);
10463	return `0`;
10464	}
10465
10466	/ Free all of the entries pinned by this swapfile. /
10467	static void btrfs_free_swapfile_pins(struct inode *inode)
10468	{
10469	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10470	struct btrfs_swapfile_pin *sp;
10471	struct rb_node node, next;
10472
10473	spin_lock(lock: &fs_info->swapfile_pins_lock);
10474	node = rb_first(&fs_info->swapfile_pins);
10475	while (node) {
10476	next = rb_next(node);
10477	sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10478	if (sp->inode == inode) {
10479	rb_erase(&sp->node, &fs_info->swapfile_pins);
10480	if (sp->is_block_group) {
10481	btrfs_dec_block_group_swap_extents(bg: sp->ptr,
10482	amount: sp->bg_extent_count);
10483	btrfs_put_block_group(cache: sp->ptr);
10484	}
10485	kfree(objp: sp);
10486	}
10487	node = next;
10488	}
10489	spin_unlock(lock: &fs_info->swapfile_pins_lock);
10490	}
10491
10492	struct btrfs_swap_info {
10493	u64 start;
10494	u64 block_start;
10495	u64 block_len;
10496	u64 lowest_ppage;
10497	u64 highest_ppage;
10498	unsigned long nr_pages;
10499	int nr_extents;
10500	};
10501
10502	static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10503	struct btrfs_swap_info *bsi)
10504	{
10505	unsigned long nr_pages;
10506	unsigned long max_pages;
10507	u64 first_ppage, first_ppage_reported, next_ppage;
10508	int ret;
10509
10510	/*
10511	* Our swapfile may have had its size extended after the swap header was
10512	* written. In that case activating the swapfile should not go beyond
10513	* the max size set in the swap header.
10514	*/
10515	if (bsi->nr_pages >= sis->max)
10516	return `0`;
10517
10518	max_pages = sis->max - bsi->nr_pages;
10519	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10520	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10521
10522	if (first_ppage >= next_ppage)
10523	return `0`;
10524	nr_pages = next_ppage - first_ppage;
10525	nr_pages = min(nr_pages, max_pages);
10526
10527	first_ppage_reported = first_ppage;
10528	if (bsi->start == `0`)
10529	first_ppage_reported++;
10530	if (bsi->lowest_ppage > first_ppage_reported)
10531	bsi->lowest_ppage = first_ppage_reported;
10532	if (bsi->highest_ppage < (next_ppage - `1`))
10533	bsi->highest_ppage = next_ppage - `1`;
10534
10535	ret = add_swap_extent(sis, start_page: bsi->nr_pages, nr_pages, start_block: first_ppage);
10536	if (ret < `0`)
10537	return ret;
10538	bsi->nr_extents += ret;
10539	bsi->nr_pages += nr_pages;
10540	return `0`;
10541	}
10542
10543	static void btrfs_swap_deactivate(struct file *file)
10544	{
10545	struct inode *inode = file_inode(f: file);
10546
10547	btrfs_free_swapfile_pins(inode);
10548	atomic_dec(v: &BTRFS_I(inode)->root->nr_swapfiles);
10549	}
10550
10551	static int btrfs_swap_activate(struct swap_info_struct sis, struct* file *file,
10552	sector_t *span)
10553	{
10554	struct inode *inode = file_inode(f: file);
10555	struct btrfs_root *root = BTRFS_I(inode)->root;
10556	struct btrfs_fs_info *fs_info = root->fs_info;
10557	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10558	struct extent_state *cached_state = NULL;
10559	struct extent_map *em = NULL;
10560	struct btrfs_device *device = NULL;
10561	struct btrfs_swap_info bsi = {
10562	.lowest_ppage = (sector_t)-`1ULL`,
10563	};
10564	int ret = `0`;
10565	u64 isize;
10566	u64 start;
10567
10568	/*
10569	* If the swap file was just created, make sure delalloc is done. If the
10570	* file changes again after this, the user is doing something stupid and
10571	* we don't really care.
10572	*/
10573	ret = btrfs_wait_ordered_range(inode, start: `0`, len: (u64)-`1`);
10574	if (ret)
10575	return ret;
10576
10577	/*
10578	* The inode is locked, so these flags won't change after we check them.
10579	*/
10580	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10581	btrfs_warn(fs_info, "swapfile must not be compressed");
10582	return -EINVAL;
10583	}
10584	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10585	btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10586	return -EINVAL;
10587	}
10588	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10589	btrfs_warn(fs_info, "swapfile must not be checksummed");
10590	return -EINVAL;
10591	}
10592
10593	/*
10594	* Balance or device remove/replace/resize can move stuff around from
10595	* under us. The exclop protection makes sure they aren't running/won't
10596	* run concurrently while we are mapping the swap extents, and
10597	* fs_info->swapfile_pins prevents them from running while the swap
10598	* file is active and moving the extents. Note that this also prevents
10599	* a concurrent device add which isn't actually necessary, but it's not
10600	* really worth the trouble to allow it.
10601	*/
10602	if (!btrfs_exclop_start(fs_info, type: BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10603	btrfs_warn(fs_info,
10604	"cannot activate swapfile while exclusive operation is running");
10605	return -EBUSY;
10606	}
10607
10608	/*
10609	* Prevent snapshot creation while we are activating the swap file.
10610	* We do not want to race with snapshot creation. If snapshot creation
10611	* already started before we bumped nr_swapfiles from 0 to 1 and
10612	* completes before the first write into the swap file after it is
10613	* activated, than that write would fallback to COW.
10614	*/
10615	if (!btrfs_drew_try_write_lock(lock: &root->snapshot_lock)) {
10616	btrfs_exclop_finish(fs_info);
10617	btrfs_warn(fs_info,
10618	"cannot activate swapfile because snapshot creation is in progress");
10619	return -EINVAL;
10620	}
10621	/*
10622	* Snapshots can create extents which require COW even if NODATACOW is
10623	* set. We use this counter to prevent snapshots. We must increment it
10624	* before walking the extents because we don't want a concurrent
10625	* snapshot to run after we've already checked the extents.
10626	*
10627	* It is possible that subvolume is marked for deletion but still not
10628	* removed yet. To prevent this race, we check the root status before
10629	* activating the swapfile.
10630	*/
10631	spin_lock(lock: &root->root_item_lock);
10632	if (btrfs_root_dead(root)) {
10633	spin_unlock(lock: &root->root_item_lock);
10634
10635	btrfs_exclop_finish(fs_info);
10636	btrfs_warn(fs_info,
10637	"cannot activate swapfile because subvolume %llu is being deleted",
10638	root->root_key.objectid);
10639	return -EPERM;
10640	}
10641	atomic_inc(v: &root->nr_swapfiles);
10642	spin_unlock(lock: &root->root_item_lock);
10643
10644	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10645
10646	lock_extent(tree: io_tree, start: `0`, end: isize - `1`, cached: &cached_state);
10647	start = `0`;
10648	while (start < isize) {
10649	u64 logical_block_start, physical_block_start;
10650	struct btrfs_block_group *bg;
10651	u64 len = isize - start;
10652
10653	em = btrfs_get_extent(inode: BTRFS_I(inode), NULL, pg_offset: `0`, start, len);
10654	if (IS_ERR(ptr: em)) {
10655	ret = PTR_ERR(ptr: em);
10656	goto out;
10657	}
10658
10659	if (em->block_start == EXTENT_MAP_HOLE) {
10660	btrfs_warn(fs_info, "swapfile must not have holes");
10661	ret = -EINVAL;
10662	goto out;
10663	}
10664	if (em->block_start == EXTENT_MAP_INLINE) {
10665	/*
10666	* It's unlikely we'll ever actually find ourselves
10667	* here, as a file small enough to fit inline won't be
10668	* big enough to store more than the swap header, but in
10669	* case something changes in the future, let's catch it
10670	* here rather than later.
10671	*/
10672	btrfs_warn(fs_info, "swapfile must not be inline");
10673	ret = -EINVAL;
10674	goto out;
10675	}
10676	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10677	btrfs_warn(fs_info, "swapfile must not be compressed");
10678	ret = -EINVAL;
10679	goto out;
10680	}
10681
10682	logical_block_start = em->block_start + (start - em->start);
10683	len = min(len, em->len - (start - em->start));
10684	free_extent_map(em);
10685	em = NULL;
10686
10687	ret = can_nocow_extent(inode, offset: start, len: &len, NULL, NULL, NULL, nowait: false, strict: true);
10688	if (ret < `0`) {
10689	goto out;
10690	} else if (ret) {
10691	ret = `0`;
10692	} else {
10693	btrfs_warn(fs_info,
10694	"swapfile must not be copy-on-write");
10695	ret = -EINVAL;
10696	goto out;
10697	}
10698
10699	em = btrfs_get_chunk_map(fs_info, logical: logical_block_start, length: len);
10700	if (IS_ERR(ptr: em)) {
10701	ret = PTR_ERR(ptr: em);
10702	goto out;
10703	}
10704
10705	if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10706	btrfs_warn(fs_info,
10707	"swapfile must have single data profile");
10708	ret = -EINVAL;
10709	goto out;
10710	}
10711
10712	if (device == NULL) {
10713	device = em->map_lookup->stripes[`0`].dev;
10714	ret = btrfs_add_swapfile_pin(inode, ptr: device, is_block_group: false);
10715	if (ret == `1`)
10716	ret = `0`;
10717	else if (ret)
10718	goto out;
10719	} else if (device != em->map_lookup->stripes[`0`].dev) {
10720	btrfs_warn(fs_info, "swapfile must be on one device");
10721	ret = -EINVAL;
10722	goto out;
10723	}
10724
10725	physical_block_start = (em->map_lookup->stripes[`0`].physical +
10726	(logical_block_start - em->start));
10727	len = min(len, em->len - (logical_block_start - em->start));
10728	free_extent_map(em);
10729	em = NULL;
10730
10731	bg = btrfs_lookup_block_group(info: fs_info, bytenr: logical_block_start);
10732	if (!bg) {
10733	btrfs_warn(fs_info,
10734	"could not find block group containing swapfile");
10735	ret = -EINVAL;
10736	goto out;
10737	}
10738
10739	if (!btrfs_inc_block_group_swap_extents(bg)) {
10740	btrfs_warn(fs_info,
10741	"block group for swapfile at %llu is read-only%s",
10742	bg->start,
10743	atomic_read(&fs_info->scrubs_running) ?
10744	" (scrub running)" : "");
10745	btrfs_put_block_group(cache: bg);
10746	ret = -EINVAL;
10747	goto out;
10748	}
10749
10750	ret = btrfs_add_swapfile_pin(inode, ptr: bg, is_block_group: true);
10751	if (ret) {
10752	btrfs_put_block_group(cache: bg);
10753	if (ret == `1`)
10754	ret = `0`;
10755	else
10756	goto out;
10757	}
10758
10759	if (bsi.block_len &&
10760	bsi.block_start + bsi.block_len == physical_block_start) {
10761	bsi.block_len += len;
10762	} else {
10763	if (bsi.block_len) {
10764	ret = btrfs_add_swap_extent(sis, bsi: &bsi);
10765	if (ret)
10766	goto out;
10767	}
10768	bsi.start = start;
10769	bsi.block_start = physical_block_start;
10770	bsi.block_len = len;
10771	}
10772
10773	start += len;
10774	}
10775
10776	if (bsi.block_len)
10777	ret = btrfs_add_swap_extent(sis, bsi: &bsi);
10778
10779	out:
10780	if (!IS_ERR_OR_NULL(ptr: em))
10781	free_extent_map(em);
10782
10783	unlock_extent(tree: io_tree, start: `0`, end: isize - `1`, cached: &cached_state);
10784
10785	if (ret)
10786	btrfs_swap_deactivate(file);
10787
10788	btrfs_drew_write_unlock(lock: &root->snapshot_lock);
10789
10790	btrfs_exclop_finish(fs_info);
10791
10792	if (ret)
10793	return ret;
10794
10795	if (device)
10796	sis->bdev = device->bdev;
10797	*span = bsi.highest_ppage - bsi.lowest_ppage + `1`;
10798	sis->max = bsi.nr_pages;
10799	sis->pages = bsi.nr_pages - `1`;
10800	sis->highest_bit = bsi.nr_pages - `1`;
10801	return bsi.nr_extents;
10802	}
10803	#else
10804	static void btrfs_swap_deactivate(struct file *file)
10805	{
10806	}
10807
10808	static int btrfs_swap_activate(struct swap_info_struct sis, struct* file *file,
10809	sector_t *span)
10810	{
10811	return -EOPNOTSUPP;
10812	}
10813	#endif
10814
10815	/*
10816	* Update the number of bytes used in the VFS' inode. When we replace extents in
10817	* a range (clone, dedupe, fallocate's zero range), we must update the number of
10818	* bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10819	* always get a correct value.
10820	*/
10821	void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10822	const u64 add_bytes,
10823	const u64 del_bytes)
10824	{
10825	if (add_bytes == del_bytes)
10826	return;
10827
10828	spin_lock(lock: &inode->lock);
10829	if (del_bytes > `0`)
10830	inode_sub_bytes(inode: &inode->vfs_inode, bytes: del_bytes);
10831	if (add_bytes > `0`)
10832	inode_add_bytes(inode: &inode->vfs_inode, bytes: add_bytes);
10833	spin_unlock(lock: &inode->lock);
10834	}
10835
10836	/*
10837	* Verify that there are no ordered extents for a given file range.
10838	*
10839	* @inode: The target inode.
10840	* @start: Start offset of the file range, should be sector size aligned.
10841	* @end: End offset (inclusive) of the file range, its value +1 should be
10842	* sector size aligned.
10843	*
10844	* This should typically be used for cases where we locked an inode's VFS lock in
10845	* exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10846	* we have flushed all delalloc in the range, we have waited for all ordered
10847	* extents in the range to complete and finally we have locked the file range in
10848	* the inode's io_tree.
10849	*/
10850	void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10851	{
10852	struct btrfs_root *root = inode->root;
10853	struct btrfs_ordered_extent *ordered;
10854
10855	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10856	return;
10857
10858	ordered = btrfs_lookup_first_ordered_range(inode, file_offset: start, len: end + `1` - start);
10859	if (ordered) {
10860	btrfs_err(root->fs_info,
10861	"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10862	start, end, btrfs_ino(inode), root->root_key.objectid,
10863	ordered->file_offset,
10864	ordered->file_offset + ordered->num_bytes - `1`);
10865	btrfs_put_ordered_extent(entry: ordered);
10866	}
10867
10868	ASSERT(ordered == NULL);
10869	}
10870
10871	static const struct inode_operations btrfs_dir_inode_operations = {
10872	.getattr = btrfs_getattr,
10873	.lookup = btrfs_lookup,
10874	.create = btrfs_create,
10875	.unlink = btrfs_unlink,
10876	.link = btrfs_link,
10877	.mkdir = btrfs_mkdir,
10878	.rmdir = btrfs_rmdir,
10879	.rename = btrfs_rename2,
10880	.symlink = btrfs_symlink,
10881	.setattr = btrfs_setattr,
10882	.mknod = btrfs_mknod,
10883	.listxattr = btrfs_listxattr,
10884	.permission = btrfs_permission,
10885	.get_inode_acl = btrfs_get_acl,
10886	.set_acl = btrfs_set_acl,
10887	.update_time = btrfs_update_time,
10888	.tmpfile = btrfs_tmpfile,
10889	.fileattr_get = btrfs_fileattr_get,
10890	.fileattr_set = btrfs_fileattr_set,
10891	};
10892
10893	static const struct file_operations btrfs_dir_file_operations = {
10894	.llseek = btrfs_dir_llseek,
10895	.read = generic_read_dir,
10896	.iterate_shared = btrfs_real_readdir,
10897	.open = btrfs_opendir,
10898	.unlocked_ioctl = btrfs_ioctl,
10899	#ifdef CONFIG_COMPAT
10900	.compat_ioctl = btrfs_compat_ioctl,
10901	#endif
10902	.release = btrfs_release_file,
10903	.fsync = btrfs_sync_file,
10904	};
10905
10906	/*
10907	* btrfs doesn't support the bmap operation because swapfiles
10908	* use bmap to make a mapping of extents in the file. They assume
10909	* these extents won't change over the life of the file and they
10910	* use the bmap result to do IO directly to the drive.
10911	*
10912	* the btrfs bmap call would return logical addresses that aren't
10913	* suitable for IO and they also will change frequently as COW
10914	* operations happen. So, swapfile + btrfs == corruption.
10915	*
10916	* For now we're avoiding this by dropping bmap.
10917	*/
10918	static const struct address_space_operations btrfs_aops = {
10919	.read_folio = btrfs_read_folio,
10920	.writepages = btrfs_writepages,
10921	.readahead = btrfs_readahead,
10922	.invalidate_folio = btrfs_invalidate_folio,
10923	.release_folio = btrfs_release_folio,
10924	.migrate_folio = btrfs_migrate_folio,
10925	.dirty_folio = filemap_dirty_folio,
10926	.error_remove_page = generic_error_remove_page,
10927	.swap_activate = btrfs_swap_activate,
10928	.swap_deactivate = btrfs_swap_deactivate,
10929	};
10930
10931	static const struct inode_operations btrfs_file_inode_operations = {
10932	.getattr = btrfs_getattr,
10933	.setattr = btrfs_setattr,
10934	.listxattr = btrfs_listxattr,
10935	.permission = btrfs_permission,
10936	.fiemap = btrfs_fiemap,
10937	.get_inode_acl = btrfs_get_acl,
10938	.set_acl = btrfs_set_acl,
10939	.update_time = btrfs_update_time,
10940	.fileattr_get = btrfs_fileattr_get,
10941	.fileattr_set = btrfs_fileattr_set,
10942	};
10943	static const struct inode_operations btrfs_special_inode_operations = {
10944	.getattr = btrfs_getattr,
10945	.setattr = btrfs_setattr,
10946	.permission = btrfs_permission,
10947	.listxattr = btrfs_listxattr,
10948	.get_inode_acl = btrfs_get_acl,
10949	.set_acl = btrfs_set_acl,
10950	.update_time = btrfs_update_time,
10951	};
10952	static const struct inode_operations btrfs_symlink_inode_operations = {
10953	.get_link = page_get_link,
10954	.getattr = btrfs_getattr,
10955	.setattr = btrfs_setattr,
10956	.permission = btrfs_permission,
10957	.listxattr = btrfs_listxattr,
10958	.update_time = btrfs_update_time,
10959	};
10960
10961	const struct dentry_operations btrfs_dentry_operations = {
10962	.d_delete = btrfs_dentry_delete,
10963	};
10964

source code of linux/fs/btrfs/inode.c