send.c source code [linux/fs/btrfs/send.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2012 Alexander Block. All rights reserved.
4	*/
5
6	#include <linux/bsearch.h>
7	#include <linux/fs.h>
8	#include <linux/file.h>
9	#include <linux/sort.h>
10	#include <linux/mount.h>
11	#include <linux/xattr.h>
12	#include <linux/posix_acl_xattr.h>
13	#include <linux/radix-tree.h>
14	#include <linux/vmalloc.h>
15	#include <linux/string.h>
16	#include <linux/compat.h>
17	#include <linux/crc32c.h>
18	#include <linux/fsverity.h>
19
20	#include "send.h"
21	#include "ctree.h"
22	#include "backref.h"
23	#include "locking.h"
24	#include "disk-io.h"
25	#include "btrfs_inode.h"
26	#include "transaction.h"
27	#include "compression.h"
28	#include "xattr.h"
29	#include "print-tree.h"
30	#include "accessors.h"
31	#include "dir-item.h"
32	#include "file-item.h"
33	#include "ioctl.h"
34	#include "verity.h"
35	#include "lru_cache.h"
36
37	/*
38	* Maximum number of references an extent can have in order for us to attempt to
39	* issue clone operations instead of write operations. This currently exists to
40	* avoid hitting limitations of the backreference walking code (taking a lot of
41	* time and using too much memory for extents with large number of references).
42	*/
43	#define SEND_MAX_EXTENT_REFS 1024
44
45	/*
46	* A fs_path is a helper to dynamically build path names with unknown size.
47	* It reallocates the internal buffer on demand.
48	* It allows fast adding of path elements on the right side (normal path) and
49	* fast adding to the left side (reversed path). A reversed path can also be
50	* unreversed if needed.
51	*/
52	struct fs_path {
53	union {
54	struct {
55	char *start;
56	char *end;
57
58	char *buf;
59	unsigned short buf_len:`15`;
60	unsigned short reversed:`1`;
61	char inline_buf[];
62	};
63	/*
64	* Average path length does not exceed 200 bytes, we'll have
65	* better packing in the slab and higher chance to satisfy
66	* a allocation later during send.
67	*/
68	char pad[`256`];
69	};
70	};
71	#define FS_PATH_INLINE_SIZE \
72	(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
73
74
75	/ reused for each extent /
76	struct clone_root {
77	struct btrfs_root *root;
78	u64 ino;
79	u64 offset;
80	u64 num_bytes;
81	bool found_ref;
82	};
83
84	#define SEND_MAX_NAME_CACHE_SIZE 256
85
86	/*
87	* Limit the root_ids array of struct backref_cache_entry to 17 elements.
88	* This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
89	* can be satisfied from the kmalloc-192 slab, without wasting any space.
90	* The most common case is to have a single root for cloning, which corresponds
91	* to the send root. Having the user specify more than 16 clone roots is not
92	* common, and in such rare cases we simply don't use caching if the number of
93	* cloning roots that lead down to a leaf is more than 17.
94	*/
95	#define SEND_MAX_BACKREF_CACHE_ROOTS 17
96
97	/*
98	* Max number of entries in the cache.
99	* With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
100	* maple tree's internal nodes, is 24K.
101	*/
102	#define SEND_MAX_BACKREF_CACHE_SIZE 128
103
104	/*
105	* A backref cache entry maps a leaf to a list of IDs of roots from which the
106	* leaf is accessible and we can use for clone operations.
107	* With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
108	* x86_64).
109	*/
110	struct backref_cache_entry {
111	struct btrfs_lru_cache_entry entry;
112	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
113	/ Number of valid elements in the root_ids array. /
114	int num_roots;
115	};
116
117	/ See the comment at lru_cache.h about struct btrfs_lru_cache_entry. /
118	static_assert(offsetof(struct backref_cache_entry, entry) == `0`);
119
120	/*
121	* Max number of entries in the cache that stores directories that were already
122	* created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
123	* at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
124	* the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
125	*/
126	#define SEND_MAX_DIR_CREATED_CACHE_SIZE 64
127
128	/*
129	* Max number of entries in the cache that stores directories that were already
130	* created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
131	* at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
132	* the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
133	*/
134	#define SEND_MAX_DIR_UTIMES_CACHE_SIZE 64
135
136	struct send_ctx {
137	struct file *send_filp;
138	loff_t send_off;
139	char *send_buf;
140	u32 send_size;
141	u32 send_max_size;
142	/*
143	* Whether BTRFS_SEND_A_DATA attribute was already added to current
144	* command (since protocol v2, data must be the last attribute).
145	*/
146	bool put_data;
147	struct page **send_buf_pages;
148	u64 flags; / 'flags' member of btrfs_ioctl_send_args is u64 /
149	/ Protocol version compatibility requested /
150	u32 proto;
151
152	struct btrfs_root *send_root;
153	struct btrfs_root *parent_root;
154	struct clone_root *clone_roots;
155	int clone_roots_cnt;
156
157	/ current state of the compare_tree call /
158	struct btrfs_path *left_path;
159	struct btrfs_path *right_path;
160	struct btrfs_key *cmp_key;
161
162	/*
163	* Keep track of the generation of the last transaction that was used
164	* for relocating a block group. This is periodically checked in order
165	* to detect if a relocation happened since the last check, so that we
166	* don't operate on stale extent buffers for nodes (level >= 1) or on
167	* stale disk_bytenr values of file extent items.
168	*/
169	u64 last_reloc_trans;
170
171	/*
172	* infos of the currently processed inode. In case of deleted inodes,
173	* these are the values from the deleted inode.
174	*/
175	u64 cur_ino;
176	u64 cur_inode_gen;
177	u64 cur_inode_size;
178	u64 cur_inode_mode;
179	u64 cur_inode_rdev;
180	u64 cur_inode_last_extent;
181	u64 cur_inode_next_write_offset;
182	bool cur_inode_new;
183	bool cur_inode_new_gen;
184	bool cur_inode_deleted;
185	bool ignore_cur_inode;
186	bool cur_inode_needs_verity;
187	void *verity_descriptor;
188
189	u64 send_progress;
190
191	struct list_head new_refs;
192	struct list_head deleted_refs;
193
194	struct btrfs_lru_cache name_cache;
195
196	/*
197	* The inode we are currently processing. It's not NULL only when we
198	* need to issue write commands for data extents from this inode.
199	*/
200	struct inode *cur_inode;
201	struct file_ra_state ra;
202	u64 page_cache_clear_start;
203	bool clean_page_cache;
204
205	/*
206	* We process inodes by their increasing order, so if before an
207	* incremental send we reverse the parent/child relationship of
208	* directories such that a directory with a lower inode number was
209	* the parent of a directory with a higher inode number, and the one
210	* becoming the new parent got renamed too, we can't rename/move the
211	* directory with lower inode number when we finish processing it - we
212	* must process the directory with higher inode number first, then
213	* rename/move it and then rename/move the directory with lower inode
214	* number. Example follows.
215	*
216	* Tree state when the first send was performed:
217	*
218	* .
219	* \|-- a (ino 257)
220	* \|-- b (ino 258)
221	* \|
222	* \|
223	* \|-- c (ino 259)
224	* \| \|-- d (ino 260)
225	* \|
226	* \|-- c2 (ino 261)
227	*
228	* Tree state when the second (incremental) send is performed:
229	*
230	* .
231	* \|-- a (ino 257)
232	* \|-- b (ino 258)
233	* \|-- c2 (ino 261)
234	* \|-- d2 (ino 260)
235	* \|-- cc (ino 259)
236	*
237	* The sequence of steps that lead to the second state was:
238	*
239	* mv /a/b/c/d /a/b/c2/d2
240	* mv /a/b/c /a/b/c2/d2/cc
241	*
242	* "c" has lower inode number, but we can't move it (2nd mv operation)
243	* before we move "d", which has higher inode number.
244	*
245	* So we just memorize which move/rename operations must be performed
246	* later when their respective parent is processed and moved/renamed.
247	*/
248
249	/ Indexed by parent directory inode number. /
250	struct rb_root pending_dir_moves;
251
252	/*
253	* Reverse index, indexed by the inode number of a directory that
254	* is waiting for the move/rename of its immediate parent before its
255	* own move/rename can be performed.
256	*/
257	struct rb_root waiting_dir_moves;
258
259	/*
260	* A directory that is going to be rm'ed might have a child directory
261	* which is in the pending directory moves index above. In this case,
262	* the directory can only be removed after the move/rename of its child
263	* is performed. Example:
264	*
265	* Parent snapshot:
266	*
267	* . (ino 256)
268	* \|-- a/ (ino 257)
269	* \|-- b/ (ino 258)
270	* \|-- c/ (ino 259)
271	* \| \|-- x/ (ino 260)
272	* \|
273	* \|-- y/ (ino 261)
274	*
275	* Send snapshot:
276	*
277	* . (ino 256)
278	* \|-- a/ (ino 257)
279	* \|-- b/ (ino 258)
280	* \|-- YY/ (ino 261)
281	* \|-- x/ (ino 260)
282	*
283	* Sequence of steps that lead to the send snapshot:
284	* rm -f /a/b/c/foo.txt
285	* mv /a/b/y /a/b/YY
286	* mv /a/b/c/x /a/b/YY
287	* rmdir /a/b/c
288	*
289	* When the child is processed, its move/rename is delayed until its
290	* parent is processed (as explained above), but all other operations
291	* like update utimes, chown, chgrp, etc, are performed and the paths
292	* that it uses for those operations must use the orphanized name of
293	* its parent (the directory we're going to rm later), so we need to
294	* memorize that name.
295	*
296	* Indexed by the inode number of the directory to be deleted.
297	*/
298	struct rb_root orphan_dirs;
299
300	struct rb_root rbtree_new_refs;
301	struct rb_root rbtree_deleted_refs;
302
303	struct btrfs_lru_cache backref_cache;
304	u64 backref_cache_last_reloc_trans;
305
306	struct btrfs_lru_cache dir_created_cache;
307	struct btrfs_lru_cache dir_utimes_cache;
308	};
309
310	struct pending_dir_move {
311	struct rb_node node;
312	struct list_head list;
313	u64 parent_ino;
314	u64 ino;
315	u64 gen;
316	struct list_head update_refs;
317	};
318
319	struct waiting_dir_move {
320	struct rb_node node;
321	u64 ino;
322	/*
323	* There might be some directory that could not be removed because it
324	* was waiting for this directory inode to be moved first. Therefore
325	* after this directory is moved, we can try to rmdir the ino rmdir_ino.
326	*/
327	u64 rmdir_ino;
328	u64 rmdir_gen;
329	bool orphanized;
330	};
331
332	struct orphan_dir_info {
333	struct rb_node node;
334	u64 ino;
335	u64 gen;
336	u64 last_dir_index_offset;
337	u64 dir_high_seq_ino;
338	};
339
340	struct name_cache_entry {
341	/*
342	* The key in the entry is an inode number, and the generation matches
343	* the inode's generation.
344	*/
345	struct btrfs_lru_cache_entry entry;
346	u64 parent_ino;
347	u64 parent_gen;
348	int ret;
349	int need_later_update;
350	int name_len;
351	char name[];
352	};
353
354	/ See the comment at lru_cache.h about struct btrfs_lru_cache_entry. /
355	static_assert(offsetof(struct name_cache_entry, entry) == `0`);
356
357	#define ADVANCE 1
358	#define ADVANCE_ONLY_NEXT -1
359
360	enum btrfs_compare_tree_result {
361	BTRFS_COMPARE_TREE_NEW,
362	BTRFS_COMPARE_TREE_DELETED,
363	BTRFS_COMPARE_TREE_CHANGED,
364	BTRFS_COMPARE_TREE_SAME,
365	};
366
367	__cold
368	static void inconsistent_snapshot_error(struct send_ctx *sctx,
369	enum btrfs_compare_tree_result result,
370	const char *what)
371	{
372	const char *result_string;
373
374	switch (result) {
375	case BTRFS_COMPARE_TREE_NEW:
376	result_string = "new";
377	break;
378	case BTRFS_COMPARE_TREE_DELETED:
379	result_string = "deleted";
380	break;
381	case BTRFS_COMPARE_TREE_CHANGED:
382	result_string = "updated";
383	break;
384	case BTRFS_COMPARE_TREE_SAME:
385	ASSERT(`0`);
386	result_string = "unchanged";
387	break;
388	default:
389	ASSERT(`0`);
390	result_string = "unexpected";
391	}
392
393	btrfs_err(sctx->send_root->fs_info,
394	"Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
395	result_string, what, sctx->cmp_key->objectid,
396	sctx->send_root->root_key.objectid,
397	(sctx->parent_root ?
398	sctx->parent_root->root_key.objectid : `0`));
399	}
400
401	__maybe_unused
402	static bool proto_cmd_ok(const struct send_ctx sctx, int* cmd)
403	{
404	switch (sctx->proto) {
405	case `1`: return cmd <= BTRFS_SEND_C_MAX_V1;
406	case `2`: return cmd <= BTRFS_SEND_C_MAX_V2;
407	case `3`: return cmd <= BTRFS_SEND_C_MAX_V3;
408	default: return false;
409	}
410	}
411
412	static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
413
414	static struct waiting_dir_move *
415	get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
416
417	static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
418
419	static int need_send_hole(struct send_ctx *sctx)
420	{
421	return (sctx->parent_root && !sctx->cur_inode_new &&
422	!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
423	S_ISREG(sctx->cur_inode_mode));
424	}
425
426	static void fs_path_reset(struct fs_path *p)
427	{
428	if (p->reversed) {
429	p->start = p->buf + p->buf_len - `1`;
430	p->end = p->start;
431	*p->start = `0`;
432	} else {
433	p->start = p->buf;
434	p->end = p->start;
435	*p->start = `0`;
436	}
437	}
438
439	static struct fs_path fs_path_alloc(void*)
440	{
441	struct fs_path *p;
442
443	p = kmalloc(size: sizeof(*p), GFP_KERNEL);
444	if (!p)
445	return NULL;
446	p->reversed = `0`;
447	p->buf = p->inline_buf;
448	p->buf_len = FS_PATH_INLINE_SIZE;
449	fs_path_reset(p);
450	return p;
451	}
452
453	static struct fs_path fs_path_alloc_reversed(void*)
454	{
455	struct fs_path *p;
456
457	p = fs_path_alloc();
458	if (!p)
459	return NULL;
460	p->reversed = `1`;
461	fs_path_reset(p);
462	return p;
463	}
464
465	static void fs_path_free(struct fs_path *p)
466	{
467	if (!p)
468	return;
469	if (p->buf != p->inline_buf)
470	kfree(objp: p->buf);
471	kfree(objp: p);
472	}
473
474	static int fs_path_len(struct fs_path *p)
475	{
476	return p->end - p->start;
477	}
478
479	static int fs_path_ensure_buf(struct fs_path p, int* len)
480	{
481	char *tmp_buf;
482	int path_len;
483	int old_buf_len;
484
485	len++;
486
487	if (p->buf_len >= len)
488	return `0`;
489
490	if (len > PATH_MAX) {
491	WARN_ON(`1`);
492	return -ENOMEM;
493	}
494
495	path_len = p->end - p->start;
496	old_buf_len = p->buf_len;
497
498	/*
499	* Allocate to the next largest kmalloc bucket size, to let
500	* the fast path happen most of the time.
501	*/
502	len = kmalloc_size_roundup(size: len);
503	/*
504	* First time the inline_buf does not suffice
505	*/
506	if (p->buf == p->inline_buf) {
507	tmp_buf = kmalloc(size: len, GFP_KERNEL);
508	if (tmp_buf)
509	memcpy(tmp_buf, p->buf, old_buf_len);
510	} else {
511	tmp_buf = krealloc(objp: p->buf, new_size: len, GFP_KERNEL);
512	}
513	if (!tmp_buf)
514	return -ENOMEM;
515	p->buf = tmp_buf;
516	p->buf_len = len;
517
518	if (p->reversed) {
519	tmp_buf = p->buf + old_buf_len - path_len - `1`;
520	p->end = p->buf + p->buf_len - `1`;
521	p->start = p->end - path_len;
522	memmove(p->start, tmp_buf, path_len + `1`);
523	} else {
524	p->start = p->buf;
525	p->end = p->start + path_len;
526	}
527	return `0`;
528	}
529
530	static int fs_path_prepare_for_add(struct fs_path p, int* name_len,
531	char **prepared)
532	{
533	int ret;
534	int new_len;
535
536	new_len = p->end - p->start + name_len;
537	if (p->start != p->end)
538	new_len++;
539	ret = fs_path_ensure_buf(p, len: new_len);
540	if (ret < `0`)
541	goto out;
542
543	if (p->reversed) {
544	if (p->start != p->end)
545	*--p->start = `'/'`;
546	p->start -= name_len;
547	*prepared = p->start;
548	} else {
549	if (p->start != p->end)
550	*p->end++ = `'/'`;
551	*prepared = p->end;
552	p->end += name_len;
553	*p->end = `0`;
554	}
555
556	out:
557	return ret;
558	}
559
560	static int fs_path_add(struct fs_path p, const* char name, int* name_len)
561	{
562	int ret;
563	char *prepared;
564
565	ret = fs_path_prepare_for_add(p, name_len, prepared: &prepared);
566	if (ret < `0`)
567	goto out;
568	memcpy(prepared, name, name_len);
569
570	out:
571	return ret;
572	}
573
574	static int fs_path_add_path(struct fs_path p, struct* fs_path *p2)
575	{
576	int ret;
577	char *prepared;
578
579	ret = fs_path_prepare_for_add(p, name_len: p2->end - p2->start, prepared: &prepared);
580	if (ret < `0`)
581	goto out;
582	memcpy(prepared, p2->start, p2->end - p2->start);
583
584	out:
585	return ret;
586	}
587
588	static int fs_path_add_from_extent_buffer(struct fs_path *p,
589	struct extent_buffer *eb,
590	unsigned long off, int len)
591	{
592	int ret;
593	char *prepared;
594
595	ret = fs_path_prepare_for_add(p, name_len: len, prepared: &prepared);
596	if (ret < `0`)
597	goto out;
598
599	read_extent_buffer(eb, dst: prepared, start: off, len);
600
601	out:
602	return ret;
603	}
604
605	static int fs_path_copy(struct fs_path p, struct* fs_path *from)
606	{
607	p->reversed = from->reversed;
608	fs_path_reset(p);
609
610	return fs_path_add_path(p, p2: from);
611	}
612
613	static void fs_path_unreverse(struct fs_path *p)
614	{
615	char *tmp;
616	int len;
617
618	if (!p->reversed)
619	return;
620
621	tmp = p->start;
622	len = p->end - p->start;
623	p->start = p->buf;
624	p->end = p->start + len;
625	memmove(p->start, tmp, len + `1`);
626	p->reversed = `0`;
627	}
628
629	static struct btrfs_path alloc_path_for_send(void*)
630	{
631	struct btrfs_path *path;
632
633	path = btrfs_alloc_path();
634	if (!path)
635	return NULL;
636	path->search_commit_root = `1`;
637	path->skip_locking = `1`;
638	path->need_commit_sem = `1`;
639	return path;
640	}
641
642	static int write_buf(struct file filp, const* void buf, u32 len, loff_t off)
643	{
644	int ret;
645	u32 pos = `0`;
646
647	while (pos < len) {
648	ret = kernel_write(filp, buf + pos, len - pos, off);
649	if (ret < `0`)
650	return ret;
651	if (ret == `0`)
652	return -EIO;
653	pos += ret;
654	}
655
656	return `0`;
657	}
658
659	static int tlv_put(struct send_ctx sctx, u16 attr, const* void data, int* len)
660	{
661	struct btrfs_tlv_header *hdr;
662	int total_len = sizeof(*hdr) + len;
663	int left = sctx->send_max_size - sctx->send_size;
664
665	if (WARN_ON_ONCE(sctx->put_data))
666	return -EINVAL;
667
668	if (unlikely(left < total_len))
669	return -EOVERFLOW;
670
671	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
672	put_unaligned_le16(val: attr, p: &hdr->tlv_type);
673	put_unaligned_le16(val: len, p: &hdr->tlv_len);
674	memcpy(hdr + `1`, data, len);
675	sctx->send_size += total_len;
676
677	return `0`;
678	}
679
680	#define TLV_PUT_DEFINE_INT(bits) \
681	static int tlv_put_u##bits(struct send_ctx *sctx, \
682	u##bits attr, u##bits value) \
683	{ \
684	__le##bits __tmp = cpu_to_le##bits(value); \
685	return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
686	}
687
688	TLV_PUT_DEFINE_INT(`8`)
689	TLV_PUT_DEFINE_INT(`32`)
690	TLV_PUT_DEFINE_INT(`64`)
691
692	static int tlv_put_string(struct send_ctx *sctx, u16 attr,
693	const char str, int* len)
694	{
695	if (len == -`1`)
696	len = strlen(str);
697	return tlv_put(sctx, attr, data: str, len);
698	}
699
700	static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
701	const u8 *uuid)
702	{
703	return tlv_put(sctx, attr, data: uuid, BTRFS_UUID_SIZE);
704	}
705
706	static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
707	struct extent_buffer *eb,
708	struct btrfs_timespec *ts)
709	{
710	struct btrfs_timespec bts;
711	read_extent_buffer(eb, dst: &bts, start: (unsigned long)ts, len: sizeof(bts));
712	return tlv_put(sctx, attr, data: &bts, len: sizeof(bts));
713	}
714
715
716	#define TLV_PUT(sctx, attrtype, data, attrlen) \
717	do { \
718	ret = tlv_put(sctx, attrtype, data, attrlen); \
719	if (ret < 0) \
720	goto tlv_put_failure; \
721	} while (0)
722
723	#define TLV_PUT_INT(sctx, attrtype, bits, value) \
724	do { \
725	ret = tlv_put_u##bits(sctx, attrtype, value); \
726	if (ret < 0) \
727	goto tlv_put_failure; \
728	} while (0)
729
730	#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
731	#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
732	#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
733	#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
734	#define TLV_PUT_STRING(sctx, attrtype, str, len) \
735	do { \
736	ret = tlv_put_string(sctx, attrtype, str, len); \
737	if (ret < 0) \
738	goto tlv_put_failure; \
739	} while (0)
740	#define TLV_PUT_PATH(sctx, attrtype, p) \
741	do { \
742	ret = tlv_put_string(sctx, attrtype, p->start, \
743	p->end - p->start); \
744	if (ret < 0) \
745	goto tlv_put_failure; \
746	} while(0)
747	#define TLV_PUT_UUID(sctx, attrtype, uuid) \
748	do { \
749	ret = tlv_put_uuid(sctx, attrtype, uuid); \
750	if (ret < 0) \
751	goto tlv_put_failure; \
752	} while (0)
753	#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
754	do { \
755	ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
756	if (ret < 0) \
757	goto tlv_put_failure; \
758	} while (0)
759
760	static int send_header(struct send_ctx *sctx)
761	{
762	struct btrfs_stream_header hdr;
763
764	strcpy(p: hdr.magic, BTRFS_SEND_STREAM_MAGIC);
765	hdr.version = cpu_to_le32(sctx->proto);
766	return write_buf(filp: sctx->send_filp, buf: &hdr, len: sizeof(hdr),
767	off: &sctx->send_off);
768	}
769
770	/*
771	* For each command/item we want to send to userspace, we call this function.
772	*/
773	static int begin_cmd(struct send_ctx sctx, int* cmd)
774	{
775	struct btrfs_cmd_header *hdr;
776
777	if (WARN_ON(!sctx->send_buf))
778	return -EINVAL;
779
780	BUG_ON(sctx->send_size);
781
782	sctx->send_size += sizeof(*hdr);
783	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
784	put_unaligned_le16(val: cmd, p: &hdr->cmd);
785
786	return `0`;
787	}
788
789	static int send_cmd(struct send_ctx *sctx)
790	{
791	int ret;
792	struct btrfs_cmd_header *hdr;
793	u32 crc;
794
795	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
796	put_unaligned_le32(val: sctx->send_size - sizeof(*hdr), p: &hdr->len);
797	put_unaligned_le32(val: `0`, p: &hdr->crc);
798
799	crc = crc32c(crc: `0`, address: (unsigned char *)sctx->send_buf, length: sctx->send_size);
800	put_unaligned_le32(val: crc, p: &hdr->crc);
801
802	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf, len: sctx->send_size,
803	off: &sctx->send_off);
804
805	sctx->send_size = `0`;
806	sctx->put_data = false;
807
808	return ret;
809	}
810
811	/*
812	* Sends a move instruction to user space
813	*/
814	static int send_rename(struct send_ctx *sctx,
815	struct fs_path from, struct* fs_path *to)
816	{
817	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
818	int ret;
819
820	btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
821
822	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_RENAME);
823	if (ret < `0`)
824	goto out;
825
826	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
827	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
828
829	ret = send_cmd(sctx);
830
831	tlv_put_failure:
832	out:
833	return ret;
834	}
835
836	/*
837	* Sends a link instruction to user space
838	*/
839	static int send_link(struct send_ctx *sctx,
840	struct fs_path path, struct* fs_path *lnk)
841	{
842	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
843	int ret;
844
845	btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
846
847	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_LINK);
848	if (ret < `0`)
849	goto out;
850
851	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
852	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
853
854	ret = send_cmd(sctx);
855
856	tlv_put_failure:
857	out:
858	return ret;
859	}
860
861	/*
862	* Sends an unlink instruction to user space
863	*/
864	static int send_unlink(struct send_ctx sctx, struct* fs_path *path)
865	{
866	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
867	int ret;
868
869	btrfs_debug(fs_info, "send_unlink %s", path->start);
870
871	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UNLINK);
872	if (ret < `0`)
873	goto out;
874
875	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
876
877	ret = send_cmd(sctx);
878
879	tlv_put_failure:
880	out:
881	return ret;
882	}
883
884	/*
885	* Sends a rmdir instruction to user space
886	*/
887	static int send_rmdir(struct send_ctx sctx, struct* fs_path *path)
888	{
889	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
890	int ret;
891
892	btrfs_debug(fs_info, "send_rmdir %s", path->start);
893
894	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_RMDIR);
895	if (ret < `0`)
896	goto out;
897
898	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
899
900	ret = send_cmd(sctx);
901
902	tlv_put_failure:
903	out:
904	return ret;
905	}
906
907	struct btrfs_inode_info {
908	u64 size;
909	u64 gen;
910	u64 mode;
911	u64 uid;
912	u64 gid;
913	u64 rdev;
914	u64 fileattr;
915	u64 nlink;
916	};
917
918	/*
919	* Helper function to retrieve some fields from an inode item.
920	*/
921	static int get_inode_info(struct btrfs_root *root, u64 ino,
922	struct btrfs_inode_info *info)
923	{
924	int ret;
925	struct btrfs_path *path;
926	struct btrfs_inode_item *ii;
927	struct btrfs_key key;
928
929	path = alloc_path_for_send();
930	if (!path)
931	return -ENOMEM;
932
933	key.objectid = ino;
934	key.type = BTRFS_INODE_ITEM_KEY;
935	key.offset = `0`;
936	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
937	if (ret) {
938	if (ret > `0`)
939	ret = -ENOENT;
940	goto out;
941	}
942
943	if (!info)
944	goto out;
945
946	ii = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
947	struct btrfs_inode_item);
948	info->size = btrfs_inode_size(eb: path->nodes[`0`], s: ii);
949	info->gen = btrfs_inode_generation(eb: path->nodes[`0`], s: ii);
950	info->mode = btrfs_inode_mode(eb: path->nodes[`0`], s: ii);
951	info->uid = btrfs_inode_uid(eb: path->nodes[`0`], s: ii);
952	info->gid = btrfs_inode_gid(eb: path->nodes[`0`], s: ii);
953	info->rdev = btrfs_inode_rdev(eb: path->nodes[`0`], s: ii);
954	info->nlink = btrfs_inode_nlink(eb: path->nodes[`0`], s: ii);
955	/*
956	* Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
957	* otherwise logically split to 32/32 parts.
958	*/
959	info->fileattr = btrfs_inode_flags(eb: path->nodes[`0`], s: ii);
960
961	out:
962	btrfs_free_path(p: path);
963	return ret;
964	}
965
966	static int get_inode_gen(struct btrfs_root root, u64 ino, u64 gen)
967	{
968	int ret;
969	struct btrfs_inode_info info = { `0` };
970
971	ASSERT(gen);
972
973	ret = get_inode_info(root, ino, info: &info);
974	*gen = info.gen;
975	return ret;
976	}
977
978	typedef int (iterate_inode_ref_t)(int* num, u64 dir, int index,
979	struct fs_path *p,
980	void *ctx);
981
982	/*
983	* Helper function to iterate the entries in ONE btrfs_inode_ref or
984	* btrfs_inode_extref.
985	* The iterate callback may return a non zero value to stop iteration. This can
986	* be a negative value for error codes or 1 to simply stop it.
987	*
988	* path must point to the INODE_REF or INODE_EXTREF when called.
989	*/
990	static int iterate_inode_ref(struct btrfs_root root, struct* btrfs_path *path,
991	struct btrfs_key found_key, int* resolve,
992	iterate_inode_ref_t iterate, void *ctx)
993	{
994	struct extent_buffer *eb = path->nodes[`0`];
995	struct btrfs_inode_ref *iref;
996	struct btrfs_inode_extref *extref;
997	struct btrfs_path *tmp_path;
998	struct fs_path *p;
999	u32 cur = `0`;
1000	u32 total;
1001	int slot = path->slots[`0`];
1002	u32 name_len;
1003	char *start;
1004	int ret = `0`;
1005	int num = `0`;
1006	int index;
1007	u64 dir;
1008	unsigned long name_off;
1009	unsigned long elem_size;
1010	unsigned long ptr;
1011
1012	p = fs_path_alloc_reversed();
1013	if (!p)
1014	return -ENOMEM;
1015
1016	tmp_path = alloc_path_for_send();
1017	if (!tmp_path) {
1018	fs_path_free(p);
1019	return -ENOMEM;
1020	}
1021
1022
1023	if (found_key->type == BTRFS_INODE_REF_KEY) {
1024	ptr = (unsigned long)btrfs_item_ptr(eb, slot,
1025	struct btrfs_inode_ref);
1026	total = btrfs_item_size(eb, slot);
1027	elem_size = sizeof(*iref);
1028	} else {
1029	ptr = btrfs_item_ptr_offset(eb, slot);
1030	total = btrfs_item_size(eb, slot);
1031	elem_size = sizeof(*extref);
1032	}
1033
1034	while (cur < total) {
1035	fs_path_reset(p);
1036
1037	if (found_key->type == BTRFS_INODE_REF_KEY) {
1038	iref = (struct btrfs_inode_ref *)(ptr + cur);
1039	name_len = btrfs_inode_ref_name_len(eb, s: iref);
1040	name_off = (unsigned long)(iref + `1`);
1041	index = btrfs_inode_ref_index(eb, s: iref);
1042	dir = found_key->offset;
1043	} else {
1044	extref = (struct btrfs_inode_extref *)(ptr + cur);
1045	name_len = btrfs_inode_extref_name_len(eb, s: extref);
1046	name_off = (unsigned long)&extref->name;
1047	index = btrfs_inode_extref_index(eb, s: extref);
1048	dir = btrfs_inode_extref_parent(eb, s: extref);
1049	}
1050
1051	if (resolve) {
1052	start = btrfs_ref_to_path(fs_root: root, path: tmp_path, name_len,
1053	name_off, eb_in: eb, parent: dir,
1054	dest: p->buf, size: p->buf_len);
1055	if (IS_ERR(ptr: start)) {
1056	ret = PTR_ERR(ptr: start);
1057	goto out;
1058	}
1059	if (start < p->buf) {
1060	/ overflow , try again with larger buffer /
1061	ret = fs_path_ensure_buf(p,
1062	len: p->buf_len + p->buf - start);
1063	if (ret < `0`)
1064	goto out;
1065	start = btrfs_ref_to_path(fs_root: root, path: tmp_path,
1066	name_len, name_off,
1067	eb_in: eb, parent: dir,
1068	dest: p->buf, size: p->buf_len);
1069	if (IS_ERR(ptr: start)) {
1070	ret = PTR_ERR(ptr: start);
1071	goto out;
1072	}
1073	BUG_ON(start < p->buf);
1074	}
1075	p->start = start;
1076	} else {
1077	ret = fs_path_add_from_extent_buffer(p, eb, off: name_off,
1078	len: name_len);
1079	if (ret < `0`)
1080	goto out;
1081	}
1082
1083	cur += elem_size + name_len;
1084	ret = iterate(num, dir, index, p, ctx);
1085	if (ret)
1086	goto out;
1087	num++;
1088	}
1089
1090	out:
1091	btrfs_free_path(p: tmp_path);
1092	fs_path_free(p);
1093	return ret;
1094	}
1095
1096	typedef int (iterate_dir_item_t)(int* num, struct btrfs_key *di_key,
1097	const char name, int* name_len,
1098	const char data, int* data_len,
1099	void *ctx);
1100
1101	/*
1102	* Helper function to iterate the entries in ONE btrfs_dir_item.
1103	* The iterate callback may return a non zero value to stop iteration. This can
1104	* be a negative value for error codes or 1 to simply stop it.
1105	*
1106	* path must point to the dir item when called.
1107	*/
1108	static int iterate_dir_item(struct btrfs_root root, struct* btrfs_path *path,
1109	iterate_dir_item_t iterate, void *ctx)
1110	{
1111	int ret = `0`;
1112	struct extent_buffer *eb;
1113	struct btrfs_dir_item *di;
1114	struct btrfs_key di_key;
1115	char *buf = NULL;
1116	int buf_len;
1117	u32 name_len;
1118	u32 data_len;
1119	u32 cur;
1120	u32 len;
1121	u32 total;
1122	int slot;
1123	int num;
1124
1125	/*
1126	* Start with a small buffer (1 page). If later we end up needing more
1127	* space, which can happen for xattrs on a fs with a leaf size greater
1128	* then the page size, attempt to increase the buffer. Typically xattr
1129	* values are small.
1130	*/
1131	buf_len = PATH_MAX;
1132	buf = kmalloc(size: buf_len, GFP_KERNEL);
1133	if (!buf) {
1134	ret = -ENOMEM;
1135	goto out;
1136	}
1137
1138	eb = path->nodes[`0`];
1139	slot = path->slots[`0`];
1140	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1141	cur = `0`;
1142	len = `0`;
1143	total = btrfs_item_size(eb, slot);
1144
1145	num = `0`;
1146	while (cur < total) {
1147	name_len = btrfs_dir_name_len(eb, s: di);
1148	data_len = btrfs_dir_data_len(eb, s: di);
1149	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &di_key);
1150
1151	if (btrfs_dir_ftype(eb, item: di) == BTRFS_FT_XATTR) {
1152	if (name_len > XATTR_NAME_MAX) {
1153	ret = -ENAMETOOLONG;
1154	goto out;
1155	}
1156	if (name_len + data_len >
1157	BTRFS_MAX_XATTR_SIZE(info: root->fs_info)) {
1158	ret = -E2BIG;
1159	goto out;
1160	}
1161	} else {
1162	/*
1163	* Path too long
1164	*/
1165	if (name_len + data_len > PATH_MAX) {
1166	ret = -ENAMETOOLONG;
1167	goto out;
1168	}
1169	}
1170
1171	if (name_len + data_len > buf_len) {
1172	buf_len = name_len + data_len;
1173	if (is_vmalloc_addr(x: buf)) {
1174	vfree(addr: buf);
1175	buf = NULL;
1176	} else {
1177	char *tmp = krealloc(objp: buf, new_size: buf_len,
1178	GFP_KERNEL \| __GFP_NOWARN);
1179
1180	if (!tmp)
1181	kfree(objp: buf);
1182	buf = tmp;
1183	}
1184	if (!buf) {
1185	buf = kvmalloc(size: buf_len, GFP_KERNEL);
1186	if (!buf) {
1187	ret = -ENOMEM;
1188	goto out;
1189	}
1190	}
1191	}
1192
1193	read_extent_buffer(eb, dst: buf, start: (unsigned long)(di + `1`),
1194	len: name_len + data_len);
1195
1196	len = sizeof(*di) + name_len + data_len;
1197	di = (struct btrfs_dir_item )((char* *)di + len);
1198	cur += len;
1199
1200	ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1201	data_len, ctx);
1202	if (ret < `0`)
1203	goto out;
1204	if (ret) {
1205	ret = `0`;
1206	goto out;
1207	}
1208
1209	num++;
1210	}
1211
1212	out:
1213	kvfree(addr: buf);
1214	return ret;
1215	}
1216
1217	static int __copy_first_ref(int num, u64 dir, int index,
1218	struct fs_path p, void* *ctx)
1219	{
1220	int ret;
1221	struct fs_path *pt = ctx;
1222
1223	ret = fs_path_copy(p: pt, from: p);
1224	if (ret < `0`)
1225	return ret;
1226
1227	/ we want the first only /
1228	return `1`;
1229	}
1230
1231	/*
1232	* Retrieve the first path of an inode. If an inode has more then one
1233	* ref/hardlink, this is ignored.
1234	*/
1235	static int get_inode_path(struct btrfs_root *root,
1236	u64 ino, struct fs_path *path)
1237	{
1238	int ret;
1239	struct btrfs_key key, found_key;
1240	struct btrfs_path *p;
1241
1242	p = alloc_path_for_send();
1243	if (!p)
1244	return -ENOMEM;
1245
1246	fs_path_reset(p: path);
1247
1248	key.objectid = ino;
1249	key.type = BTRFS_INODE_REF_KEY;
1250	key.offset = `0`;
1251
1252	ret = btrfs_search_slot_for_read(root, key: &key, p, find_higher: `1`, return_any: `0`);
1253	if (ret < `0`)
1254	goto out;
1255	if (ret) {
1256	ret = `1`;
1257	goto out;
1258	}
1259	btrfs_item_key_to_cpu(eb: p->nodes[`0`], cpu_key: &found_key, nr: p->slots[`0`]);
1260	if (found_key.objectid != ino \|\|
1261	(found_key.type != BTRFS_INODE_REF_KEY &&
1262	found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1263	ret = -ENOENT;
1264	goto out;
1265	}
1266
1267	ret = iterate_inode_ref(root, path: p, found_key: &found_key, resolve: `1`,
1268	iterate: __copy_first_ref, ctx: path);
1269	if (ret < `0`)
1270	goto out;
1271	ret = `0`;
1272
1273	out:
1274	btrfs_free_path(p);
1275	return ret;
1276	}
1277
1278	struct backref_ctx {
1279	struct send_ctx *sctx;
1280
1281	/ number of total found references /
1282	u64 found;
1283
1284	/*
1285	* used for clones found in send_root. clones found behind cur_objectid
1286	* and cur_offset are not considered as allowed clones.
1287	*/
1288	u64 cur_objectid;
1289	u64 cur_offset;
1290
1291	/ may be truncated in case it's the last extent in a file /
1292	u64 extent_len;
1293
1294	/ The bytenr the file extent item we are processing refers to. /
1295	u64 bytenr;
1296	/ The owner (root id) of the data backref for the current extent. /
1297	u64 backref_owner;
1298	/ The offset of the data backref for the current extent. /
1299	u64 backref_offset;
1300	};
1301
1302	static int __clone_root_cmp_bsearch(const void key, const* void *elt)
1303	{
1304	u64 root = (u64)(uintptr_t)key;
1305	const struct clone_root *cr = elt;
1306
1307	if (root < cr->root->root_key.objectid)
1308	return -`1`;
1309	if (root > cr->root->root_key.objectid)
1310	return `1`;
1311	return `0`;
1312	}
1313
1314	static int __clone_root_cmp_sort(const void e1, const* void *e2)
1315	{
1316	const struct clone_root *cr1 = e1;
1317	const struct clone_root *cr2 = e2;
1318
1319	if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
1320	return -`1`;
1321	if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
1322	return `1`;
1323	return `0`;
1324	}
1325
1326	/*
1327	* Called for every backref that is found for the current extent.
1328	* Results are collected in sctx->clone_roots->ino/offset.
1329	*/
1330	static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
1331	void *ctx_)
1332	{
1333	struct backref_ctx *bctx = ctx_;
1334	struct clone_root *clone_root;
1335
1336	/ First check if the root is in the list of accepted clone sources /
1337	clone_root = bsearch(key: (void *)(uintptr_t)root_id, base: bctx->sctx->clone_roots,
1338	num: bctx->sctx->clone_roots_cnt,
1339	size: sizeof(struct clone_root),
1340	cmp: __clone_root_cmp_bsearch);
1341	if (!clone_root)
1342	return `0`;
1343
1344	/ This is our own reference, bail out as we can't clone from it. /
1345	if (clone_root->root == bctx->sctx->send_root &&
1346	ino == bctx->cur_objectid &&
1347	offset == bctx->cur_offset)
1348	return `0`;
1349
1350	/*
1351	* Make sure we don't consider clones from send_root that are
1352	* behind the current inode/offset.
1353	*/
1354	if (clone_root->root == bctx->sctx->send_root) {
1355	/*
1356	* If the source inode was not yet processed we can't issue a
1357	* clone operation, as the source extent does not exist yet at
1358	* the destination of the stream.
1359	*/
1360	if (ino > bctx->cur_objectid)
1361	return `0`;
1362	/*
1363	* We clone from the inode currently being sent as long as the
1364	* source extent is already processed, otherwise we could try
1365	* to clone from an extent that does not exist yet at the
1366	* destination of the stream.
1367	*/
1368	if (ino == bctx->cur_objectid &&
1369	offset + bctx->extent_len >
1370	bctx->sctx->cur_inode_next_write_offset)
1371	return `0`;
1372	}
1373
1374	bctx->found++;
1375	clone_root->found_ref = true;
1376
1377	/*
1378	* If the given backref refers to a file extent item with a larger
1379	* number of bytes than what we found before, use the new one so that
1380	* we clone more optimally and end up doing less writes and getting
1381	* less exclusive, non-shared extents at the destination.
1382	*/
1383	if (num_bytes > clone_root->num_bytes) {
1384	clone_root->ino = ino;
1385	clone_root->offset = offset;
1386	clone_root->num_bytes = num_bytes;
1387
1388	/*
1389	* Found a perfect candidate, so there's no need to continue
1390	* backref walking.
1391	*/
1392	if (num_bytes >= bctx->extent_len)
1393	return BTRFS_ITERATE_EXTENT_INODES_STOP;
1394	}
1395
1396	return `0`;
1397	}
1398
1399	static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
1400	const u64 *root_ids_ret, int* *root_count_ret)
1401	{
1402	struct backref_ctx *bctx = ctx;
1403	struct send_ctx *sctx = bctx->sctx;
1404	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1405	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
1406	struct btrfs_lru_cache_entry *raw_entry;
1407	struct backref_cache_entry *entry;
1408
1409	if (btrfs_lru_cache_size(cache: &sctx->backref_cache) == `0`)
1410	return false;
1411
1412	/*
1413	* If relocation happened since we first filled the cache, then we must
1414	* empty the cache and can not use it, because even though we operate on
1415	* read-only roots, their leaves and nodes may have been reallocated and
1416	* now be used for different nodes/leaves of the same tree or some other
1417	* tree.
1418	*
1419	* We are called from iterate_extent_inodes() while either holding a
1420	* transaction handle or holding fs_info->commit_root_sem, so no need
1421	* to take any lock here.
1422	*/
1423	if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
1424	btrfs_lru_cache_clear(cache: &sctx->backref_cache);
1425	return false;
1426	}
1427
1428	raw_entry = btrfs_lru_cache_lookup(cache: &sctx->backref_cache, key, gen: `0`);
1429	if (!raw_entry)
1430	return false;
1431
1432	entry = container_of(raw_entry, struct backref_cache_entry, entry);
1433	*root_ids_ret = entry->root_ids;
1434	*root_count_ret = entry->num_roots;
1435
1436	return true;
1437	}
1438
1439	static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
1440	void *ctx)
1441	{
1442	struct backref_ctx *bctx = ctx;
1443	struct send_ctx *sctx = bctx->sctx;
1444	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1445	struct backref_cache_entry *new_entry;
1446	struct ulist_iterator uiter;
1447	struct ulist_node *node;
1448	int ret;
1449
1450	/*
1451	* We're called while holding a transaction handle or while holding
1452	* fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
1453	* NOFS allocation.
1454	*/
1455	new_entry = kmalloc(size: sizeof(struct backref_cache_entry), GFP_NOFS);
1456	/ No worries, cache is optional. /
1457	if (!new_entry)
1458	return;
1459
1460	new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
1461	new_entry->entry.gen = `0`;
1462	new_entry->num_roots = `0`;
1463	ULIST_ITER_INIT(&uiter);
1464	while ((node = ulist_next(ulist: root_ids, uiter: &uiter)) != NULL) {
1465	const u64 root_id = node->val;
1466	struct clone_root *root;
1467
1468	root = bsearch(key: (void *)(uintptr_t)root_id, base: sctx->clone_roots,
1469	num: sctx->clone_roots_cnt, size: sizeof(struct clone_root),
1470	cmp: __clone_root_cmp_bsearch);
1471	if (!root)
1472	continue;
1473
1474	/ Too many roots, just exit, no worries as caching is optional. /
1475	if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
1476	kfree(objp: new_entry);
1477	return;
1478	}
1479
1480	new_entry->root_ids[new_entry->num_roots] = root_id;
1481	new_entry->num_roots++;
1482	}
1483
1484	/*
1485	* We may have not added any roots to the new cache entry, which means
1486	* none of the roots is part of the list of roots from which we are
1487	* allowed to clone. Cache the new entry as it's still useful to avoid
1488	* backref walking to determine which roots have a path to the leaf.
1489	*
1490	* Also use GFP_NOFS because we're called while holding a transaction
1491	* handle or while holding fs_info->commit_root_sem.
1492	*/
1493	ret = btrfs_lru_cache_store(cache: &sctx->backref_cache, new_entry: &new_entry->entry,
1494	GFP_NOFS);
1495	ASSERT(ret == `0` \|\| ret == -ENOMEM);
1496	if (ret) {
1497	/ Caching is optional, no worries. /
1498	kfree(objp: new_entry);
1499	return;
1500	}
1501
1502	/*
1503	* We are called from iterate_extent_inodes() while either holding a
1504	* transaction handle or holding fs_info->commit_root_sem, so no need
1505	* to take any lock here.
1506	*/
1507	if (btrfs_lru_cache_size(cache: &sctx->backref_cache) == `1`)
1508	sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
1509	}
1510
1511	static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
1512	const struct extent_buffer leaf, void* *ctx)
1513	{
1514	const u64 refs = btrfs_extent_refs(eb: leaf, s: ei);
1515	const struct backref_ctx *bctx = ctx;
1516	const struct send_ctx *sctx = bctx->sctx;
1517
1518	if (bytenr == bctx->bytenr) {
1519	const u64 flags = btrfs_extent_flags(eb: leaf, s: ei);
1520
1521	if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
1522	return -EUCLEAN;
1523
1524	/*
1525	* If we have only one reference and only the send root as a
1526	* clone source - meaning no clone roots were given in the
1527	* struct btrfs_ioctl_send_args passed to the send ioctl - then
1528	* it's our reference and there's no point in doing backref
1529	* walking which is expensive, so exit early.
1530	*/
1531	if (refs == `1` && sctx->clone_roots_cnt == `1`)
1532	return -ENOENT;
1533	}
1534
1535	/*
1536	* Backreference walking (iterate_extent_inodes() below) is currently
1537	* too expensive when an extent has a large number of references, both
1538	* in time spent and used memory. So for now just fallback to write
1539	* operations instead of clone operations when an extent has more than
1540	* a certain amount of references.
1541	*/
1542	if (refs > SEND_MAX_EXTENT_REFS)
1543	return -ENOENT;
1544
1545	return `0`;
1546	}
1547
1548	static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
1549	{
1550	const struct backref_ctx *bctx = ctx;
1551
1552	if (ino == bctx->cur_objectid &&
1553	root == bctx->backref_owner &&
1554	offset == bctx->backref_offset)
1555	return true;
1556
1557	return false;
1558	}
1559
1560	/*
1561	* Given an inode, offset and extent item, it finds a good clone for a clone
1562	* instruction. Returns -ENOENT when none could be found. The function makes
1563	* sure that the returned clone is usable at the point where sending is at the
1564	* moment. This means, that no clones are accepted which lie behind the current
1565	* inode+offset.
1566	*
1567	* path must point to the extent item when called.
1568	*/
1569	static int find_extent_clone(struct send_ctx *sctx,
1570	struct btrfs_path *path,
1571	u64 ino, u64 data_offset,
1572	u64 ino_size,
1573	struct clone_root **found)
1574	{
1575	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1576	int ret;
1577	int extent_type;
1578	u64 logical;
1579	u64 disk_byte;
1580	u64 num_bytes;
1581	struct btrfs_file_extent_item *fi;
1582	struct extent_buffer *eb = path->nodes[`0`];
1583	struct backref_ctx backref_ctx = { `0` };
1584	struct btrfs_backref_walk_ctx backref_walk_ctx = { `0` };
1585	struct clone_root *cur_clone_root;
1586	int compressed;
1587	u32 i;
1588
1589	/*
1590	* With fallocate we can get prealloc extents beyond the inode's i_size,
1591	* so we don't do anything here because clone operations can not clone
1592	* to a range beyond i_size without increasing the i_size of the
1593	* destination inode.
1594	*/
1595	if (data_offset >= ino_size)
1596	return `0`;
1597
1598	fi = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_file_extent_item);
1599	extent_type = btrfs_file_extent_type(eb, s: fi);
1600	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1601	return -ENOENT;
1602
1603	disk_byte = btrfs_file_extent_disk_bytenr(eb, s: fi);
1604	if (disk_byte == `0`)
1605	return -ENOENT;
1606
1607	compressed = btrfs_file_extent_compression(eb, s: fi);
1608	num_bytes = btrfs_file_extent_num_bytes(eb, s: fi);
1609	logical = disk_byte + btrfs_file_extent_offset(eb, s: fi);
1610
1611	/*
1612	* Setup the clone roots.
1613	*/
1614	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
1615	cur_clone_root = sctx->clone_roots + i;
1616	cur_clone_root->ino = (u64)-`1`;
1617	cur_clone_root->offset = `0`;
1618	cur_clone_root->num_bytes = `0`;
1619	cur_clone_root->found_ref = false;
1620	}
1621
1622	backref_ctx.sctx = sctx;
1623	backref_ctx.cur_objectid = ino;
1624	backref_ctx.cur_offset = data_offset;
1625	backref_ctx.bytenr = disk_byte;
1626	/*
1627	* Use the header owner and not the send root's id, because in case of a
1628	* snapshot we can have shared subtrees.
1629	*/
1630	backref_ctx.backref_owner = btrfs_header_owner(eb);
1631	backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, s: fi);
1632
1633	/*
1634	* The last extent of a file may be too large due to page alignment.
1635	* We need to adjust extent_len in this case so that the checks in
1636	* iterate_backrefs() work.
1637	*/
1638	if (data_offset + num_bytes >= ino_size)
1639	backref_ctx.extent_len = ino_size - data_offset;
1640	else
1641	backref_ctx.extent_len = num_bytes;
1642
1643	/*
1644	* Now collect all backrefs.
1645	*/
1646	backref_walk_ctx.bytenr = disk_byte;
1647	if (compressed == BTRFS_COMPRESS_NONE)
1648	backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, s: fi);
1649	backref_walk_ctx.fs_info = fs_info;
1650	backref_walk_ctx.cache_lookup = lookup_backref_cache;
1651	backref_walk_ctx.cache_store = store_backref_cache;
1652	backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
1653	backref_walk_ctx.check_extent_item = check_extent_item;
1654	backref_walk_ctx.user_ctx = &backref_ctx;
1655
1656	/*
1657	* If have a single clone root, then it's the send root and we can tell
1658	* the backref walking code to skip our own backref and not resolve it,
1659	* since we can not use it for cloning - the source and destination
1660	* ranges can't overlap and in case the leaf is shared through a subtree
1661	* due to snapshots, we can't use those other roots since they are not
1662	* in the list of clone roots.
1663	*/
1664	if (sctx->clone_roots_cnt == `1`)
1665	backref_walk_ctx.skip_data_ref = skip_self_data_ref;
1666
1667	ret = iterate_extent_inodes(ctx: &backref_walk_ctx, search_commit_root: true, iterate: iterate_backrefs,
1668	user_ctx: &backref_ctx);
1669	if (ret < `0`)
1670	return ret;
1671
1672	down_read(sem: &fs_info->commit_root_sem);
1673	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
1674	/*
1675	* A transaction commit for a transaction in which block group
1676	* relocation was done just happened.
1677	* The disk_bytenr of the file extent item we processed is
1678	* possibly stale, referring to the extent's location before
1679	* relocation. So act as if we haven't found any clone sources
1680	* and fallback to write commands, which will read the correct
1681	* data from the new extent location. Otherwise we will fail
1682	* below because we haven't found our own back reference or we
1683	* could be getting incorrect sources in case the old extent
1684	* was already reallocated after the relocation.
1685	*/
1686	up_read(sem: &fs_info->commit_root_sem);
1687	return -ENOENT;
1688	}
1689	up_read(sem: &fs_info->commit_root_sem);
1690
1691	btrfs_debug(fs_info,
1692	"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1693	data_offset, ino, num_bytes, logical);
1694
1695	if (!backref_ctx.found) {
1696	btrfs_debug(fs_info, "no clones found");
1697	return -ENOENT;
1698	}
1699
1700	cur_clone_root = NULL;
1701	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
1702	struct clone_root *clone_root = &sctx->clone_roots[i];
1703
1704	if (!clone_root->found_ref)
1705	continue;
1706
1707	/*
1708	* Choose the root from which we can clone more bytes, to
1709	* minimize write operations and therefore have more extent
1710	* sharing at the destination (the same as in the source).
1711	*/
1712	if (!cur_clone_root \|\|
1713	clone_root->num_bytes > cur_clone_root->num_bytes) {
1714	cur_clone_root = clone_root;
1715
1716	/*
1717	* We found an optimal clone candidate (any inode from
1718	* any root is fine), so we're done.
1719	*/
1720	if (clone_root->num_bytes >= backref_ctx.extent_len)
1721	break;
1722	}
1723	}
1724
1725	if (cur_clone_root) {
1726	*found = cur_clone_root;
1727	ret = `0`;
1728	} else {
1729	ret = -ENOENT;
1730	}
1731
1732	return ret;
1733	}
1734
1735	static int read_symlink(struct btrfs_root *root,
1736	u64 ino,
1737	struct fs_path *dest)
1738	{
1739	int ret;
1740	struct btrfs_path *path;
1741	struct btrfs_key key;
1742	struct btrfs_file_extent_item *ei;
1743	u8 type;
1744	u8 compression;
1745	unsigned long off;
1746	int len;
1747
1748	path = alloc_path_for_send();
1749	if (!path)
1750	return -ENOMEM;
1751
1752	key.objectid = ino;
1753	key.type = BTRFS_EXTENT_DATA_KEY;
1754	key.offset = `0`;
1755	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
1756	if (ret < `0`)
1757	goto out;
1758	if (ret) {
1759	/*
1760	* An empty symlink inode. Can happen in rare error paths when
1761	* creating a symlink (transaction committed before the inode
1762	* eviction handler removed the symlink inode items and a crash
1763	* happened in between or the subvol was snapshoted in between).
1764	* Print an informative message to dmesg/syslog so that the user
1765	* can delete the symlink.
1766	*/
1767	btrfs_err(root->fs_info,
1768	"Found empty symlink inode %llu at root %llu",
1769	ino, root->root_key.objectid);
1770	ret = -EIO;
1771	goto out;
1772	}
1773
1774	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
1775	struct btrfs_file_extent_item);
1776	type = btrfs_file_extent_type(eb: path->nodes[`0`], s: ei);
1777	if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
1778	ret = -EUCLEAN;
1779	btrfs_crit(root->fs_info,
1780	"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
1781	ino, btrfs_root_id(root), type);
1782	goto out;
1783	}
1784	compression = btrfs_file_extent_compression(eb: path->nodes[`0`], s: ei);
1785	if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
1786	ret = -EUCLEAN;
1787	btrfs_crit(root->fs_info,
1788	"send: found symlink extent with compression, ino %llu root %llu compression type %d",
1789	ino, btrfs_root_id(root), compression);
1790	goto out;
1791	}
1792
1793	off = btrfs_file_extent_inline_start(e: ei);
1794	len = btrfs_file_extent_ram_bytes(eb: path->nodes[`0`], s: ei);
1795
1796	ret = fs_path_add_from_extent_buffer(p: dest, eb: path->nodes[`0`], off, len);
1797
1798	out:
1799	btrfs_free_path(p: path);
1800	return ret;
1801	}
1802
1803	/*
1804	* Helper function to generate a file name that is unique in the root of
1805	* send_root and parent_root. This is used to generate names for orphan inodes.
1806	*/
1807	static int gen_unique_name(struct send_ctx *sctx,
1808	u64 ino, u64 gen,
1809	struct fs_path *dest)
1810	{
1811	int ret = `0`;
1812	struct btrfs_path *path;
1813	struct btrfs_dir_item *di;
1814	char tmp[`64`];
1815	int len;
1816	u64 idx = `0`;
1817
1818	path = alloc_path_for_send();
1819	if (!path)
1820	return -ENOMEM;
1821
1822	while (`1`) {
1823	struct fscrypt_str tmp_name;
1824
1825	len = snprintf(buf: tmp, size: sizeof(tmp), fmt: "o%llu-%llu-%llu",
1826	ino, gen, idx);
1827	ASSERT(len < sizeof(tmp));
1828	tmp_name.name = tmp;
1829	tmp_name.len = strlen(tmp);
1830
1831	di = btrfs_lookup_dir_item(NULL, root: sctx->send_root,
1832	path, BTRFS_FIRST_FREE_OBJECTID,
1833	name: &tmp_name, mod: `0`);
1834	btrfs_release_path(p: path);
1835	if (IS_ERR(ptr: di)) {
1836	ret = PTR_ERR(ptr: di);
1837	goto out;
1838	}
1839	if (di) {
1840	/ not unique, try again /
1841	idx++;
1842	continue;
1843	}
1844
1845	if (!sctx->parent_root) {
1846	/ unique /
1847	ret = `0`;
1848	break;
1849	}
1850
1851	di = btrfs_lookup_dir_item(NULL, root: sctx->parent_root,
1852	path, BTRFS_FIRST_FREE_OBJECTID,
1853	name: &tmp_name, mod: `0`);
1854	btrfs_release_path(p: path);
1855	if (IS_ERR(ptr: di)) {
1856	ret = PTR_ERR(ptr: di);
1857	goto out;
1858	}
1859	if (di) {
1860	/ not unique, try again /
1861	idx++;
1862	continue;
1863	}
1864	/ unique /
1865	break;
1866	}
1867
1868	ret = fs_path_add(p: dest, name: tmp, strlen(tmp));
1869
1870	out:
1871	btrfs_free_path(p: path);
1872	return ret;
1873	}
1874
1875	enum inode_state {
1876	inode_state_no_change,
1877	inode_state_will_create,
1878	inode_state_did_create,
1879	inode_state_will_delete,
1880	inode_state_did_delete,
1881	};
1882
1883	static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
1884	u64 send_gen, u64 parent_gen)
1885	{
1886	int ret;
1887	int left_ret;
1888	int right_ret;
1889	u64 left_gen;
1890	u64 right_gen = `0`;
1891	struct btrfs_inode_info info;
1892
1893	ret = get_inode_info(root: sctx->send_root, ino, info: &info);
1894	if (ret < `0` && ret != -ENOENT)
1895	goto out;
1896	left_ret = (info.nlink == `0`) ? -ENOENT : ret;
1897	left_gen = info.gen;
1898	if (send_gen)
1899	*send_gen = ((left_ret == -ENOENT) ? `0` : info.gen);
1900
1901	if (!sctx->parent_root) {
1902	right_ret = -ENOENT;
1903	} else {
1904	ret = get_inode_info(root: sctx->parent_root, ino, info: &info);
1905	if (ret < `0` && ret != -ENOENT)
1906	goto out;
1907	right_ret = (info.nlink == `0`) ? -ENOENT : ret;
1908	right_gen = info.gen;
1909	if (parent_gen)
1910	*parent_gen = ((right_ret == -ENOENT) ? `0` : info.gen);
1911	}
1912
1913	if (!left_ret && !right_ret) {
1914	if (left_gen == gen && right_gen == gen) {
1915	ret = inode_state_no_change;
1916	} else if (left_gen == gen) {
1917	if (ino < sctx->send_progress)
1918	ret = inode_state_did_create;
1919	else
1920	ret = inode_state_will_create;
1921	} else if (right_gen == gen) {
1922	if (ino < sctx->send_progress)
1923	ret = inode_state_did_delete;
1924	else
1925	ret = inode_state_will_delete;
1926	} else {
1927	ret = -ENOENT;
1928	}
1929	} else if (!left_ret) {
1930	if (left_gen == gen) {
1931	if (ino < sctx->send_progress)
1932	ret = inode_state_did_create;
1933	else
1934	ret = inode_state_will_create;
1935	} else {
1936	ret = -ENOENT;
1937	}
1938	} else if (!right_ret) {
1939	if (right_gen == gen) {
1940	if (ino < sctx->send_progress)
1941	ret = inode_state_did_delete;
1942	else
1943	ret = inode_state_will_delete;
1944	} else {
1945	ret = -ENOENT;
1946	}
1947	} else {
1948	ret = -ENOENT;
1949	}
1950
1951	out:
1952	return ret;
1953	}
1954
1955	static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
1956	u64 send_gen, u64 parent_gen)
1957	{
1958	int ret;
1959
1960	if (ino == BTRFS_FIRST_FREE_OBJECTID)
1961	return `1`;
1962
1963	ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
1964	if (ret < `0`)
1965	goto out;
1966
1967	if (ret == inode_state_no_change \|\|
1968	ret == inode_state_did_create \|\|
1969	ret == inode_state_will_delete)
1970	ret = `1`;
1971	else
1972	ret = `0`;
1973
1974	out:
1975	return ret;
1976	}
1977
1978	/*
1979	* Helper function to lookup a dir item in a dir.
1980	*/
1981	static int lookup_dir_item_inode(struct btrfs_root *root,
1982	u64 dir, const char name, int* name_len,
1983	u64 *found_inode)
1984	{
1985	int ret = `0`;
1986	struct btrfs_dir_item *di;
1987	struct btrfs_key key;
1988	struct btrfs_path *path;
1989	struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
1990
1991	path = alloc_path_for_send();
1992	if (!path)
1993	return -ENOMEM;
1994
1995	di = btrfs_lookup_dir_item(NULL, root, path, dir, name: &name_str, mod: `0`);
1996	if (IS_ERR_OR_NULL(ptr: di)) {
1997	ret = di ? PTR_ERR(ptr: di) : -ENOENT;
1998	goto out;
1999	}
2000	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &key);
2001	if (key.type == BTRFS_ROOT_ITEM_KEY) {
2002	ret = -ENOENT;
2003	goto out;
2004	}
2005	*found_inode = key.objectid;
2006
2007	out:
2008	btrfs_free_path(p: path);
2009	return ret;
2010	}
2011
2012	/*
2013	* Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
2014	* generation of the parent dir and the name of the dir entry.
2015	*/
2016	static int get_first_ref(struct btrfs_root *root, u64 ino,
2017	u64 dir, u64 dir_gen, struct fs_path *name)
2018	{
2019	int ret;
2020	struct btrfs_key key;
2021	struct btrfs_key found_key;
2022	struct btrfs_path *path;
2023	int len;
2024	u64 parent_dir;
2025
2026	path = alloc_path_for_send();
2027	if (!path)
2028	return -ENOMEM;
2029
2030	key.objectid = ino;
2031	key.type = BTRFS_INODE_REF_KEY;
2032	key.offset = `0`;
2033
2034	ret = btrfs_search_slot_for_read(root, key: &key, p: path, find_higher: `1`, return_any: `0`);
2035	if (ret < `0`)
2036	goto out;
2037	if (!ret)
2038	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &found_key,
2039	nr: path->slots[`0`]);
2040	if (ret \|\| found_key.objectid != ino \|\|
2041	(found_key.type != BTRFS_INODE_REF_KEY &&
2042	found_key.type != BTRFS_INODE_EXTREF_KEY)) {
2043	ret = -ENOENT;
2044	goto out;
2045	}
2046
2047	if (found_key.type == BTRFS_INODE_REF_KEY) {
2048	struct btrfs_inode_ref *iref;
2049	iref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2050	struct btrfs_inode_ref);
2051	len = btrfs_inode_ref_name_len(eb: path->nodes[`0`], s: iref);
2052	ret = fs_path_add_from_extent_buffer(p: name, eb: path->nodes[`0`],
2053	off: (unsigned long)(iref + `1`),
2054	len);
2055	parent_dir = found_key.offset;
2056	} else {
2057	struct btrfs_inode_extref *extref;
2058	extref = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
2059	struct btrfs_inode_extref);
2060	len = btrfs_inode_extref_name_len(eb: path->nodes[`0`], s: extref);
2061	ret = fs_path_add_from_extent_buffer(p: name, eb: path->nodes[`0`],
2062	off: (unsigned long)&extref->name, len);
2063	parent_dir = btrfs_inode_extref_parent(eb: path->nodes[`0`], s: extref);
2064	}
2065	if (ret < `0`)
2066	goto out;
2067	btrfs_release_path(p: path);
2068
2069	if (dir_gen) {
2070	ret = get_inode_gen(root, ino: parent_dir, gen: dir_gen);
2071	if (ret < `0`)
2072	goto out;
2073	}
2074
2075	*dir = parent_dir;
2076
2077	out:
2078	btrfs_free_path(p: path);
2079	return ret;
2080	}
2081
2082	static int is_first_ref(struct btrfs_root *root,
2083	u64 ino, u64 dir,
2084	const char name, int* name_len)
2085	{
2086	int ret;
2087	struct fs_path *tmp_name;
2088	u64 tmp_dir;
2089
2090	tmp_name = fs_path_alloc();
2091	if (!tmp_name)
2092	return -ENOMEM;
2093
2094	ret = get_first_ref(root, ino, dir: &tmp_dir, NULL, name: tmp_name);
2095	if (ret < `0`)
2096	goto out;
2097
2098	if (dir != tmp_dir \|\| name_len != fs_path_len(p: tmp_name)) {
2099	ret = `0`;
2100	goto out;
2101	}
2102
2103	ret = !memcmp(p: tmp_name->start, q: name, size: name_len);
2104
2105	out:
2106	fs_path_free(p: tmp_name);
2107	return ret;
2108	}
2109
2110	/*
2111	* Used by process_recorded_refs to determine if a new ref would overwrite an
2112	* already existing ref. In case it detects an overwrite, it returns the
2113	* inode/gen in who_ino/who_gen.
2114	* When an overwrite is detected, process_recorded_refs does proper orphanizing
2115	* to make sure later references to the overwritten inode are possible.
2116	* Orphanizing is however only required for the first ref of an inode.
2117	* process_recorded_refs does an additional is_first_ref check to see if
2118	* orphanizing is really required.
2119	*/
2120	static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2121	const char name, int* name_len,
2122	u64 who_ino, u64 who_gen, u64 *who_mode)
2123	{
2124	int ret;
2125	u64 parent_root_dir_gen;
2126	u64 other_inode = `0`;
2127	struct btrfs_inode_info info;
2128
2129	if (!sctx->parent_root)
2130	return `0`;
2131
2132	ret = is_inode_existent(sctx, ino: dir, gen: dir_gen, NULL, parent_gen: &parent_root_dir_gen);
2133	if (ret <= `0`)
2134	return `0`;
2135
2136	/*
2137	* If we have a parent root we need to verify that the parent dir was
2138	* not deleted and then re-created, if it was then we have no overwrite
2139	* and we can just unlink this entry.
2140	*
2141	* @parent_root_dir_gen was set to 0 if the inode does not exist in the
2142	* parent root.
2143	*/
2144	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
2145	parent_root_dir_gen != dir_gen)
2146	return `0`;
2147
2148	ret = lookup_dir_item_inode(root: sctx->parent_root, dir, name, name_len,
2149	found_inode: &other_inode);
2150	if (ret == -ENOENT)
2151	return `0`;
2152	else if (ret < `0`)
2153	return ret;
2154
2155	/*
2156	* Check if the overwritten ref was already processed. If yes, the ref
2157	* was already unlinked/moved, so we can safely assume that we will not
2158	* overwrite anything at this point in time.
2159	*/
2160	if (other_inode > sctx->send_progress \|\|
2161	is_waiting_for_move(sctx, ino: other_inode)) {
2162	ret = get_inode_info(root: sctx->parent_root, ino: other_inode, info: &info);
2163	if (ret < `0`)
2164	return ret;
2165
2166	*who_ino = other_inode;
2167	*who_gen = info.gen;
2168	*who_mode = info.mode;
2169	return `1`;
2170	}
2171
2172	return `0`;
2173	}
2174
2175	/*
2176	* Checks if the ref was overwritten by an already processed inode. This is
2177	* used by __get_cur_name_and_parent to find out if the ref was orphanized and
2178	* thus the orphan name needs be used.
2179	* process_recorded_refs also uses it to avoid unlinking of refs that were
2180	* overwritten.
2181	*/
2182	static int did_overwrite_ref(struct send_ctx *sctx,
2183	u64 dir, u64 dir_gen,
2184	u64 ino, u64 ino_gen,
2185	const char name, int* name_len)
2186	{
2187	int ret;
2188	u64 ow_inode;
2189	u64 ow_gen = `0`;
2190	u64 send_root_dir_gen;
2191
2192	if (!sctx->parent_root)
2193	return `0`;
2194
2195	ret = is_inode_existent(sctx, ino: dir, gen: dir_gen, send_gen: &send_root_dir_gen, NULL);
2196	if (ret <= `0`)
2197	return ret;
2198
2199	/*
2200	* @send_root_dir_gen was set to 0 if the inode does not exist in the
2201	* send root.
2202	*/
2203	if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
2204	return `0`;
2205
2206	/ check if the ref was overwritten by another ref /
2207	ret = lookup_dir_item_inode(root: sctx->send_root, dir, name, name_len,
2208	found_inode: &ow_inode);
2209	if (ret == -ENOENT) {
2210	/ was never and will never be overwritten /
2211	return `0`;
2212	} else if (ret < `0`) {
2213	return ret;
2214	}
2215
2216	if (ow_inode == ino) {
2217	ret = get_inode_gen(root: sctx->send_root, ino: ow_inode, gen: &ow_gen);
2218	if (ret < `0`)
2219	return ret;
2220
2221	/ It's the same inode, so no overwrite happened. /
2222	if (ow_gen == ino_gen)
2223	return `0`;
2224	}
2225
2226	/*
2227	* We know that it is or will be overwritten. Check this now.
2228	* The current inode being processed might have been the one that caused
2229	* inode 'ino' to be orphanized, therefore check if ow_inode matches
2230	* the current inode being processed.
2231	*/
2232	if (ow_inode < sctx->send_progress)
2233	return `1`;
2234
2235	if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
2236	if (ow_gen == `0`) {
2237	ret = get_inode_gen(root: sctx->send_root, ino: ow_inode, gen: &ow_gen);
2238	if (ret < `0`)
2239	return ret;
2240	}
2241	if (ow_gen == sctx->cur_inode_gen)
2242	return `1`;
2243	}
2244
2245	return `0`;
2246	}
2247
2248	/*
2249	* Same as did_overwrite_ref, but also checks if it is the first ref of an inode
2250	* that got overwritten. This is used by process_recorded_refs to determine
2251	* if it has to use the path as returned by get_cur_path or the orphan name.
2252	*/
2253	static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
2254	{
2255	int ret = `0`;
2256	struct fs_path *name = NULL;
2257	u64 dir;
2258	u64 dir_gen;
2259
2260	if (!sctx->parent_root)
2261	goto out;
2262
2263	name = fs_path_alloc();
2264	if (!name)
2265	return -ENOMEM;
2266
2267	ret = get_first_ref(root: sctx->parent_root, ino, dir: &dir, dir_gen: &dir_gen, name);
2268	if (ret < `0`)
2269	goto out;
2270
2271	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, ino_gen: gen,
2272	name: name->start, name_len: fs_path_len(p: name));
2273
2274	out:
2275	fs_path_free(p: name);
2276	return ret;
2277	}
2278
2279	static inline struct name_cache_entry name_cache_search(struct* send_ctx *sctx,
2280	u64 ino, u64 gen)
2281	{
2282	struct btrfs_lru_cache_entry *entry;
2283
2284	entry = btrfs_lru_cache_lookup(cache: &sctx->name_cache, key: ino, gen);
2285	if (!entry)
2286	return NULL;
2287
2288	return container_of(entry, struct name_cache_entry, entry);
2289	}
2290
2291	/*
2292	* Used by get_cur_path for each ref up to the root.
2293	* Returns 0 if it succeeded.
2294	* Returns 1 if the inode is not existent or got overwritten. In that case, the
2295	* name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2296	* is returned, parent_ino/parent_gen are not guaranteed to be valid.
2297	* Returns <0 in case of error.
2298	*/
2299	static int __get_cur_name_and_parent(struct send_ctx *sctx,
2300	u64 ino, u64 gen,
2301	u64 *parent_ino,
2302	u64 *parent_gen,
2303	struct fs_path *dest)
2304	{
2305	int ret;
2306	int nce_ret;
2307	struct name_cache_entry *nce;
2308
2309	/*
2310	* First check if we already did a call to this function with the same
2311	* ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2312	* return the cached result.
2313	*/
2314	nce = name_cache_search(sctx, ino, gen);
2315	if (nce) {
2316	if (ino < sctx->send_progress && nce->need_later_update) {
2317	btrfs_lru_cache_remove(cache: &sctx->name_cache, entry: &nce->entry);
2318	nce = NULL;
2319	} else {
2320	*parent_ino = nce->parent_ino;
2321	*parent_gen = nce->parent_gen;
2322	ret = fs_path_add(p: dest, name: nce->name, name_len: nce->name_len);
2323	if (ret < `0`)
2324	goto out;
2325	ret = nce->ret;
2326	goto out;
2327	}
2328	}
2329
2330	/*
2331	* If the inode is not existent yet, add the orphan name and return 1.
2332	* This should only happen for the parent dir that we determine in
2333	* record_new_ref_if_needed().
2334	*/
2335	ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
2336	if (ret < `0`)
2337	goto out;
2338
2339	if (!ret) {
2340	ret = gen_unique_name(sctx, ino, gen, dest);
2341	if (ret < `0`)
2342	goto out;
2343	ret = `1`;
2344	goto out_cache;
2345	}
2346
2347	/*
2348	* Depending on whether the inode was already processed or not, use
2349	* send_root or parent_root for ref lookup.
2350	*/
2351	if (ino < sctx->send_progress)
2352	ret = get_first_ref(root: sctx->send_root, ino,
2353	dir: parent_ino, dir_gen: parent_gen, name: dest);
2354	else
2355	ret = get_first_ref(root: sctx->parent_root, ino,
2356	dir: parent_ino, dir_gen: parent_gen, name: dest);
2357	if (ret < `0`)
2358	goto out;
2359
2360	/*
2361	* Check if the ref was overwritten by an inode's ref that was processed
2362	* earlier. If yes, treat as orphan and return 1.
2363	*/
2364	ret = did_overwrite_ref(sctx, dir: parent_ino, dir_gen: parent_gen, ino, ino_gen: gen,
2365	name: dest->start, name_len: dest->end - dest->start);
2366	if (ret < `0`)
2367	goto out;
2368	if (ret) {
2369	fs_path_reset(p: dest);
2370	ret = gen_unique_name(sctx, ino, gen, dest);
2371	if (ret < `0`)
2372	goto out;
2373	ret = `1`;
2374	}
2375
2376	out_cache:
2377	/*
2378	* Store the result of the lookup in the name cache.
2379	*/
2380	nce = kmalloc(size: sizeof(*nce) + fs_path_len(p: dest) + `1`, GFP_KERNEL);
2381	if (!nce) {
2382	ret = -ENOMEM;
2383	goto out;
2384	}
2385
2386	nce->entry.key = ino;
2387	nce->entry.gen = gen;
2388	nce->parent_ino = *parent_ino;
2389	nce->parent_gen = *parent_gen;
2390	nce->name_len = fs_path_len(p: dest);
2391	nce->ret = ret;
2392	strcpy(p: nce->name, q: dest->start);
2393
2394	if (ino < sctx->send_progress)
2395	nce->need_later_update = `0`;
2396	else
2397	nce->need_later_update = `1`;
2398
2399	nce_ret = btrfs_lru_cache_store(cache: &sctx->name_cache, new_entry: &nce->entry, GFP_KERNEL);
2400	if (nce_ret < `0`) {
2401	kfree(objp: nce);
2402	ret = nce_ret;
2403	}
2404
2405	out:
2406	return ret;
2407	}
2408
2409	/*
2410	* Magic happens here. This function returns the first ref to an inode as it
2411	* would look like while receiving the stream at this point in time.
2412	* We walk the path up to the root. For every inode in between, we check if it
2413	* was already processed/sent. If yes, we continue with the parent as found
2414	* in send_root. If not, we continue with the parent as found in parent_root.
2415	* If we encounter an inode that was deleted at this point in time, we use the
2416	* inodes "orphan" name instead of the real name and stop. Same with new inodes
2417	* that were not created yet and overwritten inodes/refs.
2418	*
2419	* When do we have orphan inodes:
2420	* 1. When an inode is freshly created and thus no valid refs are available yet
2421	* 2. When a directory lost all it's refs (deleted) but still has dir items
2422	* inside which were not processed yet (pending for move/delete). If anyone
2423	* tried to get the path to the dir items, it would get a path inside that
2424	* orphan directory.
2425	* 3. When an inode is moved around or gets new links, it may overwrite the ref
2426	* of an unprocessed inode. If in that case the first ref would be
2427	* overwritten, the overwritten inode gets "orphanized". Later when we
2428	* process this overwritten inode, it is restored at a new place by moving
2429	* the orphan inode.
2430	*
2431	* sctx->send_progress tells this function at which point in time receiving
2432	* would be.
2433	*/
2434	static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2435	struct fs_path *dest)
2436	{
2437	int ret = `0`;
2438	struct fs_path *name = NULL;
2439	u64 parent_inode = `0`;
2440	u64 parent_gen = `0`;
2441	int stop = `0`;
2442
2443	name = fs_path_alloc();
2444	if (!name) {
2445	ret = -ENOMEM;
2446	goto out;
2447	}
2448
2449	dest->reversed = `1`;
2450	fs_path_reset(p: dest);
2451
2452	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2453	struct waiting_dir_move *wdm;
2454
2455	fs_path_reset(p: name);
2456
2457	if (is_waiting_for_rm(sctx, dir_ino: ino, gen)) {
2458	ret = gen_unique_name(sctx, ino, gen, dest: name);
2459	if (ret < `0`)
2460	goto out;
2461	ret = fs_path_add_path(p: dest, p2: name);
2462	break;
2463	}
2464
2465	wdm = get_waiting_dir_move(sctx, ino);
2466	if (wdm && wdm->orphanized) {
2467	ret = gen_unique_name(sctx, ino, gen, dest: name);
2468	stop = `1`;
2469	} else if (wdm) {
2470	ret = get_first_ref(root: sctx->parent_root, ino,
2471	dir: &parent_inode, dir_gen: &parent_gen, name);
2472	} else {
2473	ret = __get_cur_name_and_parent(sctx, ino, gen,
2474	parent_ino: &parent_inode,
2475	parent_gen: &parent_gen, dest: name);
2476	if (ret)
2477	stop = `1`;
2478	}
2479
2480	if (ret < `0`)
2481	goto out;
2482
2483	ret = fs_path_add_path(p: dest, p2: name);
2484	if (ret < `0`)
2485	goto out;
2486
2487	ino = parent_inode;
2488	gen = parent_gen;
2489	}
2490
2491	out:
2492	fs_path_free(p: name);
2493	if (!ret)
2494	fs_path_unreverse(p: dest);
2495	return ret;
2496	}
2497
2498	/*
2499	* Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2500	*/
2501	static int send_subvol_begin(struct send_ctx *sctx)
2502	{
2503	int ret;
2504	struct btrfs_root *send_root = sctx->send_root;
2505	struct btrfs_root *parent_root = sctx->parent_root;
2506	struct btrfs_path *path;
2507	struct btrfs_key key;
2508	struct btrfs_root_ref *ref;
2509	struct extent_buffer *leaf;
2510	char *name = NULL;
2511	int namelen;
2512
2513	path = btrfs_alloc_path();
2514	if (!path)
2515	return -ENOMEM;
2516
2517	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2518	if (!name) {
2519	btrfs_free_path(p: path);
2520	return -ENOMEM;
2521	}
2522
2523	key.objectid = send_root->root_key.objectid;
2524	key.type = BTRFS_ROOT_BACKREF_KEY;
2525	key.offset = `0`;
2526
2527	ret = btrfs_search_slot_for_read(root: send_root->fs_info->tree_root,
2528	key: &key, p: path, find_higher: `1`, return_any: `0`);
2529	if (ret < `0`)
2530	goto out;
2531	if (ret) {
2532	ret = -ENOENT;
2533	goto out;
2534	}
2535
2536	leaf = path->nodes[`0`];
2537	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
2538	if (key.type != BTRFS_ROOT_BACKREF_KEY \|\|
2539	key.objectid != send_root->root_key.objectid) {
2540	ret = -ENOENT;
2541	goto out;
2542	}
2543	ref = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_root_ref);
2544	namelen = btrfs_root_ref_name_len(eb: leaf, s: ref);
2545	read_extent_buffer(eb: leaf, dst: name, start: (unsigned long)(ref + `1`), len: namelen);
2546	btrfs_release_path(p: path);
2547
2548	if (parent_root) {
2549	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SNAPSHOT);
2550	if (ret < `0`)
2551	goto out;
2552	} else {
2553	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SUBVOL);
2554	if (ret < `0`)
2555	goto out;
2556	}
2557
2558	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2559
2560	if (!btrfs_is_empty_uuid(uuid: sctx->send_root->root_item.received_uuid))
2561	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2562	sctx->send_root->root_item.received_uuid);
2563	else
2564	TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2565	sctx->send_root->root_item.uuid);
2566
2567	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2568	btrfs_root_ctransid(&sctx->send_root->root_item));
2569	if (parent_root) {
2570	if (!btrfs_is_empty_uuid(uuid: parent_root->root_item.received_uuid))
2571	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2572	parent_root->root_item.received_uuid);
2573	else
2574	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2575	parent_root->root_item.uuid);
2576	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2577	btrfs_root_ctransid(&sctx->parent_root->root_item));
2578	}
2579
2580	ret = send_cmd(sctx);
2581
2582	tlv_put_failure:
2583	out:
2584	btrfs_free_path(p: path);
2585	kfree(objp: name);
2586	return ret;
2587	}
2588
2589	static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2590	{
2591	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2592	int ret = `0`;
2593	struct fs_path *p;
2594
2595	btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2596
2597	p = fs_path_alloc();
2598	if (!p)
2599	return -ENOMEM;
2600
2601	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_TRUNCATE);
2602	if (ret < `0`)
2603	goto out;
2604
2605	ret = get_cur_path(sctx, ino, gen, dest: p);
2606	if (ret < `0`)
2607	goto out;
2608	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2609	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2610
2611	ret = send_cmd(sctx);
2612
2613	tlv_put_failure:
2614	out:
2615	fs_path_free(p);
2616	return ret;
2617	}
2618
2619	static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2620	{
2621	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2622	int ret = `0`;
2623	struct fs_path *p;
2624
2625	btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2626
2627	p = fs_path_alloc();
2628	if (!p)
2629	return -ENOMEM;
2630
2631	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CHMOD);
2632	if (ret < `0`)
2633	goto out;
2634
2635	ret = get_cur_path(sctx, ino, gen, dest: p);
2636	if (ret < `0`)
2637	goto out;
2638	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2639	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & `07777`);
2640
2641	ret = send_cmd(sctx);
2642
2643	tlv_put_failure:
2644	out:
2645	fs_path_free(p);
2646	return ret;
2647	}
2648
2649	static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
2650	{
2651	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2652	int ret = `0`;
2653	struct fs_path *p;
2654
2655	if (sctx->proto < `2`)
2656	return `0`;
2657
2658	btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
2659
2660	p = fs_path_alloc();
2661	if (!p)
2662	return -ENOMEM;
2663
2664	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_FILEATTR);
2665	if (ret < `0`)
2666	goto out;
2667
2668	ret = get_cur_path(sctx, ino, gen, dest: p);
2669	if (ret < `0`)
2670	goto out;
2671	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2672	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
2673
2674	ret = send_cmd(sctx);
2675
2676	tlv_put_failure:
2677	out:
2678	fs_path_free(p);
2679	return ret;
2680	}
2681
2682	static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2683	{
2684	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2685	int ret = `0`;
2686	struct fs_path *p;
2687
2688	btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
2689	ino, uid, gid);
2690
2691	p = fs_path_alloc();
2692	if (!p)
2693	return -ENOMEM;
2694
2695	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CHOWN);
2696	if (ret < `0`)
2697	goto out;
2698
2699	ret = get_cur_path(sctx, ino, gen, dest: p);
2700	if (ret < `0`)
2701	goto out;
2702	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2703	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2704	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2705
2706	ret = send_cmd(sctx);
2707
2708	tlv_put_failure:
2709	out:
2710	fs_path_free(p);
2711	return ret;
2712	}
2713
2714	static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2715	{
2716	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2717	int ret = `0`;
2718	struct fs_path *p = NULL;
2719	struct btrfs_inode_item *ii;
2720	struct btrfs_path *path = NULL;
2721	struct extent_buffer *eb;
2722	struct btrfs_key key;
2723	int slot;
2724
2725	btrfs_debug(fs_info, "send_utimes %llu", ino);
2726
2727	p = fs_path_alloc();
2728	if (!p)
2729	return -ENOMEM;
2730
2731	path = alloc_path_for_send();
2732	if (!path) {
2733	ret = -ENOMEM;
2734	goto out;
2735	}
2736
2737	key.objectid = ino;
2738	key.type = BTRFS_INODE_ITEM_KEY;
2739	key.offset = `0`;
2740	ret = btrfs_search_slot(NULL, root: sctx->send_root, key: &key, p: path, ins_len: `0`, cow: `0`);
2741	if (ret > `0`)
2742	ret = -ENOENT;
2743	if (ret < `0`)
2744	goto out;
2745
2746	eb = path->nodes[`0`];
2747	slot = path->slots[`0`];
2748	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2749
2750	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UTIMES);
2751	if (ret < `0`)
2752	goto out;
2753
2754	ret = get_cur_path(sctx, ino, gen, dest: p);
2755	if (ret < `0`)
2756	goto out;
2757	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2758	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2759	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2760	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2761	if (sctx->proto >= `2`)
2762	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
2763
2764	ret = send_cmd(sctx);
2765
2766	tlv_put_failure:
2767	out:
2768	fs_path_free(p);
2769	btrfs_free_path(p: path);
2770	return ret;
2771	}
2772
2773	/*
2774	* If the cache is full, we can't remove entries from it and do a call to
2775	* send_utimes() for each respective inode, because we might be finishing
2776	* processing an inode that is a directory and it just got renamed, and existing
2777	* entries in the cache may refer to inodes that have the directory in their
2778	* full path - in which case we would generate outdated paths (pre-rename)
2779	* for the inodes that the cache entries point to. Instead of prunning the
2780	* cache when inserting, do it after we finish processing each inode at
2781	* finish_inode_if_needed().
2782	*/
2783	static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
2784	{
2785	struct btrfs_lru_cache_entry *entry;
2786	int ret;
2787
2788	entry = btrfs_lru_cache_lookup(cache: &sctx->dir_utimes_cache, key: dir, gen);
2789	if (entry != NULL)
2790	return `0`;
2791
2792	/ Caching is optional, don't fail if we can't allocate memory. /
2793	entry = kmalloc(size: sizeof(*entry), GFP_KERNEL);
2794	if (!entry)
2795	return send_utimes(sctx, ino: dir, gen);
2796
2797	entry->key = dir;
2798	entry->gen = gen;
2799
2800	ret = btrfs_lru_cache_store(cache: &sctx->dir_utimes_cache, new_entry: entry, GFP_KERNEL);
2801	ASSERT(ret != -EEXIST);
2802	if (ret) {
2803	kfree(objp: entry);
2804	return send_utimes(sctx, ino: dir, gen);
2805	}
2806
2807	return `0`;
2808	}
2809
2810	static int trim_dir_utimes_cache(struct send_ctx *sctx)
2811	{
2812	while (btrfs_lru_cache_size(cache: &sctx->dir_utimes_cache) >
2813	SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
2814	struct btrfs_lru_cache_entry *lru;
2815	int ret;
2816
2817	lru = btrfs_lru_cache_lru_entry(cache: &sctx->dir_utimes_cache);
2818	ASSERT(lru != NULL);
2819
2820	ret = send_utimes(sctx, ino: lru->key, gen: lru->gen);
2821	if (ret)
2822	return ret;
2823
2824	btrfs_lru_cache_remove(cache: &sctx->dir_utimes_cache, entry: lru);
2825	}
2826
2827	return `0`;
2828	}
2829
2830	/*
2831	* Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2832	* a valid path yet because we did not process the refs yet. So, the inode
2833	* is created as orphan.
2834	*/
2835	static int send_create_inode(struct send_ctx *sctx, u64 ino)
2836	{
2837	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2838	int ret = `0`;
2839	struct fs_path *p;
2840	int cmd;
2841	struct btrfs_inode_info info;
2842	u64 gen;
2843	u64 mode;
2844	u64 rdev;
2845
2846	btrfs_debug(fs_info, "send_create_inode %llu", ino);
2847
2848	p = fs_path_alloc();
2849	if (!p)
2850	return -ENOMEM;
2851
2852	if (ino != sctx->cur_ino) {
2853	ret = get_inode_info(root: sctx->send_root, ino, info: &info);
2854	if (ret < `0`)
2855	goto out;
2856	gen = info.gen;
2857	mode = info.mode;
2858	rdev = info.rdev;
2859	} else {
2860	gen = sctx->cur_inode_gen;
2861	mode = sctx->cur_inode_mode;
2862	rdev = sctx->cur_inode_rdev;
2863	}
2864
2865	if (S_ISREG(mode)) {
2866	cmd = BTRFS_SEND_C_MKFILE;
2867	} else if (S_ISDIR(mode)) {
2868	cmd = BTRFS_SEND_C_MKDIR;
2869	} else if (S_ISLNK(mode)) {
2870	cmd = BTRFS_SEND_C_SYMLINK;
2871	} else if (S_ISCHR(mode) \|\| S_ISBLK(mode)) {
2872	cmd = BTRFS_SEND_C_MKNOD;
2873	} else if (S_ISFIFO(mode)) {
2874	cmd = BTRFS_SEND_C_MKFIFO;
2875	} else if (S_ISSOCK(mode)) {
2876	cmd = BTRFS_SEND_C_MKSOCK;
2877	} else {
2878	btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2879	(int)(mode & S_IFMT));
2880	ret = -EOPNOTSUPP;
2881	goto out;
2882	}
2883
2884	ret = begin_cmd(sctx, cmd);
2885	if (ret < `0`)
2886	goto out;
2887
2888	ret = gen_unique_name(sctx, ino, gen, dest: p);
2889	if (ret < `0`)
2890	goto out;
2891
2892	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2893	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2894
2895	if (S_ISLNK(mode)) {
2896	fs_path_reset(p);
2897	ret = read_symlink(root: sctx->send_root, ino, dest: p);
2898	if (ret < `0`)
2899	goto out;
2900	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2901	} else if (S_ISCHR(mode) \|\| S_ISBLK(mode) \|\|
2902	S_ISFIFO(mode) \|\| S_ISSOCK(mode)) {
2903	TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2904	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2905	}
2906
2907	ret = send_cmd(sctx);
2908	if (ret < `0`)
2909	goto out;
2910
2911
2912	tlv_put_failure:
2913	out:
2914	fs_path_free(p);
2915	return ret;
2916	}
2917
2918	static void cache_dir_created(struct send_ctx *sctx, u64 dir)
2919	{
2920	struct btrfs_lru_cache_entry *entry;
2921	int ret;
2922
2923	/ Caching is optional, ignore any failures. /
2924	entry = kmalloc(size: sizeof(*entry), GFP_KERNEL);
2925	if (!entry)
2926	return;
2927
2928	entry->key = dir;
2929	entry->gen = `0`;
2930	ret = btrfs_lru_cache_store(cache: &sctx->dir_created_cache, new_entry: entry, GFP_KERNEL);
2931	if (ret < `0`)
2932	kfree(objp: entry);
2933	}
2934
2935	/*
2936	* We need some special handling for inodes that get processed before the parent
2937	* directory got created. See process_recorded_refs for details.
2938	* This function does the check if we already created the dir out of order.
2939	*/
2940	static int did_create_dir(struct send_ctx *sctx, u64 dir)
2941	{
2942	int ret = `0`;
2943	int iter_ret = `0`;
2944	struct btrfs_path *path = NULL;
2945	struct btrfs_key key;
2946	struct btrfs_key found_key;
2947	struct btrfs_key di_key;
2948	struct btrfs_dir_item *di;
2949
2950	if (btrfs_lru_cache_lookup(cache: &sctx->dir_created_cache, key: dir, gen: `0`))
2951	return `1`;
2952
2953	path = alloc_path_for_send();
2954	if (!path)
2955	return -ENOMEM;
2956
2957	key.objectid = dir;
2958	key.type = BTRFS_DIR_INDEX_KEY;
2959	key.offset = `0`;
2960
2961	btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
2962	struct extent_buffer *eb = path->nodes[`0`];
2963
2964	if (found_key.objectid != key.objectid \|\|
2965	found_key.type != key.type) {
2966	ret = `0`;
2967	break;
2968	}
2969
2970	di = btrfs_item_ptr(eb, path->slots[`0`], struct btrfs_dir_item);
2971	btrfs_dir_item_key_to_cpu(eb, item: di, cpu_key: &di_key);
2972
2973	if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2974	di_key.objectid < sctx->send_progress) {
2975	ret = `1`;
2976	cache_dir_created(sctx, dir);
2977	break;
2978	}
2979	}
2980	/ Catch error found during iteration /
2981	if (iter_ret < `0`)
2982	ret = iter_ret;
2983
2984	btrfs_free_path(p: path);
2985	return ret;
2986	}
2987
2988	/*
2989	* Only creates the inode if it is:
2990	* 1. Not a directory
2991	* 2. Or a directory which was not created already due to out of order
2992	* directories. See did_create_dir and process_recorded_refs for details.
2993	*/
2994	static int send_create_inode_if_needed(struct send_ctx *sctx)
2995	{
2996	int ret;
2997
2998	if (S_ISDIR(sctx->cur_inode_mode)) {
2999	ret = did_create_dir(sctx, dir: sctx->cur_ino);
3000	if (ret < `0`)
3001	return ret;
3002	else if (ret > `0`)
3003	return `0`;
3004	}
3005
3006	ret = send_create_inode(sctx, ino: sctx->cur_ino);
3007
3008	if (ret == `0` && S_ISDIR(sctx->cur_inode_mode))
3009	cache_dir_created(sctx, dir: sctx->cur_ino);
3010
3011	return ret;
3012	}
3013
3014	struct recorded_ref {
3015	struct list_head list;
3016	char *name;
3017	struct fs_path *full_path;
3018	u64 dir;
3019	u64 dir_gen;
3020	int name_len;
3021	struct rb_node node;
3022	struct rb_root *root;
3023	};
3024
3025	static struct recorded_ref recorded_ref_alloc(void*)
3026	{
3027	struct recorded_ref *ref;
3028
3029	ref = kzalloc(size: sizeof(*ref), GFP_KERNEL);
3030	if (!ref)
3031	return NULL;
3032	RB_CLEAR_NODE(&ref->node);
3033	INIT_LIST_HEAD(list: &ref->list);
3034	return ref;
3035	}
3036
3037	static void recorded_ref_free(struct recorded_ref *ref)
3038	{
3039	if (!ref)
3040	return;
3041	if (!RB_EMPTY_NODE(&ref->node))
3042	rb_erase(&ref->node, ref->root);
3043	list_del(entry: &ref->list);
3044	fs_path_free(p: ref->full_path);
3045	kfree(objp: ref);
3046	}
3047
3048	static void set_ref_path(struct recorded_ref ref, struct* fs_path *path)
3049	{
3050	ref->full_path = path;
3051	ref->name = (char *)kbasename(path: ref->full_path->start);
3052	ref->name_len = ref->full_path->end - ref->name;
3053	}
3054
3055	static int dup_ref(struct recorded_ref ref, struct* list_head *list)
3056	{
3057	struct recorded_ref *new;
3058
3059	new = recorded_ref_alloc();
3060	if (!new)
3061	return -ENOMEM;
3062
3063	new->dir = ref->dir;
3064	new->dir_gen = ref->dir_gen;
3065	list_add_tail(new: &new->list, head: list);
3066	return `0`;
3067	}
3068
3069	static void __free_recorded_refs(struct list_head *head)
3070	{
3071	struct recorded_ref *cur;
3072
3073	while (!list_empty(head)) {
3074	cur = list_entry(head->next, struct recorded_ref, list);
3075	recorded_ref_free(ref: cur);
3076	}
3077	}
3078
3079	static void free_recorded_refs(struct send_ctx *sctx)
3080	{
3081	__free_recorded_refs(head: &sctx->new_refs);
3082	__free_recorded_refs(head: &sctx->deleted_refs);
3083	}
3084
3085	/*
3086	* Renames/moves a file/dir to its orphan name. Used when the first
3087	* ref of an unprocessed inode gets overwritten and for all non empty
3088	* directories.
3089	*/
3090	static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
3091	struct fs_path *path)
3092	{
3093	int ret;
3094	struct fs_path *orphan;
3095
3096	orphan = fs_path_alloc();
3097	if (!orphan)
3098	return -ENOMEM;
3099
3100	ret = gen_unique_name(sctx, ino, gen, dest: orphan);
3101	if (ret < `0`)
3102	goto out;
3103
3104	ret = send_rename(sctx, from: path, to: orphan);
3105
3106	out:
3107	fs_path_free(p: orphan);
3108	return ret;
3109	}
3110
3111	static struct orphan_dir_info add_orphan_dir_info(struct* send_ctx *sctx,
3112	u64 dir_ino, u64 dir_gen)
3113	{
3114	struct rb_node **p = &sctx->orphan_dirs.rb_node;
3115	struct rb_node *parent = NULL;
3116	struct orphan_dir_info entry, odi;
3117
3118	while (*p) {
3119	parent = *p;
3120	entry = rb_entry(parent, struct orphan_dir_info, node);
3121	if (dir_ino < entry->ino)
3122	p = &(*p)->rb_left;
3123	else if (dir_ino > entry->ino)
3124	p = &(*p)->rb_right;
3125	else if (dir_gen < entry->gen)
3126	p = &(*p)->rb_left;
3127	else if (dir_gen > entry->gen)
3128	p = &(*p)->rb_right;
3129	else
3130	return entry;
3131	}
3132
3133	odi = kmalloc(size: sizeof(*odi), GFP_KERNEL);
3134	if (!odi)
3135	return ERR_PTR(error: -ENOMEM);
3136	odi->ino = dir_ino;
3137	odi->gen = dir_gen;
3138	odi->last_dir_index_offset = `0`;
3139	odi->dir_high_seq_ino = `0`;
3140
3141	rb_link_node(node: &odi->node, parent, rb_link: p);
3142	rb_insert_color(&odi->node, &sctx->orphan_dirs);
3143	return odi;
3144	}
3145
3146	static struct orphan_dir_info get_orphan_dir_info(struct* send_ctx *sctx,
3147	u64 dir_ino, u64 gen)
3148	{
3149	struct rb_node *n = sctx->orphan_dirs.rb_node;
3150	struct orphan_dir_info *entry;
3151
3152	while (n) {
3153	entry = rb_entry(n, struct orphan_dir_info, node);
3154	if (dir_ino < entry->ino)
3155	n = n->rb_left;
3156	else if (dir_ino > entry->ino)
3157	n = n->rb_right;
3158	else if (gen < entry->gen)
3159	n = n->rb_left;
3160	else if (gen > entry->gen)
3161	n = n->rb_right;
3162	else
3163	return entry;
3164	}
3165	return NULL;
3166	}
3167
3168	static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
3169	{
3170	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
3171
3172	return odi != NULL;
3173	}
3174
3175	static void free_orphan_dir_info(struct send_ctx *sctx,
3176	struct orphan_dir_info *odi)
3177	{
3178	if (!odi)
3179	return;
3180	rb_erase(&odi->node, &sctx->orphan_dirs);
3181	kfree(objp: odi);
3182	}
3183
3184	/*
3185	* Returns 1 if a directory can be removed at this point in time.
3186	* We check this by iterating all dir items and checking if the inode behind
3187	* the dir item was already processed.
3188	*/
3189	static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
3190	{
3191	int ret = `0`;
3192	int iter_ret = `0`;
3193	struct btrfs_root *root = sctx->parent_root;
3194	struct btrfs_path *path;
3195	struct btrfs_key key;
3196	struct btrfs_key found_key;
3197	struct btrfs_key loc;
3198	struct btrfs_dir_item *di;
3199	struct orphan_dir_info *odi = NULL;
3200	u64 dir_high_seq_ino = `0`;
3201	u64 last_dir_index_offset = `0`;
3202
3203	/*
3204	* Don't try to rmdir the top/root subvolume dir.
3205	*/
3206	if (dir == BTRFS_FIRST_FREE_OBJECTID)
3207	return `0`;
3208
3209	odi = get_orphan_dir_info(sctx, dir_ino: dir, gen: dir_gen);
3210	if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
3211	return `0`;
3212
3213	path = alloc_path_for_send();
3214	if (!path)
3215	return -ENOMEM;
3216
3217	if (!odi) {
3218	/*
3219	* Find the inode number associated with the last dir index
3220	* entry. This is very likely the inode with the highest number
3221	* of all inodes that have an entry in the directory. We can
3222	* then use it to avoid future calls to can_rmdir(), when
3223	* processing inodes with a lower number, from having to search
3224	* the parent root b+tree for dir index keys.
3225	*/
3226	key.objectid = dir;
3227	key.type = BTRFS_DIR_INDEX_KEY;
3228	key.offset = (u64)-`1`;
3229
3230	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
3231	if (ret < `0`) {
3232	goto out;
3233	} else if (ret > `0`) {
3234	/ Can't happen, the root is never empty. /
3235	ASSERT(path->slots[`0`] > `0`);
3236	if (WARN_ON(path->slots[`0`] == `0`)) {
3237	ret = -EUCLEAN;
3238	goto out;
3239	}
3240	path->slots[`0`]--;
3241	}
3242
3243	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
3244	if (key.objectid != dir \|\| key.type != BTRFS_DIR_INDEX_KEY) {
3245	/ No index keys, dir can be removed. /
3246	ret = `1`;
3247	goto out;
3248	}
3249
3250	di = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3251	struct btrfs_dir_item);
3252	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &loc);
3253	dir_high_seq_ino = loc.objectid;
3254	if (sctx->cur_ino < dir_high_seq_ino) {
3255	ret = `0`;
3256	goto out;
3257	}
3258
3259	btrfs_release_path(p: path);
3260	}
3261
3262	key.objectid = dir;
3263	key.type = BTRFS_DIR_INDEX_KEY;
3264	key.offset = (odi ? odi->last_dir_index_offset : `0`);
3265
3266	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
3267	struct waiting_dir_move *dm;
3268
3269	if (found_key.objectid != key.objectid \|\|
3270	found_key.type != key.type)
3271	break;
3272
3273	di = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
3274	struct btrfs_dir_item);
3275	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &loc);
3276
3277	dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
3278	last_dir_index_offset = found_key.offset;
3279
3280	dm = get_waiting_dir_move(sctx, ino: loc.objectid);
3281	if (dm) {
3282	dm->rmdir_ino = dir;
3283	dm->rmdir_gen = dir_gen;
3284	ret = `0`;
3285	goto out;
3286	}
3287
3288	if (loc.objectid > sctx->cur_ino) {
3289	ret = `0`;
3290	goto out;
3291	}
3292	}
3293	if (iter_ret < `0`) {
3294	ret = iter_ret;
3295	goto out;
3296	}
3297	free_orphan_dir_info(sctx, odi);
3298
3299	ret = `1`;
3300
3301	out:
3302	btrfs_free_path(p: path);
3303
3304	if (ret)
3305	return ret;
3306
3307	if (!odi) {
3308	odi = add_orphan_dir_info(sctx, dir_ino: dir, dir_gen);
3309	if (IS_ERR(ptr: odi))
3310	return PTR_ERR(ptr: odi);
3311
3312	odi->gen = dir_gen;
3313	}
3314
3315	odi->last_dir_index_offset = last_dir_index_offset;
3316	odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
3317
3318	return `0`;
3319	}
3320
3321	static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3322	{
3323	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3324
3325	return entry != NULL;
3326	}
3327
3328	static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3329	{
3330	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3331	struct rb_node *parent = NULL;
3332	struct waiting_dir_move entry, dm;
3333
3334	dm = kmalloc(size: sizeof(*dm), GFP_KERNEL);
3335	if (!dm)
3336	return -ENOMEM;
3337	dm->ino = ino;
3338	dm->rmdir_ino = `0`;
3339	dm->rmdir_gen = `0`;
3340	dm->orphanized = orphanized;
3341
3342	while (*p) {
3343	parent = *p;
3344	entry = rb_entry(parent, struct waiting_dir_move, node);
3345	if (ino < entry->ino) {
3346	p = &(*p)->rb_left;
3347	} else if (ino > entry->ino) {
3348	p = &(*p)->rb_right;
3349	} else {
3350	kfree(objp: dm);
3351	return -EEXIST;
3352	}
3353	}
3354
3355	rb_link_node(node: &dm->node, parent, rb_link: p);
3356	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3357	return `0`;
3358	}
3359
3360	static struct waiting_dir_move *
3361	get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3362	{
3363	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3364	struct waiting_dir_move *entry;
3365
3366	while (n) {
3367	entry = rb_entry(n, struct waiting_dir_move, node);
3368	if (ino < entry->ino)
3369	n = n->rb_left;
3370	else if (ino > entry->ino)
3371	n = n->rb_right;
3372	else
3373	return entry;
3374	}
3375	return NULL;
3376	}
3377
3378	static void free_waiting_dir_move(struct send_ctx *sctx,
3379	struct waiting_dir_move *dm)
3380	{
3381	if (!dm)
3382	return;
3383	rb_erase(&dm->node, &sctx->waiting_dir_moves);
3384	kfree(objp: dm);
3385	}
3386
3387	static int add_pending_dir_move(struct send_ctx *sctx,
3388	u64 ino,
3389	u64 ino_gen,
3390	u64 parent_ino,
3391	struct list_head *new_refs,
3392	struct list_head *deleted_refs,
3393	const bool is_orphan)
3394	{
3395	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3396	struct rb_node *parent = NULL;
3397	struct pending_dir_move entry = NULL, pm;
3398	struct recorded_ref *cur;
3399	int exists = `0`;
3400	int ret;
3401
3402	pm = kmalloc(size: sizeof(*pm), GFP_KERNEL);
3403	if (!pm)
3404	return -ENOMEM;
3405	pm->parent_ino = parent_ino;
3406	pm->ino = ino;
3407	pm->gen = ino_gen;
3408	INIT_LIST_HEAD(list: &pm->list);
3409	INIT_LIST_HEAD(list: &pm->update_refs);
3410	RB_CLEAR_NODE(&pm->node);
3411
3412	while (*p) {
3413	parent = *p;
3414	entry = rb_entry(parent, struct pending_dir_move, node);
3415	if (parent_ino < entry->parent_ino) {
3416	p = &(*p)->rb_left;
3417	} else if (parent_ino > entry->parent_ino) {
3418	p = &(*p)->rb_right;
3419	} else {
3420	exists = `1`;
3421	break;
3422	}
3423	}
3424
3425	list_for_each_entry(cur, deleted_refs, list) {
3426	ret = dup_ref(ref: cur, list: &pm->update_refs);
3427	if (ret < `0`)
3428	goto out;
3429	}
3430	list_for_each_entry(cur, new_refs, list) {
3431	ret = dup_ref(ref: cur, list: &pm->update_refs);
3432	if (ret < `0`)
3433	goto out;
3434	}
3435
3436	ret = add_waiting_dir_move(sctx, ino: pm->ino, orphanized: is_orphan);
3437	if (ret)
3438	goto out;
3439
3440	if (exists) {
3441	list_add_tail(new: &pm->list, head: &entry->list);
3442	} else {
3443	rb_link_node(node: &pm->node, parent, rb_link: p);
3444	rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3445	}
3446	ret = `0`;
3447	out:
3448	if (ret) {
3449	__free_recorded_refs(head: &pm->update_refs);
3450	kfree(objp: pm);
3451	}
3452	return ret;
3453	}
3454
3455	static struct pending_dir_move get_pending_dir_moves(struct* send_ctx *sctx,
3456	u64 parent_ino)
3457	{
3458	struct rb_node *n = sctx->pending_dir_moves.rb_node;
3459	struct pending_dir_move *entry;
3460
3461	while (n) {
3462	entry = rb_entry(n, struct pending_dir_move, node);
3463	if (parent_ino < entry->parent_ino)
3464	n = n->rb_left;
3465	else if (parent_ino > entry->parent_ino)
3466	n = n->rb_right;
3467	else
3468	return entry;
3469	}
3470	return NULL;
3471	}
3472
3473	static int path_loop(struct send_ctx sctx, struct* fs_path *name,
3474	u64 ino, u64 gen, u64 *ancestor_ino)
3475	{
3476	int ret = `0`;
3477	u64 parent_inode = `0`;
3478	u64 parent_gen = `0`;
3479	u64 start_ino = ino;
3480
3481	*ancestor_ino = `0`;
3482	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3483	fs_path_reset(p: name);
3484
3485	if (is_waiting_for_rm(sctx, dir_ino: ino, gen))
3486	break;
3487	if (is_waiting_for_move(sctx, ino)) {
3488	if (*ancestor_ino == `0`)
3489	*ancestor_ino = ino;
3490	ret = get_first_ref(root: sctx->parent_root, ino,
3491	dir: &parent_inode, dir_gen: &parent_gen, name);
3492	} else {
3493	ret = __get_cur_name_and_parent(sctx, ino, gen,
3494	parent_ino: &parent_inode,
3495	parent_gen: &parent_gen, dest: name);
3496	if (ret > `0`) {
3497	ret = `0`;
3498	break;
3499	}
3500	}
3501	if (ret < `0`)
3502	break;
3503	if (parent_inode == start_ino) {
3504	ret = `1`;
3505	if (*ancestor_ino == `0`)
3506	*ancestor_ino = ino;
3507	break;
3508	}
3509	ino = parent_inode;
3510	gen = parent_gen;
3511	}
3512	return ret;
3513	}
3514
3515	static int apply_dir_move(struct send_ctx sctx, struct* pending_dir_move *pm)
3516	{
3517	struct fs_path *from_path = NULL;
3518	struct fs_path *to_path = NULL;
3519	struct fs_path *name = NULL;
3520	u64 orig_progress = sctx->send_progress;
3521	struct recorded_ref *cur;
3522	u64 parent_ino, parent_gen;
3523	struct waiting_dir_move *dm = NULL;
3524	u64 rmdir_ino = `0`;
3525	u64 rmdir_gen;
3526	u64 ancestor;
3527	bool is_orphan;
3528	int ret;
3529
3530	name = fs_path_alloc();
3531	from_path = fs_path_alloc();
3532	if (!name \|\| !from_path) {
3533	ret = -ENOMEM;
3534	goto out;
3535	}
3536
3537	dm = get_waiting_dir_move(sctx, ino: pm->ino);
3538	ASSERT(dm);
3539	rmdir_ino = dm->rmdir_ino;
3540	rmdir_gen = dm->rmdir_gen;
3541	is_orphan = dm->orphanized;
3542	free_waiting_dir_move(sctx, dm);
3543
3544	if (is_orphan) {
3545	ret = gen_unique_name(sctx, ino: pm->ino,
3546	gen: pm->gen, dest: from_path);
3547	} else {
3548	ret = get_first_ref(root: sctx->parent_root, ino: pm->ino,
3549	dir: &parent_ino, dir_gen: &parent_gen, name);
3550	if (ret < `0`)
3551	goto out;
3552	ret = get_cur_path(sctx, ino: parent_ino, gen: parent_gen,
3553	dest: from_path);
3554	if (ret < `0`)
3555	goto out;
3556	ret = fs_path_add_path(p: from_path, p2: name);
3557	}
3558	if (ret < `0`)
3559	goto out;
3560
3561	sctx->send_progress = sctx->cur_ino + `1`;
3562	ret = path_loop(sctx, name, ino: pm->ino, gen: pm->gen, ancestor_ino: &ancestor);
3563	if (ret < `0`)
3564	goto out;
3565	if (ret) {
3566	LIST_HEAD(deleted_refs);
3567	ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3568	ret = add_pending_dir_move(sctx, ino: pm->ino, ino_gen: pm->gen, parent_ino: ancestor,
3569	new_refs: &pm->update_refs, deleted_refs: &deleted_refs,
3570	is_orphan);
3571	if (ret < `0`)
3572	goto out;
3573	if (rmdir_ino) {
3574	dm = get_waiting_dir_move(sctx, ino: pm->ino);
3575	ASSERT(dm);
3576	dm->rmdir_ino = rmdir_ino;
3577	dm->rmdir_gen = rmdir_gen;
3578	}
3579	goto out;
3580	}
3581	fs_path_reset(p: name);
3582	to_path = name;
3583	name = NULL;
3584	ret = get_cur_path(sctx, ino: pm->ino, gen: pm->gen, dest: to_path);
3585	if (ret < `0`)
3586	goto out;
3587
3588	ret = send_rename(sctx, from: from_path, to: to_path);
3589	if (ret < `0`)
3590	goto out;
3591
3592	if (rmdir_ino) {
3593	struct orphan_dir_info *odi;
3594	u64 gen;
3595
3596	odi = get_orphan_dir_info(sctx, dir_ino: rmdir_ino, gen: rmdir_gen);
3597	if (!odi) {
3598	/ already deleted /
3599	goto finish;
3600	}
3601	gen = odi->gen;
3602
3603	ret = can_rmdir(sctx, dir: rmdir_ino, dir_gen: gen);
3604	if (ret < `0`)
3605	goto out;
3606	if (!ret)
3607	goto finish;
3608
3609	name = fs_path_alloc();
3610	if (!name) {
3611	ret = -ENOMEM;
3612	goto out;
3613	}
3614	ret = get_cur_path(sctx, ino: rmdir_ino, gen, dest: name);
3615	if (ret < `0`)
3616	goto out;
3617	ret = send_rmdir(sctx, path: name);
3618	if (ret < `0`)
3619	goto out;
3620	}
3621
3622	finish:
3623	ret = cache_dir_utimes(sctx, dir: pm->ino, gen: pm->gen);
3624	if (ret < `0`)
3625	goto out;
3626
3627	/*
3628	* After rename/move, need to update the utimes of both new parent(s)
3629	* and old parent(s).
3630	*/
3631	list_for_each_entry(cur, &pm->update_refs, list) {
3632	/*
3633	* The parent inode might have been deleted in the send snapshot
3634	*/
3635	ret = get_inode_info(root: sctx->send_root, ino: cur->dir, NULL);
3636	if (ret == -ENOENT) {
3637	ret = `0`;
3638	continue;
3639	}
3640	if (ret < `0`)
3641	goto out;
3642
3643	ret = cache_dir_utimes(sctx, dir: cur->dir, gen: cur->dir_gen);
3644	if (ret < `0`)
3645	goto out;
3646	}
3647
3648	out:
3649	fs_path_free(p: name);
3650	fs_path_free(p: from_path);
3651	fs_path_free(p: to_path);
3652	sctx->send_progress = orig_progress;
3653
3654	return ret;
3655	}
3656
3657	static void free_pending_move(struct send_ctx sctx, struct* pending_dir_move *m)
3658	{
3659	if (!list_empty(head: &m->list))
3660	list_del(entry: &m->list);
3661	if (!RB_EMPTY_NODE(&m->node))
3662	rb_erase(&m->node, &sctx->pending_dir_moves);
3663	__free_recorded_refs(head: &m->update_refs);
3664	kfree(objp: m);
3665	}
3666
3667	static void tail_append_pending_moves(struct send_ctx *sctx,
3668	struct pending_dir_move *moves,
3669	struct list_head *stack)
3670	{
3671	if (list_empty(head: &moves->list)) {
3672	list_add_tail(new: &moves->list, head: stack);
3673	} else {
3674	LIST_HEAD(list);
3675	list_splice_init(list: &moves->list, head: &list);
3676	list_add_tail(new: &moves->list, head: stack);
3677	list_splice_tail(list: &list, head: stack);
3678	}
3679	if (!RB_EMPTY_NODE(&moves->node)) {
3680	rb_erase(&moves->node, &sctx->pending_dir_moves);
3681	RB_CLEAR_NODE(&moves->node);
3682	}
3683	}
3684
3685	static int apply_children_dir_moves(struct send_ctx *sctx)
3686	{
3687	struct pending_dir_move *pm;
3688	LIST_HEAD(stack);
3689	u64 parent_ino = sctx->cur_ino;
3690	int ret = `0`;
3691
3692	pm = get_pending_dir_moves(sctx, parent_ino);
3693	if (!pm)
3694	return `0`;
3695
3696	tail_append_pending_moves(sctx, moves: pm, stack: &stack);
3697
3698	while (!list_empty(head: &stack)) {
3699	pm = list_first_entry(&stack, struct pending_dir_move, list);
3700	parent_ino = pm->ino;
3701	ret = apply_dir_move(sctx, pm);
3702	free_pending_move(sctx, m: pm);
3703	if (ret)
3704	goto out;
3705	pm = get_pending_dir_moves(sctx, parent_ino);
3706	if (pm)
3707	tail_append_pending_moves(sctx, moves: pm, stack: &stack);
3708	}
3709	return `0`;
3710
3711	out:
3712	while (!list_empty(head: &stack)) {
3713	pm = list_first_entry(&stack, struct pending_dir_move, list);
3714	free_pending_move(sctx, m: pm);
3715	}
3716	return ret;
3717	}
3718
3719	/*
3720	* We might need to delay a directory rename even when no ancestor directory
3721	* (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3722	* renamed. This happens when we rename a directory to the old name (the name
3723	* in the parent root) of some other unrelated directory that got its rename
3724	* delayed due to some ancestor with higher number that got renamed.
3725	*
3726	* Example:
3727	*
3728	* Parent snapshot:
3729	* . (ino 256)
3730	* \|---- a/ (ino 257)
3731	* \| \|---- file (ino 260)
3732	* \|
3733	* \|---- b/ (ino 258)
3734	* \|---- c/ (ino 259)
3735	*
3736	* Send snapshot:
3737	* . (ino 256)
3738	* \|---- a/ (ino 258)
3739	* \|---- x/ (ino 259)
3740	* \|---- y/ (ino 257)
3741	* \|----- file (ino 260)
3742	*
3743	* Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3744	* from 'a' to 'x/y' happening first, which in turn depends on the rename of
3745	* inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3746	* must issue is:
3747	*
3748	* 1 - rename 259 from 'c' to 'x'
3749	* 2 - rename 257 from 'a' to 'x/y'
3750	* 3 - rename 258 from 'b' to 'a'
3751	*
3752	* Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3753	* be done right away and < 0 on error.
3754	*/
3755	static int wait_for_dest_dir_move(struct send_ctx *sctx,
3756	struct recorded_ref *parent_ref,
3757	const bool is_orphan)
3758	{
3759	struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3760	struct btrfs_path *path;
3761	struct btrfs_key key;
3762	struct btrfs_key di_key;
3763	struct btrfs_dir_item *di;
3764	u64 left_gen;
3765	u64 right_gen;
3766	int ret = `0`;
3767	struct waiting_dir_move *wdm;
3768
3769	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3770	return `0`;
3771
3772	path = alloc_path_for_send();
3773	if (!path)
3774	return -ENOMEM;
3775
3776	key.objectid = parent_ref->dir;
3777	key.type = BTRFS_DIR_ITEM_KEY;
3778	key.offset = btrfs_name_hash(name: parent_ref->name, len: parent_ref->name_len);
3779
3780	ret = btrfs_search_slot(NULL, root: sctx->parent_root, key: &key, p: path, ins_len: `0`, cow: `0`);
3781	if (ret < `0`) {
3782	goto out;
3783	} else if (ret > `0`) {
3784	ret = `0`;
3785	goto out;
3786	}
3787
3788	di = btrfs_match_dir_item_name(fs_info, path, name: parent_ref->name,
3789	name_len: parent_ref->name_len);
3790	if (!di) {
3791	ret = `0`;
3792	goto out;
3793	}
3794	/*
3795	* di_key.objectid has the number of the inode that has a dentry in the
3796	* parent directory with the same name that sctx->cur_ino is being
3797	* renamed to. We need to check if that inode is in the send root as
3798	* well and if it is currently marked as an inode with a pending rename,
3799	* if it is, we need to delay the rename of sctx->cur_ino as well, so
3800	* that it happens after that other inode is renamed.
3801	*/
3802	btrfs_dir_item_key_to_cpu(eb: path->nodes[`0`], item: di, cpu_key: &di_key);
3803	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
3804	ret = `0`;
3805	goto out;
3806	}
3807
3808	ret = get_inode_gen(root: sctx->parent_root, ino: di_key.objectid, gen: &left_gen);
3809	if (ret < `0`)
3810	goto out;
3811	ret = get_inode_gen(root: sctx->send_root, ino: di_key.objectid, gen: &right_gen);
3812	if (ret < `0`) {
3813	if (ret == -ENOENT)
3814	ret = `0`;
3815	goto out;
3816	}
3817
3818	/ Different inode, no need to delay the rename of sctx->cur_ino /
3819	if (right_gen != left_gen) {
3820	ret = `0`;
3821	goto out;
3822	}
3823
3824	wdm = get_waiting_dir_move(sctx, ino: di_key.objectid);
3825	if (wdm && !wdm->orphanized) {
3826	ret = add_pending_dir_move(sctx,
3827	ino: sctx->cur_ino,
3828	ino_gen: sctx->cur_inode_gen,
3829	parent_ino: di_key.objectid,
3830	new_refs: &sctx->new_refs,
3831	deleted_refs: &sctx->deleted_refs,
3832	is_orphan);
3833	if (!ret)
3834	ret = `1`;
3835	}
3836	out:
3837	btrfs_free_path(p: path);
3838	return ret;
3839	}
3840
3841	/*
3842	* Check if inode ino2, or any of its ancestors, is inode ino1.
3843	* Return 1 if true, 0 if false and < 0 on error.
3844	*/
3845	static int check_ino_in_path(struct btrfs_root *root,
3846	const u64 ino1,
3847	const u64 ino1_gen,
3848	const u64 ino2,
3849	const u64 ino2_gen,
3850	struct fs_path *fs_path)
3851	{
3852	u64 ino = ino2;
3853
3854	if (ino1 == ino2)
3855	return ino1_gen == ino2_gen;
3856
3857	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3858	u64 parent;
3859	u64 parent_gen;
3860	int ret;
3861
3862	fs_path_reset(p: fs_path);
3863	ret = get_first_ref(root, ino, dir: &parent, dir_gen: &parent_gen, name: fs_path);
3864	if (ret < `0`)
3865	return ret;
3866	if (parent == ino1)
3867	return parent_gen == ino1_gen;
3868	ino = parent;
3869	}
3870	return `0`;
3871	}
3872
3873	/*
3874	* Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3875	* possible path (in case ino2 is not a directory and has multiple hard links).
3876	* Return 1 if true, 0 if false and < 0 on error.
3877	*/
3878	static int is_ancestor(struct btrfs_root *root,
3879	const u64 ino1,
3880	const u64 ino1_gen,
3881	const u64 ino2,
3882	struct fs_path *fs_path)
3883	{
3884	bool free_fs_path = false;
3885	int ret = `0`;
3886	int iter_ret = `0`;
3887	struct btrfs_path *path = NULL;
3888	struct btrfs_key key;
3889
3890	if (!fs_path) {
3891	fs_path = fs_path_alloc();
3892	if (!fs_path)
3893	return -ENOMEM;
3894	free_fs_path = true;
3895	}
3896
3897	path = alloc_path_for_send();
3898	if (!path) {
3899	ret = -ENOMEM;
3900	goto out;
3901	}
3902
3903	key.objectid = ino2;
3904	key.type = BTRFS_INODE_REF_KEY;
3905	key.offset = `0`;
3906
3907	btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
3908	struct extent_buffer *leaf = path->nodes[`0`];
3909	int slot = path->slots[`0`];
3910	u32 cur_offset = `0`;
3911	u32 item_size;
3912
3913	if (key.objectid != ino2)
3914	break;
3915	if (key.type != BTRFS_INODE_REF_KEY &&
3916	key.type != BTRFS_INODE_EXTREF_KEY)
3917	break;
3918
3919	item_size = btrfs_item_size(eb: leaf, slot);
3920	while (cur_offset < item_size) {
3921	u64 parent;
3922	u64 parent_gen;
3923
3924	if (key.type == BTRFS_INODE_EXTREF_KEY) {
3925	unsigned long ptr;
3926	struct btrfs_inode_extref *extref;
3927
3928	ptr = btrfs_item_ptr_offset(leaf, slot);
3929	extref = (struct btrfs_inode_extref *)
3930	(ptr + cur_offset);
3931	parent = btrfs_inode_extref_parent(eb: leaf,
3932	s: extref);
3933	cur_offset += sizeof(*extref);
3934	cur_offset += btrfs_inode_extref_name_len(eb: leaf,
3935	s: extref);
3936	} else {
3937	parent = key.offset;
3938	cur_offset = item_size;
3939	}
3940
3941	ret = get_inode_gen(root, ino: parent, gen: &parent_gen);
3942	if (ret < `0`)
3943	goto out;
3944	ret = check_ino_in_path(root, ino1, ino1_gen,
3945	ino2: parent, ino2_gen: parent_gen, fs_path);
3946	if (ret)
3947	goto out;
3948	}
3949	}
3950	ret = `0`;
3951	if (iter_ret < `0`)
3952	ret = iter_ret;
3953
3954	out:
3955	btrfs_free_path(p: path);
3956	if (free_fs_path)
3957	fs_path_free(p: fs_path);
3958	return ret;
3959	}
3960
3961	static int wait_for_parent_move(struct send_ctx *sctx,
3962	struct recorded_ref *parent_ref,
3963	const bool is_orphan)
3964	{
3965	int ret = `0`;
3966	u64 ino = parent_ref->dir;
3967	u64 ino_gen = parent_ref->dir_gen;
3968	u64 parent_ino_before, parent_ino_after;
3969	struct fs_path *path_before = NULL;
3970	struct fs_path *path_after = NULL;
3971	int len1, len2;
3972
3973	path_after = fs_path_alloc();
3974	path_before = fs_path_alloc();
3975	if (!path_after \|\| !path_before) {
3976	ret = -ENOMEM;
3977	goto out;
3978	}
3979
3980	/*
3981	* Our current directory inode may not yet be renamed/moved because some
3982	* ancestor (immediate or not) has to be renamed/moved first. So find if
3983	* such ancestor exists and make sure our own rename/move happens after
3984	* that ancestor is processed to avoid path build infinite loops (done
3985	* at get_cur_path()).
3986	*/
3987	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3988	u64 parent_ino_after_gen;
3989
3990	if (is_waiting_for_move(sctx, ino)) {
3991	/*
3992	* If the current inode is an ancestor of ino in the
3993	* parent root, we need to delay the rename of the
3994	* current inode, otherwise don't delayed the rename
3995	* because we can end up with a circular dependency
3996	* of renames, resulting in some directories never
3997	* getting the respective rename operations issued in
3998	* the send stream or getting into infinite path build
3999	* loops.
4000	*/
4001	ret = is_ancestor(root: sctx->parent_root,
4002	ino1: sctx->cur_ino, ino1_gen: sctx->cur_inode_gen,
4003	ino2: ino, fs_path: path_before);
4004	if (ret)
4005	break;
4006	}
4007
4008	fs_path_reset(p: path_before);
4009	fs_path_reset(p: path_after);
4010
4011	ret = get_first_ref(root: sctx->send_root, ino, dir: &parent_ino_after,
4012	dir_gen: &parent_ino_after_gen, name: path_after);
4013	if (ret < `0`)
4014	goto out;
4015	ret = get_first_ref(root: sctx->parent_root, ino, dir: &parent_ino_before,
4016	NULL, name: path_before);
4017	if (ret < `0` && ret != -ENOENT) {
4018	goto out;
4019	} else if (ret == -ENOENT) {
4020	ret = `0`;
4021	break;
4022	}
4023
4024	len1 = fs_path_len(p: path_before);
4025	len2 = fs_path_len(p: path_after);
4026	if (ino > sctx->cur_ino &&
4027	(parent_ino_before != parent_ino_after \|\| len1 != len2 \|\|
4028	memcmp(p: path_before->start, q: path_after->start, size: len1))) {
4029	u64 parent_ino_gen;
4030
4031	ret = get_inode_gen(root: sctx->parent_root, ino, gen: &parent_ino_gen);
4032	if (ret < `0`)
4033	goto out;
4034	if (ino_gen == parent_ino_gen) {
4035	ret = `1`;
4036	break;
4037	}
4038	}
4039	ino = parent_ino_after;
4040	ino_gen = parent_ino_after_gen;
4041	}
4042
4043	out:
4044	fs_path_free(p: path_before);
4045	fs_path_free(p: path_after);
4046
4047	if (ret == `1`) {
4048	ret = add_pending_dir_move(sctx,
4049	ino: sctx->cur_ino,
4050	ino_gen: sctx->cur_inode_gen,
4051	parent_ino: ino,
4052	new_refs: &sctx->new_refs,
4053	deleted_refs: &sctx->deleted_refs,
4054	is_orphan);
4055	if (!ret)
4056	ret = `1`;
4057	}
4058
4059	return ret;
4060	}
4061
4062	static int update_ref_path(struct send_ctx sctx, struct* recorded_ref *ref)
4063	{
4064	int ret;
4065	struct fs_path *new_path;
4066
4067	/*
4068	* Our reference's name member points to its full_path member string, so
4069	* we use here a new path.
4070	*/
4071	new_path = fs_path_alloc();
4072	if (!new_path)
4073	return -ENOMEM;
4074
4075	ret = get_cur_path(sctx, ino: ref->dir, gen: ref->dir_gen, dest: new_path);
4076	if (ret < `0`) {
4077	fs_path_free(p: new_path);
4078	return ret;
4079	}
4080	ret = fs_path_add(p: new_path, name: ref->name, name_len: ref->name_len);
4081	if (ret < `0`) {
4082	fs_path_free(p: new_path);
4083	return ret;
4084	}
4085
4086	fs_path_free(p: ref->full_path);
4087	set_ref_path(ref, path: new_path);
4088
4089	return `0`;
4090	}
4091
4092	/*
4093	* When processing the new references for an inode we may orphanize an existing
4094	* directory inode because its old name conflicts with one of the new references
4095	* of the current inode. Later, when processing another new reference of our
4096	* inode, we might need to orphanize another inode, but the path we have in the
4097	* reference reflects the pre-orphanization name of the directory we previously
4098	* orphanized. For example:
4099	*
4100	* parent snapshot looks like:
4101	*
4102	* . (ino 256)
4103	* \|----- f1 (ino 257)
4104	* \|----- f2 (ino 258)
4105	* \|----- d1/ (ino 259)
4106	* \|----- d2/ (ino 260)
4107	*
4108	* send snapshot looks like:
4109	*
4110	* . (ino 256)
4111	* \|----- d1 (ino 258)
4112	* \|----- f2/ (ino 259)
4113	* \|----- f2_link/ (ino 260)
4114	* \| \|----- f1 (ino 257)
4115	* \|
4116	* \|----- d2 (ino 258)
4117	*
4118	* When processing inode 257 we compute the name for inode 259 as "d1", and we
4119	* cache it in the name cache. Later when we start processing inode 258, when
4120	* collecting all its new references we set a full path of "d1/d2" for its new
4121	* reference with name "d2". When we start processing the new references we
4122	* start by processing the new reference with name "d1", and this results in
4123	* orphanizing inode 259, since its old reference causes a conflict. Then we
4124	* move on the next new reference, with name "d2", and we find out we must
4125	* orphanize inode 260, as its old reference conflicts with ours - but for the
4126	* orphanization we use a source path corresponding to the path we stored in the
4127	* new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
4128	* receiver fail since the path component "d1/" no longer exists, it was renamed
4129	* to "o259-6-0/" when processing the previous new reference. So in this case we
4130	* must recompute the path in the new reference and use it for the new
4131	* orphanization operation.
4132	*/
4133	static int refresh_ref_path(struct send_ctx sctx, struct* recorded_ref *ref)
4134	{
4135	char *name;
4136	int ret;
4137
4138	name = kmemdup(p: ref->name, size: ref->name_len, GFP_KERNEL);
4139	if (!name)
4140	return -ENOMEM;
4141
4142	fs_path_reset(p: ref->full_path);
4143	ret = get_cur_path(sctx, ino: ref->dir, gen: ref->dir_gen, dest: ref->full_path);
4144	if (ret < `0`)
4145	goto out;
4146
4147	ret = fs_path_add(p: ref->full_path, name, name_len: ref->name_len);
4148	if (ret < `0`)
4149	goto out;
4150
4151	/ Update the reference's base name pointer. /
4152	set_ref_path(ref, path: ref->full_path);
4153	out:
4154	kfree(objp: name);
4155	return ret;
4156	}
4157
4158	/*
4159	* This does all the move/link/unlink/rmdir magic.
4160	*/
4161	static int process_recorded_refs(struct send_ctx sctx, int* *pending_move)
4162	{
4163	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4164	int ret = `0`;
4165	struct recorded_ref *cur;
4166	struct recorded_ref *cur2;
4167	LIST_HEAD(check_dirs);
4168	struct fs_path *valid_path = NULL;
4169	u64 ow_inode = `0`;
4170	u64 ow_gen;
4171	u64 ow_mode;
4172	int did_overwrite = `0`;
4173	int is_orphan = `0`;
4174	u64 last_dir_ino_rm = `0`;
4175	bool can_rename = true;
4176	bool orphanized_dir = false;
4177	bool orphanized_ancestor = false;
4178
4179	btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
4180
4181	/*
4182	* This should never happen as the root dir always has the same ref
4183	* which is always '..'
4184	*/
4185	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
4186
4187	valid_path = fs_path_alloc();
4188	if (!valid_path) {
4189	ret = -ENOMEM;
4190	goto out;
4191	}
4192
4193	/*
4194	* First, check if the first ref of the current inode was overwritten
4195	* before. If yes, we know that the current inode was already orphanized
4196	* and thus use the orphan name. If not, we can use get_cur_path to
4197	* get the path of the first ref as it would like while receiving at
4198	* this point in time.
4199	* New inodes are always orphan at the beginning, so force to use the
4200	* orphan name in this case.
4201	* The first ref is stored in valid_path and will be updated if it
4202	* gets moved around.
4203	*/
4204	if (!sctx->cur_inode_new) {
4205	ret = did_overwrite_first_ref(sctx, ino: sctx->cur_ino,
4206	gen: sctx->cur_inode_gen);
4207	if (ret < `0`)
4208	goto out;
4209	if (ret)
4210	did_overwrite = `1`;
4211	}
4212	if (sctx->cur_inode_new \|\| did_overwrite) {
4213	ret = gen_unique_name(sctx, ino: sctx->cur_ino,
4214	gen: sctx->cur_inode_gen, dest: valid_path);
4215	if (ret < `0`)
4216	goto out;
4217	is_orphan = `1`;
4218	} else {
4219	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
4220	dest: valid_path);
4221	if (ret < `0`)
4222	goto out;
4223	}
4224
4225	/*
4226	* Before doing any rename and link operations, do a first pass on the
4227	* new references to orphanize any unprocessed inodes that may have a
4228	* reference that conflicts with one of the new references of the current
4229	* inode. This needs to happen first because a new reference may conflict
4230	* with the old reference of a parent directory, so we must make sure
4231	* that the path used for link and rename commands don't use an
4232	* orphanized name when an ancestor was not yet orphanized.
4233	*
4234	* Example:
4235	*
4236	* Parent snapshot:
4237	*
4238	* . (ino 256)
4239	* \|----- testdir/ (ino 259)
4240	* \| \|----- a (ino 257)
4241	* \|
4242	* \|----- b (ino 258)
4243	*
4244	* Send snapshot:
4245	*
4246	* . (ino 256)
4247	* \|----- testdir_2/ (ino 259)
4248	* \| \|----- a (ino 260)
4249	* \|
4250	* \|----- testdir (ino 257)
4251	* \|----- b (ino 257)
4252	* \|----- b2 (ino 258)
4253	*
4254	* Processing the new reference for inode 257 with name "b" may happen
4255	* before processing the new reference with name "testdir". If so, we
4256	* must make sure that by the time we send a link command to create the
4257	* hard link "b", inode 259 was already orphanized, since the generated
4258	* path in "valid_path" already contains the orphanized name for 259.
4259	* We are processing inode 257, so only later when processing 259 we do
4260	* the rename operation to change its temporary (orphanized) name to
4261	* "testdir_2".
4262	*/
4263	list_for_each_entry(cur, &sctx->new_refs, list) {
4264	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4265	if (ret < `0`)
4266	goto out;
4267	if (ret == inode_state_will_create)
4268	continue;
4269
4270	/*
4271	* Check if this new ref would overwrite the first ref of another
4272	* unprocessed inode. If yes, orphanize the overwritten inode.
4273	* If we find an overwritten ref that is not the first ref,
4274	* simply unlink it.
4275	*/
4276	ret = will_overwrite_ref(sctx, dir: cur->dir, dir_gen: cur->dir_gen,
4277	name: cur->name, name_len: cur->name_len,
4278	who_ino: &ow_inode, who_gen: &ow_gen, who_mode: &ow_mode);
4279	if (ret < `0`)
4280	goto out;
4281	if (ret) {
4282	ret = is_first_ref(root: sctx->parent_root,
4283	ino: ow_inode, dir: cur->dir, name: cur->name,
4284	name_len: cur->name_len);
4285	if (ret < `0`)
4286	goto out;
4287	if (ret) {
4288	struct name_cache_entry *nce;
4289	struct waiting_dir_move *wdm;
4290
4291	if (orphanized_dir) {
4292	ret = refresh_ref_path(sctx, ref: cur);
4293	if (ret < `0`)
4294	goto out;
4295	}
4296
4297	ret = orphanize_inode(sctx, ino: ow_inode, gen: ow_gen,
4298	path: cur->full_path);
4299	if (ret < `0`)
4300	goto out;
4301	if (S_ISDIR(ow_mode))
4302	orphanized_dir = true;
4303
4304	/*
4305	* If ow_inode has its rename operation delayed
4306	* make sure that its orphanized name is used in
4307	* the source path when performing its rename
4308	* operation.
4309	*/
4310	wdm = get_waiting_dir_move(sctx, ino: ow_inode);
4311	if (wdm)
4312	wdm->orphanized = true;
4313
4314	/*
4315	* Make sure we clear our orphanized inode's
4316	* name from the name cache. This is because the
4317	* inode ow_inode might be an ancestor of some
4318	* other inode that will be orphanized as well
4319	* later and has an inode number greater than
4320	* sctx->send_progress. We need to prevent
4321	* future name lookups from using the old name
4322	* and get instead the orphan name.
4323	*/
4324	nce = name_cache_search(sctx, ino: ow_inode, gen: ow_gen);
4325	if (nce)
4326	btrfs_lru_cache_remove(cache: &sctx->name_cache,
4327	entry: &nce->entry);
4328
4329	/*
4330	* ow_inode might currently be an ancestor of
4331	* cur_ino, therefore compute valid_path (the
4332	* current path of cur_ino) again because it
4333	* might contain the pre-orphanization name of
4334	* ow_inode, which is no longer valid.
4335	*/
4336	ret = is_ancestor(root: sctx->parent_root,
4337	ino1: ow_inode, ino1_gen: ow_gen,
4338	ino2: sctx->cur_ino, NULL);
4339	if (ret > `0`) {
4340	orphanized_ancestor = true;
4341	fs_path_reset(p: valid_path);
4342	ret = get_cur_path(sctx, ino: sctx->cur_ino,
4343	gen: sctx->cur_inode_gen,
4344	dest: valid_path);
4345	}
4346	if (ret < `0`)
4347	goto out;
4348	} else {
4349	/*
4350	* If we previously orphanized a directory that
4351	* collided with a new reference that we already
4352	* processed, recompute the current path because
4353	* that directory may be part of the path.
4354	*/
4355	if (orphanized_dir) {
4356	ret = refresh_ref_path(sctx, ref: cur);
4357	if (ret < `0`)
4358	goto out;
4359	}
4360	ret = send_unlink(sctx, path: cur->full_path);
4361	if (ret < `0`)
4362	goto out;
4363	}
4364	}
4365
4366	}
4367
4368	list_for_each_entry(cur, &sctx->new_refs, list) {
4369	/*
4370	* We may have refs where the parent directory does not exist
4371	* yet. This happens if the parent directories inum is higher
4372	* than the current inum. To handle this case, we create the
4373	* parent directory out of order. But we need to check if this
4374	* did already happen before due to other refs in the same dir.
4375	*/
4376	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4377	if (ret < `0`)
4378	goto out;
4379	if (ret == inode_state_will_create) {
4380	ret = `0`;
4381	/*
4382	* First check if any of the current inodes refs did
4383	* already create the dir.
4384	*/
4385	list_for_each_entry(cur2, &sctx->new_refs, list) {
4386	if (cur == cur2)
4387	break;
4388	if (cur2->dir == cur->dir) {
4389	ret = `1`;
4390	break;
4391	}
4392	}
4393
4394	/*
4395	* If that did not happen, check if a previous inode
4396	* did already create the dir.
4397	*/
4398	if (!ret)
4399	ret = did_create_dir(sctx, dir: cur->dir);
4400	if (ret < `0`)
4401	goto out;
4402	if (!ret) {
4403	ret = send_create_inode(sctx, ino: cur->dir);
4404	if (ret < `0`)
4405	goto out;
4406	cache_dir_created(sctx, dir: cur->dir);
4407	}
4408	}
4409
4410	if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4411	ret = wait_for_dest_dir_move(sctx, parent_ref: cur, is_orphan);
4412	if (ret < `0`)
4413	goto out;
4414	if (ret == `1`) {
4415	can_rename = false;
4416	*pending_move = `1`;
4417	}
4418	}
4419
4420	if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4421	can_rename) {
4422	ret = wait_for_parent_move(sctx, parent_ref: cur, is_orphan);
4423	if (ret < `0`)
4424	goto out;
4425	if (ret == `1`) {
4426	can_rename = false;
4427	*pending_move = `1`;
4428	}
4429	}
4430
4431	/*
4432	* link/move the ref to the new place. If we have an orphan
4433	* inode, move it and update valid_path. If not, link or move
4434	* it depending on the inode mode.
4435	*/
4436	if (is_orphan && can_rename) {
4437	ret = send_rename(sctx, from: valid_path, to: cur->full_path);
4438	if (ret < `0`)
4439	goto out;
4440	is_orphan = `0`;
4441	ret = fs_path_copy(p: valid_path, from: cur->full_path);
4442	if (ret < `0`)
4443	goto out;
4444	} else if (can_rename) {
4445	if (S_ISDIR(sctx->cur_inode_mode)) {
4446	/*
4447	* Dirs can't be linked, so move it. For moved
4448	* dirs, we always have one new and one deleted
4449	* ref. The deleted ref is ignored later.
4450	*/
4451	ret = send_rename(sctx, from: valid_path,
4452	to: cur->full_path);
4453	if (!ret)
4454	ret = fs_path_copy(p: valid_path,
4455	from: cur->full_path);
4456	if (ret < `0`)
4457	goto out;
4458	} else {
4459	/*
4460	* We might have previously orphanized an inode
4461	* which is an ancestor of our current inode,
4462	* so our reference's full path, which was
4463	* computed before any such orphanizations, must
4464	* be updated.
4465	*/
4466	if (orphanized_dir) {
4467	ret = update_ref_path(sctx, ref: cur);
4468	if (ret < `0`)
4469	goto out;
4470	}
4471	ret = send_link(sctx, path: cur->full_path,
4472	lnk: valid_path);
4473	if (ret < `0`)
4474	goto out;
4475	}
4476	}
4477	ret = dup_ref(ref: cur, list: &check_dirs);
4478	if (ret < `0`)
4479	goto out;
4480	}
4481
4482	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4483	/*
4484	* Check if we can already rmdir the directory. If not,
4485	* orphanize it. For every dir item inside that gets deleted
4486	* later, we do this check again and rmdir it then if possible.
4487	* See the use of check_dirs for more details.
4488	*/
4489	ret = can_rmdir(sctx, dir: sctx->cur_ino, dir_gen: sctx->cur_inode_gen);
4490	if (ret < `0`)
4491	goto out;
4492	if (ret) {
4493	ret = send_rmdir(sctx, path: valid_path);
4494	if (ret < `0`)
4495	goto out;
4496	} else if (!is_orphan) {
4497	ret = orphanize_inode(sctx, ino: sctx->cur_ino,
4498	gen: sctx->cur_inode_gen, path: valid_path);
4499	if (ret < `0`)
4500	goto out;
4501	is_orphan = `1`;
4502	}
4503
4504	list_for_each_entry(cur, &sctx->deleted_refs, list) {
4505	ret = dup_ref(ref: cur, list: &check_dirs);
4506	if (ret < `0`)
4507	goto out;
4508	}
4509	} else if (S_ISDIR(sctx->cur_inode_mode) &&
4510	!list_empty(head: &sctx->deleted_refs)) {
4511	/*
4512	* We have a moved dir. Add the old parent to check_dirs
4513	*/
4514	cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
4515	list);
4516	ret = dup_ref(ref: cur, list: &check_dirs);
4517	if (ret < `0`)
4518	goto out;
4519	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
4520	/*
4521	* We have a non dir inode. Go through all deleted refs and
4522	* unlink them if they were not already overwritten by other
4523	* inodes.
4524	*/
4525	list_for_each_entry(cur, &sctx->deleted_refs, list) {
4526	ret = did_overwrite_ref(sctx, dir: cur->dir, dir_gen: cur->dir_gen,
4527	ino: sctx->cur_ino, ino_gen: sctx->cur_inode_gen,
4528	name: cur->name, name_len: cur->name_len);
4529	if (ret < `0`)
4530	goto out;
4531	if (!ret) {
4532	/*
4533	* If we orphanized any ancestor before, we need
4534	* to recompute the full path for deleted names,
4535	* since any such path was computed before we
4536	* processed any references and orphanized any
4537	* ancestor inode.
4538	*/
4539	if (orphanized_ancestor) {
4540	ret = update_ref_path(sctx, ref: cur);
4541	if (ret < `0`)
4542	goto out;
4543	}
4544	ret = send_unlink(sctx, path: cur->full_path);
4545	if (ret < `0`)
4546	goto out;
4547	}
4548	ret = dup_ref(ref: cur, list: &check_dirs);
4549	if (ret < `0`)
4550	goto out;
4551	}
4552	/*
4553	* If the inode is still orphan, unlink the orphan. This may
4554	* happen when a previous inode did overwrite the first ref
4555	* of this inode and no new refs were added for the current
4556	* inode. Unlinking does not mean that the inode is deleted in
4557	* all cases. There may still be links to this inode in other
4558	* places.
4559	*/
4560	if (is_orphan) {
4561	ret = send_unlink(sctx, path: valid_path);
4562	if (ret < `0`)
4563	goto out;
4564	}
4565	}
4566
4567	/*
4568	* We did collect all parent dirs where cur_inode was once located. We
4569	* now go through all these dirs and check if they are pending for
4570	* deletion and if it's finally possible to perform the rmdir now.
4571	* We also update the inode stats of the parent dirs here.
4572	*/
4573	list_for_each_entry(cur, &check_dirs, list) {
4574	/*
4575	* In case we had refs into dirs that were not processed yet,
4576	* we don't need to do the utime and rmdir logic for these dirs.
4577	* The dir will be processed later.
4578	*/
4579	if (cur->dir > sctx->cur_ino)
4580	continue;
4581
4582	ret = get_cur_inode_state(sctx, ino: cur->dir, gen: cur->dir_gen, NULL, NULL);
4583	if (ret < `0`)
4584	goto out;
4585
4586	if (ret == inode_state_did_create \|\|
4587	ret == inode_state_no_change) {
4588	ret = cache_dir_utimes(sctx, dir: cur->dir, gen: cur->dir_gen);
4589	if (ret < `0`)
4590	goto out;
4591	} else if (ret == inode_state_did_delete &&
4592	cur->dir != last_dir_ino_rm) {
4593	ret = can_rmdir(sctx, dir: cur->dir, dir_gen: cur->dir_gen);
4594	if (ret < `0`)
4595	goto out;
4596	if (ret) {
4597	ret = get_cur_path(sctx, ino: cur->dir,
4598	gen: cur->dir_gen, dest: valid_path);
4599	if (ret < `0`)
4600	goto out;
4601	ret = send_rmdir(sctx, path: valid_path);
4602	if (ret < `0`)
4603	goto out;
4604	last_dir_ino_rm = cur->dir;
4605	}
4606	}
4607	}
4608
4609	ret = `0`;
4610
4611	out:
4612	__free_recorded_refs(head: &check_dirs);
4613	free_recorded_refs(sctx);
4614	fs_path_free(p: valid_path);
4615	return ret;
4616	}
4617
4618	static int rbtree_ref_comp(const void k, const* struct rb_node *node)
4619	{
4620	const struct recorded_ref *data = k;
4621	const struct recorded_ref ref = rb_entry(node, struct* recorded_ref, node);
4622	int result;
4623
4624	if (data->dir > ref->dir)
4625	return `1`;
4626	if (data->dir < ref->dir)
4627	return -`1`;
4628	if (data->dir_gen > ref->dir_gen)
4629	return `1`;
4630	if (data->dir_gen < ref->dir_gen)
4631	return -`1`;
4632	if (data->name_len > ref->name_len)
4633	return `1`;
4634	if (data->name_len < ref->name_len)
4635	return -`1`;
4636	result = strcmp(data->name, ref->name);
4637	if (result > `0`)
4638	return `1`;
4639	if (result < `0`)
4640	return -`1`;
4641	return `0`;
4642	}
4643
4644	static bool rbtree_ref_less(struct rb_node node, const* struct rb_node *parent)
4645	{
4646	const struct recorded_ref entry = rb_entry(node, struct* recorded_ref, node);
4647
4648	return rbtree_ref_comp(k: entry, node: parent) < `0`;
4649	}
4650
4651	static int record_ref_in_tree(struct rb_root root, struct* list_head *refs,
4652	struct fs_path *name, u64 dir, u64 dir_gen,
4653	struct send_ctx *sctx)
4654	{
4655	int ret = `0`;
4656	struct fs_path *path = NULL;
4657	struct recorded_ref *ref = NULL;
4658
4659	path = fs_path_alloc();
4660	if (!path) {
4661	ret = -ENOMEM;
4662	goto out;
4663	}
4664
4665	ref = recorded_ref_alloc();
4666	if (!ref) {
4667	ret = -ENOMEM;
4668	goto out;
4669	}
4670
4671	ret = get_cur_path(sctx, ino: dir, gen: dir_gen, dest: path);
4672	if (ret < `0`)
4673	goto out;
4674	ret = fs_path_add_path(p: path, p2: name);
4675	if (ret < `0`)
4676	goto out;
4677
4678	ref->dir = dir;
4679	ref->dir_gen = dir_gen;
4680	set_ref_path(ref, path);
4681	list_add_tail(new: &ref->list, head: refs);
4682	rb_add(node: &ref->node, tree: root, less: rbtree_ref_less);
4683	ref->root = root;
4684	out:
4685	if (ret) {
4686	if (path && (!ref \|\| !ref->full_path))
4687	fs_path_free(p: path);
4688	recorded_ref_free(ref);
4689	}
4690	return ret;
4691	}
4692
4693	static int record_new_ref_if_needed(int num, u64 dir, int index,
4694	struct fs_path name, void* *ctx)
4695	{
4696	int ret = `0`;
4697	struct send_ctx *sctx = ctx;
4698	struct rb_node *node = NULL;
4699	struct recorded_ref data;
4700	struct recorded_ref *ref;
4701	u64 dir_gen;
4702
4703	ret = get_inode_gen(root: sctx->send_root, ino: dir, gen: &dir_gen);
4704	if (ret < `0`)
4705	goto out;
4706
4707	data.dir = dir;
4708	data.dir_gen = dir_gen;
4709	set_ref_path(ref: &data, path: name);
4710	node = rb_find(key: &data, tree: &sctx->rbtree_deleted_refs, cmp: rbtree_ref_comp);
4711	if (node) {
4712	ref = rb_entry(node, struct recorded_ref, node);
4713	recorded_ref_free(ref);
4714	} else {
4715	ret = record_ref_in_tree(root: &sctx->rbtree_new_refs,
4716	refs: &sctx->new_refs, name, dir, dir_gen,
4717	sctx);
4718	}
4719	out:
4720	return ret;
4721	}
4722
4723	static int record_deleted_ref_if_needed(int num, u64 dir, int index,
4724	struct fs_path name, void* *ctx)
4725	{
4726	int ret = `0`;
4727	struct send_ctx *sctx = ctx;
4728	struct rb_node *node = NULL;
4729	struct recorded_ref data;
4730	struct recorded_ref *ref;
4731	u64 dir_gen;
4732
4733	ret = get_inode_gen(root: sctx->parent_root, ino: dir, gen: &dir_gen);
4734	if (ret < `0`)
4735	goto out;
4736
4737	data.dir = dir;
4738	data.dir_gen = dir_gen;
4739	set_ref_path(ref: &data, path: name);
4740	node = rb_find(key: &data, tree: &sctx->rbtree_new_refs, cmp: rbtree_ref_comp);
4741	if (node) {
4742	ref = rb_entry(node, struct recorded_ref, node);
4743	recorded_ref_free(ref);
4744	} else {
4745	ret = record_ref_in_tree(root: &sctx->rbtree_deleted_refs,
4746	refs: &sctx->deleted_refs, name, dir,
4747	dir_gen, sctx);
4748	}
4749	out:
4750	return ret;
4751	}
4752
4753	static int record_new_ref(struct send_ctx *sctx)
4754	{
4755	int ret;
4756
4757	ret = iterate_inode_ref(root: sctx->send_root, path: sctx->left_path,
4758	found_key: sctx->cmp_key, resolve: `0`, iterate: record_new_ref_if_needed, ctx: sctx);
4759	if (ret < `0`)
4760	goto out;
4761	ret = `0`;
4762
4763	out:
4764	return ret;
4765	}
4766
4767	static int record_deleted_ref(struct send_ctx *sctx)
4768	{
4769	int ret;
4770
4771	ret = iterate_inode_ref(root: sctx->parent_root, path: sctx->right_path,
4772	found_key: sctx->cmp_key, resolve: `0`, iterate: record_deleted_ref_if_needed,
4773	ctx: sctx);
4774	if (ret < `0`)
4775	goto out;
4776	ret = `0`;
4777
4778	out:
4779	return ret;
4780	}
4781
4782	static int record_changed_ref(struct send_ctx *sctx)
4783	{
4784	int ret = `0`;
4785
4786	ret = iterate_inode_ref(root: sctx->send_root, path: sctx->left_path,
4787	found_key: sctx->cmp_key, resolve: `0`, iterate: record_new_ref_if_needed, ctx: sctx);
4788	if (ret < `0`)
4789	goto out;
4790	ret = iterate_inode_ref(root: sctx->parent_root, path: sctx->right_path,
4791	found_key: sctx->cmp_key, resolve: `0`, iterate: record_deleted_ref_if_needed, ctx: sctx);
4792	if (ret < `0`)
4793	goto out;
4794	ret = `0`;
4795
4796	out:
4797	return ret;
4798	}
4799
4800	/*
4801	* Record and process all refs at once. Needed when an inode changes the
4802	* generation number, which means that it was deleted and recreated.
4803	*/
4804	static int process_all_refs(struct send_ctx *sctx,
4805	enum btrfs_compare_tree_result cmd)
4806	{
4807	int ret = `0`;
4808	int iter_ret = `0`;
4809	struct btrfs_root *root;
4810	struct btrfs_path *path;
4811	struct btrfs_key key;
4812	struct btrfs_key found_key;
4813	iterate_inode_ref_t cb;
4814	int pending_move = `0`;
4815
4816	path = alloc_path_for_send();
4817	if (!path)
4818	return -ENOMEM;
4819
4820	if (cmd == BTRFS_COMPARE_TREE_NEW) {
4821	root = sctx->send_root;
4822	cb = record_new_ref_if_needed;
4823	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4824	root = sctx->parent_root;
4825	cb = record_deleted_ref_if_needed;
4826	} else {
4827	btrfs_err(sctx->send_root->fs_info,
4828	"Wrong command %d in process_all_refs", cmd);
4829	ret = -EINVAL;
4830	goto out;
4831	}
4832
4833	key.objectid = sctx->cmp_key->objectid;
4834	key.type = BTRFS_INODE_REF_KEY;
4835	key.offset = `0`;
4836	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4837	if (found_key.objectid != key.objectid \|\|
4838	(found_key.type != BTRFS_INODE_REF_KEY &&
4839	found_key.type != BTRFS_INODE_EXTREF_KEY))
4840	break;
4841
4842	ret = iterate_inode_ref(root, path, found_key: &found_key, resolve: `0`, iterate: cb, ctx: sctx);
4843	if (ret < `0`)
4844	goto out;
4845	}
4846	/ Catch error found during iteration /
4847	if (iter_ret < `0`) {
4848	ret = iter_ret;
4849	goto out;
4850	}
4851	btrfs_release_path(p: path);
4852
4853	/*
4854	* We don't actually care about pending_move as we are simply
4855	* re-creating this inode and will be rename'ing it into place once we
4856	* rename the parent directory.
4857	*/
4858	ret = process_recorded_refs(sctx, pending_move: &pending_move);
4859	out:
4860	btrfs_free_path(p: path);
4861	return ret;
4862	}
4863
4864	static int send_set_xattr(struct send_ctx *sctx,
4865	struct fs_path *path,
4866	const char name, int* name_len,
4867	const char data, int* data_len)
4868	{
4869	int ret = `0`;
4870
4871	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_SET_XATTR);
4872	if (ret < `0`)
4873	goto out;
4874
4875	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4876	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4877	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4878
4879	ret = send_cmd(sctx);
4880
4881	tlv_put_failure:
4882	out:
4883	return ret;
4884	}
4885
4886	static int send_remove_xattr(struct send_ctx *sctx,
4887	struct fs_path *path,
4888	const char name, int* name_len)
4889	{
4890	int ret = `0`;
4891
4892	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_REMOVE_XATTR);
4893	if (ret < `0`)
4894	goto out;
4895
4896	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4897	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4898
4899	ret = send_cmd(sctx);
4900
4901	tlv_put_failure:
4902	out:
4903	return ret;
4904	}
4905
4906	static int __process_new_xattr(int num, struct btrfs_key *di_key,
4907	const char name, int* name_len, const char *data,
4908	int data_len, void *ctx)
4909	{
4910	int ret;
4911	struct send_ctx *sctx = ctx;
4912	struct fs_path *p;
4913	struct posix_acl_xattr_header dummy_acl;
4914
4915	/ Capabilities are emitted by finish_inode_if_needed /
4916	if (!strncmp(name, XATTR_NAME_CAPS, name_len))
4917	return `0`;
4918
4919	p = fs_path_alloc();
4920	if (!p)
4921	return -ENOMEM;
4922
4923	/*
4924	* This hack is needed because empty acls are stored as zero byte
4925	* data in xattrs. Problem with that is, that receiving these zero byte
4926	* acls will fail later. To fix this, we send a dummy acl list that
4927	* only contains the version number and no entries.
4928	*/
4929	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) \|\|
4930	!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4931	if (data_len == `0`) {
4932	dummy_acl.a_version =
4933	cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4934	data = (char *)&dummy_acl;
4935	data_len = sizeof(dummy_acl);
4936	}
4937	}
4938
4939	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
4940	if (ret < `0`)
4941	goto out;
4942
4943	ret = send_set_xattr(sctx, path: p, name, name_len, data, data_len);
4944
4945	out:
4946	fs_path_free(p);
4947	return ret;
4948	}
4949
4950	static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4951	const char name, int* name_len,
4952	const char data, int* data_len, void *ctx)
4953	{
4954	int ret;
4955	struct send_ctx *sctx = ctx;
4956	struct fs_path *p;
4957
4958	p = fs_path_alloc();
4959	if (!p)
4960	return -ENOMEM;
4961
4962	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
4963	if (ret < `0`)
4964	goto out;
4965
4966	ret = send_remove_xattr(sctx, path: p, name, name_len);
4967
4968	out:
4969	fs_path_free(p);
4970	return ret;
4971	}
4972
4973	static int process_new_xattr(struct send_ctx *sctx)
4974	{
4975	int ret = `0`;
4976
4977	ret = iterate_dir_item(root: sctx->send_root, path: sctx->left_path,
4978	iterate: __process_new_xattr, ctx: sctx);
4979
4980	return ret;
4981	}
4982
4983	static int process_deleted_xattr(struct send_ctx *sctx)
4984	{
4985	return iterate_dir_item(root: sctx->parent_root, path: sctx->right_path,
4986	iterate: __process_deleted_xattr, ctx: sctx);
4987	}
4988
4989	struct find_xattr_ctx {
4990	const char *name;
4991	int name_len;
4992	int found_idx;
4993	char *found_data;
4994	int found_data_len;
4995	};
4996
4997	static int __find_xattr(int num, struct btrfs_key di_key, const* char *name,
4998	int name_len, const char data, int* data_len, void *vctx)
4999	{
5000	struct find_xattr_ctx *ctx = vctx;
5001
5002	if (name_len == ctx->name_len &&
5003	strncmp(name, ctx->name, name_len) == `0`) {
5004	ctx->found_idx = num;
5005	ctx->found_data_len = data_len;
5006	ctx->found_data = kmemdup(p: data, size: data_len, GFP_KERNEL);
5007	if (!ctx->found_data)
5008	return -ENOMEM;
5009	return `1`;
5010	}
5011	return `0`;
5012	}
5013
5014	static int find_xattr(struct btrfs_root *root,
5015	struct btrfs_path *path,
5016	struct btrfs_key *key,
5017	const char name, int* name_len,
5018	char *data, int* *data_len)
5019	{
5020	int ret;
5021	struct find_xattr_ctx ctx;
5022
5023	ctx.name = name;
5024	ctx.name_len = name_len;
5025	ctx.found_idx = -`1`;
5026	ctx.found_data = NULL;
5027	ctx.found_data_len = `0`;
5028
5029	ret = iterate_dir_item(root, path, iterate: __find_xattr, ctx: &ctx);
5030	if (ret < `0`)
5031	return ret;
5032
5033	if (ctx.found_idx == -`1`)
5034	return -ENOENT;
5035	if (data) {
5036	*data = ctx.found_data;
5037	*data_len = ctx.found_data_len;
5038	} else {
5039	kfree(objp: ctx.found_data);
5040	}
5041	return ctx.found_idx;
5042	}
5043
5044
5045	static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
5046	const char name, int* name_len,
5047	const char data, int* data_len,
5048	void *ctx)
5049	{
5050	int ret;
5051	struct send_ctx *sctx = ctx;
5052	char *found_data = NULL;
5053	int found_data_len = `0`;
5054
5055	ret = find_xattr(root: sctx->parent_root, path: sctx->right_path,
5056	key: sctx->cmp_key, name, name_len, data: &found_data,
5057	data_len: &found_data_len);
5058	if (ret == -ENOENT) {
5059	ret = __process_new_xattr(num, di_key, name, name_len, data,
5060	data_len, ctx);
5061	} else if (ret >= `0`) {
5062	if (data_len != found_data_len \|\|
5063	memcmp(p: data, q: found_data, size: data_len)) {
5064	ret = __process_new_xattr(num, di_key, name, name_len,
5065	data, data_len, ctx);
5066	} else {
5067	ret = `0`;
5068	}
5069	}
5070
5071	kfree(objp: found_data);
5072	return ret;
5073	}
5074
5075	static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
5076	const char name, int* name_len,
5077	const char data, int* data_len,
5078	void *ctx)
5079	{
5080	int ret;
5081	struct send_ctx *sctx = ctx;
5082
5083	ret = find_xattr(root: sctx->send_root, path: sctx->left_path, key: sctx->cmp_key,
5084	name, name_len, NULL, NULL);
5085	if (ret == -ENOENT)
5086	ret = __process_deleted_xattr(num, di_key, name, name_len, data,
5087	data_len, ctx);
5088	else if (ret >= `0`)
5089	ret = `0`;
5090
5091	return ret;
5092	}
5093
5094	static int process_changed_xattr(struct send_ctx *sctx)
5095	{
5096	int ret = `0`;
5097
5098	ret = iterate_dir_item(root: sctx->send_root, path: sctx->left_path,
5099	iterate: __process_changed_new_xattr, ctx: sctx);
5100	if (ret < `0`)
5101	goto out;
5102	ret = iterate_dir_item(root: sctx->parent_root, path: sctx->right_path,
5103	iterate: __process_changed_deleted_xattr, ctx: sctx);
5104
5105	out:
5106	return ret;
5107	}
5108
5109	static int process_all_new_xattrs(struct send_ctx *sctx)
5110	{
5111	int ret = `0`;
5112	int iter_ret = `0`;
5113	struct btrfs_root *root;
5114	struct btrfs_path *path;
5115	struct btrfs_key key;
5116	struct btrfs_key found_key;
5117
5118	path = alloc_path_for_send();
5119	if (!path)
5120	return -ENOMEM;
5121
5122	root = sctx->send_root;
5123
5124	key.objectid = sctx->cmp_key->objectid;
5125	key.type = BTRFS_XATTR_ITEM_KEY;
5126	key.offset = `0`;
5127	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
5128	if (found_key.objectid != key.objectid \|\|
5129	found_key.type != key.type) {
5130	ret = `0`;
5131	break;
5132	}
5133
5134	ret = iterate_dir_item(root, path, iterate: __process_new_xattr, ctx: sctx);
5135	if (ret < `0`)
5136	break;
5137	}
5138	/ Catch error found during iteration /
5139	if (iter_ret < `0`)
5140	ret = iter_ret;
5141
5142	btrfs_free_path(p: path);
5143	return ret;
5144	}
5145
5146	static int send_verity(struct send_ctx sctx, struct* fs_path *path,
5147	struct fsverity_descriptor *desc)
5148	{
5149	int ret;
5150
5151	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENABLE_VERITY);
5152	if (ret < `0`)
5153	goto out;
5154
5155	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
5156	TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
5157	le8_to_cpu(desc->hash_algorithm));
5158	TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
5159	`1U` << le8_to_cpu(desc->log_blocksize));
5160	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
5161	le8_to_cpu(desc->salt_size));
5162	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
5163	le32_to_cpu(desc->sig_size));
5164
5165	ret = send_cmd(sctx);
5166
5167	tlv_put_failure:
5168	out:
5169	return ret;
5170	}
5171
5172	static int process_verity(struct send_ctx *sctx)
5173	{
5174	int ret = `0`;
5175	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5176	struct inode *inode;
5177	struct fs_path *p;
5178
5179	inode = btrfs_iget(s: fs_info->sb, ino: sctx->cur_ino, root: sctx->send_root);
5180	if (IS_ERR(ptr: inode))
5181	return PTR_ERR(ptr: inode);
5182
5183	ret = btrfs_get_verity_descriptor(inode, NULL, buf_size: `0`);
5184	if (ret < `0`)
5185	goto iput;
5186
5187	if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
5188	ret = -EMSGSIZE;
5189	goto iput;
5190	}
5191	if (!sctx->verity_descriptor) {
5192	sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
5193	GFP_KERNEL);
5194	if (!sctx->verity_descriptor) {
5195	ret = -ENOMEM;
5196	goto iput;
5197	}
5198	}
5199
5200	ret = btrfs_get_verity_descriptor(inode, buf: sctx->verity_descriptor, buf_size: ret);
5201	if (ret < `0`)
5202	goto iput;
5203
5204	p = fs_path_alloc();
5205	if (!p) {
5206	ret = -ENOMEM;
5207	goto iput;
5208	}
5209	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5210	if (ret < `0`)
5211	goto free_path;
5212
5213	ret = send_verity(sctx, path: p, desc: sctx->verity_descriptor);
5214	if (ret < `0`)
5215	goto free_path;
5216
5217	free_path:
5218	fs_path_free(p);
5219	iput:
5220	iput(inode);
5221	return ret;
5222	}
5223
5224	static inline u64 max_send_read_size(const struct send_ctx *sctx)
5225	{
5226	return sctx->send_max_size - SZ_16K;
5227	}
5228
5229	static int put_data_header(struct send_ctx *sctx, u32 len)
5230	{
5231	if (WARN_ON_ONCE(sctx->put_data))
5232	return -EINVAL;
5233	sctx->put_data = true;
5234	if (sctx->proto >= `2`) {
5235	/*
5236	* Since v2, the data attribute header doesn't include a length,
5237	* it is implicitly to the end of the command.
5238	*/
5239	if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
5240	return -EOVERFLOW;
5241	put_unaligned_le16(val: BTRFS_SEND_A_DATA, p: sctx->send_buf + sctx->send_size);
5242	sctx->send_size += sizeof(__le16);
5243	} else {
5244	struct btrfs_tlv_header *hdr;
5245
5246	if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
5247	return -EOVERFLOW;
5248	hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
5249	put_unaligned_le16(val: BTRFS_SEND_A_DATA, p: &hdr->tlv_type);
5250	put_unaligned_le16(val: len, p: &hdr->tlv_len);
5251	sctx->send_size += sizeof(*hdr);
5252	}
5253	return `0`;
5254	}
5255
5256	static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
5257	{
5258	struct btrfs_root *root = sctx->send_root;
5259	struct btrfs_fs_info *fs_info = root->fs_info;
5260	struct page *page;
5261	pgoff_t index = offset >> PAGE_SHIFT;
5262	pgoff_t last_index;
5263	unsigned pg_offset = offset_in_page(offset);
5264	int ret;
5265
5266	ret = put_data_header(sctx, len);
5267	if (ret)
5268	return ret;
5269
5270	last_index = (offset + len - `1`) >> PAGE_SHIFT;
5271
5272	while (index <= last_index) {
5273	unsigned cur_len = min_t(unsigned, len,
5274	PAGE_SIZE - pg_offset);
5275
5276	page = find_lock_page(mapping: sctx->cur_inode->i_mapping, index);
5277	if (!page) {
5278	page_cache_sync_readahead(mapping: sctx->cur_inode->i_mapping,
5279	ra: &sctx->ra, NULL, index,
5280	req_count: last_index + `1` - index);
5281
5282	page = find_or_create_page(mapping: sctx->cur_inode->i_mapping,
5283	index, GFP_KERNEL);
5284	if (!page) {
5285	ret = -ENOMEM;
5286	break;
5287	}
5288	}
5289
5290	if (PageReadahead(page))
5291	page_cache_async_readahead(mapping: sctx->cur_inode->i_mapping,
5292	ra: &sctx->ra, NULL, page_folio(page),
5293	index, req_count: last_index + `1` - index);
5294
5295	if (!PageUptodate(page)) {
5296	btrfs_read_folio(NULL, page_folio(page));
5297	lock_page(page);
5298	if (!PageUptodate(page)) {
5299	unlock_page(page);
5300	btrfs_err(fs_info,
5301	"send: IO error at offset %llu for inode %llu root %llu",
5302	page_offset(page), sctx->cur_ino,
5303	sctx->send_root->root_key.objectid);
5304	put_page(page);
5305	ret = -EIO;
5306	break;
5307	}
5308	}
5309
5310	memcpy_from_page(to: sctx->send_buf + sctx->send_size, page,
5311	offset: pg_offset, len: cur_len);
5312	unlock_page(page);
5313	put_page(page);
5314	index++;
5315	pg_offset = `0`;
5316	len -= cur_len;
5317	sctx->send_size += cur_len;
5318	}
5319
5320	return ret;
5321	}
5322
5323	/*
5324	* Read some bytes from the current inode/file and send a write command to
5325	* user space.
5326	*/
5327	static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
5328	{
5329	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5330	int ret = `0`;
5331	struct fs_path *p;
5332
5333	p = fs_path_alloc();
5334	if (!p)
5335	return -ENOMEM;
5336
5337	btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5338
5339	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_WRITE);
5340	if (ret < `0`)
5341	goto out;
5342
5343	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5344	if (ret < `0`)
5345	goto out;
5346
5347	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5348	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5349	ret = put_file_data(sctx, offset, len);
5350	if (ret < `0`)
5351	goto out;
5352
5353	ret = send_cmd(sctx);
5354
5355	tlv_put_failure:
5356	out:
5357	fs_path_free(p);
5358	return ret;
5359	}
5360
5361	/*
5362	* Send a clone command to user space.
5363	*/
5364	static int send_clone(struct send_ctx *sctx,
5365	u64 offset, u32 len,
5366	struct clone_root *clone_root)
5367	{
5368	int ret = `0`;
5369	struct fs_path *p;
5370	u64 gen;
5371
5372	btrfs_debug(sctx->send_root->fs_info,
5373	"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5374	offset, len, clone_root->root->root_key.objectid,
5375	clone_root->ino, clone_root->offset);
5376
5377	p = fs_path_alloc();
5378	if (!p)
5379	return -ENOMEM;
5380
5381	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_CLONE);
5382	if (ret < `0`)
5383	goto out;
5384
5385	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5386	if (ret < `0`)
5387	goto out;
5388
5389	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5390	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
5391	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5392
5393	if (clone_root->root == sctx->send_root) {
5394	ret = get_inode_gen(root: sctx->send_root, ino: clone_root->ino, gen: &gen);
5395	if (ret < `0`)
5396	goto out;
5397	ret = get_cur_path(sctx, ino: clone_root->ino, gen, dest: p);
5398	} else {
5399	ret = get_inode_path(root: clone_root->root, ino: clone_root->ino, path: p);
5400	}
5401	if (ret < `0`)
5402	goto out;
5403
5404	/*
5405	* If the parent we're using has a received_uuid set then use that as
5406	* our clone source as that is what we will look for when doing a
5407	* receive.
5408	*
5409	* This covers the case that we create a snapshot off of a received
5410	* subvolume and then use that as the parent and try to receive on a
5411	* different host.
5412	*/
5413	if (!btrfs_is_empty_uuid(uuid: clone_root->root->root_item.received_uuid))
5414	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5415	clone_root->root->root_item.received_uuid);
5416	else
5417	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5418	clone_root->root->root_item.uuid);
5419	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5420	btrfs_root_ctransid(&clone_root->root->root_item));
5421	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
5422	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
5423	clone_root->offset);
5424
5425	ret = send_cmd(sctx);
5426
5427	tlv_put_failure:
5428	out:
5429	fs_path_free(p);
5430	return ret;
5431	}
5432
5433	/*
5434	* Send an update extent command to user space.
5435	*/
5436	static int send_update_extent(struct send_ctx *sctx,
5437	u64 offset, u32 len)
5438	{
5439	int ret = `0`;
5440	struct fs_path *p;
5441
5442	p = fs_path_alloc();
5443	if (!p)
5444	return -ENOMEM;
5445
5446	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_UPDATE_EXTENT);
5447	if (ret < `0`)
5448	goto out;
5449
5450	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5451	if (ret < `0`)
5452	goto out;
5453
5454	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5455	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5456	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5457
5458	ret = send_cmd(sctx);
5459
5460	tlv_put_failure:
5461	out:
5462	fs_path_free(p);
5463	return ret;
5464	}
5465
5466	static int send_hole(struct send_ctx *sctx, u64 end)
5467	{
5468	struct fs_path *p = NULL;
5469	u64 read_size = max_send_read_size(sctx);
5470	u64 offset = sctx->cur_inode_last_extent;
5471	int ret = `0`;
5472
5473	/*
5474	* A hole that starts at EOF or beyond it. Since we do not yet support
5475	* fallocate (for extent preallocation and hole punching), sending a
5476	* write of zeroes starting at EOF or beyond would later require issuing
5477	* a truncate operation which would undo the write and achieve nothing.
5478	*/
5479	if (offset >= sctx->cur_inode_size)
5480	return `0`;
5481
5482	/*
5483	* Don't go beyond the inode's i_size due to prealloc extents that start
5484	* after the i_size.
5485	*/
5486	end = min_t(u64, end, sctx->cur_inode_size);
5487
5488	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5489	return send_update_extent(sctx, offset, len: end - offset);
5490
5491	p = fs_path_alloc();
5492	if (!p)
5493	return -ENOMEM;
5494	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: p);
5495	if (ret < `0`)
5496	goto tlv_put_failure;
5497	while (offset < end) {
5498	u64 len = min(end - offset, read_size);
5499
5500	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_WRITE);
5501	if (ret < `0`)
5502	break;
5503	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5504	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5505	ret = put_data_header(sctx, len);
5506	if (ret < `0`)
5507	break;
5508	memset(sctx->send_buf + sctx->send_size, `0`, len);
5509	sctx->send_size += len;
5510	ret = send_cmd(sctx);
5511	if (ret < `0`)
5512	break;
5513	offset += len;
5514	}
5515	sctx->cur_inode_next_write_offset = offset;
5516	tlv_put_failure:
5517	fs_path_free(p);
5518	return ret;
5519	}
5520
5521	static int send_encoded_inline_extent(struct send_ctx *sctx,
5522	struct btrfs_path *path, u64 offset,
5523	u64 len)
5524	{
5525	struct btrfs_root *root = sctx->send_root;
5526	struct btrfs_fs_info *fs_info = root->fs_info;
5527	struct inode *inode;
5528	struct fs_path *fspath;
5529	struct extent_buffer *leaf = path->nodes[`0`];
5530	struct btrfs_key key;
5531	struct btrfs_file_extent_item *ei;
5532	u64 ram_bytes;
5533	size_t inline_size;
5534	int ret;
5535
5536	inode = btrfs_iget(s: fs_info->sb, ino: sctx->cur_ino, root);
5537	if (IS_ERR(ptr: inode))
5538	return PTR_ERR(ptr: inode);
5539
5540	fspath = fs_path_alloc();
5541	if (!fspath) {
5542	ret = -ENOMEM;
5543	goto out;
5544	}
5545
5546	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENCODED_WRITE);
5547	if (ret < `0`)
5548	goto out;
5549
5550	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: fspath);
5551	if (ret < `0`)
5552	goto out;
5553
5554	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5555	ei = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
5556	ram_bytes = btrfs_file_extent_ram_bytes(eb: leaf, s: ei);
5557	inline_size = btrfs_file_extent_inline_item_len(eb: leaf, nr: path->slots[`0`]);
5558
5559	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5560	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5561	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5562	min(key.offset + ram_bytes - offset, len));
5563	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
5564	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
5565	ret = btrfs_encoded_io_compression_from_extent(fs_info,
5566	compress_type: btrfs_file_extent_compression(eb: leaf, s: ei));
5567	if (ret < `0`)
5568	goto out;
5569	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5570
5571	ret = put_data_header(sctx, len: inline_size);
5572	if (ret < `0`)
5573	goto out;
5574	read_extent_buffer(eb: leaf, dst: sctx->send_buf + sctx->send_size,
5575	start: btrfs_file_extent_inline_start(e: ei), len: inline_size);
5576	sctx->send_size += inline_size;
5577
5578	ret = send_cmd(sctx);
5579
5580	tlv_put_failure:
5581	out:
5582	fs_path_free(p: fspath);
5583	iput(inode);
5584	return ret;
5585	}
5586
5587	static int send_encoded_extent(struct send_ctx sctx, struct* btrfs_path *path,
5588	u64 offset, u64 len)
5589	{
5590	struct btrfs_root *root = sctx->send_root;
5591	struct btrfs_fs_info *fs_info = root->fs_info;
5592	struct inode *inode;
5593	struct fs_path *fspath;
5594	struct extent_buffer *leaf = path->nodes[`0`];
5595	struct btrfs_key key;
5596	struct btrfs_file_extent_item *ei;
5597	u64 disk_bytenr, disk_num_bytes;
5598	u32 data_offset;
5599	struct btrfs_cmd_header *hdr;
5600	u32 crc;
5601	int ret;
5602
5603	inode = btrfs_iget(s: fs_info->sb, ino: sctx->cur_ino, root);
5604	if (IS_ERR(ptr: inode))
5605	return PTR_ERR(ptr: inode);
5606
5607	fspath = fs_path_alloc();
5608	if (!fspath) {
5609	ret = -ENOMEM;
5610	goto out;
5611	}
5612
5613	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_ENCODED_WRITE);
5614	if (ret < `0`)
5615	goto out;
5616
5617	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: fspath);
5618	if (ret < `0`)
5619	goto out;
5620
5621	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: path->slots[`0`]);
5622	ei = btrfs_item_ptr(leaf, path->slots[`0`], struct btrfs_file_extent_item);
5623	disk_bytenr = btrfs_file_extent_disk_bytenr(eb: leaf, s: ei);
5624	disk_num_bytes = btrfs_file_extent_disk_num_bytes(eb: leaf, s: ei);
5625
5626	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
5627	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5628	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
5629	min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
5630	len));
5631	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
5632	btrfs_file_extent_ram_bytes(leaf, ei));
5633	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
5634	offset - key.offset + btrfs_file_extent_offset(leaf, ei));
5635	ret = btrfs_encoded_io_compression_from_extent(fs_info,
5636	compress_type: btrfs_file_extent_compression(eb: leaf, s: ei));
5637	if (ret < `0`)
5638	goto out;
5639	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
5640	TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, `0`);
5641
5642	ret = put_data_header(sctx, len: disk_num_bytes);
5643	if (ret < `0`)
5644	goto out;
5645
5646	/*
5647	* We want to do I/O directly into the send buffer, so get the next page
5648	* boundary in the send buffer. This means that there may be a gap
5649	* between the beginning of the command and the file data.
5650	*/
5651	data_offset = PAGE_ALIGN(sctx->send_size);
5652	if (data_offset > sctx->send_max_size \|\|
5653	sctx->send_max_size - data_offset < disk_num_bytes) {
5654	ret = -EOVERFLOW;
5655	goto out;
5656	}
5657
5658	/*
5659	* Note that send_buf is a mapping of send_buf_pages, so this is really
5660	* reading into send_buf.
5661	*/
5662	ret = btrfs_encoded_read_regular_fill_pages(inode: BTRFS_I(inode), file_offset: offset,
5663	disk_bytenr, disk_io_size: disk_num_bytes,
5664	pages: sctx->send_buf_pages +
5665	(data_offset >> PAGE_SHIFT));
5666	if (ret)
5667	goto out;
5668
5669	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
5670	hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
5671	hdr->crc = `0`;
5672	crc = crc32c(crc: `0`, address: sctx->send_buf, length: sctx->send_size);
5673	crc = crc32c(crc, address: sctx->send_buf + data_offset, length: disk_num_bytes);
5674	hdr->crc = cpu_to_le32(crc);
5675
5676	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf, len: sctx->send_size,
5677	off: &sctx->send_off);
5678	if (!ret) {
5679	ret = write_buf(filp: sctx->send_filp, buf: sctx->send_buf + data_offset,
5680	len: disk_num_bytes, off: &sctx->send_off);
5681	}
5682	sctx->send_size = `0`;
5683	sctx->put_data = false;
5684
5685	tlv_put_failure:
5686	out:
5687	fs_path_free(p: fspath);
5688	iput(inode);
5689	return ret;
5690	}
5691
5692	static int send_extent_data(struct send_ctx sctx, struct* btrfs_path *path,
5693	const u64 offset, const u64 len)
5694	{
5695	const u64 end = offset + len;
5696	struct extent_buffer *leaf = path->nodes[`0`];
5697	struct btrfs_file_extent_item *ei;
5698	u64 read_size = max_send_read_size(sctx);
5699	u64 sent = `0`;
5700
5701	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5702	return send_update_extent(sctx, offset, len);
5703
5704	ei = btrfs_item_ptr(leaf, path->slots[`0`],
5705	struct btrfs_file_extent_item);
5706	if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
5707	btrfs_file_extent_compression(eb: leaf, s: ei) != BTRFS_COMPRESS_NONE) {
5708	bool is_inline = (btrfs_file_extent_type(eb: leaf, s: ei) ==
5709	BTRFS_FILE_EXTENT_INLINE);
5710
5711	/*
5712	* Send the compressed extent unless the compressed data is
5713	* larger than the decompressed data. This can happen if we're
5714	* not sending the entire extent, either because it has been
5715	* partially overwritten/truncated or because this is a part of
5716	* the extent that we couldn't clone in clone_range().
5717	*/
5718	if (is_inline &&
5719	btrfs_file_extent_inline_item_len(eb: leaf,
5720	nr: path->slots[`0`]) <= len) {
5721	return send_encoded_inline_extent(sctx, path, offset,
5722	len);
5723	} else if (!is_inline &&
5724	btrfs_file_extent_disk_num_bytes(eb: leaf, s: ei) <= len) {
5725	return send_encoded_extent(sctx, path, offset, len);
5726	}
5727	}
5728
5729	if (sctx->cur_inode == NULL) {
5730	struct btrfs_root *root = sctx->send_root;
5731
5732	sctx->cur_inode = btrfs_iget(s: root->fs_info->sb, ino: sctx->cur_ino, root);
5733	if (IS_ERR(ptr: sctx->cur_inode)) {
5734	int err = PTR_ERR(ptr: sctx->cur_inode);
5735
5736	sctx->cur_inode = NULL;
5737	return err;
5738	}
5739	memset(&sctx->ra, `0`, sizeof(struct file_ra_state));
5740	file_ra_state_init(ra: &sctx->ra, mapping: sctx->cur_inode->i_mapping);
5741
5742	/*
5743	* It's very likely there are no pages from this inode in the page
5744	* cache, so after reading extents and sending their data, we clean
5745	* the page cache to avoid trashing the page cache (adding pressure
5746	* to the page cache and forcing eviction of other data more useful
5747	* for applications).
5748	*
5749	* We decide if we should clean the page cache simply by checking
5750	* if the inode's mapping nrpages is 0 when we first open it, and
5751	* not by using something like filemap_range_has_page() before
5752	* reading an extent because when we ask the readahead code to
5753	* read a given file range, it may (and almost always does) read
5754	* pages from beyond that range (see the documentation for
5755	* page_cache_sync_readahead()), so it would not be reliable,
5756	* because after reading the first extent future calls to
5757	* filemap_range_has_page() would return true because the readahead
5758	* on the previous extent resulted in reading pages of the current
5759	* extent as well.
5760	*/
5761	sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == `0`);
5762	sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
5763	}
5764
5765	while (sent < len) {
5766	u64 size = min(len - sent, read_size);
5767	int ret;
5768
5769	ret = send_write(sctx, offset: offset + sent, len: size);
5770	if (ret < `0`)
5771	return ret;
5772	sent += size;
5773	}
5774
5775	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
5776	/*
5777	* Always operate only on ranges that are a multiple of the page
5778	* size. This is not only to prevent zeroing parts of a page in
5779	* the case of subpage sector size, but also to guarantee we evict
5780	* pages, as passing a range that is smaller than page size does
5781	* not evict the respective page (only zeroes part of its content).
5782	*
5783	* Always start from the end offset of the last range cleared.
5784	* This is because the readahead code may (and very often does)
5785	* reads pages beyond the range we request for readahead. So if
5786	* we have an extent layout like this:
5787	*
5788	* [ extent A ] [ extent B ] [ extent C ]
5789	*
5790	* When we ask page_cache_sync_readahead() to read extent A, it
5791	* may also trigger reads for pages of extent B. If we are doing
5792	* an incremental send and extent B has not changed between the
5793	* parent and send snapshots, some or all of its pages may end
5794	* up being read and placed in the page cache. So when truncating
5795	* the page cache we always start from the end offset of the
5796	* previously processed extent up to the end of the current
5797	* extent.
5798	*/
5799	truncate_inode_pages_range(&sctx->cur_inode->i_data,
5800	lstart: sctx->page_cache_clear_start,
5801	lend: end - `1`);
5802	sctx->page_cache_clear_start = end;
5803	}
5804
5805	return `0`;
5806	}
5807
5808	/*
5809	* Search for a capability xattr related to sctx->cur_ino. If the capability is
5810	* found, call send_set_xattr function to emit it.
5811	*
5812	* Return 0 if there isn't a capability, or when the capability was emitted
5813	* successfully, or < 0 if an error occurred.
5814	*/
5815	static int send_capabilities(struct send_ctx *sctx)
5816	{
5817	struct fs_path *fspath = NULL;
5818	struct btrfs_path *path;
5819	struct btrfs_dir_item *di;
5820	struct extent_buffer *leaf;
5821	unsigned long data_ptr;
5822	char *buf = NULL;
5823	int buf_len;
5824	int ret = `0`;
5825
5826	path = alloc_path_for_send();
5827	if (!path)
5828	return -ENOMEM;
5829
5830	di = btrfs_lookup_xattr(NULL, root: sctx->send_root, path, dir: sctx->cur_ino,
5831	XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), mod: `0`);
5832	if (!di) {
5833	/ There is no xattr for this inode /
5834	goto out;
5835	} else if (IS_ERR(ptr: di)) {
5836	ret = PTR_ERR(ptr: di);
5837	goto out;
5838	}
5839
5840	leaf = path->nodes[`0`];
5841	buf_len = btrfs_dir_data_len(eb: leaf, s: di);
5842
5843	fspath = fs_path_alloc();
5844	buf = kmalloc(size: buf_len, GFP_KERNEL);
5845	if (!fspath \|\| !buf) {
5846	ret = -ENOMEM;
5847	goto out;
5848	}
5849
5850	ret = get_cur_path(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen, dest: fspath);
5851	if (ret < `0`)
5852	goto out;
5853
5854	data_ptr = (unsigned long)(di + `1`) + btrfs_dir_name_len(eb: leaf, s: di);
5855	read_extent_buffer(eb: leaf, dst: buf, start: data_ptr, len: buf_len);
5856
5857	ret = send_set_xattr(sctx, path: fspath, XATTR_NAME_CAPS,
5858	strlen(XATTR_NAME_CAPS), data: buf, data_len: buf_len);
5859	out:
5860	kfree(objp: buf);
5861	fs_path_free(p: fspath);
5862	btrfs_free_path(p: path);
5863	return ret;
5864	}
5865
5866	static int clone_range(struct send_ctx sctx, struct* btrfs_path *dst_path,
5867	struct clone_root clone_root, const* u64 disk_byte,
5868	u64 data_offset, u64 offset, u64 len)
5869	{
5870	struct btrfs_path *path;
5871	struct btrfs_key key;
5872	int ret;
5873	struct btrfs_inode_info info;
5874	u64 clone_src_i_size = `0`;
5875
5876	/*
5877	* Prevent cloning from a zero offset with a length matching the sector
5878	* size because in some scenarios this will make the receiver fail.
5879	*
5880	* For example, if in the source filesystem the extent at offset 0
5881	* has a length of sectorsize and it was written using direct IO, then
5882	* it can never be an inline extent (even if compression is enabled).
5883	* Then this extent can be cloned in the original filesystem to a non
5884	* zero file offset, but it may not be possible to clone in the
5885	* destination filesystem because it can be inlined due to compression
5886	* on the destination filesystem (as the receiver's write operations are
5887	* always done using buffered IO). The same happens when the original
5888	* filesystem does not have compression enabled but the destination
5889	* filesystem has.
5890	*/
5891	if (clone_root->offset == `0` &&
5892	len == sctx->send_root->fs_info->sectorsize)
5893	return send_extent_data(sctx, path: dst_path, offset, len);
5894
5895	path = alloc_path_for_send();
5896	if (!path)
5897	return -ENOMEM;
5898
5899	/*
5900	* There are inodes that have extents that lie behind its i_size. Don't
5901	* accept clones from these extents.
5902	*/
5903	ret = get_inode_info(root: clone_root->root, ino: clone_root->ino, info: &info);
5904	btrfs_release_path(p: path);
5905	if (ret < `0`)
5906	goto out;
5907	clone_src_i_size = info.size;
5908
5909	/*
5910	* We can't send a clone operation for the entire range if we find
5911	* extent items in the respective range in the source file that
5912	* refer to different extents or if we find holes.
5913	* So check for that and do a mix of clone and regular write/copy
5914	* operations if needed.
5915	*
5916	* Example:
5917	*
5918	* mkfs.btrfs -f /dev/sda
5919	* mount /dev/sda /mnt
5920	* xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5921	* cp --reflink=always /mnt/foo /mnt/bar
5922	* xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5923	* btrfs subvolume snapshot -r /mnt /mnt/snap
5924	*
5925	* If when we send the snapshot and we are processing file bar (which
5926	* has a higher inode number than foo) we blindly send a clone operation
5927	* for the [0, 100K[ range from foo to bar, the receiver ends up getting
5928	* a file bar that matches the content of file foo - iow, doesn't match
5929	* the content from bar in the original filesystem.
5930	*/
5931	key.objectid = clone_root->ino;
5932	key.type = BTRFS_EXTENT_DATA_KEY;
5933	key.offset = clone_root->offset;
5934	ret = btrfs_search_slot(NULL, root: clone_root->root, key: &key, p: path, ins_len: `0`, cow: `0`);
5935	if (ret < `0`)
5936	goto out;
5937	if (ret > `0` && path->slots[`0`] > `0`) {
5938	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`] - `1`);
5939	if (key.objectid == clone_root->ino &&
5940	key.type == BTRFS_EXTENT_DATA_KEY)
5941	path->slots[`0`]--;
5942	}
5943
5944	while (true) {
5945	struct extent_buffer *leaf = path->nodes[`0`];
5946	int slot = path->slots[`0`];
5947	struct btrfs_file_extent_item *ei;
5948	u8 type;
5949	u64 ext_len;
5950	u64 clone_len;
5951	u64 clone_data_offset;
5952	bool crossed_src_i_size = false;
5953
5954	if (slot >= btrfs_header_nritems(eb: leaf)) {
5955	ret = btrfs_next_leaf(root: clone_root->root, path);
5956	if (ret < `0`)
5957	goto out;
5958	else if (ret > `0`)
5959	break;
5960	continue;
5961	}
5962
5963	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
5964
5965	/*
5966	* We might have an implicit trailing hole (NO_HOLES feature
5967	* enabled). We deal with it after leaving this loop.
5968	*/
5969	if (key.objectid != clone_root->ino \|\|
5970	key.type != BTRFS_EXTENT_DATA_KEY)
5971	break;
5972
5973	ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5974	type = btrfs_file_extent_type(eb: leaf, s: ei);
5975	if (type == BTRFS_FILE_EXTENT_INLINE) {
5976	ext_len = btrfs_file_extent_ram_bytes(eb: leaf, s: ei);
5977	ext_len = PAGE_ALIGN(ext_len);
5978	} else {
5979	ext_len = btrfs_file_extent_num_bytes(eb: leaf, s: ei);
5980	}
5981
5982	if (key.offset + ext_len <= clone_root->offset)
5983	goto next;
5984
5985	if (key.offset > clone_root->offset) {
5986	/ Implicit hole, NO_HOLES feature enabled. /
5987	u64 hole_len = key.offset - clone_root->offset;
5988
5989	if (hole_len > len)
5990	hole_len = len;
5991	ret = send_extent_data(sctx, path: dst_path, offset,
5992	len: hole_len);
5993	if (ret < `0`)
5994	goto out;
5995
5996	len -= hole_len;
5997	if (len == `0`)
5998	break;
5999	offset += hole_len;
6000	clone_root->offset += hole_len;
6001	data_offset += hole_len;
6002	}
6003
6004	if (key.offset >= clone_root->offset + len)
6005	break;
6006
6007	if (key.offset >= clone_src_i_size)
6008	break;
6009
6010	if (key.offset + ext_len > clone_src_i_size) {
6011	ext_len = clone_src_i_size - key.offset;
6012	crossed_src_i_size = true;
6013	}
6014
6015	clone_data_offset = btrfs_file_extent_offset(eb: leaf, s: ei);
6016	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: ei) == disk_byte) {
6017	clone_root->offset = key.offset;
6018	if (clone_data_offset < data_offset &&
6019	clone_data_offset + ext_len > data_offset) {
6020	u64 extent_offset;
6021
6022	extent_offset = data_offset - clone_data_offset;
6023	ext_len -= extent_offset;
6024	clone_data_offset += extent_offset;
6025	clone_root->offset += extent_offset;
6026	}
6027	}
6028
6029	clone_len = min_t(u64, ext_len, len);
6030
6031	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: ei) == disk_byte &&
6032	clone_data_offset == data_offset) {
6033	const u64 src_end = clone_root->offset + clone_len;
6034	const u64 sectorsize = SZ_64K;
6035
6036	/*
6037	* We can't clone the last block, when its size is not
6038	* sector size aligned, into the middle of a file. If we
6039	* do so, the receiver will get a failure (-EINVAL) when
6040	* trying to clone or will silently corrupt the data in
6041	* the destination file if it's on a kernel without the
6042	* fix introduced by commit ac765f83f1397646
6043	* ("Btrfs: fix data corruption due to cloning of eof
6044	* block).
6045	*
6046	* So issue a clone of the aligned down range plus a
6047	* regular write for the eof block, if we hit that case.
6048	*
6049	* Also, we use the maximum possible sector size, 64K,
6050	* because we don't know what's the sector size of the
6051	* filesystem that receives the stream, so we have to
6052	* assume the largest possible sector size.
6053	*/
6054	if (src_end == clone_src_i_size &&
6055	!IS_ALIGNED(src_end, sectorsize) &&
6056	offset + clone_len < sctx->cur_inode_size) {
6057	u64 slen;
6058
6059	slen = ALIGN_DOWN(src_end - clone_root->offset,
6060	sectorsize);
6061	if (slen > `0`) {
6062	ret = send_clone(sctx, offset, len: slen,
6063	clone_root);
6064	if (ret < `0`)
6065	goto out;
6066	}
6067	ret = send_extent_data(sctx, path: dst_path,
6068	offset: offset + slen,
6069	len: clone_len - slen);
6070	} else {
6071	ret = send_clone(sctx, offset, len: clone_len,
6072	clone_root);
6073	}
6074	} else if (crossed_src_i_size && clone_len < len) {
6075	/*
6076	* If we are at i_size of the clone source inode and we
6077	* can not clone from it, terminate the loop. This is
6078	* to avoid sending two write operations, one with a
6079	* length matching clone_len and the final one after
6080	* this loop with a length of len - clone_len.
6081	*
6082	* When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
6083	* was passed to the send ioctl), this helps avoid
6084	* sending an encoded write for an offset that is not
6085	* sector size aligned, in case the i_size of the source
6086	* inode is not sector size aligned. That will make the
6087	* receiver fallback to decompression of the data and
6088	* writing it using regular buffered IO, therefore while
6089	* not incorrect, it's not optimal due decompression and
6090	* possible re-compression at the receiver.
6091	*/
6092	break;
6093	} else {
6094	ret = send_extent_data(sctx, path: dst_path, offset,
6095	len: clone_len);
6096	}
6097
6098	if (ret < `0`)
6099	goto out;
6100
6101	len -= clone_len;
6102	if (len == `0`)
6103	break;
6104	offset += clone_len;
6105	clone_root->offset += clone_len;
6106
6107	/*
6108	* If we are cloning from the file we are currently processing,
6109	* and using the send root as the clone root, we must stop once
6110	* the current clone offset reaches the current eof of the file
6111	* at the receiver, otherwise we would issue an invalid clone
6112	* operation (source range going beyond eof) and cause the
6113	* receiver to fail. So if we reach the current eof, bail out
6114	* and fallback to a regular write.
6115	*/
6116	if (clone_root->root == sctx->send_root &&
6117	clone_root->ino == sctx->cur_ino &&
6118	clone_root->offset >= sctx->cur_inode_next_write_offset)
6119	break;
6120
6121	data_offset += clone_len;
6122	next:
6123	path->slots[`0`]++;
6124	}
6125
6126	if (len > `0`)
6127	ret = send_extent_data(sctx, path: dst_path, offset, len);
6128	else
6129	ret = `0`;
6130	out:
6131	btrfs_free_path(p: path);
6132	return ret;
6133	}
6134
6135	static int send_write_or_clone(struct send_ctx *sctx,
6136	struct btrfs_path *path,
6137	struct btrfs_key *key,
6138	struct clone_root *clone_root)
6139	{
6140	int ret = `0`;
6141	u64 offset = key->offset;
6142	u64 end;
6143	u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
6144
6145	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
6146	if (offset >= end)
6147	return `0`;
6148
6149	if (clone_root && IS_ALIGNED(end, bs)) {
6150	struct btrfs_file_extent_item *ei;
6151	u64 disk_byte;
6152	u64 data_offset;
6153
6154	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6155	struct btrfs_file_extent_item);
6156	disk_byte = btrfs_file_extent_disk_bytenr(eb: path->nodes[`0`], s: ei);
6157	data_offset = btrfs_file_extent_offset(eb: path->nodes[`0`], s: ei);
6158	ret = clone_range(sctx, dst_path: path, clone_root, disk_byte,
6159	data_offset, offset, len: end - offset);
6160	} else {
6161	ret = send_extent_data(sctx, path, offset, len: end - offset);
6162	}
6163	sctx->cur_inode_next_write_offset = end;
6164	return ret;
6165	}
6166
6167	static int is_extent_unchanged(struct send_ctx *sctx,
6168	struct btrfs_path *left_path,
6169	struct btrfs_key *ekey)
6170	{
6171	int ret = `0`;
6172	struct btrfs_key key;
6173	struct btrfs_path *path = NULL;
6174	struct extent_buffer *eb;
6175	int slot;
6176	struct btrfs_key found_key;
6177	struct btrfs_file_extent_item *ei;
6178	u64 left_disknr;
6179	u64 right_disknr;
6180	u64 left_offset;
6181	u64 right_offset;
6182	u64 left_offset_fixed;
6183	u64 left_len;
6184	u64 right_len;
6185	u64 left_gen;
6186	u64 right_gen;
6187	u8 left_type;
6188	u8 right_type;
6189
6190	path = alloc_path_for_send();
6191	if (!path)
6192	return -ENOMEM;
6193
6194	eb = left_path->nodes[`0`];
6195	slot = left_path->slots[`0`];
6196	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6197	left_type = btrfs_file_extent_type(eb, s: ei);
6198
6199	if (left_type != BTRFS_FILE_EXTENT_REG) {
6200	ret = `0`;
6201	goto out;
6202	}
6203	left_disknr = btrfs_file_extent_disk_bytenr(eb, s: ei);
6204	left_len = btrfs_file_extent_num_bytes(eb, s: ei);
6205	left_offset = btrfs_file_extent_offset(eb, s: ei);
6206	left_gen = btrfs_file_extent_generation(eb, s: ei);
6207
6208	/*
6209	* Following comments will refer to these graphics. L is the left
6210	* extents which we are checking at the moment. 1-8 are the right
6211	* extents that we iterate.
6212	*
6213	* \|-----L-----\|
6214	* \|-1-\|-2a-\|-3-\|-4-\|-5-\|-6-\|
6215	*
6216	* \|-----L-----\|
6217	* \|--1--\|-2b-\|...(same as above)
6218	*
6219	* Alternative situation. Happens on files where extents got split.
6220	* \|-----L-----\|
6221	* \|-----------7-----------\|-6-\|
6222	*
6223	* Alternative situation. Happens on files which got larger.
6224	* \|-----L-----\|
6225	* \|-8-\|
6226	* Nothing follows after 8.
6227	*/
6228
6229	key.objectid = ekey->objectid;
6230	key.type = BTRFS_EXTENT_DATA_KEY;
6231	key.offset = ekey->offset;
6232	ret = btrfs_search_slot_for_read(root: sctx->parent_root, key: &key, p: path, find_higher: `0`, return_any: `0`);
6233	if (ret < `0`)
6234	goto out;
6235	if (ret) {
6236	ret = `0`;
6237	goto out;
6238	}
6239
6240	/*
6241	* Handle special case where the right side has no extents at all.
6242	*/
6243	eb = path->nodes[`0`];
6244	slot = path->slots[`0`];
6245	btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: slot);
6246	if (found_key.objectid != key.objectid \|\|
6247	found_key.type != key.type) {
6248	/ If we're a hole then just pretend nothing changed /
6249	ret = (left_disknr) ? `0` : `1`;
6250	goto out;
6251	}
6252
6253	/*
6254	* We're now on 2a, 2b or 7.
6255	*/
6256	key = found_key;
6257	while (key.offset < ekey->offset + left_len) {
6258	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
6259	right_type = btrfs_file_extent_type(eb, s: ei);
6260	if (right_type != BTRFS_FILE_EXTENT_REG &&
6261	right_type != BTRFS_FILE_EXTENT_INLINE) {
6262	ret = `0`;
6263	goto out;
6264	}
6265
6266	if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6267	right_len = btrfs_file_extent_ram_bytes(eb, s: ei);
6268	right_len = PAGE_ALIGN(right_len);
6269	} else {
6270	right_len = btrfs_file_extent_num_bytes(eb, s: ei);
6271	}
6272
6273	/*
6274	* Are we at extent 8? If yes, we know the extent is changed.
6275	* This may only happen on the first iteration.
6276	*/
6277	if (found_key.offset + right_len <= ekey->offset) {
6278	/ If we're a hole just pretend nothing changed /
6279	ret = (left_disknr) ? `0` : `1`;
6280	goto out;
6281	}
6282
6283	/*
6284	* We just wanted to see if when we have an inline extent, what
6285	* follows it is a regular extent (wanted to check the above
6286	* condition for inline extents too). This should normally not
6287	* happen but it's possible for example when we have an inline
6288	* compressed extent representing data with a size matching
6289	* the page size (currently the same as sector size).
6290	*/
6291	if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6292	ret = `0`;
6293	goto out;
6294	}
6295
6296	right_disknr = btrfs_file_extent_disk_bytenr(eb, s: ei);
6297	right_offset = btrfs_file_extent_offset(eb, s: ei);
6298	right_gen = btrfs_file_extent_generation(eb, s: ei);
6299
6300	left_offset_fixed = left_offset;
6301	if (key.offset < ekey->offset) {
6302	/ Fix the right offset for 2a and 7. /
6303	right_offset += ekey->offset - key.offset;
6304	} else {
6305	/ Fix the left offset for all behind 2a and 2b /
6306	left_offset_fixed += key.offset - ekey->offset;
6307	}
6308
6309	/*
6310	* Check if we have the same extent.
6311	*/
6312	if (left_disknr != right_disknr \|\|
6313	left_offset_fixed != right_offset \|\|
6314	left_gen != right_gen) {
6315	ret = `0`;
6316	goto out;
6317	}
6318
6319	/*
6320	* Go to the next extent.
6321	*/
6322	ret = btrfs_next_item(root: sctx->parent_root, p: path);
6323	if (ret < `0`)
6324	goto out;
6325	if (!ret) {
6326	eb = path->nodes[`0`];
6327	slot = path->slots[`0`];
6328	btrfs_item_key_to_cpu(eb, cpu_key: &found_key, nr: slot);
6329	}
6330	if (ret \|\| found_key.objectid != key.objectid \|\|
6331	found_key.type != key.type) {
6332	key.offset += right_len;
6333	break;
6334	}
6335	if (found_key.offset != key.offset + right_len) {
6336	ret = `0`;
6337	goto out;
6338	}
6339	key = found_key;
6340	}
6341
6342	/*
6343	* We're now behind the left extent (treat as unchanged) or at the end
6344	* of the right side (treat as changed).
6345	*/
6346	if (key.offset >= ekey->offset + left_len)
6347	ret = `1`;
6348	else
6349	ret = `0`;
6350
6351
6352	out:
6353	btrfs_free_path(p: path);
6354	return ret;
6355	}
6356
6357	static int get_last_extent(struct send_ctx *sctx, u64 offset)
6358	{
6359	struct btrfs_path *path;
6360	struct btrfs_root *root = sctx->send_root;
6361	struct btrfs_key key;
6362	int ret;
6363
6364	path = alloc_path_for_send();
6365	if (!path)
6366	return -ENOMEM;
6367
6368	sctx->cur_inode_last_extent = `0`;
6369
6370	key.objectid = sctx->cur_ino;
6371	key.type = BTRFS_EXTENT_DATA_KEY;
6372	key.offset = offset;
6373	ret = btrfs_search_slot_for_read(root, key: &key, p: path, find_higher: `0`, return_any: `1`);
6374	if (ret < `0`)
6375	goto out;
6376	ret = `0`;
6377	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
6378	if (key.objectid != sctx->cur_ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
6379	goto out;
6380
6381	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6382	out:
6383	btrfs_free_path(p: path);
6384	return ret;
6385	}
6386
6387	static int range_is_hole_in_parent(struct send_ctx *sctx,
6388	const u64 start,
6389	const u64 end)
6390	{
6391	struct btrfs_path *path;
6392	struct btrfs_key key;
6393	struct btrfs_root *root = sctx->parent_root;
6394	u64 search_start = start;
6395	int ret;
6396
6397	path = alloc_path_for_send();
6398	if (!path)
6399	return -ENOMEM;
6400
6401	key.objectid = sctx->cur_ino;
6402	key.type = BTRFS_EXTENT_DATA_KEY;
6403	key.offset = search_start;
6404	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
6405	if (ret < `0`)
6406	goto out;
6407	if (ret > `0` && path->slots[`0`] > `0`)
6408	path->slots[`0`]--;
6409
6410	while (search_start < end) {
6411	struct extent_buffer *leaf = path->nodes[`0`];
6412	int slot = path->slots[`0`];
6413	struct btrfs_file_extent_item *fi;
6414	u64 extent_end;
6415
6416	if (slot >= btrfs_header_nritems(eb: leaf)) {
6417	ret = btrfs_next_leaf(root, path);
6418	if (ret < `0`)
6419	goto out;
6420	else if (ret > `0`)
6421	break;
6422	continue;
6423	}
6424
6425	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
6426	if (key.objectid < sctx->cur_ino \|\|
6427	key.type < BTRFS_EXTENT_DATA_KEY)
6428	goto next;
6429	if (key.objectid > sctx->cur_ino \|\|
6430	key.type > BTRFS_EXTENT_DATA_KEY \|\|
6431	key.offset >= end)
6432	break;
6433
6434	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6435	extent_end = btrfs_file_extent_end(path);
6436	if (extent_end <= start)
6437	goto next;
6438	if (btrfs_file_extent_disk_bytenr(eb: leaf, s: fi) == `0`) {
6439	search_start = extent_end;
6440	goto next;
6441	}
6442	ret = `0`;
6443	goto out;
6444	next:
6445	path->slots[`0`]++;
6446	}
6447	ret = `1`;
6448	out:
6449	btrfs_free_path(p: path);
6450	return ret;
6451	}
6452
6453	static int maybe_send_hole(struct send_ctx sctx, struct* btrfs_path *path,
6454	struct btrfs_key *key)
6455	{
6456	int ret = `0`;
6457
6458	if (sctx->cur_ino != key->objectid \|\| !need_send_hole(sctx))
6459	return `0`;
6460
6461	if (sctx->cur_inode_last_extent == (u64)-`1`) {
6462	ret = get_last_extent(sctx, offset: key->offset - `1`);
6463	if (ret)
6464	return ret;
6465	}
6466
6467	if (path->slots[`0`] == `0` &&
6468	sctx->cur_inode_last_extent < key->offset) {
6469	/*
6470	* We might have skipped entire leafs that contained only
6471	* file extent items for our current inode. These leafs have
6472	* a generation number smaller (older) than the one in the
6473	* current leaf and the leaf our last extent came from, and
6474	* are located between these 2 leafs.
6475	*/
6476	ret = get_last_extent(sctx, offset: key->offset - `1`);
6477	if (ret)
6478	return ret;
6479	}
6480
6481	if (sctx->cur_inode_last_extent < key->offset) {
6482	ret = range_is_hole_in_parent(sctx,
6483	start: sctx->cur_inode_last_extent,
6484	end: key->offset);
6485	if (ret < `0`)
6486	return ret;
6487	else if (ret == `0`)
6488	ret = send_hole(sctx, end: key->offset);
6489	else
6490	ret = `0`;
6491	}
6492	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6493	return ret;
6494	}
6495
6496	static int process_extent(struct send_ctx *sctx,
6497	struct btrfs_path *path,
6498	struct btrfs_key *key)
6499	{
6500	struct clone_root *found_clone = NULL;
6501	int ret = `0`;
6502
6503	if (S_ISLNK(sctx->cur_inode_mode))
6504	return `0`;
6505
6506	if (sctx->parent_root && !sctx->cur_inode_new) {
6507	ret = is_extent_unchanged(sctx, left_path: path, ekey: key);
6508	if (ret < `0`)
6509	goto out;
6510	if (ret) {
6511	ret = `0`;
6512	goto out_hole;
6513	}
6514	} else {
6515	struct btrfs_file_extent_item *ei;
6516	u8 type;
6517
6518	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
6519	struct btrfs_file_extent_item);
6520	type = btrfs_file_extent_type(eb: path->nodes[`0`], s: ei);
6521	if (type == BTRFS_FILE_EXTENT_PREALLOC \|\|
6522	type == BTRFS_FILE_EXTENT_REG) {
6523	/*
6524	* The send spec does not have a prealloc command yet,
6525	* so just leave a hole for prealloc'ed extents until
6526	* we have enough commands queued up to justify rev'ing
6527	* the send spec.
6528	*/
6529	if (type == BTRFS_FILE_EXTENT_PREALLOC) {
6530	ret = `0`;
6531	goto out;
6532	}
6533
6534	/ Have a hole, just skip it. /
6535	if (btrfs_file_extent_disk_bytenr(eb: path->nodes[`0`], s: ei) == `0`) {
6536	ret = `0`;
6537	goto out;
6538	}
6539	}
6540	}
6541
6542	ret = find_extent_clone(sctx, path, ino: key->objectid, data_offset: key->offset,
6543	ino_size: sctx->cur_inode_size, found: &found_clone);
6544	if (ret != -ENOENT && ret < `0`)
6545	goto out;
6546
6547	ret = send_write_or_clone(sctx, path, key, clone_root: found_clone);
6548	if (ret)
6549	goto out;
6550	out_hole:
6551	ret = maybe_send_hole(sctx, path, key);
6552	out:
6553	return ret;
6554	}
6555
6556	static int process_all_extents(struct send_ctx *sctx)
6557	{
6558	int ret = `0`;
6559	int iter_ret = `0`;
6560	struct btrfs_root *root;
6561	struct btrfs_path *path;
6562	struct btrfs_key key;
6563	struct btrfs_key found_key;
6564
6565	root = sctx->send_root;
6566	path = alloc_path_for_send();
6567	if (!path)
6568	return -ENOMEM;
6569
6570	key.objectid = sctx->cmp_key->objectid;
6571	key.type = BTRFS_EXTENT_DATA_KEY;
6572	key.offset = `0`;
6573	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
6574	if (found_key.objectid != key.objectid \|\|
6575	found_key.type != key.type) {
6576	ret = `0`;
6577	break;
6578	}
6579
6580	ret = process_extent(sctx, path, key: &found_key);
6581	if (ret < `0`)
6582	break;
6583	}
6584	/ Catch error found during iteration /
6585	if (iter_ret < `0`)
6586	ret = iter_ret;
6587
6588	btrfs_free_path(p: path);
6589	return ret;
6590	}
6591
6592	static int process_recorded_refs_if_needed(struct send_ctx sctx, int* at_end,
6593	int *pending_move,
6594	int *refs_processed)
6595	{
6596	int ret = `0`;
6597
6598	if (sctx->cur_ino == `0`)
6599	goto out;
6600	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6601	sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6602	goto out;
6603	if (list_empty(head: &sctx->new_refs) && list_empty(head: &sctx->deleted_refs))
6604	goto out;
6605
6606	ret = process_recorded_refs(sctx, pending_move);
6607	if (ret < `0`)
6608	goto out;
6609
6610	*refs_processed = `1`;
6611	out:
6612	return ret;
6613	}
6614
6615	static int finish_inode_if_needed(struct send_ctx sctx, int* at_end)
6616	{
6617	int ret = `0`;
6618	struct btrfs_inode_info info;
6619	u64 left_mode;
6620	u64 left_uid;
6621	u64 left_gid;
6622	u64 left_fileattr;
6623	u64 right_mode;
6624	u64 right_uid;
6625	u64 right_gid;
6626	u64 right_fileattr;
6627	int need_chmod = `0`;
6628	int need_chown = `0`;
6629	bool need_fileattr = false;
6630	int need_truncate = `1`;
6631	int pending_move = `0`;
6632	int refs_processed = `0`;
6633
6634	if (sctx->ignore_cur_inode)
6635	return `0`;
6636
6637	ret = process_recorded_refs_if_needed(sctx, at_end, pending_move: &pending_move,
6638	refs_processed: &refs_processed);
6639	if (ret < `0`)
6640	goto out;
6641
6642	/*
6643	* We have processed the refs and thus need to advance send_progress.
6644	* Now, calls to get_cur_xxx will take the updated refs of the current
6645	* inode into account.
6646	*
6647	* On the other hand, if our current inode is a directory and couldn't
6648	* be moved/renamed because its parent was renamed/moved too and it has
6649	* a higher inode number, we can only move/rename our current inode
6650	* after we moved/renamed its parent. Therefore in this case operate on
6651	* the old path (pre move/rename) of our current inode, and the
6652	* move/rename will be performed later.
6653	*/
6654	if (refs_processed && !pending_move)
6655	sctx->send_progress = sctx->cur_ino + `1`;
6656
6657	if (sctx->cur_ino == `0` \|\| sctx->cur_inode_deleted)
6658	goto out;
6659	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
6660	goto out;
6661	ret = get_inode_info(root: sctx->send_root, ino: sctx->cur_ino, info: &info);
6662	if (ret < `0`)
6663	goto out;
6664	left_mode = info.mode;
6665	left_uid = info.uid;
6666	left_gid = info.gid;
6667	left_fileattr = info.fileattr;
6668
6669	if (!sctx->parent_root \|\| sctx->cur_inode_new) {
6670	need_chown = `1`;
6671	if (!S_ISLNK(sctx->cur_inode_mode))
6672	need_chmod = `1`;
6673	if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
6674	need_truncate = `0`;
6675	} else {
6676	u64 old_size;
6677
6678	ret = get_inode_info(root: sctx->parent_root, ino: sctx->cur_ino, info: &info);
6679	if (ret < `0`)
6680	goto out;
6681	old_size = info.size;
6682	right_mode = info.mode;
6683	right_uid = info.uid;
6684	right_gid = info.gid;
6685	right_fileattr = info.fileattr;
6686
6687	if (left_uid != right_uid \|\| left_gid != right_gid)
6688	need_chown = `1`;
6689	if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
6690	need_chmod = `1`;
6691	if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
6692	need_fileattr = true;
6693	if ((old_size == sctx->cur_inode_size) \|\|
6694	(sctx->cur_inode_size > old_size &&
6695	sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
6696	need_truncate = `0`;
6697	}
6698
6699	if (S_ISREG(sctx->cur_inode_mode)) {
6700	if (need_send_hole(sctx)) {
6701	if (sctx->cur_inode_last_extent == (u64)-`1` \|\|
6702	sctx->cur_inode_last_extent <
6703	sctx->cur_inode_size) {
6704	ret = get_last_extent(sctx, offset: (u64)-`1`);
6705	if (ret)
6706	goto out;
6707	}
6708	if (sctx->cur_inode_last_extent <
6709	sctx->cur_inode_size) {
6710	ret = send_hole(sctx, end: sctx->cur_inode_size);
6711	if (ret)
6712	goto out;
6713	}
6714	}
6715	if (need_truncate) {
6716	ret = send_truncate(sctx, ino: sctx->cur_ino,
6717	gen: sctx->cur_inode_gen,
6718	size: sctx->cur_inode_size);
6719	if (ret < `0`)
6720	goto out;
6721	}
6722	}
6723
6724	if (need_chown) {
6725	ret = send_chown(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6726	uid: left_uid, gid: left_gid);
6727	if (ret < `0`)
6728	goto out;
6729	}
6730	if (need_chmod) {
6731	ret = send_chmod(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6732	mode: left_mode);
6733	if (ret < `0`)
6734	goto out;
6735	}
6736	if (need_fileattr) {
6737	ret = send_fileattr(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen,
6738	fileattr: left_fileattr);
6739	if (ret < `0`)
6740	goto out;
6741	}
6742
6743	if (proto_cmd_ok(sctx, cmd: BTRFS_SEND_C_ENABLE_VERITY)
6744	&& sctx->cur_inode_needs_verity) {
6745	ret = process_verity(sctx);
6746	if (ret < `0`)
6747	goto out;
6748	}
6749
6750	ret = send_capabilities(sctx);
6751	if (ret < `0`)
6752	goto out;
6753
6754	/*
6755	* If other directory inodes depended on our current directory
6756	* inode's move/rename, now do their move/rename operations.
6757	*/
6758	if (!is_waiting_for_move(sctx, ino: sctx->cur_ino)) {
6759	ret = apply_children_dir_moves(sctx);
6760	if (ret)
6761	goto out;
6762	/*
6763	* Need to send that every time, no matter if it actually
6764	* changed between the two trees as we have done changes to
6765	* the inode before. If our inode is a directory and it's
6766	* waiting to be moved/renamed, we will send its utimes when
6767	* it's moved/renamed, therefore we don't need to do it here.
6768	*/
6769	sctx->send_progress = sctx->cur_ino + `1`;
6770
6771	/*
6772	* If the current inode is a non-empty directory, delay issuing
6773	* the utimes command for it, as it's very likely we have inodes
6774	* with an higher number inside it. We want to issue the utimes
6775	* command only after adding all dentries to it.
6776	*/
6777	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > `0`)
6778	ret = cache_dir_utimes(sctx, dir: sctx->cur_ino, gen: sctx->cur_inode_gen);
6779	else
6780	ret = send_utimes(sctx, ino: sctx->cur_ino, gen: sctx->cur_inode_gen);
6781
6782	if (ret < `0`)
6783	goto out;
6784	}
6785
6786	out:
6787	if (!ret)
6788	ret = trim_dir_utimes_cache(sctx);
6789
6790	return ret;
6791	}
6792
6793	static void close_current_inode(struct send_ctx *sctx)
6794	{
6795	u64 i_size;
6796
6797	if (sctx->cur_inode == NULL)
6798	return;
6799
6800	i_size = i_size_read(inode: sctx->cur_inode);
6801
6802	/*
6803	* If we are doing an incremental send, we may have extents between the
6804	* last processed extent and the i_size that have not been processed
6805	* because they haven't changed but we may have read some of their pages
6806	* through readahead, see the comments at send_extent_data().
6807	*/
6808	if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
6809	truncate_inode_pages_range(&sctx->cur_inode->i_data,
6810	lstart: sctx->page_cache_clear_start,
6811	round_up(i_size, PAGE_SIZE) - `1`);
6812
6813	iput(sctx->cur_inode);
6814	sctx->cur_inode = NULL;
6815	}
6816
6817	static int changed_inode(struct send_ctx *sctx,
6818	enum btrfs_compare_tree_result result)
6819	{
6820	int ret = `0`;
6821	struct btrfs_key *key = sctx->cmp_key;
6822	struct btrfs_inode_item *left_ii = NULL;
6823	struct btrfs_inode_item *right_ii = NULL;
6824	u64 left_gen = `0`;
6825	u64 right_gen = `0`;
6826
6827	close_current_inode(sctx);
6828
6829	sctx->cur_ino = key->objectid;
6830	sctx->cur_inode_new_gen = false;
6831	sctx->cur_inode_last_extent = (u64)-`1`;
6832	sctx->cur_inode_next_write_offset = `0`;
6833	sctx->ignore_cur_inode = false;
6834
6835	/*
6836	* Set send_progress to current inode. This will tell all get_cur_xxx
6837	* functions that the current inode's refs are not updated yet. Later,
6838	* when process_recorded_refs is finished, it is set to cur_ino + 1.
6839	*/
6840	sctx->send_progress = sctx->cur_ino;
6841
6842	if (result == BTRFS_COMPARE_TREE_NEW \|\|
6843	result == BTRFS_COMPARE_TREE_CHANGED) {
6844	left_ii = btrfs_item_ptr(sctx->left_path->nodes[`0`],
6845	sctx->left_path->slots[`0`],
6846	struct btrfs_inode_item);
6847	left_gen = btrfs_inode_generation(eb: sctx->left_path->nodes[`0`],
6848	s: left_ii);
6849	} else {
6850	right_ii = btrfs_item_ptr(sctx->right_path->nodes[`0`],
6851	sctx->right_path->slots[`0`],
6852	struct btrfs_inode_item);
6853	right_gen = btrfs_inode_generation(eb: sctx->right_path->nodes[`0`],
6854	s: right_ii);
6855	}
6856	if (result == BTRFS_COMPARE_TREE_CHANGED) {
6857	right_ii = btrfs_item_ptr(sctx->right_path->nodes[`0`],
6858	sctx->right_path->slots[`0`],
6859	struct btrfs_inode_item);
6860
6861	right_gen = btrfs_inode_generation(eb: sctx->right_path->nodes[`0`],
6862	s: right_ii);
6863
6864	/*
6865	* The cur_ino = root dir case is special here. We can't treat
6866	* the inode as deleted+reused because it would generate a
6867	* stream that tries to delete/mkdir the root dir.
6868	*/
6869	if (left_gen != right_gen &&
6870	sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6871	sctx->cur_inode_new_gen = true;
6872	}
6873
6874	/*
6875	* Normally we do not find inodes with a link count of zero (orphans)
6876	* because the most common case is to create a snapshot and use it
6877	* for a send operation. However other less common use cases involve
6878	* using a subvolume and send it after turning it to RO mode just
6879	* after deleting all hard links of a file while holding an open
6880	* file descriptor against it or turning a RO snapshot into RW mode,
6881	* keep an open file descriptor against a file, delete it and then
6882	* turn the snapshot back to RO mode before using it for a send
6883	* operation. The former is what the receiver operation does.
6884	* Therefore, if we want to send these snapshots soon after they're
6885	* received, we need to handle orphan inodes as well. Moreover, orphans
6886	* can appear not only in the send snapshot but also in the parent
6887	* snapshot. Here are several cases:
6888	*
6889	* Case 1: BTRFS_COMPARE_TREE_NEW
6890	* \| send snapshot \| action
6891	* --------------------------------
6892	* nlink \| 0 \| ignore
6893	*
6894	* Case 2: BTRFS_COMPARE_TREE_DELETED
6895	* \| parent snapshot \| action
6896	* ----------------------------------
6897	* nlink \| 0 \| as usual
6898	* Note: No unlinks will be sent because there're no paths for it.
6899	*
6900	* Case 3: BTRFS_COMPARE_TREE_CHANGED
6901	* \| \| parent snapshot \| send snapshot \| action
6902	* -----------------------------------------------------------------------
6903	* subcase 1 \| nlink \| 0 \| 0 \| ignore
6904	* subcase 2 \| nlink \| >0 \| 0 \| new_gen(deletion)
6905	* subcase 3 \| nlink \| 0 \| >0 \| new_gen(creation)
6906	*
6907	*/
6908	if (result == BTRFS_COMPARE_TREE_NEW) {
6909	if (btrfs_inode_nlink(eb: sctx->left_path->nodes[`0`], s: left_ii) == `0`) {
6910	sctx->ignore_cur_inode = true;
6911	goto out;
6912	}
6913	sctx->cur_inode_gen = left_gen;
6914	sctx->cur_inode_new = true;
6915	sctx->cur_inode_deleted = false;
6916	sctx->cur_inode_size = btrfs_inode_size(
6917	eb: sctx->left_path->nodes[`0`], s: left_ii);
6918	sctx->cur_inode_mode = btrfs_inode_mode(
6919	eb: sctx->left_path->nodes[`0`], s: left_ii);
6920	sctx->cur_inode_rdev = btrfs_inode_rdev(
6921	eb: sctx->left_path->nodes[`0`], s: left_ii);
6922	if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6923	ret = send_create_inode_if_needed(sctx);
6924	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
6925	sctx->cur_inode_gen = right_gen;
6926	sctx->cur_inode_new = false;
6927	sctx->cur_inode_deleted = true;
6928	sctx->cur_inode_size = btrfs_inode_size(
6929	eb: sctx->right_path->nodes[`0`], s: right_ii);
6930	sctx->cur_inode_mode = btrfs_inode_mode(
6931	eb: sctx->right_path->nodes[`0`], s: right_ii);
6932	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6933	u32 new_nlinks, old_nlinks;
6934
6935	new_nlinks = btrfs_inode_nlink(eb: sctx->left_path->nodes[`0`], s: left_ii);
6936	old_nlinks = btrfs_inode_nlink(eb: sctx->right_path->nodes[`0`], s: right_ii);
6937	if (new_nlinks == `0` && old_nlinks == `0`) {
6938	sctx->ignore_cur_inode = true;
6939	goto out;
6940	} else if (new_nlinks == `0` \|\| old_nlinks == `0`) {
6941	sctx->cur_inode_new_gen = `1`;
6942	}
6943	/*
6944	* We need to do some special handling in case the inode was
6945	* reported as changed with a changed generation number. This
6946	* means that the original inode was deleted and new inode
6947	* reused the same inum. So we have to treat the old inode as
6948	* deleted and the new one as new.
6949	*/
6950	if (sctx->cur_inode_new_gen) {
6951	/*
6952	* First, process the inode as if it was deleted.
6953	*/
6954	if (old_nlinks > `0`) {
6955	sctx->cur_inode_gen = right_gen;
6956	sctx->cur_inode_new = false;
6957	sctx->cur_inode_deleted = true;
6958	sctx->cur_inode_size = btrfs_inode_size(
6959	eb: sctx->right_path->nodes[`0`], s: right_ii);
6960	sctx->cur_inode_mode = btrfs_inode_mode(
6961	eb: sctx->right_path->nodes[`0`], s: right_ii);
6962	ret = process_all_refs(sctx,
6963	cmd: BTRFS_COMPARE_TREE_DELETED);
6964	if (ret < `0`)
6965	goto out;
6966	}
6967
6968	/*
6969	* Now process the inode as if it was new.
6970	*/
6971	if (new_nlinks > `0`) {
6972	sctx->cur_inode_gen = left_gen;
6973	sctx->cur_inode_new = true;
6974	sctx->cur_inode_deleted = false;
6975	sctx->cur_inode_size = btrfs_inode_size(
6976	eb: sctx->left_path->nodes[`0`],
6977	s: left_ii);
6978	sctx->cur_inode_mode = btrfs_inode_mode(
6979	eb: sctx->left_path->nodes[`0`],
6980	s: left_ii);
6981	sctx->cur_inode_rdev = btrfs_inode_rdev(
6982	eb: sctx->left_path->nodes[`0`],
6983	s: left_ii);
6984	ret = send_create_inode_if_needed(sctx);
6985	if (ret < `0`)
6986	goto out;
6987
6988	ret = process_all_refs(sctx, cmd: BTRFS_COMPARE_TREE_NEW);
6989	if (ret < `0`)
6990	goto out;
6991	/*
6992	* Advance send_progress now as we did not get
6993	* into process_recorded_refs_if_needed in the
6994	* new_gen case.
6995	*/
6996	sctx->send_progress = sctx->cur_ino + `1`;
6997
6998	/*
6999	* Now process all extents and xattrs of the
7000	* inode as if they were all new.
7001	*/
7002	ret = process_all_extents(sctx);
7003	if (ret < `0`)
7004	goto out;
7005	ret = process_all_new_xattrs(sctx);
7006	if (ret < `0`)
7007	goto out;
7008	}
7009	} else {
7010	sctx->cur_inode_gen = left_gen;
7011	sctx->cur_inode_new = false;
7012	sctx->cur_inode_new_gen = false;
7013	sctx->cur_inode_deleted = false;
7014	sctx->cur_inode_size = btrfs_inode_size(
7015	eb: sctx->left_path->nodes[`0`], s: left_ii);
7016	sctx->cur_inode_mode = btrfs_inode_mode(
7017	eb: sctx->left_path->nodes[`0`], s: left_ii);
7018	}
7019	}
7020
7021	out:
7022	return ret;
7023	}
7024
7025	/*
7026	* We have to process new refs before deleted refs, but compare_trees gives us
7027	* the new and deleted refs mixed. To fix this, we record the new/deleted refs
7028	* first and later process them in process_recorded_refs.
7029	* For the cur_inode_new_gen case, we skip recording completely because
7030	* changed_inode did already initiate processing of refs. The reason for this is
7031	* that in this case, compare_tree actually compares the refs of 2 different
7032	* inodes. To fix this, process_all_refs is used in changed_inode to handle all
7033	* refs of the right tree as deleted and all refs of the left tree as new.
7034	*/
7035	static int changed_ref(struct send_ctx *sctx,
7036	enum btrfs_compare_tree_result result)
7037	{
7038	int ret = `0`;
7039
7040	if (sctx->cur_ino != sctx->cmp_key->objectid) {
7041	inconsistent_snapshot_error(sctx, result, what: "reference");
7042	return -EIO;
7043	}
7044
7045	if (!sctx->cur_inode_new_gen &&
7046	sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
7047	if (result == BTRFS_COMPARE_TREE_NEW)
7048	ret = record_new_ref(sctx);
7049	else if (result == BTRFS_COMPARE_TREE_DELETED)
7050	ret = record_deleted_ref(sctx);
7051	else if (result == BTRFS_COMPARE_TREE_CHANGED)
7052	ret = record_changed_ref(sctx);
7053	}
7054
7055	return ret;
7056	}
7057
7058	/*
7059	* Process new/deleted/changed xattrs. We skip processing in the
7060	* cur_inode_new_gen case because changed_inode did already initiate processing
7061	* of xattrs. The reason is the same as in changed_ref
7062	*/
7063	static int changed_xattr(struct send_ctx *sctx,
7064	enum btrfs_compare_tree_result result)
7065	{
7066	int ret = `0`;
7067
7068	if (sctx->cur_ino != sctx->cmp_key->objectid) {
7069	inconsistent_snapshot_error(sctx, result, what: "xattr");
7070	return -EIO;
7071	}
7072
7073	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7074	if (result == BTRFS_COMPARE_TREE_NEW)
7075	ret = process_new_xattr(sctx);
7076	else if (result == BTRFS_COMPARE_TREE_DELETED)
7077	ret = process_deleted_xattr(sctx);
7078	else if (result == BTRFS_COMPARE_TREE_CHANGED)
7079	ret = process_changed_xattr(sctx);
7080	}
7081
7082	return ret;
7083	}
7084
7085	/*
7086	* Process new/deleted/changed extents. We skip processing in the
7087	* cur_inode_new_gen case because changed_inode did already initiate processing
7088	* of extents. The reason is the same as in changed_ref
7089	*/
7090	static int changed_extent(struct send_ctx *sctx,
7091	enum btrfs_compare_tree_result result)
7092	{
7093	int ret = `0`;
7094
7095	/*
7096	* We have found an extent item that changed without the inode item
7097	* having changed. This can happen either after relocation (where the
7098	* disk_bytenr of an extent item is replaced at
7099	* relocation.c:replace_file_extents()) or after deduplication into a
7100	* file in both the parent and send snapshots (where an extent item can
7101	* get modified or replaced with a new one). Note that deduplication
7102	* updates the inode item, but it only changes the iversion (sequence
7103	* field in the inode item) of the inode, so if a file is deduplicated
7104	* the same amount of times in both the parent and send snapshots, its
7105	* iversion becomes the same in both snapshots, whence the inode item is
7106	* the same on both snapshots.
7107	*/
7108	if (sctx->cur_ino != sctx->cmp_key->objectid)
7109	return `0`;
7110
7111	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7112	if (result != BTRFS_COMPARE_TREE_DELETED)
7113	ret = process_extent(sctx, path: sctx->left_path,
7114	key: sctx->cmp_key);
7115	}
7116
7117	return ret;
7118	}
7119
7120	static int changed_verity(struct send_ctx sctx, enum* btrfs_compare_tree_result result)
7121	{
7122	int ret = `0`;
7123
7124	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
7125	if (result == BTRFS_COMPARE_TREE_NEW)
7126	sctx->cur_inode_needs_verity = true;
7127	}
7128	return ret;
7129	}
7130
7131	static int dir_changed(struct send_ctx *sctx, u64 dir)
7132	{
7133	u64 orig_gen, new_gen;
7134	int ret;
7135
7136	ret = get_inode_gen(root: sctx->send_root, ino: dir, gen: &new_gen);
7137	if (ret)
7138	return ret;
7139
7140	ret = get_inode_gen(root: sctx->parent_root, ino: dir, gen: &orig_gen);
7141	if (ret)
7142	return ret;
7143
7144	return (orig_gen != new_gen) ? `1` : `0`;
7145	}
7146
7147	static int compare_refs(struct send_ctx sctx, struct* btrfs_path *path,
7148	struct btrfs_key *key)
7149	{
7150	struct btrfs_inode_extref *extref;
7151	struct extent_buffer *leaf;
7152	u64 dirid = `0`, last_dirid = `0`;
7153	unsigned long ptr;
7154	u32 item_size;
7155	u32 cur_offset = `0`;
7156	int ref_name_len;
7157	int ret = `0`;
7158
7159	/ Easy case, just check this one dirid /
7160	if (key->type == BTRFS_INODE_REF_KEY) {
7161	dirid = key->offset;
7162
7163	ret = dir_changed(sctx, dir: dirid);
7164	goto out;
7165	}
7166
7167	leaf = path->nodes[`0`];
7168	item_size = btrfs_item_size(eb: leaf, slot: path->slots[`0`]);
7169	ptr = btrfs_item_ptr_offset(leaf, path->slots[`0`]);
7170	while (cur_offset < item_size) {
7171	extref = (struct btrfs_inode_extref *)(ptr +
7172	cur_offset);
7173	dirid = btrfs_inode_extref_parent(eb: leaf, s: extref);
7174	ref_name_len = btrfs_inode_extref_name_len(eb: leaf, s: extref);
7175	cur_offset += ref_name_len + sizeof(*extref);
7176	if (dirid == last_dirid)
7177	continue;
7178	ret = dir_changed(sctx, dir: dirid);
7179	if (ret)
7180	break;
7181	last_dirid = dirid;
7182	}
7183	out:
7184	return ret;
7185	}
7186
7187	/*
7188	* Updates compare related fields in sctx and simply forwards to the actual
7189	* changed_xxx functions.
7190	*/
7191	static int changed_cb(struct btrfs_path *left_path,
7192	struct btrfs_path *right_path,
7193	struct btrfs_key *key,
7194	enum btrfs_compare_tree_result result,
7195	struct send_ctx *sctx)
7196	{
7197	int ret = `0`;
7198
7199	/*
7200	* We can not hold the commit root semaphore here. This is because in
7201	* the case of sending and receiving to the same filesystem, using a
7202	* pipe, could result in a deadlock:
7203	*
7204	* 1) The task running send blocks on the pipe because it's full;
7205	*
7206	* 2) The task running receive, which is the only consumer of the pipe,
7207	* is waiting for a transaction commit (for example due to a space
7208	* reservation when doing a write or triggering a transaction commit
7209	* when creating a subvolume);
7210	*
7211	* 3) The transaction is waiting to write lock the commit root semaphore,
7212	* but can not acquire it since it's being held at 1).
7213	*
7214	* Down this call chain we write to the pipe through kernel_write().
7215	* The same type of problem can also happen when sending to a file that
7216	* is stored in the same filesystem - when reserving space for a write
7217	* into the file, we can trigger a transaction commit.
7218	*
7219	* Our caller has supplied us with clones of leaves from the send and
7220	* parent roots, so we're safe here from a concurrent relocation and
7221	* further reallocation of metadata extents while we are here. Below we
7222	* also assert that the leaves are clones.
7223	*/
7224	lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
7225
7226	/*
7227	* We always have a send root, so left_path is never NULL. We will not
7228	* have a leaf when we have reached the end of the send root but have
7229	* not yet reached the end of the parent root.
7230	*/
7231	if (left_path->nodes[`0`])
7232	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7233	&left_path->nodes[`0`]->bflags));
7234	/*
7235	* When doing a full send we don't have a parent root, so right_path is
7236	* NULL. When doing an incremental send, we may have reached the end of
7237	* the parent root already, so we don't have a leaf at right_path.
7238	*/
7239	if (right_path && right_path->nodes[`0`])
7240	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
7241	&right_path->nodes[`0`]->bflags));
7242
7243	if (result == BTRFS_COMPARE_TREE_SAME) {
7244	if (key->type == BTRFS_INODE_REF_KEY \|\|
7245	key->type == BTRFS_INODE_EXTREF_KEY) {
7246	ret = compare_refs(sctx, path: left_path, key);
7247	if (!ret)
7248	return `0`;
7249	if (ret < `0`)
7250	return ret;
7251	} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
7252	return maybe_send_hole(sctx, path: left_path, key);
7253	} else {
7254	return `0`;
7255	}
7256	result = BTRFS_COMPARE_TREE_CHANGED;
7257	ret = `0`;
7258	}
7259
7260	sctx->left_path = left_path;
7261	sctx->right_path = right_path;
7262	sctx->cmp_key = key;
7263
7264	ret = finish_inode_if_needed(sctx, at_end: `0`);
7265	if (ret < `0`)
7266	goto out;
7267
7268	/ Ignore non-FS objects /
7269	if (key->objectid == BTRFS_FREE_INO_OBJECTID \|\|
7270	key->objectid == BTRFS_FREE_SPACE_OBJECTID)
7271	goto out;
7272
7273	if (key->type == BTRFS_INODE_ITEM_KEY) {
7274	ret = changed_inode(sctx, result);
7275	} else if (!sctx->ignore_cur_inode) {
7276	if (key->type == BTRFS_INODE_REF_KEY \|\|
7277	key->type == BTRFS_INODE_EXTREF_KEY)
7278	ret = changed_ref(sctx, result);
7279	else if (key->type == BTRFS_XATTR_ITEM_KEY)
7280	ret = changed_xattr(sctx, result);
7281	else if (key->type == BTRFS_EXTENT_DATA_KEY)
7282	ret = changed_extent(sctx, result);
7283	else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
7284	key->offset == `0`)
7285	ret = changed_verity(sctx, result);
7286	}
7287
7288	out:
7289	return ret;
7290	}
7291
7292	static int search_key_again(const struct send_ctx *sctx,
7293	struct btrfs_root *root,
7294	struct btrfs_path *path,
7295	const struct btrfs_key *key)
7296	{
7297	int ret;
7298
7299	if (!path->need_commit_sem)
7300	lockdep_assert_held_read(&root->fs_info->commit_root_sem);
7301
7302	/*
7303	* Roots used for send operations are readonly and no one can add,
7304	* update or remove keys from them, so we should be able to find our
7305	* key again. The only exception is deduplication, which can operate on
7306	* readonly roots and add, update or remove keys to/from them - but at
7307	* the moment we don't allow it to run in parallel with send.
7308	*/
7309	ret = btrfs_search_slot(NULL, root, key, p: path, ins_len: `0`, cow: `0`);
7310	ASSERT(ret <= `0`);
7311	if (ret > `0`) {
7312	btrfs_print_tree(c: path->nodes[path->lowest_level], follow: false);
7313	btrfs_err(root->fs_info,
7314	"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
7315	key->objectid, key->type, key->offset,
7316	(root == sctx->parent_root ? "parent" : "send"),
7317	root->root_key.objectid, path->lowest_level,
7318	path->slots[path->lowest_level]);
7319	return -EUCLEAN;
7320	}
7321
7322	return ret;
7323	}
7324
7325	static int full_send_tree(struct send_ctx *sctx)
7326	{
7327	int ret;
7328	struct btrfs_root *send_root = sctx->send_root;
7329	struct btrfs_key key;
7330	struct btrfs_fs_info *fs_info = send_root->fs_info;
7331	struct btrfs_path *path;
7332
7333	path = alloc_path_for_send();
7334	if (!path)
7335	return -ENOMEM;
7336	path->reada = READA_FORWARD_ALWAYS;
7337
7338	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
7339	key.type = BTRFS_INODE_ITEM_KEY;
7340	key.offset = `0`;
7341
7342	down_read(sem: &fs_info->commit_root_sem);
7343	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7344	up_read(sem: &fs_info->commit_root_sem);
7345
7346	ret = btrfs_search_slot_for_read(root: send_root, key: &key, p: path, find_higher: `1`, return_any: `0`);
7347	if (ret < `0`)
7348	goto out;
7349	if (ret)
7350	goto out_finish;
7351
7352	while (`1`) {
7353	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
7354
7355	ret = changed_cb(left_path: path, NULL, key: &key,
7356	result: BTRFS_COMPARE_TREE_NEW, sctx);
7357	if (ret < `0`)
7358	goto out;
7359
7360	down_read(sem: &fs_info->commit_root_sem);
7361	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7362	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7363	up_read(sem: &fs_info->commit_root_sem);
7364	/*
7365	* A transaction used for relocating a block group was
7366	* committed or is about to finish its commit. Release
7367	* our path (leaf) and restart the search, so that we
7368	* avoid operating on any file extent items that are
7369	* stale, with a disk_bytenr that reflects a pre
7370	* relocation value. This way we avoid as much as
7371	* possible to fallback to regular writes when checking
7372	* if we can clone file ranges.
7373	*/
7374	btrfs_release_path(p: path);
7375	ret = search_key_again(sctx, root: send_root, path, key: &key);
7376	if (ret < `0`)
7377	goto out;
7378	} else {
7379	up_read(sem: &fs_info->commit_root_sem);
7380	}
7381
7382	ret = btrfs_next_item(root: send_root, p: path);
7383	if (ret < `0`)
7384	goto out;
7385	if (ret) {
7386	ret = `0`;
7387	break;
7388	}
7389	}
7390
7391	out_finish:
7392	ret = finish_inode_if_needed(sctx, at_end: `1`);
7393
7394	out:
7395	btrfs_free_path(p: path);
7396	return ret;
7397	}
7398
7399	static int replace_node_with_clone(struct btrfs_path path, int* level)
7400	{
7401	struct extent_buffer *clone;
7402
7403	clone = btrfs_clone_extent_buffer(src: path->nodes[level]);
7404	if (!clone)
7405	return -ENOMEM;
7406
7407	free_extent_buffer(eb: path->nodes[level]);
7408	path->nodes[level] = clone;
7409
7410	return `0`;
7411	}
7412
7413	static int tree_move_down(struct btrfs_path path, int* *level, u64 reada_min_gen)
7414	{
7415	struct extent_buffer *eb;
7416	struct extent_buffer parent = path->nodes[level];
7417	int slot = path->slots[*level];
7418	const int nritems = btrfs_header_nritems(eb: parent);
7419	u64 reada_max;
7420	u64 reada_done = `0`;
7421
7422	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
7423
7424	BUG_ON(*level == `0`);
7425	eb = btrfs_read_node_slot(parent, slot);
7426	if (IS_ERR(ptr: eb))
7427	return PTR_ERR(ptr: eb);
7428
7429	/*
7430	* Trigger readahead for the next leaves we will process, so that it is
7431	* very likely that when we need them they are already in memory and we
7432	* will not block on disk IO. For nodes we only do readahead for one,
7433	* since the time window between processing nodes is typically larger.
7434	*/
7435	reada_max = (*level == `1` ? SZ_128K : eb->fs_info->nodesize);
7436
7437	for (slot++; slot < nritems && reada_done < reada_max; slot++) {
7438	if (btrfs_node_ptr_generation(eb: parent, nr: slot) > reada_min_gen) {
7439	btrfs_readahead_node_child(node: parent, slot);
7440	reada_done += eb->fs_info->nodesize;
7441	}
7442	}
7443
7444	path->nodes[*level - `1`] = eb;
7445	path->slots[*level - `1`] = `0`;
7446	(*level)--;
7447
7448	if (*level == `0`)
7449	return replace_node_with_clone(path, level: `0`);
7450
7451	return `0`;
7452	}
7453
7454	static int tree_move_next_or_upnext(struct btrfs_path *path,
7455	int level, int* root_level)
7456	{
7457	int ret = `0`;
7458	int nritems;
7459	nritems = btrfs_header_nritems(eb: path->nodes[*level]);
7460
7461	path->slots[*level]++;
7462
7463	while (path->slots[*level] >= nritems) {
7464	if (*level == root_level) {
7465	path->slots[*level] = nritems - `1`;
7466	return -`1`;
7467	}
7468
7469	/ move upnext /
7470	path->slots[*level] = `0`;
7471	free_extent_buffer(eb: path->nodes[*level]);
7472	path->nodes[*level] = NULL;
7473	(*level)++;
7474	path->slots[*level]++;
7475
7476	nritems = btrfs_header_nritems(eb: path->nodes[*level]);
7477	ret = `1`;
7478	}
7479	return ret;
7480	}
7481
7482	/*
7483	* Returns 1 if it had to move up and next. 0 is returned if it moved only next
7484	* or down.
7485	*/
7486	static int tree_advance(struct btrfs_path *path,
7487	int level, int* root_level,
7488	int allow_down,
7489	struct btrfs_key *key,
7490	u64 reada_min_gen)
7491	{
7492	int ret;
7493
7494	if (*level == `0` \|\| !allow_down) {
7495	ret = tree_move_next_or_upnext(path, level, root_level);
7496	} else {
7497	ret = tree_move_down(path, level, reada_min_gen);
7498	}
7499
7500	/*
7501	* Even if we have reached the end of a tree, ret is -1, update the key
7502	* anyway, so that in case we need to restart due to a block group
7503	* relocation, we can assert that the last key of the root node still
7504	* exists in the tree.
7505	*/
7506	if (*level == `0`)
7507	btrfs_item_key_to_cpu(eb: path->nodes[*level], cpu_key: key,
7508	nr: path->slots[*level]);
7509	else
7510	btrfs_node_key_to_cpu(eb: path->nodes[*level], cpu_key: key,
7511	nr: path->slots[*level]);
7512
7513	return ret;
7514	}
7515
7516	static int tree_compare_item(struct btrfs_path *left_path,
7517	struct btrfs_path *right_path,
7518	char *tmp_buf)
7519	{
7520	int cmp;
7521	int len1, len2;
7522	unsigned long off1, off2;
7523
7524	len1 = btrfs_item_size(eb: left_path->nodes[`0`], slot: left_path->slots[`0`]);
7525	len2 = btrfs_item_size(eb: right_path->nodes[`0`], slot: right_path->slots[`0`]);
7526	if (len1 != len2)
7527	return `1`;
7528
7529	off1 = btrfs_item_ptr_offset(left_path->nodes[`0`], left_path->slots[`0`]);
7530	off2 = btrfs_item_ptr_offset(right_path->nodes[`0`],
7531	right_path->slots[`0`]);
7532
7533	read_extent_buffer(eb: left_path->nodes[`0`], dst: tmp_buf, start: off1, len: len1);
7534
7535	cmp = memcmp_extent_buffer(eb: right_path->nodes[`0`], ptrv: tmp_buf, start: off2, len: len1);
7536	if (cmp)
7537	return `1`;
7538	return `0`;
7539	}
7540
7541	/*
7542	* A transaction used for relocating a block group was committed or is about to
7543	* finish its commit. Release our paths and restart the search, so that we are
7544	* not using stale extent buffers:
7545	*
7546	* 1) For levels > 0, we are only holding references of extent buffers, without
7547	* any locks on them, which does not prevent them from having been relocated
7548	* and reallocated after the last time we released the commit root semaphore.
7549	* The exception are the root nodes, for which we always have a clone, see
7550	* the comment at btrfs_compare_trees();
7551	*
7552	* 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
7553	* we are safe from the concurrent relocation and reallocation. However they
7554	* can have file extent items with a pre relocation disk_bytenr value, so we
7555	* restart the start from the current commit roots and clone the new leaves so
7556	* that we get the post relocation disk_bytenr values. Not doing so, could
7557	* make us clone the wrong data in case there are new extents using the old
7558	* disk_bytenr that happen to be shared.
7559	*/
7560	static int restart_after_relocation(struct btrfs_path *left_path,
7561	struct btrfs_path *right_path,
7562	const struct btrfs_key *left_key,
7563	const struct btrfs_key *right_key,
7564	int left_level,
7565	int right_level,
7566	const struct send_ctx *sctx)
7567	{
7568	int root_level;
7569	int ret;
7570
7571	lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
7572
7573	btrfs_release_path(p: left_path);
7574	btrfs_release_path(p: right_path);
7575
7576	/*
7577	* Since keys can not be added or removed to/from our roots because they
7578	* are readonly and we do not allow deduplication to run in parallel
7579	* (which can add, remove or change keys), the layout of the trees should
7580	* not change.
7581	*/
7582	left_path->lowest_level = left_level;
7583	ret = search_key_again(sctx, root: sctx->send_root, path: left_path, key: left_key);
7584	if (ret < `0`)
7585	return ret;
7586
7587	right_path->lowest_level = right_level;
7588	ret = search_key_again(sctx, root: sctx->parent_root, path: right_path, key: right_key);
7589	if (ret < `0`)
7590	return ret;
7591
7592	/*
7593	* If the lowest level nodes are leaves, clone them so that they can be
7594	* safely used by changed_cb() while not under the protection of the
7595	* commit root semaphore, even if relocation and reallocation happens in
7596	* parallel.
7597	*/
7598	if (left_level == `0`) {
7599	ret = replace_node_with_clone(path: left_path, level: `0`);
7600	if (ret < `0`)
7601	return ret;
7602	}
7603
7604	if (right_level == `0`) {
7605	ret = replace_node_with_clone(path: right_path, level: `0`);
7606	if (ret < `0`)
7607	return ret;
7608	}
7609
7610	/*
7611	* Now clone the root nodes (unless they happen to be the leaves we have
7612	* already cloned). This is to protect against concurrent snapshotting of
7613	* the send and parent roots (see the comment at btrfs_compare_trees()).
7614	*/
7615	root_level = btrfs_header_level(eb: sctx->send_root->commit_root);
7616	if (root_level > `0`) {
7617	ret = replace_node_with_clone(path: left_path, level: root_level);
7618	if (ret < `0`)
7619	return ret;
7620	}
7621
7622	root_level = btrfs_header_level(eb: sctx->parent_root->commit_root);
7623	if (root_level > `0`) {
7624	ret = replace_node_with_clone(path: right_path, level: root_level);
7625	if (ret < `0`)
7626	return ret;
7627	}
7628
7629	return `0`;
7630	}
7631
7632	/*
7633	* This function compares two trees and calls the provided callback for
7634	* every changed/new/deleted item it finds.
7635	* If shared tree blocks are encountered, whole subtrees are skipped, making
7636	* the compare pretty fast on snapshotted subvolumes.
7637	*
7638	* This currently works on commit roots only. As commit roots are read only,
7639	* we don't do any locking. The commit roots are protected with transactions.
7640	* Transactions are ended and rejoined when a commit is tried in between.
7641	*
7642	* This function checks for modifications done to the trees while comparing.
7643	* If it detects a change, it aborts immediately.
7644	*/
7645	static int btrfs_compare_trees(struct btrfs_root *left_root,
7646	struct btrfs_root right_root, struct* send_ctx *sctx)
7647	{
7648	struct btrfs_fs_info *fs_info = left_root->fs_info;
7649	int ret;
7650	int cmp;
7651	struct btrfs_path *left_path = NULL;
7652	struct btrfs_path *right_path = NULL;
7653	struct btrfs_key left_key;
7654	struct btrfs_key right_key;
7655	char *tmp_buf = NULL;
7656	int left_root_level;
7657	int right_root_level;
7658	int left_level;
7659	int right_level;
7660	int left_end_reached = `0`;
7661	int right_end_reached = `0`;
7662	int advance_left = `0`;
7663	int advance_right = `0`;
7664	u64 left_blockptr;
7665	u64 right_blockptr;
7666	u64 left_gen;
7667	u64 right_gen;
7668	u64 reada_min_gen;
7669
7670	left_path = btrfs_alloc_path();
7671	if (!left_path) {
7672	ret = -ENOMEM;
7673	goto out;
7674	}
7675	right_path = btrfs_alloc_path();
7676	if (!right_path) {
7677	ret = -ENOMEM;
7678	goto out;
7679	}
7680
7681	tmp_buf = kvmalloc(size: fs_info->nodesize, GFP_KERNEL);
7682	if (!tmp_buf) {
7683	ret = -ENOMEM;
7684	goto out;
7685	}
7686
7687	left_path->search_commit_root = `1`;
7688	left_path->skip_locking = `1`;
7689	right_path->search_commit_root = `1`;
7690	right_path->skip_locking = `1`;
7691
7692	/*
7693	* Strategy: Go to the first items of both trees. Then do
7694	*
7695	* If both trees are at level 0
7696	* Compare keys of current items
7697	* If left < right treat left item as new, advance left tree
7698	* and repeat
7699	* If left > right treat right item as deleted, advance right tree
7700	* and repeat
7701	* If left == right do deep compare of items, treat as changed if
7702	* needed, advance both trees and repeat
7703	* If both trees are at the same level but not at level 0
7704	* Compare keys of current nodes/leafs
7705	* If left < right advance left tree and repeat
7706	* If left > right advance right tree and repeat
7707	* If left == right compare blockptrs of the next nodes/leafs
7708	* If they match advance both trees but stay at the same level
7709	* and repeat
7710	* If they don't match advance both trees while allowing to go
7711	* deeper and repeat
7712	* If tree levels are different
7713	* Advance the tree that needs it and repeat
7714	*
7715	* Advancing a tree means:
7716	* If we are at level 0, try to go to the next slot. If that's not
7717	* possible, go one level up and repeat. Stop when we found a level
7718	* where we could go to the next slot. We may at this point be on a
7719	* node or a leaf.
7720	*
7721	* If we are not at level 0 and not on shared tree blocks, go one
7722	* level deeper.
7723	*
7724	* If we are not at level 0 and on shared tree blocks, go one slot to
7725	* the right if possible or go up and right.
7726	*/
7727
7728	down_read(sem: &fs_info->commit_root_sem);
7729	left_level = btrfs_header_level(eb: left_root->commit_root);
7730	left_root_level = left_level;
7731	/*
7732	* We clone the root node of the send and parent roots to prevent races
7733	* with snapshot creation of these roots. Snapshot creation COWs the
7734	* root node of a tree, so after the transaction is committed the old
7735	* extent can be reallocated while this send operation is still ongoing.
7736	* So we clone them, under the commit root semaphore, to be race free.
7737	*/
7738	left_path->nodes[left_level] =
7739	btrfs_clone_extent_buffer(src: left_root->commit_root);
7740	if (!left_path->nodes[left_level]) {
7741	ret = -ENOMEM;
7742	goto out_unlock;
7743	}
7744
7745	right_level = btrfs_header_level(eb: right_root->commit_root);
7746	right_root_level = right_level;
7747	right_path->nodes[right_level] =
7748	btrfs_clone_extent_buffer(src: right_root->commit_root);
7749	if (!right_path->nodes[right_level]) {
7750	ret = -ENOMEM;
7751	goto out_unlock;
7752	}
7753	/*
7754	* Our right root is the parent root, while the left root is the "send"
7755	* root. We know that all new nodes/leaves in the left root must have
7756	* a generation greater than the right root's generation, so we trigger
7757	* readahead for those nodes and leaves of the left root, as we know we
7758	* will need to read them at some point.
7759	*/
7760	reada_min_gen = btrfs_header_generation(eb: right_root->commit_root);
7761
7762	if (left_level == `0`)
7763	btrfs_item_key_to_cpu(eb: left_path->nodes[left_level],
7764	cpu_key: &left_key, nr: left_path->slots[left_level]);
7765	else
7766	btrfs_node_key_to_cpu(eb: left_path->nodes[left_level],
7767	cpu_key: &left_key, nr: left_path->slots[left_level]);
7768	if (right_level == `0`)
7769	btrfs_item_key_to_cpu(eb: right_path->nodes[right_level],
7770	cpu_key: &right_key, nr: right_path->slots[right_level]);
7771	else
7772	btrfs_node_key_to_cpu(eb: right_path->nodes[right_level],
7773	cpu_key: &right_key, nr: right_path->slots[right_level]);
7774
7775	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7776
7777	while (`1`) {
7778	if (need_resched() \|\|
7779	rwsem_is_contended(sem: &fs_info->commit_root_sem)) {
7780	up_read(sem: &fs_info->commit_root_sem);
7781	cond_resched();
7782	down_read(sem: &fs_info->commit_root_sem);
7783	}
7784
7785	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7786	ret = restart_after_relocation(left_path, right_path,
7787	left_key: &left_key, right_key: &right_key,
7788	left_level, right_level,
7789	sctx);
7790	if (ret < `0`)
7791	goto out_unlock;
7792	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7793	}
7794
7795	if (advance_left && !left_end_reached) {
7796	ret = tree_advance(path: left_path, level: &left_level,
7797	root_level: left_root_level,
7798	allow_down: advance_left != ADVANCE_ONLY_NEXT,
7799	key: &left_key, reada_min_gen);
7800	if (ret == -`1`)
7801	left_end_reached = ADVANCE;
7802	else if (ret < `0`)
7803	goto out_unlock;
7804	advance_left = `0`;
7805	}
7806	if (advance_right && !right_end_reached) {
7807	ret = tree_advance(path: right_path, level: &right_level,
7808	root_level: right_root_level,
7809	allow_down: advance_right != ADVANCE_ONLY_NEXT,
7810	key: &right_key, reada_min_gen);
7811	if (ret == -`1`)
7812	right_end_reached = ADVANCE;
7813	else if (ret < `0`)
7814	goto out_unlock;
7815	advance_right = `0`;
7816	}
7817
7818	if (left_end_reached && right_end_reached) {
7819	ret = `0`;
7820	goto out_unlock;
7821	} else if (left_end_reached) {
7822	if (right_level == `0`) {
7823	up_read(sem: &fs_info->commit_root_sem);
7824	ret = changed_cb(left_path, right_path,
7825	key: &right_key,
7826	result: BTRFS_COMPARE_TREE_DELETED,
7827	sctx);
7828	if (ret < `0`)
7829	goto out;
7830	down_read(sem: &fs_info->commit_root_sem);
7831	}
7832	advance_right = ADVANCE;
7833	continue;
7834	} else if (right_end_reached) {
7835	if (left_level == `0`) {
7836	up_read(sem: &fs_info->commit_root_sem);
7837	ret = changed_cb(left_path, right_path,
7838	key: &left_key,
7839	result: BTRFS_COMPARE_TREE_NEW,
7840	sctx);
7841	if (ret < `0`)
7842	goto out;
7843	down_read(sem: &fs_info->commit_root_sem);
7844	}
7845	advance_left = ADVANCE;
7846	continue;
7847	}
7848
7849	if (left_level == `0` && right_level == `0`) {
7850	up_read(sem: &fs_info->commit_root_sem);
7851	cmp = btrfs_comp_cpu_keys(k1: &left_key, k2: &right_key);
7852	if (cmp < `0`) {
7853	ret = changed_cb(left_path, right_path,
7854	key: &left_key,
7855	result: BTRFS_COMPARE_TREE_NEW,
7856	sctx);
7857	advance_left = ADVANCE;
7858	} else if (cmp > `0`) {
7859	ret = changed_cb(left_path, right_path,
7860	key: &right_key,
7861	result: BTRFS_COMPARE_TREE_DELETED,
7862	sctx);
7863	advance_right = ADVANCE;
7864	} else {
7865	enum btrfs_compare_tree_result result;
7866
7867	WARN_ON(!extent_buffer_uptodate(left_path->nodes[`0`]));
7868	ret = tree_compare_item(left_path, right_path,
7869	tmp_buf);
7870	if (ret)
7871	result = BTRFS_COMPARE_TREE_CHANGED;
7872	else
7873	result = BTRFS_COMPARE_TREE_SAME;
7874	ret = changed_cb(left_path, right_path,
7875	key: &left_key, result, sctx);
7876	advance_left = ADVANCE;
7877	advance_right = ADVANCE;
7878	}
7879
7880	if (ret < `0`)
7881	goto out;
7882	down_read(sem: &fs_info->commit_root_sem);
7883	} else if (left_level == right_level) {
7884	cmp = btrfs_comp_cpu_keys(k1: &left_key, k2: &right_key);
7885	if (cmp < `0`) {
7886	advance_left = ADVANCE;
7887	} else if (cmp > `0`) {
7888	advance_right = ADVANCE;
7889	} else {
7890	left_blockptr = btrfs_node_blockptr(
7891	eb: left_path->nodes[left_level],
7892	nr: left_path->slots[left_level]);
7893	right_blockptr = btrfs_node_blockptr(
7894	eb: right_path->nodes[right_level],
7895	nr: right_path->slots[right_level]);
7896	left_gen = btrfs_node_ptr_generation(
7897	eb: left_path->nodes[left_level],
7898	nr: left_path->slots[left_level]);
7899	right_gen = btrfs_node_ptr_generation(
7900	eb: right_path->nodes[right_level],
7901	nr: right_path->slots[right_level]);
7902	if (left_blockptr == right_blockptr &&
7903	left_gen == right_gen) {
7904	/*
7905	* As we're on a shared block, don't
7906	* allow to go deeper.
7907	*/
7908	advance_left = ADVANCE_ONLY_NEXT;
7909	advance_right = ADVANCE_ONLY_NEXT;
7910	} else {
7911	advance_left = ADVANCE;
7912	advance_right = ADVANCE;
7913	}
7914	}
7915	} else if (left_level < right_level) {
7916	advance_right = ADVANCE;
7917	} else {
7918	advance_left = ADVANCE;
7919	}
7920	}
7921
7922	out_unlock:
7923	up_read(sem: &fs_info->commit_root_sem);
7924	out:
7925	btrfs_free_path(p: left_path);
7926	btrfs_free_path(p: right_path);
7927	kvfree(addr: tmp_buf);
7928	return ret;
7929	}
7930
7931	static int send_subvol(struct send_ctx *sctx)
7932	{
7933	int ret;
7934
7935	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
7936	ret = send_header(sctx);
7937	if (ret < `0`)
7938	goto out;
7939	}
7940
7941	ret = send_subvol_begin(sctx);
7942	if (ret < `0`)
7943	goto out;
7944
7945	if (sctx->parent_root) {
7946	ret = btrfs_compare_trees(left_root: sctx->send_root, right_root: sctx->parent_root, sctx);
7947	if (ret < `0`)
7948	goto out;
7949	ret = finish_inode_if_needed(sctx, at_end: `1`);
7950	if (ret < `0`)
7951	goto out;
7952	} else {
7953	ret = full_send_tree(sctx);
7954	if (ret < `0`)
7955	goto out;
7956	}
7957
7958	out:
7959	free_recorded_refs(sctx);
7960	return ret;
7961	}
7962
7963	/*
7964	* If orphan cleanup did remove any orphans from a root, it means the tree
7965	* was modified and therefore the commit root is not the same as the current
7966	* root anymore. This is a problem, because send uses the commit root and
7967	* therefore can see inode items that don't exist in the current root anymore,
7968	* and for example make calls to btrfs_iget, which will do tree lookups based
7969	* on the current root and not on the commit root. Those lookups will fail,
7970	* returning a -ESTALE error, and making send fail with that error. So make
7971	* sure a send does not see any orphans we have just removed, and that it will
7972	* see the same inodes regardless of whether a transaction commit happened
7973	* before it started (meaning that the commit root will be the same as the
7974	* current root) or not.
7975	*/
7976	static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
7977	{
7978	int i;
7979	struct btrfs_trans_handle *trans = NULL;
7980
7981	again:
7982	if (sctx->parent_root &&
7983	sctx->parent_root->node != sctx->parent_root->commit_root)
7984	goto commit_trans;
7985
7986	for (i = `0`; i < sctx->clone_roots_cnt; i++)
7987	if (sctx->clone_roots[i].root->node !=
7988	sctx->clone_roots[i].root->commit_root)
7989	goto commit_trans;
7990
7991	if (trans)
7992	return btrfs_end_transaction(trans);
7993
7994	return `0`;
7995
7996	commit_trans:
7997	/ Use any root, all fs roots will get their commit roots updated. /
7998	if (!trans) {
7999	trans = btrfs_join_transaction(root: sctx->send_root);
8000	if (IS_ERR(ptr: trans))
8001	return PTR_ERR(ptr: trans);
8002	goto again;
8003	}
8004
8005	return btrfs_commit_transaction(trans);
8006	}
8007
8008	/*
8009	* Make sure any existing dellaloc is flushed for any root used by a send
8010	* operation so that we do not miss any data and we do not race with writeback
8011	* finishing and changing a tree while send is using the tree. This could
8012	* happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
8013	* a send operation then uses the subvolume.
8014	* After flushing delalloc ensure_commit_roots_uptodate() must be called.
8015	*/
8016	static int flush_delalloc_roots(struct send_ctx *sctx)
8017	{
8018	struct btrfs_root *root = sctx->parent_root;
8019	int ret;
8020	int i;
8021
8022	if (root) {
8023	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: false);
8024	if (ret)
8025	return ret;
8026	btrfs_wait_ordered_extents(root, U64_MAX, range_start: `0`, U64_MAX);
8027	}
8028
8029	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
8030	root = sctx->clone_roots[i].root;
8031	ret = btrfs_start_delalloc_snapshot(root, in_reclaim_context: false);
8032	if (ret)
8033	return ret;
8034	btrfs_wait_ordered_extents(root, U64_MAX, range_start: `0`, U64_MAX);
8035	}
8036
8037	return `0`;
8038	}
8039
8040	static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
8041	{
8042	spin_lock(lock: &root->root_item_lock);
8043	root->send_in_progress--;
8044	/*
8045	* Not much left to do, we don't know why it's unbalanced and
8046	* can't blindly reset it to 0.
8047	*/
8048	if (root->send_in_progress < `0`)
8049	btrfs_err(root->fs_info,
8050	"send_in_progress unbalanced %d root %llu",
8051	root->send_in_progress, root->root_key.objectid);
8052	spin_unlock(lock: &root->root_item_lock);
8053	}
8054
8055	static void dedupe_in_progress_warn(const struct btrfs_root *root)
8056	{
8057	btrfs_warn_rl(root->fs_info,
8058	"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8059	root->root_key.objectid, root->dedupe_in_progress);
8060	}
8061
8062	long btrfs_ioctl_send(struct inode inode, struct* btrfs_ioctl_send_args *arg)
8063	{
8064	int ret = `0`;
8065	struct btrfs_root *send_root = BTRFS_I(inode)->root;
8066	struct btrfs_fs_info *fs_info = send_root->fs_info;
8067	struct btrfs_root *clone_root;
8068	struct send_ctx *sctx = NULL;
8069	u32 i;
8070	u64 *clone_sources_tmp = NULL;
8071	int clone_sources_to_rollback = `0`;
8072	size_t alloc_size;
8073	int sort_clone_roots = `0`;
8074	struct btrfs_lru_cache_entry *entry;
8075	struct btrfs_lru_cache_entry *tmp;
8076
8077	if (!capable(CAP_SYS_ADMIN))
8078	return -EPERM;
8079
8080	/*
8081	* The subvolume must remain read-only during send, protect against
8082	* making it RW. This also protects against deletion.
8083	*/
8084	spin_lock(lock: &send_root->root_item_lock);
8085	if (btrfs_root_readonly(root: send_root) && send_root->dedupe_in_progress) {
8086	dedupe_in_progress_warn(root: send_root);
8087	spin_unlock(lock: &send_root->root_item_lock);
8088	return -EAGAIN;
8089	}
8090	send_root->send_in_progress++;
8091	spin_unlock(lock: &send_root->root_item_lock);
8092
8093	/*
8094	* Userspace tools do the checks and warn the user if it's
8095	* not RO.
8096	*/
8097	if (!btrfs_root_readonly(root: send_root)) {
8098	ret = -EPERM;
8099	goto out;
8100	}
8101
8102	/*
8103	* Check that we don't overflow at later allocations, we request
8104	* clone_sources_count + 1 items, and compare to unsigned long inside
8105	* access_ok. Also set an upper limit for allocation size so this can't
8106	* easily exhaust memory. Max number of clone sources is about 200K.
8107	*/
8108	if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
8109	ret = -EINVAL;
8110	goto out;
8111	}
8112
8113	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
8114	ret = -EINVAL;
8115	goto out;
8116	}
8117
8118	sctx = kzalloc(size: sizeof(struct send_ctx), GFP_KERNEL);
8119	if (!sctx) {
8120	ret = -ENOMEM;
8121	goto out;
8122	}
8123
8124	INIT_LIST_HEAD(list: &sctx->new_refs);
8125	INIT_LIST_HEAD(list: &sctx->deleted_refs);
8126
8127	btrfs_lru_cache_init(cache: &sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
8128	btrfs_lru_cache_init(cache: &sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
8129	btrfs_lru_cache_init(cache: &sctx->dir_created_cache,
8130	SEND_MAX_DIR_CREATED_CACHE_SIZE);
8131	/*
8132	* This cache is periodically trimmed to a fixed size elsewhere, see
8133	* cache_dir_utimes() and trim_dir_utimes_cache().
8134	*/
8135	btrfs_lru_cache_init(cache: &sctx->dir_utimes_cache, max_size: `0`);
8136
8137	sctx->pending_dir_moves = RB_ROOT;
8138	sctx->waiting_dir_moves = RB_ROOT;
8139	sctx->orphan_dirs = RB_ROOT;
8140	sctx->rbtree_new_refs = RB_ROOT;
8141	sctx->rbtree_deleted_refs = RB_ROOT;
8142
8143	sctx->flags = arg->flags;
8144
8145	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
8146	if (arg->version > BTRFS_SEND_STREAM_VERSION) {
8147	ret = -EPROTO;
8148	goto out;
8149	}
8150	/ Zero means "use the highest version" /
8151	sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
8152	} else {
8153	sctx->proto = `1`;
8154	}
8155	if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < `2`) {
8156	ret = -EINVAL;
8157	goto out;
8158	}
8159
8160	sctx->send_filp = fget(fd: arg->send_fd);
8161	if (!sctx->send_filp) {
8162	ret = -EBADF;
8163	goto out;
8164	}
8165
8166	sctx->send_root = send_root;
8167	/*
8168	* Unlikely but possible, if the subvolume is marked for deletion but
8169	* is slow to remove the directory entry, send can still be started
8170	*/
8171	if (btrfs_root_dead(root: sctx->send_root)) {
8172	ret = -EPERM;
8173	goto out;
8174	}
8175
8176	sctx->clone_roots_cnt = arg->clone_sources_count;
8177
8178	if (sctx->proto >= `2`) {
8179	u32 send_buf_num_pages;
8180
8181	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
8182	sctx->send_buf = vmalloc(size: sctx->send_max_size);
8183	if (!sctx->send_buf) {
8184	ret = -ENOMEM;
8185	goto out;
8186	}
8187	send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
8188	sctx->send_buf_pages = kcalloc(n: send_buf_num_pages,
8189	size: sizeof(*sctx->send_buf_pages),
8190	GFP_KERNEL);
8191	if (!sctx->send_buf_pages) {
8192	ret = -ENOMEM;
8193	goto out;
8194	}
8195	for (i = `0`; i < send_buf_num_pages; i++) {
8196	sctx->send_buf_pages[i] =
8197	vmalloc_to_page(addr: sctx->send_buf + (i << PAGE_SHIFT));
8198	}
8199	} else {
8200	sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
8201	sctx->send_buf = kvmalloc(size: sctx->send_max_size, GFP_KERNEL);
8202	}
8203	if (!sctx->send_buf) {
8204	ret = -ENOMEM;
8205	goto out;
8206	}
8207
8208	sctx->clone_roots = kvcalloc(n: sizeof(*sctx->clone_roots),
8209	size: arg->clone_sources_count + `1`,
8210	GFP_KERNEL);
8211	if (!sctx->clone_roots) {
8212	ret = -ENOMEM;
8213	goto out;
8214	}
8215
8216	alloc_size = array_size(sizeof(*arg->clone_sources),
8217	arg->clone_sources_count);
8218
8219	if (arg->clone_sources_count) {
8220	clone_sources_tmp = kvmalloc(size: alloc_size, GFP_KERNEL);
8221	if (!clone_sources_tmp) {
8222	ret = -ENOMEM;
8223	goto out;
8224	}
8225
8226	ret = copy_from_user(to: clone_sources_tmp, from: arg->clone_sources,
8227	n: alloc_size);
8228	if (ret) {
8229	ret = -EFAULT;
8230	goto out;
8231	}
8232
8233	for (i = `0`; i < arg->clone_sources_count; i++) {
8234	clone_root = btrfs_get_fs_root(fs_info,
8235	objectid: clone_sources_tmp[i], check_ref: true);
8236	if (IS_ERR(ptr: clone_root)) {
8237	ret = PTR_ERR(ptr: clone_root);
8238	goto out;
8239	}
8240	spin_lock(lock: &clone_root->root_item_lock);
8241	if (!btrfs_root_readonly(root: clone_root) \|\|
8242	btrfs_root_dead(root: clone_root)) {
8243	spin_unlock(lock: &clone_root->root_item_lock);
8244	btrfs_put_root(root: clone_root);
8245	ret = -EPERM;
8246	goto out;
8247	}
8248	if (clone_root->dedupe_in_progress) {
8249	dedupe_in_progress_warn(root: clone_root);
8250	spin_unlock(lock: &clone_root->root_item_lock);
8251	btrfs_put_root(root: clone_root);
8252	ret = -EAGAIN;
8253	goto out;
8254	}
8255	clone_root->send_in_progress++;
8256	spin_unlock(lock: &clone_root->root_item_lock);
8257
8258	sctx->clone_roots[i].root = clone_root;
8259	clone_sources_to_rollback = i + `1`;
8260	}
8261	kvfree(addr: clone_sources_tmp);
8262	clone_sources_tmp = NULL;
8263	}
8264
8265	if (arg->parent_root) {
8266	sctx->parent_root = btrfs_get_fs_root(fs_info, objectid: arg->parent_root,
8267	check_ref: true);
8268	if (IS_ERR(ptr: sctx->parent_root)) {
8269	ret = PTR_ERR(ptr: sctx->parent_root);
8270	goto out;
8271	}
8272
8273	spin_lock(lock: &sctx->parent_root->root_item_lock);
8274	sctx->parent_root->send_in_progress++;
8275	if (!btrfs_root_readonly(root: sctx->parent_root) \|\|
8276	btrfs_root_dead(root: sctx->parent_root)) {
8277	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8278	ret = -EPERM;
8279	goto out;
8280	}
8281	if (sctx->parent_root->dedupe_in_progress) {
8282	dedupe_in_progress_warn(root: sctx->parent_root);
8283	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8284	ret = -EAGAIN;
8285	goto out;
8286	}
8287	spin_unlock(lock: &sctx->parent_root->root_item_lock);
8288	}
8289
8290	/*
8291	* Clones from send_root are allowed, but only if the clone source
8292	* is behind the current send position. This is checked while searching
8293	* for possible clone sources.
8294	*/
8295	sctx->clone_roots[sctx->clone_roots_cnt++].root =
8296	btrfs_grab_root(root: sctx->send_root);
8297
8298	/ We do a bsearch later /
8299	sort(base: sctx->clone_roots, num: sctx->clone_roots_cnt,
8300	size: sizeof(*sctx->clone_roots), cmp_func: __clone_root_cmp_sort,
8301	NULL);
8302	sort_clone_roots = `1`;
8303
8304	ret = flush_delalloc_roots(sctx);
8305	if (ret)
8306	goto out;
8307
8308	ret = ensure_commit_roots_uptodate(sctx);
8309	if (ret)
8310	goto out;
8311
8312	ret = send_subvol(sctx);
8313	if (ret < `0`)
8314	goto out;
8315
8316	btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
8317	ret = send_utimes(sctx, ino: entry->key, gen: entry->gen);
8318	if (ret < `0`)
8319	goto out;
8320	btrfs_lru_cache_remove(cache: &sctx->dir_utimes_cache, entry);
8321	}
8322
8323	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
8324	ret = begin_cmd(sctx, cmd: BTRFS_SEND_C_END);
8325	if (ret < `0`)
8326	goto out;
8327	ret = send_cmd(sctx);
8328	if (ret < `0`)
8329	goto out;
8330	}
8331
8332	out:
8333	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
8334	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
8335	struct rb_node *n;
8336	struct pending_dir_move *pm;
8337
8338	n = rb_first(&sctx->pending_dir_moves);
8339	pm = rb_entry(n, struct pending_dir_move, node);
8340	while (!list_empty(head: &pm->list)) {
8341	struct pending_dir_move *pm2;
8342
8343	pm2 = list_first_entry(&pm->list,
8344	struct pending_dir_move, list);
8345	free_pending_move(sctx, m: pm2);
8346	}
8347	free_pending_move(sctx, m: pm);
8348	}
8349
8350	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
8351	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
8352	struct rb_node *n;
8353	struct waiting_dir_move *dm;
8354
8355	n = rb_first(&sctx->waiting_dir_moves);
8356	dm = rb_entry(n, struct waiting_dir_move, node);
8357	rb_erase(&dm->node, &sctx->waiting_dir_moves);
8358	kfree(objp: dm);
8359	}
8360
8361	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
8362	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
8363	struct rb_node *n;
8364	struct orphan_dir_info *odi;
8365
8366	n = rb_first(&sctx->orphan_dirs);
8367	odi = rb_entry(n, struct orphan_dir_info, node);
8368	free_orphan_dir_info(sctx, odi);
8369	}
8370
8371	if (sort_clone_roots) {
8372	for (i = `0`; i < sctx->clone_roots_cnt; i++) {
8373	btrfs_root_dec_send_in_progress(
8374	root: sctx->clone_roots[i].root);
8375	btrfs_put_root(root: sctx->clone_roots[i].root);
8376	}
8377	} else {
8378	for (i = `0`; sctx && i < clone_sources_to_rollback; i++) {
8379	btrfs_root_dec_send_in_progress(
8380	root: sctx->clone_roots[i].root);
8381	btrfs_put_root(root: sctx->clone_roots[i].root);
8382	}
8383
8384	btrfs_root_dec_send_in_progress(root: send_root);
8385	}
8386	if (sctx && !IS_ERR_OR_NULL(ptr: sctx->parent_root)) {
8387	btrfs_root_dec_send_in_progress(root: sctx->parent_root);
8388	btrfs_put_root(root: sctx->parent_root);
8389	}
8390
8391	kvfree(addr: clone_sources_tmp);
8392
8393	if (sctx) {
8394	if (sctx->send_filp)
8395	fput(sctx->send_filp);
8396
8397	kvfree(addr: sctx->clone_roots);
8398	kfree(objp: sctx->send_buf_pages);
8399	kvfree(addr: sctx->send_buf);
8400	kvfree(addr: sctx->verity_descriptor);
8401
8402	close_current_inode(sctx);
8403
8404	btrfs_lru_cache_clear(cache: &sctx->name_cache);
8405	btrfs_lru_cache_clear(cache: &sctx->backref_cache);
8406	btrfs_lru_cache_clear(cache: &sctx->dir_created_cache);
8407	btrfs_lru_cache_clear(cache: &sctx->dir_utimes_cache);
8408
8409	kfree(objp: sctx);
8410	}
8411
8412	return ret;
8413	}
8414

source code of linux/fs/btrfs/send.c