reflink.c source code [linux/fs/btrfs/reflink.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	#include <linux/blkdev.h>
4	#include <linux/iversion.h>
5	#include "ctree.h"
6	#include "fs.h"
7	#include "messages.h"
8	#include "compression.h"
9	#include "delalloc-space.h"
10	#include "disk-io.h"
11	#include "reflink.h"
12	#include "transaction.h"
13	#include "subpage.h"
14	#include "accessors.h"
15	#include "file-item.h"
16	#include "file.h"
17	#include "super.h"
18
19	#define BTRFS_MAX_DEDUPE_LEN SZ_16M
20
21	static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
22	struct inode *inode,
23	u64 endoff,
24	const u64 destoff,
25	const u64 olen,
26	int no_time_update)
27	{
28	int ret;
29
30	inode_inc_iversion(inode);
31	if (!no_time_update) {
32	inode_set_mtime_to_ts(inode, ts: inode_set_ctime_current(inode));
33	}
34	/*
35	* We round up to the block size at eof when determining which
36	* extents to clone above, but shouldn't round up the file size.
37	*/
38	if (endoff > destoff + olen)
39	endoff = destoff + olen;
40	if (endoff > inode->i_size) {
41	i_size_write(inode, i_size: endoff);
42	btrfs_inode_safe_disk_i_size_write(inode: BTRFS_I(inode), new_i_size: `0`);
43	}
44
45	ret = btrfs_update_inode(trans, inode: BTRFS_I(inode));
46	if (ret) {
47	btrfs_abort_transaction(trans, ret);
48	btrfs_end_transaction(trans);
49	goto out;
50	}
51	ret = btrfs_end_transaction(trans);
52	out:
53	return ret;
54	}
55
56	static int copy_inline_to_page(struct btrfs_inode *inode,
57	const u64 file_offset,
58	char *inline_data,
59	const u64 size,
60	const u64 datal,
61	const u8 comp_type)
62	{
63	struct btrfs_fs_info *fs_info = inode->root->fs_info;
64	const u32 block_size = fs_info->sectorsize;
65	const u64 range_end = file_offset + block_size - `1`;
66	const size_t inline_size = size - btrfs_file_extent_calc_inline_size(datasize: `0`);
67	char *data_start = inline_data + btrfs_file_extent_calc_inline_size(datasize: `0`);
68	struct extent_changeset *data_reserved = NULL;
69	struct page *page = NULL;
70	struct address_space *mapping = inode->vfs_inode.i_mapping;
71	int ret;
72
73	ASSERT(IS_ALIGNED(file_offset, block_size));
74
75	/*
76	* We have flushed and locked the ranges of the source and destination
77	* inodes, we also have locked the inodes, so we are safe to do a
78	* reservation here. Also we must not do the reservation while holding
79	* a transaction open, otherwise we would deadlock.
80	*/
81	ret = btrfs_delalloc_reserve_space(inode, reserved: &data_reserved, start: file_offset,
82	len: block_size);
83	if (ret)
84	goto out;
85
86	page = find_or_create_page(mapping, index: file_offset >> PAGE_SHIFT,
87	gfp_mask: btrfs_alloc_write_mask(mapping));
88	if (!page) {
89	ret = -ENOMEM;
90	goto out_unlock;
91	}
92
93	ret = set_page_extent_mapped(page);
94	if (ret < `0`)
95	goto out_unlock;
96
97	clear_extent_bit(tree: &inode->io_tree, start: file_offset, end: range_end,
98	bits: EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
99	NULL);
100	ret = btrfs_set_extent_delalloc(inode, start: file_offset, end: range_end, extra_bits: `0`, NULL);
101	if (ret)
102	goto out_unlock;
103
104	/*
105	* After dirtying the page our caller will need to start a transaction,
106	* and if we are low on metadata free space, that can cause flushing of
107	* delalloc for all inodes in order to get metadata space released.
108	* However we are holding the range locked for the whole duration of
109	* the clone/dedupe operation, so we may deadlock if that happens and no
110	* other task releases enough space. So mark this inode as not being
111	* possible to flush to avoid such deadlock. We will clear that flag
112	* when we finish cloning all extents, since a transaction is started
113	* after finding each extent to clone.
114	*/
115	set_bit(nr: BTRFS_INODE_NO_DELALLOC_FLUSH, addr: &inode->runtime_flags);
116
117	if (comp_type == BTRFS_COMPRESS_NONE) {
118	memcpy_to_page(page, offset_in_page(file_offset), from: data_start,
119	len: datal);
120	} else {
121	ret = btrfs_decompress(type: comp_type, data_in: data_start, dest_page: page,
122	offset_in_page(file_offset),
123	srclen: inline_size, destlen: datal);
124	if (ret)
125	goto out_unlock;
126	flush_dcache_page(page);
127	}
128
129	/*
130	* If our inline data is smaller then the block/page size, then the
131	* remaining of the block/page is equivalent to zeroes. We had something
132	* like the following done:
133	*
134	* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
135	* $ sync # (or fsync)
136	* $ xfs_io -c "falloc 0 4K" file
137	* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
138	*
139	* So what's in the range [500, 4095] corresponds to zeroes.
140	*/
141	if (datal < block_size)
142	memzero_page(page, offset: datal, len: block_size - datal);
143
144	btrfs_folio_set_uptodate(fs_info, page_folio(page), start: file_offset, len: block_size);
145	btrfs_folio_clear_checked(fs_info, page_folio(page), start: file_offset, len: block_size);
146	btrfs_folio_set_dirty(fs_info, page_folio(page), start: file_offset, len: block_size);
147	out_unlock:
148	if (page) {
149	unlock_page(page);
150	put_page(page);
151	}
152	if (ret)
153	btrfs_delalloc_release_space(inode, reserved: data_reserved, start: file_offset,
154	len: block_size, qgroup_free: true);
155	btrfs_delalloc_release_extents(inode, num_bytes: block_size);
156	out:
157	extent_changeset_free(changeset: data_reserved);
158
159	return ret;
160	}
161
162	/*
163	* Deal with cloning of inline extents. We try to copy the inline extent from
164	* the source inode to destination inode when possible. When not possible we
165	* copy the inline extent's data into the respective page of the inode.
166	*/
167	static int clone_copy_inline_extent(struct inode *dst,
168	struct btrfs_path *path,
169	struct btrfs_key *new_key,
170	const u64 drop_start,
171	const u64 datal,
172	const u64 size,
173	const u8 comp_type,
174	char *inline_data,
175	struct btrfs_trans_handle **trans_out)
176	{
177	struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
178	struct btrfs_root *root = BTRFS_I(inode: dst)->root;
179	const u64 aligned_end = ALIGN(new_key->offset + datal,
180	fs_info->sectorsize);
181	struct btrfs_trans_handle *trans = NULL;
182	struct btrfs_drop_extents_args drop_args = { `0` };
183	int ret;
184	struct btrfs_key key;
185
186	if (new_key->offset > `0`) {
187	ret = copy_inline_to_page(inode: BTRFS_I(inode: dst), file_offset: new_key->offset,
188	inline_data, size, datal, comp_type);
189	goto out;
190	}
191
192	key.objectid = btrfs_ino(inode: BTRFS_I(inode: dst));
193	key.type = BTRFS_EXTENT_DATA_KEY;
194	key.offset = `0`;
195	ret = btrfs_search_slot(NULL, root, key: &key, p: path, ins_len: `0`, cow: `0`);
196	if (ret < `0`) {
197	return ret;
198	} else if (ret > `0`) {
199	if (path->slots[`0`] >= btrfs_header_nritems(eb: path->nodes[`0`])) {
200	ret = btrfs_next_leaf(root, path);
201	if (ret < `0`)
202	return ret;
203	else if (ret > `0`)
204	goto copy_inline_extent;
205	}
206	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key, nr: path->slots[`0`]);
207	if (key.objectid == btrfs_ino(inode: BTRFS_I(inode: dst)) &&
208	key.type == BTRFS_EXTENT_DATA_KEY) {
209	/*
210	* There's an implicit hole at file offset 0, copy the
211	* inline extent's data to the page.
212	*/
213	ASSERT(key.offset > `0`);
214	goto copy_to_page;
215	}
216	} else if (i_size_read(inode: dst) <= datal) {
217	struct btrfs_file_extent_item *ei;
218
219	ei = btrfs_item_ptr(path->nodes[`0`], path->slots[`0`],
220	struct btrfs_file_extent_item);
221	/*
222	* If it's an inline extent replace it with the source inline
223	* extent, otherwise copy the source inline extent data into
224	* the respective page at the destination inode.
225	*/
226	if (btrfs_file_extent_type(eb: path->nodes[`0`], s: ei) ==
227	BTRFS_FILE_EXTENT_INLINE)
228	goto copy_inline_extent;
229
230	goto copy_to_page;
231	}
232
233	copy_inline_extent:
234	/*
235	* We have no extent items, or we have an extent at offset 0 which may
236	* or may not be inlined. All these cases are dealt the same way.
237	*/
238	if (i_size_read(inode: dst) > datal) {
239	/*
240	* At the destination offset 0 we have either a hole, a regular
241	* extent or an inline extent larger then the one we want to
242	* clone. Deal with all these cases by copying the inline extent
243	* data into the respective page at the destination inode.
244	*/
245	goto copy_to_page;
246	}
247
248	/*
249	* Release path before starting a new transaction so we don't hold locks
250	* that would confuse lockdep.
251	*/
252	btrfs_release_path(p: path);
253	/*
254	* If we end up here it means were copy the inline extent into a leaf
255	* of the destination inode. We know we will drop or adjust at most one
256	* extent item in the destination root.
257	*
258	* 1 unit - adjusting old extent (we may have to split it)
259	* 1 unit - add new extent
260	* 1 unit - inode update
261	*/
262	trans = btrfs_start_transaction(root, num_items: `3`);
263	if (IS_ERR(ptr: trans)) {
264	ret = PTR_ERR(ptr: trans);
265	trans = NULL;
266	goto out;
267	}
268	drop_args.path = path;
269	drop_args.start = drop_start;
270	drop_args.end = aligned_end;
271	drop_args.drop_cache = true;
272	ret = btrfs_drop_extents(trans, root, inode: BTRFS_I(inode: dst), args: &drop_args);
273	if (ret)
274	goto out;
275	ret = btrfs_insert_empty_item(trans, root, path, key: new_key, data_size: size);
276	if (ret)
277	goto out;
278
279	write_extent_buffer(eb: path->nodes[`0`], src: inline_data,
280	btrfs_item_ptr_offset(path->nodes[`0`],
281	path->slots[`0`]),
282	len: size);
283	btrfs_update_inode_bytes(inode: BTRFS_I(inode: dst), add_bytes: datal, del_bytes: drop_args.bytes_found);
284	btrfs_set_inode_full_sync(inode: BTRFS_I(inode: dst));
285	ret = btrfs_inode_set_file_extent_range(inode: BTRFS_I(inode: dst), start: `0`, len: aligned_end);
286	out:
287	if (!ret && !trans) {
288	/*
289	* No transaction here means we copied the inline extent into a
290	* page of the destination inode.
291	*
292	* 1 unit to update inode item
293	*/
294	trans = btrfs_start_transaction(root, num_items: `1`);
295	if (IS_ERR(ptr: trans)) {
296	ret = PTR_ERR(ptr: trans);
297	trans = NULL;
298	}
299	}
300	if (ret && trans) {
301	btrfs_abort_transaction(trans, ret);
302	btrfs_end_transaction(trans);
303	}
304	if (!ret)
305	*trans_out = trans;
306
307	return ret;
308
309	copy_to_page:
310	/*
311	* Release our path because we don't need it anymore and also because
312	* copy_inline_to_page() needs to reserve data and metadata, which may
313	* need to flush delalloc when we are low on available space and
314	* therefore cause a deadlock if writeback of an inline extent needs to
315	* write to the same leaf or an ordered extent completion needs to write
316	* to the same leaf.
317	*/
318	btrfs_release_path(p: path);
319
320	ret = copy_inline_to_page(inode: BTRFS_I(inode: dst), file_offset: new_key->offset,
321	inline_data, size, datal, comp_type);
322	goto out;
323	}
324
325	/*
326	* Clone a range from inode file to another.
327	*
328	* @src: Inode to clone from
329	* @inode: Inode to clone to
330	* @off: Offset within source to start clone from
331	* @olen: Original length, passed by user, of range to clone
332	* @olen_aligned: Block-aligned value of olen
333	* @destoff: Offset within @inode to start clone
334	* @no_time_update: Whether to update mtime/ctime on the target inode
335	*/
336	static int btrfs_clone(struct inode src, struct* inode *inode,
337	const u64 off, const u64 olen, const u64 olen_aligned,
338	const u64 destoff, int no_time_update)
339	{
340	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
341	struct btrfs_path *path = NULL;
342	struct extent_buffer *leaf;
343	struct btrfs_trans_handle *trans;
344	char *buf = NULL;
345	struct btrfs_key key;
346	u32 nritems;
347	int slot;
348	int ret;
349	const u64 len = olen_aligned;
350	u64 last_dest_end = destoff;
351	u64 prev_extent_end = off;
352
353	ret = -ENOMEM;
354	buf = kvmalloc(size: fs_info->nodesize, GFP_KERNEL);
355	if (!buf)
356	return ret;
357
358	path = btrfs_alloc_path();
359	if (!path) {
360	kvfree(addr: buf);
361	return ret;
362	}
363
364	path->reada = READA_FORWARD;
365	/ Clone data /
366	key.objectid = btrfs_ino(inode: BTRFS_I(inode: src));
367	key.type = BTRFS_EXTENT_DATA_KEY;
368	key.offset = off;
369
370	while (`1`) {
371	struct btrfs_file_extent_item *extent;
372	u64 extent_gen;
373	int type;
374	u32 size;
375	struct btrfs_key new_key;
376	u64 disko = `0`, diskl = `0`;
377	u64 datao = `0`, datal = `0`;
378	u8 comp;
379	u64 drop_start;
380
381	/ Note the key will change type as we walk through the tree /
382	ret = btrfs_search_slot(NULL, root: BTRFS_I(inode: src)->root, key: &key, p: path,
383	ins_len: `0`, cow: `0`);
384	if (ret < `0`)
385	goto out;
386	/*
387	* First search, if no extent item that starts at offset off was
388	* found but the previous item is an extent item, it's possible
389	* it might overlap our target range, therefore process it.
390	*/
391	if (key.offset == off && ret > `0` && path->slots[`0`] > `0`) {
392	btrfs_item_key_to_cpu(eb: path->nodes[`0`], cpu_key: &key,
393	nr: path->slots[`0`] - `1`);
394	if (key.type == BTRFS_EXTENT_DATA_KEY)
395	path->slots[`0`]--;
396	}
397
398	nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
399	process_slot:
400	if (path->slots[`0`] >= nritems) {
401	ret = btrfs_next_leaf(root: BTRFS_I(inode: src)->root, path);
402	if (ret < `0`)
403	goto out;
404	if (ret > `0`)
405	break;
406	nritems = btrfs_header_nritems(eb: path->nodes[`0`]);
407	}
408	leaf = path->nodes[`0`];
409	slot = path->slots[`0`];
410
411	btrfs_item_key_to_cpu(eb: leaf, cpu_key: &key, nr: slot);
412	if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
413	key.objectid != btrfs_ino(inode: BTRFS_I(inode: src)))
414	break;
415
416	ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
417
418	extent = btrfs_item_ptr(leaf, slot,
419	struct btrfs_file_extent_item);
420	extent_gen = btrfs_file_extent_generation(eb: leaf, s: extent);
421	comp = btrfs_file_extent_compression(eb: leaf, s: extent);
422	type = btrfs_file_extent_type(eb: leaf, s: extent);
423	if (type == BTRFS_FILE_EXTENT_REG \|\|
424	type == BTRFS_FILE_EXTENT_PREALLOC) {
425	disko = btrfs_file_extent_disk_bytenr(eb: leaf, s: extent);
426	diskl = btrfs_file_extent_disk_num_bytes(eb: leaf, s: extent);
427	datao = btrfs_file_extent_offset(eb: leaf, s: extent);
428	datal = btrfs_file_extent_num_bytes(eb: leaf, s: extent);
429	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
430	/ Take upper bound, may be compressed /
431	datal = btrfs_file_extent_ram_bytes(eb: leaf, s: extent);
432	}
433
434	/*
435	* The first search might have left us at an extent item that
436	* ends before our target range's start, can happen if we have
437	* holes and NO_HOLES feature enabled.
438	*
439	* Subsequent searches may leave us on a file range we have
440	* processed before - this happens due to a race with ordered
441	* extent completion for a file range that is outside our source
442	* range, but that range was part of a file extent item that
443	* also covered a leading part of our source range.
444	*/
445	if (key.offset + datal <= prev_extent_end) {
446	path->slots[`0`]++;
447	goto process_slot;
448	} else if (key.offset >= off + len) {
449	break;
450	}
451
452	prev_extent_end = key.offset + datal;
453	size = btrfs_item_size(eb: leaf, slot);
454	read_extent_buffer(eb: leaf, dst: buf, btrfs_item_ptr_offset(leaf, slot),
455	len: size);
456
457	btrfs_release_path(p: path);
458
459	memcpy(&new_key, &key, sizeof(new_key));
460	new_key.objectid = btrfs_ino(inode: BTRFS_I(inode));
461	if (off <= key.offset)
462	new_key.offset = key.offset + destoff - off;
463	else
464	new_key.offset = destoff;
465
466	/*
467	* Deal with a hole that doesn't have an extent item that
468	* represents it (NO_HOLES feature enabled).
469	* This hole is either in the middle of the cloning range or at
470	* the beginning (fully overlaps it or partially overlaps it).
471	*/
472	if (new_key.offset != last_dest_end)
473	drop_start = last_dest_end;
474	else
475	drop_start = new_key.offset;
476
477	if (type == BTRFS_FILE_EXTENT_REG \|\|
478	type == BTRFS_FILE_EXTENT_PREALLOC) {
479	struct btrfs_replace_extent_info clone_info;
480
481	/*
482	* a \| --- range to clone ---\| b
483	* \| ------------- extent ------------- \|
484	*/
485
486	/ Subtract range b /
487	if (key.offset + datal > off + len)
488	datal = off + len - key.offset;
489
490	/ Subtract range a /
491	if (off > key.offset) {
492	datao += off - key.offset;
493	datal -= off - key.offset;
494	}
495
496	clone_info.disk_offset = disko;
497	clone_info.disk_len = diskl;
498	clone_info.data_offset = datao;
499	clone_info.data_len = datal;
500	clone_info.file_offset = new_key.offset;
501	clone_info.extent_buf = buf;
502	clone_info.is_new_extent = false;
503	clone_info.update_times = !no_time_update;
504	ret = btrfs_replace_file_extents(inode: BTRFS_I(inode), path,
505	start: drop_start, end: new_key.offset + datal - `1`,
506	extent_info: &clone_info, trans_out: &trans);
507	if (ret)
508	goto out;
509	} else {
510	ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
511	/*
512	* Inline extents always have to start at file offset 0
513	* and can never be bigger then the sector size. We can
514	* never clone only parts of an inline extent, since all
515	* reflink operations must start at a sector size aligned
516	* offset, and the length must be aligned too or end at
517	* the i_size (which implies the whole inlined data).
518	*/
519	ASSERT(key.offset == `0`);
520	ASSERT(datal <= fs_info->sectorsize);
521	if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) \|\|
522	WARN_ON(key.offset != `0`) \|\|
523	WARN_ON(datal > fs_info->sectorsize)) {
524	ret = -EUCLEAN;
525	goto out;
526	}
527
528	ret = clone_copy_inline_extent(dst: inode, path, new_key: &new_key,
529	drop_start, datal, size,
530	comp_type: comp, inline_data: buf, trans_out: &trans);
531	if (ret)
532	goto out;
533	}
534
535	btrfs_release_path(p: path);
536
537	/*
538	* Whenever we share an extent we update the last_reflink_trans
539	* of each inode to the current transaction. This is needed to
540	* make sure fsync does not log multiple checksum items with
541	* overlapping ranges (because some extent items might refer
542	* only to sections of the original extent). For the destination
543	* inode we do this regardless of the generation of the extents
544	* or even if they are inline extents or explicit holes, to make
545	* sure a full fsync does not skip them. For the source inode,
546	* we only need to update last_reflink_trans in case it's a new
547	* extent that is not a hole or an inline extent, to deal with
548	* the checksums problem on fsync.
549	*/
550	if (extent_gen == trans->transid && disko > `0`)
551	BTRFS_I(inode: src)->last_reflink_trans = trans->transid;
552
553	BTRFS_I(inode)->last_reflink_trans = trans->transid;
554
555	last_dest_end = ALIGN(new_key.offset + datal,
556	fs_info->sectorsize);
557	ret = clone_finish_inode_update(trans, inode, endoff: last_dest_end,
558	destoff, olen, no_time_update);
559	if (ret)
560	goto out;
561	if (new_key.offset + datal >= destoff + len)
562	break;
563
564	btrfs_release_path(p: path);
565	key.offset = prev_extent_end;
566
567	if (fatal_signal_pending(current)) {
568	ret = -EINTR;
569	goto out;
570	}
571
572	cond_resched();
573	}
574	ret = `0`;
575
576	if (last_dest_end < destoff + len) {
577	/*
578	* We have an implicit hole that fully or partially overlaps our
579	* cloning range at its end. This means that we either have the
580	* NO_HOLES feature enabled or the implicit hole happened due to
581	* mixing buffered and direct IO writes against this file.
582	*/
583	btrfs_release_path(p: path);
584
585	/*
586	* When using NO_HOLES and we are cloning a range that covers
587	* only a hole (no extents) into a range beyond the current
588	* i_size, punching a hole in the target range will not create
589	* an extent map defining a hole, because the range starts at or
590	* beyond current i_size. If the file previously had an i_size
591	* greater than the new i_size set by this clone operation, we
592	* need to make sure the next fsync is a full fsync, so that it
593	* detects and logs a hole covering a range from the current
594	* i_size to the new i_size. If the clone range covers extents,
595	* besides a hole, then we know the full sync flag was already
596	* set by previous calls to btrfs_replace_file_extents() that
597	* replaced file extent items.
598	*/
599	if (last_dest_end >= i_size_read(inode))
600	btrfs_set_inode_full_sync(inode: BTRFS_I(inode));
601
602	ret = btrfs_replace_file_extents(inode: BTRFS_I(inode), path,
603	start: last_dest_end, end: destoff + len - `1`, NULL, trans_out: &trans);
604	if (ret)
605	goto out;
606
607	ret = clone_finish_inode_update(trans, inode, endoff: destoff + len,
608	destoff, olen, no_time_update);
609	}
610
611	out:
612	btrfs_free_path(p: path);
613	kvfree(addr: buf);
614	clear_bit(nr: BTRFS_INODE_NO_DELALLOC_FLUSH, addr: &BTRFS_I(inode)->runtime_flags);
615
616	return ret;
617	}
618
619	static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
620	struct inode *inode2, u64 loff2, u64 len)
621	{
622	unlock_extent(tree: &BTRFS_I(inode: inode1)->io_tree, start: loff1, end: loff1 + len - `1`, NULL);
623	unlock_extent(tree: &BTRFS_I(inode: inode2)->io_tree, start: loff2, end: loff2 + len - `1`, NULL);
624	}
625
626	static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
627	struct inode *inode2, u64 loff2, u64 len)
628	{
629	u64 range1_end = loff1 + len - `1`;
630	u64 range2_end = loff2 + len - `1`;
631
632	if (inode1 < inode2) {
633	swap(inode1, inode2);
634	swap(loff1, loff2);
635	swap(range1_end, range2_end);
636	} else if (inode1 == inode2 && loff2 < loff1) {
637	swap(loff1, loff2);
638	swap(range1_end, range2_end);
639	}
640
641	lock_extent(tree: &BTRFS_I(inode: inode1)->io_tree, start: loff1, end: range1_end, NULL);
642	lock_extent(tree: &BTRFS_I(inode: inode2)->io_tree, start: loff2, end: range2_end, NULL);
643
644	btrfs_assert_inode_range_clean(inode: BTRFS_I(inode: inode1), start: loff1, end: range1_end);
645	btrfs_assert_inode_range_clean(inode: BTRFS_I(inode: inode2), start: loff2, end: range2_end);
646	}
647
648	static void btrfs_double_mmap_lock(struct inode inode1, struct* inode *inode2)
649	{
650	if (inode1 < inode2)
651	swap(inode1, inode2);
652	down_write(sem: &BTRFS_I(inode: inode1)->i_mmap_lock);
653	down_write_nested(sem: &BTRFS_I(inode: inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
654	}
655
656	static void btrfs_double_mmap_unlock(struct inode inode1, struct* inode *inode2)
657	{
658	up_write(sem: &BTRFS_I(inode: inode1)->i_mmap_lock);
659	up_write(sem: &BTRFS_I(inode: inode2)->i_mmap_lock);
660	}
661
662	static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
663	struct inode *dst, u64 dst_loff)
664	{
665	struct btrfs_fs_info *fs_info = BTRFS_I(inode: src)->root->fs_info;
666	const u64 bs = fs_info->sectorsize;
667	int ret;
668
669	/*
670	* Lock destination range to serialize with concurrent readahead() and
671	* source range to serialize with relocation.
672	*/
673	btrfs_double_extent_lock(inode1: src, loff1: loff, inode2: dst, loff2: dst_loff, len);
674	ret = btrfs_clone(src, inode: dst, off: loff, olen: len, ALIGN(len, bs), destoff: dst_loff, no_time_update: `1`);
675	btrfs_double_extent_unlock(inode1: src, loff1: loff, inode2: dst, loff2: dst_loff, len);
676
677	btrfs_btree_balance_dirty(fs_info);
678
679	return ret;
680	}
681
682	static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
683	struct inode *dst, u64 dst_loff)
684	{
685	int ret = `0`;
686	u64 i, tail_len, chunk_count;
687	struct btrfs_root *root_dst = BTRFS_I(inode: dst)->root;
688
689	spin_lock(lock: &root_dst->root_item_lock);
690	if (root_dst->send_in_progress) {
691	btrfs_warn_rl(root_dst->fs_info,
692	"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
693	root_dst->root_key.objectid,
694	root_dst->send_in_progress);
695	spin_unlock(lock: &root_dst->root_item_lock);
696	return -EAGAIN;
697	}
698	root_dst->dedupe_in_progress++;
699	spin_unlock(lock: &root_dst->root_item_lock);
700
701	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
702	chunk_count = div_u64(dividend: olen, BTRFS_MAX_DEDUPE_LEN);
703
704	for (i = `0`; i < chunk_count; i++) {
705	ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
706	dst, dst_loff);
707	if (ret)
708	goto out;
709
710	loff += BTRFS_MAX_DEDUPE_LEN;
711	dst_loff += BTRFS_MAX_DEDUPE_LEN;
712	}
713
714	if (tail_len > `0`)
715	ret = btrfs_extent_same_range(src, loff, len: tail_len, dst, dst_loff);
716	out:
717	spin_lock(lock: &root_dst->root_item_lock);
718	root_dst->dedupe_in_progress--;
719	spin_unlock(lock: &root_dst->root_item_lock);
720
721	return ret;
722	}
723
724	static noinline int btrfs_clone_files(struct file file, struct* file *file_src,
725	u64 off, u64 olen, u64 destoff)
726	{
727	struct inode *inode = file_inode(f: file);
728	struct inode *src = file_inode(f: file_src);
729	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
730	int ret;
731	int wb_ret;
732	u64 len = olen;
733	u64 bs = fs_info->sectorsize;
734
735	/*
736	* VFS's generic_remap_file_range_prep() protects us from cloning the
737	* eof block into the middle of a file, which would result in corruption
738	* if the file size is not blocksize aligned. So we don't need to check
739	* for that case here.
740	*/
741	if (off + len == src->i_size)
742	len = ALIGN(src->i_size, bs) - off;
743
744	if (destoff > inode->i_size) {
745	const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
746
747	ret = btrfs_cont_expand(inode: BTRFS_I(inode), oldsize: inode->i_size, size: destoff);
748	if (ret)
749	return ret;
750	/*
751	* We may have truncated the last block if the inode's size is
752	* not sector size aligned, so we need to wait for writeback to
753	* complete before proceeding further, otherwise we can race
754	* with cloning and attempt to increment a reference to an
755	* extent that no longer exists (writeback completed right after
756	* we found the previous extent covering eof and before we
757	* attempted to increment its reference count).
758	*/
759	ret = btrfs_wait_ordered_range(inode, start: wb_start,
760	len: destoff - wb_start);
761	if (ret)
762	return ret;
763	}
764
765	/*
766	* Lock destination range to serialize with concurrent readahead() and
767	* source range to serialize with relocation.
768	*/
769	btrfs_double_extent_lock(inode1: src, loff1: off, inode2: inode, loff2: destoff, len);
770	ret = btrfs_clone(src, inode, off, olen, olen_aligned: len, destoff, no_time_update: `0`);
771	btrfs_double_extent_unlock(inode1: src, loff1: off, inode2: inode, loff2: destoff, len);
772
773	/*
774	* We may have copied an inline extent into a page of the destination
775	* range, so wait for writeback to complete before truncating pages
776	* from the page cache. This is a rare case.
777	*/
778	wb_ret = btrfs_wait_ordered_range(inode, start: destoff, len);
779	ret = ret ? ret : wb_ret;
780	/*
781	* Truncate page cache pages so that future reads will see the cloned
782	* data immediately and not the previous data.
783	*/
784	truncate_inode_pages_range(&inode->i_data,
785	round_down(destoff, PAGE_SIZE),
786	round_up(destoff + len, PAGE_SIZE) - `1`);
787
788	btrfs_btree_balance_dirty(fs_info);
789
790	return ret;
791	}
792
793	static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
794	struct file *file_out, loff_t pos_out,
795	loff_t len, unsigned* int remap_flags)
796	{
797	struct inode *inode_in = file_inode(f: file_in);
798	struct inode *inode_out = file_inode(f: file_out);
799	u64 bs = BTRFS_I(inode: inode_out)->root->fs_info->sectorsize;
800	u64 wb_len;
801	int ret;
802
803	if (!(remap_flags & REMAP_FILE_DEDUP)) {
804	struct btrfs_root *root_out = BTRFS_I(inode: inode_out)->root;
805
806	if (btrfs_root_readonly(root: root_out))
807	return -EROFS;
808
809	ASSERT(inode_in->i_sb == inode_out->i_sb);
810	}
811
812	/ Don't make the dst file partly checksummed /
813	if ((BTRFS_I(inode: inode_in)->flags & BTRFS_INODE_NODATASUM) !=
814	(BTRFS_I(inode: inode_out)->flags & BTRFS_INODE_NODATASUM)) {
815	return -EINVAL;
816	}
817
818	/*
819	* Now that the inodes are locked, we need to start writeback ourselves
820	* and can not rely on the writeback from the VFS's generic helper
821	* generic_remap_file_range_prep() because:
822	*
823	* 1) For compression we must call filemap_fdatawrite_range() range
824	* twice (btrfs_fdatawrite_range() does it for us), and the generic
825	* helper only calls it once;
826	*
827	* 2) filemap_fdatawrite_range(), called by the generic helper only
828	* waits for the writeback to complete, i.e. for IO to be done, and
829	* not for the ordered extents to complete. We need to wait for them
830	* to complete so that new file extent items are in the fs tree.
831	*/
832	if (*len == `0` && !(remap_flags & REMAP_FILE_DEDUP))
833	wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
834	else
835	wb_len = ALIGN(*len, bs);
836
837	/*
838	* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
839	*
840	* Btrfs' back references do not have a block level granularity, they
841	* work at the whole extent level.
842	* NOCOW buffered write without data space reserved may not be able
843	* to fall back to CoW due to lack of data space, thus could cause
844	* data loss.
845	*
846	* Here we take a shortcut by flushing the whole inode, so that all
847	* nocow write should reach disk as nocow before we increase the
848	* reference of the extent. We could do better by only flushing NOCOW
849	* data, but that needs extra accounting.
850	*
851	* Also we don't need to check ASYNC_EXTENT, as async extent will be
852	* CoWed anyway, not affecting nocow part.
853	*/
854	ret = filemap_flush(inode_in->i_mapping);
855	if (ret < `0`)
856	return ret;
857
858	ret = btrfs_wait_ordered_range(inode: inode_in, ALIGN_DOWN(pos_in, bs),
859	len: wb_len);
860	if (ret < `0`)
861	return ret;
862	ret = btrfs_wait_ordered_range(inode: inode_out, ALIGN_DOWN(pos_out, bs),
863	len: wb_len);
864	if (ret < `0`)
865	return ret;
866
867	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
868	count: len, remap_flags);
869	}
870
871	static bool file_sync_write(const struct file *file)
872	{
873	if (file->f_flags & (__O_SYNC \| O_DSYNC))
874	return true;
875	if (IS_SYNC(file_inode(file)))
876	return true;
877
878	return false;
879	}
880
881	loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
882	struct file *dst_file, loff_t destoff, loff_t len,
883	unsigned int remap_flags)
884	{
885	struct inode *src_inode = file_inode(f: src_file);
886	struct inode *dst_inode = file_inode(f: dst_file);
887	bool same_inode = dst_inode == src_inode;
888	int ret;
889
890	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
891	return -EINVAL;
892
893	if (same_inode) {
894	btrfs_inode_lock(inode: BTRFS_I(inode: src_inode), ilock_flags: BTRFS_ILOCK_MMAP);
895	} else {
896	lock_two_nondirectories(src_inode, dst_inode);
897	btrfs_double_mmap_lock(inode1: src_inode, inode2: dst_inode);
898	}
899
900	ret = btrfs_remap_file_range_prep(file_in: src_file, pos_in: off, file_out: dst_file, pos_out: destoff,
901	len: &len, remap_flags);
902	if (ret < `0` \|\| len == `0`)
903	goto out_unlock;
904
905	if (remap_flags & REMAP_FILE_DEDUP)
906	ret = btrfs_extent_same(src: src_inode, loff: off, olen: len, dst: dst_inode, dst_loff: destoff);
907	else
908	ret = btrfs_clone_files(file: dst_file, file_src: src_file, off, olen: len, destoff);
909
910	out_unlock:
911	if (same_inode) {
912	btrfs_inode_unlock(inode: BTRFS_I(inode: src_inode), ilock_flags: BTRFS_ILOCK_MMAP);
913	} else {
914	btrfs_double_mmap_unlock(inode1: src_inode, inode2: dst_inode);
915	unlock_two_nondirectories(src_inode, dst_inode);
916	}
917
918	/*
919	* If either the source or the destination file was opened with O_SYNC,
920	* O_DSYNC or has the S_SYNC attribute, fsync both the destination and
921	* source files/ranges, so that after a successful return (0) followed
922	* by a power failure results in the reflinked data to be readable from
923	* both files/ranges.
924	*/
925	if (ret == `0` && len > `0` &&
926	(file_sync_write(file: src_file) \|\| file_sync_write(file: dst_file))) {
927	ret = btrfs_sync_file(file: src_file, start: off, end: off + len - `1`, datasync: `0`);
928	if (ret == `0`)
929	ret = btrfs_sync_file(file: dst_file, start: destoff,
930	end: destoff + len - `1`, datasync: `0`);
931	}
932
933	return ret < `0` ? ret : len;
934	}
935

source code of linux/fs/btrfs/reflink.c