remap_range.c source code [linux/fs/remap_range.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include <linux/slab.h>
3	#include <linux/stat.h>
4	#include <linux/sched/xacct.h>
5	#include <linux/fcntl.h>
6	#include <linux/file.h>
7	#include <linux/uio.h>
8	#include <linux/fsnotify.h>
9	#include <linux/security.h>
10	#include <linux/export.h>
11	#include <linux/syscalls.h>
12	#include <linux/pagemap.h>
13	#include <linux/splice.h>
14	#include <linux/compat.h>
15	#include <linux/mount.h>
16	#include <linux/fs.h>
17	#include <linux/dax.h>
18	#include <linux/overflow.h>
19	#include "internal.h"
20
21	#include <linux/uaccess.h>
22	#include <asm/unistd.h>
23
24	/*
25	* Performs necessary checks before doing a clone.
26	*
27	* Can adjust amount of bytes to clone via @req_count argument.
28	* Returns appropriate error code that caller should return or
29	* zero in case the clone should be allowed.
30	*/
31	static int generic_remap_checks(struct file *file_in, loff_t pos_in,
32	struct file *file_out, loff_t pos_out,
33	loff_t req_count, unsigned* int remap_flags)
34	{
35	struct inode *inode_in = file_in->f_mapping->host;
36	struct inode *inode_out = file_out->f_mapping->host;
37	uint64_t count = *req_count;
38	uint64_t bcount;
39	loff_t size_in, size_out;
40	loff_t bs = inode_out->i_sb->s_blocksize;
41	int ret;
42
43	/ The start of both ranges must be aligned to an fs block. /
44	if (!IS_ALIGNED(pos_in, bs) \|\| !IS_ALIGNED(pos_out, bs))
45	return -EINVAL;
46
47	/ Ensure offsets don't wrap. /
48	if (pos_in + count < pos_in \|\| pos_out + count < pos_out)
49	return -EINVAL;
50
51	size_in = i_size_read(inode: inode_in);
52	size_out = i_size_read(inode: inode_out);
53
54	/ Dedupe requires both ranges to be within EOF. /
55	if ((remap_flags & REMAP_FILE_DEDUP) &&
56	(pos_in >= size_in \|\| pos_in + count > size_in \|\|
57	pos_out >= size_out \|\| pos_out + count > size_out))
58	return -EINVAL;
59
60	/ Ensure the infile range is within the infile. /
61	if (pos_in >= size_in)
62	return -EINVAL;
63	count = min(count, size_in - (uint64_t)pos_in);
64
65	ret = generic_write_check_limits(file: file_out, pos: pos_out, count: &count);
66	if (ret)
67	return ret;
68
69	/*
70	* If the user wanted us to link to the infile's EOF, round up to the
71	* next block boundary for this check.
72	*
73	* Otherwise, make sure the count is also block-aligned, having
74	* already confirmed the starting offsets' block alignment.
75	*/
76	if (pos_in + count == size_in &&
77	(!(remap_flags & REMAP_FILE_DEDUP) \|\| pos_out + count == size_out)) {
78	bcount = ALIGN(size_in, bs) - pos_in;
79	} else {
80	if (!IS_ALIGNED(count, bs))
81	count = ALIGN_DOWN(count, bs);
82	bcount = count;
83	}
84
85	/ Don't allow overlapped cloning within the same file. /
86	if (inode_in == inode_out &&
87	pos_out + bcount > pos_in &&
88	pos_out < pos_in + bcount)
89	return -EINVAL;
90
91	/*
92	* We shortened the request but the caller can't deal with that, so
93	* bounce the request back to userspace.
94	*/
95	if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
96	return -EINVAL;
97
98	*req_count = count;
99	return `0`;
100	}
101
102	static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
103	bool write)
104	{
105	int mask = write ? MAY_WRITE : MAY_READ;
106	loff_t tmp;
107	int ret;
108
109	if (unlikely(pos < `0` \|\| len < `0`))
110	return -EINVAL;
111
112	if (unlikely(check_add_overflow(pos, len, &tmp)))
113	return -EINVAL;
114
115	ret = security_file_permission(file, mask);
116	if (ret)
117	return ret;
118
119	return fsnotify_file_area_perm(file, perm_mask: mask, ppos: &pos, count: len);
120	}
121
122	/*
123	* Ensure that we don't remap a partial EOF block in the middle of something
124	* else. Assume that the offsets have already been checked for block
125	* alignment.
126	*
127	* For clone we only link a partial EOF block above or at the destination file's
128	* EOF. For deduplication we accept a partial EOF block only if it ends at the
129	* destination file's EOF (can not link it into the middle of a file).
130	*
131	* Shorten the request if possible.
132	*/
133	static int generic_remap_check_len(struct inode *inode_in,
134	struct inode *inode_out,
135	loff_t pos_out,
136	loff_t *len,
137	unsigned int remap_flags)
138	{
139	u64 blkmask = i_blocksize(node: inode_in) - `1`;
140	loff_t new_len = *len;
141
142	if ((*len & blkmask) == `0`)
143	return `0`;
144
145	if (pos_out + *len < i_size_read(inode: inode_out))
146	new_len &= ~blkmask;
147
148	if (new_len == *len)
149	return `0`;
150
151	if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
152	*len = new_len;
153	return `0`;
154	}
155
156	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
157	}
158
159	/ Read a page's worth of file data into the page cache. /
160	static struct folio vfs_dedupe_get_folio(struct* file *file, loff_t pos)
161	{
162	return read_mapping_folio(mapping: file->f_mapping, index: pos >> PAGE_SHIFT, file);
163	}
164
165	/*
166	* Lock two folios, ensuring that we lock in offset order if the folios
167	* are from the same file.
168	*/
169	static void vfs_lock_two_folios(struct folio folio1, struct* folio *folio2)
170	{
171	/ Always lock in order of increasing index. /
172	if (folio1->index > folio2->index)
173	swap(folio1, folio2);
174
175	folio_lock(folio: folio1);
176	if (folio1 != folio2)
177	folio_lock(folio: folio2);
178	}
179
180	/ Unlock two folios, being careful not to unlock the same folio twice. /
181	static void vfs_unlock_two_folios(struct folio folio1, struct* folio *folio2)
182	{
183	folio_unlock(folio: folio1);
184	if (folio1 != folio2)
185	folio_unlock(folio: folio2);
186	}
187
188	/*
189	* Compare extents of two files to see if they are the same.
190	* Caller must have locked both inodes to prevent write races.
191	*/
192	static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
193	struct file *dest, loff_t dstoff,
194	loff_t len, bool *is_same)
195	{
196	bool same = true;
197	int error = -EINVAL;
198
199	while (len) {
200	struct folio src_folio, dst_folio;
201	void src_addr, dst_addr;
202	loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff),
203	PAGE_SIZE - offset_in_page(dstoff));
204
205	cmp_len = min(cmp_len, len);
206	if (cmp_len <= `0`)
207	goto out_error;
208
209	src_folio = vfs_dedupe_get_folio(file: src, pos: srcoff);
210	if (IS_ERR(ptr: src_folio)) {
211	error = PTR_ERR(ptr: src_folio);
212	goto out_error;
213	}
214	dst_folio = vfs_dedupe_get_folio(file: dest, pos: dstoff);
215	if (IS_ERR(ptr: dst_folio)) {
216	error = PTR_ERR(ptr: dst_folio);
217	folio_put(folio: src_folio);
218	goto out_error;
219	}
220
221	vfs_lock_two_folios(folio1: src_folio, folio2: dst_folio);
222
223	/*
224	* Now that we've locked both folios, make sure they're still
225	* mapped to the file data we're interested in. If not,
226	* someone is invalidating pages on us and we lose.
227	*/
228	if (!folio_test_uptodate(folio: src_folio) \|\| !folio_test_uptodate(folio: dst_folio) \|\|
229	src_folio->mapping != src->f_mapping \|\|
230	dst_folio->mapping != dest->f_mapping) {
231	same = false;
232	goto unlock;
233	}
234
235	src_addr = kmap_local_folio(folio: src_folio,
236	offset_in_folio(src_folio, srcoff));
237	dst_addr = kmap_local_folio(folio: dst_folio,
238	offset_in_folio(dst_folio, dstoff));
239
240	flush_dcache_folio(folio: src_folio);
241	flush_dcache_folio(folio: dst_folio);
242
243	if (memcmp(p: src_addr, q: dst_addr, size: cmp_len))
244	same = false;
245
246	kunmap_local(dst_addr);
247	kunmap_local(src_addr);
248	unlock:
249	vfs_unlock_two_folios(folio1: src_folio, folio2: dst_folio);
250	folio_put(folio: dst_folio);
251	folio_put(folio: src_folio);
252
253	if (!same)
254	break;
255
256	srcoff += cmp_len;
257	dstoff += cmp_len;
258	len -= cmp_len;
259	}
260
261	*is_same = same;
262	return `0`;
263
264	out_error:
265	return error;
266	}
267
268	/*
269	* Check that the two inodes are eligible for cloning, the ranges make
270	* sense, and then flush all dirty data. Caller must ensure that the
271	* inodes have been locked against any other modifications.
272	*
273	* If there's an error, then the usual negative error code is returned.
274	* Otherwise returns 0 with *len set to the request length.
275	*/
276	int
277	__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
278	struct file *file_out, loff_t pos_out,
279	loff_t len, unsigned* int remap_flags,
280	const struct iomap_ops *dax_read_ops)
281	{
282	struct inode *inode_in = file_inode(f: file_in);
283	struct inode *inode_out = file_inode(f: file_out);
284	bool same_inode = (inode_in == inode_out);
285	int ret;
286
287	/ Don't touch certain kinds of inodes /
288	if (IS_IMMUTABLE(inode_out))
289	return -EPERM;
290
291	if (IS_SWAPFILE(inode_in) \|\| IS_SWAPFILE(inode_out))
292	return -ETXTBSY;
293
294	/ Don't reflink dirs, pipes, sockets... /
295	if (S_ISDIR(inode_in->i_mode) \|\| S_ISDIR(inode_out->i_mode))
296	return -EISDIR;
297	if (!S_ISREG(inode_in->i_mode) \|\| !S_ISREG(inode_out->i_mode))
298	return -EINVAL;
299
300	/ Zero length dedupe exits immediately; reflink goes to EOF. /
301	if (*len == `0`) {
302	loff_t isize = i_size_read(inode: inode_in);
303
304	if ((remap_flags & REMAP_FILE_DEDUP) \|\| pos_in == isize)
305	return `0`;
306	if (pos_in > isize)
307	return -EINVAL;
308	*len = isize - pos_in;
309	if (*len == `0`)
310	return `0`;
311	}
312
313	/ Check that we don't violate system file offset limits. /
314	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, req_count: len,
315	remap_flags);
316	if (ret \|\| *len == `0`)
317	return ret;
318
319	/ Wait for the completion of any pending IOs on both files /
320	inode_dio_wait(inode: inode_in);
321	if (!same_inode)
322	inode_dio_wait(inode: inode_out);
323
324	ret = filemap_write_and_wait_range(mapping: inode_in->i_mapping,
325	lstart: pos_in, lend: pos_in + *len - `1`);
326	if (ret)
327	return ret;
328
329	ret = filemap_write_and_wait_range(mapping: inode_out->i_mapping,
330	lstart: pos_out, lend: pos_out + *len - `1`);
331	if (ret)
332	return ret;
333
334	/*
335	* Check that the extents are the same.
336	*/
337	if (remap_flags & REMAP_FILE_DEDUP) {
338	bool is_same = false;
339
340	if (!IS_DAX(inode_in))
341	ret = vfs_dedupe_file_range_compare(src: file_in, srcoff: pos_in,
342	dest: file_out, dstoff: pos_out, len: *len, is_same: &is_same);
343	else if (dax_read_ops)
344	ret = dax_dedupe_file_range_compare(src: inode_in, srcoff: pos_in,
345	dest: inode_out, destoff: pos_out, len: *len, is_same: &is_same,
346	ops: dax_read_ops);
347	else
348	return -EINVAL;
349	if (ret)
350	return ret;
351	if (!is_same)
352	return -EBADE;
353	}
354
355	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
356	remap_flags);
357	if (ret \|\| *len == `0`)
358	return ret;
359
360	/ If can't alter the file contents, we're done. /
361	if (!(remap_flags & REMAP_FILE_DEDUP))
362	ret = file_modified(file: file_out);
363
364	return ret;
365	}
366
367	int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
368	struct file *file_out, loff_t pos_out,
369	loff_t len, unsigned* int remap_flags)
370	{
371	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
372	pos_out, len, remap_flags, NULL);
373	}
374	EXPORT_SYMBOL(generic_remap_file_range_prep);
375
376	loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
377	struct file *file_out, loff_t pos_out,
378	loff_t len, unsigned int remap_flags)
379	{
380	loff_t ret;
381
382	WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
383
384	if (file_inode(f: file_in)->i_sb != file_inode(f: file_out)->i_sb)
385	return -EXDEV;
386
387	ret = generic_file_rw_checks(file_in, file_out);
388	if (ret < `0`)
389	return ret;
390
391	if (!file_in->f_op->remap_file_range)
392	return -EOPNOTSUPP;
393
394	ret = remap_verify_area(file: file_in, pos: pos_in, len, write: false);
395	if (ret)
396	return ret;
397
398	ret = remap_verify_area(file: file_out, pos: pos_out, len, write: true);
399	if (ret)
400	return ret;
401
402	file_start_write(file: file_out);
403	ret = file_in->f_op->remap_file_range(file_in, pos_in,
404	file_out, pos_out, len, remap_flags);
405	file_end_write(file: file_out);
406	if (ret < `0`)
407	return ret;
408
409	fsnotify_access(file: file_in);
410	fsnotify_modify(file: file_out);
411	return ret;
412	}
413	EXPORT_SYMBOL(vfs_clone_file_range);
414
415	/ Check whether we are allowed to dedupe the destination file /
416	static bool may_dedupe_file(struct file *file)
417	{
418	struct mnt_idmap *idmap = file_mnt_idmap(file);
419	struct inode *inode = file_inode(f: file);
420
421	if (capable(CAP_SYS_ADMIN))
422	return true;
423	if (file->f_mode & FMODE_WRITE)
424	return true;
425	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), current_fsuid()))
426	return true;
427	if (!inode_permission(idmap, inode, MAY_WRITE))
428	return true;
429	return false;
430	}
431
432	loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
433	struct file *dst_file, loff_t dst_pos,
434	loff_t len, unsigned int remap_flags)
435	{
436	loff_t ret;
437
438	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP \|
439	REMAP_FILE_CAN_SHORTEN));
440
441	/*
442	* This is redundant if called from vfs_dedupe_file_range(), but other
443	* callers need it and it's not performance sesitive...
444	*/
445	ret = remap_verify_area(file: src_file, pos: src_pos, len, write: false);
446	if (ret)
447	return ret;
448
449	ret = remap_verify_area(file: dst_file, pos: dst_pos, len, write: true);
450	if (ret)
451	return ret;
452
453	/*
454	* This needs to be called after remap_verify_area() because of
455	* sb_start_write() and before may_dedupe_file() because the mount's
456	* MAY_WRITE need to be checked with mnt_get_write_access_file() held.
457	*/
458	ret = mnt_want_write_file(file: dst_file);
459	if (ret)
460	return ret;
461
462	ret = -EPERM;
463	if (!may_dedupe_file(file: dst_file))
464	goto out_drop_write;
465
466	ret = -EXDEV;
467	if (file_inode(f: src_file)->i_sb != file_inode(f: dst_file)->i_sb)
468	goto out_drop_write;
469
470	ret = -EISDIR;
471	if (S_ISDIR(file_inode(dst_file)->i_mode))
472	goto out_drop_write;
473
474	ret = -EINVAL;
475	if (!dst_file->f_op->remap_file_range)
476	goto out_drop_write;
477
478	if (len == `0`) {
479	ret = `0`;
480	goto out_drop_write;
481	}
482
483	ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
484	dst_pos, len, remap_flags \| REMAP_FILE_DEDUP);
485	out_drop_write:
486	mnt_drop_write_file(file: dst_file);
487
488	return ret;
489	}
490	EXPORT_SYMBOL(vfs_dedupe_file_range_one);
491
492	int vfs_dedupe_file_range(struct file file, struct* file_dedupe_range *same)
493	{
494	struct file_dedupe_range_info *info;
495	struct inode *src = file_inode(f: file);
496	u64 off;
497	u64 len;
498	int i;
499	int ret;
500	u16 count = same->dest_count;
501	loff_t deduped;
502
503	if (!(file->f_mode & FMODE_READ))
504	return -EINVAL;
505
506	if (same->reserved1 \|\| same->reserved2)
507	return -EINVAL;
508
509	off = same->src_offset;
510	len = same->src_length;
511
512	if (S_ISDIR(src->i_mode))
513	return -EISDIR;
514
515	if (!S_ISREG(src->i_mode))
516	return -EINVAL;
517
518	if (!file->f_op->remap_file_range)
519	return -EOPNOTSUPP;
520
521	ret = remap_verify_area(file, pos: off, len, write: false);
522	if (ret < `0`)
523	return ret;
524	ret = `0`;
525
526	if (off + len > i_size_read(inode: src))
527	return -EINVAL;
528
529	/ Arbitrary 1G limit on a single dedupe request, can be raised. /
530	len = min_t(u64, len, `1` << `30`);
531
532	/ pre-format output fields to sane values /
533	for (i = `0`; i < count; i++) {
534	same->info[i].bytes_deduped = `0ULL`;
535	same->info[i].status = FILE_DEDUPE_RANGE_SAME;
536	}
537
538	for (i = `0`, info = same->info; i < count; i++, info++) {
539	struct fd dst_fd = fdget(fd: info->dest_fd);
540	struct file *dst_file = dst_fd.file;
541
542	if (!dst_file) {
543	info->status = -EBADF;
544	goto next_loop;
545	}
546
547	if (info->reserved) {
548	info->status = -EINVAL;
549	goto next_fdput;
550	}
551
552	deduped = vfs_dedupe_file_range_one(file, off, dst_file,
553	info->dest_offset, len,
554	REMAP_FILE_CAN_SHORTEN);
555	if (deduped == -EBADE)
556	info->status = FILE_DEDUPE_RANGE_DIFFERS;
557	else if (deduped < `0`)
558	info->status = deduped;
559	else
560	info->bytes_deduped = len;
561
562	next_fdput:
563	fdput(fd: dst_fd);
564	next_loop:
565	if (fatal_signal_pending(current))
566	break;
567	}
568	return ret;
569	}
570	EXPORT_SYMBOL(vfs_dedupe_file_range);
571

source code of linux/fs/remap_range.c