file.c source code [linux/fs/ceph/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3	#include <linux/ceph/striper.h>
4
5	#include <linux/module.h>
6	#include <linux/sched.h>
7	#include <linux/slab.h>
8	#include <linux/file.h>
9	#include <linux/mount.h>
10	#include <linux/namei.h>
11	#include <linux/writeback.h>
12	#include <linux/falloc.h>
13	#include <linux/iversion.h>
14	#include <linux/ktime.h>
15
16	#include "super.h"
17	#include "mds_client.h"
18	#include "cache.h"
19	#include "io.h"
20	#include "metric.h"
21
22	static __le32 ceph_flags_sys2wire(u32 flags)
23	{
24	u32 wire_flags = `0`;
25
26	switch (flags & O_ACCMODE) {
27	case O_RDONLY:
28	wire_flags \|= CEPH_O_RDONLY;
29	break;
30	case O_WRONLY:
31	wire_flags \|= CEPH_O_WRONLY;
32	break;
33	case O_RDWR:
34	wire_flags \|= CEPH_O_RDWR;
35	break;
36	}
37
38	flags &= ~O_ACCMODE;
39
40	#define ceph_sys2wire(a) if (flags & a) { wire_flags \|= CEPH_##a; flags &= ~a; }
41
42	ceph_sys2wire(O_CREAT);
43	ceph_sys2wire(O_EXCL);
44	ceph_sys2wire(O_TRUNC);
45	ceph_sys2wire(O_DIRECTORY);
46	ceph_sys2wire(O_NOFOLLOW);
47
48	#undef ceph_sys2wire
49
50	if (flags)
51	dout("unused open flags: %x\n", flags);
52
53	return cpu_to_le32(wire_flags);
54	}
55
56	/*
57	* Ceph file operations
58	*
59	* Implement basic open/close functionality, and implement
60	* read/write.
61	*
62	* We implement three modes of file I/O:
63	* - buffered uses the generic_file_aio_{read,write} helpers
64	*
65	* - synchronous is used when there is multi-client read/write
66	* sharing, avoids the page cache, and synchronously waits for an
67	* ack from the OSD.
68	*
69	* - direct io takes the variant of the sync path that references
70	* user pages directly.
71	*
72	* fsync() flushes and waits on dirty pages, but just queues metadata
73	* for writeback: since the MDS can recover size and mtime there is no
74	* need to wait for MDS acknowledgement.
75	*/
76
77	/*
78	* How many pages to get in one call to iov_iter_get_pages(). This
79	* determines the size of the on-stack array used as a buffer.
80	*/
81	#define ITER_GET_BVECS_PAGES 64
82
83	static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
84	struct bio_vec *bvecs)
85	{
86	size_t size = `0`;
87	int bvec_idx = `0`;
88
89	if (maxsize > iov_iter_count(i: iter))
90	maxsize = iov_iter_count(i: iter);
91
92	while (size < maxsize) {
93	struct page *pages[ITER_GET_BVECS_PAGES];
94	ssize_t bytes;
95	size_t start;
96	int idx = `0`;
97
98	bytes = iov_iter_get_pages2(i: iter, pages, maxsize: maxsize - size,
99	ITER_GET_BVECS_PAGES, start: &start);
100	if (bytes < `0`)
101	return size ?: bytes;
102
103	size += bytes;
104
105	for ( ; bytes; idx++, bvec_idx++) {
106	int len = min_t(int, bytes, PAGE_SIZE - start);
107
108	bvec_set_page(bv: &bvecs[bvec_idx], page: pages[idx], len, offset: start);
109	bytes -= len;
110	start = `0`;
111	}
112	}
113
114	return size;
115	}
116
117	/*
118	* iov_iter_get_pages() only considers one iov_iter segment, no matter
119	* what maxsize or maxpages are given. For ITER_BVEC that is a single
120	* page.
121	*
122	* Attempt to get up to @maxsize bytes worth of pages from @iter.
123	* Return the number of bytes in the created bio_vec array, or an error.
124	*/
125	static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
126	struct bio_vec *bvecs, int* *num_bvecs)
127	{
128	struct bio_vec *bv;
129	size_t orig_count = iov_iter_count(i: iter);
130	ssize_t bytes;
131	int npages;
132
133	iov_iter_truncate(i: iter, count: maxsize);
134	npages = iov_iter_npages(i: iter, INT_MAX);
135	iov_iter_reexpand(i: iter, count: orig_count);
136
137	/*
138	* __iter_get_bvecs() may populate only part of the array -- zero it
139	* out.
140	*/
141	bv = kvmalloc_array(n: npages, size: sizeof(*bv), GFP_KERNEL \| __GFP_ZERO);
142	if (!bv)
143	return -ENOMEM;
144
145	bytes = __iter_get_bvecs(iter, maxsize, bvecs: bv);
146	if (bytes < `0`) {
147	/*
148	* No pages were pinned -- just free the array.
149	*/
150	kvfree(addr: bv);
151	return bytes;
152	}
153
154	*bvecs = bv;
155	*num_bvecs = npages;
156	return bytes;
157	}
158
159	static void put_bvecs(struct bio_vec bvecs, int* num_bvecs, bool should_dirty)
160	{
161	int i;
162
163	for (i = `0`; i < num_bvecs; i++) {
164	if (bvecs[i].bv_page) {
165	if (should_dirty)
166	set_page_dirty_lock(bvecs[i].bv_page);
167	put_page(page: bvecs[i].bv_page);
168	}
169	}
170	kvfree(addr: bvecs);
171	}
172
173	/*
174	* Prepare an open request. Preallocate ceph_cap to avoid an
175	* inopportune ENOMEM later.
176	*/
177	static struct ceph_mds_request *
178	prepare_open_request(struct super_block sb, int* flags, int create_mode)
179	{
180	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
181	struct ceph_mds_request *req;
182	int want_auth = USE_ANY_MDS;
183	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
184
185	if (flags & (O_WRONLY\|O_RDWR\|O_CREAT\|O_TRUNC))
186	want_auth = USE_AUTH_MDS;
187
188	req = ceph_mdsc_create_request(mdsc, op, mode: want_auth);
189	if (IS_ERR(ptr: req))
190	goto out;
191	req->r_fmode = ceph_flags_to_mode(flags);
192	req->r_args.open.flags = ceph_flags_sys2wire(flags);
193	req->r_args.open.mode = cpu_to_le32(create_mode);
194	out:
195	return req;
196	}
197
198	static int ceph_init_file_info(struct inode inode, struct* file *file,
199	int fmode, bool isdir)
200	{
201	struct ceph_inode_info *ci = ceph_inode(inode);
202	struct ceph_mount_options *opt =
203	ceph_inode_to_client(inode: &ci->netfs.inode)->mount_options;
204	struct ceph_file_info *fi;
205	int ret;
206
207	dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
208	inode->i_mode, isdir ? "dir" : "regular");
209	BUG_ON(inode->i_fop->release != ceph_release);
210
211	if (isdir) {
212	struct ceph_dir_file_info *dfi =
213	kmem_cache_zalloc(k: ceph_dir_file_cachep, GFP_KERNEL);
214	if (!dfi)
215	return -ENOMEM;
216
217	file->private_data = dfi;
218	fi = &dfi->file_info;
219	dfi->next_offset = `2`;
220	dfi->readdir_cache_idx = -`1`;
221	} else {
222	fi = kmem_cache_zalloc(k: ceph_file_cachep, GFP_KERNEL);
223	if (!fi)
224	return -ENOMEM;
225
226	if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
227	fi->flags \|= CEPH_F_SYNC;
228
229	file->private_data = fi;
230	}
231
232	ceph_get_fmode(ci, mode: fmode, count: `1`);
233	fi->fmode = fmode;
234
235	spin_lock_init(&fi->rw_contexts_lock);
236	INIT_LIST_HEAD(list: &fi->rw_contexts);
237	fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
238
239	if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) {
240	ret = ceph_uninline_data(file);
241	if (ret < `0`)
242	goto error;
243	}
244
245	return `0`;
246
247	error:
248	ceph_fscache_unuse_cookie(inode, update: file->f_mode & FMODE_WRITE);
249	ceph_put_fmode(ci, mode: fi->fmode, count: `1`);
250	kmem_cache_free(s: ceph_file_cachep, objp: fi);
251	/ wake up anyone waiting for caps on this inode /
252	wake_up_all(&ci->i_cap_wq);
253	return ret;
254	}
255
256	/*
257	* initialize private struct file data.
258	* if we fail, clean up by dropping fmode reference on the ceph_inode
259	*/
260	static int ceph_init_file(struct inode inode, struct* file file, int* fmode)
261	{
262	int ret = `0`;
263
264	switch (inode->i_mode & S_IFMT) {
265	case S_IFREG:
266	ceph_fscache_use_cookie(inode, will_modify: file->f_mode & FMODE_WRITE);
267	fallthrough;
268	case S_IFDIR:
269	ret = ceph_init_file_info(inode, file, fmode,
270	S_ISDIR(inode->i_mode));
271	break;
272
273	case S_IFLNK:
274	dout("init_file %p %p 0%o (symlink)\n", inode, file,
275	inode->i_mode);
276	break;
277
278	default:
279	dout("init_file %p %p 0%o (special)\n", inode, file,
280	inode->i_mode);
281	/*
282	* we need to drop the open ref now, since we don't
283	* have .release set to ceph_release.
284	*/
285	BUG_ON(inode->i_fop->release == ceph_release);
286
287	/ call the proper open fop /
288	ret = inode->i_fop->open(inode, file);
289	}
290	return ret;
291	}
292
293	/*
294	* try renew caps after session gets killed.
295	*/
296	int ceph_renew_caps(struct inode inode, int* fmode)
297	{
298	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
299	struct ceph_inode_info *ci = ceph_inode(inode);
300	struct ceph_mds_request *req;
301	int err, flags, wanted;
302
303	spin_lock(lock: &ci->i_ceph_lock);
304	__ceph_touch_fmode(ci, mdsc, fmode);
305	wanted = __ceph_caps_file_wanted(ci);
306	if (__ceph_is_any_real_caps(ci) &&
307	(!(wanted & CEPH_CAP_ANY_WR) \|\| ci->i_auth_cap)) {
308	int issued = __ceph_caps_issued(ci, NULL);
309	spin_unlock(lock: &ci->i_ceph_lock);
310	dout("renew caps %p want %s issued %s updating mds_wanted\n",
311	inode, ceph_cap_string(wanted), ceph_cap_string(issued));
312	ceph_check_caps(ci, flags: `0`);
313	return `0`;
314	}
315	spin_unlock(lock: &ci->i_ceph_lock);
316
317	flags = `0`;
318	if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
319	flags = O_RDWR;
320	else if (wanted & CEPH_CAP_FILE_RD)
321	flags = O_RDONLY;
322	else if (wanted & CEPH_CAP_FILE_WR)
323	flags = O_WRONLY;
324	#ifdef O_LAZY
325	if (wanted & CEPH_CAP_FILE_LAZYIO)
326	flags \|= O_LAZY;
327	#endif
328
329	req = prepare_open_request(sb: inode->i_sb, flags, create_mode: `0`);
330	if (IS_ERR(ptr: req)) {
331	err = PTR_ERR(ptr: req);
332	goto out;
333	}
334
335	req->r_inode = inode;
336	ihold(inode);
337	req->r_num_caps = `1`;
338
339	err = ceph_mdsc_do_request(mdsc, NULL, req);
340	ceph_mdsc_put_request(req);
341	out:
342	dout("renew caps %p open result=%d\n", inode, err);
343	return err < `0` ? err : `0`;
344	}
345
346	/*
347	* If we already have the requisite capabilities, we can satisfy
348	* the open request locally (no need to request new caps from the
349	* MDS). We do, however, need to inform the MDS (asynchronously)
350	* if our wanted caps set expands.
351	*/
352	int ceph_open(struct inode inode, struct* file *file)
353	{
354	struct ceph_inode_info *ci = ceph_inode(inode);
355	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: inode->i_sb);
356	struct ceph_mds_client *mdsc = fsc->mdsc;
357	struct ceph_mds_request *req;
358	struct ceph_file_info *fi = file->private_data;
359	int err;
360	int flags, fmode, wanted;
361
362	if (fi) {
363	dout("open file %p is already opened\n", file);
364	return `0`;
365	}
366
367	/ filter out O_CREAT\|O_EXCL; vfs did that already. yuck. /
368	flags = file->f_flags & ~(O_CREAT\|O_EXCL);
369	if (S_ISDIR(inode->i_mode)) {
370	flags = O_DIRECTORY; / mds likes to know /
371	} else if (S_ISREG(inode->i_mode)) {
372	err = fscrypt_file_open(inode, filp: file);
373	if (err)
374	return err;
375	}
376
377	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
378	ceph_vinop(inode), file, flags, file->f_flags);
379	fmode = ceph_flags_to_mode(flags);
380	wanted = ceph_caps_for_mode(mode: fmode);
381
382	/ snapped files are read-only /
383	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
384	return -EROFS;
385
386	/ trivially open snapdir /
387	if (ceph_snap(inode) == CEPH_SNAPDIR) {
388	return ceph_init_file(inode, file, fmode);
389	}
390
391	/*
392	* No need to block if we have caps on the auth MDS (for
393	* write) or any MDS (for read). Update wanted set
394	* asynchronously.
395	*/
396	spin_lock(lock: &ci->i_ceph_lock);
397	if (__ceph_is_any_real_caps(ci) &&
398	(((fmode & CEPH_FILE_MODE_WR) == `0`) \|\| ci->i_auth_cap)) {
399	int mds_wanted = __ceph_caps_mds_wanted(ci, check: true);
400	int issued = __ceph_caps_issued(ci, NULL);
401
402	dout("open %p fmode %d want %s issued %s using existing\n",
403	inode, fmode, ceph_cap_string(wanted),
404	ceph_cap_string(issued));
405	__ceph_touch_fmode(ci, mdsc, fmode);
406	spin_unlock(lock: &ci->i_ceph_lock);
407
408	/ adjust wanted? /
409	if ((issued & wanted) != wanted &&
410	(mds_wanted & wanted) != wanted &&
411	ceph_snap(inode) != CEPH_SNAPDIR)
412	ceph_check_caps(ci, flags: `0`);
413
414	return ceph_init_file(inode, file, fmode);
415	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
416	(ci->i_snap_caps & wanted) == wanted) {
417	__ceph_touch_fmode(ci, mdsc, fmode);
418	spin_unlock(lock: &ci->i_ceph_lock);
419	return ceph_init_file(inode, file, fmode);
420	}
421
422	spin_unlock(lock: &ci->i_ceph_lock);
423
424	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
425	req = prepare_open_request(sb: inode->i_sb, flags, create_mode: `0`);
426	if (IS_ERR(ptr: req)) {
427	err = PTR_ERR(ptr: req);
428	goto out;
429	}
430	req->r_inode = inode;
431	ihold(inode);
432
433	req->r_num_caps = `1`;
434	err = ceph_mdsc_do_request(mdsc, NULL, req);
435	if (!err)
436	err = ceph_init_file(inode, file, fmode: req->r_fmode);
437	ceph_mdsc_put_request(req);
438	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
439	out:
440	return err;
441	}
442
443	/ Clone the layout from a synchronous create, if the dir now has Dc caps /
444	static void
445	cache_file_layout(struct inode dst, struct* inode *src)
446	{
447	struct ceph_inode_info *cdst = ceph_inode(inode: dst);
448	struct ceph_inode_info *csrc = ceph_inode(inode: src);
449
450	spin_lock(lock: &cdst->i_ceph_lock);
451	if ((__ceph_caps_issued(ci: cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
452	!ceph_file_layout_is_valid(layout: &cdst->i_cached_layout)) {
453	memcpy(&cdst->i_cached_layout, &csrc->i_layout,
454	sizeof(cdst->i_cached_layout));
455	rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
456	ceph_try_get_string(csrc->i_layout.pool_ns));
457	}
458	spin_unlock(lock: &cdst->i_ceph_lock);
459	}
460
461	/*
462	* Try to set up an async create. We need caps, a file layout, and inode number,
463	* and either a lease on the dentry or complete dir info. If any of those
464	* criteria are not satisfied, then return false and the caller can go
465	* synchronous.
466	*/
467	static int try_prep_async_create(struct inode dir, struct* dentry *dentry,
468	struct ceph_file_layout lo, u64 pino)
469	{
470	struct ceph_inode_info *ci = ceph_inode(inode: dir);
471	struct ceph_dentry_info *di = ceph_dentry(dentry);
472	int got = `0`, want = CEPH_CAP_FILE_EXCL \| CEPH_CAP_DIR_CREATE;
473	u64 ino;
474
475	spin_lock(lock: &ci->i_ceph_lock);
476	/ No auth cap means no chance for Dc caps /
477	if (!ci->i_auth_cap)
478	goto no_async;
479
480	/ Any delegated inos? /
481	if (xa_empty(xa: &ci->i_auth_cap->session->s_delegated_inos))
482	goto no_async;
483
484	if (!ceph_file_layout_is_valid(layout: &ci->i_cached_layout))
485	goto no_async;
486
487	if ((__ceph_caps_issued(ci, NULL) & want) != want)
488	goto no_async;
489
490	if (d_in_lookup(dentry)) {
491	if (!__ceph_dir_is_complete(ci))
492	goto no_async;
493	spin_lock(lock: &dentry->d_lock);
494	di->lease_shared_gen = atomic_read(v: &ci->i_shared_gen);
495	spin_unlock(lock: &dentry->d_lock);
496	} else if (atomic_read(v: &ci->i_shared_gen) !=
497	READ_ONCE(di->lease_shared_gen)) {
498	goto no_async;
499	}
500
501	ino = ceph_get_deleg_ino(session: ci->i_auth_cap->session);
502	if (!ino)
503	goto no_async;
504
505	*pino = ino;
506	ceph_take_cap_refs(ci, caps: want, snap_rwsem_locked: false);
507	memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
508	rcu_assign_pointer(lo->pool_ns,
509	ceph_try_get_string(ci->i_cached_layout.pool_ns));
510	got = want;
511	no_async:
512	spin_unlock(lock: &ci->i_ceph_lock);
513	return got;
514	}
515
516	static void restore_deleg_ino(struct inode *dir, u64 ino)
517	{
518	struct ceph_inode_info *ci = ceph_inode(inode: dir);
519	struct ceph_mds_session *s = NULL;
520
521	spin_lock(lock: &ci->i_ceph_lock);
522	if (ci->i_auth_cap)
523	s = ceph_get_mds_session(s: ci->i_auth_cap->session);
524	spin_unlock(lock: &ci->i_ceph_lock);
525	if (s) {
526	int err = ceph_restore_deleg_ino(session: s, ino);
527	if (err)
528	pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
529	ino, err);
530	ceph_put_mds_session(s);
531	}
532	}
533
534	static void wake_async_create_waiters(struct inode *inode,
535	struct ceph_mds_session *session)
536	{
537	struct ceph_inode_info *ci = ceph_inode(inode);
538	bool check_cap = false;
539
540	spin_lock(lock: &ci->i_ceph_lock);
541	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
542	ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
543	wake_up_bit(word: &ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
544
545	if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
546	ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
547	check_cap = true;
548	}
549	}
550	ceph_kick_flushing_inode_caps(session, ci);
551	spin_unlock(lock: &ci->i_ceph_lock);
552
553	if (check_cap)
554	ceph_check_caps(ci, CHECK_CAPS_FLUSH);
555	}
556
557	static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
558	struct ceph_mds_request *req)
559	{
560	struct dentry *dentry = req->r_dentry;
561	struct inode *dinode = d_inode(dentry);
562	struct inode *tinode = req->r_target_inode;
563	int result = req->r_err ? req->r_err :
564	le32_to_cpu(req->r_reply_info.head->result);
565
566	WARN_ON_ONCE(dinode && tinode && dinode != tinode);
567
568	/ MDS changed -- caller must resubmit /
569	if (result == -EJUKEBOX)
570	goto out;
571
572	mapping_set_error(mapping: req->r_parent->i_mapping, error: result);
573
574	if (result) {
575	int pathlen = `0`;
576	u64 base = `0`;
577	char *path = ceph_mdsc_build_path(dentry: req->r_dentry, plen: &pathlen,
578	base: &base, for_wire: `0`);
579
580	pr_warn("async create failure path=(%llx)%s result=%d!\n",
581	base, IS_ERR(path) ? "<<bad>>" : path, result);
582	ceph_mdsc_free_path(path, len: pathlen);
583
584	ceph_dir_clear_complete(inode: req->r_parent);
585	if (!d_unhashed(dentry))
586	d_drop(dentry);
587
588	if (dinode) {
589	mapping_set_error(mapping: dinode->i_mapping, error: result);
590	ceph_inode_shutdown(inode: dinode);
591	wake_async_create_waiters(inode: dinode, session: req->r_session);
592	}
593	}
594
595	if (tinode) {
596	u64 ino = ceph_vino(inode: tinode).ino;
597
598	if (req->r_deleg_ino != ino)
599	pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
600	__func__, req->r_err, req->r_deleg_ino, ino);
601
602	mapping_set_error(mapping: tinode->i_mapping, error: result);
603	wake_async_create_waiters(inode: tinode, session: req->r_session);
604	} else if (!result) {
605	pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
606	req->r_deleg_ino);
607	}
608	out:
609	ceph_mdsc_release_dir_caps(req);
610	}
611
612	static int ceph_finish_async_create(struct inode dir, struct* inode *inode,
613	struct dentry *dentry,
614	struct file *file, umode_t mode,
615	struct ceph_mds_request *req,
616	struct ceph_acl_sec_ctx *as_ctx,
617	struct ceph_file_layout *lo)
618	{
619	int ret;
620	char xattr_buf[`4`];
621	struct ceph_mds_reply_inode in = { };
622	struct ceph_mds_reply_info_in iinfo = { .in = &in };
623	struct ceph_inode_info *ci = ceph_inode(inode: dir);
624	struct ceph_dentry_info *di = ceph_dentry(dentry);
625	struct timespec64 now;
626	struct ceph_string *pool_ns;
627	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dir->i_sb);
628	struct ceph_vino vino = { .ino = req->r_deleg_ino,
629	.snap = CEPH_NOSNAP };
630
631	ktime_get_real_ts64(tv: &now);
632
633	iinfo.inline_version = CEPH_INLINE_NONE;
634	iinfo.change_attr = `1`;
635	ceph_encode_timespec64(tv: &iinfo.btime, ts: &now);
636
637	if (req->r_pagelist) {
638	iinfo.xattr_len = req->r_pagelist->length;
639	iinfo.xattr_data = req->r_pagelist->mapped_tail;
640	} else {
641	/ fake it /
642	iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
643	iinfo.xattr_data = xattr_buf;
644	memset(iinfo.xattr_data, `0`, iinfo.xattr_len);
645	}
646
647	in.ino = cpu_to_le64(vino.ino);
648	in.snapid = cpu_to_le64(CEPH_NOSNAP);
649	in.version = cpu_to_le64(`1`); // ???
650	in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
651	in.cap.cap_id = cpu_to_le64(`1`);
652	in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
653	in.cap.flags = CEPH_CAP_FLAG_AUTH;
654	in.ctime = in.mtime = in.atime = iinfo.btime;
655	in.truncate_seq = cpu_to_le32(`1`);
656	in.truncate_size = cpu_to_le64(-`1ULL`);
657	in.xattr_version = cpu_to_le64(`1`);
658	in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
659	if (dir->i_mode & S_ISGID) {
660	in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid));
661
662	/ Directories always inherit the setgid bit. /
663	if (S_ISDIR(mode))
664	mode \|= S_ISGID;
665	} else {
666	in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
667	}
668	in.mode = cpu_to_le32((u32)mode);
669
670	in.nlink = cpu_to_le32(`1`);
671	in.max_size = cpu_to_le64(lo->stripe_unit);
672
673	ceph_file_layout_to_legacy(fl: lo, legacy: &in.layout);
674	/ lo is private, so pool_ns can't change /
675	pool_ns = rcu_dereference_raw(lo->pool_ns);
676	if (pool_ns) {
677	iinfo.pool_ns_len = pool_ns->len;
678	iinfo.pool_ns_data = pool_ns->str;
679	}
680
681	down_read(sem: &mdsc->snap_rwsem);
682	ret = ceph_fill_inode(inode, NULL, iinfo: &iinfo, NULL, session: req->r_session,
683	cap_fmode: req->r_fmode, NULL);
684	up_read(sem: &mdsc->snap_rwsem);
685	if (ret) {
686	dout("%s failed to fill inode: %d\n", __func__, ret);
687	ceph_dir_clear_complete(inode: dir);
688	if (!d_unhashed(dentry))
689	d_drop(dentry);
690	discard_new_inode(inode);
691	} else {
692	struct dentry *dn;
693
694	dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
695	vino.ino, ceph_ino(dir), dentry->d_name.name);
696	ceph_dir_clear_ordered(inode: dir);
697	ceph_init_inode_acls(inode, as_ctx);
698	if (inode->i_state & I_NEW) {
699	/*
700	* If it's not I_NEW, then someone created this before
701	* we got here. Assume the server is aware of it at
702	* that point and don't worry about setting
703	* CEPH_I_ASYNC_CREATE.
704	*/
705	ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
706	unlock_new_inode(inode);
707	}
708	if (d_in_lookup(dentry) \|\| d_really_is_negative(dentry)) {
709	if (!d_unhashed(dentry))
710	d_drop(dentry);
711	dn = d_splice_alias(inode, dentry);
712	WARN_ON_ONCE(dn && dn != dentry);
713	}
714	file->f_mode \|= FMODE_CREATED;
715	ret = finish_open(file, dentry, open: ceph_open);
716	}
717
718	spin_lock(lock: &dentry->d_lock);
719	di->flags &= ~CEPH_DENTRY_ASYNC_CREATE;
720	wake_up_bit(word: &di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT);
721	spin_unlock(lock: &dentry->d_lock);
722
723	return ret;
724	}
725
726	/*
727	* Do a lookup + open with a single request. If we get a non-existent
728	* file or symlink, return 1 so the VFS can retry.
729	*/
730	int ceph_atomic_open(struct inode dir, struct* dentry *dentry,
731	struct file file, unsigned* flags, umode_t mode)
732	{
733	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: dir->i_sb);
734	struct ceph_mds_client *mdsc = fsc->mdsc;
735	struct ceph_mds_request *req;
736	struct inode *new_inode = NULL;
737	struct dentry *dn;
738	struct ceph_acl_sec_ctx as_ctx = {};
739	bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
740	int mask;
741	int err;
742
743	dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
744	dir, dentry, dentry,
745	d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
746
747	if (dentry->d_name.len > NAME_MAX)
748	return -ENAMETOOLONG;
749
750	err = ceph_wait_on_conflict_unlink(dentry);
751	if (err)
752	return err;
753	/*
754	* Do not truncate the file, since atomic_open is called before the
755	* permission check. The caller will do the truncation afterward.
756	*/
757	flags &= ~O_TRUNC;
758
759	retry:
760	if (flags & O_CREAT) {
761	if (ceph_quota_is_max_files_exceeded(inode: dir))
762	return -EDQUOT;
763
764	new_inode = ceph_new_inode(dir, dentry, mode: &mode, as_ctx: &as_ctx);
765	if (IS_ERR(ptr: new_inode)) {
766	err = PTR_ERR(ptr: new_inode);
767	goto out_ctx;
768	}
769	/ Async create can't handle more than a page of xattrs /
770	if (as_ctx.pagelist &&
771	!list_is_singular(head: &as_ctx.pagelist->head))
772	try_async = false;
773	} else if (!d_in_lookup(dentry)) {
774	/ If it's not being looked up, it's negative /
775	return -ENOENT;
776	}
777
778	/ do the open /
779	req = prepare_open_request(sb: dir->i_sb, flags, create_mode: mode);
780	if (IS_ERR(ptr: req)) {
781	err = PTR_ERR(ptr: req);
782	goto out_ctx;
783	}
784	req->r_dentry = dget(dentry);
785	req->r_num_caps = `2`;
786	mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
787	if (ceph_security_xattr_wanted(in: dir))
788	mask \|= CEPH_CAP_XATTR_SHARED;
789	req->r_args.open.mask = cpu_to_le32(mask);
790	req->r_parent = dir;
791	ihold(inode: dir);
792	if (IS_ENCRYPTED(dir)) {
793	set_bit(CEPH_MDS_R_FSCRYPT_FILE, addr: &req->r_req_flags);
794	err = fscrypt_prepare_lookup_partial(dir, dentry);
795	if (err < `0`)
796	goto out_req;
797	}
798
799	if (flags & O_CREAT) {
800	struct ceph_file_layout lo;
801
802	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL \|
803	CEPH_CAP_XATTR_EXCL;
804	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
805
806	ceph_as_ctx_to_req(req, as_ctx: &as_ctx);
807
808	if (try_async && (req->r_dir_caps =
809	try_prep_async_create(dir, dentry, lo: &lo,
810	pino: &req->r_deleg_ino))) {
811	struct ceph_vino vino = { .ino = req->r_deleg_ino,
812	.snap = CEPH_NOSNAP };
813	struct ceph_dentry_info *di = ceph_dentry(dentry);
814
815	set_bit(CEPH_MDS_R_ASYNC, addr: &req->r_req_flags);
816	req->r_args.open.flags \|= cpu_to_le32(CEPH_O_EXCL);
817	req->r_callback = ceph_async_create_cb;
818
819	/ Hash inode before RPC /
820	new_inode = ceph_get_inode(sb: dir->i_sb, vino, newino: new_inode);
821	if (IS_ERR(ptr: new_inode)) {
822	err = PTR_ERR(ptr: new_inode);
823	new_inode = NULL;
824	goto out_req;
825	}
826	WARN_ON_ONCE(!(new_inode->i_state & I_NEW));
827
828	spin_lock(lock: &dentry->d_lock);
829	di->flags \|= CEPH_DENTRY_ASYNC_CREATE;
830	spin_unlock(lock: &dentry->d_lock);
831
832	err = ceph_mdsc_submit_request(mdsc, dir, req);
833	if (!err) {
834	err = ceph_finish_async_create(dir, inode: new_inode,
835	dentry, file,
836	mode, req,
837	as_ctx: &as_ctx, lo: &lo);
838	new_inode = NULL;
839	} else if (err == -EJUKEBOX) {
840	restore_deleg_ino(dir, ino: req->r_deleg_ino);
841	ceph_mdsc_put_request(req);
842	discard_new_inode(new_inode);
843	ceph_release_acl_sec_ctx(as_ctx: &as_ctx);
844	memset(&as_ctx, `0`, sizeof(as_ctx));
845	new_inode = NULL;
846	try_async = false;
847	ceph_put_string(rcu_dereference_raw(lo.pool_ns));
848	goto retry;
849	}
850	ceph_put_string(rcu_dereference_raw(lo.pool_ns));
851	goto out_req;
852	}
853	}
854
855	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
856	req->r_new_inode = new_inode;
857	new_inode = NULL;
858	err = ceph_mdsc_do_request(mdsc, dir: (flags & O_CREAT) ? dir : NULL, req);
859	if (err == -ENOENT) {
860	dentry = ceph_handle_snapdir(req, dentry);
861	if (IS_ERR(ptr: dentry)) {
862	err = PTR_ERR(ptr: dentry);
863	goto out_req;
864	}
865	err = `0`;
866	}
867
868	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
869	err = ceph_handle_notrace_create(dir, dentry);
870
871	if (d_in_lookup(dentry)) {
872	dn = ceph_finish_lookup(req, dentry, err);
873	if (IS_ERR(ptr: dn))
874	err = PTR_ERR(ptr: dn);
875	} else {
876	/ we were given a hashed negative dentry /
877	dn = NULL;
878	}
879	if (err)
880	goto out_req;
881	if (dn \|\| d_really_is_negative(dentry) \|\| d_is_symlink(dentry)) {
882	/ make vfs retry on splice, ENOENT, or symlink /
883	dout("atomic_open finish_no_open on dn %p\n", dn);
884	err = finish_no_open(file, dentry: dn);
885	} else {
886	if (IS_ENCRYPTED(dir) &&
887	!fscrypt_has_permitted_context(parent: dir, child: d_inode(dentry))) {
888	pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
889	ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
890	goto out_req;
891	}
892
893	dout("atomic_open finish_open on dn %p\n", dn);
894	if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
895	struct inode *newino = d_inode(dentry);
896
897	cache_file_layout(dst: dir, src: newino);
898	ceph_init_inode_acls(inode: newino, as_ctx: &as_ctx);
899	file->f_mode \|= FMODE_CREATED;
900	}
901	err = finish_open(file, dentry, open: ceph_open);
902	}
903	out_req:
904	ceph_mdsc_put_request(req);
905	iput(new_inode);
906	out_ctx:
907	ceph_release_acl_sec_ctx(as_ctx: &as_ctx);
908	dout("atomic_open result=%d\n", err);
909	return err;
910	}
911
912	int ceph_release(struct inode inode, struct* file *file)
913	{
914	struct ceph_inode_info *ci = ceph_inode(inode);
915
916	if (S_ISDIR(inode->i_mode)) {
917	struct ceph_dir_file_info *dfi = file->private_data;
918	dout("release inode %p dir file %p\n", inode, file);
919	WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
920
921	ceph_put_fmode(ci, mode: dfi->file_info.fmode, count: `1`);
922
923	if (dfi->last_readdir)
924	ceph_mdsc_put_request(req: dfi->last_readdir);
925	kfree(objp: dfi->last_name);
926	kfree(objp: dfi->dir_info);
927	kmem_cache_free(s: ceph_dir_file_cachep, objp: dfi);
928	} else {
929	struct ceph_file_info *fi = file->private_data;
930	dout("release inode %p regular file %p\n", inode, file);
931	WARN_ON(!list_empty(&fi->rw_contexts));
932
933	ceph_fscache_unuse_cookie(inode, update: file->f_mode & FMODE_WRITE);
934	ceph_put_fmode(ci, mode: fi->fmode, count: `1`);
935
936	kmem_cache_free(s: ceph_file_cachep, objp: fi);
937	}
938
939	/ wake up anyone waiting for caps on this inode /
940	wake_up_all(&ci->i_cap_wq);
941	return `0`;
942	}
943
944	enum {
945	HAVE_RETRIED = `1`,
946	CHECK_EOF = `2`,
947	READ_INLINE = `3`,
948	};
949
950	/*
951	* Completely synchronous read and write methods. Direct from __user
952	* buffer to osd, or directly to user pages (if O_DIRECT).
953	*
954	* If the read spans object boundary, just do multiple reads. (That's not
955	* atomic, but good enough for now.)
956	*
957	* If we get a short result from the OSD, check against i_size; we need to
958	* only return a short read to the caller if we hit EOF.
959	*/
960	ssize_t __ceph_sync_read(struct inode inode, loff_t ki_pos,
961	struct iov_iter to, int* *retry_op,
962	u64 *last_objver)
963	{
964	struct ceph_inode_info *ci = ceph_inode(inode);
965	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
966	struct ceph_osd_client *osdc = &fsc->client->osdc;
967	ssize_t ret;
968	u64 off = *ki_pos;
969	u64 len = iov_iter_count(i: to);
970	u64 i_size = i_size_read(inode);
971	bool sparse = IS_ENCRYPTED(inode) \|\| ceph_test_mount_opt(fsc, SPARSEREAD);
972	u64 objver = `0`;
973
974	dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len);
975
976	if (ceph_inode_is_shutdown(inode))
977	return -EIO;
978
979	if (!len)
980	return `0`;
981	/*
982	* flush any page cache pages in this range. this
983	* will make concurrent normal and sync io slow,
984	* but it will at least behave sensibly when they are
985	* in sequence.
986	*/
987	ret = filemap_write_and_wait_range(mapping: inode->i_mapping,
988	lstart: off, lend: off + len - `1`);
989	if (ret < `0`)
990	return ret;
991
992	ret = `0`;
993	while ((len = iov_iter_count(i: to)) > `0`) {
994	struct ceph_osd_request *req;
995	struct page **pages;
996	int num_pages;
997	size_t page_off;
998	bool more;
999	int idx;
1000	size_t left;
1001	struct ceph_osd_req_op *op;
1002	u64 read_off = off;
1003	u64 read_len = len;
1004
1005	/ determine new offset/length if encrypted /
1006	ceph_fscrypt_adjust_off_and_len(inode, off: &read_off, len: &read_len);
1007
1008	dout("sync_read orig %llu~%llu reading %llu~%llu",
1009	off, len, read_off, read_len);
1010
1011	req = ceph_osdc_new_request(osdc, layout: &ci->i_layout,
1012	vino: ci->i_vino, offset: read_off, len: &read_len, which: `0`, num_ops: `1`,
1013	opcode: sparse ? CEPH_OSD_OP_SPARSE_READ :
1014	CEPH_OSD_OP_READ,
1015	flags: CEPH_OSD_FLAG_READ,
1016	NULL, truncate_seq: ci->i_truncate_seq,
1017	truncate_size: ci->i_truncate_size, use_mempool: false);
1018	if (IS_ERR(ptr: req)) {
1019	ret = PTR_ERR(ptr: req);
1020	break;
1021	}
1022
1023	/ adjust len downward if the request truncated the len /
1024	if (off + len > read_off + read_len)
1025	len = read_off + read_len - off;
1026	more = len < iov_iter_count(i: to);
1027
1028	num_pages = calc_pages_for(off: read_off, len: read_len);
1029	page_off = offset_in_page(off);
1030	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1031	if (IS_ERR(ptr: pages)) {
1032	ceph_osdc_put_request(req);
1033	ret = PTR_ERR(ptr: pages);
1034	break;
1035	}
1036
1037	osd_req_op_extent_osd_data_pages(req, which: `0`, pages, length: read_len,
1038	offset_in_page(read_off),
1039	pages_from_pool: false, own_pages: false);
1040
1041	op = &req->r_ops[`0`];
1042	if (sparse) {
1043	ret = ceph_alloc_sparse_ext_map(op);
1044	if (ret) {
1045	ceph_osdc_put_request(req);
1046	break;
1047	}
1048	}
1049
1050	ceph_osdc_start_request(osdc, req);
1051	ret = ceph_osdc_wait_request(osdc, req);
1052
1053	ceph_update_read_metrics(m: &fsc->mdsc->metric,
1054	r_start: req->r_start_latency,
1055	r_end: req->r_end_latency,
1056	size: read_len, rc: ret);
1057
1058	if (ret > `0`)
1059	objver = req->r_version;
1060
1061	i_size = i_size_read(inode);
1062	dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
1063	off, len, ret, i_size, (more ? " MORE" : ""));
1064
1065	/ Fix it to go to end of extent map /
1066	if (sparse && ret >= `0`)
1067	ret = ceph_sparse_ext_map_end(op);
1068	else if (ret == -ENOENT)
1069	ret = `0`;
1070
1071	if (ret > `0` && IS_ENCRYPTED(inode)) {
1072	int fret;
1073
1074	fret = ceph_fscrypt_decrypt_extents(inode, page: pages,
1075	off: read_off, map: op->extent.sparse_ext,
1076	ext_cnt: op->extent.sparse_ext_cnt);
1077	if (fret < `0`) {
1078	ret = fret;
1079	ceph_osdc_put_request(req);
1080	break;
1081	}
1082
1083	/ account for any partial block at the beginning /
1084	fret -= (off - read_off);
1085
1086	/*
1087	* Short read after big offset adjustment?
1088	* Nothing is usable, just call it a zero
1089	* len read.
1090	*/
1091	fret = max(fret, `0`);
1092
1093	/ account for partial block at the end /
1094	ret = min_t(ssize_t, fret, len);
1095	}
1096
1097	ceph_osdc_put_request(req);
1098
1099	/ Short read but not EOF? Zero out the remainder. /
1100	if (ret >= `0` && ret < len && (off + ret < i_size)) {
1101	int zlen = min(len - ret, i_size - off - ret);
1102	int zoff = page_off + ret;
1103
1104	dout("sync_read zero gap %llu~%llu\n",
1105	off + ret, off + ret + zlen);
1106	ceph_zero_page_vector_range(off: zoff, len: zlen, pages);
1107	ret += zlen;
1108	}
1109
1110	idx = `0`;
1111	left = ret > `0` ? ret : `0`;
1112	while (left > `0`) {
1113	size_t plen, copied;
1114
1115	plen = min_t(size_t, left, PAGE_SIZE - page_off);
1116	SetPageUptodate(pages[idx]);
1117	copied = copy_page_to_iter(page: pages[idx++],
1118	offset: page_off, bytes: plen, i: to);
1119	off += copied;
1120	left -= copied;
1121	page_off = `0`;
1122	if (copied < plen) {
1123	ret = -EFAULT;
1124	break;
1125	}
1126	}
1127	ceph_release_page_vector(pages, num_pages);
1128
1129	if (ret < `0`) {
1130	if (ret == -EBLOCKLISTED)
1131	fsc->blocklisted = true;
1132	break;
1133	}
1134
1135	if (off >= i_size \|\| !more)
1136	break;
1137	}
1138
1139	if (ret > `0`) {
1140	if (off > *ki_pos) {
1141	if (off >= i_size) {
1142	*retry_op = CHECK_EOF;
1143	ret = i_size - *ki_pos;
1144	*ki_pos = i_size;
1145	} else {
1146	ret = off - *ki_pos;
1147	*ki_pos = off;
1148	}
1149	}
1150
1151	if (last_objver)
1152	*last_objver = objver;
1153	}
1154	dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
1155	return ret;
1156	}
1157
1158	static ssize_t ceph_sync_read(struct kiocb iocb, struct* iov_iter *to,
1159	int *retry_op)
1160	{
1161	struct file *file = iocb->ki_filp;
1162	struct inode *inode = file_inode(f: file);
1163
1164	dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos,
1165	iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
1166
1167	return __ceph_sync_read(inode, ki_pos: &iocb->ki_pos, to, retry_op, NULL);
1168	}
1169
1170	struct ceph_aio_request {
1171	struct kiocb *iocb;
1172	size_t total_len;
1173	bool write;
1174	bool should_dirty;
1175	int error;
1176	struct list_head osd_reqs;
1177	unsigned num_reqs;
1178	atomic_t pending_reqs;
1179	struct timespec64 mtime;
1180	struct ceph_cap_flush *prealloc_cf;
1181	};
1182
1183	struct ceph_aio_work {
1184	struct work_struct work;
1185	struct ceph_osd_request *req;
1186	};
1187
1188	static void ceph_aio_retry_work(struct work_struct *work);
1189
1190	static void ceph_aio_complete(struct inode *inode,
1191	struct ceph_aio_request *aio_req)
1192	{
1193	struct ceph_inode_info *ci = ceph_inode(inode);
1194	int ret;
1195
1196	if (!atomic_dec_and_test(v: &aio_req->pending_reqs))
1197	return;
1198
1199	if (aio_req->iocb->ki_flags & IOCB_DIRECT)
1200	inode_dio_end(inode);
1201
1202	ret = aio_req->error;
1203	if (!ret)
1204	ret = aio_req->total_len;
1205
1206	dout("ceph_aio_complete %p rc %d\n", inode, ret);
1207
1208	if (ret >= `0` && aio_req->write) {
1209	int dirty;
1210
1211	loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
1212	if (endoff > i_size_read(inode)) {
1213	if (ceph_inode_set_size(inode, size: endoff))
1214	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
1215	}
1216
1217	spin_lock(lock: &ci->i_ceph_lock);
1218	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1219	pcf: &aio_req->prealloc_cf);
1220	spin_unlock(lock: &ci->i_ceph_lock);
1221	if (dirty)
1222	__mark_inode_dirty(inode, dirty);
1223
1224	}
1225
1226	ceph_put_cap_refs(ci, had: (aio_req->write ? CEPH_CAP_FILE_WR :
1227	CEPH_CAP_FILE_RD));
1228
1229	aio_req->iocb->ki_complete(aio_req->iocb, ret);
1230
1231	ceph_free_cap_flush(cf: aio_req->prealloc_cf);
1232	kfree(objp: aio_req);
1233	}
1234
1235	static void ceph_aio_complete_req(struct ceph_osd_request *req)
1236	{
1237	int rc = req->r_result;
1238	struct inode *inode = req->r_inode;
1239	struct ceph_aio_request *aio_req = req->r_priv;
1240	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(osd_req: req, which: `0`);
1241	struct ceph_osd_req_op *op = &req->r_ops[`0`];
1242	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(sb: inode->i_sb)->metric;
1243	unsigned int len = osd_data->bvec_pos.iter.bi_size;
1244	bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
1245
1246	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
1247	BUG_ON(!osd_data->num_bvecs);
1248
1249	dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len);
1250
1251	if (rc == -EOLDSNAPC) {
1252	struct ceph_aio_work *aio_work;
1253	BUG_ON(!aio_req->write);
1254
1255	aio_work = kmalloc(size: sizeof(*aio_work), GFP_NOFS);
1256	if (aio_work) {
1257	INIT_WORK(&aio_work->work, ceph_aio_retry_work);
1258	aio_work->req = req;
1259	queue_work(wq: ceph_inode_to_client(inode)->inode_wq,
1260	work: &aio_work->work);
1261	return;
1262	}
1263	rc = -ENOMEM;
1264	} else if (!aio_req->write) {
1265	if (sparse && rc >= `0`)
1266	rc = ceph_sparse_ext_map_end(op);
1267	if (rc == -ENOENT)
1268	rc = `0`;
1269	if (rc >= `0` && len > rc) {
1270	struct iov_iter i;
1271	int zlen = len - rc;
1272
1273	/*
1274	* If read is satisfied by single OSD request,
1275	* it can pass EOF. Otherwise read is within
1276	* i_size.
1277	*/
1278	if (aio_req->num_reqs == `1`) {
1279	loff_t i_size = i_size_read(inode);
1280	loff_t endoff = aio_req->iocb->ki_pos + rc;
1281	if (endoff < i_size)
1282	zlen = min_t(size_t, zlen,
1283	i_size - endoff);
1284	aio_req->total_len = rc + zlen;
1285	}
1286
1287	iov_iter_bvec(i: &i, ITER_DEST, bvec: osd_data->bvec_pos.bvecs,
1288	nr_segs: osd_data->num_bvecs, count: len);
1289	iov_iter_advance(i: &i, bytes: rc);
1290	iov_iter_zero(bytes: zlen, &i);
1291	}
1292	}
1293
1294	/ r_start_latency == 0 means the request was not submitted /
1295	if (req->r_start_latency) {
1296	if (aio_req->write)
1297	ceph_update_write_metrics(m: metric, r_start: req->r_start_latency,
1298	r_end: req->r_end_latency, size: len, rc);
1299	else
1300	ceph_update_read_metrics(m: metric, r_start: req->r_start_latency,
1301	r_end: req->r_end_latency, size: len, rc);
1302	}
1303
1304	put_bvecs(bvecs: osd_data->bvec_pos.bvecs, num_bvecs: osd_data->num_bvecs,
1305	should_dirty: aio_req->should_dirty);
1306	ceph_osdc_put_request(req);
1307
1308	if (rc < `0`)
1309	cmpxchg(&aio_req->error, `0`, rc);
1310
1311	ceph_aio_complete(inode, aio_req);
1312	return;
1313	}
1314
1315	static void ceph_aio_retry_work(struct work_struct *work)
1316	{
1317	struct ceph_aio_work *aio_work =
1318	container_of(work, struct ceph_aio_work, work);
1319	struct ceph_osd_request *orig_req = aio_work->req;
1320	struct ceph_aio_request *aio_req = orig_req->r_priv;
1321	struct inode *inode = orig_req->r_inode;
1322	struct ceph_inode_info *ci = ceph_inode(inode);
1323	struct ceph_snap_context *snapc;
1324	struct ceph_osd_request *req;
1325	int ret;
1326
1327	spin_lock(lock: &ci->i_ceph_lock);
1328	if (__ceph_have_pending_cap_snap(ci)) {
1329	struct ceph_cap_snap *capsnap =
1330	list_last_entry(&ci->i_cap_snaps,
1331	struct ceph_cap_snap,
1332	ci_item);
1333	snapc = ceph_get_snap_context(sc: capsnap->context);
1334	} else {
1335	BUG_ON(!ci->i_head_snapc);
1336	snapc = ceph_get_snap_context(sc: ci->i_head_snapc);
1337	}
1338	spin_unlock(lock: &ci->i_ceph_lock);
1339
1340	req = ceph_osdc_alloc_request(osdc: orig_req->r_osdc, snapc, num_ops: `1`,
1341	use_mempool: false, GFP_NOFS);
1342	if (!req) {
1343	ret = -ENOMEM;
1344	req = orig_req;
1345	goto out;
1346	}
1347
1348	req->r_flags = / CEPH_OSD_FLAG_ORDERSNAP \| / CEPH_OSD_FLAG_WRITE;
1349	ceph_oloc_copy(dest: &req->r_base_oloc, src: &orig_req->r_base_oloc);
1350	ceph_oid_copy(dest: &req->r_base_oid, src: &orig_req->r_base_oid);
1351
1352	req->r_ops[`0`] = orig_req->r_ops[`0`];
1353
1354	req->r_mtime = aio_req->mtime;
1355	req->r_data_offset = req->r_ops[`0`].extent.offset;
1356
1357	ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
1358	if (ret) {
1359	ceph_osdc_put_request(req);
1360	req = orig_req;
1361	goto out;
1362	}
1363
1364	ceph_osdc_put_request(req: orig_req);
1365
1366	req->r_callback = ceph_aio_complete_req;
1367	req->r_inode = inode;
1368	req->r_priv = aio_req;
1369
1370	ceph_osdc_start_request(osdc: req->r_osdc, req);
1371	out:
1372	if (ret < `0`) {
1373	req->r_result = ret;
1374	ceph_aio_complete_req(req);
1375	}
1376
1377	ceph_put_snap_context(sc: snapc);
1378	kfree(objp: aio_work);
1379	}
1380
1381	static ssize_t
1382	ceph_direct_read_write(struct kiocb iocb, struct* iov_iter *iter,
1383	struct ceph_snap_context *snapc,
1384	struct ceph_cap_flush **pcf)
1385	{
1386	struct file *file = iocb->ki_filp;
1387	struct inode *inode = file_inode(f: file);
1388	struct ceph_inode_info *ci = ceph_inode(inode);
1389	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1390	struct ceph_client_metric *metric = &fsc->mdsc->metric;
1391	struct ceph_vino vino;
1392	struct ceph_osd_request *req;
1393	struct bio_vec *bvecs;
1394	struct ceph_aio_request *aio_req = NULL;
1395	int num_pages = `0`;
1396	int flags;
1397	int ret = `0`;
1398	struct timespec64 mtime = current_time(inode);
1399	size_t count = iov_iter_count(i: iter);
1400	loff_t pos = iocb->ki_pos;
1401	bool write = iov_iter_rw(i: iter) == WRITE;
1402	bool should_dirty = !write && user_backed_iter(i: iter);
1403	bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
1404
1405	if (write && ceph_snap(inode: file_inode(f: file)) != CEPH_NOSNAP)
1406	return -EROFS;
1407
1408	dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
1409	(write ? "write" : "read"), file, pos, (unsigned)count,
1410	snapc, snapc ? snapc->seq : `0`);
1411
1412	if (write) {
1413	int ret2;
1414
1415	ceph_fscache_invalidate(inode, dio_write: true);
1416
1417	ret2 = invalidate_inode_pages2_range(mapping: inode->i_mapping,
1418	start: pos >> PAGE_SHIFT,
1419	end: (pos + count - `1`) >> PAGE_SHIFT);
1420	if (ret2 < `0`)
1421	dout("invalidate_inode_pages2_range returned %d\n", ret2);
1422
1423	flags = / CEPH_OSD_FLAG_ORDERSNAP \| / CEPH_OSD_FLAG_WRITE;
1424	} else {
1425	flags = CEPH_OSD_FLAG_READ;
1426	}
1427
1428	while (iov_iter_count(i: iter) > `0`) {
1429	u64 size = iov_iter_count(i: iter);
1430	ssize_t len;
1431	struct ceph_osd_req_op *op;
1432	int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
1433
1434	if (write)
1435	size = min_t(u64, size, fsc->mount_options->wsize);
1436	else
1437	size = min_t(u64, size, fsc->mount_options->rsize);
1438
1439	vino = ceph_vino(inode);
1440	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout,
1441	vino, offset: pos, len: &size, which: `0`,
1442	num_ops: `1`,
1443	opcode: write ? CEPH_OSD_OP_WRITE : readop,
1444	flags, snapc,
1445	truncate_seq: ci->i_truncate_seq,
1446	truncate_size: ci->i_truncate_size,
1447	use_mempool: false);
1448	if (IS_ERR(ptr: req)) {
1449	ret = PTR_ERR(ptr: req);
1450	break;
1451	}
1452
1453	len = iter_get_bvecs_alloc(iter, maxsize: size, bvecs: &bvecs, num_bvecs: &num_pages);
1454	if (len < `0`) {
1455	ceph_osdc_put_request(req);
1456	ret = len;
1457	break;
1458	}
1459	if (len != size)
1460	osd_req_op_extent_update(osd_req: req, which: `0`, length: len);
1461
1462	/*
1463	* To simplify error handling, allow AIO when IO within i_size
1464	* or IO can be satisfied by single OSD request.
1465	*/
1466	if (pos == iocb->ki_pos && !is_sync_kiocb(kiocb: iocb) &&
1467	(len == count \|\| pos + count <= i_size_read(inode))) {
1468	aio_req = kzalloc(size: sizeof(*aio_req), GFP_KERNEL);
1469	if (aio_req) {
1470	aio_req->iocb = iocb;
1471	aio_req->write = write;
1472	aio_req->should_dirty = should_dirty;
1473	INIT_LIST_HEAD(list: &aio_req->osd_reqs);
1474	if (write) {
1475	aio_req->mtime = mtime;
1476	swap(aio_req->prealloc_cf, *pcf);
1477	}
1478	}
1479	/ ignore error /
1480	}
1481
1482	if (write) {
1483	/*
1484	* throw out any page cache pages in this range. this
1485	* may block.
1486	*/
1487	truncate_inode_pages_range(inode->i_mapping, lstart: pos,
1488	PAGE_ALIGN(pos + len) - `1`);
1489
1490	req->r_mtime = mtime;
1491	}
1492
1493	osd_req_op_extent_osd_data_bvecs(osd_req: req, which: `0`, bvecs, num_bvecs: num_pages, bytes: len);
1494	op = &req->r_ops[`0`];
1495	if (sparse) {
1496	ret = ceph_alloc_sparse_ext_map(op);
1497	if (ret) {
1498	ceph_osdc_put_request(req);
1499	break;
1500	}
1501	}
1502
1503	if (aio_req) {
1504	aio_req->total_len += len;
1505	aio_req->num_reqs++;
1506	atomic_inc(v: &aio_req->pending_reqs);
1507
1508	req->r_callback = ceph_aio_complete_req;
1509	req->r_inode = inode;
1510	req->r_priv = aio_req;
1511	list_add_tail(new: &req->r_private_item, head: &aio_req->osd_reqs);
1512
1513	pos += len;
1514	continue;
1515	}
1516
1517	ceph_osdc_start_request(osdc: req->r_osdc, req);
1518	ret = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req);
1519
1520	if (write)
1521	ceph_update_write_metrics(m: metric, r_start: req->r_start_latency,
1522	r_end: req->r_end_latency, size: len, rc: ret);
1523	else
1524	ceph_update_read_metrics(m: metric, r_start: req->r_start_latency,
1525	r_end: req->r_end_latency, size: len, rc: ret);
1526
1527	size = i_size_read(inode);
1528	if (!write) {
1529	if (sparse && ret >= `0`)
1530	ret = ceph_sparse_ext_map_end(op);
1531	else if (ret == -ENOENT)
1532	ret = `0`;
1533
1534	if (ret >= `0` && ret < len && pos + ret < size) {
1535	struct iov_iter i;
1536	int zlen = min_t(size_t, len - ret,
1537	size - pos - ret);
1538
1539	iov_iter_bvec(i: &i, ITER_DEST, bvec: bvecs, nr_segs: num_pages, count: len);
1540	iov_iter_advance(i: &i, bytes: ret);
1541	iov_iter_zero(bytes: zlen, &i);
1542	ret += zlen;
1543	}
1544	if (ret >= `0`)
1545	len = ret;
1546	}
1547
1548	put_bvecs(bvecs, num_bvecs: num_pages, should_dirty);
1549	ceph_osdc_put_request(req);
1550	if (ret < `0`)
1551	break;
1552
1553	pos += len;
1554	if (!write && pos >= size)
1555	break;
1556
1557	if (write && pos > size) {
1558	if (ceph_inode_set_size(inode, size: pos))
1559	ceph_check_caps(ci: ceph_inode(inode),
1560	CHECK_CAPS_AUTHONLY);
1561	}
1562	}
1563
1564	if (aio_req) {
1565	LIST_HEAD(osd_reqs);
1566
1567	if (aio_req->num_reqs == `0`) {
1568	kfree(objp: aio_req);
1569	return ret;
1570	}
1571
1572	ceph_get_cap_refs(ci, caps: write ? CEPH_CAP_FILE_WR :
1573	CEPH_CAP_FILE_RD);
1574
1575	list_splice(list: &aio_req->osd_reqs, head: &osd_reqs);
1576	inode_dio_begin(inode);
1577	while (!list_empty(head: &osd_reqs)) {
1578	req = list_first_entry(&osd_reqs,
1579	struct ceph_osd_request,
1580	r_private_item);
1581	list_del_init(entry: &req->r_private_item);
1582	if (ret >= `0`)
1583	ceph_osdc_start_request(osdc: req->r_osdc, req);
1584	if (ret < `0`) {
1585	req->r_result = ret;
1586	ceph_aio_complete_req(req);
1587	}
1588	}
1589	return -EIOCBQUEUED;
1590	}
1591
1592	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
1593	ret = pos - iocb->ki_pos;
1594	iocb->ki_pos = pos;
1595	}
1596	return ret;
1597	}
1598
1599	/*
1600	* Synchronous write, straight from __user pointer or user pages.
1601	*
1602	* If write spans object boundary, just do multiple writes. (For a
1603	* correct atomic write, we should e.g. take write locks on all
1604	* objects, rollback on failure, etc.)
1605	*/
1606	static ssize_t
1607	ceph_sync_write(struct kiocb iocb, struct* iov_iter *from, loff_t pos,
1608	struct ceph_snap_context *snapc)
1609	{
1610	struct file *file = iocb->ki_filp;
1611	struct inode *inode = file_inode(f: file);
1612	struct ceph_inode_info *ci = ceph_inode(inode);
1613	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1614	struct ceph_osd_client *osdc = &fsc->client->osdc;
1615	struct ceph_osd_request *req;
1616	struct page **pages;
1617	u64 len;
1618	int num_pages;
1619	int written = `0`;
1620	int ret;
1621	bool check_caps = false;
1622	struct timespec64 mtime = current_time(inode);
1623	size_t count = iov_iter_count(i: from);
1624
1625	if (ceph_snap(inode: file_inode(f: file)) != CEPH_NOSNAP)
1626	return -EROFS;
1627
1628	dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
1629	file, pos, (unsigned)count, snapc, snapc->seq);
1630
1631	ret = filemap_write_and_wait_range(mapping: inode->i_mapping,
1632	lstart: pos, lend: pos + count - `1`);
1633	if (ret < `0`)
1634	return ret;
1635
1636	ceph_fscache_invalidate(inode, dio_write: false);
1637
1638	while ((len = iov_iter_count(i: from)) > `0`) {
1639	size_t left;
1640	int n;
1641	u64 write_pos = pos;
1642	u64 write_len = len;
1643	u64 objnum, objoff;
1644	u32 xlen;
1645	u64 assert_ver = `0`;
1646	bool rmw;
1647	bool first, last;
1648	struct iov_iter saved_iter = *from;
1649	size_t off;
1650
1651	ceph_fscrypt_adjust_off_and_len(inode, off: &write_pos, len: &write_len);
1652
1653	/ clamp the length to the end of first object /
1654	ceph_calc_file_object_mapping(l: &ci->i_layout, off: write_pos,
1655	len: write_len, objno: &objnum, objoff: &objoff,
1656	xlen: &xlen);
1657	write_len = xlen;
1658
1659	/ adjust len downward if it goes beyond current object /
1660	if (pos + len > write_pos + write_len)
1661	len = write_pos + write_len - pos;
1662
1663	/*
1664	* If we had to adjust the length or position to align with a
1665	* crypto block, then we must do a read/modify/write cycle. We
1666	* use a version assertion to redrive the thing if something
1667	* changes in between.
1668	*/
1669	first = pos != write_pos;
1670	last = (pos + len) != (write_pos + write_len);
1671	rmw = first \|\| last;
1672
1673	dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
1674	ci->i_vino.ino, pos, len, write_pos, write_len,
1675	rmw ? "" : "no ");
1676
1677	/*
1678	* The data is emplaced into the page as it would be if it were
1679	* in an array of pagecache pages.
1680	*/
1681	num_pages = calc_pages_for(off: write_pos, len: write_len);
1682	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1683	if (IS_ERR(ptr: pages)) {
1684	ret = PTR_ERR(ptr: pages);
1685	break;
1686	}
1687
1688	/ Do we need to preload the pages? /
1689	if (rmw) {
1690	u64 first_pos = write_pos;
1691	u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE;
1692	u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
1693	struct ceph_osd_req_op *op;
1694
1695	/ We should only need to do this for encrypted inodes /
1696	WARN_ON_ONCE(!IS_ENCRYPTED(inode));
1697
1698	/ No need to do two reads if first and last blocks are same /
1699	if (first && last_pos == first_pos)
1700	last = false;
1701
1702	/*
1703	* Allocate a read request for one or two extents,
1704	* depending on how the request was aligned.
1705	*/
1706	req = ceph_osdc_new_request(osdc, layout: &ci->i_layout,
1707	vino: ci->i_vino, offset: first ? first_pos : last_pos,
1708	len: &read_len, which: `0`, num_ops: (first && last) ? `2` : `1`,
1709	opcode: CEPH_OSD_OP_SPARSE_READ, flags: CEPH_OSD_FLAG_READ,
1710	NULL, truncate_seq: ci->i_truncate_seq,
1711	truncate_size: ci->i_truncate_size, use_mempool: false);
1712	if (IS_ERR(ptr: req)) {
1713	ceph_release_page_vector(pages, num_pages);
1714	ret = PTR_ERR(ptr: req);
1715	break;
1716	}
1717
1718	/ Something is misaligned! /
1719	if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
1720	ceph_osdc_put_request(req);
1721	ceph_release_page_vector(pages, num_pages);
1722	ret = -EIO;
1723	break;
1724	}
1725
1726	/ Add extent for first block? /
1727	op = &req->r_ops[`0`];
1728
1729	if (first) {
1730	osd_req_op_extent_osd_data_pages(req, which: `0`, pages,
1731	CEPH_FSCRYPT_BLOCK_SIZE,
1732	offset_in_page(first_pos),
1733	pages_from_pool: false, own_pages: false);
1734	/ We only expect a single extent here /
1735	ret = __ceph_alloc_sparse_ext_map(op, cnt: `1`);
1736	if (ret) {
1737	ceph_osdc_put_request(req);
1738	ceph_release_page_vector(pages, num_pages);
1739	break;
1740	}
1741	}
1742
1743	/ Add extent for last block /
1744	if (last) {
1745	/ Init the other extent if first extent has been used /
1746	if (first) {
1747	op = &req->r_ops[`1`];
1748	osd_req_op_extent_init(osd_req: req, which: `1`,
1749	opcode: CEPH_OSD_OP_SPARSE_READ,
1750	offset: last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
1751	truncate_size: ci->i_truncate_size,
1752	truncate_seq: ci->i_truncate_seq);
1753	}
1754
1755	ret = __ceph_alloc_sparse_ext_map(op, cnt: `1`);
1756	if (ret) {
1757	ceph_osdc_put_request(req);
1758	ceph_release_page_vector(pages, num_pages);
1759	break;
1760	}
1761
1762	osd_req_op_extent_osd_data_pages(req, which: first ? `1` : `0`,
1763	pages: &pages[num_pages - `1`],
1764	CEPH_FSCRYPT_BLOCK_SIZE,
1765	offset_in_page(last_pos),
1766	pages_from_pool: false, own_pages: false);
1767	}
1768
1769	ceph_osdc_start_request(osdc, req);
1770	ret = ceph_osdc_wait_request(osdc, req);
1771
1772	/ FIXME: length field is wrong if there are 2 extents /
1773	ceph_update_read_metrics(m: &fsc->mdsc->metric,
1774	r_start: req->r_start_latency,
1775	r_end: req->r_end_latency,
1776	size: read_len, rc: ret);
1777
1778	/ Ok if object is not already present /
1779	if (ret == -ENOENT) {
1780	/*
1781	* If there is no object, then we can't assert
1782	* on its version. Set it to 0, and we'll use an
1783	* exclusive create instead.
1784	*/
1785	ceph_osdc_put_request(req);
1786	ret = `0`;
1787
1788	/*
1789	* zero out the soon-to-be uncopied parts of the
1790	* first and last pages.
1791	*/
1792	if (first)
1793	zero_user_segment(page: pages[`0`], start: `0`,
1794	offset_in_page(first_pos));
1795	if (last)
1796	zero_user_segment(page: pages[num_pages - `1`],
1797	offset_in_page(last_pos),
1798	PAGE_SIZE);
1799	} else {
1800	if (ret < `0`) {
1801	ceph_osdc_put_request(req);
1802	ceph_release_page_vector(pages, num_pages);
1803	break;
1804	}
1805
1806	op = &req->r_ops[`0`];
1807	if (op->extent.sparse_ext_cnt == `0`) {
1808	if (first)
1809	zero_user_segment(page: pages[`0`], start: `0`,
1810	offset_in_page(first_pos));
1811	else
1812	zero_user_segment(page: pages[num_pages - `1`],
1813	offset_in_page(last_pos),
1814	PAGE_SIZE);
1815	} else if (op->extent.sparse_ext_cnt != `1` \|\|
1816	ceph_sparse_ext_map_end(op) !=
1817	CEPH_FSCRYPT_BLOCK_SIZE) {
1818	ret = -EIO;
1819	ceph_osdc_put_request(req);
1820	ceph_release_page_vector(pages, num_pages);
1821	break;
1822	}
1823
1824	if (first && last) {
1825	op = &req->r_ops[`1`];
1826	if (op->extent.sparse_ext_cnt == `0`) {
1827	zero_user_segment(page: pages[num_pages - `1`],
1828	offset_in_page(last_pos),
1829	PAGE_SIZE);
1830	} else if (op->extent.sparse_ext_cnt != `1` \|\|
1831	ceph_sparse_ext_map_end(op) !=
1832	CEPH_FSCRYPT_BLOCK_SIZE) {
1833	ret = -EIO;
1834	ceph_osdc_put_request(req);
1835	ceph_release_page_vector(pages, num_pages);
1836	break;
1837	}
1838	}
1839
1840	/ Grab assert version. It must be non-zero. /
1841	assert_ver = req->r_version;
1842	WARN_ON_ONCE(ret > `0` && assert_ver == `0`);
1843
1844	ceph_osdc_put_request(req);
1845	if (first) {
1846	ret = ceph_fscrypt_decrypt_block_inplace(inode,
1847	page: pages[`0`], CEPH_FSCRYPT_BLOCK_SIZE,
1848	offset_in_page(first_pos),
1849	lblk_num: first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
1850	if (ret < `0`) {
1851	ceph_release_page_vector(pages, num_pages);
1852	break;
1853	}
1854	}
1855	if (last) {
1856	ret = ceph_fscrypt_decrypt_block_inplace(inode,
1857	page: pages[num_pages - `1`],
1858	CEPH_FSCRYPT_BLOCK_SIZE,
1859	offset_in_page(last_pos),
1860	lblk_num: last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
1861	if (ret < `0`) {
1862	ceph_release_page_vector(pages, num_pages);
1863	break;
1864	}
1865	}
1866	}
1867	}
1868
1869	left = len;
1870	off = offset_in_page(pos);
1871	for (n = `0`; n < num_pages; n++) {
1872	size_t plen = min_t(size_t, left, PAGE_SIZE - off);
1873
1874	/ copy the data /
1875	ret = copy_page_from_iter(page: pages[n], offset: off, bytes: plen, i: from);
1876	if (ret != plen) {
1877	ret = -EFAULT;
1878	break;
1879	}
1880	off = `0`;
1881	left -= ret;
1882	}
1883	if (ret < `0`) {
1884	dout("sync_write write failed with %d\n", ret);
1885	ceph_release_page_vector(pages, num_pages);
1886	break;
1887	}
1888
1889	if (IS_ENCRYPTED(inode)) {
1890	ret = ceph_fscrypt_encrypt_pages(inode, page: pages,
1891	off: write_pos, len: write_len,
1892	GFP_KERNEL);
1893	if (ret < `0`) {
1894	dout("encryption failed with %d\n", ret);
1895	ceph_release_page_vector(pages, num_pages);
1896	break;
1897	}
1898	}
1899
1900	req = ceph_osdc_new_request(osdc, layout: &ci->i_layout,
1901	vino: ci->i_vino, offset: write_pos, len: &write_len,
1902	which: rmw ? `1` : `0`, num_ops: rmw ? `2` : `1`,
1903	opcode: CEPH_OSD_OP_WRITE,
1904	flags: CEPH_OSD_FLAG_WRITE,
1905	snapc, truncate_seq: ci->i_truncate_seq,
1906	truncate_size: ci->i_truncate_size, use_mempool: false);
1907	if (IS_ERR(ptr: req)) {
1908	ret = PTR_ERR(ptr: req);
1909	ceph_release_page_vector(pages, num_pages);
1910	break;
1911	}
1912
1913	dout("sync_write write op %lld~%llu\n", write_pos, write_len);
1914	osd_req_op_extent_osd_data_pages(req, which: rmw ? `1` : `0`, pages, length: write_len,
1915	offset_in_page(write_pos), pages_from_pool: false,
1916	own_pages: true);
1917	req->r_inode = inode;
1918	req->r_mtime = mtime;
1919
1920	/ Set up the assertion /
1921	if (rmw) {
1922	/*
1923	* Set up the assertion. If we don't have a version
1924	* number, then the object doesn't exist yet. Use an
1925	* exclusive create instead of a version assertion in
1926	* that case.
1927	*/
1928	if (assert_ver) {
1929	osd_req_op_init(osd_req: req, which: `0`, opcode: CEPH_OSD_OP_ASSERT_VER, flags: `0`);
1930	req->r_ops[`0`].assert_ver.ver = assert_ver;
1931	} else {
1932	osd_req_op_init(osd_req: req, which: `0`, opcode: CEPH_OSD_OP_CREATE,
1933	flags: CEPH_OSD_OP_FLAG_EXCL);
1934	}
1935	}
1936
1937	ceph_osdc_start_request(osdc, req);
1938	ret = ceph_osdc_wait_request(osdc, req);
1939
1940	ceph_update_write_metrics(m: &fsc->mdsc->metric, r_start: req->r_start_latency,
1941	r_end: req->r_end_latency, size: len, rc: ret);
1942	ceph_osdc_put_request(req);
1943	if (ret != `0`) {
1944	dout("sync_write osd write returned %d\n", ret);
1945	/ Version changed! Must re-do the rmw cycle /
1946	if ((assert_ver && (ret == -ERANGE \|\| ret == -EOVERFLOW)) \|\|
1947	(!assert_ver && ret == -EEXIST)) {
1948	/ We should only ever see this on a rmw /
1949	WARN_ON_ONCE(!rmw);
1950
1951	/ The version should never go backward /
1952	WARN_ON_ONCE(ret == -EOVERFLOW);
1953
1954	*from = saved_iter;
1955
1956	/ FIXME: limit number of times we loop? /
1957	continue;
1958	}
1959	ceph_set_error_write(ci);
1960	break;
1961	}
1962
1963	ceph_clear_error_write(ci);
1964
1965	/*
1966	* We successfully wrote to a range of the file. Declare
1967	* that region of the pagecache invalid.
1968	*/
1969	ret = invalidate_inode_pages2_range(
1970	mapping: inode->i_mapping,
1971	start: pos >> PAGE_SHIFT,
1972	end: (pos + len - `1`) >> PAGE_SHIFT);
1973	if (ret < `0`) {
1974	dout("invalidate_inode_pages2_range returned %d\n",
1975	ret);
1976	ret = `0`;
1977	}
1978	pos += len;
1979	written += len;
1980	dout("sync_write written %d\n", written);
1981	if (pos > i_size_read(inode)) {
1982	check_caps = ceph_inode_set_size(inode, size: pos);
1983	if (check_caps)
1984	ceph_check_caps(ci: ceph_inode(inode),
1985	CHECK_CAPS_AUTHONLY);
1986	}
1987
1988	}
1989
1990	if (ret != -EOLDSNAPC && written > `0`) {
1991	ret = written;
1992	iocb->ki_pos = pos;
1993	}
1994	dout("sync_write returning %d\n", ret);
1995	return ret;
1996	}
1997
1998	/*
1999	* Wrap generic_file_aio_read with checks for cap bits on the inode.
2000	* Atomically grab references, so that those bits are not released
2001	* back to the MDS mid-read.
2002	*
2003	* Hmm, the sync read case isn't actually async... should it be?
2004	*/
2005	static ssize_t ceph_read_iter(struct kiocb iocb, struct* iov_iter *to)
2006	{
2007	struct file *filp = iocb->ki_filp;
2008	struct ceph_file_info *fi = filp->private_data;
2009	size_t len = iov_iter_count(i: to);
2010	struct inode *inode = file_inode(f: filp);
2011	struct ceph_inode_info *ci = ceph_inode(inode);
2012	bool direct_lock = iocb->ki_flags & IOCB_DIRECT;
2013	ssize_t ret;
2014	int want = `0`, got = `0`;
2015	int retry_op = `0`, read = `0`;
2016
2017	again:
2018	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
2019	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
2020
2021	if (ceph_inode_is_shutdown(inode))
2022	return -ESTALE;
2023
2024	if (direct_lock)
2025	ceph_start_io_direct(inode);
2026	else
2027	ceph_start_io_read(inode);
2028
2029	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
2030	want \|= CEPH_CAP_FILE_CACHE;
2031	if (fi->fmode & CEPH_FILE_MODE_LAZY)
2032	want \|= CEPH_CAP_FILE_LAZYIO;
2033
2034	ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, endoff: -`1`, got: &got);
2035	if (ret < `0`) {
2036	if (direct_lock)
2037	ceph_end_io_direct(inode);
2038	else
2039	ceph_end_io_read(inode);
2040	return ret;
2041	}
2042
2043	if ((got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == `0` \|\|
2044	(iocb->ki_flags & IOCB_DIRECT) \|\|
2045	(fi->flags & CEPH_F_SYNC)) {
2046
2047	dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
2048	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
2049	ceph_cap_string(got));
2050
2051	if (!ceph_has_inline_data(ci)) {
2052	if (!retry_op &&
2053	(iocb->ki_flags & IOCB_DIRECT) &&
2054	!IS_ENCRYPTED(inode)) {
2055	ret = ceph_direct_read_write(iocb, iter: to,
2056	NULL, NULL);
2057	if (ret >= `0` && ret < len)
2058	retry_op = CHECK_EOF;
2059	} else {
2060	ret = ceph_sync_read(iocb, to, retry_op: &retry_op);
2061	}
2062	} else {
2063	retry_op = READ_INLINE;
2064	}
2065	} else {
2066	CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
2067	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
2068	inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
2069	ceph_cap_string(got));
2070	ceph_add_rw_context(cf: fi, ctx: &rw_ctx);
2071	ret = generic_file_read_iter(iocb, to);
2072	ceph_del_rw_context(cf: fi, ctx: &rw_ctx);
2073	}
2074
2075	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
2076	inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
2077	ceph_put_cap_refs(ci, had: got);
2078
2079	if (direct_lock)
2080	ceph_end_io_direct(inode);
2081	else
2082	ceph_end_io_read(inode);
2083
2084	if (retry_op > HAVE_RETRIED && ret >= `0`) {
2085	int statret;
2086	struct page *page = NULL;
2087	loff_t i_size;
2088	if (retry_op == READ_INLINE) {
2089	page = __page_cache_alloc(GFP_KERNEL);
2090	if (!page)
2091	return -ENOMEM;
2092	}
2093
2094	statret = __ceph_do_getattr(inode, locked_page: page,
2095	CEPH_STAT_CAP_INLINE_DATA, force: !!page);
2096	if (statret < `0`) {
2097	if (page)
2098	__free_page(page);
2099	if (statret == -ENODATA) {
2100	BUG_ON(retry_op != READ_INLINE);
2101	goto again;
2102	}
2103	return statret;
2104	}
2105
2106	i_size = i_size_read(inode);
2107	if (retry_op == READ_INLINE) {
2108	BUG_ON(ret > `0` \|\| read > `0`);
2109	if (iocb->ki_pos < i_size &&
2110	iocb->ki_pos < PAGE_SIZE) {
2111	loff_t end = min_t(loff_t, i_size,
2112	iocb->ki_pos + len);
2113	end = min_t(loff_t, end, PAGE_SIZE);
2114	if (statret < end)
2115	zero_user_segment(page, start: statret, end);
2116	ret = copy_page_to_iter(page,
2117	offset: iocb->ki_pos & ~PAGE_MASK,
2118	bytes: end - iocb->ki_pos, i: to);
2119	iocb->ki_pos += ret;
2120	read += ret;
2121	}
2122	if (iocb->ki_pos < i_size && read < len) {
2123	size_t zlen = min_t(size_t, len - read,
2124	i_size - iocb->ki_pos);
2125	ret = iov_iter_zero(bytes: zlen, to);
2126	iocb->ki_pos += ret;
2127	read += ret;
2128	}
2129	__free_pages(page, order: `0`);
2130	return read;
2131	}
2132
2133	/ hit EOF or hole? /
2134	if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
2135	ret < len) {
2136	dout("sync_read hit hole, ppos %lld < size %lld"
2137	", reading more\n", iocb->ki_pos, i_size);
2138
2139	read += ret;
2140	len -= ret;
2141	retry_op = HAVE_RETRIED;
2142	goto again;
2143	}
2144	}
2145
2146	if (ret >= `0`)
2147	ret += read;
2148
2149	return ret;
2150	}
2151
2152	/*
2153	* Wrap filemap_splice_read with checks for cap bits on the inode.
2154	* Atomically grab references, so that those bits are not released
2155	* back to the MDS mid-read.
2156	*/
2157	static ssize_t ceph_splice_read(struct file in, loff_t ppos,
2158	struct pipe_inode_info *pipe,
2159	size_t len, unsigned int flags)
2160	{
2161	struct ceph_file_info *fi = in->private_data;
2162	struct inode *inode = file_inode(f: in);
2163	struct ceph_inode_info *ci = ceph_inode(inode);
2164	ssize_t ret;
2165	int want = `0`, got = `0`;
2166	CEPH_DEFINE_RW_CONTEXT(rw_ctx, `0`);
2167
2168	dout("splice_read %p %llx.%llx %llu~%zu trying to get caps on %p\n",
2169	inode, ceph_vinop(inode), *ppos, len, inode);
2170
2171	if (ceph_inode_is_shutdown(inode))
2172	return -ESTALE;
2173
2174	if (ceph_has_inline_data(ci) \|\|
2175	(fi->flags & CEPH_F_SYNC))
2176	return copy_splice_read(in, ppos, pipe, len, flags);
2177
2178	ceph_start_io_read(inode);
2179
2180	want = CEPH_CAP_FILE_CACHE;
2181	if (fi->fmode & CEPH_FILE_MODE_LAZY)
2182	want \|= CEPH_CAP_FILE_LAZYIO;
2183
2184	ret = ceph_get_caps(filp: in, CEPH_CAP_FILE_RD, want, endoff: -`1`, got: &got);
2185	if (ret < `0`)
2186	goto out_end;
2187
2188	if ((got & (CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO)) == `0`) {
2189	dout("splice_read/sync %p %llx.%llx %llu~%zu got cap refs on %s\n",
2190	inode, ceph_vinop(inode), *ppos, len,
2191	ceph_cap_string(got));
2192
2193	ceph_put_cap_refs(ci, had: got);
2194	ceph_end_io_read(inode);
2195	return copy_splice_read(in, ppos, pipe, len, flags);
2196	}
2197
2198	dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n",
2199	inode, ceph_vinop(inode), *ppos, len, ceph_cap_string(got));
2200
2201	rw_ctx.caps = got;
2202	ceph_add_rw_context(cf: fi, ctx: &rw_ctx);
2203	ret = filemap_splice_read(in, ppos, pipe, len, flags);
2204	ceph_del_rw_context(cf: fi, ctx: &rw_ctx);
2205
2206	dout("splice_read %p %llx.%llx dropping cap refs on %s = %zd\n",
2207	inode, ceph_vinop(inode), ceph_cap_string(got), ret);
2208
2209	ceph_put_cap_refs(ci, had: got);
2210	out_end:
2211	ceph_end_io_read(inode);
2212	return ret;
2213	}
2214
2215	/*
2216	* Take cap references to avoid releasing caps to MDS mid-write.
2217	*
2218	* If we are synchronous, and write with an old snap context, the OSD
2219	* may return EOLDSNAPC. In that case, retry the write.. _after_
2220	* dropping our cap refs and allowing the pending snap to logically
2221	* complete _before_ this write occurs.
2222	*
2223	* If we are near ENOSPC, write synchronously.
2224	*/
2225	static ssize_t ceph_write_iter(struct kiocb iocb, struct* iov_iter *from)
2226	{
2227	struct file *file = iocb->ki_filp;
2228	struct ceph_file_info *fi = file->private_data;
2229	struct inode *inode = file_inode(f: file);
2230	struct ceph_inode_info *ci = ceph_inode(inode);
2231	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2232	struct ceph_osd_client *osdc = &fsc->client->osdc;
2233	struct ceph_cap_flush *prealloc_cf;
2234	ssize_t count, written = `0`;
2235	int err, want = `0`, got;
2236	bool direct_lock = false;
2237	u32 map_flags;
2238	u64 pool_flags;
2239	loff_t pos;
2240	loff_t limit = max(i_size_read(inode), fsc->max_file_size);
2241
2242	if (ceph_inode_is_shutdown(inode))
2243	return -ESTALE;
2244
2245	if (ceph_snap(inode) != CEPH_NOSNAP)
2246	return -EROFS;
2247
2248	prealloc_cf = ceph_alloc_cap_flush();
2249	if (!prealloc_cf)
2250	return -ENOMEM;
2251
2252	if ((iocb->ki_flags & (IOCB_DIRECT \| IOCB_APPEND)) == IOCB_DIRECT)
2253	direct_lock = true;
2254
2255	retry_snap:
2256	if (direct_lock)
2257	ceph_start_io_direct(inode);
2258	else
2259	ceph_start_io_write(inode);
2260
2261	if (iocb->ki_flags & IOCB_APPEND) {
2262	err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, force: false);
2263	if (err < `0`)
2264	goto out;
2265	}
2266
2267	err = generic_write_checks(iocb, from);
2268	if (err <= `0`)
2269	goto out;
2270
2271	pos = iocb->ki_pos;
2272	if (unlikely(pos >= limit)) {
2273	err = -EFBIG;
2274	goto out;
2275	} else {
2276	iov_iter_truncate(i: from, count: limit - pos);
2277	}
2278
2279	count = iov_iter_count(i: from);
2280	if (ceph_quota_is_max_bytes_exceeded(inode, newlen: pos + count)) {
2281	err = -EDQUOT;
2282	goto out;
2283	}
2284
2285	down_read(sem: &osdc->lock);
2286	map_flags = osdc->osdmap->flags;
2287	pool_flags = ceph_pg_pool_flags(map: osdc->osdmap, id: ci->i_layout.pool_id);
2288	up_read(sem: &osdc->lock);
2289	if ((map_flags & CEPH_OSDMAP_FULL) \|\|
2290	(pool_flags & CEPH_POOL_FLAG_FULL)) {
2291	err = -ENOSPC;
2292	goto out;
2293	}
2294
2295	err = file_remove_privs(file);
2296	if (err)
2297	goto out;
2298
2299	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
2300	inode, ceph_vinop(inode), pos, count, i_size_read(inode));
2301	if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
2302	want \|= CEPH_CAP_FILE_BUFFER;
2303	if (fi->fmode & CEPH_FILE_MODE_LAZY)
2304	want \|= CEPH_CAP_FILE_LAZYIO;
2305	got = `0`;
2306	err = ceph_get_caps(filp: file, CEPH_CAP_FILE_WR, want, endoff: pos + count, got: &got);
2307	if (err < `0`)
2308	goto out;
2309
2310	err = file_update_time(file);
2311	if (err)
2312	goto out_caps;
2313
2314	inode_inc_iversion_raw(inode);
2315
2316	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
2317	inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
2318
2319	if ((got & (CEPH_CAP_FILE_BUFFER\|CEPH_CAP_FILE_LAZYIO)) == `0` \|\|
2320	(iocb->ki_flags & IOCB_DIRECT) \|\| (fi->flags & CEPH_F_SYNC) \|\|
2321	(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
2322	struct ceph_snap_context *snapc;
2323	struct iov_iter data;
2324
2325	spin_lock(lock: &ci->i_ceph_lock);
2326	if (__ceph_have_pending_cap_snap(ci)) {
2327	struct ceph_cap_snap *capsnap =
2328	list_last_entry(&ci->i_cap_snaps,
2329	struct ceph_cap_snap,
2330	ci_item);
2331	snapc = ceph_get_snap_context(sc: capsnap->context);
2332	} else {
2333	BUG_ON(!ci->i_head_snapc);
2334	snapc = ceph_get_snap_context(sc: ci->i_head_snapc);
2335	}
2336	spin_unlock(lock: &ci->i_ceph_lock);
2337
2338	/ we might need to revert back to that point /
2339	data = *from;
2340	if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode))
2341	written = ceph_direct_read_write(iocb, iter: &data, snapc,
2342	pcf: &prealloc_cf);
2343	else
2344	written = ceph_sync_write(iocb, from: &data, pos, snapc);
2345	if (direct_lock)
2346	ceph_end_io_direct(inode);
2347	else
2348	ceph_end_io_write(inode);
2349	if (written > `0`)
2350	iov_iter_advance(i: from, bytes: written);
2351	ceph_put_snap_context(sc: snapc);
2352	} else {
2353	/*
2354	* No need to acquire the i_truncate_mutex. Because
2355	* the MDS revokes Fwb caps before sending truncate
2356	* message to us. We can't get Fwb cap while there
2357	* are pending vmtruncate. So write and vmtruncate
2358	* can not run at the same time
2359	*/
2360	written = generic_perform_write(iocb, from);
2361	ceph_end_io_write(inode);
2362	}
2363
2364	if (written >= `0`) {
2365	int dirty;
2366
2367	spin_lock(lock: &ci->i_ceph_lock);
2368	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
2369	pcf: &prealloc_cf);
2370	spin_unlock(lock: &ci->i_ceph_lock);
2371	if (dirty)
2372	__mark_inode_dirty(inode, dirty);
2373	if (ceph_quota_is_max_bytes_approaching(inode, newlen: iocb->ki_pos))
2374	ceph_check_caps(ci, CHECK_CAPS_FLUSH);
2375	}
2376
2377	dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
2378	inode, ceph_vinop(inode), pos, (unsigned)count,
2379	ceph_cap_string(got));
2380	ceph_put_cap_refs(ci, had: got);
2381
2382	if (written == -EOLDSNAPC) {
2383	dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
2384	inode, ceph_vinop(inode), pos, (unsigned)count);
2385	goto retry_snap;
2386	}
2387
2388	if (written >= `0`) {
2389	if ((map_flags & CEPH_OSDMAP_NEARFULL) \|\|
2390	(pool_flags & CEPH_POOL_FLAG_NEARFULL))
2391	iocb->ki_flags \|= IOCB_DSYNC;
2392	written = generic_write_sync(iocb, count: written);
2393	}
2394
2395	goto out_unlocked;
2396	out_caps:
2397	ceph_put_cap_refs(ci, had: got);
2398	out:
2399	if (direct_lock)
2400	ceph_end_io_direct(inode);
2401	else
2402	ceph_end_io_write(inode);
2403	out_unlocked:
2404	ceph_free_cap_flush(cf: prealloc_cf);
2405	return written ? written : err;
2406	}
2407
2408	/*
2409	* llseek. be sure to verify file size on SEEK_END.
2410	*/
2411	static loff_t ceph_llseek(struct file file, loff_t offset, int* whence)
2412	{
2413	if (whence == SEEK_END \|\| whence == SEEK_DATA \|\| whence == SEEK_HOLE) {
2414	struct inode *inode = file_inode(f: file);
2415	int ret;
2416
2417	ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, force: false);
2418	if (ret < `0`)
2419	return ret;
2420	}
2421	return generic_file_llseek(file, offset, whence);
2422	}
2423
2424	static inline void ceph_zero_partial_page(
2425	struct inode inode, loff_t offset, unsigned* size)
2426	{
2427	struct page *page;
2428	pgoff_t index = offset >> PAGE_SHIFT;
2429
2430	page = find_lock_page(mapping: inode->i_mapping, index);
2431	if (page) {
2432	wait_on_page_writeback(page);
2433	zero_user(page, start: offset & (PAGE_SIZE - `1`), size);
2434	unlock_page(page);
2435	put_page(page);
2436	}
2437	}
2438
2439	static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
2440	loff_t length)
2441	{
2442	loff_t nearly = round_up(offset, PAGE_SIZE);
2443	if (offset < nearly) {
2444	loff_t size = nearly - offset;
2445	if (length < size)
2446	size = length;
2447	ceph_zero_partial_page(inode, offset, size);
2448	offset += size;
2449	length -= size;
2450	}
2451	if (length >= PAGE_SIZE) {
2452	loff_t size = round_down(length, PAGE_SIZE);
2453	truncate_pagecache_range(inode, offset, end: offset + size - `1`);
2454	offset += size;
2455	length -= size;
2456	}
2457	if (length)
2458	ceph_zero_partial_page(inode, offset, size: length);
2459	}
2460
2461	static int ceph_zero_partial_object(struct inode *inode,
2462	loff_t offset, loff_t *length)
2463	{
2464	struct ceph_inode_info *ci = ceph_inode(inode);
2465	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2466	struct ceph_osd_request *req;
2467	int ret = `0`;
2468	loff_t zero = `0`;
2469	int op;
2470
2471	if (ceph_inode_is_shutdown(inode))
2472	return -EIO;
2473
2474	if (!length) {
2475	op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE;
2476	length = &zero;
2477	} else {
2478	op = CEPH_OSD_OP_ZERO;
2479	}
2480
2481	req = ceph_osdc_new_request(&fsc->client->osdc, layout: &ci->i_layout,
2482	vino: ceph_vino(inode),
2483	offset, len: length,
2484	which: `0`, num_ops: `1`, opcode: op,
2485	flags: CEPH_OSD_FLAG_WRITE,
2486	NULL, truncate_seq: `0`, truncate_size: `0`, use_mempool: false);
2487	if (IS_ERR(ptr: req)) {
2488	ret = PTR_ERR(ptr: req);
2489	goto out;
2490	}
2491
2492	req->r_mtime = inode_get_mtime(inode);
2493	ceph_osdc_start_request(osdc: &fsc->client->osdc, req);
2494	ret = ceph_osdc_wait_request(osdc: &fsc->client->osdc, req);
2495	if (ret == -ENOENT)
2496	ret = `0`;
2497	ceph_osdc_put_request(req);
2498
2499	out:
2500	return ret;
2501	}
2502
2503	static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
2504	{
2505	int ret = `0`;
2506	struct ceph_inode_info *ci = ceph_inode(inode);
2507	s32 stripe_unit = ci->i_layout.stripe_unit;
2508	s32 stripe_count = ci->i_layout.stripe_count;
2509	s32 object_size = ci->i_layout.object_size;
2510	u64 object_set_size = object_size * stripe_count;
2511	u64 nearly, t;
2512
2513	/ round offset up to next period boundary /
2514	nearly = offset + object_set_size - `1`;
2515	t = nearly;
2516	nearly -= do_div(t, object_set_size);
2517
2518	while (length && offset < nearly) {
2519	loff_t size = length;
2520	ret = ceph_zero_partial_object(inode, offset, length: &size);
2521	if (ret < `0`)
2522	return ret;
2523	offset += size;
2524	length -= size;
2525	}
2526	while (length >= object_set_size) {
2527	int i;
2528	loff_t pos = offset;
2529	for (i = `0`; i < stripe_count; ++i) {
2530	ret = ceph_zero_partial_object(inode, offset: pos, NULL);
2531	if (ret < `0`)
2532	return ret;
2533	pos += stripe_unit;
2534	}
2535	offset += object_set_size;
2536	length -= object_set_size;
2537	}
2538	while (length) {
2539	loff_t size = length;
2540	ret = ceph_zero_partial_object(inode, offset, length: &size);
2541	if (ret < `0`)
2542	return ret;
2543	offset += size;
2544	length -= size;
2545	}
2546	return ret;
2547	}
2548
2549	static long ceph_fallocate(struct file file, int* mode,
2550	loff_t offset, loff_t length)
2551	{
2552	struct ceph_file_info *fi = file->private_data;
2553	struct inode *inode = file_inode(f: file);
2554	struct ceph_inode_info *ci = ceph_inode(inode);
2555	struct ceph_cap_flush *prealloc_cf;
2556	int want, got = `0`;
2557	int dirty;
2558	int ret = `0`;
2559	loff_t endoff = `0`;
2560	loff_t size;
2561
2562	dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__,
2563	inode, ceph_vinop(inode), mode, offset, length);
2564
2565	if (mode != (FALLOC_FL_KEEP_SIZE \| FALLOC_FL_PUNCH_HOLE))
2566	return -EOPNOTSUPP;
2567
2568	if (!S_ISREG(inode->i_mode))
2569	return -EOPNOTSUPP;
2570
2571	if (IS_ENCRYPTED(inode))
2572	return -EOPNOTSUPP;
2573
2574	prealloc_cf = ceph_alloc_cap_flush();
2575	if (!prealloc_cf)
2576	return -ENOMEM;
2577
2578	inode_lock(inode);
2579
2580	if (ceph_snap(inode) != CEPH_NOSNAP) {
2581	ret = -EROFS;
2582	goto unlock;
2583	}
2584
2585	size = i_size_read(inode);
2586
2587	/ Are we punching a hole beyond EOF? /
2588	if (offset >= size)
2589	goto unlock;
2590	if ((offset + length) > size)
2591	length = size - offset;
2592
2593	if (fi->fmode & CEPH_FILE_MODE_LAZY)
2594	want = CEPH_CAP_FILE_BUFFER \| CEPH_CAP_FILE_LAZYIO;
2595	else
2596	want = CEPH_CAP_FILE_BUFFER;
2597
2598	ret = ceph_get_caps(filp: file, CEPH_CAP_FILE_WR, want, endoff, got: &got);
2599	if (ret < `0`)
2600	goto unlock;
2601
2602	ret = file_modified(file);
2603	if (ret)
2604	goto put_caps;
2605
2606	filemap_invalidate_lock(mapping: inode->i_mapping);
2607	ceph_fscache_invalidate(inode, dio_write: false);
2608	ceph_zero_pagecache_range(inode, offset, length);
2609	ret = ceph_zero_objects(inode, offset, length);
2610
2611	if (!ret) {
2612	spin_lock(lock: &ci->i_ceph_lock);
2613	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
2614	pcf: &prealloc_cf);
2615	spin_unlock(lock: &ci->i_ceph_lock);
2616	if (dirty)
2617	__mark_inode_dirty(inode, dirty);
2618	}
2619	filemap_invalidate_unlock(mapping: inode->i_mapping);
2620
2621	put_caps:
2622	ceph_put_cap_refs(ci, had: got);
2623	unlock:
2624	inode_unlock(inode);
2625	ceph_free_cap_flush(cf: prealloc_cf);
2626	return ret;
2627	}
2628
2629	/*
2630	* This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for
2631	* src_ci. Two attempts are made to obtain both caps, and an error is return if
2632	* this fails; zero is returned on success.
2633	*/
2634	static int get_rd_wr_caps(struct file src_filp, int* *src_got,
2635	struct file *dst_filp,
2636	loff_t dst_endoff, int *dst_got)
2637	{
2638	int ret = `0`;
2639	bool retrying = false;
2640
2641	retry_caps:
2642	ret = ceph_get_caps(filp: dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
2643	endoff: dst_endoff, got: dst_got);
2644	if (ret < `0`)
2645	return ret;
2646
2647	/*
2648	* Since we're already holding the FILE_WR capability for the dst file,
2649	* we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
2650	* retry dance instead to try to get both capabilities.
2651	*/
2652	ret = ceph_try_get_caps(inode: file_inode(f: src_filp),
2653	CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
2654	nonblock: false, got: src_got);
2655	if (ret <= `0`) {
2656	/ Start by dropping dst_ci caps and getting src_ci caps /
2657	ceph_put_cap_refs(ci: ceph_inode(inode: file_inode(f: dst_filp)), had: *dst_got);
2658	if (retrying) {
2659	if (!ret)
2660	/ ceph_try_get_caps masks EAGAIN /
2661	ret = -EAGAIN;
2662	return ret;
2663	}
2664	ret = ceph_get_caps(filp: src_filp, CEPH_CAP_FILE_RD,
2665	CEPH_CAP_FILE_SHARED, endoff: -`1`, got: src_got);
2666	if (ret < `0`)
2667	return ret;
2668	/... drop src_ci caps too, and retry /
2669	ceph_put_cap_refs(ci: ceph_inode(inode: file_inode(f: src_filp)), had: *src_got);
2670	retrying = true;
2671	goto retry_caps;
2672	}
2673	return ret;
2674	}
2675
2676	static void put_rd_wr_caps(struct ceph_inode_info src_ci, int* src_got,
2677	struct ceph_inode_info dst_ci, int* dst_got)
2678	{
2679	ceph_put_cap_refs(ci: src_ci, had: src_got);
2680	ceph_put_cap_refs(ci: dst_ci, had: dst_got);
2681	}
2682
2683	/*
2684	* This function does several size-related checks, returning an error if:
2685	* - source file is smaller than off+len
2686	* - destination file size is not OK (inode_newsize_ok())
2687	* - max bytes quotas is exceeded
2688	*/
2689	static int is_file_size_ok(struct inode src_inode, struct* inode *dst_inode,
2690	loff_t src_off, loff_t dst_off, size_t len)
2691	{
2692	loff_t size, endoff;
2693
2694	size = i_size_read(inode: src_inode);
2695	/*
2696	* Don't copy beyond source file EOF. Instead of simply setting length
2697	* to (size - src_off), just drop to VFS default implementation, as the
2698	* local i_size may be stale due to other clients writing to the source
2699	* inode.
2700	*/
2701	if (src_off + len > size) {
2702	dout("Copy beyond EOF (%llu + %zu > %llu)\n",
2703	src_off, len, size);
2704	return -EOPNOTSUPP;
2705	}
2706	size = i_size_read(inode: dst_inode);
2707
2708	endoff = dst_off + len;
2709	if (inode_newsize_ok(dst_inode, offset: endoff))
2710	return -EOPNOTSUPP;
2711
2712	if (ceph_quota_is_max_bytes_exceeded(inode: dst_inode, newlen: endoff))
2713	return -EDQUOT;
2714
2715	return `0`;
2716	}
2717
2718	static struct ceph_osd_request *
2719	ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc,
2720	u64 src_snapid,
2721	struct ceph_object_id *src_oid,
2722	struct ceph_object_locator *src_oloc,
2723	struct ceph_object_id *dst_oid,
2724	struct ceph_object_locator *dst_oloc,
2725	u32 truncate_seq, u64 truncate_size)
2726	{
2727	struct ceph_osd_request *req;
2728	int ret;
2729	u32 src_fadvise_flags =
2730	CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL \|
2731	CEPH_OSD_OP_FLAG_FADVISE_NOCACHE;
2732	u32 dst_fadvise_flags =
2733	CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL \|
2734	CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
2735
2736	req = ceph_osdc_alloc_request(osdc, NULL, num_ops: `1`, use_mempool: false, GFP_KERNEL);
2737	if (!req)
2738	return ERR_PTR(error: -ENOMEM);
2739
2740	req->r_flags = CEPH_OSD_FLAG_WRITE;
2741
2742	ceph_oloc_copy(dest: &req->r_t.base_oloc, src: dst_oloc);
2743	ceph_oid_copy(dest: &req->r_t.base_oid, src: dst_oid);
2744
2745	ret = osd_req_op_copy_from_init(req, src_snapid, src_version: `0`,
2746	src_oid, src_oloc,
2747	src_fadvise_flags,
2748	dst_fadvise_flags,
2749	truncate_seq,
2750	truncate_size,
2751	copy_from_flags: CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
2752	if (ret)
2753	goto out;
2754
2755	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
2756	if (ret)
2757	goto out;
2758
2759	return req;
2760
2761	out:
2762	ceph_osdc_put_request(req);
2763	return ERR_PTR(error: ret);
2764	}
2765
2766	static ssize_t ceph_do_objects_copy(struct ceph_inode_info src_ci, u64 src_off,
2767	struct ceph_inode_info dst_ci, u64 dst_off,
2768	struct ceph_fs_client *fsc,
2769	size_t len, unsigned int flags)
2770	{
2771	struct ceph_object_locator src_oloc, dst_oloc;
2772	struct ceph_object_id src_oid, dst_oid;
2773	struct ceph_osd_client *osdc;
2774	struct ceph_osd_request *req;
2775	size_t bytes = `0`;
2776	u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
2777	u32 src_objlen, dst_objlen;
2778	u32 object_size = src_ci->i_layout.object_size;
2779	int ret;
2780
2781	src_oloc.pool = src_ci->i_layout.pool_id;
2782	src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
2783	dst_oloc.pool = dst_ci->i_layout.pool_id;
2784	dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
2785	osdc = &fsc->client->osdc;
2786
2787	while (len >= object_size) {
2788	ceph_calc_file_object_mapping(l: &src_ci->i_layout, off: *src_off,
2789	len: object_size, objno: &src_objnum,
2790	objoff: &src_objoff, xlen: &src_objlen);
2791	ceph_calc_file_object_mapping(l: &dst_ci->i_layout, off: *dst_off,
2792	len: object_size, objno: &dst_objnum,
2793	objoff: &dst_objoff, xlen: &dst_objlen);
2794	ceph_oid_init(oid: &src_oid);
2795	ceph_oid_printf(oid: &src_oid, fmt: "%llx.%08llx",
2796	src_ci->i_vino.ino, src_objnum);
2797	ceph_oid_init(oid: &dst_oid);
2798	ceph_oid_printf(oid: &dst_oid, fmt: "%llx.%08llx",
2799	dst_ci->i_vino.ino, dst_objnum);
2800	/ Do an object remote copy /
2801	req = ceph_alloc_copyfrom_request(osdc, src_snapid: src_ci->i_vino.snap,
2802	src_oid: &src_oid, src_oloc: &src_oloc,
2803	dst_oid: &dst_oid, dst_oloc: &dst_oloc,
2804	truncate_seq: dst_ci->i_truncate_seq,
2805	truncate_size: dst_ci->i_truncate_size);
2806	if (IS_ERR(ptr: req))
2807	ret = PTR_ERR(ptr: req);
2808	else {
2809	ceph_osdc_start_request(osdc, req);
2810	ret = ceph_osdc_wait_request(osdc, req);
2811	ceph_update_copyfrom_metrics(m: &fsc->mdsc->metric,
2812	r_start: req->r_start_latency,
2813	r_end: req->r_end_latency,
2814	size: object_size, rc: ret);
2815	ceph_osdc_put_request(req);
2816	}
2817	if (ret) {
2818	if (ret == -EOPNOTSUPP) {
2819	fsc->have_copy_from2 = false;
2820	pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
2821	}
2822	dout("ceph_osdc_copy_from returned %d\n", ret);
2823	if (!bytes)
2824	bytes = ret;
2825	goto out;
2826	}
2827	len -= object_size;
2828	bytes += object_size;
2829	*src_off += object_size;
2830	*dst_off += object_size;
2831	}
2832
2833	out:
2834	ceph_oloc_destroy(oloc: &src_oloc);
2835	ceph_oloc_destroy(oloc: &dst_oloc);
2836	return bytes;
2837	}
2838
2839	static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2840	struct file *dst_file, loff_t dst_off,
2841	size_t len, unsigned int flags)
2842	{
2843	struct inode *src_inode = file_inode(f: src_file);
2844	struct inode *dst_inode = file_inode(f: dst_file);
2845	struct ceph_inode_info *src_ci = ceph_inode(inode: src_inode);
2846	struct ceph_inode_info *dst_ci = ceph_inode(inode: dst_inode);
2847	struct ceph_cap_flush *prealloc_cf;
2848	struct ceph_fs_client *src_fsc = ceph_inode_to_client(inode: src_inode);
2849	loff_t size;
2850	ssize_t ret = -EIO, bytes;
2851	u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
2852	u32 src_objlen, dst_objlen;
2853	int src_got = `0`, dst_got = `0`, err, dirty;
2854
2855	if (src_inode->i_sb != dst_inode->i_sb) {
2856	struct ceph_fs_client *dst_fsc = ceph_inode_to_client(inode: dst_inode);
2857
2858	if (ceph_fsid_compare(a: &src_fsc->client->fsid,
2859	b: &dst_fsc->client->fsid)) {
2860	dout("Copying files across clusters: src: %pU dst: %pU\n",
2861	&src_fsc->client->fsid, &dst_fsc->client->fsid);
2862	return -EXDEV;
2863	}
2864	}
2865	if (ceph_snap(inode: dst_inode) != CEPH_NOSNAP)
2866	return -EROFS;
2867
2868	/*
2869	* Some of the checks below will return -EOPNOTSUPP, which will force a
2870	* fallback to the default VFS copy_file_range implementation. This is
2871	* desirable in several cases (for ex, the 'len' is smaller than the
2872	* size of the objects, or in cases where that would be more
2873	* efficient).
2874	*/
2875
2876	if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
2877	return -EOPNOTSUPP;
2878
2879	if (!src_fsc->have_copy_from2)
2880	return -EOPNOTSUPP;
2881
2882	/*
2883	* Striped file layouts require that we copy partial objects, but the
2884	* OSD copy-from operation only supports full-object copies. Limit
2885	* this to non-striped file layouts for now.
2886	*/
2887	if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) \|\|
2888	(src_ci->i_layout.stripe_count != `1`) \|\|
2889	(dst_ci->i_layout.stripe_count != `1`) \|\|
2890	(src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
2891	dout("Invalid src/dst files layout\n");
2892	return -EOPNOTSUPP;
2893	}
2894
2895	/ Every encrypted inode gets its own key, so we can't offload them /
2896	if (IS_ENCRYPTED(src_inode) \|\| IS_ENCRYPTED(dst_inode))
2897	return -EOPNOTSUPP;
2898
2899	if (len < src_ci->i_layout.object_size)
2900	return -EOPNOTSUPP; / no remote copy will be done /
2901
2902	prealloc_cf = ceph_alloc_cap_flush();
2903	if (!prealloc_cf)
2904	return -ENOMEM;
2905
2906	/ Start by sync'ing the source and destination files /
2907	ret = file_write_and_wait_range(file: src_file, start: src_off, end: (src_off + len));
2908	if (ret < `0`) {
2909	dout("failed to write src file (%zd)\n", ret);
2910	goto out;
2911	}
2912	ret = file_write_and_wait_range(file: dst_file, start: dst_off, end: (dst_off + len));
2913	if (ret < `0`) {
2914	dout("failed to write dst file (%zd)\n", ret);
2915	goto out;
2916	}
2917
2918	/*
2919	* We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other
2920	* clients may have dirty data in their caches. And OSDs know nothing
2921	* about caps, so they can't safely do the remote object copies.
2922	*/
2923	err = get_rd_wr_caps(src_filp: src_file, src_got: &src_got,
2924	dst_filp: dst_file, dst_endoff: (dst_off + len), dst_got: &dst_got);
2925	if (err < `0`) {
2926	dout("get_rd_wr_caps returned %d\n", err);
2927	ret = -EOPNOTSUPP;
2928	goto out;
2929	}
2930
2931	ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len);
2932	if (ret < `0`)
2933	goto out_caps;
2934
2935	/ Drop dst file cached pages /
2936	ceph_fscache_invalidate(inode: dst_inode, dio_write: false);
2937	ret = invalidate_inode_pages2_range(mapping: dst_inode->i_mapping,
2938	start: dst_off >> PAGE_SHIFT,
2939	end: (dst_off + len) >> PAGE_SHIFT);
2940	if (ret < `0`) {
2941	dout("Failed to invalidate inode pages (%zd)\n", ret);
2942	ret = `0`; / XXX /
2943	}
2944	ceph_calc_file_object_mapping(l: &src_ci->i_layout, off: src_off,
2945	len: src_ci->i_layout.object_size,
2946	objno: &src_objnum, objoff: &src_objoff, xlen: &src_objlen);
2947	ceph_calc_file_object_mapping(l: &dst_ci->i_layout, off: dst_off,
2948	len: dst_ci->i_layout.object_size,
2949	objno: &dst_objnum, objoff: &dst_objoff, xlen: &dst_objlen);
2950	/ object-level offsets need to the same /
2951	if (src_objoff != dst_objoff) {
2952	ret = -EOPNOTSUPP;
2953	goto out_caps;
2954	}
2955
2956	/*
2957	* Do a manual copy if the object offset isn't object aligned.
2958	* 'src_objlen' contains the bytes left until the end of the object,
2959	* starting at the src_off
2960	*/
2961	if (src_objoff) {
2962	dout("Initial partial copy of %u bytes\n", src_objlen);
2963
2964	/*
2965	* we need to temporarily drop all caps as we'll be calling
2966	* {read,write}_iter, which will get caps again.
2967	*/
2968	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
2969	ret = do_splice_direct(in: src_file, ppos: &src_off, out: dst_file,
2970	opos: &dst_off, len: src_objlen, flags);
2971	/ Abort on short copies or on error /
2972	if (ret < (long)src_objlen) {
2973	dout("Failed partial copy (%zd)\n", ret);
2974	goto out;
2975	}
2976	len -= ret;
2977	err = get_rd_wr_caps(src_filp: src_file, src_got: &src_got,
2978	dst_filp: dst_file, dst_endoff: (dst_off + len), dst_got: &dst_got);
2979	if (err < `0`)
2980	goto out;
2981	err = is_file_size_ok(src_inode, dst_inode,
2982	src_off, dst_off, len);
2983	if (err < `0`)
2984	goto out_caps;
2985	}
2986
2987	size = i_size_read(inode: dst_inode);
2988	bytes = ceph_do_objects_copy(src_ci, src_off: &src_off, dst_ci, dst_off: &dst_off,
2989	fsc: src_fsc, len, flags);
2990	if (bytes <= `0`) {
2991	if (!ret)
2992	ret = bytes;
2993	goto out_caps;
2994	}
2995	dout("Copied %zu bytes out of %zu\n", bytes, len);
2996	len -= bytes;
2997	ret += bytes;
2998
2999	file_update_time(file: dst_file);
3000	inode_inc_iversion_raw(inode: dst_inode);
3001
3002	if (dst_off > size) {
3003	/ Let the MDS know about dst file size change /
3004	if (ceph_inode_set_size(inode: dst_inode, size: dst_off) \|\|
3005	ceph_quota_is_max_bytes_approaching(inode: dst_inode, newlen: dst_off))
3006	ceph_check_caps(ci: dst_ci, CHECK_CAPS_AUTHONLY \| CHECK_CAPS_FLUSH);
3007	}
3008	/ Mark Fw dirty /
3009	spin_lock(lock: &dst_ci->i_ceph_lock);
3010	dirty = __ceph_mark_dirty_caps(ci: dst_ci, CEPH_CAP_FILE_WR, pcf: &prealloc_cf);
3011	spin_unlock(lock: &dst_ci->i_ceph_lock);
3012	if (dirty)
3013	__mark_inode_dirty(dst_inode, dirty);
3014
3015	out_caps:
3016	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
3017
3018	/*
3019	* Do the final manual copy if we still have some bytes left, unless
3020	* there were errors in remote object copies (len >= object_size).
3021	*/
3022	if (len && (len < src_ci->i_layout.object_size)) {
3023	dout("Final partial copy of %zu bytes\n", len);
3024	bytes = do_splice_direct(in: src_file, ppos: &src_off, out: dst_file,
3025	opos: &dst_off, len, flags);
3026	if (bytes > `0`)
3027	ret += bytes;
3028	else
3029	dout("Failed partial copy (%zd)\n", bytes);
3030	}
3031
3032	out:
3033	ceph_free_cap_flush(cf: prealloc_cf);
3034
3035	return ret;
3036	}
3037
3038	static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
3039	struct file *dst_file, loff_t dst_off,
3040	size_t len, unsigned int flags)
3041	{
3042	ssize_t ret;
3043
3044	ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
3045	len, flags);
3046
3047	if (ret == -EOPNOTSUPP \|\| ret == -EXDEV)
3048	ret = generic_copy_file_range(file_in: src_file, pos_in: src_off, file_out: dst_file,
3049	pos_out: dst_off, len, flags);
3050	return ret;
3051	}
3052
3053	const struct file_operations ceph_file_fops = {
3054	.open = ceph_open,
3055	.release = ceph_release,
3056	.llseek = ceph_llseek,
3057	.read_iter = ceph_read_iter,
3058	.write_iter = ceph_write_iter,
3059	.mmap = ceph_mmap,
3060	.fsync = ceph_fsync,
3061	.lock = ceph_lock,
3062	.setlease = simple_nosetlease,
3063	.flock = ceph_flock,
3064	.splice_read = ceph_splice_read,
3065	.splice_write = iter_file_splice_write,
3066	.unlocked_ioctl = ceph_ioctl,
3067	.compat_ioctl = compat_ptr_ioctl,
3068	.fallocate = ceph_fallocate,
3069	.copy_file_range = ceph_copy_file_range,
3070	};
3071

source code of linux/fs/ceph/file.c