inode.c source code [linux/fs/ceph/inode.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/module.h>
5	#include <linux/fs.h>
6	#include <linux/slab.h>
7	#include <linux/string.h>
8	#include <linux/uaccess.h>
9	#include <linux/kernel.h>
10	#include <linux/writeback.h>
11	#include <linux/vmalloc.h>
12	#include <linux/xattr.h>
13	#include <linux/posix_acl.h>
14	#include <linux/random.h>
15	#include <linux/sort.h>
16	#include <linux/iversion.h>
17	#include <linux/fscrypt.h>
18
19	#include "super.h"
20	#include "mds_client.h"
21	#include "cache.h"
22	#include "crypto.h"
23	#include <linux/ceph/decode.h>
24
25	/*
26	* Ceph inode operations
27	*
28	* Implement basic inode helpers (get, alloc) and inode ops (getattr,
29	* setattr, etc.), xattr helpers, and helpers for assimilating
30	* metadata returned by the MDS into our cache.
31	*
32	* Also define helpers for doing asynchronous writeback, invalidation,
33	* and truncation for the benefit of those who can't afford to block
34	* (typically because they are in the message handler path).
35	*/
36
37	static const struct inode_operations ceph_symlink_iops;
38	static const struct inode_operations ceph_encrypted_symlink_iops;
39
40	static void ceph_inode_work(struct work_struct *work);
41
42	/*
43	* find or create an inode, given the ceph ino number
44	*/
45	static int ceph_set_ino_cb(struct inode inode, void* *data)
46	{
47	struct ceph_inode_info *ci = ceph_inode(inode);
48	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
49
50	ci->i_vino = (struct* ceph_vino *)data;
51	inode->i_ino = ceph_vino_to_ino_t(vino: ci->i_vino);
52	inode_set_iversion_raw(inode, val: `0`);
53	percpu_counter_inc(fbc: &mdsc->metric.total_inodes);
54
55	return `0`;
56	}
57
58	/**
59	* ceph_new_inode - allocate a new inode in advance of an expected create
60	* @dir: parent directory for new inode
61	* @dentry: dentry that may eventually point to new inode
62	* @mode: mode of new inode
63	* @as_ctx: pointer to inherited security context
64	*
65	* Allocate a new inode in advance of an operation to create a new inode.
66	* This allocates the inode and sets up the acl_sec_ctx with appropriate
67	* info for the new inode.
68	*
69	* Returns a pointer to the new inode or an ERR_PTR.
70	*/
71	struct inode ceph_new_inode(struct* inode dir, struct* dentry *dentry,
72	umode_t mode, struct* ceph_acl_sec_ctx *as_ctx)
73	{
74	int err;
75	struct inode *inode;
76
77	inode = new_inode(sb: dir->i_sb);
78	if (!inode)
79	return ERR_PTR(error: -ENOMEM);
80
81	if (!S_ISLNK(*mode)) {
82	err = ceph_pre_init_acls(dir, mode, as_ctx);
83	if (err < `0`)
84	goto out_err;
85	}
86
87	inode->i_state = `0`;
88	inode->i_mode = *mode;
89
90	err = ceph_security_init_secctx(dentry, mode: *mode, ctx: as_ctx);
91	if (err < `0`)
92	goto out_err;
93
94	/*
95	* We'll skip setting fscrypt context for snapshots, leaving that for
96	* the handle_reply().
97	*/
98	if (ceph_snap(inode: dir) != CEPH_SNAPDIR) {
99	err = ceph_fscrypt_prepare_context(dir, inode, as: as_ctx);
100	if (err)
101	goto out_err;
102	}
103
104	return inode;
105	out_err:
106	iput(inode);
107	return ERR_PTR(error: err);
108	}
109
110	void ceph_as_ctx_to_req(struct ceph_mds_request *req,
111	struct ceph_acl_sec_ctx *as_ctx)
112	{
113	if (as_ctx->pagelist) {
114	req->r_pagelist = as_ctx->pagelist;
115	as_ctx->pagelist = NULL;
116	}
117	ceph_fscrypt_as_ctx_to_req(req, as: as_ctx);
118	}
119
120	/**
121	* ceph_get_inode - find or create/hash a new inode
122	* @sb: superblock to search and allocate in
123	* @vino: vino to search for
124	* @newino: optional new inode to insert if one isn't found (may be NULL)
125	*
126	* Search for or insert a new inode into the hash for the given vino, and
127	* return a reference to it. If new is non-NULL, its reference is consumed.
128	*/
129	struct inode ceph_get_inode(struct* super_block sb, struct* ceph_vino vino,
130	struct inode *newino)
131	{
132	struct inode *inode;
133
134	if (ceph_vino_is_reserved(vino))
135	return ERR_PTR(error: -EREMOTEIO);
136
137	if (newino) {
138	inode = inode_insert5(inode: newino, hashval: (unsigned long)vino.ino,
139	test: ceph_ino_compare, set: ceph_set_ino_cb, data: &vino);
140	if (inode != newino)
141	iput(newino);
142	} else {
143	inode = iget5_locked(sb, (unsigned long)vino.ino,
144	test: ceph_ino_compare, set: ceph_set_ino_cb, &vino);
145	}
146
147	if (!inode) {
148	dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
149	return ERR_PTR(error: -ENOMEM);
150	}
151
152	dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
153	ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
154	return inode;
155	}
156
157	/*
158	* get/constuct snapdir inode for a given directory
159	*/
160	struct inode ceph_get_snapdir(struct* inode *parent)
161	{
162	struct ceph_vino vino = {
163	.ino = ceph_ino(inode: parent),
164	.snap = CEPH_SNAPDIR,
165	};
166	struct inode *inode = ceph_get_inode(sb: parent->i_sb, vino, NULL);
167	struct ceph_inode_info *ci = ceph_inode(inode);
168	int ret = -ENOTDIR;
169
170	if (IS_ERR(ptr: inode))
171	return inode;
172
173	if (!S_ISDIR(parent->i_mode)) {
174	pr_warn_once("bad snapdir parent type (mode=0%o)\n",
175	parent->i_mode);
176	goto err;
177	}
178
179	if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
180	pr_warn_once("bad snapdir inode type (mode=0%o)\n",
181	inode->i_mode);
182	goto err;
183	}
184
185	inode->i_mode = parent->i_mode;
186	inode->i_uid = parent->i_uid;
187	inode->i_gid = parent->i_gid;
188	inode_set_mtime_to_ts(inode, ts: inode_get_mtime(inode: parent));
189	inode_set_ctime_to_ts(inode, ts: inode_get_ctime(inode: parent));
190	inode_set_atime_to_ts(inode, ts: inode_get_atime(inode: parent));
191	ci->i_rbytes = `0`;
192	ci->i_btime = ceph_inode(inode: parent)->i_btime;
193
194	#ifdef CONFIG_FS_ENCRYPTION
195	/ if encrypted, just borrow fscrypt_auth from parent /
196	if (IS_ENCRYPTED(parent)) {
197	struct ceph_inode_info *pci = ceph_inode(inode: parent);
198
199	ci->fscrypt_auth = kmemdup(p: pci->fscrypt_auth,
200	size: pci->fscrypt_auth_len,
201	GFP_KERNEL);
202	if (ci->fscrypt_auth) {
203	inode->i_flags \|= S_ENCRYPTED;
204	ci->fscrypt_auth_len = pci->fscrypt_auth_len;
205	} else {
206	dout("Failed to alloc snapdir fscrypt_auth\n");
207	ret = -ENOMEM;
208	goto err;
209	}
210	}
211	#endif
212	if (inode->i_state & I_NEW) {
213	inode->i_op = &ceph_snapdir_iops;
214	inode->i_fop = &ceph_snapdir_fops;
215	ci->i_snap_caps = CEPH_CAP_PIN; / so we can open /
216	unlock_new_inode(inode);
217	}
218
219	return inode;
220	err:
221	if ((inode->i_state & I_NEW))
222	discard_new_inode(inode);
223	else
224	iput(inode);
225	return ERR_PTR(error: ret);
226	}
227
228	const struct inode_operations ceph_file_iops = {
229	.permission = ceph_permission,
230	.setattr = ceph_setattr,
231	.getattr = ceph_getattr,
232	.listxattr = ceph_listxattr,
233	.get_inode_acl = ceph_get_acl,
234	.set_acl = ceph_set_acl,
235	};
236
237
238	/*
239	* We use a 'frag tree' to keep track of the MDS's directory fragments
240	* for a given inode (usually there is just a single fragment). We
241	* need to know when a child frag is delegated to a new MDS, or when
242	* it is flagged as replicated, so we can direct our requests
243	* accordingly.
244	*/
245
246	/*
247	* find/create a frag in the tree
248	*/
249	static struct ceph_inode_frag __get_or_create_frag(struct* ceph_inode_info *ci,
250	u32 f)
251	{
252	struct rb_node **p;
253	struct rb_node *parent = NULL;
254	struct ceph_inode_frag *frag;
255	int c;
256
257	p = &ci->i_fragtree.rb_node;
258	while (*p) {
259	parent = *p;
260	frag = rb_entry(parent, struct ceph_inode_frag, node);
261	c = ceph_frag_compare(a: f, b: frag->frag);
262	if (c < `0`)
263	p = &(*p)->rb_left;
264	else if (c > `0`)
265	p = &(*p)->rb_right;
266	else
267	return frag;
268	}
269
270	frag = kmalloc(size: sizeof(*frag), GFP_NOFS);
271	if (!frag)
272	return ERR_PTR(error: -ENOMEM);
273
274	frag->frag = f;
275	frag->split_by = `0`;
276	frag->mds = -`1`;
277	frag->ndist = `0`;
278
279	rb_link_node(node: &frag->node, parent, rb_link: p);
280	rb_insert_color(&frag->node, &ci->i_fragtree);
281
282	dout("get_or_create_frag added %llx.%llx frag %x\n",
283	ceph_vinop(&ci->netfs.inode), f);
284	return frag;
285	}
286
287	/*
288	* find a specific frag @f
289	*/
290	struct ceph_inode_frag __ceph_find_frag(struct* ceph_inode_info *ci, u32 f)
291	{
292	struct rb_node *n = ci->i_fragtree.rb_node;
293
294	while (n) {
295	struct ceph_inode_frag *frag =
296	rb_entry(n, struct ceph_inode_frag, node);
297	int c = ceph_frag_compare(a: f, b: frag->frag);
298	if (c < `0`)
299	n = n->rb_left;
300	else if (c > `0`)
301	n = n->rb_right;
302	else
303	return frag;
304	}
305	return NULL;
306	}
307
308	/*
309	* Choose frag containing the given value @v. If @pfrag is
310	* specified, copy the frag delegation info to the caller if
311	* it is present.
312	*/
313	static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
314	struct ceph_inode_frag pfrag, int* *found)
315	{
316	u32 t = ceph_frag_make(b: `0`, v: `0`);
317	struct ceph_inode_frag *frag;
318	unsigned nway, i;
319	u32 n;
320
321	if (found)
322	*found = `0`;
323
324	while (`1`) {
325	WARN_ON(!ceph_frag_contains_value(t, v));
326	frag = __ceph_find_frag(ci, f: t);
327	if (!frag)
328	break; / t is a leaf /
329	if (frag->split_by == `0`) {
330	if (pfrag)
331	memcpy(pfrag, frag, sizeof(*pfrag));
332	if (found)
333	*found = `1`;
334	break;
335	}
336
337	/ choose child /
338	nway = `1` << frag->split_by;
339	dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
340	frag->split_by, nway);
341	for (i = `0`; i < nway; i++) {
342	n = ceph_frag_make_child(f: t, by: frag->split_by, i);
343	if (ceph_frag_contains_value(f: n, v)) {
344	t = n;
345	break;
346	}
347	}
348	BUG_ON(i == nway);
349	}
350	dout("choose_frag(%x) = %x\n", v, t);
351
352	return t;
353	}
354
355	u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
356	struct ceph_inode_frag pfrag, int* *found)
357	{
358	u32 ret;
359	mutex_lock(&ci->i_fragtree_mutex);
360	ret = __ceph_choose_frag(ci, v, pfrag, found);
361	mutex_unlock(lock: &ci->i_fragtree_mutex);
362	return ret;
363	}
364
365	/*
366	* Process dirfrag (delegation) info from the mds. Include leaf
367	* fragment in tree ONLY if ndist > 0. Otherwise, only
368	* branches/splits are included in i_fragtree)
369	*/
370	static int ceph_fill_dirfrag(struct inode *inode,
371	struct ceph_mds_reply_dirfrag *dirinfo)
372	{
373	struct ceph_inode_info *ci = ceph_inode(inode);
374	struct ceph_inode_frag *frag;
375	u32 id = le32_to_cpu(dirinfo->frag);
376	int mds = le32_to_cpu(dirinfo->auth);
377	int ndist = le32_to_cpu(dirinfo->ndist);
378	int diri_auth = -`1`;
379	int i;
380	int err = `0`;
381
382	spin_lock(lock: &ci->i_ceph_lock);
383	if (ci->i_auth_cap)
384	diri_auth = ci->i_auth_cap->mds;
385	spin_unlock(lock: &ci->i_ceph_lock);
386
387	if (mds == -`1`) / CDIR_AUTH_PARENT /
388	mds = diri_auth;
389
390	mutex_lock(&ci->i_fragtree_mutex);
391	if (ndist == `0` && mds == diri_auth) {
392	/ no delegation info needed. /
393	frag = __ceph_find_frag(ci, f: id);
394	if (!frag)
395	goto out;
396	if (frag->split_by == `0`) {
397	/ tree leaf, remove /
398	dout("fill_dirfrag removed %llx.%llx frag %x"
399	" (no ref)\n", ceph_vinop(inode), id);
400	rb_erase(&frag->node, &ci->i_fragtree);
401	kfree(objp: frag);
402	} else {
403	/ tree branch, keep and clear /
404	dout("fill_dirfrag cleared %llx.%llx frag %x"
405	" referral\n", ceph_vinop(inode), id);
406	frag->mds = -`1`;
407	frag->ndist = `0`;
408	}
409	goto out;
410	}
411
412
413	/ find/add this frag to store mds delegation info /
414	frag = __get_or_create_frag(ci, f: id);
415	if (IS_ERR(ptr: frag)) {
416	/ this is not the end of the world; we can continue*
417	with bad/inaccurate delegation info /*
418	pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
419	ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
420	err = -ENOMEM;
421	goto out;
422	}
423
424	frag->mds = mds;
425	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
426	for (i = `0`; i < frag->ndist; i++)
427	frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
428	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
429	ceph_vinop(inode), frag->frag, frag->ndist);
430
431	out:
432	mutex_unlock(lock: &ci->i_fragtree_mutex);
433	return err;
434	}
435
436	static int frag_tree_split_cmp(const void l, const* void *r)
437	{
438	struct ceph_frag_tree_split ls = (struct* ceph_frag_tree_split*)l;
439	struct ceph_frag_tree_split rs = (struct* ceph_frag_tree_split*)r;
440	return ceph_frag_compare(le32_to_cpu(ls->frag),
441	le32_to_cpu(rs->frag));
442	}
443
444	static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
445	{
446	if (!frag)
447	return f == ceph_frag_make(b: `0`, v: `0`);
448	if (ceph_frag_bits(f) != ceph_frag_bits(f: frag->frag) + frag->split_by)
449	return false;
450	return ceph_frag_contains_value(f: frag->frag, v: ceph_frag_value(f));
451	}
452
453	static int ceph_fill_fragtree(struct inode *inode,
454	struct ceph_frag_tree_head *fragtree,
455	struct ceph_mds_reply_dirfrag *dirinfo)
456	{
457	struct ceph_inode_info *ci = ceph_inode(inode);
458	struct ceph_inode_frag frag, prev_frag = NULL;
459	struct rb_node *rb_node;
460	unsigned i, split_by, nsplits;
461	u32 id;
462	bool update = false;
463
464	mutex_lock(&ci->i_fragtree_mutex);
465	nsplits = le32_to_cpu(fragtree->nsplits);
466	if (nsplits != ci->i_fragtree_nsplits) {
467	update = true;
468	} else if (nsplits) {
469	i = get_random_u32_below(ceil: nsplits);
470	id = le32_to_cpu(fragtree->splits[i].frag);
471	if (!__ceph_find_frag(ci, f: id))
472	update = true;
473	} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
474	rb_node = rb_first(&ci->i_fragtree);
475	frag = rb_entry(rb_node, struct ceph_inode_frag, node);
476	if (frag->frag != ceph_frag_make(b: `0`, v: `0`) \|\| rb_next(rb_node))
477	update = true;
478	}
479	if (!update && dirinfo) {
480	id = le32_to_cpu(dirinfo->frag);
481	if (id != __ceph_choose_frag(ci, v: id, NULL, NULL))
482	update = true;
483	}
484	if (!update)
485	goto out_unlock;
486
487	if (nsplits > `1`) {
488	sort(base: fragtree->splits, num: nsplits, size: sizeof(fragtree->splits[`0`]),
489	cmp_func: frag_tree_split_cmp, NULL);
490	}
491
492	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
493	rb_node = rb_first(&ci->i_fragtree);
494	for (i = `0`; i < nsplits; i++) {
495	id = le32_to_cpu(fragtree->splits[i].frag);
496	split_by = le32_to_cpu(fragtree->splits[i].by);
497	if (split_by == `0` \|\| ceph_frag_bits(f: id) + split_by > `24`) {
498	pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
499	"frag %x split by %d\n", ceph_vinop(inode),
500	i, nsplits, id, split_by);
501	continue;
502	}
503	frag = NULL;
504	while (rb_node) {
505	frag = rb_entry(rb_node, struct ceph_inode_frag, node);
506	if (ceph_frag_compare(a: frag->frag, b: id) >= `0`) {
507	if (frag->frag != id)
508	frag = NULL;
509	else
510	rb_node = rb_next(rb_node);
511	break;
512	}
513	rb_node = rb_next(rb_node);
514	/ delete stale split/leaf node /
515	if (frag->split_by > `0` \|\|
516	!is_frag_child(f: frag->frag, frag: prev_frag)) {
517	rb_erase(&frag->node, &ci->i_fragtree);
518	if (frag->split_by > `0`)
519	ci->i_fragtree_nsplits--;
520	kfree(objp: frag);
521	}
522	frag = NULL;
523	}
524	if (!frag) {
525	frag = __get_or_create_frag(ci, f: id);
526	if (IS_ERR(ptr: frag))
527	continue;
528	}
529	if (frag->split_by == `0`)
530	ci->i_fragtree_nsplits++;
531	frag->split_by = split_by;
532	dout(" frag %x split by %d\n", frag->frag, frag->split_by);
533	prev_frag = frag;
534	}
535	while (rb_node) {
536	frag = rb_entry(rb_node, struct ceph_inode_frag, node);
537	rb_node = rb_next(rb_node);
538	/ delete stale split/leaf node /
539	if (frag->split_by > `0` \|\|
540	!is_frag_child(f: frag->frag, frag: prev_frag)) {
541	rb_erase(&frag->node, &ci->i_fragtree);
542	if (frag->split_by > `0`)
543	ci->i_fragtree_nsplits--;
544	kfree(objp: frag);
545	}
546	}
547	out_unlock:
548	mutex_unlock(lock: &ci->i_fragtree_mutex);
549	return `0`;
550	}
551
552	/*
553	* initialize a newly allocated inode.
554	*/
555	struct inode ceph_alloc_inode(struct* super_block *sb)
556	{
557	struct ceph_inode_info *ci;
558	int i;
559
560	ci = alloc_inode_sb(sb, cache: ceph_inode_cachep, GFP_NOFS);
561	if (!ci)
562	return NULL;
563
564	dout("alloc_inode %p\n", &ci->netfs.inode);
565
566	/ Set parameters for the netfs library /
567	netfs_inode_init(ctx: &ci->netfs, ops: &ceph_netfs_ops);
568
569	spin_lock_init(&ci->i_ceph_lock);
570
571	ci->i_version = `0`;
572	ci->i_inline_version = `0`;
573	ci->i_time_warp_seq = `0`;
574	ci->i_ceph_flags = `0`;
575	atomic64_set(v: &ci->i_ordered_count, i: `1`);
576	atomic64_set(v: &ci->i_release_count, i: `1`);
577	atomic64_set(v: &ci->i_complete_seq[`0`], i: `0`);
578	atomic64_set(v: &ci->i_complete_seq[`1`], i: `0`);
579	ci->i_symlink = NULL;
580
581	ci->i_max_bytes = `0`;
582	ci->i_max_files = `0`;
583
584	memset(&ci->i_dir_layout, `0`, sizeof(ci->i_dir_layout));
585	memset(&ci->i_cached_layout, `0`, sizeof(ci->i_cached_layout));
586	RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
587
588	ci->i_fragtree = RB_ROOT;
589	mutex_init(&ci->i_fragtree_mutex);
590
591	ci->i_xattrs.blob = NULL;
592	ci->i_xattrs.prealloc_blob = NULL;
593	ci->i_xattrs.dirty = false;
594	ci->i_xattrs.index = RB_ROOT;
595	ci->i_xattrs.count = `0`;
596	ci->i_xattrs.names_size = `0`;
597	ci->i_xattrs.vals_size = `0`;
598	ci->i_xattrs.version = `0`;
599	ci->i_xattrs.index_version = `0`;
600
601	ci->i_caps = RB_ROOT;
602	ci->i_auth_cap = NULL;
603	ci->i_dirty_caps = `0`;
604	ci->i_flushing_caps = `0`;
605	INIT_LIST_HEAD(list: &ci->i_dirty_item);
606	INIT_LIST_HEAD(list: &ci->i_flushing_item);
607	ci->i_prealloc_cap_flush = NULL;
608	INIT_LIST_HEAD(list: &ci->i_cap_flush_list);
609	init_waitqueue_head(&ci->i_cap_wq);
610	ci->i_hold_caps_max = `0`;
611	INIT_LIST_HEAD(list: &ci->i_cap_delay_list);
612	INIT_LIST_HEAD(list: &ci->i_cap_snaps);
613	ci->i_head_snapc = NULL;
614	ci->i_snap_caps = `0`;
615
616	ci->i_last_rd = ci->i_last_wr = jiffies - `3600` * HZ;
617	for (i = `0`; i < CEPH_FILE_MODE_BITS; i++)
618	ci->i_nr_by_mode[i] = `0`;
619
620	mutex_init(&ci->i_truncate_mutex);
621	ci->i_truncate_seq = `0`;
622	ci->i_truncate_size = `0`;
623	ci->i_truncate_pending = `0`;
624	ci->i_truncate_pagecache_size = `0`;
625
626	ci->i_max_size = `0`;
627	ci->i_reported_size = `0`;
628	ci->i_wanted_max_size = `0`;
629	ci->i_requested_max_size = `0`;
630
631	ci->i_pin_ref = `0`;
632	ci->i_rd_ref = `0`;
633	ci->i_rdcache_ref = `0`;
634	ci->i_wr_ref = `0`;
635	ci->i_wb_ref = `0`;
636	ci->i_fx_ref = `0`;
637	ci->i_wrbuffer_ref = `0`;
638	ci->i_wrbuffer_ref_head = `0`;
639	atomic_set(v: &ci->i_filelock_ref, i: `0`);
640	atomic_set(v: &ci->i_shared_gen, i: `1`);
641	ci->i_rdcache_gen = `0`;
642	ci->i_rdcache_revoking = `0`;
643
644	INIT_LIST_HEAD(list: &ci->i_unsafe_dirops);
645	INIT_LIST_HEAD(list: &ci->i_unsafe_iops);
646	spin_lock_init(&ci->i_unsafe_lock);
647
648	ci->i_snap_realm = NULL;
649	INIT_LIST_HEAD(list: &ci->i_snap_realm_item);
650	INIT_LIST_HEAD(list: &ci->i_snap_flush_item);
651
652	INIT_WORK(&ci->i_work, ceph_inode_work);
653	ci->i_work_mask = `0`;
654	memset(&ci->i_btime, `'\0'`, sizeof(ci->i_btime));
655	#ifdef CONFIG_FS_ENCRYPTION
656	ci->fscrypt_auth = NULL;
657	ci->fscrypt_auth_len = `0`;
658	#endif
659	return &ci->netfs.inode;
660	}
661
662	void ceph_free_inode(struct inode *inode)
663	{
664	struct ceph_inode_info *ci = ceph_inode(inode);
665
666	kfree(objp: ci->i_symlink);
667	#ifdef CONFIG_FS_ENCRYPTION
668	kfree(objp: ci->fscrypt_auth);
669	#endif
670	fscrypt_free_inode(inode);
671	kmem_cache_free(s: ceph_inode_cachep, objp: ci);
672	}
673
674	void ceph_evict_inode(struct inode *inode)
675	{
676	struct ceph_inode_info *ci = ceph_inode(inode);
677	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
678	struct ceph_inode_frag *frag;
679	struct rb_node *n;
680
681	dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
682
683	percpu_counter_dec(fbc: &mdsc->metric.total_inodes);
684
685	truncate_inode_pages_final(&inode->i_data);
686	if (inode->i_state & I_PINNING_FSCACHE_WB)
687	ceph_fscache_unuse_cookie(inode, update: true);
688	clear_inode(inode);
689
690	ceph_fscache_unregister_inode_cookie(ci);
691	fscrypt_put_encryption_info(inode);
692
693	__ceph_remove_caps(ci);
694
695	if (__ceph_has_quota(ci, which: QUOTA_GET_ANY))
696	ceph_adjust_quota_realms_count(inode, inc: false);
697
698	/*
699	* we may still have a snap_realm reference if there are stray
700	* caps in i_snap_caps.
701	*/
702	if (ci->i_snap_realm) {
703	if (ceph_snap(inode) == CEPH_NOSNAP) {
704	dout(" dropping residual ref to snap realm %p\n",
705	ci->i_snap_realm);
706	ceph_change_snap_realm(inode, NULL);
707	} else {
708	ceph_put_snapid_map(mdsc, sm: ci->i_snapid_map);
709	ci->i_snap_realm = NULL;
710	}
711	}
712
713	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
714	frag = rb_entry(n, struct ceph_inode_frag, node);
715	rb_erase(n, &ci->i_fragtree);
716	kfree(objp: frag);
717	}
718	ci->i_fragtree_nsplits = `0`;
719
720	__ceph_destroy_xattrs(ci);
721	if (ci->i_xattrs.blob)
722	ceph_buffer_put(b: ci->i_xattrs.blob);
723	if (ci->i_xattrs.prealloc_blob)
724	ceph_buffer_put(b: ci->i_xattrs.prealloc_blob);
725
726	ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
727	ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
728	}
729
730	static inline blkcnt_t calc_inode_blocks(u64 size)
731	{
732	return (size + (`1`<<`9`) - `1`) >> `9`;
733	}
734
735	/*
736	* Helpers to fill in size, ctime, mtime, and atime. We have to be
737	* careful because either the client or MDS may have more up to date
738	* info, depending on which capabilities are held, and whether
739	* time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
740	* and size are monotonically increasing, except when utimes() or
741	* truncate() increments the corresponding _seq values.)
742	*/
743	int ceph_fill_file_size(struct inode inode, int* issued,
744	u32 truncate_seq, u64 truncate_size, u64 size)
745	{
746	struct ceph_inode_info *ci = ceph_inode(inode);
747	int queue_trunc = `0`;
748	loff_t isize = i_size_read(inode);
749
750	if (ceph_seq_cmp(a: truncate_seq, b: ci->i_truncate_seq) > `0` \|\|
751	(truncate_seq == ci->i_truncate_seq && size > isize)) {
752	dout("size %lld -> %llu\n", isize, size);
753	if (size > `0` && S_ISDIR(inode->i_mode)) {
754	pr_err("fill_file_size non-zero size for directory\n");
755	size = `0`;
756	}
757	i_size_write(inode, i_size: size);
758	inode->i_blocks = calc_inode_blocks(size);
759	/*
760	* If we're expanding, then we should be able to just update
761	* the existing cookie.
762	*/
763	if (size > isize)
764	ceph_fscache_update(inode);
765	ci->i_reported_size = size;
766	if (truncate_seq != ci->i_truncate_seq) {
767	dout("%s truncate_seq %u -> %u\n", __func__,
768	ci->i_truncate_seq, truncate_seq);
769	ci->i_truncate_seq = truncate_seq;
770
771	/ the MDS should have revoked these caps /
772	WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD \|
773	CEPH_CAP_FILE_LAZYIO));
774	/*
775	* If we hold relevant caps, or in the case where we're
776	* not the only client referencing this file and we
777	* don't hold those caps, then we need to check whether
778	* the file is either opened or mmaped
779	*/
780	if ((issued & (CEPH_CAP_FILE_CACHE\|
781	CEPH_CAP_FILE_BUFFER)) \|\|
782	mapping_mapped(mapping: inode->i_mapping) \|\|
783	__ceph_is_file_opened(ci)) {
784	ci->i_truncate_pending++;
785	queue_trunc = `1`;
786	}
787	}
788	}
789
790	/*
791	* It's possible that the new sizes of the two consecutive
792	* size truncations will be in the same fscrypt last block,
793	* and we need to truncate the corresponding page caches
794	* anyway.
795	*/
796	if (ceph_seq_cmp(a: truncate_seq, b: ci->i_truncate_seq) >= `0`) {
797	dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
798	ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
799
800	ci->i_truncate_size = truncate_size;
801
802	if (IS_ENCRYPTED(inode)) {
803	dout("%s truncate_pagecache_size %lld -> %llu\n",
804	__func__, ci->i_truncate_pagecache_size, size);
805	ci->i_truncate_pagecache_size = size;
806	} else {
807	ci->i_truncate_pagecache_size = truncate_size;
808	}
809	}
810	return queue_trunc;
811	}
812
813	void ceph_fill_file_time(struct inode inode, int* issued,
814	u64 time_warp_seq, struct timespec64 *ctime,
815	struct timespec64 mtime, struct* timespec64 *atime)
816	{
817	struct ceph_inode_info *ci = ceph_inode(inode);
818	struct timespec64 ictime = inode_get_ctime(inode);
819	int warn = `0`;
820
821	if (issued & (CEPH_CAP_FILE_EXCL\|
822	CEPH_CAP_FILE_WR\|
823	CEPH_CAP_FILE_BUFFER\|
824	CEPH_CAP_AUTH_EXCL\|
825	CEPH_CAP_XATTR_EXCL)) {
826	if (ci->i_version == `0` \|\|
827	timespec64_compare(lhs: ctime, rhs: &ictime) > `0`) {
828	dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
829	ictime.tv_sec, ictime.tv_nsec,
830	ctime->tv_sec, ctime->tv_nsec);
831	inode_set_ctime_to_ts(inode, ts: *ctime);
832	}
833	if (ci->i_version == `0` \|\|
834	ceph_seq_cmp(a: time_warp_seq, b: ci->i_time_warp_seq) > `0`) {
835	/ the MDS did a utimes() /
836	dout("mtime %lld.%09ld -> %lld.%09ld "
837	"tw %d -> %d\n",
838	inode_get_mtime_sec(inode),
839	inode_get_mtime_nsec(inode),
840	mtime->tv_sec, mtime->tv_nsec,
841	ci->i_time_warp_seq, (int)time_warp_seq);
842
843	inode_set_mtime_to_ts(inode, ts: *mtime);
844	inode_set_atime_to_ts(inode, ts: *atime);
845	ci->i_time_warp_seq = time_warp_seq;
846	} else if (time_warp_seq == ci->i_time_warp_seq) {
847	struct timespec64 ts;
848
849	/ nobody did utimes(); take the max /
850	ts = inode_get_mtime(inode);
851	if (timespec64_compare(lhs: mtime, rhs: &ts) > `0`) {
852	dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
853	ts.tv_sec, ts.tv_nsec,
854	mtime->tv_sec, mtime->tv_nsec);
855	inode_set_mtime_to_ts(inode, ts: *mtime);
856	}
857	ts = inode_get_atime(inode);
858	if (timespec64_compare(lhs: atime, rhs: &ts) > `0`) {
859	dout("atime %lld.%09ld -> %lld.%09ld inc\n",
860	ts.tv_sec, ts.tv_nsec,
861	atime->tv_sec, atime->tv_nsec);
862	inode_set_atime_to_ts(inode, ts: *atime);
863	}
864	} else if (issued & CEPH_CAP_FILE_EXCL) {
865	/ we did a utimes(); ignore mds values /
866	} else {
867	warn = `1`;
868	}
869	} else {
870	/ we have no write\|excl caps; whatever the MDS says is true /
871	if (ceph_seq_cmp(a: time_warp_seq, b: ci->i_time_warp_seq) >= `0`) {
872	inode_set_ctime_to_ts(inode, ts: *ctime);
873	inode_set_mtime_to_ts(inode, ts: *mtime);
874	inode_set_atime_to_ts(inode, ts: *atime);
875	ci->i_time_warp_seq = time_warp_seq;
876	} else {
877	warn = `1`;
878	}
879	}
880	if (warn) / time_warp_seq shouldn't go backwards /
881	dout("%p mds time_warp_seq %llu < %u\n",
882	inode, time_warp_seq, ci->i_time_warp_seq);
883	}
884
885	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
886	static int decode_encrypted_symlink(const char encsym, int* enclen, u8 **decsym)
887	{
888	int declen;
889	u8 *sym;
890
891	sym = kmalloc(size: enclen + `1`, GFP_NOFS);
892	if (!sym)
893	return -ENOMEM;
894
895	declen = ceph_base64_decode(src: encsym, srclen: enclen, dst: sym);
896	if (declen < `0`) {
897	pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
898	__func__, declen, enclen, encsym);
899	kfree(objp: sym);
900	return -EIO;
901	}
902	sym[declen + `1`] = `'\0'`;
903	*decsym = sym;
904	return declen;
905	}
906	#else
907	static int decode_encrypted_symlink(const char encsym, int* symlen, u8 **decsym)
908	{
909	return -EOPNOTSUPP;
910	}
911	#endif
912
913	/*
914	* Populate an inode based on info from mds. May be called on new or
915	* existing inodes.
916	*/
917	int ceph_fill_inode(struct inode inode, struct* page *locked_page,
918	struct ceph_mds_reply_info_in *iinfo,
919	struct ceph_mds_reply_dirfrag *dirinfo,
920	struct ceph_mds_session session, int* cap_fmode,
921	struct ceph_cap_reservation *caps_reservation)
922	{
923	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
924	struct ceph_mds_reply_inode *info = iinfo->in;
925	struct ceph_inode_info *ci = ceph_inode(inode);
926	int issued, new_issued, info_caps;
927	struct timespec64 mtime, atime, ctime;
928	struct ceph_buffer *xattr_blob = NULL;
929	struct ceph_buffer *old_blob = NULL;
930	struct ceph_string *pool_ns = NULL;
931	struct ceph_cap *new_cap = NULL;
932	int err = `0`;
933	bool wake = false;
934	bool queue_trunc = false;
935	bool new_version = false;
936	bool fill_inline = false;
937	umode_t mode = le32_to_cpu(info->mode);
938	dev_t rdev = le32_to_cpu(info->rdev);
939
940	lockdep_assert_held(&mdsc->snap_rwsem);
941
942	dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
943	inode, ceph_vinop(inode), le64_to_cpu(info->version),
944	ci->i_version);
945
946	/ Once I_NEW is cleared, we can't change type or dev numbers /
947	if (inode->i_state & I_NEW) {
948	inode->i_mode = mode;
949	} else {
950	if (inode_wrong_type(inode, mode)) {
951	pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
952	ceph_vinop(inode), inode->i_mode, mode);
953	return -ESTALE;
954	}
955
956	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && inode->i_rdev != rdev) {
957	pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
958	ceph_vinop(inode), MAJOR(inode->i_rdev),
959	MINOR(inode->i_rdev), MAJOR(rdev),
960	MINOR(rdev));
961	return -ESTALE;
962	}
963	}
964
965	info_caps = le32_to_cpu(info->cap.caps);
966
967	/ prealloc new cap struct /
968	if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
969	new_cap = ceph_get_cap(mdsc, ctx: caps_reservation);
970	if (!new_cap)
971	return -ENOMEM;
972	}
973
974	/*
975	* prealloc xattr data, if it looks like we'll need it. only
976	* if len > 4 (meaning there are actually xattrs; the first 4
977	* bytes are the xattr count).
978	*/
979	if (iinfo->xattr_len > `4`) {
980	xattr_blob = ceph_buffer_new(len: iinfo->xattr_len, GFP_NOFS);
981	if (!xattr_blob)
982	pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
983	iinfo->xattr_len);
984	}
985
986	if (iinfo->pool_ns_len > `0`)
987	pool_ns = ceph_find_or_create_string(str: iinfo->pool_ns_data,
988	len: iinfo->pool_ns_len);
989
990	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
991	ci->i_snapid_map = ceph_get_snapid_map(mdsc, snap: ceph_snap(inode));
992
993	spin_lock(lock: &ci->i_ceph_lock);
994
995	/*
996	* provided version will be odd if inode value is projected,
997	* even if stable. skip the update if we have newer stable
998	* info (ours>=theirs, e.g. due to racing mds replies), unless
999	* we are getting projected (unstable) info (in which case the
1000	* version is odd, and we want ours>theirs).
1001	* us them
1002	* 2 2 skip
1003	* 3 2 skip
1004	* 3 3 update
1005	*/
1006	if (ci->i_version == `0` \|\|
1007	((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1008	le64_to_cpu(info->version) > (ci->i_version & ~`1`)))
1009	new_version = true;
1010
1011	/ Update change_attribute /
1012	inode_set_max_iversion_raw(inode, val: iinfo->change_attr);
1013
1014	__ceph_caps_issued(ci, implemented: &issued);
1015	issued \|= __ceph_caps_dirty(ci);
1016	new_issued = ~issued & info_caps;
1017
1018	__ceph_update_quota(ci, max_bytes: iinfo->max_bytes, max_files: iinfo->max_files);
1019
1020	#ifdef CONFIG_FS_ENCRYPTION
1021	if (iinfo->fscrypt_auth_len &&
1022	((inode->i_state & I_NEW) \|\| (ci->fscrypt_auth_len == `0`))) {
1023	kfree(objp: ci->fscrypt_auth);
1024	ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
1025	ci->fscrypt_auth = iinfo->fscrypt_auth;
1026	iinfo->fscrypt_auth = NULL;
1027	iinfo->fscrypt_auth_len = `0`;
1028	inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
1029	}
1030	#endif
1031
1032	if ((new_version \|\| (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1033	(issued & CEPH_CAP_AUTH_EXCL) == `0`) {
1034	inode->i_mode = mode;
1035	inode->i_uid = make_kuid(from: &init_user_ns, le32_to_cpu(info->uid));
1036	inode->i_gid = make_kgid(from: &init_user_ns, le32_to_cpu(info->gid));
1037	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
1038	from_kuid(&init_user_ns, inode->i_uid),
1039	from_kgid(&init_user_ns, inode->i_gid));
1040	ceph_decode_timespec64(ts: &ci->i_btime, tv: &iinfo->btime);
1041	ceph_decode_timespec64(ts: &ci->i_snap_btime, tv: &iinfo->snap_btime);
1042	}
1043
1044	/ directories have fl_stripe_unit set to zero /
1045	if (IS_ENCRYPTED(inode))
1046	inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
1047	else if (le32_to_cpu(info->layout.fl_stripe_unit))
1048	inode->i_blkbits =
1049	fls(le32_to_cpu(info->layout.fl_stripe_unit)) - `1`;
1050	else
1051	inode->i_blkbits = CEPH_BLOCK_SHIFT;
1052
1053	if ((new_version \|\| (new_issued & CEPH_CAP_LINK_SHARED)) &&
1054	(issued & CEPH_CAP_LINK_EXCL) == `0`)
1055	set_nlink(inode, le32_to_cpu(info->nlink));
1056
1057	if (new_version \|\| (new_issued & CEPH_CAP_ANY_RD)) {
1058	/ be careful with mtime, atime, size /
1059	ceph_decode_timespec64(ts: &atime, tv: &info->atime);
1060	ceph_decode_timespec64(ts: &mtime, tv: &info->mtime);
1061	ceph_decode_timespec64(ts: &ctime, tv: &info->ctime);
1062	ceph_fill_file_time(inode, issued,
1063	le32_to_cpu(info->time_warp_seq),
1064	ctime: &ctime, mtime: &mtime, atime: &atime);
1065	}
1066
1067	if (new_version \|\| (info_caps & CEPH_CAP_FILE_SHARED)) {
1068	ci->i_files = le64_to_cpu(info->files);
1069	ci->i_subdirs = le64_to_cpu(info->subdirs);
1070	}
1071
1072	if (new_version \|\|
1073	(new_issued & (CEPH_CAP_ANY_FILE_RD \| CEPH_CAP_ANY_FILE_WR))) {
1074	u64 size = le64_to_cpu(info->size);
1075	s64 old_pool = ci->i_layout.pool_id;
1076	struct ceph_string *old_ns;
1077
1078	ceph_file_layout_from_legacy(fl: &ci->i_layout, legacy: &info->layout);
1079	old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
1080	lockdep_is_held(&ci->i_ceph_lock));
1081	rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
1082
1083	if (ci->i_layout.pool_id != old_pool \|\| pool_ns != old_ns)
1084	ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
1085
1086	pool_ns = old_ns;
1087
1088	if (IS_ENCRYPTED(inode) && size &&
1089	iinfo->fscrypt_file_len == sizeof(__le64)) {
1090	u64 fsize = __le64_to_cpu((__le64 )iinfo->fscrypt_file);
1091
1092	if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
1093	size = fsize;
1094	} else {
1095	pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
1096	info->size, size);
1097	}
1098	}
1099
1100	queue_trunc = ceph_fill_file_size(inode, issued,
1101	le32_to_cpu(info->truncate_seq),
1102	le64_to_cpu(info->truncate_size),
1103	size);
1104	/ only update max_size on auth cap /
1105	if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1106	ci->i_max_size != le64_to_cpu(info->max_size)) {
1107	dout("max_size %lld -> %llu\n", ci->i_max_size,
1108	le64_to_cpu(info->max_size));
1109	ci->i_max_size = le64_to_cpu(info->max_size);
1110	}
1111	}
1112
1113	/ layout and rstat are not tracked by capability, update them if*
1114	* the inode info is from auth mds */
1115	if (new_version \|\| (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1116	if (S_ISDIR(inode->i_mode)) {
1117	ci->i_dir_layout = iinfo->dir_layout;
1118	ci->i_rbytes = le64_to_cpu(info->rbytes);
1119	ci->i_rfiles = le64_to_cpu(info->rfiles);
1120	ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
1121	ci->i_dir_pin = iinfo->dir_pin;
1122	ci->i_rsnaps = iinfo->rsnaps;
1123	ceph_decode_timespec64(ts: &ci->i_rctime, tv: &info->rctime);
1124	}
1125	}
1126
1127	/ xattrs /
1128	/ note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. /
1129	if ((ci->i_xattrs.version == `0` \|\| !(issued & CEPH_CAP_XATTR_EXCL)) &&
1130	le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
1131	if (ci->i_xattrs.blob)
1132	old_blob = ci->i_xattrs.blob;
1133	ci->i_xattrs.blob = xattr_blob;
1134	if (xattr_blob)
1135	memcpy(ci->i_xattrs.blob->vec.iov_base,
1136	iinfo->xattr_data, iinfo->xattr_len);
1137	ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
1138	ceph_forget_all_cached_acls(inode);
1139	ceph_security_invalidate_secctx(inode);
1140	xattr_blob = NULL;
1141	}
1142
1143	/ finally update i_version /
1144	if (le64_to_cpu(info->version) > ci->i_version)
1145	ci->i_version = le64_to_cpu(info->version);
1146
1147	inode->i_mapping->a_ops = &ceph_aops;
1148
1149	switch (inode->i_mode & S_IFMT) {
1150	case S_IFIFO:
1151	case S_IFBLK:
1152	case S_IFCHR:
1153	case S_IFSOCK:
1154	inode->i_blkbits = PAGE_SHIFT;
1155	init_special_inode(inode, inode->i_mode, rdev);
1156	inode->i_op = &ceph_file_iops;
1157	break;
1158	case S_IFREG:
1159	inode->i_op = &ceph_file_iops;
1160	inode->i_fop = &ceph_file_fops;
1161	break;
1162	case S_IFLNK:
1163	if (!ci->i_symlink) {
1164	u32 symlen = iinfo->symlink_len;
1165	char *sym;
1166
1167	spin_unlock(lock: &ci->i_ceph_lock);
1168
1169	if (IS_ENCRYPTED(inode)) {
1170	if (symlen != i_size_read(inode))
1171	pr_err("%s %llx.%llx BAD symlink size %lld\n",
1172	__func__, ceph_vinop(inode),
1173	i_size_read(inode));
1174
1175	err = decode_encrypted_symlink(encsym: iinfo->symlink,
1176	enclen: symlen, decsym: (u8 **)&sym);
1177	if (err < `0`) {
1178	pr_err("%s decoding encrypted symlink failed: %d\n",
1179	__func__, err);
1180	goto out;
1181	}
1182	symlen = err;
1183	i_size_write(inode, i_size: symlen);
1184	inode->i_blocks = calc_inode_blocks(size: symlen);
1185	} else {
1186	if (symlen != i_size_read(inode)) {
1187	pr_err("%s %llx.%llx BAD symlink size %lld\n",
1188	__func__, ceph_vinop(inode),
1189	i_size_read(inode));
1190	i_size_write(inode, i_size: symlen);
1191	inode->i_blocks = calc_inode_blocks(size: symlen);
1192	}
1193
1194	err = -ENOMEM;
1195	sym = kstrndup(s: iinfo->symlink, len: symlen, GFP_NOFS);
1196	if (!sym)
1197	goto out;
1198	}
1199
1200	spin_lock(lock: &ci->i_ceph_lock);
1201	if (!ci->i_symlink)
1202	ci->i_symlink = sym;
1203	else
1204	kfree(objp: sym); / lost a race /
1205	}
1206
1207	if (IS_ENCRYPTED(inode)) {
1208	/*
1209	* Encrypted symlinks need to be decrypted before we can
1210	* cache their targets in i_link. Don't touch it here.
1211	*/
1212	inode->i_op = &ceph_encrypted_symlink_iops;
1213	} else {
1214	inode->i_link = ci->i_symlink;
1215	inode->i_op = &ceph_symlink_iops;
1216	}
1217	break;
1218	case S_IFDIR:
1219	inode->i_op = &ceph_dir_iops;
1220	inode->i_fop = &ceph_dir_fops;
1221	break;
1222	default:
1223	pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
1224	ceph_vinop(inode), inode->i_mode);
1225	}
1226
1227	/ were we issued a capability? /
1228	if (info_caps) {
1229	if (ceph_snap(inode) == CEPH_NOSNAP) {
1230	ceph_add_cap(inode, session,
1231	le64_to_cpu(info->cap.cap_id),
1232	issued: info_caps,
1233	le32_to_cpu(info->cap.wanted),
1234	le32_to_cpu(info->cap.seq),
1235	le32_to_cpu(info->cap.mseq),
1236	le64_to_cpu(info->cap.realm),
1237	flags: info->cap.flags, new_cap: &new_cap);
1238
1239	/ set dir completion flag? /
1240	if (S_ISDIR(inode->i_mode) &&
1241	ci->i_files == `0` && ci->i_subdirs == `0` &&
1242	(info_caps & CEPH_CAP_FILE_SHARED) &&
1243	(issued & CEPH_CAP_FILE_EXCL) == `0` &&
1244	!__ceph_dir_is_complete(ci)) {
1245	dout(" marking %p complete (empty)\n", inode);
1246	i_size_write(inode, i_size: `0`);
1247	__ceph_dir_set_complete(ci,
1248	release_count: atomic64_read(v: &ci->i_release_count),
1249	ordered_count: atomic64_read(v: &ci->i_ordered_count));
1250	}
1251
1252	wake = true;
1253	} else {
1254	dout(" %p got snap_caps %s\n", inode,
1255	ceph_cap_string(info_caps));
1256	ci->i_snap_caps \|= info_caps;
1257	}
1258	}
1259
1260	if (iinfo->inline_version > `0` &&
1261	iinfo->inline_version >= ci->i_inline_version) {
1262	int cache_caps = CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO;
1263	ci->i_inline_version = iinfo->inline_version;
1264	if (ceph_has_inline_data(ci) &&
1265	(locked_page \|\| (info_caps & cache_caps)))
1266	fill_inline = true;
1267	}
1268
1269	if (cap_fmode >= `0`) {
1270	if (!info_caps)
1271	pr_warn("mds issued no caps on %llx.%llx\n",
1272	ceph_vinop(inode));
1273	__ceph_touch_fmode(ci, mdsc, fmode: cap_fmode);
1274	}
1275
1276	spin_unlock(lock: &ci->i_ceph_lock);
1277
1278	ceph_fscache_register_inode_cookie(inode);
1279
1280	if (fill_inline)
1281	ceph_fill_inline_data(inode, locked_page,
1282	data: iinfo->inline_data, len: iinfo->inline_len);
1283
1284	if (wake)
1285	wake_up_all(&ci->i_cap_wq);
1286
1287	/ queue truncate if we saw i_size decrease /
1288	if (queue_trunc)
1289	ceph_queue_vmtruncate(inode);
1290
1291	/ populate frag tree /
1292	if (S_ISDIR(inode->i_mode))
1293	ceph_fill_fragtree(inode, fragtree: &info->fragtree, dirinfo);
1294
1295	/ update delegation info? /
1296	if (dirinfo)
1297	ceph_fill_dirfrag(inode, dirinfo);
1298
1299	err = `0`;
1300	out:
1301	if (new_cap)
1302	ceph_put_cap(mdsc, cap: new_cap);
1303	ceph_buffer_put(b: old_blob);
1304	ceph_buffer_put(b: xattr_blob);
1305	ceph_put_string(str: pool_ns);
1306	return err;
1307	}
1308
1309	/*
1310	* caller should hold session s_mutex and dentry->d_lock.
1311	*/
1312	static void __update_dentry_lease(struct inode dir, struct* dentry *dentry,
1313	struct ceph_mds_reply_lease *lease,
1314	struct ceph_mds_session *session,
1315	unsigned long from_time,
1316	struct ceph_mds_session **old_lease_session)
1317	{
1318	struct ceph_dentry_info *di = ceph_dentry(dentry);
1319	unsigned mask = le16_to_cpu(lease->mask);
1320	long unsigned duration = le32_to_cpu(lease->duration_ms);
1321	long unsigned ttl = from_time + (duration * HZ) / `1000`;
1322	long unsigned half_ttl = from_time + (duration * HZ / `2`) / `1000`;
1323
1324	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
1325	dentry, duration, ttl);
1326
1327	/ only track leases on regular dentries /
1328	if (ceph_snap(inode: dir) != CEPH_NOSNAP)
1329	return;
1330
1331	if (mask & CEPH_LEASE_PRIMARY_LINK)
1332	di->flags \|= CEPH_DENTRY_PRIMARY_LINK;
1333	else
1334	di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1335
1336	di->lease_shared_gen = atomic_read(v: &ceph_inode(inode: dir)->i_shared_gen);
1337	if (!(mask & CEPH_LEASE_VALID)) {
1338	__ceph_dentry_dir_lease_touch(di);
1339	return;
1340	}
1341
1342	if (di->lease_gen == atomic_read(v: &session->s_cap_gen) &&
1343	time_before(ttl, di->time))
1344	return; / we already have a newer lease. /
1345
1346	if (di->lease_session && di->lease_session != session) {
1347	*old_lease_session = di->lease_session;
1348	di->lease_session = NULL;
1349	}
1350
1351	if (!di->lease_session)
1352	di->lease_session = ceph_get_mds_session(s: session);
1353	di->lease_gen = atomic_read(v: &session->s_cap_gen);
1354	di->lease_seq = le32_to_cpu(lease->seq);
1355	di->lease_renew_after = half_ttl;
1356	di->lease_renew_from = `0`;
1357	di->time = ttl;
1358
1359	__ceph_dentry_lease_touch(di);
1360	}
1361
1362	static inline void update_dentry_lease(struct inode dir, struct* dentry *dentry,
1363	struct ceph_mds_reply_lease *lease,
1364	struct ceph_mds_session *session,
1365	unsigned long from_time)
1366	{
1367	struct ceph_mds_session *old_lease_session = NULL;
1368	spin_lock(lock: &dentry->d_lock);
1369	__update_dentry_lease(dir, dentry, lease, session, from_time,
1370	old_lease_session: &old_lease_session);
1371	spin_unlock(lock: &dentry->d_lock);
1372	ceph_put_mds_session(s: old_lease_session);
1373	}
1374
1375	/*
1376	* update dentry lease without having parent inode locked
1377	*/
1378	static void update_dentry_lease_careful(struct dentry *dentry,
1379	struct ceph_mds_reply_lease *lease,
1380	struct ceph_mds_session *session,
1381	unsigned long from_time,
1382	char *dname, u32 dname_len,
1383	struct ceph_vino *pdvino,
1384	struct ceph_vino *ptvino)
1385
1386	{
1387	struct inode *dir;
1388	struct ceph_mds_session *old_lease_session = NULL;
1389
1390	spin_lock(lock: &dentry->d_lock);
1391	/ make sure dentry's name matches target /
1392	if (dentry->d_name.len != dname_len \|\|
1393	memcmp(p: dentry->d_name.name, q: dname, size: dname_len))
1394	goto out_unlock;
1395
1396	dir = d_inode(dentry: dentry->d_parent);
1397	/ make sure parent matches dvino /
1398	if (!ceph_ino_compare(inode: dir, data: pdvino))
1399	goto out_unlock;
1400
1401	/ make sure dentry's inode matches target. NULL ptvino means that*
1402	* we expect a negative dentry */
1403	if (ptvino) {
1404	if (d_really_is_negative(dentry))
1405	goto out_unlock;
1406	if (!ceph_ino_compare(inode: d_inode(dentry), data: ptvino))
1407	goto out_unlock;
1408	} else {
1409	if (d_really_is_positive(dentry))
1410	goto out_unlock;
1411	}
1412
1413	__update_dentry_lease(dir, dentry, lease, session,
1414	from_time, old_lease_session: &old_lease_session);
1415	out_unlock:
1416	spin_unlock(lock: &dentry->d_lock);
1417	ceph_put_mds_session(s: old_lease_session);
1418	}
1419
1420	/*
1421	* splice a dentry to an inode.
1422	* caller must hold directory i_rwsem for this to be safe.
1423	*/
1424	static int splice_dentry(struct dentry pdn, struct** inode *in)
1425	{
1426	struct dentry dn = pdn;
1427	struct dentry *realdn;
1428
1429	BUG_ON(d_inode(dn));
1430
1431	if (S_ISDIR(in->i_mode)) {
1432	/ If inode is directory, d_splice_alias() below will remove*
1433	* 'realdn' from its origin parent. We need to ensure that
1434	* origin parent's readdir cache will not reference 'realdn'
1435	*/
1436	realdn = d_find_any_alias(inode: in);
1437	if (realdn) {
1438	struct ceph_dentry_info *di = ceph_dentry(dentry: realdn);
1439	spin_lock(lock: &realdn->d_lock);
1440
1441	realdn->d_op->d_prune(realdn);
1442
1443	di->time = jiffies;
1444	di->lease_shared_gen = `0`;
1445	di->offset = `0`;
1446
1447	spin_unlock(lock: &realdn->d_lock);
1448	dput(realdn);
1449	}
1450	}
1451
1452	/ dn must be unhashed /
1453	if (!d_unhashed(dentry: dn))
1454	d_drop(dentry: dn);
1455	realdn = d_splice_alias(in, dn);
1456	if (IS_ERR(ptr: realdn)) {
1457	pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
1458	PTR_ERR(realdn), dn, in, ceph_vinop(in));
1459	return PTR_ERR(ptr: realdn);
1460	}
1461
1462	if (realdn) {
1463	dout("dn %p (%d) spliced with %p (%d) "
1464	"inode %p ino %llx.%llx\n",
1465	dn, d_count(dn),
1466	realdn, d_count(realdn),
1467	d_inode(realdn), ceph_vinop(d_inode(realdn)));
1468	dput(dn);
1469	*pdn = realdn;
1470	} else {
1471	BUG_ON(!ceph_dentry(dn));
1472	dout("dn %p attached to %p ino %llx.%llx\n",
1473	dn, d_inode(dn), ceph_vinop(d_inode(dn)));
1474	}
1475	return `0`;
1476	}
1477
1478	/*
1479	* Incorporate results into the local cache. This is either just
1480	* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
1481	* after a lookup).
1482	*
1483	* A reply may contain
1484	* a directory inode along with a dentry.
1485	* and/or a target inode
1486	*
1487	* Called with snap_rwsem (read).
1488	*/
1489	int ceph_fill_trace(struct super_block sb, struct* ceph_mds_request *req)
1490	{
1491	struct ceph_mds_session *session = req->r_session;
1492	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1493	struct inode *in = NULL;
1494	struct ceph_vino tvino, dvino;
1495	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
1496	int err = `0`;
1497
1498	dout("fill_trace %p is_dentry %d is_target %d\n", req,
1499	rinfo->head->is_dentry, rinfo->head->is_target);
1500
1501	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
1502	dout("fill_trace reply is empty!\n");
1503	if (rinfo->head->result == `0` && req->r_parent)
1504	ceph_invalidate_dir_request(req);
1505	return `0`;
1506	}
1507
1508	if (rinfo->head->is_dentry) {
1509	struct inode *dir = req->r_parent;
1510
1511	if (dir) {
1512	err = ceph_fill_inode(inode: dir, NULL, iinfo: &rinfo->diri,
1513	dirinfo: rinfo->dirfrag, session, cap_fmode: -`1`,
1514	caps_reservation: &req->r_caps_reservation);
1515	if (err < `0`)
1516	goto done;
1517	} else {
1518	WARN_ON_ONCE(`1`);
1519	}
1520
1521	if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
1522	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1523	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1524	bool is_nokey = false;
1525	struct qstr dname;
1526	struct dentry dn, parent;
1527	struct fscrypt_str oname = FSTR_INIT(NULL, `0`);
1528	struct ceph_fname fname = { .dir = dir,
1529	.name = rinfo->dname,
1530	.ctext = rinfo->altname,
1531	.name_len = rinfo->dname_len,
1532	.ctext_len = rinfo->altname_len };
1533
1534	BUG_ON(!rinfo->head->is_target);
1535	BUG_ON(req->r_dentry);
1536
1537	parent = d_find_any_alias(inode: dir);
1538	BUG_ON(!parent);
1539
1540	err = ceph_fname_alloc_buffer(parent: dir, fname: &oname);
1541	if (err < `0`) {
1542	dput(parent);
1543	goto done;
1544	}
1545
1546	err = ceph_fname_to_usr(fname: &fname, NULL, oname: &oname, is_nokey: &is_nokey);
1547	if (err < `0`) {
1548	dput(parent);
1549	ceph_fname_free_buffer(parent: dir, fname: &oname);
1550	goto done;
1551	}
1552	dname.name = oname.name;
1553	dname.len = oname.len;
1554	dname.hash = full_name_hash(salt: parent, dname.name, dname.len);
1555	tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1556	tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1557	retry_lookup:
1558	dn = d_lookup(parent, &dname);
1559	dout("d_lookup on parent=%p name=%.*s got %p\n",
1560	parent, dname.len, dname.name, dn);
1561
1562	if (!dn) {
1563	dn = d_alloc(parent, &dname);
1564	dout("d_alloc %p '%.*s' = %p\n", parent,
1565	dname.len, dname.name, dn);
1566	if (!dn) {
1567	dput(parent);
1568	ceph_fname_free_buffer(parent: dir, fname: &oname);
1569	err = -ENOMEM;
1570	goto done;
1571	}
1572	if (is_nokey) {
1573	spin_lock(lock: &dn->d_lock);
1574	dn->d_flags \|= DCACHE_NOKEY_NAME;
1575	spin_unlock(lock: &dn->d_lock);
1576	}
1577	err = `0`;
1578	} else if (d_really_is_positive(dentry: dn) &&
1579	(ceph_ino(inode: d_inode(dentry: dn)) != tvino.ino \|\|
1580	ceph_snap(inode: d_inode(dentry: dn)) != tvino.snap)) {
1581	dout(" dn %p points to wrong inode %p\n",
1582	dn, d_inode(dn));
1583	ceph_dir_clear_ordered(inode: dir);
1584	d_delete(dn);
1585	dput(dn);
1586	goto retry_lookup;
1587	}
1588	ceph_fname_free_buffer(parent: dir, fname: &oname);
1589
1590	req->r_dentry = dn;
1591	dput(parent);
1592	}
1593	}
1594
1595	if (rinfo->head->is_target) {
1596	/ Should be filled in by handle_reply /
1597	BUG_ON(!req->r_target_inode);
1598
1599	in = req->r_target_inode;
1600	err = ceph_fill_inode(inode: in, locked_page: req->r_locked_page, iinfo: &rinfo->targeti,
1601	NULL, session,
1602	cap_fmode: (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1603	!test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
1604	rinfo->head->result == `0`) ? req->r_fmode : -`1`,
1605	caps_reservation: &req->r_caps_reservation);
1606	if (err < `0`) {
1607	pr_err("ceph_fill_inode badness %p %llx.%llx\n",
1608	in, ceph_vinop(in));
1609	req->r_target_inode = NULL;
1610	if (in->i_state & I_NEW)
1611	discard_new_inode(in);
1612	else
1613	iput(in);
1614	goto done;
1615	}
1616	if (in->i_state & I_NEW)
1617	unlock_new_inode(in);
1618	}
1619
1620	/*
1621	* ignore null lease/binding on snapdir ENOENT, or else we
1622	* will have trouble splicing in the virtual snapdir later
1623	*/
1624	if (rinfo->head->is_dentry &&
1625	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
1626	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1627	(rinfo->head->is_target \|\| strncmp(req->r_dentry->d_name.name,
1628	fsc->mount_options->snapdir_name,
1629	req->r_dentry->d_name.len))) {
1630	/*
1631	* lookup link rename : null -> possibly existing inode
1632	* mknod symlink mkdir : null -> new inode
1633	* unlink : linked -> null
1634	*/
1635	struct inode *dir = req->r_parent;
1636	struct dentry *dn = req->r_dentry;
1637	bool have_dir_cap, have_lease;
1638
1639	BUG_ON(!dn);
1640	BUG_ON(!dir);
1641	BUG_ON(d_inode(dn->d_parent) != dir);
1642
1643	dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1644	dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1645
1646	BUG_ON(ceph_ino(dir) != dvino.ino);
1647	BUG_ON(ceph_snap(dir) != dvino.snap);
1648
1649	/ do we have a lease on the whole dir? /
1650	have_dir_cap =
1651	(le32_to_cpu(rinfo->diri.in->cap.caps) &
1652	CEPH_CAP_FILE_SHARED);
1653
1654	/ do we have a dn lease? /
1655	have_lease = have_dir_cap \|\|
1656	le32_to_cpu(rinfo->dlease->duration_ms);
1657	if (!have_lease)
1658	dout("fill_trace no dentry lease or dir cap\n");
1659
1660	/ rename? /
1661	if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
1662	struct inode *olddir = req->r_old_dentry_dir;
1663	BUG_ON(!olddir);
1664
1665	dout(" src %p '%pd' dst %p '%pd'\n",
1666	req->r_old_dentry,
1667	req->r_old_dentry,
1668	dn, dn);
1669	dout("fill_trace doing d_move %p -> %p\n",
1670	req->r_old_dentry, dn);
1671
1672	/ d_move screws up sibling dentries' offsets /
1673	ceph_dir_clear_ordered(inode: dir);
1674	ceph_dir_clear_ordered(inode: olddir);
1675
1676	d_move(req->r_old_dentry, dn);
1677	dout(" src %p '%pd' dst %p '%pd'\n",
1678	req->r_old_dentry,
1679	req->r_old_dentry,
1680	dn, dn);
1681
1682	/ ensure target dentry is invalidated, despite*
1683	rehashing bug in vfs_rename_dir /*
1684	ceph_invalidate_dentry_lease(dentry: dn);
1685
1686	dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1687	ceph_dentry(req->r_old_dentry)->offset);
1688
1689	/ swap r_dentry and r_old_dentry in case that*
1690	* splice_dentry() gets called later. This is safe
1691	* because no other place will use them */
1692	req->r_dentry = req->r_old_dentry;
1693	req->r_old_dentry = dn;
1694	dn = req->r_dentry;
1695	}
1696
1697	/ null dentry? /
1698	if (!rinfo->head->is_target) {
1699	dout("fill_trace null dentry\n");
1700	if (d_really_is_positive(dentry: dn)) {
1701	dout("d_delete %p\n", dn);
1702	ceph_dir_clear_ordered(inode: dir);
1703	d_delete(dn);
1704	} else if (have_lease) {
1705	if (d_unhashed(dentry: dn))
1706	d_add(dn, NULL);
1707	}
1708
1709	if (!d_unhashed(dentry: dn) && have_lease)
1710	update_dentry_lease(dir, dentry: dn,
1711	lease: rinfo->dlease, session,
1712	from_time: req->r_request_started);
1713	goto done;
1714	}
1715
1716	/ attach proper inode /
1717	if (d_really_is_negative(dentry: dn)) {
1718	ceph_dir_clear_ordered(inode: dir);
1719	ihold(inode: in);
1720	err = splice_dentry(pdn: &req->r_dentry, in);
1721	if (err < `0`)
1722	goto done;
1723	dn = req->r_dentry; / may have spliced /
1724	} else if (d_really_is_positive(dentry: dn) && d_inode(dentry: dn) != in) {
1725	dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1726	dn, d_inode(dn), ceph_vinop(d_inode(dn)),
1727	ceph_vinop(in));
1728	d_invalidate(dn);
1729	have_lease = false;
1730	}
1731
1732	if (have_lease) {
1733	update_dentry_lease(dir, dentry: dn,
1734	lease: rinfo->dlease, session,
1735	from_time: req->r_request_started);
1736	}
1737	dout(" final dn %p\n", dn);
1738	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP \|\|
1739	req->r_op == CEPH_MDS_OP_MKSNAP) &&
1740	test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
1741	!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
1742	struct inode *dir = req->r_parent;
1743
1744	/ fill out a snapdir LOOKUPSNAP dentry /
1745	BUG_ON(!dir);
1746	BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
1747	BUG_ON(!req->r_dentry);
1748	dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
1749	ceph_dir_clear_ordered(inode: dir);
1750	ihold(inode: in);
1751	err = splice_dentry(pdn: &req->r_dentry, in);
1752	if (err < `0`)
1753	goto done;
1754	} else if (rinfo->head->is_dentry && req->r_dentry) {
1755	/ parent inode is not locked, be carefull /
1756	struct ceph_vino *ptvino = NULL;
1757	dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
1758	dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
1759	if (rinfo->head->is_target) {
1760	tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1761	tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1762	ptvino = &tvino;
1763	}
1764	update_dentry_lease_careful(dentry: req->r_dentry, lease: rinfo->dlease,
1765	session, from_time: req->r_request_started,
1766	dname: rinfo->dname, dname_len: rinfo->dname_len,
1767	pdvino: &dvino, ptvino);
1768	}
1769	done:
1770	dout("fill_trace done err=%d\n", err);
1771	return err;
1772	}
1773
1774	/*
1775	* Prepopulate our cache with readdir results, leases, etc.
1776	*/
1777	static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1778	struct ceph_mds_session *session)
1779	{
1780	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1781	int i, err = `0`;
1782
1783	for (i = `0`; i < rinfo->dir_nr; i++) {
1784	struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1785	struct ceph_vino vino;
1786	struct inode *in;
1787	int rc;
1788
1789	vino.ino = le64_to_cpu(rde->inode.in->ino);
1790	vino.snap = le64_to_cpu(rde->inode.in->snapid);
1791
1792	in = ceph_get_inode(sb: req->r_dentry->d_sb, vino, NULL);
1793	if (IS_ERR(ptr: in)) {
1794	err = PTR_ERR(ptr: in);
1795	dout("new_inode badness got %d\n", err);
1796	continue;
1797	}
1798	rc = ceph_fill_inode(inode: in, NULL, iinfo: &rde->inode, NULL, session,
1799	cap_fmode: -`1`, caps_reservation: &req->r_caps_reservation);
1800	if (rc < `0`) {
1801	pr_err("ceph_fill_inode badness on %p got %d\n",
1802	in, rc);
1803	err = rc;
1804	if (in->i_state & I_NEW) {
1805	ihold(inode: in);
1806	discard_new_inode(in);
1807	}
1808	} else if (in->i_state & I_NEW) {
1809	unlock_new_inode(in);
1810	}
1811
1812	iput(in);
1813	}
1814
1815	return err;
1816	}
1817
1818	void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
1819	{
1820	if (ctl->page) {
1821	kunmap(page: ctl->page);
1822	put_page(page: ctl->page);
1823	ctl->page = NULL;
1824	}
1825	}
1826
1827	static int fill_readdir_cache(struct inode dir, struct* dentry *dn,
1828	struct ceph_readdir_cache_control *ctl,
1829	struct ceph_mds_request *req)
1830	{
1831	struct ceph_inode_info *ci = ceph_inode(inode: dir);
1832	unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
1833	unsigned idx = ctl->index % nsize;
1834	pgoff_t pgoff = ctl->index / nsize;
1835
1836	if (!ctl->page \|\| pgoff != page_index(page: ctl->page)) {
1837	ceph_readdir_cache_release(ctl);
1838	if (idx == `0`)
1839	ctl->page = grab_cache_page(mapping: &dir->i_data, index: pgoff);
1840	else
1841	ctl->page = find_lock_page(mapping: &dir->i_data, index: pgoff);
1842	if (!ctl->page) {
1843	ctl->index = -`1`;
1844	return idx == `0` ? -ENOMEM : `0`;
1845	}
1846	/ reading/filling the cache are serialized by*
1847	* i_rwsem, no need to use page lock */
1848	unlock_page(page: ctl->page);
1849	ctl->dentries = kmap(page: ctl->page);
1850	if (idx == `0`)
1851	memset(ctl->dentries, `0`, PAGE_SIZE);
1852	}
1853
1854	if (req->r_dir_release_cnt == atomic64_read(v: &ci->i_release_count) &&
1855	req->r_dir_ordered_cnt == atomic64_read(v: &ci->i_ordered_count)) {
1856	dout("readdir cache dn %p idx %d\n", dn, ctl->index);
1857	ctl->dentries[idx] = dn;
1858	ctl->index++;
1859	} else {
1860	dout("disable readdir cache\n");
1861	ctl->index = -`1`;
1862	}
1863	return `0`;
1864	}
1865
1866	int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1867	struct ceph_mds_session *session)
1868	{
1869	struct dentry *parent = req->r_dentry;
1870	struct inode *inode = d_inode(dentry: parent);
1871	struct ceph_inode_info *ci = ceph_inode(inode);
1872	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1873	struct qstr dname;
1874	struct dentry *dn;
1875	struct inode *in;
1876	int err = `0`, skipped = `0`, ret, i;
1877	u32 frag = le32_to_cpu(req->r_args.readdir.frag);
1878	u32 last_hash = `0`;
1879	u32 fpos_offset;
1880	struct ceph_readdir_cache_control cache_ctl = {};
1881
1882	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
1883	return readdir_prepopulate_inodes_only(req, session);
1884
1885	if (rinfo->hash_order) {
1886	if (req->r_path2) {
1887	last_hash = ceph_str_hash(type: ci->i_dir_layout.dl_dir_hash,
1888	s: req->r_path2,
1889	strlen(req->r_path2));
1890	last_hash = ceph_frag_value(f: last_hash);
1891	} else if (rinfo->offset_hash) {
1892	/ mds understands offset_hash /
1893	WARN_ON_ONCE(req->r_readdir_offset != `2`);
1894	last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
1895	}
1896	}
1897
1898	if (rinfo->dir_dir &&
1899	le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1900	dout("readdir_prepopulate got new frag %x -> %x\n",
1901	frag, le32_to_cpu(rinfo->dir_dir->frag));
1902	frag = le32_to_cpu(rinfo->dir_dir->frag);
1903	if (!rinfo->hash_order)
1904	req->r_readdir_offset = `2`;
1905	}
1906
1907	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1908	dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1909	rinfo->dir_nr, parent);
1910	} else {
1911	dout("readdir_prepopulate %d items under dn %p\n",
1912	rinfo->dir_nr, parent);
1913	if (rinfo->dir_dir)
1914	ceph_fill_dirfrag(inode: d_inode(dentry: parent), dirinfo: rinfo->dir_dir);
1915
1916	if (ceph_frag_is_leftmost(f: frag) &&
1917	req->r_readdir_offset == `2` &&
1918	!(rinfo->hash_order && last_hash)) {
1919	/ note dir version at start of readdir so we can*
1920	* tell if any dentries get dropped */
1921	req->r_dir_release_cnt =
1922	atomic64_read(v: &ci->i_release_count);
1923	req->r_dir_ordered_cnt =
1924	atomic64_read(v: &ci->i_ordered_count);
1925	req->r_readdir_cache_idx = `0`;
1926	}
1927	}
1928
1929	cache_ctl.index = req->r_readdir_cache_idx;
1930	fpos_offset = req->r_readdir_offset;
1931
1932	/ FIXME: release caps/leases if error occurs /
1933	for (i = `0`; i < rinfo->dir_nr; i++) {
1934	struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1935	struct ceph_vino tvino;
1936
1937	dname.name = rde->name;
1938	dname.len = rde->name_len;
1939	dname.hash = full_name_hash(salt: parent, dname.name, dname.len);
1940
1941	tvino.ino = le64_to_cpu(rde->inode.in->ino);
1942	tvino.snap = le64_to_cpu(rde->inode.in->snapid);
1943
1944	if (rinfo->hash_order) {
1945	u32 hash = ceph_frag_value(f: rde->raw_hash);
1946	if (hash != last_hash)
1947	fpos_offset = `2`;
1948	last_hash = hash;
1949	rde->offset = ceph_make_fpos(high: hash, off: fpos_offset++, hash_order: true);
1950	} else {
1951	rde->offset = ceph_make_fpos(high: frag, off: fpos_offset++, hash_order: false);
1952	}
1953
1954	retry_lookup:
1955	dn = d_lookup(parent, &dname);
1956	dout("d_lookup on parent=%p name=%.*s got %p\n",
1957	parent, dname.len, dname.name, dn);
1958
1959	if (!dn) {
1960	dn = d_alloc(parent, &dname);
1961	dout("d_alloc %p '%.*s' = %p\n", parent,
1962	dname.len, dname.name, dn);
1963	if (!dn) {
1964	dout("d_alloc badness\n");
1965	err = -ENOMEM;
1966	goto out;
1967	}
1968	if (rde->is_nokey) {
1969	spin_lock(lock: &dn->d_lock);
1970	dn->d_flags \|= DCACHE_NOKEY_NAME;
1971	spin_unlock(lock: &dn->d_lock);
1972	}
1973	} else if (d_really_is_positive(dentry: dn) &&
1974	(ceph_ino(inode: d_inode(dentry: dn)) != tvino.ino \|\|
1975	ceph_snap(inode: d_inode(dentry: dn)) != tvino.snap)) {
1976	struct ceph_dentry_info *di = ceph_dentry(dentry: dn);
1977	dout(" dn %p points to wrong inode %p\n",
1978	dn, d_inode(dn));
1979
1980	spin_lock(lock: &dn->d_lock);
1981	if (di->offset > `0` &&
1982	di->lease_shared_gen ==
1983	atomic_read(v: &ci->i_shared_gen)) {
1984	__ceph_dir_clear_ordered(ci);
1985	di->offset = `0`;
1986	}
1987	spin_unlock(lock: &dn->d_lock);
1988
1989	d_delete(dn);
1990	dput(dn);
1991	goto retry_lookup;
1992	}
1993
1994	/ inode /
1995	if (d_really_is_positive(dentry: dn)) {
1996	in = d_inode(dentry: dn);
1997	} else {
1998	in = ceph_get_inode(sb: parent->d_sb, vino: tvino, NULL);
1999	if (IS_ERR(ptr: in)) {
2000	dout("new_inode badness\n");
2001	d_drop(dentry: dn);
2002	dput(dn);
2003	err = PTR_ERR(ptr: in);
2004	goto out;
2005	}
2006	}
2007
2008	ret = ceph_fill_inode(inode: in, NULL, iinfo: &rde->inode, NULL, session,
2009	cap_fmode: -`1`, caps_reservation: &req->r_caps_reservation);
2010	if (ret < `0`) {
2011	pr_err("ceph_fill_inode badness on %p\n", in);
2012	if (d_really_is_negative(dentry: dn)) {
2013	if (in->i_state & I_NEW) {
2014	ihold(inode: in);
2015	discard_new_inode(in);
2016	}
2017	iput(in);
2018	}
2019	d_drop(dentry: dn);
2020	err = ret;
2021	goto next_item;
2022	}
2023	if (in->i_state & I_NEW)
2024	unlock_new_inode(in);
2025
2026	if (d_really_is_negative(dentry: dn)) {
2027	if (ceph_security_xattr_deadlock(in)) {
2028	dout(" skip splicing dn %p to inode %p"
2029	" (security xattr deadlock)\n", dn, in);
2030	iput(in);
2031	skipped++;
2032	goto next_item;
2033	}
2034
2035	err = splice_dentry(pdn: &dn, in);
2036	if (err < `0`)
2037	goto next_item;
2038	}
2039
2040	ceph_dentry(dentry: dn)->offset = rde->offset;
2041
2042	update_dentry_lease(dir: d_inode(dentry: parent), dentry: dn,
2043	lease: rde->lease, session: req->r_session,
2044	from_time: req->r_request_started);
2045
2046	if (err == `0` && skipped == `0` && cache_ctl.index >= `0`) {
2047	ret = fill_readdir_cache(dir: d_inode(dentry: parent), dn,
2048	ctl: &cache_ctl, req);
2049	if (ret < `0`)
2050	err = ret;
2051	}
2052	next_item:
2053	dput(dn);
2054	}
2055	out:
2056	if (err == `0` && skipped == `0`) {
2057	set_bit(CEPH_MDS_R_DID_PREPOPULATE, addr: &req->r_req_flags);
2058	req->r_readdir_cache_idx = cache_ctl.index;
2059	}
2060	ceph_readdir_cache_release(ctl: &cache_ctl);
2061	dout("readdir_prepopulate done\n");
2062	return err;
2063	}
2064
2065	bool ceph_inode_set_size(struct inode *inode, loff_t size)
2066	{
2067	struct ceph_inode_info *ci = ceph_inode(inode);
2068	bool ret;
2069
2070	spin_lock(lock: &ci->i_ceph_lock);
2071	dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
2072	i_size_write(inode, i_size: size);
2073	ceph_fscache_update(inode);
2074	inode->i_blocks = calc_inode_blocks(size);
2075
2076	ret = __ceph_should_report_size(ci);
2077
2078	spin_unlock(lock: &ci->i_ceph_lock);
2079
2080	return ret;
2081	}
2082
2083	void ceph_queue_inode_work(struct inode inode, int* work_bit)
2084	{
2085	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2086	struct ceph_inode_info *ci = ceph_inode(inode);
2087	set_bit(nr: work_bit, addr: &ci->i_work_mask);
2088
2089	ihold(inode);
2090	if (queue_work(wq: fsc->inode_wq, work: &ci->i_work)) {
2091	dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
2092	} else {
2093	dout("queue_inode_work %p already queued, mask=%lx\n",
2094	inode, ci->i_work_mask);
2095	iput(inode);
2096	}
2097	}
2098
2099	static void ceph_do_invalidate_pages(struct inode *inode)
2100	{
2101	struct ceph_inode_info *ci = ceph_inode(inode);
2102	u32 orig_gen;
2103	int check = `0`;
2104
2105	ceph_fscache_invalidate(inode, dio_write: false);
2106
2107	mutex_lock(&ci->i_truncate_mutex);
2108
2109	if (ceph_inode_is_shutdown(inode)) {
2110	pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
2111	__func__, ceph_vinop(inode));
2112	mapping_set_error(mapping: inode->i_mapping, error: -EIO);
2113	truncate_pagecache(inode, new: `0`);
2114	mutex_unlock(lock: &ci->i_truncate_mutex);
2115	goto out;
2116	}
2117
2118	spin_lock(lock: &ci->i_ceph_lock);
2119	dout("invalidate_pages %p gen %d revoking %d\n", inode,
2120	ci->i_rdcache_gen, ci->i_rdcache_revoking);
2121	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2122	if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2123	check = `1`;
2124	spin_unlock(lock: &ci->i_ceph_lock);
2125	mutex_unlock(lock: &ci->i_truncate_mutex);
2126	goto out;
2127	}
2128	orig_gen = ci->i_rdcache_gen;
2129	spin_unlock(lock: &ci->i_ceph_lock);
2130
2131	if (invalidate_inode_pages2(mapping: inode->i_mapping) < `0`) {
2132	pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
2133	ceph_vinop(inode));
2134	}
2135
2136	spin_lock(lock: &ci->i_ceph_lock);
2137	if (orig_gen == ci->i_rdcache_gen &&
2138	orig_gen == ci->i_rdcache_revoking) {
2139	dout("invalidate_pages %p gen %d successful\n", inode,
2140	ci->i_rdcache_gen);
2141	ci->i_rdcache_revoking--;
2142	check = `1`;
2143	} else {
2144	dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
2145	inode, orig_gen, ci->i_rdcache_gen,
2146	ci->i_rdcache_revoking);
2147	if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
2148	check = `1`;
2149	}
2150	spin_unlock(lock: &ci->i_ceph_lock);
2151	mutex_unlock(lock: &ci->i_truncate_mutex);
2152	out:
2153	if (check)
2154	ceph_check_caps(ci, flags: `0`);
2155	}
2156
2157	/*
2158	* Make sure any pending truncation is applied before doing anything
2159	* that may depend on it.
2160	*/
2161	void __ceph_do_pending_vmtruncate(struct inode *inode)
2162	{
2163	struct ceph_inode_info *ci = ceph_inode(inode);
2164	u64 to;
2165	int wrbuffer_refs, finish = `0`;
2166
2167	mutex_lock(&ci->i_truncate_mutex);
2168	retry:
2169	spin_lock(lock: &ci->i_ceph_lock);
2170	if (ci->i_truncate_pending == `0`) {
2171	dout("%s %p none pending\n", __func__, inode);
2172	spin_unlock(lock: &ci->i_ceph_lock);
2173	mutex_unlock(lock: &ci->i_truncate_mutex);
2174	return;
2175	}
2176
2177	/*
2178	* make sure any dirty snapped pages are flushed before we
2179	* possibly truncate them.. so write AND block!
2180	*/
2181	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
2182	spin_unlock(lock: &ci->i_ceph_lock);
2183	dout("%s %p flushing snaps first\n", __func__, inode);
2184	filemap_write_and_wait_range(mapping: &inode->i_data, lstart: `0`,
2185	lend: inode->i_sb->s_maxbytes);
2186	goto retry;
2187	}
2188
2189	/ there should be no reader or writer /
2190	WARN_ON_ONCE(ci->i_rd_ref \|\| ci->i_wr_ref);
2191
2192	to = ci->i_truncate_pagecache_size;
2193	wrbuffer_refs = ci->i_wrbuffer_ref;
2194	dout("%s %p (%d) to %lld\n", __func__, inode,
2195	ci->i_truncate_pending, to);
2196	spin_unlock(lock: &ci->i_ceph_lock);
2197
2198	ceph_fscache_resize(inode, to);
2199	truncate_pagecache(inode, new: to);
2200
2201	spin_lock(lock: &ci->i_ceph_lock);
2202	if (to == ci->i_truncate_pagecache_size) {
2203	ci->i_truncate_pending = `0`;
2204	finish = `1`;
2205	}
2206	spin_unlock(lock: &ci->i_ceph_lock);
2207	if (!finish)
2208	goto retry;
2209
2210	mutex_unlock(lock: &ci->i_truncate_mutex);
2211
2212	if (wrbuffer_refs == `0`)
2213	ceph_check_caps(ci, flags: `0`);
2214
2215	wake_up_all(&ci->i_cap_wq);
2216	}
2217
2218	static void ceph_inode_work(struct work_struct *work)
2219	{
2220	struct ceph_inode_info ci = container_of(work, struct* ceph_inode_info,
2221	i_work);
2222	struct inode *inode = &ci->netfs.inode;
2223
2224	if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, addr: &ci->i_work_mask)) {
2225	dout("writeback %p\n", inode);
2226	filemap_fdatawrite(&inode->i_data);
2227	}
2228	if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, addr: &ci->i_work_mask))
2229	ceph_do_invalidate_pages(inode);
2230
2231	if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, addr: &ci->i_work_mask))
2232	__ceph_do_pending_vmtruncate(inode);
2233
2234	if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, addr: &ci->i_work_mask))
2235	ceph_check_caps(ci, flags: `0`);
2236
2237	if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, addr: &ci->i_work_mask))
2238	ceph_flush_snaps(ci, NULL);
2239
2240	iput(inode);
2241	}
2242
2243	static const char ceph_encrypted_get_link(struct* dentry *dentry,
2244	struct inode *inode,
2245	struct delayed_call *done)
2246	{
2247	struct ceph_inode_info *ci = ceph_inode(inode);
2248
2249	if (!dentry)
2250	return ERR_PTR(error: -ECHILD);
2251
2252	return fscrypt_get_symlink(inode, caddr: ci->i_symlink, max_size: i_size_read(inode),
2253	done);
2254	}
2255
2256	static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
2257	const struct path *path,
2258	struct kstat *stat, u32 request_mask,
2259	unsigned int query_flags)
2260	{
2261	int ret;
2262
2263	ret = ceph_getattr(idmap, path, stat, request_mask, flags: query_flags);
2264	if (ret)
2265	return ret;
2266	return fscrypt_symlink_getattr(path, stat);
2267	}
2268
2269	/*
2270	* symlinks
2271	*/
2272	static const struct inode_operations ceph_symlink_iops = {
2273	.get_link = simple_get_link,
2274	.setattr = ceph_setattr,
2275	.getattr = ceph_getattr,
2276	.listxattr = ceph_listxattr,
2277	};
2278
2279	static const struct inode_operations ceph_encrypted_symlink_iops = {
2280	.get_link = ceph_encrypted_get_link,
2281	.setattr = ceph_setattr,
2282	.getattr = ceph_encrypted_symlink_getattr,
2283	.listxattr = ceph_listxattr,
2284	};
2285
2286	/*
2287	* Transfer the encrypted last block to the MDS and the MDS
2288	* will help update it when truncating a smaller size.
2289	*
2290	* We don't support a PAGE_SIZE that is smaller than the
2291	* CEPH_FSCRYPT_BLOCK_SIZE.
2292	*/
2293	static int fill_fscrypt_truncate(struct inode *inode,
2294	struct ceph_mds_request *req,
2295	struct iattr *attr)
2296	{
2297	struct ceph_inode_info *ci = ceph_inode(inode);
2298	int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
2299	loff_t pos, orig_pos = round_down(attr->ia_size,
2300	CEPH_FSCRYPT_BLOCK_SIZE);
2301	u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
2302	struct ceph_pagelist *pagelist = NULL;
2303	struct kvec iov = {`0`};
2304	struct iov_iter iter;
2305	struct page *page = NULL;
2306	struct ceph_fscrypt_truncate_size_header header;
2307	int retry_op = `0`;
2308	int len = CEPH_FSCRYPT_BLOCK_SIZE;
2309	loff_t i_size = i_size_read(inode);
2310	int got, ret, issued;
2311	u64 objver;
2312
2313	ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, want: `0`, endoff: -`1`, got: &got);
2314	if (ret < `0`)
2315	return ret;
2316
2317	issued = __ceph_caps_issued(ci, NULL);
2318
2319	dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
2320	i_size, attr->ia_size, ceph_cap_string(got),
2321	ceph_cap_string(issued));
2322
2323	/ Try to writeback the dirty pagecaches /
2324	if (issued & (CEPH_CAP_FILE_BUFFER)) {
2325	loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - `1`;
2326
2327	ret = filemap_write_and_wait_range(mapping: inode->i_mapping,
2328	lstart: orig_pos, lend);
2329	if (ret < `0`)
2330	goto out;
2331	}
2332
2333	page = __page_cache_alloc(GFP_KERNEL);
2334	if (page == NULL) {
2335	ret = -ENOMEM;
2336	goto out;
2337	}
2338
2339	pagelist = ceph_pagelist_alloc(GFP_KERNEL);
2340	if (!pagelist) {
2341	ret = -ENOMEM;
2342	goto out;
2343	}
2344
2345	iov.iov_base = kmap_local_page(page);
2346	iov.iov_len = len;
2347	iov_iter_kvec(i: &iter, READ, kvec: &iov, nr_segs: `1`, count: len);
2348
2349	pos = orig_pos;
2350	ret = __ceph_sync_read(inode, ki_pos: &pos, to: &iter, retry_op: &retry_op, last_objver: &objver);
2351	if (ret < `0`)
2352	goto out;
2353
2354	/ Insert the header first /
2355	header.ver = `1`;
2356	header.compat = `1`;
2357	header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
2358
2359	/*
2360	* Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
2361	* because in MDS it may need this to do the truncate.
2362	*/
2363	header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
2364
2365	/*
2366	* If we hit a hole here, we should just skip filling
2367	* the fscrypt for the request, because once the fscrypt
2368	* is enabled, the file will be split into many blocks
2369	* with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
2370	* has a hole, the hole size should be multiple of block
2371	* size.
2372	*
2373	* If the Rados object doesn't exist, it will be set to 0.
2374	*/
2375	if (!objver) {
2376	dout("%s hit hole, ppos %lld < size %lld\n", __func__,
2377	pos, i_size);
2378
2379	header.data_len = cpu_to_le32(`8` + `8` + `4`);
2380	header.file_offset = `0`;
2381	ret = `0`;
2382	} else {
2383	header.data_len = cpu_to_le32(`8` + `8` + `4` + CEPH_FSCRYPT_BLOCK_SIZE);
2384	header.file_offset = cpu_to_le64(orig_pos);
2385
2386	dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
2387	boff, CEPH_FSCRYPT_BLOCK_SIZE);
2388
2389	/ truncate and zero out the extra contents for the last block /
2390	memset(iov.iov_base + boff, `0`, PAGE_SIZE - boff);
2391
2392	/ encrypt the last block /
2393	ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
2394	CEPH_FSCRYPT_BLOCK_SIZE,
2395	offs: `0`, lblk_num: block,
2396	GFP_KERNEL);
2397	if (ret)
2398	goto out;
2399	}
2400
2401	/ Insert the header /
2402	ret = ceph_pagelist_append(pl: pagelist, d: &header, l: sizeof(header));
2403	if (ret)
2404	goto out;
2405
2406	if (header.block_size) {
2407	/ Append the last block contents to pagelist /
2408	ret = ceph_pagelist_append(pl: pagelist, d: iov.iov_base,
2409	CEPH_FSCRYPT_BLOCK_SIZE);
2410	if (ret)
2411	goto out;
2412	}
2413	req->r_pagelist = pagelist;
2414	out:
2415	dout("%s %p size dropping cap refs on %s\n", __func__,
2416	inode, ceph_cap_string(got));
2417	ceph_put_cap_refs(ci, had: got);
2418	if (iov.iov_base)
2419	kunmap_local(iov.iov_base);
2420	if (page)
2421	__free_pages(page, order: `0`);
2422	if (ret && pagelist)
2423	ceph_pagelist_release(pl: pagelist);
2424	return ret;
2425	}
2426
2427	int __ceph_setattr(struct inode inode, struct* iattr *attr,
2428	struct ceph_iattr *cia)
2429	{
2430	struct ceph_inode_info *ci = ceph_inode(inode);
2431	unsigned int ia_valid = attr->ia_valid;
2432	struct ceph_mds_request *req;
2433	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
2434	struct ceph_cap_flush *prealloc_cf;
2435	loff_t isize = i_size_read(inode);
2436	int issued;
2437	int release = `0`, dirtied = `0`;
2438	int mask = `0`;
2439	int err = `0`;
2440	int inode_dirty_flags = `0`;
2441	bool lock_snap_rwsem = false;
2442	bool fill_fscrypt;
2443	int truncate_retry = `20`; / The RMW will take around 50ms /
2444
2445	retry:
2446	prealloc_cf = ceph_alloc_cap_flush();
2447	if (!prealloc_cf)
2448	return -ENOMEM;
2449
2450	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_SETATTR,
2451	mode: USE_AUTH_MDS);
2452	if (IS_ERR(ptr: req)) {
2453	ceph_free_cap_flush(cf: prealloc_cf);
2454	return PTR_ERR(ptr: req);
2455	}
2456
2457	fill_fscrypt = false;
2458	spin_lock(lock: &ci->i_ceph_lock);
2459	issued = __ceph_caps_issued(ci, NULL);
2460
2461	if (!ci->i_head_snapc &&
2462	(issued & (CEPH_CAP_ANY_EXCL \| CEPH_CAP_FILE_WR))) {
2463	lock_snap_rwsem = true;
2464	if (!down_read_trylock(sem: &mdsc->snap_rwsem)) {
2465	spin_unlock(lock: &ci->i_ceph_lock);
2466	down_read(sem: &mdsc->snap_rwsem);
2467	spin_lock(lock: &ci->i_ceph_lock);
2468	issued = __ceph_caps_issued(ci, NULL);
2469	}
2470	}
2471
2472	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
2473	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
2474	if (cia && cia->fscrypt_auth) {
2475	u32 len = ceph_fscrypt_auth_len(fa: cia->fscrypt_auth);
2476
2477	if (len > sizeof(*cia->fscrypt_auth)) {
2478	err = -EINVAL;
2479	spin_unlock(lock: &ci->i_ceph_lock);
2480	goto out;
2481	}
2482
2483	dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
2484	ceph_vinop(inode), ci->fscrypt_auth_len, len);
2485
2486	/ It should never be re-set once set /
2487	WARN_ON_ONCE(ci->fscrypt_auth);
2488
2489	if (issued & CEPH_CAP_AUTH_EXCL) {
2490	dirtied \|= CEPH_CAP_AUTH_EXCL;
2491	kfree(objp: ci->fscrypt_auth);
2492	ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
2493	ci->fscrypt_auth_len = len;
2494	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2495	ci->fscrypt_auth_len != len \|\|
2496	memcmp(p: ci->fscrypt_auth, q: cia->fscrypt_auth, size: len)) {
2497	req->r_fscrypt_auth = cia->fscrypt_auth;
2498	mask \|= CEPH_SETATTR_FSCRYPT_AUTH;
2499	release \|= CEPH_CAP_AUTH_SHARED;
2500	}
2501	cia->fscrypt_auth = NULL;
2502	}
2503	#else
2504	if (cia && cia->fscrypt_auth) {
2505	err = -EINVAL;
2506	spin_unlock(&ci->i_ceph_lock);
2507	goto out;
2508	}
2509	#endif /* CONFIG_FS_ENCRYPTION */
2510
2511	if (ia_valid & ATTR_UID) {
2512	dout("setattr %p uid %d -> %d\n", inode,
2513	from_kuid(&init_user_ns, inode->i_uid),
2514	from_kuid(&init_user_ns, attr->ia_uid));
2515	if (issued & CEPH_CAP_AUTH_EXCL) {
2516	inode->i_uid = attr->ia_uid;
2517	dirtied \|= CEPH_CAP_AUTH_EXCL;
2518	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2519	!uid_eq(left: attr->ia_uid, right: inode->i_uid)) {
2520	req->r_args.setattr.uid = cpu_to_le32(
2521	from_kuid(&init_user_ns, attr->ia_uid));
2522	mask \|= CEPH_SETATTR_UID;
2523	release \|= CEPH_CAP_AUTH_SHARED;
2524	}
2525	}
2526	if (ia_valid & ATTR_GID) {
2527	dout("setattr %p gid %d -> %d\n", inode,
2528	from_kgid(&init_user_ns, inode->i_gid),
2529	from_kgid(&init_user_ns, attr->ia_gid));
2530	if (issued & CEPH_CAP_AUTH_EXCL) {
2531	inode->i_gid = attr->ia_gid;
2532	dirtied \|= CEPH_CAP_AUTH_EXCL;
2533	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2534	!gid_eq(left: attr->ia_gid, right: inode->i_gid)) {
2535	req->r_args.setattr.gid = cpu_to_le32(
2536	from_kgid(&init_user_ns, attr->ia_gid));
2537	mask \|= CEPH_SETATTR_GID;
2538	release \|= CEPH_CAP_AUTH_SHARED;
2539	}
2540	}
2541	if (ia_valid & ATTR_MODE) {
2542	dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
2543	attr->ia_mode);
2544	if (issued & CEPH_CAP_AUTH_EXCL) {
2545	inode->i_mode = attr->ia_mode;
2546	dirtied \|= CEPH_CAP_AUTH_EXCL;
2547	} else if ((issued & CEPH_CAP_AUTH_SHARED) == `0` \|\|
2548	attr->ia_mode != inode->i_mode) {
2549	inode->i_mode = attr->ia_mode;
2550	req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
2551	mask \|= CEPH_SETATTR_MODE;
2552	release \|= CEPH_CAP_AUTH_SHARED;
2553	}
2554	}
2555
2556	if (ia_valid & ATTR_ATIME) {
2557	struct timespec64 atime = inode_get_atime(inode);
2558
2559	dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
2560	atime.tv_sec, atime.tv_nsec,
2561	attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
2562	if (issued & CEPH_CAP_FILE_EXCL) {
2563	ci->i_time_warp_seq++;
2564	inode_set_atime_to_ts(inode, ts: attr->ia_atime);
2565	dirtied \|= CEPH_CAP_FILE_EXCL;
2566	} else if ((issued & CEPH_CAP_FILE_WR) &&
2567	timespec64_compare(lhs: &atime,
2568	rhs: &attr->ia_atime) < `0`) {
2569	inode_set_atime_to_ts(inode, ts: attr->ia_atime);
2570	dirtied \|= CEPH_CAP_FILE_WR;
2571	} else if ((issued & CEPH_CAP_FILE_SHARED) == `0` \|\|
2572	!timespec64_equal(a: &atime, b: &attr->ia_atime)) {
2573	ceph_encode_timespec64(tv: &req->r_args.setattr.atime,
2574	ts: &attr->ia_atime);
2575	mask \|= CEPH_SETATTR_ATIME;
2576	release \|= CEPH_CAP_FILE_SHARED \|
2577	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2578	}
2579	}
2580	if (ia_valid & ATTR_SIZE) {
2581	dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
2582	/*
2583	* Only when the new size is smaller and not aligned to
2584	* CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
2585	*/
2586	if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
2587	(attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
2588	mask \|= CEPH_SETATTR_SIZE;
2589	release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
2590	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2591	set_bit(CEPH_MDS_R_FSCRYPT_FILE, addr: &req->r_req_flags);
2592	mask \|= CEPH_SETATTR_FSCRYPT_FILE;
2593	req->r_args.setattr.size =
2594	cpu_to_le64(round_up(attr->ia_size,
2595	CEPH_FSCRYPT_BLOCK_SIZE));
2596	req->r_args.setattr.old_size =
2597	cpu_to_le64(round_up(isize,
2598	CEPH_FSCRYPT_BLOCK_SIZE));
2599	req->r_fscrypt_file = attr->ia_size;
2600	fill_fscrypt = true;
2601	} else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
2602	if (attr->ia_size > isize) {
2603	i_size_write(inode, i_size: attr->ia_size);
2604	inode->i_blocks = calc_inode_blocks(size: attr->ia_size);
2605	ci->i_reported_size = attr->ia_size;
2606	dirtied \|= CEPH_CAP_FILE_EXCL;
2607	ia_valid \|= ATTR_MTIME;
2608	}
2609	} else if ((issued & CEPH_CAP_FILE_SHARED) == `0` \|\|
2610	attr->ia_size != isize) {
2611	mask \|= CEPH_SETATTR_SIZE;
2612	release \|= CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
2613	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2614	if (IS_ENCRYPTED(inode) && attr->ia_size) {
2615	set_bit(CEPH_MDS_R_FSCRYPT_FILE, addr: &req->r_req_flags);
2616	mask \|= CEPH_SETATTR_FSCRYPT_FILE;
2617	req->r_args.setattr.size =
2618	cpu_to_le64(round_up(attr->ia_size,
2619	CEPH_FSCRYPT_BLOCK_SIZE));
2620	req->r_args.setattr.old_size =
2621	cpu_to_le64(round_up(isize,
2622	CEPH_FSCRYPT_BLOCK_SIZE));
2623	req->r_fscrypt_file = attr->ia_size;
2624	} else {
2625	req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2626	req->r_args.setattr.old_size = cpu_to_le64(isize);
2627	req->r_fscrypt_file = `0`;
2628	}
2629	}
2630	}
2631	if (ia_valid & ATTR_MTIME) {
2632	struct timespec64 mtime = inode_get_mtime(inode);
2633
2634	dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
2635	mtime.tv_sec, mtime.tv_nsec,
2636	attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
2637	if (issued & CEPH_CAP_FILE_EXCL) {
2638	ci->i_time_warp_seq++;
2639	inode_set_mtime_to_ts(inode, ts: attr->ia_mtime);
2640	dirtied \|= CEPH_CAP_FILE_EXCL;
2641	} else if ((issued & CEPH_CAP_FILE_WR) &&
2642	timespec64_compare(lhs: &mtime, rhs: &attr->ia_mtime) < `0`) {
2643	inode_set_mtime_to_ts(inode, ts: attr->ia_mtime);
2644	dirtied \|= CEPH_CAP_FILE_WR;
2645	} else if ((issued & CEPH_CAP_FILE_SHARED) == `0` \|\|
2646	!timespec64_equal(a: &mtime, b: &attr->ia_mtime)) {
2647	ceph_encode_timespec64(tv: &req->r_args.setattr.mtime,
2648	ts: &attr->ia_mtime);
2649	mask \|= CEPH_SETATTR_MTIME;
2650	release \|= CEPH_CAP_FILE_SHARED \|
2651	CEPH_CAP_FILE_RD \| CEPH_CAP_FILE_WR;
2652	}
2653	}
2654
2655	/ these do nothing /
2656	if (ia_valid & ATTR_CTIME) {
2657	bool only = (ia_valid & (ATTR_SIZE\|ATTR_MTIME\|ATTR_ATIME\|
2658	ATTR_MODE\|ATTR_UID\|ATTR_GID)) == `0`;
2659	dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
2660	inode_get_ctime_sec(inode),
2661	inode_get_ctime_nsec(inode),
2662	attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
2663	only ? "ctime only" : "ignored");
2664	if (only) {
2665	/*
2666	* if kernel wants to dirty ctime but nothing else,
2667	* we need to choose a cap to dirty under, or do
2668	* a almost-no-op setattr
2669	*/
2670	if (issued & CEPH_CAP_AUTH_EXCL)
2671	dirtied \|= CEPH_CAP_AUTH_EXCL;
2672	else if (issued & CEPH_CAP_FILE_EXCL)
2673	dirtied \|= CEPH_CAP_FILE_EXCL;
2674	else if (issued & CEPH_CAP_XATTR_EXCL)
2675	dirtied \|= CEPH_CAP_XATTR_EXCL;
2676	else
2677	mask \|= CEPH_SETATTR_CTIME;
2678	}
2679	}
2680	if (ia_valid & ATTR_FILE)
2681	dout("setattr %p ATTR_FILE ... hrm!\n", inode);
2682
2683	if (dirtied) {
2684	inode_dirty_flags = __ceph_mark_dirty_caps(ci, mask: dirtied,
2685	pcf: &prealloc_cf);
2686	inode_set_ctime_to_ts(inode, ts: attr->ia_ctime);
2687	inode_inc_iversion_raw(inode);
2688	}
2689
2690	release &= issued;
2691	spin_unlock(lock: &ci->i_ceph_lock);
2692	if (lock_snap_rwsem) {
2693	up_read(sem: &mdsc->snap_rwsem);
2694	lock_snap_rwsem = false;
2695	}
2696
2697	if (inode_dirty_flags)
2698	__mark_inode_dirty(inode, inode_dirty_flags);
2699
2700	if (mask) {
2701	req->r_inode = inode;
2702	ihold(inode);
2703	req->r_inode_drop = release;
2704	req->r_args.setattr.mask = cpu_to_le32(mask);
2705	req->r_num_caps = `1`;
2706	req->r_stamp = attr->ia_ctime;
2707	if (fill_fscrypt) {
2708	err = fill_fscrypt_truncate(inode, req, attr);
2709	if (err)
2710	goto out;
2711	}
2712
2713	/*
2714	* The truncate request will return -EAGAIN when the
2715	* last block has been updated just before the MDS
2716	* successfully gets the xlock for the FILE lock. To
2717	* avoid corrupting the file contents we need to retry
2718	* it.
2719	*/
2720	err = ceph_mdsc_do_request(mdsc, NULL, req);
2721	if (err == -EAGAIN && truncate_retry--) {
2722	dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
2723	inode, err, ceph_cap_string(dirtied), mask);
2724	ceph_mdsc_put_request(req);
2725	ceph_free_cap_flush(cf: prealloc_cf);
2726	goto retry;
2727	}
2728	}
2729	out:
2730	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
2731	ceph_cap_string(dirtied), mask);
2732
2733	ceph_mdsc_put_request(req);
2734	ceph_free_cap_flush(cf: prealloc_cf);
2735
2736	if (err >= `0` && (mask & CEPH_SETATTR_SIZE))
2737	__ceph_do_pending_vmtruncate(inode);
2738
2739	return err;
2740	}
2741
2742	/*
2743	* setattr
2744	*/
2745	int ceph_setattr(struct mnt_idmap idmap, struct* dentry *dentry,
2746	struct iattr *attr)
2747	{
2748	struct inode *inode = d_inode(dentry);
2749	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2750	int err;
2751
2752	if (ceph_snap(inode) != CEPH_NOSNAP)
2753	return -EROFS;
2754
2755	if (ceph_inode_is_shutdown(inode))
2756	return -ESTALE;
2757
2758	err = fscrypt_prepare_setattr(dentry, attr);
2759	if (err)
2760	return err;
2761
2762	err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
2763	if (err != `0`)
2764	return err;
2765
2766	if ((attr->ia_valid & ATTR_SIZE) &&
2767	attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
2768	return -EFBIG;
2769
2770	if ((attr->ia_valid & ATTR_SIZE) &&
2771	ceph_quota_is_max_bytes_exceeded(inode, newlen: attr->ia_size))
2772	return -EDQUOT;
2773
2774	err = __ceph_setattr(inode, attr, NULL);
2775
2776	if (err >= `0` && (attr->ia_valid & ATTR_MODE))
2777	err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
2778
2779	return err;
2780	}
2781
2782	int ceph_try_to_choose_auth_mds(struct inode inode, int* mask)
2783	{
2784	int issued = ceph_caps_issued(ci: ceph_inode(inode));
2785
2786	/*
2787	* If any 'x' caps is issued we can just choose the auth MDS
2788	* instead of the random replica MDSes. Because only when the
2789	* Locker is in LOCK_EXEC state will the loner client could
2790	* get the 'x' caps. And if we send the getattr requests to
2791	* any replica MDS it must auth pin and tries to rdlock from
2792	* the auth MDS, and then the auth MDS need to do the Locker
2793	* state transition to LOCK_SYNC. And after that the lock state
2794	* will change back.
2795	*
2796	* This cost much when doing the Locker state transition and
2797	* usually will need to revoke caps from clients.
2798	*
2799	* And for the 'Xs' caps for getxattr we will also choose the
2800	* auth MDS, because the MDS side code is buggy due to setxattr
2801	* won't notify the replica MDSes when the values changed and
2802	* the replica MDS will return the old values. Though we will
2803	* fix it in MDS code, but this still makes sense for old ceph.
2804	*/
2805	if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
2806	\|\| (mask & (CEPH_STAT_RSTAT \| CEPH_STAT_CAP_XATTR)))
2807	return USE_AUTH_MDS;
2808	else
2809	return USE_ANY_MDS;
2810	}
2811
2812	/*
2813	* Verify that we have a lease on the given mask. If not,
2814	* do a getattr against an mds.
2815	*/
2816	int __ceph_do_getattr(struct inode inode, struct* page *locked_page,
2817	int mask, bool force)
2818	{
2819	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: inode->i_sb);
2820	struct ceph_mds_client *mdsc = fsc->mdsc;
2821	struct ceph_mds_request *req;
2822	int mode;
2823	int err;
2824
2825	if (ceph_snap(inode) == CEPH_SNAPDIR) {
2826	dout("do_getattr inode %p SNAPDIR\n", inode);
2827	return `0`;
2828	}
2829
2830	dout("do_getattr inode %p mask %s mode 0%o\n",
2831	inode, ceph_cap_string(mask), inode->i_mode);
2832	if (!force && ceph_caps_issued_mask_metric(ci: ceph_inode(inode), mask, touch: `1`))
2833	return `0`;
2834
2835	mode = ceph_try_to_choose_auth_mds(inode, mask);
2836	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_GETATTR, mode);
2837	if (IS_ERR(ptr: req))
2838	return PTR_ERR(ptr: req);
2839	req->r_inode = inode;
2840	ihold(inode);
2841	req->r_num_caps = `1`;
2842	req->r_args.getattr.mask = cpu_to_le32(mask);
2843	req->r_locked_page = locked_page;
2844	err = ceph_mdsc_do_request(mdsc, NULL, req);
2845	if (locked_page && err == `0`) {
2846	u64 inline_version = req->r_reply_info.targeti.inline_version;
2847	if (inline_version == `0`) {
2848	/ the reply is supposed to contain inline data /
2849	err = -EINVAL;
2850	} else if (inline_version == CEPH_INLINE_NONE \|\|
2851	inline_version == `1`) {
2852	err = -ENODATA;
2853	} else {
2854	err = req->r_reply_info.targeti.inline_len;
2855	}
2856	}
2857	ceph_mdsc_put_request(req);
2858	dout("do_getattr result=%d\n", err);
2859	return err;
2860	}
2861
2862	int ceph_do_getvxattr(struct inode inode, const* char name, void* *value,
2863	size_t size)
2864	{
2865	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: inode->i_sb);
2866	struct ceph_mds_client *mdsc = fsc->mdsc;
2867	struct ceph_mds_request *req;
2868	int mode = USE_AUTH_MDS;
2869	int err;
2870	char *xattr_value;
2871	size_t xattr_value_len;
2872
2873	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_GETVXATTR, mode);
2874	if (IS_ERR(ptr: req)) {
2875	err = -ENOMEM;
2876	goto out;
2877	}
2878
2879	req->r_feature_needed = CEPHFS_FEATURE_OP_GETVXATTR;
2880	req->r_path2 = kstrdup(s: name, GFP_NOFS);
2881	if (!req->r_path2) {
2882	err = -ENOMEM;
2883	goto put;
2884	}
2885
2886	ihold(inode);
2887	req->r_inode = inode;
2888	err = ceph_mdsc_do_request(mdsc, NULL, req);
2889	if (err < `0`)
2890	goto put;
2891
2892	xattr_value = req->r_reply_info.xattr_info.xattr_value;
2893	xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
2894
2895	dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
2896
2897	err = (int)xattr_value_len;
2898	if (size == `0`)
2899	goto put;
2900
2901	if (xattr_value_len > size) {
2902	err = -ERANGE;
2903	goto put;
2904	}
2905
2906	memcpy(value, xattr_value, xattr_value_len);
2907	put:
2908	ceph_mdsc_put_request(req);
2909	out:
2910	dout("do_getvxattr result=%d\n", err);
2911	return err;
2912	}
2913
2914
2915	/*
2916	* Check inode permissions. We verify we have a valid value for
2917	* the AUTH cap, then call the generic handler.
2918	*/
2919	int ceph_permission(struct mnt_idmap idmap, struct* inode *inode,
2920	int mask)
2921	{
2922	int err;
2923
2924	if (mask & MAY_NOT_BLOCK)
2925	return -ECHILD;
2926
2927	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, force: false);
2928
2929	if (!err)
2930	err = generic_permission(&nop_mnt_idmap, inode, mask);
2931	return err;
2932	}
2933
2934	/ Craft a mask of needed caps given a set of requested statx attrs. /
2935	static int statx_to_caps(u32 want, umode_t mode)
2936	{
2937	int mask = `0`;
2938
2939	if (want & (STATX_MODE\|STATX_UID\|STATX_GID\|STATX_CTIME\|STATX_BTIME\|STATX_CHANGE_COOKIE))
2940	mask \|= CEPH_CAP_AUTH_SHARED;
2941
2942	if (want & (STATX_NLINK\|STATX_CTIME\|STATX_CHANGE_COOKIE)) {
2943	/*
2944	* The link count for directories depends on inode->i_subdirs,
2945	* and that is only updated when Fs caps are held.
2946	*/
2947	if (S_ISDIR(mode))
2948	mask \|= CEPH_CAP_FILE_SHARED;
2949	else
2950	mask \|= CEPH_CAP_LINK_SHARED;
2951	}
2952
2953	if (want & (STATX_ATIME\|STATX_MTIME\|STATX_CTIME\|STATX_SIZE\|STATX_BLOCKS\|STATX_CHANGE_COOKIE))
2954	mask \|= CEPH_CAP_FILE_SHARED;
2955
2956	if (want & (STATX_CTIME\|STATX_CHANGE_COOKIE))
2957	mask \|= CEPH_CAP_XATTR_SHARED;
2958
2959	return mask;
2960	}
2961
2962	/*
2963	* Get all the attributes. If we have sufficient caps for the requested attrs,
2964	* then we can avoid talking to the MDS at all.
2965	*/
2966	int ceph_getattr(struct mnt_idmap idmap, const* struct path *path,
2967	struct kstat stat, u32 request_mask, unsigned* int flags)
2968	{
2969	struct inode *inode = d_inode(dentry: path->dentry);
2970	struct super_block *sb = inode->i_sb;
2971	struct ceph_inode_info *ci = ceph_inode(inode);
2972	u32 valid_mask = STATX_BASIC_STATS;
2973	int err = `0`;
2974
2975	if (ceph_inode_is_shutdown(inode))
2976	return -ESTALE;
2977
2978	/ Skip the getattr altogether if we're asked not to sync /
2979	if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
2980	err = ceph_do_getattr(inode,
2981	mask: statx_to_caps(want: request_mask, mode: inode->i_mode),
2982	force: flags & AT_STATX_FORCE_SYNC);
2983	if (err)
2984	return err;
2985	}
2986
2987	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
2988	stat->ino = ceph_present_inode(inode);
2989
2990	/*
2991	* btime on newly-allocated inodes is 0, so if this is still set to
2992	* that, then assume that it's not valid.
2993	*/
2994	if (ci->i_btime.tv_sec \|\| ci->i_btime.tv_nsec) {
2995	stat->btime = ci->i_btime;
2996	valid_mask \|= STATX_BTIME;
2997	}
2998
2999	if (request_mask & STATX_CHANGE_COOKIE) {
3000	stat->change_cookie = inode_peek_iversion_raw(inode);
3001	valid_mask \|= STATX_CHANGE_COOKIE;
3002	}
3003
3004	if (ceph_snap(inode) == CEPH_NOSNAP)
3005	stat->dev = sb->s_dev;
3006	else
3007	stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : `0`;
3008
3009	if (S_ISDIR(inode->i_mode)) {
3010	if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) {
3011	stat->size = ci->i_rbytes;
3012	} else if (ceph_snap(inode) == CEPH_SNAPDIR) {
3013	struct ceph_inode_info *pci;
3014	struct ceph_snap_realm *realm;
3015	struct inode *parent;
3016
3017	parent = ceph_lookup_inode(sb, ino: ceph_ino(inode));
3018	if (IS_ERR(ptr: parent))
3019	return PTR_ERR(ptr: parent);
3020
3021	pci = ceph_inode(inode: parent);
3022	spin_lock(lock: &pci->i_ceph_lock);
3023	realm = pci->i_snap_realm;
3024	if (realm)
3025	stat->size = realm->num_snaps;
3026	else
3027	stat->size = `0`;
3028	spin_unlock(lock: &pci->i_ceph_lock);
3029	iput(parent);
3030	} else {
3031	stat->size = ci->i_files + ci->i_subdirs;
3032	}
3033	stat->blocks = `0`;
3034	stat->blksize = `65536`;
3035	/*
3036	* Some applications rely on the number of st_nlink
3037	* value on directories to be either 0 (if unlinked)
3038	* or 2 + number of subdirectories.
3039	*/
3040	if (stat->nlink == `1`)
3041	/ '.' + '..' + subdirs /
3042	stat->nlink = `1` + `1` + ci->i_subdirs;
3043	}
3044
3045	stat->attributes \|= STATX_ATTR_CHANGE_MONOTONIC;
3046	if (IS_ENCRYPTED(inode))
3047	stat->attributes \|= STATX_ATTR_ENCRYPTED;
3048	stat->attributes_mask \|= (STATX_ATTR_CHANGE_MONOTONIC \|
3049	STATX_ATTR_ENCRYPTED);
3050
3051	stat->result_mask = request_mask & valid_mask;
3052	return err;
3053	}
3054
3055	void ceph_inode_shutdown(struct inode *inode)
3056	{
3057	struct ceph_inode_info *ci = ceph_inode(inode);
3058	struct rb_node *p;
3059	int iputs = `0`;
3060	bool invalidate = false;
3061
3062	spin_lock(lock: &ci->i_ceph_lock);
3063	ci->i_ceph_flags \|= CEPH_I_SHUTDOWN;
3064	p = rb_first(&ci->i_caps);
3065	while (p) {
3066	struct ceph_cap cap = rb_entry(p, struct* ceph_cap, ci_node);
3067
3068	p = rb_next(p);
3069	iputs += ceph_purge_inode_cap(inode, cap, invalidate: &invalidate);
3070	}
3071	spin_unlock(lock: &ci->i_ceph_lock);
3072
3073	if (invalidate)
3074	ceph_queue_invalidate(inode);
3075	while (iputs--)
3076	iput(inode);
3077	}
3078

source code of linux/fs/ceph/inode.c