dir.c source code [linux/fs/ceph/dir.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/spinlock.h>
5	#include <linux/namei.h>
6	#include <linux/slab.h>
7	#include <linux/sched.h>
8	#include <linux/xattr.h>
9
10	#include "super.h"
11	#include "mds_client.h"
12	#include "crypto.h"
13
14	/*
15	* Directory operations: readdir, lookup, create, link, unlink,
16	* rename, etc.
17	*/
18
19	/*
20	* Ceph MDS operations are specified in terms of a base ino and
21	* relative path. Thus, the client can specify an operation on a
22	* specific inode (e.g., a getattr due to fstat(2)), or as a path
23	* relative to, say, the root directory.
24	*
25	* Normally, we limit ourselves to strict inode ops (no path component)
26	* or dentry operations (a single path component relative to an ino). The
27	* exception to this is open_root_dentry(), which will open the mount
28	* point by name.
29	*/
30
31	const struct dentry_operations ceph_dentry_ops;
32
33	static bool __dentry_lease_is_valid(struct ceph_dentry_info *di);
34	static int __dir_lease_try_check(const struct dentry *dentry);
35
36	/*
37	* Initialize ceph dentry state.
38	*/
39	static int ceph_d_init(struct dentry *dentry)
40	{
41	struct ceph_dentry_info *di;
42	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dentry->d_sb);
43
44	di = kmem_cache_zalloc(k: ceph_dentry_cachep, GFP_KERNEL);
45	if (!di)
46	return -ENOMEM; / oh well /
47
48	di->dentry = dentry;
49	di->lease_session = NULL;
50	di->time = jiffies;
51	dentry->d_fsdata = di;
52	INIT_LIST_HEAD(list: &di->lease_list);
53
54	atomic64_inc(v: &mdsc->metric.total_dentries);
55
56	return `0`;
57	}
58
59	/*
60	* for f_pos for readdir:
61	* - hash order:
62	* (0xff << 52) \| ((24 bits hash) << 28) \|
63	* (the nth entry has hash collision);
64	* - frag+name order;
65	* ((frag value) << 28) \| (the nth entry in frag);
66	*/
67	#define OFFSET_BITS 28
68	#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
69	#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
70	loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
71	{
72	loff_t fpos = ((loff_t)high << `28`) \| (loff_t)off;
73	if (hash_order)
74	fpos \|= HASH_ORDER;
75	return fpos;
76	}
77
78	static bool is_hash_order(loff_t p)
79	{
80	return (p & HASH_ORDER) == HASH_ORDER;
81	}
82
83	static unsigned fpos_frag(loff_t p)
84	{
85	return p >> OFFSET_BITS;
86	}
87
88	static unsigned fpos_hash(loff_t p)
89	{
90	return ceph_frag_value(f: fpos_frag(p));
91	}
92
93	static unsigned fpos_off(loff_t p)
94	{
95	return p & OFFSET_MASK;
96	}
97
98	static int fpos_cmp(loff_t l, loff_t r)
99	{
100	int v = ceph_frag_compare(a: fpos_frag(p: l), b: fpos_frag(p: r));
101	if (v)
102	return v;
103	return (int)(fpos_off(p: l) - fpos_off(p: r));
104	}
105
106	/*
107	* make note of the last dentry we read, so we can
108	* continue at the same lexicographical point,
109	* regardless of what dir changes take place on the
110	* server.
111	*/
112	static int note_last_dentry(struct ceph_dir_file_info dfi, const* char *name,
113	int len, unsigned next_offset)
114	{
115	char *buf = kmalloc(size: len+`1`, GFP_KERNEL);
116	if (!buf)
117	return -ENOMEM;
118	kfree(objp: dfi->last_name);
119	dfi->last_name = buf;
120	memcpy(dfi->last_name, name, len);
121	dfi->last_name[len] = `0`;
122	dfi->next_offset = next_offset;
123	dout("note_last_dentry '%s'\n", dfi->last_name);
124	return `0`;
125	}
126
127
128	static struct dentry *
129	__dcache_find_get_entry(struct dentry *parent, u64 idx,
130	struct ceph_readdir_cache_control *cache_ctl)
131	{
132	struct inode *dir = d_inode(dentry: parent);
133	struct dentry *dentry;
134	unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - `1`;
135	loff_t ptr_pos = idx * sizeof(struct dentry *);
136	pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
137
138	if (ptr_pos >= i_size_read(inode: dir))
139	return NULL;
140
141	if (!cache_ctl->page \|\| ptr_pgoff != page_index(page: cache_ctl->page)) {
142	ceph_readdir_cache_release(ctl: cache_ctl);
143	cache_ctl->page = find_lock_page(mapping: &dir->i_data, index: ptr_pgoff);
144	if (!cache_ctl->page) {
145	dout(" page %lu not found\n", ptr_pgoff);
146	return ERR_PTR(error: -EAGAIN);
147	}
148	/ reading/filling the cache are serialized by*
149	i_rwsem, no need to use page lock /*
150	unlock_page(page: cache_ctl->page);
151	cache_ctl->dentries = kmap(page: cache_ctl->page);
152	}
153
154	cache_ctl->index = idx & idx_mask;
155
156	rcu_read_lock();
157	spin_lock(lock: &parent->d_lock);
158	/ check i_size again here, because empty directory can be*
159	* marked as complete while not holding the i_rwsem. */
160	if (ceph_dir_is_complete_ordered(inode: dir) && ptr_pos < i_size_read(inode: dir))
161	dentry = cache_ctl->dentries[cache_ctl->index];
162	else
163	dentry = NULL;
164	spin_unlock(lock: &parent->d_lock);
165	if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
166	dentry = NULL;
167	rcu_read_unlock();
168	return dentry ? : ERR_PTR(error: -EAGAIN);
169	}
170
171	/*
172	* When possible, we try to satisfy a readdir by peeking at the
173	* dcache. We make this work by carefully ordering dentries on
174	* d_child when we initially get results back from the MDS, and
175	* falling back to a "normal" sync readdir if any dentries in the dir
176	* are dropped.
177	*
178	* Complete dir indicates that we have all dentries in the dir. It is
179	* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
180	* the MDS if/when the directory is modified).
181	*/
182	static int __dcache_readdir(struct file file, struct* dir_context *ctx,
183	int shared_gen)
184	{
185	struct ceph_dir_file_info *dfi = file->private_data;
186	struct dentry *parent = file->f_path.dentry;
187	struct inode *dir = d_inode(dentry: parent);
188	struct dentry dentry, last = NULL;
189	struct ceph_dentry_info *di;
190	struct ceph_readdir_cache_control cache_ctl = {};
191	u64 idx = `0`;
192	int err = `0`;
193
194	dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos);
195
196	/ search start position /
197	if (ctx->pos > `2`) {
198	u64 count = div_u64(dividend: i_size_read(inode: dir), divisor: sizeof(struct dentry *));
199	while (count > `0`) {
200	u64 step = count >> `1`;
201	dentry = __dcache_find_get_entry(parent, idx: idx + step,
202	cache_ctl: &cache_ctl);
203	if (!dentry) {
204	/ use linar search /
205	idx = `0`;
206	break;
207	}
208	if (IS_ERR(ptr: dentry)) {
209	err = PTR_ERR(ptr: dentry);
210	goto out;
211	}
212	di = ceph_dentry(dentry);
213	spin_lock(lock: &dentry->d_lock);
214	if (fpos_cmp(l: di->offset, r: ctx->pos) < `0`) {
215	idx += step + `1`;
216	count -= step + `1`;
217	} else {
218	count = step;
219	}
220	spin_unlock(lock: &dentry->d_lock);
221	dput(dentry);
222	}
223
224	dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
225	}
226
227
228	for (;;) {
229	bool emit_dentry = false;
230	dentry = __dcache_find_get_entry(parent, idx: idx++, cache_ctl: &cache_ctl);
231	if (!dentry) {
232	dfi->file_info.flags \|= CEPH_F_ATEND;
233	err = `0`;
234	break;
235	}
236	if (IS_ERR(ptr: dentry)) {
237	err = PTR_ERR(ptr: dentry);
238	goto out;
239	}
240
241	spin_lock(lock: &dentry->d_lock);
242	di = ceph_dentry(dentry);
243	if (d_unhashed(dentry) \|\|
244	d_really_is_negative(dentry) \|\|
245	di->lease_shared_gen != shared_gen \|\|
246	((dentry->d_flags & DCACHE_NOKEY_NAME) &&
247	fscrypt_has_encryption_key(inode: dir))) {
248	spin_unlock(lock: &dentry->d_lock);
249	dput(dentry);
250	err = -EAGAIN;
251	goto out;
252	}
253	if (fpos_cmp(l: ctx->pos, r: di->offset) <= `0`) {
254	__ceph_dentry_dir_lease_touch(di);
255	emit_dentry = true;
256	}
257	spin_unlock(lock: &dentry->d_lock);
258
259	if (emit_dentry) {
260	dout(" %llx dentry %p %pd %p\n", di->offset,
261	dentry, dentry, d_inode(dentry));
262	ctx->pos = di->offset;
263	if (!dir_emit(ctx, name: dentry->d_name.name,
264	namelen: dentry->d_name.len, ino: ceph_present_inode(inode: d_inode(dentry)),
265	type: d_inode(dentry)->i_mode >> `12`)) {
266	dput(dentry);
267	err = `0`;
268	break;
269	}
270	ctx->pos++;
271
272	if (last)
273	dput(last);
274	last = dentry;
275	} else {
276	dput(dentry);
277	}
278	}
279	out:
280	ceph_readdir_cache_release(ctl: &cache_ctl);
281	if (last) {
282	int ret;
283	di = ceph_dentry(dentry: last);
284	ret = note_last_dentry(dfi, name: last->d_name.name, len: last->d_name.len,
285	next_offset: fpos_off(p: di->offset) + `1`);
286	if (ret < `0`)
287	err = ret;
288	dput(last);
289	/ last_name no longer match cache index /
290	if (dfi->readdir_cache_idx >= `0`) {
291	dfi->readdir_cache_idx = -`1`;
292	dfi->dir_release_count = `0`;
293	}
294	}
295	return err;
296	}
297
298	static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos)
299	{
300	if (!dfi->last_readdir)
301	return true;
302	if (is_hash_order(p: pos))
303	return !ceph_frag_contains_value(f: dfi->frag, v: fpos_hash(p: pos));
304	else
305	return dfi->frag != fpos_frag(p: pos);
306	}
307
308	static int ceph_readdir(struct file file, struct* dir_context *ctx)
309	{
310	struct ceph_dir_file_info *dfi = file->private_data;
311	struct inode *inode = file_inode(f: file);
312	struct ceph_inode_info *ci = ceph_inode(inode);
313	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
314	struct ceph_mds_client *mdsc = fsc->mdsc;
315	int i;
316	int err;
317	unsigned frag = -`1`;
318	struct ceph_mds_reply_info_parsed *rinfo;
319
320	dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
321	if (dfi->file_info.flags & CEPH_F_ATEND)
322	return `0`;
323
324	/ always start with . and .. /
325	if (ctx->pos == `0`) {
326	dout("readdir off 0 -> '.'\n");
327	if (!dir_emit(ctx, name: ".", namelen: `1`, ino: ceph_present_inode(inode),
328	type: inode->i_mode >> `12`))
329	return `0`;
330	ctx->pos = `1`;
331	}
332	if (ctx->pos == `1`) {
333	u64 ino;
334	struct dentry *dentry = file->f_path.dentry;
335
336	spin_lock(lock: &dentry->d_lock);
337	ino = ceph_present_inode(inode: dentry->d_parent->d_inode);
338	spin_unlock(lock: &dentry->d_lock);
339
340	dout("readdir off 1 -> '..'\n");
341	if (!dir_emit(ctx, name: "..", namelen: `2`, ino, type: inode->i_mode >> `12`))
342	return `0`;
343	ctx->pos = `2`;
344	}
345
346	err = ceph_fscrypt_prepare_readdir(dir: inode);
347	if (err < `0`)
348	return err;
349
350	spin_lock(lock: &ci->i_ceph_lock);
351	/ request Fx cap. if have Fx, we don't need to release Fs cap*
352	* for later create/unlink. */
353	__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
354	/ can we use the dcache? /
355	if (ceph_test_mount_opt(fsc, DCACHE) &&
356	!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
357	ceph_snap(inode) != CEPH_SNAPDIR &&
358	__ceph_dir_is_complete_ordered(ci) &&
359	__ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, t: `1`)) {
360	int shared_gen = atomic_read(v: &ci->i_shared_gen);
361
362	spin_unlock(lock: &ci->i_ceph_lock);
363	err = __dcache_readdir(file, ctx, shared_gen);
364	if (err != -EAGAIN)
365	return err;
366	} else {
367	spin_unlock(lock: &ci->i_ceph_lock);
368	}
369
370	/ proceed with a normal readdir /
371	more:
372	/ do we have the correct frag content buffered? /
373	if (need_send_readdir(dfi, pos: ctx->pos)) {
374	struct ceph_mds_request *req;
375	int op = ceph_snap(inode) == CEPH_SNAPDIR ?
376	CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
377
378	/ discard old result, if any /
379	if (dfi->last_readdir) {
380	ceph_mdsc_put_request(req: dfi->last_readdir);
381	dfi->last_readdir = NULL;
382	}
383
384	if (is_hash_order(p: ctx->pos)) {
385	/ fragtree isn't always accurate. choose frag*
386	* based on previous reply when possible. */
387	if (frag == (unsigned)-`1`)
388	frag = ceph_choose_frag(ci, v: fpos_hash(p: ctx->pos),
389	NULL, NULL);
390	} else {
391	frag = fpos_frag(p: ctx->pos);
392	}
393
394	dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
395	ceph_vinop(inode), frag, dfi->last_name);
396	req = ceph_mdsc_create_request(mdsc, op, mode: USE_AUTH_MDS);
397	if (IS_ERR(ptr: req))
398	return PTR_ERR(ptr: req);
399
400	err = ceph_alloc_readdir_reply_buffer(req, dir: inode);
401	if (err) {
402	ceph_mdsc_put_request(req);
403	return err;
404	}
405	/ hints to request -> mds selection code /
406	req->r_direct_mode = USE_AUTH_MDS;
407	if (op == CEPH_MDS_OP_READDIR) {
408	req->r_direct_hash = ceph_frag_value(f: frag);
409	__set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
410	req->r_inode_drop = CEPH_CAP_FILE_EXCL;
411	}
412	if (dfi->last_name) {
413	struct qstr d_name = { .name = dfi->last_name,
414	.len = strlen(dfi->last_name) };
415
416	req->r_path2 = kzalloc(NAME_MAX + `1`, GFP_KERNEL);
417	if (!req->r_path2) {
418	ceph_mdsc_put_request(req);
419	return -ENOMEM;
420	}
421
422	err = ceph_encode_encrypted_dname(parent: inode, d_name: &d_name,
423	buf: req->r_path2);
424	if (err < `0`) {
425	ceph_mdsc_put_request(req);
426	return err;
427	}
428	} else if (is_hash_order(p: ctx->pos)) {
429	req->r_args.readdir.offset_hash =
430	cpu_to_le32(fpos_hash(ctx->pos));
431	}
432
433	req->r_dir_release_cnt = dfi->dir_release_count;
434	req->r_dir_ordered_cnt = dfi->dir_ordered_count;
435	req->r_readdir_cache_idx = dfi->readdir_cache_idx;
436	req->r_readdir_offset = dfi->next_offset;
437	req->r_args.readdir.frag = cpu_to_le32(frag);
438	req->r_args.readdir.flags =
439	cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
440
441	req->r_inode = inode;
442	ihold(inode);
443	req->r_dentry = dget(dentry: file->f_path.dentry);
444	err = ceph_mdsc_do_request(mdsc, NULL, req);
445	if (err < `0`) {
446	ceph_mdsc_put_request(req);
447	return err;
448	}
449	dout("readdir got and parsed readdir result=%d on "
450	"frag %x, end=%d, complete=%d, hash_order=%d\n",
451	err, frag,
452	(int)req->r_reply_info.dir_end,
453	(int)req->r_reply_info.dir_complete,
454	(int)req->r_reply_info.hash_order);
455
456	rinfo = &req->r_reply_info;
457	if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
458	frag = le32_to_cpu(rinfo->dir_dir->frag);
459	if (!rinfo->hash_order) {
460	dfi->next_offset = req->r_readdir_offset;
461	/ adjust ctx->pos to beginning of frag /
462	ctx->pos = ceph_make_fpos(high: frag,
463	off: dfi->next_offset,
464	hash_order: false);
465	}
466	}
467
468	dfi->frag = frag;
469	dfi->last_readdir = req;
470
471	if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) {
472	dfi->readdir_cache_idx = req->r_readdir_cache_idx;
473	if (dfi->readdir_cache_idx < `0`) {
474	/ preclude from marking dir ordered /
475	dfi->dir_ordered_count = `0`;
476	} else if (ceph_frag_is_leftmost(f: frag) &&
477	dfi->next_offset == `2`) {
478	/ note dir version at start of readdir so*
479	* we can tell if any dentries get dropped */
480	dfi->dir_release_count = req->r_dir_release_cnt;
481	dfi->dir_ordered_count = req->r_dir_ordered_cnt;
482	}
483	} else {
484	dout("readdir !did_prepopulate\n");
485	/ disable readdir cache /
486	dfi->readdir_cache_idx = -`1`;
487	/ preclude from marking dir complete /
488	dfi->dir_release_count = `0`;
489	}
490
491	/ note next offset and last dentry name /
492	if (rinfo->dir_nr > `0`) {
493	struct ceph_mds_reply_dir_entry *rde =
494	rinfo->dir_entries + (rinfo->dir_nr-`1`);
495	unsigned next_offset = req->r_reply_info.dir_end ?
496	`2` : (fpos_off(p: rde->offset) + `1`);
497	err = note_last_dentry(dfi, name: rde->name, len: rde->name_len,
498	next_offset);
499	if (err) {
500	ceph_mdsc_put_request(req: dfi->last_readdir);
501	dfi->last_readdir = NULL;
502	return err;
503	}
504	} else if (req->r_reply_info.dir_end) {
505	dfi->next_offset = `2`;
506	/ keep last name /
507	}
508	}
509
510	rinfo = &dfi->last_readdir->r_reply_info;
511	dout("readdir frag %x num %d pos %llx chunk first %llx\n",
512	dfi->frag, rinfo->dir_nr, ctx->pos,
513	rinfo->dir_nr ? rinfo->dir_entries[`0`].offset : `0LL`);
514
515	i = `0`;
516	/ search start position /
517	if (rinfo->dir_nr > `0`) {
518	int step, nr = rinfo->dir_nr;
519	while (nr > `0`) {
520	step = nr >> `1`;
521	if (rinfo->dir_entries[i + step].offset < ctx->pos) {
522	i += step + `1`;
523	nr -= step + `1`;
524	} else {
525	nr = step;
526	}
527	}
528	}
529	for (; i < rinfo->dir_nr; i++) {
530	struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
531
532	if (rde->offset < ctx->pos) {
533	pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
534	__func__, rde->offset, ctx->pos);
535	return -EIO;
536	}
537
538	if (WARN_ON_ONCE(!rde->inode.in))
539	return -EIO;
540
541	ctx->pos = rde->offset;
542	dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
543	i, rinfo->dir_nr, ctx->pos,
544	rde->name_len, rde->name, &rde->inode.in);
545
546	if (!dir_emit(ctx, name: rde->name, namelen: rde->name_len,
547	ino: ceph_present_ino(sb: inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
548	le32_to_cpu(rde->inode.in->mode) >> `12`)) {
549	/*
550	* NOTE: Here no need to put the 'dfi->last_readdir',
551	* because when dir_emit stops us it's most likely
552	* doesn't have enough memory, etc. So for next readdir
553	* it will continue.
554	*/
555	dout("filldir stopping us...\n");
556	return `0`;
557	}
558
559	/ Reset the lengths to their original allocated vals /
560	ctx->pos++;
561	}
562
563	ceph_mdsc_put_request(req: dfi->last_readdir);
564	dfi->last_readdir = NULL;
565
566	if (dfi->next_offset > `2`) {
567	frag = dfi->frag;
568	goto more;
569	}
570
571	/ more frags? /
572	if (!ceph_frag_is_rightmost(f: dfi->frag)) {
573	frag = ceph_frag_next(f: dfi->frag);
574	if (is_hash_order(p: ctx->pos)) {
575	loff_t new_pos = ceph_make_fpos(high: ceph_frag_value(f: frag),
576	off: dfi->next_offset, hash_order: true);
577	if (new_pos > ctx->pos)
578	ctx->pos = new_pos;
579	/ keep last_name /
580	} else {
581	ctx->pos = ceph_make_fpos(high: frag, off: dfi->next_offset,
582	hash_order: false);
583	kfree(objp: dfi->last_name);
584	dfi->last_name = NULL;
585	}
586	dout("readdir next frag is %x\n", frag);
587	goto more;
588	}
589	dfi->file_info.flags \|= CEPH_F_ATEND;
590
591	/*
592	* if dir_release_count still matches the dir, no dentries
593	* were released during the whole readdir, and we should have
594	* the complete dir contents in our cache.
595	*/
596	if (atomic64_read(v: &ci->i_release_count) ==
597	dfi->dir_release_count) {
598	spin_lock(lock: &ci->i_ceph_lock);
599	if (dfi->dir_ordered_count ==
600	atomic64_read(v: &ci->i_ordered_count)) {
601	dout(" marking %p complete and ordered\n", inode);
602	/ use i_size to track number of entries in*
603	* readdir cache */
604	BUG_ON(dfi->readdir_cache_idx < `0`);
605	i_size_write(inode, i_size: dfi->readdir_cache_idx *
606	sizeof(struct dentry*));
607	} else {
608	dout(" marking %p complete\n", inode);
609	}
610	__ceph_dir_set_complete(ci, release_count: dfi->dir_release_count,
611	ordered_count: dfi->dir_ordered_count);
612	spin_unlock(lock: &ci->i_ceph_lock);
613	}
614	dout("readdir %p file %p done.\n", inode, file);
615	return `0`;
616	}
617
618	static void reset_readdir(struct ceph_dir_file_info *dfi)
619	{
620	if (dfi->last_readdir) {
621	ceph_mdsc_put_request(req: dfi->last_readdir);
622	dfi->last_readdir = NULL;
623	}
624	kfree(objp: dfi->last_name);
625	dfi->last_name = NULL;
626	dfi->dir_release_count = `0`;
627	dfi->readdir_cache_idx = -`1`;
628	dfi->next_offset = `2`; / compensate for . and .. /
629	dfi->file_info.flags &= ~CEPH_F_ATEND;
630	}
631
632	/*
633	* discard buffered readdir content on seekdir(0), or seek to new frag,
634	* or seek prior to current chunk
635	*/
636	static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos)
637	{
638	struct ceph_mds_reply_info_parsed *rinfo;
639	loff_t chunk_offset;
640	if (new_pos == `0`)
641	return true;
642	if (is_hash_order(p: new_pos)) {
643	/ no need to reset last_name for a forward seek when*
644	* dentries are sotred in hash order */
645	} else if (dfi->frag != fpos_frag(p: new_pos)) {
646	return true;
647	}
648	rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL;
649	if (!rinfo \|\| !rinfo->dir_nr)
650	return true;
651	chunk_offset = rinfo->dir_entries[`0`].offset;
652	return new_pos < chunk_offset \|\|
653	is_hash_order(p: new_pos) != is_hash_order(p: chunk_offset);
654	}
655
656	static loff_t ceph_dir_llseek(struct file file, loff_t offset, int* whence)
657	{
658	struct ceph_dir_file_info *dfi = file->private_data;
659	struct inode *inode = file->f_mapping->host;
660	loff_t retval;
661
662	inode_lock(inode);
663	retval = -EINVAL;
664	switch (whence) {
665	case SEEK_CUR:
666	offset += file->f_pos;
667	break;
668	case SEEK_SET:
669	break;
670	case SEEK_END:
671	retval = -EOPNOTSUPP;
672	goto out;
673	default:
674	goto out;
675	}
676
677	if (offset >= `0`) {
678	if (need_reset_readdir(dfi, new_pos: offset)) {
679	dout("dir_llseek dropping %p content\n", file);
680	reset_readdir(dfi);
681	} else if (is_hash_order(p: offset) && offset > file->f_pos) {
682	/ for hash offset, we don't know if a forward seek*
683	* is within same frag */
684	dfi->dir_release_count = `0`;
685	dfi->readdir_cache_idx = -`1`;
686	}
687
688	if (offset != file->f_pos) {
689	file->f_pos = offset;
690	file->f_version = `0`;
691	dfi->file_info.flags &= ~CEPH_F_ATEND;
692	}
693	retval = offset;
694	}
695	out:
696	inode_unlock(inode);
697	return retval;
698	}
699
700	/*
701	* Handle lookups for the hidden .snap directory.
702	*/
703	struct dentry ceph_handle_snapdir(struct* ceph_mds_request *req,
704	struct dentry *dentry)
705	{
706	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: dentry->d_sb);
707	struct inode parent = d_inode(dentry: dentry->d_parent); /* we hold i_rwsem /
708
709	/ .snap dir? /
710	if (ceph_snap(inode: parent) == CEPH_NOSNAP &&
711	strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == `0`) {
712	struct dentry *res;
713	struct inode *inode = ceph_get_snapdir(parent);
714
715	res = d_splice_alias(inode, dentry);
716	dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n",
717	dentry, dentry, inode, res);
718	if (res)
719	dentry = res;
720	}
721	return dentry;
722	}
723
724	/*
725	* Figure out final result of a lookup/open request.
726	*
727	* Mainly, make sure we return the final req->r_dentry (if it already
728	* existed) in place of the original VFS-provided dentry when they
729	* differ.
730	*
731	* Gracefully handle the case where the MDS replies with -ENOENT and
732	* no trace (which it may do, at its discretion, e.g., if it doesn't
733	* care to issue a lease on the negative dentry).
734	*/
735	struct dentry ceph_finish_lookup(struct* ceph_mds_request *req,
736	struct dentry dentry, int* err)
737	{
738	if (err == -ENOENT) {
739	/ no trace? /
740	err = `0`;
741	if (!req->r_reply_info.head->is_dentry) {
742	dout("ENOENT and no trace, dentry %p inode %p\n",
743	dentry, d_inode(dentry));
744	if (d_really_is_positive(dentry)) {
745	d_drop(dentry);
746	err = -ENOENT;
747	} else {
748	d_add(dentry, NULL);
749	}
750	}
751	}
752	if (err)
753	dentry = ERR_PTR(error: err);
754	else if (dentry != req->r_dentry)
755	dentry = dget(dentry: req->r_dentry); / we got spliced /
756	else
757	dentry = NULL;
758	return dentry;
759	}
760
761	static bool is_root_ceph_dentry(struct inode inode, struct* dentry *dentry)
762	{
763	return ceph_ino(inode) == CEPH_INO_ROOT &&
764	strncmp(dentry->d_name.name, ".ceph", `5`) == `0`;
765	}
766
767	/*
768	* Look up a single dir entry. If there is a lookup intent, inform
769	* the MDS so that it gets our 'caps wanted' value in a single op.
770	*/
771	static struct dentry ceph_lookup(struct* inode dir, struct* dentry *dentry,
772	unsigned int flags)
773	{
774	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: dir->i_sb);
775	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dir->i_sb);
776	struct ceph_mds_request *req;
777	int op;
778	int mask;
779	int err;
780
781	dout("lookup %p dentry %p '%pd'\n",
782	dir, dentry, dentry);
783
784	if (dentry->d_name.len > NAME_MAX)
785	return ERR_PTR(error: -ENAMETOOLONG);
786
787	if (IS_ENCRYPTED(dir)) {
788	bool had_key = fscrypt_has_encryption_key(inode: dir);
789
790	err = fscrypt_prepare_lookup_partial(dir, dentry);
791	if (err < `0`)
792	return ERR_PTR(error: err);
793
794	/ mark directory as incomplete if it has been unlocked /
795	if (!had_key && fscrypt_has_encryption_key(inode: dir))
796	ceph_dir_clear_complete(inode: dir);
797	}
798
799	/ can we conclude ENOENT locally? /
800	if (d_really_is_negative(dentry)) {
801	struct ceph_inode_info *ci = ceph_inode(inode: dir);
802	struct ceph_dentry_info *di = ceph_dentry(dentry);
803
804	spin_lock(lock: &ci->i_ceph_lock);
805	dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
806	if (strncmp(dentry->d_name.name,
807	fsc->mount_options->snapdir_name,
808	dentry->d_name.len) &&
809	!is_root_ceph_dentry(inode: dir, dentry) &&
810	ceph_test_mount_opt(fsc, DCACHE) &&
811	__ceph_dir_is_complete(ci) &&
812	__ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, t: `1`)) {
813	__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
814	spin_unlock(lock: &ci->i_ceph_lock);
815	dout(" dir %p complete, -ENOENT\n", dir);
816	d_add(dentry, NULL);
817	di->lease_shared_gen = atomic_read(v: &ci->i_shared_gen);
818	return NULL;
819	}
820	spin_unlock(lock: &ci->i_ceph_lock);
821	}
822
823	op = ceph_snap(inode: dir) == CEPH_SNAPDIR ?
824	CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
825	req = ceph_mdsc_create_request(mdsc, op, mode: USE_ANY_MDS);
826	if (IS_ERR(ptr: req))
827	return ERR_CAST(ptr: req);
828	req->r_dentry = dget(dentry);
829	req->r_num_caps = `2`;
830
831	mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
832	if (ceph_security_xattr_wanted(in: dir))
833	mask \|= CEPH_CAP_XATTR_SHARED;
834	req->r_args.getattr.mask = cpu_to_le32(mask);
835
836	ihold(inode: dir);
837	req->r_parent = dir;
838	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
839	err = ceph_mdsc_do_request(mdsc, NULL, req);
840	if (err == -ENOENT) {
841	struct dentry *res;
842
843	res = ceph_handle_snapdir(req, dentry);
844	if (IS_ERR(ptr: res)) {
845	err = PTR_ERR(ptr: res);
846	} else {
847	dentry = res;
848	err = `0`;
849	}
850	}
851	dentry = ceph_finish_lookup(req, dentry, err);
852	ceph_mdsc_put_request(req); / will dput(dentry) /
853	dout("lookup result=%p\n", dentry);
854	return dentry;
855	}
856
857	/*
858	* If we do a create but get no trace back from the MDS, follow up with
859	* a lookup (the VFS expects us to link up the provided dentry).
860	*/
861	int ceph_handle_notrace_create(struct inode dir, struct* dentry *dentry)
862	{
863	struct dentry *result = ceph_lookup(dir, dentry, flags: `0`);
864
865	if (result && !IS_ERR(ptr: result)) {
866	/*
867	* We created the item, then did a lookup, and found
868	* it was already linked to another inode we already
869	* had in our cache (and thus got spliced). To not
870	* confuse VFS (especially when inode is a directory),
871	* we don't link our dentry to that inode, return an
872	* error instead.
873	*
874	* This event should be rare and it happens only when
875	* we talk to old MDS. Recent MDS does not send traceless
876	* reply for request that creates new inode.
877	*/
878	d_drop(dentry: result);
879	return -ESTALE;
880	}
881	return PTR_ERR(ptr: result);
882	}
883
884	static int ceph_mknod(struct mnt_idmap idmap, struct* inode *dir,
885	struct dentry *dentry, umode_t mode, dev_t rdev)
886	{
887	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dir->i_sb);
888	struct ceph_mds_request *req;
889	struct ceph_acl_sec_ctx as_ctx = {};
890	int err;
891
892	if (ceph_snap(inode: dir) != CEPH_NOSNAP)
893	return -EROFS;
894
895	err = ceph_wait_on_conflict_unlink(dentry);
896	if (err)
897	return err;
898
899	if (ceph_quota_is_max_files_exceeded(inode: dir)) {
900	err = -EDQUOT;
901	goto out;
902	}
903
904	dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
905	dir, dentry, mode, rdev);
906	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_MKNOD, mode: USE_AUTH_MDS);
907	if (IS_ERR(ptr: req)) {
908	err = PTR_ERR(ptr: req);
909	goto out;
910	}
911
912	req->r_new_inode = ceph_new_inode(dir, dentry, mode: &mode, as_ctx: &as_ctx);
913	if (IS_ERR(ptr: req->r_new_inode)) {
914	err = PTR_ERR(ptr: req->r_new_inode);
915	req->r_new_inode = NULL;
916	goto out_req;
917	}
918
919	if (S_ISREG(mode) && IS_ENCRYPTED(dir))
920	set_bit(CEPH_MDS_R_FSCRYPT_FILE, addr: &req->r_req_flags);
921
922	req->r_dentry = dget(dentry);
923	req->r_num_caps = `2`;
924	req->r_parent = dir;
925	ihold(inode: dir);
926	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
927	req->r_args.mknod.mode = cpu_to_le32(mode);
928	req->r_args.mknod.rdev = cpu_to_le32(rdev);
929	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL \|
930	CEPH_CAP_XATTR_EXCL;
931	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
932
933	ceph_as_ctx_to_req(req, as_ctx: &as_ctx);
934
935	err = ceph_mdsc_do_request(mdsc, dir, req);
936	if (!err && !req->r_reply_info.head->is_dentry)
937	err = ceph_handle_notrace_create(dir, dentry);
938	out_req:
939	ceph_mdsc_put_request(req);
940	out:
941	if (!err)
942	ceph_init_inode_acls(inode: d_inode(dentry), as_ctx: &as_ctx);
943	else
944	d_drop(dentry);
945	ceph_release_acl_sec_ctx(as_ctx: &as_ctx);
946	return err;
947	}
948
949	static int ceph_create(struct mnt_idmap idmap, struct* inode *dir,
950	struct dentry *dentry, umode_t mode, bool excl)
951	{
952	return ceph_mknod(idmap, dir, dentry, mode, rdev: `0`);
953	}
954
955	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
956	static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
957	const char *dest)
958	{
959	int err;
960	int len = strlen(dest);
961	struct fscrypt_str osd_link = FSTR_INIT(NULL, `0`);
962
963	err = fscrypt_prepare_symlink(dir: req->r_parent, target: dest, len, PATH_MAX,
964	disk_link: &osd_link);
965	if (err)
966	goto out;
967
968	err = fscrypt_encrypt_symlink(inode: req->r_new_inode, target: dest, len, disk_link: &osd_link);
969	if (err)
970	goto out;
971
972	req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + `1`, GFP_KERNEL);
973	if (!req->r_path2) {
974	err = -ENOMEM;
975	goto out;
976	}
977
978	len = ceph_base64_encode(src: osd_link.name, srclen: osd_link.len, dst: req->r_path2);
979	req->r_path2[len] = `'\0'`;
980	out:
981	fscrypt_fname_free_buffer(crypto_str: &osd_link);
982	return err;
983	}
984	#else
985	static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
986	const char *dest)
987	{
988	return -EOPNOTSUPP;
989	}
990	#endif
991
992	static int ceph_symlink(struct mnt_idmap idmap, struct* inode *dir,
993	struct dentry dentry, const* char *dest)
994	{
995	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dir->i_sb);
996	struct ceph_mds_request *req;
997	struct ceph_acl_sec_ctx as_ctx = {};
998	umode_t mode = S_IFLNK \| `0777`;
999	int err;
1000
1001	if (ceph_snap(inode: dir) != CEPH_NOSNAP)
1002	return -EROFS;
1003
1004	err = ceph_wait_on_conflict_unlink(dentry);
1005	if (err)
1006	return err;
1007
1008	if (ceph_quota_is_max_files_exceeded(inode: dir)) {
1009	err = -EDQUOT;
1010	goto out;
1011	}
1012
1013	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
1014	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_SYMLINK, mode: USE_AUTH_MDS);
1015	if (IS_ERR(ptr: req)) {
1016	err = PTR_ERR(ptr: req);
1017	goto out;
1018	}
1019
1020	req->r_new_inode = ceph_new_inode(dir, dentry, mode: &mode, as_ctx: &as_ctx);
1021	if (IS_ERR(ptr: req->r_new_inode)) {
1022	err = PTR_ERR(ptr: req->r_new_inode);
1023	req->r_new_inode = NULL;
1024	goto out_req;
1025	}
1026
1027	req->r_parent = dir;
1028	ihold(inode: dir);
1029
1030	if (IS_ENCRYPTED(req->r_new_inode)) {
1031	err = prep_encrypted_symlink_target(req, dest);
1032	if (err)
1033	goto out_req;
1034	} else {
1035	req->r_path2 = kstrdup(s: dest, GFP_KERNEL);
1036	if (!req->r_path2) {
1037	err = -ENOMEM;
1038	goto out_req;
1039	}
1040	}
1041
1042	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
1043	req->r_dentry = dget(dentry);
1044	req->r_num_caps = `2`;
1045	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL \|
1046	CEPH_CAP_XATTR_EXCL;
1047	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1048
1049	ceph_as_ctx_to_req(req, as_ctx: &as_ctx);
1050
1051	err = ceph_mdsc_do_request(mdsc, dir, req);
1052	if (!err && !req->r_reply_info.head->is_dentry)
1053	err = ceph_handle_notrace_create(dir, dentry);
1054	out_req:
1055	ceph_mdsc_put_request(req);
1056	out:
1057	if (err)
1058	d_drop(dentry);
1059	ceph_release_acl_sec_ctx(as_ctx: &as_ctx);
1060	return err;
1061	}
1062
1063	static int ceph_mkdir(struct mnt_idmap idmap, struct* inode *dir,
1064	struct dentry *dentry, umode_t mode)
1065	{
1066	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dir->i_sb);
1067	struct ceph_mds_request *req;
1068	struct ceph_acl_sec_ctx as_ctx = {};
1069	int err;
1070	int op;
1071
1072	err = ceph_wait_on_conflict_unlink(dentry);
1073	if (err)
1074	return err;
1075
1076	if (ceph_snap(inode: dir) == CEPH_SNAPDIR) {
1077	/ mkdir .snap/foo is a MKSNAP /
1078	op = CEPH_MDS_OP_MKSNAP;
1079	dout("mksnap dir %p snap '%pd' dn %p\n", dir,
1080	dentry, dentry);
1081	} else if (ceph_snap(inode: dir) == CEPH_NOSNAP) {
1082	dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
1083	op = CEPH_MDS_OP_MKDIR;
1084	} else {
1085	err = -EROFS;
1086	goto out;
1087	}
1088
1089	if (op == CEPH_MDS_OP_MKDIR &&
1090	ceph_quota_is_max_files_exceeded(inode: dir)) {
1091	err = -EDQUOT;
1092	goto out;
1093	}
1094	if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) &&
1095	!fscrypt_has_encryption_key(inode: dir)) {
1096	err = -ENOKEY;
1097	goto out;
1098	}
1099
1100
1101	req = ceph_mdsc_create_request(mdsc, op, mode: USE_AUTH_MDS);
1102	if (IS_ERR(ptr: req)) {
1103	err = PTR_ERR(ptr: req);
1104	goto out;
1105	}
1106
1107	mode \|= S_IFDIR;
1108	req->r_new_inode = ceph_new_inode(dir, dentry, mode: &mode, as_ctx: &as_ctx);
1109	if (IS_ERR(ptr: req->r_new_inode)) {
1110	err = PTR_ERR(ptr: req->r_new_inode);
1111	req->r_new_inode = NULL;
1112	goto out_req;
1113	}
1114
1115	req->r_dentry = dget(dentry);
1116	req->r_num_caps = `2`;
1117	req->r_parent = dir;
1118	ihold(inode: dir);
1119	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
1120	req->r_args.mkdir.mode = cpu_to_le32(mode);
1121	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_AUTH_EXCL \|
1122	CEPH_CAP_XATTR_EXCL;
1123	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1124
1125	ceph_as_ctx_to_req(req, as_ctx: &as_ctx);
1126
1127	err = ceph_mdsc_do_request(mdsc, dir, req);
1128	if (!err &&
1129	!req->r_reply_info.head->is_target &&
1130	!req->r_reply_info.head->is_dentry)
1131	err = ceph_handle_notrace_create(dir, dentry);
1132	out_req:
1133	ceph_mdsc_put_request(req);
1134	out:
1135	if (!err)
1136	ceph_init_inode_acls(inode: d_inode(dentry), as_ctx: &as_ctx);
1137	else
1138	d_drop(dentry);
1139	ceph_release_acl_sec_ctx(as_ctx: &as_ctx);
1140	return err;
1141	}
1142
1143	static int ceph_link(struct dentry old_dentry, struct* inode *dir,
1144	struct dentry *dentry)
1145	{
1146	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: dir->i_sb);
1147	struct ceph_mds_request *req;
1148	int err;
1149
1150	if (dentry->d_flags & DCACHE_DISCONNECTED)
1151	return -EINVAL;
1152
1153	err = ceph_wait_on_conflict_unlink(dentry);
1154	if (err)
1155	return err;
1156
1157	if (ceph_snap(inode: dir) != CEPH_NOSNAP)
1158	return -EROFS;
1159
1160	err = fscrypt_prepare_link(old_dentry, dir, dentry);
1161	if (err)
1162	return err;
1163
1164	dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
1165	dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
1166	req = ceph_mdsc_create_request(mdsc, op: CEPH_MDS_OP_LINK, mode: USE_AUTH_MDS);
1167	if (IS_ERR(ptr: req)) {
1168	d_drop(dentry);
1169	return PTR_ERR(ptr: req);
1170	}
1171	req->r_dentry = dget(dentry);
1172	req->r_num_caps = `2`;
1173	req->r_old_dentry = dget(dentry: old_dentry);
1174	/*
1175	* The old_dentry maybe a DCACHE_DISCONNECTED dentry, then we
1176	* will just pass the ino# to MDSs.
1177	*/
1178	if (old_dentry->d_flags & DCACHE_DISCONNECTED)
1179	req->r_ino2 = ceph_vino(inode: d_inode(dentry: old_dentry));
1180	req->r_parent = dir;
1181	ihold(inode: dir);
1182	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
1183	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_XATTR_EXCL;
1184	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1185	/ release LINK_SHARED on source inode (mds will lock it) /
1186	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
1187	err = ceph_mdsc_do_request(mdsc, dir, req);
1188	if (err) {
1189	d_drop(dentry);
1190	} else if (!req->r_reply_info.head->is_dentry) {
1191	ihold(inode: d_inode(dentry: old_dentry));
1192	d_instantiate(dentry, d_inode(dentry: old_dentry));
1193	}
1194	ceph_mdsc_put_request(req);
1195	return err;
1196	}
1197
1198	static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
1199	struct ceph_mds_request *req)
1200	{
1201	struct dentry *dentry = req->r_dentry;
1202	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: dentry->d_sb);
1203	struct ceph_dentry_info *di = ceph_dentry(dentry);
1204	int result = req->r_err ? req->r_err :
1205	le32_to_cpu(req->r_reply_info.head->result);
1206
1207	if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
1208	pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
1209	__func__, dentry, dentry);
1210
1211	spin_lock(lock: &fsc->async_unlink_conflict_lock);
1212	hash_del_rcu(node: &di->hnode);
1213	spin_unlock(lock: &fsc->async_unlink_conflict_lock);
1214
1215	spin_lock(lock: &dentry->d_lock);
1216	di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
1217	wake_up_bit(word: &di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
1218	spin_unlock(lock: &dentry->d_lock);
1219
1220	synchronize_rcu();
1221
1222	if (result == -EJUKEBOX)
1223	goto out;
1224
1225	/ If op failed, mark everyone involved for errors /
1226	if (result) {
1227	int pathlen = `0`;
1228	u64 base = `0`;
1229	char *path = ceph_mdsc_build_path(dentry, plen: &pathlen,
1230	base: &base, for_wire: `0`);
1231
1232	/ mark error on parent + clear complete /
1233	mapping_set_error(mapping: req->r_parent->i_mapping, error: result);
1234	ceph_dir_clear_complete(inode: req->r_parent);
1235
1236	/ drop the dentry -- we don't know its status /
1237	if (!d_unhashed(dentry))
1238	d_drop(dentry);
1239
1240	/ mark inode itself for an error (since metadata is bogus) /
1241	mapping_set_error(mapping: req->r_old_inode->i_mapping, error: result);
1242
1243	pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
1244	base, IS_ERR(path) ? "<<bad>>" : path, result);
1245	ceph_mdsc_free_path(path, len: pathlen);
1246	}
1247	out:
1248	iput(req->r_old_inode);
1249	ceph_mdsc_release_dir_caps(req);
1250	}
1251
1252	static int get_caps_for_async_unlink(struct inode dir, struct* dentry *dentry)
1253	{
1254	struct ceph_inode_info *ci = ceph_inode(inode: dir);
1255	struct ceph_dentry_info *di;
1256	int got = `0`, want = CEPH_CAP_FILE_EXCL \| CEPH_CAP_DIR_UNLINK;
1257
1258	spin_lock(lock: &ci->i_ceph_lock);
1259	if ((__ceph_caps_issued(ci, NULL) & want) == want) {
1260	ceph_take_cap_refs(ci, caps: want, snap_rwsem_locked: false);
1261	got = want;
1262	}
1263	spin_unlock(lock: &ci->i_ceph_lock);
1264
1265	/ If we didn't get anything, return 0 /
1266	if (!got)
1267	return `0`;
1268
1269	spin_lock(lock: &dentry->d_lock);
1270	di = ceph_dentry(dentry);
1271	/*
1272	* - We are holding Fx, which implies Fs caps.
1273	* - Only support async unlink for primary linkage
1274	*/
1275	if (atomic_read(v: &ci->i_shared_gen) != di->lease_shared_gen \|\|
1276	!(di->flags & CEPH_DENTRY_PRIMARY_LINK))
1277	want = `0`;
1278	spin_unlock(lock: &dentry->d_lock);
1279
1280	/ Do we still want what we've got? /
1281	if (want == got)
1282	return got;
1283
1284	ceph_put_cap_refs(ci, had: got);
1285	return `0`;
1286	}
1287
1288	/*
1289	* rmdir and unlink are differ only by the metadata op code
1290	*/
1291	static int ceph_unlink(struct inode dir, struct* dentry *dentry)
1292	{
1293	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: dir->i_sb);
1294	struct ceph_mds_client *mdsc = fsc->mdsc;
1295	struct inode *inode = d_inode(dentry);
1296	struct ceph_mds_request *req;
1297	bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
1298	int err = -EROFS;
1299	int op;
1300
1301	if (ceph_snap(inode: dir) == CEPH_SNAPDIR) {
1302	/ rmdir .snap/foo is RMSNAP /
1303	dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry);
1304	op = CEPH_MDS_OP_RMSNAP;
1305	} else if (ceph_snap(inode: dir) == CEPH_NOSNAP) {
1306	dout("unlink/rmdir dir %p dn %p inode %p\n",
1307	dir, dentry, inode);
1308	op = d_is_dir(dentry) ?
1309	CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
1310	} else
1311	goto out;
1312	retry:
1313	req = ceph_mdsc_create_request(mdsc, op, mode: USE_AUTH_MDS);
1314	if (IS_ERR(ptr: req)) {
1315	err = PTR_ERR(ptr: req);
1316	goto out;
1317	}
1318	req->r_dentry = dget(dentry);
1319	req->r_num_caps = `2`;
1320	req->r_parent = dir;
1321	ihold(inode: dir);
1322	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_XATTR_EXCL;
1323	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1324	req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
1325
1326	if (try_async && op == CEPH_MDS_OP_UNLINK &&
1327	(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
1328	struct ceph_dentry_info *di = ceph_dentry(dentry);
1329
1330	dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
1331	dentry->d_name.len, dentry->d_name.name,
1332	ceph_cap_string(req->r_dir_caps));
1333	set_bit(CEPH_MDS_R_ASYNC, addr: &req->r_req_flags);
1334	req->r_callback = ceph_async_unlink_cb;
1335	req->r_old_inode = d_inode(dentry);
1336	ihold(inode: req->r_old_inode);
1337
1338	spin_lock(lock: &dentry->d_lock);
1339	di->flags \|= CEPH_DENTRY_ASYNC_UNLINK;
1340	spin_unlock(lock: &dentry->d_lock);
1341
1342	spin_lock(lock: &fsc->async_unlink_conflict_lock);
1343	hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
1344	dentry->d_name.hash);
1345	spin_unlock(lock: &fsc->async_unlink_conflict_lock);
1346
1347	err = ceph_mdsc_submit_request(mdsc, dir, req);
1348	if (!err) {
1349	/*
1350	* We have enough caps, so we assume that the unlink
1351	* will succeed. Fix up the target inode and dcache.
1352	*/
1353	drop_nlink(inode);
1354	d_delete(dentry);
1355	} else {
1356	spin_lock(lock: &fsc->async_unlink_conflict_lock);
1357	hash_del_rcu(node: &di->hnode);
1358	spin_unlock(lock: &fsc->async_unlink_conflict_lock);
1359
1360	spin_lock(lock: &dentry->d_lock);
1361	di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
1362	spin_unlock(lock: &dentry->d_lock);
1363
1364	if (err == -EJUKEBOX) {
1365	try_async = false;
1366	ceph_mdsc_put_request(req);
1367	goto retry;
1368	}
1369	}
1370	} else {
1371	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
1372	err = ceph_mdsc_do_request(mdsc, dir, req);
1373	if (!err && !req->r_reply_info.head->is_dentry)
1374	d_delete(dentry);
1375	}
1376
1377	ceph_mdsc_put_request(req);
1378	out:
1379	return err;
1380	}
1381
1382	static int ceph_rename(struct mnt_idmap idmap, struct* inode *old_dir,
1383	struct dentry old_dentry, struct* inode *new_dir,
1384	struct dentry new_dentry, unsigned* int flags)
1385	{
1386	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: old_dir->i_sb);
1387	struct ceph_mds_request *req;
1388	int op = CEPH_MDS_OP_RENAME;
1389	int err;
1390
1391	if (flags)
1392	return -EINVAL;
1393
1394	if (ceph_snap(inode: old_dir) != ceph_snap(inode: new_dir))
1395	return -EXDEV;
1396	if (ceph_snap(inode: old_dir) != CEPH_NOSNAP) {
1397	if (old_dir == new_dir && ceph_snap(inode: old_dir) == CEPH_SNAPDIR)
1398	op = CEPH_MDS_OP_RENAMESNAP;
1399	else
1400	return -EROFS;
1401	}
1402	/ don't allow cross-quota renames /
1403	if ((old_dir != new_dir) &&
1404	(!ceph_quota_is_same_realm(old: old_dir, new: new_dir)))
1405	return -EXDEV;
1406
1407	err = ceph_wait_on_conflict_unlink(dentry: new_dentry);
1408	if (err)
1409	return err;
1410
1411	err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
1412	flags);
1413	if (err)
1414	return err;
1415
1416	dout("rename dir %p dentry %p to dir %p dentry %p\n",
1417	old_dir, old_dentry, new_dir, new_dentry);
1418	req = ceph_mdsc_create_request(mdsc, op, mode: USE_AUTH_MDS);
1419	if (IS_ERR(ptr: req))
1420	return PTR_ERR(ptr: req);
1421	ihold(inode: old_dir);
1422	req->r_dentry = dget(dentry: new_dentry);
1423	req->r_num_caps = `2`;
1424	req->r_old_dentry = dget(dentry: old_dentry);
1425	req->r_old_dentry_dir = old_dir;
1426	req->r_parent = new_dir;
1427	ihold(inode: new_dir);
1428	set_bit(CEPH_MDS_R_PARENT_LOCKED, addr: &req->r_req_flags);
1429	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_XATTR_EXCL;
1430	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
1431	req->r_dentry_drop = CEPH_CAP_FILE_SHARED \| CEPH_CAP_XATTR_EXCL;
1432	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
1433	/ release LINK_RDCACHE on source inode (mds will lock it) /
1434	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
1435	if (d_really_is_positive(dentry: new_dentry)) {
1436	req->r_inode_drop =
1437	ceph_drop_caps_for_unlink(inode: d_inode(dentry: new_dentry));
1438	}
1439	err = ceph_mdsc_do_request(mdsc, dir: old_dir, req);
1440	if (!err && !req->r_reply_info.head->is_dentry) {
1441	/*
1442	* Normally d_move() is done by fill_trace (called by
1443	* do_request, above). If there is no trace, we need
1444	* to do it here.
1445	*/
1446	d_move(old_dentry, new_dentry);
1447	}
1448	ceph_mdsc_put_request(req);
1449	return err;
1450	}
1451
1452	/*
1453	* Move dentry to tail of mdsc->dentry_leases list when lease is updated.
1454	* Leases at front of the list will expire first. (Assume all leases have
1455	* similar duration)
1456	*
1457	* Called under dentry->d_lock.
1458	*/
1459	void __ceph_dentry_lease_touch(struct ceph_dentry_info *di)
1460	{
1461	struct dentry *dn = di->dentry;
1462	struct ceph_mds_client *mdsc;
1463
1464	dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn);
1465
1466	di->flags \|= CEPH_DENTRY_LEASE_LIST;
1467	if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
1468	di->flags \|= CEPH_DENTRY_REFERENCED;
1469	return;
1470	}
1471
1472	mdsc = ceph_sb_to_client(sb: dn->d_sb)->mdsc;
1473	spin_lock(lock: &mdsc->dentry_list_lock);
1474	list_move_tail(list: &di->lease_list, head: &mdsc->dentry_leases);
1475	spin_unlock(lock: &mdsc->dentry_list_lock);
1476	}
1477
1478	static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc,
1479	struct ceph_dentry_info *di)
1480	{
1481	di->flags &= ~(CEPH_DENTRY_LEASE_LIST \| CEPH_DENTRY_REFERENCED);
1482	di->lease_gen = `0`;
1483	di->time = jiffies;
1484	list_move_tail(list: &di->lease_list, head: &mdsc->dentry_dir_leases);
1485	}
1486
1487	/*
1488	* When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases
1489	* list if it's not in the list, otherwise set 'referenced' flag.
1490	*
1491	* Called under dentry->d_lock.
1492	*/
1493	void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
1494	{
1495	struct dentry *dn = di->dentry;
1496	struct ceph_mds_client *mdsc;
1497
1498	dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
1499	di, dn, dn, di->offset);
1500
1501	if (!list_empty(head: &di->lease_list)) {
1502	if (di->flags & CEPH_DENTRY_LEASE_LIST) {
1503	/ don't remove dentry from dentry lease list*
1504	* if its lease is valid */
1505	if (__dentry_lease_is_valid(di))
1506	return;
1507	} else {
1508	di->flags \|= CEPH_DENTRY_REFERENCED;
1509	return;
1510	}
1511	}
1512
1513	if (di->flags & CEPH_DENTRY_SHRINK_LIST) {
1514	di->flags \|= CEPH_DENTRY_REFERENCED;
1515	di->flags &= ~CEPH_DENTRY_LEASE_LIST;
1516	return;
1517	}
1518
1519	mdsc = ceph_sb_to_client(sb: dn->d_sb)->mdsc;
1520	spin_lock(lock: &mdsc->dentry_list_lock);
1521	__dentry_dir_lease_touch(mdsc, di),
1522	spin_unlock(lock: &mdsc->dentry_list_lock);
1523	}
1524
1525	static void __dentry_lease_unlist(struct ceph_dentry_info *di)
1526	{
1527	struct ceph_mds_client *mdsc;
1528	if (di->flags & CEPH_DENTRY_SHRINK_LIST)
1529	return;
1530	if (list_empty(head: &di->lease_list))
1531	return;
1532
1533	mdsc = ceph_sb_to_client(sb: di->dentry->d_sb)->mdsc;
1534	spin_lock(lock: &mdsc->dentry_list_lock);
1535	list_del_init(entry: &di->lease_list);
1536	spin_unlock(lock: &mdsc->dentry_list_lock);
1537	}
1538
1539	enum {
1540	KEEP = `0`,
1541	DELETE = `1`,
1542	TOUCH = `2`,
1543	STOP = `4`,
1544	};
1545
1546	struct ceph_lease_walk_control {
1547	bool dir_lease;
1548	bool expire_dir_lease;
1549	unsigned long nr_to_scan;
1550	unsigned long dir_lease_ttl;
1551	};
1552
1553	static unsigned long
1554	__dentry_leases_walk(struct ceph_mds_client *mdsc,
1555	struct ceph_lease_walk_control *lwc,
1556	int (check)(struct* dentry, void**))
1557	{
1558	struct ceph_dentry_info di, tmp;
1559	struct dentry dentry, last = NULL;
1560	struct list_head* list;
1561	LIST_HEAD(dispose);
1562	unsigned long freed = `0`;
1563	int ret = `0`;
1564
1565	list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases;
1566	spin_lock(lock: &mdsc->dentry_list_lock);
1567	list_for_each_entry_safe(di, tmp, list, lease_list) {
1568	if (!lwc->nr_to_scan)
1569	break;
1570	--lwc->nr_to_scan;
1571
1572	dentry = di->dentry;
1573	if (last == dentry)
1574	break;
1575
1576	if (!spin_trylock(lock: &dentry->d_lock))
1577	continue;
1578
1579	if (__lockref_is_dead(l: &dentry->d_lockref)) {
1580	list_del_init(entry: &di->lease_list);
1581	goto next;
1582	}
1583
1584	ret = check(dentry, lwc);
1585	if (ret & TOUCH) {
1586	/ move it into tail of dir lease list /
1587	__dentry_dir_lease_touch(mdsc, di);
1588	if (!last)
1589	last = dentry;
1590	}
1591	if (ret & DELETE) {
1592	/ stale lease /
1593	di->flags &= ~CEPH_DENTRY_REFERENCED;
1594	if (dentry->d_lockref.count > `0`) {
1595	/ update_dentry_lease() will re-add*
1596	* it to lease list, or
1597	* ceph_d_delete() will return 1 when
1598	* last reference is dropped */
1599	list_del_init(entry: &di->lease_list);
1600	} else {
1601	di->flags \|= CEPH_DENTRY_SHRINK_LIST;
1602	list_move_tail(list: &di->lease_list, head: &dispose);
1603	dget_dlock(dentry);
1604	}
1605	}
1606	next:
1607	spin_unlock(lock: &dentry->d_lock);
1608	if (ret & STOP)
1609	break;
1610	}
1611	spin_unlock(lock: &mdsc->dentry_list_lock);
1612
1613	while (!list_empty(head: &dispose)) {
1614	di = list_first_entry(&dispose, struct ceph_dentry_info,
1615	lease_list);
1616	dentry = di->dentry;
1617	spin_lock(lock: &dentry->d_lock);
1618
1619	list_del_init(entry: &di->lease_list);
1620	di->flags &= ~CEPH_DENTRY_SHRINK_LIST;
1621	if (di->flags & CEPH_DENTRY_REFERENCED) {
1622	spin_lock(lock: &mdsc->dentry_list_lock);
1623	if (di->flags & CEPH_DENTRY_LEASE_LIST) {
1624	list_add_tail(new: &di->lease_list,
1625	head: &mdsc->dentry_leases);
1626	} else {
1627	__dentry_dir_lease_touch(mdsc, di);
1628	}
1629	spin_unlock(lock: &mdsc->dentry_list_lock);
1630	} else {
1631	freed++;
1632	}
1633
1634	spin_unlock(lock: &dentry->d_lock);
1635	/ ceph_d_delete() does the trick /
1636	dput(dentry);
1637	}
1638	return freed;
1639	}
1640
1641	static int __dentry_lease_check(struct dentry dentry, void* *arg)
1642	{
1643	struct ceph_dentry_info *di = ceph_dentry(dentry);
1644	int ret;
1645
1646	if (__dentry_lease_is_valid(di))
1647	return STOP;
1648	ret = __dir_lease_try_check(dentry);
1649	if (ret == -EBUSY)
1650	return KEEP;
1651	if (ret > `0`)
1652	return TOUCH;
1653	return DELETE;
1654	}
1655
1656	static int __dir_lease_check(struct dentry dentry, void* *arg)
1657	{
1658	struct ceph_lease_walk_control *lwc = arg;
1659	struct ceph_dentry_info *di = ceph_dentry(dentry);
1660
1661	int ret = __dir_lease_try_check(dentry);
1662	if (ret == -EBUSY)
1663	return KEEP;
1664	if (ret > `0`) {
1665	if (time_before(jiffies, di->time + lwc->dir_lease_ttl))
1666	return STOP;
1667	/ Move dentry to tail of dir lease list if we don't want*
1668	* to delete it. So dentries in the list are checked in a
1669	* round robin manner */
1670	if (!lwc->expire_dir_lease)
1671	return TOUCH;
1672	if (dentry->d_lockref.count > `0` \|\|
1673	(di->flags & CEPH_DENTRY_REFERENCED))
1674	return TOUCH;
1675	/ invalidate dir lease /
1676	di->lease_shared_gen = `0`;
1677	}
1678	return DELETE;
1679	}
1680
1681	int ceph_trim_dentries(struct ceph_mds_client *mdsc)
1682	{
1683	struct ceph_lease_walk_control lwc;
1684	unsigned long count;
1685	unsigned long freed;
1686
1687	spin_lock(lock: &mdsc->caps_list_lock);
1688	if (mdsc->caps_use_max > `0` &&
1689	mdsc->caps_use_count > mdsc->caps_use_max)
1690	count = mdsc->caps_use_count - mdsc->caps_use_max;
1691	else
1692	count = `0`;
1693	spin_unlock(lock: &mdsc->caps_list_lock);
1694
1695	lwc.dir_lease = false;
1696	lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * `2`;
1697	freed = __dentry_leases_walk(mdsc, lwc: &lwc, check: __dentry_lease_check);
1698	if (!lwc.nr_to_scan) / more invalid leases /
1699	return -EAGAIN;
1700
1701	if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE)
1702	lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE;
1703
1704	lwc.dir_lease = true;
1705	lwc.expire_dir_lease = freed < count;
1706	lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
1707	freed +=__dentry_leases_walk(mdsc, lwc: &lwc, check: __dir_lease_check);
1708	if (!lwc.nr_to_scan) / more to check /
1709	return -EAGAIN;
1710
1711	return freed > `0` ? `1` : `0`;
1712	}
1713
1714	/*
1715	* Ensure a dentry lease will no longer revalidate.
1716	*/
1717	void ceph_invalidate_dentry_lease(struct dentry *dentry)
1718	{
1719	struct ceph_dentry_info *di = ceph_dentry(dentry);
1720	spin_lock(lock: &dentry->d_lock);
1721	di->time = jiffies;
1722	di->lease_shared_gen = `0`;
1723	di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
1724	__dentry_lease_unlist(di);
1725	spin_unlock(lock: &dentry->d_lock);
1726	}
1727
1728	/*
1729	* Check if dentry lease is valid. If not, delete the lease. Try to
1730	* renew if the least is more than half up.
1731	*/
1732	static bool __dentry_lease_is_valid(struct ceph_dentry_info *di)
1733	{
1734	struct ceph_mds_session *session;
1735
1736	if (!di->lease_gen)
1737	return false;
1738
1739	session = di->lease_session;
1740	if (session) {
1741	u32 gen;
1742	unsigned long ttl;
1743
1744	gen = atomic_read(v: &session->s_cap_gen);
1745	ttl = session->s_cap_ttl;
1746
1747	if (di->lease_gen == gen &&
1748	time_before(jiffies, ttl) &&
1749	time_before(jiffies, di->time))
1750	return true;
1751	}
1752	di->lease_gen = `0`;
1753	return false;
1754	}
1755
1756	static int dentry_lease_is_valid(struct dentry dentry, unsigned* int flags)
1757	{
1758	struct ceph_dentry_info *di;
1759	struct ceph_mds_session *session = NULL;
1760	u32 seq = `0`;
1761	int valid = `0`;
1762
1763	spin_lock(lock: &dentry->d_lock);
1764	di = ceph_dentry(dentry);
1765	if (di && __dentry_lease_is_valid(di)) {
1766	valid = `1`;
1767
1768	if (di->lease_renew_after &&
1769	time_after(jiffies, di->lease_renew_after)) {
1770	/*
1771	* We should renew. If we're in RCU walk mode
1772	* though, we can't do that so just return
1773	* -ECHILD.
1774	*/
1775	if (flags & LOOKUP_RCU) {
1776	valid = -ECHILD;
1777	} else {
1778	session = ceph_get_mds_session(s: di->lease_session);
1779	seq = di->lease_seq;
1780	di->lease_renew_after = `0`;
1781	di->lease_renew_from = jiffies;
1782	}
1783	}
1784	}
1785	spin_unlock(lock: &dentry->d_lock);
1786
1787	if (session) {
1788	ceph_mdsc_lease_send_msg(session, dentry,
1789	CEPH_MDS_LEASE_RENEW, seq);
1790	ceph_put_mds_session(s: session);
1791	}
1792	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
1793	return valid;
1794	}
1795
1796	/*
1797	* Called under dentry->d_lock.
1798	*/
1799	static int __dir_lease_try_check(const struct dentry *dentry)
1800	{
1801	struct ceph_dentry_info *di = ceph_dentry(dentry);
1802	struct inode *dir;
1803	struct ceph_inode_info *ci;
1804	int valid = `0`;
1805
1806	if (!di->lease_shared_gen)
1807	return `0`;
1808	if (IS_ROOT(dentry))
1809	return `0`;
1810
1811	dir = d_inode(dentry: dentry->d_parent);
1812	ci = ceph_inode(inode: dir);
1813
1814	if (spin_trylock(lock: &ci->i_ceph_lock)) {
1815	if (atomic_read(v: &ci->i_shared_gen) == di->lease_shared_gen &&
1816	__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, t: `0`))
1817	valid = `1`;
1818	spin_unlock(lock: &ci->i_ceph_lock);
1819	} else {
1820	valid = -EBUSY;
1821	}
1822
1823	if (!valid)
1824	di->lease_shared_gen = `0`;
1825	return valid;
1826	}
1827
1828	/*
1829	* Check if directory-wide content lease/cap is valid.
1830	*/
1831	static int dir_lease_is_valid(struct inode dir, struct* dentry *dentry,
1832	struct ceph_mds_client *mdsc)
1833	{
1834	struct ceph_inode_info *ci = ceph_inode(inode: dir);
1835	int valid;
1836	int shared_gen;
1837
1838	spin_lock(lock: &ci->i_ceph_lock);
1839	valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, t: `1`);
1840	if (valid) {
1841	__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
1842	shared_gen = atomic_read(v: &ci->i_shared_gen);
1843	}
1844	spin_unlock(lock: &ci->i_ceph_lock);
1845	if (valid) {
1846	struct ceph_dentry_info *di;
1847	spin_lock(lock: &dentry->d_lock);
1848	di = ceph_dentry(dentry);
1849	if (dir == d_inode(dentry: dentry->d_parent) &&
1850	di && di->lease_shared_gen == shared_gen)
1851	__ceph_dentry_dir_lease_touch(di);
1852	else
1853	valid = `0`;
1854	spin_unlock(lock: &dentry->d_lock);
1855	}
1856	dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n",
1857	dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid);
1858	return valid;
1859	}
1860
1861	/*
1862	* Check if cached dentry can be trusted.
1863	*/
1864	static int ceph_d_revalidate(struct dentry dentry, unsigned* int flags)
1865	{
1866	int valid = `0`;
1867	struct dentry *parent;
1868	struct inode dir, inode;
1869	struct ceph_mds_client *mdsc;
1870
1871	valid = fscrypt_d_revalidate(dentry, flags);
1872	if (valid <= `0`)
1873	return valid;
1874
1875	if (flags & LOOKUP_RCU) {
1876	parent = READ_ONCE(dentry->d_parent);
1877	dir = d_inode_rcu(dentry: parent);
1878	if (!dir)
1879	return -ECHILD;
1880	inode = d_inode_rcu(dentry);
1881	} else {
1882	parent = dget_parent(dentry);
1883	dir = d_inode(dentry: parent);
1884	inode = d_inode(dentry);
1885	}
1886
1887	dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
1888	dentry, inode, ceph_dentry(dentry)->offset,
1889	!!(dentry->d_flags & DCACHE_NOKEY_NAME));
1890
1891	mdsc = ceph_sb_to_client(sb: dir->i_sb)->mdsc;
1892
1893	/ always trust cached snapped dentries, snapdir dentry /
1894	if (ceph_snap(inode: dir) != CEPH_NOSNAP) {
1895	dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
1896	dentry, inode);
1897	valid = `1`;
1898	} else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1899	valid = `1`;
1900	} else {
1901	valid = dentry_lease_is_valid(dentry, flags);
1902	if (valid == -ECHILD)
1903	return valid;
1904	if (valid \|\| dir_lease_is_valid(dir, dentry, mdsc)) {
1905	if (inode)
1906	valid = ceph_is_any_caps(inode);
1907	else
1908	valid = `1`;
1909	}
1910	}
1911
1912	if (!valid) {
1913	struct ceph_mds_request *req;
1914	int op, err;
1915	u32 mask;
1916
1917	if (flags & LOOKUP_RCU)
1918	return -ECHILD;
1919
1920	percpu_counter_inc(fbc: &mdsc->metric.d_lease_mis);
1921
1922	op = ceph_snap(inode: dir) == CEPH_SNAPDIR ?
1923	CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
1924	req = ceph_mdsc_create_request(mdsc, op, mode: USE_ANY_MDS);
1925	if (!IS_ERR(ptr: req)) {
1926	req->r_dentry = dget(dentry);
1927	req->r_num_caps = `2`;
1928	req->r_parent = dir;
1929	ihold(inode: dir);
1930
1931	mask = CEPH_STAT_CAP_INODE \| CEPH_CAP_AUTH_SHARED;
1932	if (ceph_security_xattr_wanted(in: dir))
1933	mask \|= CEPH_CAP_XATTR_SHARED;
1934	req->r_args.getattr.mask = cpu_to_le32(mask);
1935
1936	err = ceph_mdsc_do_request(mdsc, NULL, req);
1937	switch (err) {
1938	case `0`:
1939	if (d_really_is_positive(dentry) &&
1940	d_inode(dentry) == req->r_target_inode)
1941	valid = `1`;
1942	break;
1943	case -ENOENT:
1944	if (d_really_is_negative(dentry))
1945	valid = `1`;
1946	fallthrough;
1947	default:
1948	break;
1949	}
1950	ceph_mdsc_put_request(req);
1951	dout("d_revalidate %p lookup result=%d\n",
1952	dentry, err);
1953	}
1954	} else {
1955	percpu_counter_inc(fbc: &mdsc->metric.d_lease_hit);
1956	}
1957
1958	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1959	if (!valid)
1960	ceph_dir_clear_complete(inode: dir);
1961
1962	if (!(flags & LOOKUP_RCU))
1963	dput(parent);
1964	return valid;
1965	}
1966
1967	/*
1968	* Delete unused dentry that doesn't have valid lease
1969	*
1970	* Called under dentry->d_lock.
1971	*/
1972	static int ceph_d_delete(const struct dentry *dentry)
1973	{
1974	struct ceph_dentry_info *di;
1975
1976	/ won't release caps /
1977	if (d_really_is_negative(dentry))
1978	return `0`;
1979	if (ceph_snap(inode: d_inode(dentry)) != CEPH_NOSNAP)
1980	return `0`;
1981	/ vaild lease? /
1982	di = ceph_dentry(dentry);
1983	if (di) {
1984	if (__dentry_lease_is_valid(di))
1985	return `0`;
1986	if (__dir_lease_try_check(dentry))
1987	return `0`;
1988	}
1989	return `1`;
1990	}
1991
1992	/*
1993	* Release our ceph_dentry_info.
1994	*/
1995	static void ceph_d_release(struct dentry *dentry)
1996	{
1997	struct ceph_dentry_info *di = ceph_dentry(dentry);
1998	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: dentry->d_sb);
1999
2000	dout("d_release %p\n", dentry);
2001
2002	atomic64_dec(v: &fsc->mdsc->metric.total_dentries);
2003
2004	spin_lock(lock: &dentry->d_lock);
2005	__dentry_lease_unlist(di);
2006	dentry->d_fsdata = NULL;
2007	spin_unlock(lock: &dentry->d_lock);
2008
2009	ceph_put_mds_session(s: di->lease_session);
2010	kmem_cache_free(s: ceph_dentry_cachep, objp: di);
2011	}
2012
2013	/*
2014	* When the VFS prunes a dentry from the cache, we need to clear the
2015	* complete flag on the parent directory.
2016	*
2017	* Called under dentry->d_lock.
2018	*/
2019	static void ceph_d_prune(struct dentry *dentry)
2020	{
2021	struct ceph_inode_info *dir_ci;
2022	struct ceph_dentry_info *di;
2023
2024	dout("ceph_d_prune %pd %p\n", dentry, dentry);
2025
2026	/ do we have a valid parent? /
2027	if (IS_ROOT(dentry))
2028	return;
2029
2030	/ we hold d_lock, so d_parent is stable /
2031	dir_ci = ceph_inode(inode: d_inode(dentry: dentry->d_parent));
2032	if (dir_ci->i_vino.snap == CEPH_SNAPDIR)
2033	return;
2034
2035	/ who calls d_delete() should also disable dcache readdir /
2036	if (d_really_is_negative(dentry))
2037	return;
2038
2039	/ d_fsdata does not get cleared until d_release /
2040	if (!d_unhashed(dentry)) {
2041	__ceph_dir_clear_complete(ci: dir_ci);
2042	return;
2043	}
2044
2045	/ Disable dcache readdir just in case that someone called d_drop()*
2046	* or d_invalidate(), but MDS didn't revoke CEPH_CAP_FILE_SHARED
2047	* properly (dcache readdir is still enabled) */
2048	di = ceph_dentry(dentry);
2049	if (di->offset > `0` &&
2050	di->lease_shared_gen == atomic_read(v: &dir_ci->i_shared_gen))
2051	__ceph_dir_clear_ordered(ci: dir_ci);
2052	}
2053
2054	/*
2055	* read() on a dir. This weird interface hack only works if mounted
2056	* with '-o dirstat'.
2057	*/
2058	static ssize_t ceph_read_dir(struct file file, char* __user *buf, size_t size,
2059	loff_t *ppos)
2060	{
2061	struct ceph_dir_file_info *dfi = file->private_data;
2062	struct inode *inode = file_inode(f: file);
2063	struct ceph_inode_info *ci = ceph_inode(inode);
2064	int left;
2065	const int bufsize = `1024`;
2066
2067	if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
2068	return -EISDIR;
2069
2070	if (!dfi->dir_info) {
2071	dfi->dir_info = kmalloc(size: bufsize, GFP_KERNEL);
2072	if (!dfi->dir_info)
2073	return -ENOMEM;
2074	dfi->dir_info_len =
2075	snprintf(buf: dfi->dir_info, size: bufsize,
2076	fmt: "entries: %20lld\n"
2077	" files: %20lld\n"
2078	" subdirs: %20lld\n"
2079	"rentries: %20lld\n"
2080	" rfiles: %20lld\n"
2081	" rsubdirs: %20lld\n"
2082	"rbytes: %20lld\n"
2083	"rctime: %10lld.%09ld\n",
2084	ci->i_files + ci->i_subdirs,
2085	ci->i_files,
2086	ci->i_subdirs,
2087	ci->i_rfiles + ci->i_rsubdirs,
2088	ci->i_rfiles,
2089	ci->i_rsubdirs,
2090	ci->i_rbytes,
2091	ci->i_rctime.tv_sec,
2092	ci->i_rctime.tv_nsec);
2093	}
2094
2095	if (*ppos >= dfi->dir_info_len)
2096	return `0`;
2097	size = min_t(unsigned, size, dfi->dir_info_len-*ppos);
2098	left = copy_to_user(to: buf, from: dfi->dir_info + *ppos, n: size);
2099	if (left == size)
2100	return -EFAULT;
2101	*ppos += (size - left);
2102	return size - left;
2103	}
2104
2105
2106
2107	/*
2108	* Return name hash for a given dentry. This is dependent on
2109	* the parent directory's hash function.
2110	*/
2111	unsigned ceph_dentry_hash(struct inode dir, struct* dentry *dn)
2112	{
2113	struct ceph_inode_info *dci = ceph_inode(inode: dir);
2114	unsigned hash;
2115
2116	switch (dci->i_dir_layout.dl_dir_hash) {
2117	case `0`: / for backward compat /
2118	case CEPH_STR_HASH_LINUX:
2119	return dn->d_name.hash;
2120
2121	default:
2122	spin_lock(lock: &dn->d_lock);
2123	hash = ceph_str_hash(type: dci->i_dir_layout.dl_dir_hash,
2124	s: dn->d_name.name, len: dn->d_name.len);
2125	spin_unlock(lock: &dn->d_lock);
2126	return hash;
2127	}
2128	}
2129
2130	WRAP_DIR_ITER(ceph_readdir) // FIXME!
2131	const struct file_operations ceph_dir_fops = {
2132	.read = ceph_read_dir,
2133	.iterate_shared = shared_ceph_readdir,
2134	.llseek = ceph_dir_llseek,
2135	.open = ceph_open,
2136	.release = ceph_release,
2137	.unlocked_ioctl = ceph_ioctl,
2138	.compat_ioctl = compat_ptr_ioctl,
2139	.fsync = ceph_fsync,
2140	.lock = ceph_lock,
2141	.flock = ceph_flock,
2142	};
2143
2144	const struct file_operations ceph_snapdir_fops = {
2145	.iterate_shared = shared_ceph_readdir,
2146	.llseek = ceph_dir_llseek,
2147	.open = ceph_open,
2148	.release = ceph_release,
2149	};
2150
2151	const struct inode_operations ceph_dir_iops = {
2152	.lookup = ceph_lookup,
2153	.permission = ceph_permission,
2154	.getattr = ceph_getattr,
2155	.setattr = ceph_setattr,
2156	.listxattr = ceph_listxattr,
2157	.get_inode_acl = ceph_get_acl,
2158	.set_acl = ceph_set_acl,
2159	.mknod = ceph_mknod,
2160	.symlink = ceph_symlink,
2161	.mkdir = ceph_mkdir,
2162	.link = ceph_link,
2163	.unlink = ceph_unlink,
2164	.rmdir = ceph_unlink,
2165	.rename = ceph_rename,
2166	.create = ceph_create,
2167	.atomic_open = ceph_atomic_open,
2168	};
2169
2170	const struct inode_operations ceph_snapdir_iops = {
2171	.lookup = ceph_lookup,
2172	.permission = ceph_permission,
2173	.getattr = ceph_getattr,
2174	.mkdir = ceph_mkdir,
2175	.rmdir = ceph_unlink,
2176	.rename = ceph_rename,
2177	};
2178
2179	const struct dentry_operations ceph_dentry_ops = {
2180	.d_revalidate = ceph_d_revalidate,
2181	.d_delete = ceph_d_delete,
2182	.d_release = ceph_d_release,
2183	.d_prune = ceph_d_prune,
2184	.d_init = ceph_d_init,
2185	};
2186

source code of linux/fs/ceph/dir.c