caps.c source code [linux/fs/ceph/caps.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/ceph/ceph_debug.h>
3
4	#include <linux/fs.h>
5	#include <linux/kernel.h>
6	#include <linux/sched/signal.h>
7	#include <linux/slab.h>
8	#include <linux/vmalloc.h>
9	#include <linux/wait.h>
10	#include <linux/writeback.h>
11	#include <linux/iversion.h>
12	#include <linux/filelock.h>
13
14	#include "super.h"
15	#include "mds_client.h"
16	#include "cache.h"
17	#include "crypto.h"
18	#include <linux/ceph/decode.h>
19	#include <linux/ceph/messenger.h>
20
21	/*
22	* Capability management
23	*
24	* The Ceph metadata servers control client access to inode metadata
25	* and file data by issuing capabilities, granting clients permission
26	* to read and/or write both inode field and file data to OSDs
27	* (storage nodes). Each capability consists of a set of bits
28	* indicating which operations are allowed.
29	*
30	* If the client holds a *_SHARED cap, the client has a coherent value
31	* that can be safely read from the cached inode.
32	*
33	* In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
34	* client is allowed to change inode attributes (e.g., file size,
35	* mtime), note its dirty state in the ceph_cap, and asynchronously
36	* flush that metadata change to the MDS.
37	*
38	* In the event of a conflicting operation (perhaps by another
39	* client), the MDS will revoke the conflicting client capabilities.
40	*
41	* In order for a client to cache an inode, it must hold a capability
42	* with at least one MDS server. When inodes are released, release
43	* notifications are batched and periodically sent en masse to the MDS
44	* cluster to release server state.
45	*/
46
47	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
48	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
49	struct ceph_mds_session *session,
50	struct ceph_inode_info *ci,
51	u64 oldest_flush_tid);
52
53	/*
54	* Generate readable cap strings for debugging output.
55	*/
56	#define MAX_CAP_STR 20
57	static char cap_str[MAX_CAP_STR][`40`];
58	static DEFINE_SPINLOCK(cap_str_lock);
59	static int last_cap_str;
60
61	static char gcap_string(char* s, int* c)
62	{
63	if (c & CEPH_CAP_GSHARED)
64	*s++ = `'s'`;
65	if (c & CEPH_CAP_GEXCL)
66	*s++ = `'x'`;
67	if (c & CEPH_CAP_GCACHE)
68	*s++ = `'c'`;
69	if (c & CEPH_CAP_GRD)
70	*s++ = `'r'`;
71	if (c & CEPH_CAP_GWR)
72	*s++ = `'w'`;
73	if (c & CEPH_CAP_GBUFFER)
74	*s++ = `'b'`;
75	if (c & CEPH_CAP_GWREXTEND)
76	*s++ = `'a'`;
77	if (c & CEPH_CAP_GLAZYIO)
78	*s++ = `'l'`;
79	return s;
80	}
81
82	const char ceph_cap_string(int* caps)
83	{
84	int i;
85	char *s;
86	int c;
87
88	spin_lock(lock: &cap_str_lock);
89	i = last_cap_str++;
90	if (last_cap_str == MAX_CAP_STR)
91	last_cap_str = `0`;
92	spin_unlock(lock: &cap_str_lock);
93
94	s = cap_str[i];
95
96	if (caps & CEPH_CAP_PIN)
97	*s++ = `'p'`;
98
99	c = (caps >> CEPH_CAP_SAUTH) & `3`;
100	if (c) {
101	*s++ = `'A'`;
102	s = gcap_string(s, c);
103	}
104
105	c = (caps >> CEPH_CAP_SLINK) & `3`;
106	if (c) {
107	*s++ = `'L'`;
108	s = gcap_string(s, c);
109	}
110
111	c = (caps >> CEPH_CAP_SXATTR) & `3`;
112	if (c) {
113	*s++ = `'X'`;
114	s = gcap_string(s, c);
115	}
116
117	c = caps >> CEPH_CAP_SFILE;
118	if (c) {
119	*s++ = `'F'`;
120	s = gcap_string(s, c);
121	}
122
123	if (s == cap_str[i])
124	*s++ = `'-'`;
125	*s = `0`;
126	return cap_str[i];
127	}
128
129	void ceph_caps_init(struct ceph_mds_client *mdsc)
130	{
131	INIT_LIST_HEAD(list: &mdsc->caps_list);
132	spin_lock_init(&mdsc->caps_list_lock);
133	}
134
135	void ceph_caps_finalize(struct ceph_mds_client *mdsc)
136	{
137	struct ceph_cap *cap;
138
139	spin_lock(lock: &mdsc->caps_list_lock);
140	while (!list_empty(head: &mdsc->caps_list)) {
141	cap = list_first_entry(&mdsc->caps_list,
142	struct ceph_cap, caps_item);
143	list_del(entry: &cap->caps_item);
144	kmem_cache_free(s: ceph_cap_cachep, objp: cap);
145	}
146	mdsc->caps_total_count = `0`;
147	mdsc->caps_avail_count = `0`;
148	mdsc->caps_use_count = `0`;
149	mdsc->caps_reserve_count = `0`;
150	mdsc->caps_min_count = `0`;
151	spin_unlock(lock: &mdsc->caps_list_lock);
152	}
153
154	void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
155	struct ceph_mount_options *fsopt)
156	{
157	spin_lock(lock: &mdsc->caps_list_lock);
158	mdsc->caps_min_count = fsopt->max_readdir;
159	if (mdsc->caps_min_count < `1024`)
160	mdsc->caps_min_count = `1024`;
161	mdsc->caps_use_max = fsopt->caps_max;
162	if (mdsc->caps_use_max > `0` &&
163	mdsc->caps_use_max < mdsc->caps_min_count)
164	mdsc->caps_use_max = mdsc->caps_min_count;
165	spin_unlock(lock: &mdsc->caps_list_lock);
166	}
167
168	static void __ceph_unreserve_caps(struct ceph_mds_client mdsc, int* nr_caps)
169	{
170	struct ceph_cap *cap;
171	int i;
172
173	if (nr_caps) {
174	BUG_ON(mdsc->caps_reserve_count < nr_caps);
175	mdsc->caps_reserve_count -= nr_caps;
176	if (mdsc->caps_avail_count >=
177	mdsc->caps_reserve_count + mdsc->caps_min_count) {
178	mdsc->caps_total_count -= nr_caps;
179	for (i = `0`; i < nr_caps; i++) {
180	cap = list_first_entry(&mdsc->caps_list,
181	struct ceph_cap, caps_item);
182	list_del(entry: &cap->caps_item);
183	kmem_cache_free(s: ceph_cap_cachep, objp: cap);
184	}
185	} else {
186	mdsc->caps_avail_count += nr_caps;
187	}
188
189	dout("%s: caps %d = %d used + %d resv + %d avail\n",
190	__func__,
191	mdsc->caps_total_count, mdsc->caps_use_count,
192	mdsc->caps_reserve_count, mdsc->caps_avail_count);
193	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
194	mdsc->caps_reserve_count +
195	mdsc->caps_avail_count);
196	}
197	}
198
199	/*
200	* Called under mdsc->mutex.
201	*/
202	int ceph_reserve_caps(struct ceph_mds_client *mdsc,
203	struct ceph_cap_reservation ctx, int* need)
204	{
205	int i, j;
206	struct ceph_cap *cap;
207	int have;
208	int alloc = `0`;
209	int max_caps;
210	int err = `0`;
211	bool trimmed = false;
212	struct ceph_mds_session *s;
213	LIST_HEAD(newcaps);
214
215	dout("reserve caps ctx=%p need=%d\n", ctx, need);
216
217	/ first reserve any caps that are already allocated /
218	spin_lock(lock: &mdsc->caps_list_lock);
219	if (mdsc->caps_avail_count >= need)
220	have = need;
221	else
222	have = mdsc->caps_avail_count;
223	mdsc->caps_avail_count -= have;
224	mdsc->caps_reserve_count += have;
225	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
226	mdsc->caps_reserve_count +
227	mdsc->caps_avail_count);
228	spin_unlock(lock: &mdsc->caps_list_lock);
229
230	for (i = have; i < need; ) {
231	cap = kmem_cache_alloc(cachep: ceph_cap_cachep, GFP_NOFS);
232	if (cap) {
233	list_add(new: &cap->caps_item, head: &newcaps);
234	alloc++;
235	i++;
236	continue;
237	}
238
239	if (!trimmed) {
240	for (j = `0`; j < mdsc->max_sessions; j++) {
241	s = __ceph_lookup_mds_session(mdsc, mds: j);
242	if (!s)
243	continue;
244	mutex_unlock(lock: &mdsc->mutex);
245
246	mutex_lock(&s->s_mutex);
247	max_caps = s->s_nr_caps - (need - i);
248	ceph_trim_caps(mdsc, session: s, max_caps);
249	mutex_unlock(lock: &s->s_mutex);
250
251	ceph_put_mds_session(s);
252	mutex_lock(&mdsc->mutex);
253	}
254	trimmed = true;
255
256	spin_lock(lock: &mdsc->caps_list_lock);
257	if (mdsc->caps_avail_count) {
258	int more_have;
259	if (mdsc->caps_avail_count >= need - i)
260	more_have = need - i;
261	else
262	more_have = mdsc->caps_avail_count;
263
264	i += more_have;
265	have += more_have;
266	mdsc->caps_avail_count -= more_have;
267	mdsc->caps_reserve_count += more_have;
268
269	}
270	spin_unlock(lock: &mdsc->caps_list_lock);
271
272	continue;
273	}
274
275	pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
276	ctx, need, have + alloc);
277	err = -ENOMEM;
278	break;
279	}
280
281	if (!err) {
282	BUG_ON(have + alloc != need);
283	ctx->count = need;
284	ctx->used = `0`;
285	}
286
287	spin_lock(lock: &mdsc->caps_list_lock);
288	mdsc->caps_total_count += alloc;
289	mdsc->caps_reserve_count += alloc;
290	list_splice(list: &newcaps, head: &mdsc->caps_list);
291
292	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
293	mdsc->caps_reserve_count +
294	mdsc->caps_avail_count);
295
296	if (err)
297	__ceph_unreserve_caps(mdsc, nr_caps: have + alloc);
298
299	spin_unlock(lock: &mdsc->caps_list_lock);
300
301	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
302	ctx, mdsc->caps_total_count, mdsc->caps_use_count,
303	mdsc->caps_reserve_count, mdsc->caps_avail_count);
304	return err;
305	}
306
307	void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
308	struct ceph_cap_reservation *ctx)
309	{
310	bool reclaim = false;
311	if (!ctx->count)
312	return;
313
314	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
315	spin_lock(lock: &mdsc->caps_list_lock);
316	__ceph_unreserve_caps(mdsc, nr_caps: ctx->count);
317	ctx->count = `0`;
318
319	if (mdsc->caps_use_max > `0` &&
320	mdsc->caps_use_count > mdsc->caps_use_max)
321	reclaim = true;
322	spin_unlock(lock: &mdsc->caps_list_lock);
323
324	if (reclaim)
325	ceph_reclaim_caps_nr(mdsc, nr: ctx->used);
326	}
327
328	struct ceph_cap ceph_get_cap(struct* ceph_mds_client *mdsc,
329	struct ceph_cap_reservation *ctx)
330	{
331	struct ceph_cap *cap = NULL;
332
333	/ temporary, until we do something about cap import/export /
334	if (!ctx) {
335	cap = kmem_cache_alloc(cachep: ceph_cap_cachep, GFP_NOFS);
336	if (cap) {
337	spin_lock(lock: &mdsc->caps_list_lock);
338	mdsc->caps_use_count++;
339	mdsc->caps_total_count++;
340	spin_unlock(lock: &mdsc->caps_list_lock);
341	} else {
342	spin_lock(lock: &mdsc->caps_list_lock);
343	if (mdsc->caps_avail_count) {
344	BUG_ON(list_empty(&mdsc->caps_list));
345
346	mdsc->caps_avail_count--;
347	mdsc->caps_use_count++;
348	cap = list_first_entry(&mdsc->caps_list,
349	struct ceph_cap, caps_item);
350	list_del(entry: &cap->caps_item);
351
352	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
353	mdsc->caps_reserve_count + mdsc->caps_avail_count);
354	}
355	spin_unlock(lock: &mdsc->caps_list_lock);
356	}
357
358	return cap;
359	}
360
361	spin_lock(lock: &mdsc->caps_list_lock);
362	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
363	ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
364	mdsc->caps_reserve_count, mdsc->caps_avail_count);
365	BUG_ON(!ctx->count);
366	BUG_ON(ctx->count > mdsc->caps_reserve_count);
367	BUG_ON(list_empty(&mdsc->caps_list));
368
369	ctx->count--;
370	ctx->used++;
371	mdsc->caps_reserve_count--;
372	mdsc->caps_use_count++;
373
374	cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
375	list_del(entry: &cap->caps_item);
376
377	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
378	mdsc->caps_reserve_count + mdsc->caps_avail_count);
379	spin_unlock(lock: &mdsc->caps_list_lock);
380	return cap;
381	}
382
383	void ceph_put_cap(struct ceph_mds_client mdsc, struct* ceph_cap *cap)
384	{
385	spin_lock(lock: &mdsc->caps_list_lock);
386	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
387	cap, mdsc->caps_total_count, mdsc->caps_use_count,
388	mdsc->caps_reserve_count, mdsc->caps_avail_count);
389	mdsc->caps_use_count--;
390	/*
391	* Keep some preallocated caps around (ceph_min_count), to
392	* avoid lots of free/alloc churn.
393	*/
394	if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
395	mdsc->caps_min_count) {
396	mdsc->caps_total_count--;
397	kmem_cache_free(s: ceph_cap_cachep, objp: cap);
398	} else {
399	mdsc->caps_avail_count++;
400	list_add(new: &cap->caps_item, head: &mdsc->caps_list);
401	}
402
403	BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
404	mdsc->caps_reserve_count + mdsc->caps_avail_count);
405	spin_unlock(lock: &mdsc->caps_list_lock);
406	}
407
408	void ceph_reservation_status(struct ceph_fs_client *fsc,
409	int total, int* avail, int* used, int* *reserved,
410	int *min)
411	{
412	struct ceph_mds_client *mdsc = fsc->mdsc;
413
414	spin_lock(lock: &mdsc->caps_list_lock);
415
416	if (total)
417	*total = mdsc->caps_total_count;
418	if (avail)
419	*avail = mdsc->caps_avail_count;
420	if (used)
421	*used = mdsc->caps_use_count;
422	if (reserved)
423	*reserved = mdsc->caps_reserve_count;
424	if (min)
425	*min = mdsc->caps_min_count;
426
427	spin_unlock(lock: &mdsc->caps_list_lock);
428	}
429
430	/*
431	* Find ceph_cap for given mds, if any.
432	*
433	* Called with i_ceph_lock held.
434	*/
435	struct ceph_cap __get_cap_for_mds(struct* ceph_inode_info ci, int* mds)
436	{
437	struct ceph_cap *cap;
438	struct rb_node *n = ci->i_caps.rb_node;
439
440	while (n) {
441	cap = rb_entry(n, struct ceph_cap, ci_node);
442	if (mds < cap->mds)
443	n = n->rb_left;
444	else if (mds > cap->mds)
445	n = n->rb_right;
446	else
447	return cap;
448	}
449	return NULL;
450	}
451
452	struct ceph_cap ceph_get_cap_for_mds(struct* ceph_inode_info ci, int* mds)
453	{
454	struct ceph_cap *cap;
455
456	spin_lock(lock: &ci->i_ceph_lock);
457	cap = __get_cap_for_mds(ci, mds);
458	spin_unlock(lock: &ci->i_ceph_lock);
459	return cap;
460	}
461
462	/*
463	* Called under i_ceph_lock.
464	*/
465	static void __insert_cap_node(struct ceph_inode_info *ci,
466	struct ceph_cap *new)
467	{
468	struct rb_node **p = &ci->i_caps.rb_node;
469	struct rb_node *parent = NULL;
470	struct ceph_cap *cap = NULL;
471
472	while (*p) {
473	parent = *p;
474	cap = rb_entry(parent, struct ceph_cap, ci_node);
475	if (new->mds < cap->mds)
476	p = &(*p)->rb_left;
477	else if (new->mds > cap->mds)
478	p = &(*p)->rb_right;
479	else
480	BUG();
481	}
482
483	rb_link_node(node: &new->ci_node, parent, rb_link: p);
484	rb_insert_color(&new->ci_node, &ci->i_caps);
485	}
486
487	/*
488	* (re)set cap hold timeouts, which control the delayed release
489	* of unused caps back to the MDS. Should be called on cap use.
490	*/
491	static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
492	struct ceph_inode_info *ci)
493	{
494	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
495	ci->i_hold_caps_max = round_jiffies(j: jiffies +
496	opt->caps_wanted_delay_max * HZ);
497	dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
498	ci->i_hold_caps_max - jiffies);
499	}
500
501	/*
502	* (Re)queue cap at the end of the delayed cap release list.
503	*
504	* If I_FLUSH is set, leave the inode at the front of the list.
505	*
506	* Caller holds i_ceph_lock
507	* -> we take mdsc->cap_delay_lock
508	*/
509	static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
510	struct ceph_inode_info *ci)
511	{
512	dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
513	ci->i_ceph_flags, ci->i_hold_caps_max);
514	if (!mdsc->stopping) {
515	spin_lock(lock: &mdsc->cap_delay_lock);
516	if (!list_empty(head: &ci->i_cap_delay_list)) {
517	if (ci->i_ceph_flags & CEPH_I_FLUSH)
518	goto no_change;
519	list_del_init(entry: &ci->i_cap_delay_list);
520	}
521	__cap_set_timeouts(mdsc, ci);
522	list_add_tail(new: &ci->i_cap_delay_list, head: &mdsc->cap_delay_list);
523	no_change:
524	spin_unlock(lock: &mdsc->cap_delay_lock);
525	}
526	}
527
528	/*
529	* Queue an inode for immediate writeback. Mark inode with I_FLUSH,
530	* indicating we should send a cap message to flush dirty metadata
531	* asap, and move to the front of the delayed cap list.
532	*/
533	static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
534	struct ceph_inode_info *ci)
535	{
536	dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
537	spin_lock(lock: &mdsc->cap_delay_lock);
538	ci->i_ceph_flags \|= CEPH_I_FLUSH;
539	if (!list_empty(head: &ci->i_cap_delay_list))
540	list_del_init(entry: &ci->i_cap_delay_list);
541	list_add(new: &ci->i_cap_delay_list, head: &mdsc->cap_delay_list);
542	spin_unlock(lock: &mdsc->cap_delay_lock);
543	}
544
545	/*
546	* Cancel delayed work on cap.
547	*
548	* Caller must hold i_ceph_lock.
549	*/
550	static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
551	struct ceph_inode_info *ci)
552	{
553	dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
554	if (list_empty(head: &ci->i_cap_delay_list))
555	return;
556	spin_lock(lock: &mdsc->cap_delay_lock);
557	list_del_init(entry: &ci->i_cap_delay_list);
558	spin_unlock(lock: &mdsc->cap_delay_lock);
559	}
560
561	/ Common issue checks for add_cap, handle_cap_grant. /
562	static void __check_cap_issue(struct ceph_inode_info ci, struct* ceph_cap *cap,
563	unsigned issued)
564	{
565	unsigned had = __ceph_caps_issued(ci, NULL);
566
567	lockdep_assert_held(&ci->i_ceph_lock);
568
569	/*
570	* Each time we receive FILE_CACHE anew, we increment
571	* i_rdcache_gen.
572	*/
573	if (S_ISREG(ci->netfs.inode.i_mode) &&
574	(issued & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
575	(had & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) == `0`) {
576	ci->i_rdcache_gen++;
577	}
578
579	/*
580	* If FILE_SHARED is newly issued, mark dir not complete. We don't
581	* know what happened to this directory while we didn't have the cap.
582	* If FILE_SHARED is being revoked, also mark dir not complete. It
583	* stops on-going cached readdir.
584	*/
585	if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
586	if (issued & CEPH_CAP_FILE_SHARED)
587	atomic_inc(v: &ci->i_shared_gen);
588	if (S_ISDIR(ci->netfs.inode.i_mode)) {
589	dout(" marking %p NOT complete\n", &ci->netfs.inode);
590	__ceph_dir_clear_complete(ci);
591	}
592	}
593
594	/ Wipe saved layout if we're losing DIR_CREATE caps /
595	if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
596	!(issued & CEPH_CAP_DIR_CREATE)) {
597	ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
598	memset(&ci->i_cached_layout, `0`, sizeof(ci->i_cached_layout));
599	}
600	}
601
602	/**
603	* change_auth_cap_ses - move inode to appropriate lists when auth caps change
604	* @ci: inode to be moved
605	* @session: new auth caps session
606	*/
607	void change_auth_cap_ses(struct ceph_inode_info *ci,
608	struct ceph_mds_session *session)
609	{
610	lockdep_assert_held(&ci->i_ceph_lock);
611
612	if (list_empty(head: &ci->i_dirty_item) && list_empty(head: &ci->i_flushing_item))
613	return;
614
615	spin_lock(lock: &session->s_mdsc->cap_dirty_lock);
616	if (!list_empty(head: &ci->i_dirty_item))
617	list_move(list: &ci->i_dirty_item, head: &session->s_cap_dirty);
618	if (!list_empty(head: &ci->i_flushing_item))
619	list_move_tail(list: &ci->i_flushing_item, head: &session->s_cap_flushing);
620	spin_unlock(lock: &session->s_mdsc->cap_dirty_lock);
621	}
622
623	/*
624	* Add a capability under the given MDS session.
625	*
626	* Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
627	*
628	* @fmode is the open file mode, if we are opening a file, otherwise
629	* it is < 0. (This is so we can atomically add the cap and add an
630	* open file reference to it.)
631	*/
632	void ceph_add_cap(struct inode *inode,
633	struct ceph_mds_session *session, u64 cap_id,
634	unsigned issued, unsigned wanted,
635	unsigned seq, unsigned mseq, u64 realmino, int flags,
636	struct ceph_cap **new_cap)
637	{
638	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
639	struct ceph_inode_info *ci = ceph_inode(inode);
640	struct ceph_cap *cap;
641	int mds = session->s_mds;
642	int actual_wanted;
643	u32 gen;
644
645	lockdep_assert_held(&ci->i_ceph_lock);
646
647	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
648	session->s_mds, cap_id, ceph_cap_string(issued), seq);
649
650	gen = atomic_read(v: &session->s_cap_gen);
651
652	cap = __get_cap_for_mds(ci, mds);
653	if (!cap) {
654	cap = *new_cap;
655	*new_cap = NULL;
656
657	cap->issued = `0`;
658	cap->implemented = `0`;
659	cap->mds = mds;
660	cap->mds_wanted = `0`;
661	cap->mseq = `0`;
662
663	cap->ci = ci;
664	__insert_cap_node(ci, new: cap);
665
666	/ add to session cap list /
667	cap->session = session;
668	spin_lock(lock: &session->s_cap_lock);
669	list_add_tail(new: &cap->session_caps, head: &session->s_caps);
670	session->s_nr_caps++;
671	atomic64_inc(v: &mdsc->metric.total_caps);
672	spin_unlock(lock: &session->s_cap_lock);
673	} else {
674	spin_lock(lock: &session->s_cap_lock);
675	list_move_tail(list: &cap->session_caps, head: &session->s_caps);
676	spin_unlock(lock: &session->s_cap_lock);
677
678	if (cap->cap_gen < gen)
679	cap->issued = cap->implemented = CEPH_CAP_PIN;
680
681	/*
682	* auth mds of the inode changed. we received the cap export
683	* message, but still haven't received the cap import message.
684	* handle_cap_export() updated the new auth MDS' cap.
685	*
686	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
687	* a message that was send before the cap import message. So
688	* don't remove caps.
689	*/
690	if (ceph_seq_cmp(a: seq, b: cap->seq) <= `0`) {
691	WARN_ON(cap != ci->i_auth_cap);
692	WARN_ON(cap->cap_id != cap_id);
693	seq = cap->seq;
694	mseq = cap->mseq;
695	issued \|= cap->issued;
696	flags \|= CEPH_CAP_FLAG_AUTH;
697	}
698	}
699
700	if (!ci->i_snap_realm \|\|
701	((flags & CEPH_CAP_FLAG_AUTH) &&
702	realmino != (u64)-`1` && ci->i_snap_realm->ino != realmino)) {
703	/*
704	* add this inode to the appropriate snap realm
705	*/
706	struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
707	ino: realmino);
708	if (realm)
709	ceph_change_snap_realm(inode, realm);
710	else
711	WARN(`1`, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
712	__func__, realmino, ci->i_vino.ino,
713	ci->i_snap_realm ? ci->i_snap_realm->ino : `0`);
714	}
715
716	__check_cap_issue(ci, cap, issued);
717
718	/*
719	* If we are issued caps we don't want, or the mds' wanted
720	* value appears to be off, queue a check so we'll release
721	* later and/or update the mds wanted value.
722	*/
723	actual_wanted = __ceph_caps_wanted(ci);
724	if ((wanted & ~actual_wanted) \|\|
725	(issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
726	dout(" issued %s, mds wanted %s, actual %s, queueing\n",
727	ceph_cap_string(issued), ceph_cap_string(wanted),
728	ceph_cap_string(actual_wanted));
729	__cap_delay_requeue(mdsc, ci);
730	}
731
732	if (flags & CEPH_CAP_FLAG_AUTH) {
733	if (!ci->i_auth_cap \|\|
734	ceph_seq_cmp(a: ci->i_auth_cap->mseq, b: mseq) < `0`) {
735	if (ci->i_auth_cap &&
736	ci->i_auth_cap->session != cap->session)
737	change_auth_cap_ses(ci, session: cap->session);
738	ci->i_auth_cap = cap;
739	cap->mds_wanted = wanted;
740	}
741	} else {
742	WARN_ON(ci->i_auth_cap == cap);
743	}
744
745	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
746	inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
747	ceph_cap_string(issued\|cap->issued), seq, mds);
748	cap->cap_id = cap_id;
749	cap->issued = issued;
750	cap->implemented \|= issued;
751	if (ceph_seq_cmp(a: mseq, b: cap->mseq) > `0`)
752	cap->mds_wanted = wanted;
753	else
754	cap->mds_wanted \|= wanted;
755	cap->seq = seq;
756	cap->issue_seq = seq;
757	cap->mseq = mseq;
758	cap->cap_gen = gen;
759	wake_up_all(&ci->i_cap_wq);
760	}
761
762	/*
763	* Return true if cap has not timed out and belongs to the current
764	* generation of the MDS session (i.e. has not gone 'stale' due to
765	* us losing touch with the mds).
766	*/
767	static int __cap_is_valid(struct ceph_cap *cap)
768	{
769	unsigned long ttl;
770	u32 gen;
771
772	gen = atomic_read(v: &cap->session->s_cap_gen);
773	ttl = cap->session->s_cap_ttl;
774
775	if (cap->cap_gen < gen \|\| time_after_eq(jiffies, ttl)) {
776	dout("__cap_is_valid %p cap %p issued %s "
777	"but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
778	cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
779	return `0`;
780	}
781
782	return `1`;
783	}
784
785	/*
786	* Return set of valid cap bits issued to us. Note that caps time
787	* out, and may be invalidated in bulk if the client session times out
788	* and session->s_cap_gen is bumped.
789	*/
790	int __ceph_caps_issued(struct ceph_inode_info ci, int* *implemented)
791	{
792	int have = ci->i_snap_caps;
793	struct ceph_cap *cap;
794	struct rb_node *p;
795
796	if (implemented)
797	*implemented = `0`;
798	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
799	cap = rb_entry(p, struct ceph_cap, ci_node);
800	if (!__cap_is_valid(cap))
801	continue;
802	dout("__ceph_caps_issued %p cap %p issued %s\n",
803	&ci->netfs.inode, cap, ceph_cap_string(cap->issued));
804	have \|= cap->issued;
805	if (implemented)
806	*implemented \|= cap->implemented;
807	}
808	/*
809	* exclude caps issued by non-auth MDS, but are been revoking
810	* by the auth MDS. The non-auth MDS should be revoking/exporting
811	* these caps, but the message is delayed.
812	*/
813	if (ci->i_auth_cap) {
814	cap = ci->i_auth_cap;
815	have &= ~cap->implemented \| cap->issued;
816	}
817	return have;
818	}
819
820	/*
821	* Get cap bits issued by caps other than @ocap
822	*/
823	int __ceph_caps_issued_other(struct ceph_inode_info ci, struct* ceph_cap *ocap)
824	{
825	int have = ci->i_snap_caps;
826	struct ceph_cap *cap;
827	struct rb_node *p;
828
829	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
830	cap = rb_entry(p, struct ceph_cap, ci_node);
831	if (cap == ocap)
832	continue;
833	if (!__cap_is_valid(cap))
834	continue;
835	have \|= cap->issued;
836	}
837	return have;
838	}
839
840	/*
841	* Move a cap to the end of the LRU (oldest caps at list head, newest
842	* at list tail).
843	*/
844	static void __touch_cap(struct ceph_cap *cap)
845	{
846	struct ceph_mds_session *s = cap->session;
847
848	spin_lock(lock: &s->s_cap_lock);
849	if (!s->s_cap_iterator) {
850	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
851	s->s_mds);
852	list_move_tail(list: &cap->session_caps, head: &s->s_caps);
853	} else {
854	dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
855	&cap->ci->netfs.inode, cap, s->s_mds);
856	}
857	spin_unlock(lock: &s->s_cap_lock);
858	}
859
860	/*
861	* Check if we hold the given mask. If so, move the cap(s) to the
862	* front of their respective LRUs. (This is the preferred way for
863	* callers to check for caps they want.)
864	*/
865	int __ceph_caps_issued_mask(struct ceph_inode_info ci, int* mask, int touch)
866	{
867	struct ceph_cap *cap;
868	struct rb_node *p;
869	int have = ci->i_snap_caps;
870
871	if ((have & mask) == mask) {
872	dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
873	" (mask %s)\n", ceph_ino(&ci->netfs.inode),
874	ceph_cap_string(have),
875	ceph_cap_string(mask));
876	return `1`;
877	}
878
879	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
880	cap = rb_entry(p, struct ceph_cap, ci_node);
881	if (!__cap_is_valid(cap))
882	continue;
883	if ((cap->issued & mask) == mask) {
884	dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
885	" (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
886	ceph_cap_string(cap->issued),
887	ceph_cap_string(mask));
888	if (touch)
889	__touch_cap(cap);
890	return `1`;
891	}
892
893	/ does a combination of caps satisfy mask? /
894	have \|= cap->issued;
895	if ((have & mask) == mask) {
896	dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
897	" (mask %s)\n", ceph_ino(&ci->netfs.inode),
898	ceph_cap_string(cap->issued),
899	ceph_cap_string(mask));
900	if (touch) {
901	struct rb_node *q;
902
903	/ touch this + preceding caps /
904	__touch_cap(cap);
905	for (q = rb_first(&ci->i_caps); q != p;
906	q = rb_next(q)) {
907	cap = rb_entry(q, struct ceph_cap,
908	ci_node);
909	if (!__cap_is_valid(cap))
910	continue;
911	if (cap->issued & mask)
912	__touch_cap(cap);
913	}
914	}
915	return `1`;
916	}
917	}
918
919	return `0`;
920	}
921
922	int __ceph_caps_issued_mask_metric(struct ceph_inode_info ci, int* mask,
923	int touch)
924	{
925	struct ceph_fs_client *fsc = ceph_sb_to_client(sb: ci->netfs.inode.i_sb);
926	int r;
927
928	r = __ceph_caps_issued_mask(ci, mask, touch);
929	if (r)
930	ceph_update_cap_hit(m: &fsc->mdsc->metric);
931	else
932	ceph_update_cap_mis(m: &fsc->mdsc->metric);
933	return r;
934	}
935
936	/*
937	* Return true if mask caps are currently being revoked by an MDS.
938	*/
939	int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
940	struct ceph_cap ocap, int* mask)
941	{
942	struct ceph_cap *cap;
943	struct rb_node *p;
944
945	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
946	cap = rb_entry(p, struct ceph_cap, ci_node);
947	if (cap != ocap &&
948	(cap->implemented & ~cap->issued & mask))
949	return `1`;
950	}
951	return `0`;
952	}
953
954	int ceph_caps_revoking(struct ceph_inode_info ci, int* mask)
955	{
956	struct inode *inode = &ci->netfs.inode;
957	int ret;
958
959	spin_lock(lock: &ci->i_ceph_lock);
960	ret = __ceph_caps_revoking_other(ci, NULL, mask);
961	spin_unlock(lock: &ci->i_ceph_lock);
962	dout("ceph_caps_revoking %p %s = %d\n", inode,
963	ceph_cap_string(mask), ret);
964	return ret;
965	}
966
967	int __ceph_caps_used(struct ceph_inode_info *ci)
968	{
969	int used = `0`;
970	if (ci->i_pin_ref)
971	used \|= CEPH_CAP_PIN;
972	if (ci->i_rd_ref)
973	used \|= CEPH_CAP_FILE_RD;
974	if (ci->i_rdcache_ref \|\|
975	(S_ISREG(ci->netfs.inode.i_mode) &&
976	ci->netfs.inode.i_data.nrpages))
977	used \|= CEPH_CAP_FILE_CACHE;
978	if (ci->i_wr_ref)
979	used \|= CEPH_CAP_FILE_WR;
980	if (ci->i_wb_ref \|\| ci->i_wrbuffer_ref)
981	used \|= CEPH_CAP_FILE_BUFFER;
982	if (ci->i_fx_ref)
983	used \|= CEPH_CAP_FILE_EXCL;
984	return used;
985	}
986
987	#define FMODE_WAIT_BIAS 1000
988
989	/*
990	* wanted, by virtue of open file modes
991	*/
992	int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
993	{
994	const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
995	const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
996	const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
997	const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
998	struct ceph_mount_options *opt =
999	ceph_inode_to_client(inode: &ci->netfs.inode)->mount_options;
1000	unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
1001	unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
1002
1003	if (S_ISDIR(ci->netfs.inode.i_mode)) {
1004	int want = `0`;
1005
1006	/ use used_cutoff here, to keep dir's wanted caps longer /
1007	if (ci->i_nr_by_mode[RD_SHIFT] > `0` \|\|
1008	time_after(ci->i_last_rd, used_cutoff))
1009	want \|= CEPH_CAP_ANY_SHARED;
1010
1011	if (ci->i_nr_by_mode[WR_SHIFT] > `0` \|\|
1012	time_after(ci->i_last_wr, used_cutoff)) {
1013	want \|= CEPH_CAP_ANY_SHARED \| CEPH_CAP_FILE_EXCL;
1014	if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1015	want \|= CEPH_CAP_ANY_DIR_OPS;
1016	}
1017
1018	if (want \|\| ci->i_nr_by_mode[PIN_SHIFT] > `0`)
1019	want \|= CEPH_CAP_PIN;
1020
1021	return want;
1022	} else {
1023	int bits = `0`;
1024
1025	if (ci->i_nr_by_mode[RD_SHIFT] > `0`) {
1026	if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS \|\|
1027	time_after(ci->i_last_rd, used_cutoff))
1028	bits \|= `1` << RD_SHIFT;
1029	} else if (time_after(ci->i_last_rd, idle_cutoff)) {
1030	bits \|= `1` << RD_SHIFT;
1031	}
1032
1033	if (ci->i_nr_by_mode[WR_SHIFT] > `0`) {
1034	if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS \|\|
1035	time_after(ci->i_last_wr, used_cutoff))
1036	bits \|= `1` << WR_SHIFT;
1037	} else if (time_after(ci->i_last_wr, idle_cutoff)) {
1038	bits \|= `1` << WR_SHIFT;
1039	}
1040
1041	/ check lazyio only when read/write is wanted /
1042	if ((bits & (CEPH_FILE_MODE_RDWR << `1`)) &&
1043	ci->i_nr_by_mode[LAZY_SHIFT] > `0`)
1044	bits \|= `1` << LAZY_SHIFT;
1045
1046	return bits ? ceph_caps_for_mode(mode: bits >> `1`) : `0`;
1047	}
1048	}
1049
1050	/*
1051	* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1052	*/
1053	int __ceph_caps_wanted(struct ceph_inode_info *ci)
1054	{
1055	int w = __ceph_caps_file_wanted(ci) \| __ceph_caps_used(ci);
1056	if (S_ISDIR(ci->netfs.inode.i_mode)) {
1057	/ we want EXCL if holding caps of dir ops /
1058	if (w & CEPH_CAP_ANY_DIR_OPS)
1059	w \|= CEPH_CAP_FILE_EXCL;
1060	} else {
1061	/ we want EXCL if dirty data /
1062	if (w & CEPH_CAP_FILE_BUFFER)
1063	w \|= CEPH_CAP_FILE_EXCL;
1064	}
1065	return w;
1066	}
1067
1068	/*
1069	* Return caps we have registered with the MDS(s) as 'wanted'.
1070	*/
1071	int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
1072	{
1073	struct ceph_cap *cap;
1074	struct rb_node *p;
1075	int mds_wanted = `0`;
1076
1077	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1078	cap = rb_entry(p, struct ceph_cap, ci_node);
1079	if (check && !__cap_is_valid(cap))
1080	continue;
1081	if (cap == ci->i_auth_cap)
1082	mds_wanted \|= cap->mds_wanted;
1083	else
1084	mds_wanted \|= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
1085	}
1086	return mds_wanted;
1087	}
1088
1089	int ceph_is_any_caps(struct inode *inode)
1090	{
1091	struct ceph_inode_info *ci = ceph_inode(inode);
1092	int ret;
1093
1094	spin_lock(lock: &ci->i_ceph_lock);
1095	ret = __ceph_is_any_real_caps(ci);
1096	spin_unlock(lock: &ci->i_ceph_lock);
1097
1098	return ret;
1099	}
1100
1101	/*
1102	* Remove a cap. Take steps to deal with a racing iterate_session_caps.
1103	*
1104	* caller should hold i_ceph_lock.
1105	* caller will not hold session s_mutex if called from destroy_inode.
1106	*/
1107	void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
1108	{
1109	struct ceph_mds_session *session = cap->session;
1110	struct ceph_inode_info *ci = cap->ci;
1111	struct ceph_mds_client *mdsc;
1112	int removed = `0`;
1113
1114	/ 'ci' being NULL means the remove have already occurred /
1115	if (!ci) {
1116	dout("%s: cap inode is NULL\n", __func__);
1117	return;
1118	}
1119
1120	lockdep_assert_held(&ci->i_ceph_lock);
1121
1122	dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
1123
1124	mdsc = ceph_inode_to_client(inode: &ci->netfs.inode)->mdsc;
1125
1126	/ remove from inode's cap rbtree, and clear auth cap /
1127	rb_erase(&cap->ci_node, &ci->i_caps);
1128	if (ci->i_auth_cap == cap)
1129	ci->i_auth_cap = NULL;
1130
1131	/ remove from session list /
1132	spin_lock(lock: &session->s_cap_lock);
1133	if (session->s_cap_iterator == cap) {
1134	/ not yet, we are iterating over this very cap /
1135	dout("__ceph_remove_cap delaying %p removal from session %p\n",
1136	cap, cap->session);
1137	} else {
1138	list_del_init(entry: &cap->session_caps);
1139	session->s_nr_caps--;
1140	atomic64_dec(v: &mdsc->metric.total_caps);
1141	cap->session = NULL;
1142	removed = `1`;
1143	}
1144	/ protect backpointer with s_cap_lock: see iterate_session_caps /
1145	cap->ci = NULL;
1146
1147	/*
1148	* s_cap_reconnect is protected by s_cap_lock. no one changes
1149	* s_cap_gen while session is in the reconnect state.
1150	*/
1151	if (queue_release &&
1152	(!session->s_cap_reconnect \|\|
1153	cap->cap_gen == atomic_read(v: &session->s_cap_gen))) {
1154	cap->queue_release = `1`;
1155	if (removed) {
1156	__ceph_queue_cap_release(session, cap);
1157	removed = `0`;
1158	}
1159	} else {
1160	cap->queue_release = `0`;
1161	}
1162	cap->cap_ino = ci->i_vino.ino;
1163
1164	spin_unlock(lock: &session->s_cap_lock);
1165
1166	if (removed)
1167	ceph_put_cap(mdsc, cap);
1168
1169	if (!__ceph_is_any_real_caps(ci)) {
1170	/ when reconnect denied, we remove session caps forcibly,*
1171	* i_wr_ref can be non-zero. If there are ongoing write,
1172	* keep i_snap_realm.
1173	*/
1174	if (ci->i_wr_ref == `0` && ci->i_snap_realm)
1175	ceph_change_snap_realm(inode: &ci->netfs.inode, NULL);
1176
1177	__cap_delay_cancel(mdsc, ci);
1178	}
1179	}
1180
1181	void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
1182	{
1183	struct ceph_inode_info *ci = cap->ci;
1184	struct ceph_fs_client *fsc;
1185
1186	/ 'ci' being NULL means the remove have already occurred /
1187	if (!ci) {
1188	dout("%s: cap inode is NULL\n", __func__);
1189	return;
1190	}
1191
1192	lockdep_assert_held(&ci->i_ceph_lock);
1193
1194	fsc = ceph_inode_to_client(inode: &ci->netfs.inode);
1195	WARN_ON_ONCE(ci->i_auth_cap == cap &&
1196	!list_empty(&ci->i_dirty_item) &&
1197	!fsc->blocklisted &&
1198	!ceph_inode_is_shutdown(&ci->netfs.inode));
1199
1200	__ceph_remove_cap(cap, queue_release);
1201	}
1202
1203	struct cap_msg_args {
1204	struct ceph_mds_session *session;
1205	u64 ino, cid, follows;
1206	u64 flush_tid, oldest_flush_tid, size, max_size;
1207	u64 xattr_version;
1208	u64 change_attr;
1209	struct ceph_buffer *xattr_buf;
1210	struct ceph_buffer *old_xattr_buf;
1211	struct timespec64 atime, mtime, ctime, btime;
1212	int op, caps, wanted, dirty;
1213	u32 seq, issue_seq, mseq, time_warp_seq;
1214	u32 flags;
1215	kuid_t uid;
1216	kgid_t gid;
1217	umode_t mode;
1218	bool inline_data;
1219	bool wake;
1220	bool encrypted;
1221	u32 fscrypt_auth_len;
1222	u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
1223	};
1224
1225	/ Marshal up the cap msg to the MDS /
1226	static void encode_cap_msg(struct ceph_msg msg, struct* cap_msg_args *arg)
1227	{
1228	struct ceph_mds_caps *fc;
1229	void *p;
1230	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
1231
1232	dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
1233	__func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
1234	ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
1235	ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
1236	arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
1237	arg->size, arg->max_size, arg->xattr_version,
1238	arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : `0`);
1239
1240	msg->hdr.version = cpu_to_le16(`12`);
1241	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
1242
1243	fc = msg->front.iov_base;
1244	memset(fc, `0`, sizeof(*fc));
1245
1246	fc->cap_id = cpu_to_le64(arg->cid);
1247	fc->op = cpu_to_le32(arg->op);
1248	fc->seq = cpu_to_le32(arg->seq);
1249	fc->issue_seq = cpu_to_le32(arg->issue_seq);
1250	fc->migrate_seq = cpu_to_le32(arg->mseq);
1251	fc->caps = cpu_to_le32(arg->caps);
1252	fc->wanted = cpu_to_le32(arg->wanted);
1253	fc->dirty = cpu_to_le32(arg->dirty);
1254	fc->ino = cpu_to_le64(arg->ino);
1255	fc->snap_follows = cpu_to_le64(arg->follows);
1256
1257	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1258	if (arg->encrypted)
1259	fc->size = cpu_to_le64(round_up(arg->size,
1260	CEPH_FSCRYPT_BLOCK_SIZE));
1261	else
1262	#endif
1263	fc->size = cpu_to_le64(arg->size);
1264	fc->max_size = cpu_to_le64(arg->max_size);
1265	ceph_encode_timespec64(tv: &fc->mtime, ts: &arg->mtime);
1266	ceph_encode_timespec64(tv: &fc->atime, ts: &arg->atime);
1267	ceph_encode_timespec64(tv: &fc->ctime, ts: &arg->ctime);
1268	fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
1269
1270	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
1271	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
1272	fc->mode = cpu_to_le32(arg->mode);
1273
1274	fc->xattr_version = cpu_to_le64(arg->xattr_version);
1275	if (arg->xattr_buf) {
1276	msg->middle = ceph_buffer_get(b: arg->xattr_buf);
1277	fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1278	msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1279	}
1280
1281	p = fc + `1`;
1282	/ flock buffer size (version 2) /
1283	ceph_encode_32(p: &p, v: `0`);
1284	/ inline version (version 4) /
1285	ceph_encode_64(p: &p, v: arg->inline_data ? `0` : CEPH_INLINE_NONE);
1286	/ inline data size /
1287	ceph_encode_32(p: &p, v: `0`);
1288	/*
1289	* osd_epoch_barrier (version 5)
1290	* The epoch_barrier is protected osdc->lock, so READ_ONCE here in
1291	* case it was recently changed
1292	*/
1293	ceph_encode_32(p: &p, READ_ONCE(osdc->epoch_barrier));
1294	/ oldest_flush_tid (version 6) /
1295	ceph_encode_64(p: &p, v: arg->oldest_flush_tid);
1296
1297	/*
1298	* caller_uid/caller_gid (version 7)
1299	*
1300	* Currently, we don't properly track which caller dirtied the caps
1301	* last, and force a flush of them when there is a conflict. For now,
1302	* just set this to 0:0, to emulate how the MDS has worked up to now.
1303	*/
1304	ceph_encode_32(p: &p, v: `0`);
1305	ceph_encode_32(p: &p, v: `0`);
1306
1307	/ pool namespace (version 8) (mds always ignores this) /
1308	ceph_encode_32(p: &p, v: `0`);
1309
1310	/ btime and change_attr (version 9) /
1311	ceph_encode_timespec64(tv: p, ts: &arg->btime);
1312	p += sizeof(struct ceph_timespec);
1313	ceph_encode_64(p: &p, v: arg->change_attr);
1314
1315	/ Advisory flags (version 10) /
1316	ceph_encode_32(p: &p, v: arg->flags);
1317
1318	/ dirstats (version 11) - these are r/o on the client /
1319	ceph_encode_64(p: &p, v: `0`);
1320	ceph_encode_64(p: &p, v: `0`);
1321
1322	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1323	/*
1324	* fscrypt_auth and fscrypt_file (version 12)
1325	*
1326	* fscrypt_auth holds the crypto context (if any). fscrypt_file
1327	* tracks the real i_size as an __le64 field (and we use a rounded-up
1328	* i_size in the traditional size field).
1329	*/
1330	ceph_encode_32(p: &p, v: arg->fscrypt_auth_len);
1331	ceph_encode_copy(p: &p, s: arg->fscrypt_auth, len: arg->fscrypt_auth_len);
1332	ceph_encode_32(p: &p, v: sizeof(__le64));
1333	ceph_encode_64(p: &p, v: arg->size);
1334	#else /* CONFIG_FS_ENCRYPTION */
1335	ceph_encode_32(&p, `0`);
1336	ceph_encode_32(&p, `0`);
1337	#endif /* CONFIG_FS_ENCRYPTION */
1338	}
1339
1340	/*
1341	* Queue cap releases when an inode is dropped from our cache.
1342	*/
1343	void __ceph_remove_caps(struct ceph_inode_info *ci)
1344	{
1345	struct rb_node *p;
1346
1347	/ lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)*
1348	* may call __ceph_caps_issued_mask() on a freeing inode. */
1349	spin_lock(lock: &ci->i_ceph_lock);
1350	p = rb_first(&ci->i_caps);
1351	while (p) {
1352	struct ceph_cap cap = rb_entry(p, struct* ceph_cap, ci_node);
1353	p = rb_next(p);
1354	ceph_remove_cap(cap, queue_release: true);
1355	}
1356	spin_unlock(lock: &ci->i_ceph_lock);
1357	}
1358
1359	/*
1360	* Prepare to send a cap message to an MDS. Update the cap state, and populate
1361	* the arg struct with the parameters that will need to be sent. This should
1362	* be done under the i_ceph_lock to guard against changes to cap state.
1363	*
1364	* Make note of max_size reported/requested from mds, revoked caps
1365	* that have now been implemented.
1366	*/
1367	static void __prep_cap(struct cap_msg_args arg, struct* ceph_cap *cap,
1368	int op, int flags, int used, int want, int retain,
1369	int flushing, u64 flush_tid, u64 oldest_flush_tid)
1370	{
1371	struct ceph_inode_info *ci = cap->ci;
1372	struct inode *inode = &ci->netfs.inode;
1373	int held, revoking;
1374
1375	lockdep_assert_held(&ci->i_ceph_lock);
1376
1377	held = cap->issued \| cap->implemented;
1378	revoking = cap->implemented & ~cap->issued;
1379	retain &= ~revoking;
1380
1381	dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
1382	__func__, inode, cap, cap->session,
1383	ceph_cap_string(held), ceph_cap_string(held & retain),
1384	ceph_cap_string(revoking));
1385	BUG_ON((retain & CEPH_CAP_PIN) == `0`);
1386
1387	ci->i_ceph_flags &= ~CEPH_I_FLUSH;
1388
1389	cap->issued &= retain; / drop bits we don't want /
1390	/*
1391	* Wake up any waiters on wanted -> needed transition. This is due to
1392	* the weird transition from buffered to sync IO... we need to flush
1393	* dirty pages _before_ allowing sync writes to avoid reordering.
1394	*/
1395	arg->wake = cap->implemented & ~cap->issued;
1396	cap->implemented &= cap->issued \| used;
1397	cap->mds_wanted = want;
1398
1399	arg->session = cap->session;
1400	arg->ino = ceph_vino(inode).ino;
1401	arg->cid = cap->cap_id;
1402	arg->follows = flushing ? ci->i_head_snapc->seq : `0`;
1403	arg->flush_tid = flush_tid;
1404	arg->oldest_flush_tid = oldest_flush_tid;
1405	arg->size = i_size_read(inode);
1406	ci->i_reported_size = arg->size;
1407	arg->max_size = ci->i_wanted_max_size;
1408	if (cap == ci->i_auth_cap) {
1409	if (want & CEPH_CAP_ANY_FILE_WR)
1410	ci->i_requested_max_size = arg->max_size;
1411	else
1412	ci->i_requested_max_size = `0`;
1413	}
1414
1415	if (flushing & CEPH_CAP_XATTR_EXCL) {
1416	arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1417	arg->xattr_version = ci->i_xattrs.version;
1418	arg->xattr_buf = ci->i_xattrs.blob;
1419	} else {
1420	arg->xattr_buf = NULL;
1421	arg->old_xattr_buf = NULL;
1422	}
1423
1424	arg->mtime = inode_get_mtime(inode);
1425	arg->atime = inode_get_atime(inode);
1426	arg->ctime = inode_get_ctime(inode);
1427	arg->btime = ci->i_btime;
1428	arg->change_attr = inode_peek_iversion_raw(inode);
1429
1430	arg->op = op;
1431	arg->caps = cap->implemented;
1432	arg->wanted = want;
1433	arg->dirty = flushing;
1434
1435	arg->seq = cap->seq;
1436	arg->issue_seq = cap->issue_seq;
1437	arg->mseq = cap->mseq;
1438	arg->time_warp_seq = ci->i_time_warp_seq;
1439
1440	arg->uid = inode->i_uid;
1441	arg->gid = inode->i_gid;
1442	arg->mode = inode->i_mode;
1443
1444	arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
1445	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1446	!list_empty(head: &ci->i_cap_snaps)) {
1447	struct ceph_cap_snap *capsnap;
1448	list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1449	if (capsnap->cap_flush.tid)
1450	break;
1451	if (capsnap->need_flush) {
1452	flags \|= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1453	break;
1454	}
1455	}
1456	}
1457	arg->flags = flags;
1458	arg->encrypted = IS_ENCRYPTED(inode);
1459	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1460	if (ci->fscrypt_auth_len &&
1461	WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
1462	/ Don't set this if it's too big /
1463	arg->fscrypt_auth_len = `0`;
1464	} else {
1465	arg->fscrypt_auth_len = ci->fscrypt_auth_len;
1466	memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
1467	min_t(size_t, ci->fscrypt_auth_len,
1468	sizeof(arg->fscrypt_auth)));
1469	}
1470	#endif /* CONFIG_FS_ENCRYPTION */
1471	}
1472
1473	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
1474	#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
1475	4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
1476
1477	static inline int cap_msg_size(struct cap_msg_args *arg)
1478	{
1479	return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
1480	}
1481	#else
1482	#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
1483	4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
1484
1485	static inline int cap_msg_size(struct cap_msg_args *arg)
1486	{
1487	return CAP_MSG_FIXED_FIELDS;
1488	}
1489	#endif /* CONFIG_FS_ENCRYPTION */
1490
1491	/*
1492	* Send a cap msg on the given inode.
1493	*
1494	* Caller should hold snap_rwsem (read), s_mutex.
1495	*/
1496	static void __send_cap(struct cap_msg_args arg, struct* ceph_inode_info *ci)
1497	{
1498	struct ceph_msg *msg;
1499	struct inode *inode = &ci->netfs.inode;
1500
1501	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, front_len: cap_msg_size(arg), GFP_NOFS,
1502	can_fail: false);
1503	if (!msg) {
1504	pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
1505	ceph_vinop(inode), ceph_cap_string(arg->dirty),
1506	arg->flush_tid);
1507	spin_lock(lock: &ci->i_ceph_lock);
1508	__cap_delay_requeue(mdsc: arg->session->s_mdsc, ci);
1509	spin_unlock(lock: &ci->i_ceph_lock);
1510	return;
1511	}
1512
1513	encode_cap_msg(msg, arg);
1514	ceph_con_send(con: &arg->session->s_con, msg);
1515	ceph_buffer_put(b: arg->old_xattr_buf);
1516	if (arg->wake)
1517	wake_up_all(&ci->i_cap_wq);
1518	}
1519
1520	static inline int __send_flush_snap(struct inode *inode,
1521	struct ceph_mds_session *session,
1522	struct ceph_cap_snap *capsnap,
1523	u32 mseq, u64 oldest_flush_tid)
1524	{
1525	struct cap_msg_args arg;
1526	struct ceph_msg *msg;
1527
1528	arg.session = session;
1529	arg.ino = ceph_vino(inode).ino;
1530	arg.cid = `0`;
1531	arg.follows = capsnap->follows;
1532	arg.flush_tid = capsnap->cap_flush.tid;
1533	arg.oldest_flush_tid = oldest_flush_tid;
1534
1535	arg.size = capsnap->size;
1536	arg.max_size = `0`;
1537	arg.xattr_version = capsnap->xattr_version;
1538	arg.xattr_buf = capsnap->xattr_blob;
1539	arg.old_xattr_buf = NULL;
1540
1541	arg.atime = capsnap->atime;
1542	arg.mtime = capsnap->mtime;
1543	arg.ctime = capsnap->ctime;
1544	arg.btime = capsnap->btime;
1545	arg.change_attr = capsnap->change_attr;
1546
1547	arg.op = CEPH_CAP_OP_FLUSHSNAP;
1548	arg.caps = capsnap->issued;
1549	arg.wanted = `0`;
1550	arg.dirty = capsnap->dirty;
1551
1552	arg.seq = `0`;
1553	arg.issue_seq = `0`;
1554	arg.mseq = mseq;
1555	arg.time_warp_seq = capsnap->time_warp_seq;
1556
1557	arg.uid = capsnap->uid;
1558	arg.gid = capsnap->gid;
1559	arg.mode = capsnap->mode;
1560
1561	arg.inline_data = capsnap->inline_data;
1562	arg.flags = `0`;
1563	arg.wake = false;
1564	arg.encrypted = IS_ENCRYPTED(inode);
1565
1566	/ No fscrypt_auth changes from a capsnap./
1567	arg.fscrypt_auth_len = `0`;
1568
1569	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, front_len: cap_msg_size(arg: &arg),
1570	GFP_NOFS, can_fail: false);
1571	if (!msg)
1572	return -ENOMEM;
1573
1574	encode_cap_msg(msg, arg: &arg);
1575	ceph_con_send(con: &arg.session->s_con, msg);
1576	return `0`;
1577	}
1578
1579	/*
1580	* When a snapshot is taken, clients accumulate dirty metadata on
1581	* inodes with capabilities in ceph_cap_snaps to describe the file
1582	* state at the time the snapshot was taken. This must be flushed
1583	* asynchronously back to the MDS once sync writes complete and dirty
1584	* data is written out.
1585	*
1586	* Called under i_ceph_lock.
1587	*/
1588	static void __ceph_flush_snaps(struct ceph_inode_info *ci,
1589	struct ceph_mds_session *session)
1590	__releases(ci->i_ceph_lock)
1591	__acquires(ci->i_ceph_lock)
1592	{
1593	struct inode *inode = &ci->netfs.inode;
1594	struct ceph_mds_client *mdsc = session->s_mdsc;
1595	struct ceph_cap_snap *capsnap;
1596	u64 oldest_flush_tid = `0`;
1597	u64 first_tid = `1`, last_tid = `0`;
1598
1599	dout("__flush_snaps %p session %p\n", inode, session);
1600
1601	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1602	/*
1603	* we need to wait for sync writes to complete and for dirty
1604	* pages to be written out.
1605	*/
1606	if (capsnap->dirty_pages \|\| capsnap->writing)
1607	break;
1608
1609	/ should be removed by ceph_try_drop_cap_snap() /
1610	BUG_ON(!capsnap->need_flush);
1611
1612	/ only flush each capsnap once /
1613	if (capsnap->cap_flush.tid > `0`) {
1614	dout(" already flushed %p, skipping\n", capsnap);
1615	continue;
1616	}
1617
1618	spin_lock(lock: &mdsc->cap_dirty_lock);
1619	capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1620	list_add_tail(new: &capsnap->cap_flush.g_list,
1621	head: &mdsc->cap_flush_list);
1622	if (oldest_flush_tid == `0`)
1623	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1624	if (list_empty(head: &ci->i_flushing_item)) {
1625	list_add_tail(new: &ci->i_flushing_item,
1626	head: &session->s_cap_flushing);
1627	}
1628	spin_unlock(lock: &mdsc->cap_dirty_lock);
1629
1630	list_add_tail(new: &capsnap->cap_flush.i_list,
1631	head: &ci->i_cap_flush_list);
1632
1633	if (first_tid == `1`)
1634	first_tid = capsnap->cap_flush.tid;
1635	last_tid = capsnap->cap_flush.tid;
1636	}
1637
1638	ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
1639
1640	while (first_tid <= last_tid) {
1641	struct ceph_cap *cap = ci->i_auth_cap;
1642	struct ceph_cap_flush cf = NULL, iter;
1643	int ret;
1644
1645	if (!(cap && cap->session == session)) {
1646	dout("__flush_snaps %p auth cap %p not mds%d, "
1647	"stop\n", inode, cap, session->s_mds);
1648	break;
1649	}
1650
1651	ret = -ENOENT;
1652	list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
1653	if (iter->tid >= first_tid) {
1654	cf = iter;
1655	ret = `0`;
1656	break;
1657	}
1658	}
1659	if (ret < `0`)
1660	break;
1661
1662	first_tid = cf->tid + `1`;
1663
1664	capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
1665	refcount_inc(r: &capsnap->nref);
1666	spin_unlock(lock: &ci->i_ceph_lock);
1667
1668	dout("__flush_snaps %p capsnap %p tid %llu %s\n",
1669	inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
1670
1671	ret = __send_flush_snap(inode, session, capsnap, mseq: cap->mseq,
1672	oldest_flush_tid);
1673	if (ret < `0`) {
1674	pr_err("__flush_snaps: error sending cap flushsnap, "
1675	"ino (%llx.%llx) tid %llu follows %llu\n",
1676	ceph_vinop(inode), cf->tid, capsnap->follows);
1677	}
1678
1679	ceph_put_cap_snap(capsnap);
1680	spin_lock(lock: &ci->i_ceph_lock);
1681	}
1682	}
1683
1684	void ceph_flush_snaps(struct ceph_inode_info *ci,
1685	struct ceph_mds_session **psession)
1686	{
1687	struct inode *inode = &ci->netfs.inode;
1688	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1689	struct ceph_mds_session *session = NULL;
1690	bool need_put = false;
1691	int mds;
1692
1693	dout("ceph_flush_snaps %p\n", inode);
1694	if (psession)
1695	session = *psession;
1696	retry:
1697	spin_lock(lock: &ci->i_ceph_lock);
1698	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
1699	dout(" no capsnap needs flush, doing nothing\n");
1700	goto out;
1701	}
1702	if (!ci->i_auth_cap) {
1703	dout(" no auth cap (migrating?), doing nothing\n");
1704	goto out;
1705	}
1706
1707	mds = ci->i_auth_cap->session->s_mds;
1708	if (session && session->s_mds != mds) {
1709	dout(" oops, wrong session %p mutex\n", session);
1710	ceph_put_mds_session(s: session);
1711	session = NULL;
1712	}
1713	if (!session) {
1714	spin_unlock(lock: &ci->i_ceph_lock);
1715	mutex_lock(&mdsc->mutex);
1716	session = __ceph_lookup_mds_session(mdsc, mds);
1717	mutex_unlock(lock: &mdsc->mutex);
1718	goto retry;
1719	}
1720
1721	// make sure flushsnap messages are sent in proper order.
1722	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
1723	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid: `0`);
1724
1725	__ceph_flush_snaps(ci, session);
1726	out:
1727	spin_unlock(lock: &ci->i_ceph_lock);
1728
1729	if (psession)
1730	*psession = session;
1731	else
1732	ceph_put_mds_session(s: session);
1733	/ we flushed them all; remove this inode from the queue /
1734	spin_lock(lock: &mdsc->snap_flush_lock);
1735	if (!list_empty(head: &ci->i_snap_flush_item))
1736	need_put = true;
1737	list_del_init(entry: &ci->i_snap_flush_item);
1738	spin_unlock(lock: &mdsc->snap_flush_lock);
1739
1740	if (need_put)
1741	iput(inode);
1742	}
1743
1744	/*
1745	* Mark caps dirty. If inode is newly dirty, return the dirty flags.
1746	* Caller is then responsible for calling __mark_inode_dirty with the
1747	* returned flags value.
1748	*/
1749	int __ceph_mark_dirty_caps(struct ceph_inode_info ci, int* mask,
1750	struct ceph_cap_flush **pcf)
1751	{
1752	struct ceph_mds_client *mdsc =
1753	ceph_sb_to_client(sb: ci->netfs.inode.i_sb)->mdsc;
1754	struct inode *inode = &ci->netfs.inode;
1755	int was = ci->i_dirty_caps;
1756	int dirty = `0`;
1757
1758	lockdep_assert_held(&ci->i_ceph_lock);
1759
1760	if (!ci->i_auth_cap) {
1761	pr_warn("__mark_dirty_caps %p %llx mask %s, "
1762	"but no auth cap (session was closed?)\n",
1763	inode, ceph_ino(inode), ceph_cap_string(mask));
1764	return `0`;
1765	}
1766
1767	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
1768	ceph_cap_string(mask), ceph_cap_string(was),
1769	ceph_cap_string(was \| mask));
1770	ci->i_dirty_caps \|= mask;
1771	if (was == `0`) {
1772	struct ceph_mds_session *session = ci->i_auth_cap->session;
1773
1774	WARN_ON_ONCE(ci->i_prealloc_cap_flush);
1775	swap(ci->i_prealloc_cap_flush, *pcf);
1776
1777	if (!ci->i_head_snapc) {
1778	WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
1779	ci->i_head_snapc = ceph_get_snap_context(
1780	sc: ci->i_snap_realm->cached_context);
1781	}
1782	dout(" inode %p now dirty snapc %p auth cap %p\n",
1783	&ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
1784	BUG_ON(!list_empty(&ci->i_dirty_item));
1785	spin_lock(lock: &mdsc->cap_dirty_lock);
1786	list_add(new: &ci->i_dirty_item, head: &session->s_cap_dirty);
1787	spin_unlock(lock: &mdsc->cap_dirty_lock);
1788	if (ci->i_flushing_caps == `0`) {
1789	ihold(inode);
1790	dirty \|= I_DIRTY_SYNC;
1791	}
1792	} else {
1793	WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
1794	}
1795	BUG_ON(list_empty(&ci->i_dirty_item));
1796	if (((was \| ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1797	(mask & CEPH_CAP_FILE_BUFFER))
1798	dirty \|= I_DIRTY_DATASYNC;
1799	__cap_delay_requeue(mdsc, ci);
1800	return dirty;
1801	}
1802
1803	struct ceph_cap_flush ceph_alloc_cap_flush(void*)
1804	{
1805	struct ceph_cap_flush *cf;
1806
1807	cf = kmem_cache_alloc(cachep: ceph_cap_flush_cachep, GFP_KERNEL);
1808	if (!cf)
1809	return NULL;
1810
1811	cf->is_capsnap = false;
1812	return cf;
1813	}
1814
1815	void ceph_free_cap_flush(struct ceph_cap_flush *cf)
1816	{
1817	if (cf)
1818	kmem_cache_free(s: ceph_cap_flush_cachep, objp: cf);
1819	}
1820
1821	static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1822	{
1823	if (!list_empty(head: &mdsc->cap_flush_list)) {
1824	struct ceph_cap_flush *cf =
1825	list_first_entry(&mdsc->cap_flush_list,
1826	struct ceph_cap_flush, g_list);
1827	return cf->tid;
1828	}
1829	return `0`;
1830	}
1831
1832	/*
1833	* Remove cap_flush from the mdsc's or inode's flushing cap list.
1834	* Return true if caller needs to wake up flush waiters.
1835	*/
1836	static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1837	struct ceph_cap_flush *cf)
1838	{
1839	struct ceph_cap_flush *prev;
1840	bool wake = cf->wake;
1841
1842	if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1843	prev = list_prev_entry(cf, g_list);
1844	prev->wake = true;
1845	wake = false;
1846	}
1847	list_del_init(entry: &cf->g_list);
1848	return wake;
1849	}
1850
1851	static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1852	struct ceph_cap_flush *cf)
1853	{
1854	struct ceph_cap_flush *prev;
1855	bool wake = cf->wake;
1856
1857	if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1858	prev = list_prev_entry(cf, i_list);
1859	prev->wake = true;
1860	wake = false;
1861	}
1862	list_del_init(entry: &cf->i_list);
1863	return wake;
1864	}
1865
1866	/*
1867	* Add dirty inode to the flushing list. Assigned a seq number so we
1868	* can wait for caps to flush without starving.
1869	*
1870	* Called under i_ceph_lock. Returns the flush tid.
1871	*/
1872	static u64 __mark_caps_flushing(struct inode *inode,
1873	struct ceph_mds_session *session, bool wake,
1874	u64 *oldest_flush_tid)
1875	{
1876	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
1877	struct ceph_inode_info *ci = ceph_inode(inode);
1878	struct ceph_cap_flush *cf = NULL;
1879	int flushing;
1880
1881	lockdep_assert_held(&ci->i_ceph_lock);
1882	BUG_ON(ci->i_dirty_caps == `0`);
1883	BUG_ON(list_empty(&ci->i_dirty_item));
1884	BUG_ON(!ci->i_prealloc_cap_flush);
1885
1886	flushing = ci->i_dirty_caps;
1887	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1888	ceph_cap_string(flushing),
1889	ceph_cap_string(ci->i_flushing_caps),
1890	ceph_cap_string(ci->i_flushing_caps \| flushing));
1891	ci->i_flushing_caps \|= flushing;
1892	ci->i_dirty_caps = `0`;
1893	dout(" inode %p now !dirty\n", inode);
1894
1895	swap(cf, ci->i_prealloc_cap_flush);
1896	cf->caps = flushing;
1897	cf->wake = wake;
1898
1899	spin_lock(lock: &mdsc->cap_dirty_lock);
1900	list_del_init(entry: &ci->i_dirty_item);
1901
1902	cf->tid = ++mdsc->last_cap_flush_tid;
1903	list_add_tail(new: &cf->g_list, head: &mdsc->cap_flush_list);
1904	*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
1905
1906	if (list_empty(head: &ci->i_flushing_item)) {
1907	list_add_tail(new: &ci->i_flushing_item, head: &session->s_cap_flushing);
1908	mdsc->num_cap_flushing++;
1909	}
1910	spin_unlock(lock: &mdsc->cap_dirty_lock);
1911
1912	list_add_tail(new: &cf->i_list, head: &ci->i_cap_flush_list);
1913
1914	return cf->tid;
1915	}
1916
1917	/*
1918	* try to invalidate mapping pages without blocking.
1919	*/
1920	static int try_nonblocking_invalidate(struct inode *inode)
1921	__releases(ci->i_ceph_lock)
1922	__acquires(ci->i_ceph_lock)
1923	{
1924	struct ceph_inode_info *ci = ceph_inode(inode);
1925	u32 invalidating_gen = ci->i_rdcache_gen;
1926
1927	spin_unlock(lock: &ci->i_ceph_lock);
1928	ceph_fscache_invalidate(inode, dio_write: false);
1929	invalidate_mapping_pages(mapping: &inode->i_data, start: `0`, end: -`1`);
1930	spin_lock(lock: &ci->i_ceph_lock);
1931
1932	if (inode->i_data.nrpages == `0` &&
1933	invalidating_gen == ci->i_rdcache_gen) {
1934	/ success. /
1935	dout("try_nonblocking_invalidate %p success\n", inode);
1936	/ save any racing async invalidate some trouble /
1937	ci->i_rdcache_revoking = ci->i_rdcache_gen - `1`;
1938	return `0`;
1939	}
1940	dout("try_nonblocking_invalidate %p failed\n", inode);
1941	return -`1`;
1942	}
1943
1944	bool __ceph_should_report_size(struct ceph_inode_info *ci)
1945	{
1946	loff_t size = i_size_read(inode: &ci->netfs.inode);
1947	/ mds will adjust max size according to the reported size /
1948	if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
1949	return false;
1950	if (size >= ci->i_max_size)
1951	return true;
1952	/ half of previous max_size increment has been used /
1953	if (ci->i_max_size > ci->i_reported_size &&
1954	(size << `1`) >= ci->i_max_size + ci->i_reported_size)
1955	return true;
1956	return false;
1957	}
1958
1959	/*
1960	* Swiss army knife function to examine currently used and wanted
1961	* versus held caps. Release, flush, ack revoked caps to mds as
1962	* appropriate.
1963	*
1964	* CHECK_CAPS_AUTHONLY - we should only check the auth cap
1965	* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1966	* further delay.
1967	*/
1968	void ceph_check_caps(struct ceph_inode_info ci, int* flags)
1969	{
1970	struct inode *inode = &ci->netfs.inode;
1971	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: inode->i_sb);
1972	struct ceph_cap *cap;
1973	u64 flush_tid, oldest_flush_tid;
1974	int file_wanted, used, cap_used;
1975	int issued, implemented, want, retain, revoking, flushing = `0`;
1976	int mds = -`1`; / keep track of how far we've gone through i_caps list*
1977	to avoid an infinite loop on retry /*
1978	struct rb_node *p;
1979	bool queue_invalidate = false;
1980	bool tried_invalidate = false;
1981	bool queue_writeback = false;
1982	struct ceph_mds_session *session = NULL;
1983
1984	spin_lock(lock: &ci->i_ceph_lock);
1985	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
1986	ci->i_ceph_flags \|= CEPH_I_ASYNC_CHECK_CAPS;
1987
1988	/ Don't send messages until we get async create reply /
1989	spin_unlock(lock: &ci->i_ceph_lock);
1990	return;
1991	}
1992
1993	if (ci->i_ceph_flags & CEPH_I_FLUSH)
1994	flags \|= CHECK_CAPS_FLUSH;
1995	retry:
1996	/ Caps wanted by virtue of active open files. /
1997	file_wanted = __ceph_caps_file_wanted(ci);
1998
1999	/ Caps which have active references against them /
2000	used = __ceph_caps_used(ci);
2001
2002	/*
2003	* "issued" represents the current caps that the MDS wants us to have.
2004	* "implemented" is the set that we have been granted, and includes the
2005	* ones that have not yet been returned to the MDS (the "revoking" set,
2006	* usually because they have outstanding references).
2007	*/
2008	issued = __ceph_caps_issued(ci, implemented: &implemented);
2009	revoking = implemented & ~issued;
2010
2011	want = file_wanted;
2012
2013	/ The ones we currently want to retain (may be adjusted below) /
2014	retain = file_wanted \| used \| CEPH_CAP_PIN;
2015	if (!mdsc->stopping && inode->i_nlink > `0`) {
2016	if (file_wanted) {
2017	retain \|= CEPH_CAP_ANY; / be greedy /
2018	} else if (S_ISDIR(inode->i_mode) &&
2019	(issued & CEPH_CAP_FILE_SHARED) &&
2020	__ceph_dir_is_complete(ci)) {
2021	/*
2022	* If a directory is complete, we want to keep
2023	* the exclusive cap. So that MDS does not end up
2024	* revoking the shared cap on every create/unlink
2025	* operation.
2026	*/
2027	if (IS_RDONLY(inode)) {
2028	want = CEPH_CAP_ANY_SHARED;
2029	} else {
2030	want \|= CEPH_CAP_ANY_SHARED \| CEPH_CAP_FILE_EXCL;
2031	}
2032	retain \|= want;
2033	} else {
2034
2035	retain \|= CEPH_CAP_ANY_SHARED;
2036	/*
2037	* keep RD only if we didn't have the file open RW,
2038	* because then the mds would revoke it anyway to
2039	* journal max_size=0.
2040	*/
2041	if (ci->i_max_size == `0`)
2042	retain \|= CEPH_CAP_ANY_RD;
2043	}
2044	}
2045
2046	dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s"
2047	" issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode),
2048	ceph_cap_string(file_wanted),
2049	ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
2050	ceph_cap_string(ci->i_flushing_caps),
2051	ceph_cap_string(issued), ceph_cap_string(revoking),
2052	ceph_cap_string(retain),
2053	(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
2054	(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "",
2055	(flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : "");
2056
2057	/*
2058	* If we no longer need to hold onto old our caps, and we may
2059	* have cached pages, but don't want them, then try to invalidate.
2060	* If we fail, it's because pages are locked.... try again later.
2061	*/
2062	if ((!(flags & CHECK_CAPS_NOINVAL) \|\| mdsc->stopping) &&
2063	S_ISREG(inode->i_mode) &&
2064	!(ci->i_wb_ref \|\| ci->i_wrbuffer_ref) && / no dirty pages... /
2065	inode->i_data.nrpages && / have cached pages /
2066	(revoking & (CEPH_CAP_FILE_CACHE\|
2067	CEPH_CAP_FILE_LAZYIO)) && / or revoking cache /
2068	!tried_invalidate) {
2069	dout("check_caps trying to invalidate on %llx.%llx\n",
2070	ceph_vinop(inode));
2071	if (try_nonblocking_invalidate(inode) < `0`) {
2072	dout("check_caps queuing invalidate\n");
2073	queue_invalidate = true;
2074	ci->i_rdcache_revoking = ci->i_rdcache_gen;
2075	}
2076	tried_invalidate = true;
2077	goto retry;
2078	}
2079
2080	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2081	int mflags = `0`;
2082	struct cap_msg_args arg;
2083
2084	cap = rb_entry(p, struct ceph_cap, ci_node);
2085
2086	/ avoid looping forever /
2087	if (mds >= cap->mds \|\|
2088	((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
2089	continue;
2090
2091	/*
2092	* If we have an auth cap, we don't need to consider any
2093	* overlapping caps as used.
2094	*/
2095	cap_used = used;
2096	if (ci->i_auth_cap && cap != ci->i_auth_cap)
2097	cap_used &= ~ci->i_auth_cap->issued;
2098
2099	revoking = cap->implemented & ~cap->issued;
2100	dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
2101	cap->mds, cap, ceph_cap_string(cap_used),
2102	ceph_cap_string(cap->issued),
2103	ceph_cap_string(cap->implemented),
2104	ceph_cap_string(revoking));
2105
2106	if (cap == ci->i_auth_cap &&
2107	(cap->issued & CEPH_CAP_FILE_WR)) {
2108	/ request larger max_size from MDS? /
2109	if (ci->i_wanted_max_size > ci->i_max_size &&
2110	ci->i_wanted_max_size > ci->i_requested_max_size) {
2111	dout("requesting new max_size\n");
2112	goto ack;
2113	}
2114
2115	/ approaching file_max? /
2116	if (__ceph_should_report_size(ci)) {
2117	dout("i_size approaching max_size\n");
2118	goto ack;
2119	}
2120	}
2121	/ flush anything dirty? /
2122	if (cap == ci->i_auth_cap) {
2123	if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
2124	dout("flushing dirty caps\n");
2125	goto ack;
2126	}
2127	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
2128	dout("flushing snap caps\n");
2129	goto ack;
2130	}
2131	}
2132
2133	/ completed revocation? going down and there are no caps? /
2134	if (revoking) {
2135	if ((revoking & cap_used) == `0`) {
2136	dout("completed revocation of %s\n",
2137	ceph_cap_string(cap->implemented & ~cap->issued));
2138	goto ack;
2139	}
2140
2141	/*
2142	* If the "i_wrbuffer_ref" was increased by mmap or generic
2143	* cache write just before the ceph_check_caps() is called,
2144	* the Fb capability revoking will fail this time. Then we
2145	* must wait for the BDI's delayed work to flush the dirty
2146	* pages and to release the "i_wrbuffer_ref", which will cost
2147	* at most 5 seconds. That means the MDS needs to wait at
2148	* most 5 seconds to finished the Fb capability's revocation.
2149	*
2150	* Let's queue a writeback for it.
2151	*/
2152	if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
2153	(revoking & CEPH_CAP_FILE_BUFFER))
2154	queue_writeback = true;
2155	}
2156
2157	/ want more caps from mds? /
2158	if (want & ~cap->mds_wanted) {
2159	if (want & ~(cap->mds_wanted \| cap->issued))
2160	goto ack;
2161	if (!__cap_is_valid(cap))
2162	goto ack;
2163	}
2164
2165	/ things we might delay /
2166	if ((cap->issued & ~retain) == `0`)
2167	continue; / nope, all good /
2168
2169	ack:
2170	ceph_put_mds_session(s: session);
2171	session = ceph_get_mds_session(s: cap->session);
2172
2173	/ kick flushing and flush snaps before sending normal*
2174	* cap message */
2175	if (cap == ci->i_auth_cap &&
2176	(ci->i_ceph_flags &
2177	(CEPH_I_KICK_FLUSH \| CEPH_I_FLUSH_SNAPS))) {
2178	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2179	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid: `0`);
2180	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2181	__ceph_flush_snaps(ci, session);
2182
2183	goto retry;
2184	}
2185
2186	if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2187	flushing = ci->i_dirty_caps;
2188	flush_tid = __mark_caps_flushing(inode, session, wake: false,
2189	oldest_flush_tid: &oldest_flush_tid);
2190	if (flags & CHECK_CAPS_FLUSH &&
2191	list_empty(head: &session->s_cap_dirty))
2192	mflags \|= CEPH_CLIENT_CAPS_SYNC;
2193	} else {
2194	flushing = `0`;
2195	flush_tid = `0`;
2196	spin_lock(lock: &mdsc->cap_dirty_lock);
2197	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2198	spin_unlock(lock: &mdsc->cap_dirty_lock);
2199	}
2200
2201	mds = cap->mds; / remember mds, so we don't repeat /
2202
2203	__prep_cap(arg: &arg, cap, op: CEPH_CAP_OP_UPDATE, flags: mflags, used: cap_used,
2204	want, retain, flushing, flush_tid, oldest_flush_tid);
2205
2206	spin_unlock(lock: &ci->i_ceph_lock);
2207	__send_cap(arg: &arg, ci);
2208	spin_lock(lock: &ci->i_ceph_lock);
2209
2210	goto retry; / retake i_ceph_lock and restart our cap scan. /
2211	}
2212
2213	/ periodically re-calculate caps wanted by open files /
2214	if (__ceph_is_any_real_caps(ci) &&
2215	list_empty(head: &ci->i_cap_delay_list) &&
2216	(file_wanted & ~CEPH_CAP_PIN) &&
2217	!(used & (CEPH_CAP_FILE_RD \| CEPH_CAP_ANY_FILE_WR))) {
2218	__cap_delay_requeue(mdsc, ci);
2219	}
2220
2221	spin_unlock(lock: &ci->i_ceph_lock);
2222
2223	ceph_put_mds_session(s: session);
2224	if (queue_writeback)
2225	ceph_queue_writeback(inode);
2226	if (queue_invalidate)
2227	ceph_queue_invalidate(inode);
2228	}
2229
2230	/*
2231	* Try to flush dirty caps back to the auth mds.
2232	*/
2233	static int try_flush_caps(struct inode inode, u64 ptid)
2234	{
2235	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
2236	struct ceph_inode_info *ci = ceph_inode(inode);
2237	int flushing = `0`;
2238	u64 flush_tid = `0`, oldest_flush_tid = `0`;
2239
2240	spin_lock(lock: &ci->i_ceph_lock);
2241	retry_locked:
2242	if (ci->i_dirty_caps && ci->i_auth_cap) {
2243	struct ceph_cap *cap = ci->i_auth_cap;
2244	struct cap_msg_args arg;
2245	struct ceph_mds_session *session = cap->session;
2246
2247	if (session->s_state < CEPH_MDS_SESSION_OPEN) {
2248	spin_unlock(lock: &ci->i_ceph_lock);
2249	goto out;
2250	}
2251
2252	if (ci->i_ceph_flags &
2253	(CEPH_I_KICK_FLUSH \| CEPH_I_FLUSH_SNAPS)) {
2254	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2255	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid: `0`);
2256	if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2257	__ceph_flush_snaps(ci, session);
2258	goto retry_locked;
2259	}
2260
2261	flushing = ci->i_dirty_caps;
2262	flush_tid = __mark_caps_flushing(inode, session, wake: true,
2263	oldest_flush_tid: &oldest_flush_tid);
2264
2265	__prep_cap(arg: &arg, cap, op: CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
2266	used: __ceph_caps_used(ci), want: __ceph_caps_wanted(ci),
2267	retain: (cap->issued \| cap->implemented),
2268	flushing, flush_tid, oldest_flush_tid);
2269	spin_unlock(lock: &ci->i_ceph_lock);
2270
2271	__send_cap(arg: &arg, ci);
2272	} else {
2273	if (!list_empty(head: &ci->i_cap_flush_list)) {
2274	struct ceph_cap_flush *cf =
2275	list_last_entry(&ci->i_cap_flush_list,
2276	struct ceph_cap_flush, i_list);
2277	cf->wake = true;
2278	flush_tid = cf->tid;
2279	}
2280	flushing = ci->i_flushing_caps;
2281	spin_unlock(lock: &ci->i_ceph_lock);
2282	}
2283	out:
2284	*ptid = flush_tid;
2285	return flushing;
2286	}
2287
2288	/*
2289	* Return true if we've flushed caps through the given flush_tid.
2290	*/
2291	static int caps_are_flushed(struct inode *inode, u64 flush_tid)
2292	{
2293	struct ceph_inode_info *ci = ceph_inode(inode);
2294	int ret = `1`;
2295
2296	spin_lock(lock: &ci->i_ceph_lock);
2297	if (!list_empty(head: &ci->i_cap_flush_list)) {
2298	struct ceph_cap_flush * cf =
2299	list_first_entry(&ci->i_cap_flush_list,
2300	struct ceph_cap_flush, i_list);
2301	if (cf->tid <= flush_tid)
2302	ret = `0`;
2303	}
2304	spin_unlock(lock: &ci->i_ceph_lock);
2305	return ret;
2306	}
2307
2308	/*
2309	* flush the mdlog and wait for any unsafe requests to complete.
2310	*/
2311	static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
2312	{
2313	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
2314	struct ceph_inode_info *ci = ceph_inode(inode);
2315	struct ceph_mds_request req1 = NULL, req2 = NULL;
2316	int ret, err = `0`;
2317
2318	spin_lock(lock: &ci->i_unsafe_lock);
2319	if (S_ISDIR(inode->i_mode) && !list_empty(head: &ci->i_unsafe_dirops)) {
2320	req1 = list_last_entry(&ci->i_unsafe_dirops,
2321	struct ceph_mds_request,
2322	r_unsafe_dir_item);
2323	ceph_mdsc_get_request(req: req1);
2324	}
2325	if (!list_empty(head: &ci->i_unsafe_iops)) {
2326	req2 = list_last_entry(&ci->i_unsafe_iops,
2327	struct ceph_mds_request,
2328	r_unsafe_target_item);
2329	ceph_mdsc_get_request(req: req2);
2330	}
2331	spin_unlock(lock: &ci->i_unsafe_lock);
2332
2333	/*
2334	* Trigger to flush the journal logs in all the relevant MDSes
2335	* manually, or in the worst case we must wait at most 5 seconds
2336	* to wait the journal logs to be flushed by the MDSes periodically.
2337	*/
2338	if (req1 \|\| req2) {
2339	struct ceph_mds_request *req;
2340	struct ceph_mds_session **sessions;
2341	struct ceph_mds_session *s;
2342	unsigned int max_sessions;
2343	int i;
2344
2345	mutex_lock(&mdsc->mutex);
2346	max_sessions = mdsc->max_sessions;
2347
2348	sessions = kcalloc(n: max_sessions, size: sizeof(s), GFP_KERNEL);
2349	if (!sessions) {
2350	mutex_unlock(lock: &mdsc->mutex);
2351	err = -ENOMEM;
2352	goto out;
2353	}
2354
2355	spin_lock(lock: &ci->i_unsafe_lock);
2356	if (req1) {
2357	list_for_each_entry(req, &ci->i_unsafe_dirops,
2358	r_unsafe_dir_item) {
2359	s = req->r_session;
2360	if (!s)
2361	continue;
2362	if (!sessions[s->s_mds]) {
2363	s = ceph_get_mds_session(s);
2364	sessions[s->s_mds] = s;
2365	}
2366	}
2367	}
2368	if (req2) {
2369	list_for_each_entry(req, &ci->i_unsafe_iops,
2370	r_unsafe_target_item) {
2371	s = req->r_session;
2372	if (!s)
2373	continue;
2374	if (!sessions[s->s_mds]) {
2375	s = ceph_get_mds_session(s);
2376	sessions[s->s_mds] = s;
2377	}
2378	}
2379	}
2380	spin_unlock(lock: &ci->i_unsafe_lock);
2381
2382	/ the auth MDS /
2383	spin_lock(lock: &ci->i_ceph_lock);
2384	if (ci->i_auth_cap) {
2385	s = ci->i_auth_cap->session;
2386	if (!sessions[s->s_mds])
2387	sessions[s->s_mds] = ceph_get_mds_session(s);
2388	}
2389	spin_unlock(lock: &ci->i_ceph_lock);
2390	mutex_unlock(lock: &mdsc->mutex);
2391
2392	/ send flush mdlog request to MDSes /
2393	for (i = `0`; i < max_sessions; i++) {
2394	s = sessions[i];
2395	if (s) {
2396	send_flush_mdlog(s);
2397	ceph_put_mds_session(s);
2398	}
2399	}
2400	kfree(objp: sessions);
2401	}
2402
2403	dout("%s %p wait on tid %llu %llu\n", __func__,
2404	inode, req1 ? req1->r_tid : `0ULL`, req2 ? req2->r_tid : `0ULL`);
2405	if (req1) {
2406	ret = !wait_for_completion_timeout(x: &req1->r_safe_completion,
2407	timeout: ceph_timeout_jiffies(timeout: req1->r_timeout));
2408	if (ret)
2409	err = -EIO;
2410	}
2411	if (req2) {
2412	ret = !wait_for_completion_timeout(x: &req2->r_safe_completion,
2413	timeout: ceph_timeout_jiffies(timeout: req2->r_timeout));
2414	if (ret)
2415	err = -EIO;
2416	}
2417
2418	out:
2419	if (req1)
2420	ceph_mdsc_put_request(req: req1);
2421	if (req2)
2422	ceph_mdsc_put_request(req: req2);
2423	return err;
2424	}
2425
2426	int ceph_fsync(struct file file, loff_t start, loff_t end, int* datasync)
2427	{
2428	struct inode *inode = file->f_mapping->host;
2429	struct ceph_inode_info *ci = ceph_inode(inode);
2430	u64 flush_tid;
2431	int ret, err;
2432	int dirty;
2433
2434	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
2435
2436	ret = file_write_and_wait_range(file, start, end);
2437	if (datasync)
2438	goto out;
2439
2440	ret = ceph_wait_on_async_create(inode);
2441	if (ret)
2442	goto out;
2443
2444	dirty = try_flush_caps(inode, ptid: &flush_tid);
2445	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2446
2447	err = flush_mdlog_and_wait_inode_unsafe_requests(inode);
2448
2449	/*
2450	* only wait on non-file metadata writeback (the mds
2451	* can recover size and mtime, so we don't need to
2452	* wait for that)
2453	*/
2454	if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2455	err = wait_event_interruptible(ci->i_cap_wq,
2456	caps_are_flushed(inode, flush_tid));
2457	}
2458
2459	if (err < `0`)
2460	ret = err;
2461
2462	err = file_check_and_advance_wb_err(file);
2463	if (err < `0`)
2464	ret = err;
2465	out:
2466	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
2467	return ret;
2468	}
2469
2470	/*
2471	* Flush any dirty caps back to the mds. If we aren't asked to wait,
2472	* queue inode for flush but don't do so immediately, because we can
2473	* get by with fewer MDS messages if we wait for data writeback to
2474	* complete first.
2475	*/
2476	int ceph_write_inode(struct inode inode, struct* writeback_control *wbc)
2477	{
2478	struct ceph_inode_info *ci = ceph_inode(inode);
2479	u64 flush_tid;
2480	int err = `0`;
2481	int dirty;
2482	int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
2483
2484	dout("write_inode %p wait=%d\n", inode, wait);
2485	ceph_fscache_unpin_writeback(inode, wbc);
2486	if (wait) {
2487	err = ceph_wait_on_async_create(inode);
2488	if (err)
2489	return err;
2490	dirty = try_flush_caps(inode, ptid: &flush_tid);
2491	if (dirty)
2492	err = wait_event_interruptible(ci->i_cap_wq,
2493	caps_are_flushed(inode, flush_tid));
2494	} else {
2495	struct ceph_mds_client *mdsc =
2496	ceph_sb_to_client(sb: inode->i_sb)->mdsc;
2497
2498	spin_lock(lock: &ci->i_ceph_lock);
2499	if (__ceph_caps_dirty(ci))
2500	__cap_delay_requeue_front(mdsc, ci);
2501	spin_unlock(lock: &ci->i_ceph_lock);
2502	}
2503	return err;
2504	}
2505
2506	static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2507	struct ceph_mds_session *session,
2508	struct ceph_inode_info *ci,
2509	u64 oldest_flush_tid)
2510	__releases(ci->i_ceph_lock)
2511	__acquires(ci->i_ceph_lock)
2512	{
2513	struct inode *inode = &ci->netfs.inode;
2514	struct ceph_cap *cap;
2515	struct ceph_cap_flush *cf;
2516	int ret;
2517	u64 first_tid = `0`;
2518	u64 last_snap_flush = `0`;
2519
2520	/ Don't do anything until create reply comes in /
2521	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
2522	return;
2523
2524	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2525
2526	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2527	if (cf->is_capsnap) {
2528	last_snap_flush = cf->tid;
2529	break;
2530	}
2531	}
2532
2533	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2534	if (cf->tid < first_tid)
2535	continue;
2536
2537	cap = ci->i_auth_cap;
2538	if (!(cap && cap->session == session)) {
2539	pr_err("%p auth cap %p not mds%d ???\n",
2540	inode, cap, session->s_mds);
2541	break;
2542	}
2543
2544	first_tid = cf->tid + `1`;
2545
2546	if (!cf->is_capsnap) {
2547	struct cap_msg_args arg;
2548
2549	dout("kick_flushing_caps %p cap %p tid %llu %s\n",
2550	inode, cap, cf->tid, ceph_cap_string(cf->caps));
2551	__prep_cap(arg: &arg, cap, op: CEPH_CAP_OP_FLUSH,
2552	flags: (cf->tid < last_snap_flush ?
2553	CEPH_CLIENT_CAPS_PENDING_CAPSNAP : `0`),
2554	used: __ceph_caps_used(ci),
2555	want: __ceph_caps_wanted(ci),
2556	retain: (cap->issued \| cap->implemented),
2557	flushing: cf->caps, flush_tid: cf->tid, oldest_flush_tid);
2558	spin_unlock(lock: &ci->i_ceph_lock);
2559	__send_cap(arg: &arg, ci);
2560	} else {
2561	struct ceph_cap_snap *capsnap =
2562	container_of(cf, struct ceph_cap_snap,
2563	cap_flush);
2564	dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
2565	inode, capsnap, cf->tid,
2566	ceph_cap_string(capsnap->dirty));
2567
2568	refcount_inc(r: &capsnap->nref);
2569	spin_unlock(lock: &ci->i_ceph_lock);
2570
2571	ret = __send_flush_snap(inode, session, capsnap, mseq: cap->mseq,
2572	oldest_flush_tid);
2573	if (ret < `0`) {
2574	pr_err("kick_flushing_caps: error sending "
2575	"cap flushsnap, ino (%llx.%llx) "
2576	"tid %llu follows %llu\n",
2577	ceph_vinop(inode), cf->tid,
2578	capsnap->follows);
2579	}
2580
2581	ceph_put_cap_snap(capsnap);
2582	}
2583
2584	spin_lock(lock: &ci->i_ceph_lock);
2585	}
2586	}
2587
2588	void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2589	struct ceph_mds_session *session)
2590	{
2591	struct ceph_inode_info *ci;
2592	struct ceph_cap *cap;
2593	u64 oldest_flush_tid;
2594
2595	dout("early_kick_flushing_caps mds%d\n", session->s_mds);
2596
2597	spin_lock(lock: &mdsc->cap_dirty_lock);
2598	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2599	spin_unlock(lock: &mdsc->cap_dirty_lock);
2600
2601	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2602	spin_lock(lock: &ci->i_ceph_lock);
2603	cap = ci->i_auth_cap;
2604	if (!(cap && cap->session == session)) {
2605	pr_err("%p auth cap %p not mds%d ???\n",
2606	&ci->netfs.inode, cap, session->s_mds);
2607	spin_unlock(lock: &ci->i_ceph_lock);
2608	continue;
2609	}
2610
2611
2612	/*
2613	* if flushing caps were revoked, we re-send the cap flush
2614	* in client reconnect stage. This guarantees MDS * processes
2615	* the cap flush message before issuing the flushing caps to
2616	* other client.
2617	*/
2618	if ((cap->issued & ci->i_flushing_caps) !=
2619	ci->i_flushing_caps) {
2620	/ encode_caps_cb() also will reset these sequence*
2621	* numbers. make sure sequence numbers in cap flush
2622	* message match later reconnect message */
2623	cap->seq = `0`;
2624	cap->issue_seq = `0`;
2625	cap->mseq = `0`;
2626	__kick_flushing_caps(mdsc, session, ci,
2627	oldest_flush_tid);
2628	} else {
2629	ci->i_ceph_flags \|= CEPH_I_KICK_FLUSH;
2630	}
2631
2632	spin_unlock(lock: &ci->i_ceph_lock);
2633	}
2634	}
2635
2636	void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2637	struct ceph_mds_session *session)
2638	{
2639	struct ceph_inode_info *ci;
2640	struct ceph_cap *cap;
2641	u64 oldest_flush_tid;
2642
2643	lockdep_assert_held(&session->s_mutex);
2644
2645	dout("kick_flushing_caps mds%d\n", session->s_mds);
2646
2647	spin_lock(lock: &mdsc->cap_dirty_lock);
2648	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2649	spin_unlock(lock: &mdsc->cap_dirty_lock);
2650
2651	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2652	spin_lock(lock: &ci->i_ceph_lock);
2653	cap = ci->i_auth_cap;
2654	if (!(cap && cap->session == session)) {
2655	pr_err("%p auth cap %p not mds%d ???\n",
2656	&ci->netfs.inode, cap, session->s_mds);
2657	spin_unlock(lock: &ci->i_ceph_lock);
2658	continue;
2659	}
2660	if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
2661	__kick_flushing_caps(mdsc, session, ci,
2662	oldest_flush_tid);
2663	}
2664	spin_unlock(lock: &ci->i_ceph_lock);
2665	}
2666	}
2667
2668	void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2669	struct ceph_inode_info *ci)
2670	{
2671	struct ceph_mds_client *mdsc = session->s_mdsc;
2672	struct ceph_cap *cap = ci->i_auth_cap;
2673
2674	lockdep_assert_held(&ci->i_ceph_lock);
2675
2676	dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
2677	ceph_cap_string(ci->i_flushing_caps));
2678
2679	if (!list_empty(head: &ci->i_cap_flush_list)) {
2680	u64 oldest_flush_tid;
2681	spin_lock(lock: &mdsc->cap_dirty_lock);
2682	list_move_tail(list: &ci->i_flushing_item,
2683	head: &cap->session->s_cap_flushing);
2684	oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2685	spin_unlock(lock: &mdsc->cap_dirty_lock);
2686
2687	__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
2688	}
2689	}
2690
2691
2692	/*
2693	* Take references to capabilities we hold, so that we don't release
2694	* them to the MDS prematurely.
2695	*/
2696	void ceph_take_cap_refs(struct ceph_inode_info ci, int* got,
2697	bool snap_rwsem_locked)
2698	{
2699	lockdep_assert_held(&ci->i_ceph_lock);
2700
2701	if (got & CEPH_CAP_PIN)
2702	ci->i_pin_ref++;
2703	if (got & CEPH_CAP_FILE_RD)
2704	ci->i_rd_ref++;
2705	if (got & CEPH_CAP_FILE_CACHE)
2706	ci->i_rdcache_ref++;
2707	if (got & CEPH_CAP_FILE_EXCL)
2708	ci->i_fx_ref++;
2709	if (got & CEPH_CAP_FILE_WR) {
2710	if (ci->i_wr_ref == `0` && !ci->i_head_snapc) {
2711	BUG_ON(!snap_rwsem_locked);
2712	ci->i_head_snapc = ceph_get_snap_context(
2713	sc: ci->i_snap_realm->cached_context);
2714	}
2715	ci->i_wr_ref++;
2716	}
2717	if (got & CEPH_CAP_FILE_BUFFER) {
2718	if (ci->i_wb_ref == `0`)
2719	ihold(inode: &ci->netfs.inode);
2720	ci->i_wb_ref++;
2721	dout("%s %p wb %d -> %d (?)\n", __func__,
2722	&ci->netfs.inode, ci->i_wb_ref-`1`, ci->i_wb_ref);
2723	}
2724	}
2725
2726	/*
2727	* Try to grab cap references. Specify those refs we @want, and the
2728	* minimal set we @need. Also include the larger offset we are writing
2729	* to (when applicable), and check against max_size here as well.
2730	* Note that caller is responsible for ensuring max_size increases are
2731	* requested from the MDS.
2732	*
2733	* Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2734	* or a negative error code. There are 3 speical error codes:
2735	* -EAGAIN: need to sleep but non-blocking is specified
2736	* -EFBIG: ask caller to call check_max_size() and try again.
2737	* -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
2738	*/
2739	enum {
2740	/ first 8 bits are reserved for CEPH_FILE_MODE_FOO /
2741	NON_BLOCKING = (`1` << `8`),
2742	CHECK_FILELOCK = (`1` << `9`),
2743	};
2744
2745	static int try_get_cap_refs(struct inode inode, int* need, int want,
2746	loff_t endoff, int flags, int *got)
2747	{
2748	struct ceph_inode_info *ci = ceph_inode(inode);
2749	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2750	int ret = `0`;
2751	int have, implemented;
2752	bool snap_rwsem_locked = false;
2753
2754	dout("get_cap_refs %p need %s want %s\n", inode,
2755	ceph_cap_string(need), ceph_cap_string(want));
2756
2757	again:
2758	spin_lock(lock: &ci->i_ceph_lock);
2759
2760	if ((flags & CHECK_FILELOCK) &&
2761	(ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2762	dout("try_get_cap_refs %p error filelock\n", inode);
2763	ret = -EIO;
2764	goto out_unlock;
2765	}
2766
2767	/ finish pending truncate /
2768	while (ci->i_truncate_pending) {
2769	spin_unlock(lock: &ci->i_ceph_lock);
2770	if (snap_rwsem_locked) {
2771	up_read(sem: &mdsc->snap_rwsem);
2772	snap_rwsem_locked = false;
2773	}
2774	__ceph_do_pending_vmtruncate(inode);
2775	spin_lock(lock: &ci->i_ceph_lock);
2776	}
2777
2778	have = __ceph_caps_issued(ci, implemented: &implemented);
2779
2780	if (have & need & CEPH_CAP_FILE_WR) {
2781	if (endoff >= `0` && endoff > (loff_t)ci->i_max_size) {
2782	dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2783	inode, endoff, ci->i_max_size);
2784	if (endoff > ci->i_requested_max_size)
2785	ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
2786	goto out_unlock;
2787	}
2788	/*
2789	* If a sync write is in progress, we must wait, so that we
2790	* can get a final snapshot value for size+mtime.
2791	*/
2792	if (__ceph_have_pending_cap_snap(ci)) {
2793	dout("get_cap_refs %p cap_snap_pending\n", inode);
2794	goto out_unlock;
2795	}
2796	}
2797
2798	if ((have & need) == need) {
2799	/*
2800	* Look at (implemented & ~have & not) so that we keep waiting
2801	* on transition from wanted -> needed caps. This is needed
2802	* for WRBUFFER\|WR -> WR to avoid a new WR sync write from
2803	* going before a prior buffered writeback happens.
2804	*
2805	* For RDCACHE\|RD -> RD, there is not need to wait and we can
2806	* just exclude the revoking caps and force to sync read.
2807	*/
2808	int not = want & ~(have & need);
2809	int revoking = implemented & ~have;
2810	int exclude = revoking & not;
2811	dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2812	inode, ceph_cap_string(have), ceph_cap_string(not),
2813	ceph_cap_string(revoking));
2814	if (!exclude \|\| !(exclude & CEPH_CAP_FILE_BUFFER)) {
2815	if (!snap_rwsem_locked &&
2816	!ci->i_head_snapc &&
2817	(need & CEPH_CAP_FILE_WR)) {
2818	if (!down_read_trylock(sem: &mdsc->snap_rwsem)) {
2819	/*
2820	* we can not call down_read() when
2821	* task isn't in TASK_RUNNING state
2822	*/
2823	if (flags & NON_BLOCKING) {
2824	ret = -EAGAIN;
2825	goto out_unlock;
2826	}
2827
2828	spin_unlock(lock: &ci->i_ceph_lock);
2829	down_read(sem: &mdsc->snap_rwsem);
2830	snap_rwsem_locked = true;
2831	goto again;
2832	}
2833	snap_rwsem_locked = true;
2834	}
2835	if ((have & want) == want)
2836	*got = need \| (want & ~exclude);
2837	else
2838	*got = need;
2839	ceph_take_cap_refs(ci, got: *got, snap_rwsem_locked: true);
2840	ret = `1`;
2841	}
2842	} else {
2843	int session_readonly = false;
2844	int mds_wanted;
2845	if (ci->i_auth_cap &&
2846	(need & (CEPH_CAP_FILE_WR \| CEPH_CAP_FILE_EXCL))) {
2847	struct ceph_mds_session *s = ci->i_auth_cap->session;
2848	spin_lock(lock: &s->s_cap_lock);
2849	session_readonly = s->s_readonly;
2850	spin_unlock(lock: &s->s_cap_lock);
2851	}
2852	if (session_readonly) {
2853	dout("get_cap_refs %p need %s but mds%d readonly\n",
2854	inode, ceph_cap_string(need), ci->i_auth_cap->mds);
2855	ret = -EROFS;
2856	goto out_unlock;
2857	}
2858
2859	if (ceph_inode_is_shutdown(inode)) {
2860	dout("get_cap_refs %p inode is shutdown\n", inode);
2861	ret = -ESTALE;
2862	goto out_unlock;
2863	}
2864	mds_wanted = __ceph_caps_mds_wanted(ci, check: false);
2865	if (need & ~mds_wanted) {
2866	dout("get_cap_refs %p need %s > mds_wanted %s\n",
2867	inode, ceph_cap_string(need),
2868	ceph_cap_string(mds_wanted));
2869	ret = -EUCLEAN;
2870	goto out_unlock;
2871	}
2872
2873	dout("get_cap_refs %p have %s need %s\n", inode,
2874	ceph_cap_string(have), ceph_cap_string(need));
2875	}
2876	out_unlock:
2877
2878	__ceph_touch_fmode(ci, mdsc, fmode: flags);
2879
2880	spin_unlock(lock: &ci->i_ceph_lock);
2881	if (snap_rwsem_locked)
2882	up_read(sem: &mdsc->snap_rwsem);
2883
2884	if (!ret)
2885	ceph_update_cap_mis(m: &mdsc->metric);
2886	else if (ret == `1`)
2887	ceph_update_cap_hit(m: &mdsc->metric);
2888
2889	dout("get_cap_refs %p ret %d got %s\n", inode,
2890	ret, ceph_cap_string(*got));
2891	return ret;
2892	}
2893
2894	/*
2895	* Check the offset we are writing up to against our current
2896	* max_size. If necessary, tell the MDS we want to write to
2897	* a larger offset.
2898	*/
2899	static void check_max_size(struct inode *inode, loff_t endoff)
2900	{
2901	struct ceph_inode_info *ci = ceph_inode(inode);
2902	int check = `0`;
2903
2904	/ do we need to explicitly request a larger max_size? /
2905	spin_lock(lock: &ci->i_ceph_lock);
2906	if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
2907	dout("write %p at large endoff %llu, req max_size\n",
2908	inode, endoff);
2909	ci->i_wanted_max_size = endoff;
2910	}
2911	/ duplicate ceph_check_caps()'s logic /
2912	if (ci->i_auth_cap &&
2913	(ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
2914	ci->i_wanted_max_size > ci->i_max_size &&
2915	ci->i_wanted_max_size > ci->i_requested_max_size)
2916	check = `1`;
2917	spin_unlock(lock: &ci->i_ceph_lock);
2918	if (check)
2919	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
2920	}
2921
2922	static inline int get_used_fmode(int caps)
2923	{
2924	int fmode = `0`;
2925	if (caps & CEPH_CAP_FILE_RD)
2926	fmode \|= CEPH_FILE_MODE_RD;
2927	if (caps & CEPH_CAP_FILE_WR)
2928	fmode \|= CEPH_FILE_MODE_WR;
2929	return fmode;
2930	}
2931
2932	int ceph_try_get_caps(struct inode inode, int* need, int want,
2933	bool nonblock, int *got)
2934	{
2935	int ret, flags;
2936
2937	BUG_ON(need & ~CEPH_CAP_FILE_RD);
2938	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE \| CEPH_CAP_FILE_LAZYIO \|
2939	CEPH_CAP_FILE_SHARED \| CEPH_CAP_FILE_EXCL \|
2940	CEPH_CAP_ANY_DIR_OPS));
2941	if (need) {
2942	ret = ceph_pool_perm_check(inode, need);
2943	if (ret < `0`)
2944	return ret;
2945	}
2946
2947	flags = get_used_fmode(caps: need \| want);
2948	if (nonblock)
2949	flags \|= NON_BLOCKING;
2950
2951	ret = try_get_cap_refs(inode, need, want, endoff: `0`, flags, got);
2952	/ three special error codes /
2953	if (ret == -EAGAIN \|\| ret == -EFBIG \|\| ret == -EUCLEAN)
2954	ret = `0`;
2955	return ret;
2956	}
2957
2958	/*
2959	* Wait for caps, and take cap references. If we can't get a WR cap
2960	* due to a small max_size, make sure we check_max_size (and possibly
2961	* ask the mds) so we don't get hung up indefinitely.
2962	*/
2963	int __ceph_get_caps(struct inode inode, struct* ceph_file_info fi, int* need,
2964	int want, loff_t endoff, int *got)
2965	{
2966	struct ceph_inode_info *ci = ceph_inode(inode);
2967	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2968	int ret, _got, flags;
2969
2970	ret = ceph_pool_perm_check(inode, need);
2971	if (ret < `0`)
2972	return ret;
2973
2974	if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
2975	fi->filp_gen != READ_ONCE(fsc->filp_gen))
2976	return -EBADF;
2977
2978	flags = get_used_fmode(caps: need \| want);
2979
2980	while (true) {
2981	flags &= CEPH_FILE_MODE_MASK;
2982	if (vfs_inode_has_locks(inode))
2983	flags \|= CHECK_FILELOCK;
2984	_got = `0`;
2985	ret = try_get_cap_refs(inode, need, want, endoff,
2986	flags, got: &_got);
2987	WARN_ON_ONCE(ret == -EAGAIN);
2988	if (!ret) {
2989	struct ceph_mds_client *mdsc = fsc->mdsc;
2990	struct cap_wait cw;
2991	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2992
2993	cw.ino = ceph_ino(inode);
2994	cw.tgid = current->tgid;
2995	cw.need = need;
2996	cw.want = want;
2997
2998	spin_lock(lock: &mdsc->caps_list_lock);
2999	list_add(new: &cw.list, head: &mdsc->cap_wait_list);
3000	spin_unlock(lock: &mdsc->caps_list_lock);
3001
3002	/ make sure used fmode not timeout /
3003	ceph_get_fmode(ci, mode: flags, FMODE_WAIT_BIAS);
3004	add_wait_queue(wq_head: &ci->i_cap_wq, wq_entry: &wait);
3005
3006	flags \|= NON_BLOCKING;
3007	while (!(ret = try_get_cap_refs(inode, need, want,
3008	endoff, flags, got: &_got))) {
3009	if (signal_pending(current)) {
3010	ret = -ERESTARTSYS;
3011	break;
3012	}
3013	wait_woken(wq_entry: &wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3014	}
3015
3016	remove_wait_queue(wq_head: &ci->i_cap_wq, wq_entry: &wait);
3017	ceph_put_fmode(ci, mode: flags, FMODE_WAIT_BIAS);
3018
3019	spin_lock(lock: &mdsc->caps_list_lock);
3020	list_del(entry: &cw.list);
3021	spin_unlock(lock: &mdsc->caps_list_lock);
3022
3023	if (ret == -EAGAIN)
3024	continue;
3025	}
3026
3027	if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
3028	fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
3029	if (ret >= `0` && _got)
3030	ceph_put_cap_refs(ci, had: _got);
3031	return -EBADF;
3032	}
3033
3034	if (ret < `0`) {
3035	if (ret == -EFBIG \|\| ret == -EUCLEAN) {
3036	int ret2 = ceph_wait_on_async_create(inode);
3037	if (ret2 < `0`)
3038	return ret2;
3039	}
3040	if (ret == -EFBIG) {
3041	check_max_size(inode, endoff);
3042	continue;
3043	}
3044	if (ret == -EUCLEAN) {
3045	/ session was killed, try renew caps /
3046	ret = ceph_renew_caps(inode, fmode: flags);
3047	if (ret == `0`)
3048	continue;
3049	}
3050	return ret;
3051	}
3052
3053	if (S_ISREG(ci->netfs.inode.i_mode) &&
3054	ceph_has_inline_data(ci) &&
3055	(_got & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)) &&
3056	i_size_read(inode) > `0`) {
3057	struct page *page =
3058	find_get_page(mapping: inode->i_mapping, offset: `0`);
3059	if (page) {
3060	bool uptodate = PageUptodate(page);
3061
3062	put_page(page);
3063	if (uptodate)
3064	break;
3065	}
3066	/*
3067	* drop cap refs first because getattr while
3068	* holding * caps refs can cause deadlock.
3069	*/
3070	ceph_put_cap_refs(ci, had: _got);
3071	_got = `0`;
3072
3073	/*
3074	* getattr request will bring inline data into
3075	* page cache
3076	*/
3077	ret = __ceph_do_getattr(inode, NULL,
3078	CEPH_STAT_CAP_INLINE_DATA,
3079	force: true);
3080	if (ret < `0`)
3081	return ret;
3082	continue;
3083	}
3084	break;
3085	}
3086	*got = _got;
3087	return `0`;
3088	}
3089
3090	int ceph_get_caps(struct file filp, int* need, int want, loff_t endoff,
3091	int *got)
3092	{
3093	struct ceph_file_info *fi = filp->private_data;
3094	struct inode *inode = file_inode(f: filp);
3095
3096	return __ceph_get_caps(inode, fi, need, want, endoff, got);
3097	}
3098
3099	/*
3100	* Take cap refs. Caller must already know we hold at least one ref
3101	* on the caps in question or we don't know this is safe.
3102	*/
3103	void ceph_get_cap_refs(struct ceph_inode_info ci, int* caps)
3104	{
3105	spin_lock(lock: &ci->i_ceph_lock);
3106	ceph_take_cap_refs(ci, got: caps, snap_rwsem_locked: false);
3107	spin_unlock(lock: &ci->i_ceph_lock);
3108	}
3109
3110
3111	/*
3112	* drop cap_snap that is not associated with any snapshot.
3113	* we don't need to send FLUSHSNAP message for it.
3114	*/
3115	static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
3116	struct ceph_cap_snap *capsnap)
3117	{
3118	if (!capsnap->need_flush &&
3119	!capsnap->writing && !capsnap->dirty_pages) {
3120	dout("dropping cap_snap %p follows %llu\n",
3121	capsnap, capsnap->follows);
3122	BUG_ON(capsnap->cap_flush.tid > `0`);
3123	ceph_put_snap_context(sc: capsnap->context);
3124	if (!list_is_last(list: &capsnap->ci_item, head: &ci->i_cap_snaps))
3125	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
3126
3127	list_del(entry: &capsnap->ci_item);
3128	ceph_put_cap_snap(capsnap);
3129	return `1`;
3130	}
3131	return `0`;
3132	}
3133
3134	enum put_cap_refs_mode {
3135	PUT_CAP_REFS_SYNC = `0`,
3136	PUT_CAP_REFS_NO_CHECK,
3137	PUT_CAP_REFS_ASYNC,
3138	};
3139
3140	/*
3141	* Release cap refs.
3142	*
3143	* If we released the last ref on any given cap, call ceph_check_caps
3144	* to release (or schedule a release).
3145	*
3146	* If we are releasing a WR cap (from a sync write), finalize any affected
3147	* cap_snap, and wake up any waiters.
3148	*/
3149	static void __ceph_put_cap_refs(struct ceph_inode_info ci, int* had,
3150	enum put_cap_refs_mode mode)
3151	{
3152	struct inode *inode = &ci->netfs.inode;
3153	int last = `0`, put = `0`, flushsnaps = `0`, wake = `0`;
3154	bool check_flushsnaps = false;
3155
3156	spin_lock(lock: &ci->i_ceph_lock);
3157	if (had & CEPH_CAP_PIN)
3158	--ci->i_pin_ref;
3159	if (had & CEPH_CAP_FILE_RD)
3160	if (--ci->i_rd_ref == `0`)
3161	last++;
3162	if (had & CEPH_CAP_FILE_CACHE)
3163	if (--ci->i_rdcache_ref == `0`)
3164	last++;
3165	if (had & CEPH_CAP_FILE_EXCL)
3166	if (--ci->i_fx_ref == `0`)
3167	last++;
3168	if (had & CEPH_CAP_FILE_BUFFER) {
3169	if (--ci->i_wb_ref == `0`) {
3170	last++;
3171	/ put the ref held by ceph_take_cap_refs() /
3172	put++;
3173	check_flushsnaps = true;
3174	}
3175	dout("put_cap_refs %p wb %d -> %d (?)\n",
3176	inode, ci->i_wb_ref+`1`, ci->i_wb_ref);
3177	}
3178	if (had & CEPH_CAP_FILE_WR) {
3179	if (--ci->i_wr_ref == `0`) {
3180	/*
3181	* The Fb caps will always be took and released
3182	* together with the Fw caps.
3183	*/
3184	WARN_ON_ONCE(ci->i_wb_ref);
3185
3186	last++;
3187	check_flushsnaps = true;
3188	if (ci->i_wrbuffer_ref_head == `0` &&
3189	ci->i_dirty_caps == `0` &&
3190	ci->i_flushing_caps == `0`) {
3191	BUG_ON(!ci->i_head_snapc);
3192	ceph_put_snap_context(sc: ci->i_head_snapc);
3193	ci->i_head_snapc = NULL;
3194	}
3195	/ see comment in __ceph_remove_cap() /
3196	if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
3197	ceph_change_snap_realm(inode, NULL);
3198	}
3199	}
3200	if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
3201	struct ceph_cap_snap *capsnap =
3202	list_last_entry(&ci->i_cap_snaps,
3203	struct ceph_cap_snap,
3204	ci_item);
3205
3206	capsnap->writing = `0`;
3207	if (ceph_try_drop_cap_snap(ci, capsnap))
3208	/ put the ref held by ceph_queue_cap_snap() /
3209	put++;
3210	else if (__ceph_finish_cap_snap(ci, capsnap))
3211	flushsnaps = `1`;
3212	wake = `1`;
3213	}
3214	spin_unlock(lock: &ci->i_ceph_lock);
3215
3216	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
3217	last ? " last" : "", put ? " put" : "");
3218
3219	switch (mode) {
3220	case PUT_CAP_REFS_SYNC:
3221	if (last)
3222	ceph_check_caps(ci, flags: `0`);
3223	else if (flushsnaps)
3224	ceph_flush_snaps(ci, NULL);
3225	break;
3226	case PUT_CAP_REFS_ASYNC:
3227	if (last)
3228	ceph_queue_check_caps(inode);
3229	else if (flushsnaps)
3230	ceph_queue_flush_snaps(inode);
3231	break;
3232	default:
3233	break;
3234	}
3235	if (wake)
3236	wake_up_all(&ci->i_cap_wq);
3237	while (put-- > `0`)
3238	iput(inode);
3239	}
3240
3241	void ceph_put_cap_refs(struct ceph_inode_info ci, int* had)
3242	{
3243	__ceph_put_cap_refs(ci, had, mode: PUT_CAP_REFS_SYNC);
3244	}
3245
3246	void ceph_put_cap_refs_async(struct ceph_inode_info ci, int* had)
3247	{
3248	__ceph_put_cap_refs(ci, had, mode: PUT_CAP_REFS_ASYNC);
3249	}
3250
3251	void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info ci, int* had)
3252	{
3253	__ceph_put_cap_refs(ci, had, mode: PUT_CAP_REFS_NO_CHECK);
3254	}
3255
3256	/*
3257	* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
3258	* context. Adjust per-snap dirty page accounting as appropriate.
3259	* Once all dirty data for a cap_snap is flushed, flush snapped file
3260	* metadata back to the MDS. If we dropped the last ref, call
3261	* ceph_check_caps.
3262	*/
3263	void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info ci, int* nr,
3264	struct ceph_snap_context *snapc)
3265	{
3266	struct inode *inode = &ci->netfs.inode;
3267	struct ceph_cap_snap capsnap = NULL, iter;
3268	int put = `0`;
3269	bool last = false;
3270	bool flush_snaps = false;
3271	bool complete_capsnap = false;
3272
3273	spin_lock(lock: &ci->i_ceph_lock);
3274	ci->i_wrbuffer_ref -= nr;
3275	if (ci->i_wrbuffer_ref == `0`) {
3276	last = true;
3277	put++;
3278	}
3279
3280	if (ci->i_head_snapc == snapc) {
3281	ci->i_wrbuffer_ref_head -= nr;
3282	if (ci->i_wrbuffer_ref_head == `0` &&
3283	ci->i_wr_ref == `0` &&
3284	ci->i_dirty_caps == `0` &&
3285	ci->i_flushing_caps == `0`) {
3286	BUG_ON(!ci->i_head_snapc);
3287	ceph_put_snap_context(sc: ci->i_head_snapc);
3288	ci->i_head_snapc = NULL;
3289	}
3290	dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
3291	inode,
3292	ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
3293	ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
3294	last ? " LAST" : "");
3295	} else {
3296	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
3297	if (iter->context == snapc) {
3298	capsnap = iter;
3299	break;
3300	}
3301	}
3302
3303	if (!capsnap) {
3304	/*
3305	* The capsnap should already be removed when removing
3306	* auth cap in the case of a forced unmount.
3307	*/
3308	WARN_ON_ONCE(ci->i_auth_cap);
3309	goto unlock;
3310	}
3311
3312	capsnap->dirty_pages -= nr;
3313	if (capsnap->dirty_pages == `0`) {
3314	complete_capsnap = true;
3315	if (!capsnap->writing) {
3316	if (ceph_try_drop_cap_snap(ci, capsnap)) {
3317	put++;
3318	} else {
3319	ci->i_ceph_flags \|= CEPH_I_FLUSH_SNAPS;
3320	flush_snaps = true;
3321	}
3322	}
3323	}
3324	dout("put_wrbuffer_cap_refs on %p cap_snap %p "
3325	" snap %lld %d/%d -> %d/%d %s%s\n",
3326	inode, capsnap, capsnap->context->seq,
3327	ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
3328	ci->i_wrbuffer_ref, capsnap->dirty_pages,
3329	last ? " (wrbuffer last)" : "",
3330	complete_capsnap ? " (complete capsnap)" : "");
3331	}
3332
3333	unlock:
3334	spin_unlock(lock: &ci->i_ceph_lock);
3335
3336	if (last) {
3337	ceph_check_caps(ci, flags: `0`);
3338	} else if (flush_snaps) {
3339	ceph_flush_snaps(ci, NULL);
3340	}
3341	if (complete_capsnap)
3342	wake_up_all(&ci->i_cap_wq);
3343	while (put-- > `0`) {
3344	iput(inode);
3345	}
3346	}
3347
3348	/*
3349	* Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
3350	*/
3351	static void invalidate_aliases(struct inode *inode)
3352	{
3353	struct dentry dn, prev = NULL;
3354
3355	dout("invalidate_aliases inode %p\n", inode);
3356	d_prune_aliases(inode);
3357	/*
3358	* For non-directory inode, d_find_alias() only returns
3359	* hashed dentry. After calling d_invalidate(), the
3360	* dentry becomes unhashed.
3361	*
3362	* For directory inode, d_find_alias() can return
3363	* unhashed dentry. But directory inode should have
3364	* one alias at most.
3365	*/
3366	while ((dn = d_find_alias(inode))) {
3367	if (dn == prev) {
3368	dput(dn);
3369	break;
3370	}
3371	d_invalidate(dn);
3372	if (prev)
3373	dput(prev);
3374	prev = dn;
3375	}
3376	if (prev)
3377	dput(prev);
3378	}
3379
3380	struct cap_extra_info {
3381	struct ceph_string *pool_ns;
3382	/ inline data /
3383	u64 inline_version;
3384	void *inline_data;
3385	u32 inline_len;
3386	/ dirstat /
3387	bool dirstat_valid;
3388	u64 nfiles;
3389	u64 nsubdirs;
3390	u64 change_attr;
3391	/ currently issued /
3392	int issued;
3393	struct timespec64 btime;
3394	u8 *fscrypt_auth;
3395	u32 fscrypt_auth_len;
3396	u64 fscrypt_file_size;
3397	};
3398
3399	/*
3400	* Handle a cap GRANT message from the MDS. (Note that a GRANT may
3401	* actually be a revocation if it specifies a smaller cap set.)
3402	*
3403	* caller holds s_mutex and i_ceph_lock, we drop both.
3404	*/
3405	static void handle_cap_grant(struct inode *inode,
3406	struct ceph_mds_session *session,
3407	struct ceph_cap *cap,
3408	struct ceph_mds_caps *grant,
3409	struct ceph_buffer *xattr_buf,
3410	struct cap_extra_info *extra_info)
3411	__releases(ci->i_ceph_lock)
3412	__releases(session->s_mdsc->snap_rwsem)
3413	{
3414	struct ceph_inode_info *ci = ceph_inode(inode);
3415	int seq = le32_to_cpu(grant->seq);
3416	int newcaps = le32_to_cpu(grant->caps);
3417	int used, wanted, dirty;
3418	u64 size = le64_to_cpu(grant->size);
3419	u64 max_size = le64_to_cpu(grant->max_size);
3420	unsigned char check_caps = `0`;
3421	bool was_stale = cap->cap_gen < atomic_read(v: &session->s_cap_gen);
3422	bool wake = false;
3423	bool writeback = false;
3424	bool queue_trunc = false;
3425	bool queue_invalidate = false;
3426	bool deleted_inode = false;
3427	bool fill_inline = false;
3428
3429	/*
3430	* If there is at least one crypto block then we'll trust
3431	* fscrypt_file_size. If the real length of the file is 0, then
3432	* ignore it (it has probably been truncated down to 0 by the MDS).
3433	*/
3434	if (IS_ENCRYPTED(inode) && size)
3435	size = extra_info->fscrypt_file_size;
3436
3437	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
3438	inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
3439	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
3440	i_size_read(inode));
3441
3442
3443	/*
3444	* If CACHE is being revoked, and we have no dirty buffers,
3445	* try to invalidate (once). (If there are dirty buffers, we
3446	* will invalidate _after_ writeback.)
3447	*/
3448	if (S_ISREG(inode->i_mode) && / don't invalidate readdir cache /
3449	((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
3450	(newcaps & CEPH_CAP_FILE_LAZYIO) == `0` &&
3451	!(ci->i_wrbuffer_ref \|\| ci->i_wb_ref)) {
3452	if (try_nonblocking_invalidate(inode)) {
3453	/ there were locked pages.. invalidate later*
3454	in a separate thread. /*
3455	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
3456	queue_invalidate = true;
3457	ci->i_rdcache_revoking = ci->i_rdcache_gen;
3458	}
3459	}
3460	}
3461
3462	if (was_stale)
3463	cap->issued = cap->implemented = CEPH_CAP_PIN;
3464
3465	/*
3466	* auth mds of the inode changed. we received the cap export message,
3467	* but still haven't received the cap import message. handle_cap_export
3468	* updated the new auth MDS' cap.
3469	*
3470	* "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
3471	* that was sent before the cap import message. So don't remove caps.
3472	*/
3473	if (ceph_seq_cmp(a: seq, b: cap->seq) <= `0`) {
3474	WARN_ON(cap != ci->i_auth_cap);
3475	WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
3476	seq = cap->seq;
3477	newcaps \|= cap->issued;
3478	}
3479
3480	/ side effects now are allowed /
3481	cap->cap_gen = atomic_read(v: &session->s_cap_gen);
3482	cap->seq = seq;
3483
3484	__check_cap_issue(ci, cap, issued: newcaps);
3485
3486	inode_set_max_iversion_raw(inode, val: extra_info->change_attr);
3487
3488	if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
3489	(extra_info->issued & CEPH_CAP_AUTH_EXCL) == `0`) {
3490	umode_t mode = le32_to_cpu(grant->mode);
3491
3492	if (inode_wrong_type(inode, mode))
3493	pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
3494	ceph_vinop(inode), inode->i_mode, mode);
3495	else
3496	inode->i_mode = mode;
3497	inode->i_uid = make_kuid(from: &init_user_ns, le32_to_cpu(grant->uid));
3498	inode->i_gid = make_kgid(from: &init_user_ns, le32_to_cpu(grant->gid));
3499	ci->i_btime = extra_info->btime;
3500	dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
3501	from_kuid(&init_user_ns, inode->i_uid),
3502	from_kgid(&init_user_ns, inode->i_gid));
3503	#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
3504	if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len \|\|
3505	memcmp(p: ci->fscrypt_auth, q: extra_info->fscrypt_auth,
3506	size: ci->fscrypt_auth_len))
3507	pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
3508	__func__, ci->fscrypt_auth_len,
3509	extra_info->fscrypt_auth_len);
3510	#endif
3511	}
3512
3513	if ((newcaps & CEPH_CAP_LINK_SHARED) &&
3514	(extra_info->issued & CEPH_CAP_LINK_EXCL) == `0`) {
3515	set_nlink(inode, le32_to_cpu(grant->nlink));
3516	if (inode->i_nlink == `0`)
3517	deleted_inode = true;
3518	}
3519
3520	if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == `0` &&
3521	grant->xattr_len) {
3522	int len = le32_to_cpu(grant->xattr_len);
3523	u64 version = le64_to_cpu(grant->xattr_version);
3524
3525	if (version > ci->i_xattrs.version) {
3526	dout(" got new xattrs v%llu on %p len %d\n",
3527	version, inode, len);
3528	if (ci->i_xattrs.blob)
3529	ceph_buffer_put(b: ci->i_xattrs.blob);
3530	ci->i_xattrs.blob = ceph_buffer_get(b: xattr_buf);
3531	ci->i_xattrs.version = version;
3532	ceph_forget_all_cached_acls(inode);
3533	ceph_security_invalidate_secctx(inode);
3534	}
3535	}
3536
3537	if (newcaps & CEPH_CAP_ANY_RD) {
3538	struct timespec64 mtime, atime, ctime;
3539	/ ctime/mtime/atime? /
3540	ceph_decode_timespec64(ts: &mtime, tv: &grant->mtime);
3541	ceph_decode_timespec64(ts: &atime, tv: &grant->atime);
3542	ceph_decode_timespec64(ts: &ctime, tv: &grant->ctime);
3543	ceph_fill_file_time(inode, issued: extra_info->issued,
3544	le32_to_cpu(grant->time_warp_seq),
3545	ctime: &ctime, mtime: &mtime, atime: &atime);
3546	}
3547
3548	if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
3549	ci->i_files = extra_info->nfiles;
3550	ci->i_subdirs = extra_info->nsubdirs;
3551	}
3552
3553	if (newcaps & (CEPH_CAP_ANY_FILE_RD \| CEPH_CAP_ANY_FILE_WR)) {
3554	/ file layout may have changed /
3555	s64 old_pool = ci->i_layout.pool_id;
3556	struct ceph_string *old_ns;
3557
3558	ceph_file_layout_from_legacy(fl: &ci->i_layout, legacy: &grant->layout);
3559	old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
3560	lockdep_is_held(&ci->i_ceph_lock));
3561	rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
3562
3563	if (ci->i_layout.pool_id != old_pool \|\|
3564	extra_info->pool_ns != old_ns)
3565	ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
3566
3567	extra_info->pool_ns = old_ns;
3568
3569	/ size/truncate_seq? /
3570	queue_trunc = ceph_fill_file_size(inode, issued: extra_info->issued,
3571	le32_to_cpu(grant->truncate_seq),
3572	le64_to_cpu(grant->truncate_size),
3573	size);
3574	}
3575
3576	if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
3577	if (max_size != ci->i_max_size) {
3578	dout("max_size %lld -> %llu\n",
3579	ci->i_max_size, max_size);
3580	ci->i_max_size = max_size;
3581	if (max_size >= ci->i_wanted_max_size) {
3582	ci->i_wanted_max_size = `0`; / reset /
3583	ci->i_requested_max_size = `0`;
3584	}
3585	wake = true;
3586	}
3587	}
3588
3589	/ check cap bits /
3590	wanted = __ceph_caps_wanted(ci);
3591	used = __ceph_caps_used(ci);
3592	dirty = __ceph_caps_dirty(ci);
3593	dout(" my wanted = %s, used = %s, dirty %s\n",
3594	ceph_cap_string(wanted),
3595	ceph_cap_string(used),
3596	ceph_cap_string(dirty));
3597
3598	if ((was_stale \|\| le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3599	(wanted & ~(cap->mds_wanted \| newcaps))) {
3600	/*
3601	* If mds is importing cap, prior cap messages that update
3602	* 'wanted' may get dropped by mds (migrate seq mismatch).
3603	*
3604	* We don't send cap message to update 'wanted' if what we
3605	* want are already issued. If mds revokes caps, cap message
3606	* that releases caps also tells mds what we want. But if
3607	* caps got revoked by mds forcedly (session stale). We may
3608	* haven't told mds what we want.
3609	*/
3610	check_caps = `1`;
3611	}
3612
3613	/ revocation, grant, or no-op? /
3614	if (cap->issued & ~newcaps) {
3615	int revoking = cap->issued & ~newcaps;
3616
3617	dout("revocation: %s -> %s (revoking %s)\n",
3618	ceph_cap_string(cap->issued),
3619	ceph_cap_string(newcaps),
3620	ceph_cap_string(revoking));
3621	if (S_ISREG(inode->i_mode) &&
3622	(revoking & used & CEPH_CAP_FILE_BUFFER))
3623	writeback = true; / initiate writeback; will delay ack /
3624	else if (queue_invalidate &&
3625	revoking == CEPH_CAP_FILE_CACHE &&
3626	(newcaps & CEPH_CAP_FILE_LAZYIO) == `0`)
3627	; / do nothing yet, invalidation will be queued /
3628	else if (cap == ci->i_auth_cap)
3629	check_caps = `1`; / check auth cap only /
3630	else
3631	check_caps = `2`; / check all caps /
3632	/ If there is new caps, try to wake up the waiters /
3633	if (~cap->issued & newcaps)
3634	wake = true;
3635	cap->issued = newcaps;
3636	cap->implemented \|= newcaps;
3637	} else if (cap->issued == newcaps) {
3638	dout("caps unchanged: %s -> %s\n",
3639	ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
3640	} else {
3641	dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
3642	ceph_cap_string(newcaps));
3643	/ non-auth MDS is revoking the newly grant caps ? /
3644	if (cap == ci->i_auth_cap &&
3645	__ceph_caps_revoking_other(ci, ocap: cap, mask: newcaps))
3646	check_caps = `2`;
3647
3648	cap->issued = newcaps;
3649	cap->implemented \|= newcaps; / add bits only, to*
3650	* avoid stepping on a
3651	* pending revocation */
3652	wake = true;
3653	}
3654	BUG_ON(cap->issued & ~cap->implemented);
3655
3656	/ don't let check_caps skip sending a response to MDS for revoke msgs /
3657	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
3658	cap->mds_wanted = `0`;
3659	if (cap == ci->i_auth_cap)
3660	check_caps = `1`; / check auth cap only /
3661	else
3662	check_caps = `2`; / check all caps /
3663	}
3664
3665	if (extra_info->inline_version > `0` &&
3666	extra_info->inline_version >= ci->i_inline_version) {
3667	ci->i_inline_version = extra_info->inline_version;
3668	if (ci->i_inline_version != CEPH_INLINE_NONE &&
3669	(newcaps & (CEPH_CAP_FILE_CACHE\|CEPH_CAP_FILE_LAZYIO)))
3670	fill_inline = true;
3671	}
3672
3673	if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
3674	if (ci->i_auth_cap == cap) {
3675	if (newcaps & ~extra_info->issued)
3676	wake = true;
3677
3678	if (ci->i_requested_max_size > max_size \|\|
3679	!(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
3680	/ re-request max_size if necessary /
3681	ci->i_requested_max_size = `0`;
3682	wake = true;
3683	}
3684
3685	ceph_kick_flushing_inode_caps(session, ci);
3686	}
3687	up_read(sem: &session->s_mdsc->snap_rwsem);
3688	}
3689	spin_unlock(lock: &ci->i_ceph_lock);
3690
3691	if (fill_inline)
3692	ceph_fill_inline_data(inode, NULL, data: extra_info->inline_data,
3693	len: extra_info->inline_len);
3694
3695	if (queue_trunc)
3696	ceph_queue_vmtruncate(inode);
3697
3698	if (writeback)
3699	/*
3700	* queue inode for writeback: we can't actually call
3701	* filemap_write_and_wait, etc. from message handler
3702	* context.
3703	*/
3704	ceph_queue_writeback(inode);
3705	if (queue_invalidate)
3706	ceph_queue_invalidate(inode);
3707	if (deleted_inode)
3708	invalidate_aliases(inode);
3709	if (wake)
3710	wake_up_all(&ci->i_cap_wq);
3711
3712	mutex_unlock(lock: &session->s_mutex);
3713	if (check_caps == `1`)
3714	ceph_check_caps(ci, CHECK_CAPS_AUTHONLY \| CHECK_CAPS_NOINVAL);
3715	else if (check_caps == `2`)
3716	ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
3717	}
3718
3719	/*
3720	* Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
3721	* MDS has been safely committed.
3722	*/
3723	static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
3724	struct ceph_mds_caps *m,
3725	struct ceph_mds_session *session,
3726	struct ceph_cap *cap)
3727	__releases(ci->i_ceph_lock)
3728	{
3729	struct ceph_inode_info *ci = ceph_inode(inode);
3730	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
3731	struct ceph_cap_flush cf, tmp_cf;
3732	LIST_HEAD(to_remove);
3733	unsigned seq = le32_to_cpu(m->seq);
3734	int dirty = le32_to_cpu(m->dirty);
3735	int cleaned = `0`;
3736	bool drop = false;
3737	bool wake_ci = false;
3738	bool wake_mdsc = false;
3739
3740	list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
3741	/ Is this the one that was flushed? /
3742	if (cf->tid == flush_tid)
3743	cleaned = cf->caps;
3744
3745	/ Is this a capsnap? /
3746	if (cf->is_capsnap)
3747	continue;
3748
3749	if (cf->tid <= flush_tid) {
3750	/*
3751	* An earlier or current tid. The FLUSH_ACK should
3752	* represent a superset of this flush's caps.
3753	*/
3754	wake_ci \|= __detach_cap_flush_from_ci(ci, cf);
3755	list_add_tail(new: &cf->i_list, head: &to_remove);
3756	} else {
3757	/*
3758	* This is a later one. Any caps in it are still dirty
3759	* so don't count them as cleaned.
3760	*/
3761	cleaned &= ~cf->caps;
3762	if (!cleaned)
3763	break;
3764	}
3765	}
3766
3767	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
3768	" flushing %s -> %s\n",
3769	inode, session->s_mds, seq, ceph_cap_string(dirty),
3770	ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
3771	ceph_cap_string(ci->i_flushing_caps & ~cleaned));
3772
3773	if (list_empty(head: &to_remove) && !cleaned)
3774	goto out;
3775
3776	ci->i_flushing_caps &= ~cleaned;
3777
3778	spin_lock(lock: &mdsc->cap_dirty_lock);
3779
3780	list_for_each_entry(cf, &to_remove, i_list)
3781	wake_mdsc \|= __detach_cap_flush_from_mdsc(mdsc, cf);
3782
3783	if (ci->i_flushing_caps == `0`) {
3784	if (list_empty(head: &ci->i_cap_flush_list)) {
3785	list_del_init(entry: &ci->i_flushing_item);
3786	if (!list_empty(head: &session->s_cap_flushing)) {
3787	dout(" mds%d still flushing cap on %p\n",
3788	session->s_mds,
3789	&list_first_entry(&session->s_cap_flushing,
3790	struct ceph_inode_info,
3791	i_flushing_item)->netfs.inode);
3792	}
3793	}
3794	mdsc->num_cap_flushing--;
3795	dout(" inode %p now !flushing\n", inode);
3796
3797	if (ci->i_dirty_caps == `0`) {
3798	dout(" inode %p now clean\n", inode);
3799	BUG_ON(!list_empty(&ci->i_dirty_item));
3800	drop = true;
3801	if (ci->i_wr_ref == `0` &&
3802	ci->i_wrbuffer_ref_head == `0`) {
3803	BUG_ON(!ci->i_head_snapc);
3804	ceph_put_snap_context(sc: ci->i_head_snapc);
3805	ci->i_head_snapc = NULL;
3806	}
3807	} else {
3808	BUG_ON(list_empty(&ci->i_dirty_item));
3809	}
3810	}
3811	spin_unlock(lock: &mdsc->cap_dirty_lock);
3812
3813	out:
3814	spin_unlock(lock: &ci->i_ceph_lock);
3815
3816	while (!list_empty(head: &to_remove)) {
3817	cf = list_first_entry(&to_remove,
3818	struct ceph_cap_flush, i_list);
3819	list_del_init(entry: &cf->i_list);
3820	if (!cf->is_capsnap)
3821	ceph_free_cap_flush(cf);
3822	}
3823
3824	if (wake_ci)
3825	wake_up_all(&ci->i_cap_wq);
3826	if (wake_mdsc)
3827	wake_up_all(&mdsc->cap_flushing_wq);
3828	if (drop)
3829	iput(inode);
3830	}
3831
3832	void __ceph_remove_capsnap(struct inode inode, struct* ceph_cap_snap *capsnap,
3833	bool wake_ci, bool wake_mdsc)
3834	{
3835	struct ceph_inode_info *ci = ceph_inode(inode);
3836	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
3837	bool ret;
3838
3839	lockdep_assert_held(&ci->i_ceph_lock);
3840
3841	dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);
3842
3843	list_del_init(entry: &capsnap->ci_item);
3844	ret = __detach_cap_flush_from_ci(ci, cf: &capsnap->cap_flush);
3845	if (wake_ci)
3846	*wake_ci = ret;
3847
3848	spin_lock(lock: &mdsc->cap_dirty_lock);
3849	if (list_empty(head: &ci->i_cap_flush_list))
3850	list_del_init(entry: &ci->i_flushing_item);
3851
3852	ret = __detach_cap_flush_from_mdsc(mdsc, cf: &capsnap->cap_flush);
3853	if (wake_mdsc)
3854	*wake_mdsc = ret;
3855	spin_unlock(lock: &mdsc->cap_dirty_lock);
3856	}
3857
3858	void ceph_remove_capsnap(struct inode inode, struct* ceph_cap_snap *capsnap,
3859	bool wake_ci, bool wake_mdsc)
3860	{
3861	struct ceph_inode_info *ci = ceph_inode(inode);
3862
3863	lockdep_assert_held(&ci->i_ceph_lock);
3864
3865	WARN_ON_ONCE(capsnap->dirty_pages \|\| capsnap->writing);
3866	__ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
3867	}
3868
3869	/*
3870	* Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
3871	* throw away our cap_snap.
3872	*
3873	* Caller hold s_mutex.
3874	*/
3875	static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
3876	struct ceph_mds_caps *m,
3877	struct ceph_mds_session *session)
3878	{
3879	struct ceph_inode_info *ci = ceph_inode(inode);
3880	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb: inode->i_sb)->mdsc;
3881	u64 follows = le64_to_cpu(m->snap_follows);
3882	struct ceph_cap_snap capsnap = NULL, iter;
3883	bool wake_ci = false;
3884	bool wake_mdsc = false;
3885
3886	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
3887	inode, ci, session->s_mds, follows);
3888
3889	spin_lock(lock: &ci->i_ceph_lock);
3890	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
3891	if (iter->follows == follows) {
3892	if (iter->cap_flush.tid != flush_tid) {
3893	dout(" cap_snap %p follows %lld tid %lld !="
3894	" %lld\n", iter, follows,
3895	flush_tid, iter->cap_flush.tid);
3896	break;
3897	}
3898	capsnap = iter;
3899	break;
3900	} else {
3901	dout(" skipping cap_snap %p follows %lld\n",
3902	iter, iter->follows);
3903	}
3904	}
3905	if (capsnap)
3906	ceph_remove_capsnap(inode, capsnap, wake_ci: &wake_ci, wake_mdsc: &wake_mdsc);
3907	spin_unlock(lock: &ci->i_ceph_lock);
3908
3909	if (capsnap) {
3910	ceph_put_snap_context(sc: capsnap->context);
3911	ceph_put_cap_snap(capsnap);
3912	if (wake_ci)
3913	wake_up_all(&ci->i_cap_wq);
3914	if (wake_mdsc)
3915	wake_up_all(&mdsc->cap_flushing_wq);
3916	iput(inode);
3917	}
3918	}
3919
3920	/*
3921	* Handle TRUNC from MDS, indicating file truncation.
3922	*
3923	* caller hold s_mutex.
3924	*/
3925	static bool handle_cap_trunc(struct inode *inode,
3926	struct ceph_mds_caps *trunc,
3927	struct ceph_mds_session *session,
3928	struct cap_extra_info *extra_info)
3929	{
3930	struct ceph_inode_info *ci = ceph_inode(inode);
3931	int mds = session->s_mds;
3932	int seq = le32_to_cpu(trunc->seq);
3933	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
3934	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
3935	u64 size = le64_to_cpu(trunc->size);
3936	int implemented = `0`;
3937	int dirty = __ceph_caps_dirty(ci);
3938	int issued = __ceph_caps_issued(ci: ceph_inode(inode), implemented: &implemented);
3939	bool queue_trunc = false;
3940
3941	lockdep_assert_held(&ci->i_ceph_lock);
3942
3943	issued \|= implemented \| dirty;
3944
3945	/*
3946	* If there is at least one crypto block then we'll trust
3947	* fscrypt_file_size. If the real length of the file is 0, then
3948	* ignore it (it has probably been truncated down to 0 by the MDS).
3949	*/
3950	if (IS_ENCRYPTED(inode) && size)
3951	size = extra_info->fscrypt_file_size;
3952
3953	dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
3954	__func__, inode, mds, seq, truncate_size, truncate_seq);
3955	queue_trunc = ceph_fill_file_size(inode, issued,
3956	truncate_seq, truncate_size, size);
3957	return queue_trunc;
3958	}
3959
3960	/*
3961	* Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
3962	* different one. If we are the most recent migration we've seen (as
3963	* indicated by mseq), make note of the migrating cap bits for the
3964	* duration (until we see the corresponding IMPORT).
3965	*
3966	* caller holds s_mutex
3967	*/
3968	static void handle_cap_export(struct inode inode, struct* ceph_mds_caps *ex,
3969	struct ceph_mds_cap_peer *ph,
3970	struct ceph_mds_session *session)
3971	{
3972	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
3973	struct ceph_mds_session *tsession = NULL;
3974	struct ceph_cap cap, tcap, *new_cap = NULL;
3975	struct ceph_inode_info *ci = ceph_inode(inode);
3976	u64 t_cap_id;
3977	unsigned mseq = le32_to_cpu(ex->migrate_seq);
3978	unsigned t_seq, t_mseq;
3979	int target, issued;
3980	int mds = session->s_mds;
3981
3982	if (ph) {
3983	t_cap_id = le64_to_cpu(ph->cap_id);
3984	t_seq = le32_to_cpu(ph->seq);
3985	t_mseq = le32_to_cpu(ph->mseq);
3986	target = le32_to_cpu(ph->mds);
3987	} else {
3988	t_cap_id = t_seq = t_mseq = `0`;
3989	target = -`1`;
3990	}
3991
3992	dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
3993	inode, ci, mds, mseq, target);
3994	retry:
3995	down_read(sem: &mdsc->snap_rwsem);
3996	spin_lock(lock: &ci->i_ceph_lock);
3997	cap = __get_cap_for_mds(ci, mds);
3998	if (!cap \|\| cap->cap_id != le64_to_cpu(ex->cap_id))
3999	goto out_unlock;
4000
4001	if (target < `0`) {
4002	ceph_remove_cap(cap, queue_release: false);
4003	goto out_unlock;
4004	}
4005
4006	/*
4007	* now we know we haven't received the cap import message yet
4008	* because the exported cap still exist.
4009	*/
4010
4011	issued = cap->issued;
4012	if (issued != cap->implemented)
4013	pr_err_ratelimited("handle_cap_export: issued != implemented: "
4014	"ino (%llx.%llx) mds%d seq %d mseq %d "
4015	"issued %s implemented %s\n",
4016	ceph_vinop(inode), mds, cap->seq, cap->mseq,
4017	ceph_cap_string(issued),
4018	ceph_cap_string(cap->implemented));
4019
4020
4021	tcap = __get_cap_for_mds(ci, mds: target);
4022	if (tcap) {
4023	/ already have caps from the target /
4024	if (tcap->cap_id == t_cap_id &&
4025	ceph_seq_cmp(a: tcap->seq, b: t_seq) < `0`) {
4026	dout(" updating import cap %p mds%d\n", tcap, target);
4027	tcap->cap_id = t_cap_id;
4028	tcap->seq = t_seq - `1`;
4029	tcap->issue_seq = t_seq - `1`;
4030	tcap->issued \|= issued;
4031	tcap->implemented \|= issued;
4032	if (cap == ci->i_auth_cap) {
4033	ci->i_auth_cap = tcap;
4034	change_auth_cap_ses(ci, session: tcap->session);
4035	}
4036	}
4037	ceph_remove_cap(cap, queue_release: false);
4038	goto out_unlock;
4039	} else if (tsession) {
4040	/ add placeholder for the export tagert /
4041	int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : `0`;
4042	tcap = new_cap;
4043	ceph_add_cap(inode, session: tsession, cap_id: t_cap_id, issued, wanted: `0`,
4044	seq: t_seq - `1`, mseq: t_mseq, realmino: (u64)-`1`, flags: flag, new_cap: &new_cap);
4045
4046	if (!list_empty(head: &ci->i_cap_flush_list) &&
4047	ci->i_auth_cap == tcap) {
4048	spin_lock(lock: &mdsc->cap_dirty_lock);
4049	list_move_tail(list: &ci->i_flushing_item,
4050	head: &tcap->session->s_cap_flushing);
4051	spin_unlock(lock: &mdsc->cap_dirty_lock);
4052	}
4053
4054	ceph_remove_cap(cap, queue_release: false);
4055	goto out_unlock;
4056	}
4057
4058	spin_unlock(lock: &ci->i_ceph_lock);
4059	up_read(sem: &mdsc->snap_rwsem);
4060	mutex_unlock(lock: &session->s_mutex);
4061
4062	/ open target session /
4063	tsession = ceph_mdsc_open_export_target_session(mdsc, target);
4064	if (!IS_ERR(ptr: tsession)) {
4065	if (mds > target) {
4066	mutex_lock(&session->s_mutex);
4067	mutex_lock_nested(lock: &tsession->s_mutex,
4068	SINGLE_DEPTH_NESTING);
4069	} else {
4070	mutex_lock(&tsession->s_mutex);
4071	mutex_lock_nested(lock: &session->s_mutex,
4072	SINGLE_DEPTH_NESTING);
4073	}
4074	new_cap = ceph_get_cap(mdsc, NULL);
4075	} else {
4076	WARN_ON(`1`);
4077	tsession = NULL;
4078	target = -`1`;
4079	mutex_lock(&session->s_mutex);
4080	}
4081	goto retry;
4082
4083	out_unlock:
4084	spin_unlock(lock: &ci->i_ceph_lock);
4085	up_read(sem: &mdsc->snap_rwsem);
4086	mutex_unlock(lock: &session->s_mutex);
4087	if (tsession) {
4088	mutex_unlock(lock: &tsession->s_mutex);
4089	ceph_put_mds_session(s: tsession);
4090	}
4091	if (new_cap)
4092	ceph_put_cap(mdsc, cap: new_cap);
4093	}
4094
4095	/*
4096	* Handle cap IMPORT.
4097	*
4098	* caller holds s_mutex. acquires i_ceph_lock
4099	*/
4100	static void handle_cap_import(struct ceph_mds_client *mdsc,
4101	struct inode inode, struct* ceph_mds_caps *im,
4102	struct ceph_mds_cap_peer *ph,
4103	struct ceph_mds_session *session,
4104	struct ceph_cap *target_cap, int* *old_issued)
4105	{
4106	struct ceph_inode_info *ci = ceph_inode(inode);
4107	struct ceph_cap cap, ocap, *new_cap = NULL;
4108	int mds = session->s_mds;
4109	int issued;
4110	unsigned caps = le32_to_cpu(im->caps);
4111	unsigned wanted = le32_to_cpu(im->wanted);
4112	unsigned seq = le32_to_cpu(im->seq);
4113	unsigned mseq = le32_to_cpu(im->migrate_seq);
4114	u64 realmino = le64_to_cpu(im->realm);
4115	u64 cap_id = le64_to_cpu(im->cap_id);
4116	u64 p_cap_id;
4117	int peer;
4118
4119	if (ph) {
4120	p_cap_id = le64_to_cpu(ph->cap_id);
4121	peer = le32_to_cpu(ph->mds);
4122	} else {
4123	p_cap_id = `0`;
4124	peer = -`1`;
4125	}
4126
4127	dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
4128	inode, ci, mds, mseq, peer);
4129	retry:
4130	cap = __get_cap_for_mds(ci, mds);
4131	if (!cap) {
4132	if (!new_cap) {
4133	spin_unlock(lock: &ci->i_ceph_lock);
4134	new_cap = ceph_get_cap(mdsc, NULL);
4135	spin_lock(lock: &ci->i_ceph_lock);
4136	goto retry;
4137	}
4138	cap = new_cap;
4139	} else {
4140	if (new_cap) {
4141	ceph_put_cap(mdsc, cap: new_cap);
4142	new_cap = NULL;
4143	}
4144	}
4145
4146	__ceph_caps_issued(ci, implemented: &issued);
4147	issued \|= __ceph_caps_dirty(ci);
4148
4149	ceph_add_cap(inode, session, cap_id, issued: caps, wanted, seq, mseq,
4150	realmino, CEPH_CAP_FLAG_AUTH, new_cap: &new_cap);
4151
4152	ocap = peer >= `0` ? __get_cap_for_mds(ci, mds: peer) : NULL;
4153	if (ocap && ocap->cap_id == p_cap_id) {
4154	dout(" remove export cap %p mds%d flags %d\n",
4155	ocap, peer, ph->flags);
4156	if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
4157	(ocap->seq != le32_to_cpu(ph->seq) \|\|
4158	ocap->mseq != le32_to_cpu(ph->mseq))) {
4159	pr_err_ratelimited("handle_cap_import: "
4160	"mismatched seq/mseq: ino (%llx.%llx) "
4161	"mds%d seq %d mseq %d importer mds%d "
4162	"has peer seq %d mseq %d\n",
4163	ceph_vinop(inode), peer, ocap->seq,
4164	ocap->mseq, mds, le32_to_cpu(ph->seq),
4165	le32_to_cpu(ph->mseq));
4166	}
4167	ceph_remove_cap(cap: ocap, queue_release: (ph->flags & CEPH_CAP_FLAG_RELEASE));
4168	}
4169
4170	*old_issued = issued;
4171	*target_cap = cap;
4172	}
4173
4174	#ifdef CONFIG_FS_ENCRYPTION
4175	static int parse_fscrypt_fields(void *p, void* *end,
4176	struct cap_extra_info *extra)
4177	{
4178	u32 len;
4179
4180	ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
4181	if (extra->fscrypt_auth_len) {
4182	ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
4183	extra->fscrypt_auth = kmalloc(size: extra->fscrypt_auth_len,
4184	GFP_KERNEL);
4185	if (!extra->fscrypt_auth)
4186	return -ENOMEM;
4187	ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
4188	extra->fscrypt_auth_len, bad);
4189	}
4190
4191	ceph_decode_32_safe(p, end, len, bad);
4192	if (len >= sizeof(u64)) {
4193	ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
4194	len -= sizeof(u64);
4195	}
4196	ceph_decode_skip_n(p, end, len, bad);
4197	return `0`;
4198	bad:
4199	return -EIO;
4200	}
4201	#else
4202	static int parse_fscrypt_fields(void *p, void* *end,
4203	struct cap_extra_info *extra)
4204	{
4205	u32 len;
4206
4207	/ Don't care about these fields unless we're encryption-capable /
4208	ceph_decode_32_safe(p, end, len, bad);
4209	if (len)
4210	ceph_decode_skip_n(p, end, len, bad);
4211	ceph_decode_32_safe(p, end, len, bad);
4212	if (len)
4213	ceph_decode_skip_n(p, end, len, bad);
4214	return `0`;
4215	bad:
4216	return -EIO;
4217	}
4218	#endif
4219
4220	/*
4221	* Handle a caps message from the MDS.
4222	*
4223	* Identify the appropriate session, inode, and call the right handler
4224	* based on the cap op.
4225	*/
4226	void ceph_handle_caps(struct ceph_mds_session *session,
4227	struct ceph_msg *msg)
4228	{
4229	struct ceph_mds_client *mdsc = session->s_mdsc;
4230	struct inode *inode;
4231	struct ceph_inode_info *ci;
4232	struct ceph_cap *cap;
4233	struct ceph_mds_caps *h;
4234	struct ceph_mds_cap_peer *peer = NULL;
4235	struct ceph_snap_realm *realm = NULL;
4236	int op;
4237	int msg_version = le16_to_cpu(msg->hdr.version);
4238	u32 seq, mseq;
4239	struct ceph_vino vino;
4240	void *snaptrace;
4241	size_t snaptrace_len;
4242	void p, end;
4243	struct cap_extra_info extra_info = {};
4244	bool queue_trunc;
4245	bool close_sessions = false;
4246	bool do_cap_release = false;
4247
4248	dout("handle_caps from mds%d\n", session->s_mds);
4249
4250	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
4251	return;
4252
4253	/ decode /
4254	end = msg->front.iov_base + msg->front.iov_len;
4255	if (msg->front.iov_len < sizeof(*h))
4256	goto bad;
4257	h = msg->front.iov_base;
4258	op = le32_to_cpu(h->op);
4259	vino.ino = le64_to_cpu(h->ino);
4260	vino.snap = CEPH_NOSNAP;
4261	seq = le32_to_cpu(h->seq);
4262	mseq = le32_to_cpu(h->migrate_seq);
4263
4264	snaptrace = h + `1`;
4265	snaptrace_len = le32_to_cpu(h->snap_trace_len);
4266	p = snaptrace + snaptrace_len;
4267
4268	if (msg_version >= `2`) {
4269	u32 flock_len;
4270	ceph_decode_32_safe(&p, end, flock_len, bad);
4271	if (p + flock_len > end)
4272	goto bad;
4273	p += flock_len;
4274	}
4275
4276	if (msg_version >= `3`) {
4277	if (op == CEPH_CAP_OP_IMPORT) {
4278	if (p + sizeof(*peer) > end)
4279	goto bad;
4280	peer = p;
4281	p += sizeof(*peer);
4282	} else if (op == CEPH_CAP_OP_EXPORT) {
4283	/ recorded in unused fields /
4284	peer = (void *)&h->size;
4285	}
4286	}
4287
4288	if (msg_version >= `4`) {
4289	ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
4290	ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
4291	if (p + extra_info.inline_len > end)
4292	goto bad;
4293	extra_info.inline_data = p;
4294	p += extra_info.inline_len;
4295	}
4296
4297	if (msg_version >= `5`) {
4298	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
4299	u32 epoch_barrier;
4300
4301	ceph_decode_32_safe(&p, end, epoch_barrier, bad);
4302	ceph_osdc_update_epoch_barrier(osdc, eb: epoch_barrier);
4303	}
4304
4305	if (msg_version >= `8`) {
4306	u32 pool_ns_len;
4307
4308	/ version >= 6 /
4309	ceph_decode_skip_64(&p, end, bad); // flush_tid
4310	/ version >= 7 /
4311	ceph_decode_skip_32(&p, end, bad); // caller_uid
4312	ceph_decode_skip_32(&p, end, bad); // caller_gid
4313	/ version >= 8 /
4314	ceph_decode_32_safe(&p, end, pool_ns_len, bad);
4315	if (pool_ns_len > `0`) {
4316	ceph_decode_need(&p, end, pool_ns_len, bad);
4317	extra_info.pool_ns =
4318	ceph_find_or_create_string(str: p, len: pool_ns_len);
4319	p += pool_ns_len;
4320	}
4321	}
4322
4323	if (msg_version >= `9`) {
4324	struct ceph_timespec *btime;
4325
4326	if (p + sizeof(*btime) > end)
4327	goto bad;
4328	btime = p;
4329	ceph_decode_timespec64(ts: &extra_info.btime, tv: btime);
4330	p += sizeof(*btime);
4331	ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
4332	}
4333
4334	if (msg_version >= `11`) {
4335	/ version >= 10 /
4336	ceph_decode_skip_32(&p, end, bad); // flags
4337	/ version >= 11 /
4338	extra_info.dirstat_valid = true;
4339	ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
4340	ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
4341	}
4342
4343	if (msg_version >= `12`) {
4344	if (parse_fscrypt_fields(p: &p, end, extra: &extra_info))
4345	goto bad;
4346	}
4347
4348	/ lookup ino /
4349	inode = ceph_find_inode(sb: mdsc->fsc->sb, vino);
4350	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
4351	vino.snap, inode);
4352
4353	mutex_lock(&session->s_mutex);
4354	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
4355	(unsigned)seq);
4356
4357	if (!inode) {
4358	dout(" i don't have ino %llx\n", vino.ino);
4359
4360	switch (op) {
4361	case CEPH_CAP_OP_IMPORT:
4362	case CEPH_CAP_OP_REVOKE:
4363	case CEPH_CAP_OP_GRANT:
4364	do_cap_release = true;
4365	break;
4366	default:
4367	break;
4368	}
4369	goto flush_cap_releases;
4370	}
4371	ci = ceph_inode(inode);
4372
4373	/ these will work even if we don't have a cap yet /
4374	switch (op) {
4375	case CEPH_CAP_OP_FLUSHSNAP_ACK:
4376	handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
4377	m: h, session);
4378	goto done;
4379
4380	case CEPH_CAP_OP_EXPORT:
4381	handle_cap_export(inode, ex: h, ph: peer, session);
4382	goto done_unlocked;
4383
4384	case CEPH_CAP_OP_IMPORT:
4385	realm = NULL;
4386	if (snaptrace_len) {
4387	down_write(sem: &mdsc->snap_rwsem);
4388	if (ceph_update_snap_trace(m: mdsc, p: snaptrace,
4389	e: snaptrace + snaptrace_len,
4390	deletion: false, realm_ret: &realm)) {
4391	up_write(sem: &mdsc->snap_rwsem);
4392	close_sessions = true;
4393	goto done;
4394	}
4395	downgrade_write(sem: &mdsc->snap_rwsem);
4396	} else {
4397	down_read(sem: &mdsc->snap_rwsem);
4398	}
4399	spin_lock(lock: &ci->i_ceph_lock);
4400	handle_cap_import(mdsc, inode, im: h, ph: peer, session,
4401	target_cap: &cap, old_issued: &extra_info.issued);
4402	handle_cap_grant(inode, session, cap,
4403	grant: h, xattr_buf: msg->middle, extra_info: &extra_info);
4404	if (realm)
4405	ceph_put_snap_realm(mdsc, realm);
4406	goto done_unlocked;
4407	}
4408
4409	/ the rest require a cap /
4410	spin_lock(lock: &ci->i_ceph_lock);
4411	cap = __get_cap_for_mds(ci: ceph_inode(inode), mds: session->s_mds);
4412	if (!cap) {
4413	dout(" no cap on %p ino %llx.%llx from mds%d\n",
4414	inode, ceph_ino(inode), ceph_snap(inode),
4415	session->s_mds);
4416	spin_unlock(lock: &ci->i_ceph_lock);
4417	switch (op) {
4418	case CEPH_CAP_OP_REVOKE:
4419	case CEPH_CAP_OP_GRANT:
4420	do_cap_release = true;
4421	break;
4422	default:
4423	break;
4424	}
4425	goto flush_cap_releases;
4426	}
4427
4428	/ note that each of these drops i_ceph_lock for us /
4429	switch (op) {
4430	case CEPH_CAP_OP_REVOKE:
4431	case CEPH_CAP_OP_GRANT:
4432	__ceph_caps_issued(ci, implemented: &extra_info.issued);
4433	extra_info.issued \|= __ceph_caps_dirty(ci);
4434	handle_cap_grant(inode, session, cap,
4435	grant: h, xattr_buf: msg->middle, extra_info: &extra_info);
4436	goto done_unlocked;
4437
4438	case CEPH_CAP_OP_FLUSH_ACK:
4439	handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
4440	m: h, session, cap);
4441	break;
4442
4443	case CEPH_CAP_OP_TRUNC:
4444	queue_trunc = handle_cap_trunc(inode, trunc: h, session,
4445	extra_info: &extra_info);
4446	spin_unlock(lock: &ci->i_ceph_lock);
4447	if (queue_trunc)
4448	ceph_queue_vmtruncate(inode);
4449	break;
4450
4451	default:
4452	spin_unlock(lock: &ci->i_ceph_lock);
4453	pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
4454	ceph_cap_op_name(op));
4455	}
4456
4457	done:
4458	mutex_unlock(lock: &session->s_mutex);
4459	done_unlocked:
4460	iput(inode);
4461	out:
4462	ceph_dec_mds_stopping_blocker(mdsc);
4463
4464	ceph_put_string(str: extra_info.pool_ns);
4465
4466	/ Defer closing the sessions after s_mutex lock being released /
4467	if (close_sessions)
4468	ceph_mdsc_close_sessions(mdsc);
4469
4470	kfree(objp: extra_info.fscrypt_auth);
4471	return;
4472
4473	flush_cap_releases:
4474	/*
4475	* send any cap release message to try to move things
4476	* along for the mds (who clearly thinks we still have this
4477	* cap).
4478	*/
4479	if (do_cap_release) {
4480	cap = ceph_get_cap(mdsc, NULL);
4481	cap->cap_ino = vino.ino;
4482	cap->queue_release = `1`;
4483	cap->cap_id = le64_to_cpu(h->cap_id);
4484	cap->mseq = mseq;
4485	cap->seq = seq;
4486	cap->issue_seq = seq;
4487	spin_lock(lock: &session->s_cap_lock);
4488	__ceph_queue_cap_release(session, cap);
4489	spin_unlock(lock: &session->s_cap_lock);
4490	}
4491	ceph_flush_cap_releases(mdsc, session);
4492	goto done;
4493
4494	bad:
4495	pr_err("ceph_handle_caps: corrupt message\n");
4496	ceph_msg_dump(msg);
4497	goto out;
4498	}
4499
4500	/*
4501	* Delayed work handler to process end of delayed cap release LRU list.
4502	*
4503	* If new caps are added to the list while processing it, these won't get
4504	* processed in this run. In this case, the ci->i_hold_caps_max will be
4505	* returned so that the work can be scheduled accordingly.
4506	*/
4507	unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4508	{
4509	struct inode *inode;
4510	struct ceph_inode_info *ci;
4511	struct ceph_mount_options *opt = mdsc->fsc->mount_options;
4512	unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
4513	unsigned long loop_start = jiffies;
4514	unsigned long delay = `0`;
4515
4516	dout("check_delayed_caps\n");
4517	spin_lock(lock: &mdsc->cap_delay_lock);
4518	while (!list_empty(head: &mdsc->cap_delay_list)) {
4519	ci = list_first_entry(&mdsc->cap_delay_list,
4520	struct ceph_inode_info,
4521	i_cap_delay_list);
4522	if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
4523	dout("%s caps added recently. Exiting loop", __func__);
4524	delay = ci->i_hold_caps_max;
4525	break;
4526	}
4527	if ((ci->i_ceph_flags & CEPH_I_FLUSH) == `0` &&
4528	time_before(jiffies, ci->i_hold_caps_max))
4529	break;
4530	list_del_init(entry: &ci->i_cap_delay_list);
4531
4532	inode = igrab(&ci->netfs.inode);
4533	if (inode) {
4534	spin_unlock(lock: &mdsc->cap_delay_lock);
4535	dout("check_delayed_caps on %p\n", inode);
4536	ceph_check_caps(ci, flags: `0`);
4537	iput(inode);
4538	spin_lock(lock: &mdsc->cap_delay_lock);
4539	}
4540	}
4541	spin_unlock(lock: &mdsc->cap_delay_lock);
4542
4543	return delay;
4544	}
4545
4546	/*
4547	* Flush all dirty caps to the mds
4548	*/
4549	static void flush_dirty_session_caps(struct ceph_mds_session *s)
4550	{
4551	struct ceph_mds_client *mdsc = s->s_mdsc;
4552	struct ceph_inode_info *ci;
4553	struct inode *inode;
4554
4555	dout("flush_dirty_caps\n");
4556	spin_lock(lock: &mdsc->cap_dirty_lock);
4557	while (!list_empty(head: &s->s_cap_dirty)) {
4558	ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
4559	i_dirty_item);
4560	inode = &ci->netfs.inode;
4561	ihold(inode);
4562	dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
4563	spin_unlock(lock: &mdsc->cap_dirty_lock);
4564	ceph_wait_on_async_create(inode);
4565	ceph_check_caps(ci, CHECK_CAPS_FLUSH);
4566	iput(inode);
4567	spin_lock(lock: &mdsc->cap_dirty_lock);
4568	}
4569	spin_unlock(lock: &mdsc->cap_dirty_lock);
4570	dout("flush_dirty_caps done\n");
4571	}
4572
4573	void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4574	{
4575	ceph_mdsc_iterate_sessions(mdsc, cb: flush_dirty_session_caps, check_state: true);
4576	}
4577
4578	void __ceph_touch_fmode(struct ceph_inode_info *ci,
4579	struct ceph_mds_client mdsc, int* fmode)
4580	{
4581	unsigned long now = jiffies;
4582	if (fmode & CEPH_FILE_MODE_RD)
4583	ci->i_last_rd = now;
4584	if (fmode & CEPH_FILE_MODE_WR)
4585	ci->i_last_wr = now;
4586	/ queue periodic check /
4587	if (fmode &&
4588	__ceph_is_any_real_caps(ci) &&
4589	list_empty(head: &ci->i_cap_delay_list))
4590	__cap_delay_requeue(mdsc, ci);
4591	}
4592
4593	void ceph_get_fmode(struct ceph_inode_info ci, int* fmode, int count)
4594	{
4595	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: ci->netfs.inode.i_sb);
4596	int bits = (fmode << `1`) \| `1`;
4597	bool already_opened = false;
4598	int i;
4599
4600	if (count == `1`)
4601	atomic64_inc(v: &mdsc->metric.opened_files);
4602
4603	spin_lock(lock: &ci->i_ceph_lock);
4604	for (i = `0`; i < CEPH_FILE_MODE_BITS; i++) {
4605	/*
4606	* If any of the mode ref is larger than 0,
4607	* that means it has been already opened by
4608	* others. Just skip checking the PIN ref.
4609	*/
4610	if (i && ci->i_nr_by_mode[i])
4611	already_opened = true;
4612
4613	if (bits & (`1` << i))
4614	ci->i_nr_by_mode[i] += count;
4615	}
4616
4617	if (!already_opened)
4618	percpu_counter_inc(fbc: &mdsc->metric.opened_inodes);
4619	spin_unlock(lock: &ci->i_ceph_lock);
4620	}
4621
4622	/*
4623	* Drop open file reference. If we were the last open file,
4624	* we may need to release capabilities to the MDS (or schedule
4625	* their delayed release).
4626	*/
4627	void ceph_put_fmode(struct ceph_inode_info ci, int* fmode, int count)
4628	{
4629	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb: ci->netfs.inode.i_sb);
4630	int bits = (fmode << `1`) \| `1`;
4631	bool is_closed = true;
4632	int i;
4633
4634	if (count == `1`)
4635	atomic64_dec(v: &mdsc->metric.opened_files);
4636
4637	spin_lock(lock: &ci->i_ceph_lock);
4638	for (i = `0`; i < CEPH_FILE_MODE_BITS; i++) {
4639	if (bits & (`1` << i)) {
4640	BUG_ON(ci->i_nr_by_mode[i] < count);
4641	ci->i_nr_by_mode[i] -= count;
4642	}
4643
4644	/*
4645	* If any of the mode ref is not 0 after
4646	* decreased, that means it is still opened
4647	* by others. Just skip checking the PIN ref.
4648	*/
4649	if (i && ci->i_nr_by_mode[i])
4650	is_closed = false;
4651	}
4652
4653	if (is_closed)
4654	percpu_counter_dec(fbc: &mdsc->metric.opened_inodes);
4655	spin_unlock(lock: &ci->i_ceph_lock);
4656	}
4657
4658	/*
4659	* For a soon-to-be unlinked file, drop the LINK caps. If it
4660	* looks like the link count will hit 0, drop any other caps (other
4661	* than PIN) we don't specifically want (due to the file still being
4662	* open).
4663	*/
4664	int ceph_drop_caps_for_unlink(struct inode *inode)
4665	{
4666	struct ceph_inode_info *ci = ceph_inode(inode);
4667	int drop = CEPH_CAP_LINK_SHARED \| CEPH_CAP_LINK_EXCL;
4668
4669	spin_lock(lock: &ci->i_ceph_lock);
4670	if (inode->i_nlink == `1`) {
4671	drop \|= ~(__ceph_caps_wanted(ci) \| CEPH_CAP_PIN);
4672
4673	if (__ceph_caps_dirty(ci)) {
4674	struct ceph_mds_client *mdsc =
4675	ceph_inode_to_client(inode)->mdsc;
4676	__cap_delay_requeue_front(mdsc, ci);
4677	}
4678	}
4679	spin_unlock(lock: &ci->i_ceph_lock);
4680	return drop;
4681	}
4682
4683	/*
4684	* Helpers for embedding cap and dentry lease releases into mds
4685	* requests.
4686	*
4687	* @force is used by dentry_release (below) to force inclusion of a
4688	* record for the directory inode, even when there aren't any caps to
4689	* drop.
4690	*/
4691	int ceph_encode_inode_release(void p, struct** inode *inode,
4692	int mds, int drop, int unless, int force)
4693	{
4694	struct ceph_inode_info *ci = ceph_inode(inode);
4695	struct ceph_cap *cap;
4696	struct ceph_mds_request_release rel = p;
4697	int used, dirty;
4698	int ret = `0`;
4699
4700	spin_lock(lock: &ci->i_ceph_lock);
4701	used = __ceph_caps_used(ci);
4702	dirty = __ceph_caps_dirty(ci);
4703
4704	dout("encode_inode_release %p mds%d used\|dirty %s drop %s unless %s\n",
4705	inode, mds, ceph_cap_string(used\|dirty), ceph_cap_string(drop),
4706	ceph_cap_string(unless));
4707
4708	/ only drop unused, clean caps /
4709	drop &= ~(used \| dirty);
4710
4711	cap = __get_cap_for_mds(ci, mds);
4712	if (cap && __cap_is_valid(cap)) {
4713	unless &= cap->issued;
4714	if (unless) {
4715	if (unless & CEPH_CAP_AUTH_EXCL)
4716	drop &= ~CEPH_CAP_AUTH_SHARED;
4717	if (unless & CEPH_CAP_LINK_EXCL)
4718	drop &= ~CEPH_CAP_LINK_SHARED;
4719	if (unless & CEPH_CAP_XATTR_EXCL)
4720	drop &= ~CEPH_CAP_XATTR_SHARED;
4721	if (unless & CEPH_CAP_FILE_EXCL)
4722	drop &= ~CEPH_CAP_FILE_SHARED;
4723	}
4724
4725	if (force \|\| (cap->issued & drop)) {
4726	if (cap->issued & drop) {
4727	int wanted = __ceph_caps_wanted(ci);
4728	dout("encode_inode_release %p cap %p "
4729	"%s -> %s, wanted %s -> %s\n", inode, cap,
4730	ceph_cap_string(cap->issued),
4731	ceph_cap_string(cap->issued & ~drop),
4732	ceph_cap_string(cap->mds_wanted),
4733	ceph_cap_string(wanted));
4734
4735	cap->issued &= ~drop;
4736	cap->implemented &= ~drop;
4737	cap->mds_wanted = wanted;
4738	if (cap == ci->i_auth_cap &&
4739	!(wanted & CEPH_CAP_ANY_FILE_WR))
4740	ci->i_requested_max_size = `0`;
4741	} else {
4742	dout("encode_inode_release %p cap %p %s"
4743	" (force)\n", inode, cap,
4744	ceph_cap_string(cap->issued));
4745	}
4746
4747	rel->ino = cpu_to_le64(ceph_ino(inode));
4748	rel->cap_id = cpu_to_le64(cap->cap_id);
4749	rel->seq = cpu_to_le32(cap->seq);
4750	rel->issue_seq = cpu_to_le32(cap->issue_seq);
4751	rel->mseq = cpu_to_le32(cap->mseq);
4752	rel->caps = cpu_to_le32(cap->implemented);
4753	rel->wanted = cpu_to_le32(cap->mds_wanted);
4754	rel->dname_len = `0`;
4755	rel->dname_seq = `0`;
4756	p += sizeof(rel);
4757	ret = `1`;
4758	} else {
4759	dout("encode_inode_release %p cap %p %s (noop)\n",
4760	inode, cap, ceph_cap_string(cap->issued));
4761	}
4762	}
4763	spin_unlock(lock: &ci->i_ceph_lock);
4764	return ret;
4765	}
4766
4767	/**
4768	* ceph_encode_dentry_release - encode a dentry release into an outgoing request
4769	* @p: outgoing request buffer
4770	* @dentry: dentry to release
4771	* @dir: dir to release it from
4772	* @mds: mds that we're speaking to
4773	* @drop: caps being dropped
4774	* @unless: unless we have these caps
4775	*
4776	* Encode a dentry release into an outgoing request buffer. Returns 1 if the
4777	* thing was released, or a negative error code otherwise.
4778	*/
4779	int ceph_encode_dentry_release(void p, struct** dentry *dentry,
4780	struct inode *dir,
4781	int mds, int drop, int unless)
4782	{
4783	struct dentry *parent = NULL;
4784	struct ceph_mds_request_release rel = p;
4785	struct ceph_dentry_info *di = ceph_dentry(dentry);
4786	int force = `0`;
4787	int ret;
4788
4789	/*
4790	* force an record for the directory caps if we have a dentry lease.
4791	* this is racy (can't take i_ceph_lock and d_lock together), but it
4792	* doesn't have to be perfect; the mds will revoke anything we don't
4793	* release.
4794	*/
4795	spin_lock(lock: &dentry->d_lock);
4796	if (di->lease_session && di->lease_session->s_mds == mds)
4797	force = `1`;
4798	if (!dir) {
4799	parent = dget(dentry: dentry->d_parent);
4800	dir = d_inode(dentry: parent);
4801	}
4802	spin_unlock(lock: &dentry->d_lock);
4803
4804	ret = ceph_encode_inode_release(p, inode: dir, mds, drop, unless, force);
4805	dput(parent);
4806
4807	spin_lock(lock: &dentry->d_lock);
4808	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
4809	dout("encode_dentry_release %p mds%d seq %d\n",
4810	dentry, mds, (int)di->lease_seq);
4811	rel->dname_seq = cpu_to_le32(di->lease_seq);
4812	__ceph_mdsc_drop_dentry_lease(dentry);
4813	spin_unlock(lock: &dentry->d_lock);
4814	if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(inode: dir)) {
4815	int ret2 = ceph_encode_encrypted_fname(parent: dir, dentry, buf: *p);
4816
4817	if (ret2 < `0`)
4818	return ret2;
4819
4820	rel->dname_len = cpu_to_le32(ret2);
4821	*p += ret2;
4822	} else {
4823	rel->dname_len = cpu_to_le32(dentry->d_name.len);
4824	memcpy(*p, dentry->d_name.name, dentry->d_name.len);
4825	*p += dentry->d_name.len;
4826	}
4827	} else {
4828	spin_unlock(lock: &dentry->d_lock);
4829	}
4830	return ret;
4831	}
4832
4833	static int remove_capsnaps(struct ceph_mds_client mdsc, struct* inode *inode)
4834	{
4835	struct ceph_inode_info *ci = ceph_inode(inode);
4836	struct ceph_cap_snap *capsnap;
4837	int capsnap_release = `0`;
4838
4839	lockdep_assert_held(&ci->i_ceph_lock);
4840
4841	dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
4842
4843	while (!list_empty(head: &ci->i_cap_snaps)) {
4844	capsnap = list_first_entry(&ci->i_cap_snaps,
4845	struct ceph_cap_snap, ci_item);
4846	__ceph_remove_capsnap(inode, capsnap, NULL, NULL);
4847	ceph_put_snap_context(sc: capsnap->context);
4848	ceph_put_cap_snap(capsnap);
4849	capsnap_release++;
4850	}
4851	wake_up_all(&ci->i_cap_wq);
4852	wake_up_all(&mdsc->cap_flushing_wq);
4853	return capsnap_release;
4854	}
4855
4856	int ceph_purge_inode_cap(struct inode inode, struct* ceph_cap cap, bool invalidate)
4857	{
4858	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
4859	struct ceph_mds_client *mdsc = fsc->mdsc;
4860	struct ceph_inode_info *ci = ceph_inode(inode);
4861	bool is_auth;
4862	bool dirty_dropped = false;
4863	int iputs = `0`;
4864
4865	lockdep_assert_held(&ci->i_ceph_lock);
4866
4867	dout("removing cap %p, ci is %p, inode is %p\n",
4868	cap, ci, &ci->netfs.inode);
4869
4870	is_auth = (cap == ci->i_auth_cap);
4871	__ceph_remove_cap(cap, queue_release: false);
4872	if (is_auth) {
4873	struct ceph_cap_flush *cf;
4874
4875	if (ceph_inode_is_shutdown(inode)) {
4876	if (inode->i_data.nrpages > `0`)
4877	*invalidate = true;
4878	if (ci->i_wrbuffer_ref > `0`)
4879	mapping_set_error(mapping: &inode->i_data, error: -EIO);
4880	}
4881
4882	spin_lock(lock: &mdsc->cap_dirty_lock);
4883
4884	/ trash all of the cap flushes for this inode /
4885	while (!list_empty(head: &ci->i_cap_flush_list)) {
4886	cf = list_first_entry(&ci->i_cap_flush_list,
4887	struct ceph_cap_flush, i_list);
4888	list_del_init(entry: &cf->g_list);
4889	list_del_init(entry: &cf->i_list);
4890	if (!cf->is_capsnap)
4891	ceph_free_cap_flush(cf);
4892	}
4893
4894	if (!list_empty(head: &ci->i_dirty_item)) {
4895	pr_warn_ratelimited(
4896	" dropping dirty %s state for %p %lld\n",
4897	ceph_cap_string(ci->i_dirty_caps),
4898	inode, ceph_ino(inode));
4899	ci->i_dirty_caps = `0`;
4900	list_del_init(entry: &ci->i_dirty_item);
4901	dirty_dropped = true;
4902	}
4903	if (!list_empty(head: &ci->i_flushing_item)) {
4904	pr_warn_ratelimited(
4905	" dropping dirty+flushing %s state for %p %lld\n",
4906	ceph_cap_string(ci->i_flushing_caps),
4907	inode, ceph_ino(inode));
4908	ci->i_flushing_caps = `0`;
4909	list_del_init(entry: &ci->i_flushing_item);
4910	mdsc->num_cap_flushing--;
4911	dirty_dropped = true;
4912	}
4913	spin_unlock(lock: &mdsc->cap_dirty_lock);
4914
4915	if (dirty_dropped) {
4916	mapping_set_error(mapping: inode->i_mapping, error: -EIO);
4917
4918	if (ci->i_wrbuffer_ref_head == `0` &&
4919	ci->i_wr_ref == `0` &&
4920	ci->i_dirty_caps == `0` &&
4921	ci->i_flushing_caps == `0`) {
4922	ceph_put_snap_context(sc: ci->i_head_snapc);
4923	ci->i_head_snapc = NULL;
4924	}
4925	}
4926
4927	if (atomic_read(v: &ci->i_filelock_ref) > `0`) {
4928	/ make further file lock syscall return -EIO /
4929	ci->i_ceph_flags \|= CEPH_I_ERROR_FILELOCK;
4930	pr_warn_ratelimited(" dropping file locks for %p %lld\n",
4931	inode, ceph_ino(inode));
4932	}
4933
4934	if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
4935	cf = ci->i_prealloc_cap_flush;
4936	ci->i_prealloc_cap_flush = NULL;
4937	if (!cf->is_capsnap)
4938	ceph_free_cap_flush(cf);
4939	}
4940
4941	if (!list_empty(head: &ci->i_cap_snaps))
4942	iputs = remove_capsnaps(mdsc, inode);
4943	}
4944	if (dirty_dropped)
4945	++iputs;
4946	return iputs;
4947	}
4948

source code of linux/fs/ceph/caps.c