xfs_icache.c source code [linux/fs/xfs/xfs_icache.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_fs.h"
8	#include "xfs_shared.h"
9	#include "xfs_format.h"
10	#include "xfs_log_format.h"
11	#include "xfs_trans_resv.h"
12	#include "xfs_mount.h"
13	#include "xfs_inode.h"
14	#include "xfs_trans.h"
15	#include "xfs_trans_priv.h"
16	#include "xfs_inode_item.h"
17	#include "xfs_quota.h"
18	#include "xfs_trace.h"
19	#include "xfs_icache.h"
20	#include "xfs_bmap_util.h"
21	#include "xfs_dquot_item.h"
22	#include "xfs_dquot.h"
23	#include "xfs_reflink.h"
24	#include "xfs_ialloc.h"
25	#include "xfs_ag.h"
26	#include "xfs_log_priv.h"
27
28	#include <linux/iversion.h>
29
30	/ Radix tree tags for incore inode tree. /
31
32	/ inode is to be reclaimed /
33	#define XFS_ICI_RECLAIM_TAG 0
34	/ Inode has speculative preallocations (posteof or cow) to clean. /
35	#define XFS_ICI_BLOCKGC_TAG 1
36
37	/*
38	* The goal for walking incore inodes. These can correspond with incore inode
39	* radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
40	*/
41	enum xfs_icwalk_goal {
42	/ Goals directly associated with tagged inodes. /
43	XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
44	XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
45	};
46
47	static int xfs_icwalk(struct xfs_mount *mp,
48	enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
49	static int xfs_icwalk_ag(struct xfs_perag *pag,
50	enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
51
52	/*
53	* Private inode cache walk flags for struct xfs_icwalk. Must not
54	* coincide with XFS_ICWALK_FLAGS_VALID.
55	*/
56
57	/ Stop scanning after icw_scan_limit inodes. /
58	#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
59
60	#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
61	#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
62
63	#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT \| \
64	XFS_ICWALK_FLAG_RECLAIM_SICK \| \
65	XFS_ICWALK_FLAG_UNION)
66
67	/*
68	* Allocate and initialise an xfs_inode.
69	*/
70	struct xfs_inode *
71	xfs_inode_alloc(
72	struct xfs_mount *mp,
73	xfs_ino_t ino)
74	{
75	struct xfs_inode *ip;
76
77	/*
78	* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
79	* and return NULL here on ENOMEM.
80	*/
81	ip = alloc_inode_sb(sb: mp->m_super, cache: xfs_inode_cache, GFP_KERNEL \| __GFP_NOFAIL);
82
83	if (inode_init_always(mp->m_super, VFS_I(ip))) {
84	kmem_cache_free(s: xfs_inode_cache, objp: ip);
85	return NULL;
86	}
87
88	/ VFS doesn't initialise i_mode or i_state! /
89	VFS_I(ip)->i_mode = `0`;
90	VFS_I(ip)->i_state = `0`;
91	mapping_set_large_folios(mapping: VFS_I(ip)->i_mapping);
92
93	XFS_STATS_INC(mp, vn_active);
94	ASSERT(atomic_read(&ip->i_pincount) == `0`);
95	ASSERT(ip->i_ino == `0`);
96
97	/ initialise the xfs inode /
98	ip->i_ino = ino;
99	ip->i_mount = mp;
100	memset(&ip->i_imap, `0`, sizeof(struct xfs_imap));
101	ip->i_cowfp = NULL;
102	memset(&ip->i_af, `0`, sizeof(ip->i_af));
103	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
104	memset(&ip->i_df, `0`, sizeof(ip->i_df));
105	ip->i_flags = `0`;
106	ip->i_delayed_blks = `0`;
107	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
108	ip->i_nblocks = `0`;
109	ip->i_forkoff = `0`;
110	ip->i_sick = `0`;
111	ip->i_checked = `0`;
112	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
113	INIT_LIST_HEAD(list: &ip->i_ioend_list);
114	spin_lock_init(&ip->i_ioend_lock);
115	ip->i_next_unlinked = NULLAGINO;
116	ip->i_prev_unlinked = `0`;
117
118	return ip;
119	}
120
121	STATIC void
122	xfs_inode_free_callback(
123	struct rcu_head *head)
124	{
125	struct inode inode = container_of(head, struct* inode, i_rcu);
126	struct xfs_inode *ip = XFS_I(inode);
127
128	switch (VFS_I(ip)->i_mode & S_IFMT) {
129	case S_IFREG:
130	case S_IFDIR:
131	case S_IFLNK:
132	xfs_idestroy_fork(&ip->i_df);
133	break;
134	}
135
136	xfs_ifork_zap_attr(ip);
137
138	if (ip->i_cowfp) {
139	xfs_idestroy_fork(ip->i_cowfp);
140	kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
141	}
142	if (ip->i_itemp) {
143	ASSERT(!test_bit(XFS_LI_IN_AIL,
144	&ip->i_itemp->ili_item.li_flags));
145	xfs_inode_item_destroy(ip);
146	ip->i_itemp = NULL;
147	}
148
149	kmem_cache_free(s: xfs_inode_cache, objp: ip);
150	}
151
152	static void
153	__xfs_inode_free(
154	struct xfs_inode *ip)
155	{
156	/ asserts to verify all state is correct here /
157	ASSERT(atomic_read(&ip->i_pincount) == `0`);
158	ASSERT(!ip->i_itemp \|\| list_empty(&ip->i_itemp->ili_item.li_bio_list));
159	XFS_STATS_DEC(ip->i_mount, vn_active);
160
161	call_rcu(head: &VFS_I(ip)->i_rcu, func: xfs_inode_free_callback);
162	}
163
164	void
165	xfs_inode_free(
166	struct xfs_inode *ip)
167	{
168	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
169
170	/*
171	* Because we use RCU freeing we need to ensure the inode always
172	* appears to be reclaimed with an invalid inode number when in the
173	* free state. The ip->i_flags_lock provides the barrier against lookup
174	* races.
175	*/
176	spin_lock(lock: &ip->i_flags_lock);
177	ip->i_flags = XFS_IRECLAIM;
178	ip->i_ino = `0`;
179	spin_unlock(lock: &ip->i_flags_lock);
180
181	__xfs_inode_free(ip);
182	}
183
184	/*
185	* Queue background inode reclaim work if there are reclaimable inodes and there
186	* isn't reclaim work already scheduled or in progress.
187	*/
188	static void
189	xfs_reclaim_work_queue(
190	struct xfs_mount *mp)
191	{
192
193	rcu_read_lock();
194	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
195	queue_delayed_work(wq: mp->m_reclaim_workqueue, dwork: &mp->m_reclaim_work,
196	delay: msecs_to_jiffies(xfs_syncd_centisecs / `6` * `10`));
197	}
198	rcu_read_unlock();
199	}
200
201	/*
202	* Background scanning to trim preallocated space. This is queued based on the
203	* 'speculative_prealloc_lifetime' tunable (5m by default).
204	*/
205	static inline void
206	xfs_blockgc_queue(
207	struct xfs_perag *pag)
208	{
209	struct xfs_mount *mp = pag->pag_mount;
210
211	if (!xfs_is_blockgc_enabled(mp))
212	return;
213
214	rcu_read_lock();
215	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
216	queue_delayed_work(wq: pag->pag_mount->m_blockgc_wq,
217	dwork: &pag->pag_blockgc_work,
218	delay: msecs_to_jiffies(xfs_blockgc_secs * `1000`));
219	rcu_read_unlock();
220	}
221
222	/ Set a tag on both the AG incore inode tree and the AG radix tree. /
223	static void
224	xfs_perag_set_inode_tag(
225	struct xfs_perag *pag,
226	xfs_agino_t agino,
227	unsigned int tag)
228	{
229	struct xfs_mount *mp = pag->pag_mount;
230	bool was_tagged;
231
232	lockdep_assert_held(&pag->pag_ici_lock);
233
234	was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
235	radix_tree_tag_set(&pag->pag_ici_root, index: agino, tag);
236
237	if (tag == XFS_ICI_RECLAIM_TAG)
238	pag->pag_ici_reclaimable++;
239
240	if (was_tagged)
241	return;
242
243	/ propagate the tag up into the perag radix tree /
244	spin_lock(lock: &mp->m_perag_lock);
245	radix_tree_tag_set(&mp->m_perag_tree, index: pag->pag_agno, tag);
246	spin_unlock(lock: &mp->m_perag_lock);
247
248	/ start background work /
249	switch (tag) {
250	case XFS_ICI_RECLAIM_TAG:
251	xfs_reclaim_work_queue(mp);
252	break;
253	case XFS_ICI_BLOCKGC_TAG:
254	xfs_blockgc_queue(pag);
255	break;
256	}
257
258	trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
259	}
260
261	/ Clear a tag on both the AG incore inode tree and the AG radix tree. /
262	static void
263	xfs_perag_clear_inode_tag(
264	struct xfs_perag *pag,
265	xfs_agino_t agino,
266	unsigned int tag)
267	{
268	struct xfs_mount *mp = pag->pag_mount;
269
270	lockdep_assert_held(&pag->pag_ici_lock);
271
272	/*
273	* Reclaim can signal (with a null agino) that it cleared its own tag
274	* by removing the inode from the radix tree.
275	*/
276	if (agino != NULLAGINO)
277	radix_tree_tag_clear(&pag->pag_ici_root, index: agino, tag);
278	else
279	ASSERT(tag == XFS_ICI_RECLAIM_TAG);
280
281	if (tag == XFS_ICI_RECLAIM_TAG)
282	pag->pag_ici_reclaimable--;
283
284	if (radix_tree_tagged(&pag->pag_ici_root, tag))
285	return;
286
287	/ clear the tag from the perag radix tree /
288	spin_lock(lock: &mp->m_perag_lock);
289	radix_tree_tag_clear(&mp->m_perag_tree, index: pag->pag_agno, tag);
290	spin_unlock(lock: &mp->m_perag_lock);
291
292	trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
293	}
294
295	/*
296	* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
297	* part of the structure. This is made more complex by the fact we store
298	* information about the on-disk values in the VFS inode and so we can't just
299	* overwrite the values unconditionally. Hence we save the parameters we
300	* need to retain across reinitialisation, and rewrite them into the VFS inode
301	* after reinitialisation even if it fails.
302	*/
303	static int
304	xfs_reinit_inode(
305	struct xfs_mount *mp,
306	struct inode *inode)
307	{
308	int error;
309	uint32_t nlink = inode->i_nlink;
310	uint32_t generation = inode->i_generation;
311	uint64_t version = inode_peek_iversion(inode);
312	umode_t mode = inode->i_mode;
313	dev_t dev = inode->i_rdev;
314	kuid_t uid = inode->i_uid;
315	kgid_t gid = inode->i_gid;
316
317	error = inode_init_always(mp->m_super, inode);
318
319	set_nlink(inode, nlink);
320	inode->i_generation = generation;
321	inode_set_iversion_queried(inode, val: version);
322	inode->i_mode = mode;
323	inode->i_rdev = dev;
324	inode->i_uid = uid;
325	inode->i_gid = gid;
326	mapping_set_large_folios(mapping: inode->i_mapping);
327	return error;
328	}
329
330	/*
331	* Carefully nudge an inode whose VFS state has been torn down back into a
332	* usable state. Drops the i_flags_lock and the rcu read lock.
333	*/
334	static int
335	xfs_iget_recycle(
336	struct xfs_perag *pag,
337	struct xfs_inode *ip) __releases(&ip->i_flags_lock)
338	{
339	struct xfs_mount *mp = ip->i_mount;
340	struct inode *inode = VFS_I(ip);
341	int error;
342
343	trace_xfs_iget_recycle(ip);
344
345	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
346	return -EAGAIN;
347
348	/*
349	* We need to make it look like the inode is being reclaimed to prevent
350	* the actual reclaim workers from stomping over us while we recycle
351	* the inode. We can't clear the radix tree tag yet as it requires
352	* pag_ici_lock to be held exclusive.
353	*/
354	ip->i_flags \|= XFS_IRECLAIM;
355
356	spin_unlock(lock: &ip->i_flags_lock);
357	rcu_read_unlock();
358
359	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
360	error = xfs_reinit_inode(mp, inode);
361	xfs_iunlock(ip, XFS_ILOCK_EXCL);
362	if (error) {
363	/*
364	* Re-initializing the inode failed, and we are in deep
365	* trouble. Try to re-add it to the reclaim list.
366	*/
367	rcu_read_lock();
368	spin_lock(lock: &ip->i_flags_lock);
369	ip->i_flags &= ~(XFS_INEW \| XFS_IRECLAIM);
370	ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
371	spin_unlock(lock: &ip->i_flags_lock);
372	rcu_read_unlock();
373
374	trace_xfs_iget_recycle_fail(ip);
375	return error;
376	}
377
378	spin_lock(lock: &pag->pag_ici_lock);
379	spin_lock(lock: &ip->i_flags_lock);
380
381	/*
382	* Clear the per-lifetime state in the inode as we are now effectively
383	* a new inode and need to return to the initial state before reuse
384	* occurs.
385	*/
386	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
387	ip->i_flags \|= XFS_INEW;
388	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
389	XFS_ICI_RECLAIM_TAG);
390	inode->i_state = I_NEW;
391	spin_unlock(lock: &ip->i_flags_lock);
392	spin_unlock(lock: &pag->pag_ici_lock);
393
394	return `0`;
395	}
396
397	/*
398	* If we are allocating a new inode, then check what was returned is
399	* actually a free, empty inode. If we are not allocating an inode,
400	* then check we didn't find a free inode.
401	*
402	* Returns:
403	* 0 if the inode free state matches the lookup context
404	* -ENOENT if the inode is free and we are not allocating
405	* -EFSCORRUPTED if there is any state mismatch at all
406	*/
407	static int
408	xfs_iget_check_free_state(
409	struct xfs_inode *ip,
410	int flags)
411	{
412	if (flags & XFS_IGET_CREATE) {
413	/ should be a free inode /
414	if (VFS_I(ip)->i_mode != `0`) {
415	xfs_warn(ip->i_mount,
416	"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
417	ip->i_ino, VFS_I(ip)->i_mode);
418	return -EFSCORRUPTED;
419	}
420
421	if (ip->i_nblocks != `0`) {
422	xfs_warn(ip->i_mount,
423	"Corruption detected! Free inode 0x%llx has blocks allocated!",
424	ip->i_ino);
425	return -EFSCORRUPTED;
426	}
427	return `0`;
428	}
429
430	/ should be an allocated inode /
431	if (VFS_I(ip)->i_mode == `0`)
432	return -ENOENT;
433
434	return `0`;
435	}
436
437	/ Make all pending inactivation work start immediately. /
438	static bool
439	xfs_inodegc_queue_all(
440	struct xfs_mount *mp)
441	{
442	struct xfs_inodegc *gc;
443	int cpu;
444	bool ret = false;
445
446	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
447	gc = per_cpu_ptr(mp->m_inodegc, cpu);
448	if (!llist_empty(head: &gc->list)) {
449	mod_delayed_work_on(cpu, wq: mp->m_inodegc_wq, dwork: &gc->work, delay: `0`);
450	ret = true;
451	}
452	}
453
454	return ret;
455	}
456
457	/ Wait for all queued work and collect errors /
458	static int
459	xfs_inodegc_wait_all(
460	struct xfs_mount *mp)
461	{
462	int cpu;
463	int error = `0`;
464
465	flush_workqueue(mp->m_inodegc_wq);
466	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
467	struct xfs_inodegc *gc;
468
469	gc = per_cpu_ptr(mp->m_inodegc, cpu);
470	if (gc->error && !error)
471	error = gc->error;
472	gc->error = `0`;
473	}
474
475	return error;
476	}
477
478	/*
479	* Check the validity of the inode we just found it the cache
480	*/
481	static int
482	xfs_iget_cache_hit(
483	struct xfs_perag *pag,
484	struct xfs_inode *ip,
485	xfs_ino_t ino,
486	int flags,
487	int lock_flags) __releases(RCU)
488	{
489	struct inode *inode = VFS_I(ip);
490	struct xfs_mount *mp = ip->i_mount;
491	int error;
492
493	/*
494	* check for re-use of an inode within an RCU grace period due to the
495	* radix tree nodes not being updated yet. We monitor for this by
496	* setting the inode number to zero before freeing the inode structure.
497	* If the inode has been reallocated and set up, then the inode number
498	* will not match, so check for that, too.
499	*/
500	spin_lock(lock: &ip->i_flags_lock);
501	if (ip->i_ino != ino)
502	goto out_skip;
503
504	/*
505	* If we are racing with another cache hit that is currently
506	* instantiating this inode or currently recycling it out of
507	* reclaimable state, wait for the initialisation to complete
508	* before continuing.
509	*
510	* If we're racing with the inactivation worker we also want to wait.
511	* If we're creating a new file, it's possible that the worker
512	* previously marked the inode as free on disk but hasn't finished
513	* updating the incore state yet. The AGI buffer will be dirty and
514	* locked to the icreate transaction, so a synchronous push of the
515	* inodegc workers would result in deadlock. For a regular iget, the
516	* worker is running already, so we might as well wait.
517	*
518	* XXX(hch): eventually we should do something equivalent to
519	* wait_on_inode to wait for these flags to be cleared
520	* instead of polling for it.
521	*/
522	if (ip->i_flags & (XFS_INEW \| XFS_IRECLAIM \| XFS_INACTIVATING))
523	goto out_skip;
524
525	if (ip->i_flags & XFS_NEED_INACTIVE) {
526	/ Unlinked inodes cannot be re-grabbed. /
527	if (VFS_I(ip)->i_nlink == `0`) {
528	error = -ENOENT;
529	goto out_error;
530	}
531	goto out_inodegc_flush;
532	}
533
534	/*
535	* Check the inode free state is valid. This also detects lookup
536	* racing with unlinks.
537	*/
538	error = xfs_iget_check_free_state(ip, flags);
539	if (error)
540	goto out_error;
541
542	/ Skip inodes that have no vfs state. /
543	if ((flags & XFS_IGET_INCORE) &&
544	(ip->i_flags & XFS_IRECLAIMABLE))
545	goto out_skip;
546
547	/ The inode fits the selection criteria; process it. /
548	if (ip->i_flags & XFS_IRECLAIMABLE) {
549	/ Drops i_flags_lock and RCU read lock. /
550	error = xfs_iget_recycle(pag, ip);
551	if (error == -EAGAIN)
552	goto out_skip;
553	if (error)
554	return error;
555	} else {
556	/ If the VFS inode is being torn down, pause and try again. /
557	if (!igrab(inode))
558	goto out_skip;
559
560	/ We've got a live one. /
561	spin_unlock(lock: &ip->i_flags_lock);
562	rcu_read_unlock();
563	trace_xfs_iget_hit(ip);
564	}
565
566	if (lock_flags != `0`)
567	xfs_ilock(ip, lock_flags);
568
569	if (!(flags & XFS_IGET_INCORE))
570	xfs_iflags_clear(ip, XFS_ISTALE);
571	XFS_STATS_INC(mp, xs_ig_found);
572
573	return `0`;
574
575	out_skip:
576	trace_xfs_iget_skip(ip);
577	XFS_STATS_INC(mp, xs_ig_frecycle);
578	error = -EAGAIN;
579	out_error:
580	spin_unlock(lock: &ip->i_flags_lock);
581	rcu_read_unlock();
582	return error;
583
584	out_inodegc_flush:
585	spin_unlock(lock: &ip->i_flags_lock);
586	rcu_read_unlock();
587	/*
588	* Do not wait for the workers, because the caller could hold an AGI
589	* buffer lock. We're just going to sleep in a loop anyway.
590	*/
591	if (xfs_is_inodegc_enabled(mp))
592	xfs_inodegc_queue_all(mp);
593	return -EAGAIN;
594	}
595
596	static int
597	xfs_iget_cache_miss(
598	struct xfs_mount *mp,
599	struct xfs_perag *pag,
600	xfs_trans_t *tp,
601	xfs_ino_t ino,
602	struct xfs_inode **ipp,
603	int flags,
604	int lock_flags)
605	{
606	struct xfs_inode *ip;
607	int error;
608	xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
609	int iflags;
610
611	ip = xfs_inode_alloc(mp, ino);
612	if (!ip)
613	return -ENOMEM;
614
615	error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
616	if (error)
617	goto out_destroy;
618
619	/*
620	* For version 5 superblocks, if we are initialising a new inode and we
621	* are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
622	* simply build the new inode core with a random generation number.
623	*
624	* For version 4 (and older) superblocks, log recovery is dependent on
625	* the i_flushiter field being initialised from the current on-disk
626	* value and hence we must also read the inode off disk even when
627	* initializing new inodes.
628	*/
629	if (xfs_has_v3inodes(mp) &&
630	(flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
631	VFS_I(ip)->i_generation = get_random_u32();
632	} else {
633	struct xfs_buf *bp;
634
635	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
636	if (error)
637	goto out_destroy;
638
639	error = xfs_inode_from_disk(ip,
640	xfs_buf_offset(bp, ip->i_imap.im_boffset));
641	if (!error)
642	xfs_buf_set_ref(bp, XFS_INO_REF);
643	xfs_trans_brelse(tp, bp);
644
645	if (error)
646	goto out_destroy;
647	}
648
649	trace_xfs_iget_miss(ip);
650
651	/*
652	* Check the inode free state is valid. This also detects lookup
653	* racing with unlinks.
654	*/
655	error = xfs_iget_check_free_state(ip, flags);
656	if (error)
657	goto out_destroy;
658
659	/*
660	* Preload the radix tree so we can insert safely under the
661	* write spinlock. Note that we cannot sleep inside the preload
662	* region. Since we can be called from transaction context, don't
663	* recurse into the file system.
664	*/
665	if (radix_tree_preload(GFP_NOFS)) {
666	error = -EAGAIN;
667	goto out_destroy;
668	}
669
670	/*
671	* Because the inode hasn't been added to the radix-tree yet it can't
672	* be found by another thread, so we can do the non-sleeping lock here.
673	*/
674	if (lock_flags) {
675	if (!xfs_ilock_nowait(ip, lock_flags))
676	BUG();
677	}
678
679	/*
680	* These values must be set before inserting the inode into the radix
681	* tree as the moment it is inserted a concurrent lookup (allowed by the
682	* RCU locking mechanism) can find it and that lookup must see that this
683	* is an inode currently under construction (i.e. that XFS_INEW is set).
684	* The ip->i_flags_lock that protects the XFS_INEW flag forms the
685	* memory barrier that ensures this detection works correctly at lookup
686	* time.
687	*/
688	iflags = XFS_INEW;
689	if (flags & XFS_IGET_DONTCACHE)
690	d_mark_dontcache(inode: VFS_I(ip));
691	ip->i_udquot = NULL;
692	ip->i_gdquot = NULL;
693	ip->i_pdquot = NULL;
694	xfs_iflags_set(ip, flags: iflags);
695
696	/ insert the new inode /
697	spin_lock(lock: &pag->pag_ici_lock);
698	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
699	if (unlikely(error)) {
700	WARN_ON(error != -EEXIST);
701	XFS_STATS_INC(mp, xs_ig_dup);
702	error = -EAGAIN;
703	goto out_preload_end;
704	}
705	spin_unlock(lock: &pag->pag_ici_lock);
706	radix_tree_preload_end();
707
708	*ipp = ip;
709	return `0`;
710
711	out_preload_end:
712	spin_unlock(lock: &pag->pag_ici_lock);
713	radix_tree_preload_end();
714	if (lock_flags)
715	xfs_iunlock(ip, lock_flags);
716	out_destroy:
717	__destroy_inode(VFS_I(ip));
718	xfs_inode_free(ip);
719	return error;
720	}
721
722	/*
723	* Look up an inode by number in the given file system. The inode is looked up
724	* in the cache held in each AG. If the inode is found in the cache, initialise
725	* the vfs inode if necessary.
726	*
727	* If it is not in core, read it in from the file system's device, add it to the
728	* cache and initialise the vfs inode.
729	*
730	* The inode is locked according to the value of the lock_flags parameter.
731	* Inode lookup is only done during metadata operations and not as part of the
732	* data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
733	*/
734	int
735	xfs_iget(
736	struct xfs_mount *mp,
737	struct xfs_trans *tp,
738	xfs_ino_t ino,
739	uint flags,
740	uint lock_flags,
741	struct xfs_inode **ipp)
742	{
743	struct xfs_inode *ip;
744	struct xfs_perag *pag;
745	xfs_agino_t agino;
746	int error;
747
748	ASSERT((lock_flags & (XFS_IOLOCK_EXCL \| XFS_IOLOCK_SHARED)) == `0`);
749
750	/ reject inode numbers outside existing AGs /
751	if (!ino \|\| XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
752	return -EINVAL;
753
754	XFS_STATS_INC(mp, xs_ig_attempts);
755
756	/ get the perag structure and ensure that it's inode capable /
757	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
758	agino = XFS_INO_TO_AGINO(mp, ino);
759
760	again:
761	error = `0`;
762	rcu_read_lock();
763	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
764
765	if (ip) {
766	error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
767	if (error)
768	goto out_error_or_again;
769	} else {
770	rcu_read_unlock();
771	if (flags & XFS_IGET_INCORE) {
772	error = -ENODATA;
773	goto out_error_or_again;
774	}
775	XFS_STATS_INC(mp, xs_ig_missed);
776
777	error = xfs_iget_cache_miss(mp, pag, tp, ino, ipp: &ip,
778	flags, lock_flags);
779	if (error)
780	goto out_error_or_again;
781	}
782	xfs_perag_put(pag);
783
784	*ipp = ip;
785
786	/*
787	* If we have a real type for an on-disk inode, we can setup the inode
788	* now. If it's a new inode being created, xfs_init_new_inode will
789	* handle it.
790	*/
791	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != `0`)
792	xfs_setup_existing_inode(ip);
793	return `0`;
794
795	out_error_or_again:
796	if (!(flags & (XFS_IGET_INCORE \| XFS_IGET_NORETRY)) &&
797	error == -EAGAIN) {
798	delay(ticks: `1`);
799	goto again;
800	}
801	xfs_perag_put(pag);
802	return error;
803	}
804
805	/*
806	* Grab the inode for reclaim exclusively.
807	*
808	* We have found this inode via a lookup under RCU, so the inode may have
809	* already been freed, or it may be in the process of being recycled by
810	* xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
811	* has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
812	* will not be set. Hence we need to check for both these flag conditions to
813	* avoid inodes that are no longer reclaim candidates.
814	*
815	* Note: checking for other state flags here, under the i_flags_lock or not, is
816	* racy and should be avoided. Those races should be resolved only after we have
817	* ensured that we are able to reclaim this inode and the world can see that we
818	* are going to reclaim it.
819	*
820	* Return true if we grabbed it, false otherwise.
821	*/
822	static bool
823	xfs_reclaim_igrab(
824	struct xfs_inode *ip,
825	struct xfs_icwalk *icw)
826	{
827	ASSERT(rcu_read_lock_held());
828
829	spin_lock(lock: &ip->i_flags_lock);
830	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) \|\|
831	__xfs_iflags_test(ip, XFS_IRECLAIM)) {
832	/ not a reclaim candidate. /
833	spin_unlock(lock: &ip->i_flags_lock);
834	return false;
835	}
836
837	/ Don't reclaim a sick inode unless the caller asked for it. /
838	if (ip->i_sick &&
839	(!icw \|\| !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
840	spin_unlock(lock: &ip->i_flags_lock);
841	return false;
842	}
843
844	__xfs_iflags_set(ip, XFS_IRECLAIM);
845	spin_unlock(lock: &ip->i_flags_lock);
846	return true;
847	}
848
849	/*
850	* Inode reclaim is non-blocking, so the default action if progress cannot be
851	* made is to "requeue" the inode for reclaim by unlocking it and clearing the
852	* XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
853	* blocking anymore and hence we can wait for the inode to be able to reclaim
854	* it.
855	*
856	* We do no IO here - if callers require inodes to be cleaned they must push the
857	* AIL first to trigger writeback of dirty inodes. This enables writeback to be
858	* done in the background in a non-blocking manner, and enables memory reclaim
859	* to make progress without blocking.
860	*/
861	static void
862	xfs_reclaim_inode(
863	struct xfs_inode *ip,
864	struct xfs_perag *pag)
865	{
866	xfs_ino_t ino = ip->i_ino; / for radix_tree_delete /
867
868	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
869	goto out;
870	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
871	goto out_iunlock;
872
873	/*
874	* Check for log shutdown because aborting the inode can move the log
875	* tail and corrupt in memory state. This is fine if the log is shut
876	* down, but if the log is still active and only the mount is shut down
877	* then the in-memory log tail movement caused by the abort can be
878	* incorrectly propagated to disk.
879	*/
880	if (xlog_is_shutdown(log: ip->i_mount->m_log)) {
881	xfs_iunpin_wait(ip);
882	xfs_iflush_shutdown_abort(ip);
883	goto reclaim;
884	}
885	if (xfs_ipincount(ip))
886	goto out_clear_flush;
887	if (!xfs_inode_clean(ip))
888	goto out_clear_flush;
889
890	xfs_iflags_clear(ip, XFS_IFLUSHING);
891	reclaim:
892	trace_xfs_inode_reclaiming(ip);
893
894	/*
895	* Because we use RCU freeing we need to ensure the inode always appears
896	* to be reclaimed with an invalid inode number when in the free state.
897	* We do this as early as possible under the ILOCK so that
898	* xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
899	* detect races with us here. By doing this, we guarantee that once
900	* xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
901	* it will see either a valid inode that will serialise correctly, or it
902	* will see an invalid inode that it can skip.
903	*/
904	spin_lock(lock: &ip->i_flags_lock);
905	ip->i_flags = XFS_IRECLAIM;
906	ip->i_ino = `0`;
907	ip->i_sick = `0`;
908	ip->i_checked = `0`;
909	spin_unlock(lock: &ip->i_flags_lock);
910
911	ASSERT(!ip->i_itemp \|\| ip->i_itemp->ili_item.li_buf == NULL);
912	xfs_iunlock(ip, XFS_ILOCK_EXCL);
913
914	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
915	/*
916	* Remove the inode from the per-AG radix tree.
917	*
918	* Because radix_tree_delete won't complain even if the item was never
919	* added to the tree assert that it's been there before to catch
920	* problems with the inode life time early on.
921	*/
922	spin_lock(lock: &pag->pag_ici_lock);
923	if (!radix_tree_delete(&pag->pag_ici_root,
924	XFS_INO_TO_AGINO(ip->i_mount, ino)))
925	ASSERT(`0`);
926	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
927	spin_unlock(lock: &pag->pag_ici_lock);
928
929	/*
930	* Here we do an (almost) spurious inode lock in order to coordinate
931	* with inode cache radix tree lookups. This is because the lookup
932	* can reference the inodes in the cache without taking references.
933	*
934	* We make that OK here by ensuring that we wait until the inode is
935	* unlocked after the lookup before we go ahead and free it.
936	*/
937	xfs_ilock(ip, XFS_ILOCK_EXCL);
938	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
939	xfs_iunlock(ip, XFS_ILOCK_EXCL);
940	ASSERT(xfs_inode_clean(ip));
941
942	__xfs_inode_free(ip);
943	return;
944
945	out_clear_flush:
946	xfs_iflags_clear(ip, XFS_IFLUSHING);
947	out_iunlock:
948	xfs_iunlock(ip, XFS_ILOCK_EXCL);
949	out:
950	xfs_iflags_clear(ip, XFS_IRECLAIM);
951	}
952
953	/ Reclaim sick inodes if we're unmounting or the fs went down. /
954	static inline bool
955	xfs_want_reclaim_sick(
956	struct xfs_mount *mp)
957	{
958	return xfs_is_unmounting(mp) \|\| xfs_has_norecovery(mp) \|\|
959	xfs_is_shutdown(mp);
960	}
961
962	void
963	xfs_reclaim_inodes(
964	struct xfs_mount *mp)
965	{
966	struct xfs_icwalk icw = {
967	.icw_flags = `0`,
968	};
969
970	if (xfs_want_reclaim_sick(mp))
971	icw.icw_flags \|= XFS_ICWALK_FLAG_RECLAIM_SICK;
972
973	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
974	xfs_ail_push_all_sync(mp->m_ail);
975	xfs_icwalk(mp, goal: XFS_ICWALK_RECLAIM, icw: &icw);
976	}
977	}
978
979	/*
980	* The shrinker infrastructure determines how many inodes we should scan for
981	* reclaim. We want as many clean inodes ready to reclaim as possible, so we
982	* push the AIL here. We also want to proactively free up memory if we can to
983	* minimise the amount of work memory reclaim has to do so we kick the
984	* background reclaim if it isn't already scheduled.
985	*/
986	long
987	xfs_reclaim_inodes_nr(
988	struct xfs_mount *mp,
989	unsigned long nr_to_scan)
990	{
991	struct xfs_icwalk icw = {
992	.icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
993	.icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
994	};
995
996	if (xfs_want_reclaim_sick(mp))
997	icw.icw_flags \|= XFS_ICWALK_FLAG_RECLAIM_SICK;
998
999	/ kick background reclaimer and push the AIL /
1000	xfs_reclaim_work_queue(mp);
1001	xfs_ail_push_all(mp->m_ail);
1002
1003	xfs_icwalk(mp, goal: XFS_ICWALK_RECLAIM, icw: &icw);
1004	return `0`;
1005	}
1006
1007	/*
1008	* Return the number of reclaimable inodes in the filesystem for
1009	* the shrinker to determine how much to reclaim.
1010	*/
1011	long
1012	xfs_reclaim_inodes_count(
1013	struct xfs_mount *mp)
1014	{
1015	struct xfs_perag *pag;
1016	xfs_agnumber_t ag = `0`;
1017	long reclaimable = `0`;
1018
1019	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1020	ag = pag->pag_agno + `1`;
1021	reclaimable += pag->pag_ici_reclaimable;
1022	xfs_perag_put(pag);
1023	}
1024	return reclaimable;
1025	}
1026
1027	STATIC bool
1028	xfs_icwalk_match_id(
1029	struct xfs_inode *ip,
1030	struct xfs_icwalk *icw)
1031	{
1032	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1033	!uid_eq(left: VFS_I(ip)->i_uid, right: icw->icw_uid))
1034	return false;
1035
1036	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1037	!gid_eq(left: VFS_I(ip)->i_gid, right: icw->icw_gid))
1038	return false;
1039
1040	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1041	ip->i_projid != icw->icw_prid)
1042	return false;
1043
1044	return true;
1045	}
1046
1047	/*
1048	* A union-based inode filtering algorithm. Process the inode if any of the
1049	* criteria match. This is for global/internal scans only.
1050	*/
1051	STATIC bool
1052	xfs_icwalk_match_id_union(
1053	struct xfs_inode *ip,
1054	struct xfs_icwalk *icw)
1055	{
1056	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
1057	uid_eq(left: VFS_I(ip)->i_uid, right: icw->icw_uid))
1058	return true;
1059
1060	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
1061	gid_eq(left: VFS_I(ip)->i_gid, right: icw->icw_gid))
1062	return true;
1063
1064	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
1065	ip->i_projid == icw->icw_prid)
1066	return true;
1067
1068	return false;
1069	}
1070
1071	/*
1072	* Is this inode @ip eligible for eof/cow block reclamation, given some
1073	* filtering parameters @icw? The inode is eligible if @icw is null or
1074	* if the predicate functions match.
1075	*/
1076	static bool
1077	xfs_icwalk_match(
1078	struct xfs_inode *ip,
1079	struct xfs_icwalk *icw)
1080	{
1081	bool match;
1082
1083	if (!icw)
1084	return true;
1085
1086	if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
1087	match = xfs_icwalk_match_id_union(ip, icw);
1088	else
1089	match = xfs_icwalk_match_id(ip, icw);
1090	if (!match)
1091	return false;
1092
1093	/ skip the inode if the file size is too small /
1094	if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
1095	XFS_ISIZE(ip) < icw->icw_min_file_size)
1096	return false;
1097
1098	return true;
1099	}
1100
1101	/*
1102	* This is a fast pass over the inode cache to try to get reclaim moving on as
1103	* many inodes as possible in a short period of time. It kicks itself every few
1104	* seconds, as well as being kicked by the inode cache shrinker when memory
1105	* goes low.
1106	*/
1107	void
1108	xfs_reclaim_worker(
1109	struct work_struct *work)
1110	{
1111	struct xfs_mount *mp = container_of(to_delayed_work(work),
1112	struct xfs_mount, m_reclaim_work);
1113
1114	xfs_icwalk(mp, goal: XFS_ICWALK_RECLAIM, NULL);
1115	xfs_reclaim_work_queue(mp);
1116	}
1117
1118	STATIC int
1119	xfs_inode_free_eofblocks(
1120	struct xfs_inode *ip,
1121	struct xfs_icwalk *icw,
1122	unsigned int *lockflags)
1123	{
1124	bool wait;
1125
1126	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1127
1128	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1129	return `0`;
1130
1131	/*
1132	* If the mapping is dirty the operation can block and wait for some
1133	* time. Unless we are waiting, skip it.
1134	*/
1135	if (!wait && mapping_tagged(mapping: VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1136	return `0`;
1137
1138	if (!xfs_icwalk_match(ip, icw))
1139	return `0`;
1140
1141	/*
1142	* If the caller is waiting, return -EAGAIN to keep the background
1143	* scanner moving and revisit the inode in a subsequent pass.
1144	*/
1145	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1146	if (wait)
1147	return -EAGAIN;
1148	return `0`;
1149	}
1150	*lockflags \|= XFS_IOLOCK_EXCL;
1151
1152	if (xfs_can_free_eofblocks(ip, force: false))
1153	return xfs_free_eofblocks(ip);
1154
1155	/ inode could be preallocated or append-only /
1156	trace_xfs_inode_free_eofblocks_invalid(ip);
1157	xfs_inode_clear_eofblocks_tag(ip);
1158	return `0`;
1159	}
1160
1161	static void
1162	xfs_blockgc_set_iflag(
1163	struct xfs_inode *ip,
1164	unsigned long iflag)
1165	{
1166	struct xfs_mount *mp = ip->i_mount;
1167	struct xfs_perag *pag;
1168
1169	ASSERT((iflag & ~(XFS_IEOFBLOCKS \| XFS_ICOWBLOCKS)) == `0`);
1170
1171	/*
1172	* Don't bother locking the AG and looking up in the radix trees
1173	* if we already know that we have the tag set.
1174	*/
1175	if (ip->i_flags & iflag)
1176	return;
1177	spin_lock(lock: &ip->i_flags_lock);
1178	ip->i_flags \|= iflag;
1179	spin_unlock(lock: &ip->i_flags_lock);
1180
1181	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1182	spin_lock(lock: &pag->pag_ici_lock);
1183
1184	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1185	XFS_ICI_BLOCKGC_TAG);
1186
1187	spin_unlock(lock: &pag->pag_ici_lock);
1188	xfs_perag_put(pag);
1189	}
1190
1191	void
1192	xfs_inode_set_eofblocks_tag(
1193	xfs_inode_t *ip)
1194	{
1195	trace_xfs_inode_set_eofblocks_tag(ip);
1196	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1197	}
1198
1199	static void
1200	xfs_blockgc_clear_iflag(
1201	struct xfs_inode *ip,
1202	unsigned long iflag)
1203	{
1204	struct xfs_mount *mp = ip->i_mount;
1205	struct xfs_perag *pag;
1206	bool clear_tag;
1207
1208	ASSERT((iflag & ~(XFS_IEOFBLOCKS \| XFS_ICOWBLOCKS)) == `0`);
1209
1210	spin_lock(lock: &ip->i_flags_lock);
1211	ip->i_flags &= ~iflag;
1212	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS \| XFS_ICOWBLOCKS)) == `0`;
1213	spin_unlock(lock: &ip->i_flags_lock);
1214
1215	if (!clear_tag)
1216	return;
1217
1218	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1219	spin_lock(lock: &pag->pag_ici_lock);
1220
1221	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1222	XFS_ICI_BLOCKGC_TAG);
1223
1224	spin_unlock(lock: &pag->pag_ici_lock);
1225	xfs_perag_put(pag);
1226	}
1227
1228	void
1229	xfs_inode_clear_eofblocks_tag(
1230	xfs_inode_t *ip)
1231	{
1232	trace_xfs_inode_clear_eofblocks_tag(ip);
1233	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1234	}
1235
1236	/*
1237	* Set ourselves up to free CoW blocks from this file. If it's already clean
1238	* then we can bail out quickly, but otherwise we must back off if the file
1239	* is undergoing some kind of write.
1240	*/
1241	static bool
1242	xfs_prep_free_cowblocks(
1243	struct xfs_inode *ip)
1244	{
1245	/*
1246	* Just clear the tag if we have an empty cow fork or none at all. It's
1247	* possible the inode was fully unshared since it was originally tagged.
1248	*/
1249	if (!xfs_inode_has_cow_data(ip)) {
1250	trace_xfs_inode_free_cowblocks_invalid(ip);
1251	xfs_inode_clear_cowblocks_tag(ip);
1252	return false;
1253	}
1254
1255	/*
1256	* If the mapping is dirty or under writeback we cannot touch the
1257	* CoW fork. Leave it alone if we're in the midst of a directio.
1258	*/
1259	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) \|\|
1260	mapping_tagged(mapping: VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) \|\|
1261	mapping_tagged(mapping: VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) \|\|
1262	atomic_read(v: &VFS_I(ip)->i_dio_count))
1263	return false;
1264
1265	return true;
1266	}
1267
1268	/*
1269	* Automatic CoW Reservation Freeing
1270	*
1271	* These functions automatically garbage collect leftover CoW reservations
1272	* that were made on behalf of a cowextsize hint when we start to run out
1273	* of quota or when the reservations sit around for too long. If the file
1274	* has dirty pages or is undergoing writeback, its CoW reservations will
1275	* be retained.
1276	*
1277	* The actual garbage collection piggybacks off the same code that runs
1278	* the speculative EOF preallocation garbage collector.
1279	*/
1280	STATIC int
1281	xfs_inode_free_cowblocks(
1282	struct xfs_inode *ip,
1283	struct xfs_icwalk *icw,
1284	unsigned int *lockflags)
1285	{
1286	bool wait;
1287	int ret = `0`;
1288
1289	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1290
1291	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1292	return `0`;
1293
1294	if (!xfs_prep_free_cowblocks(ip))
1295	return `0`;
1296
1297	if (!xfs_icwalk_match(ip, icw))
1298	return `0`;
1299
1300	/*
1301	* If the caller is waiting, return -EAGAIN to keep the background
1302	* scanner moving and revisit the inode in a subsequent pass.
1303	*/
1304	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
1305	!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1306	if (wait)
1307	return -EAGAIN;
1308	return `0`;
1309	}
1310	*lockflags \|= XFS_IOLOCK_EXCL;
1311
1312	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1313	if (wait)
1314	return -EAGAIN;
1315	return `0`;
1316	}
1317	*lockflags \|= XFS_MMAPLOCK_EXCL;
1318
1319	/*
1320	* Check again, nobody else should be able to dirty blocks or change
1321	* the reflink iflag now that we have the first two locks held.
1322	*/
1323	if (xfs_prep_free_cowblocks(ip))
1324	ret = xfs_reflink_cancel_cow_range(ip, `0`, NULLFILEOFF, false);
1325	return ret;
1326	}
1327
1328	void
1329	xfs_inode_set_cowblocks_tag(
1330	xfs_inode_t *ip)
1331	{
1332	trace_xfs_inode_set_cowblocks_tag(ip);
1333	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1334	}
1335
1336	void
1337	xfs_inode_clear_cowblocks_tag(
1338	xfs_inode_t *ip)
1339	{
1340	trace_xfs_inode_clear_cowblocks_tag(ip);
1341	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1342	}
1343
1344	/ Disable post-EOF and CoW block auto-reclamation. /
1345	void
1346	xfs_blockgc_stop(
1347	struct xfs_mount *mp)
1348	{
1349	struct xfs_perag *pag;
1350	xfs_agnumber_t agno;
1351
1352	if (!xfs_clear_blockgc_enabled(mp))
1353	return;
1354
1355	for_each_perag(mp, agno, pag)
1356	cancel_delayed_work_sync(&pag->pag_blockgc_work);
1357	trace_xfs_blockgc_stop(mp, __return_address);
1358	}
1359
1360	/ Enable post-EOF and CoW block auto-reclamation. /
1361	void
1362	xfs_blockgc_start(
1363	struct xfs_mount *mp)
1364	{
1365	struct xfs_perag *pag;
1366	xfs_agnumber_t agno;
1367
1368	if (xfs_set_blockgc_enabled(mp))
1369	return;
1370
1371	trace_xfs_blockgc_start(mp, __return_address);
1372	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1373	xfs_blockgc_queue(pag);
1374	}
1375
1376	/ Don't try to run block gc on an inode that's in any of these states. /
1377	#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW \| \
1378	XFS_NEED_INACTIVE \| \
1379	XFS_INACTIVATING \| \
1380	XFS_IRECLAIMABLE \| \
1381	XFS_IRECLAIM)
1382	/*
1383	* Decide if the given @ip is eligible for garbage collection of speculative
1384	* preallocations, and grab it if so. Returns true if it's ready to go or
1385	* false if we should just ignore it.
1386	*/
1387	static bool
1388	xfs_blockgc_igrab(
1389	struct xfs_inode *ip)
1390	{
1391	struct inode *inode = VFS_I(ip);
1392
1393	ASSERT(rcu_read_lock_held());
1394
1395	/ Check for stale RCU freed inode /
1396	spin_lock(lock: &ip->i_flags_lock);
1397	if (!ip->i_ino)
1398	goto out_unlock_noent;
1399
1400	if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1401	goto out_unlock_noent;
1402	spin_unlock(lock: &ip->i_flags_lock);
1403
1404	/ nothing to sync during shutdown /
1405	if (xfs_is_shutdown(mp: ip->i_mount))
1406	return false;
1407
1408	/ If we can't grab the inode, it must on it's way to reclaim. /
1409	if (!igrab(inode))
1410	return false;
1411
1412	/ inode is valid /
1413	return true;
1414
1415	out_unlock_noent:
1416	spin_unlock(lock: &ip->i_flags_lock);
1417	return false;
1418	}
1419
1420	/ Scan one incore inode for block preallocations that we can remove. /
1421	static int
1422	xfs_blockgc_scan_inode(
1423	struct xfs_inode *ip,
1424	struct xfs_icwalk *icw)
1425	{
1426	unsigned int lockflags = `0`;
1427	int error;
1428
1429	error = xfs_inode_free_eofblocks(ip, icw, lockflags: &lockflags);
1430	if (error)
1431	goto unlock;
1432
1433	error = xfs_inode_free_cowblocks(ip, icw, lockflags: &lockflags);
1434	unlock:
1435	if (lockflags)
1436	xfs_iunlock(ip, lockflags);
1437	xfs_irele(ip);
1438	return error;
1439	}
1440
1441	/ Background worker that trims preallocated space. /
1442	void
1443	xfs_blockgc_worker(
1444	struct work_struct *work)
1445	{
1446	struct xfs_perag *pag = container_of(to_delayed_work(work),
1447	struct xfs_perag, pag_blockgc_work);
1448	struct xfs_mount *mp = pag->pag_mount;
1449	int error;
1450
1451	trace_xfs_blockgc_worker(mp, __return_address);
1452
1453	error = xfs_icwalk_ag(pag, goal: XFS_ICWALK_BLOCKGC, NULL);
1454	if (error)
1455	xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1456	pag->pag_agno, error);
1457	xfs_blockgc_queue(pag);
1458	}
1459
1460	/*
1461	* Try to free space in the filesystem by purging inactive inodes, eofblocks
1462	* and cowblocks.
1463	*/
1464	int
1465	xfs_blockgc_free_space(
1466	struct xfs_mount *mp,
1467	struct xfs_icwalk *icw)
1468	{
1469	int error;
1470
1471	trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1472
1473	error = xfs_icwalk(mp, goal: XFS_ICWALK_BLOCKGC, icw);
1474	if (error)
1475	return error;
1476
1477	return xfs_inodegc_flush(mp);
1478	}
1479
1480	/*
1481	* Reclaim all the free space that we can by scheduling the background blockgc
1482	* and inodegc workers immediately and waiting for them all to clear.
1483	*/
1484	int
1485	xfs_blockgc_flush_all(
1486	struct xfs_mount *mp)
1487	{
1488	struct xfs_perag *pag;
1489	xfs_agnumber_t agno;
1490
1491	trace_xfs_blockgc_flush_all(mp, __return_address);
1492
1493	/*
1494	* For each blockgc worker, move its queue time up to now. If it
1495	* wasn't queued, it will not be requeued. Then flush whatever's
1496	* left.
1497	*/
1498	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1499	mod_delayed_work(pag->pag_mount->m_blockgc_wq,
1500	&pag->pag_blockgc_work, `0`);
1501
1502	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1503	flush_delayed_work(&pag->pag_blockgc_work);
1504
1505	return xfs_inodegc_flush(mp);
1506	}
1507
1508	/*
1509	* Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
1510	* quota caused an allocation failure, so we make a best effort by including
1511	* each quota under low free space conditions (less than 1% free space) in the
1512	* scan.
1513	*
1514	* Callers must not hold any inode's ILOCK. If requesting a synchronous scan
1515	* (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1516	* MMAPLOCK.
1517	*/
1518	int
1519	xfs_blockgc_free_dquots(
1520	struct xfs_mount *mp,
1521	struct xfs_dquot *udqp,
1522	struct xfs_dquot *gdqp,
1523	struct xfs_dquot *pdqp,
1524	unsigned int iwalk_flags)
1525	{
1526	struct xfs_icwalk icw = {`0`};
1527	bool do_work = false;
1528
1529	if (!udqp && !gdqp && !pdqp)
1530	return `0`;
1531
1532	/*
1533	* Run a scan to free blocks using the union filter to cover all
1534	* applicable quotas in a single scan.
1535	*/
1536	icw.icw_flags = XFS_ICWALK_FLAG_UNION \| iwalk_flags;
1537
1538	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(dqp: udqp)) {
1539	icw.icw_uid = make_kuid(from: mp->m_super->s_user_ns, uid: udqp->q_id);
1540	icw.icw_flags \|= XFS_ICWALK_FLAG_UID;
1541	do_work = true;
1542	}
1543
1544	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(dqp: gdqp)) {
1545	icw.icw_gid = make_kgid(from: mp->m_super->s_user_ns, gid: gdqp->q_id);
1546	icw.icw_flags \|= XFS_ICWALK_FLAG_GID;
1547	do_work = true;
1548	}
1549
1550	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(dqp: pdqp)) {
1551	icw.icw_prid = pdqp->q_id;
1552	icw.icw_flags \|= XFS_ICWALK_FLAG_PRID;
1553	do_work = true;
1554	}
1555
1556	if (!do_work)
1557	return `0`;
1558
1559	return xfs_blockgc_free_space(mp, icw: &icw);
1560	}
1561
1562	/ Run cow/eofblocks scans on the quotas attached to the inode. /
1563	int
1564	xfs_blockgc_free_quota(
1565	struct xfs_inode *ip,
1566	unsigned int iwalk_flags)
1567	{
1568	return xfs_blockgc_free_dquots(ip->i_mount,
1569	xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1570	xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1571	xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1572	}
1573
1574	/ XFS Inode Cache Walking Code /
1575
1576	/*
1577	* The inode lookup is done in batches to keep the amount of lock traffic and
1578	* radix tree lookups to a minimum. The batch size is a trade off between
1579	* lookup reduction and stack usage. This is in the reclaim path, so we can't
1580	* be too greedy.
1581	*/
1582	#define XFS_LOOKUP_BATCH 32
1583
1584
1585	/*
1586	* Decide if we want to grab this inode in anticipation of doing work towards
1587	* the goal.
1588	*/
1589	static inline bool
1590	xfs_icwalk_igrab(
1591	enum xfs_icwalk_goal goal,
1592	struct xfs_inode *ip,
1593	struct xfs_icwalk *icw)
1594	{
1595	switch (goal) {
1596	case XFS_ICWALK_BLOCKGC:
1597	return xfs_blockgc_igrab(ip);
1598	case XFS_ICWALK_RECLAIM:
1599	return xfs_reclaim_igrab(ip, icw);
1600	default:
1601	return false;
1602	}
1603	}
1604
1605	/*
1606	* Process an inode. Each processing function must handle any state changes
1607	* made by the icwalk igrab function. Return -EAGAIN to skip an inode.
1608	*/
1609	static inline int
1610	xfs_icwalk_process_inode(
1611	enum xfs_icwalk_goal goal,
1612	struct xfs_inode *ip,
1613	struct xfs_perag *pag,
1614	struct xfs_icwalk *icw)
1615	{
1616	int error = `0`;
1617
1618	switch (goal) {
1619	case XFS_ICWALK_BLOCKGC:
1620	error = xfs_blockgc_scan_inode(ip, icw);
1621	break;
1622	case XFS_ICWALK_RECLAIM:
1623	xfs_reclaim_inode(ip, pag);
1624	break;
1625	}
1626	return error;
1627	}
1628
1629	/*
1630	* For a given per-AG structure @pag and a goal, grab qualifying inodes and
1631	* process them in some manner.
1632	*/
1633	static int
1634	xfs_icwalk_ag(
1635	struct xfs_perag *pag,
1636	enum xfs_icwalk_goal goal,
1637	struct xfs_icwalk *icw)
1638	{
1639	struct xfs_mount *mp = pag->pag_mount;
1640	uint32_t first_index;
1641	int last_error = `0`;
1642	int skipped;
1643	bool done;
1644	int nr_found;
1645
1646	restart:
1647	done = false;
1648	skipped = `0`;
1649	if (goal == XFS_ICWALK_RECLAIM)
1650	first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
1651	else
1652	first_index = `0`;
1653	nr_found = `0`;
1654	do {
1655	struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1656	int error = `0`;
1657	int i;
1658
1659	rcu_read_lock();
1660
1661	nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
1662	results: (void **) batch, first_index,
1663	XFS_LOOKUP_BATCH, tag: goal);
1664	if (!nr_found) {
1665	done = true;
1666	rcu_read_unlock();
1667	break;
1668	}
1669
1670	/*
1671	* Grab the inodes before we drop the lock. if we found
1672	* nothing, nr == 0 and the loop will be skipped.
1673	*/
1674	for (i = `0`; i < nr_found; i++) {
1675	struct xfs_inode *ip = batch[i];
1676
1677	if (done \|\| !xfs_icwalk_igrab(goal, ip, icw))
1678	batch[i] = NULL;
1679
1680	/*
1681	* Update the index for the next lookup. Catch
1682	* overflows into the next AG range which can occur if
1683	* we have inodes in the last block of the AG and we
1684	* are currently pointing to the last inode.
1685	*
1686	* Because we may see inodes that are from the wrong AG
1687	* due to RCU freeing and reallocation, only update the
1688	* index if it lies in this AG. It was a race that lead
1689	* us to see this inode, so another lookup from the
1690	* same index will not find it again.
1691	*/
1692	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1693	continue;
1694	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + `1`);
1695	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1696	done = true;
1697	}
1698
1699	/ unlock now we've grabbed the inodes. /
1700	rcu_read_unlock();
1701
1702	for (i = `0`; i < nr_found; i++) {
1703	if (!batch[i])
1704	continue;
1705	error = xfs_icwalk_process_inode(goal, ip: batch[i], pag,
1706	icw);
1707	if (error == -EAGAIN) {
1708	skipped++;
1709	continue;
1710	}
1711	if (error && last_error != -EFSCORRUPTED)
1712	last_error = error;
1713	}
1714
1715	/ bail out if the filesystem is corrupted. /
1716	if (error == -EFSCORRUPTED)
1717	break;
1718
1719	cond_resched();
1720
1721	if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
1722	icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
1723	if (icw->icw_scan_limit <= `0`)
1724	break;
1725	}
1726	} while (nr_found && !done);
1727
1728	if (goal == XFS_ICWALK_RECLAIM) {
1729	if (done)
1730	first_index = `0`;
1731	WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
1732	}
1733
1734	if (skipped) {
1735	delay(ticks: `1`);
1736	goto restart;
1737	}
1738	return last_error;
1739	}
1740
1741	/ Walk all incore inodes to achieve a given goal. /
1742	static int
1743	xfs_icwalk(
1744	struct xfs_mount *mp,
1745	enum xfs_icwalk_goal goal,
1746	struct xfs_icwalk *icw)
1747	{
1748	struct xfs_perag *pag;
1749	int error = `0`;
1750	int last_error = `0`;
1751	xfs_agnumber_t agno;
1752
1753	for_each_perag_tag(mp, agno, pag, goal) {
1754	error = xfs_icwalk_ag(pag, goal, icw);
1755	if (error) {
1756	last_error = error;
1757	if (error == -EFSCORRUPTED) {
1758	xfs_perag_rele(pag);
1759	break;
1760	}
1761	}
1762	}
1763	return last_error;
1764	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1765	}
1766
1767	#ifdef DEBUG
1768	static void
1769	xfs_check_delalloc(
1770	struct xfs_inode *ip,
1771	int whichfork)
1772	{
1773	struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
1774	struct xfs_bmbt_irec got;
1775	struct xfs_iext_cursor icur;
1776
1777	if (!ifp \|\| !xfs_iext_lookup_extent(ip, ifp, `0`, &icur, &got))
1778	return;
1779	do {
1780	if (isnullstartblock(got.br_startblock)) {
1781	xfs_warn(ip->i_mount,
1782	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
1783	ip->i_ino,
1784	whichfork == XFS_DATA_FORK ? "data" : "cow",
1785	got.br_startoff, got.br_blockcount);
1786	}
1787	} while (xfs_iext_next_extent(ifp, &icur, &got));
1788	}
1789	#else
1790	#define xfs_check_delalloc(ip, whichfork) do { } while (0)
1791	#endif
1792
1793	/ Schedule the inode for reclaim. /
1794	static void
1795	xfs_inodegc_set_reclaimable(
1796	struct xfs_inode *ip)
1797	{
1798	struct xfs_mount *mp = ip->i_mount;
1799	struct xfs_perag *pag;
1800
1801	if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
1802	xfs_check_delalloc(ip, XFS_DATA_FORK);
1803	xfs_check_delalloc(ip, XFS_COW_FORK);
1804	ASSERT(`0`);
1805	}
1806
1807	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1808	spin_lock(lock: &pag->pag_ici_lock);
1809	spin_lock(lock: &ip->i_flags_lock);
1810
1811	trace_xfs_inode_set_reclaimable(ip);
1812	ip->i_flags &= ~(XFS_NEED_INACTIVE \| XFS_INACTIVATING);
1813	ip->i_flags \|= XFS_IRECLAIMABLE;
1814	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
1815	XFS_ICI_RECLAIM_TAG);
1816
1817	spin_unlock(lock: &ip->i_flags_lock);
1818	spin_unlock(lock: &pag->pag_ici_lock);
1819	xfs_perag_put(pag);
1820	}
1821
1822	/*
1823	* Free all speculative preallocations and possibly even the inode itself.
1824	* This is the last chance to make changes to an otherwise unreferenced file
1825	* before incore reclamation happens.
1826	*/
1827	static int
1828	xfs_inodegc_inactivate(
1829	struct xfs_inode *ip)
1830	{
1831	int error;
1832
1833	trace_xfs_inode_inactivating(ip);
1834	error = xfs_inactive(ip);
1835	xfs_inodegc_set_reclaimable(ip);
1836	return error;
1837
1838	}
1839
1840	void
1841	xfs_inodegc_worker(
1842	struct work_struct *work)
1843	{
1844	struct xfs_inodegc *gc = container_of(to_delayed_work(work),
1845	struct xfs_inodegc, work);
1846	struct llist_node *node = llist_del_all(head: &gc->list);
1847	struct xfs_inode ip, n;
1848	struct xfs_mount *mp = gc->mp;
1849	unsigned int nofs_flag;
1850
1851	/*
1852	* Clear the cpu mask bit and ensure that we have seen the latest
1853	* update of the gc structure associated with this CPU. This matches
1854	* with the release semantics used when setting the cpumask bit in
1855	* xfs_inodegc_queue.
1856	*/
1857	cpumask_clear_cpu(cpu: gc->cpu, dstp: &mp->m_inodegc_cpumask);
1858	smp_mb__after_atomic();
1859
1860	WRITE_ONCE(gc->items, `0`);
1861
1862	if (!node)
1863	return;
1864
1865	/*
1866	* We can allocate memory here while doing writeback on behalf of
1867	* memory reclaim. To avoid memory allocation deadlocks set the
1868	* task-wide nofs context for the following operations.
1869	*/
1870	nofs_flag = memalloc_nofs_save();
1871
1872	ip = llist_entry(node, struct xfs_inode, i_gclist);
1873	trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
1874
1875	WRITE_ONCE(gc->shrinker_hits, `0`);
1876	llist_for_each_entry_safe(ip, n, node, i_gclist) {
1877	int error;
1878
1879	xfs_iflags_set(ip, XFS_INACTIVATING);
1880	error = xfs_inodegc_inactivate(ip);
1881	if (error && !gc->error)
1882	gc->error = error;
1883	}
1884
1885	memalloc_nofs_restore(flags: nofs_flag);
1886	}
1887
1888	/*
1889	* Expedite all pending inodegc work to run immediately. This does not wait for
1890	* completion of the work.
1891	*/
1892	void
1893	xfs_inodegc_push(
1894	struct xfs_mount *mp)
1895	{
1896	if (!xfs_is_inodegc_enabled(mp))
1897	return;
1898	trace_xfs_inodegc_push(mp, __return_address);
1899	xfs_inodegc_queue_all(mp);
1900	}
1901
1902	/*
1903	* Force all currently queued inode inactivation work to run immediately and
1904	* wait for the work to finish.
1905	*/
1906	int
1907	xfs_inodegc_flush(
1908	struct xfs_mount *mp)
1909	{
1910	xfs_inodegc_push(mp);
1911	trace_xfs_inodegc_flush(mp, __return_address);
1912	return xfs_inodegc_wait_all(mp);
1913	}
1914
1915	/*
1916	* Flush all the pending work and then disable the inode inactivation background
1917	* workers and wait for them to stop. Caller must hold sb->s_umount to
1918	* coordinate changes in the inodegc_enabled state.
1919	*/
1920	void
1921	xfs_inodegc_stop(
1922	struct xfs_mount *mp)
1923	{
1924	bool rerun;
1925
1926	if (!xfs_clear_inodegc_enabled(mp))
1927	return;
1928
1929	/*
1930	* Drain all pending inodegc work, including inodes that could be
1931	* queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
1932	* threads that sample the inodegc state just prior to us clearing it.
1933	* The inodegc flag state prevents new threads from queuing more
1934	* inodes, so we queue pending work items and flush the workqueue until
1935	* all inodegc lists are empty. IOWs, we cannot use drain_workqueue
1936	* here because it does not allow other unserialized mechanisms to
1937	* reschedule inodegc work while this draining is in progress.
1938	*/
1939	xfs_inodegc_queue_all(mp);
1940	do {
1941	flush_workqueue(mp->m_inodegc_wq);
1942	rerun = xfs_inodegc_queue_all(mp);
1943	} while (rerun);
1944
1945	trace_xfs_inodegc_stop(mp, __return_address);
1946	}
1947
1948	/*
1949	* Enable the inode inactivation background workers and schedule deferred inode
1950	* inactivation work if there is any. Caller must hold sb->s_umount to
1951	* coordinate changes in the inodegc_enabled state.
1952	*/
1953	void
1954	xfs_inodegc_start(
1955	struct xfs_mount *mp)
1956	{
1957	if (xfs_set_inodegc_enabled(mp))
1958	return;
1959
1960	trace_xfs_inodegc_start(mp, __return_address);
1961	xfs_inodegc_queue_all(mp);
1962	}
1963
1964	#ifdef CONFIG_XFS_RT
1965	static inline bool
1966	xfs_inodegc_want_queue_rt_file(
1967	struct xfs_inode *ip)
1968	{
1969	struct xfs_mount *mp = ip->i_mount;
1970
1971	if (!XFS_IS_REALTIME_INODE(ip))
1972	return false;
1973
1974	if (__percpu_counter_compare(fbc: &mp->m_frextents,
1975	rhs: mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
1976	XFS_FDBLOCKS_BATCH) < `0`)
1977	return true;
1978
1979	return false;
1980	}
1981	#else
1982	# define xfs_inodegc_want_queue_rt_file(ip) (false)
1983	#endif /* CONFIG_XFS_RT */
1984
1985	/*
1986	* Schedule the inactivation worker when:
1987	*
1988	* - We've accumulated more than one inode cluster buffer's worth of inodes.
1989	* - There is less than 5% free space left.
1990	* - Any of the quotas for this inode are near an enforcement limit.
1991	*/
1992	static inline bool
1993	xfs_inodegc_want_queue_work(
1994	struct xfs_inode *ip,
1995	unsigned int items)
1996	{
1997	struct xfs_mount *mp = ip->i_mount;
1998
1999	if (items > mp->m_ino_geo.inodes_per_cluster)
2000	return true;
2001
2002	if (__percpu_counter_compare(fbc: &mp->m_fdblocks,
2003	rhs: mp->m_low_space[XFS_LOWSP_5_PCNT],
2004	XFS_FDBLOCKS_BATCH) < `0`)
2005	return true;
2006
2007	if (xfs_inodegc_want_queue_rt_file(ip))
2008	return true;
2009
2010	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
2011	return true;
2012
2013	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
2014	return true;
2015
2016	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
2017	return true;
2018
2019	return false;
2020	}
2021
2022	/*
2023	* Upper bound on the number of inodes in each AG that can be queued for
2024	* inactivation at any given time, to avoid monopolizing the workqueue.
2025	*/
2026	#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
2027
2028	/*
2029	* Make the frontend wait for inactivations when:
2030	*
2031	* - Memory shrinkers queued the inactivation worker and it hasn't finished.
2032	* - The queue depth exceeds the maximum allowable percpu backlog.
2033	*
2034	* Note: If the current thread is running a transaction, we don't ever want to
2035	* wait for other transactions because that could introduce a deadlock.
2036	*/
2037	static inline bool
2038	xfs_inodegc_want_flush_work(
2039	struct xfs_inode *ip,
2040	unsigned int items,
2041	unsigned int shrinker_hits)
2042	{
2043	if (current->journal_info)
2044	return false;
2045
2046	if (shrinker_hits > `0`)
2047	return true;
2048
2049	if (items > XFS_INODEGC_MAX_BACKLOG)
2050	return true;
2051
2052	return false;
2053	}
2054
2055	/*
2056	* Queue a background inactivation worker if there are inodes that need to be
2057	* inactivated and higher level xfs code hasn't disabled the background
2058	* workers.
2059	*/
2060	static void
2061	xfs_inodegc_queue(
2062	struct xfs_inode *ip)
2063	{
2064	struct xfs_mount *mp = ip->i_mount;
2065	struct xfs_inodegc *gc;
2066	int items;
2067	unsigned int shrinker_hits;
2068	unsigned int cpu_nr;
2069	unsigned long queue_delay = `1`;
2070
2071	trace_xfs_inode_set_need_inactive(ip);
2072	spin_lock(lock: &ip->i_flags_lock);
2073	ip->i_flags \|= XFS_NEED_INACTIVE;
2074	spin_unlock(lock: &ip->i_flags_lock);
2075
2076	cpu_nr = get_cpu();
2077	gc = this_cpu_ptr(mp->m_inodegc);
2078	llist_add(new: &ip->i_gclist, head: &gc->list);
2079	items = READ_ONCE(gc->items);
2080	WRITE_ONCE(gc->items, items + `1`);
2081	shrinker_hits = READ_ONCE(gc->shrinker_hits);
2082
2083	/*
2084	* Ensure the list add is always seen by anyone who finds the cpumask
2085	* bit set. This effectively gives the cpumask bit set operation
2086	* release ordering semantics.
2087	*/
2088	smp_mb__before_atomic();
2089	if (!cpumask_test_cpu(cpu: cpu_nr, cpumask: &mp->m_inodegc_cpumask))
2090	cpumask_test_and_set_cpu(cpu: cpu_nr, cpumask: &mp->m_inodegc_cpumask);
2091
2092	/*
2093	* We queue the work while holding the current CPU so that the work
2094	* is scheduled to run on this CPU.
2095	*/
2096	if (!xfs_is_inodegc_enabled(mp)) {
2097	put_cpu();
2098	return;
2099	}
2100
2101	if (xfs_inodegc_want_queue_work(ip, items))
2102	queue_delay = `0`;
2103
2104	trace_xfs_inodegc_queue(mp, __return_address);
2105	mod_delayed_work_on(current_cpu(), wq: mp->m_inodegc_wq, dwork: &gc->work,
2106	delay: queue_delay);
2107	put_cpu();
2108
2109	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2110	trace_xfs_inodegc_throttle(mp, __return_address);
2111	flush_delayed_work(dwork: &gc->work);
2112	}
2113	}
2114
2115	/*
2116	* We set the inode flag atomically with the radix tree tag. Once we get tag
2117	* lookups on the radix tree, this inode flag can go away.
2118	*
2119	* We always use background reclaim here because even if the inode is clean, it
2120	* still may be under IO and hence we have wait for IO completion to occur
2121	* before we can reclaim the inode. The background reclaim path handles this
2122	* more efficiently than we can here, so simply let background reclaim tear down
2123	* all inodes.
2124	*/
2125	void
2126	xfs_inode_mark_reclaimable(
2127	struct xfs_inode *ip)
2128	{
2129	struct xfs_mount *mp = ip->i_mount;
2130	bool need_inactive;
2131
2132	XFS_STATS_INC(mp, vn_reclaim);
2133
2134	/*
2135	* We should never get here with any of the reclaim flags already set.
2136	*/
2137	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
2138
2139	need_inactive = xfs_inode_needs_inactive(ip);
2140	if (need_inactive) {
2141	xfs_inodegc_queue(ip);
2142	return;
2143	}
2144
2145	/ Going straight to reclaim, so drop the dquots. /
2146	xfs_qm_dqdetach(ip);
2147	xfs_inodegc_set_reclaimable(ip);
2148	}
2149
2150	/*
2151	* Register a phony shrinker so that we can run background inodegc sooner when
2152	* there's memory pressure. Inactivation does not itself free any memory but
2153	* it does make inodes reclaimable, which eventually frees memory.
2154	*
2155	* The count function, seek value, and batch value are crafted to trigger the
2156	* scan function during the second round of scanning. Hopefully this means
2157	* that we reclaimed enough memory that initiating metadata transactions won't
2158	* make things worse.
2159	*/
2160	#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
2161	#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
2162
2163	static unsigned long
2164	xfs_inodegc_shrinker_count(
2165	struct shrinker *shrink,
2166	struct shrink_control *sc)
2167	{
2168	struct xfs_mount *mp = shrink->private_data;
2169	struct xfs_inodegc *gc;
2170	int cpu;
2171
2172	if (!xfs_is_inodegc_enabled(mp))
2173	return `0`;
2174
2175	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2176	gc = per_cpu_ptr(mp->m_inodegc, cpu);
2177	if (!llist_empty(head: &gc->list))
2178	return XFS_INODEGC_SHRINKER_COUNT;
2179	}
2180
2181	return `0`;
2182	}
2183
2184	static unsigned long
2185	xfs_inodegc_shrinker_scan(
2186	struct shrinker *shrink,
2187	struct shrink_control *sc)
2188	{
2189	struct xfs_mount *mp = shrink->private_data;
2190	struct xfs_inodegc *gc;
2191	int cpu;
2192	bool no_items = true;
2193
2194	if (!xfs_is_inodegc_enabled(mp))
2195	return SHRINK_STOP;
2196
2197	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
2198
2199	for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
2200	gc = per_cpu_ptr(mp->m_inodegc, cpu);
2201	if (!llist_empty(head: &gc->list)) {
2202	unsigned int h = READ_ONCE(gc->shrinker_hits);
2203
2204	WRITE_ONCE(gc->shrinker_hits, h + `1`);
2205	mod_delayed_work_on(cpu, wq: mp->m_inodegc_wq, dwork: &gc->work, delay: `0`);
2206	no_items = false;
2207	}
2208	}
2209
2210	/*
2211	* If there are no inodes to inactivate, we don't want the shrinker
2212	* to think there's deferred work to call us back about.
2213	*/
2214	if (no_items)
2215	return LONG_MAX;
2216
2217	return SHRINK_STOP;
2218	}
2219
2220	/ Register a shrinker so we can accelerate inodegc and throttle queuing. /
2221	int
2222	xfs_inodegc_register_shrinker(
2223	struct xfs_mount *mp)
2224	{
2225	mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
2226	fmt: "xfs-inodegc:%s",
2227	mp->m_super->s_id);
2228	if (!mp->m_inodegc_shrinker)
2229	return -ENOMEM;
2230
2231	mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
2232	mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
2233	mp->m_inodegc_shrinker->seeks = `0`;
2234	mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
2235	mp->m_inodegc_shrinker->private_data = mp;
2236
2237	shrinker_register(shrinker: mp->m_inodegc_shrinker);
2238
2239	return `0`;
2240	}
2241

source code of linux/fs/xfs/xfs_icache.c