xfs_inode.c source code [linux/fs/xfs/xfs_inode.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
4	* All Rights Reserved.
5	*/
6	#include <linux/iversion.h>
7
8	#include "xfs.h"
9	#include "xfs_fs.h"
10	#include "xfs_shared.h"
11	#include "xfs_format.h"
12	#include "xfs_log_format.h"
13	#include "xfs_trans_resv.h"
14	#include "xfs_mount.h"
15	#include "xfs_defer.h"
16	#include "xfs_inode.h"
17	#include "xfs_dir2.h"
18	#include "xfs_attr.h"
19	#include "xfs_trans_space.h"
20	#include "xfs_trans.h"
21	#include "xfs_buf_item.h"
22	#include "xfs_inode_item.h"
23	#include "xfs_iunlink_item.h"
24	#include "xfs_ialloc.h"
25	#include "xfs_bmap.h"
26	#include "xfs_bmap_util.h"
27	#include "xfs_errortag.h"
28	#include "xfs_error.h"
29	#include "xfs_quota.h"
30	#include "xfs_filestream.h"
31	#include "xfs_trace.h"
32	#include "xfs_icache.h"
33	#include "xfs_symlink.h"
34	#include "xfs_trans_priv.h"
35	#include "xfs_log.h"
36	#include "xfs_bmap_btree.h"
37	#include "xfs_reflink.h"
38	#include "xfs_ag.h"
39	#include "xfs_log_priv.h"
40
41	struct kmem_cache *xfs_inode_cache;
42
43	/*
44	* Used in xfs_itruncate_extents(). This is the maximum number of extents
45	* freed from a file in a single transaction.
46	*/
47	#define XFS_ITRUNC_MAX_EXTENTS 2
48
49	STATIC int xfs_iunlink(struct xfs_trans , struct* xfs_inode *);
50	STATIC int xfs_iunlink_remove(struct xfs_trans tp, struct* xfs_perag *pag,
51	struct xfs_inode *);
52
53	/*
54	* helper function to extract extent size hint from inode
55	*/
56	xfs_extlen_t
57	xfs_get_extsz_hint(
58	struct xfs_inode *ip)
59	{
60	/*
61	* No point in aligning allocations if we need to COW to actually
62	* write to them.
63	*/
64	if (xfs_is_always_cow_inode(ip))
65	return `0`;
66	if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize)
67	return ip->i_extsize;
68	if (XFS_IS_REALTIME_INODE(ip))
69	return ip->i_mount->m_sb.sb_rextsize;
70	return `0`;
71	}
72
73	/*
74	* Helper function to extract CoW extent size hint from inode.
75	* Between the extent size hint and the CoW extent size hint, we
76	* return the greater of the two. If the value is zero (automatic),
77	* use the default size.
78	*/
79	xfs_extlen_t
80	xfs_get_cowextsz_hint(
81	struct xfs_inode *ip)
82	{
83	xfs_extlen_t a, b;
84
85	a = `0`;
86	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
87	a = ip->i_cowextsize;
88	b = xfs_get_extsz_hint(ip);
89
90	a = max(a, b);
91	if (a == `0`)
92	return XFS_DEFAULT_COWEXTSZ_HINT;
93	return a;
94	}
95
96	/*
97	* These two are wrapper routines around the xfs_ilock() routine used to
98	* centralize some grungy code. They are used in places that wish to lock the
99	* inode solely for reading the extents. The reason these places can't just
100	* call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
101	* bringing in of the extents from disk for a file in b-tree format. If the
102	* inode is in b-tree format, then we need to lock the inode exclusively until
103	* the extents are read in. Locking it exclusively all the time would limit
104	* our parallelism unnecessarily, though. What we do instead is check to see
105	* if the extents have been read in yet, and only lock the inode exclusively
106	* if they have not.
107	*
108	* The functions return a value which should be given to the corresponding
109	* xfs_iunlock() call.
110	*/
111	uint
112	xfs_ilock_data_map_shared(
113	struct xfs_inode *ip)
114	{
115	uint lock_mode = XFS_ILOCK_SHARED;
116
117	if (xfs_need_iread_extents(&ip->i_df))
118	lock_mode = XFS_ILOCK_EXCL;
119	xfs_ilock(ip, lock_mode);
120	return lock_mode;
121	}
122
123	uint
124	xfs_ilock_attr_map_shared(
125	struct xfs_inode *ip)
126	{
127	uint lock_mode = XFS_ILOCK_SHARED;
128
129	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
130	lock_mode = XFS_ILOCK_EXCL;
131	xfs_ilock(ip, lock_mode);
132	return lock_mode;
133	}
134
135	/*
136	* You can't set both SHARED and EXCL for the same lock,
137	* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
138	* XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
139	* to set in lock_flags.
140	*/
141	static inline void
142	xfs_lock_flags_assert(
143	uint lock_flags)
144	{
145	ASSERT((lock_flags & (XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL)) !=
146	(XFS_IOLOCK_SHARED \| XFS_IOLOCK_EXCL));
147	ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL)) !=
148	(XFS_MMAPLOCK_SHARED \| XFS_MMAPLOCK_EXCL));
149	ASSERT((lock_flags & (XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL)) !=
150	(XFS_ILOCK_SHARED \| XFS_ILOCK_EXCL));
151	ASSERT((lock_flags & ~(XFS_LOCK_MASK \| XFS_LOCK_SUBCLASS_MASK)) == `0`);
152	ASSERT(lock_flags != `0`);
153	}
154
155	/*
156	* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
157	* multi-reader locks: invalidate_lock and the i_lock. This routine allows
158	* various combinations of the locks to be obtained.
159	*
160	* The 3 locks should always be ordered so that the IO lock is obtained first,
161	* the mmap lock second and the ilock last in order to prevent deadlock.
162	*
163	* Basic locking order:
164	*
165	* i_rwsem -> invalidate_lock -> page_lock -> i_ilock
166	*
167	* mmap_lock locking order:
168	*
169	* i_rwsem -> page lock -> mmap_lock
170	* mmap_lock -> invalidate_lock -> page_lock
171	*
172	* The difference in mmap_lock locking order mean that we cannot hold the
173	* invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths
174	* can fault in pages during copy in/out (for buffered IO) or require the
175	* mmap_lock in get_user_pages() to map the user pages into the kernel address
176	* space for direct IO. Similarly the i_rwsem cannot be taken inside a page
177	* fault because page faults already hold the mmap_lock.
178	*
179	* Hence to serialise fully against both syscall and mmap based IO, we need to
180	* take both the i_rwsem and the invalidate_lock. These locks should only be
181	* both taken in places where we need to invalidate the page cache in a race
182	* free manner (e.g. truncate, hole punch and other extent manipulation
183	* functions).
184	*/
185	void
186	xfs_ilock(
187	xfs_inode_t *ip,
188	uint lock_flags)
189	{
190	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
191
192	xfs_lock_flags_assert(lock_flags);
193
194	if (lock_flags & XFS_IOLOCK_EXCL) {
195	down_write_nested(sem: &VFS_I(ip)->i_rwsem,
196	XFS_IOLOCK_DEP(lock_flags));
197	} else if (lock_flags & XFS_IOLOCK_SHARED) {
198	down_read_nested(sem: &VFS_I(ip)->i_rwsem,
199	XFS_IOLOCK_DEP(lock_flags));
200	}
201
202	if (lock_flags & XFS_MMAPLOCK_EXCL) {
203	down_write_nested(sem: &VFS_I(ip)->i_mapping->invalidate_lock,
204	XFS_MMAPLOCK_DEP(lock_flags));
205	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
206	down_read_nested(sem: &VFS_I(ip)->i_mapping->invalidate_lock,
207	XFS_MMAPLOCK_DEP(lock_flags));
208	}
209
210	if (lock_flags & XFS_ILOCK_EXCL)
211	mrupdate_nested(mrp: &ip->i_lock, XFS_ILOCK_DEP(lock_flags));
212	else if (lock_flags & XFS_ILOCK_SHARED)
213	mraccess_nested(mrp: &ip->i_lock, XFS_ILOCK_DEP(lock_flags));
214	}
215
216	/*
217	* This is just like xfs_ilock(), except that the caller
218	* is guaranteed not to sleep. It returns 1 if it gets
219	* the requested locks and 0 otherwise. If the IO lock is
220	* obtained but the inode lock cannot be, then the IO lock
221	* is dropped before returning.
222	*
223	* ip -- the inode being locked
224	* lock_flags -- this parameter indicates the inode's locks to be
225	* to be locked. See the comment for xfs_ilock() for a list
226	* of valid values.
227	*/
228	int
229	xfs_ilock_nowait(
230	xfs_inode_t *ip,
231	uint lock_flags)
232	{
233	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
234
235	xfs_lock_flags_assert(lock_flags);
236
237	if (lock_flags & XFS_IOLOCK_EXCL) {
238	if (!down_write_trylock(sem: &VFS_I(ip)->i_rwsem))
239	goto out;
240	} else if (lock_flags & XFS_IOLOCK_SHARED) {
241	if (!down_read_trylock(sem: &VFS_I(ip)->i_rwsem))
242	goto out;
243	}
244
245	if (lock_flags & XFS_MMAPLOCK_EXCL) {
246	if (!down_write_trylock(sem: &VFS_I(ip)->i_mapping->invalidate_lock))
247	goto out_undo_iolock;
248	} else if (lock_flags & XFS_MMAPLOCK_SHARED) {
249	if (!down_read_trylock(sem: &VFS_I(ip)->i_mapping->invalidate_lock))
250	goto out_undo_iolock;
251	}
252
253	if (lock_flags & XFS_ILOCK_EXCL) {
254	if (!mrtryupdate(mrp: &ip->i_lock))
255	goto out_undo_mmaplock;
256	} else if (lock_flags & XFS_ILOCK_SHARED) {
257	if (!mrtryaccess(mrp: &ip->i_lock))
258	goto out_undo_mmaplock;
259	}
260	return `1`;
261
262	out_undo_mmaplock:
263	if (lock_flags & XFS_MMAPLOCK_EXCL)
264	up_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock);
265	else if (lock_flags & XFS_MMAPLOCK_SHARED)
266	up_read(sem: &VFS_I(ip)->i_mapping->invalidate_lock);
267	out_undo_iolock:
268	if (lock_flags & XFS_IOLOCK_EXCL)
269	up_write(sem: &VFS_I(ip)->i_rwsem);
270	else if (lock_flags & XFS_IOLOCK_SHARED)
271	up_read(sem: &VFS_I(ip)->i_rwsem);
272	out:
273	return `0`;
274	}
275
276	/*
277	* xfs_iunlock() is used to drop the inode locks acquired with
278	* xfs_ilock() and xfs_ilock_nowait(). The caller must pass
279	* in the flags given to xfs_ilock() or xfs_ilock_nowait() so
280	* that we know which locks to drop.
281	*
282	* ip -- the inode being unlocked
283	* lock_flags -- this parameter indicates the inode's locks to be
284	* to be unlocked. See the comment for xfs_ilock() for a list
285	* of valid values for this parameter.
286	*
287	*/
288	void
289	xfs_iunlock(
290	xfs_inode_t *ip,
291	uint lock_flags)
292	{
293	xfs_lock_flags_assert(lock_flags);
294
295	if (lock_flags & XFS_IOLOCK_EXCL)
296	up_write(sem: &VFS_I(ip)->i_rwsem);
297	else if (lock_flags & XFS_IOLOCK_SHARED)
298	up_read(sem: &VFS_I(ip)->i_rwsem);
299
300	if (lock_flags & XFS_MMAPLOCK_EXCL)
301	up_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock);
302	else if (lock_flags & XFS_MMAPLOCK_SHARED)
303	up_read(sem: &VFS_I(ip)->i_mapping->invalidate_lock);
304
305	if (lock_flags & XFS_ILOCK_EXCL)
306	mrunlock_excl(mrp: &ip->i_lock);
307	else if (lock_flags & XFS_ILOCK_SHARED)
308	mrunlock_shared(mrp: &ip->i_lock);
309
310	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
311	}
312
313	/*
314	* give up write locks. the i/o lock cannot be held nested
315	* if it is being demoted.
316	*/
317	void
318	xfs_ilock_demote(
319	xfs_inode_t *ip,
320	uint lock_flags)
321	{
322	ASSERT(lock_flags & (XFS_IOLOCK_EXCL\|XFS_MMAPLOCK_EXCL\|XFS_ILOCK_EXCL));
323	ASSERT((lock_flags &
324	~(XFS_IOLOCK_EXCL\|XFS_MMAPLOCK_EXCL\|XFS_ILOCK_EXCL)) == `0`);
325
326	if (lock_flags & XFS_ILOCK_EXCL)
327	mrdemote(mrp: &ip->i_lock);
328	if (lock_flags & XFS_MMAPLOCK_EXCL)
329	downgrade_write(sem: &VFS_I(ip)->i_mapping->invalidate_lock);
330	if (lock_flags & XFS_IOLOCK_EXCL)
331	downgrade_write(sem: &VFS_I(ip)->i_rwsem);
332
333	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
334	}
335
336	#if defined(DEBUG) \|\| defined(XFS_WARN)
337	static inline bool
338	__xfs_rwsem_islocked(
339	struct rw_semaphore *rwsem,
340	bool shared)
341	{
342	if (!debug_locks)
343	return rwsem_is_locked(sem: rwsem);
344
345	if (!shared)
346	return lockdep_is_held_type(rwsem, `0`);
347
348	/*
349	* We are checking that the lock is held at least in shared
350	* mode but don't care that it might be held exclusively
351	* (i.e. shared \| excl). Hence we check if the lock is held
352	* in any mode rather than an explicit shared mode.
353	*/
354	return lockdep_is_held_type(rwsem, -`1`);
355	}
356
357	bool
358	xfs_isilocked(
359	struct xfs_inode *ip,
360	uint lock_flags)
361	{
362	if (lock_flags & (XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED)) {
363	if (!(lock_flags & XFS_ILOCK_SHARED))
364	return !!ip->i_lock.mr_writer;
365	return rwsem_is_locked(sem: &ip->i_lock.mr_lock);
366	}
367
368	if (lock_flags & (XFS_MMAPLOCK_EXCL\|XFS_MMAPLOCK_SHARED)) {
369	return __xfs_rwsem_islocked(rwsem: &VFS_I(ip)->i_mapping->invalidate_lock,
370	shared: (lock_flags & XFS_MMAPLOCK_SHARED));
371	}
372
373	if (lock_flags & (XFS_IOLOCK_EXCL \| XFS_IOLOCK_SHARED)) {
374	return __xfs_rwsem_islocked(rwsem: &VFS_I(ip)->i_rwsem,
375	shared: (lock_flags & XFS_IOLOCK_SHARED));
376	}
377
378	ASSERT(`0`);
379	return false;
380	}
381	#endif
382
383	/*
384	* xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
385	* DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
386	* when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
387	* errors and warnings.
388	*/
389	#if (defined(DEBUG) \|\| defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
390	static bool
391	xfs_lockdep_subclass_ok(
392	int subclass)
393	{
394	return subclass < MAX_LOCKDEP_SUBCLASSES;
395	}
396	#else
397	#define xfs_lockdep_subclass_ok(subclass) (true)
398	#endif
399
400	/*
401	* Bump the subclass so xfs_lock_inodes() acquires each lock with a different
402	* value. This can be called for any type of inode lock combination, including
403	* parent locking. Care must be taken to ensure we don't overrun the subclass
404	* storage fields in the class mask we build.
405	*/
406	static inline uint
407	xfs_lock_inumorder(
408	uint lock_mode,
409	uint subclass)
410	{
411	uint class = `0`;
412
413	ASSERT(!(lock_mode & (XFS_ILOCK_PARENT \| XFS_ILOCK_RTBITMAP \|
414	XFS_ILOCK_RTSUM)));
415	ASSERT(xfs_lockdep_subclass_ok(subclass));
416
417	if (lock_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)) {
418	ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
419	class += subclass << XFS_IOLOCK_SHIFT;
420	}
421
422	if (lock_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)) {
423	ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
424	class += subclass << XFS_MMAPLOCK_SHIFT;
425	}
426
427	if (lock_mode & (XFS_ILOCK_SHARED\|XFS_ILOCK_EXCL)) {
428	ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
429	class += subclass << XFS_ILOCK_SHIFT;
430	}
431
432	return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) \| class;
433	}
434
435	/*
436	* The following routine will lock n inodes in exclusive mode. We assume the
437	* caller calls us with the inodes in i_ino order.
438	*
439	* We need to detect deadlock where an inode that we lock is in the AIL and we
440	* start waiting for another inode that is locked by a thread in a long running
441	* transaction (such as truncate). This can result in deadlock since the long
442	* running trans might need to wait for the inode we just locked in order to
443	* push the tail and free space in the log.
444	*
445	* xfs_lock_inodes() can only be used to lock one type of lock at a time -
446	* the iolock, the mmaplock or the ilock, but not more than one at a time. If we
447	* lock more than one at a time, lockdep will report false positives saying we
448	* have violated locking orders.
449	*/
450	static void
451	xfs_lock_inodes(
452	struct xfs_inode **ips,
453	int inodes,
454	uint lock_mode)
455	{
456	int attempts = `0`;
457	uint i;
458	int j;
459	bool try_lock;
460	struct xfs_log_item *lp;
461
462	/*
463	* Currently supports between 2 and 5 inodes with exclusive locking. We
464	* support an arbitrary depth of locking here, but absolute limits on
465	* inodes depend on the type of locking and the limits placed by
466	* lockdep annotations in xfs_lock_inumorder. These are all checked by
467	* the asserts.
468	*/
469	ASSERT(ips && inodes >= `2` && inodes <= `5`);
470	ASSERT(lock_mode & (XFS_IOLOCK_EXCL \| XFS_MMAPLOCK_EXCL \|
471	XFS_ILOCK_EXCL));
472	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED \| XFS_MMAPLOCK_SHARED \|
473	XFS_ILOCK_SHARED)));
474	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) \|\|
475	inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + `1`);
476	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) \|\|
477	inodes <= XFS_ILOCK_MAX_SUBCLASS + `1`);
478
479	if (lock_mode & XFS_IOLOCK_EXCL) {
480	ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL \| XFS_ILOCK_EXCL)));
481	} else if (lock_mode & XFS_MMAPLOCK_EXCL)
482	ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
483
484	again:
485	try_lock = false;
486	i = `0`;
487	for (; i < inodes; i++) {
488	ASSERT(ips[i]);
489
490	if (i && (ips[i] == ips[i - `1`])) / Already locked /
491	continue;
492
493	/*
494	* If try_lock is not set yet, make sure all locked inodes are
495	* not in the AIL. If any are, set try_lock to be used later.
496	*/
497	if (!try_lock) {
498	for (j = (i - `1`); j >= `0` && !try_lock; j--) {
499	lp = &ips[j]->i_itemp->ili_item;
500	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
501	try_lock = true;
502	}
503	}
504
505	/*
506	* If any of the previous locks we have locked is in the AIL,
507	* we must TRY to get the second and subsequent locks. If
508	* we can't get any, we must release all we have
509	* and try again.
510	*/
511	if (!try_lock) {
512	xfs_ilock(ip: ips[i], lock_flags: xfs_lock_inumorder(lock_mode, subclass: i));
513	continue;
514	}
515
516	/ try_lock means we have an inode locked that is in the AIL. /
517	ASSERT(i != `0`);
518	if (xfs_ilock_nowait(ip: ips[i], lock_flags: xfs_lock_inumorder(lock_mode, subclass: i)))
519	continue;
520
521	/*
522	* Unlock all previous guys and try again. xfs_iunlock will try
523	* to push the tail if the inode is in the AIL.
524	*/
525	attempts++;
526	for (j = i - `1`; j >= `0`; j--) {
527	/*
528	* Check to see if we've already unlocked this one. Not
529	* the first one going back, and the inode ptr is the
530	* same.
531	*/
532	if (j != (i - `1`) && ips[j] == ips[j + `1`])
533	continue;
534
535	xfs_iunlock(ip: ips[j], lock_flags: lock_mode);
536	}
537
538	if ((attempts % `5`) == `0`) {
539	delay(ticks: `1`); / Don't just spin the CPU /
540	}
541	goto again;
542	}
543	}
544
545	/*
546	* xfs_lock_two_inodes() can only be used to lock ilock. The iolock and
547	* mmaplock must be double-locked separately since we use i_rwsem and
548	* invalidate_lock for that. We now support taking one lock EXCL and the
549	* other SHARED.
550	*/
551	void
552	xfs_lock_two_inodes(
553	struct xfs_inode *ip0,
554	uint ip0_mode,
555	struct xfs_inode *ip1,
556	uint ip1_mode)
557	{
558	int attempts = `0`;
559	struct xfs_log_item *lp;
560
561	ASSERT(hweight32(ip0_mode) == `1`);
562	ASSERT(hweight32(ip1_mode) == `1`);
563	ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)));
564	ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED\|XFS_IOLOCK_EXCL)));
565	ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)));
566	ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED\|XFS_MMAPLOCK_EXCL)));
567	ASSERT(ip0->i_ino != ip1->i_ino);
568
569	if (ip0->i_ino > ip1->i_ino) {
570	swap(ip0, ip1);
571	swap(ip0_mode, ip1_mode);
572	}
573
574	again:
575	xfs_ilock(ip: ip0, lock_flags: xfs_lock_inumorder(lock_mode: ip0_mode, subclass: `0`));
576
577	/*
578	* If the first lock we have locked is in the AIL, we must TRY to get
579	* the second lock. If we can't get it, we must release the first one
580	* and try again.
581	*/
582	lp = &ip0->i_itemp->ili_item;
583	if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
584	if (!xfs_ilock_nowait(ip: ip1, lock_flags: xfs_lock_inumorder(lock_mode: ip1_mode, subclass: `1`))) {
585	xfs_iunlock(ip: ip0, lock_flags: ip0_mode);
586	if ((++attempts % `5`) == `0`)
587	delay(ticks: `1`); / Don't just spin the CPU /
588	goto again;
589	}
590	} else {
591	xfs_ilock(ip: ip1, lock_flags: xfs_lock_inumorder(lock_mode: ip1_mode, subclass: `1`));
592	}
593	}
594
595	uint
596	xfs_ip2xflags(
597	struct xfs_inode *ip)
598	{
599	uint flags = `0`;
600
601	if (ip->i_diflags & XFS_DIFLAG_ANY) {
602	if (ip->i_diflags & XFS_DIFLAG_REALTIME)
603	flags \|= FS_XFLAG_REALTIME;
604	if (ip->i_diflags & XFS_DIFLAG_PREALLOC)
605	flags \|= FS_XFLAG_PREALLOC;
606	if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE)
607	flags \|= FS_XFLAG_IMMUTABLE;
608	if (ip->i_diflags & XFS_DIFLAG_APPEND)
609	flags \|= FS_XFLAG_APPEND;
610	if (ip->i_diflags & XFS_DIFLAG_SYNC)
611	flags \|= FS_XFLAG_SYNC;
612	if (ip->i_diflags & XFS_DIFLAG_NOATIME)
613	flags \|= FS_XFLAG_NOATIME;
614	if (ip->i_diflags & XFS_DIFLAG_NODUMP)
615	flags \|= FS_XFLAG_NODUMP;
616	if (ip->i_diflags & XFS_DIFLAG_RTINHERIT)
617	flags \|= FS_XFLAG_RTINHERIT;
618	if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT)
619	flags \|= FS_XFLAG_PROJINHERIT;
620	if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS)
621	flags \|= FS_XFLAG_NOSYMLINKS;
622	if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
623	flags \|= FS_XFLAG_EXTSIZE;
624	if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT)
625	flags \|= FS_XFLAG_EXTSZINHERIT;
626	if (ip->i_diflags & XFS_DIFLAG_NODEFRAG)
627	flags \|= FS_XFLAG_NODEFRAG;
628	if (ip->i_diflags & XFS_DIFLAG_FILESTREAM)
629	flags \|= FS_XFLAG_FILESTREAM;
630	}
631
632	if (ip->i_diflags2 & XFS_DIFLAG2_ANY) {
633	if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
634	flags \|= FS_XFLAG_DAX;
635	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
636	flags \|= FS_XFLAG_COWEXTSIZE;
637	}
638
639	if (xfs_inode_has_attr_fork(ip))
640	flags \|= FS_XFLAG_HASATTR;
641	return flags;
642	}
643
644	/*
645	* Lookups up an inode from "name". If ci_name is not NULL, then a CI match
646	* is allowed, otherwise it has to be an exact match. If a CI match is found,
647	* ci_name->name will point to a the actual name (caller must free) or
648	* will be set to NULL if an exact match is found.
649	*/
650	int
651	xfs_lookup(
652	struct xfs_inode *dp,
653	const struct xfs_name *name,
654	struct xfs_inode **ipp,
655	struct xfs_name *ci_name)
656	{
657	xfs_ino_t inum;
658	int error;
659
660	trace_xfs_lookup(dp, xfs_lookup: name);
661
662	if (xfs_is_shutdown(mp: dp->i_mount))
663	return -EIO;
664
665	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
666	if (error)
667	goto out_unlock;
668
669	error = xfs_iget(mp: dp->i_mount, NULL, ino: inum, flags: `0`, lock_flags: `0`, ipp);
670	if (error)
671	goto out_free_name;
672
673	return `0`;
674
675	out_free_name:
676	if (ci_name)
677	kmem_free(ptr: ci_name->name);
678	out_unlock:
679	*ipp = NULL;
680	return error;
681	}
682
683	/ Propagate di_flags from a parent inode to a child inode. /
684	static void
685	xfs_inode_inherit_flags(
686	struct xfs_inode *ip,
687	const struct xfs_inode *pip)
688	{
689	unsigned int di_flags = `0`;
690	xfs_failaddr_t failaddr;
691	umode_t mode = VFS_I(ip)->i_mode;
692
693	if (S_ISDIR(mode)) {
694	if (pip->i_diflags & XFS_DIFLAG_RTINHERIT)
695	di_flags \|= XFS_DIFLAG_RTINHERIT;
696	if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
697	di_flags \|= XFS_DIFLAG_EXTSZINHERIT;
698	ip->i_extsize = pip->i_extsize;
699	}
700	if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT)
701	di_flags \|= XFS_DIFLAG_PROJINHERIT;
702	} else if (S_ISREG(mode)) {
703	if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
704	xfs_has_realtime(ip->i_mount))
705	di_flags \|= XFS_DIFLAG_REALTIME;
706	if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) {
707	di_flags \|= XFS_DIFLAG_EXTSIZE;
708	ip->i_extsize = pip->i_extsize;
709	}
710	}
711	if ((pip->i_diflags & XFS_DIFLAG_NOATIME) &&
712	xfs_inherit_noatime)
713	di_flags \|= XFS_DIFLAG_NOATIME;
714	if ((pip->i_diflags & XFS_DIFLAG_NODUMP) &&
715	xfs_inherit_nodump)
716	di_flags \|= XFS_DIFLAG_NODUMP;
717	if ((pip->i_diflags & XFS_DIFLAG_SYNC) &&
718	xfs_inherit_sync)
719	di_flags \|= XFS_DIFLAG_SYNC;
720	if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) &&
721	xfs_inherit_nosymlinks)
722	di_flags \|= XFS_DIFLAG_NOSYMLINKS;
723	if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) &&
724	xfs_inherit_nodefrag)
725	di_flags \|= XFS_DIFLAG_NODEFRAG;
726	if (pip->i_diflags & XFS_DIFLAG_FILESTREAM)
727	di_flags \|= XFS_DIFLAG_FILESTREAM;
728
729	ip->i_diflags \|= di_flags;
730
731	/*
732	* Inode verifiers on older kernels only check that the extent size
733	* hint is an integer multiple of the rt extent size on realtime files.
734	* They did not check the hint alignment on a directory with both
735	* rtinherit and extszinherit flags set. If the misaligned hint is
736	* propagated from a directory into a new realtime file, new file
737	* allocations will fail due to math errors in the rt allocator and/or
738	* trip the verifiers. Validate the hint settings in the new file so
739	* that we don't let broken hints propagate.
740	*/
741	failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize,
742	VFS_I(ip)->i_mode, ip->i_diflags);
743	if (failaddr) {
744	ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE \|
745	XFS_DIFLAG_EXTSZINHERIT);
746	ip->i_extsize = `0`;
747	}
748	}
749
750	/ Propagate di_flags2 from a parent inode to a child inode. /
751	static void
752	xfs_inode_inherit_flags2(
753	struct xfs_inode *ip,
754	const struct xfs_inode *pip)
755	{
756	xfs_failaddr_t failaddr;
757
758	if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) {
759	ip->i_diflags2 \|= XFS_DIFLAG2_COWEXTSIZE;
760	ip->i_cowextsize = pip->i_cowextsize;
761	}
762	if (pip->i_diflags2 & XFS_DIFLAG2_DAX)
763	ip->i_diflags2 \|= XFS_DIFLAG2_DAX;
764
765	/ Don't let invalid cowextsize hints propagate. /
766	failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize,
767	VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2);
768	if (failaddr) {
769	ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
770	ip->i_cowextsize = `0`;
771	}
772	}
773
774	/*
775	* Initialise a newly allocated inode and return the in-core inode to the
776	* caller locked exclusively.
777	*/
778	int
779	xfs_init_new_inode(
780	struct mnt_idmap *idmap,
781	struct xfs_trans *tp,
782	struct xfs_inode *pip,
783	xfs_ino_t ino,
784	umode_t mode,
785	xfs_nlink_t nlink,
786	dev_t rdev,
787	prid_t prid,
788	bool init_xattrs,
789	struct xfs_inode **ipp)
790	{
791	struct inode *dir = pip ? VFS_I(ip: pip) : NULL;
792	struct xfs_mount *mp = tp->t_mountp;
793	struct xfs_inode *ip;
794	unsigned int flags;
795	int error;
796	struct timespec64 tv;
797	struct inode *inode;
798
799	/*
800	* Protect against obviously corrupt allocation btree records. Later
801	* xfs_iget checks will catch re-allocation of other active in-memory
802	* and on-disk inodes. If we don't catch reallocating the parent inode
803	* here we will deadlock in xfs_iget() so we have to do these checks
804	* first.
805	*/
806	if ((pip && ino == pip->i_ino) \|\| !xfs_verify_dir_ino(mp, ino)) {
807	xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
808	return -EFSCORRUPTED;
809	}
810
811	/*
812	* Get the in-core inode with the lock held exclusively to prevent
813	* others from looking at until we're done.
814	*/
815	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, ipp: &ip);
816	if (error)
817	return error;
818
819	ASSERT(ip != NULL);
820	inode = VFS_I(ip);
821	set_nlink(inode, nlink);
822	inode->i_rdev = rdev;
823	ip->i_projid = prid;
824
825	if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) {
826	inode_fsuid_set(inode, idmap);
827	inode->i_gid = dir->i_gid;
828	inode->i_mode = mode;
829	} else {
830	inode_init_owner(idmap, inode, dir, mode);
831	}
832
833	/*
834	* If the group ID of the new file does not match the effective group
835	* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
836	* (and only if the irix_sgid_inherit compatibility variable is set).
837	*/
838	if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
839	!vfsgid_in_group_p(vfsgid: i_gid_into_vfsgid(idmap, inode)))
840	inode->i_mode &= ~S_ISGID;
841
842	ip->i_disk_size = `0`;
843	ip->i_df.if_nextents = `0`;
844	ASSERT(ip->i_nblocks == `0`);
845
846	tv = inode_set_ctime_current(inode);
847	inode_set_mtime_to_ts(inode, ts: tv);
848	inode_set_atime_to_ts(inode, ts: tv);
849
850	ip->i_extsize = `0`;
851	ip->i_diflags = `0`;
852
853	if (xfs_has_v3inodes(mp)) {
854	inode_set_iversion(inode, val: `1`);
855	ip->i_cowextsize = `0`;
856	ip->i_crtime = tv;
857	}
858
859	flags = XFS_ILOG_CORE;
860	switch (mode & S_IFMT) {
861	case S_IFIFO:
862	case S_IFCHR:
863	case S_IFBLK:
864	case S_IFSOCK:
865	ip->i_df.if_format = XFS_DINODE_FMT_DEV;
866	flags \|= XFS_ILOG_DEV;
867	break;
868	case S_IFREG:
869	case S_IFDIR:
870	if (pip && (pip->i_diflags & XFS_DIFLAG_ANY))
871	xfs_inode_inherit_flags(ip, pip);
872	if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY))
873	xfs_inode_inherit_flags2(ip, pip);
874	fallthrough;
875	case S_IFLNK:
876	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
877	ip->i_df.if_bytes = `0`;
878	ip->i_df.if_u1.if_root = NULL;
879	break;
880	default:
881	ASSERT(`0`);
882	}
883
884	/*
885	* If we need to create attributes immediately after allocating the
886	* inode, initialise an empty attribute fork right now. We use the
887	* default fork offset for attributes here as we don't know exactly what
888	* size or how many attributes we might be adding. We can do this
889	* safely here because we know the data fork is completely empty and
890	* this saves us from needing to run a separate transaction to set the
891	* fork offset in the immediate future.
892	*/
893	if (init_xattrs && xfs_has_attr(mp)) {
894	ip->i_forkoff = xfs_default_attroffset(ip) >> `3`;
895	xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, `0`);
896	}
897
898	/*
899	* Log the new values stuffed into the inode.
900	*/
901	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
902	xfs_trans_log_inode(tp, ip, flags);
903
904	/ now that we have an i_mode we can setup the inode structure /
905	xfs_setup_inode(ip);
906
907	*ipp = ip;
908	return `0`;
909	}
910
911	/*
912	* Decrement the link count on an inode & log the change. If this causes the
913	* link count to go to zero, move the inode to AGI unlinked list so that it can
914	* be freed when the last active reference goes away via xfs_inactive().
915	*/
916	static int / error /
917	xfs_droplink(
918	xfs_trans_t *tp,
919	xfs_inode_t *ip)
920	{
921	if (VFS_I(ip)->i_nlink == `0`) {
922	xfs_alert(ip->i_mount,
923	"%s: Attempt to drop inode (%llu) with nlink zero.",
924	__func__, ip->i_ino);
925	return -EFSCORRUPTED;
926	}
927
928	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
929
930	drop_nlink(inode: VFS_I(ip));
931	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
932
933	if (VFS_I(ip)->i_nlink)
934	return `0`;
935
936	return xfs_iunlink(tp, ip);
937	}
938
939	/*
940	* Increment the link count on an inode & log the change.
941	*/
942	static void
943	xfs_bumplink(
944	xfs_trans_t *tp,
945	xfs_inode_t *ip)
946	{
947	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
948
949	inc_nlink(inode: VFS_I(ip));
950	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
951	}
952
953	int
954	xfs_create(
955	struct mnt_idmap *idmap,
956	xfs_inode_t *dp,
957	struct xfs_name *name,
958	umode_t mode,
959	dev_t rdev,
960	bool init_xattrs,
961	xfs_inode_t **ipp)
962	{
963	int is_dir = S_ISDIR(mode);
964	struct xfs_mount *mp = dp->i_mount;
965	struct xfs_inode *ip = NULL;
966	struct xfs_trans *tp = NULL;
967	int error;
968	bool unlock_dp_on_error = false;
969	prid_t prid;
970	struct xfs_dquot *udqp = NULL;
971	struct xfs_dquot *gdqp = NULL;
972	struct xfs_dquot *pdqp = NULL;
973	struct xfs_trans_res *tres;
974	uint resblks;
975	xfs_ino_t ino;
976
977	trace_xfs_create(dp, xfs_create: name);
978
979	if (xfs_is_shutdown(mp))
980	return -EIO;
981
982	prid = xfs_get_initial_prid(dp);
983
984	/*
985	* Make sure that we have allocated dquot(s) on disk.
986	*/
987	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
988	mapped_fsgid(idmap, &init_user_ns), prid,
989	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
990	&udqp, &gdqp, &pdqp);
991	if (error)
992	return error;
993
994	if (is_dir) {
995	resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
996	tres = &M_RES(mp)->tr_mkdir;
997	} else {
998	resblks = XFS_CREATE_SPACE_RES(mp, name->len);
999	tres = &M_RES(mp)->tr_create;
1000	}
1001
1002	/*
1003	* Initially assume that the file does not exist and
1004	* reserve the resources for that case. If that is not
1005	* the case we'll drop the one we have and get a more
1006	* appropriate transaction later.
1007	*/
1008	error = xfs_trans_alloc_icreate(mp, resv: tres, udqp, gdqp, pdqp, dblocks: resblks,
1009	tpp: &tp);
1010	if (error == -ENOSPC) {
1011	/ flush outstanding delalloc blocks and retry /
1012	xfs_flush_inodes(mp);
1013	error = xfs_trans_alloc_icreate(mp, resv: tres, udqp, gdqp, pdqp,
1014	dblocks: resblks, tpp: &tp);
1015	}
1016	if (error)
1017	goto out_release_dquots;
1018
1019	xfs_ilock(ip: dp, XFS_ILOCK_EXCL \| XFS_ILOCK_PARENT);
1020	unlock_dp_on_error = true;
1021
1022	/*
1023	* A newly created regular or special file just has one directory
1024	* entry pointing to them, but a directory also the "." entry
1025	* pointing to itself.
1026	*/
1027	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1028	if (!error)
1029	error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1030	is_dir ? `2` : `1`, rdev, prid, init_xattrs, &ip);
1031	if (error)
1032	goto out_trans_cancel;
1033
1034	/*
1035	* Now we join the directory inode to the transaction. We do not do it
1036	* earlier because xfs_dialloc might commit the previous transaction
1037	* (and release all the locks). An error from here on will result in
1038	* the transaction cancel unlocking dp so don't do it explicitly in the
1039	* error path.
1040	*/
1041	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1042	unlock_dp_on_error = false;
1043
1044	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1045	resblks - XFS_IALLOC_SPACE_RES(mp));
1046	if (error) {
1047	ASSERT(error != -ENOSPC);
1048	goto out_trans_cancel;
1049	}
1050	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
1051	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1052
1053	if (is_dir) {
1054	error = xfs_dir_init(tp, ip, dp);
1055	if (error)
1056	goto out_trans_cancel;
1057
1058	xfs_bumplink(tp, ip: dp);
1059	}
1060
1061	/*
1062	* If this is a synchronous mount, make sure that the
1063	* create transaction goes to disk before returning to
1064	* the user.
1065	*/
1066	if (xfs_has_wsync(mp) \|\| xfs_has_dirsync(mp))
1067	xfs_trans_set_sync(tp);
1068
1069	/*
1070	* Attach the dquot(s) to the inodes and modify them incore.
1071	* These ids of the inode couldn't have changed since the new
1072	* inode has been locked ever since it was created.
1073	*/
1074	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1075
1076	error = xfs_trans_commit(tp);
1077	if (error)
1078	goto out_release_inode;
1079
1080	xfs_qm_dqrele(udqp);
1081	xfs_qm_dqrele(gdqp);
1082	xfs_qm_dqrele(pdqp);
1083
1084	*ipp = ip;
1085	return `0`;
1086
1087	out_trans_cancel:
1088	xfs_trans_cancel(tp);
1089	out_release_inode:
1090	/*
1091	* Wait until after the current transaction is aborted to finish the
1092	* setup of the inode and release the inode. This prevents recursive
1093	* transactions and deadlocks from xfs_inactive.
1094	*/
1095	if (ip) {
1096	xfs_finish_inode_setup(ip);
1097	xfs_irele(ip);
1098	}
1099	out_release_dquots:
1100	xfs_qm_dqrele(udqp);
1101	xfs_qm_dqrele(gdqp);
1102	xfs_qm_dqrele(pdqp);
1103
1104	if (unlock_dp_on_error)
1105	xfs_iunlock(ip: dp, XFS_ILOCK_EXCL);
1106	return error;
1107	}
1108
1109	int
1110	xfs_create_tmpfile(
1111	struct mnt_idmap *idmap,
1112	struct xfs_inode *dp,
1113	umode_t mode,
1114	struct xfs_inode **ipp)
1115	{
1116	struct xfs_mount *mp = dp->i_mount;
1117	struct xfs_inode *ip = NULL;
1118	struct xfs_trans *tp = NULL;
1119	int error;
1120	prid_t prid;
1121	struct xfs_dquot *udqp = NULL;
1122	struct xfs_dquot *gdqp = NULL;
1123	struct xfs_dquot *pdqp = NULL;
1124	struct xfs_trans_res *tres;
1125	uint resblks;
1126	xfs_ino_t ino;
1127
1128	if (xfs_is_shutdown(mp))
1129	return -EIO;
1130
1131	prid = xfs_get_initial_prid(dp);
1132
1133	/*
1134	* Make sure that we have allocated dquot(s) on disk.
1135	*/
1136	error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns),
1137	mapped_fsgid(idmap, &init_user_ns), prid,
1138	XFS_QMOPT_QUOTALL \| XFS_QMOPT_INHERIT,
1139	&udqp, &gdqp, &pdqp);
1140	if (error)
1141	return error;
1142
1143	resblks = XFS_IALLOC_SPACE_RES(mp);
1144	tres = &M_RES(mp)->tr_create_tmpfile;
1145
1146	error = xfs_trans_alloc_icreate(mp, resv: tres, udqp, gdqp, pdqp, dblocks: resblks,
1147	tpp: &tp);
1148	if (error)
1149	goto out_release_dquots;
1150
1151	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
1152	if (!error)
1153	error = xfs_init_new_inode(idmap, tp, dp, ino, mode,
1154	`0`, `0`, prid, false, &ip);
1155	if (error)
1156	goto out_trans_cancel;
1157
1158	if (xfs_has_wsync(mp))
1159	xfs_trans_set_sync(tp);
1160
1161	/*
1162	* Attach the dquot(s) to the inodes and modify them incore.
1163	* These ids of the inode couldn't have changed since the new
1164	* inode has been locked ever since it was created.
1165	*/
1166	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1167
1168	error = xfs_iunlink(tp, ip);
1169	if (error)
1170	goto out_trans_cancel;
1171
1172	error = xfs_trans_commit(tp);
1173	if (error)
1174	goto out_release_inode;
1175
1176	xfs_qm_dqrele(udqp);
1177	xfs_qm_dqrele(gdqp);
1178	xfs_qm_dqrele(pdqp);
1179
1180	*ipp = ip;
1181	return `0`;
1182
1183	out_trans_cancel:
1184	xfs_trans_cancel(tp);
1185	out_release_inode:
1186	/*
1187	* Wait until after the current transaction is aborted to finish the
1188	* setup of the inode and release the inode. This prevents recursive
1189	* transactions and deadlocks from xfs_inactive.
1190	*/
1191	if (ip) {
1192	xfs_finish_inode_setup(ip);
1193	xfs_irele(ip);
1194	}
1195	out_release_dquots:
1196	xfs_qm_dqrele(udqp);
1197	xfs_qm_dqrele(gdqp);
1198	xfs_qm_dqrele(pdqp);
1199
1200	return error;
1201	}
1202
1203	int
1204	xfs_link(
1205	xfs_inode_t *tdp,
1206	xfs_inode_t *sip,
1207	struct xfs_name *target_name)
1208	{
1209	xfs_mount_t *mp = tdp->i_mount;
1210	xfs_trans_t *tp;
1211	int error, nospace_error = `0`;
1212	int resblks;
1213
1214	trace_xfs_link(dp: tdp, xfs_link: target_name);
1215
1216	ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
1217
1218	if (xfs_is_shutdown(mp))
1219	return -EIO;
1220
1221	error = xfs_qm_dqattach(sip);
1222	if (error)
1223	goto std_return;
1224
1225	error = xfs_qm_dqattach(tdp);
1226	if (error)
1227	goto std_return;
1228
1229	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1230	error = xfs_trans_alloc_dir(dp: tdp, resv: &M_RES(mp)->tr_link, ip: sip, dblocks: &resblks,
1231	tpp: &tp, nospace_error: &nospace_error);
1232	if (error)
1233	goto std_return;
1234
1235	/*
1236	* If we are using project inheritance, we only allow hard link
1237	* creation in our tree when the project IDs are the same; else
1238	* the tree quota mechanism could be circumvented.
1239	*/
1240	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
1241	tdp->i_projid != sip->i_projid)) {
1242	error = -EXDEV;
1243	goto error_return;
1244	}
1245
1246	if (!resblks) {
1247	error = xfs_dir_canenter(tp, tdp, target_name);
1248	if (error)
1249	goto error_return;
1250	}
1251
1252	/*
1253	* Handle initial link state of O_TMPFILE inode
1254	*/
1255	if (VFS_I(ip: sip)->i_nlink == `0`) {
1256	struct xfs_perag *pag;
1257
1258	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino));
1259	error = xfs_iunlink_remove(tp, pag, sip);
1260	xfs_perag_put(pag);
1261	if (error)
1262	goto error_return;
1263	}
1264
1265	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1266	resblks);
1267	if (error)
1268	goto error_return;
1269	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
1270	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1271
1272	xfs_bumplink(tp, ip: sip);
1273
1274	/*
1275	* If this is a synchronous mount, make sure that the
1276	* link transaction goes to disk before returning to
1277	* the user.
1278	*/
1279	if (xfs_has_wsync(mp) \|\| xfs_has_dirsync(mp))
1280	xfs_trans_set_sync(tp);
1281
1282	return xfs_trans_commit(tp);
1283
1284	error_return:
1285	xfs_trans_cancel(tp);
1286	std_return:
1287	if (error == -ENOSPC && nospace_error)
1288	error = nospace_error;
1289	return error;
1290	}
1291
1292	/ Clear the reflink flag and the cowblocks tag if possible. /
1293	static void
1294	xfs_itruncate_clear_reflink_flags(
1295	struct xfs_inode *ip)
1296	{
1297	struct xfs_ifork *dfork;
1298	struct xfs_ifork *cfork;
1299
1300	if (!xfs_is_reflink_inode(ip))
1301	return;
1302	dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1303	cfork = xfs_ifork_ptr(ip, XFS_COW_FORK);
1304	if (dfork->if_bytes == `0` && cfork->if_bytes == `0`)
1305	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1306	if (cfork->if_bytes == `0`)
1307	xfs_inode_clear_cowblocks_tag(ip);
1308	}
1309
1310	/*
1311	* Free up the underlying blocks past new_size. The new size must be smaller
1312	* than the current size. This routine can be used both for the attribute and
1313	* data fork, and does not modify the inode size, which is left to the caller.
1314	*
1315	* The transaction passed to this routine must have made a permanent log
1316	* reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
1317	* given transaction and start new ones, so make sure everything involved in
1318	* the transaction is tidy before calling here. Some transaction will be
1319	* returned to the caller to be committed. The incoming transaction must
1320	* already include the inode, and both inode locks must be held exclusively.
1321	* The inode must also be "held" within the transaction. On return the inode
1322	* will be "held" within the returned transaction. This routine does NOT
1323	* require any disk space to be reserved for it within the transaction.
1324	*
1325	* If we get an error, we must return with the inode locked and linked into the
1326	* current transaction. This keeps things simple for the higher level code,
1327	* because it always knows that the inode is locked and held in the transaction
1328	* that returns to it whether errors occur or not. We don't mark the inode
1329	* dirty on error so that transactions can be easily aborted if possible.
1330	*/
1331	int
1332	xfs_itruncate_extents_flags(
1333	struct xfs_trans **tpp,
1334	struct xfs_inode *ip,
1335	int whichfork,
1336	xfs_fsize_t new_size,
1337	int flags)
1338	{
1339	struct xfs_mount *mp = ip->i_mount;
1340	struct xfs_trans tp = tpp;
1341	xfs_fileoff_t first_unmap_block;
1342	xfs_filblks_t unmap_len;
1343	int error = `0`;
1344
1345	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1346	ASSERT(!atomic_read(&VFS_I(ip)->i_count) \|\|
1347	xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1348	ASSERT(new_size <= XFS_ISIZE(ip));
1349	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1350	ASSERT(ip->i_itemp != NULL);
1351	ASSERT(ip->i_itemp->ili_lock_flags == `0`);
1352	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1353
1354	trace_xfs_itruncate_extents_start(ip, new_size);
1355
1356	flags \|= xfs_bmapi_aflag(whichfork);
1357
1358	/*
1359	* Since it is possible for space to become allocated beyond
1360	* the end of the file (in a crash where the space is allocated
1361	* but the inode size is not yet updated), simply remove any
1362	* blocks which show up between the new EOF and the maximum
1363	* possible file size.
1364	*
1365	* We have to free all the blocks to the bmbt maximum offset, even if
1366	* the page cache can't scale that far.
1367	*/
1368	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1369	if (!xfs_verify_fileoff(mp, first_unmap_block)) {
1370	WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
1371	return `0`;
1372	}
1373
1374	unmap_len = XFS_MAX_FILEOFF - first_unmap_block + `1`;
1375	while (unmap_len > `0`) {
1376	ASSERT(tp->t_highest_agno == NULLAGNUMBER);
1377	error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
1378	flags, XFS_ITRUNC_MAX_EXTENTS);
1379	if (error)
1380	goto out;
1381
1382	/ free the just unmapped extents /
1383	error = xfs_defer_finish(&tp);
1384	if (error)
1385	goto out;
1386	}
1387
1388	if (whichfork == XFS_DATA_FORK) {
1389	/ Remove all pending CoW reservations. /
1390	error = xfs_reflink_cancel_cow_blocks(ip, &tp,
1391	first_unmap_block, XFS_MAX_FILEOFF, true);
1392	if (error)
1393	goto out;
1394
1395	xfs_itruncate_clear_reflink_flags(ip);
1396	}
1397
1398	/*
1399	* Always re-log the inode so that our permanent transaction can keep
1400	* on rolling it forward in the log.
1401	*/
1402	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1403
1404	trace_xfs_itruncate_extents_end(ip, new_size);
1405
1406	out:
1407	*tpp = tp;
1408	return error;
1409	}
1410
1411	int
1412	xfs_release(
1413	xfs_inode_t *ip)
1414	{
1415	xfs_mount_t *mp = ip->i_mount;
1416	int error = `0`;
1417
1418	if (!S_ISREG(VFS_I(ip)->i_mode) \|\| (VFS_I(ip)->i_mode == `0`))
1419	return `0`;
1420
1421	/ If this is a read-only mount, don't do this (would generate I/O) /
1422	if (xfs_is_readonly(mp))
1423	return `0`;
1424
1425	if (!xfs_is_shutdown(mp)) {
1426	int truncated;
1427
1428	/*
1429	* If we previously truncated this file and removed old data
1430	* in the process, we want to initiate "early" writeout on
1431	* the last close. This is an attempt to combat the notorious
1432	* NULL files problem which is particularly noticeable from a
1433	* truncate down, buffered (re-)write (delalloc), followed by
1434	* a crash. What we are effectively doing here is
1435	* significantly reducing the time window where we'd otherwise
1436	* be exposed to that problem.
1437	*/
1438	truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1439	if (truncated) {
1440	xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1441	if (ip->i_delayed_blks > `0`) {
1442	error = filemap_flush(VFS_I(ip)->i_mapping);
1443	if (error)
1444	return error;
1445	}
1446	}
1447	}
1448
1449	if (VFS_I(ip)->i_nlink == `0`)
1450	return `0`;
1451
1452	/*
1453	* If we can't get the iolock just skip truncating the blocks past EOF
1454	* because we could deadlock with the mmap_lock otherwise. We'll get
1455	* another chance to drop them once the last reference to the inode is
1456	* dropped, so we'll never leak blocks permanently.
1457	*/
1458	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
1459	return `0`;
1460
1461	if (xfs_can_free_eofblocks(ip, force: false)) {
1462	/*
1463	* Check if the inode is being opened, written and closed
1464	* frequently and we have delayed allocation blocks outstanding
1465	* (e.g. streaming writes from the NFS server), truncating the
1466	* blocks past EOF will cause fragmentation to occur.
1467	*
1468	* In this case don't do the truncation, but we have to be
1469	* careful how we detect this case. Blocks beyond EOF show up as
1470	* i_delayed_blks even when the inode is clean, so we need to
1471	* truncate them away first before checking for a dirty release.
1472	* Hence on the first dirty close we will still remove the
1473	* speculative allocation, but after that we will leave it in
1474	* place.
1475	*/
1476	if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1477	goto out_unlock;
1478
1479	error = xfs_free_eofblocks(ip);
1480	if (error)
1481	goto out_unlock;
1482
1483	/ delalloc blocks after truncation means it really is dirty /
1484	if (ip->i_delayed_blks)
1485	xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1486	}
1487
1488	out_unlock:
1489	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1490	return error;
1491	}
1492
1493	/*
1494	* xfs_inactive_truncate
1495	*
1496	* Called to perform a truncate when an inode becomes unlinked.
1497	*/
1498	STATIC int
1499	xfs_inactive_truncate(
1500	struct xfs_inode *ip)
1501	{
1502	struct xfs_mount *mp = ip->i_mount;
1503	struct xfs_trans *tp;
1504	int error;
1505
1506	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_itruncate, blocks: `0`, rtextents: `0`, flags: `0`, tpp: &tp);
1507	if (error) {
1508	ASSERT(xfs_is_shutdown(mp));
1509	return error;
1510	}
1511	xfs_ilock(ip, XFS_ILOCK_EXCL);
1512	xfs_trans_ijoin(tp, ip, `0`);
1513
1514	/*
1515	* Log the inode size first to prevent stale data exposure in the event
1516	* of a system crash before the truncate completes. See the related
1517	* comment in xfs_vn_setattr_size() for details.
1518	*/
1519	ip->i_disk_size = `0`;
1520	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1521
1522	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, `0`);
1523	if (error)
1524	goto error_trans_cancel;
1525
1526	ASSERT(ip->i_df.if_nextents == `0`);
1527
1528	error = xfs_trans_commit(tp);
1529	if (error)
1530	goto error_unlock;
1531
1532	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1533	return `0`;
1534
1535	error_trans_cancel:
1536	xfs_trans_cancel(tp);
1537	error_unlock:
1538	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1539	return error;
1540	}
1541
1542	/*
1543	* xfs_inactive_ifree()
1544	*
1545	* Perform the inode free when an inode is unlinked.
1546	*/
1547	STATIC int
1548	xfs_inactive_ifree(
1549	struct xfs_inode *ip)
1550	{
1551	struct xfs_mount *mp = ip->i_mount;
1552	struct xfs_trans *tp;
1553	int error;
1554
1555	/*
1556	* We try to use a per-AG reservation for any block needed by the finobt
1557	* tree, but as the finobt feature predates the per-AG reservation
1558	* support a degraded file system might not have enough space for the
1559	* reservation at mount time. In that case try to dip into the reserved
1560	* pool and pray.
1561	*
1562	* Send a warning if the reservation does happen to fail, as the inode
1563	* now remains allocated and sits on the unlinked list until the fs is
1564	* repaired.
1565	*/
1566	if (unlikely(mp->m_finobt_nores)) {
1567	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
1568	XFS_IFREE_SPACE_RES(mp), `0`, XFS_TRANS_RESERVE,
1569	&tp);
1570	} else {
1571	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_ifree, blocks: `0`, rtextents: `0`, flags: `0`, tpp: &tp);
1572	}
1573	if (error) {
1574	if (error == -ENOSPC) {
1575	xfs_warn_ratelimited(mp,
1576	"Failed to remove inode(s) from unlinked list. "
1577	"Please free space, unmount and run xfs_repair.");
1578	} else {
1579	ASSERT(xfs_is_shutdown(mp));
1580	}
1581	return error;
1582	}
1583
1584	/*
1585	* We do not hold the inode locked across the entire rolling transaction
1586	* here. We only need to hold it for the first transaction that
1587	* xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1588	* underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1589	* here breaks the relationship between cluster buffer invalidation and
1590	* stale inode invalidation on cluster buffer item journal commit
1591	* completion, and can result in leaving dirty stale inodes hanging
1592	* around in memory.
1593	*
1594	* We have no need for serialising this inode operation against other
1595	* operations - we freed the inode and hence reallocation is required
1596	* and that will serialise on reallocating the space the deferops need
1597	* to free. Hence we can unlock the inode on the first commit of
1598	* the transaction rather than roll it right through the deferops. This
1599	* avoids relogging the XFS_ISTALE inode.
1600	*
1601	* We check that xfs_ifree() hasn't grown an internal transaction roll
1602	* by asserting that the inode is still locked when it returns.
1603	*/
1604	xfs_ilock(ip, XFS_ILOCK_EXCL);
1605	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1606
1607	error = xfs_ifree(tp, ip);
1608	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1609	if (error) {
1610	/*
1611	* If we fail to free the inode, shut down. The cancel
1612	* might do that, we need to make sure. Otherwise the
1613	* inode might be lost for a long time or forever.
1614	*/
1615	if (!xfs_is_shutdown(mp)) {
1616	xfs_notice(mp, "%s: xfs_ifree returned error %d",
1617	__func__, error);
1618	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1619	}
1620	xfs_trans_cancel(tp);
1621	return error;
1622	}
1623
1624	/*
1625	* Credit the quota account(s). The inode is gone.
1626	*/
1627	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -`1`);
1628
1629	return xfs_trans_commit(tp);
1630	}
1631
1632	/*
1633	* Returns true if we need to update the on-disk metadata before we can free
1634	* the memory used by this inode. Updates include freeing post-eof
1635	* preallocations; freeing COW staging extents; and marking the inode free in
1636	* the inobt if it is on the unlinked list.
1637	*/
1638	bool
1639	xfs_inode_needs_inactive(
1640	struct xfs_inode *ip)
1641	{
1642	struct xfs_mount *mp = ip->i_mount;
1643	struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
1644
1645	/*
1646	* If the inode is already free, then there can be nothing
1647	* to clean up here.
1648	*/
1649	if (VFS_I(ip)->i_mode == `0`)
1650	return false;
1651
1652	/*
1653	* If this is a read-only mount, don't do this (would generate I/O)
1654	* unless we're in log recovery and cleaning the iunlinked list.
1655	*/
1656	if (xfs_is_readonly(mp) && !xlog_recovery_needed(log: mp->m_log))
1657	return false;
1658
1659	/ If the log isn't running, push inodes straight to reclaim. /
1660	if (xfs_is_shutdown(mp) \|\| xfs_has_norecovery(mp))
1661	return false;
1662
1663	/ Metadata inodes require explicit resource cleanup. /
1664	if (xfs_is_metadata_inode(ip))
1665	return false;
1666
1667	/ Want to clean out the cow blocks if there are any. /
1668	if (cow_ifp && cow_ifp->if_bytes > `0`)
1669	return true;
1670
1671	/ Unlinked files must be freed. /
1672	if (VFS_I(ip)->i_nlink == `0`)
1673	return true;
1674
1675	/*
1676	* This file isn't being freed, so check if there are post-eof blocks
1677	* to free. @force is true because we are evicting an inode from the
1678	* cache. Post-eof blocks must be freed, lest we end up with broken
1679	* free space accounting.
1680	*
1681	* Note: don't bother with iolock here since lockdep complains about
1682	* acquiring it in reclaim context. We have the only reference to the
1683	* inode at this point anyways.
1684	*/
1685	return xfs_can_free_eofblocks(ip, force: true);
1686	}
1687
1688	/*
1689	* xfs_inactive
1690	*
1691	* This is called when the vnode reference count for the vnode
1692	* goes to zero. If the file has been unlinked, then it must
1693	* now be truncated. Also, we clear all of the read-ahead state
1694	* kept for the inode here since the file is now closed.
1695	*/
1696	int
1697	xfs_inactive(
1698	xfs_inode_t *ip)
1699	{
1700	struct xfs_mount *mp;
1701	int error = `0`;
1702	int truncate = `0`;
1703
1704	/*
1705	* If the inode is already free, then there can be nothing
1706	* to clean up here.
1707	*/
1708	if (VFS_I(ip)->i_mode == `0`) {
1709	ASSERT(ip->i_df.if_broot_bytes == `0`);
1710	goto out;
1711	}
1712
1713	mp = ip->i_mount;
1714	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
1715
1716	/*
1717	* If this is a read-only mount, don't do this (would generate I/O)
1718	* unless we're in log recovery and cleaning the iunlinked list.
1719	*/
1720	if (xfs_is_readonly(mp) && !xlog_recovery_needed(log: mp->m_log))
1721	goto out;
1722
1723	/ Metadata inodes require explicit resource cleanup. /
1724	if (xfs_is_metadata_inode(ip))
1725	goto out;
1726
1727	/ Try to clean out the cow blocks if there are any. /
1728	if (xfs_inode_has_cow_data(ip))
1729	xfs_reflink_cancel_cow_range(ip, `0`, NULLFILEOFF, true);
1730
1731	if (VFS_I(ip)->i_nlink != `0`) {
1732	/*
1733	* force is true because we are evicting an inode from the
1734	* cache. Post-eof blocks must be freed, lest we end up with
1735	* broken free space accounting.
1736	*
1737	* Note: don't bother with iolock here since lockdep complains
1738	* about acquiring it in reclaim context. We have the only
1739	* reference to the inode at this point anyways.
1740	*/
1741	if (xfs_can_free_eofblocks(ip, force: true))
1742	error = xfs_free_eofblocks(ip);
1743
1744	goto out;
1745	}
1746
1747	if (S_ISREG(VFS_I(ip)->i_mode) &&
1748	(ip->i_disk_size != `0` \|\| XFS_ISIZE(ip) != `0` \|\|
1749	ip->i_df.if_nextents > `0` \|\| ip->i_delayed_blks > `0`))
1750	truncate = `1`;
1751
1752	if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
1753	/*
1754	* If this inode is being inactivated during a quotacheck and
1755	* has not yet been scanned by quotacheck, we /must/ remove
1756	* the dquots from the inode before inactivation changes the
1757	* block and inode counts. Most probably this is a result of
1758	* reloading the incore iunlinked list to purge unrecovered
1759	* unlinked inodes.
1760	*/
1761	xfs_qm_dqdetach(ip);
1762	} else {
1763	error = xfs_qm_dqattach(ip);
1764	if (error)
1765	goto out;
1766	}
1767
1768	if (S_ISLNK(VFS_I(ip)->i_mode))
1769	error = xfs_inactive_symlink(ip);
1770	else if (truncate)
1771	error = xfs_inactive_truncate(ip);
1772	if (error)
1773	goto out;
1774
1775	/*
1776	* If there are attributes associated with the file then blow them away
1777	* now. The code calls a routine that recursively deconstructs the
1778	* attribute fork. If also blows away the in-core attribute fork.
1779	*/
1780	if (xfs_inode_has_attr_fork(ip)) {
1781	error = xfs_attr_inactive(ip);
1782	if (error)
1783	goto out;
1784	}
1785
1786	ASSERT(ip->i_forkoff == `0`);
1787
1788	/*
1789	* Free the inode.
1790	*/
1791	error = xfs_inactive_ifree(ip);
1792
1793	out:
1794	/*
1795	* We're done making metadata updates for this inode, so we can release
1796	* the attached dquots.
1797	*/
1798	xfs_qm_dqdetach(ip);
1799	return error;
1800	}
1801
1802	/*
1803	* In-Core Unlinked List Lookups
1804	* =============================
1805	*
1806	* Every inode is supposed to be reachable from some other piece of metadata
1807	* with the exception of the root directory. Inodes with a connection to a
1808	* file descriptor but not linked from anywhere in the on-disk directory tree
1809	* are collectively known as unlinked inodes, though the filesystem itself
1810	* maintains links to these inodes so that on-disk metadata are consistent.
1811	*
1812	* XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
1813	* header contains a number of buckets that point to an inode, and each inode
1814	* record has a pointer to the next inode in the hash chain. This
1815	* singly-linked list causes scaling problems in the iunlink remove function
1816	* because we must walk that list to find the inode that points to the inode
1817	* being removed from the unlinked hash bucket list.
1818	*
1819	* Hence we keep an in-memory double linked list to link each inode on an
1820	* unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer
1821	* based lists would require having 64 list heads in the perag, one for each
1822	* list. This is expensive in terms of memory (think millions of AGs) and cache
1823	* misses on lookups. Instead, use the fact that inodes on the unlinked list
1824	* must be referenced at the VFS level to keep them on the list and hence we
1825	* have an existence guarantee for inodes on the unlinked list.
1826	*
1827	* Given we have an existence guarantee, we can use lockless inode cache lookups
1828	* to resolve aginos to xfs inodes. This means we only need 8 bytes per inode
1829	* for the double linked unlinked list, and we don't need any extra locking to
1830	* keep the list safe as all manipulations are done under the AGI buffer lock.
1831	* Keeping the list up to date does not require memory allocation, just finding
1832	* the XFS inode and updating the next/prev unlinked list aginos.
1833	*/
1834
1835	/*
1836	* Find an inode on the unlinked list. This does not take references to the
1837	* inode as we have existence guarantees by holding the AGI buffer lock and that
1838	* only unlinked, referenced inodes can be on the unlinked inode list. If we
1839	* don't find the inode in cache, then let the caller handle the situation.
1840	*/
1841	static struct xfs_inode *
1842	xfs_iunlink_lookup(
1843	struct xfs_perag *pag,
1844	xfs_agino_t agino)
1845	{
1846	struct xfs_inode *ip;
1847
1848	rcu_read_lock();
1849	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1850	if (!ip) {
1851	/ Caller can handle inode not being in memory. /
1852	rcu_read_unlock();
1853	return NULL;
1854	}
1855
1856	/*
1857	* Inode in RCU freeing limbo should not happen. Warn about this and
1858	* let the caller handle the failure.
1859	*/
1860	if (WARN_ON_ONCE(!ip->i_ino)) {
1861	rcu_read_unlock();
1862	return NULL;
1863	}
1864	ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE \| XFS_IRECLAIM));
1865	rcu_read_unlock();
1866	return ip;
1867	}
1868
1869	/*
1870	* Update the prev pointer of the next agino. Returns -ENOLINK if the inode
1871	* is not in cache.
1872	*/
1873	static int
1874	xfs_iunlink_update_backref(
1875	struct xfs_perag *pag,
1876	xfs_agino_t prev_agino,
1877	xfs_agino_t next_agino)
1878	{
1879	struct xfs_inode *ip;
1880
1881	/ No update necessary if we are at the end of the list. /
1882	if (next_agino == NULLAGINO)
1883	return `0`;
1884
1885	ip = xfs_iunlink_lookup(pag, next_agino);
1886	if (!ip)
1887	return -ENOLINK;
1888
1889	ip->i_prev_unlinked = prev_agino;
1890	return `0`;
1891	}
1892
1893	/*
1894	* Point the AGI unlinked bucket at an inode and log the results. The caller
1895	* is responsible for validating the old value.
1896	*/
1897	STATIC int
1898	xfs_iunlink_update_bucket(
1899	struct xfs_trans *tp,
1900	struct xfs_perag *pag,
1901	struct xfs_buf *agibp,
1902	unsigned int bucket_index,
1903	xfs_agino_t new_agino)
1904	{
1905	struct xfs_agi *agi = agibp->b_addr;
1906	xfs_agino_t old_value;
1907	int offset;
1908
1909	ASSERT(xfs_verify_agino_or_null(pag, new_agino));
1910
1911	old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1912	trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index,
1913	old_value, new_agino);
1914
1915	/*
1916	* We should never find the head of the list already set to the value
1917	* passed in because either we're adding or removing ourselves from the
1918	* head of the list.
1919	*/
1920	if (old_value == new_agino) {
1921	xfs_buf_mark_corrupt(agibp);
1922	return -EFSCORRUPTED;
1923	}
1924
1925	agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
1926	offset = offsetof(struct xfs_agi, agi_unlinked) +
1927	(sizeof(xfs_agino_t) * bucket_index);
1928	xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - `1`);
1929	return `0`;
1930	}
1931
1932	/*
1933	* Load the inode @next_agino into the cache and set its prev_unlinked pointer
1934	* to @prev_agino. Caller must hold the AGI to synchronize with other changes
1935	* to the unlinked list.
1936	*/
1937	STATIC int
1938	xfs_iunlink_reload_next(
1939	struct xfs_trans *tp,
1940	struct xfs_buf *agibp,
1941	xfs_agino_t prev_agino,
1942	xfs_agino_t next_agino)
1943	{
1944	struct xfs_perag *pag = agibp->b_pag;
1945	struct xfs_mount *mp = pag->pag_mount;
1946	struct xfs_inode *next_ip = NULL;
1947	xfs_ino_t ino;
1948	int error;
1949
1950	ASSERT(next_agino != NULLAGINO);
1951
1952	#ifdef DEBUG
1953	rcu_read_lock();
1954	next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
1955	ASSERT(next_ip == NULL);
1956	rcu_read_unlock();
1957	#endif
1958
1959	xfs_info_ratelimited(mp,
1960	"Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
1961	next_agino, pag->pag_agno);
1962
1963	/*
1964	* Use an untrusted lookup just to be cautious in case the AGI has been
1965	* corrupted and now points at a free inode. That shouldn't happen,
1966	* but we'd rather shut down now since we're already running in a weird
1967	* situation.
1968	*/
1969	ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
1970	error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, lock_flags: `0`, ipp: &next_ip);
1971	if (error)
1972	return error;
1973
1974	/ If this is not an unlinked inode, something is very wrong. /
1975	if (VFS_I(ip: next_ip)->i_nlink != `0`) {
1976	error = -EFSCORRUPTED;
1977	goto rele;
1978	}
1979
1980	next_ip->i_prev_unlinked = prev_agino;
1981	trace_xfs_iunlink_reload_next(ip: next_ip);
1982	rele:
1983	ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
1984	if (xfs_is_quotacheck_running(mp) && next_ip)
1985	xfs_iflags_set(ip: next_ip, XFS_IQUOTAUNCHECKED);
1986	xfs_irele(ip: next_ip);
1987	return error;
1988	}
1989
1990	static int
1991	xfs_iunlink_insert_inode(
1992	struct xfs_trans *tp,
1993	struct xfs_perag *pag,
1994	struct xfs_buf *agibp,
1995	struct xfs_inode *ip)
1996	{
1997	struct xfs_mount *mp = tp->t_mountp;
1998	struct xfs_agi *agi = agibp->b_addr;
1999	xfs_agino_t next_agino;
2000	xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2001	short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2002	int error;
2003
2004	/*
2005	* Get the index into the agi hash table for the list this inode will
2006	* go on. Make sure the pointer isn't garbage and that this inode
2007	* isn't already on the list.
2008	*/
2009	next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2010	if (next_agino == agino \|\|
2011	!xfs_verify_agino_or_null(pag, next_agino)) {
2012	xfs_buf_mark_corrupt(agibp);
2013	return -EFSCORRUPTED;
2014	}
2015
2016	/*
2017	* Update the prev pointer in the next inode to point back to this
2018	* inode.
2019	*/
2020	error = xfs_iunlink_update_backref(pag, agino, next_agino);
2021	if (error == -ENOLINK)
2022	error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
2023	if (error)
2024	return error;
2025
2026	if (next_agino != NULLAGINO) {
2027	/*
2028	* There is already another inode in the bucket, so point this
2029	* inode to the current head of the list.
2030	*/
2031	error = xfs_iunlink_log_inode(tp, ip, pag, next_agino);
2032	if (error)
2033	return error;
2034	ip->i_next_unlinked = next_agino;
2035	}
2036
2037	/ Point the head of the list to point to this inode. /
2038	ip->i_prev_unlinked = NULLAGINO;
2039	return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
2040	}
2041
2042	/*
2043	* This is called when the inode's link count has gone to 0 or we are creating
2044	* a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
2045	*
2046	* We place the on-disk inode on a list in the AGI. It will be pulled from this
2047	* list when the inode is freed.
2048	*/
2049	STATIC int
2050	xfs_iunlink(
2051	struct xfs_trans *tp,
2052	struct xfs_inode *ip)
2053	{
2054	struct xfs_mount *mp = tp->t_mountp;
2055	struct xfs_perag *pag;
2056	struct xfs_buf *agibp;
2057	int error;
2058
2059	ASSERT(VFS_I(ip)->i_nlink == `0`);
2060	ASSERT(VFS_I(ip)->i_mode != `0`);
2061	trace_xfs_iunlink(ip);
2062
2063	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2064
2065	/ Get the agi buffer first. It ensures lock ordering on the list. /
2066	error = xfs_read_agi(pag, tp, &agibp);
2067	if (error)
2068	goto out;
2069
2070	error = xfs_iunlink_insert_inode(tp, pag, agibp, ip);
2071	out:
2072	xfs_perag_put(pag);
2073	return error;
2074	}
2075
2076	static int
2077	xfs_iunlink_remove_inode(
2078	struct xfs_trans *tp,
2079	struct xfs_perag *pag,
2080	struct xfs_buf *agibp,
2081	struct xfs_inode *ip)
2082	{
2083	struct xfs_mount *mp = tp->t_mountp;
2084	struct xfs_agi *agi = agibp->b_addr;
2085	xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2086	xfs_agino_t head_agino;
2087	short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2088	int error;
2089
2090	trace_xfs_iunlink_remove(ip);
2091
2092	/*
2093	* Get the index into the agi hash table for the list this inode will
2094	* go on. Make sure the head pointer isn't garbage.
2095	*/
2096	head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2097	if (!xfs_verify_agino(pag, head_agino)) {
2098	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
2099	agi, sizeof(*agi));
2100	return -EFSCORRUPTED;
2101	}
2102
2103	/*
2104	* Set our inode's next_unlinked pointer to NULL and then return
2105	* the old pointer value so that we can update whatever was previous
2106	* to us in the list to point to whatever was next in the list.
2107	*/
2108	error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO);
2109	if (error)
2110	return error;
2111
2112	/*
2113	* Update the prev pointer in the next inode to point back to previous
2114	* inode in the chain.
2115	*/
2116	error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
2117	ip->i_next_unlinked);
2118	if (error == -ENOLINK)
2119	error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
2120	ip->i_next_unlinked);
2121	if (error)
2122	return error;
2123
2124	if (head_agino != agino) {
2125	struct xfs_inode *prev_ip;
2126
2127	prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
2128	if (!prev_ip)
2129	return -EFSCORRUPTED;
2130
2131	error = xfs_iunlink_log_inode(tp, ip: prev_ip, pag,
2132	next_agino: ip->i_next_unlinked);
2133	prev_ip->i_next_unlinked = ip->i_next_unlinked;
2134	} else {
2135	/ Point the head of the list to the next unlinked inode. /
2136	error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index,
2137	ip->i_next_unlinked);
2138	}
2139
2140	ip->i_next_unlinked = NULLAGINO;
2141	ip->i_prev_unlinked = `0`;
2142	return error;
2143	}
2144
2145	/*
2146	* Pull the on-disk inode from the AGI unlinked list.
2147	*/
2148	STATIC int
2149	xfs_iunlink_remove(
2150	struct xfs_trans *tp,
2151	struct xfs_perag *pag,
2152	struct xfs_inode *ip)
2153	{
2154	struct xfs_buf *agibp;
2155	int error;
2156
2157	trace_xfs_iunlink_remove(ip);
2158
2159	/ Get the agi buffer first. It ensures lock ordering on the list. /
2160	error = xfs_read_agi(pag, tp, &agibp);
2161	if (error)
2162	return error;
2163
2164	return xfs_iunlink_remove_inode(tp, pag, agibp, ip);
2165	}
2166
2167	/*
2168	* Look up the inode number specified and if it is not already marked XFS_ISTALE
2169	* mark it stale. We should only find clean inodes in this lookup that aren't
2170	* already stale.
2171	*/
2172	static void
2173	xfs_ifree_mark_inode_stale(
2174	struct xfs_perag *pag,
2175	struct xfs_inode *free_ip,
2176	xfs_ino_t inum)
2177	{
2178	struct xfs_mount *mp = pag->pag_mount;
2179	struct xfs_inode_log_item *iip;
2180	struct xfs_inode *ip;
2181
2182	retry:
2183	rcu_read_lock();
2184	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2185
2186	/ Inode not in memory, nothing to do /
2187	if (!ip) {
2188	rcu_read_unlock();
2189	return;
2190	}
2191
2192	/*
2193	* because this is an RCU protected lookup, we could find a recently
2194	* freed or even reallocated inode during the lookup. We need to check
2195	* under the i_flags_lock for a valid inode here. Skip it if it is not
2196	* valid, the wrong inode or stale.
2197	*/
2198	spin_lock(lock: &ip->i_flags_lock);
2199	if (ip->i_ino != inum \|\| __xfs_iflags_test(ip, XFS_ISTALE))
2200	goto out_iflags_unlock;
2201
2202	/*
2203	* Don't try to lock/unlock the current inode, but we _cannot_ skip the
2204	* other inodes that we did not find in the list attached to the buffer
2205	* and are not already marked stale. If we can't lock it, back off and
2206	* retry.
2207	*/
2208	if (ip != free_ip) {
2209	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2210	spin_unlock(lock: &ip->i_flags_lock);
2211	rcu_read_unlock();
2212	delay(ticks: `1`);
2213	goto retry;
2214	}
2215	}
2216	ip->i_flags \|= XFS_ISTALE;
2217
2218	/*
2219	* If the inode is flushing, it is already attached to the buffer. All
2220	* we needed to do here is mark the inode stale so buffer IO completion
2221	* will remove it from the AIL.
2222	*/
2223	iip = ip->i_itemp;
2224	if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
2225	ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2226	ASSERT(iip->ili_last_fields);
2227	goto out_iunlock;
2228	}
2229
2230	/*
2231	* Inodes not attached to the buffer can be released immediately.
2232	* Everything else has to go through xfs_iflush_abort() on journal
2233	* commit as the flock synchronises removal of the inode from the
2234	* cluster buffer against inode reclaim.
2235	*/
2236	if (!iip \|\| list_empty(head: &iip->ili_item.li_bio_list))
2237	goto out_iunlock;
2238
2239	__xfs_iflags_set(ip, XFS_IFLUSHING);
2240	spin_unlock(lock: &ip->i_flags_lock);
2241	rcu_read_unlock();
2242
2243	/ we have a dirty inode in memory that has not yet been flushed. /
2244	spin_lock(lock: &iip->ili_lock);
2245	iip->ili_last_fields = iip->ili_fields;
2246	iip->ili_fields = `0`;
2247	iip->ili_fsync_fields = `0`;
2248	spin_unlock(lock: &iip->ili_lock);
2249	ASSERT(iip->ili_last_fields);
2250
2251	if (ip != free_ip)
2252	xfs_iunlock(ip, XFS_ILOCK_EXCL);
2253	return;
2254
2255	out_iunlock:
2256	if (ip != free_ip)
2257	xfs_iunlock(ip, XFS_ILOCK_EXCL);
2258	out_iflags_unlock:
2259	spin_unlock(lock: &ip->i_flags_lock);
2260	rcu_read_unlock();
2261	}
2262
2263	/*
2264	* A big issue when freeing the inode cluster is that we _cannot_ skip any
2265	* inodes that are in memory - they all must be marked stale and attached to
2266	* the cluster buffer.
2267	*/
2268	static int
2269	xfs_ifree_cluster(
2270	struct xfs_trans *tp,
2271	struct xfs_perag *pag,
2272	struct xfs_inode *free_ip,
2273	struct xfs_icluster *xic)
2274	{
2275	struct xfs_mount *mp = free_ip->i_mount;
2276	struct xfs_ino_geometry *igeo = M_IGEO(mp);
2277	struct xfs_buf *bp;
2278	xfs_daddr_t blkno;
2279	xfs_ino_t inum = xic->first_ino;
2280	int nbufs;
2281	int i, j;
2282	int ioffset;
2283	int error;
2284
2285	nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2286
2287	for (j = `0`; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2288	/*
2289	* The allocation bitmap tells us which inodes of the chunk were
2290	* physically allocated. Skip the cluster if an inode falls into
2291	* a sparse region.
2292	*/
2293	ioffset = inum - xic->first_ino;
2294	if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == `0`) {
2295	ASSERT(ioffset % igeo->inodes_per_cluster == `0`);
2296	continue;
2297	}
2298
2299	blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2300	XFS_INO_TO_AGBNO(mp, inum));
2301
2302	/*
2303	* We obtain and lock the backing buffer first in the process
2304	* here to ensure dirty inodes attached to the buffer remain in
2305	* the flushing state while we mark them stale.
2306	*
2307	* If we scan the in-memory inodes first, then buffer IO can
2308	* complete before we get a lock on it, and hence we may fail
2309	* to mark all the active inodes on the buffer stale.
2310	*/
2311	error = xfs_trans_get_buf(tp, target: mp->m_ddev_targp, blkno,
2312	numblks: mp->m_bsize * igeo->blocks_per_cluster,
2313	XBF_UNMAPPED, bpp: &bp);
2314	if (error)
2315	return error;
2316
2317	/*
2318	* This buffer may not have been correctly initialised as we
2319	* didn't read it from disk. That's not important because we are
2320	* only using to mark the buffer as stale in the log, and to
2321	* attach stale cached inodes on it. That means it will never be
2322	* dispatched for IO. If it is, we want to know about it, and we
2323	* want it to fail. We can acheive this by adding a write
2324	* verifier to the buffer.
2325	*/
2326	bp->b_ops = &xfs_inode_buf_ops;
2327
2328	/*
2329	* Now we need to set all the cached clean inodes as XFS_ISTALE,
2330	* too. This requires lookups, and will skip inodes that we've
2331	* already marked XFS_ISTALE.
2332	*/
2333	for (i = `0`; i < igeo->inodes_per_cluster; i++)
2334	xfs_ifree_mark_inode_stale(pag, free_ip, inum: inum + i);
2335
2336	xfs_trans_stale_inode_buf(tp, bp);
2337	xfs_trans_binval(tp, bp);
2338	}
2339	return `0`;
2340	}
2341
2342	/*
2343	* This is called to return an inode to the inode free list. The inode should
2344	* already be truncated to 0 length and have no pages associated with it. This
2345	* routine also assumes that the inode is already a part of the transaction.
2346	*
2347	* The on-disk copy of the inode will have been added to the list of unlinked
2348	* inodes in the AGI. We need to remove the inode from that list atomically with
2349	* respect to freeing it here.
2350	*/
2351	int
2352	xfs_ifree(
2353	struct xfs_trans *tp,
2354	struct xfs_inode *ip)
2355	{
2356	struct xfs_mount *mp = ip->i_mount;
2357	struct xfs_perag *pag;
2358	struct xfs_icluster xic = { `0` };
2359	struct xfs_inode_log_item *iip = ip->i_itemp;
2360	int error;
2361
2362	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2363	ASSERT(VFS_I(ip)->i_nlink == `0`);
2364	ASSERT(ip->i_df.if_nextents == `0`);
2365	ASSERT(ip->i_disk_size == `0` \|\| !S_ISREG(VFS_I(ip)->i_mode));
2366	ASSERT(ip->i_nblocks == `0`);
2367
2368	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2369
2370	/*
2371	* Free the inode first so that we guarantee that the AGI lock is going
2372	* to be taken before we remove the inode from the unlinked list. This
2373	* makes the AGI lock -> unlinked list modification order the same as
2374	* used in O_TMPFILE creation.
2375	*/
2376	error = xfs_difree(tp, pag, ip->i_ino, &xic);
2377	if (error)
2378	goto out;
2379
2380	error = xfs_iunlink_remove(tp, pag, ip);
2381	if (error)
2382	goto out;
2383
2384	/*
2385	* Free any local-format data sitting around before we reset the
2386	* data fork to extents format. Note that the attr fork data has
2387	* already been freed by xfs_attr_inactive.
2388	*/
2389	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
2390	kmem_free(ptr: ip->i_df.if_u1.if_data);
2391	ip->i_df.if_u1.if_data = NULL;
2392	ip->i_df.if_bytes = `0`;
2393	}
2394
2395	VFS_I(ip)->i_mode = `0`; / mark incore inode as free /
2396	ip->i_diflags = `0`;
2397	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
2398	ip->i_forkoff = `0`; / mark the attr fork not in use /
2399	ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2400	if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS))
2401	xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS);
2402
2403	/ Don't attempt to replay owner changes for a deleted inode /
2404	spin_lock(lock: &iip->ili_lock);
2405	iip->ili_fields &= ~(XFS_ILOG_AOWNER \| XFS_ILOG_DOWNER);
2406	spin_unlock(lock: &iip->ili_lock);
2407
2408	/*
2409	* Bump the generation count so no one will be confused
2410	* by reincarnations of this inode.
2411	*/
2412	VFS_I(ip)->i_generation++;
2413	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2414
2415	if (xic.deleted)
2416	error = xfs_ifree_cluster(tp, pag, free_ip: ip, xic: &xic);
2417	out:
2418	xfs_perag_put(pag);
2419	return error;
2420	}
2421
2422	/*
2423	* This is called to unpin an inode. The caller must have the inode locked
2424	* in at least shared mode so that the buffer cannot be subsequently pinned
2425	* once someone is waiting for it to be unpinned.
2426	*/
2427	static void
2428	xfs_iunpin(
2429	struct xfs_inode *ip)
2430	{
2431	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
2432
2433	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2434
2435	/ Give the log a push to start the unpinning I/O /
2436	xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, `0`, NULL);
2437
2438	}
2439
2440	static void
2441	__xfs_iunpin_wait(
2442	struct xfs_inode *ip)
2443	{
2444	wait_queue_head_t *wq = bit_waitqueue(word: &ip->i_flags, __XFS_IPINNED_BIT);
2445	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2446
2447	xfs_iunpin(ip);
2448
2449	do {
2450	prepare_to_wait(wq_head: wq, wq_entry: &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2451	if (xfs_ipincount(ip))
2452	io_schedule();
2453	} while (xfs_ipincount(ip));
2454	finish_wait(wq_head: wq, wq_entry: &wait.wq_entry);
2455	}
2456
2457	void
2458	xfs_iunpin_wait(
2459	struct xfs_inode *ip)
2460	{
2461	if (xfs_ipincount(ip))
2462	__xfs_iunpin_wait(ip);
2463	}
2464
2465	/*
2466	* Removing an inode from the namespace involves removing the directory entry
2467	* and dropping the link count on the inode. Removing the directory entry can
2468	* result in locking an AGF (directory blocks were freed) and removing a link
2469	* count can result in placing the inode on an unlinked list which results in
2470	* locking an AGI.
2471	*
2472	* The big problem here is that we have an ordering constraint on AGF and AGI
2473	* locking - inode allocation locks the AGI, then can allocate a new extent for
2474	* new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2475	* removes the inode from the unlinked list, requiring that we lock the AGI
2476	* first, and then freeing the inode can result in an inode chunk being freed
2477	* and hence freeing disk space requiring that we lock an AGF.
2478	*
2479	* Hence the ordering that is imposed by other parts of the code is AGI before
2480	* AGF. This means we cannot remove the directory entry before we drop the inode
2481	* reference count and put it on the unlinked list as this results in a lock
2482	* order of AGF then AGI, and this can deadlock against inode allocation and
2483	* freeing. Therefore we must drop the link counts before we remove the
2484	* directory entry.
2485	*
2486	* This is still safe from a transactional point of view - it is not until we
2487	* get to xfs_defer_finish() that we have the possibility of multiple
2488	* transactions in this operation. Hence as long as we remove the directory
2489	* entry and drop the link count in the first transaction of the remove
2490	* operation, there are no transactional constraints on the ordering here.
2491	*/
2492	int
2493	xfs_remove(
2494	xfs_inode_t *dp,
2495	struct xfs_name *name,
2496	xfs_inode_t *ip)
2497	{
2498	xfs_mount_t *mp = dp->i_mount;
2499	xfs_trans_t *tp = NULL;
2500	int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
2501	int dontcare;
2502	int error = `0`;
2503	uint resblks;
2504
2505	trace_xfs_remove(dp, xfs_remove: name);
2506
2507	if (xfs_is_shutdown(mp))
2508	return -EIO;
2509
2510	error = xfs_qm_dqattach(dp);
2511	if (error)
2512	goto std_return;
2513
2514	error = xfs_qm_dqattach(ip);
2515	if (error)
2516	goto std_return;
2517
2518	/*
2519	* We try to get the real space reservation first, allowing for
2520	* directory btree deletion(s) implying possible bmap insert(s). If we
2521	* can't get the space reservation then we use 0 instead, and avoid the
2522	* bmap btree insert(s) in the directory code by, if the bmap insert
2523	* tries to happen, instead trimming the LAST block from the directory.
2524	*
2525	* Ignore EDQUOT and ENOSPC being returned via nospace_error because
2526	* the directory code can handle a reservationless update and we don't
2527	* want to prevent a user from trying to free space by deleting things.
2528	*/
2529	resblks = XFS_REMOVE_SPACE_RES(mp);
2530	error = xfs_trans_alloc_dir(dp, resv: &M_RES(mp)->tr_remove, ip, dblocks: &resblks,
2531	tpp: &tp, nospace_error: &dontcare);
2532	if (error) {
2533	ASSERT(error != -ENOSPC);
2534	goto std_return;
2535	}
2536
2537	/*
2538	* If we're removing a directory perform some additional validation.
2539	*/
2540	if (is_dir) {
2541	ASSERT(VFS_I(ip)->i_nlink >= `2`);
2542	if (VFS_I(ip)->i_nlink != `2`) {
2543	error = -ENOTEMPTY;
2544	goto out_trans_cancel;
2545	}
2546	if (!xfs_dir_isempty(ip)) {
2547	error = -ENOTEMPTY;
2548	goto out_trans_cancel;
2549	}
2550
2551	/ Drop the link from ip's "..". /
2552	error = xfs_droplink(tp, ip: dp);
2553	if (error)
2554	goto out_trans_cancel;
2555
2556	/ Drop the "." link from ip to self. /
2557	error = xfs_droplink(tp, ip);
2558	if (error)
2559	goto out_trans_cancel;
2560
2561	/*
2562	* Point the unlinked child directory's ".." entry to the root
2563	* directory to eliminate back-references to inodes that may
2564	* get freed before the child directory is closed. If the fs
2565	* gets shrunk, this can lead to dirent inode validation errors.
2566	*/
2567	if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) {
2568	error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
2569	tp->t_mountp->m_sb.sb_rootino, `0`);
2570	if (error)
2571	goto out_trans_cancel;
2572	}
2573	} else {
2574	/*
2575	* When removing a non-directory we need to log the parent
2576	* inode here. For a directory this is done implicitly
2577	* by the xfs_droplink call for the ".." entry.
2578	*/
2579	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2580	}
2581	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
2582
2583	/ Drop the link from dp to ip. /
2584	error = xfs_droplink(tp, ip);
2585	if (error)
2586	goto out_trans_cancel;
2587
2588	error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks);
2589	if (error) {
2590	ASSERT(error != -ENOENT);
2591	goto out_trans_cancel;
2592	}
2593
2594	/*
2595	* If this is a synchronous mount, make sure that the
2596	* remove transaction goes to disk before returning to
2597	* the user.
2598	*/
2599	if (xfs_has_wsync(mp) \|\| xfs_has_dirsync(mp))
2600	xfs_trans_set_sync(tp);
2601
2602	error = xfs_trans_commit(tp);
2603	if (error)
2604	goto std_return;
2605
2606	if (is_dir && xfs_inode_is_filestream(ip))
2607	xfs_filestream_deassociate(ip);
2608
2609	return `0`;
2610
2611	out_trans_cancel:
2612	xfs_trans_cancel(tp);
2613	std_return:
2614	return error;
2615	}
2616
2617	/*
2618	* Enter all inodes for a rename transaction into a sorted array.
2619	*/
2620	#define __XFS_SORT_INODES 5
2621	STATIC void
2622	xfs_sort_for_rename(
2623	struct xfs_inode dp1, /* in: old (source) directory inode /
2624	struct xfs_inode dp2, /* in: new (target) directory inode /
2625	struct xfs_inode ip1, /* in: inode of old entry /
2626	struct xfs_inode ip2, /* in: inode of new entry /
2627	struct xfs_inode wip, /* in: whiteout inode /
2628	struct xfs_inode *i_tab,/* out: sorted array of inodes /
2629	int num_inodes) /* in/out: inodes in array /
2630	{
2631	int i, j;
2632
2633	ASSERT(*num_inodes == __XFS_SORT_INODES);
2634	memset(i_tab, `0`, num_inodes sizeof(struct xfs_inode *));
2635
2636	/*
2637	* i_tab contains a list of pointers to inodes. We initialize
2638	* the table here & we'll sort it. We will then use it to
2639	* order the acquisition of the inode locks.
2640	*
2641	* Note that the table may contain duplicates. e.g., dp1 == dp2.
2642	*/
2643	i = `0`;
2644	i_tab[i++] = dp1;
2645	i_tab[i++] = dp2;
2646	i_tab[i++] = ip1;
2647	if (ip2)
2648	i_tab[i++] = ip2;
2649	if (wip)
2650	i_tab[i++] = wip;
2651	*num_inodes = i;
2652
2653	/*
2654	* Sort the elements via bubble sort. (Remember, there are at
2655	* most 5 elements to sort, so this is adequate.)
2656	*/
2657	for (i = `0`; i < *num_inodes; i++) {
2658	for (j = `1`; j < *num_inodes; j++) {
2659	if (i_tab[j]->i_ino < i_tab[j-`1`]->i_ino) {
2660	struct xfs_inode *temp = i_tab[j];
2661	i_tab[j] = i_tab[j-`1`];
2662	i_tab[j-`1`] = temp;
2663	}
2664	}
2665	}
2666	}
2667
2668	static int
2669	xfs_finish_rename(
2670	struct xfs_trans *tp)
2671	{
2672	/*
2673	* If this is a synchronous mount, make sure that the rename transaction
2674	* goes to disk before returning to the user.
2675	*/
2676	if (xfs_has_wsync(tp->t_mountp) \|\| xfs_has_dirsync(tp->t_mountp))
2677	xfs_trans_set_sync(tp);
2678
2679	return xfs_trans_commit(tp);
2680	}
2681
2682	/*
2683	* xfs_cross_rename()
2684	*
2685	* responsible for handling RENAME_EXCHANGE flag in renameat2() syscall
2686	*/
2687	STATIC int
2688	xfs_cross_rename(
2689	struct xfs_trans *tp,
2690	struct xfs_inode *dp1,
2691	struct xfs_name *name1,
2692	struct xfs_inode *ip1,
2693	struct xfs_inode *dp2,
2694	struct xfs_name *name2,
2695	struct xfs_inode *ip2,
2696	int spaceres)
2697	{
2698	int error = `0`;
2699	int ip1_flags = `0`;
2700	int ip2_flags = `0`;
2701	int dp2_flags = `0`;
2702
2703	/ Swap inode number for dirent in first parent /
2704	error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres);
2705	if (error)
2706	goto out_trans_abort;
2707
2708	/ Swap inode number for dirent in second parent /
2709	error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres);
2710	if (error)
2711	goto out_trans_abort;
2712
2713	/*
2714	* If we're renaming one or more directories across different parents,
2715	* update the respective ".." entries (and link counts) to match the new
2716	* parents.
2717	*/
2718	if (dp1 != dp2) {
2719	dp2_flags = XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
2720
2721	if (S_ISDIR(VFS_I(ip2)->i_mode)) {
2722	error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
2723	dp1->i_ino, spaceres);
2724	if (error)
2725	goto out_trans_abort;
2726
2727	/ transfer ip2 ".." reference to dp1 /
2728	if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
2729	error = xfs_droplink(tp, ip: dp2);
2730	if (error)
2731	goto out_trans_abort;
2732	xfs_bumplink(tp, ip: dp1);
2733	}
2734
2735	/*
2736	* Although ip1 isn't changed here, userspace needs
2737	* to be warned about the change, so that applications
2738	* relying on it (like backup ones), will properly
2739	* notify the change
2740	*/
2741	ip1_flags \|= XFS_ICHGTIME_CHG;
2742	ip2_flags \|= XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
2743	}
2744
2745	if (S_ISDIR(VFS_I(ip1)->i_mode)) {
2746	error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
2747	dp2->i_ino, spaceres);
2748	if (error)
2749	goto out_trans_abort;
2750
2751	/ transfer ip1 ".." reference to dp2 /
2752	if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
2753	error = xfs_droplink(tp, ip: dp1);
2754	if (error)
2755	goto out_trans_abort;
2756	xfs_bumplink(tp, ip: dp2);
2757	}
2758
2759	/*
2760	* Although ip2 isn't changed here, userspace needs
2761	* to be warned about the change, so that applications
2762	* relying on it (like backup ones), will properly
2763	* notify the change
2764	*/
2765	ip1_flags \|= XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG;
2766	ip2_flags \|= XFS_ICHGTIME_CHG;
2767	}
2768	}
2769
2770	if (ip1_flags) {
2771	xfs_trans_ichgtime(tp, ip1, ip1_flags);
2772	xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
2773	}
2774	if (ip2_flags) {
2775	xfs_trans_ichgtime(tp, ip2, ip2_flags);
2776	xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
2777	}
2778	if (dp2_flags) {
2779	xfs_trans_ichgtime(tp, dp2, dp2_flags);
2780	xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
2781	}
2782	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
2783	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2784	return xfs_finish_rename(tp);
2785
2786	out_trans_abort:
2787	xfs_trans_cancel(tp);
2788	return error;
2789	}
2790
2791	/*
2792	* xfs_rename_alloc_whiteout()
2793	*
2794	* Return a referenced, unlinked, unlocked inode that can be used as a
2795	* whiteout in a rename transaction. We use a tmpfile inode here so that if we
2796	* crash between allocating the inode and linking it into the rename transaction
2797	* recovery will free the inode and we won't leak it.
2798	*/
2799	static int
2800	xfs_rename_alloc_whiteout(
2801	struct mnt_idmap *idmap,
2802	struct xfs_name *src_name,
2803	struct xfs_inode *dp,
2804	struct xfs_inode **wip)
2805	{
2806	struct xfs_inode *tmpfile;
2807	struct qstr name;
2808	int error;
2809
2810	error = xfs_create_tmpfile(idmap, dp, S_IFCHR \| WHITEOUT_MODE,
2811	ipp: &tmpfile);
2812	if (error)
2813	return error;
2814
2815	name.name = src_name->name;
2816	name.len = src_name->len;
2817	error = xfs_inode_init_security(inode: VFS_I(ip: tmpfile), dir: VFS_I(ip: dp), qstr: &name);
2818	if (error) {
2819	xfs_finish_inode_setup(ip: tmpfile);
2820	xfs_irele(ip: tmpfile);
2821	return error;
2822	}
2823
2824	/*
2825	* Prepare the tmpfile inode as if it were created through the VFS.
2826	* Complete the inode setup and flag it as linkable. nlink is already
2827	* zero, so we can skip the drop_nlink.
2828	*/
2829	xfs_setup_iops(ip: tmpfile);
2830	xfs_finish_inode_setup(ip: tmpfile);
2831	VFS_I(ip: tmpfile)->i_state \|= I_LINKABLE;
2832
2833	*wip = tmpfile;
2834	return `0`;
2835	}
2836
2837	/*
2838	* xfs_rename
2839	*/
2840	int
2841	xfs_rename(
2842	struct mnt_idmap *idmap,
2843	struct xfs_inode *src_dp,
2844	struct xfs_name *src_name,
2845	struct xfs_inode *src_ip,
2846	struct xfs_inode *target_dp,
2847	struct xfs_name *target_name,
2848	struct xfs_inode *target_ip,
2849	unsigned int flags)
2850	{
2851	struct xfs_mount *mp = src_dp->i_mount;
2852	struct xfs_trans *tp;
2853	struct xfs_inode wip = NULL; /* whiteout inode /
2854	struct xfs_inode *inodes[__XFS_SORT_INODES];
2855	int i;
2856	int num_inodes = __XFS_SORT_INODES;
2857	bool new_parent = (src_dp != target_dp);
2858	bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
2859	int spaceres;
2860	bool retried = false;
2861	int error, nospace_error = `0`;
2862
2863	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2864
2865	if ((flags & RENAME_EXCHANGE) && !target_ip)
2866	return -EINVAL;
2867
2868	/*
2869	* If we are doing a whiteout operation, allocate the whiteout inode
2870	* we will be placing at the target and ensure the type is set
2871	* appropriately.
2872	*/
2873	if (flags & RENAME_WHITEOUT) {
2874	error = xfs_rename_alloc_whiteout(idmap, src_name,
2875	dp: target_dp, wip: &wip);
2876	if (error)
2877	return error;
2878
2879	/ setup target dirent info as whiteout /
2880	src_name->type = XFS_DIR3_FT_CHRDEV;
2881	}
2882
2883	xfs_sort_for_rename(dp1: src_dp, dp2: target_dp, ip1: src_ip, ip2: target_ip, wip,
2884	i_tab: inodes, num_inodes: &num_inodes);
2885
2886	retry:
2887	nospace_error = `0`;
2888	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2889	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_rename, blocks: spaceres, rtextents: `0`, flags: `0`, tpp: &tp);
2890	if (error == -ENOSPC) {
2891	nospace_error = error;
2892	spaceres = `0`;
2893	error = xfs_trans_alloc(mp, resp: &M_RES(mp)->tr_rename, blocks: `0`, rtextents: `0`, flags: `0`,
2894	tpp: &tp);
2895	}
2896	if (error)
2897	goto out_release_wip;
2898
2899	/*
2900	* Attach the dquots to the inodes
2901	*/
2902	error = xfs_qm_vop_rename_dqattach(inodes);
2903	if (error)
2904	goto out_trans_cancel;
2905
2906	/*
2907	* Lock all the participating inodes. Depending upon whether
2908	* the target_name exists in the target directory, and
2909	* whether the target directory is the same as the source
2910	* directory, we can lock from 2 to 5 inodes.
2911	*/
2912	xfs_lock_inodes(ips: inodes, inodes: num_inodes, XFS_ILOCK_EXCL);
2913
2914	/*
2915	* Join all the inodes to the transaction. From this point on,
2916	* we can rely on either trans_commit or trans_cancel to unlock
2917	* them.
2918	*/
2919	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2920	if (new_parent)
2921	xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2922	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2923	if (target_ip)
2924	xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2925	if (wip)
2926	xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2927
2928	/*
2929	* If we are using project inheritance, we only allow renames
2930	* into our tree when the project IDs are the same; else the
2931	* tree quota mechanism would be circumvented.
2932	*/
2933	if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
2934	target_dp->i_projid != src_ip->i_projid)) {
2935	error = -EXDEV;
2936	goto out_trans_cancel;
2937	}
2938
2939	/ RENAME_EXCHANGE is unique from here on. /
2940	if (flags & RENAME_EXCHANGE)
2941	return xfs_cross_rename(tp, dp1: src_dp, name1: src_name, ip1: src_ip,
2942	dp2: target_dp, name2: target_name, ip2: target_ip,
2943	spaceres);
2944
2945	/*
2946	* Try to reserve quota to handle an expansion of the target directory.
2947	* We'll allow the rename to continue in reservationless mode if we hit
2948	* a space usage constraint. If we trigger reservationless mode, save
2949	* the errno if there isn't any free space in the target directory.
2950	*/
2951	if (spaceres != `0`) {
2952	error = xfs_trans_reserve_quota_nblks(tp, ip: target_dp, dblocks: spaceres,
2953	rblocks: `0`, force: false);
2954	if (error == -EDQUOT \|\| error == -ENOSPC) {
2955	if (!retried) {
2956	xfs_trans_cancel(tp);
2957	xfs_blockgc_free_quota(ip: target_dp, iwalk_flags: `0`);
2958	retried = true;
2959	goto retry;
2960	}
2961
2962	nospace_error = error;
2963	spaceres = `0`;
2964	error = `0`;
2965	}
2966	if (error)
2967	goto out_trans_cancel;
2968	}
2969
2970	/*
2971	* Check for expected errors before we dirty the transaction
2972	* so we can return an error without a transaction abort.
2973	*/
2974	if (target_ip == NULL) {
2975	/*
2976	* If there's no space reservation, check the entry will
2977	* fit before actually inserting it.
2978	*/
2979	if (!spaceres) {
2980	error = xfs_dir_canenter(tp, target_dp, target_name);
2981	if (error)
2982	goto out_trans_cancel;
2983	}
2984	} else {
2985	/*
2986	* If target exists and it's a directory, check that whether
2987	* it can be destroyed.
2988	*/
2989	if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
2990	(!xfs_dir_isempty(target_ip) \|\|
2991	(VFS_I(ip: target_ip)->i_nlink > `2`))) {
2992	error = -EEXIST;
2993	goto out_trans_cancel;
2994	}
2995	}
2996
2997	/*
2998	* Lock the AGI buffers we need to handle bumping the nlink of the
2999	* whiteout inode off the unlinked list and to handle dropping the
3000	* nlink of the target inode. Per locking order rules, do this in
3001	* increasing AG order and before directory block allocation tries to
3002	* grab AGFs because we grab AGIs before AGFs.
3003	*
3004	* The (vfs) caller must ensure that if src is a directory then
3005	* target_ip is either null or an empty directory.
3006	*/
3007	for (i = `0`; i < num_inodes && inodes[i] != NULL; i++) {
3008	if (inodes[i] == wip \|\|
3009	(inodes[i] == target_ip &&
3010	(VFS_I(ip: target_ip)->i_nlink == `1` \|\| src_is_directory))) {
3011	struct xfs_perag *pag;
3012	struct xfs_buf *bp;
3013
3014	pag = xfs_perag_get(mp,
3015	XFS_INO_TO_AGNO(mp, inodes[i]->i_ino));
3016	error = xfs_read_agi(pag, tp, &bp);
3017	xfs_perag_put(pag);
3018	if (error)
3019	goto out_trans_cancel;
3020	}
3021	}
3022
3023	/*
3024	* Directory entry creation below may acquire the AGF. Remove
3025	* the whiteout from the unlinked list first to preserve correct
3026	* AGI/AGF locking order. This dirties the transaction so failures
3027	* after this point will abort and log recovery will clean up the
3028	* mess.
3029	*
3030	* For whiteouts, we need to bump the link count on the whiteout
3031	* inode. After this point, we have a real link, clear the tmpfile
3032	* state flag from the inode so it doesn't accidentally get misused
3033	* in future.
3034	*/
3035	if (wip) {
3036	struct xfs_perag *pag;
3037
3038	ASSERT(VFS_I(wip)->i_nlink == `0`);
3039
3040	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino));
3041	error = xfs_iunlink_remove(tp, pag, ip: wip);
3042	xfs_perag_put(pag);
3043	if (error)
3044	goto out_trans_cancel;
3045
3046	xfs_bumplink(tp, ip: wip);
3047	VFS_I(ip: wip)->i_state &= ~I_LINKABLE;
3048	}
3049
3050	/*
3051	* Set up the target.
3052	*/
3053	if (target_ip == NULL) {
3054	/*
3055	* If target does not exist and the rename crosses
3056	* directories, adjust the target directory link count
3057	* to account for the ".." reference from the new entry.
3058	*/
3059	error = xfs_dir_createname(tp, target_dp, target_name,
3060	src_ip->i_ino, spaceres);
3061	if (error)
3062	goto out_trans_cancel;
3063
3064	xfs_trans_ichgtime(tp, target_dp,
3065	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
3066
3067	if (new_parent && src_is_directory) {
3068	xfs_bumplink(tp, ip: target_dp);
3069	}
3070	} else { / target_ip != NULL /
3071	/*
3072	* Link the source inode under the target name.
3073	* If the source inode is a directory and we are moving
3074	* it across directories, its ".." entry will be
3075	* inconsistent until we replace that down below.
3076	*
3077	* In case there is already an entry with the same
3078	* name at the destination directory, remove it first.
3079	*/
3080	error = xfs_dir_replace(tp, target_dp, target_name,
3081	src_ip->i_ino, spaceres);
3082	if (error)
3083	goto out_trans_cancel;
3084
3085	xfs_trans_ichgtime(tp, target_dp,
3086	XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
3087
3088	/*
3089	* Decrement the link count on the target since the target
3090	* dir no longer points to it.
3091	*/
3092	error = xfs_droplink(tp, ip: target_ip);
3093	if (error)
3094	goto out_trans_cancel;
3095
3096	if (src_is_directory) {
3097	/*
3098	* Drop the link from the old "." entry.
3099	*/
3100	error = xfs_droplink(tp, ip: target_ip);
3101	if (error)
3102	goto out_trans_cancel;
3103	}
3104	} / target_ip != NULL /
3105
3106	/*
3107	* Remove the source.
3108	*/
3109	if (new_parent && src_is_directory) {
3110	/*
3111	* Rewrite the ".." entry to point to the new
3112	* directory.
3113	*/
3114	error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
3115	target_dp->i_ino, spaceres);
3116	ASSERT(error != -EEXIST);
3117	if (error)
3118	goto out_trans_cancel;
3119	}
3120
3121	/*
3122	* We always want to hit the ctime on the source inode.
3123	*
3124	* This isn't strictly required by the standards since the source
3125	* inode isn't really being changed, but old unix file systems did
3126	* it and some incremental backup programs won't work without it.
3127	*/
3128	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
3129	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
3130
3131	/*
3132	* Adjust the link count on src_dp. This is necessary when
3133	* renaming a directory, either within one parent when
3134	* the target existed, or across two parent directories.
3135	*/
3136	if (src_is_directory && (new_parent \|\| target_ip != NULL)) {
3137
3138	/*
3139	* Decrement link count on src_directory since the
3140	* entry that's moved no longer points to it.
3141	*/
3142	error = xfs_droplink(tp, ip: src_dp);
3143	if (error)
3144	goto out_trans_cancel;
3145	}
3146
3147	/*
3148	* For whiteouts, we only need to update the source dirent with the
3149	* inode number of the whiteout inode rather than removing it
3150	* altogether.
3151	*/
3152	if (wip)
3153	error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3154	spaceres);
3155	else
3156	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3157	spaceres);
3158
3159	if (error)
3160	goto out_trans_cancel;
3161
3162	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD \| XFS_ICHGTIME_CHG);
3163	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3164	if (new_parent)
3165	xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3166
3167	error = xfs_finish_rename(tp);
3168	if (wip)
3169	xfs_irele(ip: wip);
3170	return error;
3171
3172	out_trans_cancel:
3173	xfs_trans_cancel(tp);
3174	out_release_wip:
3175	if (wip)
3176	xfs_irele(ip: wip);
3177	if (error == -ENOSPC && nospace_error)
3178	error = nospace_error;
3179	return error;
3180	}
3181
3182	static int
3183	xfs_iflush(
3184	struct xfs_inode *ip,
3185	struct xfs_buf *bp)
3186	{
3187	struct xfs_inode_log_item *iip = ip->i_itemp;
3188	struct xfs_dinode *dip;
3189	struct xfs_mount *mp = ip->i_mount;
3190	int error;
3191
3192	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL\|XFS_ILOCK_SHARED));
3193	ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
3194	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE \|\|
3195	ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3196	ASSERT(iip->ili_item.li_buf == bp);
3197
3198	dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3199
3200	/*
3201	* We don't flush the inode if any of the following checks fail, but we
3202	* do still update the log item and attach to the backing buffer as if
3203	* the flush happened. This is a formality to facilitate predictable
3204	* error handling as the caller will shutdown and fail the buffer.
3205	*/
3206	error = -EFSCORRUPTED;
3207	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3208	mp, XFS_ERRTAG_IFLUSH_1)) {
3209	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3210	"%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
3211	__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3212	goto flush_out;
3213	}
3214	if (S_ISREG(VFS_I(ip)->i_mode)) {
3215	if (XFS_TEST_ERROR(
3216	ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3217	ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
3218	mp, XFS_ERRTAG_IFLUSH_3)) {
3219	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3220	"%s: Bad regular inode %llu, ptr "PTR_FMT,
3221	__func__, ip->i_ino, ip);
3222	goto flush_out;
3223	}
3224	} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
3225	if (XFS_TEST_ERROR(
3226	ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
3227	ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
3228	ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
3229	mp, XFS_ERRTAG_IFLUSH_4)) {
3230	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3231	"%s: Bad directory inode %llu, ptr "PTR_FMT,
3232	__func__, ip->i_ino, ip);
3233	goto flush_out;
3234	}
3235	}
3236	if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
3237	ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
3238	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3239	"%s: detected corrupt incore inode %llu, "
3240	"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
3241	__func__, ip->i_ino,
3242	ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af),
3243	ip->i_nblocks, ip);
3244	goto flush_out;
3245	}
3246	if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
3247	mp, XFS_ERRTAG_IFLUSH_6)) {
3248	xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3249	"%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
3250	__func__, ip->i_ino, ip->i_forkoff, ip);
3251	goto flush_out;
3252	}
3253
3254	/*
3255	* Inode item log recovery for v2 inodes are dependent on the flushiter
3256	* count for correct sequencing. We bump the flush iteration count so
3257	* we can detect flushes which postdate a log record during recovery.
3258	* This is redundant as we now log every change and hence this can't
3259	* happen but we need to still do it to ensure backwards compatibility
3260	* with old kernels that predate logging all inode changes.
3261	*/
3262	if (!xfs_has_v3inodes(mp))
3263	ip->i_flushiter++;
3264
3265	/*
3266	* If there are inline format data / attr forks attached to this inode,
3267	* make sure they are not corrupt.
3268	*/
3269	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
3270	xfs_ifork_verify_local_data(ip))
3271	goto flush_out;
3272	if (xfs_inode_has_attr_fork(ip) &&
3273	ip->i_af.if_format == XFS_DINODE_FMT_LOCAL &&
3274	xfs_ifork_verify_local_attr(ip))
3275	goto flush_out;
3276
3277	/*
3278	* Copy the dirty parts of the inode into the on-disk inode. We always
3279	* copy out the core of the inode, because if the inode is dirty at all
3280	* the core must be.
3281	*/
3282	xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
3283
3284	/ Wrap, we never let the log put out DI_MAX_FLUSH /
3285	if (!xfs_has_v3inodes(mp)) {
3286	if (ip->i_flushiter == DI_MAX_FLUSH)
3287	ip->i_flushiter = `0`;
3288	}
3289
3290	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3291	if (xfs_inode_has_attr_fork(ip))
3292	xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3293
3294	/*
3295	* We've recorded everything logged in the inode, so we'd like to clear
3296	* the ili_fields bits so we don't log and flush things unnecessarily.
3297	* However, we can't stop logging all this information until the data
3298	* we've copied into the disk buffer is written to disk. If we did we
3299	* might overwrite the copy of the inode in the log with all the data
3300	* after re-logging only part of it, and in the face of a crash we
3301	* wouldn't have all the data we need to recover.
3302	*
3303	* What we do is move the bits to the ili_last_fields field. When
3304	* logging the inode, these bits are moved back to the ili_fields field.
3305	* In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
3306	* we know that the information those bits represent is permanently on
3307	* disk. As long as the flush completes before the inode is logged
3308	* again, then both ili_fields and ili_last_fields will be cleared.
3309	*/
3310	error = `0`;
3311	flush_out:
3312	spin_lock(lock: &iip->ili_lock);
3313	iip->ili_last_fields = iip->ili_fields;
3314	iip->ili_fields = `0`;
3315	iip->ili_fsync_fields = `0`;
3316	spin_unlock(lock: &iip->ili_lock);
3317
3318	/*
3319	* Store the current LSN of the inode so that we can tell whether the
3320	* item has moved in the AIL from xfs_buf_inode_iodone().
3321	*/
3322	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3323	&iip->ili_item.li_lsn);
3324
3325	/ generate the checksum. /
3326	xfs_dinode_calc_crc(mp, dip);
3327	return error;
3328	}
3329
3330	/*
3331	* Non-blocking flush of dirty inode metadata into the backing buffer.
3332	*
3333	* The caller must have a reference to the inode and hold the cluster buffer
3334	* locked. The function will walk across all the inodes on the cluster buffer it
3335	* can find and lock without blocking, and flush them to the cluster buffer.
3336	*
3337	* On successful flushing of at least one inode, the caller must write out the
3338	* buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3339	* the caller needs to release the buffer. On failure, the filesystem will be
3340	* shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3341	* will be returned.
3342	*/
3343	int
3344	xfs_iflush_cluster(
3345	struct xfs_buf *bp)
3346	{
3347	struct xfs_mount *mp = bp->b_mount;
3348	struct xfs_log_item lip, n;
3349	struct xfs_inode *ip;
3350	struct xfs_inode_log_item *iip;
3351	int clcount = `0`;
3352	int error = `0`;
3353
3354	/*
3355	* We must use the safe variant here as on shutdown xfs_iflush_abort()
3356	* will remove itself from the list.
3357	*/
3358	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3359	iip = (struct xfs_inode_log_item *)lip;
3360	ip = iip->ili_inode;
3361
3362	/*
3363	* Quick and dirty check to avoid locks if possible.
3364	*/
3365	if (__xfs_iflags_test(ip, XFS_IRECLAIM \| XFS_IFLUSHING))
3366	continue;
3367	if (xfs_ipincount(ip))
3368	continue;
3369
3370	/*
3371	* The inode is still attached to the buffer, which means it is
3372	* dirty but reclaim might try to grab it. Check carefully for
3373	* that, and grab the ilock while still holding the i_flags_lock
3374	* to guarantee reclaim will not be able to reclaim this inode
3375	* once we drop the i_flags_lock.
3376	*/
3377	spin_lock(lock: &ip->i_flags_lock);
3378	ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3379	if (__xfs_iflags_test(ip, XFS_IRECLAIM \| XFS_IFLUSHING)) {
3380	spin_unlock(lock: &ip->i_flags_lock);
3381	continue;
3382	}
3383
3384	/*
3385	* ILOCK will pin the inode against reclaim and prevent
3386	* concurrent transactions modifying the inode while we are
3387	* flushing the inode. If we get the lock, set the flushing
3388	* state before we drop the i_flags_lock.
3389	*/
3390	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3391	spin_unlock(lock: &ip->i_flags_lock);
3392	continue;
3393	}
3394	__xfs_iflags_set(ip, XFS_IFLUSHING);
3395	spin_unlock(lock: &ip->i_flags_lock);
3396
3397	/*
3398	* Abort flushing this inode if we are shut down because the
3399	* inode may not currently be in the AIL. This can occur when
3400	* log I/O failure unpins the inode without inserting into the
3401	* AIL, leaving a dirty/unpinned inode attached to the buffer
3402	* that otherwise looks like it should be flushed.
3403	*/
3404	if (xlog_is_shutdown(log: mp->m_log)) {
3405	xfs_iunpin_wait(ip);
3406	xfs_iflush_abort(ip);
3407	xfs_iunlock(ip, XFS_ILOCK_SHARED);
3408	error = -EIO;
3409	continue;
3410	}
3411
3412	/ don't block waiting on a log force to unpin dirty inodes /
3413	if (xfs_ipincount(ip)) {
3414	xfs_iflags_clear(ip, XFS_IFLUSHING);
3415	xfs_iunlock(ip, XFS_ILOCK_SHARED);
3416	continue;
3417	}
3418
3419	if (!xfs_inode_clean(ip))
3420	error = xfs_iflush(ip, bp);
3421	else
3422	xfs_iflags_clear(ip, XFS_IFLUSHING);
3423	xfs_iunlock(ip, XFS_ILOCK_SHARED);
3424	if (error)
3425	break;
3426	clcount++;
3427	}
3428
3429	if (error) {
3430	/*
3431	* Shutdown first so we kill the log before we release this
3432	* buffer. If it is an INODE_ALLOC buffer and pins the tail
3433	* of the log, failing it before the _log_ is shut down can
3434	* result in the log tail being moved forward in the journal
3435	* on disk because log writes can still be taking place. Hence
3436	* unpinning the tail will allow the ICREATE intent to be
3437	* removed from the log an recovery will fail with uninitialised
3438	* inode cluster buffers.
3439	*/
3440	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3441	bp->b_flags \|= XBF_ASYNC;
3442	xfs_buf_ioend_fail(bp);
3443	return error;
3444	}
3445
3446	if (!clcount)
3447	return -EAGAIN;
3448
3449	XFS_STATS_INC(mp, xs_icluster_flushcnt);
3450	XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3451	return `0`;
3452
3453	}
3454
3455	/ Release an inode. /
3456	void
3457	xfs_irele(
3458	struct xfs_inode *ip)
3459	{
3460	trace_xfs_irele(ip, _RET_IP_);
3461	iput(VFS_I(ip));
3462	}
3463
3464	/*
3465	* Ensure all commited transactions touching the inode are written to the log.
3466	*/
3467	int
3468	xfs_log_force_inode(
3469	struct xfs_inode *ip)
3470	{
3471	xfs_csn_t seq = `0`;
3472
3473	xfs_ilock(ip, XFS_ILOCK_SHARED);
3474	if (xfs_ipincount(ip))
3475	seq = ip->i_itemp->ili_commit_seq;
3476	xfs_iunlock(ip, XFS_ILOCK_SHARED);
3477
3478	if (!seq)
3479	return `0`;
3480	return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
3481	}
3482
3483	/*
3484	* Grab the exclusive iolock for a data copy from src to dest, making sure to
3485	* abide vfs locking order (lowest pointer value goes first) and breaking the
3486	* layout leases before proceeding. The loop is needed because we cannot call
3487	* the blocking break_layout() with the iolocks held, and therefore have to
3488	* back out both locks.
3489	*/
3490	static int
3491	xfs_iolock_two_inodes_and_break_layout(
3492	struct inode *src,
3493	struct inode *dest)
3494	{
3495	int error;
3496
3497	if (src > dest)
3498	swap(src, dest);
3499
3500	retry:
3501	/ Wait to break both inodes' layouts before we start locking. /
3502	error = break_layout(inode: src, wait: true);
3503	if (error)
3504	return error;
3505	if (src != dest) {
3506	error = break_layout(inode: dest, wait: true);
3507	if (error)
3508	return error;
3509	}
3510
3511	/ Lock one inode and make sure nobody got in and leased it. /
3512	inode_lock(inode: src);
3513	error = break_layout(inode: src, wait: false);
3514	if (error) {
3515	inode_unlock(inode: src);
3516	if (error == -EWOULDBLOCK)
3517	goto retry;
3518	return error;
3519	}
3520
3521	if (src == dest)
3522	return `0`;
3523
3524	/ Lock the other inode and make sure nobody got in and leased it. /
3525	inode_lock_nested(inode: dest, subclass: I_MUTEX_NONDIR2);
3526	error = break_layout(inode: dest, wait: false);
3527	if (error) {
3528	inode_unlock(inode: src);
3529	inode_unlock(inode: dest);
3530	if (error == -EWOULDBLOCK)
3531	goto retry;
3532	return error;
3533	}
3534
3535	return `0`;
3536	}
3537
3538	static int
3539	xfs_mmaplock_two_inodes_and_break_dax_layout(
3540	struct xfs_inode *ip1,
3541	struct xfs_inode *ip2)
3542	{
3543	int error;
3544	bool retry;
3545	struct page *page;
3546
3547	if (ip1->i_ino > ip2->i_ino)
3548	swap(ip1, ip2);
3549
3550	again:
3551	retry = false;
3552	/ Lock the first inode /
3553	xfs_ilock(ip: ip1, XFS_MMAPLOCK_EXCL);
3554	error = xfs_break_dax_layouts(inode: VFS_I(ip: ip1), retry: &retry);
3555	if (error \|\| retry) {
3556	xfs_iunlock(ip: ip1, XFS_MMAPLOCK_EXCL);
3557	if (error == `0` && retry)
3558	goto again;
3559	return error;
3560	}
3561
3562	if (ip1 == ip2)
3563	return `0`;
3564
3565	/ Nested lock the second inode /
3566	xfs_ilock(ip: ip2, lock_flags: xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, subclass: `1`));
3567	/*
3568	* We cannot use xfs_break_dax_layouts() directly here because it may
3569	* need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
3570	* for this nested lock case.
3571	*/
3572	page = dax_layout_busy_page(mapping: VFS_I(ip: ip2)->i_mapping);
3573	if (page && page_ref_count(page) != `1`) {
3574	xfs_iunlock(ip: ip2, XFS_MMAPLOCK_EXCL);
3575	xfs_iunlock(ip: ip1, XFS_MMAPLOCK_EXCL);
3576	goto again;
3577	}
3578
3579	return `0`;
3580	}
3581
3582	/*
3583	* Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3584	* mmap activity.
3585	*/
3586	int
3587	xfs_ilock2_io_mmap(
3588	struct xfs_inode *ip1,
3589	struct xfs_inode *ip2)
3590	{
3591	int ret;
3592
3593	ret = xfs_iolock_two_inodes_and_break_layout(src: VFS_I(ip: ip1), dest: VFS_I(ip: ip2));
3594	if (ret)
3595	return ret;
3596
3597	if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3598	ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2);
3599	if (ret) {
3600	inode_unlock(inode: VFS_I(ip: ip2));
3601	if (ip1 != ip2)
3602	inode_unlock(inode: VFS_I(ip: ip1));
3603	return ret;
3604	}
3605	} else
3606	filemap_invalidate_lock_two(mapping1: VFS_I(ip: ip1)->i_mapping,
3607	mapping2: VFS_I(ip: ip2)->i_mapping);
3608
3609	return `0`;
3610	}
3611
3612	/ Unlock both inodes to allow IO and mmap activity. /
3613	void
3614	xfs_iunlock2_io_mmap(
3615	struct xfs_inode *ip1,
3616	struct xfs_inode *ip2)
3617	{
3618	if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) {
3619	xfs_iunlock(ip: ip2, XFS_MMAPLOCK_EXCL);
3620	if (ip1 != ip2)
3621	xfs_iunlock(ip: ip1, XFS_MMAPLOCK_EXCL);
3622	} else
3623	filemap_invalidate_unlock_two(mapping1: VFS_I(ip: ip1)->i_mapping,
3624	mapping2: VFS_I(ip: ip2)->i_mapping);
3625
3626	inode_unlock(inode: VFS_I(ip: ip2));
3627	if (ip1 != ip2)
3628	inode_unlock(inode: VFS_I(ip: ip1));
3629	}
3630
3631	/ Drop the MMAPLOCK and the IOLOCK after a remap completes. /
3632	void
3633	xfs_iunlock2_remapping(
3634	struct xfs_inode *ip1,
3635	struct xfs_inode *ip2)
3636	{
3637	xfs_iflags_clear(ip: ip1, XFS_IREMAPPING);
3638
3639	if (ip1 != ip2)
3640	xfs_iunlock(ip: ip1, XFS_MMAPLOCK_SHARED);
3641	xfs_iunlock(ip: ip2, XFS_MMAPLOCK_EXCL);
3642
3643	if (ip1 != ip2)
3644	inode_unlock_shared(inode: VFS_I(ip: ip1));
3645	inode_unlock(inode: VFS_I(ip: ip2));
3646	}
3647
3648	/*
3649	* Reload the incore inode list for this inode. Caller should ensure that
3650	* the link count cannot change, either by taking ILOCK_SHARED or otherwise
3651	* preventing other threads from executing.
3652	*/
3653	int
3654	xfs_inode_reload_unlinked_bucket(
3655	struct xfs_trans *tp,
3656	struct xfs_inode *ip)
3657	{
3658	struct xfs_mount *mp = tp->t_mountp;
3659	struct xfs_buf *agibp;
3660	struct xfs_agi *agi;
3661	struct xfs_perag *pag;
3662	xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
3663	xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
3664	xfs_agino_t prev_agino, next_agino;
3665	unsigned int bucket;
3666	bool foundit = false;
3667	int error;
3668
3669	/ Grab the first inode in the list /
3670	pag = xfs_perag_get(mp, agno);
3671	error = xfs_ialloc_read_agi(pag, tp, &agibp);
3672	xfs_perag_put(pag);
3673	if (error)
3674	return error;
3675
3676	/*
3677	* We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
3678	* incore unlinked list pointers for this inode. Check once more to
3679	* see if we raced with anyone else to reload the unlinked list.
3680	*/
3681	if (!xfs_inode_unlinked_incomplete(ip)) {
3682	foundit = true;
3683	goto out_agibp;
3684	}
3685
3686	bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
3687	agi = agibp->b_addr;
3688
3689	trace_xfs_inode_reload_unlinked_bucket(ip);
3690
3691	xfs_info_ratelimited(mp,
3692	"Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
3693	agino, agno);
3694
3695	prev_agino = NULLAGINO;
3696	next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3697	while (next_agino != NULLAGINO) {
3698	struct xfs_inode *next_ip = NULL;
3699
3700	/ Found this caller's inode, set its backlink. /
3701	if (next_agino == agino) {
3702	next_ip = ip;
3703	next_ip->i_prev_unlinked = prev_agino;
3704	foundit = true;
3705	goto next_inode;
3706	}
3707
3708	/ Try in-memory lookup first. /
3709	next_ip = xfs_iunlink_lookup(pag, next_agino);
3710	if (next_ip)
3711	goto next_inode;
3712
3713	/ Inode not in memory, try reloading it. /
3714	error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
3715	next_agino);
3716	if (error)
3717	break;
3718
3719	/ Grab the reloaded inode. /
3720	next_ip = xfs_iunlink_lookup(pag, next_agino);
3721	if (!next_ip) {
3722	/ No incore inode at all? We reloaded it... /
3723	ASSERT(next_ip != NULL);
3724	error = -EFSCORRUPTED;
3725	break;
3726	}
3727
3728	next_inode:
3729	prev_agino = next_agino;
3730	next_agino = next_ip->i_next_unlinked;
3731	}
3732
3733	out_agibp:
3734	xfs_trans_brelse(tp, agibp);
3735	/ Should have found this inode somewhere in the iunlinked bucket. /
3736	if (!error && !foundit)
3737	error = -EFSCORRUPTED;
3738	return error;
3739	}
3740
3741	/ Decide if this inode is missing its unlinked list and reload it. /
3742	int
3743	xfs_inode_reload_unlinked(
3744	struct xfs_inode *ip)
3745	{
3746	struct xfs_trans *tp;
3747	int error;
3748
3749	error = xfs_trans_alloc_empty(mp: ip->i_mount, tpp: &tp);
3750	if (error)
3751	return error;
3752
3753	xfs_ilock(ip, XFS_ILOCK_SHARED);
3754	if (xfs_inode_unlinked_incomplete(ip))
3755	error = xfs_inode_reload_unlinked_bucket(tp, ip);
3756	xfs_iunlock(ip, XFS_ILOCK_SHARED);
3757	xfs_trans_cancel(tp);
3758
3759	return error;
3760	}
3761

source code of linux/fs/xfs/xfs_inode.c