xfs_discard.c source code [linux/fs/xfs/xfs_discard.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 2010, 2023 Red Hat, Inc.
4	* All Rights Reserved.
5	*/
6	#include "xfs.h"
7	#include "xfs_shared.h"
8	#include "xfs_format.h"
9	#include "xfs_log_format.h"
10	#include "xfs_trans_resv.h"
11	#include "xfs_mount.h"
12	#include "xfs_btree.h"
13	#include "xfs_alloc_btree.h"
14	#include "xfs_alloc.h"
15	#include "xfs_discard.h"
16	#include "xfs_error.h"
17	#include "xfs_extent_busy.h"
18	#include "xfs_trace.h"
19	#include "xfs_log.h"
20	#include "xfs_ag.h"
21
22	/*
23	* Notes on an efficient, low latency fstrim algorithm
24	*
25	* We need to walk the filesystem free space and issue discards on the free
26	* space that meet the search criteria (size and location). We cannot issue
27	* discards on extents that might be in use, or are so recently in use they are
28	* still marked as busy. To serialise against extent state changes whilst we are
29	* gathering extents to trim, we must hold the AGF lock to lock out other
30	* allocations and extent free operations that might change extent state.
31	*
32	* However, we cannot just hold the AGF for the entire AG free space walk whilst
33	* we issue discards on each free space that is found. Storage devices can have
34	* extremely slow discard implementations (e.g. ceph RBD) and so walking a
35	* couple of million free extents and issuing synchronous discards on each
36	* extent can take a long time. Whilst we are doing this walk, nothing else
37	* can access the AGF, and we can stall transactions and hence the log whilst
38	* modifications wait for the AGF lock to be released. This can lead hung tasks
39	* kicking the hung task timer and rebooting the system. This is bad.
40	*
41	* Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
42	* lock, gathers a range of inode cluster buffers that are allocated, drops the
43	* AGI lock and then reads all the inode cluster buffers and processes them. It
44	* loops doing this, using a cursor to keep track of where it is up to in the AG
45	* for each iteration to restart the INOBT lookup from.
46	*
47	* We can't do this exactly with free space - once we drop the AGF lock, the
48	* state of the free extent is out of our control and we cannot run a discard
49	* safely on it in this situation. Unless, of course, we've marked the free
50	* extent as busy and undergoing a discard operation whilst we held the AGF
51	* locked.
52	*
53	* This is exactly how online discard works - free extents are marked busy when
54	* they are freed, and once the extent free has been committed to the journal,
55	* the busy extent record is marked as "undergoing discard" and the discard is
56	* then issued on the free extent. Once the discard completes, the busy extent
57	* record is removed and the extent is able to be allocated again.
58	*
59	* In the context of fstrim, if we find a free extent we need to discard, we
60	* don't have to discard it immediately. All we need to do it record that free
61	* extent as being busy and under discard, and all the allocation routines will
62	* now avoid trying to allocate it. Hence if we mark the extent as busy under
63	* the AGF lock, we can safely discard it without holding the AGF lock because
64	* nothing will attempt to allocate that free space until the discard completes.
65	*
66	* This also allows us to issue discards asynchronously like we do with online
67	* discard, and so for fast devices fstrim will run much faster as we can have
68	* multiple discard operations in flight at once, as well as pipeline the free
69	* extent search so that it overlaps in flight discard IO.
70	*/
71
72	struct workqueue_struct *xfs_discard_wq;
73
74	static void
75	xfs_discard_endio_work(
76	struct work_struct *work)
77	{
78	struct xfs_busy_extents *extents =
79	container_of(work, struct xfs_busy_extents, endio_work);
80
81	xfs_extent_busy_clear(mp: extents->mount, list: &extents->extent_list, do_discard: false);
82	kmem_free(ptr: extents->owner);
83	}
84
85	/*
86	* Queue up the actual completion to a thread to avoid IRQ-safe locking for
87	* pagb_lock.
88	*/
89	static void
90	xfs_discard_endio(
91	struct bio *bio)
92	{
93	struct xfs_busy_extents *extents = bio->bi_private;
94
95	INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
96	queue_work(wq: xfs_discard_wq, work: &extents->endio_work);
97	bio_put(bio);
98	}
99
100	/*
101	* Walk the discard list and issue discards on all the busy extents in the
102	* list. We plug and chain the bios so that we only need a single completion
103	* call to clear all the busy extents once the discards are complete.
104	*/
105	int
106	xfs_discard_extents(
107	struct xfs_mount *mp,
108	struct xfs_busy_extents *extents)
109	{
110	struct xfs_extent_busy *busyp;
111	struct bio *bio = NULL;
112	struct blk_plug plug;
113	int error = `0`;
114
115	blk_start_plug(&plug);
116	list_for_each_entry(busyp, &extents->extent_list, list) {
117	trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
118	busyp->length);
119
120	error = __blkdev_issue_discard(bdev: mp->m_ddev_targp->bt_bdev,
121	sector: XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
122	nr_sects: XFS_FSB_TO_BB(mp, busyp->length),
123	GFP_NOFS, biop: &bio);
124	if (error && error != -EOPNOTSUPP) {
125	xfs_info(mp,
126	"discard failed for extent [0x%llx,%u], error %d",
127	(unsigned long long)busyp->bno,
128	busyp->length,
129	error);
130	break;
131	}
132	}
133
134	if (bio) {
135	bio->bi_private = extents;
136	bio->bi_end_io = xfs_discard_endio;
137	submit_bio(bio);
138	} else {
139	xfs_discard_endio_work(work: &extents->endio_work);
140	}
141	blk_finish_plug(&plug);
142
143	return error;
144	}
145
146
147	static int
148	xfs_trim_gather_extents(
149	struct xfs_perag *pag,
150	xfs_daddr_t start,
151	xfs_daddr_t end,
152	xfs_daddr_t minlen,
153	struct xfs_alloc_rec_incore *tcur,
154	struct xfs_busy_extents *extents,
155	uint64_t *blocks_trimmed)
156	{
157	struct xfs_mount *mp = pag->pag_mount;
158	struct xfs_btree_cur *cur;
159	struct xfs_buf *agbp;
160	int error;
161	int i;
162	int batch = `100`;
163
164	/*
165	* Force out the log. This means any transactions that might have freed
166	* space before we take the AGF buffer lock are now on disk, and the
167	* volatile disk cache is flushed.
168	*/
169	xfs_log_force(mp, XFS_LOG_SYNC);
170
171	error = xfs_alloc_read_agf(pag, NULL, `0`, &agbp);
172	if (error)
173	return error;
174
175	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
176
177	/*
178	* Look up the extent length requested in the AGF and start with it.
179	*/
180	if (tcur->ar_startblock == NULLAGBLOCK)
181	error = xfs_alloc_lookup_ge(cur, `0`, tcur->ar_blockcount, &i);
182	else
183	error = xfs_alloc_lookup_le(cur, tcur->ar_startblock,
184	tcur->ar_blockcount, &i);
185	if (error)
186	goto out_del_cursor;
187	if (i == `0`) {
188	/ nothing of that length left in the AG, we are done /
189	tcur->ar_blockcount = `0`;
190	goto out_del_cursor;
191	}
192
193	/*
194	* Loop until we are done with all extents that are large
195	* enough to be worth discarding or we hit batch limits.
196	*/
197	while (i) {
198	xfs_agblock_t fbno;
199	xfs_extlen_t flen;
200	xfs_daddr_t dbno;
201	xfs_extlen_t dlen;
202
203	error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
204	if (error)
205	break;
206	if (XFS_IS_CORRUPT(mp, i != `1`)) {
207	error = -EFSCORRUPTED;
208	break;
209	}
210
211	if (--batch <= `0`) {
212	/*
213	* Update the cursor to point at this extent so we
214	* restart the next batch from this extent.
215	*/
216	tcur->ar_startblock = fbno;
217	tcur->ar_blockcount = flen;
218	break;
219	}
220
221	/*
222	* use daddr format for all range/len calculations as that is
223	* the format the range/len variables are supplied in by
224	* userspace.
225	*/
226	dbno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, fbno);
227	dlen = XFS_FSB_TO_BB(mp, flen);
228
229	/*
230	* Too small? Give up.
231	*/
232	if (dlen < minlen) {
233	trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
234	tcur->ar_blockcount = `0`;
235	break;
236	}
237
238	/*
239	* If the extent is entirely outside of the range we are
240	* supposed to discard skip it. Do not bother to trim
241	* down partially overlapping ranges for now.
242	*/
243	if (dbno + dlen < start \|\| dbno > end) {
244	trace_xfs_discard_exclude(mp, pag->pag_agno, fbno, flen);
245	goto next_extent;
246	}
247
248	/*
249	* If any blocks in the range are still busy, skip the
250	* discard and try again the next time.
251	*/
252	if (xfs_extent_busy_search(mp, pag, fbno, flen)) {
253	trace_xfs_discard_busy(mp, pag->pag_agno, fbno, flen);
254	goto next_extent;
255	}
256
257	xfs_extent_busy_insert_discard(pag, fbno, flen,
258	&extents->extent_list);
259	*blocks_trimmed += flen;
260	next_extent:
261	error = xfs_btree_decrement(cur, `0`, &i);
262	if (error)
263	break;
264
265	/*
266	* If there's no more records in the tree, we are done. Set the
267	* cursor block count to 0 to indicate to the caller that there
268	* is no more extents to search.
269	*/
270	if (i == `0`)
271	tcur->ar_blockcount = `0`;
272	}
273
274	/*
275	* If there was an error, release all the gathered busy extents because
276	* we aren't going to issue a discard on them any more.
277	*/
278	if (error)
279	xfs_extent_busy_clear(mp, list: &extents->extent_list, do_discard: false);
280	out_del_cursor:
281	xfs_btree_del_cursor(cur, error);
282	xfs_buf_relse(bp: agbp);
283	return error;
284	}
285
286	static bool
287	xfs_trim_should_stop(void)
288	{
289	return fatal_signal_pending(current) \|\| freezing(current);
290	}
291
292	/*
293	* Iterate the free list gathering extents and discarding them. We need a cursor
294	* for the repeated iteration of gather/discard loop, so use the longest extent
295	* we found in the last batch as the key to start the next.
296	*/
297	static int
298	xfs_trim_extents(
299	struct xfs_perag *pag,
300	xfs_daddr_t start,
301	xfs_daddr_t end,
302	xfs_daddr_t minlen,
303	uint64_t *blocks_trimmed)
304	{
305	struct xfs_alloc_rec_incore tcur = {
306	.ar_blockcount = pag->pagf_longest,
307	.ar_startblock = NULLAGBLOCK,
308	};
309	int error = `0`;
310
311	do {
312	struct xfs_busy_extents *extents;
313
314	extents = kzalloc(size: sizeof(*extents), GFP_KERNEL);
315	if (!extents) {
316	error = -ENOMEM;
317	break;
318	}
319
320	extents->mount = pag->pag_mount;
321	extents->owner = extents;
322	INIT_LIST_HEAD(list: &extents->extent_list);
323
324	error = xfs_trim_gather_extents(pag, start, end, minlen,
325	tcur: &tcur, extents, blocks_trimmed);
326	if (error) {
327	kfree(objp: extents);
328	break;
329	}
330
331	/*
332	* We hand the extent list to the discard function here so the
333	* discarded extents can be removed from the busy extent list.
334	* This allows the discards to run asynchronously with gathering
335	* the next round of extents to discard.
336	*
337	* However, we must ensure that we do not reference the extent
338	* list after this function call, as it may have been freed by
339	* the time control returns to us.
340	*/
341	error = xfs_discard_extents(mp: pag->pag_mount, extents);
342	if (error)
343	break;
344
345	if (xfs_trim_should_stop())
346	break;
347
348	} while (tcur.ar_blockcount != `0`);
349
350	return error;
351
352	}
353
354	/*
355	* trim a range of the filesystem.
356	*
357	* Note: the parameters passed from userspace are byte ranges into the
358	* filesystem which does not match to the format we use for filesystem block
359	* addressing. FSB addressing is sparse (AGNO\|AGBNO), while the incoming format
360	* is a linear address range. Hence we need to use DADDR based conversions and
361	* comparisons for determining the correct offset and regions to trim.
362	*/
363	int
364	xfs_ioc_trim(
365	struct xfs_mount *mp,
366	struct fstrim_range __user *urange)
367	{
368	struct xfs_perag *pag;
369	unsigned int granularity =
370	bdev_discard_granularity(bdev: mp->m_ddev_targp->bt_bdev);
371	struct fstrim_range range;
372	xfs_daddr_t start, end, minlen;
373	xfs_agnumber_t agno;
374	uint64_t blocks_trimmed = `0`;
375	int error, last_error = `0`;
376
377	if (!capable(CAP_SYS_ADMIN))
378	return -EPERM;
379	if (!bdev_max_discard_sectors(bdev: mp->m_ddev_targp->bt_bdev))
380	return -EOPNOTSUPP;
381
382	/*
383	* We haven't recovered the log, so we cannot use our bnobt-guided
384	* storage zapping commands.
385	*/
386	if (xfs_has_norecovery(mp))
387	return -EROFS;
388
389	if (copy_from_user(to: &range, from: urange, n: sizeof(range)))
390	return -EFAULT;
391
392	range.minlen = max_t(u64, granularity, range.minlen);
393	minlen = BTOBB(range.minlen);
394	/*
395	* Truncating down the len isn't actually quite correct, but using
396	* BBTOB would mean we trivially get overflows for values
397	* of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default
398	* used by the fstrim application. In the end it really doesn't
399	* matter as trimming blocks is an advisory interface.
400	*/
401	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) \|\|
402	range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) \|\|
403	range.len < mp->m_sb.sb_blocksize)
404	return -EINVAL;
405
406	start = BTOBB(range.start);
407	end = start + BTOBBT(range.len) - `1`;
408
409	if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - `1`)
410	end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - `1`;
411
412	agno = xfs_daddr_to_agno(mp, start);
413	for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
414	error = xfs_trim_extents(pag, start, end, minlen,
415	&blocks_trimmed);
416	if (error)
417	last_error = error;
418
419	if (xfs_trim_should_stop()) {
420	xfs_perag_rele(pag);
421	break;
422	}
423	}
424
425	if (last_error)
426	return last_error;
427
428	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
429	if (copy_to_user(to: urange, from: &range, n: sizeof(range)))
430	return -EFAULT;
431	return `0`;
432	}
433

source code of linux/fs/xfs/xfs_discard.c