gc.c source code [linux/fs/ubifs/gc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* This file is part of UBIFS.
4	*
5	* Copyright (C) 2006-2008 Nokia Corporation.
6	*
7	* Authors: Adrian Hunter
8	* Artem Bityutskiy (Битюцкий Артём)
9	*/
10
11	/*
12	* This file implements garbage collection. The procedure for garbage collection
13	* is different depending on whether a LEB as an index LEB (contains index
14	* nodes) or not. For non-index LEBs, garbage collection finds a LEB which
15	* contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
16	* nodes to the journal, at which point the garbage-collected LEB is free to be
17	* reused. For index LEBs, garbage collection marks the non-obsolete index nodes
18	* dirty in the TNC, and after the next commit, the garbage-collected LEB is
19	* to be reused. Garbage collection will cause the number of dirty index nodes
20	* to grow, however sufficient space is reserved for the index to ensure the
21	* commit will never run out of space.
22	*
23	* Notes about dead watermark. At current UBIFS implementation we assume that
24	* LEBs which have less than @c->dead_wm bytes of free + dirty space are full
25	* and not worth garbage-collecting. The dead watermark is one min. I/O unit
26	* size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
27	* Garbage Collector has to synchronize the GC head's write buffer before
28	* returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
29	* actually reclaim even very small pieces of dirty space by garbage collecting
30	* enough dirty LEBs, but we do not bother doing this at this implementation.
31	*
32	* Notes about dark watermark. The results of GC work depends on how big are
33	* the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
34	* if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
35	* have to waste large pieces of free space at the end of LEB B, because nodes
36	* from LEB A would not fit. And the worst situation is when all nodes are of
37	* maximum size. So dark watermark is the amount of free + dirty space in LEB
38	* which are guaranteed to be reclaimable. If LEB has less space, the GC might
39	* be unable to reclaim it. So, LEBs with free + dirty greater than dark
40	* watermark are "good" LEBs from GC's point of view. The other LEBs are not so
41	* good, and GC takes extra care when moving them.
42	*/
43
44	#include <linux/slab.h>
45	#include <linux/pagemap.h>
46	#include <linux/list_sort.h>
47	#include "ubifs.h"
48
49	/*
50	* GC may need to move more than one LEB to make progress. The below constants
51	* define "soft" and "hard" limits on the number of LEBs the garbage collector
52	* may move.
53	*/
54	#define SOFT_LEBS_LIMIT 4
55	#define HARD_LEBS_LIMIT 32
56
57	/**
58	* switch_gc_head - switch the garbage collection journal head.
59	* @c: UBIFS file-system description object
60	*
61	* This function switch the GC head to the next LEB which is reserved in
62	* @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
63	* and other negative error code in case of failures.
64	*/
65	static int switch_gc_head(struct ubifs_info *c)
66	{
67	int err, gc_lnum = c->gc_lnum;
68	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
69
70	ubifs_assert(c, gc_lnum != -`1`);
71	dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
72	wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
73	c->leb_size - wbuf->offs - wbuf->used);
74
75	err = ubifs_wbuf_sync_nolock(wbuf);
76	if (err)
77	return err;
78
79	/*
80	* The GC write-buffer was synchronized, we may safely unmap
81	* 'c->gc_lnum'.
82	*/
83	err = ubifs_leb_unmap(c, lnum: gc_lnum);
84	if (err)
85	return err;
86
87	err = ubifs_add_bud_to_log(c, GCHD, lnum: gc_lnum, offs: `0`);
88	if (err)
89	return err;
90
91	c->gc_lnum = -`1`;
92	err = ubifs_wbuf_seek_nolock(wbuf, lnum: gc_lnum, offs: `0`);
93	return err;
94	}
95
96	/**
97	* data_nodes_cmp - compare 2 data nodes.
98	* @priv: UBIFS file-system description object
99	* @a: first data node
100	* @b: second data node
101	*
102	* This function compares data nodes @a and @b. Returns %1 if @a has greater
103	* inode or block number, and %-1 otherwise.
104	*/
105	static int data_nodes_cmp(void priv, const* struct list_head *a,
106	const struct list_head *b)
107	{
108	ino_t inuma, inumb;
109	struct ubifs_info *c = priv;
110	struct ubifs_scan_node sa, sb;
111
112	cond_resched();
113	if (a == b)
114	return `0`;
115
116	sa = list_entry(a, struct ubifs_scan_node, list);
117	sb = list_entry(b, struct ubifs_scan_node, list);
118
119	ubifs_assert(c, key_type(c, &sa->key) == UBIFS_DATA_KEY);
120	ubifs_assert(c, key_type(c, &sb->key) == UBIFS_DATA_KEY);
121	ubifs_assert(c, sa->type == UBIFS_DATA_NODE);
122	ubifs_assert(c, sb->type == UBIFS_DATA_NODE);
123
124	inuma = key_inum(c, k: &sa->key);
125	inumb = key_inum(c, k: &sb->key);
126
127	if (inuma == inumb) {
128	unsigned int blka = key_block(c, key: &sa->key);
129	unsigned int blkb = key_block(c, key: &sb->key);
130
131	if (blka <= blkb)
132	return -`1`;
133	} else if (inuma <= inumb)
134	return -`1`;
135
136	return `1`;
137	}
138
139	/*
140	* nondata_nodes_cmp - compare 2 non-data nodes.
141	* @priv: UBIFS file-system description object
142	* @a: first node
143	* @a: second node
144	*
145	* This function compares nodes @a and @b. It makes sure that inode nodes go
146	* first and sorted by length in descending order. Directory entry nodes go
147	* after inode nodes and are sorted in ascending hash valuer order.
148	*/
149	static int nondata_nodes_cmp(void priv, const* struct list_head *a,
150	const struct list_head *b)
151	{
152	ino_t inuma, inumb;
153	struct ubifs_info *c = priv;
154	struct ubifs_scan_node sa, sb;
155
156	cond_resched();
157	if (a == b)
158	return `0`;
159
160	sa = list_entry(a, struct ubifs_scan_node, list);
161	sb = list_entry(b, struct ubifs_scan_node, list);
162
163	ubifs_assert(c, key_type(c, &sa->key) != UBIFS_DATA_KEY &&
164	key_type(c, &sb->key) != UBIFS_DATA_KEY);
165	ubifs_assert(c, sa->type != UBIFS_DATA_NODE &&
166	sb->type != UBIFS_DATA_NODE);
167
168	/ Inodes go before directory entries /
169	if (sa->type == UBIFS_INO_NODE) {
170	if (sb->type == UBIFS_INO_NODE)
171	return sb->len - sa->len;
172	return -`1`;
173	}
174	if (sb->type == UBIFS_INO_NODE)
175	return `1`;
176
177	ubifs_assert(c, key_type(c, &sa->key) == UBIFS_DENT_KEY \|\|
178	key_type(c, &sa->key) == UBIFS_XENT_KEY);
179	ubifs_assert(c, key_type(c, &sb->key) == UBIFS_DENT_KEY \|\|
180	key_type(c, &sb->key) == UBIFS_XENT_KEY);
181	ubifs_assert(c, sa->type == UBIFS_DENT_NODE \|\|
182	sa->type == UBIFS_XENT_NODE);
183	ubifs_assert(c, sb->type == UBIFS_DENT_NODE \|\|
184	sb->type == UBIFS_XENT_NODE);
185
186	inuma = key_inum(c, k: &sa->key);
187	inumb = key_inum(c, k: &sb->key);
188
189	if (inuma == inumb) {
190	uint32_t hasha = key_hash(c, key: &sa->key);
191	uint32_t hashb = key_hash(c, key: &sb->key);
192
193	if (hasha <= hashb)
194	return -`1`;
195	} else if (inuma <= inumb)
196	return -`1`;
197
198	return `1`;
199	}
200
201	/**
202	* sort_nodes - sort nodes for GC.
203	* @c: UBIFS file-system description object
204	* @sleb: describes nodes to sort and contains the result on exit
205	* @nondata: contains non-data nodes on exit
206	* @min: minimum node size is returned here
207	*
208	* This function sorts the list of inodes to garbage collect. First of all, it
209	* kills obsolete nodes and separates data and non-data nodes to the
210	* @sleb->nodes and @nondata lists correspondingly.
211	*
212	* Data nodes are then sorted in block number order - this is important for
213	* bulk-read; data nodes with lower inode number go before data nodes with
214	* higher inode number, and data nodes with lower block number go before data
215	* nodes with higher block number;
216	*
217	* Non-data nodes are sorted as follows.
218	* o First go inode nodes - they are sorted in descending length order.
219	* o Then go directory entry nodes - they are sorted in hash order, which
220	* should supposedly optimize 'readdir()'. Direntry nodes with lower parent
221	* inode number go before direntry nodes with higher parent inode number,
222	* and direntry nodes with lower name hash values go before direntry nodes
223	* with higher name hash values.
224	*
225	* This function returns zero in case of success and a negative error code in
226	* case of failure.
227	*/
228	static int sort_nodes(struct ubifs_info c, struct* ubifs_scan_leb *sleb,
229	struct list_head nondata, int* *min)
230	{
231	int err;
232	struct ubifs_scan_node snod, tmp;
233
234	*min = INT_MAX;
235
236	/ Separate data nodes and non-data nodes /
237	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
238	ubifs_assert(c, snod->type == UBIFS_INO_NODE \|\|
239	snod->type == UBIFS_DATA_NODE \|\|
240	snod->type == UBIFS_DENT_NODE \|\|
241	snod->type == UBIFS_XENT_NODE \|\|
242	snod->type == UBIFS_TRUN_NODE \|\|
243	snod->type == UBIFS_AUTH_NODE);
244
245	if (snod->type != UBIFS_INO_NODE &&
246	snod->type != UBIFS_DATA_NODE &&
247	snod->type != UBIFS_DENT_NODE &&
248	snod->type != UBIFS_XENT_NODE) {
249	/ Probably truncation node, zap it /
250	list_del(entry: &snod->list);
251	kfree(objp: snod);
252	continue;
253	}
254
255	ubifs_assert(c, key_type(c, &snod->key) == UBIFS_DATA_KEY \|\|
256	key_type(c, &snod->key) == UBIFS_INO_KEY \|\|
257	key_type(c, &snod->key) == UBIFS_DENT_KEY \|\|
258	key_type(c, &snod->key) == UBIFS_XENT_KEY);
259
260	err = ubifs_tnc_has_node(c, key: &snod->key, level: `0`, lnum: sleb->lnum,
261	offs: snod->offs, is_idx: `0`);
262	if (err < `0`)
263	return err;
264
265	if (!err) {
266	/ The node is obsolete, remove it from the list /
267	list_del(entry: &snod->list);
268	kfree(objp: snod);
269	continue;
270	}
271
272	if (snod->len < *min)
273	*min = snod->len;
274
275	if (key_type(c, key: &snod->key) != UBIFS_DATA_KEY)
276	list_move_tail(list: &snod->list, head: nondata);
277	}
278
279	/ Sort data and non-data nodes /
280	list_sort(priv: c, head: &sleb->nodes, cmp: &data_nodes_cmp);
281	list_sort(priv: c, head: nondata, cmp: &nondata_nodes_cmp);
282
283	err = dbg_check_data_nodes_order(c, head: &sleb->nodes);
284	if (err)
285	return err;
286	err = dbg_check_nondata_nodes_order(c, head: nondata);
287	if (err)
288	return err;
289	return `0`;
290	}
291
292	/**
293	* move_node - move a node.
294	* @c: UBIFS file-system description object
295	* @sleb: describes the LEB to move nodes from
296	* @snod: the mode to move
297	* @wbuf: write-buffer to move node to
298	*
299	* This function moves node @snod to @wbuf, changes TNC correspondingly, and
300	* destroys @snod. Returns zero in case of success and a negative error code in
301	* case of failure.
302	*/
303	static int move_node(struct ubifs_info c, struct* ubifs_scan_leb *sleb,
304	struct ubifs_scan_node snod, struct* ubifs_wbuf *wbuf)
305	{
306	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
307
308	cond_resched();
309	err = ubifs_wbuf_write_nolock(wbuf, buf: snod->node, len: snod->len);
310	if (err)
311	return err;
312
313	err = ubifs_tnc_replace(c, key: &snod->key, old_lnum: sleb->lnum,
314	old_offs: snod->offs, lnum: new_lnum, offs: new_offs,
315	len: snod->len);
316	list_del(entry: &snod->list);
317	kfree(objp: snod);
318	return err;
319	}
320
321	/**
322	* move_nodes - move nodes.
323	* @c: UBIFS file-system description object
324	* @sleb: describes the LEB to move nodes from
325	*
326	* This function moves valid nodes from data LEB described by @sleb to the GC
327	* journal head. This function returns zero in case of success, %-EAGAIN if
328	* commit is required, and other negative error codes in case of other
329	* failures.
330	*/
331	static int move_nodes(struct ubifs_info c, struct* ubifs_scan_leb *sleb)
332	{
333	int err, min;
334	LIST_HEAD(nondata);
335	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
336
337	if (wbuf->lnum == -`1`) {
338	/*
339	* The GC journal head is not set, because it is the first GC
340	* invocation since mount.
341	*/
342	err = switch_gc_head(c);
343	if (err)
344	return err;
345	}
346
347	err = sort_nodes(c, sleb, nondata: &nondata, min: &min);
348	if (err)
349	goto out;
350
351	/ Write nodes to their new location. Use the first-fit strategy /
352	while (`1`) {
353	int avail, moved = `0`;
354	struct ubifs_scan_node snod, tmp;
355
356	/ Move data nodes /
357	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
358	avail = c->leb_size - wbuf->offs - wbuf->used -
359	ubifs_auth_node_sz(c);
360	if (snod->len > avail)
361	/*
362	* Do not skip data nodes in order to optimize
363	* bulk-read.
364	*/
365	break;
366
367	err = ubifs_shash_update(c, desc: c->jheads[GCHD].log_hash,
368	buf: snod->node, len: snod->len);
369	if (err)
370	goto out;
371
372	err = move_node(c, sleb, snod, wbuf);
373	if (err)
374	goto out;
375	moved = `1`;
376	}
377
378	/ Move non-data nodes /
379	list_for_each_entry_safe(snod, tmp, &nondata, list) {
380	avail = c->leb_size - wbuf->offs - wbuf->used -
381	ubifs_auth_node_sz(c);
382	if (avail < min)
383	break;
384
385	if (snod->len > avail) {
386	/*
387	* Keep going only if this is an inode with
388	* some data. Otherwise stop and switch the GC
389	* head. IOW, we assume that data-less inode
390	* nodes and direntry nodes are roughly of the
391	* same size.
392	*/
393	if (key_type(c, key: &snod->key) == UBIFS_DENT_KEY \|\|
394	snod->len == UBIFS_INO_NODE_SZ)
395	break;
396	continue;
397	}
398
399	err = ubifs_shash_update(c, desc: c->jheads[GCHD].log_hash,
400	buf: snod->node, len: snod->len);
401	if (err)
402	goto out;
403
404	err = move_node(c, sleb, snod, wbuf);
405	if (err)
406	goto out;
407	moved = `1`;
408	}
409
410	if (ubifs_authenticated(c) && moved) {
411	struct ubifs_auth_node *auth;
412
413	auth = kmalloc(size: ubifs_auth_node_sz(c), GFP_NOFS);
414	if (!auth) {
415	err = -ENOMEM;
416	goto out;
417	}
418
419	err = ubifs_prepare_auth_node(c, node: auth,
420	inhash: c->jheads[GCHD].log_hash);
421	if (err) {
422	kfree(objp: auth);
423	goto out;
424	}
425
426	err = ubifs_wbuf_write_nolock(wbuf, buf: auth,
427	len: ubifs_auth_node_sz(c));
428	if (err) {
429	kfree(objp: auth);
430	goto out;
431	}
432
433	ubifs_add_dirt(c, lnum: wbuf->lnum, dirty: ubifs_auth_node_sz(c));
434	}
435
436	if (list_empty(head: &sleb->nodes) && list_empty(head: &nondata))
437	break;
438
439	/*
440	* Waste the rest of the space in the LEB and switch to the
441	* next LEB.
442	*/
443	err = switch_gc_head(c);
444	if (err)
445	goto out;
446	}
447
448	return `0`;
449
450	out:
451	list_splice_tail(list: &nondata, head: &sleb->nodes);
452	return err;
453	}
454
455	/**
456	* gc_sync_wbufs - sync write-buffers for GC.
457	* @c: UBIFS file-system description object
458	*
459	* We must guarantee that obsoleting nodes are on flash. Unfortunately they may
460	* be in a write-buffer instead. That is, a node could be written to a
461	* write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
462	* erased before the write-buffer is sync'd and then there is an unclean
463	* unmount, then an existing node is lost. To avoid this, we sync all
464	* write-buffers.
465	*
466	* This function returns %0 on success or a negative error code on failure.
467	*/
468	static int gc_sync_wbufs(struct ubifs_info *c)
469	{
470	int err, i;
471
472	for (i = `0`; i < c->jhead_cnt; i++) {
473	if (i == GCHD)
474	continue;
475	err = ubifs_wbuf_sync(wbuf: &c->jheads[i].wbuf);
476	if (err)
477	return err;
478	}
479	return `0`;
480	}
481
482	/**
483	* ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
484	* @c: UBIFS file-system description object
485	* @lp: describes the LEB to garbage collect
486	*
487	* This function garbage-collects an LEB and returns one of the @LEB_FREED,
488	* @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
489	* required, and other negative error codes in case of failures.
490	*/
491	int ubifs_garbage_collect_leb(struct ubifs_info c, struct* ubifs_lprops *lp)
492	{
493	struct ubifs_scan_leb *sleb;
494	struct ubifs_scan_node *snod;
495	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
496	int err = `0`, lnum = lp->lnum;
497
498	ubifs_assert(c, c->gc_lnum != -`1` \|\| wbuf->offs + wbuf->used == `0` \|\|
499	c->need_recovery);
500	ubifs_assert(c, c->gc_lnum != lnum);
501	ubifs_assert(c, wbuf->lnum != lnum);
502
503	if (lp->free + lp->dirty == c->leb_size) {
504	/ Special case - a free LEB /
505	dbg_gc("LEB %d is free, return it", lp->lnum);
506	ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
507
508	if (lp->free != c->leb_size) {
509	/*
510	* Write buffers must be sync'd before unmapping
511	* freeable LEBs, because one of them may contain data
512	* which obsoletes something in 'lp->lnum'.
513	*/
514	err = gc_sync_wbufs(c);
515	if (err)
516	return err;
517	err = ubifs_change_one_lp(c, lnum: lp->lnum, free: c->leb_size,
518	dirty: `0`, flags_set: `0`, flags_clean: `0`, idx_gc_cnt: `0`);
519	if (err)
520	return err;
521	}
522	err = ubifs_leb_unmap(c, lnum: lp->lnum);
523	if (err)
524	return err;
525
526	if (c->gc_lnum == -`1`) {
527	c->gc_lnum = lnum;
528	return LEB_RETAINED;
529	}
530
531	return LEB_FREED;
532	}
533
534	/*
535	* We scan the entire LEB even though we only really need to scan up to
536	* (c->leb_size - lp->free).
537	*/
538	sleb = ubifs_scan(c, lnum, offs: `0`, sbuf: c->sbuf, quiet: `0`);
539	if (IS_ERR(ptr: sleb))
540	return PTR_ERR(ptr: sleb);
541
542	ubifs_assert(c, !list_empty(&sleb->nodes));
543	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
544
545	if (snod->type == UBIFS_IDX_NODE) {
546	struct ubifs_gced_idx_leb *idx_gc;
547
548	dbg_gc("indexing LEB %d (free %d, dirty %d)",
549	lnum, lp->free, lp->dirty);
550	list_for_each_entry(snod, &sleb->nodes, list) {
551	struct ubifs_idx_node *idx = snod->node;
552	int level = le16_to_cpu(idx->level);
553
554	ubifs_assert(c, snod->type == UBIFS_IDX_NODE);
555	key_read(c, from: ubifs_idx_key(c, idx), to: &snod->key);
556	err = ubifs_dirty_idx_node(c, key: &snod->key, level, lnum,
557	offs: snod->offs);
558	if (err)
559	goto out;
560	}
561
562	idx_gc = kmalloc(size: sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
563	if (!idx_gc) {
564	err = -ENOMEM;
565	goto out;
566	}
567
568	idx_gc->lnum = lnum;
569	idx_gc->unmap = `0`;
570	list_add(new: &idx_gc->list, head: &c->idx_gc);
571
572	/*
573	* Don't release the LEB until after the next commit, because
574	* it may contain data which is needed for recovery. So
575	* although we freed this LEB, it will become usable only after
576	* the commit.
577	*/
578	err = ubifs_change_one_lp(c, lnum, free: c->leb_size, dirty: `0`, flags_set: `0`,
579	flags_clean: LPROPS_INDEX, idx_gc_cnt: `1`);
580	if (err)
581	goto out;
582	err = LEB_FREED_IDX;
583	} else {
584	dbg_gc("data LEB %d (free %d, dirty %d)",
585	lnum, lp->free, lp->dirty);
586
587	err = move_nodes(c, sleb);
588	if (err)
589	goto out_inc_seq;
590
591	err = gc_sync_wbufs(c);
592	if (err)
593	goto out_inc_seq;
594
595	err = ubifs_change_one_lp(c, lnum, free: c->leb_size, dirty: `0`, flags_set: `0`, flags_clean: `0`, idx_gc_cnt: `0`);
596	if (err)
597	goto out_inc_seq;
598
599	/ Allow for races with TNC /
600	c->gced_lnum = lnum;
601	smp_wmb();
602	c->gc_seq += `1`;
603	smp_wmb();
604
605	if (c->gc_lnum == -`1`) {
606	c->gc_lnum = lnum;
607	err = LEB_RETAINED;
608	} else {
609	err = ubifs_wbuf_sync_nolock(wbuf);
610	if (err)
611	goto out;
612
613	err = ubifs_leb_unmap(c, lnum);
614	if (err)
615	goto out;
616
617	err = LEB_FREED;
618	}
619	}
620
621	out:
622	ubifs_scan_destroy(sleb);
623	return err;
624
625	out_inc_seq:
626	/ We may have moved at least some nodes so allow for races with TNC /
627	c->gced_lnum = lnum;
628	smp_wmb();
629	c->gc_seq += `1`;
630	smp_wmb();
631	goto out;
632	}
633
634	/**
635	* ubifs_garbage_collect - UBIFS garbage collector.
636	* @c: UBIFS file-system description object
637	* @anyway: do GC even if there are free LEBs
638	*
639	* This function does out-of-place garbage collection. The return codes are:
640	* o positive LEB number if the LEB has been freed and may be used;
641	* o %-EAGAIN if the caller has to run commit;
642	* o %-ENOSPC if GC failed to make any progress;
643	* o other negative error codes in case of other errors.
644	*
645	* Garbage collector writes data to the journal when GC'ing data LEBs, and just
646	* marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
647	* commit may be required. But commit cannot be run from inside GC, because the
648	* caller might be holding the commit lock, so %-EAGAIN is returned instead;
649	* And this error code means that the caller has to run commit, and re-run GC
650	* if there is still no free space.
651	*
652	* There are many reasons why this function may return %-EAGAIN:
653	* o the log is full and there is no space to write an LEB reference for
654	* @c->gc_lnum;
655	* o the journal is too large and exceeds size limitations;
656	* o GC moved indexing LEBs, but they can be used only after the commit;
657	* o the shrinker fails to find clean znodes to free and requests the commit;
658	* o etc.
659	*
660	* Note, if the file-system is close to be full, this function may return
661	* %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
662	* the function. E.g., this happens if the limits on the journal size are too
663	* tough and GC writes too much to the journal before an LEB is freed. This
664	* might also mean that the journal is too large, and the TNC becomes to big,
665	* so that the shrinker is constantly called, finds not clean znodes to free,
666	* and requests commit. Well, this may also happen if the journal is all right,
667	* but another kernel process consumes too much memory. Anyway, infinite
668	* %-EAGAIN may happen, but in some extreme/misconfiguration cases.
669	*/
670	int ubifs_garbage_collect(struct ubifs_info c, int* anyway)
671	{
672	int i, err, ret, min_space = c->dead_wm;
673	struct ubifs_lprops lp;
674	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
675
676	ubifs_assert_cmt_locked(c);
677	ubifs_assert(c, !c->ro_media && !c->ro_mount);
678
679	if (ubifs_gc_should_commit(c))
680	return -EAGAIN;
681
682	mutex_lock_nested(lock: &wbuf->io_mutex, subclass: wbuf->jhead);
683
684	if (c->ro_error) {
685	ret = -EROFS;
686	goto out_unlock;
687	}
688
689	/ We expect the write-buffer to be empty on entry /
690	ubifs_assert(c, !wbuf->used);
691
692	for (i = `0`; ; i++) {
693	int space_before, space_after;
694
695	/ Maybe continue after find and break before find /
696	lp.lnum = -`1`;
697
698	cond_resched();
699
700	/ Give the commit an opportunity to run /
701	if (ubifs_gc_should_commit(c)) {
702	ret = -EAGAIN;
703	break;
704	}
705
706	if (i > SOFT_LEBS_LIMIT && !list_empty(head: &c->idx_gc)) {
707	/*
708	* We've done enough iterations. Indexing LEBs were
709	* moved and will be available after the commit.
710	*/
711	dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
712	ubifs_commit_required(c);
713	ret = -EAGAIN;
714	break;
715	}
716
717	if (i > HARD_LEBS_LIMIT) {
718	/*
719	* We've moved too many LEBs and have not made
720	* progress, give up.
721	*/
722	dbg_gc("hard limit, -ENOSPC");
723	ret = -ENOSPC;
724	break;
725	}
726
727	/*
728	* Empty and freeable LEBs can turn up while we waited for
729	* the wbuf lock, or while we have been running GC. In that
730	* case, we should just return one of those instead of
731	* continuing to GC dirty LEBs. Hence we request
732	* 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
733	*/
734	ret = ubifs_find_dirty_leb(c, ret_lp: &lp, min_space, pick_free: anyway ? `0` : `1`);
735	if (ret) {
736	if (ret == -ENOSPC)
737	dbg_gc("no more dirty LEBs");
738	break;
739	}
740
741	dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)",
742	lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty,
743	min_space);
744
745	space_before = c->leb_size - wbuf->offs - wbuf->used;
746	if (wbuf->lnum == -`1`)
747	space_before = `0`;
748
749	ret = ubifs_garbage_collect_leb(c, lp: &lp);
750	if (ret < `0`) {
751	if (ret == -EAGAIN) {
752	/*
753	* This is not error, so we have to return the
754	* LEB to lprops. But if 'ubifs_return_leb()'
755	* fails, its failure code is propagated to the
756	* caller instead of the original '-EAGAIN'.
757	*/
758	err = ubifs_return_leb(c, lnum: lp.lnum);
759	if (err) {
760	ret = err;
761	/*
762	* An LEB may always be "taken",
763	* so setting ubifs to read-only,
764	* and then executing sync wbuf will
765	* return -EROFS and enter the "out"
766	* error branch.
767	*/
768	ubifs_ro_mode(c, err: ret);
769	}
770	/ Maybe double return LEB if goto out /
771	lp.lnum = -`1`;
772	break;
773	}
774	goto out;
775	}
776
777	if (ret == LEB_FREED) {
778	/ An LEB has been freed and is ready for use /
779	dbg_gc("LEB %d freed, return", lp.lnum);
780	ret = lp.lnum;
781	break;
782	}
783
784	if (ret == LEB_FREED_IDX) {
785	/*
786	* This was an indexing LEB and it cannot be
787	* immediately used. And instead of requesting the
788	* commit straight away, we try to garbage collect some
789	* more.
790	*/
791	dbg_gc("indexing LEB %d freed, continue", lp.lnum);
792	continue;
793	}
794
795	ubifs_assert(c, ret == LEB_RETAINED);
796	space_after = c->leb_size - wbuf->offs - wbuf->used;
797	dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
798	space_after - space_before);
799
800	if (space_after > space_before) {
801	/ GC makes progress, keep working /
802	min_space >>= `1`;
803	if (min_space < c->dead_wm)
804	min_space = c->dead_wm;
805	continue;
806	}
807
808	dbg_gc("did not make progress");
809
810	/*
811	* GC moved an LEB bud have not done any progress. This means
812	* that the previous GC head LEB contained too few free space
813	* and the LEB which was GC'ed contained only large nodes which
814	* did not fit that space.
815	*
816	* We can do 2 things:
817	* 1. pick another LEB in a hope it'll contain a small node
818	* which will fit the space we have at the end of current GC
819	* head LEB, but there is no guarantee, so we try this out
820	* unless we have already been working for too long;
821	* 2. request an LEB with more dirty space, which will force
822	* 'ubifs_find_dirty_leb()' to start scanning the lprops
823	* table, instead of just picking one from the heap
824	* (previously it already picked the dirtiest LEB).
825	*/
826	if (i < SOFT_LEBS_LIMIT) {
827	dbg_gc("try again");
828	continue;
829	}
830
831	min_space <<= `1`;
832	if (min_space > c->dark_wm)
833	min_space = c->dark_wm;
834	dbg_gc("set min. space to %d", min_space);
835	}
836
837	if (ret == -ENOSPC && !list_empty(head: &c->idx_gc)) {
838	dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
839	ubifs_commit_required(c);
840	ret = -EAGAIN;
841	}
842
843	err = ubifs_wbuf_sync_nolock(wbuf);
844	if (!err)
845	err = ubifs_leb_unmap(c, lnum: c->gc_lnum);
846	if (err) {
847	ret = err;
848	goto out;
849	}
850	out_unlock:
851	mutex_unlock(lock: &wbuf->io_mutex);
852	return ret;
853
854	out:
855	ubifs_assert(c, ret < `0`);
856	ubifs_assert(c, ret != -ENOSPC && ret != -EAGAIN);
857	ubifs_wbuf_sync_nolock(wbuf);
858	ubifs_ro_mode(c, err: ret);
859	mutex_unlock(lock: &wbuf->io_mutex);
860	if (lp.lnum != -`1`)
861	ubifs_return_leb(c, lnum: lp.lnum);
862	return ret;
863	}
864
865	/**
866	* ubifs_gc_start_commit - garbage collection at start of commit.
867	* @c: UBIFS file-system description object
868	*
869	* If a LEB has only dirty and free space, then we may safely unmap it and make
870	* it free. Note, we cannot do this with indexing LEBs because dirty space may
871	* correspond index nodes that are required for recovery. In that case, the
872	* LEB cannot be unmapped until after the next commit.
873	*
874	* This function returns %0 upon success and a negative error code upon failure.
875	*/
876	int ubifs_gc_start_commit(struct ubifs_info *c)
877	{
878	struct ubifs_gced_idx_leb *idx_gc;
879	const struct ubifs_lprops *lp;
880	int err = `0`, flags;
881
882	ubifs_get_lprops(c);
883
884	/*
885	* Unmap (non-index) freeable LEBs. Note that recovery requires that all
886	* wbufs are sync'd before this, which is done in 'do_commit()'.
887	*/
888	while (`1`) {
889	lp = ubifs_fast_find_freeable(c);
890	if (!lp)
891	break;
892	ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
893	ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
894	err = ubifs_leb_unmap(c, lnum: lp->lnum);
895	if (err)
896	goto out;
897	lp = ubifs_change_lp(c, lp, free: c->leb_size, dirty: `0`, flags: lp->flags, idx_gc_cnt: `0`);
898	if (IS_ERR(ptr: lp)) {
899	err = PTR_ERR(ptr: lp);
900	goto out;
901	}
902	ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
903	ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
904	}
905
906	/ Mark GC'd index LEBs OK to unmap after this commit finishes /
907	list_for_each_entry(idx_gc, &c->idx_gc, list)
908	idx_gc->unmap = `1`;
909
910	/ Record index freeable LEBs for unmapping after commit /
911	while (`1`) {
912	lp = ubifs_fast_find_frdi_idx(c);
913	if (IS_ERR(ptr: lp)) {
914	err = PTR_ERR(ptr: lp);
915	goto out;
916	}
917	if (!lp)
918	break;
919	idx_gc = kmalloc(size: sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
920	if (!idx_gc) {
921	err = -ENOMEM;
922	goto out;
923	}
924	ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
925	ubifs_assert(c, lp->flags & LPROPS_INDEX);
926	/ Don't release the LEB until after the next commit /
927	flags = (lp->flags \| LPROPS_TAKEN) ^ LPROPS_INDEX;
928	lp = ubifs_change_lp(c, lp, free: c->leb_size, dirty: `0`, flags, idx_gc_cnt: `1`);
929	if (IS_ERR(ptr: lp)) {
930	err = PTR_ERR(ptr: lp);
931	kfree(objp: idx_gc);
932	goto out;
933	}
934	ubifs_assert(c, lp->flags & LPROPS_TAKEN);
935	ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
936	idx_gc->lnum = lp->lnum;
937	idx_gc->unmap = `1`;
938	list_add(new: &idx_gc->list, head: &c->idx_gc);
939	}
940	out:
941	ubifs_release_lprops(c);
942	return err;
943	}
944
945	/**
946	* ubifs_gc_end_commit - garbage collection at end of commit.
947	* @c: UBIFS file-system description object
948	*
949	* This function completes out-of-place garbage collection of index LEBs.
950	*/
951	int ubifs_gc_end_commit(struct ubifs_info *c)
952	{
953	struct ubifs_gced_idx_leb idx_gc, tmp;
954	struct ubifs_wbuf *wbuf;
955	int err = `0`;
956
957	wbuf = &c->jheads[GCHD].wbuf;
958	mutex_lock_nested(lock: &wbuf->io_mutex, subclass: wbuf->jhead);
959	list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
960	if (idx_gc->unmap) {
961	dbg_gc("LEB %d", idx_gc->lnum);
962	err = ubifs_leb_unmap(c, lnum: idx_gc->lnum);
963	if (err)
964	goto out;
965	err = ubifs_change_one_lp(c, lnum: idx_gc->lnum, LPROPS_NC,
966	LPROPS_NC, flags_set: `0`, flags_clean: LPROPS_TAKEN, idx_gc_cnt: -`1`);
967	if (err)
968	goto out;
969	list_del(entry: &idx_gc->list);
970	kfree(objp: idx_gc);
971	}
972	out:
973	mutex_unlock(lock: &wbuf->io_mutex);
974	return err;
975	}
976
977	/**
978	* ubifs_destroy_idx_gc - destroy idx_gc list.
979	* @c: UBIFS file-system description object
980	*
981	* This function destroys the @c->idx_gc list. It is called when unmounting
982	* so locks are not needed. Returns zero in case of success and a negative
983	* error code in case of failure.
984	*/
985	void ubifs_destroy_idx_gc(struct ubifs_info *c)
986	{
987	while (!list_empty(head: &c->idx_gc)) {
988	struct ubifs_gced_idx_leb *idx_gc;
989
990	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
991	list);
992	c->idx_gc_cnt -= `1`;
993	list_del(entry: &idx_gc->list);
994	kfree(objp: idx_gc);
995	}
996	}
997
998	/**
999	* ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
1000	* @c: UBIFS file-system description object
1001	*
1002	* Called during start commit so locks are not needed.
1003	*/
1004	int ubifs_get_idx_gc_leb(struct ubifs_info *c)
1005	{
1006	struct ubifs_gced_idx_leb *idx_gc;
1007	int lnum;
1008
1009	if (list_empty(head: &c->idx_gc))
1010	return -ENOSPC;
1011	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
1012	lnum = idx_gc->lnum;
1013	/ c->idx_gc_cnt is updated by the caller when lprops are updated /
1014	list_del(entry: &idx_gc->list);
1015	kfree(objp: idx_gc);
1016	return lnum;
1017	}
1018

source code of linux/fs/ubifs/gc.c