1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * This file is part of UBIFS. |
4 | * |
5 | * Copyright (C) 2006-2008 Nokia Corporation. |
6 | * |
7 | * Authors: Artem Bityutskiy (Битюцкий Артём) |
8 | * Adrian Hunter |
9 | */ |
10 | |
11 | /* |
12 | * This file implements UBIFS shrinker which evicts clean znodes from the TNC |
13 | * tree when Linux VM needs more RAM. |
14 | * |
15 | * We do not implement any LRU lists to find oldest znodes to free because it |
16 | * would add additional overhead to the file system fast paths. So the shrinker |
17 | * just walks the TNC tree when searching for znodes to free. |
18 | * |
19 | * If the root of a TNC sub-tree is clean and old enough, then the children are |
20 | * also clean and old enough. So the shrinker walks the TNC in level order and |
21 | * dumps entire sub-trees. |
22 | * |
23 | * The age of znodes is just the time-stamp when they were last looked at. |
24 | * The current shrinker first tries to evict old znodes, then young ones. |
25 | * |
26 | * Since the shrinker is global, it has to protect against races with FS |
27 | * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. |
28 | */ |
29 | |
30 | #include "ubifs.h" |
31 | |
32 | /* List of all UBIFS file-system instances */ |
33 | LIST_HEAD(ubifs_infos); |
34 | |
35 | /* |
36 | * We number each shrinker run and record the number on the ubifs_info structure |
37 | * so that we can easily work out which ubifs_info structures have already been |
38 | * done by the current run. |
39 | */ |
40 | static unsigned int shrinker_run_no; |
41 | |
42 | /* Protects 'ubifs_infos' list */ |
43 | DEFINE_SPINLOCK(ubifs_infos_lock); |
44 | |
45 | /* Global clean znode counter (for all mounted UBIFS instances) */ |
46 | atomic_long_t ubifs_clean_zn_cnt; |
47 | |
48 | /** |
49 | * shrink_tnc - shrink TNC tree. |
50 | * @c: UBIFS file-system description object |
51 | * @nr: number of znodes to free |
52 | * @age: the age of znodes to free |
53 | * @contention: if any contention, this is set to %1 |
54 | * |
55 | * This function traverses TNC tree and frees clean znodes. It does not free |
56 | * clean znodes which younger then @age. Returns number of freed znodes. |
57 | */ |
58 | static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) |
59 | { |
60 | int total_freed = 0; |
61 | struct ubifs_znode *znode, *zprev; |
62 | time64_t time = ktime_get_seconds(); |
63 | |
64 | ubifs_assert(c, mutex_is_locked(&c->umount_mutex)); |
65 | ubifs_assert(c, mutex_is_locked(&c->tnc_mutex)); |
66 | |
67 | if (!c->zroot.znode || atomic_long_read(v: &c->clean_zn_cnt) == 0) |
68 | return 0; |
69 | |
70 | /* |
71 | * Traverse the TNC tree in levelorder manner, so that it is possible |
72 | * to destroy large sub-trees. Indeed, if a znode is old, then all its |
73 | * children are older or of the same age. |
74 | * |
75 | * Note, we are holding 'c->tnc_mutex', so we do not have to lock the |
76 | * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is |
77 | * changed only when the 'c->tnc_mutex' is held. |
78 | */ |
79 | zprev = NULL; |
80 | znode = ubifs_tnc_levelorder_next(c, zr: c->zroot.znode, NULL); |
81 | while (znode && total_freed < nr && |
82 | atomic_long_read(v: &c->clean_zn_cnt) > 0) { |
83 | int freed; |
84 | |
85 | /* |
86 | * If the znode is clean, but it is in the 'c->cnext' list, this |
87 | * means that this znode has just been written to flash as a |
88 | * part of commit and was marked clean. They will be removed |
89 | * from the list at end commit. We cannot change the list, |
90 | * because it is not protected by any mutex (design decision to |
91 | * make commit really independent and parallel to main I/O). So |
92 | * we just skip these znodes. |
93 | * |
94 | * Note, the 'clean_zn_cnt' counters are not updated until |
95 | * after the commit, so the UBIFS shrinker does not report |
96 | * the znodes which are in the 'c->cnext' list as freeable. |
97 | * |
98 | * Also note, if the root of a sub-tree is not in 'c->cnext', |
99 | * then the whole sub-tree is not in 'c->cnext' as well, so it |
100 | * is safe to dump whole sub-tree. |
101 | */ |
102 | |
103 | if (znode->cnext) { |
104 | /* |
105 | * Very soon these znodes will be removed from the list |
106 | * and become freeable. |
107 | */ |
108 | *contention = 1; |
109 | } else if (!ubifs_zn_dirty(znode) && |
110 | abs(time - znode->time) >= age) { |
111 | if (znode->parent) |
112 | znode->parent->zbranch[znode->iip].znode = NULL; |
113 | else |
114 | c->zroot.znode = NULL; |
115 | |
116 | freed = ubifs_destroy_tnc_subtree(c, zr: znode); |
117 | atomic_long_sub(i: freed, v: &ubifs_clean_zn_cnt); |
118 | atomic_long_sub(i: freed, v: &c->clean_zn_cnt); |
119 | total_freed += freed; |
120 | znode = zprev; |
121 | } |
122 | |
123 | if (unlikely(!c->zroot.znode)) |
124 | break; |
125 | |
126 | zprev = znode; |
127 | znode = ubifs_tnc_levelorder_next(c, zr: c->zroot.znode, znode); |
128 | cond_resched(); |
129 | } |
130 | |
131 | return total_freed; |
132 | } |
133 | |
134 | /** |
135 | * shrink_tnc_trees - shrink UBIFS TNC trees. |
136 | * @nr: number of znodes to free |
137 | * @age: the age of znodes to free |
138 | * @contention: if any contention, this is set to %1 |
139 | * |
140 | * This function walks the list of mounted UBIFS file-systems and frees clean |
141 | * znodes which are older than @age, until at least @nr znodes are freed. |
142 | * Returns the number of freed znodes. |
143 | */ |
144 | static int shrink_tnc_trees(int nr, int age, int *contention) |
145 | { |
146 | struct ubifs_info *c; |
147 | struct list_head *p; |
148 | unsigned int run_no; |
149 | int freed = 0; |
150 | |
151 | spin_lock(lock: &ubifs_infos_lock); |
152 | do { |
153 | run_no = ++shrinker_run_no; |
154 | } while (run_no == 0); |
155 | /* Iterate over all mounted UBIFS file-systems and try to shrink them */ |
156 | p = ubifs_infos.next; |
157 | while (p != &ubifs_infos) { |
158 | c = list_entry(p, struct ubifs_info, infos_list); |
159 | /* |
160 | * We move the ones we do to the end of the list, so we stop |
161 | * when we see one we have already done. |
162 | */ |
163 | if (c->shrinker_run_no == run_no) |
164 | break; |
165 | if (!mutex_trylock(lock: &c->umount_mutex)) { |
166 | /* Some un-mount is in progress, try next FS */ |
167 | *contention = 1; |
168 | p = p->next; |
169 | continue; |
170 | } |
171 | /* |
172 | * We're holding 'c->umount_mutex', so the file-system won't go |
173 | * away. |
174 | */ |
175 | if (!mutex_trylock(lock: &c->tnc_mutex)) { |
176 | mutex_unlock(lock: &c->umount_mutex); |
177 | *contention = 1; |
178 | p = p->next; |
179 | continue; |
180 | } |
181 | spin_unlock(lock: &ubifs_infos_lock); |
182 | /* |
183 | * OK, now we have TNC locked, the file-system cannot go away - |
184 | * it is safe to reap the cache. |
185 | */ |
186 | c->shrinker_run_no = run_no; |
187 | freed += shrink_tnc(c, nr, age, contention); |
188 | mutex_unlock(lock: &c->tnc_mutex); |
189 | spin_lock(lock: &ubifs_infos_lock); |
190 | /* Get the next list element before we move this one */ |
191 | p = p->next; |
192 | /* |
193 | * Move this one to the end of the list to provide some |
194 | * fairness. |
195 | */ |
196 | list_move_tail(list: &c->infos_list, head: &ubifs_infos); |
197 | mutex_unlock(lock: &c->umount_mutex); |
198 | if (freed >= nr) |
199 | break; |
200 | } |
201 | spin_unlock(lock: &ubifs_infos_lock); |
202 | return freed; |
203 | } |
204 | |
205 | /** |
206 | * kick_a_thread - kick a background thread to start commit. |
207 | * |
208 | * This function kicks a background thread to start background commit. Returns |
209 | * %-1 if a thread was kicked or there is another reason to assume the memory |
210 | * will soon be freed or become freeable. If there are no dirty znodes, returns |
211 | * %0. |
212 | */ |
213 | static int kick_a_thread(void) |
214 | { |
215 | int i; |
216 | struct ubifs_info *c; |
217 | |
218 | /* |
219 | * Iterate over all mounted UBIFS file-systems and find out if there is |
220 | * already an ongoing commit operation there. If no, then iterate for |
221 | * the second time and initiate background commit. |
222 | */ |
223 | spin_lock(lock: &ubifs_infos_lock); |
224 | for (i = 0; i < 2; i++) { |
225 | list_for_each_entry(c, &ubifs_infos, infos_list) { |
226 | long dirty_zn_cnt; |
227 | |
228 | if (!mutex_trylock(lock: &c->umount_mutex)) { |
229 | /* |
230 | * Some un-mount is in progress, it will |
231 | * certainly free memory, so just return. |
232 | */ |
233 | spin_unlock(lock: &ubifs_infos_lock); |
234 | return -1; |
235 | } |
236 | |
237 | dirty_zn_cnt = atomic_long_read(v: &c->dirty_zn_cnt); |
238 | |
239 | if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || |
240 | c->ro_mount || c->ro_error) { |
241 | mutex_unlock(lock: &c->umount_mutex); |
242 | continue; |
243 | } |
244 | |
245 | if (c->cmt_state != COMMIT_RESTING) { |
246 | spin_unlock(lock: &ubifs_infos_lock); |
247 | mutex_unlock(lock: &c->umount_mutex); |
248 | return -1; |
249 | } |
250 | |
251 | if (i == 1) { |
252 | list_move_tail(list: &c->infos_list, head: &ubifs_infos); |
253 | spin_unlock(lock: &ubifs_infos_lock); |
254 | |
255 | ubifs_request_bg_commit(c); |
256 | mutex_unlock(lock: &c->umount_mutex); |
257 | return -1; |
258 | } |
259 | mutex_unlock(lock: &c->umount_mutex); |
260 | } |
261 | } |
262 | spin_unlock(lock: &ubifs_infos_lock); |
263 | |
264 | return 0; |
265 | } |
266 | |
267 | unsigned long ubifs_shrink_count(struct shrinker *shrink, |
268 | struct shrink_control *sc) |
269 | { |
270 | long clean_zn_cnt = atomic_long_read(v: &ubifs_clean_zn_cnt); |
271 | |
272 | /* |
273 | * Due to the way UBIFS updates the clean znode counter it may |
274 | * temporarily be negative. |
275 | */ |
276 | return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; |
277 | } |
278 | |
279 | unsigned long ubifs_shrink_scan(struct shrinker *shrink, |
280 | struct shrink_control *sc) |
281 | { |
282 | unsigned long nr = sc->nr_to_scan; |
283 | int contention = 0; |
284 | unsigned long freed; |
285 | long clean_zn_cnt = atomic_long_read(v: &ubifs_clean_zn_cnt); |
286 | |
287 | if (!clean_zn_cnt) { |
288 | /* |
289 | * No clean znodes, nothing to reap. All we can do in this case |
290 | * is to kick background threads to start commit, which will |
291 | * probably make clean znodes which, in turn, will be freeable. |
292 | * And we return -1 which means will make VM call us again |
293 | * later. |
294 | */ |
295 | dbg_tnc("no clean znodes, kick a thread" ); |
296 | return kick_a_thread(); |
297 | } |
298 | |
299 | freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, contention: &contention); |
300 | if (freed >= nr) |
301 | goto out; |
302 | |
303 | dbg_tnc("not enough old znodes, try to free young ones" ); |
304 | freed += shrink_tnc_trees(nr: nr - freed, YOUNG_ZNODE_AGE, contention: &contention); |
305 | if (freed >= nr) |
306 | goto out; |
307 | |
308 | dbg_tnc("not enough young znodes, free all" ); |
309 | freed += shrink_tnc_trees(nr: nr - freed, age: 0, contention: &contention); |
310 | |
311 | if (!freed && contention) { |
312 | dbg_tnc("freed nothing, but contention" ); |
313 | return SHRINK_STOP; |
314 | } |
315 | |
316 | out: |
317 | dbg_tnc("%lu znodes were freed, requested %lu" , freed, nr); |
318 | return freed; |
319 | } |
320 | |