1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/memcontrol.h> |
3 | #include <linux/rwsem.h> |
4 | #include <linux/shrinker.h> |
5 | #include <linux/rculist.h> |
6 | #include <trace/events/vmscan.h> |
7 | |
8 | #include "internal.h" |
9 | |
10 | LIST_HEAD(shrinker_list); |
11 | DEFINE_MUTEX(shrinker_mutex); |
12 | |
13 | #ifdef CONFIG_MEMCG |
14 | static int shrinker_nr_max; |
15 | |
16 | static inline int shrinker_unit_size(int nr_items) |
17 | { |
18 | return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); |
19 | } |
20 | |
21 | static inline void shrinker_unit_free(struct shrinker_info *info, int start) |
22 | { |
23 | struct shrinker_info_unit **unit; |
24 | int nr, i; |
25 | |
26 | if (!info) |
27 | return; |
28 | |
29 | unit = info->unit; |
30 | nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); |
31 | |
32 | for (i = start; i < nr; i++) { |
33 | if (!unit[i]) |
34 | break; |
35 | |
36 | kfree(objp: unit[i]); |
37 | unit[i] = NULL; |
38 | } |
39 | } |
40 | |
41 | static inline int shrinker_unit_alloc(struct shrinker_info *new, |
42 | struct shrinker_info *old, int nid) |
43 | { |
44 | struct shrinker_info_unit *unit; |
45 | int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); |
46 | int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; |
47 | int i; |
48 | |
49 | for (i = start; i < nr; i++) { |
50 | unit = kzalloc_node(size: sizeof(*unit), GFP_KERNEL, node: nid); |
51 | if (!unit) { |
52 | shrinker_unit_free(info: new, start); |
53 | return -ENOMEM; |
54 | } |
55 | |
56 | new->unit[i] = unit; |
57 | } |
58 | |
59 | return 0; |
60 | } |
61 | |
62 | void free_shrinker_info(struct mem_cgroup *memcg) |
63 | { |
64 | struct mem_cgroup_per_node *pn; |
65 | struct shrinker_info *info; |
66 | int nid; |
67 | |
68 | for_each_node(nid) { |
69 | pn = memcg->nodeinfo[nid]; |
70 | info = rcu_dereference_protected(pn->shrinker_info, true); |
71 | shrinker_unit_free(info, start: 0); |
72 | kvfree(addr: info); |
73 | rcu_assign_pointer(pn->shrinker_info, NULL); |
74 | } |
75 | } |
76 | |
77 | int alloc_shrinker_info(struct mem_cgroup *memcg) |
78 | { |
79 | struct shrinker_info *info; |
80 | int nid, ret = 0; |
81 | int array_size = 0; |
82 | |
83 | mutex_lock(&shrinker_mutex); |
84 | array_size = shrinker_unit_size(nr_items: shrinker_nr_max); |
85 | for_each_node(nid) { |
86 | info = kvzalloc_node(size: sizeof(*info) + array_size, GFP_KERNEL, node: nid); |
87 | if (!info) |
88 | goto err; |
89 | info->map_nr_max = shrinker_nr_max; |
90 | if (shrinker_unit_alloc(new: info, NULL, nid)) |
91 | goto err; |
92 | rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); |
93 | } |
94 | mutex_unlock(lock: &shrinker_mutex); |
95 | |
96 | return ret; |
97 | |
98 | err: |
99 | mutex_unlock(lock: &shrinker_mutex); |
100 | free_shrinker_info(memcg); |
101 | return -ENOMEM; |
102 | } |
103 | |
104 | static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, |
105 | int nid) |
106 | { |
107 | return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, |
108 | lockdep_is_held(&shrinker_mutex)); |
109 | } |
110 | |
111 | static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, |
112 | int old_size, int new_nr_max) |
113 | { |
114 | struct shrinker_info *new, *old; |
115 | struct mem_cgroup_per_node *pn; |
116 | int nid; |
117 | |
118 | for_each_node(nid) { |
119 | pn = memcg->nodeinfo[nid]; |
120 | old = shrinker_info_protected(memcg, nid); |
121 | /* Not yet online memcg */ |
122 | if (!old) |
123 | return 0; |
124 | |
125 | /* Already expanded this shrinker_info */ |
126 | if (new_nr_max <= old->map_nr_max) |
127 | continue; |
128 | |
129 | new = kvmalloc_node(size: sizeof(*new) + new_size, GFP_KERNEL, node: nid); |
130 | if (!new) |
131 | return -ENOMEM; |
132 | |
133 | new->map_nr_max = new_nr_max; |
134 | |
135 | memcpy(new->unit, old->unit, old_size); |
136 | if (shrinker_unit_alloc(new, old, nid)) { |
137 | kvfree(addr: new); |
138 | return -ENOMEM; |
139 | } |
140 | |
141 | rcu_assign_pointer(pn->shrinker_info, new); |
142 | kvfree_rcu(old, rcu); |
143 | } |
144 | |
145 | return 0; |
146 | } |
147 | |
148 | static int expand_shrinker_info(int new_id) |
149 | { |
150 | int ret = 0; |
151 | int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); |
152 | int new_size, old_size = 0; |
153 | struct mem_cgroup *memcg; |
154 | |
155 | if (!root_mem_cgroup) |
156 | goto out; |
157 | |
158 | lockdep_assert_held(&shrinker_mutex); |
159 | |
160 | new_size = shrinker_unit_size(nr_items: new_nr_max); |
161 | old_size = shrinker_unit_size(nr_items: shrinker_nr_max); |
162 | |
163 | memcg = mem_cgroup_iter(NULL, NULL, NULL); |
164 | do { |
165 | ret = expand_one_shrinker_info(memcg, new_size, old_size, |
166 | new_nr_max); |
167 | if (ret) { |
168 | mem_cgroup_iter_break(NULL, memcg); |
169 | goto out; |
170 | } |
171 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); |
172 | out: |
173 | if (!ret) |
174 | shrinker_nr_max = new_nr_max; |
175 | |
176 | return ret; |
177 | } |
178 | |
179 | static inline int shrinker_id_to_index(int shrinker_id) |
180 | { |
181 | return shrinker_id / SHRINKER_UNIT_BITS; |
182 | } |
183 | |
184 | static inline int shrinker_id_to_offset(int shrinker_id) |
185 | { |
186 | return shrinker_id % SHRINKER_UNIT_BITS; |
187 | } |
188 | |
189 | static inline int calc_shrinker_id(int index, int offset) |
190 | { |
191 | return index * SHRINKER_UNIT_BITS + offset; |
192 | } |
193 | |
194 | void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) |
195 | { |
196 | if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { |
197 | struct shrinker_info *info; |
198 | struct shrinker_info_unit *unit; |
199 | |
200 | rcu_read_lock(); |
201 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
202 | unit = info->unit[shrinker_id_to_index(shrinker_id)]; |
203 | if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { |
204 | /* Pairs with smp mb in shrink_slab() */ |
205 | smp_mb__before_atomic(); |
206 | set_bit(nr: shrinker_id_to_offset(shrinker_id), addr: unit->map); |
207 | } |
208 | rcu_read_unlock(); |
209 | } |
210 | } |
211 | |
212 | static DEFINE_IDR(shrinker_idr); |
213 | |
214 | static int shrinker_memcg_alloc(struct shrinker *shrinker) |
215 | { |
216 | int id, ret = -ENOMEM; |
217 | |
218 | if (mem_cgroup_disabled()) |
219 | return -ENOSYS; |
220 | |
221 | mutex_lock(&shrinker_mutex); |
222 | id = idr_alloc(&shrinker_idr, ptr: shrinker, start: 0, end: 0, GFP_KERNEL); |
223 | if (id < 0) |
224 | goto unlock; |
225 | |
226 | if (id >= shrinker_nr_max) { |
227 | if (expand_shrinker_info(new_id: id)) { |
228 | idr_remove(&shrinker_idr, id); |
229 | goto unlock; |
230 | } |
231 | } |
232 | shrinker->id = id; |
233 | ret = 0; |
234 | unlock: |
235 | mutex_unlock(lock: &shrinker_mutex); |
236 | return ret; |
237 | } |
238 | |
239 | static void shrinker_memcg_remove(struct shrinker *shrinker) |
240 | { |
241 | int id = shrinker->id; |
242 | |
243 | BUG_ON(id < 0); |
244 | |
245 | lockdep_assert_held(&shrinker_mutex); |
246 | |
247 | idr_remove(&shrinker_idr, id); |
248 | } |
249 | |
250 | static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, |
251 | struct mem_cgroup *memcg) |
252 | { |
253 | struct shrinker_info *info; |
254 | struct shrinker_info_unit *unit; |
255 | long nr_deferred; |
256 | |
257 | rcu_read_lock(); |
258 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
259 | unit = info->unit[shrinker_id_to_index(shrinker_id: shrinker->id)]; |
260 | nr_deferred = atomic_long_xchg(v: &unit->nr_deferred[shrinker_id_to_offset(shrinker_id: shrinker->id)], new: 0); |
261 | rcu_read_unlock(); |
262 | |
263 | return nr_deferred; |
264 | } |
265 | |
266 | static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, |
267 | struct mem_cgroup *memcg) |
268 | { |
269 | struct shrinker_info *info; |
270 | struct shrinker_info_unit *unit; |
271 | long nr_deferred; |
272 | |
273 | rcu_read_lock(); |
274 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
275 | unit = info->unit[shrinker_id_to_index(shrinker_id: shrinker->id)]; |
276 | nr_deferred = |
277 | atomic_long_add_return(i: nr, v: &unit->nr_deferred[shrinker_id_to_offset(shrinker_id: shrinker->id)]); |
278 | rcu_read_unlock(); |
279 | |
280 | return nr_deferred; |
281 | } |
282 | |
283 | void reparent_shrinker_deferred(struct mem_cgroup *memcg) |
284 | { |
285 | int nid, index, offset; |
286 | long nr; |
287 | struct mem_cgroup *parent; |
288 | struct shrinker_info *child_info, *parent_info; |
289 | struct shrinker_info_unit *child_unit, *parent_unit; |
290 | |
291 | parent = parent_mem_cgroup(memcg); |
292 | if (!parent) |
293 | parent = root_mem_cgroup; |
294 | |
295 | /* Prevent from concurrent shrinker_info expand */ |
296 | mutex_lock(&shrinker_mutex); |
297 | for_each_node(nid) { |
298 | child_info = shrinker_info_protected(memcg, nid); |
299 | parent_info = shrinker_info_protected(memcg: parent, nid); |
300 | for (index = 0; index < shrinker_id_to_index(shrinker_id: child_info->map_nr_max); index++) { |
301 | child_unit = child_info->unit[index]; |
302 | parent_unit = parent_info->unit[index]; |
303 | for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { |
304 | nr = atomic_long_read(v: &child_unit->nr_deferred[offset]); |
305 | atomic_long_add(i: nr, v: &parent_unit->nr_deferred[offset]); |
306 | } |
307 | } |
308 | } |
309 | mutex_unlock(lock: &shrinker_mutex); |
310 | } |
311 | #else |
312 | static int shrinker_memcg_alloc(struct shrinker *shrinker) |
313 | { |
314 | return -ENOSYS; |
315 | } |
316 | |
317 | static void shrinker_memcg_remove(struct shrinker *shrinker) |
318 | { |
319 | } |
320 | |
321 | static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, |
322 | struct mem_cgroup *memcg) |
323 | { |
324 | return 0; |
325 | } |
326 | |
327 | static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, |
328 | struct mem_cgroup *memcg) |
329 | { |
330 | return 0; |
331 | } |
332 | #endif /* CONFIG_MEMCG */ |
333 | |
334 | static long xchg_nr_deferred(struct shrinker *shrinker, |
335 | struct shrink_control *sc) |
336 | { |
337 | int nid = sc->nid; |
338 | |
339 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
340 | nid = 0; |
341 | |
342 | if (sc->memcg && |
343 | (shrinker->flags & SHRINKER_MEMCG_AWARE)) |
344 | return xchg_nr_deferred_memcg(nid, shrinker, |
345 | memcg: sc->memcg); |
346 | |
347 | return atomic_long_xchg(v: &shrinker->nr_deferred[nid], new: 0); |
348 | } |
349 | |
350 | |
351 | static long add_nr_deferred(long nr, struct shrinker *shrinker, |
352 | struct shrink_control *sc) |
353 | { |
354 | int nid = sc->nid; |
355 | |
356 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
357 | nid = 0; |
358 | |
359 | if (sc->memcg && |
360 | (shrinker->flags & SHRINKER_MEMCG_AWARE)) |
361 | return add_nr_deferred_memcg(nr, nid, shrinker, |
362 | memcg: sc->memcg); |
363 | |
364 | return atomic_long_add_return(i: nr, v: &shrinker->nr_deferred[nid]); |
365 | } |
366 | |
367 | #define SHRINK_BATCH 128 |
368 | |
369 | static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, |
370 | struct shrinker *shrinker, int priority) |
371 | { |
372 | unsigned long freed = 0; |
373 | unsigned long long delta; |
374 | long total_scan; |
375 | long freeable; |
376 | long nr; |
377 | long new_nr; |
378 | long batch_size = shrinker->batch ? shrinker->batch |
379 | : SHRINK_BATCH; |
380 | long scanned = 0, next_deferred; |
381 | |
382 | freeable = shrinker->count_objects(shrinker, shrinkctl); |
383 | if (freeable == 0 || freeable == SHRINK_EMPTY) |
384 | return freeable; |
385 | |
386 | /* |
387 | * copy the current shrinker scan count into a local variable |
388 | * and zero it so that other concurrent shrinker invocations |
389 | * don't also do this scanning work. |
390 | */ |
391 | nr = xchg_nr_deferred(shrinker, sc: shrinkctl); |
392 | |
393 | if (shrinker->seeks) { |
394 | delta = freeable >> priority; |
395 | delta *= 4; |
396 | do_div(delta, shrinker->seeks); |
397 | } else { |
398 | /* |
399 | * These objects don't require any IO to create. Trim |
400 | * them aggressively under memory pressure to keep |
401 | * them from causing refetches in the IO caches. |
402 | */ |
403 | delta = freeable / 2; |
404 | } |
405 | |
406 | total_scan = nr >> priority; |
407 | total_scan += delta; |
408 | total_scan = min(total_scan, (2 * freeable)); |
409 | |
410 | trace_mm_shrink_slab_start(shr: shrinker, sc: shrinkctl, nr_objects_to_shrink: nr, |
411 | cache_items: freeable, delta, total_scan, priority); |
412 | |
413 | /* |
414 | * Normally, we should not scan less than batch_size objects in one |
415 | * pass to avoid too frequent shrinker calls, but if the slab has less |
416 | * than batch_size objects in total and we are really tight on memory, |
417 | * we will try to reclaim all available objects, otherwise we can end |
418 | * up failing allocations although there are plenty of reclaimable |
419 | * objects spread over several slabs with usage less than the |
420 | * batch_size. |
421 | * |
422 | * We detect the "tight on memory" situations by looking at the total |
423 | * number of objects we want to scan (total_scan). If it is greater |
424 | * than the total number of objects on slab (freeable), we must be |
425 | * scanning at high prio and therefore should try to reclaim as much as |
426 | * possible. |
427 | */ |
428 | while (total_scan >= batch_size || |
429 | total_scan >= freeable) { |
430 | unsigned long ret; |
431 | unsigned long nr_to_scan = min(batch_size, total_scan); |
432 | |
433 | shrinkctl->nr_to_scan = nr_to_scan; |
434 | shrinkctl->nr_scanned = nr_to_scan; |
435 | ret = shrinker->scan_objects(shrinker, shrinkctl); |
436 | if (ret == SHRINK_STOP) |
437 | break; |
438 | freed += ret; |
439 | |
440 | count_vm_events(item: SLABS_SCANNED, delta: shrinkctl->nr_scanned); |
441 | total_scan -= shrinkctl->nr_scanned; |
442 | scanned += shrinkctl->nr_scanned; |
443 | |
444 | cond_resched(); |
445 | } |
446 | |
447 | /* |
448 | * The deferred work is increased by any new work (delta) that wasn't |
449 | * done, decreased by old deferred work that was done now. |
450 | * |
451 | * And it is capped to two times of the freeable items. |
452 | */ |
453 | next_deferred = max_t(long, (nr + delta - scanned), 0); |
454 | next_deferred = min(next_deferred, (2 * freeable)); |
455 | |
456 | /* |
457 | * move the unused scan count back into the shrinker in a |
458 | * manner that handles concurrent updates. |
459 | */ |
460 | new_nr = add_nr_deferred(nr: next_deferred, shrinker, sc: shrinkctl); |
461 | |
462 | trace_mm_shrink_slab_end(shr: shrinker, nid: shrinkctl->nid, shrinker_retval: freed, unused_scan_cnt: nr, new_scan_cnt: new_nr, total_scan); |
463 | return freed; |
464 | } |
465 | |
466 | #ifdef CONFIG_MEMCG |
467 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, |
468 | struct mem_cgroup *memcg, int priority) |
469 | { |
470 | struct shrinker_info *info; |
471 | unsigned long ret, freed = 0; |
472 | int offset, index = 0; |
473 | |
474 | if (!mem_cgroup_online(memcg)) |
475 | return 0; |
476 | |
477 | /* |
478 | * lockless algorithm of memcg shrink. |
479 | * |
480 | * The shrinker_info may be freed asynchronously via RCU in the |
481 | * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used |
482 | * to ensure the existence of the shrinker_info. |
483 | * |
484 | * The shrinker_info_unit is never freed unless its corresponding memcg |
485 | * is destroyed. Here we already hold the refcount of memcg, so the |
486 | * memcg will not be destroyed, and of course shrinker_info_unit will |
487 | * not be freed. |
488 | * |
489 | * So in the memcg shrink: |
490 | * step 1: use rcu_read_lock() to guarantee existence of the |
491 | * shrinker_info. |
492 | * step 2: after getting shrinker_info_unit we can safely release the |
493 | * RCU lock. |
494 | * step 3: traverse the bitmap and calculate shrinker_id |
495 | * step 4: use rcu_read_lock() to guarantee existence of the shrinker. |
496 | * step 5: use shrinker_id to find the shrinker, then use |
497 | * shrinker_try_get() to guarantee existence of the shrinker, |
498 | * then we can release the RCU lock to do do_shrink_slab() that |
499 | * may sleep. |
500 | * step 6: do shrinker_put() paired with step 5 to put the refcount, |
501 | * if the refcount reaches 0, then wake up the waiter in |
502 | * shrinker_free() by calling complete(). |
503 | * Note: here is different from the global shrink, we don't |
504 | * need to acquire the RCU lock to guarantee existence of |
505 | * the shrinker, because we don't need to use this |
506 | * shrinker to traverse the next shrinker in the bitmap. |
507 | * step 7: we have already exited the read-side of rcu critical section |
508 | * before calling do_shrink_slab(), the shrinker_info may be |
509 | * released in expand_one_shrinker_info(), so go back to step 1 |
510 | * to reacquire the shrinker_info. |
511 | */ |
512 | again: |
513 | rcu_read_lock(); |
514 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
515 | if (unlikely(!info)) |
516 | goto unlock; |
517 | |
518 | if (index < shrinker_id_to_index(shrinker_id: info->map_nr_max)) { |
519 | struct shrinker_info_unit *unit; |
520 | |
521 | unit = info->unit[index]; |
522 | |
523 | rcu_read_unlock(); |
524 | |
525 | for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { |
526 | struct shrink_control sc = { |
527 | .gfp_mask = gfp_mask, |
528 | .nid = nid, |
529 | .memcg = memcg, |
530 | }; |
531 | struct shrinker *shrinker; |
532 | int shrinker_id = calc_shrinker_id(index, offset); |
533 | |
534 | rcu_read_lock(); |
535 | shrinker = idr_find(&shrinker_idr, id: shrinker_id); |
536 | if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { |
537 | clear_bit(nr: offset, addr: unit->map); |
538 | rcu_read_unlock(); |
539 | continue; |
540 | } |
541 | rcu_read_unlock(); |
542 | |
543 | /* Call non-slab shrinkers even though kmem is disabled */ |
544 | if (!memcg_kmem_online() && |
545 | !(shrinker->flags & SHRINKER_NONSLAB)) |
546 | continue; |
547 | |
548 | ret = do_shrink_slab(shrinkctl: &sc, shrinker, priority); |
549 | if (ret == SHRINK_EMPTY) { |
550 | clear_bit(nr: offset, addr: unit->map); |
551 | /* |
552 | * After the shrinker reported that it had no objects to |
553 | * free, but before we cleared the corresponding bit in |
554 | * the memcg shrinker map, a new object might have been |
555 | * added. To make sure, we have the bit set in this |
556 | * case, we invoke the shrinker one more time and reset |
557 | * the bit if it reports that it is not empty anymore. |
558 | * The memory barrier here pairs with the barrier in |
559 | * set_shrinker_bit(): |
560 | * |
561 | * list_lru_add() shrink_slab_memcg() |
562 | * list_add_tail() clear_bit() |
563 | * <MB> <MB> |
564 | * set_bit() do_shrink_slab() |
565 | */ |
566 | smp_mb__after_atomic(); |
567 | ret = do_shrink_slab(shrinkctl: &sc, shrinker, priority); |
568 | if (ret == SHRINK_EMPTY) |
569 | ret = 0; |
570 | else |
571 | set_shrinker_bit(memcg, nid, shrinker_id); |
572 | } |
573 | freed += ret; |
574 | shrinker_put(shrinker); |
575 | } |
576 | |
577 | index++; |
578 | goto again; |
579 | } |
580 | unlock: |
581 | rcu_read_unlock(); |
582 | return freed; |
583 | } |
584 | #else /* !CONFIG_MEMCG */ |
585 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, |
586 | struct mem_cgroup *memcg, int priority) |
587 | { |
588 | return 0; |
589 | } |
590 | #endif /* CONFIG_MEMCG */ |
591 | |
592 | /** |
593 | * shrink_slab - shrink slab caches |
594 | * @gfp_mask: allocation context |
595 | * @nid: node whose slab caches to target |
596 | * @memcg: memory cgroup whose slab caches to target |
597 | * @priority: the reclaim priority |
598 | * |
599 | * Call the shrink functions to age shrinkable caches. |
600 | * |
601 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
602 | * unaware shrinkers will receive a node id of 0 instead. |
603 | * |
604 | * @memcg specifies the memory cgroup to target. Unaware shrinkers |
605 | * are called only if it is the root cgroup. |
606 | * |
607 | * @priority is sc->priority, we take the number of objects and >> by priority |
608 | * in order to get the scan target. |
609 | * |
610 | * Returns the number of reclaimed slab objects. |
611 | */ |
612 | unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, |
613 | int priority) |
614 | { |
615 | unsigned long ret, freed = 0; |
616 | struct shrinker *shrinker; |
617 | |
618 | /* |
619 | * The root memcg might be allocated even though memcg is disabled |
620 | * via "cgroup_disable=memory" boot parameter. This could make |
621 | * mem_cgroup_is_root() return false, then just run memcg slab |
622 | * shrink, but skip global shrink. This may result in premature |
623 | * oom. |
624 | */ |
625 | if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) |
626 | return shrink_slab_memcg(gfp_mask, nid, memcg, priority); |
627 | |
628 | /* |
629 | * lockless algorithm of global shrink. |
630 | * |
631 | * In the unregistration setp, the shrinker will be freed asynchronously |
632 | * via RCU after its refcount reaches 0. So both rcu_read_lock() and |
633 | * shrinker_try_get() can be used to ensure the existence of the shrinker. |
634 | * |
635 | * So in the global shrink: |
636 | * step 1: use rcu_read_lock() to guarantee existence of the shrinker |
637 | * and the validity of the shrinker_list walk. |
638 | * step 2: use shrinker_try_get() to try get the refcount, if successful, |
639 | * then the existence of the shrinker can also be guaranteed, |
640 | * so we can release the RCU lock to do do_shrink_slab() that |
641 | * may sleep. |
642 | * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), |
643 | * which ensures that neither this shrinker nor the next shrinker |
644 | * will be freed in the next traversal operation. |
645 | * step 4: do shrinker_put() paired with step 2 to put the refcount, |
646 | * if the refcount reaches 0, then wake up the waiter in |
647 | * shrinker_free() by calling complete(). |
648 | */ |
649 | rcu_read_lock(); |
650 | list_for_each_entry_rcu(shrinker, &shrinker_list, list) { |
651 | struct shrink_control sc = { |
652 | .gfp_mask = gfp_mask, |
653 | .nid = nid, |
654 | .memcg = memcg, |
655 | }; |
656 | |
657 | if (!shrinker_try_get(shrinker)) |
658 | continue; |
659 | |
660 | rcu_read_unlock(); |
661 | |
662 | ret = do_shrink_slab(shrinkctl: &sc, shrinker, priority); |
663 | if (ret == SHRINK_EMPTY) |
664 | ret = 0; |
665 | freed += ret; |
666 | |
667 | rcu_read_lock(); |
668 | shrinker_put(shrinker); |
669 | } |
670 | |
671 | rcu_read_unlock(); |
672 | cond_resched(); |
673 | return freed; |
674 | } |
675 | |
676 | struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) |
677 | { |
678 | struct shrinker *shrinker; |
679 | unsigned int size; |
680 | va_list ap; |
681 | int err; |
682 | |
683 | shrinker = kzalloc(size: sizeof(struct shrinker), GFP_KERNEL); |
684 | if (!shrinker) |
685 | return NULL; |
686 | |
687 | va_start(ap, fmt); |
688 | err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); |
689 | va_end(ap); |
690 | if (err) |
691 | goto err_name; |
692 | |
693 | shrinker->flags = flags | SHRINKER_ALLOCATED; |
694 | shrinker->seeks = DEFAULT_SEEKS; |
695 | |
696 | if (flags & SHRINKER_MEMCG_AWARE) { |
697 | err = shrinker_memcg_alloc(shrinker); |
698 | if (err == -ENOSYS) { |
699 | /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ |
700 | shrinker->flags &= ~SHRINKER_MEMCG_AWARE; |
701 | goto non_memcg; |
702 | } |
703 | |
704 | if (err) |
705 | goto err_flags; |
706 | |
707 | return shrinker; |
708 | } |
709 | |
710 | non_memcg: |
711 | /* |
712 | * The nr_deferred is available on per memcg level for memcg aware |
713 | * shrinkers, so only allocate nr_deferred in the following cases: |
714 | * - non-memcg-aware shrinkers |
715 | * - !CONFIG_MEMCG |
716 | * - memcg is disabled by kernel command line |
717 | */ |
718 | size = sizeof(*shrinker->nr_deferred); |
719 | if (flags & SHRINKER_NUMA_AWARE) |
720 | size *= nr_node_ids; |
721 | |
722 | shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); |
723 | if (!shrinker->nr_deferred) |
724 | goto err_flags; |
725 | |
726 | return shrinker; |
727 | |
728 | err_flags: |
729 | shrinker_debugfs_name_free(shrinker); |
730 | err_name: |
731 | kfree(objp: shrinker); |
732 | return NULL; |
733 | } |
734 | EXPORT_SYMBOL_GPL(shrinker_alloc); |
735 | |
736 | void shrinker_register(struct shrinker *shrinker) |
737 | { |
738 | if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { |
739 | pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker" ); |
740 | return; |
741 | } |
742 | |
743 | mutex_lock(&shrinker_mutex); |
744 | list_add_tail_rcu(new: &shrinker->list, head: &shrinker_list); |
745 | shrinker->flags |= SHRINKER_REGISTERED; |
746 | shrinker_debugfs_add(shrinker); |
747 | mutex_unlock(lock: &shrinker_mutex); |
748 | |
749 | init_completion(x: &shrinker->done); |
750 | /* |
751 | * Now the shrinker is fully set up, take the first reference to it to |
752 | * indicate that lookup operations are now allowed to use it via |
753 | * shrinker_try_get(). |
754 | */ |
755 | refcount_set(r: &shrinker->refcount, n: 1); |
756 | } |
757 | EXPORT_SYMBOL_GPL(shrinker_register); |
758 | |
759 | static void shrinker_free_rcu_cb(struct rcu_head *head) |
760 | { |
761 | struct shrinker *shrinker = container_of(head, struct shrinker, rcu); |
762 | |
763 | kfree(objp: shrinker->nr_deferred); |
764 | kfree(objp: shrinker); |
765 | } |
766 | |
767 | void shrinker_free(struct shrinker *shrinker) |
768 | { |
769 | struct dentry *debugfs_entry = NULL; |
770 | int debugfs_id; |
771 | |
772 | if (!shrinker) |
773 | return; |
774 | |
775 | if (shrinker->flags & SHRINKER_REGISTERED) { |
776 | /* drop the initial refcount */ |
777 | shrinker_put(shrinker); |
778 | /* |
779 | * Wait for all lookups of the shrinker to complete, after that, |
780 | * no shrinker is running or will run again, then we can safely |
781 | * free it asynchronously via RCU and safely free the structure |
782 | * where the shrinker is located, such as super_block etc. |
783 | */ |
784 | wait_for_completion(&shrinker->done); |
785 | } |
786 | |
787 | mutex_lock(&shrinker_mutex); |
788 | if (shrinker->flags & SHRINKER_REGISTERED) { |
789 | /* |
790 | * Now we can safely remove it from the shrinker_list and then |
791 | * free it. |
792 | */ |
793 | list_del_rcu(entry: &shrinker->list); |
794 | debugfs_entry = shrinker_debugfs_detach(shrinker, debugfs_id: &debugfs_id); |
795 | shrinker->flags &= ~SHRINKER_REGISTERED; |
796 | } |
797 | |
798 | shrinker_debugfs_name_free(shrinker); |
799 | |
800 | if (shrinker->flags & SHRINKER_MEMCG_AWARE) |
801 | shrinker_memcg_remove(shrinker); |
802 | mutex_unlock(lock: &shrinker_mutex); |
803 | |
804 | if (debugfs_entry) |
805 | shrinker_debugfs_remove(debugfs_entry, debugfs_id); |
806 | |
807 | call_rcu(head: &shrinker->rcu, func: shrinker_free_rcu_cb); |
808 | } |
809 | EXPORT_SYMBOL_GPL(shrinker_free); |
810 | |