workingset.c source code [linux/mm/workingset.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Workingset detection
4	*
5	* Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
6	*/
7
8	#include <linux/memcontrol.h>
9	#include <linux/mm_inline.h>
10	#include <linux/writeback.h>
11	#include <linux/shmem_fs.h>
12	#include <linux/pagemap.h>
13	#include <linux/atomic.h>
14	#include <linux/module.h>
15	#include <linux/swap.h>
16	#include <linux/dax.h>
17	#include <linux/fs.h>
18	#include <linux/mm.h>
19
20	/*
21	* Double CLOCK lists
22	*
23	* Per node, two clock lists are maintained for file pages: the
24	* inactive and the active list. Freshly faulted pages start out at
25	* the head of the inactive list and page reclaim scans pages from the
26	* tail. Pages that are accessed multiple times on the inactive list
27	* are promoted to the active list, to protect them from reclaim,
28	* whereas active pages are demoted to the inactive list when the
29	* active list grows too big.
30	*
31	* fault ------------------------+
32	* \|
33	* +--------------+ \| +-------------+
34	* reclaim <- \| inactive \| <-+-- demotion \| active \| <--+
35	* +--------------+ +-------------+ \|
36	* \| \|
37	* +-------------- promotion ------------------+
38	*
39	*
40	* Access frequency and refault distance
41	*
42	* A workload is thrashing when its pages are frequently used but they
43	* are evicted from the inactive list every time before another access
44	* would have promoted them to the active list.
45	*
46	* In cases where the average access distance between thrashing pages
47	* is bigger than the size of memory there is nothing that can be
48	* done - the thrashing set could never fit into memory under any
49	* circumstance.
50	*
51	* However, the average access distance could be bigger than the
52	* inactive list, yet smaller than the size of memory. In this case,
53	* the set could fit into memory if it weren't for the currently
54	* active pages - which may be used more, hopefully less frequently:
55	*
56	* +-memory available to cache-+
57	* \| \|
58	* +-inactive------+-active----+
59	* a b \| c d e f g h i \| J K L M N \|
60	* +---------------+-----------+
61	*
62	* It is prohibitively expensive to accurately track access frequency
63	* of pages. But a reasonable approximation can be made to measure
64	* thrashing on the inactive list, after which refaulting pages can be
65	* activated optimistically to compete with the existing active pages.
66	*
67	* Approximating inactive page access frequency - Observations:
68	*
69	* 1. When a page is accessed for the first time, it is added to the
70	* head of the inactive list, slides every existing inactive page
71	* towards the tail by one slot, and pushes the current tail page
72	* out of memory.
73	*
74	* 2. When a page is accessed for the second time, it is promoted to
75	* the active list, shrinking the inactive list by one slot. This
76	* also slides all inactive pages that were faulted into the cache
77	* more recently than the activated page towards the tail of the
78	* inactive list.
79	*
80	* Thus:
81	*
82	* 1. The sum of evictions and activations between any two points in
83	* time indicate the minimum number of inactive pages accessed in
84	* between.
85	*
86	* 2. Moving one inactive page N page slots towards the tail of the
87	* list requires at least N inactive page accesses.
88	*
89	* Combining these:
90	*
91	* 1. When a page is finally evicted from memory, the number of
92	* inactive pages accessed while the page was in cache is at least
93	* the number of page slots on the inactive list.
94	*
95	* 2. In addition, measuring the sum of evictions and activations (E)
96	* at the time of a page's eviction, and comparing it to another
97	* reading (R) at the time the page faults back into memory tells
98	* the minimum number of accesses while the page was not cached.
99	* This is called the refault distance.
100	*
101	* Because the first access of the page was the fault and the second
102	* access the refault, we combine the in-cache distance with the
103	* out-of-cache distance to get the complete minimum access distance
104	* of this page:
105	*
106	* NR_inactive + (R - E)
107	*
108	* And knowing the minimum access distance of a page, we can easily
109	* tell if the page would be able to stay in cache assuming all page
110	* slots in the cache were available:
111	*
112	* NR_inactive + (R - E) <= NR_inactive + NR_active
113	*
114	* If we have swap we should consider about NR_inactive_anon and
115	* NR_active_anon, so for page cache and anonymous respectively:
116	*
117	* NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
118	* + NR_inactive_anon + NR_active_anon
119	*
120	* NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
121	* + NR_inactive_file + NR_active_file
122	*
123	* Which can be further simplified to:
124	*
125	* (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
126	*
127	* (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
128	*
129	* Put into words, the refault distance (out-of-cache) can be seen as
130	* a deficit in inactive list space (in-cache). If the inactive list
131	* had (R - E) more page slots, the page would not have been evicted
132	* in between accesses, but activated instead. And on a full system,
133	* the only thing eating into inactive list space is active pages.
134	*
135	*
136	* Refaulting inactive pages
137	*
138	* All that is known about the active list is that the pages have been
139	* accessed more than once in the past. This means that at any given
140	* time there is actually a good chance that pages on the active list
141	* are no longer in active use.
142	*
143	* So when a refault distance of (R - E) is observed and there are at
144	* least (R - E) pages in the userspace workingset, the refaulting page
145	* is activated optimistically in the hope that (R - E) pages are actually
146	* used less frequently than the refaulting page - or even not used at
147	* all anymore.
148	*
149	* That means if inactive cache is refaulting with a suitable refault
150	* distance, we assume the cache workingset is transitioning and put
151	* pressure on the current workingset.
152	*
153	* If this is wrong and demotion kicks in, the pages which are truly
154	* used more frequently will be reactivated while the less frequently
155	* used once will be evicted from memory.
156	*
157	* But if this is right, the stale pages will be pushed out of memory
158	* and the used pages get to stay in cache.
159	*
160	* Refaulting active pages
161	*
162	* If on the other hand the refaulting pages have recently been
163	* deactivated, it means that the active list is no longer protecting
164	* actively used cache from reclaim. The cache is NOT transitioning to
165	* a different workingset; the existing workingset is thrashing in the
166	* space allocated to the page cache.
167	*
168	*
169	* Implementation
170	*
171	* For each node's LRU lists, a counter for inactive evictions and
172	* activations is maintained (node->nonresident_age).
173	*
174	* On eviction, a snapshot of this counter (along with some bits to
175	* identify the node) is stored in the now empty page cache
176	* slot of the evicted page. This is called a shadow entry.
177	*
178	* On cache misses for which there are shadow entries, an eligible
179	* refault distance will immediately activate the refaulting page.
180	*/
181
182	#define WORKINGSET_SHIFT 1
183	#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
184	WORKINGSET_SHIFT + NODES_SHIFT + \
185	MEM_CGROUP_ID_SHIFT)
186	#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
187
188	/*
189	* Eviction timestamps need to be able to cover the full range of
190	* actionable refaults. However, bits are tight in the xarray
191	* entry, and after storing the identifier for the lruvec there might
192	* not be enough left to represent every single actionable refault. In
193	* that case, we have to sacrifice granularity for distance, and group
194	* evictions into coarser buckets by shaving off lower timestamp bits.
195	*/
196	static unsigned int bucket_order __read_mostly;
197
198	static void pack_shadow(int* memcgid, pg_data_t pgdat, unsigned* long eviction,
199	bool workingset)
200	{
201	eviction &= EVICTION_MASK;
202	eviction = (eviction << MEM_CGROUP_ID_SHIFT) \| memcgid;
203	eviction = (eviction << NODES_SHIFT) \| pgdat->node_id;
204	eviction = (eviction << WORKINGSET_SHIFT) \| workingset;
205
206	return xa_mk_value(v: eviction);
207	}
208
209	static void unpack_shadow(void shadow, int* memcgidp, pg_data_t *pgdat,
210	unsigned long evictionp, bool workingsetp)
211	{
212	unsigned long entry = xa_to_value(entry: shadow);
213	int memcgid, nid;
214	bool workingset;
215
216	workingset = entry & ((`1UL` << WORKINGSET_SHIFT) - `1`);
217	entry >>= WORKINGSET_SHIFT;
218	nid = entry & ((`1UL` << NODES_SHIFT) - `1`);
219	entry >>= NODES_SHIFT;
220	memcgid = entry & ((`1UL` << MEM_CGROUP_ID_SHIFT) - `1`);
221	entry >>= MEM_CGROUP_ID_SHIFT;
222
223	*memcgidp = memcgid;
224	*pgdat = NODE_DATA(nid);
225	*evictionp = entry;
226	*workingsetp = workingset;
227	}
228
229	#ifdef CONFIG_LRU_GEN
230
231	static void lru_gen_eviction(struct* folio *folio)
232	{
233	int hist;
234	unsigned long token;
235	unsigned long min_seq;
236	struct lruvec *lruvec;
237	struct lru_gen_folio *lrugen;
238	int type = folio_is_file_lru(folio);
239	int delta = folio_nr_pages(folio);
240	int refs = folio_lru_refs(folio);
241	int tier = lru_tier_from_refs(refs);
242	struct mem_cgroup *memcg = folio_memcg(folio);
243	struct pglist_data *pgdat = folio_pgdat(folio);
244
245	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
246
247	lruvec = mem_cgroup_lruvec(memcg, pgdat);
248	lrugen = &lruvec->lrugen;
249	min_seq = READ_ONCE(lrugen->min_seq[type]);
250	token = (min_seq << LRU_REFS_WIDTH) \| max(refs - `1`, `0`);
251
252	hist = lru_hist_from_seq(seq: min_seq);
253	atomic_long_add(i: delta, v: &lrugen->evicted[hist][type][tier]);
254
255	return pack_shadow(memcgid: mem_cgroup_id(memcg), pgdat, eviction: token, workingset: refs);
256	}
257
258	/*
259	* Tests if the shadow entry is for a folio that was recently evicted.
260	* Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
261	*/
262	static bool lru_gen_test_recent(void shadow, bool file, struct* lruvec **lruvec,
263	unsigned long token, bool workingset)
264	{
265	int memcg_id;
266	unsigned long min_seq;
267	struct mem_cgroup *memcg;
268	struct pglist_data *pgdat;
269
270	unpack_shadow(shadow, memcgidp: &memcg_id, pgdat: &pgdat, evictionp: token, workingsetp: workingset);
271
272	memcg = mem_cgroup_from_id(id: memcg_id);
273	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
274
275	min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]);
276	return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH));
277	}
278
279	static void lru_gen_refault(struct folio folio, void* *shadow)
280	{
281	bool recent;
282	int hist, tier, refs;
283	bool workingset;
284	unsigned long token;
285	struct lruvec *lruvec;
286	struct lru_gen_folio *lrugen;
287	int type = folio_is_file_lru(folio);
288	int delta = folio_nr_pages(folio);
289
290	rcu_read_lock();
291
292	recent = lru_gen_test_recent(shadow, file: type, lruvec: &lruvec, token: &token, workingset: &workingset);
293	if (lruvec != folio_lruvec(folio))
294	goto unlock;
295
296	mod_lruvec_state(lruvec, idx: WORKINGSET_REFAULT_BASE + type, val: delta);
297
298	if (!recent)
299	goto unlock;
300
301	lrugen = &lruvec->lrugen;
302
303	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
304	/ see the comment in folio_lru_refs() /
305	refs = (token & (BIT(LRU_REFS_WIDTH) - `1`)) + workingset;
306	tier = lru_tier_from_refs(refs);
307
308	atomic_long_add(i: delta, v: &lrugen->refaulted[hist][type][tier]);
309	mod_lruvec_state(lruvec, idx: WORKINGSET_ACTIVATE_BASE + type, val: delta);
310
311	/*
312	* Count the following two cases as stalls:
313	* 1. For pages accessed through page tables, hotter pages pushed out
314	* hot pages which refaulted immediately.
315	* 2. For pages accessed multiple times through file descriptors,
316	* numbers of accesses might have been out of the range.
317	*/
318	if (lru_gen_in_fault() \|\| refs == BIT(LRU_REFS_WIDTH)) {
319	folio_set_workingset(folio);
320	mod_lruvec_state(lruvec, idx: WORKINGSET_RESTORE_BASE + type, val: delta);
321	}
322	unlock:
323	rcu_read_unlock();
324	}
325
326	#else /* !CONFIG_LRU_GEN */
327
328	static void lru_gen_eviction(struct* folio *folio)
329	{
330	return NULL;
331	}
332
333	static bool lru_gen_test_recent(void shadow, bool file, struct* lruvec **lruvec,
334	unsigned long token, bool workingset)
335	{
336	return false;
337	}
338
339	static void lru_gen_refault(struct folio folio, void* *shadow)
340	{
341	}
342
343	#endif /* CONFIG_LRU_GEN */
344
345	/**
346	* workingset_age_nonresident - age non-resident entries as LRU ages
347	* @lruvec: the lruvec that was aged
348	* @nr_pages: the number of pages to count
349	*
350	* As in-memory pages are aged, non-resident pages need to be aged as
351	* well, in order for the refault distances later on to be comparable
352	* to the in-memory dimensions. This function allows reclaim and LRU
353	* operations to drive the non-resident aging along in parallel.
354	*/
355	void workingset_age_nonresident(struct lruvec lruvec, unsigned* long nr_pages)
356	{
357	/*
358	* Reclaiming a cgroup means reclaiming all its children in a
359	* round-robin fashion. That means that each cgroup has an LRU
360	* order that is composed of the LRU orders of its child
361	* cgroups; and every page has an LRU position not just in the
362	* cgroup that owns it, but in all of that group's ancestors.
363	*
364	* So when the physical inactive list of a leaf cgroup ages,
365	* the virtual inactive lists of all its parents, including
366	* the root cgroup's, age as well.
367	*/
368	do {
369	atomic_long_add(i: nr_pages, v: &lruvec->nonresident_age);
370	} while ((lruvec = parent_lruvec(lruvec)));
371	}
372
373	/**
374	* workingset_eviction - note the eviction of a folio from memory
375	* @target_memcg: the cgroup that is causing the reclaim
376	* @folio: the folio being evicted
377	*
378	* Return: a shadow entry to be stored in @folio->mapping->i_pages in place
379	* of the evicted @folio so that a later refault can be detected.
380	*/
381	void workingset_eviction(struct* folio folio, struct* mem_cgroup *target_memcg)
382	{
383	struct pglist_data *pgdat = folio_pgdat(folio);
384	unsigned long eviction;
385	struct lruvec *lruvec;
386	int memcgid;
387
388	/ Folio is fully exclusive and pins folio's memory cgroup pointer /
389	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
390	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
391	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
392
393	if (lru_gen_enabled())
394	return lru_gen_eviction(folio);
395
396	lruvec = mem_cgroup_lruvec(memcg: target_memcg, pgdat);
397	/ XXX: target_memcg can be NULL, go through lruvec /
398	memcgid = mem_cgroup_id(memcg: lruvec_memcg(lruvec));
399	eviction = atomic_long_read(v: &lruvec->nonresident_age);
400	eviction >>= bucket_order;
401	workingset_age_nonresident(lruvec, nr_pages: folio_nr_pages(folio));
402	return pack_shadow(memcgid, pgdat, eviction,
403	workingset: folio_test_workingset(folio));
404	}
405
406	/**
407	* workingset_test_recent - tests if the shadow entry is for a folio that was
408	* recently evicted. Also fills in @workingset with the value unpacked from
409	* shadow.
410	* @shadow: the shadow entry to be tested.
411	* @file: whether the corresponding folio is from the file lru.
412	* @workingset: where the workingset value unpacked from shadow should
413	* be stored.
414	*
415	* Return: true if the shadow is for a recently evicted folio; false otherwise.
416	*/
417	bool workingset_test_recent(void shadow, bool file, bool workingset)
418	{
419	struct mem_cgroup *eviction_memcg;
420	struct lruvec *eviction_lruvec;
421	unsigned long refault_distance;
422	unsigned long workingset_size;
423	unsigned long refault;
424	int memcgid;
425	struct pglist_data *pgdat;
426	unsigned long eviction;
427
428	if (lru_gen_enabled())
429	return lru_gen_test_recent(shadow, file, lruvec: &eviction_lruvec, token: &eviction, workingset);
430
431	unpack_shadow(shadow, memcgidp: &memcgid, pgdat: &pgdat, evictionp: &eviction, workingsetp: workingset);
432	eviction <<= bucket_order;
433
434	/*
435	* Look up the memcg associated with the stored ID. It might
436	* have been deleted since the folio's eviction.
437	*
438	* Note that in rare events the ID could have been recycled
439	* for a new cgroup that refaults a shared folio. This is
440	* impossible to tell from the available data. However, this
441	* should be a rare and limited disturbance, and activations
442	* are always speculative anyway. Ultimately, it's the aging
443	* algorithm's job to shake out the minimum access frequency
444	* for the active cache.
445	*
446	* XXX: On !CONFIG_MEMCG, this will always return NULL; it
447	* would be better if the root_mem_cgroup existed in all
448	* configurations instead.
449	*/
450	eviction_memcg = mem_cgroup_from_id(id: memcgid);
451	if (!mem_cgroup_disabled() && !eviction_memcg)
452	return false;
453
454	eviction_lruvec = mem_cgroup_lruvec(memcg: eviction_memcg, pgdat);
455	refault = atomic_long_read(v: &eviction_lruvec->nonresident_age);
456
457	/*
458	* Calculate the refault distance
459	*
460	* The unsigned subtraction here gives an accurate distance
461	* across nonresident_age overflows in most cases. There is a
462	* special case: usually, shadow entries have a short lifetime
463	* and are either refaulted or reclaimed along with the inode
464	* before they get too old. But it is not impossible for the
465	* nonresident_age to lap a shadow entry in the field, which
466	* can then result in a false small refault distance, leading
467	* to a false activation should this old entry actually
468	* refault again. However, earlier kernels used to deactivate
469	* unconditionally with every reclaim invocation for the
470	* longest time, so the occasional inappropriate activation
471	* leading to pressure on the active list is not a problem.
472	*/
473	refault_distance = (refault - eviction) & EVICTION_MASK;
474
475	/*
476	* Compare the distance to the existing workingset size. We
477	* don't activate pages that couldn't stay resident even if
478	* all the memory was available to the workingset. Whether
479	* workingset competition needs to consider anon or not depends
480	* on having free swap space.
481	*/
482	workingset_size = lruvec_page_state(lruvec: eviction_lruvec, idx: NR_ACTIVE_FILE);
483	if (!file) {
484	workingset_size += lruvec_page_state(lruvec: eviction_lruvec,
485	idx: NR_INACTIVE_FILE);
486	}
487	if (mem_cgroup_get_nr_swap_pages(memcg: eviction_memcg) > `0`) {
488	workingset_size += lruvec_page_state(lruvec: eviction_lruvec,
489	idx: NR_ACTIVE_ANON);
490	if (file) {
491	workingset_size += lruvec_page_state(lruvec: eviction_lruvec,
492	idx: NR_INACTIVE_ANON);
493	}
494	}
495
496	return refault_distance <= workingset_size;
497	}
498
499	/**
500	* workingset_refault - Evaluate the refault of a previously evicted folio.
501	* @folio: The freshly allocated replacement folio.
502	* @shadow: Shadow entry of the evicted folio.
503	*
504	* Calculates and evaluates the refault distance of the previously
505	* evicted folio in the context of the node and the memcg whose memory
506	* pressure caused the eviction.
507	*/
508	void workingset_refault(struct folio folio, void* *shadow)
509	{
510	bool file = folio_is_file_lru(folio);
511	struct pglist_data *pgdat;
512	struct mem_cgroup *memcg;
513	struct lruvec *lruvec;
514	bool workingset;
515	long nr;
516
517	if (lru_gen_enabled()) {
518	lru_gen_refault(folio, shadow);
519	return;
520	}
521
522	/ Flush stats (and potentially sleep) before holding RCU read lock /
523	mem_cgroup_flush_stats_ratelimited();
524
525	rcu_read_lock();
526
527	/*
528	* The activation decision for this folio is made at the level
529	* where the eviction occurred, as that is where the LRU order
530	* during folio reclaim is being determined.
531	*
532	* However, the cgroup that will own the folio is the one that
533	* is actually experiencing the refault event.
534	*/
535	nr = folio_nr_pages(folio);
536	memcg = folio_memcg(folio);
537	pgdat = folio_pgdat(folio);
538	lruvec = mem_cgroup_lruvec(memcg, pgdat);
539
540	mod_lruvec_state(lruvec, idx: WORKINGSET_REFAULT_BASE + file, val: nr);
541
542	if (!workingset_test_recent(shadow, file, workingset: &workingset))
543	goto out;
544
545	folio_set_active(folio);
546	workingset_age_nonresident(lruvec, nr_pages: nr);
547	mod_lruvec_state(lruvec, idx: WORKINGSET_ACTIVATE_BASE + file, val: nr);
548
549	/ Folio was active prior to eviction /
550	if (workingset) {
551	folio_set_workingset(folio);
552	/*
553	* XXX: Move to folio_add_lru() when it supports new vs
554	* putback
555	*/
556	lru_note_cost_refault(folio);
557	mod_lruvec_state(lruvec, idx: WORKINGSET_RESTORE_BASE + file, val: nr);
558	}
559	out:
560	rcu_read_unlock();
561	}
562
563	/**
564	* workingset_activation - note a page activation
565	* @folio: Folio that is being activated.
566	*/
567	void workingset_activation(struct folio *folio)
568	{
569	struct mem_cgroup *memcg;
570
571	rcu_read_lock();
572	/*
573	* Filter non-memcg pages here, e.g. unmap can call
574	* mark_page_accessed() on VDSO pages.
575	*
576	* XXX: See workingset_refault() - this should return
577	* root_mem_cgroup even for !CONFIG_MEMCG.
578	*/
579	memcg = folio_memcg_rcu(folio);
580	if (!mem_cgroup_disabled() && !memcg)
581	goto out;
582	workingset_age_nonresident(lruvec: folio_lruvec(folio), nr_pages: folio_nr_pages(folio));
583	out:
584	rcu_read_unlock();
585	}
586
587	/*
588	* Shadow entries reflect the share of the working set that does not
589	* fit into memory, so their number depends on the access pattern of
590	* the workload. In most cases, they will refault or get reclaimed
591	* along with the inode, but a (malicious) workload that streams
592	* through files with a total size several times that of available
593	* memory, while preventing the inodes from being reclaimed, can
594	* create excessive amounts of shadow nodes. To keep a lid on this,
595	* track shadow nodes and reclaim them when they grow way past the
596	* point where they would still be useful.
597	*/
598
599	struct list_lru shadow_nodes;
600
601	void workingset_update_node(struct xa_node *node)
602	{
603	struct address_space *mapping;
604
605	/*
606	* Track non-empty nodes that contain only shadow entries;
607	* unlink those that contain pages or are being freed.
608	*
609	* Avoid acquiring the list_lru lock when the nodes are
610	* already where they should be. The list_empty() test is safe
611	* as node->private_list is protected by the i_pages lock.
612	*/
613	mapping = container_of(node->array, struct address_space, i_pages);
614	lockdep_assert_held(&mapping->i_pages.xa_lock);
615
616	if (node->count && node->count == node->nr_values) {
617	if (list_empty(head: &node->private_list)) {
618	list_lru_add(lru: &shadow_nodes, item: &node->private_list);
619	__inc_lruvec_kmem_state(p: node, idx: WORKINGSET_NODES);
620	}
621	} else {
622	if (!list_empty(head: &node->private_list)) {
623	list_lru_del(lru: &shadow_nodes, item: &node->private_list);
624	__dec_lruvec_kmem_state(p: node, idx: WORKINGSET_NODES);
625	}
626	}
627	}
628
629	static unsigned long count_shadow_nodes(struct shrinker *shrinker,
630	struct shrink_control *sc)
631	{
632	unsigned long max_nodes;
633	unsigned long nodes;
634	unsigned long pages;
635
636	nodes = list_lru_shrink_count(lru: &shadow_nodes, sc);
637	if (!nodes)
638	return SHRINK_EMPTY;
639
640	/*
641	* Approximate a reasonable limit for the nodes
642	* containing shadow entries. We don't need to keep more
643	* shadow entries than possible pages on the active list,
644	* since refault distances bigger than that are dismissed.
645	*
646	* The size of the active list converges toward 100% of
647	* overall page cache as memory grows, with only a tiny
648	* inactive list. Assume the total cache size for that.
649	*
650	* Nodes might be sparsely populated, with only one shadow
651	* entry in the extreme case. Obviously, we cannot keep one
652	* node for every eligible shadow entry, so compromise on a
653	* worst-case density of 1/8th. Below that, not all eligible
654	* refaults can be detected anymore.
655	*
656	* On 64-bit with 7 xa_nodes per page and 64 slots
657	* each, this will reclaim shadow entries when they consume
658	* ~1.8% of available memory:
659	*
660	* PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
661	*/
662	#ifdef CONFIG_MEMCG
663	if (sc->memcg) {
664	struct lruvec *lruvec;
665	int i;
666
667	mem_cgroup_flush_stats();
668	lruvec = mem_cgroup_lruvec(memcg: sc->memcg, NODE_DATA(sc->nid));
669	for (pages = `0`, i = `0`; i < NR_LRU_LISTS; i++)
670	pages += lruvec_page_state_local(lruvec,
671	idx: NR_LRU_BASE + i);
672	pages += lruvec_page_state_local(
673	lruvec, idx: NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
674	pages += lruvec_page_state_local(
675	lruvec, idx: NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
676	} else
677	#endif
678	pages = node_present_pages(sc->nid);
679
680	max_nodes = pages >> (XA_CHUNK_SHIFT - `3`);
681
682	if (nodes <= max_nodes)
683	return `0`;
684	return nodes - max_nodes;
685	}
686
687	static enum lru_status shadow_lru_isolate(struct list_head *item,
688	struct list_lru_one *lru,
689	spinlock_t *lru_lock,
690	void *arg) __must_hold(lru_lock)
691	{
692	struct xa_node node = container_of(item, struct* xa_node, private_list);
693	struct address_space *mapping;
694	int ret;
695
696	/*
697	* Page cache insertions and deletions synchronously maintain
698	* the shadow node LRU under the i_pages lock and the
699	* lru_lock. Because the page cache tree is emptied before
700	* the inode can be destroyed, holding the lru_lock pins any
701	* address_space that has nodes on the LRU.
702	*
703	* We can then safely transition to the i_pages lock to
704	* pin only the address_space of the particular node we want
705	* to reclaim, take the node off-LRU, and drop the lru_lock.
706	*/
707
708	mapping = container_of(node->array, struct address_space, i_pages);
709
710	/ Coming from the list, invert the lock order /
711	if (!xa_trylock(&mapping->i_pages)) {
712	spin_unlock_irq(lock: lru_lock);
713	ret = LRU_RETRY;
714	goto out;
715	}
716
717	/ For page cache we need to hold i_lock /
718	if (mapping->host != NULL) {
719	if (!spin_trylock(lock: &mapping->host->i_lock)) {
720	xa_unlock(&mapping->i_pages);
721	spin_unlock_irq(lock: lru_lock);
722	ret = LRU_RETRY;
723	goto out;
724	}
725	}
726
727	list_lru_isolate(list: lru, item);
728	__dec_lruvec_kmem_state(p: node, idx: WORKINGSET_NODES);
729
730	spin_unlock(lock: lru_lock);
731
732	/*
733	* The nodes should only contain one or more shadow entries,
734	* no pages, so we expect to be able to remove them all and
735	* delete and free the empty node afterwards.
736	*/
737	if (WARN_ON_ONCE(!node->nr_values))
738	goto out_invalid;
739	if (WARN_ON_ONCE(node->count != node->nr_values))
740	goto out_invalid;
741	xa_delete_node(node, workingset_update_node);
742	__inc_lruvec_kmem_state(p: node, idx: WORKINGSET_NODERECLAIM);
743
744	out_invalid:
745	xa_unlock_irq(&mapping->i_pages);
746	if (mapping->host != NULL) {
747	if (mapping_shrinkable(mapping))
748	inode_add_lru(inode: mapping->host);
749	spin_unlock(lock: &mapping->host->i_lock);
750	}
751	ret = LRU_REMOVED_RETRY;
752	out:
753	cond_resched();
754	spin_lock_irq(lock: lru_lock);
755	return ret;
756	}
757
758	static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
759	struct shrink_control *sc)
760	{
761	/ list_lru lock nests inside the IRQ-safe i_pages lock /
762	return list_lru_shrink_walk_irq(lru: &shadow_nodes, sc, isolate: shadow_lru_isolate,
763	NULL);
764	}
765
766	/*
767	* Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
768	* i_pages lock.
769	*/
770	static struct lock_class_key shadow_nodes_key;
771
772	static int __init workingset_init(void)
773	{
774	struct shrinker *workingset_shadow_shrinker;
775	unsigned int timestamp_bits;
776	unsigned int max_order;
777	int ret = -ENOMEM;
778
779	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
780	/*
781	* Calculate the eviction bucket size to cover the longest
782	* actionable refault distance, which is currently half of
783	* memory (totalram_pages/2). However, memory hotplug may add
784	* some more pages at runtime, so keep working with up to
785	* double the initial memory by using totalram_pages as-is.
786	*/
787	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
788	max_order = fls_long(l: totalram_pages() - `1`);
789	if (max_order > timestamp_bits)
790	bucket_order = max_order - timestamp_bits;
791	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
792	timestamp_bits, max_order, bucket_order);
793
794	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE \|
795	SHRINKER_MEMCG_AWARE,
796	fmt: "mm-shadow");
797	if (!workingset_shadow_shrinker)
798	goto err;
799
800	ret = __list_lru_init(lru: &shadow_nodes, memcg_aware: true, key: &shadow_nodes_key,
801	shrinker: workingset_shadow_shrinker);
802	if (ret)
803	goto err_list_lru;
804
805	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
806	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
807	/ ->count reports only fully expendable nodes /
808	workingset_shadow_shrinker->seeks = `0`;
809
810	shrinker_register(shrinker: workingset_shadow_shrinker);
811	return `0`;
812	err_list_lru:
813	shrinker_free(shrinker: workingset_shadow_shrinker);
814	err:
815	return ret;
816	}
817	module_init(workingset_init);
818

source code of linux/mm/workingset.c