swapfile.c source code [linux/mm/swapfile.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/swapfile.c
4	*
5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6	* Swap reorganised 29.12.95, Stephen Tweedie
7	*/
8
9	#include <linux/blkdev.h>
10	#include <linux/mm.h>
11	#include <linux/sched/mm.h>
12	#include <linux/sched/task.h>
13	#include <linux/hugetlb.h>
14	#include <linux/mman.h>
15	#include <linux/slab.h>
16	#include <linux/kernel_stat.h>
17	#include <linux/swap.h>
18	#include <linux/vmalloc.h>
19	#include <linux/pagemap.h>
20	#include <linux/namei.h>
21	#include <linux/shmem_fs.h>
22	#include <linux/blk-cgroup.h>
23	#include <linux/random.h>
24	#include <linux/writeback.h>
25	#include <linux/proc_fs.h>
26	#include <linux/seq_file.h>
27	#include <linux/init.h>
28	#include <linux/ksm.h>
29	#include <linux/rmap.h>
30	#include <linux/security.h>
31	#include <linux/backing-dev.h>
32	#include <linux/mutex.h>
33	#include <linux/capability.h>
34	#include <linux/syscalls.h>
35	#include <linux/memcontrol.h>
36	#include <linux/poll.h>
37	#include <linux/oom.h>
38	#include <linux/swapfile.h>
39	#include <linux/export.h>
40	#include <linux/swap_slots.h>
41	#include <linux/sort.h>
42	#include <linux/completion.h>
43	#include <linux/suspend.h>
44	#include <linux/zswap.h>
45
46	#include <asm/tlbflush.h>
47	#include <linux/swapops.h>
48	#include <linux/swap_cgroup.h>
49	#include "internal.h"
50	#include "swap.h"
51
52	static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
53	unsigned char);
54	static void free_swap_count_continuations(struct swap_info_struct *);
55
56	static DEFINE_SPINLOCK(swap_lock);
57	static unsigned int nr_swapfiles;
58	atomic_long_t nr_swap_pages;
59	/*
60	* Some modules use swappable objects and may try to swap them out under
61	* memory pressure (via the shrinker). Before doing so, they may wish to
62	* check to see if any swap space is available.
63	*/
64	EXPORT_SYMBOL_GPL(nr_swap_pages);
65	/ protected with swap_lock. reading in vm_swap_full() doesn't need lock /
66	long total_swap_pages;
67	static int least_priority = -`1`;
68	unsigned long swapfile_maximum_size;
69	#ifdef CONFIG_MIGRATION
70	bool swap_migration_ad_supported;
71	#endif /* CONFIG_MIGRATION */
72
73	static const char Bad_file[] = "Bad swap file entry ";
74	static const char Unused_file[] = "Unused swap file entry ";
75	static const char Bad_offset[] = "Bad swap offset entry ";
76	static const char Unused_offset[] = "Unused swap offset entry ";
77
78	/*
79	* all active swap_info_structs
80	* protected with swap_lock, and ordered by priority.
81	*/
82	static PLIST_HEAD(swap_active_head);
83
84	/*
85	* all available (active, not full) swap_info_structs
86	* protected with swap_avail_lock, ordered by priority.
87	* This is used by folio_alloc_swap() instead of swap_active_head
88	* because swap_active_head includes all swap_info_structs,
89	* but folio_alloc_swap() doesn't need to look at full ones.
90	* This uses its own lock instead of swap_lock because when a
91	* swap_info_struct changes between not-full/full, it needs to
92	* add/remove itself to/from this list, but the swap_info_struct->lock
93	* is held and the locking order requires swap_lock to be taken
94	* before any swap_info_struct->lock.
95	*/
96	static struct plist_head *swap_avail_heads;
97	static DEFINE_SPINLOCK(swap_avail_lock);
98
99	static struct swap_info_struct *swap_info[MAX_SWAPFILES];
100
101	static DEFINE_MUTEX(swapon_mutex);
102
103	static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
104	/ Activity counter to indicate that a swapon or swapoff has occurred /
105	static atomic_t proc_poll_event = ATOMIC_INIT(`0`);
106
107	atomic_t nr_rotate_swap = ATOMIC_INIT(`0`);
108
109	static struct swap_info_struct swap_type_to_swap_info(int* type)
110	{
111	if (type >= MAX_SWAPFILES)
112	return NULL;
113
114	return READ_ONCE(swap_info[type]); / rcu_dereference() /
115	}
116
117	static inline unsigned char swap_count(unsigned char ent)
118	{
119	return ent & ~SWAP_HAS_CACHE; / may include COUNT_CONTINUED flag /
120	}
121
122	/ Reclaim the swap entry anyway if possible /
123	#define TTRS_ANYWAY 0x1
124	/*
125	* Reclaim the swap entry if there are no more mappings of the
126	* corresponding page
127	*/
128	#define TTRS_UNMAPPED 0x2
129	/ Reclaim the swap entry if swap is getting full/
130	#define TTRS_FULL 0x4
131
132	/ returns 1 if swap entry is freed /
133	static int __try_to_reclaim_swap(struct swap_info_struct *si,
134	unsigned long offset, unsigned long flags)
135	{
136	swp_entry_t entry = swp_entry(type: si->type, offset);
137	struct folio *folio;
138	int ret = `0`;
139
140	folio = filemap_get_folio(swap_address_space(entry), index: offset);
141	if (IS_ERR(ptr: folio))
142	return `0`;
143	/*
144	* When this function is called from scan_swap_map_slots() and it's
145	* called by vmscan.c at reclaiming folios. So we hold a folio lock
146	* here. We have to use trylock for avoiding deadlock. This is a special
147	* case and you should use folio_free_swap() with explicit folio_lock()
148	* in usual operations.
149	*/
150	if (folio_trylock(folio)) {
151	if ((flags & TTRS_ANYWAY) \|\|
152	((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) \|\|
153	((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
154	ret = folio_free_swap(folio);
155	folio_unlock(folio);
156	}
157	folio_put(folio);
158	return ret;
159	}
160
161	static inline struct swap_extent first_se(struct* swap_info_struct *sis)
162	{
163	struct rb_node *rb = rb_first(&sis->swap_extent_root);
164	return rb_entry(rb, struct swap_extent, rb_node);
165	}
166
167	static inline struct swap_extent next_se(struct* swap_extent *se)
168	{
169	struct rb_node *rb = rb_next(&se->rb_node);
170	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
171	}
172
173	/*
174	* swapon tell device that all the old swap contents can be discarded,
175	* to allow the swap device to optimize its wear-levelling.
176	*/
177	static int discard_swap(struct swap_info_struct *si)
178	{
179	struct swap_extent *se;
180	sector_t start_block;
181	sector_t nr_blocks;
182	int err = `0`;
183
184	/ Do not discard the swap header page! /
185	se = first_se(sis: si);
186	start_block = (se->start_block + `1`) << (PAGE_SHIFT - `9`);
187	nr_blocks = ((sector_t)se->nr_pages - `1`) << (PAGE_SHIFT - `9`);
188	if (nr_blocks) {
189	err = blkdev_issue_discard(bdev: si->bdev, sector: start_block,
190	nr_sects: nr_blocks, GFP_KERNEL);
191	if (err)
192	return err;
193	cond_resched();
194	}
195
196	for (se = next_se(se); se; se = next_se(se)) {
197	start_block = se->start_block << (PAGE_SHIFT - `9`);
198	nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - `9`);
199
200	err = blkdev_issue_discard(bdev: si->bdev, sector: start_block,
201	nr_sects: nr_blocks, GFP_KERNEL);
202	if (err)
203	break;
204
205	cond_resched();
206	}
207	return err; / That will often be -EOPNOTSUPP /
208	}
209
210	static struct swap_extent *
211	offset_to_swap_extent(struct swap_info_struct sis, unsigned* long offset)
212	{
213	struct swap_extent *se;
214	struct rb_node *rb;
215
216	rb = sis->swap_extent_root.rb_node;
217	while (rb) {
218	se = rb_entry(rb, struct swap_extent, rb_node);
219	if (offset < se->start_page)
220	rb = rb->rb_left;
221	else if (offset >= se->start_page + se->nr_pages)
222	rb = rb->rb_right;
223	else
224	return se;
225	}
226	/ It must be present /
227	BUG();
228	}
229
230	sector_t swap_page_sector(struct page *page)
231	{
232	struct swap_info_struct *sis = page_swap_info(page);
233	struct swap_extent *se;
234	sector_t sector;
235	pgoff_t offset;
236
237	offset = __page_file_index(page);
238	se = offset_to_swap_extent(sis, offset);
239	sector = se->start_block + (offset - se->start_page);
240	return sector << (PAGE_SHIFT - `9`);
241	}
242
243	/*
244	* swap allocation tell device that a cluster of swap can now be discarded,
245	* to allow the swap device to optimize its wear-levelling.
246	*/
247	static void discard_swap_cluster(struct swap_info_struct *si,
248	pgoff_t start_page, pgoff_t nr_pages)
249	{
250	struct swap_extent *se = offset_to_swap_extent(sis: si, offset: start_page);
251
252	while (nr_pages) {
253	pgoff_t offset = start_page - se->start_page;
254	sector_t start_block = se->start_block + offset;
255	sector_t nr_blocks = se->nr_pages - offset;
256
257	if (nr_blocks > nr_pages)
258	nr_blocks = nr_pages;
259	start_page += nr_blocks;
260	nr_pages -= nr_blocks;
261
262	start_block <<= PAGE_SHIFT - `9`;
263	nr_blocks <<= PAGE_SHIFT - `9`;
264	if (blkdev_issue_discard(bdev: si->bdev, sector: start_block,
265	nr_sects: nr_blocks, GFP_NOIO))
266	break;
267
268	se = next_se(se);
269	}
270	}
271
272	#ifdef CONFIG_THP_SWAP
273	#define SWAPFILE_CLUSTER HPAGE_PMD_NR
274
275	#define swap_entry_size(size) (size)
276	#else
277	#define SWAPFILE_CLUSTER 256
278
279	/*
280	* Define swap_entry_size() as constant to let compiler to optimize
281	* out some code if !CONFIG_THP_SWAP
282	*/
283	#define swap_entry_size(size) 1
284	#endif
285	#define LATENCY_LIMIT 256
286
287	static inline void cluster_set_flag(struct swap_cluster_info *info,
288	unsigned int flag)
289	{
290	info->flags = flag;
291	}
292
293	static inline unsigned int cluster_count(struct swap_cluster_info *info)
294	{
295	return info->data;
296	}
297
298	static inline void cluster_set_count(struct swap_cluster_info *info,
299	unsigned int c)
300	{
301	info->data = c;
302	}
303
304	static inline void cluster_set_count_flag(struct swap_cluster_info *info,
305	unsigned int c, unsigned int f)
306	{
307	info->flags = f;
308	info->data = c;
309	}
310
311	static inline unsigned int cluster_next(struct swap_cluster_info *info)
312	{
313	return info->data;
314	}
315
316	static inline void cluster_set_next(struct swap_cluster_info *info,
317	unsigned int n)
318	{
319	info->data = n;
320	}
321
322	static inline void cluster_set_next_flag(struct swap_cluster_info *info,
323	unsigned int n, unsigned int f)
324	{
325	info->flags = f;
326	info->data = n;
327	}
328
329	static inline bool cluster_is_free(struct swap_cluster_info *info)
330	{
331	return info->flags & CLUSTER_FLAG_FREE;
332	}
333
334	static inline bool cluster_is_null(struct swap_cluster_info *info)
335	{
336	return info->flags & CLUSTER_FLAG_NEXT_NULL;
337	}
338
339	static inline void cluster_set_null(struct swap_cluster_info *info)
340	{
341	info->flags = CLUSTER_FLAG_NEXT_NULL;
342	info->data = `0`;
343	}
344
345	static inline bool cluster_is_huge(struct swap_cluster_info *info)
346	{
347	if (IS_ENABLED(CONFIG_THP_SWAP))
348	return info->flags & CLUSTER_FLAG_HUGE;
349	return false;
350	}
351
352	static inline void cluster_clear_huge(struct swap_cluster_info *info)
353	{
354	info->flags &= ~CLUSTER_FLAG_HUGE;
355	}
356
357	static inline struct swap_cluster_info lock_cluster(struct* swap_info_struct *si,
358	unsigned long offset)
359	{
360	struct swap_cluster_info *ci;
361
362	ci = si->cluster_info;
363	if (ci) {
364	ci += offset / SWAPFILE_CLUSTER;
365	spin_lock(lock: &ci->lock);
366	}
367	return ci;
368	}
369
370	static inline void unlock_cluster(struct swap_cluster_info *ci)
371	{
372	if (ci)
373	spin_unlock(lock: &ci->lock);
374	}
375
376	/*
377	* Determine the locking method in use for this device. Return
378	* swap_cluster_info if SSD-style cluster-based locking is in place.
379	*/
380	static inline struct swap_cluster_info *lock_cluster_or_swap_info(
381	struct swap_info_struct si, unsigned* long offset)
382	{
383	struct swap_cluster_info *ci;
384
385	/ Try to use fine-grained SSD-style locking if available: /
386	ci = lock_cluster(si, offset);
387	/ Otherwise, fall back to traditional, coarse locking: /
388	if (!ci)
389	spin_lock(lock: &si->lock);
390
391	return ci;
392	}
393
394	static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
395	struct swap_cluster_info *ci)
396	{
397	if (ci)
398	unlock_cluster(ci);
399	else
400	spin_unlock(lock: &si->lock);
401	}
402
403	static inline bool cluster_list_empty(struct swap_cluster_list *list)
404	{
405	return cluster_is_null(info: &list->head);
406	}
407
408	static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
409	{
410	return cluster_next(info: &list->head);
411	}
412
413	static void cluster_list_init(struct swap_cluster_list *list)
414	{
415	cluster_set_null(info: &list->head);
416	cluster_set_null(info: &list->tail);
417	}
418
419	static void cluster_list_add_tail(struct swap_cluster_list *list,
420	struct swap_cluster_info *ci,
421	unsigned int idx)
422	{
423	if (cluster_list_empty(list)) {
424	cluster_set_next_flag(info: &list->head, n: idx, f: `0`);
425	cluster_set_next_flag(info: &list->tail, n: idx, f: `0`);
426	} else {
427	struct swap_cluster_info *ci_tail;
428	unsigned int tail = cluster_next(info: &list->tail);
429
430	/*
431	* Nested cluster lock, but both cluster locks are
432	* only acquired when we held swap_info_struct->lock
433	*/
434	ci_tail = ci + tail;
435	spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
436	cluster_set_next(info: ci_tail, n: idx);
437	spin_unlock(lock: &ci_tail->lock);
438	cluster_set_next_flag(info: &list->tail, n: idx, f: `0`);
439	}
440	}
441
442	static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
443	struct swap_cluster_info *ci)
444	{
445	unsigned int idx;
446
447	idx = cluster_next(info: &list->head);
448	if (cluster_next(info: &list->tail) == idx) {
449	cluster_set_null(info: &list->head);
450	cluster_set_null(info: &list->tail);
451	} else
452	cluster_set_next_flag(info: &list->head,
453	n: cluster_next(info: &ci[idx]), f: `0`);
454
455	return idx;
456	}
457
458	/ Add a cluster to discard list and schedule it to do discard /
459	static void swap_cluster_schedule_discard(struct swap_info_struct *si,
460	unsigned int idx)
461	{
462	/*
463	* If scan_swap_map_slots() can't find a free cluster, it will check
464	* si->swap_map directly. To make sure the discarding cluster isn't
465	* taken by scan_swap_map_slots(), mark the swap entries bad (occupied).
466	* It will be cleared after discard
467	*/
468	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
469	SWAP_MAP_BAD, SWAPFILE_CLUSTER);
470
471	cluster_list_add_tail(list: &si->discard_clusters, ci: si->cluster_info, idx);
472
473	schedule_work(work: &si->discard_work);
474	}
475
476	static void __free_cluster(struct swap_info_struct si, unsigned* long idx)
477	{
478	struct swap_cluster_info *ci = si->cluster_info;
479
480	cluster_set_flag(info: ci + idx, CLUSTER_FLAG_FREE);
481	cluster_list_add_tail(list: &si->free_clusters, ci, idx);
482	}
483
484	/*
485	* Doing discard actually. After a cluster discard is finished, the cluster
486	* will be added to free cluster list. caller should hold si->lock.
487	*/
488	static void swap_do_scheduled_discard(struct swap_info_struct *si)
489	{
490	struct swap_cluster_info info, ci;
491	unsigned int idx;
492
493	info = si->cluster_info;
494
495	while (!cluster_list_empty(list: &si->discard_clusters)) {
496	idx = cluster_list_del_first(list: &si->discard_clusters, ci: info);
497	spin_unlock(lock: &si->lock);
498
499	discard_swap_cluster(si, start_page: idx * SWAPFILE_CLUSTER,
500	SWAPFILE_CLUSTER);
501
502	spin_lock(lock: &si->lock);
503	ci = lock_cluster(si, offset: idx * SWAPFILE_CLUSTER);
504	__free_cluster(si, idx);
505	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
506	`0`, SWAPFILE_CLUSTER);
507	unlock_cluster(ci);
508	}
509	}
510
511	static void swap_discard_work(struct work_struct *work)
512	{
513	struct swap_info_struct *si;
514
515	si = container_of(work, struct swap_info_struct, discard_work);
516
517	spin_lock(lock: &si->lock);
518	swap_do_scheduled_discard(si);
519	spin_unlock(lock: &si->lock);
520	}
521
522	static void swap_users_ref_free(struct percpu_ref *ref)
523	{
524	struct swap_info_struct *si;
525
526	si = container_of(ref, struct swap_info_struct, users);
527	complete(&si->comp);
528	}
529
530	static void alloc_cluster(struct swap_info_struct si, unsigned* long idx)
531	{
532	struct swap_cluster_info *ci = si->cluster_info;
533
534	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
535	cluster_list_del_first(list: &si->free_clusters, ci);
536	cluster_set_count_flag(info: ci + idx, c: `0`, f: `0`);
537	}
538
539	static void free_cluster(struct swap_info_struct si, unsigned* long idx)
540	{
541	struct swap_cluster_info *ci = si->cluster_info + idx;
542
543	VM_BUG_ON(cluster_count(ci) != `0`);
544	/*
545	* If the swap is discardable, prepare discard the cluster
546	* instead of free it immediately. The cluster will be freed
547	* after discard.
548	*/
549	if ((si->flags & (SWP_WRITEOK \| SWP_PAGE_DISCARD)) ==
550	(SWP_WRITEOK \| SWP_PAGE_DISCARD)) {
551	swap_cluster_schedule_discard(si, idx);
552	return;
553	}
554
555	__free_cluster(si, idx);
556	}
557
558	/*
559	* The cluster corresponding to page_nr will be used. The cluster will be
560	* removed from free cluster list and its usage counter will be increased.
561	*/
562	static void inc_cluster_info_page(struct swap_info_struct *p,
563	struct swap_cluster_info cluster_info, unsigned* long page_nr)
564	{
565	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
566
567	if (!cluster_info)
568	return;
569	if (cluster_is_free(info: &cluster_info[idx]))
570	alloc_cluster(si: p, idx);
571
572	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
573	cluster_set_count(info: &cluster_info[idx],
574	c: cluster_count(info: &cluster_info[idx]) + `1`);
575	}
576
577	/*
578	* The cluster corresponding to page_nr decreases one usage. If the usage
579	* counter becomes 0, which means no page in the cluster is in using, we can
580	* optionally discard the cluster and add it to free cluster list.
581	*/
582	static void dec_cluster_info_page(struct swap_info_struct *p,
583	struct swap_cluster_info cluster_info, unsigned* long page_nr)
584	{
585	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
586
587	if (!cluster_info)
588	return;
589
590	VM_BUG_ON(cluster_count(&cluster_info[idx]) == `0`);
591	cluster_set_count(info: &cluster_info[idx],
592	c: cluster_count(info: &cluster_info[idx]) - `1`);
593
594	if (cluster_count(info: &cluster_info[idx]) == `0`)
595	free_cluster(si: p, idx);
596	}
597
598	/*
599	* It's possible scan_swap_map_slots() uses a free cluster in the middle of free
600	* cluster list. Avoiding such abuse to avoid list corruption.
601	*/
602	static bool
603	scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
604	unsigned long offset)
605	{
606	struct percpu_cluster *percpu_cluster;
607	bool conflict;
608
609	offset /= SWAPFILE_CLUSTER;
610	conflict = !cluster_list_empty(list: &si->free_clusters) &&
611	offset != cluster_list_first(list: &si->free_clusters) &&
612	cluster_is_free(info: &si->cluster_info[offset]);
613
614	if (!conflict)
615	return false;
616
617	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
618	cluster_set_null(info: &percpu_cluster->index);
619	return true;
620	}
621
622	/*
623	* Try to get a swap entry from current cpu's swap entry pool (a cluster). This
624	* might involve allocating a new cluster for current CPU too.
625	*/
626	static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
627	unsigned long offset, unsigned* long *scan_base)
628	{
629	struct percpu_cluster *cluster;
630	struct swap_cluster_info *ci;
631	unsigned long tmp, max;
632
633	new_cluster:
634	cluster = this_cpu_ptr(si->percpu_cluster);
635	if (cluster_is_null(info: &cluster->index)) {
636	if (!cluster_list_empty(list: &si->free_clusters)) {
637	cluster->index = si->free_clusters.head;
638	cluster->next = cluster_next(info: &cluster->index) *
639	SWAPFILE_CLUSTER;
640	} else if (!cluster_list_empty(list: &si->discard_clusters)) {
641	/*
642	* we don't have free cluster but have some clusters in
643	* discarding, do discard now and reclaim them, then
644	* reread cluster_next_cpu since we dropped si->lock
645	*/
646	swap_do_scheduled_discard(si);
647	scan_base = this_cpu_read(si->cluster_next_cpu);
648	offset = scan_base;
649	goto new_cluster;
650	} else
651	return false;
652	}
653
654	/*
655	* Other CPUs can use our cluster if they can't find a free cluster,
656	* check if there is still free entry in the cluster
657	*/
658	tmp = cluster->next;
659	max = min_t(unsigned long, si->max,
660	(cluster_next(&cluster->index) + `1`) * SWAPFILE_CLUSTER);
661	if (tmp < max) {
662	ci = lock_cluster(si, offset: tmp);
663	while (tmp < max) {
664	if (!si->swap_map[tmp])
665	break;
666	tmp++;
667	}
668	unlock_cluster(ci);
669	}
670	if (tmp >= max) {
671	cluster_set_null(info: &cluster->index);
672	goto new_cluster;
673	}
674	cluster->next = tmp + `1`;
675	*offset = tmp;
676	*scan_base = tmp;
677	return true;
678	}
679
680	static void __del_from_avail_list(struct swap_info_struct *p)
681	{
682	int nid;
683
684	assert_spin_locked(&p->lock);
685	for_each_node(nid)
686	plist_del(node: &p->avail_lists[nid], head: &swap_avail_heads[nid]);
687	}
688
689	static void del_from_avail_list(struct swap_info_struct *p)
690	{
691	spin_lock(lock: &swap_avail_lock);
692	__del_from_avail_list(p);
693	spin_unlock(lock: &swap_avail_lock);
694	}
695
696	static void swap_range_alloc(struct swap_info_struct si, unsigned* long offset,
697	unsigned int nr_entries)
698	{
699	unsigned int end = offset + nr_entries - `1`;
700
701	if (offset == si->lowest_bit)
702	si->lowest_bit += nr_entries;
703	if (end == si->highest_bit)
704	WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
705	WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
706	if (si->inuse_pages == si->pages) {
707	si->lowest_bit = si->max;
708	si->highest_bit = `0`;
709	del_from_avail_list(p: si);
710	}
711	}
712
713	static void add_to_avail_list(struct swap_info_struct *p)
714	{
715	int nid;
716
717	spin_lock(lock: &swap_avail_lock);
718	for_each_node(nid)
719	plist_add(node: &p->avail_lists[nid], head: &swap_avail_heads[nid]);
720	spin_unlock(lock: &swap_avail_lock);
721	}
722
723	static void swap_range_free(struct swap_info_struct si, unsigned* long offset,
724	unsigned int nr_entries)
725	{
726	unsigned long begin = offset;
727	unsigned long end = offset + nr_entries - `1`;
728	void (swap_slot_free_notify)(struct* block_device , unsigned* long);
729
730	if (offset < si->lowest_bit)
731	si->lowest_bit = offset;
732	if (end > si->highest_bit) {
733	bool was_full = !si->highest_bit;
734
735	WRITE_ONCE(si->highest_bit, end);
736	if (was_full && (si->flags & SWP_WRITEOK))
737	add_to_avail_list(p: si);
738	}
739	atomic_long_add(i: nr_entries, v: &nr_swap_pages);
740	WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
741	if (si->flags & SWP_BLKDEV)
742	swap_slot_free_notify =
743	si->bdev->bd_disk->fops->swap_slot_free_notify;
744	else
745	swap_slot_free_notify = NULL;
746	while (offset <= end) {
747	arch_swap_invalidate_page(type: si->type, offset);
748	zswap_invalidate(type: si->type, offset);
749	if (swap_slot_free_notify)
750	swap_slot_free_notify(si->bdev, offset);
751	offset++;
752	}
753	clear_shadow_from_swap_cache(type: si->type, begin, end);
754	}
755
756	static void set_cluster_next(struct swap_info_struct si, unsigned* long next)
757	{
758	unsigned long prev;
759
760	if (!(si->flags & SWP_SOLIDSTATE)) {
761	si->cluster_next = next;
762	return;
763	}
764
765	prev = this_cpu_read(*si->cluster_next_cpu);
766	/*
767	* Cross the swap address space size aligned trunk, choose
768	* another trunk randomly to avoid lock contention on swap
769	* address space if possible.
770	*/
771	if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
772	(next >> SWAP_ADDRESS_SPACE_SHIFT)) {
773	/ No free swap slots available /
774	if (si->highest_bit <= si->lowest_bit)
775	return;
776	next = get_random_u32_inclusive(floor: si->lowest_bit, ceil: si->highest_bit);
777	next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
778	next = max_t(unsigned int, next, si->lowest_bit);
779	}
780	this_cpu_write(*si->cluster_next_cpu, next);
781	}
782
783	static bool swap_offset_available_and_locked(struct swap_info_struct *si,
784	unsigned long offset)
785	{
786	if (data_race(!si->swap_map[offset])) {
787	spin_lock(lock: &si->lock);
788	return true;
789	}
790
791	if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
792	spin_lock(lock: &si->lock);
793	return true;
794	}
795
796	return false;
797	}
798
799	static int scan_swap_map_slots(struct swap_info_struct *si,
800	unsigned char usage, int nr,
801	swp_entry_t slots[])
802	{
803	struct swap_cluster_info *ci;
804	unsigned long offset;
805	unsigned long scan_base;
806	unsigned long last_in_cluster = `0`;
807	int latency_ration = LATENCY_LIMIT;
808	int n_ret = `0`;
809	bool scanned_many = false;
810
811	/*
812	* We try to cluster swap pages by allocating them sequentially
813	* in swap. Once we've allocated SWAPFILE_CLUSTER pages this
814	* way, however, we resort to first-free allocation, starting
815	* a new cluster. This prevents us from scattering swap pages
816	* all over the entire swap partition, so that we reduce
817	* overall disk seek times between swap pages. -- sct
818	* But we do now try to find an empty cluster. -Andrea
819	* And we let swap pages go all over an SSD partition. Hugh
820	*/
821
822	si->flags += SWP_SCANNING;
823	/*
824	* Use percpu scan base for SSD to reduce lock contention on
825	* cluster and swap cache. For HDD, sequential access is more
826	* important.
827	*/
828	if (si->flags & SWP_SOLIDSTATE)
829	scan_base = this_cpu_read(*si->cluster_next_cpu);
830	else
831	scan_base = si->cluster_next;
832	offset = scan_base;
833
834	/ SSD algorithm /
835	if (si->cluster_info) {
836	if (!scan_swap_map_try_ssd_cluster(si, offset: &offset, scan_base: &scan_base))
837	goto scan;
838	} else if (unlikely(!si->cluster_nr--)) {
839	if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
840	si->cluster_nr = SWAPFILE_CLUSTER - `1`;
841	goto checks;
842	}
843
844	spin_unlock(lock: &si->lock);
845
846	/*
847	* If seek is expensive, start searching for new cluster from
848	* start of partition, to minimize the span of allocated swap.
849	* If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
850	* case, just handled by scan_swap_map_try_ssd_cluster() above.
851	*/
852	scan_base = offset = si->lowest_bit;
853	last_in_cluster = offset + SWAPFILE_CLUSTER - `1`;
854
855	/ Locate the first empty (unaligned) cluster /
856	for (; last_in_cluster <= si->highest_bit; offset++) {
857	if (si->swap_map[offset])
858	last_in_cluster = offset + SWAPFILE_CLUSTER;
859	else if (offset == last_in_cluster) {
860	spin_lock(lock: &si->lock);
861	offset -= SWAPFILE_CLUSTER - `1`;
862	si->cluster_next = offset;
863	si->cluster_nr = SWAPFILE_CLUSTER - `1`;
864	goto checks;
865	}
866	if (unlikely(--latency_ration < `0`)) {
867	cond_resched();
868	latency_ration = LATENCY_LIMIT;
869	}
870	}
871
872	offset = scan_base;
873	spin_lock(lock: &si->lock);
874	si->cluster_nr = SWAPFILE_CLUSTER - `1`;
875	}
876
877	checks:
878	if (si->cluster_info) {
879	while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
880	/ take a break if we already got some slots /
881	if (n_ret)
882	goto done;
883	if (!scan_swap_map_try_ssd_cluster(si, offset: &offset,
884	scan_base: &scan_base))
885	goto scan;
886	}
887	}
888	if (!(si->flags & SWP_WRITEOK))
889	goto no_page;
890	if (!si->highest_bit)
891	goto no_page;
892	if (offset > si->highest_bit)
893	scan_base = offset = si->lowest_bit;
894
895	ci = lock_cluster(si, offset);
896	/ reuse swap entry of cache-only swap if not busy. /
897	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
898	int swap_was_freed;
899	unlock_cluster(ci);
900	spin_unlock(lock: &si->lock);
901	swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
902	spin_lock(lock: &si->lock);
903	/ entry was freed successfully, try to use this again /
904	if (swap_was_freed)
905	goto checks;
906	goto scan; / check next one /
907	}
908
909	if (si->swap_map[offset]) {
910	unlock_cluster(ci);
911	if (!n_ret)
912	goto scan;
913	else
914	goto done;
915	}
916	WRITE_ONCE(si->swap_map[offset], usage);
917	inc_cluster_info_page(p: si, cluster_info: si->cluster_info, page_nr: offset);
918	unlock_cluster(ci);
919
920	swap_range_alloc(si, offset, nr_entries: `1`);
921	slots[n_ret++] = swp_entry(type: si->type, offset);
922
923	/ got enough slots or reach max slots? /
924	if ((n_ret == nr) \|\| (offset >= si->highest_bit))
925	goto done;
926
927	/ search for next available slot /
928
929	/ time to take a break? /
930	if (unlikely(--latency_ration < `0`)) {
931	if (n_ret)
932	goto done;
933	spin_unlock(lock: &si->lock);
934	cond_resched();
935	spin_lock(lock: &si->lock);
936	latency_ration = LATENCY_LIMIT;
937	}
938
939	/ try to get more slots in cluster /
940	if (si->cluster_info) {
941	if (scan_swap_map_try_ssd_cluster(si, offset: &offset, scan_base: &scan_base))
942	goto checks;
943	} else if (si->cluster_nr && !si->swap_map[++offset]) {
944	/ non-ssd case, still more slots in cluster? /
945	--si->cluster_nr;
946	goto checks;
947	}
948
949	/*
950	* Even if there's no free clusters available (fragmented),
951	* try to scan a little more quickly with lock held unless we
952	* have scanned too many slots already.
953	*/
954	if (!scanned_many) {
955	unsigned long scan_limit;
956
957	if (offset < scan_base)
958	scan_limit = scan_base;
959	else
960	scan_limit = si->highest_bit;
961	for (; offset <= scan_limit && --latency_ration > `0`;
962	offset++) {
963	if (!si->swap_map[offset])
964	goto checks;
965	}
966	}
967
968	done:
969	set_cluster_next(si, next: offset + `1`);
970	si->flags -= SWP_SCANNING;
971	return n_ret;
972
973	scan:
974	spin_unlock(lock: &si->lock);
975	while (++offset <= READ_ONCE(si->highest_bit)) {
976	if (unlikely(--latency_ration < `0`)) {
977	cond_resched();
978	latency_ration = LATENCY_LIMIT;
979	scanned_many = true;
980	}
981	if (swap_offset_available_and_locked(si, offset))
982	goto checks;
983	}
984	offset = si->lowest_bit;
985	while (offset < scan_base) {
986	if (unlikely(--latency_ration < `0`)) {
987	cond_resched();
988	latency_ration = LATENCY_LIMIT;
989	scanned_many = true;
990	}
991	if (swap_offset_available_and_locked(si, offset))
992	goto checks;
993	offset++;
994	}
995	spin_lock(lock: &si->lock);
996
997	no_page:
998	si->flags -= SWP_SCANNING;
999	return n_ret;
1000	}
1001
1002	static int swap_alloc_cluster(struct swap_info_struct si, swp_entry_t slot)
1003	{
1004	unsigned long idx;
1005	struct swap_cluster_info *ci;
1006	unsigned long offset;
1007
1008	/*
1009	* Should not even be attempting cluster allocations when huge
1010	* page swap is disabled. Warn and fail the allocation.
1011	*/
1012	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
1013	VM_WARN_ON_ONCE(`1`);
1014	return `0`;
1015	}
1016
1017	if (cluster_list_empty(list: &si->free_clusters))
1018	return `0`;
1019
1020	idx = cluster_list_first(list: &si->free_clusters);
1021	offset = idx * SWAPFILE_CLUSTER;
1022	ci = lock_cluster(si, offset);
1023	alloc_cluster(si, idx);
1024	cluster_set_count_flag(info: ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
1025
1026	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
1027	unlock_cluster(ci);
1028	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
1029	*slot = swp_entry(type: si->type, offset);
1030
1031	return `1`;
1032	}
1033
1034	static void swap_free_cluster(struct swap_info_struct si, unsigned* long idx)
1035	{
1036	unsigned long offset = idx * SWAPFILE_CLUSTER;
1037	struct swap_cluster_info *ci;
1038
1039	ci = lock_cluster(si, offset);
1040	memset(si->swap_map + offset, `0`, SWAPFILE_CLUSTER);
1041	cluster_set_count_flag(info: ci, c: `0`, f: `0`);
1042	free_cluster(si, idx);
1043	unlock_cluster(ci);
1044	swap_range_free(si, offset, SWAPFILE_CLUSTER);
1045	}
1046
1047	int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
1048	{
1049	unsigned long size = swap_entry_size(entry_size);
1050	struct swap_info_struct si, next;
1051	long avail_pgs;
1052	int n_ret = `0`;
1053	int node;
1054
1055	/ Only single cluster request supported /
1056	WARN_ON_ONCE(n_goal > `1` && size == SWAPFILE_CLUSTER);
1057
1058	spin_lock(lock: &swap_avail_lock);
1059
1060	avail_pgs = atomic_long_read(v: &nr_swap_pages) / size;
1061	if (avail_pgs <= `0`) {
1062	spin_unlock(lock: &swap_avail_lock);
1063	goto noswap;
1064	}
1065
1066	n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
1067
1068	atomic_long_sub(i: n_goal * size, v: &nr_swap_pages);
1069
1070	start_over:
1071	node = numa_node_id();
1072	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1073	/ requeue si to after same-priority siblings /
1074	plist_requeue(node: &si->avail_lists[node], head: &swap_avail_heads[node]);
1075	spin_unlock(lock: &swap_avail_lock);
1076	spin_lock(lock: &si->lock);
1077	if (!si->highest_bit \|\| !(si->flags & SWP_WRITEOK)) {
1078	spin_lock(lock: &swap_avail_lock);
1079	if (plist_node_empty(node: &si->avail_lists[node])) {
1080	spin_unlock(lock: &si->lock);
1081	goto nextsi;
1082	}
1083	WARN(!si->highest_bit,
1084	"swap_info %d in list but !highest_bit\n",
1085	si->type);
1086	WARN(!(si->flags & SWP_WRITEOK),
1087	"swap_info %d in list but !SWP_WRITEOK\n",
1088	si->type);
1089	__del_from_avail_list(p: si);
1090	spin_unlock(lock: &si->lock);
1091	goto nextsi;
1092	}
1093	if (size == SWAPFILE_CLUSTER) {
1094	if (si->flags & SWP_BLKDEV)
1095	n_ret = swap_alloc_cluster(si, slot: swp_entries);
1096	} else
1097	n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1098	nr: n_goal, slots: swp_entries);
1099	spin_unlock(lock: &si->lock);
1100	if (n_ret \|\| size == SWAPFILE_CLUSTER)
1101	goto check_out;
1102	cond_resched();
1103
1104	spin_lock(lock: &swap_avail_lock);
1105	nextsi:
1106	/*
1107	* if we got here, it's likely that si was almost full before,
1108	* and since scan_swap_map_slots() can drop the si->lock,
1109	* multiple callers probably all tried to get a page from the
1110	* same si and it filled up before we could get one; or, the si
1111	* filled up between us dropping swap_avail_lock and taking
1112	* si->lock. Since we dropped the swap_avail_lock, the
1113	* swap_avail_head list may have been modified; so if next is
1114	* still in the swap_avail_head list then try it, otherwise
1115	* start over if we have not gotten any slots.
1116	*/
1117	if (plist_node_empty(node: &next->avail_lists[node]))
1118	goto start_over;
1119	}
1120
1121	spin_unlock(lock: &swap_avail_lock);
1122
1123	check_out:
1124	if (n_ret < n_goal)
1125	atomic_long_add(i: (long)(n_goal - n_ret) * size,
1126	v: &nr_swap_pages);
1127	noswap:
1128	return n_ret;
1129	}
1130
1131	static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
1132	{
1133	struct swap_info_struct *p;
1134	unsigned long offset;
1135
1136	if (!entry.val)
1137	goto out;
1138	p = swp_swap_info(entry);
1139	if (!p)
1140	goto bad_nofile;
1141	if (data_race(!(p->flags & SWP_USED)))
1142	goto bad_device;
1143	offset = swp_offset(entry);
1144	if (offset >= p->max)
1145	goto bad_offset;
1146	if (data_race(!p->swap_map[swp_offset(entry)]))
1147	goto bad_free;
1148	return p;
1149
1150	bad_free:
1151	pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
1152	goto out;
1153	bad_offset:
1154	pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1155	goto out;
1156	bad_device:
1157	pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
1158	goto out;
1159	bad_nofile:
1160	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1161	out:
1162	return NULL;
1163	}
1164
1165	static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
1166	struct swap_info_struct *q)
1167	{
1168	struct swap_info_struct *p;
1169
1170	p = _swap_info_get(entry);
1171
1172	if (p != q) {
1173	if (q != NULL)
1174	spin_unlock(lock: &q->lock);
1175	if (p != NULL)
1176	spin_lock(lock: &p->lock);
1177	}
1178	return p;
1179	}
1180
1181	static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
1182	unsigned long offset,
1183	unsigned char usage)
1184	{
1185	unsigned char count;
1186	unsigned char has_cache;
1187
1188	count = p->swap_map[offset];
1189
1190	has_cache = count & SWAP_HAS_CACHE;
1191	count &= ~SWAP_HAS_CACHE;
1192
1193	if (usage == SWAP_HAS_CACHE) {
1194	VM_BUG_ON(!has_cache);
1195	has_cache = `0`;
1196	} else if (count == SWAP_MAP_SHMEM) {
1197	/*
1198	* Or we could insist on shmem.c using a special
1199	* swap_shmem_free() and free_shmem_swap_and_cache()...
1200	*/
1201	count = `0`;
1202	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
1203	if (count == COUNT_CONTINUED) {
1204	if (swap_count_continued(p, offset, count))
1205	count = SWAP_MAP_MAX \| COUNT_CONTINUED;
1206	else
1207	count = SWAP_MAP_MAX;
1208	} else
1209	count--;
1210	}
1211
1212	usage = count \| has_cache;
1213	if (usage)
1214	WRITE_ONCE(p->swap_map[offset], usage);
1215	else
1216	WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
1217
1218	return usage;
1219	}
1220
1221	/*
1222	* When we get a swap entry, if there aren't some other ways to
1223	* prevent swapoff, such as the folio in swap cache is locked, page
1224	* table lock is held, etc., the swap entry may become invalid because
1225	* of swapoff. Then, we need to enclose all swap related functions
1226	* with get_swap_device() and put_swap_device(), unless the swap
1227	* functions call get/put_swap_device() by themselves.
1228	*
1229	* Check whether swap entry is valid in the swap device. If so,
1230	* return pointer to swap_info_struct, and keep the swap entry valid
1231	* via preventing the swap device from being swapoff, until
1232	* put_swap_device() is called. Otherwise return NULL.
1233	*
1234	* Notice that swapoff or swapoff+swapon can still happen before the
1235	* percpu_ref_tryget_live() in get_swap_device() or after the
1236	* percpu_ref_put() in put_swap_device() if there isn't any other way
1237	* to prevent swapoff. The caller must be prepared for that. For
1238	* example, the following situation is possible.
1239	*
1240	* CPU1 CPU2
1241	* do_swap_page()
1242	* ... swapoff+swapon
1243	* __read_swap_cache_async()
1244	* swapcache_prepare()
1245	* __swap_duplicate()
1246	* // check swap_map
1247	* // verify PTE not changed
1248	*
1249	* In __swap_duplicate(), the swap_map need to be checked before
1250	* changing partly because the specified swap entry may be for another
1251	* swap device which has been swapoff. And in do_swap_page(), after
1252	* the page is read from the swap device, the PTE is verified not
1253	* changed with the page table locked to check whether the swap device
1254	* has been swapoff or swapoff+swapon.
1255	*/
1256	struct swap_info_struct *get_swap_device(swp_entry_t entry)
1257	{
1258	struct swap_info_struct *si;
1259	unsigned long offset;
1260
1261	if (!entry.val)
1262	goto out;
1263	si = swp_swap_info(entry);
1264	if (!si)
1265	goto bad_nofile;
1266	if (!percpu_ref_tryget_live(ref: &si->users))
1267	goto out;
1268	/*
1269	* Guarantee the si->users are checked before accessing other
1270	* fields of swap_info_struct.
1271	*
1272	* Paired with the spin_unlock() after setup_swap_info() in
1273	* enable_swap_info().
1274	*/
1275	smp_rmb();
1276	offset = swp_offset(entry);
1277	if (offset >= si->max)
1278	goto put_out;
1279
1280	return si;
1281	bad_nofile:
1282	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
1283	out:
1284	return NULL;
1285	put_out:
1286	pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
1287	percpu_ref_put(ref: &si->users);
1288	return NULL;
1289	}
1290
1291	static unsigned char __swap_entry_free(struct swap_info_struct *p,
1292	swp_entry_t entry)
1293	{
1294	struct swap_cluster_info *ci;
1295	unsigned long offset = swp_offset(entry);
1296	unsigned char usage;
1297
1298	ci = lock_cluster_or_swap_info(si: p, offset);
1299	usage = __swap_entry_free_locked(p, offset, usage: `1`);
1300	unlock_cluster_or_swap_info(si: p, ci);
1301	if (!usage)
1302	free_swap_slot(entry);
1303
1304	return usage;
1305	}
1306
1307	static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
1308	{
1309	struct swap_cluster_info *ci;
1310	unsigned long offset = swp_offset(entry);
1311	unsigned char count;
1312
1313	ci = lock_cluster(si: p, offset);
1314	count = p->swap_map[offset];
1315	VM_BUG_ON(count != SWAP_HAS_CACHE);
1316	p->swap_map[offset] = `0`;
1317	dec_cluster_info_page(p, cluster_info: p->cluster_info, page_nr: offset);
1318	unlock_cluster(ci);
1319
1320	mem_cgroup_uncharge_swap(entry, nr_pages: `1`);
1321	swap_range_free(si: p, offset, nr_entries: `1`);
1322	}
1323
1324	/*
1325	* Caller has made sure that the swap device corresponding to entry
1326	* is still around or has not been recycled.
1327	*/
1328	void swap_free(swp_entry_t entry)
1329	{
1330	struct swap_info_struct *p;
1331
1332	p = _swap_info_get(entry);
1333	if (p)
1334	__swap_entry_free(p, entry);
1335	}
1336
1337	/*
1338	* Called after dropping swapcache to decrease refcnt to swap entries.
1339	*/
1340	void put_swap_folio(struct folio *folio, swp_entry_t entry)
1341	{
1342	unsigned long offset = swp_offset(entry);
1343	unsigned long idx = offset / SWAPFILE_CLUSTER;
1344	struct swap_cluster_info *ci;
1345	struct swap_info_struct *si;
1346	unsigned char *map;
1347	unsigned int i, free_entries = `0`;
1348	unsigned char val;
1349	int size = swap_entry_size(folio_nr_pages(folio));
1350
1351	si = _swap_info_get(entry);
1352	if (!si)
1353	return;
1354
1355	ci = lock_cluster_or_swap_info(si, offset);
1356	if (size == SWAPFILE_CLUSTER) {
1357	VM_BUG_ON(!cluster_is_huge(ci));
1358	map = si->swap_map + offset;
1359	for (i = `0`; i < SWAPFILE_CLUSTER; i++) {
1360	val = map[i];
1361	VM_BUG_ON(!(val & SWAP_HAS_CACHE));
1362	if (val == SWAP_HAS_CACHE)
1363	free_entries++;
1364	}
1365	cluster_clear_huge(info: ci);
1366	if (free_entries == SWAPFILE_CLUSTER) {
1367	unlock_cluster_or_swap_info(si, ci);
1368	spin_lock(lock: &si->lock);
1369	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
1370	swap_free_cluster(si, idx);
1371	spin_unlock(lock: &si->lock);
1372	return;
1373	}
1374	}
1375	for (i = `0`; i < size; i++, entry.val++) {
1376	if (!__swap_entry_free_locked(p: si, offset: offset + i, SWAP_HAS_CACHE)) {
1377	unlock_cluster_or_swap_info(si, ci);
1378	free_swap_slot(entry);
1379	if (i == size - `1`)
1380	return;
1381	lock_cluster_or_swap_info(si, offset);
1382	}
1383	}
1384	unlock_cluster_or_swap_info(si, ci);
1385	}
1386
1387	#ifdef CONFIG_THP_SWAP
1388	int split_swap_cluster(swp_entry_t entry)
1389	{
1390	struct swap_info_struct *si;
1391	struct swap_cluster_info *ci;
1392	unsigned long offset = swp_offset(entry);
1393
1394	si = _swap_info_get(entry);
1395	if (!si)
1396	return -EBUSY;
1397	ci = lock_cluster(si, offset);
1398	cluster_clear_huge(info: ci);
1399	unlock_cluster(ci);
1400	return `0`;
1401	}
1402	#endif
1403
1404	static int swp_entry_cmp(const void ent1, const* void *ent2)
1405	{
1406	const swp_entry_t e1 = ent1, e2 = ent2;
1407
1408	return (int)swp_type(entry: e1) - (int)swp_type(entry: e2);
1409	}
1410
1411	void swapcache_free_entries(swp_entry_t entries, int* n)
1412	{
1413	struct swap_info_struct p, prev;
1414	int i;
1415
1416	if (n <= `0`)
1417	return;
1418
1419	prev = NULL;
1420	p = NULL;
1421
1422	/*
1423	* Sort swap entries by swap device, so each lock is only taken once.
1424	* nr_swapfiles isn't absolutely correct, but the overhead of sort() is
1425	* so low that it isn't necessary to optimize further.
1426	*/
1427	if (nr_swapfiles > `1`)
1428	sort(base: entries, num: n, size: sizeof(entries[`0`]), cmp_func: swp_entry_cmp, NULL);
1429	for (i = `0`; i < n; ++i) {
1430	p = swap_info_get_cont(entry: entries[i], q: prev);
1431	if (p)
1432	swap_entry_free(p, entry: entries[i]);
1433	prev = p;
1434	}
1435	if (p)
1436	spin_unlock(lock: &p->lock);
1437	}
1438
1439	int __swap_count(swp_entry_t entry)
1440	{
1441	struct swap_info_struct *si = swp_swap_info(entry);
1442	pgoff_t offset = swp_offset(entry);
1443
1444	return swap_count(ent: si->swap_map[offset]);
1445	}
1446
1447	/*
1448	* How many references to @entry are currently swapped out?
1449	* This does not give an exact answer when swap count is continued,
1450	* but does include the high COUNT_CONTINUED flag to allow for that.
1451	*/
1452	int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
1453	{
1454	pgoff_t offset = swp_offset(entry);
1455	struct swap_cluster_info *ci;
1456	int count;
1457
1458	ci = lock_cluster_or_swap_info(si, offset);
1459	count = swap_count(ent: si->swap_map[offset]);
1460	unlock_cluster_or_swap_info(si, ci);
1461	return count;
1462	}
1463
1464	/*
1465	* How many references to @entry are currently swapped out?
1466	* This considers COUNT_CONTINUED so it returns exact answer.
1467	*/
1468	int swp_swapcount(swp_entry_t entry)
1469	{
1470	int count, tmp_count, n;
1471	struct swap_info_struct *p;
1472	struct swap_cluster_info *ci;
1473	struct page *page;
1474	pgoff_t offset;
1475	unsigned char *map;
1476
1477	p = _swap_info_get(entry);
1478	if (!p)
1479	return `0`;
1480
1481	offset = swp_offset(entry);
1482
1483	ci = lock_cluster_or_swap_info(si: p, offset);
1484
1485	count = swap_count(ent: p->swap_map[offset]);
1486	if (!(count & COUNT_CONTINUED))
1487	goto out;
1488
1489	count &= ~COUNT_CONTINUED;
1490	n = SWAP_MAP_MAX + `1`;
1491
1492	page = vmalloc_to_page(addr: p->swap_map + offset);
1493	offset &= ~PAGE_MASK;
1494	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
1495
1496	do {
1497	page = list_next_entry(page, lru);
1498	map = kmap_atomic(page);
1499	tmp_count = map[offset];
1500	kunmap_atomic(map);
1501
1502	count += (tmp_count & ~COUNT_CONTINUED) * n;
1503	n *= (SWAP_CONT_MAX + `1`);
1504	} while (tmp_count & COUNT_CONTINUED);
1505	out:
1506	unlock_cluster_or_swap_info(si: p, ci);
1507	return count;
1508	}
1509
1510	static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
1511	swp_entry_t entry)
1512	{
1513	struct swap_cluster_info *ci;
1514	unsigned char *map = si->swap_map;
1515	unsigned long roffset = swp_offset(entry);
1516	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
1517	int i;
1518	bool ret = false;
1519
1520	ci = lock_cluster_or_swap_info(si, offset);
1521	if (!ci \|\| !cluster_is_huge(info: ci)) {
1522	if (swap_count(ent: map[roffset]))
1523	ret = true;
1524	goto unlock_out;
1525	}
1526	for (i = `0`; i < SWAPFILE_CLUSTER; i++) {
1527	if (swap_count(ent: map[offset + i])) {
1528	ret = true;
1529	break;
1530	}
1531	}
1532	unlock_out:
1533	unlock_cluster_or_swap_info(si, ci);
1534	return ret;
1535	}
1536
1537	static bool folio_swapped(struct folio *folio)
1538	{
1539	swp_entry_t entry = folio->swap;
1540	struct swap_info_struct *si = _swap_info_get(entry);
1541
1542	if (!si)
1543	return false;
1544
1545	if (!IS_ENABLED(CONFIG_THP_SWAP) \|\| likely(!folio_test_large(folio)))
1546	return swap_swapcount(si, entry) != `0`;
1547
1548	return swap_page_trans_huge_swapped(si, entry);
1549	}
1550
1551	/**
1552	* folio_free_swap() - Free the swap space used for this folio.
1553	* @folio: The folio to remove.
1554	*
1555	* If swap is getting full, or if there are no more mappings of this folio,
1556	* then call folio_free_swap to free its swap space.
1557	*
1558	* Return: true if we were able to release the swap space.
1559	*/
1560	bool folio_free_swap(struct folio *folio)
1561	{
1562	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1563
1564	if (!folio_test_swapcache(folio))
1565	return false;
1566	if (folio_test_writeback(folio))
1567	return false;
1568	if (folio_swapped(folio))
1569	return false;
1570
1571	/*
1572	* Once hibernation has begun to create its image of memory,
1573	* there's a danger that one of the calls to folio_free_swap()
1574	* - most probably a call from __try_to_reclaim_swap() while
1575	* hibernation is allocating its own swap pages for the image,
1576	* but conceivably even a call from memory reclaim - will free
1577	* the swap from a folio which has already been recorded in the
1578	* image as a clean swapcache folio, and then reuse its swap for
1579	* another page of the image. On waking from hibernation, the
1580	* original folio might be freed under memory pressure, then
1581	* later read back in from swap, now with the wrong data.
1582	*
1583	* Hibernation suspends storage while it is writing the image
1584	* to disk so check that here.
1585	*/
1586	if (pm_suspended_storage())
1587	return false;
1588
1589	delete_from_swap_cache(folio);
1590	folio_set_dirty(folio);
1591	return true;
1592	}
1593
1594	/*
1595	* Free the swap entry like above, but also try to
1596	* free the page cache entry if it is the last user.
1597	*/
1598	int free_swap_and_cache(swp_entry_t entry)
1599	{
1600	struct swap_info_struct *p;
1601	unsigned char count;
1602
1603	if (non_swap_entry(entry))
1604	return `1`;
1605
1606	p = _swap_info_get(entry);
1607	if (p) {
1608	count = __swap_entry_free(p, entry);
1609	if (count == SWAP_HAS_CACHE &&
1610	!swap_page_trans_huge_swapped(si: p, entry))
1611	__try_to_reclaim_swap(si: p, offset: swp_offset(entry),
1612	TTRS_UNMAPPED \| TTRS_FULL);
1613	}
1614	return p != NULL;
1615	}
1616
1617	#ifdef CONFIG_HIBERNATION
1618
1619	swp_entry_t get_swap_page_of_type(int type)
1620	{
1621	struct swap_info_struct *si = swap_type_to_swap_info(type);
1622	swp_entry_t entry = {`0`};
1623
1624	if (!si)
1625	goto fail;
1626
1627	/ This is called for allocating swap entry, not cache /
1628	spin_lock(lock: &si->lock);
1629	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, usage: `1`, nr: `1`, slots: &entry))
1630	atomic_long_dec(v: &nr_swap_pages);
1631	spin_unlock(lock: &si->lock);
1632	fail:
1633	return entry;
1634	}
1635
1636	/*
1637	* Find the swap type that corresponds to given device (if any).
1638	*
1639	* @offset - number of the PAGE_SIZE-sized block of the device, starting
1640	* from 0, in which the swap header is expected to be located.
1641	*
1642	* This is needed for the suspend to disk (aka swsusp).
1643	*/
1644	int swap_type_of(dev_t device, sector_t offset)
1645	{
1646	int type;
1647
1648	if (!device)
1649	return -`1`;
1650
1651	spin_lock(lock: &swap_lock);
1652	for (type = `0`; type < nr_swapfiles; type++) {
1653	struct swap_info_struct *sis = swap_info[type];
1654
1655	if (!(sis->flags & SWP_WRITEOK))
1656	continue;
1657
1658	if (device == sis->bdev->bd_dev) {
1659	struct swap_extent *se = first_se(sis);
1660
1661	if (se->start_block == offset) {
1662	spin_unlock(lock: &swap_lock);
1663	return type;
1664	}
1665	}
1666	}
1667	spin_unlock(lock: &swap_lock);
1668	return -ENODEV;
1669	}
1670
1671	int find_first_swap(dev_t *device)
1672	{
1673	int type;
1674
1675	spin_lock(lock: &swap_lock);
1676	for (type = `0`; type < nr_swapfiles; type++) {
1677	struct swap_info_struct *sis = swap_info[type];
1678
1679	if (!(sis->flags & SWP_WRITEOK))
1680	continue;
1681	*device = sis->bdev->bd_dev;
1682	spin_unlock(lock: &swap_lock);
1683	return type;
1684	}
1685	spin_unlock(lock: &swap_lock);
1686	return -ENODEV;
1687	}
1688
1689	/*
1690	* Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
1691	* corresponding to given index in swap_info (swap type).
1692	*/
1693	sector_t swapdev_block(int type, pgoff_t offset)
1694	{
1695	struct swap_info_struct *si = swap_type_to_swap_info(type);
1696	struct swap_extent *se;
1697
1698	if (!si \|\| !(si->flags & SWP_WRITEOK))
1699	return `0`;
1700	se = offset_to_swap_extent(sis: si, offset);
1701	return se->start_block + (offset - se->start_page);
1702	}
1703
1704	/*
1705	* Return either the total number of swap pages of given type, or the number
1706	* of free pages of that type (depending on @free)
1707	*
1708	* This is needed for software suspend
1709	*/
1710	unsigned int count_swap_pages(int type, int free)
1711	{
1712	unsigned int n = `0`;
1713
1714	spin_lock(lock: &swap_lock);
1715	if ((unsigned int)type < nr_swapfiles) {
1716	struct swap_info_struct *sis = swap_info[type];
1717
1718	spin_lock(lock: &sis->lock);
1719	if (sis->flags & SWP_WRITEOK) {
1720	n = sis->pages;
1721	if (free)
1722	n -= sis->inuse_pages;
1723	}
1724	spin_unlock(lock: &sis->lock);
1725	}
1726	spin_unlock(lock: &swap_lock);
1727	return n;
1728	}
1729	#endif /* CONFIG_HIBERNATION */
1730
1731	static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
1732	{
1733	return pte_same(a: pte_swp_clear_flags(pte), b: swp_pte);
1734	}
1735
1736	/*
1737	* No need to decide whether this PTE shares the swap entry with others,
1738	* just let do_wp_page work it out if a write is requested later - to
1739	* force COW, vm_page_prot omits write permission from any private vma.
1740	*/
1741	static int unuse_pte(struct vm_area_struct vma, pmd_t pmd,
1742	unsigned long addr, swp_entry_t entry, struct folio *folio)
1743	{
1744	struct page *page = folio_file_page(folio, index: swp_offset(entry));
1745	struct page *swapcache;
1746	spinlock_t *ptl;
1747	pte_t *pte, new_pte, old_pte;
1748	bool hwpoisoned = PageHWPoison(page);
1749	int ret = `1`;
1750
1751	swapcache = page;
1752	page = ksm_might_need_to_copy(page, vma, address: addr);
1753	if (unlikely(!page))
1754	return -ENOMEM;
1755	else if (unlikely(PTR_ERR(page) == -EHWPOISON))
1756	hwpoisoned = true;
1757
1758	pte = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr, ptlp: &ptl);
1759	if (unlikely(!pte \|\| !pte_same_as_swp(ptep_get(pte),
1760	swp_entry_to_pte(entry)))) {
1761	ret = `0`;
1762	goto out;
1763	}
1764
1765	old_pte = ptep_get(ptep: pte);
1766
1767	if (unlikely(hwpoisoned \|\| !PageUptodate(page))) {
1768	swp_entry_t swp_entry;
1769
1770	dec_mm_counter(mm: vma->vm_mm, member: MM_SWAPENTS);
1771	if (hwpoisoned) {
1772	swp_entry = make_hwpoison_entry(page: swapcache);
1773	page = swapcache;
1774	} else {
1775	swp_entry = make_poisoned_swp_entry();
1776	}
1777	new_pte = swp_entry_to_pte(entry: swp_entry);
1778	ret = `0`;
1779	goto setpte;
1780	}
1781
1782	/*
1783	* Some architectures may have to restore extra metadata to the page
1784	* when reading from swap. This metadata may be indexed by swap entry
1785	* so this must be called before swap_free().
1786	*/
1787	arch_swap_restore(entry, page_folio(page));
1788
1789	/ See do_swap_page() /
1790	BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
1791	BUG_ON(PageAnon(page) && PageAnonExclusive(page));
1792
1793	dec_mm_counter(mm: vma->vm_mm, member: MM_SWAPENTS);
1794	inc_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES);
1795	get_page(page);
1796	if (page == swapcache) {
1797	rmap_t rmap_flags = RMAP_NONE;
1798
1799	/*
1800	* See do_swap_page(): PageWriteback() would be problematic.
1801	* However, we do a wait_on_page_writeback() just before this
1802	* call and have the page locked.
1803	*/
1804	VM_BUG_ON_PAGE(PageWriteback(page), page);
1805	if (pte_swp_exclusive(pte: old_pte))
1806	rmap_flags \|= RMAP_EXCLUSIVE;
1807
1808	page_add_anon_rmap(page, vma, address: addr, flags: rmap_flags);
1809	} else { / ksm created a completely new copy /
1810	page_add_new_anon_rmap(page, vma, address: addr);
1811	lru_cache_add_inactive_or_unevictable(page, vma);
1812	}
1813	new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
1814	if (pte_swp_soft_dirty(pte: old_pte))
1815	new_pte = pte_mksoft_dirty(pte: new_pte);
1816	if (pte_swp_uffd_wp(pte: old_pte))
1817	new_pte = pte_mkuffd_wp(pte: new_pte);
1818	setpte:
1819	set_pte_at(vma->vm_mm, addr, pte, new_pte);
1820	swap_free(entry);
1821	out:
1822	if (pte)
1823	pte_unmap_unlock(pte, ptl);
1824	if (page != swapcache) {
1825	unlock_page(page);
1826	put_page(page);
1827	}
1828	return ret;
1829	}
1830
1831	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
1832	unsigned long addr, unsigned long end,
1833	unsigned int type)
1834	{
1835	pte_t *pte = NULL;
1836	struct swap_info_struct *si;
1837
1838	si = swap_info[type];
1839	do {
1840	struct folio *folio;
1841	unsigned long offset;
1842	unsigned char swp_count;
1843	swp_entry_t entry;
1844	int ret;
1845	pte_t ptent;
1846
1847	if (!pte++) {
1848	pte = pte_offset_map(pmd, addr);
1849	if (!pte)
1850	break;
1851	}
1852
1853	ptent = ptep_get_lockless(ptep: pte);
1854
1855	if (!is_swap_pte(pte: ptent))
1856	continue;
1857
1858	entry = pte_to_swp_entry(pte: ptent);
1859	if (swp_type(entry) != type)
1860	continue;
1861
1862	offset = swp_offset(entry);
1863	pte_unmap(pte);
1864	pte = NULL;
1865
1866	folio = swap_cache_get_folio(entry, vma, addr);
1867	if (!folio) {
1868	struct page *page;
1869	struct vm_fault vmf = {
1870	.vma = vma,
1871	.address = addr,
1872	.real_address = addr,
1873	.pmd = pmd,
1874	};
1875
1876	page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
1877	vmf: &vmf);
1878	if (page)
1879	folio = page_folio(page);
1880	}
1881	if (!folio) {
1882	swp_count = READ_ONCE(si->swap_map[offset]);
1883	if (swp_count == `0` \|\| swp_count == SWAP_MAP_BAD)
1884	continue;
1885	return -ENOMEM;
1886	}
1887
1888	folio_lock(folio);
1889	folio_wait_writeback(folio);
1890	ret = unuse_pte(vma, pmd, addr, entry, folio);
1891	if (ret < `0`) {
1892	folio_unlock(folio);
1893	folio_put(folio);
1894	return ret;
1895	}
1896
1897	folio_free_swap(folio);
1898	folio_unlock(folio);
1899	folio_put(folio);
1900	} while (addr += PAGE_SIZE, addr != end);
1901
1902	if (pte)
1903	pte_unmap(pte);
1904	return `0`;
1905	}
1906
1907	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
1908	unsigned long addr, unsigned long end,
1909	unsigned int type)
1910	{
1911	pmd_t *pmd;
1912	unsigned long next;
1913	int ret;
1914
1915	pmd = pmd_offset(pud, address: addr);
1916	do {
1917	cond_resched();
1918	next = pmd_addr_end(addr, end);
1919	ret = unuse_pte_range(vma, pmd, addr, end: next, type);
1920	if (ret)
1921	return ret;
1922	} while (pmd++, addr = next, addr != end);
1923	return `0`;
1924	}
1925
1926	static inline int unuse_pud_range(struct vm_area_struct vma, p4d_t p4d,
1927	unsigned long addr, unsigned long end,
1928	unsigned int type)
1929	{
1930	pud_t *pud;
1931	unsigned long next;
1932	int ret;
1933
1934	pud = pud_offset(p4d, address: addr);
1935	do {
1936	next = pud_addr_end(addr, end);
1937	if (pud_none_or_clear_bad(pud))
1938	continue;
1939	ret = unuse_pmd_range(vma, pud, addr, end: next, type);
1940	if (ret)
1941	return ret;
1942	} while (pud++, addr = next, addr != end);
1943	return `0`;
1944	}
1945
1946	static inline int unuse_p4d_range(struct vm_area_struct vma, pgd_t pgd,
1947	unsigned long addr, unsigned long end,
1948	unsigned int type)
1949	{
1950	p4d_t *p4d;
1951	unsigned long next;
1952	int ret;
1953
1954	p4d = p4d_offset(pgd, address: addr);
1955	do {
1956	next = p4d_addr_end(addr, end);
1957	if (p4d_none_or_clear_bad(p4d))
1958	continue;
1959	ret = unuse_pud_range(vma, p4d, addr, end: next, type);
1960	if (ret)
1961	return ret;
1962	} while (p4d++, addr = next, addr != end);
1963	return `0`;
1964	}
1965
1966	static int unuse_vma(struct vm_area_struct vma, unsigned* int type)
1967	{
1968	pgd_t *pgd;
1969	unsigned long addr, end, next;
1970	int ret;
1971
1972	addr = vma->vm_start;
1973	end = vma->vm_end;
1974
1975	pgd = pgd_offset(vma->vm_mm, addr);
1976	do {
1977	next = pgd_addr_end(addr, end);
1978	if (pgd_none_or_clear_bad(pgd))
1979	continue;
1980	ret = unuse_p4d_range(vma, pgd, addr, end: next, type);
1981	if (ret)
1982	return ret;
1983	} while (pgd++, addr = next, addr != end);
1984	return `0`;
1985	}
1986
1987	static int unuse_mm(struct mm_struct mm, unsigned* int type)
1988	{
1989	struct vm_area_struct *vma;
1990	int ret = `0`;
1991	VMA_ITERATOR(vmi, mm, `0`);
1992
1993	mmap_read_lock(mm);
1994	for_each_vma(vmi, vma) {
1995	if (vma->anon_vma) {
1996	ret = unuse_vma(vma, type);
1997	if (ret)
1998	break;
1999	}
2000
2001	cond_resched();
2002	}
2003	mmap_read_unlock(mm);
2004	return ret;
2005	}
2006
2007	/*
2008	* Scan swap_map from current position to next entry still in use.
2009	* Return 0 if there are no inuse entries after prev till end of
2010	* the map.
2011	*/
2012	static unsigned int find_next_to_unuse(struct swap_info_struct *si,
2013	unsigned int prev)
2014	{
2015	unsigned int i;
2016	unsigned char count;
2017
2018	/*
2019	* No need for swap_lock here: we're just looking
2020	* for whether an entry is in use, not modifying it; false
2021	* hits are okay, and sys_swapoff() has already prevented new
2022	* allocations from this area (while holding swap_lock).
2023	*/
2024	for (i = prev + `1`; i < si->max; i++) {
2025	count = READ_ONCE(si->swap_map[i]);
2026	if (count && swap_count(ent: count) != SWAP_MAP_BAD)
2027	break;
2028	if ((i % LATENCY_LIMIT) == `0`)
2029	cond_resched();
2030	}
2031
2032	if (i == si->max)
2033	i = `0`;
2034
2035	return i;
2036	}
2037
2038	static int try_to_unuse(unsigned int type)
2039	{
2040	struct mm_struct *prev_mm;
2041	struct mm_struct *mm;
2042	struct list_head *p;
2043	int retval = `0`;
2044	struct swap_info_struct *si = swap_info[type];
2045	struct folio *folio;
2046	swp_entry_t entry;
2047	unsigned int i;
2048
2049	if (!READ_ONCE(si->inuse_pages))
2050	return `0`;
2051
2052	retry:
2053	retval = shmem_unuse(type);
2054	if (retval)
2055	return retval;
2056
2057	prev_mm = &init_mm;
2058	mmget(mm: prev_mm);
2059
2060	spin_lock(lock: &mmlist_lock);
2061	p = &init_mm.mmlist;
2062	while (READ_ONCE(si->inuse_pages) &&
2063	!signal_pending(current) &&
2064	(p = p->next) != &init_mm.mmlist) {
2065
2066	mm = list_entry(p, struct mm_struct, mmlist);
2067	if (!mmget_not_zero(mm))
2068	continue;
2069	spin_unlock(lock: &mmlist_lock);
2070	mmput(prev_mm);
2071	prev_mm = mm;
2072	retval = unuse_mm(mm, type);
2073	if (retval) {
2074	mmput(prev_mm);
2075	return retval;
2076	}
2077
2078	/*
2079	* Make sure that we aren't completely killing
2080	* interactive performance.
2081	*/
2082	cond_resched();
2083	spin_lock(lock: &mmlist_lock);
2084	}
2085	spin_unlock(lock: &mmlist_lock);
2086
2087	mmput(prev_mm);
2088
2089	i = `0`;
2090	while (READ_ONCE(si->inuse_pages) &&
2091	!signal_pending(current) &&
2092	(i = find_next_to_unuse(si, prev: i)) != `0`) {
2093
2094	entry = swp_entry(type, offset: i);
2095	folio = filemap_get_folio(swap_address_space(entry), index: i);
2096	if (IS_ERR(ptr: folio))
2097	continue;
2098
2099	/*
2100	* It is conceivable that a racing task removed this folio from
2101	* swap cache just before we acquired the page lock. The folio
2102	* might even be back in swap cache on another swap area. But
2103	* that is okay, folio_free_swap() only removes stale folios.
2104	*/
2105	folio_lock(folio);
2106	folio_wait_writeback(folio);
2107	folio_free_swap(folio);
2108	folio_unlock(folio);
2109	folio_put(folio);
2110	}
2111
2112	/*
2113	* Lets check again to see if there are still swap entries in the map.
2114	* If yes, we would need to do retry the unuse logic again.
2115	* Under global memory pressure, swap entries can be reinserted back
2116	* into process space after the mmlist loop above passes over them.
2117	*
2118	* Limit the number of retries? No: when mmget_not_zero()
2119	* above fails, that mm is likely to be freeing swap from
2120	* exit_mmap(), which proceeds at its own independent pace;
2121	* and even shmem_writepage() could have been preempted after
2122	* folio_alloc_swap(), temporarily hiding that swap. It's easy
2123	* and robust (though cpu-intensive) just to keep retrying.
2124	*/
2125	if (READ_ONCE(si->inuse_pages)) {
2126	if (!signal_pending(current))
2127	goto retry;
2128	return -EINTR;
2129	}
2130
2131	return `0`;
2132	}
2133
2134	/*
2135	* After a successful try_to_unuse, if no swap is now in use, we know
2136	* we can empty the mmlist. swap_lock must be held on entry and exit.
2137	* Note that mmlist_lock nests inside swap_lock, and an mm must be
2138	* added to the mmlist just after page_duplicate - before would be racy.
2139	*/
2140	static void drain_mmlist(void)
2141	{
2142	struct list_head p, next;
2143	unsigned int type;
2144
2145	for (type = `0`; type < nr_swapfiles; type++)
2146	if (swap_info[type]->inuse_pages)
2147	return;
2148	spin_lock(lock: &mmlist_lock);
2149	list_for_each_safe(p, next, &init_mm.mmlist)
2150	list_del_init(entry: p);
2151	spin_unlock(lock: &mmlist_lock);
2152	}
2153
2154	/*
2155	* Free all of a swapdev's extent information
2156	*/
2157	static void destroy_swap_extents(struct swap_info_struct *sis)
2158	{
2159	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2160	struct rb_node *rb = sis->swap_extent_root.rb_node;
2161	struct swap_extent se = rb_entry(rb, struct* swap_extent, rb_node);
2162
2163	rb_erase(rb, &sis->swap_extent_root);
2164	kfree(objp: se);
2165	}
2166
2167	if (sis->flags & SWP_ACTIVATED) {
2168	struct file *swap_file = sis->swap_file;
2169	struct address_space *mapping = swap_file->f_mapping;
2170
2171	sis->flags &= ~SWP_ACTIVATED;
2172	if (mapping->a_ops->swap_deactivate)
2173	mapping->a_ops->swap_deactivate(swap_file);
2174	}
2175	}
2176
2177	/*
2178	* Add a block range (and the corresponding page range) into this swapdev's
2179	* extent tree.
2180	*
2181	* This function rather assumes that it is called in ascending page order.
2182	*/
2183	int
2184	add_swap_extent(struct swap_info_struct sis, unsigned* long start_page,
2185	unsigned long nr_pages, sector_t start_block)
2186	{
2187	struct rb_node *link = &sis->swap_extent_root.rb_node, parent = NULL;
2188	struct swap_extent *se;
2189	struct swap_extent *new_se;
2190
2191	/*
2192	* place the new node at the right most since the
2193	* function is called in ascending page order.
2194	*/
2195	while (*link) {
2196	parent = *link;
2197	link = &parent->rb_right;
2198	}
2199
2200	if (parent) {
2201	se = rb_entry(parent, struct swap_extent, rb_node);
2202	BUG_ON(se->start_page + se->nr_pages != start_page);
2203	if (se->start_block + se->nr_pages == start_block) {
2204	/ Merge it /
2205	se->nr_pages += nr_pages;
2206	return `0`;
2207	}
2208	}
2209
2210	/ No merge, insert a new extent. /
2211	new_se = kmalloc(size: sizeof(*se), GFP_KERNEL);
2212	if (new_se == NULL)
2213	return -ENOMEM;
2214	new_se->start_page = start_page;
2215	new_se->nr_pages = nr_pages;
2216	new_se->start_block = start_block;
2217
2218	rb_link_node(node: &new_se->rb_node, parent, rb_link: link);
2219	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2220	return `1`;
2221	}
2222	EXPORT_SYMBOL_GPL(add_swap_extent);
2223
2224	/*
2225	* A `swap extent' is a simple thing which maps a contiguous range of pages
2226	* onto a contiguous range of disk blocks. A rbtree of swap extents is
2227	* built at swapon time and is then used at swap_writepage/swap_readpage
2228	* time for locating where on disk a page belongs.
2229	*
2230	* If the swapfile is an S_ISBLK block device, a single extent is installed.
2231	* This is done so that the main operating code can treat S_ISBLK and S_ISREG
2232	* swap files identically.
2233	*
2234	* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
2235	* extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
2236	* swapfiles are handled identically after swapon time.
2237	*
2238	* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
2239	* and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray
2240	* blocks are found which do not fall within the PAGE_SIZE alignment
2241	* requirements, they are simply tossed out - we will never use those blocks
2242	* for swapping.
2243	*
2244	* For all swap devices we set S_SWAPFILE across the life of the swapon. This
2245	* prevents users from writing to the swap device, which will corrupt memory.
2246	*
2247	* The amount of disk space which a single swap extent represents varies.
2248	* Typically it is in the 1-4 megabyte range. So we can have hundreds of
2249	* extents in the rbtree. - akpm.
2250	*/
2251	static int setup_swap_extents(struct swap_info_struct sis, sector_t span)
2252	{
2253	struct file *swap_file = sis->swap_file;
2254	struct address_space *mapping = swap_file->f_mapping;
2255	struct inode *inode = mapping->host;
2256	int ret;
2257
2258	if (S_ISBLK(inode->i_mode)) {
2259	ret = add_swap_extent(sis, `0`, sis->max, `0`);
2260	*span = sis->pages;
2261	return ret;
2262	}
2263
2264	if (mapping->a_ops->swap_activate) {
2265	ret = mapping->a_ops->swap_activate(sis, swap_file, span);
2266	if (ret < `0`)
2267	return ret;
2268	sis->flags \|= SWP_ACTIVATED;
2269	if ((sis->flags & SWP_FS_OPS) &&
2270	sio_pool_init() != `0`) {
2271	destroy_swap_extents(sis);
2272	return -ENOMEM;
2273	}
2274	return ret;
2275	}
2276
2277	return generic_swapfile_activate(sis, swap_file, span);
2278	}
2279
2280	static int swap_node(struct swap_info_struct *p)
2281	{
2282	struct block_device *bdev;
2283
2284	if (p->bdev)
2285	bdev = p->bdev;
2286	else
2287	bdev = p->swap_file->f_inode->i_sb->s_bdev;
2288
2289	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
2290	}
2291
2292	static void setup_swap_info(struct swap_info_struct p, int* prio,
2293	unsigned char *swap_map,
2294	struct swap_cluster_info *cluster_info)
2295	{
2296	int i;
2297
2298	if (prio >= `0`)
2299	p->prio = prio;
2300	else
2301	p->prio = --least_priority;
2302	/*
2303	* the plist prio is negated because plist ordering is
2304	* low-to-high, while swap ordering is high-to-low
2305	*/
2306	p->list.prio = -p->prio;
2307	for_each_node(i) {
2308	if (p->prio >= `0`)
2309	p->avail_lists[i].prio = -p->prio;
2310	else {
2311	if (swap_node(p) == i)
2312	p->avail_lists[i].prio = `1`;
2313	else
2314	p->avail_lists[i].prio = -p->prio;
2315	}
2316	}
2317	p->swap_map = swap_map;
2318	p->cluster_info = cluster_info;
2319	}
2320
2321	static void _enable_swap_info(struct swap_info_struct *p)
2322	{
2323	p->flags \|= SWP_WRITEOK;
2324	atomic_long_add(i: p->pages, v: &nr_swap_pages);
2325	total_swap_pages += p->pages;
2326
2327	assert_spin_locked(&swap_lock);
2328	/*
2329	* both lists are plists, and thus priority ordered.
2330	* swap_active_head needs to be priority ordered for swapoff(),
2331	* which on removal of any swap_info_struct with an auto-assigned
2332	* (i.e. negative) priority increments the auto-assigned priority
2333	* of any lower-priority swap_info_structs.
2334	* swap_avail_head needs to be priority ordered for folio_alloc_swap(),
2335	* which allocates swap pages from the highest available priority
2336	* swap_info_struct.
2337	*/
2338	plist_add(node: &p->list, head: &swap_active_head);
2339
2340	/ add to available list iff swap device is not full /
2341	if (p->highest_bit)
2342	add_to_avail_list(p);
2343	}
2344
2345	static void enable_swap_info(struct swap_info_struct p, int* prio,
2346	unsigned char *swap_map,
2347	struct swap_cluster_info *cluster_info)
2348	{
2349	zswap_swapon(type: p->type);
2350
2351	spin_lock(lock: &swap_lock);
2352	spin_lock(lock: &p->lock);
2353	setup_swap_info(p, prio, swap_map, cluster_info);
2354	spin_unlock(lock: &p->lock);
2355	spin_unlock(lock: &swap_lock);
2356	/*
2357	* Finished initializing swap device, now it's safe to reference it.
2358	*/
2359	percpu_ref_resurrect(ref: &p->users);
2360	spin_lock(lock: &swap_lock);
2361	spin_lock(lock: &p->lock);
2362	_enable_swap_info(p);
2363	spin_unlock(lock: &p->lock);
2364	spin_unlock(lock: &swap_lock);
2365	}
2366
2367	static void reinsert_swap_info(struct swap_info_struct *p)
2368	{
2369	spin_lock(lock: &swap_lock);
2370	spin_lock(lock: &p->lock);
2371	setup_swap_info(p, prio: p->prio, swap_map: p->swap_map, cluster_info: p->cluster_info);
2372	_enable_swap_info(p);
2373	spin_unlock(lock: &p->lock);
2374	spin_unlock(lock: &swap_lock);
2375	}
2376
2377	bool has_usable_swap(void)
2378	{
2379	bool ret = true;
2380
2381	spin_lock(lock: &swap_lock);
2382	if (plist_head_empty(head: &swap_active_head))
2383	ret = false;
2384	spin_unlock(lock: &swap_lock);
2385	return ret;
2386	}
2387
2388	SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
2389	{
2390	struct swap_info_struct *p = NULL;
2391	unsigned char *swap_map;
2392	struct swap_cluster_info *cluster_info;
2393	struct file swap_file, victim;
2394	struct address_space *mapping;
2395	struct inode *inode;
2396	struct filename *pathname;
2397	int err, found = `0`;
2398	unsigned int old_block_size;
2399
2400	if (!capable(CAP_SYS_ADMIN))
2401	return -EPERM;
2402
2403	BUG_ON(!current->mm);
2404
2405	pathname = getname(specialfile);
2406	if (IS_ERR(ptr: pathname))
2407	return PTR_ERR(ptr: pathname);
2408
2409	victim = file_open_name(pathname, O_RDWR\|O_LARGEFILE, `0`);
2410	err = PTR_ERR(ptr: victim);
2411	if (IS_ERR(ptr: victim))
2412	goto out;
2413
2414	mapping = victim->f_mapping;
2415	spin_lock(lock: &swap_lock);
2416	plist_for_each_entry(p, &swap_active_head, list) {
2417	if (p->flags & SWP_WRITEOK) {
2418	if (p->swap_file->f_mapping == mapping) {
2419	found = `1`;
2420	break;
2421	}
2422	}
2423	}
2424	if (!found) {
2425	err = -EINVAL;
2426	spin_unlock(lock: &swap_lock);
2427	goto out_dput;
2428	}
2429	if (!security_vm_enough_memory_mm(current->mm, pages: p->pages))
2430	vm_unacct_memory(pages: p->pages);
2431	else {
2432	err = -ENOMEM;
2433	spin_unlock(lock: &swap_lock);
2434	goto out_dput;
2435	}
2436	spin_lock(lock: &p->lock);
2437	del_from_avail_list(p);
2438	if (p->prio < `0`) {
2439	struct swap_info_struct *si = p;
2440	int nid;
2441
2442	plist_for_each_entry_continue(si, &swap_active_head, list) {
2443	si->prio++;
2444	si->list.prio--;
2445	for_each_node(nid) {
2446	if (si->avail_lists[nid].prio != `1`)
2447	si->avail_lists[nid].prio--;
2448	}
2449	}
2450	least_priority++;
2451	}
2452	plist_del(node: &p->list, head: &swap_active_head);
2453	atomic_long_sub(i: p->pages, v: &nr_swap_pages);
2454	total_swap_pages -= p->pages;
2455	p->flags &= ~SWP_WRITEOK;
2456	spin_unlock(lock: &p->lock);
2457	spin_unlock(lock: &swap_lock);
2458
2459	disable_swap_slots_cache_lock();
2460
2461	set_current_oom_origin();
2462	err = try_to_unuse(type: p->type);
2463	clear_current_oom_origin();
2464
2465	if (err) {
2466	/ re-insert swap space back into swap_list /
2467	reinsert_swap_info(p);
2468	reenable_swap_slots_cache_unlock();
2469	goto out_dput;
2470	}
2471
2472	reenable_swap_slots_cache_unlock();
2473
2474	/*
2475	* Wait for swap operations protected by get/put_swap_device()
2476	* to complete.
2477	*
2478	* We need synchronize_rcu() here to protect the accessing to
2479	* the swap cache data structure.
2480	*/
2481	percpu_ref_kill(ref: &p->users);
2482	synchronize_rcu();
2483	wait_for_completion(&p->comp);
2484
2485	flush_work(work: &p->discard_work);
2486
2487	destroy_swap_extents(sis: p);
2488	if (p->flags & SWP_CONTINUED)
2489	free_swap_count_continuations(p);
2490
2491	if (!p->bdev \|\| !bdev_nonrot(bdev: p->bdev))
2492	atomic_dec(v: &nr_rotate_swap);
2493
2494	mutex_lock(&swapon_mutex);
2495	spin_lock(lock: &swap_lock);
2496	spin_lock(lock: &p->lock);
2497	drain_mmlist();
2498
2499	/ wait for anyone still in scan_swap_map_slots /
2500	p->highest_bit = `0`; / cuts scans short /
2501	while (p->flags >= SWP_SCANNING) {
2502	spin_unlock(lock: &p->lock);
2503	spin_unlock(lock: &swap_lock);
2504	schedule_timeout_uninterruptible(timeout: `1`);
2505	spin_lock(lock: &swap_lock);
2506	spin_lock(lock: &p->lock);
2507	}
2508
2509	swap_file = p->swap_file;
2510	old_block_size = p->old_block_size;
2511	p->swap_file = NULL;
2512	p->max = `0`;
2513	swap_map = p->swap_map;
2514	p->swap_map = NULL;
2515	cluster_info = p->cluster_info;
2516	p->cluster_info = NULL;
2517	spin_unlock(lock: &p->lock);
2518	spin_unlock(lock: &swap_lock);
2519	arch_swap_invalidate_area(type: p->type);
2520	zswap_swapoff(type: p->type);
2521	mutex_unlock(lock: &swapon_mutex);
2522	free_percpu(pdata: p->percpu_cluster);
2523	p->percpu_cluster = NULL;
2524	free_percpu(pdata: p->cluster_next_cpu);
2525	p->cluster_next_cpu = NULL;
2526	vfree(addr: swap_map);
2527	kvfree(addr: cluster_info);
2528	/ Destroy swap account information /
2529	swap_cgroup_swapoff(type: p->type);
2530	exit_swap_address_space(type: p->type);
2531
2532	inode = mapping->host;
2533	if (p->bdev_handle) {
2534	set_blocksize(bdev: p->bdev, size: old_block_size);
2535	bdev_release(handle: p->bdev_handle);
2536	p->bdev_handle = NULL;
2537	}
2538
2539	inode_lock(inode);
2540	inode->i_flags &= ~S_SWAPFILE;
2541	inode_unlock(inode);
2542	filp_close(swap_file, NULL);
2543
2544	/*
2545	* Clear the SWP_USED flag after all resources are freed so that swapon
2546	* can reuse this swap_info in alloc_swap_info() safely. It is ok to
2547	* not hold p->lock after we cleared its SWP_WRITEOK.
2548	*/
2549	spin_lock(lock: &swap_lock);
2550	p->flags = `0`;
2551	spin_unlock(lock: &swap_lock);
2552
2553	err = `0`;
2554	atomic_inc(v: &proc_poll_event);
2555	wake_up_interruptible(&proc_poll_wait);
2556
2557	out_dput:
2558	filp_close(victim, NULL);
2559	out:
2560	putname(name: pathname);
2561	return err;
2562	}
2563
2564	#ifdef CONFIG_PROC_FS
2565	static __poll_t swaps_poll(struct file file, poll_table wait)
2566	{
2567	struct seq_file *seq = file->private_data;
2568
2569	poll_wait(filp: file, wait_address: &proc_poll_wait, p: wait);
2570
2571	if (seq->poll_event != atomic_read(v: &proc_poll_event)) {
2572	seq->poll_event = atomic_read(v: &proc_poll_event);
2573	return EPOLLIN \| EPOLLRDNORM \| EPOLLERR \| EPOLLPRI;
2574	}
2575
2576	return EPOLLIN \| EPOLLRDNORM;
2577	}
2578
2579	/ iterator /
2580	static void swap_start(struct* seq_file swap, loff_t pos)
2581	{
2582	struct swap_info_struct *si;
2583	int type;
2584	loff_t l = *pos;
2585
2586	mutex_lock(&swapon_mutex);
2587
2588	if (!l)
2589	return SEQ_START_TOKEN;
2590
2591	for (type = `0`; (si = swap_type_to_swap_info(type)); type++) {
2592	if (!(si->flags & SWP_USED) \|\| !si->swap_map)
2593	continue;
2594	if (!--l)
2595	return si;
2596	}
2597
2598	return NULL;
2599	}
2600
2601	static void swap_next(struct* seq_file swap, void* v, loff_t pos)
2602	{
2603	struct swap_info_struct *si = v;
2604	int type;
2605
2606	if (v == SEQ_START_TOKEN)
2607	type = `0`;
2608	else
2609	type = si->type + `1`;
2610
2611	++(*pos);
2612	for (; (si = swap_type_to_swap_info(type)); type++) {
2613	if (!(si->flags & SWP_USED) \|\| !si->swap_map)
2614	continue;
2615	return si;
2616	}
2617
2618	return NULL;
2619	}
2620
2621	static void swap_stop(struct seq_file swap, void* *v)
2622	{
2623	mutex_unlock(lock: &swapon_mutex);
2624	}
2625
2626	static int swap_show(struct seq_file swap, void* *v)
2627	{
2628	struct swap_info_struct *si = v;
2629	struct file *file;
2630	int len;
2631	unsigned long bytes, inuse;
2632
2633	if (si == SEQ_START_TOKEN) {
2634	seq_puts(m: swap, s: "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
2635	return `0`;
2636	}
2637
2638	bytes = K(si->pages);
2639	inuse = K(READ_ONCE(si->inuse_pages));
2640
2641	file = si->swap_file;
2642	len = seq_file_path(swap, file, " \t\n\\");
2643	seq_printf(m: swap, fmt: "%*s%s\t%lu\t%s%lu\t%s%d\n",
2644	len < `40` ? `40` - len : `1`, " ",
2645	S_ISBLK(file_inode(file)->i_mode) ?
2646	"partition" : "file\t",
2647	bytes, bytes < `10000000` ? "\t" : "",
2648	inuse, inuse < `10000000` ? "\t" : "",
2649	si->prio);
2650	return `0`;
2651	}
2652
2653	static const struct seq_operations swaps_op = {
2654	.start = swap_start,
2655	.next = swap_next,
2656	.stop = swap_stop,
2657	.show = swap_show
2658	};
2659
2660	static int swaps_open(struct inode inode, struct* file *file)
2661	{
2662	struct seq_file *seq;
2663	int ret;
2664
2665	ret = seq_open(file, &swaps_op);
2666	if (ret)
2667	return ret;
2668
2669	seq = file->private_data;
2670	seq->poll_event = atomic_read(v: &proc_poll_event);
2671	return `0`;
2672	}
2673
2674	static const struct proc_ops swaps_proc_ops = {
2675	.proc_flags = PROC_ENTRY_PERMANENT,
2676	.proc_open = swaps_open,
2677	.proc_read = seq_read,
2678	.proc_lseek = seq_lseek,
2679	.proc_release = seq_release,
2680	.proc_poll = swaps_poll,
2681	};
2682
2683	static int __init procswaps_init(void)
2684	{
2685	proc_create(name: "swaps", mode: `0`, NULL, proc_ops: &swaps_proc_ops);
2686	return `0`;
2687	}
2688	__initcall(procswaps_init);
2689	#endif /* CONFIG_PROC_FS */
2690
2691	#ifdef MAX_SWAPFILES_CHECK
2692	static int __init max_swapfiles_check(void)
2693	{
2694	MAX_SWAPFILES_CHECK();
2695	return `0`;
2696	}
2697	late_initcall(max_swapfiles_check);
2698	#endif
2699
2700	static struct swap_info_struct alloc_swap_info(void*)
2701	{
2702	struct swap_info_struct *p;
2703	struct swap_info_struct *defer = NULL;
2704	unsigned int type;
2705	int i;
2706
2707	p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
2708	if (!p)
2709	return ERR_PTR(error: -ENOMEM);
2710
2711	if (percpu_ref_init(ref: &p->users, release: swap_users_ref_free,
2712	flags: PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
2713	kvfree(addr: p);
2714	return ERR_PTR(error: -ENOMEM);
2715	}
2716
2717	spin_lock(lock: &swap_lock);
2718	for (type = `0`; type < nr_swapfiles; type++) {
2719	if (!(swap_info[type]->flags & SWP_USED))
2720	break;
2721	}
2722	if (type >= MAX_SWAPFILES) {
2723	spin_unlock(lock: &swap_lock);
2724	percpu_ref_exit(ref: &p->users);
2725	kvfree(addr: p);
2726	return ERR_PTR(error: -EPERM);
2727	}
2728	if (type >= nr_swapfiles) {
2729	p->type = type;
2730	/*
2731	* Publish the swap_info_struct after initializing it.
2732	* Note that kvzalloc() above zeroes all its fields.
2733	*/
2734	smp_store_release(&swap_info[type], p); / rcu_assign_pointer() /
2735	nr_swapfiles++;
2736	} else {
2737	defer = p;
2738	p = swap_info[type];
2739	/*
2740	* Do not memset this entry: a racing procfs swap_next()
2741	* would be relying on p->type to remain valid.
2742	*/
2743	}
2744	p->swap_extent_root = RB_ROOT;
2745	plist_node_init(node: &p->list, prio: `0`);
2746	for_each_node(i)
2747	plist_node_init(node: &p->avail_lists[i], prio: `0`);
2748	p->flags = SWP_USED;
2749	spin_unlock(lock: &swap_lock);
2750	if (defer) {
2751	percpu_ref_exit(ref: &defer->users);
2752	kvfree(addr: defer);
2753	}
2754	spin_lock_init(&p->lock);
2755	spin_lock_init(&p->cont_lock);
2756	init_completion(x: &p->comp);
2757
2758	return p;
2759	}
2760
2761	static int claim_swapfile(struct swap_info_struct p, struct* inode *inode)
2762	{
2763	int error;
2764
2765	if (S_ISBLK(inode->i_mode)) {
2766	p->bdev_handle = bdev_open_by_dev(dev: inode->i_rdev,
2767	BLK_OPEN_READ \| BLK_OPEN_WRITE, holder: p, NULL);
2768	if (IS_ERR(ptr: p->bdev_handle)) {
2769	error = PTR_ERR(ptr: p->bdev_handle);
2770	p->bdev_handle = NULL;
2771	return error;
2772	}
2773	p->bdev = p->bdev_handle->bdev;
2774	p->old_block_size = block_size(bdev: p->bdev);
2775	error = set_blocksize(bdev: p->bdev, PAGE_SIZE);
2776	if (error < `0`)
2777	return error;
2778	/*
2779	* Zoned block devices contain zones that have a sequential
2780	* write only restriction. Hence zoned block devices are not
2781	* suitable for swapping. Disallow them here.
2782	*/
2783	if (bdev_is_zoned(bdev: p->bdev))
2784	return -EINVAL;
2785	p->flags \|= SWP_BLKDEV;
2786	} else if (S_ISREG(inode->i_mode)) {
2787	p->bdev = inode->i_sb->s_bdev;
2788	}
2789
2790	return `0`;
2791	}
2792
2793
2794	/*
2795	* Find out how many pages are allowed for a single swap device. There
2796	* are two limiting factors:
2797	* 1) the number of bits for the swap offset in the swp_entry_t type, and
2798	* 2) the number of bits in the swap pte, as defined by the different
2799	* architectures.
2800	*
2801	* In order to find the largest possible bit mask, a swap entry with
2802	* swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
2803	* decoded to a swp_entry_t again, and finally the swap offset is
2804	* extracted.
2805	*
2806	* This will mask all the bits from the initial ~0UL mask that can't
2807	* be encoded in either the swp_entry_t or the architecture definition
2808	* of a swap pte.
2809	*/
2810	unsigned long generic_max_swapfile_size(void)
2811	{
2812	return swp_offset(entry: pte_to_swp_entry(
2813	pte: swp_entry_to_pte(entry: swp_entry(type: `0`, offset: ~`0UL`)))) + `1`;
2814	}
2815
2816	/ Can be overridden by an architecture for additional checks. /
2817	__weak unsigned long arch_max_swapfile_size(void)
2818	{
2819	return generic_max_swapfile_size();
2820	}
2821
2822	static unsigned long read_swap_header(struct swap_info_struct *p,
2823	union swap_header *swap_header,
2824	struct inode *inode)
2825	{
2826	int i;
2827	unsigned long maxpages;
2828	unsigned long swapfilepages;
2829	unsigned long last_page;
2830
2831	if (memcmp(p: "SWAPSPACE2", q: swap_header->magic.magic, size: `10`)) {
2832	pr_err("Unable to find swap-space signature\n");
2833	return `0`;
2834	}
2835
2836	/ swap partition endianness hack... /
2837	if (swab32(swap_header->info.version) == `1`) {
2838	swab32s(p: &swap_header->info.version);
2839	swab32s(p: &swap_header->info.last_page);
2840	swab32s(p: &swap_header->info.nr_badpages);
2841	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2842	return `0`;
2843	for (i = `0`; i < swap_header->info.nr_badpages; i++)
2844	swab32s(p: &swap_header->info.badpages[i]);
2845	}
2846	/ Check the swap header's sub-version /
2847	if (swap_header->info.version != `1`) {
2848	pr_warn("Unable to handle swap header version %d\n",
2849	swap_header->info.version);
2850	return `0`;
2851	}
2852
2853	p->lowest_bit = `1`;
2854	p->cluster_next = `1`;
2855	p->cluster_nr = `0`;
2856
2857	maxpages = swapfile_maximum_size;
2858	last_page = swap_header->info.last_page;
2859	if (!last_page) {
2860	pr_warn("Empty swap-file\n");
2861	return `0`;
2862	}
2863	if (last_page > maxpages) {
2864	pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2865	K(maxpages), K(last_page));
2866	}
2867	if (maxpages > last_page) {
2868	maxpages = last_page + `1`;
2869	/ p->max is an unsigned int: don't overflow it /
2870	if ((unsigned int)maxpages == `0`)
2871	maxpages = UINT_MAX;
2872	}
2873	p->highest_bit = maxpages - `1`;
2874
2875	if (!maxpages)
2876	return `0`;
2877	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2878	if (swapfilepages && maxpages > swapfilepages) {
2879	pr_warn("Swap area shorter than signature indicates\n");
2880	return `0`;
2881	}
2882	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2883	return `0`;
2884	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2885	return `0`;
2886
2887	return maxpages;
2888	}
2889
2890	#define SWAP_CLUSTER_INFO_COLS \
2891	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
2892	#define SWAP_CLUSTER_SPACE_COLS \
2893	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
2894	#define SWAP_CLUSTER_COLS \
2895	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
2896
2897	static int setup_swap_map_and_extents(struct swap_info_struct *p,
2898	union swap_header *swap_header,
2899	unsigned char *swap_map,
2900	struct swap_cluster_info *cluster_info,
2901	unsigned long maxpages,
2902	sector_t *span)
2903	{
2904	unsigned int j, k;
2905	unsigned int nr_good_pages;
2906	int nr_extents;
2907	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2908	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
2909	unsigned long i, idx;
2910
2911	nr_good_pages = maxpages - `1`; / omit header page /
2912
2913	cluster_list_init(list: &p->free_clusters);
2914	cluster_list_init(list: &p->discard_clusters);
2915
2916	for (i = `0`; i < swap_header->info.nr_badpages; i++) {
2917	unsigned int page_nr = swap_header->info.badpages[i];
2918	if (page_nr == `0` \|\| page_nr > swap_header->info.last_page)
2919	return -EINVAL;
2920	if (page_nr < maxpages) {
2921	swap_map[page_nr] = SWAP_MAP_BAD;
2922	nr_good_pages--;
2923	/*
2924	* Haven't marked the cluster free yet, no list
2925	* operation involved
2926	*/
2927	inc_cluster_info_page(p, cluster_info, page_nr);
2928	}
2929	}
2930
2931	/ Haven't marked the cluster free yet, no list operation involved /
2932	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2933	inc_cluster_info_page(p, cluster_info, page_nr: i);
2934
2935	if (nr_good_pages) {
2936	swap_map[`0`] = SWAP_MAP_BAD;
2937	/*
2938	* Not mark the cluster free yet, no list
2939	* operation involved
2940	*/
2941	inc_cluster_info_page(p, cluster_info, page_nr: `0`);
2942	p->max = maxpages;
2943	p->pages = nr_good_pages;
2944	nr_extents = setup_swap_extents(sis: p, span);
2945	if (nr_extents < `0`)
2946	return nr_extents;
2947	nr_good_pages = p->pages;
2948	}
2949	if (!nr_good_pages) {
2950	pr_warn("Empty swap-file\n");
2951	return -EINVAL;
2952	}
2953
2954	if (!cluster_info)
2955	return nr_extents;
2956
2957
2958	/*
2959	* Reduce false cache line sharing between cluster_info and
2960	* sharing same address space.
2961	*/
2962	for (k = `0`; k < SWAP_CLUSTER_COLS; k++) {
2963	j = (k + col) % SWAP_CLUSTER_COLS;
2964	for (i = `0`; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
2965	idx = i * SWAP_CLUSTER_COLS + j;
2966	if (idx >= nr_clusters)
2967	continue;
2968	if (cluster_count(info: &cluster_info[idx]))
2969	continue;
2970	cluster_set_flag(info: &cluster_info[idx], CLUSTER_FLAG_FREE);
2971	cluster_list_add_tail(list: &p->free_clusters, ci: cluster_info,
2972	idx);
2973	}
2974	}
2975	return nr_extents;
2976	}
2977
2978	SYSCALL_DEFINE2(swapon, const char __user , specialfile, int*, swap_flags)
2979	{
2980	struct swap_info_struct *p;
2981	struct filename *name;
2982	struct file *swap_file = NULL;
2983	struct address_space *mapping;
2984	struct dentry *dentry;
2985	int prio;
2986	int error;
2987	union swap_header *swap_header;
2988	int nr_extents;
2989	sector_t span;
2990	unsigned long maxpages;
2991	unsigned char *swap_map = NULL;
2992	struct swap_cluster_info *cluster_info = NULL;
2993	struct page *page = NULL;
2994	struct inode *inode = NULL;
2995	bool inced_nr_rotate_swap = false;
2996
2997	if (swap_flags & ~SWAP_FLAGS_VALID)
2998	return -EINVAL;
2999
3000	if (!capable(CAP_SYS_ADMIN))
3001	return -EPERM;
3002
3003	if (!swap_avail_heads)
3004	return -ENOMEM;
3005
3006	p = alloc_swap_info();
3007	if (IS_ERR(ptr: p))
3008	return PTR_ERR(ptr: p);
3009
3010	INIT_WORK(&p->discard_work, swap_discard_work);
3011
3012	name = getname(specialfile);
3013	if (IS_ERR(ptr: name)) {
3014	error = PTR_ERR(ptr: name);
3015	name = NULL;
3016	goto bad_swap;
3017	}
3018	swap_file = file_open_name(name, O_RDWR\|O_LARGEFILE, `0`);
3019	if (IS_ERR(ptr: swap_file)) {
3020	error = PTR_ERR(ptr: swap_file);
3021	swap_file = NULL;
3022	goto bad_swap;
3023	}
3024
3025	p->swap_file = swap_file;
3026	mapping = swap_file->f_mapping;
3027	dentry = swap_file->f_path.dentry;
3028	inode = mapping->host;
3029
3030	error = claim_swapfile(p, inode);
3031	if (unlikely(error))
3032	goto bad_swap;
3033
3034	inode_lock(inode);
3035	if (d_unlinked(dentry) \|\| cant_mount(dentry)) {
3036	error = -ENOENT;
3037	goto bad_swap_unlock_inode;
3038	}
3039	if (IS_SWAPFILE(inode)) {
3040	error = -EBUSY;
3041	goto bad_swap_unlock_inode;
3042	}
3043
3044	/*
3045	* Read the swap header.
3046	*/
3047	if (!mapping->a_ops->read_folio) {
3048	error = -EINVAL;
3049	goto bad_swap_unlock_inode;
3050	}
3051	page = read_mapping_page(mapping, index: `0`, file: swap_file);
3052	if (IS_ERR(ptr: page)) {
3053	error = PTR_ERR(ptr: page);
3054	goto bad_swap_unlock_inode;
3055	}
3056	swap_header = kmap(page);
3057
3058	maxpages = read_swap_header(p, swap_header, inode);
3059	if (unlikely(!maxpages)) {
3060	error = -EINVAL;
3061	goto bad_swap_unlock_inode;
3062	}
3063
3064	/ OK, set up the swap map and apply the bad block list /
3065	swap_map = vzalloc(size: maxpages);
3066	if (!swap_map) {
3067	error = -ENOMEM;
3068	goto bad_swap_unlock_inode;
3069	}
3070
3071	if (p->bdev && bdev_stable_writes(bdev: p->bdev))
3072	p->flags \|= SWP_STABLE_WRITES;
3073
3074	if (p->bdev && bdev_synchronous(bdev: p->bdev))
3075	p->flags \|= SWP_SYNCHRONOUS_IO;
3076
3077	if (p->bdev && bdev_nonrot(bdev: p->bdev)) {
3078	int cpu;
3079	unsigned long ci, nr_cluster;
3080
3081	p->flags \|= SWP_SOLIDSTATE;
3082	p->cluster_next_cpu = alloc_percpu(unsigned int);
3083	if (!p->cluster_next_cpu) {
3084	error = -ENOMEM;
3085	goto bad_swap_unlock_inode;
3086	}
3087	/*
3088	* select a random position to start with to help wear leveling
3089	* SSD
3090	*/
3091	for_each_possible_cpu(cpu) {
3092	per_cpu(*p->cluster_next_cpu, cpu) =
3093	get_random_u32_inclusive(floor: `1`, ceil: p->highest_bit);
3094	}
3095	nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
3096
3097	cluster_info = kvcalloc(n: nr_cluster, size: sizeof(*cluster_info),
3098	GFP_KERNEL);
3099	if (!cluster_info) {
3100	error = -ENOMEM;
3101	goto bad_swap_unlock_inode;
3102	}
3103
3104	for (ci = `0`; ci < nr_cluster; ci++)
3105	spin_lock_init(&((cluster_info + ci)->lock));
3106
3107	p->percpu_cluster = alloc_percpu(struct percpu_cluster);
3108	if (!p->percpu_cluster) {
3109	error = -ENOMEM;
3110	goto bad_swap_unlock_inode;
3111	}
3112	for_each_possible_cpu(cpu) {
3113	struct percpu_cluster *cluster;
3114	cluster = per_cpu_ptr(p->percpu_cluster, cpu);
3115	cluster_set_null(info: &cluster->index);
3116	}
3117	} else {
3118	atomic_inc(v: &nr_rotate_swap);
3119	inced_nr_rotate_swap = true;
3120	}
3121
3122	error = swap_cgroup_swapon(type: p->type, max_pages: maxpages);
3123	if (error)
3124	goto bad_swap_unlock_inode;
3125
3126	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
3127	cluster_info, maxpages, span: &span);
3128	if (unlikely(nr_extents < `0`)) {
3129	error = nr_extents;
3130	goto bad_swap_unlock_inode;
3131	}
3132
3133	if ((swap_flags & SWAP_FLAG_DISCARD) &&
3134	p->bdev && bdev_max_discard_sectors(bdev: p->bdev)) {
3135	/*
3136	* When discard is enabled for swap with no particular
3137	* policy flagged, we set all swap discard flags here in
3138	* order to sustain backward compatibility with older
3139	* swapon(8) releases.
3140	*/
3141	p->flags \|= (SWP_DISCARDABLE \| SWP_AREA_DISCARD \|
3142	SWP_PAGE_DISCARD);
3143
3144	/*
3145	* By flagging sys_swapon, a sysadmin can tell us to
3146	* either do single-time area discards only, or to just
3147	* perform discards for released swap page-clusters.
3148	* Now it's time to adjust the p->flags accordingly.
3149	*/
3150	if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
3151	p->flags &= ~SWP_PAGE_DISCARD;
3152	else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
3153	p->flags &= ~SWP_AREA_DISCARD;
3154
3155	/ issue a swapon-time discard if it's still required /
3156	if (p->flags & SWP_AREA_DISCARD) {
3157	int err = discard_swap(si: p);
3158	if (unlikely(err))
3159	pr_err("swapon: discard_swap(%p): %d\n",
3160	p, err);
3161	}
3162	}
3163
3164	error = init_swap_address_space(type: p->type, nr_pages: maxpages);
3165	if (error)
3166	goto bad_swap_unlock_inode;
3167
3168	/*
3169	* Flush any pending IO and dirty mappings before we start using this
3170	* swap device.
3171	*/
3172	inode->i_flags \|= S_SWAPFILE;
3173	error = inode_drain_writes(inode);
3174	if (error) {
3175	inode->i_flags &= ~S_SWAPFILE;
3176	goto free_swap_address_space;
3177	}
3178
3179	mutex_lock(&swapon_mutex);
3180	prio = -`1`;
3181	if (swap_flags & SWAP_FLAG_PREFER)
3182	prio =
3183	(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
3184	enable_swap_info(p, prio, swap_map, cluster_info);
3185
3186	pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
3187	K(p->pages), name->name, p->prio, nr_extents,
3188	K((unsigned long long)span),
3189	(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
3190	(p->flags & SWP_DISCARDABLE) ? "D" : "",
3191	(p->flags & SWP_AREA_DISCARD) ? "s" : "",
3192	(p->flags & SWP_PAGE_DISCARD) ? "c" : "");
3193
3194	mutex_unlock(lock: &swapon_mutex);
3195	atomic_inc(v: &proc_poll_event);
3196	wake_up_interruptible(&proc_poll_wait);
3197
3198	error = `0`;
3199	goto out;
3200	free_swap_address_space:
3201	exit_swap_address_space(type: p->type);
3202	bad_swap_unlock_inode:
3203	inode_unlock(inode);
3204	bad_swap:
3205	free_percpu(pdata: p->percpu_cluster);
3206	p->percpu_cluster = NULL;
3207	free_percpu(pdata: p->cluster_next_cpu);
3208	p->cluster_next_cpu = NULL;
3209	if (p->bdev_handle) {
3210	set_blocksize(bdev: p->bdev, size: p->old_block_size);
3211	bdev_release(handle: p->bdev_handle);
3212	p->bdev_handle = NULL;
3213	}
3214	inode = NULL;
3215	destroy_swap_extents(sis: p);
3216	swap_cgroup_swapoff(type: p->type);
3217	spin_lock(lock: &swap_lock);
3218	p->swap_file = NULL;
3219	p->flags = `0`;
3220	spin_unlock(lock: &swap_lock);
3221	vfree(addr: swap_map);
3222	kvfree(addr: cluster_info);
3223	if (inced_nr_rotate_swap)
3224	atomic_dec(v: &nr_rotate_swap);
3225	if (swap_file)
3226	filp_close(swap_file, NULL);
3227	out:
3228	if (page && !IS_ERR(ptr: page)) {
3229	kunmap(page);
3230	put_page(page);
3231	}
3232	if (name)
3233	putname(name);
3234	if (inode)
3235	inode_unlock(inode);
3236	if (!error)
3237	enable_swap_slots_cache();
3238	return error;
3239	}
3240
3241	void si_swapinfo(struct sysinfo *val)
3242	{
3243	unsigned int type;
3244	unsigned long nr_to_be_unused = `0`;
3245
3246	spin_lock(lock: &swap_lock);
3247	for (type = `0`; type < nr_swapfiles; type++) {
3248	struct swap_info_struct *si = swap_info[type];
3249
3250	if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3251	nr_to_be_unused += READ_ONCE(si->inuse_pages);
3252	}
3253	val->freeswap = atomic_long_read(v: &nr_swap_pages) + nr_to_be_unused;
3254	val->totalswap = total_swap_pages + nr_to_be_unused;
3255	spin_unlock(lock: &swap_lock);
3256	}
3257
3258	/*
3259	* Verify that a swap entry is valid and increment its swap map count.
3260	*
3261	* Returns error code in following case.
3262	* - success -> 0
3263	* - swp_entry is invalid -> EINVAL
3264	* - swp_entry is migration entry -> EINVAL
3265	* - swap-cache reference is requested but there is already one. -> EEXIST
3266	* - swap-cache reference is requested but the entry is not used. -> ENOENT
3267	* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
3268	*/
3269	static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
3270	{
3271	struct swap_info_struct *p;
3272	struct swap_cluster_info *ci;
3273	unsigned long offset;
3274	unsigned char count;
3275	unsigned char has_cache;
3276	int err;
3277
3278	p = swp_swap_info(entry);
3279
3280	offset = swp_offset(entry);
3281	ci = lock_cluster_or_swap_info(si: p, offset);
3282
3283	count = p->swap_map[offset];
3284
3285	/*
3286	* swapin_readahead() doesn't check if a swap entry is valid, so the
3287	* swap entry could be SWAP_MAP_BAD. Check here with lock held.
3288	*/
3289	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
3290	err = -ENOENT;
3291	goto unlock_out;
3292	}
3293
3294	has_cache = count & SWAP_HAS_CACHE;
3295	count &= ~SWAP_HAS_CACHE;
3296	err = `0`;
3297
3298	if (usage == SWAP_HAS_CACHE) {
3299
3300	/ set SWAP_HAS_CACHE if there is no cache and entry is used /
3301	if (!has_cache && count)
3302	has_cache = SWAP_HAS_CACHE;
3303	else if (has_cache) / someone else added cache /
3304	err = -EEXIST;
3305	else / no users remaining /
3306	err = -ENOENT;
3307
3308	} else if (count \|\| has_cache) {
3309
3310	if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
3311	count += usage;
3312	else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
3313	err = -EINVAL;
3314	else if (swap_count_continued(p, offset, count))
3315	count = COUNT_CONTINUED;
3316	else
3317	err = -ENOMEM;
3318	} else
3319	err = -ENOENT; / unused swap entry /
3320
3321	WRITE_ONCE(p->swap_map[offset], count \| has_cache);
3322
3323	unlock_out:
3324	unlock_cluster_or_swap_info(si: p, ci);
3325	return err;
3326	}
3327
3328	/*
3329	* Help swapoff by noting that swap entry belongs to shmem/tmpfs
3330	* (in which case its reference count is never incremented).
3331	*/
3332	void swap_shmem_alloc(swp_entry_t entry)
3333	{
3334	__swap_duplicate(entry, SWAP_MAP_SHMEM);
3335	}
3336
3337	/*
3338	* Increase reference count of swap entry by 1.
3339	* Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
3340	* but could not be atomically allocated. Returns 0, just as if it succeeded,
3341	* if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
3342	* might occur if a page table entry has got corrupted.
3343	*/
3344	int swap_duplicate(swp_entry_t entry)
3345	{
3346	int err = `0`;
3347
3348	while (!err && __swap_duplicate(entry, usage: `1`) == -ENOMEM)
3349	err = add_swap_count_continuation(entry, GFP_ATOMIC);
3350	return err;
3351	}
3352
3353	/*
3354	* @entry: swap entry for which we allocate swap cache.
3355	*
3356	* Called when allocating swap cache for existing swap entry,
3357	* This can return error codes. Returns 0 at success.
3358	* -EEXIST means there is a swap cache.
3359	* Note: return code is different from swap_duplicate().
3360	*/
3361	int swapcache_prepare(swp_entry_t entry)
3362	{
3363	return __swap_duplicate(entry, SWAP_HAS_CACHE);
3364	}
3365
3366	struct swap_info_struct *swp_swap_info(swp_entry_t entry)
3367	{
3368	return swap_type_to_swap_info(type: swp_type(entry));
3369	}
3370
3371	struct swap_info_struct page_swap_info(struct* page *page)
3372	{
3373	swp_entry_t entry = page_swap_entry(page);
3374	return swp_swap_info(entry);
3375	}
3376
3377	/*
3378	* out-of-line methods to avoid include hell.
3379	*/
3380	struct address_space swapcache_mapping(struct* folio *folio)
3381	{
3382	return page_swap_info(page: &folio->page)->swap_file->f_mapping;
3383	}
3384	EXPORT_SYMBOL_GPL(swapcache_mapping);
3385
3386	pgoff_t __page_file_index(struct page *page)
3387	{
3388	swp_entry_t swap = page_swap_entry(page);
3389	return swp_offset(entry: swap);
3390	}
3391	EXPORT_SYMBOL_GPL(__page_file_index);
3392
3393	/*
3394	* add_swap_count_continuation - called when a swap count is duplicated
3395	* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
3396	* page of the original vmalloc'ed swap_map, to hold the continuation count
3397	* (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
3398	* again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
3399	*
3400	* These continuation pages are seldom referenced: the common paths all work
3401	* on the original swap_map, only referring to a continuation page when the
3402	* low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
3403	*
3404	* add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
3405	* page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
3406	* can be called after dropping locks.
3407	*/
3408	int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
3409	{
3410	struct swap_info_struct *si;
3411	struct swap_cluster_info *ci;
3412	struct page *head;
3413	struct page *page;
3414	struct page *list_page;
3415	pgoff_t offset;
3416	unsigned char count;
3417	int ret = `0`;
3418
3419	/*
3420	* When debugging, it's easier to use __GFP_ZERO here; but it's better
3421	* for latency not to zero a page while GFP_ATOMIC and holding locks.
3422	*/
3423	page = alloc_page(gfp_mask \| __GFP_HIGHMEM);
3424
3425	si = get_swap_device(entry);
3426	if (!si) {
3427	/*
3428	* An acceptable race has occurred since the failing
3429	* __swap_duplicate(): the swap device may be swapoff
3430	*/
3431	goto outer;
3432	}
3433	spin_lock(lock: &si->lock);
3434
3435	offset = swp_offset(entry);
3436
3437	ci = lock_cluster(si, offset);
3438
3439	count = swap_count(ent: si->swap_map[offset]);
3440
3441	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
3442	/*
3443	* The higher the swap count, the more likely it is that tasks
3444	* will race to add swap count continuation: we need to avoid
3445	* over-provisioning.
3446	*/
3447	goto out;
3448	}
3449
3450	if (!page) {
3451	ret = -ENOMEM;
3452	goto out;
3453	}
3454
3455	head = vmalloc_to_page(addr: si->swap_map + offset);
3456	offset &= ~PAGE_MASK;
3457
3458	spin_lock(lock: &si->cont_lock);
3459	/*
3460	* Page allocation does not initialize the page's lru field,
3461	* but it does always reset its private field.
3462	*/
3463	if (!page_private(head)) {
3464	BUG_ON(count & COUNT_CONTINUED);
3465	INIT_LIST_HEAD(list: &head->lru);
3466	set_page_private(page: head, private: SWP_CONTINUED);
3467	si->flags \|= SWP_CONTINUED;
3468	}
3469
3470	list_for_each_entry(list_page, &head->lru, lru) {
3471	unsigned char *map;
3472
3473	/*
3474	* If the previous map said no continuation, but we've found
3475	* a continuation page, free our allocation and use this one.
3476	*/
3477	if (!(count & COUNT_CONTINUED))
3478	goto out_unlock_cont;
3479
3480	map = kmap_atomic(page: list_page) + offset;
3481	count = *map;
3482	kunmap_atomic(map);
3483
3484	/*
3485	* If this continuation count now has some space in it,
3486	* free our allocation and use this one.
3487	*/
3488	if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
3489	goto out_unlock_cont;
3490	}
3491
3492	list_add_tail(new: &page->lru, head: &head->lru);
3493	page = NULL; / now it's attached, don't free it /
3494	out_unlock_cont:
3495	spin_unlock(lock: &si->cont_lock);
3496	out:
3497	unlock_cluster(ci);
3498	spin_unlock(lock: &si->lock);
3499	put_swap_device(si);
3500	outer:
3501	if (page)
3502	__free_page(page);
3503	return ret;
3504	}
3505
3506	/*
3507	* swap_count_continued - when the original swap_map count is incremented
3508	* from SWAP_MAP_MAX, check if there is already a continuation page to carry
3509	* into, carry if so, or else fail until a new continuation page is allocated;
3510	* when the original swap_map count is decremented from 0 with continuation,
3511	* borrow from the continuation and report whether it still holds more.
3512	* Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
3513	* lock.
3514	*/
3515	static bool swap_count_continued(struct swap_info_struct *si,
3516	pgoff_t offset, unsigned char count)
3517	{
3518	struct page *head;
3519	struct page *page;
3520	unsigned char *map;
3521	bool ret;
3522
3523	head = vmalloc_to_page(addr: si->swap_map + offset);
3524	if (page_private(head) != SWP_CONTINUED) {
3525	BUG_ON(count & COUNT_CONTINUED);
3526	return false; / need to add count continuation /
3527	}
3528
3529	spin_lock(lock: &si->cont_lock);
3530	offset &= ~PAGE_MASK;
3531	page = list_next_entry(head, lru);
3532	map = kmap_atomic(page) + offset;
3533
3534	if (count == SWAP_MAP_MAX) / initial increment from swap_map /
3535	goto init_map; / jump over SWAP_CONT_MAX checks /
3536
3537	if (count == (SWAP_MAP_MAX \| COUNT_CONTINUED)) { / incrementing /
3538	/*
3539	* Think of how you add 1 to 999
3540	*/
3541	while (*map == (SWAP_CONT_MAX \| COUNT_CONTINUED)) {
3542	kunmap_atomic(map);
3543	page = list_next_entry(page, lru);
3544	BUG_ON(page == head);
3545	map = kmap_atomic(page) + offset;
3546	}
3547	if (*map == SWAP_CONT_MAX) {
3548	kunmap_atomic(map);
3549	page = list_next_entry(page, lru);
3550	if (page == head) {
3551	ret = false; / add count continuation /
3552	goto out;
3553	}
3554	map = kmap_atomic(page) + offset;
3555	init_map: map = `0`; /* we didn't zero the page /
3556	}
3557	*map += `1`;
3558	kunmap_atomic(map);
3559	while ((page = list_prev_entry(page, lru)) != head) {
3560	map = kmap_atomic(page) + offset;
3561	*map = COUNT_CONTINUED;
3562	kunmap_atomic(map);
3563	}
3564	ret = true; / incremented /
3565
3566	} else { / decrementing /
3567	/*
3568	* Think of how you subtract 1 from 1000
3569	*/
3570	BUG_ON(count != COUNT_CONTINUED);
3571	while (*map == COUNT_CONTINUED) {
3572	kunmap_atomic(map);
3573	page = list_next_entry(page, lru);
3574	BUG_ON(page == head);
3575	map = kmap_atomic(page) + offset;
3576	}
3577	BUG_ON(*map == `0`);
3578	*map -= `1`;
3579	if (*map == `0`)
3580	count = `0`;
3581	kunmap_atomic(map);
3582	while ((page = list_prev_entry(page, lru)) != head) {
3583	map = kmap_atomic(page) + offset;
3584	*map = SWAP_CONT_MAX \| count;
3585	count = COUNT_CONTINUED;
3586	kunmap_atomic(map);
3587	}
3588	ret = count == COUNT_CONTINUED;
3589	}
3590	out:
3591	spin_unlock(lock: &si->cont_lock);
3592	return ret;
3593	}
3594
3595	/*
3596	* free_swap_count_continuations - swapoff free all the continuation pages
3597	* appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
3598	*/
3599	static void free_swap_count_continuations(struct swap_info_struct *si)
3600	{
3601	pgoff_t offset;
3602
3603	for (offset = `0`; offset < si->max; offset += PAGE_SIZE) {
3604	struct page *head;
3605	head = vmalloc_to_page(addr: si->swap_map + offset);
3606	if (page_private(head)) {
3607	struct page page, next;
3608
3609	list_for_each_entry_safe(page, next, &head->lru, lru) {
3610	list_del(entry: &page->lru);
3611	__free_page(page);
3612	}
3613	}
3614	}
3615	}
3616
3617	#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
3618	void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
3619	{
3620	struct swap_info_struct si, next;
3621	int nid = folio_nid(folio);
3622
3623	if (!(gfp & __GFP_IO))
3624	return;
3625
3626	if (!blk_cgroup_congested())
3627	return;
3628
3629	/*
3630	* We've already scheduled a throttle, avoid taking the global swap
3631	* lock.
3632	*/
3633	if (current->throttle_disk)
3634	return;
3635
3636	spin_lock(lock: &swap_avail_lock);
3637	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
3638	avail_lists[nid]) {
3639	if (si->bdev) {
3640	blkcg_schedule_throttle(disk: si->bdev->bd_disk, use_memdelay: true);
3641	break;
3642	}
3643	}
3644	spin_unlock(lock: &swap_avail_lock);
3645	}
3646	#endif
3647
3648	static int __init swapfile_init(void)
3649	{
3650	int nid;
3651
3652	swap_avail_heads = kmalloc_array(n: nr_node_ids, size: sizeof(struct plist_head),
3653	GFP_KERNEL);
3654	if (!swap_avail_heads) {
3655	pr_emerg("Not enough memory for swap heads, swap is disabled\n");
3656	return -ENOMEM;
3657	}
3658
3659	for_each_node(nid)
3660	plist_head_init(head: &swap_avail_heads[nid]);
3661
3662	swapfile_maximum_size = arch_max_swapfile_size();
3663
3664	#ifdef CONFIG_MIGRATION
3665	if (swapfile_maximum_size >= (`1UL` << SWP_MIG_TOTAL_BITS))
3666	swap_migration_ad_supported = true;
3667	#endif /* CONFIG_MIGRATION */
3668
3669	return `0`;
3670	}
3671	subsys_initcall(swapfile_init);
3672

source code of linux/mm/swapfile.c