vmalloc.c source code [linux/mm/vmalloc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 1993 Linus Torvalds
4	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5	* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
6	* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
7	* Numa awareness, Christoph Lameter, SGI, June 2005
8	* Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
9	*/
10
11	#include <linux/vmalloc.h>
12	#include <linux/mm.h>
13	#include <linux/module.h>
14	#include <linux/highmem.h>
15	#include <linux/sched/signal.h>
16	#include <linux/slab.h>
17	#include <linux/spinlock.h>
18	#include <linux/interrupt.h>
19	#include <linux/proc_fs.h>
20	#include <linux/seq_file.h>
21	#include <linux/set_memory.h>
22	#include <linux/debugobjects.h>
23	#include <linux/kallsyms.h>
24	#include <linux/list.h>
25	#include <linux/notifier.h>
26	#include <linux/rbtree.h>
27	#include <linux/xarray.h>
28	#include <linux/io.h>
29	#include <linux/rcupdate.h>
30	#include <linux/pfn.h>
31	#include <linux/kmemleak.h>
32	#include <linux/atomic.h>
33	#include <linux/compiler.h>
34	#include <linux/memcontrol.h>
35	#include <linux/llist.h>
36	#include <linux/uio.h>
37	#include <linux/bitops.h>
38	#include <linux/rbtree_augmented.h>
39	#include <linux/overflow.h>
40	#include <linux/pgtable.h>
41	#include <linux/hugetlb.h>
42	#include <linux/sched/mm.h>
43	#include <asm/tlbflush.h>
44	#include <asm/shmparam.h>
45
46	#define CREATE_TRACE_POINTS
47	#include <trace/events/vmalloc.h>
48
49	#include "internal.h"
50	#include "pgalloc-track.h"
51
52	#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
53	static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - `1`;
54
55	static int __init set_nohugeiomap(char *str)
56	{
57	ioremap_max_page_shift = PAGE_SHIFT;
58	return `0`;
59	}
60	early_param("nohugeiomap", set_nohugeiomap);
61	#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
62	static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
63	#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
64
65	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
66	static bool __ro_after_init vmap_allow_huge = true;
67
68	static int __init set_nohugevmalloc(char *str)
69	{
70	vmap_allow_huge = false;
71	return `0`;
72	}
73	early_param("nohugevmalloc", set_nohugevmalloc);
74	#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
75	static const bool vmap_allow_huge = false;
76	#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
77
78	bool is_vmalloc_addr(const void *x)
79	{
80	unsigned long addr = (unsigned long)kasan_reset_tag(addr: x);
81
82	return addr >= VMALLOC_START && addr < VMALLOC_END;
83	}
84	EXPORT_SYMBOL(is_vmalloc_addr);
85
86	struct vfree_deferred {
87	struct llist_head list;
88	struct work_struct wq;
89	};
90	static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
91
92	/ Page table manipulation functions /
93	static int vmap_pte_range(pmd_t pmd, unsigned* long addr, unsigned long end,
94	phys_addr_t phys_addr, pgprot_t prot,
95	unsigned int max_page_shift, pgtbl_mod_mask *mask)
96	{
97	pte_t *pte;
98	u64 pfn;
99	unsigned long size = PAGE_SIZE;
100
101	pfn = phys_addr >> PAGE_SHIFT;
102	pte = pte_alloc_kernel_track(pmd, addr, mask);
103	if (!pte)
104	return -ENOMEM;
105	do {
106	BUG_ON(!pte_none(ptep_get(pte)));
107
108	#ifdef CONFIG_HUGETLB_PAGE
109	size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
110	if (size != PAGE_SIZE) {
111	pte_t entry = pfn_pte(page_nr: pfn, pgprot: prot);
112
113	entry = arch_make_huge_pte(entry, ilog2(size), flags: `0`);
114	set_huge_pte_at(mm: &init_mm, addr, ptep: pte, pte: entry, sz: size);
115	pfn += PFN_DOWN(size);
116	continue;
117	}
118	#endif
119	set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
120	pfn++;
121	} while (pte += PFN_DOWN(size), addr += size, addr != end);
122	*mask \|= PGTBL_PTE_MODIFIED;
123	return `0`;
124	}
125
126	static int vmap_try_huge_pmd(pmd_t pmd, unsigned* long addr, unsigned long end,
127	phys_addr_t phys_addr, pgprot_t prot,
128	unsigned int max_page_shift)
129	{
130	if (max_page_shift < PMD_SHIFT)
131	return `0`;
132
133	if (!arch_vmap_pmd_supported(prot))
134	return `0`;
135
136	if ((end - addr) != PMD_SIZE)
137	return `0`;
138
139	if (!IS_ALIGNED(addr, PMD_SIZE))
140	return `0`;
141
142	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
143	return `0`;
144
145	if (pmd_present(pmd: *pmd) && !pmd_free_pte_page(pmd, addr))
146	return `0`;
147
148	return pmd_set_huge(pmd, addr: phys_addr, prot);
149	}
150
151	static int vmap_pmd_range(pud_t pud, unsigned* long addr, unsigned long end,
152	phys_addr_t phys_addr, pgprot_t prot,
153	unsigned int max_page_shift, pgtbl_mod_mask *mask)
154	{
155	pmd_t *pmd;
156	unsigned long next;
157
158	pmd = pmd_alloc_track(mm: &init_mm, pud, address: addr, mod_mask: mask);
159	if (!pmd)
160	return -ENOMEM;
161	do {
162	next = pmd_addr_end(addr, end);
163
164	if (vmap_try_huge_pmd(pmd, addr, end: next, phys_addr, prot,
165	max_page_shift)) {
166	*mask \|= PGTBL_PMD_MODIFIED;
167	continue;
168	}
169
170	if (vmap_pte_range(pmd, addr, end: next, phys_addr, prot, max_page_shift, mask))
171	return -ENOMEM;
172	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
173	return `0`;
174	}
175
176	static int vmap_try_huge_pud(pud_t pud, unsigned* long addr, unsigned long end,
177	phys_addr_t phys_addr, pgprot_t prot,
178	unsigned int max_page_shift)
179	{
180	if (max_page_shift < PUD_SHIFT)
181	return `0`;
182
183	if (!arch_vmap_pud_supported(prot))
184	return `0`;
185
186	if ((end - addr) != PUD_SIZE)
187	return `0`;
188
189	if (!IS_ALIGNED(addr, PUD_SIZE))
190	return `0`;
191
192	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
193	return `0`;
194
195	if (pud_present(pud: *pud) && !pud_free_pmd_page(pud, addr))
196	return `0`;
197
198	return pud_set_huge(pud, addr: phys_addr, prot);
199	}
200
201	static int vmap_pud_range(p4d_t p4d, unsigned* long addr, unsigned long end,
202	phys_addr_t phys_addr, pgprot_t prot,
203	unsigned int max_page_shift, pgtbl_mod_mask *mask)
204	{
205	pud_t *pud;
206	unsigned long next;
207
208	pud = pud_alloc_track(mm: &init_mm, p4d, address: addr, mod_mask: mask);
209	if (!pud)
210	return -ENOMEM;
211	do {
212	next = pud_addr_end(addr, end);
213
214	if (vmap_try_huge_pud(pud, addr, end: next, phys_addr, prot,
215	max_page_shift)) {
216	*mask \|= PGTBL_PUD_MODIFIED;
217	continue;
218	}
219
220	if (vmap_pmd_range(pud, addr, end: next, phys_addr, prot,
221	max_page_shift, mask))
222	return -ENOMEM;
223	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
224	return `0`;
225	}
226
227	static int vmap_try_huge_p4d(p4d_t p4d, unsigned* long addr, unsigned long end,
228	phys_addr_t phys_addr, pgprot_t prot,
229	unsigned int max_page_shift)
230	{
231	if (max_page_shift < P4D_SHIFT)
232	return `0`;
233
234	if (!arch_vmap_p4d_supported(prot))
235	return `0`;
236
237	if ((end - addr) != P4D_SIZE)
238	return `0`;
239
240	if (!IS_ALIGNED(addr, P4D_SIZE))
241	return `0`;
242
243	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
244	return `0`;
245
246	if (p4d_present(p4d: *p4d) && !p4d_free_pud_page(p4d, addr))
247	return `0`;
248
249	return p4d_set_huge(p4d, addr: phys_addr, prot);
250	}
251
252	static int vmap_p4d_range(pgd_t pgd, unsigned* long addr, unsigned long end,
253	phys_addr_t phys_addr, pgprot_t prot,
254	unsigned int max_page_shift, pgtbl_mod_mask *mask)
255	{
256	p4d_t *p4d;
257	unsigned long next;
258
259	p4d = p4d_alloc_track(mm: &init_mm, pgd, address: addr, mod_mask: mask);
260	if (!p4d)
261	return -ENOMEM;
262	do {
263	next = p4d_addr_end(addr, end);
264
265	if (vmap_try_huge_p4d(p4d, addr, end: next, phys_addr, prot,
266	max_page_shift)) {
267	*mask \|= PGTBL_P4D_MODIFIED;
268	continue;
269	}
270
271	if (vmap_pud_range(p4d, addr, end: next, phys_addr, prot,
272	max_page_shift, mask))
273	return -ENOMEM;
274	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
275	return `0`;
276	}
277
278	static int vmap_range_noflush(unsigned long addr, unsigned long end,
279	phys_addr_t phys_addr, pgprot_t prot,
280	unsigned int max_page_shift)
281	{
282	pgd_t *pgd;
283	unsigned long start;
284	unsigned long next;
285	int err;
286	pgtbl_mod_mask mask = `0`;
287
288	might_sleep();
289	BUG_ON(addr >= end);
290
291	start = addr;
292	pgd = pgd_offset_k(addr);
293	do {
294	next = pgd_addr_end(addr, end);
295	err = vmap_p4d_range(pgd, addr, end: next, phys_addr, prot,
296	max_page_shift, mask: &mask);
297	if (err)
298	break;
299	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
300
301	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
302	arch_sync_kernel_mappings(start, end);
303
304	return err;
305	}
306
307	int ioremap_page_range(unsigned long addr, unsigned long end,
308	phys_addr_t phys_addr, pgprot_t prot)
309	{
310	int err;
311
312	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
313	max_page_shift: ioremap_max_page_shift);
314	flush_cache_vmap(start: addr, end);
315	if (!err)
316	err = kmsan_ioremap_page_range(start: addr, end, phys_addr, prot,
317	page_shift: ioremap_max_page_shift);
318	return err;
319	}
320
321	static void vunmap_pte_range(pmd_t pmd, unsigned* long addr, unsigned long end,
322	pgtbl_mod_mask *mask)
323	{
324	pte_t *pte;
325
326	pte = pte_offset_kernel(pmd, address: addr);
327	do {
328	pte_t ptent = ptep_get_and_clear(mm: &init_mm, addr, ptep: pte);
329	WARN_ON(!pte_none(ptent) && !pte_present(ptent));
330	} while (pte++, addr += PAGE_SIZE, addr != end);
331	*mask \|= PGTBL_PTE_MODIFIED;
332	}
333
334	static void vunmap_pmd_range(pud_t pud, unsigned* long addr, unsigned long end,
335	pgtbl_mod_mask *mask)
336	{
337	pmd_t *pmd;
338	unsigned long next;
339	int cleared;
340
341	pmd = pmd_offset(pud, address: addr);
342	do {
343	next = pmd_addr_end(addr, end);
344
345	cleared = pmd_clear_huge(pmd);
346	if (cleared \|\| pmd_bad(pmd: *pmd))
347	*mask \|= PGTBL_PMD_MODIFIED;
348
349	if (cleared)
350	continue;
351	if (pmd_none_or_clear_bad(pmd))
352	continue;
353	vunmap_pte_range(pmd, addr, end: next, mask);
354
355	cond_resched();
356	} while (pmd++, addr = next, addr != end);
357	}
358
359	static void vunmap_pud_range(p4d_t p4d, unsigned* long addr, unsigned long end,
360	pgtbl_mod_mask *mask)
361	{
362	pud_t *pud;
363	unsigned long next;
364	int cleared;
365
366	pud = pud_offset(p4d, address: addr);
367	do {
368	next = pud_addr_end(addr, end);
369
370	cleared = pud_clear_huge(pud);
371	if (cleared \|\| pud_bad(pud: *pud))
372	*mask \|= PGTBL_PUD_MODIFIED;
373
374	if (cleared)
375	continue;
376	if (pud_none_or_clear_bad(pud))
377	continue;
378	vunmap_pmd_range(pud, addr, end: next, mask);
379	} while (pud++, addr = next, addr != end);
380	}
381
382	static void vunmap_p4d_range(pgd_t pgd, unsigned* long addr, unsigned long end,
383	pgtbl_mod_mask *mask)
384	{
385	p4d_t *p4d;
386	unsigned long next;
387
388	p4d = p4d_offset(pgd, address: addr);
389	do {
390	next = p4d_addr_end(addr, end);
391
392	p4d_clear_huge(p4d);
393	if (p4d_bad(p4d: *p4d))
394	*mask \|= PGTBL_P4D_MODIFIED;
395
396	if (p4d_none_or_clear_bad(p4d))
397	continue;
398	vunmap_pud_range(p4d, addr, end: next, mask);
399	} while (p4d++, addr = next, addr != end);
400	}
401
402	/*
403	* vunmap_range_noflush is similar to vunmap_range, but does not
404	* flush caches or TLBs.
405	*
406	* The caller is responsible for calling flush_cache_vmap() before calling
407	* this function, and flush_tlb_kernel_range after it has returned
408	* successfully (and before the addresses are expected to cause a page fault
409	* or be re-mapped for something else, if TLB flushes are being delayed or
410	* coalesced).
411	*
412	* This is an internal function only. Do not use outside mm/.
413	*/
414	void __vunmap_range_noflush(unsigned long start, unsigned long end)
415	{
416	unsigned long next;
417	pgd_t *pgd;
418	unsigned long addr = start;
419	pgtbl_mod_mask mask = `0`;
420
421	BUG_ON(addr >= end);
422	pgd = pgd_offset_k(addr);
423	do {
424	next = pgd_addr_end(addr, end);
425	if (pgd_bad(pgd: *pgd))
426	mask \|= PGTBL_PGD_MODIFIED;
427	if (pgd_none_or_clear_bad(pgd))
428	continue;
429	vunmap_p4d_range(pgd, addr, end: next, mask: &mask);
430	} while (pgd++, addr = next, addr != end);
431
432	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
433	arch_sync_kernel_mappings(start, end);
434	}
435
436	void vunmap_range_noflush(unsigned long start, unsigned long end)
437	{
438	kmsan_vunmap_range_noflush(start, end);
439	__vunmap_range_noflush(start, end);
440	}
441
442	/**
443	* vunmap_range - unmap kernel virtual addresses
444	* @addr: start of the VM area to unmap
445	* @end: end of the VM area to unmap (non-inclusive)
446	*
447	* Clears any present PTEs in the virtual address range, flushes TLBs and
448	* caches. Any subsequent access to the address before it has been re-mapped
449	* is a kernel bug.
450	*/
451	void vunmap_range(unsigned long addr, unsigned long end)
452	{
453	flush_cache_vunmap(start: addr, end);
454	vunmap_range_noflush(start: addr, end);
455	flush_tlb_kernel_range(start: addr, end);
456	}
457
458	static int vmap_pages_pte_range(pmd_t pmd, unsigned* long addr,
459	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
460	pgtbl_mod_mask *mask)
461	{
462	pte_t *pte;
463
464	/*
465	* nr is a running index into the array which helps higher level
466	* callers keep track of where we're up to.
467	*/
468
469	pte = pte_alloc_kernel_track(pmd, addr, mask);
470	if (!pte)
471	return -ENOMEM;
472	do {
473	struct page page = pages[nr];
474
475	if (WARN_ON(!pte_none(ptep_get(pte))))
476	return -EBUSY;
477	if (WARN_ON(!page))
478	return -ENOMEM;
479	if (WARN_ON(!pfn_valid(page_to_pfn(page))))
480	return -EINVAL;
481
482	set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
483	(*nr)++;
484	} while (pte++, addr += PAGE_SIZE, addr != end);
485	*mask \|= PGTBL_PTE_MODIFIED;
486	return `0`;
487	}
488
489	static int vmap_pages_pmd_range(pud_t pud, unsigned* long addr,
490	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
491	pgtbl_mod_mask *mask)
492	{
493	pmd_t *pmd;
494	unsigned long next;
495
496	pmd = pmd_alloc_track(mm: &init_mm, pud, address: addr, mod_mask: mask);
497	if (!pmd)
498	return -ENOMEM;
499	do {
500	next = pmd_addr_end(addr, end);
501	if (vmap_pages_pte_range(pmd, addr, end: next, prot, pages, nr, mask))
502	return -ENOMEM;
503	} while (pmd++, addr = next, addr != end);
504	return `0`;
505	}
506
507	static int vmap_pages_pud_range(p4d_t p4d, unsigned* long addr,
508	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
509	pgtbl_mod_mask *mask)
510	{
511	pud_t *pud;
512	unsigned long next;
513
514	pud = pud_alloc_track(mm: &init_mm, p4d, address: addr, mod_mask: mask);
515	if (!pud)
516	return -ENOMEM;
517	do {
518	next = pud_addr_end(addr, end);
519	if (vmap_pages_pmd_range(pud, addr, end: next, prot, pages, nr, mask))
520	return -ENOMEM;
521	} while (pud++, addr = next, addr != end);
522	return `0`;
523	}
524
525	static int vmap_pages_p4d_range(pgd_t pgd, unsigned* long addr,
526	unsigned long end, pgprot_t prot, struct page *pages, int* *nr,
527	pgtbl_mod_mask *mask)
528	{
529	p4d_t *p4d;
530	unsigned long next;
531
532	p4d = p4d_alloc_track(mm: &init_mm, pgd, address: addr, mod_mask: mask);
533	if (!p4d)
534	return -ENOMEM;
535	do {
536	next = p4d_addr_end(addr, end);
537	if (vmap_pages_pud_range(p4d, addr, end: next, prot, pages, nr, mask))
538	return -ENOMEM;
539	} while (p4d++, addr = next, addr != end);
540	return `0`;
541	}
542
543	static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
544	pgprot_t prot, struct page **pages)
545	{
546	unsigned long start = addr;
547	pgd_t *pgd;
548	unsigned long next;
549	int err = `0`;
550	int nr = `0`;
551	pgtbl_mod_mask mask = `0`;
552
553	BUG_ON(addr >= end);
554	pgd = pgd_offset_k(addr);
555	do {
556	next = pgd_addr_end(addr, end);
557	if (pgd_bad(pgd: *pgd))
558	mask \|= PGTBL_PGD_MODIFIED;
559	err = vmap_pages_p4d_range(pgd, addr, end: next, prot, pages, nr: &nr, mask: &mask);
560	if (err)
561	return err;
562	} while (pgd++, addr = next, addr != end);
563
564	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
565	arch_sync_kernel_mappings(start, end);
566
567	return `0`;
568	}
569
570	/*
571	* vmap_pages_range_noflush is similar to vmap_pages_range, but does not
572	* flush caches.
573	*
574	* The caller is responsible for calling flush_cache_vmap() after this
575	* function returns successfully and before the addresses are accessed.
576	*
577	* This is an internal function only. Do not use outside mm/.
578	*/
579	int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
580	pgprot_t prot, struct page *pages, unsigned* int page_shift)
581	{
582	unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
583
584	WARN_ON(page_shift < PAGE_SHIFT);
585
586	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) \|\|
587	page_shift == PAGE_SHIFT)
588	return vmap_small_pages_range_noflush(addr, end, prot, pages);
589
590	for (i = `0`; i < nr; i += `1U` << (page_shift - PAGE_SHIFT)) {
591	int err;
592
593	err = vmap_range_noflush(addr, end: addr + (`1UL` << page_shift),
594	page_to_phys(pages[i]), prot,
595	max_page_shift: page_shift);
596	if (err)
597	return err;
598
599	addr += `1UL` << page_shift;
600	}
601
602	return `0`;
603	}
604
605	int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
606	pgprot_t prot, struct page *pages, unsigned* int page_shift)
607	{
608	int ret = kmsan_vmap_pages_range_noflush(start: addr, end, prot, pages,
609	page_shift);
610
611	if (ret)
612	return ret;
613	return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
614	}
615
616	/**
617	* vmap_pages_range - map pages to a kernel virtual address
618	* @addr: start of the VM area to map
619	* @end: end of the VM area to map (non-inclusive)
620	* @prot: page protection flags to use
621	* @pages: pages to map (always PAGE_SIZE pages)
622	* @page_shift: maximum shift that the pages may be mapped with, @pages must
623	* be aligned and contiguous up to at least this shift.
624	*
625	* RETURNS:
626	* 0 on success, -errno on failure.
627	*/
628	static int vmap_pages_range(unsigned long addr, unsigned long end,
629	pgprot_t prot, struct page *pages, unsigned* int page_shift)
630	{
631	int err;
632
633	err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
634	flush_cache_vmap(start: addr, end);
635	return err;
636	}
637
638	int is_vmalloc_or_module_addr(const void *x)
639	{
640	/*
641	* ARM, x86-64 and sparc64 put modules in a special place,
642	* and fall back on vmalloc() if that fails. Others
643	* just put it in the vmalloc space.
644	*/
645	#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
646	unsigned long addr = (unsigned long)kasan_reset_tag(addr: x);
647	if (addr >= MODULES_VADDR && addr < MODULES_END)
648	return `1`;
649	#endif
650	return is_vmalloc_addr(x);
651	}
652	EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);
653
654	/*
655	* Walk a vmap address to the struct page it maps. Huge vmap mappings will
656	* return the tail page that corresponds to the base page address, which
657	* matches small vmap mappings.
658	*/
659	struct page vmalloc_to_page(const* void *vmalloc_addr)
660	{
661	unsigned long addr = (unsigned long) vmalloc_addr;
662	struct page *page = NULL;
663	pgd_t *pgd = pgd_offset_k(addr);
664	p4d_t *p4d;
665	pud_t *pud;
666	pmd_t *pmd;
667	pte_t *ptep, pte;
668
669	/*
670	* XXX we might need to change this if we add VIRTUAL_BUG_ON for
671	* architectures that do not vmalloc module space
672	*/
673	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
674
675	if (pgd_none(pgd: *pgd))
676	return NULL;
677	if (WARN_ON_ONCE(pgd_leaf(*pgd)))
678	return NULL; / XXX: no allowance for huge pgd /
679	if (WARN_ON_ONCE(pgd_bad(*pgd)))
680	return NULL;
681
682	p4d = p4d_offset(pgd, address: addr);
683	if (p4d_none(p4d: *p4d))
684	return NULL;
685	if (p4d_leaf(p4d: *p4d))
686	return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
687	if (WARN_ON_ONCE(p4d_bad(*p4d)))
688	return NULL;
689
690	pud = pud_offset(p4d, address: addr);
691	if (pud_none(pud: *pud))
692	return NULL;
693	if (pud_leaf(pud: *pud))
694	return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
695	if (WARN_ON_ONCE(pud_bad(*pud)))
696	return NULL;
697
698	pmd = pmd_offset(pud, address: addr);
699	if (pmd_none(pmd: *pmd))
700	return NULL;
701	if (pmd_leaf(pte: *pmd))
702	return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
703	if (WARN_ON_ONCE(pmd_bad(*pmd)))
704	return NULL;
705
706	ptep = pte_offset_kernel(pmd, address: addr);
707	pte = ptep_get(ptep);
708	if (pte_present(a: pte))
709	page = pte_page(pte);
710
711	return page;
712	}
713	EXPORT_SYMBOL(vmalloc_to_page);
714
715	/*
716	* Map a vmalloc()-space virtual address to the physical page frame number.
717	*/
718	unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
719	{
720	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
721	}
722	EXPORT_SYMBOL(vmalloc_to_pfn);
723
724
725	/ Global kva allocator /
726
727	#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
728	#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
729
730
731	static DEFINE_SPINLOCK(vmap_area_lock);
732	static DEFINE_SPINLOCK(free_vmap_area_lock);
733	/ Export for kexec only /
734	LIST_HEAD(vmap_area_list);
735	static struct rb_root vmap_area_root = RB_ROOT;
736	static bool vmap_initialized __read_mostly;
737
738	static struct rb_root purge_vmap_area_root = RB_ROOT;
739	static LIST_HEAD(purge_vmap_area_list);
740	static DEFINE_SPINLOCK(purge_vmap_area_lock);
741
742	/*
743	* This kmem_cache is used for vmap_area objects. Instead of
744	* allocating from slab we reuse an object from this cache to
745	* make things faster. Especially in "no edge" splitting of
746	* free block.
747	*/
748	static struct kmem_cache *vmap_area_cachep;
749
750	/*
751	* This linked list is used in pair with free_vmap_area_root.
752	* It gives O(1) access to prev/next to perform fast coalescing.
753	*/
754	static LIST_HEAD(free_vmap_area_list);
755
756	/*
757	* This augment red-black tree represents the free vmap space.
758	* All vmap_area objects in this tree are sorted by va->va_start
759	* address. It is used for allocation and merging when a vmap
760	* object is released.
761	*
762	* Each vmap_area node contains a maximum available free block
763	* of its sub-tree, right or left. Therefore it is possible to
764	* find a lowest match of free area.
765	*/
766	static struct rb_root free_vmap_area_root = RB_ROOT;
767
768	/*
769	* Preload a CPU with one object for "no edge" split case. The
770	* aim is to get rid of allocations from the atomic context, thus
771	* to use more permissive allocation masks.
772	*/
773	static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
774
775	static __always_inline unsigned long
776	va_size(struct vmap_area *va)
777	{
778	return (va->va_end - va->va_start);
779	}
780
781	static __always_inline unsigned long
782	get_subtree_max_size(struct rb_node *node)
783	{
784	struct vmap_area *va;
785
786	va = rb_entry_safe(node, struct vmap_area, rb_node);
787	return va ? va->subtree_max_size : `0`;
788	}
789
790	RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
791	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
792
793	static void reclaim_and_purge_vmap_areas(void);
794	static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
795	static void drain_vmap_area_work(struct work_struct *work);
796	static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
797
798	static atomic_long_t nr_vmalloc_pages;
799
800	unsigned long vmalloc_nr_pages(void)
801	{
802	return atomic_long_read(v: &nr_vmalloc_pages);
803	}
804
805	/ Look up the first VA which satisfies addr < va_end, NULL if none. /
806	static struct vmap_area find_vmap_area_exceed_addr(unsigned* long addr)
807	{
808	struct vmap_area *va = NULL;
809	struct rb_node *n = vmap_area_root.rb_node;
810
811	addr = (unsigned long)kasan_reset_tag(addr: (void *)addr);
812
813	while (n) {
814	struct vmap_area *tmp;
815
816	tmp = rb_entry(n, struct vmap_area, rb_node);
817	if (tmp->va_end > addr) {
818	va = tmp;
819	if (tmp->va_start <= addr)
820	break;
821
822	n = n->rb_left;
823	} else
824	n = n->rb_right;
825	}
826
827	return va;
828	}
829
830	static struct vmap_area __find_vmap_area(unsigned* long addr, struct rb_root *root)
831	{
832	struct rb_node *n = root->rb_node;
833
834	addr = (unsigned long)kasan_reset_tag(addr: (void *)addr);
835
836	while (n) {
837	struct vmap_area *va;
838
839	va = rb_entry(n, struct vmap_area, rb_node);
840	if (addr < va->va_start)
841	n = n->rb_left;
842	else if (addr >= va->va_end)
843	n = n->rb_right;
844	else
845	return va;
846	}
847
848	return NULL;
849	}
850
851	/*
852	* This function returns back addresses of parent node
853	* and its left or right link for further processing.
854	*
855	* Otherwise NULL is returned. In that case all further
856	* steps regarding inserting of conflicting overlap range
857	* have to be declined and actually considered as a bug.
858	*/
859	static __always_inline struct rb_node **
860	find_va_links(struct vmap_area *va,
861	struct rb_root root, struct* rb_node *from,
862	struct rb_node **parent)
863	{
864	struct vmap_area *tmp_va;
865	struct rb_node **link;
866
867	if (root) {
868	link = &root->rb_node;
869	if (unlikely(!*link)) {
870	*parent = NULL;
871	return link;
872	}
873	} else {
874	link = &from;
875	}
876
877	/*
878	* Go to the bottom of the tree. When we hit the last point
879	* we end up with parent rb_node and correct direction, i name
880	* it link, where the new va->rb_node will be attached to.
881	*/
882	do {
883	tmp_va = rb_entry(link, struct* vmap_area, rb_node);
884
885	/*
886	* During the traversal we also do some sanity check.
887	* Trigger the BUG() if there are sides(left/right)
888	* or full overlaps.
889	*/
890	if (va->va_end <= tmp_va->va_start)
891	link = &(*link)->rb_left;
892	else if (va->va_start >= tmp_va->va_end)
893	link = &(*link)->rb_right;
894	else {
895	WARN(`1`, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
896	va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
897
898	return NULL;
899	}
900	} while (*link);
901
902	*parent = &tmp_va->rb_node;
903	return link;
904	}
905
906	static __always_inline struct list_head *
907	get_va_next_sibling(struct rb_node parent, struct* rb_node **link)
908	{
909	struct list_head *list;
910
911	if (unlikely(!parent))
912	/*
913	* The red-black tree where we try to find VA neighbors
914	* before merging or inserting is empty, i.e. it means
915	* there is no free vmap space. Normally it does not
916	* happen but we handle this case anyway.
917	*/
918	return NULL;
919
920	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
921	return (&parent->rb_right == link ? list->next : list);
922	}
923
924	static __always_inline void
925	__link_va(struct vmap_area va, struct* rb_root *root,
926	struct rb_node parent, struct* rb_node **link,
927	struct list_head *head, bool augment)
928	{
929	/*
930	* VA is still not in the list, but we can
931	* identify its future previous list_head node.
932	*/
933	if (likely(parent)) {
934	head = &rb_entry(parent, struct vmap_area, rb_node)->list;
935	if (&parent->rb_right != link)
936	head = head->prev;
937	}
938
939	/ Insert to the rb-tree /
940	rb_link_node(node: &va->rb_node, parent, rb_link: link);
941	if (augment) {
942	/*
943	* Some explanation here. Just perform simple insertion
944	* to the tree. We do not set va->subtree_max_size to
945	* its current size before calling rb_insert_augmented().
946	* It is because we populate the tree from the bottom
947	* to parent levels when the node _is_ in the tree.
948	*
949	* Therefore we set subtree_max_size to zero after insertion,
950	* to let __augment_tree_propagate_from() puts everything to
951	* the correct order later on.
952	*/
953	rb_insert_augmented(node: &va->rb_node,
954	root, augment: &free_vmap_area_rb_augment_cb);
955	va->subtree_max_size = `0`;
956	} else {
957	rb_insert_color(&va->rb_node, root);
958	}
959
960	/ Address-sort this list /
961	list_add(new: &va->list, head);
962	}
963
964	static __always_inline void
965	link_va(struct vmap_area va, struct* rb_root *root,
966	struct rb_node parent, struct* rb_node **link,
967	struct list_head *head)
968	{
969	__link_va(va, root, parent, link, head, augment: false);
970	}
971
972	static __always_inline void
973	link_va_augment(struct vmap_area va, struct* rb_root *root,
974	struct rb_node parent, struct* rb_node **link,
975	struct list_head *head)
976	{
977	__link_va(va, root, parent, link, head, augment: true);
978	}
979
980	static __always_inline void
981	__unlink_va(struct vmap_area va, struct* rb_root *root, bool augment)
982	{
983	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
984	return;
985
986	if (augment)
987	rb_erase_augmented(node: &va->rb_node,
988	root, augment: &free_vmap_area_rb_augment_cb);
989	else
990	rb_erase(&va->rb_node, root);
991
992	list_del_init(entry: &va->list);
993	RB_CLEAR_NODE(&va->rb_node);
994	}
995
996	static __always_inline void
997	unlink_va(struct vmap_area va, struct* rb_root *root)
998	{
999	__unlink_va(va, root, augment: false);
1000	}
1001
1002	static __always_inline void
1003	unlink_va_augment(struct vmap_area va, struct* rb_root *root)
1004	{
1005	__unlink_va(va, root, augment: true);
1006	}
1007
1008	#if DEBUG_AUGMENT_PROPAGATE_CHECK
1009	/*
1010	* Gets called when remove the node and rotate.
1011	*/
1012	static __always_inline unsigned long
1013	compute_subtree_max_size(struct vmap_area *va)
1014	{
1015	return max3(va_size(va),
1016	get_subtree_max_size(va->rb_node.rb_left),
1017	get_subtree_max_size(va->rb_node.rb_right));
1018	}
1019
1020	static void
1021	augment_tree_propagate_check(void)
1022	{
1023	struct vmap_area *va;
1024	unsigned long computed_size;
1025
1026	list_for_each_entry(va, &free_vmap_area_list, list) {
1027	computed_size = compute_subtree_max_size(va);
1028	if (computed_size != va->subtree_max_size)
1029	pr_emerg("tree is corrupted: %lu, %lu\n",
1030	va_size(va), va->subtree_max_size);
1031	}
1032	}
1033	#endif
1034
1035	/*
1036	* This function populates subtree_max_size from bottom to upper
1037	* levels starting from VA point. The propagation must be done
1038	* when VA size is modified by changing its va_start/va_end. Or
1039	* in case of newly inserting of VA to the tree.
1040	*
1041	* It means that __augment_tree_propagate_from() must be called:
1042	* - After VA has been inserted to the tree(free path);
1043	* - After VA has been shrunk(allocation path);
1044	* - After VA has been increased(merging path).
1045	*
1046	* Please note that, it does not mean that upper parent nodes
1047	* and their subtree_max_size are recalculated all the time up
1048	* to the root node.
1049	*
1050	* 4--8
1051	* /\
1052	* / \
1053	* / \
1054	* 2--2 8--8
1055	*
1056	* For example if we modify the node 4, shrinking it to 2, then
1057	* no any modification is required. If we shrink the node 2 to 1
1058	* its subtree_max_size is updated only, and set to 1. If we shrink
1059	* the node 8 to 6, then its subtree_max_size is set to 6 and parent
1060	* node becomes 4--6.
1061	*/
1062	static __always_inline void
1063	augment_tree_propagate_from(struct vmap_area *va)
1064	{
1065	/*
1066	* Populate the tree from bottom towards the root until
1067	* the calculated maximum available size of checked node
1068	* is equal to its current one.
1069	*/
1070	free_vmap_area_rb_augment_cb_propagate(rb: &va->rb_node, NULL);
1071
1072	#if DEBUG_AUGMENT_PROPAGATE_CHECK
1073	augment_tree_propagate_check();
1074	#endif
1075	}
1076
1077	static void
1078	insert_vmap_area(struct vmap_area *va,
1079	struct rb_root root, struct* list_head *head)
1080	{
1081	struct rb_node **link;
1082	struct rb_node *parent;
1083
1084	link = find_va_links(va, root, NULL, parent: &parent);
1085	if (link)
1086	link_va(va, root, parent, link, head);
1087	}
1088
1089	static void
1090	insert_vmap_area_augment(struct vmap_area *va,
1091	struct rb_node from, struct* rb_root *root,
1092	struct list_head *head)
1093	{
1094	struct rb_node **link;
1095	struct rb_node *parent;
1096
1097	if (from)
1098	link = find_va_links(va, NULL, from, parent: &parent);
1099	else
1100	link = find_va_links(va, root, NULL, parent: &parent);
1101
1102	if (link) {
1103	link_va_augment(va, root, parent, link, head);
1104	augment_tree_propagate_from(va);
1105	}
1106	}
1107
1108	/*
1109	* Merge de-allocated chunk of VA memory with previous
1110	* and next free blocks. If coalesce is not done a new
1111	* free area is inserted. If VA has been merged, it is
1112	* freed.
1113	*
1114	* Please note, it can return NULL in case of overlap
1115	* ranges, followed by WARN() report. Despite it is a
1116	* buggy behaviour, a system can be alive and keep
1117	* ongoing.
1118	*/
1119	static __always_inline struct vmap_area *
1120	__merge_or_add_vmap_area(struct vmap_area *va,
1121	struct rb_root root, struct* list_head *head, bool augment)
1122	{
1123	struct vmap_area *sibling;
1124	struct list_head *next;
1125	struct rb_node **link;
1126	struct rb_node *parent;
1127	bool merged = false;
1128
1129	/*
1130	* Find a place in the tree where VA potentially will be
1131	* inserted, unless it is merged with its sibling/siblings.
1132	*/
1133	link = find_va_links(va, root, NULL, parent: &parent);
1134	if (!link)
1135	return NULL;
1136
1137	/*
1138	* Get next node of VA to check if merging can be done.
1139	*/
1140	next = get_va_next_sibling(parent, link);
1141	if (unlikely(next == NULL))
1142	goto insert;
1143
1144	/*
1145	* start end
1146	* \| \|
1147	* \|<------VA------>\|<-----Next----->\|
1148	* \| \|
1149	* start end
1150	*/
1151	if (next != head) {
1152	sibling = list_entry(next, struct vmap_area, list);
1153	if (sibling->va_start == va->va_end) {
1154	sibling->va_start = va->va_start;
1155
1156	/ Free vmap_area object. /
1157	kmem_cache_free(s: vmap_area_cachep, objp: va);
1158
1159	/ Point to the new merged area. /
1160	va = sibling;
1161	merged = true;
1162	}
1163	}
1164
1165	/*
1166	* start end
1167	* \| \|
1168	* \|<-----Prev----->\|<------VA------>\|
1169	* \| \|
1170	* start end
1171	*/
1172	if (next->prev != head) {
1173	sibling = list_entry(next->prev, struct vmap_area, list);
1174	if (sibling->va_end == va->va_start) {
1175	/*
1176	* If both neighbors are coalesced, it is important
1177	* to unlink the "next" node first, followed by merging
1178	* with "previous" one. Otherwise the tree might not be
1179	* fully populated if a sibling's augmented value is
1180	* "normalized" because of rotation operations.
1181	*/
1182	if (merged)
1183	__unlink_va(va, root, augment);
1184
1185	sibling->va_end = va->va_end;
1186
1187	/ Free vmap_area object. /
1188	kmem_cache_free(s: vmap_area_cachep, objp: va);
1189
1190	/ Point to the new merged area. /
1191	va = sibling;
1192	merged = true;
1193	}
1194	}
1195
1196	insert:
1197	if (!merged)
1198	__link_va(va, root, parent, link, head, augment);
1199
1200	return va;
1201	}
1202
1203	static __always_inline struct vmap_area *
1204	merge_or_add_vmap_area(struct vmap_area *va,
1205	struct rb_root root, struct* list_head *head)
1206	{
1207	return __merge_or_add_vmap_area(va, root, head, augment: false);
1208	}
1209
1210	static __always_inline struct vmap_area *
1211	merge_or_add_vmap_area_augment(struct vmap_area *va,
1212	struct rb_root root, struct* list_head *head)
1213	{
1214	va = __merge_or_add_vmap_area(va, root, head, augment: true);
1215	if (va)
1216	augment_tree_propagate_from(va);
1217
1218	return va;
1219	}
1220
1221	static __always_inline bool
1222	is_within_this_va(struct vmap_area va, unsigned* long size,
1223	unsigned long align, unsigned long vstart)
1224	{
1225	unsigned long nva_start_addr;
1226
1227	if (va->va_start > vstart)
1228	nva_start_addr = ALIGN(va->va_start, align);
1229	else
1230	nva_start_addr = ALIGN(vstart, align);
1231
1232	/ Can be overflowed due to big size or alignment. /
1233	if (nva_start_addr + size < nva_start_addr \|\|
1234	nva_start_addr < vstart)
1235	return false;
1236
1237	return (nva_start_addr + size <= va->va_end);
1238	}
1239
1240	/*
1241	* Find the first free block(lowest start address) in the tree,
1242	* that will accomplish the request corresponding to passing
1243	* parameters. Please note, with an alignment bigger than PAGE_SIZE,
1244	* a search length is adjusted to account for worst case alignment
1245	* overhead.
1246	*/
1247	static __always_inline struct vmap_area *
1248	find_vmap_lowest_match(struct rb_root root, unsigned* long size,
1249	unsigned long align, unsigned long vstart, bool adjust_search_size)
1250	{
1251	struct vmap_area *va;
1252	struct rb_node *node;
1253	unsigned long length;
1254
1255	/ Start from the root. /
1256	node = root->rb_node;
1257
1258	/ Adjust the search size for alignment overhead. /
1259	length = adjust_search_size ? size + align - `1` : size;
1260
1261	while (node) {
1262	va = rb_entry(node, struct vmap_area, rb_node);
1263
1264	if (get_subtree_max_size(node: node->rb_left) >= length &&
1265	vstart < va->va_start) {
1266	node = node->rb_left;
1267	} else {
1268	if (is_within_this_va(va, size, align, vstart))
1269	return va;
1270
1271	/*
1272	* Does not make sense to go deeper towards the right
1273	* sub-tree if it does not have a free block that is
1274	* equal or bigger to the requested search length.
1275	*/
1276	if (get_subtree_max_size(node: node->rb_right) >= length) {
1277	node = node->rb_right;
1278	continue;
1279	}
1280
1281	/*
1282	* OK. We roll back and find the first right sub-tree,
1283	* that will satisfy the search criteria. It can happen
1284	* due to "vstart" restriction or an alignment overhead
1285	* that is bigger then PAGE_SIZE.
1286	*/
1287	while ((node = rb_parent(node))) {
1288	va = rb_entry(node, struct vmap_area, rb_node);
1289	if (is_within_this_va(va, size, align, vstart))
1290	return va;
1291
1292	if (get_subtree_max_size(node: node->rb_right) >= length &&
1293	vstart <= va->va_start) {
1294	/*
1295	* Shift the vstart forward. Please note, we update it with
1296	* parent's start address adding "1" because we do not want
1297	* to enter same sub-tree after it has already been checked
1298	* and no suitable free block found there.
1299	*/
1300	vstart = va->va_start + `1`;
1301	node = node->rb_right;
1302	break;
1303	}
1304	}
1305	}
1306	}
1307
1308	return NULL;
1309	}
1310
1311	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1312	#include <linux/random.h>
1313
1314	static struct vmap_area *
1315	find_vmap_lowest_linear_match(struct list_head head, unsigned* long size,
1316	unsigned long align, unsigned long vstart)
1317	{
1318	struct vmap_area *va;
1319
1320	list_for_each_entry(va, head, list) {
1321	if (!is_within_this_va(va, size, align, vstart))
1322	continue;
1323
1324	return va;
1325	}
1326
1327	return NULL;
1328	}
1329
1330	static void
1331	find_vmap_lowest_match_check(struct rb_root root, struct* list_head *head,
1332	unsigned long size, unsigned long align)
1333	{
1334	struct vmap_area va_1, va_2;
1335	unsigned long vstart;
1336	unsigned int rnd;
1337
1338	get_random_bytes(&rnd, sizeof(rnd));
1339	vstart = VMALLOC_START + rnd;
1340
1341	va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
1342	va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
1343
1344	if (va_1 != va_2)
1345	pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
1346	va_1, va_2, vstart);
1347	}
1348	#endif
1349
1350	enum fit_type {
1351	NOTHING_FIT = `0`,
1352	FL_FIT_TYPE = `1`, / full fit /
1353	LE_FIT_TYPE = `2`, / left edge fit /
1354	RE_FIT_TYPE = `3`, / right edge fit /
1355	NE_FIT_TYPE = `4` / no edge fit /
1356	};
1357
1358	static __always_inline enum fit_type
1359	classify_va_fit_type(struct vmap_area *va,
1360	unsigned long nva_start_addr, unsigned long size)
1361	{
1362	enum fit_type type;
1363
1364	/ Check if it is within VA. /
1365	if (nva_start_addr < va->va_start \|\|
1366	nva_start_addr + size > va->va_end)
1367	return NOTHING_FIT;
1368
1369	/ Now classify. /
1370	if (va->va_start == nva_start_addr) {
1371	if (va->va_end == nva_start_addr + size)
1372	type = FL_FIT_TYPE;
1373	else
1374	type = LE_FIT_TYPE;
1375	} else if (va->va_end == nva_start_addr + size) {
1376	type = RE_FIT_TYPE;
1377	} else {
1378	type = NE_FIT_TYPE;
1379	}
1380
1381	return type;
1382	}
1383
1384	static __always_inline int
1385	adjust_va_to_fit_type(struct rb_root root, struct* list_head *head,
1386	struct vmap_area va, unsigned* long nva_start_addr,
1387	unsigned long size)
1388	{
1389	struct vmap_area *lva = NULL;
1390	enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
1391
1392	if (type == FL_FIT_TYPE) {
1393	/*
1394	* No need to split VA, it fully fits.
1395	*
1396	* \| \|
1397	* V NVA V
1398	* \|---------------\|
1399	*/
1400	unlink_va_augment(va, root);
1401	kmem_cache_free(s: vmap_area_cachep, objp: va);
1402	} else if (type == LE_FIT_TYPE) {
1403	/*
1404	* Split left edge of fit VA.
1405	*
1406	* \| \|
1407	* V NVA V R
1408	* \|-------\|-------\|
1409	*/
1410	va->va_start += size;
1411	} else if (type == RE_FIT_TYPE) {
1412	/*
1413	* Split right edge of fit VA.
1414	*
1415	* \| \|
1416	* L V NVA V
1417	* \|-------\|-------\|
1418	*/
1419	va->va_end = nva_start_addr;
1420	} else if (type == NE_FIT_TYPE) {
1421	/*
1422	* Split no edge of fit VA.
1423	*
1424	* \| \|
1425	* L V NVA V R
1426	* \|---\|-------\|---\|
1427	*/
1428	lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1429	if (unlikely(!lva)) {
1430	/*
1431	* For percpu allocator we do not do any pre-allocation
1432	* and leave it as it is. The reason is it most likely
1433	* never ends up with NE_FIT_TYPE splitting. In case of
1434	* percpu allocations offsets and sizes are aligned to
1435	* fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1436	* are its main fitting cases.
1437	*
1438	* There are a few exceptions though, as an example it is
1439	* a first allocation (early boot up) when we have "one"
1440	* big free space that has to be split.
1441	*
1442	* Also we can hit this path in case of regular "vmap"
1443	* allocations, if "this" current CPU was not preloaded.
1444	* See the comment in alloc_vmap_area() why. If so, then
1445	* GFP_NOWAIT is used instead to get an extra object for
1446	* split purpose. That is rare and most time does not
1447	* occur.
1448	*
1449	* What happens if an allocation gets failed. Basically,
1450	* an "overflow" path is triggered to purge lazily freed
1451	* areas to free some memory, then, the "retry" path is
1452	* triggered to repeat one more time. See more details
1453	* in alloc_vmap_area() function.
1454	*/
1455	lva = kmem_cache_alloc(cachep: vmap_area_cachep, GFP_NOWAIT);
1456	if (!lva)
1457	return -`1`;
1458	}
1459
1460	/*
1461	* Build the remainder.
1462	*/
1463	lva->va_start = va->va_start;
1464	lva->va_end = nva_start_addr;
1465
1466	/*
1467	* Shrink this VA to remaining size.
1468	*/
1469	va->va_start = nva_start_addr + size;
1470	} else {
1471	return -`1`;
1472	}
1473
1474	if (type != FL_FIT_TYPE) {
1475	augment_tree_propagate_from(va);
1476
1477	if (lva) / type == NE_FIT_TYPE /
1478	insert_vmap_area_augment(va: lva, from: &va->rb_node, root, head);
1479	}
1480
1481	return `0`;
1482	}
1483
1484	/*
1485	* Returns a start address of the newly allocated area, if success.
1486	* Otherwise a vend is returned that indicates failure.
1487	*/
1488	static __always_inline unsigned long
1489	__alloc_vmap_area(struct rb_root root, struct* list_head *head,
1490	unsigned long size, unsigned long align,
1491	unsigned long vstart, unsigned long vend)
1492	{
1493	bool adjust_search_size = true;
1494	unsigned long nva_start_addr;
1495	struct vmap_area *va;
1496	int ret;
1497
1498	/*
1499	* Do not adjust when:
1500	* a) align <= PAGE_SIZE, because it does not make any sense.
1501	* All blocks(their start addresses) are at least PAGE_SIZE
1502	* aligned anyway;
1503	* b) a short range where a requested size corresponds to exactly
1504	* specified [vstart:vend] interval and an alignment > PAGE_SIZE.
1505	* With adjusted search length an allocation would not succeed.
1506	*/
1507	if (align <= PAGE_SIZE \|\| (align > PAGE_SIZE && (vend - vstart) == size))
1508	adjust_search_size = false;
1509
1510	va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
1511	if (unlikely(!va))
1512	return vend;
1513
1514	if (va->va_start > vstart)
1515	nva_start_addr = ALIGN(va->va_start, align);
1516	else
1517	nva_start_addr = ALIGN(vstart, align);
1518
1519	/ Check the "vend" restriction. /
1520	if (nva_start_addr + size > vend)
1521	return vend;
1522
1523	/ Update the free vmap_area. /
1524	ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
1525	if (WARN_ON_ONCE(ret))
1526	return vend;
1527
1528	#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1529	find_vmap_lowest_match_check(root, head, size, align);
1530	#endif
1531
1532	return nva_start_addr;
1533	}
1534
1535	/*
1536	* Free a region of KVA allocated by alloc_vmap_area
1537	*/
1538	static void free_vmap_area(struct vmap_area *va)
1539	{
1540	/*
1541	* Remove from the busy tree/list.
1542	*/
1543	spin_lock(lock: &vmap_area_lock);
1544	unlink_va(va, root: &vmap_area_root);
1545	spin_unlock(lock: &vmap_area_lock);
1546
1547	/*
1548	* Insert/Merge it back to the free tree/list.
1549	*/
1550	spin_lock(lock: &free_vmap_area_lock);
1551	merge_or_add_vmap_area_augment(va, root: &free_vmap_area_root, head: &free_vmap_area_list);
1552	spin_unlock(lock: &free_vmap_area_lock);
1553	}
1554
1555	static inline void
1556	preload_this_cpu_lock(spinlock_t lock, gfp_t gfp_mask, int* node)
1557	{
1558	struct vmap_area *va = NULL;
1559
1560	/*
1561	* Preload this CPU with one extra vmap_area object. It is used
1562	* when fit type of free area is NE_FIT_TYPE. It guarantees that
1563	* a CPU that does an allocation is preloaded.
1564	*
1565	* We do it in non-atomic context, thus it allows us to use more
1566	* permissive allocation masks to be more stable under low memory
1567	* condition and high memory pressure.
1568	*/
1569	if (!this_cpu_read(ne_fit_preload_node))
1570	va = kmem_cache_alloc_node(s: vmap_area_cachep, flags: gfp_mask, node);
1571
1572	spin_lock(lock);
1573
1574	if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
1575	kmem_cache_free(s: vmap_area_cachep, objp: va);
1576	}
1577
1578	/*
1579	* Allocate a region of KVA of the specified size and alignment, within the
1580	* vstart and vend.
1581	*/
1582	static struct vmap_area alloc_vmap_area(unsigned* long size,
1583	unsigned long align,
1584	unsigned long vstart, unsigned long vend,
1585	int node, gfp_t gfp_mask,
1586	unsigned long va_flags)
1587	{
1588	struct vmap_area *va;
1589	unsigned long freed;
1590	unsigned long addr;
1591	int purged = `0`;
1592	int ret;
1593
1594	if (unlikely(!size \|\| offset_in_page(size) \|\| !is_power_of_2(align)))
1595	return ERR_PTR(error: -EINVAL);
1596
1597	if (unlikely(!vmap_initialized))
1598	return ERR_PTR(error: -EBUSY);
1599
1600	might_sleep();
1601	gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1602
1603	va = kmem_cache_alloc_node(s: vmap_area_cachep, flags: gfp_mask, node);
1604	if (unlikely(!va))
1605	return ERR_PTR(error: -ENOMEM);
1606
1607	/*
1608	* Only scan the relevant parts containing pointers to other objects
1609	* to avoid false negatives.
1610	*/
1611	kmemleak_scan_area(ptr: &va->rb_node, SIZE_MAX, gfp: gfp_mask);
1612
1613	retry:
1614	preload_this_cpu_lock(lock: &free_vmap_area_lock, gfp_mask, node);
1615	addr = __alloc_vmap_area(root: &free_vmap_area_root, head: &free_vmap_area_list,
1616	size, align, vstart, vend);
1617	spin_unlock(lock: &free_vmap_area_lock);
1618
1619	trace_alloc_vmap_area(addr, size, align, vstart, vend, failed: addr == vend);
1620
1621	/*
1622	* If an allocation fails, the "vend" address is
1623	* returned. Therefore trigger the overflow path.
1624	*/
1625	if (unlikely(addr == vend))
1626	goto overflow;
1627
1628	va->va_start = addr;
1629	va->va_end = addr + size;
1630	va->vm = NULL;
1631	va->flags = va_flags;
1632
1633	spin_lock(lock: &vmap_area_lock);
1634	insert_vmap_area(va, root: &vmap_area_root, head: &vmap_area_list);
1635	spin_unlock(lock: &vmap_area_lock);
1636
1637	BUG_ON(!IS_ALIGNED(va->va_start, align));
1638	BUG_ON(va->va_start < vstart);
1639	BUG_ON(va->va_end > vend);
1640
1641	ret = kasan_populate_vmalloc(start: addr, size);
1642	if (ret) {
1643	free_vmap_area(va);
1644	return ERR_PTR(error: ret);
1645	}
1646
1647	return va;
1648
1649	overflow:
1650	if (!purged) {
1651	reclaim_and_purge_vmap_areas();
1652	purged = `1`;
1653	goto retry;
1654	}
1655
1656	freed = `0`;
1657	blocking_notifier_call_chain(nh: &vmap_notify_list, val: `0`, v: &freed);
1658
1659	if (freed > `0`) {
1660	purged = `0`;
1661	goto retry;
1662	}
1663
1664	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
1665	pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
1666	size);
1667
1668	kmem_cache_free(s: vmap_area_cachep, objp: va);
1669	return ERR_PTR(error: -EBUSY);
1670	}
1671
1672	int register_vmap_purge_notifier(struct notifier_block *nb)
1673	{
1674	return blocking_notifier_chain_register(nh: &vmap_notify_list, nb);
1675	}
1676	EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
1677
1678	int unregister_vmap_purge_notifier(struct notifier_block *nb)
1679	{
1680	return blocking_notifier_chain_unregister(nh: &vmap_notify_list, nb);
1681	}
1682	EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
1683
1684	/*
1685	* lazy_max_pages is the maximum amount of virtual address space we gather up
1686	* before attempting to purge with a TLB flush.
1687	*
1688	* There is a tradeoff here: a larger number will cover more kernel page tables
1689	* and take slightly longer to purge, but it will linearly reduce the number of
1690	* global TLB flushes that must be performed. It would seem natural to scale
1691	* this number up linearly with the number of CPUs (because vmapping activity
1692	* could also scale linearly with the number of CPUs), however it is likely
1693	* that in practice, workloads might be constrained in other ways that mean
1694	* vmap activity will not scale linearly with CPUs. Also, I want to be
1695	* conservative and not introduce a big latency on huge systems, so go with
1696	* a less aggressive log scale. It will still be an improvement over the old
1697	* code, and it will be simple to change the scale factor if we find that it
1698	* becomes a problem on bigger systems.
1699	*/
1700	static unsigned long lazy_max_pages(void)
1701	{
1702	unsigned int log;
1703
1704	log = fls(x: num_online_cpus());
1705
1706	return log * (`32UL` * `1024` * `1024` / PAGE_SIZE);
1707	}
1708
1709	static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(`0`);
1710
1711	/*
1712	* Serialize vmap purging. There is no actual critical section protected
1713	* by this lock, but we want to avoid concurrent calls for performance
1714	* reasons and to make the pcpu_get_vm_areas more deterministic.
1715	*/
1716	static DEFINE_MUTEX(vmap_purge_lock);
1717
1718	/ for per-CPU blocks /
1719	static void purge_fragmented_blocks_allcpus(void);
1720
1721	/*
1722	* Purges all lazily-freed vmap areas.
1723	*/
1724	static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
1725	{
1726	unsigned long resched_threshold;
1727	unsigned int num_purged_areas = `0`;
1728	struct list_head local_purge_list;
1729	struct vmap_area va, n_va;
1730
1731	lockdep_assert_held(&vmap_purge_lock);
1732
1733	spin_lock(lock: &purge_vmap_area_lock);
1734	purge_vmap_area_root = RB_ROOT;
1735	list_replace_init(old: &purge_vmap_area_list, new: &local_purge_list);
1736	spin_unlock(lock: &purge_vmap_area_lock);
1737
1738	if (unlikely(list_empty(&local_purge_list)))
1739	goto out;
1740
1741	start = min(start,
1742	list_first_entry(&local_purge_list,
1743	struct vmap_area, list)->va_start);
1744
1745	end = max(end,
1746	list_last_entry(&local_purge_list,
1747	struct vmap_area, list)->va_end);
1748
1749	flush_tlb_kernel_range(start, end);
1750	resched_threshold = lazy_max_pages() << `1`;
1751
1752	spin_lock(lock: &free_vmap_area_lock);
1753	list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
1754	unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1755	unsigned long orig_start = va->va_start;
1756	unsigned long orig_end = va->va_end;
1757
1758	/*
1759	* Finally insert or merge lazily-freed area. It is
1760	* detached and there is no need to "unlink" it from
1761	* anything.
1762	*/
1763	va = merge_or_add_vmap_area_augment(va, root: &free_vmap_area_root,
1764	head: &free_vmap_area_list);
1765
1766	if (!va)
1767	continue;
1768
1769	if (is_vmalloc_or_module_addr((void *)orig_start))
1770	kasan_release_vmalloc(start: orig_start, end: orig_end,
1771	free_region_start: va->va_start, free_region_end: va->va_end);
1772
1773	atomic_long_sub(i: nr, v: &vmap_lazy_nr);
1774	num_purged_areas++;
1775
1776	if (atomic_long_read(v: &vmap_lazy_nr) < resched_threshold)
1777	cond_resched_lock(&free_vmap_area_lock);
1778	}
1779	spin_unlock(lock: &free_vmap_area_lock);
1780
1781	out:
1782	trace_purge_vmap_area_lazy(start, end, npurged: num_purged_areas);
1783	return num_purged_areas > `0`;
1784	}
1785
1786	/*
1787	* Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
1788	*/
1789	static void reclaim_and_purge_vmap_areas(void)
1790
1791	{
1792	mutex_lock(&vmap_purge_lock);
1793	purge_fragmented_blocks_allcpus();
1794	__purge_vmap_area_lazy(ULONG_MAX, end: `0`);
1795	mutex_unlock(lock: &vmap_purge_lock);
1796	}
1797
1798	static void drain_vmap_area_work(struct work_struct *work)
1799	{
1800	unsigned long nr_lazy;
1801
1802	do {
1803	mutex_lock(&vmap_purge_lock);
1804	__purge_vmap_area_lazy(ULONG_MAX, end: `0`);
1805	mutex_unlock(lock: &vmap_purge_lock);
1806
1807	/ Recheck if further work is required. /
1808	nr_lazy = atomic_long_read(v: &vmap_lazy_nr);
1809	} while (nr_lazy > lazy_max_pages());
1810	}
1811
1812	/*
1813	* Free a vmap area, caller ensuring that the area has been unmapped,
1814	* unlinked and flush_cache_vunmap had been called for the correct
1815	* range previously.
1816	*/
1817	static void free_vmap_area_noflush(struct vmap_area *va)
1818	{
1819	unsigned long nr_lazy_max = lazy_max_pages();
1820	unsigned long va_start = va->va_start;
1821	unsigned long nr_lazy;
1822
1823	if (WARN_ON_ONCE(!list_empty(&va->list)))
1824	return;
1825
1826	nr_lazy = atomic_long_add_return(i: (va->va_end - va->va_start) >>
1827	PAGE_SHIFT, v: &vmap_lazy_nr);
1828
1829	/*
1830	* Merge or place it to the purge tree/list.
1831	*/
1832	spin_lock(lock: &purge_vmap_area_lock);
1833	merge_or_add_vmap_area(va,
1834	root: &purge_vmap_area_root, head: &purge_vmap_area_list);
1835	spin_unlock(lock: &purge_vmap_area_lock);
1836
1837	trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);
1838
1839	/ After this point, we may free va at any time /
1840	if (unlikely(nr_lazy > nr_lazy_max))
1841	schedule_work(work: &drain_vmap_work);
1842	}
1843
1844	/*
1845	* Free and unmap a vmap area
1846	*/
1847	static void free_unmap_vmap_area(struct vmap_area *va)
1848	{
1849	flush_cache_vunmap(start: va->va_start, end: va->va_end);
1850	vunmap_range_noflush(start: va->va_start, end: va->va_end);
1851	if (debug_pagealloc_enabled_static())
1852	flush_tlb_kernel_range(start: va->va_start, end: va->va_end);
1853
1854	free_vmap_area_noflush(va);
1855	}
1856
1857	struct vmap_area find_vmap_area(unsigned* long addr)
1858	{
1859	struct vmap_area *va;
1860
1861	spin_lock(lock: &vmap_area_lock);
1862	va = __find_vmap_area(addr, root: &vmap_area_root);
1863	spin_unlock(lock: &vmap_area_lock);
1864
1865	return va;
1866	}
1867
1868	static struct vmap_area find_unlink_vmap_area(unsigned* long addr)
1869	{
1870	struct vmap_area *va;
1871
1872	spin_lock(lock: &vmap_area_lock);
1873	va = __find_vmap_area(addr, root: &vmap_area_root);
1874	if (va)
1875	unlink_va(va, root: &vmap_area_root);
1876	spin_unlock(lock: &vmap_area_lock);
1877
1878	return va;
1879	}
1880
1881	/ Per cpu kva allocator /
1882
1883	/*
1884	* vmap space is limited especially on 32 bit architectures. Ensure there is
1885	* room for at least 16 percpu vmap blocks per CPU.
1886	*/
1887	/*
1888	* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
1889	* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
1890	* instead (we just need a rough idea)
1891	*/
1892	#if BITS_PER_LONG == 32
1893	#define VMALLOC_SPACE (128UL10241024)
1894	#else
1895	#define VMALLOC_SPACE (128UL10241024*1024)
1896	#endif
1897
1898	#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
1899	#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
1900	#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
1901	#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
1902	#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
1903	#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
1904	#define VMAP_BBMAP_BITS \
1905	VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
1906	VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
1907	VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
1908
1909	#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
1910
1911	/*
1912	* Purge threshold to prevent overeager purging of fragmented blocks for
1913	* regular operations: Purge if vb->free is less than 1/4 of the capacity.
1914	*/
1915	#define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4)
1916
1917	#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/
1918	#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/
1919	#define VMAP_FLAGS_MASK 0x3
1920
1921	struct vmap_block_queue {
1922	spinlock_t lock;
1923	struct list_head free;
1924
1925	/*
1926	* An xarray requires an extra memory dynamically to
1927	* be allocated. If it is an issue, we can use rb-tree
1928	* instead.
1929	*/
1930	struct xarray vmap_blocks;
1931	};
1932
1933	struct vmap_block {
1934	spinlock_t lock;
1935	struct vmap_area *va;
1936	unsigned long free, dirty;
1937	DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
1938	unsigned long dirty_min, dirty_max; /< dirty range /
1939	struct list_head free_list;
1940	struct rcu_head rcu_head;
1941	struct list_head purge;
1942	};
1943
1944	/ Queue of free and dirty vmap blocks, for allocation and flushing purposes /
1945	static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
1946
1947	/*
1948	* In order to fast access to any "vmap_block" associated with a
1949	* specific address, we use a hash.
1950	*
1951	* A per-cpu vmap_block_queue is used in both ways, to serialize
1952	* an access to free block chains among CPUs(alloc path) and it
1953	* also acts as a vmap_block hash(alloc/free paths). It means we
1954	* overload it, since we already have the per-cpu array which is
1955	* used as a hash table. When used as a hash a 'cpu' passed to
1956	* per_cpu() is not actually a CPU but rather a hash index.
1957	*
1958	* A hash function is addr_to_vb_xa() which hashes any address
1959	* to a specific index(in a hash) it belongs to. This then uses a
1960	* per_cpu() macro to access an array with generated index.
1961	*
1962	* An example:
1963	*
1964	* CPU_1 CPU_2 CPU_0
1965	* \| \| \|
1966	* V V V
1967	* 0 10 20 30 40 50 60
1968	* \|------\|------\|------\|------\|------\|------\|...<vmap address space>
1969	* CPU0 CPU1 CPU2 CPU0 CPU1 CPU2
1970	*
1971	* - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
1972	* it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
1973	*
1974	* - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
1975	* it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
1976	*
1977	* - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
1978	* it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
1979	*
1980	* This technique almost always avoids lock contention on insert/remove,
1981	* however xarray spinlocks protect against any contention that remains.
1982	*/
1983	static struct xarray *
1984	addr_to_vb_xa(unsigned long addr)
1985	{
1986	int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
1987
1988	return &per_cpu(vmap_block_queue, index).vmap_blocks;
1989	}
1990
1991	/*
1992	* We should probably have a fallback mechanism to allocate virtual memory
1993	* out of partially filled vmap blocks. However vmap block sizing should be
1994	* fairly reasonable according to the vmalloc size, so it shouldn't be a
1995	* big problem.
1996	*/
1997
1998	static unsigned long addr_to_vb_idx(unsigned long addr)
1999	{
2000	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-`1`);
2001	addr /= VMAP_BLOCK_SIZE;
2002	return addr;
2003	}
2004
2005	static void vmap_block_vaddr(unsigned* long va_start, unsigned long pages_off)
2006	{
2007	unsigned long addr;
2008
2009	addr = va_start + (pages_off << PAGE_SHIFT);
2010	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
2011	return (void *)addr;
2012	}
2013
2014	/**
2015	* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
2016	* block. Of course pages number can't exceed VMAP_BBMAP_BITS
2017	* @order: how many 2^order pages should be occupied in newly allocated block
2018	* @gfp_mask: flags for the page level allocator
2019	*
2020	* Return: virtual address in a newly allocated block or ERR_PTR(-errno)
2021	*/
2022	static void new_vmap_block(unsigned* int order, gfp_t gfp_mask)
2023	{
2024	struct vmap_block_queue *vbq;
2025	struct vmap_block *vb;
2026	struct vmap_area *va;
2027	struct xarray *xa;
2028	unsigned long vb_idx;
2029	int node, err;
2030	void *vaddr;
2031
2032	node = numa_node_id();
2033
2034	vb = kmalloc_node(size: sizeof(struct vmap_block),
2035	flags: gfp_mask & GFP_RECLAIM_MASK, node);
2036	if (unlikely(!vb))
2037	return ERR_PTR(error: -ENOMEM);
2038
2039	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
2040	VMALLOC_START, VMALLOC_END,
2041	node, gfp_mask,
2042	VMAP_RAM\|VMAP_BLOCK);
2043	if (IS_ERR(ptr: va)) {
2044	kfree(objp: vb);
2045	return ERR_CAST(ptr: va);
2046	}
2047
2048	vaddr = vmap_block_vaddr(va_start: va->va_start, pages_off: `0`);
2049	spin_lock_init(&vb->lock);
2050	vb->va = va;
2051	/ At least something should be left free /
2052	BUG_ON(VMAP_BBMAP_BITS <= (`1UL` << order));
2053	bitmap_zero(dst: vb->used_map, VMAP_BBMAP_BITS);
2054	vb->free = VMAP_BBMAP_BITS - (`1UL` << order);
2055	vb->dirty = `0`;
2056	vb->dirty_min = VMAP_BBMAP_BITS;
2057	vb->dirty_max = `0`;
2058	bitmap_set(map: vb->used_map, start: `0`, nbits: (`1UL` << order));
2059	INIT_LIST_HEAD(list: &vb->free_list);
2060
2061	xa = addr_to_vb_xa(addr: va->va_start);
2062	vb_idx = addr_to_vb_idx(addr: va->va_start);
2063	err = xa_insert(xa, index: vb_idx, entry: vb, gfp: gfp_mask);
2064	if (err) {
2065	kfree(objp: vb);
2066	free_vmap_area(va);
2067	return ERR_PTR(error: err);
2068	}
2069
2070	vbq = raw_cpu_ptr(&vmap_block_queue);
2071	spin_lock(lock: &vbq->lock);
2072	list_add_tail_rcu(new: &vb->free_list, head: &vbq->free);
2073	spin_unlock(lock: &vbq->lock);
2074
2075	return vaddr;
2076	}
2077
2078	static void free_vmap_block(struct vmap_block *vb)
2079	{
2080	struct vmap_block *tmp;
2081	struct xarray *xa;
2082
2083	xa = addr_to_vb_xa(addr: vb->va->va_start);
2084	tmp = xa_erase(xa, index: addr_to_vb_idx(addr: vb->va->va_start));
2085	BUG_ON(tmp != vb);
2086
2087	spin_lock(lock: &vmap_area_lock);
2088	unlink_va(va: vb->va, root: &vmap_area_root);
2089	spin_unlock(lock: &vmap_area_lock);
2090
2091	free_vmap_area_noflush(va: vb->va);
2092	kfree_rcu(vb, rcu_head);
2093	}
2094
2095	static bool purge_fragmented_block(struct vmap_block *vb,
2096	struct vmap_block_queue vbq, struct* list_head *purge_list,
2097	bool force_purge)
2098	{
2099	if (vb->free + vb->dirty != VMAP_BBMAP_BITS \|\|
2100	vb->dirty == VMAP_BBMAP_BITS)
2101	return false;
2102
2103	/ Don't overeagerly purge usable blocks unless requested /
2104	if (!(force_purge \|\| vb->free < VMAP_PURGE_THRESHOLD))
2105	return false;
2106
2107	/ prevent further allocs after releasing lock /
2108	WRITE_ONCE(vb->free, `0`);
2109	/ prevent purging it again /
2110	WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
2111	vb->dirty_min = `0`;
2112	vb->dirty_max = VMAP_BBMAP_BITS;
2113	spin_lock(lock: &vbq->lock);
2114	list_del_rcu(entry: &vb->free_list);
2115	spin_unlock(lock: &vbq->lock);
2116	list_add_tail(new: &vb->purge, head: purge_list);
2117	return true;
2118	}
2119
2120	static void free_purged_blocks(struct list_head *purge_list)
2121	{
2122	struct vmap_block vb, n_vb;
2123
2124	list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
2125	list_del(entry: &vb->purge);
2126	free_vmap_block(vb);
2127	}
2128	}
2129
2130	static void purge_fragmented_blocks(int cpu)
2131	{
2132	LIST_HEAD(purge);
2133	struct vmap_block *vb;
2134	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2135
2136	rcu_read_lock();
2137	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
2138	unsigned long free = READ_ONCE(vb->free);
2139	unsigned long dirty = READ_ONCE(vb->dirty);
2140
2141	if (free + dirty != VMAP_BBMAP_BITS \|\|
2142	dirty == VMAP_BBMAP_BITS)
2143	continue;
2144
2145	spin_lock(lock: &vb->lock);
2146	purge_fragmented_block(vb, vbq, purge_list: &purge, force_purge: true);
2147	spin_unlock(lock: &vb->lock);
2148	}
2149	rcu_read_unlock();
2150	free_purged_blocks(purge_list: &purge);
2151	}
2152
2153	static void purge_fragmented_blocks_allcpus(void)
2154	{
2155	int cpu;
2156
2157	for_each_possible_cpu(cpu)
2158	purge_fragmented_blocks(cpu);
2159	}
2160
2161	static void vb_alloc(unsigned* long size, gfp_t gfp_mask)
2162	{
2163	struct vmap_block_queue *vbq;
2164	struct vmap_block *vb;
2165	void *vaddr = NULL;
2166	unsigned int order;
2167
2168	BUG_ON(offset_in_page(size));
2169	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
2170	if (WARN_ON(size == `0`)) {
2171	/*
2172	* Allocating 0 bytes isn't what caller wants since
2173	* get_order(0) returns funny result. Just warn and terminate
2174	* early.
2175	*/
2176	return NULL;
2177	}
2178	order = get_order(size);
2179
2180	rcu_read_lock();
2181	vbq = raw_cpu_ptr(&vmap_block_queue);
2182	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
2183	unsigned long pages_off;
2184
2185	if (READ_ONCE(vb->free) < (`1UL` << order))
2186	continue;
2187
2188	spin_lock(lock: &vb->lock);
2189	if (vb->free < (`1UL` << order)) {
2190	spin_unlock(lock: &vb->lock);
2191	continue;
2192	}
2193
2194	pages_off = VMAP_BBMAP_BITS - vb->free;
2195	vaddr = vmap_block_vaddr(va_start: vb->va->va_start, pages_off);
2196	WRITE_ONCE(vb->free, vb->free - (`1UL` << order));
2197	bitmap_set(map: vb->used_map, start: pages_off, nbits: (`1UL` << order));
2198	if (vb->free == `0`) {
2199	spin_lock(lock: &vbq->lock);
2200	list_del_rcu(entry: &vb->free_list);
2201	spin_unlock(lock: &vbq->lock);
2202	}
2203
2204	spin_unlock(lock: &vb->lock);
2205	break;
2206	}
2207
2208	rcu_read_unlock();
2209
2210	/ Allocate new block if nothing was found /
2211	if (!vaddr)
2212	vaddr = new_vmap_block(order, gfp_mask);
2213
2214	return vaddr;
2215	}
2216
2217	static void vb_free(unsigned long addr, unsigned long size)
2218	{
2219	unsigned long offset;
2220	unsigned int order;
2221	struct vmap_block *vb;
2222	struct xarray *xa;
2223
2224	BUG_ON(offset_in_page(size));
2225	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
2226
2227	flush_cache_vunmap(start: addr, end: addr + size);
2228
2229	order = get_order(size);
2230	offset = (addr & (VMAP_BLOCK_SIZE - `1`)) >> PAGE_SHIFT;
2231
2232	xa = addr_to_vb_xa(addr);
2233	vb = xa_load(xa, index: addr_to_vb_idx(addr));
2234
2235	spin_lock(lock: &vb->lock);
2236	bitmap_clear(map: vb->used_map, start: offset, nbits: (`1UL` << order));
2237	spin_unlock(lock: &vb->lock);
2238
2239	vunmap_range_noflush(start: addr, end: addr + size);
2240
2241	if (debug_pagealloc_enabled_static())
2242	flush_tlb_kernel_range(start: addr, end: addr + size);
2243
2244	spin_lock(lock: &vb->lock);
2245
2246	/ Expand the not yet TLB flushed dirty range /
2247	vb->dirty_min = min(vb->dirty_min, offset);
2248	vb->dirty_max = max(vb->dirty_max, offset + (`1UL` << order));
2249
2250	WRITE_ONCE(vb->dirty, vb->dirty + (`1UL` << order));
2251	if (vb->dirty == VMAP_BBMAP_BITS) {
2252	BUG_ON(vb->free);
2253	spin_unlock(lock: &vb->lock);
2254	free_vmap_block(vb);
2255	} else
2256	spin_unlock(lock: &vb->lock);
2257	}
2258
2259	static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
2260	{
2261	LIST_HEAD(purge_list);
2262	int cpu;
2263
2264	if (unlikely(!vmap_initialized))
2265	return;
2266
2267	mutex_lock(&vmap_purge_lock);
2268
2269	for_each_possible_cpu(cpu) {
2270	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2271	struct vmap_block *vb;
2272	unsigned long idx;
2273
2274	rcu_read_lock();
2275	xa_for_each(&vbq->vmap_blocks, idx, vb) {
2276	spin_lock(lock: &vb->lock);
2277
2278	/*
2279	* Try to purge a fragmented block first. If it's
2280	* not purgeable, check whether there is dirty
2281	* space to be flushed.
2282	*/
2283	if (!purge_fragmented_block(vb, vbq, purge_list: &purge_list, force_purge: false) &&
2284	vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
2285	unsigned long va_start = vb->va->va_start;
2286	unsigned long s, e;
2287
2288	s = va_start + (vb->dirty_min << PAGE_SHIFT);
2289	e = va_start + (vb->dirty_max << PAGE_SHIFT);
2290
2291	start = min(s, start);
2292	end = max(e, end);
2293
2294	/ Prevent that this is flushed again /
2295	vb->dirty_min = VMAP_BBMAP_BITS;
2296	vb->dirty_max = `0`;
2297
2298	flush = `1`;
2299	}
2300	spin_unlock(lock: &vb->lock);
2301	}
2302	rcu_read_unlock();
2303	}
2304	free_purged_blocks(purge_list: &purge_list);
2305
2306	if (!__purge_vmap_area_lazy(start, end) && flush)
2307	flush_tlb_kernel_range(start, end);
2308	mutex_unlock(lock: &vmap_purge_lock);
2309	}
2310
2311	/**
2312	* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
2313	*
2314	* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
2315	* to amortize TLB flushing overheads. What this means is that any page you
2316	* have now, may, in a former life, have been mapped into kernel virtual
2317	* address by the vmap layer and so there might be some CPUs with TLB entries
2318	* still referencing that page (additional to the regular 1:1 kernel mapping).
2319	*
2320	* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
2321	* be sure that none of the pages we have control over will have any aliases
2322	* from the vmap layer.
2323	*/
2324	void vm_unmap_aliases(void)
2325	{
2326	unsigned long start = ULONG_MAX, end = `0`;
2327	int flush = `0`;
2328
2329	_vm_unmap_aliases(start, end, flush);
2330	}
2331	EXPORT_SYMBOL_GPL(vm_unmap_aliases);
2332
2333	/**
2334	* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
2335	* @mem: the pointer returned by vm_map_ram
2336	* @count: the count passed to that vm_map_ram call (cannot unmap partial)
2337	*/
2338	void vm_unmap_ram(const void mem, unsigned* int count)
2339	{
2340	unsigned long size = (unsigned long)count << PAGE_SHIFT;
2341	unsigned long addr = (unsigned long)kasan_reset_tag(addr: mem);
2342	struct vmap_area *va;
2343
2344	might_sleep();
2345	BUG_ON(!addr);
2346	BUG_ON(addr < VMALLOC_START);
2347	BUG_ON(addr > VMALLOC_END);
2348	BUG_ON(!PAGE_ALIGNED(addr));
2349
2350	kasan_poison_vmalloc(start: mem, size);
2351
2352	if (likely(count <= VMAP_MAX_ALLOC)) {
2353	debug_check_no_locks_freed(from: mem, len: size);
2354	vb_free(addr, size);
2355	return;
2356	}
2357
2358	va = find_unlink_vmap_area(addr);
2359	if (WARN_ON_ONCE(!va))
2360	return;
2361
2362	debug_check_no_locks_freed(from: (void *)va->va_start,
2363	len: (va->va_end - va->va_start));
2364	free_unmap_vmap_area(va);
2365	}
2366	EXPORT_SYMBOL(vm_unmap_ram);
2367
2368	/**
2369	* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
2370	* @pages: an array of pointers to the pages to be mapped
2371	* @count: number of pages
2372	* @node: prefer to allocate data structures on this node
2373	*
2374	* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
2375	* faster than vmap so it's good. But if you mix long-life and short-life
2376	* objects with vm_map_ram(), it could consume lots of address space through
2377	* fragmentation (especially on a 32bit machine). You could see failures in
2378	* the end. Please use this function for short-lived objects.
2379	*
2380	* Returns: a pointer to the address that has been mapped, or %NULL on failure
2381	*/
2382	void vm_map_ram(struct* page *pages, unsigned* int count, int node)
2383	{
2384	unsigned long size = (unsigned long)count << PAGE_SHIFT;
2385	unsigned long addr;
2386	void *mem;
2387
2388	if (likely(count <= VMAP_MAX_ALLOC)) {
2389	mem = vb_alloc(size, GFP_KERNEL);
2390	if (IS_ERR(ptr: mem))
2391	return NULL;
2392	addr = (unsigned long)mem;
2393	} else {
2394	struct vmap_area *va;
2395	va = alloc_vmap_area(size, PAGE_SIZE,
2396	VMALLOC_START, VMALLOC_END,
2397	node, GFP_KERNEL, VMAP_RAM);
2398	if (IS_ERR(ptr: va))
2399	return NULL;
2400
2401	addr = va->va_start;
2402	mem = (void *)addr;
2403	}
2404
2405	if (vmap_pages_range(addr, end: addr + size, PAGE_KERNEL,
2406	pages, PAGE_SHIFT) < `0`) {
2407	vm_unmap_ram(mem, count);
2408	return NULL;
2409	}
2410
2411	/*
2412	* Mark the pages as accessible, now that they are mapped.
2413	* With hardware tag-based KASAN, marking is skipped for
2414	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
2415	*/
2416	mem = kasan_unpoison_vmalloc(start: mem, size, KASAN_VMALLOC_PROT_NORMAL);
2417
2418	return mem;
2419	}
2420	EXPORT_SYMBOL(vm_map_ram);
2421
2422	static struct vm_struct *vmlist __initdata;
2423
2424	static inline unsigned int vm_area_page_order(struct vm_struct *vm)
2425	{
2426	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
2427	return vm->page_order;
2428	#else
2429	return `0`;
2430	#endif
2431	}
2432
2433	static inline void set_vm_area_page_order(struct vm_struct vm, unsigned* int order)
2434	{
2435	#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
2436	vm->page_order = order;
2437	#else
2438	BUG_ON(order != `0`);
2439	#endif
2440	}
2441
2442	/**
2443	* vm_area_add_early - add vmap area early during boot
2444	* @vm: vm_struct to add
2445	*
2446	* This function is used to add fixed kernel vm area to vmlist before
2447	* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
2448	* should contain proper values and the other fields should be zero.
2449	*
2450	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
2451	*/
2452	void __init vm_area_add_early(struct vm_struct *vm)
2453	{
2454	struct vm_struct tmp, *p;
2455
2456	BUG_ON(vmap_initialized);
2457	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
2458	if (tmp->addr >= vm->addr) {
2459	BUG_ON(tmp->addr < vm->addr + vm->size);
2460	break;
2461	} else
2462	BUG_ON(tmp->addr + tmp->size > vm->addr);
2463	}
2464	vm->next = *p;
2465	*p = vm;
2466	}
2467
2468	/**
2469	* vm_area_register_early - register vmap area early during boot
2470	* @vm: vm_struct to register
2471	* @align: requested alignment
2472	*
2473	* This function is used to register kernel vm area before
2474	* vmalloc_init() is called. @vm->size and @vm->flags should contain
2475	* proper values on entry and other fields should be zero. On return,
2476	* vm->addr contains the allocated address.
2477	*
2478	* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
2479	*/
2480	void __init vm_area_register_early(struct vm_struct *vm, size_t align)
2481	{
2482	unsigned long addr = ALIGN(VMALLOC_START, align);
2483	struct vm_struct cur, *p;
2484
2485	BUG_ON(vmap_initialized);
2486
2487	for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
2488	if ((unsigned long)cur->addr - addr >= vm->size)
2489	break;
2490	addr = ALIGN((unsigned long)cur->addr + cur->size, align);
2491	}
2492
2493	BUG_ON(addr > VMALLOC_END - vm->size);
2494	vm->addr = (void *)addr;
2495	vm->next = *p;
2496	*p = vm;
2497	kasan_populate_early_vm_area_shadow(start: vm->addr, size: vm->size);
2498	}
2499
2500	static void vmap_init_free_space(void)
2501	{
2502	unsigned long vmap_start = `1`;
2503	const unsigned long vmap_end = ULONG_MAX;
2504	struct vmap_area busy, free;
2505
2506	/*
2507	* B F B B B F
2508	* -\|-----\|.....\|-----\|-----\|-----\|.....\|-
2509	* \| The KVA space \|
2510	* \|<--------------------------------->\|
2511	*/
2512	list_for_each_entry(busy, &vmap_area_list, list) {
2513	if (busy->va_start - vmap_start > `0`) {
2514	free = kmem_cache_zalloc(k: vmap_area_cachep, GFP_NOWAIT);
2515	if (!WARN_ON_ONCE(!free)) {
2516	free->va_start = vmap_start;
2517	free->va_end = busy->va_start;
2518
2519	insert_vmap_area_augment(va: free, NULL,
2520	root: &free_vmap_area_root,
2521	head: &free_vmap_area_list);
2522	}
2523	}
2524
2525	vmap_start = busy->va_end;
2526	}
2527
2528	if (vmap_end - vmap_start > `0`) {
2529	free = kmem_cache_zalloc(k: vmap_area_cachep, GFP_NOWAIT);
2530	if (!WARN_ON_ONCE(!free)) {
2531	free->va_start = vmap_start;
2532	free->va_end = vmap_end;
2533
2534	insert_vmap_area_augment(va: free, NULL,
2535	root: &free_vmap_area_root,
2536	head: &free_vmap_area_list);
2537	}
2538	}
2539	}
2540
2541	static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2542	struct vmap_area va, unsigned* long flags, const void *caller)
2543	{
2544	vm->flags = flags;
2545	vm->addr = (void *)va->va_start;
2546	vm->size = va->va_end - va->va_start;
2547	vm->caller = caller;
2548	va->vm = vm;
2549	}
2550
2551	static void setup_vmalloc_vm(struct vm_struct vm, struct* vmap_area *va,
2552	unsigned long flags, const void *caller)
2553	{
2554	spin_lock(lock: &vmap_area_lock);
2555	setup_vmalloc_vm_locked(vm, va, flags, caller);
2556	spin_unlock(lock: &vmap_area_lock);
2557	}
2558
2559	static void clear_vm_uninitialized_flag(struct vm_struct *vm)
2560	{
2561	/*
2562	* Before removing VM_UNINITIALIZED,
2563	* we should make sure that vm has proper values.
2564	* Pair with smp_rmb() in show_numa_info().
2565	*/
2566	smp_wmb();
2567	vm->flags &= ~VM_UNINITIALIZED;
2568	}
2569
2570	static struct vm_struct __get_vm_area_node(unsigned* long size,
2571	unsigned long align, unsigned long shift, unsigned long flags,
2572	unsigned long start, unsigned long end, int node,
2573	gfp_t gfp_mask, const void *caller)
2574	{
2575	struct vmap_area *va;
2576	struct vm_struct *area;
2577	unsigned long requested_size = size;
2578
2579	BUG_ON(in_interrupt());
2580	size = ALIGN(size, `1ul` << shift);
2581	if (unlikely(!size))
2582	return NULL;
2583
2584	if (flags & VM_IOREMAP)
2585	align = `1ul` << clamp_t(int, get_count_order_long(size),
2586	PAGE_SHIFT, IOREMAP_MAX_ORDER);
2587
2588	area = kzalloc_node(size: sizeof(*area), flags: gfp_mask & GFP_RECLAIM_MASK, node);
2589	if (unlikely(!area))
2590	return NULL;
2591
2592	if (!(flags & VM_NO_GUARD))
2593	size += PAGE_SIZE;
2594
2595	va = alloc_vmap_area(size, align, vstart: start, vend: end, node, gfp_mask, va_flags: `0`);
2596	if (IS_ERR(ptr: va)) {
2597	kfree(objp: area);
2598	return NULL;
2599	}
2600
2601	setup_vmalloc_vm(vm: area, va, flags, caller);
2602
2603	/*
2604	* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
2605	* best-effort approach, as they can be mapped outside of vmalloc code.
2606	* For VM_ALLOC mappings, the pages are marked as accessible after
2607	* getting mapped in __vmalloc_node_range().
2608	* With hardware tag-based KASAN, marking is skipped for
2609	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
2610	*/
2611	if (!(flags & VM_ALLOC))
2612	area->addr = kasan_unpoison_vmalloc(start: area->addr, size: requested_size,
2613	KASAN_VMALLOC_PROT_NORMAL);
2614
2615	return area;
2616	}
2617
2618	struct vm_struct __get_vm_area_caller(unsigned* long size, unsigned long flags,
2619	unsigned long start, unsigned long end,
2620	const void *caller)
2621	{
2622	return __get_vm_area_node(size, align: `1`, PAGE_SHIFT, flags, start, end,
2623	NUMA_NO_NODE, GFP_KERNEL, caller);
2624	}
2625
2626	/**
2627	* get_vm_area - reserve a contiguous kernel virtual area
2628	* @size: size of the area
2629	* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
2630	*
2631	* Search an area of @size in the kernel virtual mapping area,
2632	* and reserved it for out purposes. Returns the area descriptor
2633	* on success or %NULL on failure.
2634	*
2635	* Return: the area descriptor on success or %NULL on failure.
2636	*/
2637	struct vm_struct get_vm_area(unsigned* long size, unsigned long flags)
2638	{
2639	return __get_vm_area_node(size, align: `1`, PAGE_SHIFT, flags,
2640	VMALLOC_START, VMALLOC_END,
2641	NUMA_NO_NODE, GFP_KERNEL,
2642	caller: __builtin_return_address(`0`));
2643	}
2644
2645	struct vm_struct get_vm_area_caller(unsigned* long size, unsigned long flags,
2646	const void *caller)
2647	{
2648	return __get_vm_area_node(size, align: `1`, PAGE_SHIFT, flags,
2649	VMALLOC_START, VMALLOC_END,
2650	NUMA_NO_NODE, GFP_KERNEL, caller);
2651	}
2652
2653	/**
2654	* find_vm_area - find a continuous kernel virtual area
2655	* @addr: base address
2656	*
2657	* Search for the kernel VM area starting at @addr, and return it.
2658	* It is up to the caller to do all required locking to keep the returned
2659	* pointer valid.
2660	*
2661	* Return: the area descriptor on success or %NULL on failure.
2662	*/
2663	struct vm_struct find_vm_area(const* void *addr)
2664	{
2665	struct vmap_area *va;
2666
2667	va = find_vmap_area(addr: (unsigned long)addr);
2668	if (!va)
2669	return NULL;
2670
2671	return va->vm;
2672	}
2673
2674	/**
2675	* remove_vm_area - find and remove a continuous kernel virtual area
2676	* @addr: base address
2677	*
2678	* Search for the kernel VM area starting at @addr, and remove it.
2679	* This function returns the found VM area, but using it is NOT safe
2680	* on SMP machines, except for its size or flags.
2681	*
2682	* Return: the area descriptor on success or %NULL on failure.
2683	*/
2684	struct vm_struct remove_vm_area(const* void *addr)
2685	{
2686	struct vmap_area *va;
2687	struct vm_struct *vm;
2688
2689	might_sleep();
2690
2691	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
2692	addr))
2693	return NULL;
2694
2695	va = find_unlink_vmap_area(addr: (unsigned long)addr);
2696	if (!va \|\| !va->vm)
2697	return NULL;
2698	vm = va->vm;
2699
2700	debug_check_no_locks_freed(from: vm->addr, len: get_vm_area_size(area: vm));
2701	debug_check_no_obj_freed(address: vm->addr, size: get_vm_area_size(area: vm));
2702	kasan_free_module_shadow(vm);
2703	kasan_poison_vmalloc(start: vm->addr, size: get_vm_area_size(area: vm));
2704
2705	free_unmap_vmap_area(va);
2706	return vm;
2707	}
2708
2709	static inline void set_area_direct_map(const struct vm_struct *area,
2710	int (set_direct_map)(struct* page *page))
2711	{
2712	int i;
2713
2714	/ HUGE_VMALLOC passes small pages to set_direct_map /
2715	for (i = `0`; i < area->nr_pages; i++)
2716	if (page_address(area->pages[i]))
2717	set_direct_map(area->pages[i]);
2718	}
2719
2720	/*
2721	* Flush the vm mapping and reset the direct map.
2722	*/
2723	static void vm_reset_perms(struct vm_struct *area)
2724	{
2725	unsigned long start = ULONG_MAX, end = `0`;
2726	unsigned int page_order = vm_area_page_order(vm: area);
2727	int flush_dmap = `0`;
2728	int i;
2729
2730	/*
2731	* Find the start and end range of the direct mappings to make sure that
2732	* the vm_unmap_aliases() flush includes the direct map.
2733	*/
2734	for (i = `0`; i < area->nr_pages; i += `1U` << page_order) {
2735	unsigned long addr = (unsigned long)page_address(area->pages[i]);
2736
2737	if (addr) {
2738	unsigned long page_size;
2739
2740	page_size = PAGE_SIZE << page_order;
2741	start = min(addr, start);
2742	end = max(addr + page_size, end);
2743	flush_dmap = `1`;
2744	}
2745	}
2746
2747	/*
2748	* Set direct map to something invalid so that it won't be cached if
2749	* there are any accesses after the TLB flush, then flush the TLB and
2750	* reset the direct map permissions to the default.
2751	*/
2752	set_area_direct_map(area, set_direct_map: set_direct_map_invalid_noflush);
2753	_vm_unmap_aliases(start, end, flush: flush_dmap);
2754	set_area_direct_map(area, set_direct_map: set_direct_map_default_noflush);
2755	}
2756
2757	static void delayed_vfree_work(struct work_struct *w)
2758	{
2759	struct vfree_deferred p = container_of(w, struct* vfree_deferred, wq);
2760	struct llist_node t, llnode;
2761
2762	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
2763	vfree(addr: llnode);
2764	}
2765
2766	/**
2767	* vfree_atomic - release memory allocated by vmalloc()
2768	* @addr: memory base address
2769	*
2770	* This one is just like vfree() but can be called in any atomic context
2771	* except NMIs.
2772	*/
2773	void vfree_atomic(const void *addr)
2774	{
2775	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
2776
2777	BUG_ON(in_nmi());
2778	kmemleak_free(ptr: addr);
2779
2780	/*
2781	* Use raw_cpu_ptr() because this can be called from preemptible
2782	* context. Preemption is absolutely fine here, because the llist_add()
2783	* implementation is lockless, so it works even if we are adding to
2784	* another cpu's list. schedule_work() should be fine with this too.
2785	*/
2786	if (addr && llist_add(new: (struct llist_node *)addr, head: &p->list))
2787	schedule_work(work: &p->wq);
2788	}
2789
2790	/**
2791	* vfree - Release memory allocated by vmalloc()
2792	* @addr: Memory base address
2793	*
2794	* Free the virtually continuous memory area starting at @addr, as obtained
2795	* from one of the vmalloc() family of APIs. This will usually also free the
2796	* physical memory underlying the virtual allocation, but that memory is
2797	* reference counted, so it will not be freed until the last user goes away.
2798	*
2799	* If @addr is NULL, no operation is performed.
2800	*
2801	* Context:
2802	* May sleep if called not from interrupt context.
2803	* Must not be called in NMI context (strictly speaking, it could be
2804	* if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
2805	* conventions for vfree() arch-dependent would be a really bad idea).
2806	*/
2807	void vfree(const void *addr)
2808	{
2809	struct vm_struct *vm;
2810	int i;
2811
2812	if (unlikely(in_interrupt())) {
2813	vfree_atomic(addr);
2814	return;
2815	}
2816
2817	BUG_ON(in_nmi());
2818	kmemleak_free(ptr: addr);
2819	might_sleep();
2820
2821	if (!addr)
2822	return;
2823
2824	vm = remove_vm_area(addr);
2825	if (unlikely(!vm)) {
2826	WARN(`1`, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
2827	addr);
2828	return;
2829	}
2830
2831	if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
2832	vm_reset_perms(area: vm);
2833	for (i = `0`; i < vm->nr_pages; i++) {
2834	struct page *page = vm->pages[i];
2835
2836	BUG_ON(!page);
2837	mod_memcg_page_state(page, idx: MEMCG_VMALLOC, val: -`1`);
2838	/*
2839	* High-order allocs for huge vmallocs are split, so
2840	* can be freed as an array of order-0 allocations
2841	*/
2842	__free_page(page);
2843	cond_resched();
2844	}
2845	atomic_long_sub(i: vm->nr_pages, v: &nr_vmalloc_pages);
2846	kvfree(addr: vm->pages);
2847	kfree(objp: vm);
2848	}
2849	EXPORT_SYMBOL(vfree);
2850
2851	/**
2852	* vunmap - release virtual mapping obtained by vmap()
2853	* @addr: memory base address
2854	*
2855	* Free the virtually contiguous memory area starting at @addr,
2856	* which was created from the page array passed to vmap().
2857	*
2858	* Must not be called in interrupt context.
2859	*/
2860	void vunmap(const void *addr)
2861	{
2862	struct vm_struct *vm;
2863
2864	BUG_ON(in_interrupt());
2865	might_sleep();
2866
2867	if (!addr)
2868	return;
2869	vm = remove_vm_area(addr);
2870	if (unlikely(!vm)) {
2871	WARN(`1`, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
2872	addr);
2873	return;
2874	}
2875	kfree(objp: vm);
2876	}
2877	EXPORT_SYMBOL(vunmap);
2878
2879	/**
2880	* vmap - map an array of pages into virtually contiguous space
2881	* @pages: array of page pointers
2882	* @count: number of pages to map
2883	* @flags: vm_area->flags
2884	* @prot: page protection for the mapping
2885	*
2886	* Maps @count pages from @pages into contiguous kernel virtual space.
2887	* If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
2888	* (which must be kmalloc or vmalloc memory) and one reference per pages in it
2889	* are transferred from the caller to vmap(), and will be freed / dropped when
2890	* vfree() is called on the return value.
2891	*
2892	* Return: the address of the area or %NULL on failure
2893	*/
2894	void vmap(struct* page *pages, unsigned* int count,
2895	unsigned long flags, pgprot_t prot)
2896	{
2897	struct vm_struct *area;
2898	unsigned long addr;
2899	unsigned long size; / In bytes /
2900
2901	might_sleep();
2902
2903	if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
2904	return NULL;
2905
2906	/*
2907	* Your top guard is someone else's bottom guard. Not having a top
2908	* guard compromises someone else's mappings too.
2909	*/
2910	if (WARN_ON_ONCE(flags & VM_NO_GUARD))
2911	flags &= ~VM_NO_GUARD;
2912
2913	if (count > totalram_pages())
2914	return NULL;
2915
2916	size = (unsigned long)count << PAGE_SHIFT;
2917	area = get_vm_area_caller(size, flags, caller: __builtin_return_address(`0`));
2918	if (!area)
2919	return NULL;
2920
2921	addr = (unsigned long)area->addr;
2922	if (vmap_pages_range(addr, end: addr + size, pgprot_nx(prot),
2923	pages, PAGE_SHIFT) < `0`) {
2924	vunmap(area->addr);
2925	return NULL;
2926	}
2927
2928	if (flags & VM_MAP_PUT_PAGES) {
2929	area->pages = pages;
2930	area->nr_pages = count;
2931	}
2932	return area->addr;
2933	}
2934	EXPORT_SYMBOL(vmap);
2935
2936	#ifdef CONFIG_VMAP_PFN
2937	struct vmap_pfn_data {
2938	unsigned long *pfns;
2939	pgprot_t prot;
2940	unsigned int idx;
2941	};
2942
2943	static int vmap_pfn_apply(pte_t pte, unsigned* long addr, void *private)
2944	{
2945	struct vmap_pfn_data *data = private;
2946	unsigned long pfn = data->pfns[data->idx];
2947	pte_t ptent;
2948
2949	if (WARN_ON_ONCE(pfn_valid(pfn)))
2950	return -EINVAL;
2951
2952	ptent = pte_mkspecial(pte: pfn_pte(page_nr: pfn, pgprot: data->prot));
2953	set_pte_at(&init_mm, addr, pte, ptent);
2954
2955	data->idx++;
2956	return `0`;
2957	}
2958
2959	/**
2960	* vmap_pfn - map an array of PFNs into virtually contiguous space
2961	* @pfns: array of PFNs
2962	* @count: number of pages to map
2963	* @prot: page protection for the mapping
2964	*
2965	* Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
2966	* the start address of the mapping.
2967	*/
2968	void vmap_pfn(unsigned* long pfns, unsigned* int count, pgprot_t prot)
2969	{
2970	struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
2971	struct vm_struct *area;
2972
2973	area = get_vm_area_caller(size: count * PAGE_SIZE, VM_IOREMAP,
2974	caller: __builtin_return_address(`0`));
2975	if (!area)
2976	return NULL;
2977	if (apply_to_page_range(mm: &init_mm, address: (unsigned long)area->addr,
2978	size: count * PAGE_SIZE, fn: vmap_pfn_apply, data: &data)) {
2979	free_vm_area(area);
2980	return NULL;
2981	}
2982
2983	flush_cache_vmap(start: (unsigned long)area->addr,
2984	end: (unsigned long)area->addr + count * PAGE_SIZE);
2985
2986	return area->addr;
2987	}
2988	EXPORT_SYMBOL_GPL(vmap_pfn);
2989	#endif /* CONFIG_VMAP_PFN */
2990
2991	static inline unsigned int
2992	vm_area_alloc_pages(gfp_t gfp, int nid,
2993	unsigned int order, unsigned int nr_pages, struct page **pages)
2994	{
2995	unsigned int nr_allocated = `0`;
2996	gfp_t alloc_gfp = gfp;
2997	bool nofail = false;
2998	struct page *page;
2999	int i;
3000
3001	/*
3002	* For order-0 pages we make use of bulk allocator, if
3003	* the page array is partly or not at all populated due
3004	* to fails, fallback to a single page allocator that is
3005	* more permissive.
3006	*/
3007	if (!order) {
3008	/ bulk allocator doesn't support nofail req. officially /
3009	gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
3010
3011	while (nr_allocated < nr_pages) {
3012	unsigned int nr, nr_pages_request;
3013
3014	/*
3015	* A maximum allowed request is hard-coded and is 100
3016	* pages per call. That is done in order to prevent a
3017	* long preemption off scenario in the bulk-allocator
3018	* so the range is [1:100].
3019	*/
3020	nr_pages_request = min(`100U`, nr_pages - nr_allocated);
3021
3022	/ memory allocation should consider mempolicy, we can't*
3023	* wrongly use nearest node when nid == NUMA_NO_NODE,
3024	* otherwise memory may be allocated in only one node,
3025	* but mempolicy wants to alloc memory by interleaving.
3026	*/
3027	if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
3028	nr = alloc_pages_bulk_array_mempolicy(gfp: bulk_gfp,
3029	nr_pages: nr_pages_request,
3030	page_array: pages + nr_allocated);
3031
3032	else
3033	nr = alloc_pages_bulk_array_node(gfp: bulk_gfp, nid,
3034	nr_pages: nr_pages_request,
3035	page_array: pages + nr_allocated);
3036
3037	nr_allocated += nr;
3038	cond_resched();
3039
3040	/*
3041	* If zero or pages were obtained partly,
3042	* fallback to a single page allocator.
3043	*/
3044	if (nr != nr_pages_request)
3045	break;
3046	}
3047	} else if (gfp & __GFP_NOFAIL) {
3048	/*
3049	* Higher order nofail allocations are really expensive and
3050	* potentially dangerous (pre-mature OOM, disruptive reclaim
3051	* and compaction etc.
3052	*/
3053	alloc_gfp &= ~__GFP_NOFAIL;
3054	nofail = true;
3055	}
3056
3057	/ High-order pages or fallback path if "bulk" fails. /
3058	while (nr_allocated < nr_pages) {
3059	if (fatal_signal_pending(current))
3060	break;
3061
3062	if (nid == NUMA_NO_NODE)
3063	page = alloc_pages(gfp: alloc_gfp, order);
3064	else
3065	page = alloc_pages_node(nid, gfp_mask: alloc_gfp, order);
3066	if (unlikely(!page)) {
3067	if (!nofail)
3068	break;
3069
3070	/ fall back to the zero order allocations /
3071	alloc_gfp \|= __GFP_NOFAIL;
3072	order = `0`;
3073	continue;
3074	}
3075
3076	/*
3077	* Higher order allocations must be able to be treated as
3078	* indepdenent small pages by callers (as they can with
3079	* small-page vmallocs). Some drivers do their own refcounting
3080	* on vmalloc_to_page() pages, some use page->mapping,
3081	* page->lru, etc.
3082	*/
3083	if (order)
3084	split_page(page, order);
3085
3086	/*
3087	* Careful, we allocate and map page-order pages, but
3088	* tracking is done per PAGE_SIZE page so as to keep the
3089	* vm_struct APIs independent of the physical/mapped size.
3090	*/
3091	for (i = `0`; i < (`1U` << order); i++)
3092	pages[nr_allocated + i] = page + i;
3093
3094	cond_resched();
3095	nr_allocated += `1U` << order;
3096	}
3097
3098	return nr_allocated;
3099	}
3100
3101	static void __vmalloc_area_node(struct* vm_struct *area, gfp_t gfp_mask,
3102	pgprot_t prot, unsigned int page_shift,
3103	int node)
3104	{
3105	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) \| __GFP_ZERO;
3106	bool nofail = gfp_mask & __GFP_NOFAIL;
3107	unsigned long addr = (unsigned long)area->addr;
3108	unsigned long size = get_vm_area_size(area);
3109	unsigned long array_size;
3110	unsigned int nr_small_pages = size >> PAGE_SHIFT;
3111	unsigned int page_order;
3112	unsigned int flags;
3113	int ret;
3114
3115	array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
3116
3117	if (!(gfp_mask & (GFP_DMA \| GFP_DMA32)))
3118	gfp_mask \|= __GFP_HIGHMEM;
3119
3120	/ Please note that the recursion is strictly bounded. /
3121	if (array_size > PAGE_SIZE) {
3122	area->pages = __vmalloc_node(size: array_size, align: `1`, gfp_mask: nested_gfp, node,
3123	caller: area->caller);
3124	} else {
3125	area->pages = kmalloc_node(size: array_size, flags: nested_gfp, node);
3126	}
3127
3128	if (!area->pages) {
3129	warn_alloc(gfp_mask, NULL,
3130	fmt: "vmalloc error: size %lu, failed to allocated page array size %lu",
3131	nr_small_pages * PAGE_SIZE, array_size);
3132	free_vm_area(area);
3133	return NULL;
3134	}
3135
3136	set_vm_area_page_order(vm: area, order: page_shift - PAGE_SHIFT);
3137	page_order = vm_area_page_order(vm: area);
3138
3139	area->nr_pages = vm_area_alloc_pages(gfp: gfp_mask \| __GFP_NOWARN,
3140	nid: node, order: page_order, nr_pages: nr_small_pages, pages: area->pages);
3141
3142	atomic_long_add(i: area->nr_pages, v: &nr_vmalloc_pages);
3143	if (gfp_mask & __GFP_ACCOUNT) {
3144	int i;
3145
3146	for (i = `0`; i < area->nr_pages; i++)
3147	mod_memcg_page_state(page: area->pages[i], idx: MEMCG_VMALLOC, val: `1`);
3148	}
3149
3150	/*
3151	* If not enough pages were obtained to accomplish an
3152	* allocation request, free them via vfree() if any.
3153	*/
3154	if (area->nr_pages != nr_small_pages) {
3155	/*
3156	* vm_area_alloc_pages() can fail due to insufficient memory but
3157	* also:-
3158	*
3159	* - a pending fatal signal
3160	* - insufficient huge page-order pages
3161	*
3162	* Since we always retry allocations at order-0 in the huge page
3163	* case a warning for either is spurious.
3164	*/
3165	if (!fatal_signal_pending(current) && page_order == `0`)
3166	warn_alloc(gfp_mask, NULL,
3167	fmt: "vmalloc error: size %lu, failed to allocate pages",
3168	area->nr_pages * PAGE_SIZE);
3169	goto fail;
3170	}
3171
3172	/*
3173	* page tables allocations ignore external gfp mask, enforce it
3174	* by the scope API
3175	*/
3176	if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == __GFP_IO)
3177	flags = memalloc_nofs_save();
3178	else if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == `0`)
3179	flags = memalloc_noio_save();
3180
3181	do {
3182	ret = vmap_pages_range(addr, end: addr + size, prot, pages: area->pages,
3183	page_shift);
3184	if (nofail && (ret < `0`))
3185	schedule_timeout_uninterruptible(timeout: `1`);
3186	} while (nofail && (ret < `0`));
3187
3188	if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == __GFP_IO)
3189	memalloc_nofs_restore(flags);
3190	else if ((gfp_mask & (__GFP_FS \| __GFP_IO)) == `0`)
3191	memalloc_noio_restore(flags);
3192
3193	if (ret < `0`) {
3194	warn_alloc(gfp_mask, NULL,
3195	fmt: "vmalloc error: size %lu, failed to map pages",
3196	area->nr_pages * PAGE_SIZE);
3197	goto fail;
3198	}
3199
3200	return area->addr;
3201
3202	fail:
3203	vfree(area->addr);
3204	return NULL;
3205	}
3206
3207	/**
3208	* __vmalloc_node_range - allocate virtually contiguous memory
3209	* @size: allocation size
3210	* @align: desired alignment
3211	* @start: vm area range start
3212	* @end: vm area range end
3213	* @gfp_mask: flags for the page level allocator
3214	* @prot: protection mask for the allocated pages
3215	* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
3216	* @node: node to use for allocation or NUMA_NO_NODE
3217	* @caller: caller's return address
3218	*
3219	* Allocate enough pages to cover @size from the page level
3220	* allocator with @gfp_mask flags. Please note that the full set of gfp
3221	* flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
3222	* supported.
3223	* Zone modifiers are not supported. From the reclaim modifiers
3224	* __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
3225	* and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
3226	* __GFP_RETRY_MAYFAIL are not supported).
3227	*
3228	* __GFP_NOWARN can be used to suppress failures messages.
3229	*
3230	* Map them into contiguous kernel virtual space, using a pagetable
3231	* protection of @prot.
3232	*
3233	* Return: the address of the area or %NULL on failure
3234	*/
3235	void __vmalloc_node_range(unsigned* long size, unsigned long align,
3236	unsigned long start, unsigned long end, gfp_t gfp_mask,
3237	pgprot_t prot, unsigned long vm_flags, int node,
3238	const void *caller)
3239	{
3240	struct vm_struct *area;
3241	void *ret;
3242	kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
3243	unsigned long real_size = size;
3244	unsigned long real_align = align;
3245	unsigned int shift = PAGE_SHIFT;
3246
3247	if (WARN_ON_ONCE(!size))
3248	return NULL;
3249
3250	if ((size >> PAGE_SHIFT) > totalram_pages()) {
3251	warn_alloc(gfp_mask, NULL,
3252	fmt: "vmalloc error: size %lu, exceeds total pages",
3253	real_size);
3254	return NULL;
3255	}
3256
3257	if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
3258	unsigned long size_per_node;
3259
3260	/*
3261	* Try huge pages. Only try for PAGE_KERNEL allocations,
3262	* others like modules don't yet expect huge pages in
3263	* their allocations due to apply_to_page_range not
3264	* supporting them.
3265	*/
3266
3267	size_per_node = size;
3268	if (node == NUMA_NO_NODE)
3269	size_per_node /= num_online_nodes();
3270	if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
3271	shift = PMD_SHIFT;
3272	else
3273	shift = arch_vmap_pte_supported_shift(size: size_per_node);
3274
3275	align = max(real_align, `1UL` << shift);
3276	size = ALIGN(real_size, `1UL` << shift);
3277	}
3278
3279	again:
3280	area = __get_vm_area_node(size: real_size, align, shift, VM_ALLOC \|
3281	VM_UNINITIALIZED \| vm_flags, start, end, node,
3282	gfp_mask, caller);
3283	if (!area) {
3284	bool nofail = gfp_mask & __GFP_NOFAIL;
3285	warn_alloc(gfp_mask, NULL,
3286	fmt: "vmalloc error: size %lu, vm_struct allocation failed%s",
3287	real_size, (nofail) ? ". Retrying." : "");
3288	if (nofail) {
3289	schedule_timeout_uninterruptible(timeout: `1`);
3290	goto again;
3291	}
3292	goto fail;
3293	}
3294
3295	/*
3296	* Prepare arguments for __vmalloc_area_node() and
3297	* kasan_unpoison_vmalloc().
3298	*/
3299	if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
3300	if (kasan_hw_tags_enabled()) {
3301	/*
3302	* Modify protection bits to allow tagging.
3303	* This must be done before mapping.
3304	*/
3305	prot = arch_vmap_pgprot_tagged(prot);
3306
3307	/*
3308	* Skip page_alloc poisoning and zeroing for physical
3309	* pages backing VM_ALLOC mapping. Memory is instead
3310	* poisoned and zeroed by kasan_unpoison_vmalloc().
3311	*/
3312	gfp_mask \|= __GFP_SKIP_KASAN \| __GFP_SKIP_ZERO;
3313	}
3314
3315	/ Take note that the mapping is PAGE_KERNEL. /
3316	kasan_flags \|= KASAN_VMALLOC_PROT_NORMAL;
3317	}
3318
3319	/ Allocate physical pages and map them into vmalloc space. /
3320	ret = __vmalloc_area_node(area, gfp_mask, prot, page_shift: shift, node);
3321	if (!ret)
3322	goto fail;
3323
3324	/*
3325	* Mark the pages as accessible, now that they are mapped.
3326	* The condition for setting KASAN_VMALLOC_INIT should complement the
3327	* one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
3328	* to make sure that memory is initialized under the same conditions.
3329	* Tag-based KASAN modes only assign tags to normal non-executable
3330	* allocations, see __kasan_unpoison_vmalloc().
3331	*/
3332	kasan_flags \|= KASAN_VMALLOC_VM_ALLOC;
3333	if (!want_init_on_free() && want_init_on_alloc(flags: gfp_mask) &&
3334	(gfp_mask & __GFP_SKIP_ZERO))
3335	kasan_flags \|= KASAN_VMALLOC_INIT;
3336	/ KASAN_VMALLOC_PROT_NORMAL already set if required. /
3337	area->addr = kasan_unpoison_vmalloc(start: area->addr, size: real_size, flags: kasan_flags);
3338
3339	/*
3340	* In this function, newly allocated vm_struct has VM_UNINITIALIZED
3341	* flag. It means that vm_struct is not fully initialized.
3342	* Now, it is fully initialized, so remove this flag here.
3343	*/
3344	clear_vm_uninitialized_flag(vm: area);
3345
3346	size = PAGE_ALIGN(size);
3347	if (!(vm_flags & VM_DEFER_KMEMLEAK))
3348	kmemleak_vmalloc(area, size, gfp: gfp_mask);
3349
3350	return area->addr;
3351
3352	fail:
3353	if (shift > PAGE_SHIFT) {
3354	shift = PAGE_SHIFT;
3355	align = real_align;
3356	size = real_size;
3357	goto again;
3358	}
3359
3360	return NULL;
3361	}
3362
3363	/**
3364	* __vmalloc_node - allocate virtually contiguous memory
3365	* @size: allocation size
3366	* @align: desired alignment
3367	* @gfp_mask: flags for the page level allocator
3368	* @node: node to use for allocation or NUMA_NO_NODE
3369	* @caller: caller's return address
3370	*
3371	* Allocate enough pages to cover @size from the page level allocator with
3372	* @gfp_mask flags. Map them into contiguous kernel virtual space.
3373	*
3374	* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
3375	* and __GFP_NOFAIL are not supported
3376	*
3377	* Any use of gfp flags outside of GFP_KERNEL should be consulted
3378	* with mm people.
3379	*
3380	* Return: pointer to the allocated memory or %NULL on error
3381	*/
3382	void __vmalloc_node(unsigned* long size, unsigned long align,
3383	gfp_t gfp_mask, int node, const void *caller)
3384	{
3385	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
3386	gfp_mask, PAGE_KERNEL, vm_flags: `0`, node, caller);
3387	}
3388	/*
3389	* This is only for performance analysis of vmalloc and stress purpose.
3390	* It is required by vmalloc test module, therefore do not use it other
3391	* than that.
3392	*/
3393	#ifdef CONFIG_TEST_VMALLOC_MODULE
3394	EXPORT_SYMBOL_GPL(__vmalloc_node);
3395	#endif
3396
3397	void __vmalloc(unsigned* long size, gfp_t gfp_mask)
3398	{
3399	return __vmalloc_node(size, `1`, gfp_mask, NUMA_NO_NODE,
3400	__builtin_return_address(`0`));
3401	}
3402	EXPORT_SYMBOL(__vmalloc);
3403
3404	/**
3405	* vmalloc - allocate virtually contiguous memory
3406	* @size: allocation size
3407	*
3408	* Allocate enough pages to cover @size from the page level
3409	* allocator and map them into contiguous kernel virtual space.
3410	*
3411	* For tight control over page level allocator and protection flags
3412	* use __vmalloc() instead.
3413	*
3414	* Return: pointer to the allocated memory or %NULL on error
3415	*/
3416	void vmalloc(unsigned* long size)
3417	{
3418	return __vmalloc_node(size, `1`, GFP_KERNEL, NUMA_NO_NODE,
3419	__builtin_return_address(`0`));
3420	}
3421	EXPORT_SYMBOL(vmalloc);
3422
3423	/**
3424	* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
3425	* @size: allocation size
3426	* @gfp_mask: flags for the page level allocator
3427	*
3428	* Allocate enough pages to cover @size from the page level
3429	* allocator and map them into contiguous kernel virtual space.
3430	* If @size is greater than or equal to PMD_SIZE, allow using
3431	* huge pages for the memory
3432	*
3433	* Return: pointer to the allocated memory or %NULL on error
3434	*/
3435	void vmalloc_huge(unsigned* long size, gfp_t gfp_mask)
3436	{
3437	return __vmalloc_node_range(size, align: `1`, VMALLOC_START, VMALLOC_END,
3438	gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
3439	NUMA_NO_NODE, caller: __builtin_return_address(`0`));
3440	}
3441	EXPORT_SYMBOL_GPL(vmalloc_huge);
3442
3443	/**
3444	* vzalloc - allocate virtually contiguous memory with zero fill
3445	* @size: allocation size
3446	*
3447	* Allocate enough pages to cover @size from the page level
3448	* allocator and map them into contiguous kernel virtual space.
3449	* The memory allocated is set to zero.
3450	*
3451	* For tight control over page level allocator and protection flags
3452	* use __vmalloc() instead.
3453	*
3454	* Return: pointer to the allocated memory or %NULL on error
3455	*/
3456	void vzalloc(unsigned* long size)
3457	{
3458	return __vmalloc_node(size, `1`, GFP_KERNEL \| __GFP_ZERO, NUMA_NO_NODE,
3459	__builtin_return_address(`0`));
3460	}
3461	EXPORT_SYMBOL(vzalloc);
3462
3463	/**
3464	* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
3465	* @size: allocation size
3466	*
3467	* The resulting memory area is zeroed so it can be mapped to userspace
3468	* without leaking data.
3469	*
3470	* Return: pointer to the allocated memory or %NULL on error
3471	*/
3472	void vmalloc_user(unsigned* long size)
3473	{
3474	return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
3475	GFP_KERNEL \| __GFP_ZERO, PAGE_KERNEL,
3476	VM_USERMAP, NUMA_NO_NODE,
3477	caller: __builtin_return_address(`0`));
3478	}
3479	EXPORT_SYMBOL(vmalloc_user);
3480
3481	/**
3482	* vmalloc_node - allocate memory on a specific node
3483	* @size: allocation size
3484	* @node: numa node
3485	*
3486	* Allocate enough pages to cover @size from the page level
3487	* allocator and map them into contiguous kernel virtual space.
3488	*
3489	* For tight control over page level allocator and protection flags
3490	* use __vmalloc() instead.
3491	*
3492	* Return: pointer to the allocated memory or %NULL on error
3493	*/
3494	void vmalloc_node(unsigned* long size, int node)
3495	{
3496	return __vmalloc_node(size, `1`, GFP_KERNEL, node,
3497	__builtin_return_address(`0`));
3498	}
3499	EXPORT_SYMBOL(vmalloc_node);
3500
3501	/**
3502	* vzalloc_node - allocate memory on a specific node with zero fill
3503	* @size: allocation size
3504	* @node: numa node
3505	*
3506	* Allocate enough pages to cover @size from the page level
3507	* allocator and map them into contiguous kernel virtual space.
3508	* The memory allocated is set to zero.
3509	*
3510	* Return: pointer to the allocated memory or %NULL on error
3511	*/
3512	void vzalloc_node(unsigned* long size, int node)
3513	{
3514	return __vmalloc_node(size, `1`, GFP_KERNEL \| __GFP_ZERO, node,
3515	__builtin_return_address(`0`));
3516	}
3517	EXPORT_SYMBOL(vzalloc_node);
3518
3519	#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
3520	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
3521	#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
3522	#define GFP_VMALLOC32 (GFP_DMA \| GFP_KERNEL)
3523	#else
3524	/*
3525	* 64b systems should always have either DMA or DMA32 zones. For others
3526	* GFP_DMA32 should do the right thing and use the normal zone.
3527	*/
3528	#define GFP_VMALLOC32 (GFP_DMA32 \| GFP_KERNEL)
3529	#endif
3530
3531	/**
3532	* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
3533	* @size: allocation size
3534	*
3535	* Allocate enough 32bit PA addressable pages to cover @size from the
3536	* page level allocator and map them into contiguous kernel virtual space.
3537	*
3538	* Return: pointer to the allocated memory or %NULL on error
3539	*/
3540	void vmalloc_32(unsigned* long size)
3541	{
3542	return __vmalloc_node(size, `1`, GFP_VMALLOC32, NUMA_NO_NODE,
3543	__builtin_return_address(`0`));
3544	}
3545	EXPORT_SYMBOL(vmalloc_32);
3546
3547	/**
3548	* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
3549	* @size: allocation size
3550	*
3551	* The resulting memory area is 32bit addressable and zeroed so it can be
3552	* mapped to userspace without leaking data.
3553	*
3554	* Return: pointer to the allocated memory or %NULL on error
3555	*/
3556	void vmalloc_32_user(unsigned* long size)
3557	{
3558	return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
3559	GFP_VMALLOC32 \| __GFP_ZERO, PAGE_KERNEL,
3560	VM_USERMAP, NUMA_NO_NODE,
3561	caller: __builtin_return_address(`0`));
3562	}
3563	EXPORT_SYMBOL(vmalloc_32_user);
3564
3565	/*
3566	* Atomically zero bytes in the iterator.
3567	*
3568	* Returns the number of zeroed bytes.
3569	*/
3570	static size_t zero_iter(struct iov_iter *iter, size_t count)
3571	{
3572	size_t remains = count;
3573
3574	while (remains > `0`) {
3575	size_t num, copied;
3576
3577	num = min_t(size_t, remains, PAGE_SIZE);
3578	copied = copy_page_to_iter_nofault(ZERO_PAGE(`0`), offset: `0`, bytes: num, i: iter);
3579	remains -= copied;
3580
3581	if (copied < num)
3582	break;
3583	}
3584
3585	return count - remains;
3586	}
3587
3588	/*
3589	* small helper routine, copy contents to iter from addr.
3590	* If the page is not present, fill zero.
3591	*
3592	* Returns the number of copied bytes.
3593	*/
3594	static size_t aligned_vread_iter(struct iov_iter *iter,
3595	const char *addr, size_t count)
3596	{
3597	size_t remains = count;
3598	struct page *page;
3599
3600	while (remains > `0`) {
3601	unsigned long offset, length;
3602	size_t copied = `0`;
3603
3604	offset = offset_in_page(addr);
3605	length = PAGE_SIZE - offset;
3606	if (length > remains)
3607	length = remains;
3608	page = vmalloc_to_page(addr);
3609	/*
3610	* To do safe access to this _mapped_ area, we need lock. But
3611	* adding lock here means that we need to add overhead of
3612	* vmalloc()/vfree() calls for this _debug_ interface, rarely
3613	* used. Instead of that, we'll use an local mapping via
3614	* copy_page_to_iter_nofault() and accept a small overhead in
3615	* this access function.
3616	*/
3617	if (page)
3618	copied = copy_page_to_iter_nofault(page, offset,
3619	bytes: length, i: iter);
3620	else
3621	copied = zero_iter(iter, count: length);
3622
3623	addr += copied;
3624	remains -= copied;
3625
3626	if (copied != length)
3627	break;
3628	}
3629
3630	return count - remains;
3631	}
3632
3633	/*
3634	* Read from a vm_map_ram region of memory.
3635	*
3636	* Returns the number of copied bytes.
3637	*/
3638	static size_t vmap_ram_vread_iter(struct iov_iter iter, const* char *addr,
3639	size_t count, unsigned long flags)
3640	{
3641	char *start;
3642	struct vmap_block *vb;
3643	struct xarray *xa;
3644	unsigned long offset;
3645	unsigned int rs, re;
3646	size_t remains, n;
3647
3648	/*
3649	* If it's area created by vm_map_ram() interface directly, but
3650	* not further subdividing and delegating management to vmap_block,
3651	* handle it here.
3652	*/
3653	if (!(flags & VMAP_BLOCK))
3654	return aligned_vread_iter(iter, addr, count);
3655
3656	remains = count;
3657
3658	/*
3659	* Area is split into regions and tracked with vmap_block, read out
3660	* each region and zero fill the hole between regions.
3661	*/
3662	xa = addr_to_vb_xa(addr: (unsigned long) addr);
3663	vb = xa_load(xa, index: addr_to_vb_idx(addr: (unsigned long)addr));
3664	if (!vb)
3665	goto finished_zero;
3666
3667	spin_lock(lock: &vb->lock);
3668	if (bitmap_empty(src: vb->used_map, VMAP_BBMAP_BITS)) {
3669	spin_unlock(lock: &vb->lock);
3670	goto finished_zero;
3671	}
3672
3673	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
3674	size_t copied;
3675
3676	if (remains == `0`)
3677	goto finished;
3678
3679	start = vmap_block_vaddr(va_start: vb->va->va_start, pages_off: rs);
3680
3681	if (addr < start) {
3682	size_t to_zero = min_t(size_t, start - addr, remains);
3683	size_t zeroed = zero_iter(iter, count: to_zero);
3684
3685	addr += zeroed;
3686	remains -= zeroed;
3687
3688	if (remains == `0` \|\| zeroed != to_zero)
3689	goto finished;
3690	}
3691
3692	/it could start reading from the middle of used region/
3693	offset = offset_in_page(addr);
3694	n = ((re - rs + `1`) << PAGE_SHIFT) - offset;
3695	if (n > remains)
3696	n = remains;
3697
3698	copied = aligned_vread_iter(iter, addr: start + offset, count: n);
3699
3700	addr += copied;
3701	remains -= copied;
3702
3703	if (copied != n)
3704	goto finished;
3705	}
3706
3707	spin_unlock(lock: &vb->lock);
3708
3709	finished_zero:
3710	/ zero-fill the left dirty or free regions /
3711	return count - remains + zero_iter(iter, count: remains);
3712	finished:
3713	/ We couldn't copy/zero everything /
3714	spin_unlock(lock: &vb->lock);
3715	return count - remains;
3716	}
3717
3718	/**
3719	* vread_iter() - read vmalloc area in a safe way to an iterator.
3720	* @iter: the iterator to which data should be written.
3721	* @addr: vm address.
3722	* @count: number of bytes to be read.
3723	*
3724	* This function checks that addr is a valid vmalloc'ed area, and
3725	* copy data from that area to a given buffer. If the given memory range
3726	* of [addr...addr+count) includes some valid address, data is copied to
3727	* proper area of @buf. If there are memory holes, they'll be zero-filled.
3728	* IOREMAP area is treated as memory hole and no copy is done.
3729	*
3730	* If [addr...addr+count) doesn't includes any intersects with alive
3731	* vm_struct area, returns 0. @buf should be kernel's buffer.
3732	*
3733	* Note: In usual ops, vread() is never necessary because the caller
3734	* should know vmalloc() area is valid and can use memcpy().
3735	* This is for routines which have to access vmalloc area without
3736	* any information, as /proc/kcore.
3737	*
3738	* Return: number of bytes for which addr and buf should be increased
3739	* (same number as @count) or %0 if [addr...addr+count) doesn't
3740	* include any intersection with valid vmalloc area
3741	*/
3742	long vread_iter(struct iov_iter iter, const* char *addr, size_t count)
3743	{
3744	struct vmap_area *va;
3745	struct vm_struct *vm;
3746	char *vaddr;
3747	size_t n, size, flags, remains;
3748
3749	addr = kasan_reset_tag(addr);
3750
3751	/ Don't allow overflow /
3752	if ((unsigned long) addr + count < count)
3753	count = -(unsigned long) addr;
3754
3755	remains = count;
3756
3757	spin_lock(lock: &vmap_area_lock);
3758	va = find_vmap_area_exceed_addr(addr: (unsigned long)addr);
3759	if (!va)
3760	goto finished_zero;
3761
3762	/ no intersects with alive vmap_area /
3763	if ((unsigned long)addr + remains <= va->va_start)
3764	goto finished_zero;
3765
3766	list_for_each_entry_from(va, &vmap_area_list, list) {
3767	size_t copied;
3768
3769	if (remains == `0`)
3770	goto finished;
3771
3772	vm = va->vm;
3773	flags = va->flags & VMAP_FLAGS_MASK;
3774	/*
3775	* VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
3776	* be set together with VMAP_RAM.
3777	*/
3778	WARN_ON(flags == VMAP_BLOCK);
3779
3780	if (!vm && !flags)
3781	continue;
3782
3783	if (vm && (vm->flags & VM_UNINITIALIZED))
3784	continue;
3785
3786	/ Pair with smp_wmb() in clear_vm_uninitialized_flag() /
3787	smp_rmb();
3788
3789	vaddr = (char *) va->va_start;
3790	size = vm ? get_vm_area_size(area: vm) : va_size(va);
3791
3792	if (addr >= vaddr + size)
3793	continue;
3794
3795	if (addr < vaddr) {
3796	size_t to_zero = min_t(size_t, vaddr - addr, remains);
3797	size_t zeroed = zero_iter(iter, count: to_zero);
3798
3799	addr += zeroed;
3800	remains -= zeroed;
3801
3802	if (remains == `0` \|\| zeroed != to_zero)
3803	goto finished;
3804	}
3805
3806	n = vaddr + size - addr;
3807	if (n > remains)
3808	n = remains;
3809
3810	if (flags & VMAP_RAM)
3811	copied = vmap_ram_vread_iter(iter, addr, count: n, flags);
3812	else if (!(vm && (vm->flags & VM_IOREMAP)))
3813	copied = aligned_vread_iter(iter, addr, count: n);
3814	else / IOREMAP area is treated as memory hole /
3815	copied = zero_iter(iter, count: n);
3816
3817	addr += copied;
3818	remains -= copied;
3819
3820	if (copied != n)
3821	goto finished;
3822	}
3823
3824	finished_zero:
3825	spin_unlock(lock: &vmap_area_lock);
3826	/ zero-fill memory holes /
3827	return count - remains + zero_iter(iter, count: remains);
3828	finished:
3829	/ Nothing remains, or We couldn't copy/zero everything. /
3830	spin_unlock(lock: &vmap_area_lock);
3831
3832	return count - remains;
3833	}
3834
3835	/**
3836	* remap_vmalloc_range_partial - map vmalloc pages to userspace
3837	* @vma: vma to cover
3838	* @uaddr: target user address to start at
3839	* @kaddr: virtual address of vmalloc kernel memory
3840	* @pgoff: offset from @kaddr to start at
3841	* @size: size of map area
3842	*
3843	* Returns: 0 for success, -Exxx on failure
3844	*
3845	* This function checks that @kaddr is a valid vmalloc'ed area,
3846	* and that it is big enough to cover the range starting at
3847	* @uaddr in @vma. Will return failure if that criteria isn't
3848	* met.
3849	*
3850	* Similar to remap_pfn_range() (see mm/memory.c)
3851	*/
3852	int remap_vmalloc_range_partial(struct vm_area_struct vma, unsigned* long uaddr,
3853	void kaddr, unsigned* long pgoff,
3854	unsigned long size)
3855	{
3856	struct vm_struct *area;
3857	unsigned long off;
3858	unsigned long end_index;
3859
3860	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
3861	return -EINVAL;
3862
3863	size = PAGE_ALIGN(size);
3864
3865	if (!PAGE_ALIGNED(uaddr) \|\| !PAGE_ALIGNED(kaddr))
3866	return -EINVAL;
3867
3868	area = find_vm_area(addr: kaddr);
3869	if (!area)
3870	return -EINVAL;
3871
3872	if (!(area->flags & (VM_USERMAP \| VM_DMA_COHERENT)))
3873	return -EINVAL;
3874
3875	if (check_add_overflow(size, off, &end_index) \|\|
3876	end_index > get_vm_area_size(area))
3877	return -EINVAL;
3878	kaddr += off;
3879
3880	do {
3881	struct page *page = vmalloc_to_page(kaddr);
3882	int ret;
3883
3884	ret = vm_insert_page(vma, addr: uaddr, page);
3885	if (ret)
3886	return ret;
3887
3888	uaddr += PAGE_SIZE;
3889	kaddr += PAGE_SIZE;
3890	size -= PAGE_SIZE;
3891	} while (size > `0`);
3892
3893	vm_flags_set(vma, VM_DONTEXPAND \| VM_DONTDUMP);
3894
3895	return `0`;
3896	}
3897
3898	/**
3899	* remap_vmalloc_range - map vmalloc pages to userspace
3900	* @vma: vma to cover (map full range of vma)
3901	* @addr: vmalloc memory
3902	* @pgoff: number of pages into addr before first page to map
3903	*
3904	* Returns: 0 for success, -Exxx on failure
3905	*
3906	* This function checks that addr is a valid vmalloc'ed area, and
3907	* that it is big enough to cover the vma. Will return failure if
3908	* that criteria isn't met.
3909	*
3910	* Similar to remap_pfn_range() (see mm/memory.c)
3911	*/
3912	int remap_vmalloc_range(struct vm_area_struct vma, void* *addr,
3913	unsigned long pgoff)
3914	{
3915	return remap_vmalloc_range_partial(vma, uaddr: vma->vm_start,
3916	kaddr: addr, pgoff,
3917	size: vma->vm_end - vma->vm_start);
3918	}
3919	EXPORT_SYMBOL(remap_vmalloc_range);
3920
3921	void free_vm_area(struct vm_struct *area)
3922	{
3923	struct vm_struct *ret;
3924	ret = remove_vm_area(addr: area->addr);
3925	BUG_ON(ret != area);
3926	kfree(objp: area);
3927	}
3928	EXPORT_SYMBOL_GPL(free_vm_area);
3929
3930	#ifdef CONFIG_SMP
3931	static struct vmap_area node_to_va(struct* rb_node *n)
3932	{
3933	return rb_entry_safe(n, struct vmap_area, rb_node);
3934	}
3935
3936	/**
3937	* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3938	* @addr: target address
3939	*
3940	* Returns: vmap_area if it is found. If there is no such area
3941	* the first highest(reverse order) vmap_area is returned
3942	* i.e. va->va_start < addr && va->va_end < addr or NULL
3943	* if there are no any areas before @addr.
3944	*/
3945	static struct vmap_area *
3946	pvm_find_va_enclose_addr(unsigned long addr)
3947	{
3948	struct vmap_area va, tmp;
3949	struct rb_node *n;
3950
3951	n = free_vmap_area_root.rb_node;
3952	va = NULL;
3953
3954	while (n) {
3955	tmp = rb_entry(n, struct vmap_area, rb_node);
3956	if (tmp->va_start <= addr) {
3957	va = tmp;
3958	if (tmp->va_end >= addr)
3959	break;
3960
3961	n = n->rb_right;
3962	} else {
3963	n = n->rb_left;
3964	}
3965	}
3966
3967	return va;
3968	}
3969
3970	/**
3971	* pvm_determine_end_from_reverse - find the highest aligned address
3972	* of free block below VMALLOC_END
3973	* @va:
3974	* in - the VA we start the search(reverse order);
3975	* out - the VA with the highest aligned end address.
3976	* @align: alignment for required highest address
3977	*
3978	* Returns: determined end address within vmap_area
3979	*/
3980	static unsigned long
3981	pvm_determine_end_from_reverse(struct vmap_area *va, unsigned* long align)
3982	{
3983	unsigned long vmalloc_end = VMALLOC_END & ~(align - `1`);
3984	unsigned long addr;
3985
3986	if (likely(*va)) {
3987	list_for_each_entry_from_reverse((*va),
3988	&free_vmap_area_list, list) {
3989	addr = min((*va)->va_end & ~(align - `1`), vmalloc_end);
3990	if ((*va)->va_start < addr)
3991	return addr;
3992	}
3993	}
3994
3995	return `0`;
3996	}
3997
3998	/**
3999	* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
4000	* @offsets: array containing offset of each area
4001	* @sizes: array containing size of each area
4002	* @nr_vms: the number of areas to allocate
4003	* @align: alignment, all entries in @offsets and @sizes must be aligned to this
4004	*
4005	* Returns: kmalloc'd vm_struct pointer array pointing to allocated
4006	* vm_structs on success, %NULL on failure
4007	*
4008	* Percpu allocator wants to use congruent vm areas so that it can
4009	* maintain the offsets among percpu areas. This function allocates
4010	* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
4011	* be scattered pretty far, distance between two areas easily going up
4012	* to gigabytes. To avoid interacting with regular vmallocs, these
4013	* areas are allocated from top.
4014	*
4015	* Despite its complicated look, this allocator is rather simple. It
4016	* does everything top-down and scans free blocks from the end looking
4017	* for matching base. While scanning, if any of the areas do not fit the
4018	* base address is pulled down to fit the area. Scanning is repeated till
4019	* all the areas fit and then all necessary data structures are inserted
4020	* and the result is returned.
4021	*/
4022	struct vm_struct *pcpu_get_vm_areas(const* unsigned long *offsets,
4023	const size_t sizes, int* nr_vms,
4024	size_t align)
4025	{
4026	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
4027	const unsigned long vmalloc_end = VMALLOC_END & ~(align - `1`);
4028	struct vmap_area *vas, va;
4029	struct vm_struct **vms;
4030	int area, area2, last_area, term_area;
4031	unsigned long base, start, size, end, last_end, orig_start, orig_end;
4032	bool purged = false;
4033
4034	/ verify parameters and allocate data structures /
4035	BUG_ON(offset_in_page(align) \|\| !is_power_of_2(align));
4036	for (last_area = `0`, area = `0`; area < nr_vms; area++) {
4037	start = offsets[area];
4038	end = start + sizes[area];
4039
4040	/ is everything aligned properly? /
4041	BUG_ON(!IS_ALIGNED(offsets[area], align));
4042	BUG_ON(!IS_ALIGNED(sizes[area], align));
4043
4044	/ detect the area with the highest address /
4045	if (start > offsets[last_area])
4046	last_area = area;
4047
4048	for (area2 = area + `1`; area2 < nr_vms; area2++) {
4049	unsigned long start2 = offsets[area2];
4050	unsigned long end2 = start2 + sizes[area2];
4051
4052	BUG_ON(start2 < end && start < end2);
4053	}
4054	}
4055	last_end = offsets[last_area] + sizes[last_area];
4056
4057	if (vmalloc_end - vmalloc_start < last_end) {
4058	WARN_ON(true);
4059	return NULL;
4060	}
4061
4062	vms = kcalloc(n: nr_vms, size: sizeof(vms[`0`]), GFP_KERNEL);
4063	vas = kcalloc(n: nr_vms, size: sizeof(vas[`0`]), GFP_KERNEL);
4064	if (!vas \|\| !vms)
4065	goto err_free2;
4066
4067	for (area = `0`; area < nr_vms; area++) {
4068	vas[area] = kmem_cache_zalloc(k: vmap_area_cachep, GFP_KERNEL);
4069	vms[area] = kzalloc(size: sizeof(struct vm_struct), GFP_KERNEL);
4070	if (!vas[area] \|\| !vms[area])
4071	goto err_free;
4072	}
4073	retry:
4074	spin_lock(lock: &free_vmap_area_lock);
4075
4076	/ start scanning - we scan from the top, begin with the last area /
4077	area = term_area = last_area;
4078	start = offsets[area];
4079	end = start + sizes[area];
4080
4081	va = pvm_find_va_enclose_addr(addr: vmalloc_end);
4082	base = pvm_determine_end_from_reverse(va: &va, align) - end;
4083
4084	while (true) {
4085	/*
4086	* base might have underflowed, add last_end before
4087	* comparing.
4088	*/
4089	if (base + last_end < vmalloc_start + last_end)
4090	goto overflow;
4091
4092	/*
4093	* Fitting base has not been found.
4094	*/
4095	if (va == NULL)
4096	goto overflow;
4097
4098	/*
4099	* If required width exceeds current VA block, move
4100	* base downwards and then recheck.
4101	*/
4102	if (base + end > va->va_end) {
4103	base = pvm_determine_end_from_reverse(va: &va, align) - end;
4104	term_area = area;
4105	continue;
4106	}
4107
4108	/*
4109	* If this VA does not fit, move base downwards and recheck.
4110	*/
4111	if (base + start < va->va_start) {
4112	va = node_to_va(n: rb_prev(&va->rb_node));
4113	base = pvm_determine_end_from_reverse(va: &va, align) - end;
4114	term_area = area;
4115	continue;
4116	}
4117
4118	/*
4119	* This area fits, move on to the previous one. If
4120	* the previous one is the terminal one, we're done.
4121	*/
4122	area = (area + nr_vms - `1`) % nr_vms;
4123	if (area == term_area)
4124	break;
4125
4126	start = offsets[area];
4127	end = start + sizes[area];
4128	va = pvm_find_va_enclose_addr(addr: base + end);
4129	}
4130
4131	/ we've found a fitting base, insert all va's /
4132	for (area = `0`; area < nr_vms; area++) {
4133	int ret;
4134
4135	start = base + offsets[area];
4136	size = sizes[area];
4137
4138	va = pvm_find_va_enclose_addr(addr: start);
4139	if (WARN_ON_ONCE(va == NULL))
4140	/ It is a BUG(), but trigger recovery instead. /
4141	goto recovery;
4142
4143	ret = adjust_va_to_fit_type(root: &free_vmap_area_root,
4144	head: &free_vmap_area_list,
4145	va, nva_start_addr: start, size);
4146	if (WARN_ON_ONCE(unlikely(ret)))
4147	/ It is a BUG(), but trigger recovery instead. /
4148	goto recovery;
4149
4150	/ Allocated area. /
4151	va = vas[area];
4152	va->va_start = start;
4153	va->va_end = start + size;
4154	}
4155
4156	spin_unlock(lock: &free_vmap_area_lock);
4157
4158	/ populate the kasan shadow space /
4159	for (area = `0`; area < nr_vms; area++) {
4160	if (kasan_populate_vmalloc(start: vas[area]->va_start, size: sizes[area]))
4161	goto err_free_shadow;
4162	}
4163
4164	/ insert all vm's /
4165	spin_lock(lock: &vmap_area_lock);
4166	for (area = `0`; area < nr_vms; area++) {
4167	insert_vmap_area(va: vas[area], root: &vmap_area_root, head: &vmap_area_list);
4168
4169	setup_vmalloc_vm_locked(vm: vms[area], va: vas[area], VM_ALLOC,
4170	caller: pcpu_get_vm_areas);
4171	}
4172	spin_unlock(lock: &vmap_area_lock);
4173
4174	/*
4175	* Mark allocated areas as accessible. Do it now as a best-effort
4176	* approach, as they can be mapped outside of vmalloc code.
4177	* With hardware tag-based KASAN, marking is skipped for
4178	* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
4179	*/
4180	for (area = `0`; area < nr_vms; area++)
4181	vms[area]->addr = kasan_unpoison_vmalloc(start: vms[area]->addr,
4182	size: vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
4183
4184	kfree(objp: vas);
4185	return vms;
4186
4187	recovery:
4188	/*
4189	* Remove previously allocated areas. There is no
4190	* need in removing these areas from the busy tree,
4191	* because they are inserted only on the final step
4192	* and when pcpu_get_vm_areas() is success.
4193	*/
4194	while (area--) {
4195	orig_start = vas[area]->va_start;
4196	orig_end = vas[area]->va_end;
4197	va = merge_or_add_vmap_area_augment(va: vas[area], root: &free_vmap_area_root,
4198	head: &free_vmap_area_list);
4199	if (va)
4200	kasan_release_vmalloc(start: orig_start, end: orig_end,
4201	free_region_start: va->va_start, free_region_end: va->va_end);
4202	vas[area] = NULL;
4203	}
4204
4205	overflow:
4206	spin_unlock(lock: &free_vmap_area_lock);
4207	if (!purged) {
4208	reclaim_and_purge_vmap_areas();
4209	purged = true;
4210
4211	/ Before "retry", check if we recover. /
4212	for (area = `0`; area < nr_vms; area++) {
4213	if (vas[area])
4214	continue;
4215
4216	vas[area] = kmem_cache_zalloc(
4217	k: vmap_area_cachep, GFP_KERNEL);
4218	if (!vas[area])
4219	goto err_free;
4220	}
4221
4222	goto retry;
4223	}
4224
4225	err_free:
4226	for (area = `0`; area < nr_vms; area++) {
4227	if (vas[area])
4228	kmem_cache_free(s: vmap_area_cachep, objp: vas[area]);
4229
4230	kfree(objp: vms[area]);
4231	}
4232	err_free2:
4233	kfree(objp: vas);
4234	kfree(objp: vms);
4235	return NULL;
4236
4237	err_free_shadow:
4238	spin_lock(lock: &free_vmap_area_lock);
4239	/*
4240	* We release all the vmalloc shadows, even the ones for regions that
4241	* hadn't been successfully added. This relies on kasan_release_vmalloc
4242	* being able to tolerate this case.
4243	*/
4244	for (area = `0`; area < nr_vms; area++) {
4245	orig_start = vas[area]->va_start;
4246	orig_end = vas[area]->va_end;
4247	va = merge_or_add_vmap_area_augment(va: vas[area], root: &free_vmap_area_root,
4248	head: &free_vmap_area_list);
4249	if (va)
4250	kasan_release_vmalloc(start: orig_start, end: orig_end,
4251	free_region_start: va->va_start, free_region_end: va->va_end);
4252	vas[area] = NULL;
4253	kfree(objp: vms[area]);
4254	}
4255	spin_unlock(lock: &free_vmap_area_lock);
4256	kfree(objp: vas);
4257	kfree(objp: vms);
4258	return NULL;
4259	}
4260
4261	/**
4262	* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
4263	* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
4264	* @nr_vms: the number of allocated areas
4265	*
4266	* Free vm_structs and the array allocated by pcpu_get_vm_areas().
4267	*/
4268	void pcpu_free_vm_areas(struct vm_struct *vms, int* nr_vms)
4269	{
4270	int i;
4271
4272	for (i = `0`; i < nr_vms; i++)
4273	free_vm_area(vms[i]);
4274	kfree(objp: vms);
4275	}
4276	#endif /* CONFIG_SMP */
4277
4278	#ifdef CONFIG_PRINTK
4279	bool vmalloc_dump_obj(void *object)
4280	{
4281	void objp = (void* )PAGE_ALIGN((unsigned* long)object);
4282	const void *caller;
4283	struct vm_struct *vm;
4284	struct vmap_area *va;
4285	unsigned long addr;
4286	unsigned int nr_pages;
4287
4288	if (!spin_trylock(lock: &vmap_area_lock))
4289	return false;
4290	va = __find_vmap_area(addr: (unsigned long)objp, root: &vmap_area_root);
4291	if (!va) {
4292	spin_unlock(lock: &vmap_area_lock);
4293	return false;
4294	}
4295
4296	vm = va->vm;
4297	if (!vm) {
4298	spin_unlock(lock: &vmap_area_lock);
4299	return false;
4300	}
4301	addr = (unsigned long)vm->addr;
4302	caller = vm->caller;
4303	nr_pages = vm->nr_pages;
4304	spin_unlock(lock: &vmap_area_lock);
4305	pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
4306	nr_pages, addr, caller);
4307	return true;
4308	}
4309	#endif
4310
4311	#ifdef CONFIG_PROC_FS
4312	static void s_start(struct* seq_file m, loff_t pos)
4313	__acquires(&vmap_purge_lock)
4314	__acquires(&vmap_area_lock)
4315	{
4316	mutex_lock(&vmap_purge_lock);
4317	spin_lock(lock: &vmap_area_lock);
4318
4319	return seq_list_start(head: &vmap_area_list, pos: *pos);
4320	}
4321
4322	static void s_next(struct* seq_file m, void* p, loff_t pos)
4323	{
4324	return seq_list_next(v: p, head: &vmap_area_list, ppos: pos);
4325	}
4326
4327	static void s_stop(struct seq_file m, void* *p)
4328	__releases(&vmap_area_lock)
4329	__releases(&vmap_purge_lock)
4330	{
4331	spin_unlock(lock: &vmap_area_lock);
4332	mutex_unlock(lock: &vmap_purge_lock);
4333	}
4334
4335	static void show_numa_info(struct seq_file m, struct* vm_struct *v)
4336	{
4337	if (IS_ENABLED(CONFIG_NUMA)) {
4338	unsigned int nr, *counters = m->private;
4339	unsigned int step = `1U` << vm_area_page_order(vm: v);
4340
4341	if (!counters)
4342	return;
4343
4344	if (v->flags & VM_UNINITIALIZED)
4345	return;
4346	/ Pair with smp_wmb() in clear_vm_uninitialized_flag() /
4347	smp_rmb();
4348
4349	memset(counters, `0`, nr_node_ids * sizeof(unsigned int));
4350
4351	for (nr = `0`; nr < v->nr_pages; nr += step)
4352	counters[page_to_nid(page: v->pages[nr])] += step;
4353	for_each_node_state(nr, N_HIGH_MEMORY)
4354	if (counters[nr])
4355	seq_printf(m, fmt: " N%u=%u", nr, counters[nr]);
4356	}
4357	}
4358
4359	static void show_purge_info(struct seq_file *m)
4360	{
4361	struct vmap_area *va;
4362
4363	spin_lock(lock: &purge_vmap_area_lock);
4364	list_for_each_entry(va, &purge_vmap_area_list, list) {
4365	seq_printf(m, fmt: "0x%pK-0x%pK %7ld unpurged vm_area\n",
4366	(void )va->va_start, (void* *)va->va_end,
4367	va->va_end - va->va_start);
4368	}
4369	spin_unlock(lock: &purge_vmap_area_lock);
4370	}
4371
4372	static int s_show(struct seq_file m, void* *p)
4373	{
4374	struct vmap_area *va;
4375	struct vm_struct *v;
4376
4377	va = list_entry(p, struct vmap_area, list);
4378
4379	if (!va->vm) {
4380	if (va->flags & VMAP_RAM)
4381	seq_printf(m, fmt: "0x%pK-0x%pK %7ld vm_map_ram\n",
4382	(void )va->va_start, (void* *)va->va_end,
4383	va->va_end - va->va_start);
4384
4385	goto final;
4386	}
4387
4388	v = va->vm;
4389
4390	seq_printf(m, fmt: "0x%pK-0x%pK %7ld",
4391	v->addr, v->addr + v->size, v->size);
4392
4393	if (v->caller)
4394	seq_printf(m, fmt: " %pS", v->caller);
4395
4396	if (v->nr_pages)
4397	seq_printf(m, fmt: " pages=%d", v->nr_pages);
4398
4399	if (v->phys_addr)
4400	seq_printf(m, fmt: " phys=%pa", &v->phys_addr);
4401
4402	if (v->flags & VM_IOREMAP)
4403	seq_puts(m, s: " ioremap");
4404
4405	if (v->flags & VM_ALLOC)
4406	seq_puts(m, s: " vmalloc");
4407
4408	if (v->flags & VM_MAP)
4409	seq_puts(m, s: " vmap");
4410
4411	if (v->flags & VM_USERMAP)
4412	seq_puts(m, s: " user");
4413
4414	if (v->flags & VM_DMA_COHERENT)
4415	seq_puts(m, s: " dma-coherent");
4416
4417	if (is_vmalloc_addr(v->pages))
4418	seq_puts(m, s: " vpages");
4419
4420	show_numa_info(m, v);
4421	seq_putc(m, c: `'\n'`);
4422
4423	/*
4424	* As a final step, dump "unpurged" areas.
4425	*/
4426	final:
4427	if (list_is_last(list: &va->list, head: &vmap_area_list))
4428	show_purge_info(m);
4429
4430	return `0`;
4431	}
4432
4433	static const struct seq_operations vmalloc_op = {
4434	.start = s_start,
4435	.next = s_next,
4436	.stop = s_stop,
4437	.show = s_show,
4438	};
4439
4440	static int __init proc_vmalloc_init(void)
4441	{
4442	if (IS_ENABLED(CONFIG_NUMA))
4443	proc_create_seq_private(name: "vmallocinfo", mode: `0400`, NULL,
4444	ops: &vmalloc_op,
4445	state_size: nr_node_ids * sizeof(unsigned int), NULL);
4446	else
4447	proc_create_seq("vmallocinfo", `0400`, NULL, &vmalloc_op);
4448	return `0`;
4449	}
4450	module_init(proc_vmalloc_init);
4451
4452	#endif
4453
4454	void __init vmalloc_init(void)
4455	{
4456	struct vmap_area *va;
4457	struct vm_struct *tmp;
4458	int i;
4459
4460	/*
4461	* Create the cache for vmap_area objects.
4462	*/
4463	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
4464
4465	for_each_possible_cpu(i) {
4466	struct vmap_block_queue *vbq;
4467	struct vfree_deferred *p;
4468
4469	vbq = &per_cpu(vmap_block_queue, i);
4470	spin_lock_init(&vbq->lock);
4471	INIT_LIST_HEAD(list: &vbq->free);
4472	p = &per_cpu(vfree_deferred, i);
4473	init_llist_head(list: &p->list);
4474	INIT_WORK(&p->wq, delayed_vfree_work);
4475	xa_init(xa: &vbq->vmap_blocks);
4476	}
4477
4478	/ Import existing vmlist entries. /
4479	for (tmp = vmlist; tmp; tmp = tmp->next) {
4480	va = kmem_cache_zalloc(k: vmap_area_cachep, GFP_NOWAIT);
4481	if (WARN_ON_ONCE(!va))
4482	continue;
4483
4484	va->va_start = (unsigned long)tmp->addr;
4485	va->va_end = va->va_start + tmp->size;
4486	va->vm = tmp;
4487	insert_vmap_area(va, root: &vmap_area_root, head: &vmap_area_list);
4488	}
4489
4490	/*
4491	* Now we can initialize a free vmap space.
4492	*/
4493	vmap_init_free_space();
4494	vmap_initialized = true;
4495	}
4496

source code of linux/mm/vmalloc.c