hugetlb.c source code [linux/mm/hugetlb.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Generic hugetlb support.
4	* (C) Nadia Yvette Chambers, April 2004
5	*/
6	#include <linux/list.h>
7	#include <linux/init.h>
8	#include <linux/mm.h>
9	#include <linux/seq_file.h>
10	#include <linux/sysctl.h>
11	#include <linux/highmem.h>
12	#include <linux/mmu_notifier.h>
13	#include <linux/nodemask.h>
14	#include <linux/pagemap.h>
15	#include <linux/mempolicy.h>
16	#include <linux/compiler.h>
17	#include <linux/cpuset.h>
18	#include <linux/mutex.h>
19	#include <linux/memblock.h>
20	#include <linux/sysfs.h>
21	#include <linux/slab.h>
22	#include <linux/sched/mm.h>
23	#include <linux/mmdebug.h>
24	#include <linux/sched/signal.h>
25	#include <linux/rmap.h>
26	#include <linux/string_helpers.h>
27	#include <linux/swap.h>
28	#include <linux/swapops.h>
29	#include <linux/jhash.h>
30	#include <linux/numa.h>
31	#include <linux/llist.h>
32	#include <linux/cma.h>
33	#include <linux/migrate.h>
34	#include <linux/nospec.h>
35	#include <linux/delayacct.h>
36	#include <linux/memory.h>
37	#include <linux/mm_inline.h>
38
39	#include <asm/page.h>
40	#include <asm/pgalloc.h>
41	#include <asm/tlb.h>
42
43	#include <linux/io.h>
44	#include <linux/hugetlb.h>
45	#include <linux/hugetlb_cgroup.h>
46	#include <linux/node.h>
47	#include <linux/page_owner.h>
48	#include "internal.h"
49	#include "hugetlb_vmemmap.h"
50
51	int hugetlb_max_hstate __read_mostly;
52	unsigned int default_hstate_idx;
53	struct hstate hstates[HUGE_MAX_HSTATE];
54
55	#ifdef CONFIG_CMA
56	static struct cma *hugetlb_cma[MAX_NUMNODES];
57	static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
58	static bool hugetlb_cma_folio(struct folio folio, unsigned* int order)
59	{
60	return cma_pages_valid(cma: hugetlb_cma[folio_nid(folio)], pages: &folio->page,
61	count: `1` << order);
62	}
63	#else
64	static bool hugetlb_cma_folio(struct folio folio, unsigned* int order)
65	{
66	return false;
67	}
68	#endif
69	static unsigned long hugetlb_cma_size __initdata;
70
71	__initdata LIST_HEAD(huge_boot_pages);
72
73	/ for command line parsing /
74	static struct hstate * __initdata parsed_hstate;
75	static unsigned long __initdata default_hstate_max_huge_pages;
76	static bool __initdata parsed_valid_hugepagesz = true;
77	static bool __initdata parsed_default_hugepagesz;
78	static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
79
80	/*
81	* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
82	* free_huge_pages, and surplus_huge_pages.
83	*/
84	DEFINE_SPINLOCK(hugetlb_lock);
85
86	/*
87	* Serializes faults on the same logical page. This is used to
88	* prevent spurious OOMs when the hugepage pool is fully utilized.
89	*/
90	static int num_fault_mutexes;
91	struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
92
93	/ Forward declaration /
94	static int hugetlb_acct_memory(struct hstate h, long* delta);
95	static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
96	static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
97	static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
98	static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
99	unsigned long start, unsigned long end);
100	static struct resv_map vma_resv_map(struct* vm_area_struct *vma);
101
102	static inline bool subpool_is_free(struct hugepage_subpool *spool)
103	{
104	if (spool->count)
105	return false;
106	if (spool->max_hpages != -`1`)
107	return spool->used_hpages == `0`;
108	if (spool->min_hpages != -`1`)
109	return spool->rsv_hpages == spool->min_hpages;
110
111	return true;
112	}
113
114	static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
115	unsigned long irq_flags)
116	{
117	spin_unlock_irqrestore(lock: &spool->lock, flags: irq_flags);
118
119	/ If no pages are used, and no other handles to the subpool*
120	* remain, give up any reservations based on minimum size and
121	* free the subpool */
122	if (subpool_is_free(spool)) {
123	if (spool->min_hpages != -`1`)
124	hugetlb_acct_memory(h: spool->hstate,
125	delta: -spool->min_hpages);
126	kfree(objp: spool);
127	}
128	}
129
130	struct hugepage_subpool hugepage_new_subpool(struct* hstate h, long* max_hpages,
131	long min_hpages)
132	{
133	struct hugepage_subpool *spool;
134
135	spool = kzalloc(size: sizeof(*spool), GFP_KERNEL);
136	if (!spool)
137	return NULL;
138
139	spin_lock_init(&spool->lock);
140	spool->count = `1`;
141	spool->max_hpages = max_hpages;
142	spool->hstate = h;
143	spool->min_hpages = min_hpages;
144
145	if (min_hpages != -`1` && hugetlb_acct_memory(h, delta: min_hpages)) {
146	kfree(objp: spool);
147	return NULL;
148	}
149	spool->rsv_hpages = min_hpages;
150
151	return spool;
152	}
153
154	void hugepage_put_subpool(struct hugepage_subpool *spool)
155	{
156	unsigned long flags;
157
158	spin_lock_irqsave(&spool->lock, flags);
159	BUG_ON(!spool->count);
160	spool->count--;
161	unlock_or_release_subpool(spool, irq_flags: flags);
162	}
163
164	/*
165	* Subpool accounting for allocating and reserving pages.
166	* Return -ENOMEM if there are not enough resources to satisfy the
167	* request. Otherwise, return the number of pages by which the
168	* global pools must be adjusted (upward). The returned value may
169	* only be different than the passed value (delta) in the case where
170	* a subpool minimum size must be maintained.
171	*/
172	static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
173	long delta)
174	{
175	long ret = delta;
176
177	if (!spool)
178	return ret;
179
180	spin_lock_irq(lock: &spool->lock);
181
182	if (spool->max_hpages != -`1`) { / maximum size accounting /
183	if ((spool->used_hpages + delta) <= spool->max_hpages)
184	spool->used_hpages += delta;
185	else {
186	ret = -ENOMEM;
187	goto unlock_ret;
188	}
189	}
190
191	/ minimum size accounting /
192	if (spool->min_hpages != -`1` && spool->rsv_hpages) {
193	if (delta > spool->rsv_hpages) {
194	/*
195	* Asking for more reserves than those already taken on
196	* behalf of subpool. Return difference.
197	*/
198	ret = delta - spool->rsv_hpages;
199	spool->rsv_hpages = `0`;
200	} else {
201	ret = `0`; / reserves already accounted for /
202	spool->rsv_hpages -= delta;
203	}
204	}
205
206	unlock_ret:
207	spin_unlock_irq(lock: &spool->lock);
208	return ret;
209	}
210
211	/*
212	* Subpool accounting for freeing and unreserving pages.
213	* Return the number of global page reservations that must be dropped.
214	* The return value may only be different than the passed value (delta)
215	* in the case where a subpool minimum size must be maintained.
216	*/
217	static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
218	long delta)
219	{
220	long ret = delta;
221	unsigned long flags;
222
223	if (!spool)
224	return delta;
225
226	spin_lock_irqsave(&spool->lock, flags);
227
228	if (spool->max_hpages != -`1`) / maximum size accounting /
229	spool->used_hpages -= delta;
230
231	/ minimum size accounting /
232	if (spool->min_hpages != -`1` && spool->used_hpages < spool->min_hpages) {
233	if (spool->rsv_hpages + delta <= spool->min_hpages)
234	ret = `0`;
235	else
236	ret = spool->rsv_hpages + delta - spool->min_hpages;
237
238	spool->rsv_hpages += delta;
239	if (spool->rsv_hpages > spool->min_hpages)
240	spool->rsv_hpages = spool->min_hpages;
241	}
242
243	/*
244	* If hugetlbfs_put_super couldn't free spool due to an outstanding
245	* quota reference, free it now.
246	*/
247	unlock_or_release_subpool(spool, irq_flags: flags);
248
249	return ret;
250	}
251
252	static inline struct hugepage_subpool subpool_inode(struct* inode *inode)
253	{
254	return HUGETLBFS_SB(sb: inode->i_sb)->spool;
255	}
256
257	static inline struct hugepage_subpool subpool_vma(struct* vm_area_struct *vma)
258	{
259	return subpool_inode(inode: file_inode(f: vma->vm_file));
260	}
261
262	/*
263	* hugetlb vma_lock helper routines
264	*/
265	void hugetlb_vma_lock_read(struct vm_area_struct *vma)
266	{
267	if (__vma_shareable_lock(vma)) {
268	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
269
270	down_read(sem: &vma_lock->rw_sema);
271	} else if (__vma_private_lock(vma)) {
272	struct resv_map *resv_map = vma_resv_map(vma);
273
274	down_read(sem: &resv_map->rw_sema);
275	}
276	}
277
278	void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
279	{
280	if (__vma_shareable_lock(vma)) {
281	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
282
283	up_read(sem: &vma_lock->rw_sema);
284	} else if (__vma_private_lock(vma)) {
285	struct resv_map *resv_map = vma_resv_map(vma);
286
287	up_read(sem: &resv_map->rw_sema);
288	}
289	}
290
291	void hugetlb_vma_lock_write(struct vm_area_struct *vma)
292	{
293	if (__vma_shareable_lock(vma)) {
294	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
295
296	down_write(sem: &vma_lock->rw_sema);
297	} else if (__vma_private_lock(vma)) {
298	struct resv_map *resv_map = vma_resv_map(vma);
299
300	down_write(sem: &resv_map->rw_sema);
301	}
302	}
303
304	void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
305	{
306	if (__vma_shareable_lock(vma)) {
307	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
308
309	up_write(sem: &vma_lock->rw_sema);
310	} else if (__vma_private_lock(vma)) {
311	struct resv_map *resv_map = vma_resv_map(vma);
312
313	up_write(sem: &resv_map->rw_sema);
314	}
315	}
316
317	int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
318	{
319
320	if (__vma_shareable_lock(vma)) {
321	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
322
323	return down_write_trylock(sem: &vma_lock->rw_sema);
324	} else if (__vma_private_lock(vma)) {
325	struct resv_map *resv_map = vma_resv_map(vma);
326
327	return down_write_trylock(sem: &resv_map->rw_sema);
328	}
329
330	return `1`;
331	}
332
333	void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
334	{
335	if (__vma_shareable_lock(vma)) {
336	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
337
338	lockdep_assert_held(&vma_lock->rw_sema);
339	} else if (__vma_private_lock(vma)) {
340	struct resv_map *resv_map = vma_resv_map(vma);
341
342	lockdep_assert_held(&resv_map->rw_sema);
343	}
344	}
345
346	void hugetlb_vma_lock_release(struct kref *kref)
347	{
348	struct hugetlb_vma_lock *vma_lock = container_of(kref,
349	struct hugetlb_vma_lock, refs);
350
351	kfree(objp: vma_lock);
352	}
353
354	static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
355	{
356	struct vm_area_struct *vma = vma_lock->vma;
357
358	/*
359	* vma_lock structure may or not be released as a result of put,
360	* it certainly will no longer be attached to vma so clear pointer.
361	* Semaphore synchronizes access to vma_lock->vma field.
362	*/
363	vma_lock->vma = NULL;
364	vma->vm_private_data = NULL;
365	up_write(sem: &vma_lock->rw_sema);
366	kref_put(kref: &vma_lock->refs, release: hugetlb_vma_lock_release);
367	}
368
369	static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
370	{
371	if (__vma_shareable_lock(vma)) {
372	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
373
374	__hugetlb_vma_unlock_write_put(vma_lock);
375	} else if (__vma_private_lock(vma)) {
376	struct resv_map *resv_map = vma_resv_map(vma);
377
378	/ no free for anon vmas, but still need to unlock /
379	up_write(sem: &resv_map->rw_sema);
380	}
381	}
382
383	static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
384	{
385	/*
386	* Only present in sharable vmas.
387	*/
388	if (!vma \|\| !__vma_shareable_lock(vma))
389	return;
390
391	if (vma->vm_private_data) {
392	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
393
394	down_write(sem: &vma_lock->rw_sema);
395	__hugetlb_vma_unlock_write_put(vma_lock);
396	}
397	}
398
399	static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
400	{
401	struct hugetlb_vma_lock *vma_lock;
402
403	/ Only establish in (flags) sharable vmas /
404	if (!vma \|\| !(vma->vm_flags & VM_MAYSHARE))
405	return;
406
407	/ Should never get here with non-NULL vm_private_data /
408	if (vma->vm_private_data)
409	return;
410
411	vma_lock = kmalloc(size: sizeof(*vma_lock), GFP_KERNEL);
412	if (!vma_lock) {
413	/*
414	* If we can not allocate structure, then vma can not
415	* participate in pmd sharing. This is only a possible
416	* performance enhancement and memory saving issue.
417	* However, the lock is also used to synchronize page
418	* faults with truncation. If the lock is not present,
419	* unlikely races could leave pages in a file past i_size
420	* until the file is removed. Warn in the unlikely case of
421	* allocation failure.
422	*/
423	pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
424	return;
425	}
426
427	kref_init(kref: &vma_lock->refs);
428	init_rwsem(&vma_lock->rw_sema);
429	vma_lock->vma = vma;
430	vma->vm_private_data = vma_lock;
431	}
432
433	/ Helper that removes a struct file_region from the resv_map cache and returns*
434	* it for use.
435	*/
436	static struct file_region *
437	get_file_region_entry_from_cache(struct resv_map resv, long* from, long to)
438	{
439	struct file_region *nrg;
440
441	VM_BUG_ON(resv->region_cache_count <= `0`);
442
443	resv->region_cache_count--;
444	nrg = list_first_entry(&resv->region_cache, struct file_region, link);
445	list_del(entry: &nrg->link);
446
447	nrg->from = from;
448	nrg->to = to;
449
450	return nrg;
451	}
452
453	static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
454	struct file_region *rg)
455	{
456	#ifdef CONFIG_CGROUP_HUGETLB
457	nrg->reservation_counter = rg->reservation_counter;
458	nrg->css = rg->css;
459	if (rg->css)
460	css_get(css: rg->css);
461	#endif
462	}
463
464	/ Helper that records hugetlb_cgroup uncharge info. /
465	static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
466	struct hstate *h,
467	struct resv_map *resv,
468	struct file_region *nrg)
469	{
470	#ifdef CONFIG_CGROUP_HUGETLB
471	if (h_cg) {
472	nrg->reservation_counter =
473	&h_cg->rsvd_hugepage[hstate_index(h)];
474	nrg->css = &h_cg->css;
475	/*
476	* The caller will hold exactly one h_cg->css reference for the
477	* whole contiguous reservation region. But this area might be
478	* scattered when there are already some file_regions reside in
479	* it. As a result, many file_regions may share only one css
480	* reference. In order to ensure that one file_region must hold
481	* exactly one h_cg->css reference, we should do css_get for
482	* each file_region and leave the reference held by caller
483	* untouched.
484	*/
485	css_get(css: &h_cg->css);
486	if (!resv->pages_per_hpage)
487	resv->pages_per_hpage = pages_per_huge_page(h);
488	/ pages_per_hpage should be the same for all entries in*
489	* a resv_map.
490	*/
491	VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
492	} else {
493	nrg->reservation_counter = NULL;
494	nrg->css = NULL;
495	}
496	#endif
497	}
498
499	static void put_uncharge_info(struct file_region *rg)
500	{
501	#ifdef CONFIG_CGROUP_HUGETLB
502	if (rg->css)
503	css_put(css: rg->css);
504	#endif
505	}
506
507	static bool has_same_uncharge_info(struct file_region *rg,
508	struct file_region *org)
509	{
510	#ifdef CONFIG_CGROUP_HUGETLB
511	return rg->reservation_counter == org->reservation_counter &&
512	rg->css == org->css;
513
514	#else
515	return true;
516	#endif
517	}
518
519	static void coalesce_file_region(struct resv_map resv, struct* file_region *rg)
520	{
521	struct file_region nrg, prg;
522
523	prg = list_prev_entry(rg, link);
524	if (&prg->link != &resv->regions && prg->to == rg->from &&
525	has_same_uncharge_info(rg: prg, org: rg)) {
526	prg->to = rg->to;
527
528	list_del(entry: &rg->link);
529	put_uncharge_info(rg);
530	kfree(objp: rg);
531
532	rg = prg;
533	}
534
535	nrg = list_next_entry(rg, link);
536	if (&nrg->link != &resv->regions && nrg->from == rg->to &&
537	has_same_uncharge_info(rg: nrg, org: rg)) {
538	nrg->from = rg->from;
539
540	list_del(entry: &rg->link);
541	put_uncharge_info(rg);
542	kfree(objp: rg);
543	}
544	}
545
546	static inline long
547	hugetlb_resv_map_add(struct resv_map map, struct* list_head rg, long* from,
548	long to, struct hstate h, struct* hugetlb_cgroup *cg,
549	long *regions_needed)
550	{
551	struct file_region *nrg;
552
553	if (!regions_needed) {
554	nrg = get_file_region_entry_from_cache(resv: map, from, to);
555	record_hugetlb_cgroup_uncharge_info(h_cg: cg, h, resv: map, nrg);
556	list_add(new: &nrg->link, head: rg);
557	coalesce_file_region(resv: map, rg: nrg);
558	} else
559	*regions_needed += `1`;
560
561	return to - from;
562	}
563
564	/*
565	* Must be called with resv->lock held.
566	*
567	* Calling this with regions_needed != NULL will count the number of pages
568	* to be added but will not modify the linked list. And regions_needed will
569	* indicate the number of file_regions needed in the cache to carry out to add
570	* the regions for this range.
571	*/
572	static long add_reservation_in_range(struct resv_map resv, long* f, long t,
573	struct hugetlb_cgroup *h_cg,
574	struct hstate h, long* *regions_needed)
575	{
576	long add = `0`;
577	struct list_head *head = &resv->regions;
578	long last_accounted_offset = f;
579	struct file_region iter, trg = NULL;
580	struct list_head *rg = NULL;
581
582	if (regions_needed)
583	*regions_needed = `0`;
584
585	/ In this loop, we essentially handle an entry for the range*
586	* [last_accounted_offset, iter->from), at every iteration, with some
587	* bounds checking.
588	*/
589	list_for_each_entry_safe(iter, trg, head, link) {
590	/ Skip irrelevant regions that start before our range. /
591	if (iter->from < f) {
592	/ If this region ends after the last accounted offset,*
593	* then we need to update last_accounted_offset.
594	*/
595	if (iter->to > last_accounted_offset)
596	last_accounted_offset = iter->to;
597	continue;
598	}
599
600	/ When we find a region that starts beyond our range, we've*
601	* finished.
602	*/
603	if (iter->from >= t) {
604	rg = iter->link.prev;
605	break;
606	}
607
608	/ Add an entry for last_accounted_offset -> iter->from, and*
609	* update last_accounted_offset.
610	*/
611	if (iter->from > last_accounted_offset)
612	add += hugetlb_resv_map_add(map: resv, rg: iter->link.prev,
613	from: last_accounted_offset,
614	to: iter->from, h, cg: h_cg,
615	regions_needed);
616
617	last_accounted_offset = iter->to;
618	}
619
620	/ Handle the case where our range extends beyond*
621	* last_accounted_offset.
622	*/
623	if (!rg)
624	rg = head->prev;
625	if (last_accounted_offset < t)
626	add += hugetlb_resv_map_add(map: resv, rg, from: last_accounted_offset,
627	to: t, h, cg: h_cg, regions_needed);
628
629	return add;
630	}
631
632	/ Must be called with resv->lock acquired. Will drop lock to allocate entries.*
633	*/
634	static int allocate_file_region_entries(struct resv_map *resv,
635	int regions_needed)
636	__must_hold(&resv->lock)
637	{
638	LIST_HEAD(allocated_regions);
639	int to_allocate = `0`, i = `0`;
640	struct file_region trg = NULL, rg = NULL;
641
642	VM_BUG_ON(regions_needed < `0`);
643
644	/*
645	* Check for sufficient descriptors in the cache to accommodate
646	* the number of in progress add operations plus regions_needed.
647	*
648	* This is a while loop because when we drop the lock, some other call
649	* to region_add or region_del may have consumed some region_entries,
650	* so we keep looping here until we finally have enough entries for
651	* (adds_in_progress + regions_needed).
652	*/
653	while (resv->region_cache_count <
654	(resv->adds_in_progress + regions_needed)) {
655	to_allocate = resv->adds_in_progress + regions_needed -
656	resv->region_cache_count;
657
658	/ At this point, we should have enough entries in the cache*
659	* for all the existing adds_in_progress. We should only be
660	* needing to allocate for regions_needed.
661	*/
662	VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
663
664	spin_unlock(lock: &resv->lock);
665	for (i = `0`; i < to_allocate; i++) {
666	trg = kmalloc(size: sizeof(*trg), GFP_KERNEL);
667	if (!trg)
668	goto out_of_memory;
669	list_add(new: &trg->link, head: &allocated_regions);
670	}
671
672	spin_lock(lock: &resv->lock);
673
674	list_splice(list: &allocated_regions, head: &resv->region_cache);
675	resv->region_cache_count += to_allocate;
676	}
677
678	return `0`;
679
680	out_of_memory:
681	list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
682	list_del(entry: &rg->link);
683	kfree(objp: rg);
684	}
685	return -ENOMEM;
686	}
687
688	/*
689	* Add the huge page range represented by [f, t) to the reserve
690	* map. Regions will be taken from the cache to fill in this range.
691	* Sufficient regions should exist in the cache due to the previous
692	* call to region_chg with the same range, but in some cases the cache will not
693	* have sufficient entries due to races with other code doing region_add or
694	* region_del. The extra needed entries will be allocated.
695	*
696	* regions_needed is the out value provided by a previous call to region_chg.
697	*
698	* Return the number of new huge pages added to the map. This number is greater
699	* than or equal to zero. If file_region entries needed to be allocated for
700	* this operation and we were not able to allocate, it returns -ENOMEM.
701	* region_add of regions of length 1 never allocate file_regions and cannot
702	* fail; region_chg will always allocate at least 1 entry and a region_add for
703	* 1 page will only require at most 1 entry.
704	*/
705	static long region_add(struct resv_map resv, long* f, long t,
706	long in_regions_needed, struct hstate *h,
707	struct hugetlb_cgroup *h_cg)
708	{
709	long add = `0`, actual_regions_needed = `0`;
710
711	spin_lock(lock: &resv->lock);
712	retry:
713
714	/ Count how many regions are actually needed to execute this add. /
715	add_reservation_in_range(resv, f, t, NULL, NULL,
716	regions_needed: &actual_regions_needed);
717
718	/*
719	* Check for sufficient descriptors in the cache to accommodate
720	* this add operation. Note that actual_regions_needed may be greater
721	* than in_regions_needed, as the resv_map may have been modified since
722	* the region_chg call. In this case, we need to make sure that we
723	* allocate extra entries, such that we have enough for all the
724	* existing adds_in_progress, plus the excess needed for this
725	* operation.
726	*/
727	if (actual_regions_needed > in_regions_needed &&
728	resv->region_cache_count <
729	resv->adds_in_progress +
730	(actual_regions_needed - in_regions_needed)) {
731	/ region_add operation of range 1 should never need to*
732	* allocate file_region entries.
733	*/
734	VM_BUG_ON(t - f <= `1`);
735
736	if (allocate_file_region_entries(
737	resv, regions_needed: actual_regions_needed - in_regions_needed)) {
738	return -ENOMEM;
739	}
740
741	goto retry;
742	}
743
744	add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
745
746	resv->adds_in_progress -= in_regions_needed;
747
748	spin_unlock(lock: &resv->lock);
749	return add;
750	}
751
752	/*
753	* Examine the existing reserve map and determine how many
754	* huge pages in the specified range [f, t) are NOT currently
755	* represented. This routine is called before a subsequent
756	* call to region_add that will actually modify the reserve
757	* map to add the specified range [f, t). region_chg does
758	* not change the number of huge pages represented by the
759	* map. A number of new file_region structures is added to the cache as a
760	* placeholder, for the subsequent region_add call to use. At least 1
761	* file_region structure is added.
762	*
763	* out_regions_needed is the number of regions added to the
764	* resv->adds_in_progress. This value needs to be provided to a follow up call
765	* to region_add or region_abort for proper accounting.
766	*
767	* Returns the number of huge pages that need to be added to the existing
768	* reservation map for the range [f, t). This number is greater or equal to
769	* zero. -ENOMEM is returned if a new file_region structure or cache entry
770	* is needed and can not be allocated.
771	*/
772	static long region_chg(struct resv_map resv, long* f, long t,
773	long *out_regions_needed)
774	{
775	long chg = `0`;
776
777	spin_lock(lock: &resv->lock);
778
779	/ Count how many hugepages in this range are NOT represented. /
780	chg = add_reservation_in_range(resv, f, t, NULL, NULL,
781	regions_needed: out_regions_needed);
782
783	if (*out_regions_needed == `0`)
784	*out_regions_needed = `1`;
785
786	if (allocate_file_region_entries(resv, regions_needed: *out_regions_needed))
787	return -ENOMEM;
788
789	resv->adds_in_progress += *out_regions_needed;
790
791	spin_unlock(lock: &resv->lock);
792	return chg;
793	}
794
795	/*
796	* Abort the in progress add operation. The adds_in_progress field
797	* of the resv_map keeps track of the operations in progress between
798	* calls to region_chg and region_add. Operations are sometimes
799	* aborted after the call to region_chg. In such cases, region_abort
800	* is called to decrement the adds_in_progress counter. regions_needed
801	* is the value returned by the region_chg call, it is used to decrement
802	* the adds_in_progress counter.
803	*
804	* NOTE: The range arguments [f, t) are not needed or used in this
805	* routine. They are kept to make reading the calling code easier as
806	* arguments will match the associated region_chg call.
807	*/
808	static void region_abort(struct resv_map resv, long* f, long t,
809	long regions_needed)
810	{
811	spin_lock(lock: &resv->lock);
812	VM_BUG_ON(!resv->region_cache_count);
813	resv->adds_in_progress -= regions_needed;
814	spin_unlock(lock: &resv->lock);
815	}
816
817	/*
818	* Delete the specified range [f, t) from the reserve map. If the
819	* t parameter is LONG_MAX, this indicates that ALL regions after f
820	* should be deleted. Locate the regions which intersect [f, t)
821	* and either trim, delete or split the existing regions.
822	*
823	* Returns the number of huge pages deleted from the reserve map.
824	* In the normal case, the return value is zero or more. In the
825	* case where a region must be split, a new region descriptor must
826	* be allocated. If the allocation fails, -ENOMEM will be returned.
827	* NOTE: If the parameter t == LONG_MAX, then we will never split
828	* a region and possibly return -ENOMEM. Callers specifying
829	* t == LONG_MAX do not need to check for -ENOMEM error.
830	*/
831	static long region_del(struct resv_map resv, long* f, long t)
832	{
833	struct list_head *head = &resv->regions;
834	struct file_region rg, trg;
835	struct file_region *nrg = NULL;
836	long del = `0`;
837
838	retry:
839	spin_lock(lock: &resv->lock);
840	list_for_each_entry_safe(rg, trg, head, link) {
841	/*
842	* Skip regions before the range to be deleted. file_region
843	* ranges are normally of the form [from, to). However, there
844	* may be a "placeholder" entry in the map which is of the form
845	* (from, to) with from == to. Check for placeholder entries
846	* at the beginning of the range to be deleted.
847	*/
848	if (rg->to <= f && (rg->to != rg->from \|\| rg->to != f))
849	continue;
850
851	if (rg->from >= t)
852	break;
853
854	if (f > rg->from && t < rg->to) { / Must split region /
855	/*
856	* Check for an entry in the cache before dropping
857	* lock and attempting allocation.
858	*/
859	if (!nrg &&
860	resv->region_cache_count > resv->adds_in_progress) {
861	nrg = list_first_entry(&resv->region_cache,
862	struct file_region,
863	link);
864	list_del(entry: &nrg->link);
865	resv->region_cache_count--;
866	}
867
868	if (!nrg) {
869	spin_unlock(lock: &resv->lock);
870	nrg = kmalloc(size: sizeof(*nrg), GFP_KERNEL);
871	if (!nrg)
872	return -ENOMEM;
873	goto retry;
874	}
875
876	del += t - f;
877	hugetlb_cgroup_uncharge_file_region(
878	resv, rg, nr_pages: t - f, region_del: false);
879
880	/ New entry for end of split region /
881	nrg->from = t;
882	nrg->to = rg->to;
883
884	copy_hugetlb_cgroup_uncharge_info(nrg, rg);
885
886	INIT_LIST_HEAD(list: &nrg->link);
887
888	/ Original entry is trimmed /
889	rg->to = f;
890
891	list_add(new: &nrg->link, head: &rg->link);
892	nrg = NULL;
893	break;
894	}
895
896	if (f <= rg->from && t >= rg->to) { / Remove entire region /
897	del += rg->to - rg->from;
898	hugetlb_cgroup_uncharge_file_region(resv, rg,
899	nr_pages: rg->to - rg->from, region_del: true);
900	list_del(entry: &rg->link);
901	kfree(objp: rg);
902	continue;
903	}
904
905	if (f <= rg->from) { / Trim beginning of region /
906	hugetlb_cgroup_uncharge_file_region(resv, rg,
907	nr_pages: t - rg->from, region_del: false);
908
909	del += t - rg->from;
910	rg->from = t;
911	} else { / Trim end of region /
912	hugetlb_cgroup_uncharge_file_region(resv, rg,
913	nr_pages: rg->to - f, region_del: false);
914
915	del += rg->to - f;
916	rg->to = f;
917	}
918	}
919
920	spin_unlock(lock: &resv->lock);
921	kfree(objp: nrg);
922	return del;
923	}
924
925	/*
926	* A rare out of memory error was encountered which prevented removal of
927	* the reserve map region for a page. The huge page itself was free'ed
928	* and removed from the page cache. This routine will adjust the subpool
929	* usage count, and the global reserve count if needed. By incrementing
930	* these counts, the reserve map entry which could not be deleted will
931	* appear as a "reserved" entry instead of simply dangling with incorrect
932	* counts.
933	*/
934	void hugetlb_fix_reserve_counts(struct inode *inode)
935	{
936	struct hugepage_subpool *spool = subpool_inode(inode);
937	long rsv_adjust;
938	bool reserved = false;
939
940	rsv_adjust = hugepage_subpool_get_pages(spool, delta: `1`);
941	if (rsv_adjust > `0`) {
942	struct hstate *h = hstate_inode(i: inode);
943
944	if (!hugetlb_acct_memory(h, delta: `1`))
945	reserved = true;
946	} else if (!rsv_adjust) {
947	reserved = true;
948	}
949
950	if (!reserved)
951	pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
952	}
953
954	/*
955	* Count and return the number of huge pages in the reserve map
956	* that intersect with the range [f, t).
957	*/
958	static long region_count(struct resv_map resv, long* f, long t)
959	{
960	struct list_head *head = &resv->regions;
961	struct file_region *rg;
962	long chg = `0`;
963
964	spin_lock(lock: &resv->lock);
965	/ Locate each segment we overlap with, and count that overlap. /
966	list_for_each_entry(rg, head, link) {
967	long seg_from;
968	long seg_to;
969
970	if (rg->to <= f)
971	continue;
972	if (rg->from >= t)
973	break;
974
975	seg_from = max(rg->from, f);
976	seg_to = min(rg->to, t);
977
978	chg += seg_to - seg_from;
979	}
980	spin_unlock(lock: &resv->lock);
981
982	return chg;
983	}
984
985	/*
986	* Convert the address within this vma to the page offset within
987	* the mapping, huge page units here.
988	*/
989	static pgoff_t vma_hugecache_offset(struct hstate *h,
990	struct vm_area_struct vma, unsigned* long address)
991	{
992	return ((address - vma->vm_start) >> huge_page_shift(h)) +
993	(vma->vm_pgoff >> huge_page_order(h));
994	}
995
996	/**
997	* vma_kernel_pagesize - Page size granularity for this VMA.
998	* @vma: The user mapping.
999	*
1000	* Folios in this VMA will be aligned to, and at least the size of the
1001	* number of bytes returned by this function.
1002	*
1003	* Return: The default size of the folios allocated when backing a VMA.
1004	*/
1005	unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
1006	{
1007	if (vma->vm_ops && vma->vm_ops->pagesize)
1008	return vma->vm_ops->pagesize(vma);
1009	return PAGE_SIZE;
1010	}
1011	EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
1012
1013	/*
1014	* Return the page size being used by the MMU to back a VMA. In the majority
1015	* of cases, the page size used by the kernel matches the MMU size. On
1016	* architectures where it differs, an architecture-specific 'strong'
1017	* version of this symbol is required.
1018	*/
1019	__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
1020	{
1021	return vma_kernel_pagesize(vma);
1022	}
1023
1024	/*
1025	* Flags for MAP_PRIVATE reservations. These are stored in the bottom
1026	* bits of the reservation map pointer, which are always clear due to
1027	* alignment.
1028	*/
1029	#define HPAGE_RESV_OWNER (1UL << 0)
1030	#define HPAGE_RESV_UNMAPPED (1UL << 1)
1031	#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER \| HPAGE_RESV_UNMAPPED)
1032
1033	/*
1034	* These helpers are used to track how many pages are reserved for
1035	* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
1036	* is guaranteed to have their future faults succeed.
1037	*
1038	* With the exception of hugetlb_dup_vma_private() which is called at fork(),
1039	* the reserve counters are updated with the hugetlb_lock held. It is safe
1040	* to reset the VMA at fork() time as it is not in use yet and there is no
1041	* chance of the global counters getting corrupted as a result of the values.
1042	*
1043	* The private mapping reservation is represented in a subtly different
1044	* manner to a shared mapping. A shared mapping has a region map associated
1045	* with the underlying file, this region map represents the backing file
1046	* pages which have ever had a reservation assigned which this persists even
1047	* after the page is instantiated. A private mapping has a region map
1048	* associated with the original mmap which is attached to all VMAs which
1049	* reference it, this region map represents those offsets which have consumed
1050	* reservation ie. where pages have been instantiated.
1051	*/
1052	static unsigned long get_vma_private_data(struct vm_area_struct *vma)
1053	{
1054	return (unsigned long)vma->vm_private_data;
1055	}
1056
1057	static void set_vma_private_data(struct vm_area_struct *vma,
1058	unsigned long value)
1059	{
1060	vma->vm_private_data = (void *)value;
1061	}
1062
1063	static void
1064	resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
1065	struct hugetlb_cgroup *h_cg,
1066	struct hstate *h)
1067	{
1068	#ifdef CONFIG_CGROUP_HUGETLB
1069	if (!h_cg \|\| !h) {
1070	resv_map->reservation_counter = NULL;
1071	resv_map->pages_per_hpage = `0`;
1072	resv_map->css = NULL;
1073	} else {
1074	resv_map->reservation_counter =
1075	&h_cg->rsvd_hugepage[hstate_index(h)];
1076	resv_map->pages_per_hpage = pages_per_huge_page(h);
1077	resv_map->css = &h_cg->css;
1078	}
1079	#endif
1080	}
1081
1082	struct resv_map resv_map_alloc(void*)
1083	{
1084	struct resv_map resv_map = kmalloc(size: sizeof(resv_map), GFP_KERNEL);
1085	struct file_region rg = kmalloc(size: sizeof(rg), GFP_KERNEL);
1086
1087	if (!resv_map \|\| !rg) {
1088	kfree(objp: resv_map);
1089	kfree(objp: rg);
1090	return NULL;
1091	}
1092
1093	kref_init(kref: &resv_map->refs);
1094	spin_lock_init(&resv_map->lock);
1095	INIT_LIST_HEAD(list: &resv_map->regions);
1096	init_rwsem(&resv_map->rw_sema);
1097
1098	resv_map->adds_in_progress = `0`;
1099	/*
1100	* Initialize these to 0. On shared mappings, 0's here indicate these
1101	* fields don't do cgroup accounting. On private mappings, these will be
1102	* re-initialized to the proper values, to indicate that hugetlb cgroup
1103	* reservations are to be un-charged from here.
1104	*/
1105	resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
1106
1107	INIT_LIST_HEAD(list: &resv_map->region_cache);
1108	list_add(new: &rg->link, head: &resv_map->region_cache);
1109	resv_map->region_cache_count = `1`;
1110
1111	return resv_map;
1112	}
1113
1114	void resv_map_release(struct kref *ref)
1115	{
1116	struct resv_map resv_map = container_of(ref, struct* resv_map, refs);
1117	struct list_head *head = &resv_map->region_cache;
1118	struct file_region rg, trg;
1119
1120	/ Clear out any active regions before we release the map. /
1121	region_del(resv: resv_map, f: `0`, LONG_MAX);
1122
1123	/ ... and any entries left in the cache /
1124	list_for_each_entry_safe(rg, trg, head, link) {
1125	list_del(entry: &rg->link);
1126	kfree(objp: rg);
1127	}
1128
1129	VM_BUG_ON(resv_map->adds_in_progress);
1130
1131	kfree(objp: resv_map);
1132	}
1133
1134	static inline struct resv_map inode_resv_map(struct* inode *inode)
1135	{
1136	/*
1137	* At inode evict time, i_mapping may not point to the original
1138	* address space within the inode. This original address space
1139	* contains the pointer to the resv_map. So, always use the
1140	* address space embedded within the inode.
1141	* The VERY common case is inode->mapping == &inode->i_data but,
1142	* this may not be true for device special inodes.
1143	*/
1144	return (struct resv_map *)(&inode->i_data)->private_data;
1145	}
1146
1147	static struct resv_map vma_resv_map(struct* vm_area_struct *vma)
1148	{
1149	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1150	if (vma->vm_flags & VM_MAYSHARE) {
1151	struct address_space *mapping = vma->vm_file->f_mapping;
1152	struct inode *inode = mapping->host;
1153
1154	return inode_resv_map(inode);
1155
1156	} else {
1157	return (struct resv_map *)(get_vma_private_data(vma) &
1158	~HPAGE_RESV_MASK);
1159	}
1160	}
1161
1162	static void set_vma_resv_map(struct vm_area_struct vma, struct* resv_map *map)
1163	{
1164	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1165	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1166
1167	set_vma_private_data(vma, value: (unsigned long)map);
1168	}
1169
1170	static void set_vma_resv_flags(struct vm_area_struct vma, unsigned* long flags)
1171	{
1172	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1173	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1174
1175	set_vma_private_data(vma, value: get_vma_private_data(vma) \| flags);
1176	}
1177
1178	static int is_vma_resv_set(struct vm_area_struct vma, unsigned* long flag)
1179	{
1180	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1181
1182	return (get_vma_private_data(vma) & flag) != `0`;
1183	}
1184
1185	void hugetlb_dup_vma_private(struct vm_area_struct *vma)
1186	{
1187	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1188	/*
1189	* Clear vm_private_data
1190	* - For shared mappings this is a per-vma semaphore that may be
1191	* allocated in a subsequent call to hugetlb_vm_op_open.
1192	* Before clearing, make sure pointer is not associated with vma
1193	* as this will leak the structure. This is the case when called
1194	* via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
1195	* been called to allocate a new structure.
1196	* - For MAP_PRIVATE mappings, this is the reserve map which does
1197	* not apply to children. Faults generated by the children are
1198	* not guaranteed to succeed, even if read-only.
1199	*/
1200	if (vma->vm_flags & VM_MAYSHARE) {
1201	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
1202
1203	if (vma_lock && vma_lock->vma != vma)
1204	vma->vm_private_data = NULL;
1205	} else
1206	vma->vm_private_data = NULL;
1207	}
1208
1209	/*
1210	* Reset and decrement one ref on hugepage private reservation.
1211	* Called with mm->mmap_lock writer semaphore held.
1212	* This function should be only used by move_vma() and operate on
1213	* same sized vma. It should never come here with last ref on the
1214	* reservation.
1215	*/
1216	void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1217	{
1218	/*
1219	* Clear the old hugetlb private page reservation.
1220	* It has already been transferred to new_vma.
1221	*
1222	* During a mremap() operation of a hugetlb vma we call move_vma()
1223	* which copies vma into new_vma and unmaps vma. After the copy
1224	* operation both new_vma and vma share a reference to the resv_map
1225	* struct, and at that point vma is about to be unmapped. We don't
1226	* want to return the reservation to the pool at unmap of vma because
1227	* the reservation still lives on in new_vma, so simply decrement the
1228	* ref here and remove the resv_map reference from this vma.
1229	*/
1230	struct resv_map *reservations = vma_resv_map(vma);
1231
1232	if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1233	resv_map_put_hugetlb_cgroup_uncharge_info(resv_map: reservations);
1234	kref_put(kref: &reservations->refs, release: resv_map_release);
1235	}
1236
1237	hugetlb_dup_vma_private(vma);
1238	}
1239
1240	/ Returns true if the VMA has associated reserve pages /
1241	static bool vma_has_reserves(struct vm_area_struct vma, long* chg)
1242	{
1243	if (vma->vm_flags & VM_NORESERVE) {
1244	/*
1245	* This address is already reserved by other process(chg == 0),
1246	* so, we should decrement reserved count. Without decrementing,
1247	* reserve count remains after releasing inode, because this
1248	* allocated page will go into page cache and is regarded as
1249	* coming from reserved pool in releasing step. Currently, we
1250	* don't have any other solution to deal with this situation
1251	* properly, so add work-around here.
1252	*/
1253	if (vma->vm_flags & VM_MAYSHARE && chg == `0`)
1254	return true;
1255	else
1256	return false;
1257	}
1258
1259	/ Shared mappings always use reserves /
1260	if (vma->vm_flags & VM_MAYSHARE) {
1261	/*
1262	* We know VM_NORESERVE is not set. Therefore, there SHOULD
1263	* be a region map for all pages. The only situation where
1264	* there is no region map is if a hole was punched via
1265	* fallocate. In this case, there really are no reserves to
1266	* use. This situation is indicated if chg != 0.
1267	*/
1268	if (chg)
1269	return false;
1270	else
1271	return true;
1272	}
1273
1274	/*
1275	* Only the process that called mmap() has reserves for
1276	* private mappings.
1277	*/
1278	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1279	/*
1280	* Like the shared case above, a hole punch or truncate
1281	* could have been performed on the private mapping.
1282	* Examine the value of chg to determine if reserves
1283	* actually exist or were previously consumed.
1284	* Very Subtle - The value of chg comes from a previous
1285	* call to vma_needs_reserves(). The reserve map for
1286	* private mappings has different (opposite) semantics
1287	* than that of shared mappings. vma_needs_reserves()
1288	* has already taken this difference in semantics into
1289	* account. Therefore, the meaning of chg is the same
1290	* as in the shared case above. Code could easily be
1291	* combined, but keeping it separate draws attention to
1292	* subtle differences.
1293	*/
1294	if (chg)
1295	return false;
1296	else
1297	return true;
1298	}
1299
1300	return false;
1301	}
1302
1303	static void enqueue_hugetlb_folio(struct hstate h, struct* folio *folio)
1304	{
1305	int nid = folio_nid(folio);
1306
1307	lockdep_assert_held(&hugetlb_lock);
1308	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1309
1310	list_move(list: &folio->lru, head: &h->hugepage_freelists[nid]);
1311	h->free_huge_pages++;
1312	h->free_huge_pages_node[nid]++;
1313	folio_set_hugetlb_freed(folio);
1314	}
1315
1316	static struct folio dequeue_hugetlb_folio_node_exact(struct* hstate *h,
1317	int nid)
1318	{
1319	struct folio *folio;
1320	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1321
1322	lockdep_assert_held(&hugetlb_lock);
1323	list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
1324	if (pin && !folio_is_longterm_pinnable(folio))
1325	continue;
1326
1327	if (folio_test_hwpoison(folio))
1328	continue;
1329
1330	list_move(list: &folio->lru, head: &h->hugepage_activelist);
1331	folio_ref_unfreeze(folio, count: `1`);
1332	folio_clear_hugetlb_freed(folio);
1333	h->free_huge_pages--;
1334	h->free_huge_pages_node[nid]--;
1335	return folio;
1336	}
1337
1338	return NULL;
1339	}
1340
1341	static struct folio dequeue_hugetlb_folio_nodemask(struct* hstate *h, gfp_t gfp_mask,
1342	int nid, nodemask_t *nmask)
1343	{
1344	unsigned int cpuset_mems_cookie;
1345	struct zonelist *zonelist;
1346	struct zone *zone;
1347	struct zoneref *z;
1348	int node = NUMA_NO_NODE;
1349
1350	zonelist = node_zonelist(nid, flags: gfp_mask);
1351
1352	retry_cpuset:
1353	cpuset_mems_cookie = read_mems_allowed_begin();
1354	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1355	struct folio *folio;
1356
1357	if (!cpuset_zone_allowed(z: zone, gfp_mask))
1358	continue;
1359	/*
1360	* no need to ask again on the same node. Pool is node rather than
1361	* zone aware
1362	*/
1363	if (zone_to_nid(zone) == node)
1364	continue;
1365	node = zone_to_nid(zone);
1366
1367	folio = dequeue_hugetlb_folio_node_exact(h, nid: node);
1368	if (folio)
1369	return folio;
1370	}
1371	if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1372	goto retry_cpuset;
1373
1374	return NULL;
1375	}
1376
1377	static unsigned long available_huge_pages(struct hstate *h)
1378	{
1379	return h->free_huge_pages - h->resv_huge_pages;
1380	}
1381
1382	static struct folio dequeue_hugetlb_folio_vma(struct* hstate *h,
1383	struct vm_area_struct *vma,
1384	unsigned long address, int avoid_reserve,
1385	long chg)
1386	{
1387	struct folio *folio = NULL;
1388	struct mempolicy *mpol;
1389	gfp_t gfp_mask;
1390	nodemask_t *nodemask;
1391	int nid;
1392
1393	/*
1394	* A child process with MAP_PRIVATE mappings created by their parent
1395	* have no page reserves. This check ensures that reservations are
1396	* not "stolen". The child may still get SIGKILLed
1397	*/
1398	if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
1399	goto err;
1400
1401	/ If reserves cannot be used, ensure enough pages are in the pool /
1402	if (avoid_reserve && !available_huge_pages(h))
1403	goto err;
1404
1405	gfp_mask = htlb_alloc_mask(h);
1406	nid = huge_node(vma, addr: address, gfp_flags: gfp_mask, mpol: &mpol, nodemask: &nodemask);
1407
1408	if (mpol_is_preferred_many(pol: mpol)) {
1409	folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1410	nid, nmask: nodemask);
1411
1412	/ Fallback to all nodes if page==NULL /
1413	nodemask = NULL;
1414	}
1415
1416	if (!folio)
1417	folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1418	nid, nmask: nodemask);
1419
1420	if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
1421	folio_set_hugetlb_restore_reserve(folio);
1422	h->resv_huge_pages--;
1423	}
1424
1425	mpol_cond_put(pol: mpol);
1426	return folio;
1427
1428	err:
1429	return NULL;
1430	}
1431
1432	/*
1433	* common helper functions for hstate_next_node_to_{alloc\|free}.
1434	* We may have allocated or freed a huge page based on a different
1435	* nodes_allowed previously, so h->next_node_to_{alloc\|free} might
1436	* be outside of *nodes_allowed. Ensure that we use an allowed
1437	* node for alloc or free.
1438	*/
1439	static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1440	{
1441	nid = next_node_in(nid, *nodes_allowed);
1442	VM_BUG_ON(nid >= MAX_NUMNODES);
1443
1444	return nid;
1445	}
1446
1447	static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1448	{
1449	if (!node_isset(nid, *nodes_allowed))
1450	nid = next_node_allowed(nid, nodes_allowed);
1451	return nid;
1452	}
1453
1454	/*
1455	* returns the previously saved node ["this node"] from which to
1456	* allocate a persistent huge page for the pool and advance the
1457	* next node from which to allocate, handling wrap at end of node
1458	* mask.
1459	*/
1460	static int hstate_next_node_to_alloc(struct hstate *h,
1461	nodemask_t *nodes_allowed)
1462	{
1463	int nid;
1464
1465	VM_BUG_ON(!nodes_allowed);
1466
1467	nid = get_valid_node_allowed(nid: h->next_nid_to_alloc, nodes_allowed);
1468	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1469
1470	return nid;
1471	}
1472
1473	/*
1474	* helper for remove_pool_hugetlb_folio() - return the previously saved
1475	* node ["this node"] from which to free a huge page. Advance the
1476	* next node id whether or not we find a free huge page to free so
1477	* that the next attempt to free addresses the next node.
1478	*/
1479	static int hstate_next_node_to_free(struct hstate h, nodemask_t nodes_allowed)
1480	{
1481	int nid;
1482
1483	VM_BUG_ON(!nodes_allowed);
1484
1485	nid = get_valid_node_allowed(nid: h->next_nid_to_free, nodes_allowed);
1486	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1487
1488	return nid;
1489	}
1490
1491	#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1492	for (nr_nodes = nodes_weight(*mask); \
1493	nr_nodes > 0 && \
1494	((node = hstate_next_node_to_alloc(hs, mask)) \|\| 1); \
1495	nr_nodes--)
1496
1497	#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1498	for (nr_nodes = nodes_weight(*mask); \
1499	nr_nodes > 0 && \
1500	((node = hstate_next_node_to_free(hs, mask)) \|\| 1); \
1501	nr_nodes--)
1502
1503	/ used to demote non-gigantic_huge pages as well /
1504	static void __destroy_compound_gigantic_folio(struct folio *folio,
1505	unsigned int order, bool demote)
1506	{
1507	int i;
1508	int nr_pages = `1` << order;
1509	struct page *p;
1510
1511	atomic_set(v: &folio->_entire_mapcount, i: `0`);
1512	atomic_set(v: &folio->_nr_pages_mapped, i: `0`);
1513	atomic_set(v: &folio->_pincount, i: `0`);
1514
1515	for (i = `1`; i < nr_pages; i++) {
1516	p = folio_page(folio, i);
1517	p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE;
1518	p->mapping = NULL;
1519	clear_compound_head(page: p);
1520	if (!demote)
1521	set_page_refcounted(p);
1522	}
1523
1524	__folio_clear_head(folio);
1525	}
1526
1527	static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
1528	unsigned int order)
1529	{
1530	__destroy_compound_gigantic_folio(folio, order, demote: true);
1531	}
1532
1533	#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1534	static void destroy_compound_gigantic_folio(struct folio *folio,
1535	unsigned int order)
1536	{
1537	__destroy_compound_gigantic_folio(folio, order, demote: false);
1538	}
1539
1540	static void free_gigantic_folio(struct folio folio, unsigned* int order)
1541	{
1542	/*
1543	* If the page isn't allocated using the cma allocator,
1544	* cma_release() returns false.
1545	*/
1546	#ifdef CONFIG_CMA
1547	int nid = folio_nid(folio);
1548
1549	if (cma_release(cma: hugetlb_cma[nid], pages: &folio->page, count: `1` << order))
1550	return;
1551	#endif
1552
1553	free_contig_range(pfn: folio_pfn(folio), nr_pages: `1` << order);
1554	}
1555
1556	#ifdef CONFIG_CONTIG_ALLOC
1557	static struct folio alloc_gigantic_folio(struct* hstate *h, gfp_t gfp_mask,
1558	int nid, nodemask_t *nodemask)
1559	{
1560	struct page *page;
1561	unsigned long nr_pages = pages_per_huge_page(h);
1562	if (nid == NUMA_NO_NODE)
1563	nid = numa_mem_id();
1564
1565	#ifdef CONFIG_CMA
1566	{
1567	int node;
1568
1569	if (hugetlb_cma[nid]) {
1570	page = cma_alloc(cma: hugetlb_cma[nid], count: nr_pages,
1571	align: huge_page_order(h), no_warn: true);
1572	if (page)
1573	return page_folio(page);
1574	}
1575
1576	if (!(gfp_mask & __GFP_THISNODE)) {
1577	for_each_node_mask(node, *nodemask) {
1578	if (node == nid \|\| !hugetlb_cma[node])
1579	continue;
1580
1581	page = cma_alloc(cma: hugetlb_cma[node], count: nr_pages,
1582	align: huge_page_order(h), no_warn: true);
1583	if (page)
1584	return page_folio(page);
1585	}
1586	}
1587	}
1588	#endif
1589
1590	page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1591	return page ? page_folio(page) : NULL;
1592	}
1593
1594	#else /* !CONFIG_CONTIG_ALLOC */
1595	static struct folio alloc_gigantic_folio(struct* hstate *h, gfp_t gfp_mask,
1596	int nid, nodemask_t *nodemask)
1597	{
1598	return NULL;
1599	}
1600	#endif /* CONFIG_CONTIG_ALLOC */
1601
1602	#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
1603	static struct folio alloc_gigantic_folio(struct* hstate *h, gfp_t gfp_mask,
1604	int nid, nodemask_t *nodemask)
1605	{
1606	return NULL;
1607	}
1608	static inline void free_gigantic_folio(struct folio *folio,
1609	unsigned int order) { }
1610	static inline void destroy_compound_gigantic_folio(struct folio *folio,
1611	unsigned int order) { }
1612	#endif
1613
1614	static inline void __clear_hugetlb_destructor(struct hstate *h,
1615	struct folio *folio)
1616	{
1617	lockdep_assert_held(&hugetlb_lock);
1618
1619	folio_clear_hugetlb(folio);
1620	}
1621
1622	/*
1623	* Remove hugetlb folio from lists.
1624	* If vmemmap exists for the folio, update dtor so that the folio appears
1625	* as just a compound page. Otherwise, wait until after allocating vmemmap
1626	* to update dtor.
1627	*
1628	* A reference is held on the folio, except in the case of demote.
1629	*
1630	* Must be called with hugetlb lock held.
1631	*/
1632	static void __remove_hugetlb_folio(struct hstate h, struct* folio *folio,
1633	bool adjust_surplus,
1634	bool demote)
1635	{
1636	int nid = folio_nid(folio);
1637
1638	VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
1639	VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
1640
1641	lockdep_assert_held(&hugetlb_lock);
1642	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1643	return;
1644
1645	list_del(entry: &folio->lru);
1646
1647	if (folio_test_hugetlb_freed(folio)) {
1648	h->free_huge_pages--;
1649	h->free_huge_pages_node[nid]--;
1650	}
1651	if (adjust_surplus) {
1652	h->surplus_huge_pages--;
1653	h->surplus_huge_pages_node[nid]--;
1654	}
1655
1656	/*
1657	* We can only clear the hugetlb destructor after allocating vmemmap
1658	* pages. Otherwise, someone (memory error handling) may try to write
1659	* to tail struct pages.
1660	*/
1661	if (!folio_test_hugetlb_vmemmap_optimized(folio))
1662	__clear_hugetlb_destructor(h, folio);
1663
1664	/*
1665	* In the case of demote we do not ref count the page as it will soon
1666	* be turned into a page of smaller size.
1667	*/
1668	if (!demote)
1669	folio_ref_unfreeze(folio, count: `1`);
1670
1671	h->nr_huge_pages--;
1672	h->nr_huge_pages_node[nid]--;
1673	}
1674
1675	static void remove_hugetlb_folio(struct hstate h, struct* folio *folio,
1676	bool adjust_surplus)
1677	{
1678	__remove_hugetlb_folio(h, folio, adjust_surplus, demote: false);
1679	}
1680
1681	static void remove_hugetlb_folio_for_demote(struct hstate h, struct* folio *folio,
1682	bool adjust_surplus)
1683	{
1684	__remove_hugetlb_folio(h, folio, adjust_surplus, demote: true);
1685	}
1686
1687	static void add_hugetlb_folio(struct hstate h, struct* folio *folio,
1688	bool adjust_surplus)
1689	{
1690	int zeroed;
1691	int nid = folio_nid(folio);
1692
1693	VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
1694
1695	lockdep_assert_held(&hugetlb_lock);
1696
1697	INIT_LIST_HEAD(list: &folio->lru);
1698	h->nr_huge_pages++;
1699	h->nr_huge_pages_node[nid]++;
1700
1701	if (adjust_surplus) {
1702	h->surplus_huge_pages++;
1703	h->surplus_huge_pages_node[nid]++;
1704	}
1705
1706	folio_set_hugetlb(folio);
1707	folio_change_private(folio, NULL);
1708	/*
1709	* We have to set hugetlb_vmemmap_optimized again as above
1710	* folio_change_private(folio, NULL) cleared it.
1711	*/
1712	folio_set_hugetlb_vmemmap_optimized(folio);
1713
1714	/*
1715	* This folio is about to be managed by the hugetlb allocator and
1716	* should have no users. Drop our reference, and check for others
1717	* just in case.
1718	*/
1719	zeroed = folio_put_testzero(folio);
1720	if (unlikely(!zeroed))
1721	/*
1722	* It is VERY unlikely soneone else has taken a ref
1723	* on the folio. In this case, we simply return as
1724	* free_huge_folio() will be called when this other ref
1725	* is dropped.
1726	*/
1727	return;
1728
1729	arch_clear_hugepage_flags(page: &folio->page);
1730	enqueue_hugetlb_folio(h, folio);
1731	}
1732
1733	static void __update_and_free_hugetlb_folio(struct hstate *h,
1734	struct folio *folio)
1735	{
1736	bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
1737
1738	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1739	return;
1740
1741	/*
1742	* If we don't know which subpages are hwpoisoned, we can't free
1743	* the hugepage, so it's leaked intentionally.
1744	*/
1745	if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1746	return;
1747
1748	/*
1749	* If folio is not vmemmap optimized (!clear_dtor), then the folio
1750	* is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio
1751	* can only be passed hugetlb pages and will BUG otherwise.
1752	*/
1753	if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) {
1754	spin_lock_irq(lock: &hugetlb_lock);
1755	/*
1756	* If we cannot allocate vmemmap pages, just refuse to free the
1757	* page and put the page back on the hugetlb free list and treat
1758	* as a surplus page.
1759	*/
1760	add_hugetlb_folio(h, folio, adjust_surplus: true);
1761	spin_unlock_irq(lock: &hugetlb_lock);
1762	return;
1763	}
1764
1765	/*
1766	* Move PageHWPoison flag from head page to the raw error pages,
1767	* which makes any healthy subpages reusable.
1768	*/
1769	if (unlikely(folio_test_hwpoison(folio)))
1770	folio_clear_hugetlb_hwpoison(folio);
1771
1772	/*
1773	* If vmemmap pages were allocated above, then we need to clear the
1774	* hugetlb destructor under the hugetlb lock.
1775	*/
1776	if (clear_dtor) {
1777	spin_lock_irq(lock: &hugetlb_lock);
1778	__clear_hugetlb_destructor(h, folio);
1779	spin_unlock_irq(lock: &hugetlb_lock);
1780	}
1781
1782	/*
1783	* Non-gigantic pages demoted from CMA allocated gigantic pages
1784	* need to be given back to CMA in free_gigantic_folio.
1785	*/
1786	if (hstate_is_gigantic(h) \|\|
1787	hugetlb_cma_folio(folio, order: huge_page_order(h))) {
1788	destroy_compound_gigantic_folio(folio, order: huge_page_order(h));
1789	free_gigantic_folio(folio, order: huge_page_order(h));
1790	} else {
1791	__free_pages(page: &folio->page, order: huge_page_order(h));
1792	}
1793	}
1794
1795	/*
1796	* As update_and_free_hugetlb_folio() can be called under any context, so we cannot
1797	* use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1798	* actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1799	* the vmemmap pages.
1800	*
1801	* free_hpage_workfn() locklessly retrieves the linked list of pages to be
1802	* freed and frees them one-by-one. As the page->mapping pointer is going
1803	* to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1804	* structure of a lockless linked list of huge pages to be freed.
1805	*/
1806	static LLIST_HEAD(hpage_freelist);
1807
1808	static void free_hpage_workfn(struct work_struct *work)
1809	{
1810	struct llist_node *node;
1811
1812	node = llist_del_all(head: &hpage_freelist);
1813
1814	while (node) {
1815	struct folio *folio;
1816	struct hstate *h;
1817
1818	folio = container_of((struct address_space **)node,
1819	struct folio, mapping);
1820	node = node->next;
1821	folio->mapping = NULL;
1822	/*
1823	* The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
1824	* folio_hstate() is going to trigger because a previous call to
1825	* remove_hugetlb_folio() will clear the hugetlb bit, so do
1826	* not use folio_hstate() directly.
1827	*/
1828	h = size_to_hstate(size: folio_size(folio));
1829
1830	__update_and_free_hugetlb_folio(h, folio);
1831
1832	cond_resched();
1833	}
1834	}
1835	static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1836
1837	static inline void flush_free_hpage_work(struct hstate *h)
1838	{
1839	if (hugetlb_vmemmap_optimizable(h))
1840	flush_work(work: &free_hpage_work);
1841	}
1842
1843	static void update_and_free_hugetlb_folio(struct hstate h, struct* folio *folio,
1844	bool atomic)
1845	{
1846	if (!folio_test_hugetlb_vmemmap_optimized(folio) \|\| !atomic) {
1847	__update_and_free_hugetlb_folio(h, folio);
1848	return;
1849	}
1850
1851	/*
1852	* Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1853	*
1854	* Only call schedule_work() if hpage_freelist is previously
1855	* empty. Otherwise, schedule_work() had been called but the workfn
1856	* hasn't retrieved the list yet.
1857	*/
1858	if (llist_add(new: (struct llist_node *)&folio->mapping, head: &hpage_freelist))
1859	schedule_work(work: &free_hpage_work);
1860	}
1861
1862	static void bulk_vmemmap_restore_error(struct hstate *h,
1863	struct list_head *folio_list,
1864	struct list_head *non_hvo_folios)
1865	{
1866	struct folio folio, t_folio;
1867
1868	if (!list_empty(head: non_hvo_folios)) {
1869	/*
1870	* Free any restored hugetlb pages so that restore of the
1871	* entire list can be retried.
1872	* The idea is that in the common case of ENOMEM errors freeing
1873	* hugetlb pages with vmemmap we will free up memory so that we
1874	* can allocate vmemmap for more hugetlb pages.
1875	*/
1876	list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
1877	list_del(entry: &folio->lru);
1878	spin_lock_irq(lock: &hugetlb_lock);
1879	__clear_hugetlb_destructor(h, folio);
1880	spin_unlock_irq(lock: &hugetlb_lock);
1881	update_and_free_hugetlb_folio(h, folio, atomic: false);
1882	cond_resched();
1883	}
1884	} else {
1885	/*
1886	* In the case where there are no folios which can be
1887	* immediately freed, we loop through the list trying to restore
1888	* vmemmap individually in the hope that someone elsewhere may
1889	* have done something to cause success (such as freeing some
1890	* memory). If unable to restore a hugetlb page, the hugetlb
1891	* page is made a surplus page and removed from the list.
1892	* If are able to restore vmemmap and free one hugetlb page, we
1893	* quit processing the list to retry the bulk operation.
1894	*/
1895	list_for_each_entry_safe(folio, t_folio, folio_list, lru)
1896	if (hugetlb_vmemmap_restore_folio(h, folio)) {
1897	list_del(entry: &folio->lru);
1898	spin_lock_irq(lock: &hugetlb_lock);
1899	add_hugetlb_folio(h, folio, adjust_surplus: true);
1900	spin_unlock_irq(lock: &hugetlb_lock);
1901	} else {
1902	list_del(entry: &folio->lru);
1903	spin_lock_irq(lock: &hugetlb_lock);
1904	__clear_hugetlb_destructor(h, folio);
1905	spin_unlock_irq(lock: &hugetlb_lock);
1906	update_and_free_hugetlb_folio(h, folio, atomic: false);
1907	cond_resched();
1908	break;
1909	}
1910	}
1911	}
1912
1913	static void update_and_free_pages_bulk(struct hstate *h,
1914	struct list_head *folio_list)
1915	{
1916	long ret;
1917	struct folio folio, t_folio;
1918	LIST_HEAD(non_hvo_folios);
1919
1920	/*
1921	* First allocate required vmemmmap (if necessary) for all folios.
1922	* Carefully handle errors and free up any available hugetlb pages
1923	* in an effort to make forward progress.
1924	*/
1925	retry:
1926	ret = hugetlb_vmemmap_restore_folios(h, folio_list, non_hvo_folios: &non_hvo_folios);
1927	if (ret < `0`) {
1928	bulk_vmemmap_restore_error(h, folio_list, non_hvo_folios: &non_hvo_folios);
1929	goto retry;
1930	}
1931
1932	/*
1933	* At this point, list should be empty, ret should be >= 0 and there
1934	* should only be pages on the non_hvo_folios list.
1935	* Do note that the non_hvo_folios list could be empty.
1936	* Without HVO enabled, ret will be 0 and there is no need to call
1937	* __clear_hugetlb_destructor as this was done previously.
1938	*/
1939	VM_WARN_ON(!list_empty(folio_list));
1940	VM_WARN_ON(ret < `0`);
1941	if (!list_empty(head: &non_hvo_folios) && ret) {
1942	spin_lock_irq(lock: &hugetlb_lock);
1943	list_for_each_entry(folio, &non_hvo_folios, lru)
1944	__clear_hugetlb_destructor(h, folio);
1945	spin_unlock_irq(lock: &hugetlb_lock);
1946	}
1947
1948	list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
1949	update_and_free_hugetlb_folio(h, folio, atomic: false);
1950	cond_resched();
1951	}
1952	}
1953
1954	struct hstate size_to_hstate(unsigned* long size)
1955	{
1956	struct hstate *h;
1957
1958	for_each_hstate(h) {
1959	if (huge_page_size(h) == size)
1960	return h;
1961	}
1962	return NULL;
1963	}
1964
1965	void free_huge_folio(struct folio *folio)
1966	{
1967	/*
1968	* Can't pass hstate in here because it is called from the
1969	* compound page destructor.
1970	*/
1971	struct hstate *h = folio_hstate(folio);
1972	int nid = folio_nid(folio);
1973	struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
1974	bool restore_reserve;
1975	unsigned long flags;
1976
1977	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1978	VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
1979
1980	hugetlb_set_folio_subpool(folio, NULL);
1981	if (folio_test_anon(folio))
1982	__ClearPageAnonExclusive(page: &folio->page);
1983	folio->mapping = NULL;
1984	restore_reserve = folio_test_hugetlb_restore_reserve(folio);
1985	folio_clear_hugetlb_restore_reserve(folio);
1986
1987	/*
1988	* If HPageRestoreReserve was set on page, page allocation consumed a
1989	* reservation. If the page was associated with a subpool, there
1990	* would have been a page reserved in the subpool before allocation
1991	* via hugepage_subpool_get_pages(). Since we are 'restoring' the
1992	* reservation, do not call hugepage_subpool_put_pages() as this will
1993	* remove the reserved page from the subpool.
1994	*/
1995	if (!restore_reserve) {
1996	/*
1997	* A return code of zero implies that the subpool will be
1998	* under its minimum size if the reservation is not restored
1999	* after page is free. Therefore, force restore_reserve
2000	* operation.
2001	*/
2002	if (hugepage_subpool_put_pages(spool, delta: `1`) == `0`)
2003	restore_reserve = true;
2004	}
2005
2006	spin_lock_irqsave(&hugetlb_lock, flags);
2007	folio_clear_hugetlb_migratable(folio);
2008	hugetlb_cgroup_uncharge_folio(idx: hstate_index(h),
2009	nr_pages: pages_per_huge_page(h), folio);
2010	hugetlb_cgroup_uncharge_folio_rsvd(idx: hstate_index(h),
2011	nr_pages: pages_per_huge_page(h), folio);
2012	mem_cgroup_uncharge(folio);
2013	if (restore_reserve)
2014	h->resv_huge_pages++;
2015
2016	if (folio_test_hugetlb_temporary(folio)) {
2017	remove_hugetlb_folio(h, folio, adjust_surplus: false);
2018	spin_unlock_irqrestore(lock: &hugetlb_lock, flags);
2019	update_and_free_hugetlb_folio(h, folio, atomic: true);
2020	} else if (h->surplus_huge_pages_node[nid]) {
2021	/ remove the page from active list /
2022	remove_hugetlb_folio(h, folio, adjust_surplus: true);
2023	spin_unlock_irqrestore(lock: &hugetlb_lock, flags);
2024	update_and_free_hugetlb_folio(h, folio, atomic: true);
2025	} else {
2026	arch_clear_hugepage_flags(page: &folio->page);
2027	enqueue_hugetlb_folio(h, folio);
2028	spin_unlock_irqrestore(lock: &hugetlb_lock, flags);
2029	}
2030	}
2031
2032	/*
2033	* Must be called with the hugetlb lock held
2034	*/
2035	static void __prep_account_new_huge_page(struct hstate h, int* nid)
2036	{
2037	lockdep_assert_held(&hugetlb_lock);
2038	h->nr_huge_pages++;
2039	h->nr_huge_pages_node[nid]++;
2040	}
2041
2042	static void init_new_hugetlb_folio(struct hstate h, struct* folio *folio)
2043	{
2044	folio_set_hugetlb(folio);
2045	INIT_LIST_HEAD(list: &folio->lru);
2046	hugetlb_set_folio_subpool(folio, NULL);
2047	set_hugetlb_cgroup(folio, NULL);
2048	set_hugetlb_cgroup_rsvd(folio, NULL);
2049	}
2050
2051	static void __prep_new_hugetlb_folio(struct hstate h, struct* folio *folio)
2052	{
2053	init_new_hugetlb_folio(h, folio);
2054	hugetlb_vmemmap_optimize_folio(h, folio);
2055	}
2056
2057	static void prep_new_hugetlb_folio(struct hstate h, struct* folio folio, int* nid)
2058	{
2059	__prep_new_hugetlb_folio(h, folio);
2060	spin_lock_irq(lock: &hugetlb_lock);
2061	__prep_account_new_huge_page(h, nid);
2062	spin_unlock_irq(lock: &hugetlb_lock);
2063	}
2064
2065	static bool __prep_compound_gigantic_folio(struct folio *folio,
2066	unsigned int order, bool demote)
2067	{
2068	int i, j;
2069	int nr_pages = `1` << order;
2070	struct page *p;
2071
2072	__folio_clear_reserved(folio);
2073	for (i = `0`; i < nr_pages; i++) {
2074	p = folio_page(folio, i);
2075
2076	/*
2077	* For gigantic hugepages allocated through bootmem at
2078	* boot, it's safer to be consistent with the not-gigantic
2079	* hugepages and clear the PG_reserved bit from all tail pages
2080	* too. Otherwise drivers using get_user_pages() to access tail
2081	* pages may get the reference counting wrong if they see
2082	* PG_reserved set on a tail page (despite the head page not
2083	* having PG_reserved set). Enforcing this consistency between
2084	* head and tail pages allows drivers to optimize away a check
2085	* on the head page when they need know if put_page() is needed
2086	* after get_user_pages().
2087	*/
2088	if (i != `0`) / head page cleared above /
2089	__ClearPageReserved(page: p);
2090	/*
2091	* Subtle and very unlikely
2092	*
2093	* Gigantic 'page allocators' such as memblock or cma will
2094	* return a set of pages with each page ref counted. We need
2095	* to turn this set of pages into a compound page with tail
2096	* page ref counts set to zero. Code such as speculative page
2097	* cache adding could take a ref on a 'to be' tail page.
2098	* We need to respect any increased ref count, and only set
2099	* the ref count to zero if count is currently 1. If count
2100	* is not 1, we return an error. An error return indicates
2101	* the set of pages can not be converted to a gigantic page.
2102	* The caller who allocated the pages should then discard the
2103	* pages using the appropriate free interface.
2104	*
2105	* In the case of demote, the ref count will be zero.
2106	*/
2107	if (!demote) {
2108	if (!page_ref_freeze(page: p, count: `1`)) {
2109	pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
2110	goto out_error;
2111	}
2112	} else {
2113	VM_BUG_ON_PAGE(page_count(p), p);
2114	}
2115	if (i != `0`)
2116	set_compound_head(page: p, head: &folio->page);
2117	}
2118	__folio_set_head(folio);
2119	/ we rely on prep_new_hugetlb_folio to set the destructor /
2120	folio_set_order(folio, order);
2121	atomic_set(v: &folio->_entire_mapcount, i: -`1`);
2122	atomic_set(v: &folio->_nr_pages_mapped, i: `0`);
2123	atomic_set(v: &folio->_pincount, i: `0`);
2124	return true;
2125
2126	out_error:
2127	/ undo page modifications made above /
2128	for (j = `0`; j < i; j++) {
2129	p = folio_page(folio, j);
2130	if (j != `0`)
2131	clear_compound_head(page: p);
2132	set_page_refcounted(p);
2133	}
2134	/ need to clear PG_reserved on remaining tail pages /
2135	for (; j < nr_pages; j++) {
2136	p = folio_page(folio, j);
2137	__ClearPageReserved(page: p);
2138	}
2139	return false;
2140	}
2141
2142	static bool prep_compound_gigantic_folio(struct folio *folio,
2143	unsigned int order)
2144	{
2145	return __prep_compound_gigantic_folio(folio, order, demote: false);
2146	}
2147
2148	static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
2149	unsigned int order)
2150	{
2151	return __prep_compound_gigantic_folio(folio, order, demote: true);
2152	}
2153
2154	/*
2155	* PageHuge() only returns true for hugetlbfs pages, but not for normal or
2156	* transparent huge pages. See the PageTransHuge() documentation for more
2157	* details.
2158	*/
2159	int PageHuge(struct page *page)
2160	{
2161	struct folio *folio;
2162
2163	if (!PageCompound(page))
2164	return `0`;
2165	folio = page_folio(page);
2166	return folio_test_hugetlb(folio);
2167	}
2168	EXPORT_SYMBOL_GPL(PageHuge);
2169
2170	/*
2171	* Find and lock address space (mapping) in write mode.
2172	*
2173	* Upon entry, the page is locked which means that page_mapping() is
2174	* stable. Due to locking order, we can only trylock_write. If we can
2175	* not get the lock, simply return NULL to caller.
2176	*/
2177	struct address_space hugetlb_page_mapping_lock_write(struct* page *hpage)
2178	{
2179	struct address_space *mapping = page_mapping(hpage);
2180
2181	if (!mapping)
2182	return mapping;
2183
2184	if (i_mmap_trylock_write(mapping))
2185	return mapping;
2186
2187	return NULL;
2188	}
2189
2190	static struct folio alloc_buddy_hugetlb_folio(struct* hstate *h,
2191	gfp_t gfp_mask, int nid, nodemask_t *nmask,
2192	nodemask_t *node_alloc_noretry)
2193	{
2194	int order = huge_page_order(h);
2195	struct page *page;
2196	bool alloc_try_hard = true;
2197	bool retry = true;
2198
2199	/*
2200	* By default we always try hard to allocate the page with
2201	* __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
2202	* a loop (to adjust global huge page counts) and previous allocation
2203	* failed, do not continue to try hard on the same node. Use the
2204	* node_alloc_noretry bitmap to manage this state information.
2205	*/
2206	if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
2207	alloc_try_hard = false;
2208	gfp_mask \|= __GFP_COMP\|__GFP_NOWARN;
2209	if (alloc_try_hard)
2210	gfp_mask \|= __GFP_RETRY_MAYFAIL;
2211	if (nid == NUMA_NO_NODE)
2212	nid = numa_mem_id();
2213	retry:
2214	page = __alloc_pages(gfp: gfp_mask, order, preferred_nid: nid, nodemask: nmask);
2215
2216	/ Freeze head page /
2217	if (page && !page_ref_freeze(page, count: `1`)) {
2218	__free_pages(page, order);
2219	if (retry) { / retry once /
2220	retry = false;
2221	goto retry;
2222	}
2223	/ WOW! twice in a row. /
2224	pr_warn("HugeTLB head page unexpected inflated ref count\n");
2225	page = NULL;
2226	}
2227
2228	/*
2229	* If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
2230	* indicates an overall state change. Clear bit so that we resume
2231	* normal 'try hard' allocations.
2232	*/
2233	if (node_alloc_noretry && page && !alloc_try_hard)
2234	node_clear(nid, *node_alloc_noretry);
2235
2236	/*
2237	* If we tried hard to get a page but failed, set bit so that
2238	* subsequent attempts will not try as hard until there is an
2239	* overall state change.
2240	*/
2241	if (node_alloc_noretry && !page && alloc_try_hard)
2242	node_set(nid, *node_alloc_noretry);
2243
2244	if (!page) {
2245	__count_vm_event(item: HTLB_BUDDY_PGALLOC_FAIL);
2246	return NULL;
2247	}
2248
2249	__count_vm_event(item: HTLB_BUDDY_PGALLOC);
2250	return page_folio(page);
2251	}
2252
2253	static struct folio __alloc_fresh_hugetlb_folio(struct* hstate *h,
2254	gfp_t gfp_mask, int nid, nodemask_t *nmask,
2255	nodemask_t *node_alloc_noretry)
2256	{
2257	struct folio *folio;
2258	bool retry = false;
2259
2260	retry:
2261	if (hstate_is_gigantic(h))
2262	folio = alloc_gigantic_folio(h, gfp_mask, nid, nodemask: nmask);
2263	else
2264	folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
2265	nid, nmask, node_alloc_noretry);
2266	if (!folio)
2267	return NULL;
2268
2269	if (hstate_is_gigantic(h)) {
2270	if (!prep_compound_gigantic_folio(folio, order: huge_page_order(h))) {
2271	/*
2272	* Rare failure to convert pages to compound page.
2273	* Free pages and try again - ONCE!
2274	*/
2275	free_gigantic_folio(folio, order: huge_page_order(h));
2276	if (!retry) {
2277	retry = true;
2278	goto retry;
2279	}
2280	return NULL;
2281	}
2282	}
2283
2284	return folio;
2285	}
2286
2287	static struct folio only_alloc_fresh_hugetlb_folio(struct* hstate *h,
2288	gfp_t gfp_mask, int nid, nodemask_t *nmask,
2289	nodemask_t *node_alloc_noretry)
2290	{
2291	struct folio *folio;
2292
2293	folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
2294	node_alloc_noretry);
2295	if (folio)
2296	init_new_hugetlb_folio(h, folio);
2297	return folio;
2298	}
2299
2300	/*
2301	* Common helper to allocate a fresh hugetlb page. All specific allocators
2302	* should use this function to get new hugetlb pages
2303	*
2304	* Note that returned page is 'frozen': ref count of head page and all tail
2305	* pages is zero.
2306	*/
2307	static struct folio alloc_fresh_hugetlb_folio(struct* hstate *h,
2308	gfp_t gfp_mask, int nid, nodemask_t *nmask,
2309	nodemask_t *node_alloc_noretry)
2310	{
2311	struct folio *folio;
2312
2313	folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
2314	node_alloc_noretry);
2315	if (!folio)
2316	return NULL;
2317
2318	prep_new_hugetlb_folio(h, folio, nid: folio_nid(folio));
2319	return folio;
2320	}
2321
2322	static void prep_and_add_allocated_folios(struct hstate *h,
2323	struct list_head *folio_list)
2324	{
2325	unsigned long flags;
2326	struct folio folio, tmp_f;
2327
2328	/ Send list for bulk vmemmap optimization processing /
2329	hugetlb_vmemmap_optimize_folios(h, folio_list);
2330
2331	/ Add all new pool pages to free lists in one lock cycle /
2332	spin_lock_irqsave(&hugetlb_lock, flags);
2333	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
2334	__prep_account_new_huge_page(h, nid: folio_nid(folio));
2335	enqueue_hugetlb_folio(h, folio);
2336	}
2337	spin_unlock_irqrestore(lock: &hugetlb_lock, flags);
2338	}
2339
2340	/*
2341	* Allocates a fresh hugetlb page in a node interleaved manner. The page
2342	* will later be added to the appropriate hugetlb pool.
2343	*/
2344	static struct folio alloc_pool_huge_folio(struct* hstate *h,
2345	nodemask_t *nodes_allowed,
2346	nodemask_t *node_alloc_noretry)
2347	{
2348	gfp_t gfp_mask = htlb_alloc_mask(h) \| __GFP_THISNODE;
2349	int nr_nodes, node;
2350
2351	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2352	struct folio *folio;
2353
2354	folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid: node,
2355	nmask: nodes_allowed, node_alloc_noretry);
2356	if (folio)
2357	return folio;
2358	}
2359
2360	return NULL;
2361	}
2362
2363	/*
2364	* Remove huge page from pool from next node to free. Attempt to keep
2365	* persistent huge pages more or less balanced over allowed nodes.
2366	* This routine only 'removes' the hugetlb page. The caller must make
2367	* an additional call to free the page to low level allocators.
2368	* Called with hugetlb_lock locked.
2369	*/
2370	static struct folio remove_pool_hugetlb_folio(struct* hstate *h,
2371	nodemask_t *nodes_allowed, bool acct_surplus)
2372	{
2373	int nr_nodes, node;
2374	struct folio *folio = NULL;
2375
2376	lockdep_assert_held(&hugetlb_lock);
2377	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2378	/*
2379	* If we're returning unused surplus pages, only examine
2380	* nodes with surplus pages.
2381	*/
2382	if ((!acct_surplus \|\| h->surplus_huge_pages_node[node]) &&
2383	!list_empty(head: &h->hugepage_freelists[node])) {
2384	folio = list_entry(h->hugepage_freelists[node].next,
2385	struct folio, lru);
2386	remove_hugetlb_folio(h, folio, adjust_surplus: acct_surplus);
2387	break;
2388	}
2389	}
2390
2391	return folio;
2392	}
2393
2394	/*
2395	* Dissolve a given free hugepage into free buddy pages. This function does
2396	* nothing for in-use hugepages and non-hugepages.
2397	* This function returns values like below:
2398	*
2399	* -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2400	* when the system is under memory pressure and the feature of
2401	* freeing unused vmemmap pages associated with each hugetlb page
2402	* is enabled.
2403	* -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
2404	* (allocated or reserved.)
2405	* 0: successfully dissolved free hugepages or the page is not a
2406	* hugepage (considered as already dissolved)
2407	*/
2408	int dissolve_free_huge_page(struct page *page)
2409	{
2410	int rc = -EBUSY;
2411	struct folio *folio = page_folio(page);
2412
2413	retry:
2414	/ Not to disrupt normal path by vainly holding hugetlb_lock /
2415	if (!folio_test_hugetlb(folio))
2416	return `0`;
2417
2418	spin_lock_irq(lock: &hugetlb_lock);
2419	if (!folio_test_hugetlb(folio)) {
2420	rc = `0`;
2421	goto out;
2422	}
2423
2424	if (!folio_ref_count(folio)) {
2425	struct hstate *h = folio_hstate(folio);
2426	if (!available_huge_pages(h))
2427	goto out;
2428
2429	/*
2430	* We should make sure that the page is already on the free list
2431	* when it is dissolved.
2432	*/
2433	if (unlikely(!folio_test_hugetlb_freed(folio))) {
2434	spin_unlock_irq(lock: &hugetlb_lock);
2435	cond_resched();
2436
2437	/*
2438	* Theoretically, we should return -EBUSY when we
2439	* encounter this race. In fact, we have a chance
2440	* to successfully dissolve the page if we do a
2441	* retry. Because the race window is quite small.
2442	* If we seize this opportunity, it is an optimization
2443	* for increasing the success rate of dissolving page.
2444	*/
2445	goto retry;
2446	}
2447
2448	remove_hugetlb_folio(h, folio, adjust_surplus: false);
2449	h->max_huge_pages--;
2450	spin_unlock_irq(lock: &hugetlb_lock);
2451
2452	/*
2453	* Normally update_and_free_hugtlb_folio will allocate required vmemmmap
2454	* before freeing the page. update_and_free_hugtlb_folio will fail to
2455	* free the page if it can not allocate required vmemmap. We
2456	* need to adjust max_huge_pages if the page is not freed.
2457	* Attempt to allocate vmemmmap here so that we can take
2458	* appropriate action on failure.
2459	*
2460	* The folio_test_hugetlb check here is because
2461	* remove_hugetlb_folio will clear hugetlb folio flag for
2462	* non-vmemmap optimized hugetlb folios.
2463	*/
2464	if (folio_test_hugetlb(folio)) {
2465	rc = hugetlb_vmemmap_restore_folio(h, folio);
2466	if (rc) {
2467	spin_lock_irq(lock: &hugetlb_lock);
2468	add_hugetlb_folio(h, folio, adjust_surplus: false);
2469	h->max_huge_pages++;
2470	goto out;
2471	}
2472	} else
2473	rc = `0`;
2474
2475	update_and_free_hugetlb_folio(h, folio, atomic: false);
2476	return rc;
2477	}
2478	out:
2479	spin_unlock_irq(lock: &hugetlb_lock);
2480	return rc;
2481	}
2482
2483	/*
2484	* Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2485	* make specified memory blocks removable from the system.
2486	* Note that this will dissolve a free gigantic hugepage completely, if any
2487	* part of it lies within the given range.
2488	* Also note that if dissolve_free_huge_page() returns with an error, all
2489	* free hugepages that were dissolved before that error are lost.
2490	*/
2491	int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2492	{
2493	unsigned long pfn;
2494	struct page *page;
2495	int rc = `0`;
2496	unsigned int order;
2497	struct hstate *h;
2498
2499	if (!hugepages_supported())
2500	return rc;
2501
2502	order = huge_page_order(h: &default_hstate);
2503	for_each_hstate(h)
2504	order = min(order, huge_page_order(h));
2505
2506	for (pfn = start_pfn; pfn < end_pfn; pfn += `1` << order) {
2507	page = pfn_to_page(pfn);
2508	rc = dissolve_free_huge_page(page);
2509	if (rc)
2510	break;
2511	}
2512
2513	return rc;
2514	}
2515
2516	/*
2517	* Allocates a fresh surplus page from the page allocator.
2518	*/
2519	static struct folio alloc_surplus_hugetlb_folio(struct* hstate *h,
2520	gfp_t gfp_mask, int nid, nodemask_t *nmask)
2521	{
2522	struct folio *folio = NULL;
2523
2524	if (hstate_is_gigantic(h))
2525	return NULL;
2526
2527	spin_lock_irq(lock: &hugetlb_lock);
2528	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2529	goto out_unlock;
2530	spin_unlock_irq(lock: &hugetlb_lock);
2531
2532	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2533	if (!folio)
2534	return NULL;
2535
2536	spin_lock_irq(lock: &hugetlb_lock);
2537	/*
2538	* We could have raced with the pool size change.
2539	* Double check that and simply deallocate the new page
2540	* if we would end up overcommiting the surpluses. Abuse
2541	* temporary page to workaround the nasty free_huge_folio
2542	* codeflow
2543	*/
2544	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2545	folio_set_hugetlb_temporary(folio);
2546	spin_unlock_irq(lock: &hugetlb_lock);
2547	free_huge_folio(folio);
2548	return NULL;
2549	}
2550
2551	h->surplus_huge_pages++;
2552	h->surplus_huge_pages_node[folio_nid(folio)]++;
2553
2554	out_unlock:
2555	spin_unlock_irq(lock: &hugetlb_lock);
2556
2557	return folio;
2558	}
2559
2560	static struct folio alloc_migrate_hugetlb_folio(struct* hstate *h, gfp_t gfp_mask,
2561	int nid, nodemask_t *nmask)
2562	{
2563	struct folio *folio;
2564
2565	if (hstate_is_gigantic(h))
2566	return NULL;
2567
2568	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2569	if (!folio)
2570	return NULL;
2571
2572	/ fresh huge pages are frozen /
2573	folio_ref_unfreeze(folio, count: `1`);
2574	/*
2575	* We do not account these pages as surplus because they are only
2576	* temporary and will be released properly on the last reference
2577	*/
2578	folio_set_hugetlb_temporary(folio);
2579
2580	return folio;
2581	}
2582
2583	/*
2584	* Use the VMA's mpolicy to allocate a huge page from the buddy.
2585	*/
2586	static
2587	struct folio alloc_buddy_hugetlb_folio_with_mpol(struct* hstate *h,
2588	struct vm_area_struct vma, unsigned* long addr)
2589	{
2590	struct folio *folio = NULL;
2591	struct mempolicy *mpol;
2592	gfp_t gfp_mask = htlb_alloc_mask(h);
2593	int nid;
2594	nodemask_t *nodemask;
2595
2596	nid = huge_node(vma, addr, gfp_flags: gfp_mask, mpol: &mpol, nodemask: &nodemask);
2597	if (mpol_is_preferred_many(pol: mpol)) {
2598	gfp_t gfp = gfp_mask \| __GFP_NOWARN;
2599
2600	gfp &= ~(__GFP_DIRECT_RECLAIM \| __GFP_NOFAIL);
2601	folio = alloc_surplus_hugetlb_folio(h, gfp_mask: gfp, nid, nmask: nodemask);
2602
2603	/ Fallback to all nodes if page==NULL /
2604	nodemask = NULL;
2605	}
2606
2607	if (!folio)
2608	folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nmask: nodemask);
2609	mpol_cond_put(pol: mpol);
2610	return folio;
2611	}
2612
2613	/ folio migration callback function /
2614	struct folio alloc_hugetlb_folio_nodemask(struct* hstate h, int* preferred_nid,
2615	nodemask_t *nmask, gfp_t gfp_mask)
2616	{
2617	spin_lock_irq(lock: &hugetlb_lock);
2618	if (available_huge_pages(h)) {
2619	struct folio *folio;
2620
2621	folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
2622	nid: preferred_nid, nmask);
2623	if (folio) {
2624	spin_unlock_irq(lock: &hugetlb_lock);
2625	return folio;
2626	}
2627	}
2628	spin_unlock_irq(lock: &hugetlb_lock);
2629
2630	return alloc_migrate_hugetlb_folio(h, gfp_mask, nid: preferred_nid, nmask);
2631	}
2632
2633	/*
2634	* Increase the hugetlb pool such that it can accommodate a reservation
2635	* of size 'delta'.
2636	*/
2637	static int gather_surplus_pages(struct hstate h, long* delta)
2638	__must_hold(&hugetlb_lock)
2639	{
2640	LIST_HEAD(surplus_list);
2641	struct folio folio, tmp;
2642	int ret;
2643	long i;
2644	long needed, allocated;
2645	bool alloc_ok = true;
2646
2647	lockdep_assert_held(&hugetlb_lock);
2648	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2649	if (needed <= `0`) {
2650	h->resv_huge_pages += delta;
2651	return `0`;
2652	}
2653
2654	allocated = `0`;
2655
2656	ret = -ENOMEM;
2657	retry:
2658	spin_unlock_irq(lock: &hugetlb_lock);
2659	for (i = `0`; i < needed; i++) {
2660	folio = alloc_surplus_hugetlb_folio(h, gfp_mask: htlb_alloc_mask(h),
2661	NUMA_NO_NODE, NULL);
2662	if (!folio) {
2663	alloc_ok = false;
2664	break;
2665	}
2666	list_add(new: &folio->lru, head: &surplus_list);
2667	cond_resched();
2668	}
2669	allocated += i;
2670
2671	/*
2672	* After retaking hugetlb_lock, we need to recalculate 'needed'
2673	* because either resv_huge_pages or free_huge_pages may have changed.
2674	*/
2675	spin_lock_irq(lock: &hugetlb_lock);
2676	needed = (h->resv_huge_pages + delta) -
2677	(h->free_huge_pages + allocated);
2678	if (needed > `0`) {
2679	if (alloc_ok)
2680	goto retry;
2681	/*
2682	* We were not able to allocate enough pages to
2683	* satisfy the entire reservation so we free what
2684	* we've allocated so far.
2685	*/
2686	goto free;
2687	}
2688	/*
2689	* The surplus_list now contains _at_least_ the number of extra pages
2690	* needed to accommodate the reservation. Add the appropriate number
2691	* of pages to the hugetlb pool and free the extras back to the buddy
2692	* allocator. Commit the entire reservation here to prevent another
2693	* process from stealing the pages as they are added to the pool but
2694	* before they are reserved.
2695	*/
2696	needed += allocated;
2697	h->resv_huge_pages += delta;
2698	ret = `0`;
2699
2700	/ Free the needed pages to the hugetlb pool /
2701	list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
2702	if ((--needed) < `0`)
2703	break;
2704	/ Add the page to the hugetlb allocator /
2705	enqueue_hugetlb_folio(h, folio);
2706	}
2707	free:
2708	spin_unlock_irq(lock: &hugetlb_lock);
2709
2710	/*
2711	* Free unnecessary surplus pages to the buddy allocator.
2712	* Pages have no ref count, call free_huge_folio directly.
2713	*/
2714	list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
2715	free_huge_folio(folio);
2716	spin_lock_irq(lock: &hugetlb_lock);
2717
2718	return ret;
2719	}
2720
2721	/*
2722	* This routine has two main purposes:
2723	* 1) Decrement the reservation count (resv_huge_pages) by the value passed
2724	* in unused_resv_pages. This corresponds to the prior adjustments made
2725	* to the associated reservation map.
2726	* 2) Free any unused surplus pages that may have been allocated to satisfy
2727	* the reservation. As many as unused_resv_pages may be freed.
2728	*/
2729	static void return_unused_surplus_pages(struct hstate *h,
2730	unsigned long unused_resv_pages)
2731	{
2732	unsigned long nr_pages;
2733	LIST_HEAD(page_list);
2734
2735	lockdep_assert_held(&hugetlb_lock);
2736	/ Uncommit the reservation /
2737	h->resv_huge_pages -= unused_resv_pages;
2738
2739	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
2740	goto out;
2741
2742	/*
2743	* Part (or even all) of the reservation could have been backed
2744	* by pre-allocated pages. Only free surplus pages.
2745	*/
2746	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2747
2748	/*
2749	* We want to release as many surplus pages as possible, spread
2750	* evenly across all nodes with memory. Iterate across these nodes
2751	* until we can no longer free unreserved surplus pages. This occurs
2752	* when the nodes with surplus pages have no free pages.
2753	* remove_pool_hugetlb_folio() will balance the freed pages across the
2754	* on-line nodes with memory and will handle the hstate accounting.
2755	*/
2756	while (nr_pages--) {
2757	struct folio *folio;
2758
2759	folio = remove_pool_hugetlb_folio(h, nodes_allowed: &node_states[N_MEMORY], acct_surplus: `1`);
2760	if (!folio)
2761	goto out;
2762
2763	list_add(new: &folio->lru, head: &page_list);
2764	}
2765
2766	out:
2767	spin_unlock_irq(lock: &hugetlb_lock);
2768	update_and_free_pages_bulk(h, folio_list: &page_list);
2769	spin_lock_irq(lock: &hugetlb_lock);
2770	}
2771
2772
2773	/*
2774	* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
2775	* are used by the huge page allocation routines to manage reservations.
2776	*
2777	* vma_needs_reservation is called to determine if the huge page at addr
2778	* within the vma has an associated reservation. If a reservation is
2779	* needed, the value 1 is returned. The caller is then responsible for
2780	* managing the global reservation and subpool usage counts. After
2781	* the huge page has been allocated, vma_commit_reservation is called
2782	* to add the page to the reservation map. If the page allocation fails,
2783	* the reservation must be ended instead of committed. vma_end_reservation
2784	* is called in such cases.
2785	*
2786	* In the normal case, vma_commit_reservation returns the same value
2787	* as the preceding vma_needs_reservation call. The only time this
2788	* is not the case is if a reserve map was changed between calls. It
2789	* is the responsibility of the caller to notice the difference and
2790	* take appropriate action.
2791	*
2792	* vma_add_reservation is used in error paths where a reservation must
2793	* be restored when a newly allocated huge page must be freed. It is
2794	* to be called after calling vma_needs_reservation to determine if a
2795	* reservation exists.
2796	*
2797	* vma_del_reservation is used in error paths where an entry in the reserve
2798	* map was created during huge page allocation and must be removed. It is to
2799	* be called after calling vma_needs_reservation to determine if a reservation
2800	* exists.
2801	*/
2802	enum vma_resv_mode {
2803	VMA_NEEDS_RESV,
2804	VMA_COMMIT_RESV,
2805	VMA_END_RESV,
2806	VMA_ADD_RESV,
2807	VMA_DEL_RESV,
2808	};
2809	static long __vma_reservation_common(struct hstate *h,
2810	struct vm_area_struct vma, unsigned* long addr,
2811	enum vma_resv_mode mode)
2812	{
2813	struct resv_map *resv;
2814	pgoff_t idx;
2815	long ret;
2816	long dummy_out_regions_needed;
2817
2818	resv = vma_resv_map(vma);
2819	if (!resv)
2820	return `1`;
2821
2822	idx = vma_hugecache_offset(h, vma, address: addr);
2823	switch (mode) {
2824	case VMA_NEEDS_RESV:
2825	ret = region_chg(resv, f: idx, t: idx + `1`, out_regions_needed: &dummy_out_regions_needed);
2826	/ We assume that vma_reservation_* routines always operate on*
2827	* 1 page, and that adding to resv map a 1 page entry can only
2828	* ever require 1 region.
2829	*/
2830	VM_BUG_ON(dummy_out_regions_needed != `1`);
2831	break;
2832	case VMA_COMMIT_RESV:
2833	ret = region_add(resv, f: idx, t: idx + `1`, in_regions_needed: `1`, NULL, NULL);
2834	/ region_add calls of range 1 should never fail. /
2835	VM_BUG_ON(ret < `0`);
2836	break;
2837	case VMA_END_RESV:
2838	region_abort(resv, f: idx, t: idx + `1`, regions_needed: `1`);
2839	ret = `0`;
2840	break;
2841	case VMA_ADD_RESV:
2842	if (vma->vm_flags & VM_MAYSHARE) {
2843	ret = region_add(resv, f: idx, t: idx + `1`, in_regions_needed: `1`, NULL, NULL);
2844	/ region_add calls of range 1 should never fail. /
2845	VM_BUG_ON(ret < `0`);
2846	} else {
2847	region_abort(resv, f: idx, t: idx + `1`, regions_needed: `1`);
2848	ret = region_del(resv, f: idx, t: idx + `1`);
2849	}
2850	break;
2851	case VMA_DEL_RESV:
2852	if (vma->vm_flags & VM_MAYSHARE) {
2853	region_abort(resv, f: idx, t: idx + `1`, regions_needed: `1`);
2854	ret = region_del(resv, f: idx, t: idx + `1`);
2855	} else {
2856	ret = region_add(resv, f: idx, t: idx + `1`, in_regions_needed: `1`, NULL, NULL);
2857	/ region_add calls of range 1 should never fail. /
2858	VM_BUG_ON(ret < `0`);
2859	}
2860	break;
2861	default:
2862	BUG();
2863	}
2864
2865	if (vma->vm_flags & VM_MAYSHARE \|\| mode == VMA_DEL_RESV)
2866	return ret;
2867	/*
2868	* We know private mapping must have HPAGE_RESV_OWNER set.
2869	*
2870	* In most cases, reserves always exist for private mappings.
2871	* However, a file associated with mapping could have been
2872	* hole punched or truncated after reserves were consumed.
2873	* As subsequent fault on such a range will not use reserves.
2874	* Subtle - The reserve map for private mappings has the
2875	* opposite meaning than that of shared mappings. If NO
2876	* entry is in the reserve map, it means a reservation exists.
2877	* If an entry exists in the reserve map, it means the
2878	* reservation has already been consumed. As a result, the
2879	* return value of this routine is the opposite of the
2880	* value returned from reserve map manipulation routines above.
2881	*/
2882	if (ret > `0`)
2883	return `0`;
2884	if (ret == `0`)
2885	return `1`;
2886	return ret;
2887	}
2888
2889	static long vma_needs_reservation(struct hstate *h,
2890	struct vm_area_struct vma, unsigned* long addr)
2891	{
2892	return __vma_reservation_common(h, vma, addr, mode: VMA_NEEDS_RESV);
2893	}
2894
2895	static long vma_commit_reservation(struct hstate *h,
2896	struct vm_area_struct vma, unsigned* long addr)
2897	{
2898	return __vma_reservation_common(h, vma, addr, mode: VMA_COMMIT_RESV);
2899	}
2900
2901	static void vma_end_reservation(struct hstate *h,
2902	struct vm_area_struct vma, unsigned* long addr)
2903	{
2904	(void)__vma_reservation_common(h, vma, addr, mode: VMA_END_RESV);
2905	}
2906
2907	static long vma_add_reservation(struct hstate *h,
2908	struct vm_area_struct vma, unsigned* long addr)
2909	{
2910	return __vma_reservation_common(h, vma, addr, mode: VMA_ADD_RESV);
2911	}
2912
2913	static long vma_del_reservation(struct hstate *h,
2914	struct vm_area_struct vma, unsigned* long addr)
2915	{
2916	return __vma_reservation_common(h, vma, addr, mode: VMA_DEL_RESV);
2917	}
2918
2919	/*
2920	* This routine is called to restore reservation information on error paths.
2921	* It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
2922	* and the hugetlb mutex should remain held when calling this routine.
2923	*
2924	* It handles two specific cases:
2925	* 1) A reservation was in place and the folio consumed the reservation.
2926	* hugetlb_restore_reserve is set in the folio.
2927	* 2) No reservation was in place for the page, so hugetlb_restore_reserve is
2928	* not set. However, alloc_hugetlb_folio always updates the reserve map.
2929	*
2930	* In case 1, free_huge_folio later in the error path will increment the
2931	* global reserve count. But, free_huge_folio does not have enough context
2932	* to adjust the reservation map. This case deals primarily with private
2933	* mappings. Adjust the reserve map here to be consistent with global
2934	* reserve count adjustments to be made by free_huge_folio. Make sure the
2935	* reserve map indicates there is a reservation present.
2936	*
2937	* In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
2938	*/
2939	void restore_reserve_on_error(struct hstate h, struct* vm_area_struct *vma,
2940	unsigned long address, struct folio *folio)
2941	{
2942	long rc = vma_needs_reservation(h, vma, addr: address);
2943
2944	if (folio_test_hugetlb_restore_reserve(folio)) {
2945	if (unlikely(rc < `0`))
2946	/*
2947	* Rare out of memory condition in reserve map
2948	* manipulation. Clear hugetlb_restore_reserve so
2949	* that global reserve count will not be incremented
2950	* by free_huge_folio. This will make it appear
2951	* as though the reservation for this folio was
2952	* consumed. This may prevent the task from
2953	* faulting in the folio at a later time. This
2954	* is better than inconsistent global huge page
2955	* accounting of reserve counts.
2956	*/
2957	folio_clear_hugetlb_restore_reserve(folio);
2958	else if (rc)
2959	(void)vma_add_reservation(h, vma, addr: address);
2960	else
2961	vma_end_reservation(h, vma, addr: address);
2962	} else {
2963	if (!rc) {
2964	/*
2965	* This indicates there is an entry in the reserve map
2966	* not added by alloc_hugetlb_folio. We know it was added
2967	* before the alloc_hugetlb_folio call, otherwise
2968	* hugetlb_restore_reserve would be set on the folio.
2969	* Remove the entry so that a subsequent allocation
2970	* does not consume a reservation.
2971	*/
2972	rc = vma_del_reservation(h, vma, addr: address);
2973	if (rc < `0`)
2974	/*
2975	* VERY rare out of memory condition. Since
2976	* we can not delete the entry, set
2977	* hugetlb_restore_reserve so that the reserve
2978	* count will be incremented when the folio
2979	* is freed. This reserve will be consumed
2980	* on a subsequent allocation.
2981	*/
2982	folio_set_hugetlb_restore_reserve(folio);
2983	} else if (rc < `0`) {
2984	/*
2985	* Rare out of memory condition from
2986	* vma_needs_reservation call. Memory allocation is
2987	* only attempted if a new entry is needed. Therefore,
2988	* this implies there is not an entry in the
2989	* reserve map.
2990	*
2991	* For shared mappings, no entry in the map indicates
2992	* no reservation. We are done.
2993	*/
2994	if (!(vma->vm_flags & VM_MAYSHARE))
2995	/*
2996	* For private mappings, no entry indicates
2997	* a reservation is present. Since we can
2998	* not add an entry, set hugetlb_restore_reserve
2999	* on the folio so reserve count will be
3000	* incremented when freed. This reserve will
3001	* be consumed on a subsequent allocation.
3002	*/
3003	folio_set_hugetlb_restore_reserve(folio);
3004	} else
3005	/*
3006	* No reservation present, do nothing
3007	*/
3008	vma_end_reservation(h, vma, addr: address);
3009	}
3010	}
3011
3012	/*
3013	* alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
3014	* the old one
3015	* @h: struct hstate old page belongs to
3016	* @old_folio: Old folio to dissolve
3017	* @list: List to isolate the page in case we need to
3018	* Returns 0 on success, otherwise negated error.
3019	*/
3020	static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
3021	struct folio old_folio, struct* list_head *list)
3022	{
3023	gfp_t gfp_mask = htlb_alloc_mask(h) \| __GFP_THISNODE;
3024	int nid = folio_nid(folio: old_folio);
3025	struct folio *new_folio;
3026	int ret = `0`;
3027
3028	/*
3029	* Before dissolving the folio, we need to allocate a new one for the
3030	* pool to remain stable. Here, we allocate the folio and 'prep' it
3031	* by doing everything but actually updating counters and adding to
3032	* the pool. This simplifies and let us do most of the processing
3033	* under the lock.
3034	*/
3035	new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
3036	if (!new_folio)
3037	return -ENOMEM;
3038	__prep_new_hugetlb_folio(h, folio: new_folio);
3039
3040	retry:
3041	spin_lock_irq(lock: &hugetlb_lock);
3042	if (!folio_test_hugetlb(folio: old_folio)) {
3043	/*
3044	* Freed from under us. Drop new_folio too.
3045	*/
3046	goto free_new;
3047	} else if (folio_ref_count(folio: old_folio)) {
3048	bool isolated;
3049
3050	/*
3051	* Someone has grabbed the folio, try to isolate it here.
3052	* Fail with -EBUSY if not possible.
3053	*/
3054	spin_unlock_irq(lock: &hugetlb_lock);
3055	isolated = isolate_hugetlb(folio: old_folio, list);
3056	ret = isolated ? `0` : -EBUSY;
3057	spin_lock_irq(lock: &hugetlb_lock);
3058	goto free_new;
3059	} else if (!folio_test_hugetlb_freed(folio: old_folio)) {
3060	/*
3061	* Folio's refcount is 0 but it has not been enqueued in the
3062	* freelist yet. Race window is small, so we can succeed here if
3063	* we retry.
3064	*/
3065	spin_unlock_irq(lock: &hugetlb_lock);
3066	cond_resched();
3067	goto retry;
3068	} else {
3069	/*
3070	* Ok, old_folio is still a genuine free hugepage. Remove it from
3071	* the freelist and decrease the counters. These will be
3072	* incremented again when calling __prep_account_new_huge_page()
3073	* and enqueue_hugetlb_folio() for new_folio. The counters will
3074	* remain stable since this happens under the lock.
3075	*/
3076	remove_hugetlb_folio(h, folio: old_folio, adjust_surplus: false);
3077
3078	/*
3079	* Ref count on new_folio is already zero as it was dropped
3080	* earlier. It can be directly added to the pool free list.
3081	*/
3082	__prep_account_new_huge_page(h, nid);
3083	enqueue_hugetlb_folio(h, folio: new_folio);
3084
3085	/*
3086	* Folio has been replaced, we can safely free the old one.
3087	*/
3088	spin_unlock_irq(lock: &hugetlb_lock);
3089	update_and_free_hugetlb_folio(h, folio: old_folio, atomic: false);
3090	}
3091
3092	return ret;
3093
3094	free_new:
3095	spin_unlock_irq(lock: &hugetlb_lock);
3096	/ Folio has a zero ref count, but needs a ref to be freed /
3097	folio_ref_unfreeze(folio: new_folio, count: `1`);
3098	update_and_free_hugetlb_folio(h, folio: new_folio, atomic: false);
3099
3100	return ret;
3101	}
3102
3103	int isolate_or_dissolve_huge_page(struct page page, struct* list_head *list)
3104	{
3105	struct hstate *h;
3106	struct folio *folio = page_folio(page);
3107	int ret = -EBUSY;
3108
3109	/*
3110	* The page might have been dissolved from under our feet, so make sure
3111	* to carefully check the state under the lock.
3112	* Return success when racing as if we dissolved the page ourselves.
3113	*/
3114	spin_lock_irq(lock: &hugetlb_lock);
3115	if (folio_test_hugetlb(folio)) {
3116	h = folio_hstate(folio);
3117	} else {
3118	spin_unlock_irq(lock: &hugetlb_lock);
3119	return `0`;
3120	}
3121	spin_unlock_irq(lock: &hugetlb_lock);
3122
3123	/*
3124	* Fence off gigantic pages as there is a cyclic dependency between
3125	* alloc_contig_range and them. Return -ENOMEM as this has the effect
3126	* of bailing out right away without further retrying.
3127	*/
3128	if (hstate_is_gigantic(h))
3129	return -ENOMEM;
3130
3131	if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
3132	ret = `0`;
3133	else if (!folio_ref_count(folio))
3134	ret = alloc_and_dissolve_hugetlb_folio(h, old_folio: folio, list);
3135
3136	return ret;
3137	}
3138
3139	struct folio alloc_hugetlb_folio(struct* vm_area_struct *vma,
3140	unsigned long addr, int avoid_reserve)
3141	{
3142	struct hugepage_subpool *spool = subpool_vma(vma);
3143	struct hstate *h = hstate_vma(vma);
3144	struct folio *folio;
3145	long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
3146	long gbl_chg;
3147	int memcg_charge_ret, ret, idx;
3148	struct hugetlb_cgroup *h_cg = NULL;
3149	struct mem_cgroup *memcg;
3150	bool deferred_reserve;
3151	gfp_t gfp = htlb_alloc_mask(h) \| __GFP_RETRY_MAYFAIL;
3152
3153	memcg = get_mem_cgroup_from_current();
3154	memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
3155	if (memcg_charge_ret == -ENOMEM) {
3156	mem_cgroup_put(memcg);
3157	return ERR_PTR(error: -ENOMEM);
3158	}
3159
3160	idx = hstate_index(h);
3161	/*
3162	* Examine the region/reserve map to determine if the process
3163	* has a reservation for the page to be allocated. A return
3164	* code of zero indicates a reservation exists (no change).
3165	*/
3166	map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
3167	if (map_chg < `0`) {
3168	if (!memcg_charge_ret)
3169	mem_cgroup_cancel_charge(memcg, nr_pages);
3170	mem_cgroup_put(memcg);
3171	return ERR_PTR(error: -ENOMEM);
3172	}
3173
3174	/*
3175	* Processes that did not create the mapping will have no
3176	* reserves as indicated by the region/reserve map. Check
3177	* that the allocation will not exceed the subpool limit.
3178	* Allocations for MAP_NORESERVE mappings also need to be
3179	* checked against any subpool limit.
3180	*/
3181	if (map_chg \|\| avoid_reserve) {
3182	gbl_chg = hugepage_subpool_get_pages(spool, delta: `1`);
3183	if (gbl_chg < `0`)
3184	goto out_end_reservation;
3185
3186	/*
3187	* Even though there was no reservation in the region/reserve
3188	* map, there could be reservations associated with the
3189	* subpool that can be used. This would be indicated if the
3190	* return value of hugepage_subpool_get_pages() is zero.
3191	* However, if avoid_reserve is specified we still avoid even
3192	* the subpool reservations.
3193	*/
3194	if (avoid_reserve)
3195	gbl_chg = `1`;
3196	}
3197
3198	/ If this allocation is not consuming a reservation, charge it now.*
3199	*/
3200	deferred_reserve = map_chg \|\| avoid_reserve;
3201	if (deferred_reserve) {
3202	ret = hugetlb_cgroup_charge_cgroup_rsvd(
3203	idx, nr_pages: pages_per_huge_page(h), ptr: &h_cg);
3204	if (ret)
3205	goto out_subpool_put;
3206	}
3207
3208	ret = hugetlb_cgroup_charge_cgroup(idx, nr_pages: pages_per_huge_page(h), ptr: &h_cg);
3209	if (ret)
3210	goto out_uncharge_cgroup_reservation;
3211
3212	spin_lock_irq(lock: &hugetlb_lock);
3213	/*
3214	* glb_chg is passed to indicate whether or not a page must be taken
3215	* from the global free pool (global change). gbl_chg == 0 indicates
3216	* a reservation exists for the allocation.
3217	*/
3218	folio = dequeue_hugetlb_folio_vma(h, vma, address: addr, avoid_reserve, chg: gbl_chg);
3219	if (!folio) {
3220	spin_unlock_irq(lock: &hugetlb_lock);
3221	folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
3222	if (!folio)
3223	goto out_uncharge_cgroup;
3224	spin_lock_irq(lock: &hugetlb_lock);
3225	if (!avoid_reserve && vma_has_reserves(vma, chg: gbl_chg)) {
3226	folio_set_hugetlb_restore_reserve(folio);
3227	h->resv_huge_pages--;
3228	}
3229	list_add(new: &folio->lru, head: &h->hugepage_activelist);
3230	folio_ref_unfreeze(folio, count: `1`);
3231	/ Fall through /
3232	}
3233
3234	hugetlb_cgroup_commit_charge(idx, nr_pages: pages_per_huge_page(h), h_cg, folio);
3235	/ If allocation is not consuming a reservation, also store the*
3236	* hugetlb_cgroup pointer on the page.
3237	*/
3238	if (deferred_reserve) {
3239	hugetlb_cgroup_commit_charge_rsvd(idx, nr_pages: pages_per_huge_page(h),
3240	h_cg, folio);
3241	}
3242
3243	spin_unlock_irq(lock: &hugetlb_lock);
3244
3245	hugetlb_set_folio_subpool(folio, subpool: spool);
3246
3247	map_commit = vma_commit_reservation(h, vma, addr);
3248	if (unlikely(map_chg > map_commit)) {
3249	/*
3250	* The page was added to the reservation map between
3251	* vma_needs_reservation and vma_commit_reservation.
3252	* This indicates a race with hugetlb_reserve_pages.
3253	* Adjust for the subpool count incremented above AND
3254	* in hugetlb_reserve_pages for the same page. Also,
3255	* the reservation count added in hugetlb_reserve_pages
3256	* no longer applies.
3257	*/
3258	long rsv_adjust;
3259
3260	rsv_adjust = hugepage_subpool_put_pages(spool, delta: `1`);
3261	hugetlb_acct_memory(h, delta: -rsv_adjust);
3262	if (deferred_reserve)
3263	hugetlb_cgroup_uncharge_folio_rsvd(idx: hstate_index(h),
3264	nr_pages: pages_per_huge_page(h), folio);
3265	}
3266
3267	if (!memcg_charge_ret)
3268	mem_cgroup_commit_charge(folio, memcg);
3269	mem_cgroup_put(memcg);
3270
3271	return folio;
3272
3273	out_uncharge_cgroup:
3274	hugetlb_cgroup_uncharge_cgroup(idx, nr_pages: pages_per_huge_page(h), h_cg);
3275	out_uncharge_cgroup_reservation:
3276	if (deferred_reserve)
3277	hugetlb_cgroup_uncharge_cgroup_rsvd(idx, nr_pages: pages_per_huge_page(h),
3278	h_cg);
3279	out_subpool_put:
3280	if (map_chg \|\| avoid_reserve)
3281	hugepage_subpool_put_pages(spool, delta: `1`);
3282	out_end_reservation:
3283	vma_end_reservation(h, vma, addr);
3284	if (!memcg_charge_ret)
3285	mem_cgroup_cancel_charge(memcg, nr_pages);
3286	mem_cgroup_put(memcg);
3287	return ERR_PTR(error: -ENOSPC);
3288	}
3289
3290	int alloc_bootmem_huge_page(struct hstate h, int* nid)
3291	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
3292	int __alloc_bootmem_huge_page(struct hstate h, int* nid)
3293	{
3294	struct huge_bootmem_page m = NULL; /* initialize for clang /
3295	int nr_nodes, node;
3296
3297	/ do node specific alloc /
3298	if (nid != NUMA_NO_NODE) {
3299	m = memblock_alloc_try_nid_raw(size: huge_page_size(h), align: huge_page_size(h),
3300	min_addr: `0`, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3301	if (!m)
3302	return `0`;
3303	goto found;
3304	}
3305	/ allocate from next node when distributing huge pages /
3306	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
3307	m = memblock_alloc_try_nid_raw(
3308	size: huge_page_size(h), align: huge_page_size(h),
3309	min_addr: `0`, MEMBLOCK_ALLOC_ACCESSIBLE, nid: node);
3310	/*
3311	* Use the beginning of the huge page to store the
3312	* huge_bootmem_page struct (until gather_bootmem
3313	* puts them into the mem_map).
3314	*/
3315	if (!m)
3316	return `0`;
3317	goto found;
3318	}
3319
3320	found:
3321
3322	/*
3323	* Only initialize the head struct page in memmap_init_reserved_pages,
3324	* rest of the struct pages will be initialized by the HugeTLB
3325	* subsystem itself.
3326	* The head struct page is used to get folio information by the HugeTLB
3327	* subsystem like zone id and node id.
3328	*/
3329	memblock_reserved_mark_noinit(virt_to_phys(address: (void *)m + PAGE_SIZE),
3330	size: huge_page_size(h) - PAGE_SIZE);
3331	/ Put them into a private list first because mem_map is not up yet /
3332	INIT_LIST_HEAD(list: &m->list);
3333	list_add(new: &m->list, head: &huge_boot_pages);
3334	m->hstate = h;
3335	return `1`;
3336	}
3337
3338	/ Initialize [start_page:end_page_number] tail struct pages of a hugepage /
3339	static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
3340	unsigned long start_page_number,
3341	unsigned long end_page_number)
3342	{
3343	enum zone_type zone = zone_idx(folio_zone(folio));
3344	int nid = folio_nid(folio);
3345	unsigned long head_pfn = folio_pfn(folio);
3346	unsigned long pfn, end_pfn = head_pfn + end_page_number;
3347	int ret;
3348
3349	for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
3350	struct page *page = pfn_to_page(pfn);
3351
3352	__init_single_page(page, pfn, zone, nid);
3353	prep_compound_tail(head: (struct page *)folio, tail_idx: pfn - head_pfn);
3354	ret = page_ref_freeze(page, count: `1`);
3355	VM_BUG_ON(!ret);
3356	}
3357	}
3358
3359	static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
3360	struct hstate *h,
3361	unsigned long nr_pages)
3362	{
3363	int ret;
3364
3365	/ Prepare folio head /
3366	__folio_clear_reserved(folio);
3367	__folio_set_head(folio);
3368	ret = folio_ref_freeze(folio, count: `1`);
3369	VM_BUG_ON(!ret);
3370	/ Initialize the necessary tail struct pages /
3371	hugetlb_folio_init_tail_vmemmap(folio, start_page_number: `1`, end_page_number: nr_pages);
3372	prep_compound_head(page: (struct page *)folio, order: huge_page_order(h));
3373	}
3374
3375	static void __init prep_and_add_bootmem_folios(struct hstate *h,
3376	struct list_head *folio_list)
3377	{
3378	unsigned long flags;
3379	struct folio folio, tmp_f;
3380
3381	/ Send list for bulk vmemmap optimization processing /
3382	hugetlb_vmemmap_optimize_folios(h, folio_list);
3383
3384	/ Add all new pool pages to free lists in one lock cycle /
3385	spin_lock_irqsave(&hugetlb_lock, flags);
3386	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
3387	if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
3388	/*
3389	* If HVO fails, initialize all tail struct pages
3390	* We do not worry about potential long lock hold
3391	* time as this is early in boot and there should
3392	* be no contention.
3393	*/
3394	hugetlb_folio_init_tail_vmemmap(folio,
3395	HUGETLB_VMEMMAP_RESERVE_PAGES,
3396	end_page_number: pages_per_huge_page(h));
3397	}
3398	__prep_account_new_huge_page(h, nid: folio_nid(folio));
3399	enqueue_hugetlb_folio(h, folio);
3400	}
3401	spin_unlock_irqrestore(lock: &hugetlb_lock, flags);
3402	}
3403
3404	/*
3405	* Put bootmem huge pages into the standard lists after mem_map is up.
3406	* Note: This only applies to gigantic (order > MAX_ORDER) pages.
3407	*/
3408	static void __init gather_bootmem_prealloc(void)
3409	{
3410	LIST_HEAD(folio_list);
3411	struct huge_bootmem_page *m;
3412	struct hstate h = NULL, prev_h = NULL;
3413
3414	list_for_each_entry(m, &huge_boot_pages, list) {
3415	struct page *page = virt_to_page(m);
3416	struct folio folio = (void* *)page;
3417
3418	h = m->hstate;
3419	/*
3420	* It is possible to have multiple huge page sizes (hstates)
3421	* in this list. If so, process each size separately.
3422	*/
3423	if (h != prev_h && prev_h != NULL)
3424	prep_and_add_bootmem_folios(h: prev_h, folio_list: &folio_list);
3425	prev_h = h;
3426
3427	VM_BUG_ON(!hstate_is_gigantic(h));
3428	WARN_ON(folio_ref_count(folio) != `1`);
3429
3430	hugetlb_folio_init_vmemmap(folio, h,
3431	HUGETLB_VMEMMAP_RESERVE_PAGES);
3432	init_new_hugetlb_folio(h, folio);
3433	list_add(new: &folio->lru, head: &folio_list);
3434
3435	/*
3436	* We need to restore the 'stolen' pages to totalram_pages
3437	* in order to fix confusing memory reports from free(1) and
3438	* other side-effects, like CommitLimit going negative.
3439	*/
3440	adjust_managed_page_count(page, count: pages_per_huge_page(h));
3441	cond_resched();
3442	}
3443
3444	prep_and_add_bootmem_folios(h, folio_list: &folio_list);
3445	}
3446
3447	static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate h, int* nid)
3448	{
3449	unsigned long i;
3450	char buf[`32`];
3451
3452	for (i = `0`; i < h->max_huge_pages_node[nid]; ++i) {
3453	if (hstate_is_gigantic(h)) {
3454	if (!alloc_bootmem_huge_page(h, nid))
3455	break;
3456	} else {
3457	struct folio *folio;
3458	gfp_t gfp_mask = htlb_alloc_mask(h) \| __GFP_THISNODE;
3459
3460	folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
3461	nmask: &node_states[N_MEMORY], NULL);
3462	if (!folio)
3463	break;
3464	free_huge_folio(folio); / free it into the hugepage allocator /
3465	}
3466	cond_resched();
3467	}
3468	if (i == h->max_huge_pages_node[nid])
3469	return;
3470
3471	string_get_size(size: huge_page_size(h), blk_size: `1`, units: STRING_UNITS_2, buf, len: `32`);
3472	pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3473	h->max_huge_pages_node[nid], buf, nid, i);
3474	h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3475	h->max_huge_pages_node[nid] = i;
3476	}
3477
3478	/*
3479	* NOTE: this routine is called in different contexts for gigantic and
3480	* non-gigantic pages.
3481	* - For gigantic pages, this is called early in the boot process and
3482	* pages are allocated from memblock allocated or something similar.
3483	* Gigantic pages are actually added to pools later with the routine
3484	* gather_bootmem_prealloc.
3485	* - For non-gigantic pages, this is called later in the boot process after
3486	* all of mm is up and functional. Pages are allocated from buddy and
3487	* then added to hugetlb pools.
3488	*/
3489	static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3490	{
3491	unsigned long i;
3492	struct folio *folio;
3493	LIST_HEAD(folio_list);
3494	nodemask_t *node_alloc_noretry;
3495	bool node_specific_alloc = false;
3496
3497	/ skip gigantic hugepages allocation if hugetlb_cma enabled /
3498	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3499	pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3500	return;
3501	}
3502
3503	/ do node specific alloc /
3504	for_each_online_node(i) {
3505	if (h->max_huge_pages_node[i] > `0`) {
3506	hugetlb_hstate_alloc_pages_onenode(h, nid: i);
3507	node_specific_alloc = true;
3508	}
3509	}
3510
3511	if (node_specific_alloc)
3512	return;
3513
3514	/ below will do all node balanced alloc /
3515	if (!hstate_is_gigantic(h)) {
3516	/*
3517	* Bit mask controlling how hard we retry per-node allocations.
3518	* Ignore errors as lower level routines can deal with
3519	* node_alloc_noretry == NULL. If this kmalloc fails at boot
3520	* time, we are likely in bigger trouble.
3521	*/
3522	node_alloc_noretry = kmalloc(size: sizeof(*node_alloc_noretry),
3523	GFP_KERNEL);
3524	} else {
3525	/ allocations done at boot time /
3526	node_alloc_noretry = NULL;
3527	}
3528
3529	/ bit mask controlling how hard we retry per-node allocations /
3530	if (node_alloc_noretry)
3531	nodes_clear(*node_alloc_noretry);
3532
3533	for (i = `0`; i < h->max_huge_pages; ++i) {
3534	if (hstate_is_gigantic(h)) {
3535	/*
3536	* gigantic pages not added to list as they are not
3537	* added to pools now.
3538	*/
3539	if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3540	break;
3541	} else {
3542	folio = alloc_pool_huge_folio(h, nodes_allowed: &node_states[N_MEMORY],
3543	node_alloc_noretry);
3544	if (!folio)
3545	break;
3546	list_add(new: &folio->lru, head: &folio_list);
3547	}
3548	cond_resched();
3549	}
3550
3551	/ list will be empty if hstate_is_gigantic /
3552	prep_and_add_allocated_folios(h, folio_list: &folio_list);
3553
3554	if (i < h->max_huge_pages) {
3555	char buf[`32`];
3556
3557	string_get_size(size: huge_page_size(h), blk_size: `1`, units: STRING_UNITS_2, buf, len: `32`);
3558	pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
3559	h->max_huge_pages, buf, i);
3560	h->max_huge_pages = i;
3561	}
3562	kfree(objp: node_alloc_noretry);
3563	}
3564
3565	static void __init hugetlb_init_hstates(void)
3566	{
3567	struct hstate h, h2;
3568
3569	for_each_hstate(h) {
3570	/ oversize hugepages were init'ed in early boot /
3571	if (!hstate_is_gigantic(h))
3572	hugetlb_hstate_alloc_pages(h);
3573
3574	/*
3575	* Set demote order for each hstate. Note that
3576	* h->demote_order is initially 0.
3577	* - We can not demote gigantic pages if runtime freeing
3578	* is not supported, so skip this.
3579	* - If CMA allocation is possible, we can not demote
3580	* HUGETLB_PAGE_ORDER or smaller size pages.
3581	*/
3582	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3583	continue;
3584	if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3585	continue;
3586	for_each_hstate(h2) {
3587	if (h2 == h)
3588	continue;
3589	if (h2->order < h->order &&
3590	h2->order > h->demote_order)
3591	h->demote_order = h2->order;
3592	}
3593	}
3594	}
3595
3596	static void __init report_hugepages(void)
3597	{
3598	struct hstate *h;
3599
3600	for_each_hstate(h) {
3601	char buf[`32`];
3602
3603	string_get_size(size: huge_page_size(h), blk_size: `1`, units: STRING_UNITS_2, buf, len: `32`);
3604	pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3605	buf, h->free_huge_pages);
3606	pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3607	hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
3608	}
3609	}
3610
3611	#ifdef CONFIG_HIGHMEM
3612	static void try_to_free_low(struct hstate h, unsigned* long count,
3613	nodemask_t *nodes_allowed)
3614	{
3615	int i;
3616	LIST_HEAD(page_list);
3617
3618	lockdep_assert_held(&hugetlb_lock);
3619	if (hstate_is_gigantic(h))
3620	return;
3621
3622	/*
3623	* Collect pages to be freed on a list, and free after dropping lock
3624	*/
3625	for_each_node_mask(i, *nodes_allowed) {
3626	struct folio folio, next;
3627	struct list_head *freel = &h->hugepage_freelists[i];
3628	list_for_each_entry_safe(folio, next, freel, lru) {
3629	if (count >= h->nr_huge_pages)
3630	goto out;
3631	if (folio_test_highmem(folio))
3632	continue;
3633	remove_hugetlb_folio(h, folio, false);
3634	list_add(&folio->lru, &page_list);
3635	}
3636	}
3637
3638	out:
3639	spin_unlock_irq(&hugetlb_lock);
3640	update_and_free_pages_bulk(h, &page_list);
3641	spin_lock_irq(&hugetlb_lock);
3642	}
3643	#else
3644	static inline void try_to_free_low(struct hstate h, unsigned* long count,
3645	nodemask_t *nodes_allowed)
3646	{
3647	}
3648	#endif
3649
3650	/*
3651	* Increment or decrement surplus_huge_pages. Keep node-specific counters
3652	* balanced by operating on them in a round-robin fashion.
3653	* Returns 1 if an adjustment was made.
3654	*/
3655	static int adjust_pool_surplus(struct hstate h, nodemask_t nodes_allowed,
3656	int delta)
3657	{
3658	int nr_nodes, node;
3659
3660	lockdep_assert_held(&hugetlb_lock);
3661	VM_BUG_ON(delta != -`1` && delta != `1`);
3662
3663	if (delta < `0`) {
3664	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3665	if (h->surplus_huge_pages_node[node])
3666	goto found;
3667	}
3668	} else {
3669	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3670	if (h->surplus_huge_pages_node[node] <
3671	h->nr_huge_pages_node[node])
3672	goto found;
3673	}
3674	}
3675	return `0`;
3676
3677	found:
3678	h->surplus_huge_pages += delta;
3679	h->surplus_huge_pages_node[node] += delta;
3680	return `1`;
3681	}
3682
3683	#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3684	static int set_max_huge_pages(struct hstate h, unsigned* long count, int nid,
3685	nodemask_t *nodes_allowed)
3686	{
3687	unsigned long min_count;
3688	unsigned long allocated;
3689	struct folio *folio;
3690	LIST_HEAD(page_list);
3691	NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3692
3693	/*
3694	* Bit mask controlling how hard we retry per-node allocations.
3695	* If we can not allocate the bit mask, do not attempt to allocate
3696	* the requested huge pages.
3697	*/
3698	if (node_alloc_noretry)
3699	nodes_clear(*node_alloc_noretry);
3700	else
3701	return -ENOMEM;
3702
3703	/*
3704	* resize_lock mutex prevents concurrent adjustments to number of
3705	* pages in hstate via the proc/sysfs interfaces.
3706	*/
3707	mutex_lock(&h->resize_lock);
3708	flush_free_hpage_work(h);
3709	spin_lock_irq(lock: &hugetlb_lock);
3710
3711	/*
3712	* Check for a node specific request.
3713	* Changing node specific huge page count may require a corresponding
3714	* change to the global count. In any case, the passed node mask
3715	* (nodes_allowed) will restrict alloc/free to the specified node.
3716	*/
3717	if (nid != NUMA_NO_NODE) {
3718	unsigned long old_count = count;
3719
3720	count += persistent_huge_pages(h) -
3721	(h->nr_huge_pages_node[nid] -
3722	h->surplus_huge_pages_node[nid]);
3723	/*
3724	* User may have specified a large count value which caused the
3725	* above calculation to overflow. In this case, they wanted
3726	* to allocate as many huge pages as possible. Set count to
3727	* largest possible value to align with their intention.
3728	*/
3729	if (count < old_count)
3730	count = ULONG_MAX;
3731	}
3732
3733	/*
3734	* Gigantic pages runtime allocation depend on the capability for large
3735	* page range allocation.
3736	* If the system does not provide this feature, return an error when
3737	* the user tries to allocate gigantic pages but let the user free the
3738	* boottime allocated gigantic pages.
3739	*/
3740	if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3741	if (count > persistent_huge_pages(h)) {
3742	spin_unlock_irq(lock: &hugetlb_lock);
3743	mutex_unlock(lock: &h->resize_lock);
3744	NODEMASK_FREE(node_alloc_noretry);
3745	return -EINVAL;
3746	}
3747	/ Fall through to decrease pool /
3748	}
3749
3750	/*
3751	* Increase the pool size
3752	* First take pages out of surplus state. Then make up the
3753	* remaining difference by allocating fresh huge pages.
3754	*
3755	* We might race with alloc_surplus_hugetlb_folio() here and be unable
3756	* to convert a surplus huge page to a normal huge page. That is
3757	* not critical, though, it just means the overall size of the
3758	* pool might be one hugepage larger than it needs to be, but
3759	* within all the constraints specified by the sysctls.
3760	*/
3761	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3762	if (!adjust_pool_surplus(h, nodes_allowed, delta: -`1`))
3763	break;
3764	}
3765
3766	allocated = `0`;
3767	while (count > (persistent_huge_pages(h) + allocated)) {
3768	/*
3769	* If this allocation races such that we no longer need the
3770	* page, free_huge_folio will handle it by freeing the page
3771	* and reducing the surplus.
3772	*/
3773	spin_unlock_irq(lock: &hugetlb_lock);
3774
3775	/ yield cpu to avoid soft lockup /
3776	cond_resched();
3777
3778	folio = alloc_pool_huge_folio(h, nodes_allowed,
3779	node_alloc_noretry);
3780	if (!folio) {
3781	prep_and_add_allocated_folios(h, folio_list: &page_list);
3782	spin_lock_irq(lock: &hugetlb_lock);
3783	goto out;
3784	}
3785
3786	list_add(new: &folio->lru, head: &page_list);
3787	allocated++;
3788
3789	/ Bail for signals. Probably ctrl-c from user /
3790	if (signal_pending(current)) {
3791	prep_and_add_allocated_folios(h, folio_list: &page_list);
3792	spin_lock_irq(lock: &hugetlb_lock);
3793	goto out;
3794	}
3795
3796	spin_lock_irq(lock: &hugetlb_lock);
3797	}
3798
3799	/ Add allocated pages to the pool /
3800	if (!list_empty(head: &page_list)) {
3801	spin_unlock_irq(lock: &hugetlb_lock);
3802	prep_and_add_allocated_folios(h, folio_list: &page_list);
3803	spin_lock_irq(lock: &hugetlb_lock);
3804	}
3805
3806	/*
3807	* Decrease the pool size
3808	* First return free pages to the buddy allocator (being careful
3809	* to keep enough around to satisfy reservations). Then place
3810	* pages into surplus state as needed so the pool will shrink
3811	* to the desired size as pages become free.
3812	*
3813	* By placing pages into the surplus state independent of the
3814	* overcommit value, we are allowing the surplus pool size to
3815	* exceed overcommit. There are few sane options here. Since
3816	* alloc_surplus_hugetlb_folio() is checking the global counter,
3817	* though, we'll note that we're not allowed to exceed surplus
3818	* and won't grow the pool anywhere else. Not until one of the
3819	* sysctls are changed, or the surplus pages go out of use.
3820	*/
3821	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3822	min_count = max(count, min_count);
3823	try_to_free_low(h, count: min_count, nodes_allowed);
3824
3825	/*
3826	* Collect pages to be removed on list without dropping lock
3827	*/
3828	while (min_count < persistent_huge_pages(h)) {
3829	folio = remove_pool_hugetlb_folio(h, nodes_allowed, acct_surplus: `0`);
3830	if (!folio)
3831	break;
3832
3833	list_add(new: &folio->lru, head: &page_list);
3834	}
3835	/ free the pages after dropping lock /
3836	spin_unlock_irq(lock: &hugetlb_lock);
3837	update_and_free_pages_bulk(h, folio_list: &page_list);
3838	flush_free_hpage_work(h);
3839	spin_lock_irq(lock: &hugetlb_lock);
3840
3841	while (count < persistent_huge_pages(h)) {
3842	if (!adjust_pool_surplus(h, nodes_allowed, delta: `1`))
3843	break;
3844	}
3845	out:
3846	h->max_huge_pages = persistent_huge_pages(h);
3847	spin_unlock_irq(lock: &hugetlb_lock);
3848	mutex_unlock(lock: &h->resize_lock);
3849
3850	NODEMASK_FREE(node_alloc_noretry);
3851
3852	return `0`;
3853	}
3854
3855	static int demote_free_hugetlb_folio(struct hstate h, struct* folio *folio)
3856	{
3857	int i, nid = folio_nid(folio);
3858	struct hstate *target_hstate;
3859	struct page *subpage;
3860	struct folio *inner_folio;
3861	int rc = `0`;
3862
3863	target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3864
3865	remove_hugetlb_folio_for_demote(h, folio, adjust_surplus: false);
3866	spin_unlock_irq(lock: &hugetlb_lock);
3867
3868	/*
3869	* If vmemmap already existed for folio, the remove routine above would
3870	* have cleared the hugetlb folio flag. Hence the folio is technically
3871	* no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be
3872	* passed hugetlb folios and will BUG otherwise.
3873	*/
3874	if (folio_test_hugetlb(folio)) {
3875	rc = hugetlb_vmemmap_restore_folio(h, folio);
3876	if (rc) {
3877	/ Allocation of vmemmmap failed, we can not demote folio /
3878	spin_lock_irq(lock: &hugetlb_lock);
3879	folio_ref_unfreeze(folio, count: `1`);
3880	add_hugetlb_folio(h, folio, adjust_surplus: false);
3881	return rc;
3882	}
3883	}
3884
3885	/*
3886	* Use destroy_compound_hugetlb_folio_for_demote for all huge page
3887	* sizes as it will not ref count folios.
3888	*/
3889	destroy_compound_hugetlb_folio_for_demote(folio, order: huge_page_order(h));
3890
3891	/*
3892	* Taking target hstate mutex synchronizes with set_max_huge_pages.
3893	* Without the mutex, pages added to target hstate could be marked
3894	* as surplus.
3895	*
3896	* Note that we already hold h->resize_lock. To prevent deadlock,
3897	* use the convention of always taking larger size hstate mutex first.
3898	*/
3899	mutex_lock(&target_hstate->resize_lock);
3900	for (i = `0`; i < pages_per_huge_page(h);
3901	i += pages_per_huge_page(h: target_hstate)) {
3902	subpage = folio_page(folio, i);
3903	inner_folio = page_folio(subpage);
3904	if (hstate_is_gigantic(h: target_hstate))
3905	prep_compound_gigantic_folio_for_demote(folio: inner_folio,
3906	order: target_hstate->order);
3907	else
3908	prep_compound_page(page: subpage, order: target_hstate->order);
3909	folio_change_private(folio: inner_folio, NULL);
3910	prep_new_hugetlb_folio(h: target_hstate, folio: inner_folio, nid);
3911	free_huge_folio(folio: inner_folio);
3912	}
3913	mutex_unlock(lock: &target_hstate->resize_lock);
3914
3915	spin_lock_irq(lock: &hugetlb_lock);
3916
3917	/*
3918	* Not absolutely necessary, but for consistency update max_huge_pages
3919	* based on pool changes for the demoted page.
3920	*/
3921	h->max_huge_pages--;
3922	target_hstate->max_huge_pages +=
3923	pages_per_huge_page(h) / pages_per_huge_page(h: target_hstate);
3924
3925	return rc;
3926	}
3927
3928	static int demote_pool_huge_page(struct hstate h, nodemask_t nodes_allowed)
3929	__must_hold(&hugetlb_lock)
3930	{
3931	int nr_nodes, node;
3932	struct folio *folio;
3933
3934	lockdep_assert_held(&hugetlb_lock);
3935
3936	/ We should never get here if no demote order /
3937	if (!h->demote_order) {
3938	pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3939	return -EINVAL; / internal error /
3940	}
3941
3942	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3943	list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
3944	if (folio_test_hwpoison(folio))
3945	continue;
3946	return demote_free_hugetlb_folio(h, folio);
3947	}
3948	}
3949
3950	/*
3951	* Only way to get here is if all pages on free lists are poisoned.
3952	* Return -EBUSY so that caller will not retry.
3953	*/
3954	return -EBUSY;
3955	}
3956
3957	#define HSTATE_ATTR_RO(_name) \
3958	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3959
3960	#define HSTATE_ATTR_WO(_name) \
3961	static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3962
3963	#define HSTATE_ATTR(_name) \
3964	static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
3965
3966	static struct kobject *hugepages_kobj;
3967	static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3968
3969	static struct hstate kobj_to_node_hstate(struct* kobject kobj, int* *nidp);
3970
3971	static struct hstate kobj_to_hstate(struct* kobject kobj, int* *nidp)
3972	{
3973	int i;
3974
3975	for (i = `0`; i < HUGE_MAX_HSTATE; i++)
3976	if (hstate_kobjs[i] == kobj) {
3977	if (nidp)
3978	*nidp = NUMA_NO_NODE;
3979	return &hstates[i];
3980	}
3981
3982	return kobj_to_node_hstate(kobj, nidp);
3983	}
3984
3985	static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3986	struct kobj_attribute attr, char* *buf)
3987	{
3988	struct hstate *h;
3989	unsigned long nr_huge_pages;
3990	int nid;
3991
3992	h = kobj_to_hstate(kobj, nidp: &nid);
3993	if (nid == NUMA_NO_NODE)
3994	nr_huge_pages = h->nr_huge_pages;
3995	else
3996	nr_huge_pages = h->nr_huge_pages_node[nid];
3997
3998	return sysfs_emit(buf, fmt: "%lu\n", nr_huge_pages);
3999	}
4000
4001	static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
4002	struct hstate h, int* nid,
4003	unsigned long count, size_t len)
4004	{
4005	int err;
4006	nodemask_t nodes_allowed, *n_mask;
4007
4008	if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
4009	return -EINVAL;
4010
4011	if (nid == NUMA_NO_NODE) {
4012	/*
4013	* global hstate attribute
4014	*/
4015	if (!(obey_mempolicy &&
4016	init_nodemask_of_mempolicy(mask: &nodes_allowed)))
4017	n_mask = &node_states[N_MEMORY];
4018	else
4019	n_mask = &nodes_allowed;
4020	} else {
4021	/*
4022	* Node specific request. count adjustment happens in
4023	* set_max_huge_pages() after acquiring hugetlb_lock.
4024	*/
4025	init_nodemask_of_node(mask: &nodes_allowed, node: nid);
4026	n_mask = &nodes_allowed;
4027	}
4028
4029	err = set_max_huge_pages(h, count, nid, nodes_allowed: n_mask);
4030
4031	return err ? err : len;
4032	}
4033
4034	static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
4035	struct kobject kobj, const* char *buf,
4036	size_t len)
4037	{
4038	struct hstate *h;
4039	unsigned long count;
4040	int nid;
4041	int err;
4042
4043	err = kstrtoul(s: buf, base: `10`, res: &count);
4044	if (err)
4045	return err;
4046
4047	h = kobj_to_hstate(kobj, nidp: &nid);
4048	return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
4049	}
4050
4051	static ssize_t nr_hugepages_show(struct kobject *kobj,
4052	struct kobj_attribute attr, char* *buf)
4053	{
4054	return nr_hugepages_show_common(kobj, attr, buf);
4055	}
4056
4057	static ssize_t nr_hugepages_store(struct kobject *kobj,
4058	struct kobj_attribute attr, const* char *buf, size_t len)
4059	{
4060	return nr_hugepages_store_common(obey_mempolicy: false, kobj, buf, len);
4061	}
4062	HSTATE_ATTR(nr_hugepages);
4063
4064	#ifdef CONFIG_NUMA
4065
4066	/*
4067	* hstate attribute for optionally mempolicy-based constraint on persistent
4068	* huge page alloc/free.
4069	*/
4070	static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
4071	struct kobj_attribute *attr,
4072	char *buf)
4073	{
4074	return nr_hugepages_show_common(kobj, attr, buf);
4075	}
4076
4077	static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
4078	struct kobj_attribute attr, const* char *buf, size_t len)
4079	{
4080	return nr_hugepages_store_common(obey_mempolicy: true, kobj, buf, len);
4081	}
4082	HSTATE_ATTR(nr_hugepages_mempolicy);
4083	#endif
4084
4085
4086	static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
4087	struct kobj_attribute attr, char* *buf)
4088	{
4089	struct hstate *h = kobj_to_hstate(kobj, NULL);
4090	return sysfs_emit(buf, fmt: "%lu\n", h->nr_overcommit_huge_pages);
4091	}
4092
4093	static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
4094	struct kobj_attribute attr, const* char *buf, size_t count)
4095	{
4096	int err;
4097	unsigned long input;
4098	struct hstate *h = kobj_to_hstate(kobj, NULL);
4099
4100	if (hstate_is_gigantic(h))
4101	return -EINVAL;
4102
4103	err = kstrtoul(s: buf, base: `10`, res: &input);
4104	if (err)
4105	return err;
4106
4107	spin_lock_irq(lock: &hugetlb_lock);
4108	h->nr_overcommit_huge_pages = input;
4109	spin_unlock_irq(lock: &hugetlb_lock);
4110
4111	return count;
4112	}
4113	HSTATE_ATTR(nr_overcommit_hugepages);
4114
4115	static ssize_t free_hugepages_show(struct kobject *kobj,
4116	struct kobj_attribute attr, char* *buf)
4117	{
4118	struct hstate *h;
4119	unsigned long free_huge_pages;
4120	int nid;
4121
4122	h = kobj_to_hstate(kobj, nidp: &nid);
4123	if (nid == NUMA_NO_NODE)
4124	free_huge_pages = h->free_huge_pages;
4125	else
4126	free_huge_pages = h->free_huge_pages_node[nid];
4127
4128	return sysfs_emit(buf, fmt: "%lu\n", free_huge_pages);
4129	}
4130	HSTATE_ATTR_RO(free_hugepages);
4131
4132	static ssize_t resv_hugepages_show(struct kobject *kobj,
4133	struct kobj_attribute attr, char* *buf)
4134	{
4135	struct hstate *h = kobj_to_hstate(kobj, NULL);
4136	return sysfs_emit(buf, fmt: "%lu\n", h->resv_huge_pages);
4137	}
4138	HSTATE_ATTR_RO(resv_hugepages);
4139
4140	static ssize_t surplus_hugepages_show(struct kobject *kobj,
4141	struct kobj_attribute attr, char* *buf)
4142	{
4143	struct hstate *h;
4144	unsigned long surplus_huge_pages;
4145	int nid;
4146
4147	h = kobj_to_hstate(kobj, nidp: &nid);
4148	if (nid == NUMA_NO_NODE)
4149	surplus_huge_pages = h->surplus_huge_pages;
4150	else
4151	surplus_huge_pages = h->surplus_huge_pages_node[nid];
4152
4153	return sysfs_emit(buf, fmt: "%lu\n", surplus_huge_pages);
4154	}
4155	HSTATE_ATTR_RO(surplus_hugepages);
4156
4157	static ssize_t demote_store(struct kobject *kobj,
4158	struct kobj_attribute attr, const* char *buf, size_t len)
4159	{
4160	unsigned long nr_demote;
4161	unsigned long nr_available;
4162	nodemask_t nodes_allowed, *n_mask;
4163	struct hstate *h;
4164	int err;
4165	int nid;
4166
4167	err = kstrtoul(s: buf, base: `10`, res: &nr_demote);
4168	if (err)
4169	return err;
4170	h = kobj_to_hstate(kobj, nidp: &nid);
4171
4172	if (nid != NUMA_NO_NODE) {
4173	init_nodemask_of_node(mask: &nodes_allowed, node: nid);
4174	n_mask = &nodes_allowed;
4175	} else {
4176	n_mask = &node_states[N_MEMORY];
4177	}
4178
4179	/ Synchronize with other sysfs operations modifying huge pages /
4180	mutex_lock(&h->resize_lock);
4181	spin_lock_irq(lock: &hugetlb_lock);
4182
4183	while (nr_demote) {
4184	/*
4185	* Check for available pages to demote each time thorough the
4186	* loop as demote_pool_huge_page will drop hugetlb_lock.
4187	*/
4188	if (nid != NUMA_NO_NODE)
4189	nr_available = h->free_huge_pages_node[nid];
4190	else
4191	nr_available = h->free_huge_pages;
4192	nr_available -= h->resv_huge_pages;
4193	if (!nr_available)
4194	break;
4195
4196	err = demote_pool_huge_page(h, nodes_allowed: n_mask);
4197	if (err)
4198	break;
4199
4200	nr_demote--;
4201	}
4202
4203	spin_unlock_irq(lock: &hugetlb_lock);
4204	mutex_unlock(lock: &h->resize_lock);
4205
4206	if (err)
4207	return err;
4208	return len;
4209	}
4210	HSTATE_ATTR_WO(demote);
4211
4212	static ssize_t demote_size_show(struct kobject *kobj,
4213	struct kobj_attribute attr, char* *buf)
4214	{
4215	struct hstate *h = kobj_to_hstate(kobj, NULL);
4216	unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
4217
4218	return sysfs_emit(buf, fmt: "%lukB\n", demote_size);
4219	}
4220
4221	static ssize_t demote_size_store(struct kobject *kobj,
4222	struct kobj_attribute *attr,
4223	const char *buf, size_t count)
4224	{
4225	struct hstate h, demote_hstate;
4226	unsigned long demote_size;
4227	unsigned int demote_order;
4228
4229	demote_size = (unsigned long)memparse(ptr: buf, NULL);
4230
4231	demote_hstate = size_to_hstate(size: demote_size);
4232	if (!demote_hstate)
4233	return -EINVAL;
4234	demote_order = demote_hstate->order;
4235	if (demote_order < HUGETLB_PAGE_ORDER)
4236	return -EINVAL;
4237
4238	/ demote order must be smaller than hstate order /
4239	h = kobj_to_hstate(kobj, NULL);
4240	if (demote_order >= h->order)
4241	return -EINVAL;
4242
4243	/ resize_lock synchronizes access to demote size and writes /
4244	mutex_lock(&h->resize_lock);
4245	h->demote_order = demote_order;
4246	mutex_unlock(lock: &h->resize_lock);
4247
4248	return count;
4249	}
4250	HSTATE_ATTR(demote_size);
4251
4252	static struct attribute *hstate_attrs[] = {
4253	&nr_hugepages_attr.attr,
4254	&nr_overcommit_hugepages_attr.attr,
4255	&free_hugepages_attr.attr,
4256	&resv_hugepages_attr.attr,
4257	&surplus_hugepages_attr.attr,
4258	#ifdef CONFIG_NUMA
4259	&nr_hugepages_mempolicy_attr.attr,
4260	#endif
4261	NULL,
4262	};
4263
4264	static const struct attribute_group hstate_attr_group = {
4265	.attrs = hstate_attrs,
4266	};
4267
4268	static struct attribute *hstate_demote_attrs[] = {
4269	&demote_size_attr.attr,
4270	&demote_attr.attr,
4271	NULL,
4272	};
4273
4274	static const struct attribute_group hstate_demote_attr_group = {
4275	.attrs = hstate_demote_attrs,
4276	};
4277
4278	static int hugetlb_sysfs_add_hstate(struct hstate h, struct* kobject *parent,
4279	struct kobject **hstate_kobjs,
4280	const struct attribute_group *hstate_attr_group)
4281	{
4282	int retval;
4283	int hi = hstate_index(h);
4284
4285	hstate_kobjs[hi] = kobject_create_and_add(name: h->name, parent);
4286	if (!hstate_kobjs[hi])
4287	return -ENOMEM;
4288
4289	retval = sysfs_create_group(kobj: hstate_kobjs[hi], grp: hstate_attr_group);
4290	if (retval) {
4291	kobject_put(kobj: hstate_kobjs[hi]);
4292	hstate_kobjs[hi] = NULL;
4293	return retval;
4294	}
4295
4296	if (h->demote_order) {
4297	retval = sysfs_create_group(kobj: hstate_kobjs[hi],
4298	grp: &hstate_demote_attr_group);
4299	if (retval) {
4300	pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
4301	sysfs_remove_group(kobj: hstate_kobjs[hi], grp: hstate_attr_group);
4302	kobject_put(kobj: hstate_kobjs[hi]);
4303	hstate_kobjs[hi] = NULL;
4304	return retval;
4305	}
4306	}
4307
4308	return `0`;
4309	}
4310
4311	#ifdef CONFIG_NUMA
4312	static bool hugetlb_sysfs_initialized __ro_after_init;
4313
4314	/*
4315	* node_hstate/s - associate per node hstate attributes, via their kobjects,
4316	* with node devices in node_devices[] using a parallel array. The array
4317	* index of a node device or _hstate == node id.
4318	* This is here to avoid any static dependency of the node device driver, in
4319	* the base kernel, on the hugetlb module.
4320	*/
4321	struct node_hstate {
4322	struct kobject *hugepages_kobj;
4323	struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
4324	};
4325	static struct node_hstate node_hstates[MAX_NUMNODES];
4326
4327	/*
4328	* A subset of global hstate attributes for node devices
4329	*/
4330	static struct attribute *per_node_hstate_attrs[] = {
4331	&nr_hugepages_attr.attr,
4332	&free_hugepages_attr.attr,
4333	&surplus_hugepages_attr.attr,
4334	NULL,
4335	};
4336
4337	static const struct attribute_group per_node_hstate_attr_group = {
4338	.attrs = per_node_hstate_attrs,
4339	};
4340
4341	/*
4342	* kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
4343	* Returns node id via non-NULL nidp.
4344	*/
4345	static struct hstate kobj_to_node_hstate(struct* kobject kobj, int* *nidp)
4346	{
4347	int nid;
4348
4349	for (nid = `0`; nid < nr_node_ids; nid++) {
4350	struct node_hstate *nhs = &node_hstates[nid];
4351	int i;
4352	for (i = `0`; i < HUGE_MAX_HSTATE; i++)
4353	if (nhs->hstate_kobjs[i] == kobj) {
4354	if (nidp)
4355	*nidp = nid;
4356	return &hstates[i];
4357	}
4358	}
4359
4360	BUG();
4361	return NULL;
4362	}
4363
4364	/*
4365	* Unregister hstate attributes from a single node device.
4366	* No-op if no hstate attributes attached.
4367	*/
4368	void hugetlb_unregister_node(struct node *node)
4369	{
4370	struct hstate *h;
4371	struct node_hstate *nhs = &node_hstates[node->dev.id];
4372
4373	if (!nhs->hugepages_kobj)
4374	return; / no hstate attributes /
4375
4376	for_each_hstate(h) {
4377	int idx = hstate_index(h);
4378	struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
4379
4380	if (!hstate_kobj)
4381	continue;
4382	if (h->demote_order)
4383	sysfs_remove_group(kobj: hstate_kobj, grp: &hstate_demote_attr_group);
4384	sysfs_remove_group(kobj: hstate_kobj, grp: &per_node_hstate_attr_group);
4385	kobject_put(kobj: hstate_kobj);
4386	nhs->hstate_kobjs[idx] = NULL;
4387	}
4388
4389	kobject_put(kobj: nhs->hugepages_kobj);
4390	nhs->hugepages_kobj = NULL;
4391	}
4392
4393
4394	/*
4395	* Register hstate attributes for a single node device.
4396	* No-op if attributes already registered.
4397	*/
4398	void hugetlb_register_node(struct node *node)
4399	{
4400	struct hstate *h;
4401	struct node_hstate *nhs = &node_hstates[node->dev.id];
4402	int err;
4403
4404	if (!hugetlb_sysfs_initialized)
4405	return;
4406
4407	if (nhs->hugepages_kobj)
4408	return; / already allocated /
4409
4410	nhs->hugepages_kobj = kobject_create_and_add(name: "hugepages",
4411	parent: &node->dev.kobj);
4412	if (!nhs->hugepages_kobj)
4413	return;
4414
4415	for_each_hstate(h) {
4416	err = hugetlb_sysfs_add_hstate(h, parent: nhs->hugepages_kobj,
4417	hstate_kobjs: nhs->hstate_kobjs,
4418	hstate_attr_group: &per_node_hstate_attr_group);
4419	if (err) {
4420	pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
4421	h->name, node->dev.id);
4422	hugetlb_unregister_node(node);
4423	break;
4424	}
4425	}
4426	}
4427
4428	/*
4429	* hugetlb init time: register hstate attributes for all registered node
4430	* devices of nodes that have memory. All on-line nodes should have
4431	* registered their associated device by this time.
4432	*/
4433	static void __init hugetlb_register_all_nodes(void)
4434	{
4435	int nid;
4436
4437	for_each_online_node(nid)
4438	hugetlb_register_node(node: node_devices[nid]);
4439	}
4440	#else /* !CONFIG_NUMA */
4441
4442	static struct hstate kobj_to_node_hstate(struct* kobject kobj, int* *nidp)
4443	{
4444	BUG();
4445	if (nidp)
4446	*nidp = -`1`;
4447	return NULL;
4448	}
4449
4450	static void hugetlb_register_all_nodes(void) { }
4451
4452	#endif
4453
4454	#ifdef CONFIG_CMA
4455	static void __init hugetlb_cma_check(void);
4456	#else
4457	static inline __init void hugetlb_cma_check(void)
4458	{
4459	}
4460	#endif
4461
4462	static void __init hugetlb_sysfs_init(void)
4463	{
4464	struct hstate *h;
4465	int err;
4466
4467	hugepages_kobj = kobject_create_and_add(name: "hugepages", parent: mm_kobj);
4468	if (!hugepages_kobj)
4469	return;
4470
4471	for_each_hstate(h) {
4472	err = hugetlb_sysfs_add_hstate(h, parent: hugepages_kobj,
4473	hstate_kobjs, hstate_attr_group: &hstate_attr_group);
4474	if (err)
4475	pr_err("HugeTLB: Unable to add hstate %s", h->name);
4476	}
4477
4478	#ifdef CONFIG_NUMA
4479	hugetlb_sysfs_initialized = true;
4480	#endif
4481	hugetlb_register_all_nodes();
4482	}
4483
4484	#ifdef CONFIG_SYSCTL
4485	static void hugetlb_sysctl_init(void);
4486	#else
4487	static inline void hugetlb_sysctl_init(void) { }
4488	#endif
4489
4490	static int __init hugetlb_init(void)
4491	{
4492	int i;
4493
4494	BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4495	__NR_HPAGEFLAGS);
4496
4497	if (!hugepages_supported()) {
4498	if (hugetlb_max_hstate \|\| default_hstate_max_huge_pages)
4499	pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4500	return `0`;
4501	}
4502
4503	/*
4504	* Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
4505	* architectures depend on setup being done here.
4506	*/
4507	hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4508	if (!parsed_default_hugepagesz) {
4509	/*
4510	* If we did not parse a default huge page size, set
4511	* default_hstate_idx to HPAGE_SIZE hstate. And, if the
4512	* number of huge pages for this default size was implicitly
4513	* specified, set that here as well.
4514	* Note that the implicit setting will overwrite an explicit
4515	* setting. A warning will be printed in this case.
4516	*/
4517	default_hstate_idx = hstate_index(h: size_to_hstate(HPAGE_SIZE));
4518	if (default_hstate_max_huge_pages) {
4519	if (default_hstate.max_huge_pages) {
4520	char buf[`32`];
4521
4522	string_get_size(size: huge_page_size(h: &default_hstate),
4523	blk_size: `1`, units: STRING_UNITS_2, buf, len: `32`);
4524	pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4525	default_hstate.max_huge_pages, buf);
4526	pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4527	default_hstate_max_huge_pages);
4528	}
4529	default_hstate.max_huge_pages =
4530	default_hstate_max_huge_pages;
4531
4532	for_each_online_node(i)
4533	default_hstate.max_huge_pages_node[i] =
4534	default_hugepages_in_node[i];
4535	}
4536	}
4537
4538	hugetlb_cma_check();
4539	hugetlb_init_hstates();
4540	gather_bootmem_prealloc();
4541	report_hugepages();
4542
4543	hugetlb_sysfs_init();
4544	hugetlb_cgroup_file_init();
4545	hugetlb_sysctl_init();
4546
4547	#ifdef CONFIG_SMP
4548	num_fault_mutexes = roundup_pow_of_two(`8` * num_possible_cpus());
4549	#else
4550	num_fault_mutexes = `1`;
4551	#endif
4552	hugetlb_fault_mutex_table =
4553	kmalloc_array(n: num_fault_mutexes, size: sizeof(struct mutex),
4554	GFP_KERNEL);
4555	BUG_ON(!hugetlb_fault_mutex_table);
4556
4557	for (i = `0`; i < num_fault_mutexes; i++)
4558	mutex_init(&hugetlb_fault_mutex_table[i]);
4559	return `0`;
4560	}
4561	subsys_initcall(hugetlb_init);
4562
4563	/ Overwritten by architectures with more huge page sizes /
4564	bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4565	{
4566	return size == HPAGE_SIZE;
4567	}
4568
4569	void __init hugetlb_add_hstate(unsigned int order)
4570	{
4571	struct hstate *h;
4572	unsigned long i;
4573
4574	if (size_to_hstate(PAGE_SIZE << order)) {
4575	return;
4576	}
4577	BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4578	BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
4579	h = &hstates[hugetlb_max_hstate++];
4580	mutex_init(&h->resize_lock);
4581	h->order = order;
4582	h->mask = ~(huge_page_size(h) - `1`);
4583	for (i = `0`; i < MAX_NUMNODES; ++i)
4584	INIT_LIST_HEAD(list: &h->hugepage_freelists[i]);
4585	INIT_LIST_HEAD(list: &h->hugepage_activelist);
4586	h->next_nid_to_alloc = first_memory_node;
4587	h->next_nid_to_free = first_memory_node;
4588	snprintf(buf: h->name, HSTATE_NAME_LEN, fmt: "hugepages-%lukB",
4589	huge_page_size(h)/SZ_1K);
4590
4591	parsed_hstate = h;
4592	}
4593
4594	bool __init __weak hugetlb_node_alloc_supported(void)
4595	{
4596	return true;
4597	}
4598
4599	static void __init hugepages_clear_pages_in_node(void)
4600	{
4601	if (!hugetlb_max_hstate) {
4602	default_hstate_max_huge_pages = `0`;
4603	memset(default_hugepages_in_node, `0`,
4604	sizeof(default_hugepages_in_node));
4605	} else {
4606	parsed_hstate->max_huge_pages = `0`;
4607	memset(parsed_hstate->max_huge_pages_node, `0`,
4608	sizeof(parsed_hstate->max_huge_pages_node));
4609	}
4610	}
4611
4612	/*
4613	* hugepages command line processing
4614	* hugepages normally follows a valid hugepagsz or default_hugepagsz
4615	* specification. If not, ignore the hugepages value. hugepages can also
4616	* be the first huge page command line option in which case it implicitly
4617	* specifies the number of huge pages for the default size.
4618	*/
4619	static int __init hugepages_setup(char *s)
4620	{
4621	unsigned long *mhp;
4622	static unsigned long *last_mhp;
4623	int node = NUMA_NO_NODE;
4624	int count;
4625	unsigned long tmp;
4626	char *p = s;
4627
4628	if (!parsed_valid_hugepagesz) {
4629	pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4630	parsed_valid_hugepagesz = true;
4631	return `1`;
4632	}
4633
4634	/*
4635	* !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4636	* yet, so this hugepages= parameter goes to the "default hstate".
4637	* Otherwise, it goes with the previously parsed hugepagesz or
4638	* default_hugepagesz.
4639	*/
4640	else if (!hugetlb_max_hstate)
4641	mhp = &default_hstate_max_huge_pages;
4642	else
4643	mhp = &parsed_hstate->max_huge_pages;
4644
4645	if (mhp == last_mhp) {
4646	pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4647	return `1`;
4648	}
4649
4650	while (*p) {
4651	count = `0`;
4652	if (sscanf(p, "%lu%n", &tmp, &count) != `1`)
4653	goto invalid;
4654	/ Parameter is node format /
4655	if (p[count] == `':'`) {
4656	if (!hugetlb_node_alloc_supported()) {
4657	pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4658	return `1`;
4659	}
4660	if (tmp >= MAX_NUMNODES \|\| !node_online(tmp))
4661	goto invalid;
4662	node = array_index_nospec(tmp, MAX_NUMNODES);
4663	p += count + `1`;
4664	/ Parse hugepages /
4665	if (sscanf(p, "%lu%n", &tmp, &count) != `1`)
4666	goto invalid;
4667	if (!hugetlb_max_hstate)
4668	default_hugepages_in_node[node] = tmp;
4669	else
4670	parsed_hstate->max_huge_pages_node[node] = tmp;
4671	*mhp += tmp;
4672	/ Go to parse next node/
4673	if (p[count] == `','`)
4674	p += count + `1`;
4675	else
4676	break;
4677	} else {
4678	if (p != s)
4679	goto invalid;
4680	*mhp = tmp;
4681	break;
4682	}
4683	}
4684
4685	/*
4686	* Global state is always initialized later in hugetlb_init.
4687	* But we need to allocate gigantic hstates here early to still
4688	* use the bootmem allocator.
4689	*/
4690	if (hugetlb_max_hstate && hstate_is_gigantic(h: parsed_hstate))
4691	hugetlb_hstate_alloc_pages(h: parsed_hstate);
4692
4693	last_mhp = mhp;
4694
4695	return `1`;
4696
4697	invalid:
4698	pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4699	hugepages_clear_pages_in_node();
4700	return `1`;
4701	}
4702	__setup("hugepages=", hugepages_setup);
4703
4704	/*
4705	* hugepagesz command line processing
4706	* A specific huge page size can only be specified once with hugepagesz.
4707	* hugepagesz is followed by hugepages on the command line. The global
4708	* variable 'parsed_valid_hugepagesz' is used to determine if prior
4709	* hugepagesz argument was valid.
4710	*/
4711	static int __init hugepagesz_setup(char *s)
4712	{
4713	unsigned long size;
4714	struct hstate *h;
4715
4716	parsed_valid_hugepagesz = false;
4717	size = (unsigned long)memparse(ptr: s, NULL);
4718
4719	if (!arch_hugetlb_valid_size(size)) {
4720	pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4721	return `1`;
4722	}
4723
4724	h = size_to_hstate(size);
4725	if (h) {
4726	/*
4727	* hstate for this size already exists. This is normally
4728	* an error, but is allowed if the existing hstate is the
4729	* default hstate. More specifically, it is only allowed if
4730	* the number of huge pages for the default hstate was not
4731	* previously specified.
4732	*/
4733	if (!parsed_default_hugepagesz \|\| h != &default_hstate \|\|
4734	default_hstate.max_huge_pages) {
4735	pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4736	return `1`;
4737	}
4738
4739	/*
4740	* No need to call hugetlb_add_hstate() as hstate already
4741	* exists. But, do set parsed_hstate so that a following
4742	* hugepages= parameter will be applied to this hstate.
4743	*/
4744	parsed_hstate = h;
4745	parsed_valid_hugepagesz = true;
4746	return `1`;
4747	}
4748
4749	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4750	parsed_valid_hugepagesz = true;
4751	return `1`;
4752	}
4753	__setup("hugepagesz=", hugepagesz_setup);
4754
4755	/*
4756	* default_hugepagesz command line input
4757	* Only one instance of default_hugepagesz allowed on command line.
4758	*/
4759	static int __init default_hugepagesz_setup(char *s)
4760	{
4761	unsigned long size;
4762	int i;
4763
4764	parsed_valid_hugepagesz = false;
4765	if (parsed_default_hugepagesz) {
4766	pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4767	return `1`;
4768	}
4769
4770	size = (unsigned long)memparse(ptr: s, NULL);
4771
4772	if (!arch_hugetlb_valid_size(size)) {
4773	pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4774	return `1`;
4775	}
4776
4777	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4778	parsed_valid_hugepagesz = true;
4779	parsed_default_hugepagesz = true;
4780	default_hstate_idx = hstate_index(h: size_to_hstate(size));
4781
4782	/*
4783	* The number of default huge pages (for this size) could have been
4784	* specified as the first hugetlb parameter: hugepages=X. If so,
4785	* then default_hstate_max_huge_pages is set. If the default huge
4786	* page size is gigantic (> MAX_ORDER), then the pages must be
4787	* allocated here from bootmem allocator.
4788	*/
4789	if (default_hstate_max_huge_pages) {
4790	default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4791	for_each_online_node(i)
4792	default_hstate.max_huge_pages_node[i] =
4793	default_hugepages_in_node[i];
4794	if (hstate_is_gigantic(h: &default_hstate))
4795	hugetlb_hstate_alloc_pages(h: &default_hstate);
4796	default_hstate_max_huge_pages = `0`;
4797	}
4798
4799	return `1`;
4800	}
4801	__setup("default_hugepagesz=", default_hugepagesz_setup);
4802
4803	static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
4804	{
4805	#ifdef CONFIG_NUMA
4806	struct mempolicy *mpol = get_task_policy(current);
4807
4808	/*
4809	* Only enforce MPOL_BIND policy which overlaps with cpuset policy
4810	* (from policy_nodemask) specifically for hugetlb case
4811	*/
4812	if (mpol->mode == MPOL_BIND &&
4813	(apply_policy_zone(policy: mpol, zone: gfp_zone(flags: gfp)) &&
4814	cpuset_nodemask_valid_mems_allowed(nodemask: &mpol->nodes)))
4815	return &mpol->nodes;
4816	#endif
4817	return NULL;
4818	}
4819
4820	static unsigned int allowed_mems_nr(struct hstate *h)
4821	{
4822	int node;
4823	unsigned int nr = `0`;
4824	nodemask_t *mbind_nodemask;
4825	unsigned int *array = h->free_huge_pages_node;
4826	gfp_t gfp_mask = htlb_alloc_mask(h);
4827
4828	mbind_nodemask = policy_mbind_nodemask(gfp: gfp_mask);
4829	for_each_node_mask(node, cpuset_current_mems_allowed) {
4830	if (!mbind_nodemask \|\| node_isset(node, *mbind_nodemask))
4831	nr += array[node];
4832	}
4833
4834	return nr;
4835	}
4836
4837	#ifdef CONFIG_SYSCTL
4838	static int proc_hugetlb_doulongvec_minmax(struct ctl_table table, int* write,
4839	void buffer, size_t length,
4840	loff_t ppos, unsigned* long *out)
4841	{
4842	struct ctl_table dup_table;
4843
4844	/*
4845	* In order to avoid races with __do_proc_doulongvec_minmax(), we
4846	* can duplicate the @table and alter the duplicate of it.
4847	*/
4848	dup_table = *table;
4849	dup_table.data = out;
4850
4851	return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4852	}
4853
4854	static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4855	struct ctl_table table, int* write,
4856	void buffer, size_t length, loff_t *ppos)
4857	{
4858	struct hstate *h = &default_hstate;
4859	unsigned long tmp = h->max_huge_pages;
4860	int ret;
4861
4862	if (!hugepages_supported())
4863	return -EOPNOTSUPP;
4864
4865	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4866	out: &tmp);
4867	if (ret)
4868	goto out;
4869
4870	if (write)
4871	ret = __nr_hugepages_store_common(obey_mempolicy, h,
4872	NUMA_NO_NODE, count: tmp, len: *length);
4873	out:
4874	return ret;
4875	}
4876
4877	static int hugetlb_sysctl_handler(struct ctl_table table, int* write,
4878	void buffer, size_t length, loff_t *ppos)
4879	{
4880
4881	return hugetlb_sysctl_handler_common(obey_mempolicy: false, table, write,
4882	buffer, length, ppos);
4883	}
4884
4885	#ifdef CONFIG_NUMA
4886	static int hugetlb_mempolicy_sysctl_handler(struct ctl_table table, int* write,
4887	void buffer, size_t length, loff_t *ppos)
4888	{
4889	return hugetlb_sysctl_handler_common(obey_mempolicy: true, table, write,
4890	buffer, length, ppos);
4891	}
4892	#endif /* CONFIG_NUMA */
4893
4894	static int hugetlb_overcommit_handler(struct ctl_table table, int* write,
4895	void buffer, size_t length, loff_t *ppos)
4896	{
4897	struct hstate *h = &default_hstate;
4898	unsigned long tmp;
4899	int ret;
4900
4901	if (!hugepages_supported())
4902	return -EOPNOTSUPP;
4903
4904	tmp = h->nr_overcommit_huge_pages;
4905
4906	if (write && hstate_is_gigantic(h))
4907	return -EINVAL;
4908
4909	ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4910	out: &tmp);
4911	if (ret)
4912	goto out;
4913
4914	if (write) {
4915	spin_lock_irq(lock: &hugetlb_lock);
4916	h->nr_overcommit_huge_pages = tmp;
4917	spin_unlock_irq(lock: &hugetlb_lock);
4918	}
4919	out:
4920	return ret;
4921	}
4922
4923	static struct ctl_table hugetlb_table[] = {
4924	{
4925	.procname = "nr_hugepages",
4926	.data = NULL,
4927	.maxlen = sizeof(unsigned long),
4928	.mode = `0644`,
4929	.proc_handler = hugetlb_sysctl_handler,
4930	},
4931	#ifdef CONFIG_NUMA
4932	{
4933	.procname = "nr_hugepages_mempolicy",
4934	.data = NULL,
4935	.maxlen = sizeof(unsigned long),
4936	.mode = `0644`,
4937	.proc_handler = &hugetlb_mempolicy_sysctl_handler,
4938	},
4939	#endif
4940	{
4941	.procname = "hugetlb_shm_group",
4942	.data = &sysctl_hugetlb_shm_group,
4943	.maxlen = sizeof(gid_t),
4944	.mode = `0644`,
4945	.proc_handler = proc_dointvec,
4946	},
4947	{
4948	.procname = "nr_overcommit_hugepages",
4949	.data = NULL,
4950	.maxlen = sizeof(unsigned long),
4951	.mode = `0644`,
4952	.proc_handler = hugetlb_overcommit_handler,
4953	},
4954	{ }
4955	};
4956
4957	static void hugetlb_sysctl_init(void)
4958	{
4959	register_sysctl_init("vm", hugetlb_table);
4960	}
4961	#endif /* CONFIG_SYSCTL */
4962
4963	void hugetlb_report_meminfo(struct seq_file *m)
4964	{
4965	struct hstate *h;
4966	unsigned long total = `0`;
4967
4968	if (!hugepages_supported())
4969	return;
4970
4971	for_each_hstate(h) {
4972	unsigned long count = h->nr_huge_pages;
4973
4974	total += huge_page_size(h) * count;
4975
4976	if (h == &default_hstate)
4977	seq_printf(m,
4978	fmt: "HugePages_Total: %5lu\n"
4979	"HugePages_Free: %5lu\n"
4980	"HugePages_Rsvd: %5lu\n"
4981	"HugePages_Surp: %5lu\n"
4982	"Hugepagesize: %8lu kB\n",
4983	count,
4984	h->free_huge_pages,
4985	h->resv_huge_pages,
4986	h->surplus_huge_pages,
4987	huge_page_size(h) / SZ_1K);
4988	}
4989
4990	seq_printf(m, fmt: "Hugetlb: %8lu kB\n", total / SZ_1K);
4991	}
4992
4993	int hugetlb_report_node_meminfo(char buf, int* len, int nid)
4994	{
4995	struct hstate *h = &default_hstate;
4996
4997	if (!hugepages_supported())
4998	return `0`;
4999
5000	return sysfs_emit_at(buf, at: len,
5001	fmt: "Node %d HugePages_Total: %5u\n"
5002	"Node %d HugePages_Free: %5u\n"
5003	"Node %d HugePages_Surp: %5u\n",
5004	nid, h->nr_huge_pages_node[nid],
5005	nid, h->free_huge_pages_node[nid],
5006	nid, h->surplus_huge_pages_node[nid]);
5007	}
5008
5009	void hugetlb_show_meminfo_node(int nid)
5010	{
5011	struct hstate *h;
5012
5013	if (!hugepages_supported())
5014	return;
5015
5016	for_each_hstate(h)
5017	printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
5018	nid,
5019	h->nr_huge_pages_node[nid],
5020	h->free_huge_pages_node[nid],
5021	h->surplus_huge_pages_node[nid],
5022	huge_page_size(h) / SZ_1K);
5023	}
5024
5025	void hugetlb_report_usage(struct seq_file m, struct* mm_struct *mm)
5026	{
5027	seq_printf(m, fmt: "HugetlbPages:\t%8lu kB\n",
5028	K(atomic_long_read(&mm->hugetlb_usage)));
5029	}
5030
5031	/ Return the number pages of memory we physically have, in PAGE_SIZE units. /
5032	unsigned long hugetlb_total_pages(void)
5033	{
5034	struct hstate *h;
5035	unsigned long nr_total_pages = `0`;
5036
5037	for_each_hstate(h)
5038	nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
5039	return nr_total_pages;
5040	}
5041
5042	static int hugetlb_acct_memory(struct hstate h, long* delta)
5043	{
5044	int ret = -ENOMEM;
5045
5046	if (!delta)
5047	return `0`;
5048
5049	spin_lock_irq(lock: &hugetlb_lock);
5050	/*
5051	* When cpuset is configured, it breaks the strict hugetlb page
5052	* reservation as the accounting is done on a global variable. Such
5053	* reservation is completely rubbish in the presence of cpuset because
5054	* the reservation is not checked against page availability for the
5055	* current cpuset. Application can still potentially OOM'ed by kernel
5056	* with lack of free htlb page in cpuset that the task is in.
5057	* Attempt to enforce strict accounting with cpuset is almost
5058	* impossible (or too ugly) because cpuset is too fluid that
5059	* task or memory node can be dynamically moved between cpusets.
5060	*
5061	* The change of semantics for shared hugetlb mapping with cpuset is
5062	* undesirable. However, in order to preserve some of the semantics,
5063	* we fall back to check against current free page availability as
5064	* a best attempt and hopefully to minimize the impact of changing
5065	* semantics that cpuset has.
5066	*
5067	* Apart from cpuset, we also have memory policy mechanism that
5068	* also determines from which node the kernel will allocate memory
5069	* in a NUMA system. So similar to cpuset, we also should consider
5070	* the memory policy of the current task. Similar to the description
5071	* above.
5072	*/
5073	if (delta > `0`) {
5074	if (gather_surplus_pages(h, delta) < `0`)
5075	goto out;
5076
5077	if (delta > allowed_mems_nr(h)) {
5078	return_unused_surplus_pages(h, unused_resv_pages: delta);
5079	goto out;
5080	}
5081	}
5082
5083	ret = `0`;
5084	if (delta < `0`)
5085	return_unused_surplus_pages(h, unused_resv_pages: (unsigned long) -delta);
5086
5087	out:
5088	spin_unlock_irq(lock: &hugetlb_lock);
5089	return ret;
5090	}
5091
5092	static void hugetlb_vm_op_open(struct vm_area_struct *vma)
5093	{
5094	struct resv_map *resv = vma_resv_map(vma);
5095
5096	/*
5097	* HPAGE_RESV_OWNER indicates a private mapping.
5098	* This new VMA should share its siblings reservation map if present.
5099	* The VMA will only ever have a valid reservation map pointer where
5100	* it is being copied for another still existing VMA. As that VMA
5101	* has a reference to the reservation map it cannot disappear until
5102	* after this open call completes. It is therefore safe to take a
5103	* new reference here without additional locking.
5104	*/
5105	if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
5106	resv_map_dup_hugetlb_cgroup_uncharge_info(resv_map: resv);
5107	kref_get(kref: &resv->refs);
5108	}
5109
5110	/*
5111	* vma_lock structure for sharable mappings is vma specific.
5112	* Clear old pointer (if copied via vm_area_dup) and allocate
5113	* new structure. Before clearing, make sure vma_lock is not
5114	* for this vma.
5115	*/
5116	if (vma->vm_flags & VM_MAYSHARE) {
5117	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
5118
5119	if (vma_lock) {
5120	if (vma_lock->vma != vma) {
5121	vma->vm_private_data = NULL;
5122	hugetlb_vma_lock_alloc(vma);
5123	} else
5124	pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
5125	} else
5126	hugetlb_vma_lock_alloc(vma);
5127	}
5128	}
5129
5130	static void hugetlb_vm_op_close(struct vm_area_struct *vma)
5131	{
5132	struct hstate *h = hstate_vma(vma);
5133	struct resv_map *resv;
5134	struct hugepage_subpool *spool = subpool_vma(vma);
5135	unsigned long reserve, start, end;
5136	long gbl_reserve;
5137
5138	hugetlb_vma_lock_free(vma);
5139
5140	resv = vma_resv_map(vma);
5141	if (!resv \|\| !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
5142	return;
5143
5144	start = vma_hugecache_offset(h, vma, address: vma->vm_start);
5145	end = vma_hugecache_offset(h, vma, address: vma->vm_end);
5146
5147	reserve = (end - start) - region_count(resv, f: start, t: end);
5148	hugetlb_cgroup_uncharge_counter(resv, start, end);
5149	if (reserve) {
5150	/*
5151	* Decrement reserve counts. The global reserve count may be
5152	* adjusted if the subpool has a minimum size.
5153	*/
5154	gbl_reserve = hugepage_subpool_put_pages(spool, delta: reserve);
5155	hugetlb_acct_memory(h, delta: -gbl_reserve);
5156	}
5157
5158	kref_put(kref: &resv->refs, release: resv_map_release);
5159	}
5160
5161	static int hugetlb_vm_op_split(struct vm_area_struct vma, unsigned* long addr)
5162	{
5163	if (addr & ~(huge_page_mask(h: hstate_vma(vma))))
5164	return -EINVAL;
5165
5166	/*
5167	* PMD sharing is only possible for PUD_SIZE-aligned address ranges
5168	* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
5169	* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5170	*/
5171	if (addr & ~PUD_MASK) {
5172	/*
5173	* hugetlb_vm_op_split is called right before we attempt to
5174	* split the VMA. We will need to unshare PMDs in the old and
5175	* new VMAs, so let's unshare before we split.
5176	*/
5177	unsigned long floor = addr & PUD_MASK;
5178	unsigned long ceil = floor + PUD_SIZE;
5179
5180	if (floor >= vma->vm_start && ceil <= vma->vm_end)
5181	hugetlb_unshare_pmds(vma, start: floor, end: ceil);
5182	}
5183
5184	return `0`;
5185	}
5186
5187	static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
5188	{
5189	return huge_page_size(h: hstate_vma(vma));
5190	}
5191
5192	/*
5193	* We cannot handle pagefaults against hugetlb pages at all. They cause
5194	* handle_mm_fault() to try to instantiate regular-sized pages in the
5195	* hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
5196	* this far.
5197	*/
5198	static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
5199	{
5200	BUG();
5201	return `0`;
5202	}
5203
5204	/*
5205	* When a new function is introduced to vm_operations_struct and added
5206	* to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
5207	* This is because under System V memory model, mappings created via
5208	* shmget/shmat with "huge page" specified are backed by hugetlbfs files,
5209	* their original vm_ops are overwritten with shm_vm_ops.
5210	*/
5211	const struct vm_operations_struct hugetlb_vm_ops = {
5212	.fault = hugetlb_vm_op_fault,
5213	.open = hugetlb_vm_op_open,
5214	.close = hugetlb_vm_op_close,
5215	.may_split = hugetlb_vm_op_split,
5216	.pagesize = hugetlb_vm_op_pagesize,
5217	};
5218
5219	static pte_t make_huge_pte(struct vm_area_struct vma, struct* page *page,
5220	int writable)
5221	{
5222	pte_t entry;
5223	unsigned int shift = huge_page_shift(h: hstate_vma(vma));
5224
5225	if (writable) {
5226	entry = huge_pte_mkwrite(pte: huge_pte_mkdirty(pte: mk_huge_pte(page,
5227	pgprot: vma->vm_page_prot)));
5228	} else {
5229	entry = huge_pte_wrprotect(pte: mk_huge_pte(page,
5230	pgprot: vma->vm_page_prot));
5231	}
5232	entry = pte_mkyoung(pte: entry);
5233	entry = arch_make_huge_pte(entry, shift, flags: vma->vm_flags);
5234
5235	return entry;
5236	}
5237
5238	static void set_huge_ptep_writable(struct vm_area_struct *vma,
5239	unsigned long address, pte_t *ptep)
5240	{
5241	pte_t entry;
5242
5243	entry = huge_pte_mkwrite(pte: huge_pte_mkdirty(pte: huge_ptep_get(ptep)));
5244	if (huge_ptep_set_access_flags(vma, addr: address, ptep, pte: entry, dirty: `1`))
5245	update_mmu_cache(vma, addr: address, ptep);
5246	}
5247
5248	bool is_hugetlb_entry_migration(pte_t pte)
5249	{
5250	swp_entry_t swp;
5251
5252	if (huge_pte_none(pte) \|\| pte_present(a: pte))
5253	return false;
5254	swp = pte_to_swp_entry(pte);
5255	if (is_migration_entry(entry: swp))
5256	return true;
5257	else
5258	return false;
5259	}
5260
5261	bool is_hugetlb_entry_hwpoisoned(pte_t pte)
5262	{
5263	swp_entry_t swp;
5264
5265	if (huge_pte_none(pte) \|\| pte_present(a: pte))
5266	return false;
5267	swp = pte_to_swp_entry(pte);
5268	if (is_hwpoison_entry(entry: swp))
5269	return true;
5270	else
5271	return false;
5272	}
5273
5274	static void
5275	hugetlb_install_folio(struct vm_area_struct vma, pte_t ptep, unsigned long addr,
5276	struct folio new_folio, pte_t old, unsigned* long sz)
5277	{
5278	pte_t newpte = make_huge_pte(vma, page: &new_folio->page, writable: `1`);
5279
5280	__folio_mark_uptodate(folio: new_folio);
5281	hugepage_add_new_anon_rmap(new_folio, vma, address: addr);
5282	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(pte: old))
5283	newpte = huge_pte_mkuffd_wp(pte: newpte);
5284	set_huge_pte_at(mm: vma->vm_mm, addr, ptep, pte: newpte, sz);
5285	hugetlb_count_add(l: pages_per_huge_page(h: hstate_vma(vma)), mm: vma->vm_mm);
5286	folio_set_hugetlb_migratable(folio: new_folio);
5287	}
5288
5289	int copy_hugetlb_page_range(struct mm_struct dst, struct* mm_struct *src,
5290	struct vm_area_struct *dst_vma,
5291	struct vm_area_struct *src_vma)
5292	{
5293	pte_t src_pte, dst_pte, entry;
5294	struct folio *pte_folio;
5295	unsigned long addr;
5296	bool cow = is_cow_mapping(flags: src_vma->vm_flags);
5297	struct hstate *h = hstate_vma(vma: src_vma);
5298	unsigned long sz = huge_page_size(h);
5299	unsigned long npages = pages_per_huge_page(h);
5300	struct mmu_notifier_range range;
5301	unsigned long last_addr_mask;
5302	int ret = `0`;
5303
5304	if (cow) {
5305	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: src,
5306	start: src_vma->vm_start,
5307	end: src_vma->vm_end);
5308	mmu_notifier_invalidate_range_start(range: &range);
5309	vma_assert_write_locked(vma: src_vma);
5310	raw_write_seqcount_begin(&src->write_protect_seq);
5311	} else {
5312	/*
5313	* For shared mappings the vma lock must be held before
5314	* calling hugetlb_walk() in the src vma. Otherwise, the
5315	* returned ptep could go away if part of a shared pmd and
5316	* another thread calls huge_pmd_unshare.
5317	*/
5318	hugetlb_vma_lock_read(vma: src_vma);
5319	}
5320
5321	last_addr_mask = hugetlb_mask_last_page(h);
5322	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
5323	spinlock_t src_ptl, dst_ptl;
5324	src_pte = hugetlb_walk(vma: src_vma, addr, sz);
5325	if (!src_pte) {
5326	addr \|= last_addr_mask;
5327	continue;
5328	}
5329	dst_pte = huge_pte_alloc(mm: dst, vma: dst_vma, addr, sz);
5330	if (!dst_pte) {
5331	ret = -ENOMEM;
5332	break;
5333	}
5334
5335	/*
5336	* If the pagetables are shared don't copy or take references.
5337	*
5338	* dst_pte == src_pte is the common case of src/dest sharing.
5339	* However, src could have 'unshared' and dst shares with
5340	* another vma. So page_count of ptep page is checked instead
5341	* to reliably determine whether pte is shared.
5342	*/
5343	if (page_count(virt_to_page(dst_pte)) > `1`) {
5344	addr \|= last_addr_mask;
5345	continue;
5346	}
5347
5348	dst_ptl = huge_pte_lock(h, mm: dst, pte: dst_pte);
5349	src_ptl = huge_pte_lockptr(h, mm: src, pte: src_pte);
5350	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5351	entry = huge_ptep_get(ptep: src_pte);
5352	again:
5353	if (huge_pte_none(pte: entry)) {
5354	/*
5355	* Skip if src entry none.
5356	*/
5357	;
5358	} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
5359	if (!userfaultfd_wp(vma: dst_vma))
5360	entry = huge_pte_clear_uffd_wp(pte: entry);
5361	set_huge_pte_at(mm: dst, addr, ptep: dst_pte, pte: entry, sz);
5362	} else if (unlikely(is_hugetlb_entry_migration(entry))) {
5363	swp_entry_t swp_entry = pte_to_swp_entry(pte: entry);
5364	bool uffd_wp = pte_swp_uffd_wp(pte: entry);
5365
5366	if (!is_readable_migration_entry(entry: swp_entry) && cow) {
5367	/*
5368	* COW mappings require pages in both
5369	* parent and child to be set to read.
5370	*/
5371	swp_entry = make_readable_migration_entry(
5372	offset: swp_offset(entry: swp_entry));
5373	entry = swp_entry_to_pte(entry: swp_entry);
5374	if (userfaultfd_wp(vma: src_vma) && uffd_wp)
5375	entry = pte_swp_mkuffd_wp(pte: entry);
5376	set_huge_pte_at(mm: src, addr, ptep: src_pte, pte: entry, sz);
5377	}
5378	if (!userfaultfd_wp(vma: dst_vma))
5379	entry = huge_pte_clear_uffd_wp(pte: entry);
5380	set_huge_pte_at(mm: dst, addr, ptep: dst_pte, pte: entry, sz);
5381	} else if (unlikely(is_pte_marker(entry))) {
5382	pte_marker marker = copy_pte_marker(
5383	entry: pte_to_swp_entry(pte: entry), dst_vma);
5384
5385	if (marker)
5386	set_huge_pte_at(mm: dst, addr, ptep: dst_pte,
5387	pte: make_pte_marker(marker), sz);
5388	} else {
5389	entry = huge_ptep_get(ptep: src_pte);
5390	pte_folio = page_folio(pte_page(entry));
5391	folio_get(folio: pte_folio);
5392
5393	/*
5394	* Failing to duplicate the anon rmap is a rare case
5395	* where we see pinned hugetlb pages while they're
5396	* prone to COW. We need to do the COW earlier during
5397	* fork.
5398	*
5399	* When pre-allocating the page or copying data, we
5400	* need to be without the pgtable locks since we could
5401	* sleep during the process.
5402	*/
5403	if (!folio_test_anon(folio: pte_folio)) {
5404	page_dup_file_rmap(page: &pte_folio->page, compound: true);
5405	} else if (page_try_dup_anon_rmap(page: &pte_folio->page,
5406	compound: true, vma: src_vma)) {
5407	pte_t src_pte_old = entry;
5408	struct folio *new_folio;
5409
5410	spin_unlock(lock: src_ptl);
5411	spin_unlock(lock: dst_ptl);
5412	/ Do not use reserve as it's private owned /
5413	new_folio = alloc_hugetlb_folio(vma: dst_vma, addr, avoid_reserve: `1`);
5414	if (IS_ERR(ptr: new_folio)) {
5415	folio_put(folio: pte_folio);
5416	ret = PTR_ERR(ptr: new_folio);
5417	break;
5418	}
5419	ret = copy_user_large_folio(dst: new_folio,
5420	src: pte_folio,
5421	addr_hint: addr, vma: dst_vma);
5422	folio_put(folio: pte_folio);
5423	if (ret) {
5424	folio_put(folio: new_folio);
5425	break;
5426	}
5427
5428	/ Install the new hugetlb folio if src pte stable /
5429	dst_ptl = huge_pte_lock(h, mm: dst, pte: dst_pte);
5430	src_ptl = huge_pte_lockptr(h, mm: src, pte: src_pte);
5431	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5432	entry = huge_ptep_get(ptep: src_pte);
5433	if (!pte_same(a: src_pte_old, b: entry)) {
5434	restore_reserve_on_error(h, vma: dst_vma, address: addr,
5435	folio: new_folio);
5436	folio_put(folio: new_folio);
5437	/ huge_ptep of dst_pte won't change as in child /
5438	goto again;
5439	}
5440	hugetlb_install_folio(vma: dst_vma, ptep: dst_pte, addr,
5441	new_folio, old: src_pte_old, sz);
5442	spin_unlock(lock: src_ptl);
5443	spin_unlock(lock: dst_ptl);
5444	continue;
5445	}
5446
5447	if (cow) {
5448	/*
5449	* No need to notify as we are downgrading page
5450	* table protection not changing it to point
5451	* to a new page.
5452	*
5453	* See Documentation/mm/mmu_notifier.rst
5454	*/
5455	huge_ptep_set_wrprotect(mm: src, addr, ptep: src_pte);
5456	entry = huge_pte_wrprotect(pte: entry);
5457	}
5458
5459	if (!userfaultfd_wp(vma: dst_vma))
5460	entry = huge_pte_clear_uffd_wp(pte: entry);
5461
5462	set_huge_pte_at(mm: dst, addr, ptep: dst_pte, pte: entry, sz);
5463	hugetlb_count_add(l: npages, mm: dst);
5464	}
5465	spin_unlock(lock: src_ptl);
5466	spin_unlock(lock: dst_ptl);
5467	}
5468
5469	if (cow) {
5470	raw_write_seqcount_end(&src->write_protect_seq);
5471	mmu_notifier_invalidate_range_end(range: &range);
5472	} else {
5473	hugetlb_vma_unlock_read(vma: src_vma);
5474	}
5475
5476	return ret;
5477	}
5478
5479	static void move_huge_pte(struct vm_area_struct vma, unsigned* long old_addr,
5480	unsigned long new_addr, pte_t src_pte, pte_t dst_pte,
5481	unsigned long sz)
5482	{
5483	struct hstate *h = hstate_vma(vma);
5484	struct mm_struct *mm = vma->vm_mm;
5485	spinlock_t src_ptl, dst_ptl;
5486	pte_t pte;
5487
5488	dst_ptl = huge_pte_lock(h, mm, pte: dst_pte);
5489	src_ptl = huge_pte_lockptr(h, mm, pte: src_pte);
5490
5491	/*
5492	* We don't have to worry about the ordering of src and dst ptlocks
5493	* because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
5494	*/
5495	if (src_ptl != dst_ptl)
5496	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5497
5498	pte = huge_ptep_get_and_clear(mm, addr: old_addr, ptep: src_pte);
5499	set_huge_pte_at(mm, addr: new_addr, ptep: dst_pte, pte, sz);
5500
5501	if (src_ptl != dst_ptl)
5502	spin_unlock(lock: src_ptl);
5503	spin_unlock(lock: dst_ptl);
5504	}
5505
5506	int move_hugetlb_page_tables(struct vm_area_struct *vma,
5507	struct vm_area_struct *new_vma,
5508	unsigned long old_addr, unsigned long new_addr,
5509	unsigned long len)
5510	{
5511	struct hstate *h = hstate_vma(vma);
5512	struct address_space *mapping = vma->vm_file->f_mapping;
5513	unsigned long sz = huge_page_size(h);
5514	struct mm_struct *mm = vma->vm_mm;
5515	unsigned long old_end = old_addr + len;
5516	unsigned long last_addr_mask;
5517	pte_t src_pte, dst_pte;
5518	struct mmu_notifier_range range;
5519	bool shared_pmd = false;
5520
5521	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm, start: old_addr,
5522	end: old_end);
5523	adjust_range_if_pmd_sharing_possible(vma, start: &range.start, end: &range.end);
5524	/*
5525	* In case of shared PMDs, we should cover the maximum possible
5526	* range.
5527	*/
5528	flush_cache_range(vma, start: range.start, end: range.end);
5529
5530	mmu_notifier_invalidate_range_start(range: &range);
5531	last_addr_mask = hugetlb_mask_last_page(h);
5532	/ Prevent race with file truncation /
5533	hugetlb_vma_lock_write(vma);
5534	i_mmap_lock_write(mapping);
5535	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
5536	src_pte = hugetlb_walk(vma, addr: old_addr, sz);
5537	if (!src_pte) {
5538	old_addr \|= last_addr_mask;
5539	new_addr \|= last_addr_mask;
5540	continue;
5541	}
5542	if (huge_pte_none(pte: huge_ptep_get(ptep: src_pte)))
5543	continue;
5544
5545	if (huge_pmd_unshare(mm, vma, addr: old_addr, ptep: src_pte)) {
5546	shared_pmd = true;
5547	old_addr \|= last_addr_mask;
5548	new_addr \|= last_addr_mask;
5549	continue;
5550	}
5551
5552	dst_pte = huge_pte_alloc(mm, vma: new_vma, addr: new_addr, sz);
5553	if (!dst_pte)
5554	break;
5555
5556	move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
5557	}
5558
5559	if (shared_pmd)
5560	flush_hugetlb_tlb_range(vma, range.start, range.end);
5561	else
5562	flush_hugetlb_tlb_range(vma, old_end - len, old_end);
5563	mmu_notifier_invalidate_range_end(range: &range);
5564	i_mmap_unlock_write(mapping);
5565	hugetlb_vma_unlock_write(vma);
5566
5567	return len + old_addr - old_end;
5568	}
5569
5570	void __unmap_hugepage_range(struct mmu_gather tlb, struct* vm_area_struct *vma,
5571	unsigned long start, unsigned long end,
5572	struct page *ref_page, zap_flags_t zap_flags)
5573	{
5574	struct mm_struct *mm = vma->vm_mm;
5575	unsigned long address;
5576	pte_t *ptep;
5577	pte_t pte;
5578	spinlock_t *ptl;
5579	struct page *page;
5580	struct hstate *h = hstate_vma(vma);
5581	unsigned long sz = huge_page_size(h);
5582	unsigned long last_addr_mask;
5583	bool force_flush = false;
5584
5585	WARN_ON(!is_vm_hugetlb_page(vma));
5586	BUG_ON(start & ~huge_page_mask(h));
5587	BUG_ON(end & ~huge_page_mask(h));
5588
5589	/*
5590	* This is a hugetlb vma, all the pte entries should point
5591	* to huge page.
5592	*/
5593	tlb_change_page_size(tlb, page_size: sz);
5594	tlb_start_vma(tlb, vma);
5595
5596	last_addr_mask = hugetlb_mask_last_page(h);
5597	address = start;
5598	for (; address < end; address += sz) {
5599	ptep = hugetlb_walk(vma, addr: address, sz);
5600	if (!ptep) {
5601	address \|= last_addr_mask;
5602	continue;
5603	}
5604
5605	ptl = huge_pte_lock(h, mm, pte: ptep);
5606	if (huge_pmd_unshare(mm, vma, addr: address, ptep)) {
5607	spin_unlock(lock: ptl);
5608	tlb_flush_pmd_range(tlb, address: address & PUD_MASK, PUD_SIZE);
5609	force_flush = true;
5610	address \|= last_addr_mask;
5611	continue;
5612	}
5613
5614	pte = huge_ptep_get(ptep);
5615	if (huge_pte_none(pte)) {
5616	spin_unlock(lock: ptl);
5617	continue;
5618	}
5619
5620	/*
5621	* Migrating hugepage or HWPoisoned hugepage is already
5622	* unmapped and its refcount is dropped, so just clear pte here.
5623	*/
5624	if (unlikely(!pte_present(pte))) {
5625	/*
5626	* If the pte was wr-protected by uffd-wp in any of the
5627	* swap forms, meanwhile the caller does not want to
5628	* drop the uffd-wp bit in this zap, then replace the
5629	* pte with a marker.
5630	*/
5631	if (pte_swp_uffd_wp_any(pte) &&
5632	!(zap_flags & ZAP_FLAG_DROP_MARKER))
5633	set_huge_pte_at(mm, addr: address, ptep,
5634	pte: make_pte_marker(PTE_MARKER_UFFD_WP),
5635	sz);
5636	else
5637	huge_pte_clear(mm, addr: address, ptep, sz);
5638	spin_unlock(lock: ptl);
5639	continue;
5640	}
5641
5642	page = pte_page(pte);
5643	/*
5644	* If a reference page is supplied, it is because a specific
5645	* page is being unmapped, not a range. Ensure the page we
5646	* are about to unmap is the actual page of interest.
5647	*/
5648	if (ref_page) {
5649	if (page != ref_page) {
5650	spin_unlock(lock: ptl);
5651	continue;
5652	}
5653	/*
5654	* Mark the VMA as having unmapped its page so that
5655	* future faults in this VMA will fail rather than
5656	* looking like data was lost
5657	*/
5658	set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5659	}
5660
5661	pte = huge_ptep_get_and_clear(mm, addr: address, ptep);
5662	tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5663	if (huge_pte_dirty(pte))
5664	set_page_dirty(page);
5665	/ Leave a uffd-wp pte marker if needed /
5666	if (huge_pte_uffd_wp(pte) &&
5667	!(zap_flags & ZAP_FLAG_DROP_MARKER))
5668	set_huge_pte_at(mm, addr: address, ptep,
5669	pte: make_pte_marker(PTE_MARKER_UFFD_WP),
5670	sz);
5671	hugetlb_count_sub(l: pages_per_huge_page(h), mm);
5672	page_remove_rmap(page, vma, compound: true);
5673
5674	spin_unlock(lock: ptl);
5675	tlb_remove_page_size(tlb, page, page_size: huge_page_size(h));
5676	/*
5677	* Bail out after unmapping reference page if supplied
5678	*/
5679	if (ref_page)
5680	break;
5681	}
5682	tlb_end_vma(tlb, vma);
5683
5684	/*
5685	* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
5686	* could defer the flush until now, since by holding i_mmap_rwsem we
5687	* guaranteed that the last refernece would not be dropped. But we must
5688	* do the flushing before we return, as otherwise i_mmap_rwsem will be
5689	* dropped and the last reference to the shared PMDs page might be
5690	* dropped as well.
5691	*
5692	* In theory we could defer the freeing of the PMD pages as well, but
5693	* huge_pmd_unshare() relies on the exact page_count for the PMD page to
5694	* detect sharing, so we cannot defer the release of the page either.
5695	* Instead, do flush now.
5696	*/
5697	if (force_flush)
5698	tlb_flush_mmu_tlbonly(tlb);
5699	}
5700
5701	void __hugetlb_zap_begin(struct vm_area_struct *vma,
5702	unsigned long start, unsigned* long *end)
5703	{
5704	if (!vma->vm_file) / hugetlbfs_file_mmap error /
5705	return;
5706
5707	adjust_range_if_pmd_sharing_possible(vma, start, end);
5708	hugetlb_vma_lock_write(vma);
5709	if (vma->vm_file)
5710	i_mmap_lock_write(mapping: vma->vm_file->f_mapping);
5711	}
5712
5713	void __hugetlb_zap_end(struct vm_area_struct *vma,
5714	struct zap_details *details)
5715	{
5716	zap_flags_t zap_flags = details ? details->zap_flags : `0`;
5717
5718	if (!vma->vm_file) / hugetlbfs_file_mmap error /
5719	return;
5720
5721	if (zap_flags & ZAP_FLAG_UNMAP) { / final unmap /
5722	/*
5723	* Unlock and free the vma lock before releasing i_mmap_rwsem.
5724	* When the vma_lock is freed, this makes the vma ineligible
5725	* for pmd sharing. And, i_mmap_rwsem is required to set up
5726	* pmd sharing. This is important as page tables for this
5727	* unmapped range will be asynchrously deleted. If the page
5728	* tables are shared, there will be issues when accessed by
5729	* someone else.
5730	*/
5731	__hugetlb_vma_unlock_write_free(vma);
5732	} else {
5733	hugetlb_vma_unlock_write(vma);
5734	}
5735
5736	if (vma->vm_file)
5737	i_mmap_unlock_write(mapping: vma->vm_file->f_mapping);
5738	}
5739
5740	void unmap_hugepage_range(struct vm_area_struct vma, unsigned* long start,
5741	unsigned long end, struct page *ref_page,
5742	zap_flags_t zap_flags)
5743	{
5744	struct mmu_notifier_range range;
5745	struct mmu_gather tlb;
5746
5747	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
5748	start, end);
5749	adjust_range_if_pmd_sharing_possible(vma, start: &range.start, end: &range.end);
5750	mmu_notifier_invalidate_range_start(range: &range);
5751	tlb_gather_mmu(tlb: &tlb, mm: vma->vm_mm);
5752
5753	__unmap_hugepage_range(tlb: &tlb, vma, start, end, ref_page, zap_flags);
5754
5755	mmu_notifier_invalidate_range_end(range: &range);
5756	tlb_finish_mmu(tlb: &tlb);
5757	}
5758
5759	/*
5760	* This is called when the original mapper is failing to COW a MAP_PRIVATE
5761	* mapping it owns the reserve page for. The intention is to unmap the page
5762	* from other VMAs and let the children be SIGKILLed if they are faulting the
5763	* same region.
5764	*/
5765	static void unmap_ref_private(struct mm_struct mm, struct* vm_area_struct *vma,
5766	struct page page, unsigned* long address)
5767	{
5768	struct hstate *h = hstate_vma(vma);
5769	struct vm_area_struct *iter_vma;
5770	struct address_space *mapping;
5771	pgoff_t pgoff;
5772
5773	/*
5774	* vm_pgoff is in PAGE_SIZE units, hence the different calculation
5775	* from page cache lookup which is in HPAGE_SIZE units.
5776	*/
5777	address = address & huge_page_mask(h);
5778	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5779	vma->vm_pgoff;
5780	mapping = vma->vm_file->f_mapping;
5781
5782	/*
5783	* Take the mapping lock for the duration of the table walk. As
5784	* this mapping should be shared between all the VMAs,
5785	* __unmap_hugepage_range() is called as the lock is already held
5786	*/
5787	i_mmap_lock_write(mapping);
5788	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5789	/ Do not unmap the current VMA /
5790	if (iter_vma == vma)
5791	continue;
5792
5793	/*
5794	* Shared VMAs have their own reserves and do not affect
5795	* MAP_PRIVATE accounting but it is possible that a shared
5796	* VMA is using the same page so check and skip such VMAs.
5797	*/
5798	if (iter_vma->vm_flags & VM_MAYSHARE)
5799	continue;
5800
5801	/*
5802	* Unmap the page from other VMAs without their own reserves.
5803	* They get marked to be SIGKILLed if they fault in these
5804	* areas. This is because a future no-page fault on this VMA
5805	* could insert a zeroed page instead of the data existing
5806	* from the time of fork. This would look like data corruption
5807	*/
5808	if (!is_vma_resv_set(vma: iter_vma, HPAGE_RESV_OWNER))
5809	unmap_hugepage_range(vma: iter_vma, start: address,
5810	end: address + huge_page_size(h), ref_page: page, zap_flags: `0`);
5811	}
5812	i_mmap_unlock_write(mapping);
5813	}
5814
5815	/*
5816	* hugetlb_wp() should be called with page lock of the original hugepage held.
5817	* Called with hugetlb_fault_mutex_table held and pte_page locked so we
5818	* cannot race with other handlers or page migration.
5819	* Keep the pte_same checks anyway to make transition from the mutex easier.
5820	*/
5821	static vm_fault_t hugetlb_wp(struct mm_struct mm, struct* vm_area_struct *vma,
5822	unsigned long address, pte_t ptep, unsigned* int flags,
5823	struct folio pagecache_folio, spinlock_t ptl)
5824	{
5825	const bool unshare = flags & FAULT_FLAG_UNSHARE;
5826	pte_t pte = huge_ptep_get(ptep);
5827	struct hstate *h = hstate_vma(vma);
5828	struct folio *old_folio;
5829	struct folio *new_folio;
5830	int outside_reserve = `0`;
5831	vm_fault_t ret = `0`;
5832	unsigned long haddr = address & huge_page_mask(h);
5833	struct mmu_notifier_range range;
5834
5835	/*
5836	* Never handle CoW for uffd-wp protected pages. It should be only
5837	* handled when the uffd-wp protection is removed.
5838	*
5839	* Note that only the CoW optimization path (in hugetlb_no_page())
5840	* can trigger this, because hugetlb_fault() will always resolve
5841	* uffd-wp bit first.
5842	*/
5843	if (!unshare && huge_pte_uffd_wp(pte))
5844	return `0`;
5845
5846	/*
5847	* hugetlb does not support FOLL_FORCE-style write faults that keep the
5848	* PTE mapped R/O such as maybe_mkwrite() would do.
5849	*/
5850	if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
5851	return VM_FAULT_SIGSEGV;
5852
5853	/ Let's take out MAP_SHARED mappings first. /
5854	if (vma->vm_flags & VM_MAYSHARE) {
5855	set_huge_ptep_writable(vma, address: haddr, ptep);
5856	return `0`;
5857	}
5858
5859	old_folio = page_folio(pte_page(pte));
5860
5861	delayacct_wpcopy_start();
5862
5863	retry_avoidcopy:
5864	/*
5865	* If no-one else is actually using this page, we're the exclusive
5866	* owner and can reuse this page.
5867	*/
5868	if (folio_mapcount(folio: old_folio) == `1` && folio_test_anon(folio: old_folio)) {
5869	if (!PageAnonExclusive(page: &old_folio->page)) {
5870	folio_move_anon_rmap(old_folio, vma);
5871	SetPageAnonExclusive(&old_folio->page);
5872	}
5873	if (likely(!unshare))
5874	set_huge_ptep_writable(vma, address: haddr, ptep);
5875
5876	delayacct_wpcopy_end();
5877	return `0`;
5878	}
5879	VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
5880	PageAnonExclusive(&old_folio->page), &old_folio->page);
5881
5882	/*
5883	* If the process that created a MAP_PRIVATE mapping is about to
5884	* perform a COW due to a shared page count, attempt to satisfy
5885	* the allocation without using the existing reserves. The pagecache
5886	* page is used to determine if the reserve at this address was
5887	* consumed or not. If reserves were used, a partial faulted mapping
5888	* at the time of fork() could consume its reserves on COW instead
5889	* of the full address range.
5890	*/
5891	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5892	old_folio != pagecache_folio)
5893	outside_reserve = `1`;
5894
5895	folio_get(folio: old_folio);
5896
5897	/*
5898	* Drop page table lock as buddy allocator may be called. It will
5899	* be acquired again before returning to the caller, as expected.
5900	*/
5901	spin_unlock(lock: ptl);
5902	new_folio = alloc_hugetlb_folio(vma, addr: haddr, avoid_reserve: outside_reserve);
5903
5904	if (IS_ERR(ptr: new_folio)) {
5905	/*
5906	* If a process owning a MAP_PRIVATE mapping fails to COW,
5907	* it is due to references held by a child and an insufficient
5908	* huge page pool. To guarantee the original mappers
5909	* reliability, unmap the page from child processes. The child
5910	* may get SIGKILLed if it later faults.
5911	*/
5912	if (outside_reserve) {
5913	struct address_space *mapping = vma->vm_file->f_mapping;
5914	pgoff_t idx;
5915	u32 hash;
5916
5917	folio_put(folio: old_folio);
5918	/*
5919	* Drop hugetlb_fault_mutex and vma_lock before
5920	* unmapping. unmapping needs to hold vma_lock
5921	* in write mode. Dropping vma_lock in read mode
5922	* here is OK as COW mappings do not interact with
5923	* PMD sharing.
5924	*
5925	* Reacquire both after unmap operation.
5926	*/
5927	idx = vma_hugecache_offset(h, vma, address: haddr);
5928	hash = hugetlb_fault_mutex_hash(mapping, idx);
5929	hugetlb_vma_unlock_read(vma);
5930	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
5931
5932	unmap_ref_private(mm, vma, page: &old_folio->page, address: haddr);
5933
5934	mutex_lock(&hugetlb_fault_mutex_table[hash]);
5935	hugetlb_vma_lock_read(vma);
5936	spin_lock(lock: ptl);
5937	ptep = hugetlb_walk(vma, addr: haddr, sz: huge_page_size(h));
5938	if (likely(ptep &&
5939	pte_same(huge_ptep_get(ptep), pte)))
5940	goto retry_avoidcopy;
5941	/*
5942	* race occurs while re-acquiring page table
5943	* lock, and our job is done.
5944	*/
5945	delayacct_wpcopy_end();
5946	return `0`;
5947	}
5948
5949	ret = vmf_error(err: PTR_ERR(ptr: new_folio));
5950	goto out_release_old;
5951	}
5952
5953	/*
5954	* When the original hugepage is shared one, it does not have
5955	* anon_vma prepared.
5956	*/
5957	if (unlikely(anon_vma_prepare(vma))) {
5958	ret = VM_FAULT_OOM;
5959	goto out_release_all;
5960	}
5961
5962	if (copy_user_large_folio(dst: new_folio, src: old_folio, addr_hint: address, vma)) {
5963	ret = VM_FAULT_HWPOISON_LARGE;
5964	goto out_release_all;
5965	}
5966	__folio_mark_uptodate(folio: new_folio);
5967
5968	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm, start: haddr,
5969	end: haddr + huge_page_size(h));
5970	mmu_notifier_invalidate_range_start(range: &range);
5971
5972	/*
5973	* Retake the page table lock to check for racing updates
5974	* before the page tables are altered
5975	*/
5976	spin_lock(lock: ptl);
5977	ptep = hugetlb_walk(vma, addr: haddr, sz: huge_page_size(h));
5978	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
5979	pte_t newpte = make_huge_pte(vma, page: &new_folio->page, writable: !unshare);
5980
5981	/ Break COW or unshare /
5982	huge_ptep_clear_flush(vma, addr: haddr, ptep);
5983	page_remove_rmap(&old_folio->page, vma, compound: true);
5984	hugepage_add_new_anon_rmap(new_folio, vma, address: haddr);
5985	if (huge_pte_uffd_wp(pte))
5986	newpte = huge_pte_mkuffd_wp(pte: newpte);
5987	set_huge_pte_at(mm, addr: haddr, ptep, pte: newpte, sz: huge_page_size(h));
5988	folio_set_hugetlb_migratable(folio: new_folio);
5989	/ Make the old page be freed below /
5990	new_folio = old_folio;
5991	}
5992	spin_unlock(lock: ptl);
5993	mmu_notifier_invalidate_range_end(range: &range);
5994	out_release_all:
5995	/*
5996	* No restore in case of successful pagetable update (Break COW or
5997	* unshare)
5998	*/
5999	if (new_folio != old_folio)
6000	restore_reserve_on_error(h, vma, address: haddr, folio: new_folio);
6001	folio_put(folio: new_folio);
6002	out_release_old:
6003	folio_put(folio: old_folio);
6004
6005	spin_lock(lock: ptl); / Caller expects lock to be held /
6006
6007	delayacct_wpcopy_end();
6008	return ret;
6009	}
6010
6011	/*
6012	* Return whether there is a pagecache page to back given address within VMA.
6013	*/
6014	static bool hugetlbfs_pagecache_present(struct hstate *h,
6015	struct vm_area_struct vma, unsigned* long address)
6016	{
6017	struct address_space *mapping = vma->vm_file->f_mapping;
6018	pgoff_t idx = linear_page_index(vma, address);
6019	struct folio *folio;
6020
6021	folio = filemap_get_folio(mapping, index: idx);
6022	if (IS_ERR(ptr: folio))
6023	return false;
6024	folio_put(folio);
6025	return true;
6026	}
6027
6028	int hugetlb_add_to_page_cache(struct folio folio, struct* address_space *mapping,
6029	pgoff_t idx)
6030	{
6031	struct inode *inode = mapping->host;
6032	struct hstate *h = hstate_inode(i: inode);
6033	int err;
6034
6035	idx <<= huge_page_order(h);
6036	__folio_set_locked(folio);
6037	err = __filemap_add_folio(mapping, folio, index: idx, GFP_KERNEL, NULL);
6038
6039	if (unlikely(err)) {
6040	__folio_clear_locked(folio);
6041	return err;
6042	}
6043	folio_clear_hugetlb_restore_reserve(folio);
6044
6045	/*
6046	* mark folio dirty so that it will not be removed from cache/file
6047	* by non-hugetlbfs specific code paths.
6048	*/
6049	folio_mark_dirty(folio);
6050
6051	spin_lock(lock: &inode->i_lock);
6052	inode->i_blocks += blocks_per_huge_page(h);
6053	spin_unlock(lock: &inode->i_lock);
6054	return `0`;
6055	}
6056
6057	static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
6058	struct address_space *mapping,
6059	pgoff_t idx,
6060	unsigned int flags,
6061	unsigned long haddr,
6062	unsigned long addr,
6063	unsigned long reason)
6064	{
6065	u32 hash;
6066	struct vm_fault vmf = {
6067	.vma = vma,
6068	.address = haddr,
6069	.real_address = addr,
6070	.flags = flags,
6071
6072	/*
6073	* Hard to debug if it ends up being
6074	* used by a callee that assumes
6075	* something about the other
6076	* uninitialized fields... same as in
6077	* memory.c
6078	*/
6079	};
6080
6081	/*
6082	* vma_lock and hugetlb_fault_mutex must be dropped before handling
6083	* userfault. Also mmap_lock could be dropped due to handling
6084	* userfault, any vma operation should be careful from here.
6085	*/
6086	hugetlb_vma_unlock_read(vma);
6087	hash = hugetlb_fault_mutex_hash(mapping, idx);
6088	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
6089	return handle_userfault(vmf: &vmf, reason);
6090	}
6091
6092	/*
6093	* Recheck pte with pgtable lock. Returns true if pte didn't change, or
6094	* false if pte changed or is changing.
6095	*/
6096	static bool hugetlb_pte_stable(struct hstate h, struct* mm_struct *mm,
6097	pte_t *ptep, pte_t old_pte)
6098	{
6099	spinlock_t *ptl;
6100	bool same;
6101
6102	ptl = huge_pte_lock(h, mm, pte: ptep);
6103	same = pte_same(a: huge_ptep_get(ptep), b: old_pte);
6104	spin_unlock(lock: ptl);
6105
6106	return same;
6107	}
6108
6109	static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
6110	struct vm_area_struct *vma,
6111	struct address_space *mapping, pgoff_t idx,
6112	unsigned long address, pte_t *ptep,
6113	pte_t old_pte, unsigned int flags)
6114	{
6115	struct hstate *h = hstate_vma(vma);
6116	vm_fault_t ret = VM_FAULT_SIGBUS;
6117	int anon_rmap = `0`;
6118	unsigned long size;
6119	struct folio *folio;
6120	pte_t new_pte;
6121	spinlock_t *ptl;
6122	unsigned long haddr = address & huge_page_mask(h);
6123	bool new_folio, new_pagecache_folio = false;
6124	u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
6125
6126	/*
6127	* Currently, we are forced to kill the process in the event the
6128	* original mapper has unmapped pages from the child due to a failed
6129	* COW/unsharing. Warn that such a situation has occurred as it may not
6130	* be obvious.
6131	*/
6132	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
6133	pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
6134	current->pid);
6135	goto out;
6136	}
6137
6138	/*
6139	* Use page lock to guard against racing truncation
6140	* before we get page_table_lock.
6141	*/
6142	new_folio = false;
6143	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
6144	if (IS_ERR(ptr: folio)) {
6145	size = i_size_read(inode: mapping->host) >> huge_page_shift(h);
6146	if (idx >= size)
6147	goto out;
6148	/ Check for page in userfault range /
6149	if (userfaultfd_missing(vma)) {
6150	/*
6151	* Since hugetlb_no_page() was examining pte
6152	* without pgtable lock, we need to re-test under
6153	* lock because the pte may not be stable and could
6154	* have changed from under us. Try to detect
6155	* either changed or during-changing ptes and retry
6156	* properly when needed.
6157	*
6158	* Note that userfaultfd is actually fine with
6159	* false positives (e.g. caused by pte changed),
6160	* but not wrong logical events (e.g. caused by
6161	* reading a pte during changing). The latter can
6162	* confuse the userspace, so the strictness is very
6163	* much preferred. E.g., MISSING event should
6164	* never happen on the page after UFFDIO_COPY has
6165	* correctly installed the page and returned.
6166	*/
6167	if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
6168	ret = `0`;
6169	goto out;
6170	}
6171
6172	return hugetlb_handle_userfault(vma, mapping, idx, flags,
6173	haddr, addr: address,
6174	VM_UFFD_MISSING);
6175	}
6176
6177	folio = alloc_hugetlb_folio(vma, addr: haddr, avoid_reserve: `0`);
6178	if (IS_ERR(ptr: folio)) {
6179	/*
6180	* Returning error will result in faulting task being
6181	* sent SIGBUS. The hugetlb fault mutex prevents two
6182	* tasks from racing to fault in the same page which
6183	* could result in false unable to allocate errors.
6184	* Page migration does not take the fault mutex, but
6185	* does a clear then write of pte's under page table
6186	* lock. Page fault code could race with migration,
6187	* notice the clear pte and try to allocate a page
6188	* here. Before returning error, get ptl and make
6189	* sure there really is no pte entry.
6190	*/
6191	if (hugetlb_pte_stable(h, mm, ptep, old_pte))
6192	ret = vmf_error(err: PTR_ERR(ptr: folio));
6193	else
6194	ret = `0`;
6195	goto out;
6196	}
6197	clear_huge_page(page: &folio->page, addr_hint: address, pages_per_huge_page: pages_per_huge_page(h));
6198	__folio_mark_uptodate(folio);
6199	new_folio = true;
6200
6201	if (vma->vm_flags & VM_MAYSHARE) {
6202	int err = hugetlb_add_to_page_cache(folio, mapping, idx);
6203	if (err) {
6204	/*
6205	* err can't be -EEXIST which implies someone
6206	* else consumed the reservation since hugetlb
6207	* fault mutex is held when add a hugetlb page
6208	* to the page cache. So it's safe to call
6209	* restore_reserve_on_error() here.
6210	*/
6211	restore_reserve_on_error(h, vma, address: haddr, folio);
6212	folio_put(folio);
6213	goto out;
6214	}
6215	new_pagecache_folio = true;
6216	} else {
6217	folio_lock(folio);
6218	if (unlikely(anon_vma_prepare(vma))) {
6219	ret = VM_FAULT_OOM;
6220	goto backout_unlocked;
6221	}
6222	anon_rmap = `1`;
6223	}
6224	} else {
6225	/*
6226	* If memory error occurs between mmap() and fault, some process
6227	* don't have hwpoisoned swap entry for errored virtual address.
6228	* So we need to block hugepage fault by PG_hwpoison bit check.
6229	*/
6230	if (unlikely(folio_test_hwpoison(folio))) {
6231	ret = VM_FAULT_HWPOISON_LARGE \|
6232	VM_FAULT_SET_HINDEX(hstate_index(h));
6233	goto backout_unlocked;
6234	}
6235
6236	/ Check for page in userfault range. /
6237	if (userfaultfd_minor(vma)) {
6238	folio_unlock(folio);
6239	folio_put(folio);
6240	/ See comment in userfaultfd_missing() block above /
6241	if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
6242	ret = `0`;
6243	goto out;
6244	}
6245	return hugetlb_handle_userfault(vma, mapping, idx, flags,
6246	haddr, addr: address,
6247	VM_UFFD_MINOR);
6248	}
6249	}
6250
6251	/*
6252	* If we are going to COW a private mapping later, we examine the
6253	* pending reservations for this page now. This will ensure that
6254	* any allocations necessary to record that reservation occur outside
6255	* the spinlock.
6256	*/
6257	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6258	if (vma_needs_reservation(h, vma, addr: haddr) < `0`) {
6259	ret = VM_FAULT_OOM;
6260	goto backout_unlocked;
6261	}
6262	/ Just decrements count, does not deallocate /
6263	vma_end_reservation(h, vma, addr: haddr);
6264	}
6265
6266	ptl = huge_pte_lock(h, mm, pte: ptep);
6267	ret = `0`;
6268	/ If pte changed from under us, retry /
6269	if (!pte_same(a: huge_ptep_get(ptep), b: old_pte))
6270	goto backout;
6271
6272	if (anon_rmap)
6273	hugepage_add_new_anon_rmap(folio, vma, address: haddr);
6274	else
6275	page_dup_file_rmap(page: &folio->page, compound: true);
6276	new_pte = make_huge_pte(vma, page: &folio->page, writable: ((vma->vm_flags & VM_WRITE)
6277	&& (vma->vm_flags & VM_SHARED)));
6278	/*
6279	* If this pte was previously wr-protected, keep it wr-protected even
6280	* if populated.
6281	*/
6282	if (unlikely(pte_marker_uffd_wp(old_pte)))
6283	new_pte = huge_pte_mkuffd_wp(pte: new_pte);
6284	set_huge_pte_at(mm, addr: haddr, ptep, pte: new_pte, sz: huge_page_size(h));
6285
6286	hugetlb_count_add(l: pages_per_huge_page(h), mm);
6287	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6288	/ Optimization, do the COW without a second fault /
6289	ret = hugetlb_wp(mm, vma, address, ptep, flags, pagecache_folio: folio, ptl);
6290	}
6291
6292	spin_unlock(lock: ptl);
6293
6294	/*
6295	* Only set hugetlb_migratable in newly allocated pages. Existing pages
6296	* found in the pagecache may not have hugetlb_migratable if they have
6297	* been isolated for migration.
6298	*/
6299	if (new_folio)
6300	folio_set_hugetlb_migratable(folio);
6301
6302	folio_unlock(folio);
6303	out:
6304	hugetlb_vma_unlock_read(vma);
6305	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
6306	return ret;
6307
6308	backout:
6309	spin_unlock(lock: ptl);
6310	backout_unlocked:
6311	if (new_folio && !new_pagecache_folio)
6312	restore_reserve_on_error(h, vma, address: haddr, folio);
6313
6314	folio_unlock(folio);
6315	folio_put(folio);
6316	goto out;
6317	}
6318
6319	#ifdef CONFIG_SMP
6320	u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6321	{
6322	unsigned long key[`2`];
6323	u32 hash;
6324
6325	key[`0`] = (unsigned long) mapping;
6326	key[`1`] = idx;
6327
6328	hash = jhash2(k: (u32 )&key, length: sizeof(key)/(sizeof*(u32)), initval: `0`);
6329
6330	return hash & (num_fault_mutexes - `1`);
6331	}
6332	#else
6333	/*
6334	* For uniprocessor systems we always use a single mutex, so just
6335	* return 0 and avoid the hashing overhead.
6336	*/
6337	u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
6338	{
6339	return `0`;
6340	}
6341	#endif
6342
6343	vm_fault_t hugetlb_fault(struct mm_struct mm, struct* vm_area_struct *vma,
6344	unsigned long address, unsigned int flags)
6345	{
6346	pte_t *ptep, entry;
6347	spinlock_t *ptl;
6348	vm_fault_t ret;
6349	u32 hash;
6350	pgoff_t idx;
6351	struct folio *folio = NULL;
6352	struct folio *pagecache_folio = NULL;
6353	struct hstate *h = hstate_vma(vma);
6354	struct address_space *mapping;
6355	int need_wait_lock = `0`;
6356	unsigned long haddr = address & huge_page_mask(h);
6357
6358	/ TODO: Handle faults under the VMA lock /
6359	if (flags & FAULT_FLAG_VMA_LOCK) {
6360	vma_end_read(vma);
6361	return VM_FAULT_RETRY;
6362	}
6363
6364	/*
6365	* Serialize hugepage allocation and instantiation, so that we don't
6366	* get spurious allocation failures if two CPUs race to instantiate
6367	* the same page in the page cache.
6368	*/
6369	mapping = vma->vm_file->f_mapping;
6370	idx = vma_hugecache_offset(h, vma, address: haddr);
6371	hash = hugetlb_fault_mutex_hash(mapping, idx);
6372	mutex_lock(&hugetlb_fault_mutex_table[hash]);
6373
6374	/*
6375	* Acquire vma lock before calling huge_pte_alloc and hold
6376	* until finished with ptep. This prevents huge_pmd_unshare from
6377	* being called elsewhere and making the ptep no longer valid.
6378	*/
6379	hugetlb_vma_lock_read(vma);
6380	ptep = huge_pte_alloc(mm, vma, addr: haddr, sz: huge_page_size(h));
6381	if (!ptep) {
6382	hugetlb_vma_unlock_read(vma);
6383	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
6384	return VM_FAULT_OOM;
6385	}
6386
6387	entry = huge_ptep_get(ptep);
6388	if (huge_pte_none_mostly(pte: entry)) {
6389	if (is_pte_marker(pte: entry)) {
6390	pte_marker marker =
6391	pte_marker_get(entry: pte_to_swp_entry(pte: entry));
6392
6393	if (marker & PTE_MARKER_POISONED) {
6394	ret = VM_FAULT_HWPOISON_LARGE;
6395	goto out_mutex;
6396	}
6397	}
6398
6399	/*
6400	* Other PTE markers should be handled the same way as none PTE.
6401	*
6402	* hugetlb_no_page will drop vma lock and hugetlb fault
6403	* mutex internally, which make us return immediately.
6404	*/
6405	return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
6406	old_pte: entry, flags);
6407	}
6408
6409	ret = `0`;
6410
6411	/*
6412	* entry could be a migration/hwpoison entry at this point, so this
6413	* check prevents the kernel from going below assuming that we have
6414	* an active hugepage in pagecache. This goto expects the 2nd page
6415	* fault, and is_hugetlb_entry_(migration\|hwpoisoned) check will
6416	* properly handle it.
6417	*/
6418	if (!pte_present(a: entry)) {
6419	if (unlikely(is_hugetlb_entry_migration(entry))) {
6420	/*
6421	* Release the hugetlb fault lock now, but retain
6422	* the vma lock, because it is needed to guard the
6423	* huge_pte_lockptr() later in
6424	* migration_entry_wait_huge(). The vma lock will
6425	* be released there.
6426	*/
6427	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
6428	migration_entry_wait_huge(vma, pte: ptep);
6429	return `0`;
6430	} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
6431	ret = VM_FAULT_HWPOISON_LARGE \|
6432	VM_FAULT_SET_HINDEX(hstate_index(h));
6433	goto out_mutex;
6434	}
6435
6436	/*
6437	* If we are going to COW/unshare the mapping later, we examine the
6438	* pending reservations for this page now. This will ensure that any
6439	* allocations necessary to record that reservation occur outside the
6440	* spinlock. Also lookup the pagecache page now as it is used to
6441	* determine if a reservation has been consumed.
6442	*/
6443	if ((flags & (FAULT_FLAG_WRITE\|FAULT_FLAG_UNSHARE)) &&
6444	!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(pte: entry)) {
6445	if (vma_needs_reservation(h, vma, addr: haddr) < `0`) {
6446	ret = VM_FAULT_OOM;
6447	goto out_mutex;
6448	}
6449	/ Just decrements count, does not deallocate /
6450	vma_end_reservation(h, vma, addr: haddr);
6451
6452	pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
6453	if (IS_ERR(ptr: pagecache_folio))
6454	pagecache_folio = NULL;
6455	}
6456
6457	ptl = huge_pte_lock(h, mm, pte: ptep);
6458
6459	/ Check for a racing update before calling hugetlb_wp() /
6460	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
6461	goto out_ptl;
6462
6463	/ Handle userfault-wp first, before trying to lock more pages /
6464	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(pte: huge_ptep_get(ptep)) &&
6465	(flags & FAULT_FLAG_WRITE) && !huge_pte_write(pte: entry)) {
6466	if (!userfaultfd_wp_async(vma)) {
6467	struct vm_fault vmf = {
6468	.vma = vma,
6469	.address = haddr,
6470	.real_address = address,
6471	.flags = flags,
6472	};
6473
6474	spin_unlock(lock: ptl);
6475	if (pagecache_folio) {
6476	folio_unlock(folio: pagecache_folio);
6477	folio_put(folio: pagecache_folio);
6478	}
6479	hugetlb_vma_unlock_read(vma);
6480	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
6481	return handle_userfault(vmf: &vmf, VM_UFFD_WP);
6482	}
6483
6484	entry = huge_pte_clear_uffd_wp(pte: entry);
6485	set_huge_pte_at(mm, addr: haddr, ptep, pte: entry,
6486	sz: huge_page_size(h: hstate_vma(vma)));
6487	/ Fallthrough to CoW /
6488	}
6489
6490	/*
6491	* hugetlb_wp() requires page locks of pte_page(entry) and
6492	* pagecache_folio, so here we need take the former one
6493	* when folio != pagecache_folio or !pagecache_folio.
6494	*/
6495	folio = page_folio(pte_page(entry));
6496	if (folio != pagecache_folio)
6497	if (!folio_trylock(folio)) {
6498	need_wait_lock = `1`;
6499	goto out_ptl;
6500	}
6501
6502	folio_get(folio);
6503
6504	if (flags & (FAULT_FLAG_WRITE\|FAULT_FLAG_UNSHARE)) {
6505	if (!huge_pte_write(pte: entry)) {
6506	ret = hugetlb_wp(mm, vma, address, ptep, flags,
6507	pagecache_folio, ptl);
6508	goto out_put_page;
6509	} else if (likely(flags & FAULT_FLAG_WRITE)) {
6510	entry = huge_pte_mkdirty(pte: entry);
6511	}
6512	}
6513	entry = pte_mkyoung(pte: entry);
6514	if (huge_ptep_set_access_flags(vma, addr: haddr, ptep, pte: entry,
6515	dirty: flags & FAULT_FLAG_WRITE))
6516	update_mmu_cache(vma, addr: haddr, ptep);
6517	out_put_page:
6518	if (folio != pagecache_folio)
6519	folio_unlock(folio);
6520	folio_put(folio);
6521	out_ptl:
6522	spin_unlock(lock: ptl);
6523
6524	if (pagecache_folio) {
6525	folio_unlock(folio: pagecache_folio);
6526	folio_put(folio: pagecache_folio);
6527	}
6528	out_mutex:
6529	hugetlb_vma_unlock_read(vma);
6530	mutex_unlock(lock: &hugetlb_fault_mutex_table[hash]);
6531	/*
6532	* Generally it's safe to hold refcount during waiting page lock. But
6533	* here we just wait to defer the next page fault to avoid busy loop and
6534	* the page is not used after unlocked before returning from the current
6535	* page fault. So we are safe from accessing freed page, even if we wait
6536	* here without taking refcount.
6537	*/
6538	if (need_wait_lock)
6539	folio_wait_locked(folio);
6540	return ret;
6541	}
6542
6543	#ifdef CONFIG_USERFAULTFD
6544	/*
6545	* Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
6546	*/
6547	static struct folio alloc_hugetlb_folio_vma(struct* hstate *h,
6548	struct vm_area_struct vma, unsigned* long address)
6549	{
6550	struct mempolicy *mpol;
6551	nodemask_t *nodemask;
6552	struct folio *folio;
6553	gfp_t gfp_mask;
6554	int node;
6555
6556	gfp_mask = htlb_alloc_mask(h);
6557	node = huge_node(vma, addr: address, gfp_flags: gfp_mask, mpol: &mpol, nodemask: &nodemask);
6558	folio = alloc_hugetlb_folio_nodemask(h, preferred_nid: node, nmask: nodemask, gfp_mask);
6559	mpol_cond_put(pol: mpol);
6560
6561	return folio;
6562	}
6563
6564	/*
6565	* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
6566	* with modifications for hugetlb pages.
6567	*/
6568	int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
6569	struct vm_area_struct *dst_vma,
6570	unsigned long dst_addr,
6571	unsigned long src_addr,
6572	uffd_flags_t flags,
6573	struct folio **foliop)
6574	{
6575	struct mm_struct *dst_mm = dst_vma->vm_mm;
6576	bool is_continue = uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_CONTINUE);
6577	bool wp_enabled = (flags & MFILL_ATOMIC_WP);
6578	struct hstate *h = hstate_vma(vma: dst_vma);
6579	struct address_space *mapping = dst_vma->vm_file->f_mapping;
6580	pgoff_t idx = vma_hugecache_offset(h, vma: dst_vma, address: dst_addr);
6581	unsigned long size;
6582	int vm_shared = dst_vma->vm_flags & VM_SHARED;
6583	pte_t _dst_pte;
6584	spinlock_t *ptl;
6585	int ret = -ENOMEM;
6586	struct folio *folio;
6587	int writable;
6588	bool folio_in_pagecache = false;
6589
6590	if (uffd_flags_mode_is(flags, expected: MFILL_ATOMIC_POISON)) {
6591	ptl = huge_pte_lock(h, mm: dst_mm, pte: dst_pte);
6592
6593	/ Don't overwrite any existing PTEs (even markers) /
6594	if (!huge_pte_none(pte: huge_ptep_get(ptep: dst_pte))) {
6595	spin_unlock(lock: ptl);
6596	return -EEXIST;
6597	}
6598
6599	_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
6600	set_huge_pte_at(mm: dst_mm, addr: dst_addr, ptep: dst_pte, pte: _dst_pte,
6601	sz: huge_page_size(h));
6602
6603	/ No need to invalidate - it was non-present before /
6604	update_mmu_cache(vma: dst_vma, addr: dst_addr, ptep: dst_pte);
6605
6606	spin_unlock(lock: ptl);
6607	return `0`;
6608	}
6609
6610	if (is_continue) {
6611	ret = -EFAULT;
6612	folio = filemap_lock_hugetlb_folio(h, mapping, idx);
6613	if (IS_ERR(ptr: folio))
6614	goto out;
6615	folio_in_pagecache = true;
6616	} else if (!*foliop) {
6617	/ If a folio already exists, then it's UFFDIO_COPY for*
6618	* a non-missing case. Return -EEXIST.
6619	*/
6620	if (vm_shared &&
6621	hugetlbfs_pagecache_present(h, vma: dst_vma, address: dst_addr)) {
6622	ret = -EEXIST;
6623	goto out;
6624	}
6625
6626	folio = alloc_hugetlb_folio(vma: dst_vma, addr: dst_addr, avoid_reserve: `0`);
6627	if (IS_ERR(ptr: folio)) {
6628	ret = -ENOMEM;
6629	goto out;
6630	}
6631
6632	ret = copy_folio_from_user(dst_folio: folio, usr_src: (const void __user *) src_addr,
6633	allow_pagefault: false);
6634
6635	/ fallback to copy_from_user outside mmap_lock /
6636	if (unlikely(ret)) {
6637	ret = -ENOENT;
6638	/ Free the allocated folio which may have*
6639	* consumed a reservation.
6640	*/
6641	restore_reserve_on_error(h, vma: dst_vma, address: dst_addr, folio);
6642	folio_put(folio);
6643
6644	/ Allocate a temporary folio to hold the copied*
6645	* contents.
6646	*/
6647	folio = alloc_hugetlb_folio_vma(h, vma: dst_vma, address: dst_addr);
6648	if (!folio) {
6649	ret = -ENOMEM;
6650	goto out;
6651	}
6652	*foliop = folio;
6653	/ Set the outparam foliop and return to the caller to*
6654	* copy the contents outside the lock. Don't free the
6655	* folio.
6656	*/
6657	goto out;
6658	}
6659	} else {
6660	if (vm_shared &&
6661	hugetlbfs_pagecache_present(h, vma: dst_vma, address: dst_addr)) {
6662	folio_put(folio: *foliop);
6663	ret = -EEXIST;
6664	*foliop = NULL;
6665	goto out;
6666	}
6667
6668	folio = alloc_hugetlb_folio(vma: dst_vma, addr: dst_addr, avoid_reserve: `0`);
6669	if (IS_ERR(ptr: folio)) {
6670	folio_put(folio: *foliop);
6671	ret = -ENOMEM;
6672	*foliop = NULL;
6673	goto out;
6674	}
6675	ret = copy_user_large_folio(dst: folio, src: *foliop, addr_hint: dst_addr, vma: dst_vma);
6676	folio_put(folio: *foliop);
6677	*foliop = NULL;
6678	if (ret) {
6679	folio_put(folio);
6680	goto out;
6681	}
6682	}
6683
6684	/*
6685	* The memory barrier inside __folio_mark_uptodate makes sure that
6686	* preceding stores to the page contents become visible before
6687	* the set_pte_at() write.
6688	*/
6689	__folio_mark_uptodate(folio);
6690
6691	/ Add shared, newly allocated pages to the page cache. /
6692	if (vm_shared && !is_continue) {
6693	size = i_size_read(inode: mapping->host) >> huge_page_shift(h);
6694	ret = -EFAULT;
6695	if (idx >= size)
6696	goto out_release_nounlock;
6697
6698	/*
6699	* Serialization between remove_inode_hugepages() and
6700	* hugetlb_add_to_page_cache() below happens through the
6701	* hugetlb_fault_mutex_table that here must be hold by
6702	* the caller.
6703	*/
6704	ret = hugetlb_add_to_page_cache(folio, mapping, idx);
6705	if (ret)
6706	goto out_release_nounlock;
6707	folio_in_pagecache = true;
6708	}
6709
6710	ptl = huge_pte_lock(h, mm: dst_mm, pte: dst_pte);
6711
6712	ret = -EIO;
6713	if (folio_test_hwpoison(folio))
6714	goto out_release_unlock;
6715
6716	/*
6717	* We allow to overwrite a pte marker: consider when both MISSING\|WP
6718	* registered, we firstly wr-protect a none pte which has no page cache
6719	* page backing it, then access the page.
6720	*/
6721	ret = -EEXIST;
6722	if (!huge_pte_none_mostly(pte: huge_ptep_get(ptep: dst_pte)))
6723	goto out_release_unlock;
6724
6725	if (folio_in_pagecache)
6726	page_dup_file_rmap(page: &folio->page, compound: true);
6727	else
6728	hugepage_add_new_anon_rmap(folio, dst_vma, address: dst_addr);
6729
6730	/*
6731	* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
6732	* with wp flag set, don't set pte write bit.
6733	*/
6734	if (wp_enabled \|\| (is_continue && !vm_shared))
6735	writable = `0`;
6736	else
6737	writable = dst_vma->vm_flags & VM_WRITE;
6738
6739	_dst_pte = make_huge_pte(vma: dst_vma, page: &folio->page, writable);
6740	/*
6741	* Always mark UFFDIO_COPY page dirty; note that this may not be
6742	* extremely important for hugetlbfs for now since swapping is not
6743	* supported, but we should still be clear in that this page cannot be
6744	* thrown away at will, even if write bit not set.
6745	*/
6746	_dst_pte = huge_pte_mkdirty(pte: _dst_pte);
6747	_dst_pte = pte_mkyoung(pte: _dst_pte);
6748
6749	if (wp_enabled)
6750	_dst_pte = huge_pte_mkuffd_wp(pte: _dst_pte);
6751
6752	set_huge_pte_at(mm: dst_mm, addr: dst_addr, ptep: dst_pte, pte: _dst_pte, sz: huge_page_size(h));
6753
6754	hugetlb_count_add(l: pages_per_huge_page(h), mm: dst_mm);
6755
6756	/ No need to invalidate - it was non-present before /
6757	update_mmu_cache(vma: dst_vma, addr: dst_addr, ptep: dst_pte);
6758
6759	spin_unlock(lock: ptl);
6760	if (!is_continue)
6761	folio_set_hugetlb_migratable(folio);
6762	if (vm_shared \|\| is_continue)
6763	folio_unlock(folio);
6764	ret = `0`;
6765	out:
6766	return ret;
6767	out_release_unlock:
6768	spin_unlock(lock: ptl);
6769	if (vm_shared \|\| is_continue)
6770	folio_unlock(folio);
6771	out_release_nounlock:
6772	if (!folio_in_pagecache)
6773	restore_reserve_on_error(h, vma: dst_vma, address: dst_addr, folio);
6774	folio_put(folio);
6775	goto out;
6776	}
6777	#endif /* CONFIG_USERFAULTFD */
6778
6779	struct page hugetlb_follow_page_mask(struct* vm_area_struct *vma,
6780	unsigned long address, unsigned int flags,
6781	unsigned int *page_mask)
6782	{
6783	struct hstate *h = hstate_vma(vma);
6784	struct mm_struct *mm = vma->vm_mm;
6785	unsigned long haddr = address & huge_page_mask(h);
6786	struct page *page = NULL;
6787	spinlock_t *ptl;
6788	pte_t *pte, entry;
6789	int ret;
6790
6791	hugetlb_vma_lock_read(vma);
6792	pte = hugetlb_walk(vma, addr: haddr, sz: huge_page_size(h));
6793	if (!pte)
6794	goto out_unlock;
6795
6796	ptl = huge_pte_lock(h, mm, pte);
6797	entry = huge_ptep_get(ptep: pte);
6798	if (pte_present(a: entry)) {
6799	page = pte_page(entry);
6800
6801	if (!huge_pte_write(pte: entry)) {
6802	if (flags & FOLL_WRITE) {
6803	page = NULL;
6804	goto out;
6805	}
6806
6807	if (gup_must_unshare(vma, flags, page)) {
6808	/ Tell the caller to do unsharing /
6809	page = ERR_PTR(error: -EMLINK);
6810	goto out;
6811	}
6812	}
6813
6814	page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
6815
6816	/*
6817	* Note that page may be a sub-page, and with vmemmap
6818	* optimizations the page struct may be read only.
6819	* try_grab_page() will increase the ref count on the
6820	* head page, so this will be OK.
6821	*
6822	* try_grab_page() should always be able to get the page here,
6823	* because we hold the ptl lock and have verified pte_present().
6824	*/
6825	ret = try_grab_page(page, flags);
6826
6827	if (WARN_ON_ONCE(ret)) {
6828	page = ERR_PTR(error: ret);
6829	goto out;
6830	}
6831
6832	*page_mask = (`1U` << huge_page_order(h)) - `1`;
6833	}
6834	out:
6835	spin_unlock(lock: ptl);
6836	out_unlock:
6837	hugetlb_vma_unlock_read(vma);
6838
6839	/*
6840	* Fixup retval for dump requests: if pagecache doesn't exist,
6841	* don't try to allocate a new page but just skip it.
6842	*/
6843	if (!page && (flags & FOLL_DUMP) &&
6844	!hugetlbfs_pagecache_present(h, vma, address))
6845	page = ERR_PTR(error: -EFAULT);
6846
6847	return page;
6848	}
6849
6850	long hugetlb_change_protection(struct vm_area_struct *vma,
6851	unsigned long address, unsigned long end,
6852	pgprot_t newprot, unsigned long cp_flags)
6853	{
6854	struct mm_struct *mm = vma->vm_mm;
6855	unsigned long start = address;
6856	pte_t *ptep;
6857	pte_t pte;
6858	struct hstate *h = hstate_vma(vma);
6859	long pages = `0`, psize = huge_page_size(h);
6860	bool shared_pmd = false;
6861	struct mmu_notifier_range range;
6862	unsigned long last_addr_mask;
6863	bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
6864	bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
6865
6866	/*
6867	* In the case of shared PMDs, the area to flush could be beyond
6868	* start/end. Set range.start/range.end to cover the maximum possible
6869	* range if PMD sharing is possible.
6870	*/
6871	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_PROTECTION_VMA,
6872	flags: `0`, mm, start, end);
6873	adjust_range_if_pmd_sharing_possible(vma, start: &range.start, end: &range.end);
6874
6875	BUG_ON(address >= end);
6876	flush_cache_range(vma, start: range.start, end: range.end);
6877
6878	mmu_notifier_invalidate_range_start(range: &range);
6879	hugetlb_vma_lock_write(vma);
6880	i_mmap_lock_write(mapping: vma->vm_file->f_mapping);
6881	last_addr_mask = hugetlb_mask_last_page(h);
6882	for (; address < end; address += psize) {
6883	spinlock_t *ptl;
6884	ptep = hugetlb_walk(vma, addr: address, sz: psize);
6885	if (!ptep) {
6886	if (!uffd_wp) {
6887	address \|= last_addr_mask;
6888	continue;
6889	}
6890	/*
6891	* Userfaultfd wr-protect requires pgtable
6892	* pre-allocations to install pte markers.
6893	*/
6894	ptep = huge_pte_alloc(mm, vma, addr: address, sz: psize);
6895	if (!ptep) {
6896	pages = -ENOMEM;
6897	break;
6898	}
6899	}
6900	ptl = huge_pte_lock(h, mm, pte: ptep);
6901	if (huge_pmd_unshare(mm, vma, addr: address, ptep)) {
6902	/*
6903	* When uffd-wp is enabled on the vma, unshare
6904	* shouldn't happen at all. Warn about it if it
6905	* happened due to some reason.
6906	*/
6907	WARN_ON_ONCE(uffd_wp \|\| uffd_wp_resolve);
6908	pages++;
6909	spin_unlock(lock: ptl);
6910	shared_pmd = true;
6911	address \|= last_addr_mask;
6912	continue;
6913	}
6914	pte = huge_ptep_get(ptep);
6915	if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6916	/ Nothing to do. /
6917	} else if (unlikely(is_hugetlb_entry_migration(pte))) {
6918	swp_entry_t entry = pte_to_swp_entry(pte);
6919	struct page *page = pfn_swap_entry_to_page(entry);
6920	pte_t newpte = pte;
6921
6922	if (is_writable_migration_entry(entry)) {
6923	if (PageAnon(page))
6924	entry = make_readable_exclusive_migration_entry(
6925	offset: swp_offset(entry));
6926	else
6927	entry = make_readable_migration_entry(
6928	offset: swp_offset(entry));
6929	newpte = swp_entry_to_pte(entry);
6930	pages++;
6931	}
6932
6933	if (uffd_wp)
6934	newpte = pte_swp_mkuffd_wp(pte: newpte);
6935	else if (uffd_wp_resolve)
6936	newpte = pte_swp_clear_uffd_wp(pte: newpte);
6937	if (!pte_same(a: pte, b: newpte))
6938	set_huge_pte_at(mm, addr: address, ptep, pte: newpte, sz: psize);
6939	} else if (unlikely(is_pte_marker(pte))) {
6940	/ No other markers apply for now. /
6941	WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
6942	if (uffd_wp_resolve)
6943	/ Safe to modify directly (non-present->none). /
6944	huge_pte_clear(mm, addr: address, ptep, sz: psize);
6945	} else if (!huge_pte_none(pte)) {
6946	pte_t old_pte;
6947	unsigned int shift = huge_page_shift(h: hstate_vma(vma));
6948
6949	old_pte = huge_ptep_modify_prot_start(vma, addr: address, ptep);
6950	pte = huge_pte_modify(pte: old_pte, newprot);
6951	pte = arch_make_huge_pte(entry: pte, shift, flags: vma->vm_flags);
6952	if (uffd_wp)
6953	pte = huge_pte_mkuffd_wp(pte);
6954	else if (uffd_wp_resolve)
6955	pte = huge_pte_clear_uffd_wp(pte);
6956	huge_ptep_modify_prot_commit(vma, addr: address, ptep, old_pte, pte);
6957	pages++;
6958	} else {
6959	/ None pte /
6960	if (unlikely(uffd_wp))
6961	/ Safe to modify directly (none->non-present). /
6962	set_huge_pte_at(mm, addr: address, ptep,
6963	pte: make_pte_marker(PTE_MARKER_UFFD_WP),
6964	sz: psize);
6965	}
6966	spin_unlock(lock: ptl);
6967	}
6968	/*
6969	* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
6970	* may have cleared our pud entry and done put_page on the page table:
6971	* once we release i_mmap_rwsem, another task can do the final put_page
6972	* and that page table be reused and filled with junk. If we actually
6973	* did unshare a page of pmds, flush the range corresponding to the pud.
6974	*/
6975	if (shared_pmd)
6976	flush_hugetlb_tlb_range(vma, range.start, range.end);
6977	else
6978	flush_hugetlb_tlb_range(vma, start, end);
6979	/*
6980	* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
6981	* downgrading page table protection not changing it to point to a new
6982	* page.
6983	*
6984	* See Documentation/mm/mmu_notifier.rst
6985	*/
6986	i_mmap_unlock_write(mapping: vma->vm_file->f_mapping);
6987	hugetlb_vma_unlock_write(vma);
6988	mmu_notifier_invalidate_range_end(range: &range);
6989
6990	return pages > `0` ? (pages << h->order) : pages;
6991	}
6992
6993	/ Return true if reservation was successful, false otherwise. /
6994	bool hugetlb_reserve_pages(struct inode *inode,
6995	long from, long to,
6996	struct vm_area_struct *vma,
6997	vm_flags_t vm_flags)
6998	{
6999	long chg = -`1`, add = -`1`;
7000	struct hstate *h = hstate_inode(i: inode);
7001	struct hugepage_subpool *spool = subpool_inode(inode);
7002	struct resv_map *resv_map;
7003	struct hugetlb_cgroup *h_cg = NULL;
7004	long gbl_reserve, regions_needed = `0`;
7005
7006	/ This should never happen /
7007	if (from > to) {
7008	VM_WARN(`1`, "%s called with a negative range\n", __func__);
7009	return false;
7010	}
7011
7012	/*
7013	* vma specific semaphore used for pmd sharing and fault/truncation
7014	* synchronization
7015	*/
7016	hugetlb_vma_lock_alloc(vma);
7017
7018	/*
7019	* Only apply hugepage reservation if asked. At fault time, an
7020	* attempt will be made for VM_NORESERVE to allocate a page
7021	* without using reserves
7022	*/
7023	if (vm_flags & VM_NORESERVE)
7024	return true;
7025
7026	/*
7027	* Shared mappings base their reservation on the number of pages that
7028	* are already allocated on behalf of the file. Private mappings need
7029	* to reserve the full area even if read-only as mprotect() may be
7030	* called to make the mapping read-write. Assume !vma is a shm mapping
7031	*/
7032	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
7033	/*
7034	* resv_map can not be NULL as hugetlb_reserve_pages is only
7035	* called for inodes for which resv_maps were created (see
7036	* hugetlbfs_get_inode).
7037	*/
7038	resv_map = inode_resv_map(inode);
7039
7040	chg = region_chg(resv: resv_map, f: from, t: to, out_regions_needed: &regions_needed);
7041	} else {
7042	/ Private mapping. /
7043	resv_map = resv_map_alloc();
7044	if (!resv_map)
7045	goto out_err;
7046
7047	chg = to - from;
7048
7049	set_vma_resv_map(vma, map: resv_map);
7050	set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
7051	}
7052
7053	if (chg < `0`)
7054	goto out_err;
7055
7056	if (hugetlb_cgroup_charge_cgroup_rsvd(idx: hstate_index(h),
7057	nr_pages: chg * pages_per_huge_page(h), ptr: &h_cg) < `0`)
7058	goto out_err;
7059
7060	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
7061	/ For private mappings, the hugetlb_cgroup uncharge info hangs*
7062	* of the resv_map.
7063	*/
7064	resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
7065	}
7066
7067	/*
7068	* There must be enough pages in the subpool for the mapping. If
7069	* the subpool has a minimum size, there may be some global
7070	* reservations already in place (gbl_reserve).
7071	*/
7072	gbl_reserve = hugepage_subpool_get_pages(spool, delta: chg);
7073	if (gbl_reserve < `0`)
7074	goto out_uncharge_cgroup;
7075
7076	/*
7077	* Check enough hugepages are available for the reservation.
7078	* Hand the pages back to the subpool if there are not
7079	*/
7080	if (hugetlb_acct_memory(h, delta: gbl_reserve) < `0`)
7081	goto out_put_pages;
7082
7083	/*
7084	* Account for the reservations made. Shared mappings record regions
7085	* that have reservations as they are shared by multiple VMAs.
7086	* When the last VMA disappears, the region map says how much
7087	* the reservation was and the page cache tells how much of
7088	* the reservation was consumed. Private mappings are per-VMA and
7089	* only the consumed reservations are tracked. When the VMA
7090	* disappears, the original reservation is the VMA size and the
7091	* consumed reservations are stored in the map. Hence, nothing
7092	* else has to be done for private mappings here
7093	*/
7094	if (!vma \|\| vma->vm_flags & VM_MAYSHARE) {
7095	add = region_add(resv: resv_map, f: from, t: to, in_regions_needed: regions_needed, h, h_cg);
7096
7097	if (unlikely(add < `0`)) {
7098	hugetlb_acct_memory(h, delta: -gbl_reserve);
7099	goto out_put_pages;
7100	} else if (unlikely(chg > add)) {
7101	/*
7102	* pages in this range were added to the reserve
7103	* map between region_chg and region_add. This
7104	* indicates a race with alloc_hugetlb_folio. Adjust
7105	* the subpool and reserve counts modified above
7106	* based on the difference.
7107	*/
7108	long rsv_adjust;
7109
7110	/*
7111	* hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
7112	* reference to h_cg->css. See comment below for detail.
7113	*/
7114	hugetlb_cgroup_uncharge_cgroup_rsvd(
7115	idx: hstate_index(h),
7116	nr_pages: (chg - add) * pages_per_huge_page(h), h_cg);
7117
7118	rsv_adjust = hugepage_subpool_put_pages(spool,
7119	delta: chg - add);
7120	hugetlb_acct_memory(h, delta: -rsv_adjust);
7121	} else if (h_cg) {
7122	/*
7123	* The file_regions will hold their own reference to
7124	* h_cg->css. So we should release the reference held
7125	* via hugetlb_cgroup_charge_cgroup_rsvd() when we are
7126	* done.
7127	*/
7128	hugetlb_cgroup_put_rsvd_cgroup(h_cg);
7129	}
7130	}
7131	return true;
7132
7133	out_put_pages:
7134	/ put back original number of pages, chg /
7135	(void)hugepage_subpool_put_pages(spool, delta: chg);
7136	out_uncharge_cgroup:
7137	hugetlb_cgroup_uncharge_cgroup_rsvd(idx: hstate_index(h),
7138	nr_pages: chg * pages_per_huge_page(h), h_cg);
7139	out_err:
7140	hugetlb_vma_lock_free(vma);
7141	if (!vma \|\| vma->vm_flags & VM_MAYSHARE)
7142	/ Only call region_abort if the region_chg succeeded but the*
7143	* region_add failed or didn't run.
7144	*/
7145	if (chg >= `0` && add < `0`)
7146	region_abort(resv: resv_map, f: from, t: to, regions_needed);
7147	if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
7148	kref_put(kref: &resv_map->refs, release: resv_map_release);
7149	set_vma_resv_map(vma, NULL);
7150	}
7151	return false;
7152	}
7153
7154	long hugetlb_unreserve_pages(struct inode inode, long* start, long end,
7155	long freed)
7156	{
7157	struct hstate *h = hstate_inode(i: inode);
7158	struct resv_map *resv_map = inode_resv_map(inode);
7159	long chg = `0`;
7160	struct hugepage_subpool *spool = subpool_inode(inode);
7161	long gbl_reserve;
7162
7163	/*
7164	* Since this routine can be called in the evict inode path for all
7165	* hugetlbfs inodes, resv_map could be NULL.
7166	*/
7167	if (resv_map) {
7168	chg = region_del(resv: resv_map, f: start, t: end);
7169	/*
7170	* region_del() can fail in the rare case where a region
7171	* must be split and another region descriptor can not be
7172	* allocated. If end == LONG_MAX, it will not fail.
7173	*/
7174	if (chg < `0`)
7175	return chg;
7176	}
7177
7178	spin_lock(lock: &inode->i_lock);
7179	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
7180	spin_unlock(lock: &inode->i_lock);
7181
7182	/*
7183	* If the subpool has a minimum size, the number of global
7184	* reservations to be released may be adjusted.
7185	*
7186	* Note that !resv_map implies freed == 0. So (chg - freed)
7187	* won't go negative.
7188	*/
7189	gbl_reserve = hugepage_subpool_put_pages(spool, delta: (chg - freed));
7190	hugetlb_acct_memory(h, delta: -gbl_reserve);
7191
7192	return `0`;
7193	}
7194
7195	#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
7196	static unsigned long page_table_shareable(struct vm_area_struct *svma,
7197	struct vm_area_struct *vma,
7198	unsigned long addr, pgoff_t idx)
7199	{
7200	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
7201	svma->vm_start;
7202	unsigned long sbase = saddr & PUD_MASK;
7203	unsigned long s_end = sbase + PUD_SIZE;
7204
7205	/ Allow segments to share if only one is marked locked /
7206	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
7207	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
7208
7209	/*
7210	* match the virtual addresses, permission and the alignment of the
7211	* page table page.
7212	*
7213	* Also, vma_lock (vm_private_data) is required for sharing.
7214	*/
7215	if (pmd_index(address: addr) != pmd_index(address: saddr) \|\|
7216	vm_flags != svm_flags \|\|
7217	!range_in_vma(vma: svma, start: sbase, end: s_end) \|\|
7218	!svma->vm_private_data)
7219	return `0`;
7220
7221	return saddr;
7222	}
7223
7224	bool want_pmd_share(struct vm_area_struct vma, unsigned* long addr)
7225	{
7226	unsigned long start = addr & PUD_MASK;
7227	unsigned long end = start + PUD_SIZE;
7228
7229	#ifdef CONFIG_USERFAULTFD
7230	if (uffd_disable_huge_pmd_share(vma))
7231	return false;
7232	#endif
7233	/*
7234	* check on proper vm_flags and page table alignment
7235	*/
7236	if (!(vma->vm_flags & VM_MAYSHARE))
7237	return false;
7238	if (!vma->vm_private_data) / vma lock required for sharing /
7239	return false;
7240	if (!range_in_vma(vma, start, end))
7241	return false;
7242	return true;
7243	}
7244
7245	/*
7246	* Determine if start,end range within vma could be mapped by shared pmd.
7247	* If yes, adjust start and end to cover range associated with possible
7248	* shared pmd mappings.
7249	*/
7250	void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7251	unsigned long start, unsigned* long *end)
7252	{
7253	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
7254	v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7255
7256	/*
7257	* vma needs to span at least one aligned PUD size, and the range
7258	* must be at least partially within in.
7259	*/
7260	if (!(vma->vm_flags & VM_MAYSHARE) \|\| !(v_end > v_start) \|\|
7261	(end <= v_start) \|\| (start >= v_end))
7262	return;
7263
7264	/ Extend the range to be PUD aligned for a worst case scenario /
7265	if (*start > v_start)
7266	start = ALIGN_DOWN(start, PUD_SIZE);
7267
7268	if (*end < v_end)
7269	end = ALIGN(end, PUD_SIZE);
7270	}
7271
7272	/*
7273	* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
7274	* and returns the corresponding pte. While this is not necessary for the
7275	* !shared pmd case because we can allocate the pmd later as well, it makes the
7276	* code much cleaner. pmd allocation is essential for the shared case because
7277	* pud has to be populated inside the same i_mmap_rwsem section - otherwise
7278	* racing tasks could either miss the sharing (see huge_pte_offset) or select a
7279	* bad pmd for sharing.
7280	*/
7281	pte_t huge_pmd_share(struct* mm_struct mm, struct* vm_area_struct *vma,
7282	unsigned long addr, pud_t *pud)
7283	{
7284	struct address_space *mapping = vma->vm_file->f_mapping;
7285	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
7286	vma->vm_pgoff;
7287	struct vm_area_struct *svma;
7288	unsigned long saddr;
7289	pte_t *spte = NULL;
7290	pte_t *pte;
7291
7292	i_mmap_lock_read(mapping);
7293	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
7294	if (svma == vma)
7295	continue;
7296
7297	saddr = page_table_shareable(svma, vma, addr, idx);
7298	if (saddr) {
7299	spte = hugetlb_walk(vma: svma, addr: saddr,
7300	sz: vma_mmu_pagesize(vma: svma));
7301	if (spte) {
7302	get_page(virt_to_page(spte));
7303	break;
7304	}
7305	}
7306	}
7307
7308	if (!spte)
7309	goto out;
7310
7311	spin_lock(lock: &mm->page_table_lock);
7312	if (pud_none(pud: *pud)) {
7313	pud_populate(mm, pud,
7314	pmd: (pmd_t )((unsigned* long)spte & PAGE_MASK));
7315	mm_inc_nr_pmds(mm);
7316	} else {
7317	put_page(virt_to_page(spte));
7318	}
7319	spin_unlock(lock: &mm->page_table_lock);
7320	out:
7321	pte = (pte_t *)pmd_alloc(mm, pud, address: addr);
7322	i_mmap_unlock_read(mapping);
7323	return pte;
7324	}
7325
7326	/*
7327	* unmap huge page backed by shared pte.
7328	*
7329	* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
7330	* indicated by page_count > 1, unmap is achieved by clearing pud and
7331	* decrementing the ref count. If count == 1, the pte page is not shared.
7332	*
7333	* Called with page table lock held.
7334	*
7335	* returns: 1 successfully unmapped a shared pte page
7336	* 0 the underlying pte page is not shared, or it is the last user
7337	*/
7338	int huge_pmd_unshare(struct mm_struct mm, struct* vm_area_struct *vma,
7339	unsigned long addr, pte_t *ptep)
7340	{
7341	pgd_t *pgd = pgd_offset(mm, addr);
7342	p4d_t *p4d = p4d_offset(pgd, address: addr);
7343	pud_t *pud = pud_offset(p4d, address: addr);
7344
7345	i_mmap_assert_write_locked(mapping: vma->vm_file->f_mapping);
7346	hugetlb_vma_assert_locked(vma);
7347	BUG_ON(page_count(virt_to_page(ptep)) == `0`);
7348	if (page_count(virt_to_page(ptep)) == `1`)
7349	return `0`;
7350
7351	pud_clear(pudp: pud);
7352	put_page(virt_to_page(ptep));
7353	mm_dec_nr_pmds(mm);
7354	return `1`;
7355	}
7356
7357	#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
7358
7359	pte_t huge_pmd_share(struct* mm_struct mm, struct* vm_area_struct *vma,
7360	unsigned long addr, pud_t *pud)
7361	{
7362	return NULL;
7363	}
7364
7365	int huge_pmd_unshare(struct mm_struct mm, struct* vm_area_struct *vma,
7366	unsigned long addr, pte_t *ptep)
7367	{
7368	return `0`;
7369	}
7370
7371	void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7372	unsigned long start, unsigned* long *end)
7373	{
7374	}
7375
7376	bool want_pmd_share(struct vm_area_struct vma, unsigned* long addr)
7377	{
7378	return false;
7379	}
7380	#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
7381
7382	#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
7383	pte_t huge_pte_alloc(struct* mm_struct mm, struct* vm_area_struct *vma,
7384	unsigned long addr, unsigned long sz)
7385	{
7386	pgd_t *pgd;
7387	p4d_t *p4d;
7388	pud_t *pud;
7389	pte_t *pte = NULL;
7390
7391	pgd = pgd_offset(mm, addr);
7392	p4d = p4d_alloc(mm, pgd, address: addr);
7393	if (!p4d)
7394	return NULL;
7395	pud = pud_alloc(mm, p4d, address: addr);
7396	if (pud) {
7397	if (sz == PUD_SIZE) {
7398	pte = (pte_t *)pud;
7399	} else {
7400	BUG_ON(sz != PMD_SIZE);
7401	if (want_pmd_share(vma, addr) && pud_none(pud: *pud))
7402	pte = huge_pmd_share(mm, vma, addr, pud);
7403	else
7404	pte = (pte_t *)pmd_alloc(mm, pud, address: addr);
7405	}
7406	}
7407
7408	if (pte) {
7409	pte_t pteval = ptep_get_lockless(ptep: pte);
7410
7411	BUG_ON(pte_present(pteval) && !pte_huge(pteval));
7412	}
7413
7414	return pte;
7415	}
7416
7417	/*
7418	* huge_pte_offset() - Walk the page table to resolve the hugepage
7419	* entry at address @addr
7420	*
7421	* Return: Pointer to page table entry (PUD or PMD) for
7422	* address @addr, or NULL if a !p*d_present() entry is encountered and the
7423	* size @sz doesn't match the hugepage size at this level of the page
7424	* table.
7425	*/
7426	pte_t huge_pte_offset(struct* mm_struct *mm,
7427	unsigned long addr, unsigned long sz)
7428	{
7429	pgd_t *pgd;
7430	p4d_t *p4d;
7431	pud_t *pud;
7432	pmd_t *pmd;
7433
7434	pgd = pgd_offset(mm, addr);
7435	if (!pgd_present(pgd: *pgd))
7436	return NULL;
7437	p4d = p4d_offset(pgd, address: addr);
7438	if (!p4d_present(p4d: *p4d))
7439	return NULL;
7440
7441	pud = pud_offset(p4d, address: addr);
7442	if (sz == PUD_SIZE)
7443	/ must be pud huge, non-present or none /
7444	return (pte_t *)pud;
7445	if (!pud_present(pud: *pud))
7446	return NULL;
7447	/ must have a valid entry and size to go further /
7448
7449	pmd = pmd_offset(pud, address: addr);
7450	/ must be pmd huge, non-present or none /
7451	return (pte_t *)pmd;
7452	}
7453
7454	/*
7455	* Return a mask that can be used to update an address to the last huge
7456	* page in a page table page mapping size. Used to skip non-present
7457	* page table entries when linearly scanning address ranges. Architectures
7458	* with unique huge page to page table relationships can define their own
7459	* version of this routine.
7460	*/
7461	unsigned long hugetlb_mask_last_page(struct hstate *h)
7462	{
7463	unsigned long hp_size = huge_page_size(h);
7464
7465	if (hp_size == PUD_SIZE)
7466	return P4D_SIZE - PUD_SIZE;
7467	else if (hp_size == PMD_SIZE)
7468	return PUD_SIZE - PMD_SIZE;
7469	else
7470	return `0UL`;
7471	}
7472
7473	#else
7474
7475	/ See description above. Architectures can provide their own version. /
7476	__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
7477	{
7478	#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
7479	if (huge_page_size(h) == PMD_SIZE)
7480	return PUD_SIZE - PMD_SIZE;
7481	#endif
7482	return `0UL`;
7483	}
7484
7485	#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
7486
7487	/*
7488	* These functions are overwritable if your architecture needs its own
7489	* behavior.
7490	*/
7491	bool isolate_hugetlb(struct folio folio, struct* list_head *list)
7492	{
7493	bool ret = true;
7494
7495	spin_lock_irq(lock: &hugetlb_lock);
7496	if (!folio_test_hugetlb(folio) \|\|
7497	!folio_test_hugetlb_migratable(folio) \|\|
7498	!folio_try_get(folio)) {
7499	ret = false;
7500	goto unlock;
7501	}
7502	folio_clear_hugetlb_migratable(folio);
7503	list_move_tail(list: &folio->lru, head: list);
7504	unlock:
7505	spin_unlock_irq(lock: &hugetlb_lock);
7506	return ret;
7507	}
7508
7509	int get_hwpoison_hugetlb_folio(struct folio folio, bool hugetlb, bool unpoison)
7510	{
7511	int ret = `0`;
7512
7513	*hugetlb = false;
7514	spin_lock_irq(lock: &hugetlb_lock);
7515	if (folio_test_hugetlb(folio)) {
7516	*hugetlb = true;
7517	if (folio_test_hugetlb_freed(folio))
7518	ret = `0`;
7519	else if (folio_test_hugetlb_migratable(folio) \|\| unpoison)
7520	ret = folio_try_get(folio);
7521	else
7522	ret = -EBUSY;
7523	}
7524	spin_unlock_irq(lock: &hugetlb_lock);
7525	return ret;
7526	}
7527
7528	int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
7529	bool *migratable_cleared)
7530	{
7531	int ret;
7532
7533	spin_lock_irq(lock: &hugetlb_lock);
7534	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
7535	spin_unlock_irq(lock: &hugetlb_lock);
7536	return ret;
7537	}
7538
7539	void folio_putback_active_hugetlb(struct folio *folio)
7540	{
7541	spin_lock_irq(lock: &hugetlb_lock);
7542	folio_set_hugetlb_migratable(folio);
7543	list_move_tail(list: &folio->lru, head: &(folio_hstate(folio))->hugepage_activelist);
7544	spin_unlock_irq(lock: &hugetlb_lock);
7545	folio_put(folio);
7546	}
7547
7548	void move_hugetlb_state(struct folio old_folio, struct* folio new_folio, int* reason)
7549	{
7550	struct hstate *h = folio_hstate(folio: old_folio);
7551
7552	hugetlb_cgroup_migrate(old_folio, new_folio);
7553	set_page_owner_migrate_reason(page: &new_folio->page, reason);
7554
7555	/*
7556	* transfer temporary state of the new hugetlb folio. This is
7557	* reverse to other transitions because the newpage is going to
7558	* be final while the old one will be freed so it takes over
7559	* the temporary status.
7560	*
7561	* Also note that we have to transfer the per-node surplus state
7562	* here as well otherwise the global surplus count will not match
7563	* the per-node's.
7564	*/
7565	if (folio_test_hugetlb_temporary(folio: new_folio)) {
7566	int old_nid = folio_nid(folio: old_folio);
7567	int new_nid = folio_nid(folio: new_folio);
7568
7569	folio_set_hugetlb_temporary(folio: old_folio);
7570	folio_clear_hugetlb_temporary(folio: new_folio);
7571
7572
7573	/*
7574	* There is no need to transfer the per-node surplus state
7575	* when we do not cross the node.
7576	*/
7577	if (new_nid == old_nid)
7578	return;
7579	spin_lock_irq(lock: &hugetlb_lock);
7580	if (h->surplus_huge_pages_node[old_nid]) {
7581	h->surplus_huge_pages_node[old_nid]--;
7582	h->surplus_huge_pages_node[new_nid]++;
7583	}
7584	spin_unlock_irq(lock: &hugetlb_lock);
7585	}
7586	}
7587
7588	static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7589	unsigned long start,
7590	unsigned long end)
7591	{
7592	struct hstate *h = hstate_vma(vma);
7593	unsigned long sz = huge_page_size(h);
7594	struct mm_struct *mm = vma->vm_mm;
7595	struct mmu_notifier_range range;
7596	unsigned long address;
7597	spinlock_t *ptl;
7598	pte_t *ptep;
7599
7600	if (!(vma->vm_flags & VM_MAYSHARE))
7601	return;
7602
7603	if (start >= end)
7604	return;
7605
7606	flush_cache_range(vma, start, end);
7607	/*
7608	* No need to call adjust_range_if_pmd_sharing_possible(), because
7609	* we have already done the PUD_SIZE alignment.
7610	*/
7611	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
7612	start, end);
7613	mmu_notifier_invalidate_range_start(range: &range);
7614	hugetlb_vma_lock_write(vma);
7615	i_mmap_lock_write(mapping: vma->vm_file->f_mapping);
7616	for (address = start; address < end; address += PUD_SIZE) {
7617	ptep = hugetlb_walk(vma, addr: address, sz);
7618	if (!ptep)
7619	continue;
7620	ptl = huge_pte_lock(h, mm, pte: ptep);
7621	huge_pmd_unshare(mm, vma, addr: address, ptep);
7622	spin_unlock(lock: ptl);
7623	}
7624	flush_hugetlb_tlb_range(vma, start, end);
7625	i_mmap_unlock_write(mapping: vma->vm_file->f_mapping);
7626	hugetlb_vma_unlock_write(vma);
7627	/*
7628	* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
7629	* Documentation/mm/mmu_notifier.rst.
7630	*/
7631	mmu_notifier_invalidate_range_end(range: &range);
7632	}
7633
7634	/*
7635	* This function will unconditionally remove all the shared pmd pgtable entries
7636	* within the specific vma for a hugetlbfs memory range.
7637	*/
7638	void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7639	{
7640	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7641	ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7642	}
7643
7644	#ifdef CONFIG_CMA
7645	static bool cma_reserve_called __initdata;
7646
7647	static int __init cmdline_parse_hugetlb_cma(char *p)
7648	{
7649	int nid, count = `0`;
7650	unsigned long tmp;
7651	char *s = p;
7652
7653	while (*s) {
7654	if (sscanf(s, "%lu%n", &tmp, &count) != `1`)
7655	break;
7656
7657	if (s[count] == `':'`) {
7658	if (tmp >= MAX_NUMNODES)
7659	break;
7660	nid = array_index_nospec(tmp, MAX_NUMNODES);
7661
7662	s += count + `1`;
7663	tmp = memparse(ptr: s, retptr: &s);
7664	hugetlb_cma_size_in_node[nid] = tmp;
7665	hugetlb_cma_size += tmp;
7666
7667	/*
7668	* Skip the separator if have one, otherwise
7669	* break the parsing.
7670	*/
7671	if (*s == `','`)
7672	s++;
7673	else
7674	break;
7675	} else {
7676	hugetlb_cma_size = memparse(ptr: p, retptr: &p);
7677	break;
7678	}
7679	}
7680
7681	return `0`;
7682	}
7683
7684	early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
7685
7686	void __init hugetlb_cma_reserve(int order)
7687	{
7688	unsigned long size, reserved, per_node;
7689	bool node_specific_cma_alloc = false;
7690	int nid;
7691
7692	cma_reserve_called = true;
7693
7694	if (!hugetlb_cma_size)
7695	return;
7696
7697	for (nid = `0`; nid < MAX_NUMNODES; nid++) {
7698	if (hugetlb_cma_size_in_node[nid] == `0`)
7699	continue;
7700
7701	if (!node_online(nid)) {
7702	pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
7703	hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7704	hugetlb_cma_size_in_node[nid] = `0`;
7705	continue;
7706	}
7707
7708	if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
7709	pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
7710	nid, (PAGE_SIZE << order) / SZ_1M);
7711	hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7712	hugetlb_cma_size_in_node[nid] = `0`;
7713	} else {
7714	node_specific_cma_alloc = true;
7715	}
7716	}
7717
7718	/ Validate the CMA size again in case some invalid nodes specified. /
7719	if (!hugetlb_cma_size)
7720	return;
7721
7722	if (hugetlb_cma_size < (PAGE_SIZE << order)) {
7723	pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
7724	(PAGE_SIZE << order) / SZ_1M);
7725	hugetlb_cma_size = `0`;
7726	return;
7727	}
7728
7729	if (!node_specific_cma_alloc) {
7730	/*
7731	* If 3 GB area is requested on a machine with 4 numa nodes,
7732	* let's allocate 1 GB on first three nodes and ignore the last one.
7733	*/
7734	per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
7735	pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
7736	hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
7737	}
7738
7739	reserved = `0`;
7740	for_each_online_node(nid) {
7741	int res;
7742	char name[CMA_MAX_NAME];
7743
7744	if (node_specific_cma_alloc) {
7745	if (hugetlb_cma_size_in_node[nid] == `0`)
7746	continue;
7747
7748	size = hugetlb_cma_size_in_node[nid];
7749	} else {
7750	size = min(per_node, hugetlb_cma_size - reserved);
7751	}
7752
7753	size = round_up(size, PAGE_SIZE << order);
7754
7755	snprintf(buf: name, size: sizeof(name), fmt: "hugetlb%d", nid);
7756	/*
7757	* Note that 'order per bit' is based on smallest size that
7758	* may be returned to CMA allocator in the case of
7759	* huge page demotion.
7760	*/
7761	res = cma_declare_contiguous_nid(base: `0`, size, limit: `0`,
7762	PAGE_SIZE << HUGETLB_PAGE_ORDER,
7763	order_per_bit: `0`, fixed: false, name,
7764	res_cma: &hugetlb_cma[nid], nid);
7765	if (res) {
7766	pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7767	res, nid);
7768	continue;
7769	}
7770
7771	reserved += size;
7772	pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7773	size / SZ_1M, nid);
7774
7775	if (reserved >= hugetlb_cma_size)
7776	break;
7777	}
7778
7779	if (!reserved)
7780	/*
7781	* hugetlb_cma_size is used to determine if allocations from
7782	* cma are possible. Set to zero if no cma regions are set up.
7783	*/
7784	hugetlb_cma_size = `0`;
7785	}
7786
7787	static void __init hugetlb_cma_check(void)
7788	{
7789	if (!hugetlb_cma_size \|\| cma_reserve_called)
7790	return;
7791
7792	pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7793	}
7794
7795	#endif /* CONFIG_CMA */
7796

source code of linux/mm/hugetlb.c