slab.c source code [linux/mm/slab.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/mm/slab.c
4	* Written by Mark Hemment, 1996/97.
5	* (markhe@nextd.demon.co.uk)
6	*
7	* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
8	*
9	* Major cleanup, different bufctl logic, per-cpu arrays
10	* (c) 2000 Manfred Spraul
11	*
12	* Cleanup, make the head arrays unconditional, preparation for NUMA
13	* (c) 2002 Manfred Spraul
14	*
15	* An implementation of the Slab Allocator as described in outline in;
16	* UNIX Internals: The New Frontiers by Uresh Vahalia
17	* Pub: Prentice Hall ISBN 0-13-101908-2
18	* or with a little more detail in;
19	* The Slab Allocator: An Object-Caching Kernel Memory Allocator
20	* Jeff Bonwick (Sun Microsystems).
21	* Presented at: USENIX Summer 1994 Technical Conference
22	*
23	* The memory is organized in caches, one cache for each object type.
24	* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
25	* Each cache consists out of many slabs (they are small (usually one
26	* page long) and always contiguous), and each slab contains multiple
27	* initialized objects.
28	*
29	* This means, that your constructor is used only for newly allocated
30	* slabs and you must pass objects with the same initializations to
31	* kmem_cache_free.
32	*
33	* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
34	* normal). If you need a special memory type, then must create a new
35	* cache for that memory type.
36	*
37	* In order to reduce fragmentation, the slabs are sorted in 3 groups:
38	* full slabs with 0 free objects
39	* partial slabs
40	* empty slabs with no allocated objects
41	*
42	* If partial slabs exist, then new allocations come from these slabs,
43	* otherwise from empty slabs or new slabs are allocated.
44	*
45	* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
46	* during kmem_cache_destroy(). The caller must prevent concurrent allocs.
47	*
48	* Each cache has a short per-cpu head array, most allocs
49	* and frees go into that array, and if that array overflows, then 1/2
50	* of the entries in the array are given back into the global cache.
51	* The head array is strictly LIFO and should improve the cache hit rates.
52	* On SMP, it additionally reduces the spinlock operations.
53	*
54	* The c_cpuarray may not be read with enabled local interrupts -
55	* it's changed with a smp_call_function().
56	*
57	* SMP synchronization:
58	* constructors and destructors are called without any locking.
59	* Several members in struct kmem_cache and struct slab never change, they
60	* are accessed without any locking.
61	* The per-cpu arrays are never accessed from the wrong cpu, no locking,
62	* and local interrupts are disabled so slab code is preempt-safe.
63	* The non-constant members are protected with a per-cache irq spinlock.
64	*
65	* Many thanks to Mark Hemment, who wrote another per-cpu slab patch
66	* in 2000 - many ideas in the current implementation are derived from
67	* his patch.
68	*
69	* Further notes from the original documentation:
70	*
71	* 11 April '97. Started multi-threading - markhe
72	* The global cache-chain is protected by the mutex 'slab_mutex'.
73	* The sem is only needed when accessing/extending the cache-chain, which
74	* can never happen inside an interrupt (kmem_cache_create(),
75	* kmem_cache_shrink() and kmem_cache_reap()).
76	*
77	* At present, each engine can be growing a cache. This should be blocked.
78	*
79	* 15 March 2005. NUMA slab allocator.
80	* Shai Fultheim <shai@scalex86.org>.
81	* Shobhit Dayal <shobhit@calsoftinc.com>
82	* Alok N Kataria <alokk@calsoftinc.com>
83	* Christoph Lameter <christoph@lameter.com>
84	*
85	* Modified the slab allocator to be node aware on NUMA systems.
86	* Each node has its own list of partial, free and full slabs.
87	* All object allocations for a node occur from node specific slab lists.
88	*/
89
90	#include <linux/slab.h>
91	#include <linux/mm.h>
92	#include <linux/poison.h>
93	#include <linux/swap.h>
94	#include <linux/cache.h>
95	#include <linux/interrupt.h>
96	#include <linux/init.h>
97	#include <linux/compiler.h>
98	#include <linux/cpuset.h>
99	#include <linux/proc_fs.h>
100	#include <linux/seq_file.h>
101	#include <linux/notifier.h>
102	#include <linux/kallsyms.h>
103	#include <linux/kfence.h>
104	#include <linux/cpu.h>
105	#include <linux/sysctl.h>
106	#include <linux/module.h>
107	#include <linux/rcupdate.h>
108	#include <linux/string.h>
109	#include <linux/uaccess.h>
110	#include <linux/nodemask.h>
111	#include <linux/kmemleak.h>
112	#include <linux/mempolicy.h>
113	#include <linux/mutex.h>
114	#include <linux/fault-inject.h>
115	#include <linux/rtmutex.h>
116	#include <linux/reciprocal_div.h>
117	#include <linux/debugobjects.h>
118	#include <linux/memory.h>
119	#include <linux/prefetch.h>
120	#include <linux/sched/task_stack.h>
121
122	#include <net/sock.h>
123
124	#include <asm/cacheflush.h>
125	#include <asm/tlbflush.h>
126	#include <asm/page.h>
127
128	#include <trace/events/kmem.h>
129
130	#include "internal.h"
131
132	#include "slab.h"
133
134	/*
135	* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
136	* 0 for faster, smaller code (especially in the critical paths).
137	*
138	* STATS - 1 to collect stats for /proc/slabinfo.
139	* 0 for faster, smaller code (especially in the critical paths).
140	*
141	* FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
142	*/
143
144	#ifdef CONFIG_DEBUG_SLAB
145	#define DEBUG 1
146	#define STATS 1
147	#define FORCED_DEBUG 1
148	#else
149	#define DEBUG 0
150	#define STATS 0
151	#define FORCED_DEBUG 0
152	#endif
153
154	/ Shouldn't this be in a header file somewhere? /
155	#define BYTES_PER_WORD sizeof(void *)
156	#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
157
158	#ifndef ARCH_KMALLOC_FLAGS
159	#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
160	#endif
161
162	#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
163	<= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
164
165	#if FREELIST_BYTE_INDEX
166	typedef unsigned char freelist_idx_t;
167	#else
168	typedef unsigned short freelist_idx_t;
169	#endif
170
171	#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
172
173	/*
174	* struct array_cache
175	*
176	* Purpose:
177	* - LIFO ordering, to hand out cache-warm objects from _alloc
178	* - reduce the number of linked list operations
179	* - reduce spinlock operations
180	*
181	* The limit is stored in the per-cpu structure to reduce the data cache
182	* footprint.
183	*
184	*/
185	struct array_cache {
186	unsigned int avail;
187	unsigned int limit;
188	unsigned int batchcount;
189	unsigned int touched;
190	void entry[]; /
191	* Must have this definition in here for the proper
192	* alignment of array_cache. Also simplifies accessing
193	* the entries.
194	*/
195	};
196
197	struct alien_cache {
198	spinlock_t lock;
199	struct array_cache ac;
200	};
201
202	/*
203	* Need this for bootstrapping a per node allocator.
204	*/
205	#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
206	static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
207	#define CACHE_CACHE 0
208	#define SIZE_NODE (MAX_NUMNODES)
209
210	static int drain_freelist(struct kmem_cache *cache,
211	struct kmem_cache_node n, int* tofree);
212	static void free_block(struct kmem_cache cachep, void* *objpp, int* len,
213	int node, struct list_head *list);
214	static void slabs_destroy(struct kmem_cache cachep, struct* list_head *list);
215	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
216	static void cache_reap(struct work_struct *unused);
217
218	static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
219	void **list);
220	static inline void fixup_slab_list(struct kmem_cache *cachep,
221	struct kmem_cache_node n, struct* slab *slab,
222	void **list);
223
224	#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
225
226	static void kmem_cache_node_init(struct kmem_cache_node *parent)
227	{
228	INIT_LIST_HEAD(list: &parent->slabs_full);
229	INIT_LIST_HEAD(list: &parent->slabs_partial);
230	INIT_LIST_HEAD(list: &parent->slabs_free);
231	parent->total_slabs = `0`;
232	parent->free_slabs = `0`;
233	parent->shared = NULL;
234	parent->alien = NULL;
235	parent->colour_next = `0`;
236	raw_spin_lock_init(&parent->list_lock);
237	parent->free_objects = `0`;
238	parent->free_touched = `0`;
239	}
240
241	#define MAKE_LIST(cachep, listp, slab, nodeid) \
242	do { \
243	INIT_LIST_HEAD(listp); \
244	list_splice(&get_node(cachep, nodeid)->slab, listp); \
245	} while (0)
246
247	#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
248	do { \
249	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
250	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
251	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
252	} while (0)
253
254	#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
255	#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
256	#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
257	#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
258
259	#define BATCHREFILL_LIMIT 16
260	/*
261	* Optimization question: fewer reaps means less probability for unnecessary
262	* cpucache drain/refill cycles.
263	*
264	* OTOH the cpuarrays can contain lots of objects,
265	* which could lock up otherwise freeable slabs.
266	*/
267	#define REAPTIMEOUT_AC (2*HZ)
268	#define REAPTIMEOUT_NODE (4*HZ)
269
270	#if STATS
271	#define STATS_INC_ACTIVE(x) ((x)->num_active++)
272	#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
273	#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
274	#define STATS_INC_GROWN(x) ((x)->grown++)
275	#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y))
276	#define STATS_SET_HIGH(x) \
277	do { \
278	if ((x)->num_active > (x)->high_mark) \
279	(x)->high_mark = (x)->num_active; \
280	} while (0)
281	#define STATS_INC_ERR(x) ((x)->errors++)
282	#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
283	#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
284	#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
285	#define STATS_SET_FREEABLE(x, i) \
286	do { \
287	if ((x)->max_freeable < i) \
288	(x)->max_freeable = i; \
289	} while (0)
290	#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
291	#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
292	#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
293	#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
294	#else
295	#define STATS_INC_ACTIVE(x) do { } while (0)
296	#define STATS_DEC_ACTIVE(x) do { } while (0)
297	#define STATS_INC_ALLOCED(x) do { } while (0)
298	#define STATS_INC_GROWN(x) do { } while (0)
299	#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0)
300	#define STATS_SET_HIGH(x) do { } while (0)
301	#define STATS_INC_ERR(x) do { } while (0)
302	#define STATS_INC_NODEALLOCS(x) do { } while (0)
303	#define STATS_INC_NODEFREES(x) do { } while (0)
304	#define STATS_INC_ACOVERFLOW(x) do { } while (0)
305	#define STATS_SET_FREEABLE(x, i) do { } while (0)
306	#define STATS_INC_ALLOCHIT(x) do { } while (0)
307	#define STATS_INC_ALLOCMISS(x) do { } while (0)
308	#define STATS_INC_FREEHIT(x) do { } while (0)
309	#define STATS_INC_FREEMISS(x) do { } while (0)
310	#endif
311
312	#if DEBUG
313
314	/*
315	* memory layout of objects:
316	* 0 : objp
317	* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
318	* the end of an object is aligned with the end of the real
319	* allocation. Catches writes behind the end of the allocation.
320	* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
321	* redzone word.
322	* cachep->obj_offset: The real object.
323	* cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
324	* cachep->size - 1* BYTES_PER_WORD: last caller address
325	* [BYTES_PER_WORD long]
326	*/
327	static int obj_offset(struct kmem_cache *cachep)
328	{
329	return cachep->obj_offset;
330	}
331
332	static unsigned long long dbg_redzone1(struct* kmem_cache cachep, void* *objp)
333	{
334	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
335	return (unsigned long long *) (objp + obj_offset(cachep) -
336	sizeof(unsigned long long));
337	}
338
339	static unsigned long long dbg_redzone2(struct* kmem_cache cachep, void* *objp)
340	{
341	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
342	if (cachep->flags & SLAB_STORE_USER)
343	return (unsigned long long *)(objp + cachep->size -
344	sizeof(unsigned long long) -
345	REDZONE_ALIGN);
346	return (unsigned long long *) (objp + cachep->size -
347	sizeof(unsigned long long));
348	}
349
350	static void dbg_userword(struct** kmem_cache cachep, void* *objp)
351	{
352	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
353	return (void **)(objp + cachep->size - BYTES_PER_WORD);
354	}
355
356	#else
357
358	#define obj_offset(x) 0
359	#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
360	#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
361	#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
362
363	#endif
364
365	/*
366	* Do not go above this order unless 0 objects fit into the slab or
367	* overridden on the command line.
368	*/
369	#define SLAB_MAX_ORDER_HI 1
370	#define SLAB_MAX_ORDER_LO 0
371	static int slab_max_order = SLAB_MAX_ORDER_LO;
372	static bool slab_max_order_set __initdata;
373
374	static inline void index_to_obj(struct* kmem_cache *cache,
375	const struct slab slab, unsigned* int idx)
376	{
377	return slab->s_mem + cache->size * idx;
378	}
379
380	#define BOOT_CPUCACHE_ENTRIES 1
381	/ internal cache of cache description objs /
382	static struct kmem_cache kmem_cache_boot = {
383	.batchcount = `1`,
384	.limit = BOOT_CPUCACHE_ENTRIES,
385	.shared = `1`,
386	.size = sizeof(struct kmem_cache),
387	.name = "kmem_cache",
388	};
389
390	static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
391
392	static inline struct array_cache cpu_cache_get(struct* kmem_cache *cachep)
393	{
394	return this_cpu_ptr(cachep->cpu_cache);
395	}
396
397	/*
398	* Calculate the number of objects and left-over bytes for a given buffer size.
399	*/
400	static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
401	slab_flags_t flags, size_t *left_over)
402	{
403	unsigned int num;
404	size_t slab_size = PAGE_SIZE << gfporder;
405
406	/*
407	* The slab management structure can be either off the slab or
408	* on it. For the latter case, the memory allocated for a
409	* slab is used for:
410	*
411	* - @buffer_size bytes for each object
412	* - One freelist_idx_t for each object
413	*
414	* We don't need to consider alignment of freelist because
415	* freelist will be at the end of slab page. The objects will be
416	* at the correct alignment.
417	*
418	* If the slab management structure is off the slab, then the
419	* alignment will already be calculated into the size. Because
420	* the slabs are all pages aligned, the objects will be at the
421	* correct alignment when allocated.
422	*/
423	if (flags & (CFLGS_OBJFREELIST_SLAB \| CFLGS_OFF_SLAB)) {
424	num = slab_size / buffer_size;
425	*left_over = slab_size % buffer_size;
426	} else {
427	num = slab_size / (buffer_size + sizeof(freelist_idx_t));
428	*left_over = slab_size %
429	(buffer_size + sizeof(freelist_idx_t));
430	}
431
432	return num;
433	}
434
435	#if DEBUG
436	#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
437
438	static void __slab_error(const char function, struct* kmem_cache *cachep,
439	char *msg)
440	{
441	pr_err("slab error in %s(): cache `%s': %s\n",
442	function, cachep->name, msg);
443	dump_stack();
444	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
445	}
446	#endif
447
448	/*
449	* By default on NUMA we use alien caches to stage the freeing of
450	* objects allocated from other nodes. This causes massive memory
451	* inefficiencies when using fake NUMA setup to split memory into a
452	* large number of small nodes, so it can be disabled on the command
453	* line
454	*/
455
456	static int use_alien_caches __read_mostly = `1`;
457	static int __init noaliencache_setup(char *s)
458	{
459	use_alien_caches = `0`;
460	return `1`;
461	}
462	__setup("noaliencache", noaliencache_setup);
463
464	static int __init slab_max_order_setup(char *str)
465	{
466	get_option(str: &str, pint: &slab_max_order);
467	slab_max_order = slab_max_order < `0` ? `0` :
468	min(slab_max_order, MAX_ORDER);
469	slab_max_order_set = true;
470
471	return `1`;
472	}
473	__setup("slab_max_order=", slab_max_order_setup);
474
475	#ifdef CONFIG_NUMA
476	/*
477	* Special reaping functions for NUMA systems called from cache_reap().
478	* These take care of doing round robin flushing of alien caches (containing
479	* objects freed on different nodes from which they were allocated) and the
480	* flushing of remote pcps by calling drain_node_pages.
481	*/
482	static DEFINE_PER_CPU(unsigned long, slab_reap_node);
483
484	static void init_reap_node(int cpu)
485	{
486	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
487	node_online_map);
488	}
489
490	static void next_reap_node(void)
491	{
492	int node = __this_cpu_read(slab_reap_node);
493
494	node = next_node_in(node, node_online_map);
495	__this_cpu_write(slab_reap_node, node);
496	}
497
498	#else
499	#define init_reap_node(cpu) do { } while (0)
500	#define next_reap_node(void) do { } while (0)
501	#endif
502
503	/*
504	* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
505	* via the workqueue/eventd.
506	* Add the CPU number into the expiration time to minimize the possibility of
507	* the CPUs getting into lockstep and contending for the global cache chain
508	* lock.
509	*/
510	static void start_cpu_timer(int cpu)
511	{
512	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
513
514	if (reap_work->work.func == NULL) {
515	init_reap_node(cpu);
516	INIT_DEFERRABLE_WORK(reap_work, cache_reap);
517	schedule_delayed_work_on(cpu, dwork: reap_work,
518	delay: __round_jiffies_relative(HZ, cpu));
519	}
520	}
521
522	static void init_arraycache(struct array_cache ac, int* limit, int batch)
523	{
524	if (ac) {
525	ac->avail = `0`;
526	ac->limit = limit;
527	ac->batchcount = batch;
528	ac->touched = `0`;
529	}
530	}
531
532	static struct array_cache alloc_arraycache(int* node, int entries,
533	int batchcount, gfp_t gfp)
534	{
535	size_t memsize = sizeof(void ) entries + sizeof(struct array_cache);
536	struct array_cache *ac = NULL;
537
538	ac = kmalloc_node(size: memsize, flags: gfp, node);
539	/*
540	* The array_cache structures contain pointers to free object.
541	* However, when such objects are allocated or transferred to another
542	* cache the pointers are not cleared and they could be counted as
543	* valid references during a kmemleak scan. Therefore, kmemleak must
544	* not scan such objects.
545	*/
546	kmemleak_no_scan(ptr: ac);
547	init_arraycache(ac, limit: entries, batch: batchcount);
548	return ac;
549	}
550
551	static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
552	struct slab slab, void* *objp)
553	{
554	struct kmem_cache_node *n;
555	int slab_node;
556	LIST_HEAD(list);
557
558	slab_node = slab_nid(slab);
559	n = get_node(s: cachep, node: slab_node);
560
561	raw_spin_lock(&n->list_lock);
562	free_block(cachep, objpp: &objp, len: `1`, node: slab_node, list: &list);
563	raw_spin_unlock(&n->list_lock);
564
565	slabs_destroy(cachep, list: &list);
566	}
567
568	/*
569	* Transfer objects in one arraycache to another.
570	* Locking must be handled by the caller.
571	*
572	* Return the number of entries transferred.
573	*/
574	static int transfer_objects(struct array_cache *to,
575	struct array_cache from, unsigned* int max)
576	{
577	/ Figure out how many entries to transfer /
578	int nr = min3(from->avail, max, to->limit - to->avail);
579
580	if (!nr)
581	return `0`;
582
583	memcpy(to->entry + to->avail, from->entry + from->avail - nr,
584	sizeof(void ) nr);
585
586	from->avail -= nr;
587	to->avail += nr;
588	return nr;
589	}
590
591	/ &alien->lock must be held by alien callers. /
592	static __always_inline void __free_one(struct array_cache ac, void* *objp)
593	{
594	/ Avoid trivial double-free. /
595	if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
596	WARN_ON_ONCE(ac->avail > `0` && ac->entry[ac->avail - `1`] == objp))
597	return;
598	ac->entry[ac->avail++] = objp;
599	}
600
601	#ifndef CONFIG_NUMA
602
603	#define drain_alien_cache(cachep, alien) do { } while (0)
604	#define reap_alien(cachep, n) do { } while (0)
605
606	static inline struct alien_cache *alloc_alien_cache(int* node,
607	int limit, gfp_t gfp)
608	{
609	return NULL;
610	}
611
612	static inline void free_alien_cache(struct alien_cache **ac_ptr)
613	{
614	}
615
616	static inline int cache_free_alien(struct kmem_cache cachep, void* *objp)
617	{
618	return `0`;
619	}
620
621	static inline gfp_t gfp_exact_node(gfp_t flags)
622	{
623	return flags & ~__GFP_NOFAIL;
624	}
625
626	#else /* CONFIG_NUMA */
627
628	static struct alien_cache __alloc_alien_cache(int* node, int entries,
629	int batch, gfp_t gfp)
630	{
631	size_t memsize = sizeof(void ) entries + sizeof(struct alien_cache);
632	struct alien_cache *alc = NULL;
633
634	alc = kmalloc_node(size: memsize, flags: gfp, node);
635	if (alc) {
636	kmemleak_no_scan(ptr: alc);
637	init_arraycache(ac: &alc->ac, limit: entries, batch);
638	spin_lock_init(&alc->lock);
639	}
640	return alc;
641	}
642
643	static struct alien_cache *alloc_alien_cache(int* node, int limit, gfp_t gfp)
644	{
645	struct alien_cache **alc_ptr;
646	int i;
647
648	if (limit > `1`)
649	limit = `12`;
650	alc_ptr = kcalloc_node(n: nr_node_ids, size: sizeof(void *), flags: gfp, node);
651	if (!alc_ptr)
652	return NULL;
653
654	for_each_node(i) {
655	if (i == node \|\| !node_online(i))
656	continue;
657	alc_ptr[i] = __alloc_alien_cache(node, entries: limit, batch: `0xbaadf00d`, gfp);
658	if (!alc_ptr[i]) {
659	for (i--; i >= `0`; i--)
660	kfree(objp: alc_ptr[i]);
661	kfree(objp: alc_ptr);
662	return NULL;
663	}
664	}
665	return alc_ptr;
666	}
667
668	static void free_alien_cache(struct alien_cache **alc_ptr)
669	{
670	int i;
671
672	if (!alc_ptr)
673	return;
674	for_each_node(i)
675	kfree(objp: alc_ptr[i]);
676	kfree(objp: alc_ptr);
677	}
678
679	static void __drain_alien_cache(struct kmem_cache *cachep,
680	struct array_cache ac, int* node,
681	struct list_head *list)
682	{
683	struct kmem_cache_node *n = get_node(s: cachep, node);
684
685	if (ac->avail) {
686	raw_spin_lock(&n->list_lock);
687	/*
688	* Stuff objects into the remote nodes shared array first.
689	* That way we could avoid the overhead of putting the objects
690	* into the free lists and getting them back later.
691	*/
692	if (n->shared)
693	transfer_objects(to: n->shared, from: ac, max: ac->limit);
694
695	free_block(cachep, objpp: ac->entry, len: ac->avail, node, list);
696	ac->avail = `0`;
697	raw_spin_unlock(&n->list_lock);
698	}
699	}
700
701	/*
702	* Called from cache_reap() to regularly drain alien caches round robin.
703	*/
704	static void reap_alien(struct kmem_cache cachep, struct* kmem_cache_node *n)
705	{
706	int node = __this_cpu_read(slab_reap_node);
707
708	if (n->alien) {
709	struct alien_cache *alc = n->alien[node];
710	struct array_cache *ac;
711
712	if (alc) {
713	ac = &alc->ac;
714	if (ac->avail && spin_trylock_irq(lock: &alc->lock)) {
715	LIST_HEAD(list);
716
717	__drain_alien_cache(cachep, ac, node, list: &list);
718	spin_unlock_irq(lock: &alc->lock);
719	slabs_destroy(cachep, list: &list);
720	}
721	}
722	}
723	}
724
725	static void drain_alien_cache(struct kmem_cache *cachep,
726	struct alien_cache **alien)
727	{
728	int i = `0`;
729	struct alien_cache *alc;
730	struct array_cache *ac;
731	unsigned long flags;
732
733	for_each_online_node(i) {
734	alc = alien[i];
735	if (alc) {
736	LIST_HEAD(list);
737
738	ac = &alc->ac;
739	spin_lock_irqsave(&alc->lock, flags);
740	__drain_alien_cache(cachep, ac, node: i, list: &list);
741	spin_unlock_irqrestore(lock: &alc->lock, flags);
742	slabs_destroy(cachep, list: &list);
743	}
744	}
745	}
746
747	static int __cache_free_alien(struct kmem_cache cachep, void* *objp,
748	int node, int slab_node)
749	{
750	struct kmem_cache_node *n;
751	struct alien_cache *alien = NULL;
752	struct array_cache *ac;
753	LIST_HEAD(list);
754
755	n = get_node(s: cachep, node);
756	STATS_INC_NODEFREES(cachep);
757	if (n->alien && n->alien[slab_node]) {
758	alien = n->alien[slab_node];
759	ac = &alien->ac;
760	spin_lock(lock: &alien->lock);
761	if (unlikely(ac->avail == ac->limit)) {
762	STATS_INC_ACOVERFLOW(cachep);
763	__drain_alien_cache(cachep, ac, node: slab_node, list: &list);
764	}
765	__free_one(ac, objp);
766	spin_unlock(lock: &alien->lock);
767	slabs_destroy(cachep, list: &list);
768	} else {
769	n = get_node(s: cachep, node: slab_node);
770	raw_spin_lock(&n->list_lock);
771	free_block(cachep, objpp: &objp, len: `1`, node: slab_node, list: &list);
772	raw_spin_unlock(&n->list_lock);
773	slabs_destroy(cachep, list: &list);
774	}
775	return `1`;
776	}
777
778	static inline int cache_free_alien(struct kmem_cache cachep, void* *objp)
779	{
780	int slab_node = slab_nid(slab: virt_to_slab(addr: objp));
781	int node = numa_mem_id();
782	/*
783	* Make sure we are not freeing an object from another node to the array
784	* cache on this cpu.
785	*/
786	if (likely(node == slab_node))
787	return `0`;
788
789	return __cache_free_alien(cachep, objp, node, slab_node);
790	}
791
792	/*
793	* Construct gfp mask to allocate from a specific node but do not reclaim or
794	* warn about failures.
795	*/
796	static inline gfp_t gfp_exact_node(gfp_t flags)
797	{
798	return (flags \| __GFP_THISNODE \| __GFP_NOWARN) & ~(__GFP_RECLAIM\|__GFP_NOFAIL);
799	}
800	#endif
801
802	static int init_cache_node(struct kmem_cache cachep, int* node, gfp_t gfp)
803	{
804	struct kmem_cache_node *n;
805
806	/*
807	* Set up the kmem_cache_node for cpu before we can
808	* begin anything. Make sure some other cpu on this
809	* node has not already allocated this
810	*/
811	n = get_node(s: cachep, node);
812	if (n) {
813	raw_spin_lock_irq(&n->list_lock);
814	n->free_limit = (`1` + nr_cpus_node(node)) * cachep->batchcount +
815	cachep->num;
816	raw_spin_unlock_irq(&n->list_lock);
817
818	return `0`;
819	}
820
821	n = kmalloc_node(size: sizeof(struct kmem_cache_node), flags: gfp, node);
822	if (!n)
823	return -ENOMEM;
824
825	kmem_cache_node_init(parent: n);
826	n->next_reap = jiffies + REAPTIMEOUT_NODE +
827	((unsigned long)cachep) % REAPTIMEOUT_NODE;
828
829	n->free_limit =
830	(`1` + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
831
832	/*
833	* The kmem_cache_nodes don't come and go as CPUs
834	* come and go. slab_mutex provides sufficient
835	* protection here.
836	*/
837	cachep->node[node] = n;
838
839	return `0`;
840	}
841
842	#if defined(CONFIG_NUMA) \|\| defined(CONFIG_SMP)
843	/*
844	* Allocates and initializes node for a node on each slab cache, used for
845	* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
846	* will be allocated off-node since memory is not yet online for the new node.
847	* When hotplugging memory or a cpu, existing nodes are not replaced if
848	* already in use.
849	*
850	* Must hold slab_mutex.
851	*/
852	static int init_cache_node_node(int node)
853	{
854	int ret;
855	struct kmem_cache *cachep;
856
857	list_for_each_entry(cachep, &slab_caches, list) {
858	ret = init_cache_node(cachep, node, GFP_KERNEL);
859	if (ret)
860	return ret;
861	}
862
863	return `0`;
864	}
865	#endif
866
867	static int setup_kmem_cache_node(struct kmem_cache *cachep,
868	int node, gfp_t gfp, bool force_change)
869	{
870	int ret = -ENOMEM;
871	struct kmem_cache_node *n;
872	struct array_cache *old_shared = NULL;
873	struct array_cache *new_shared = NULL;
874	struct alien_cache **new_alien = NULL;
875	LIST_HEAD(list);
876
877	if (use_alien_caches) {
878	new_alien = alloc_alien_cache(node, limit: cachep->limit, gfp);
879	if (!new_alien)
880	goto fail;
881	}
882
883	if (cachep->shared) {
884	new_shared = alloc_arraycache(node,
885	entries: cachep->shared * cachep->batchcount, batchcount: `0xbaadf00d`, gfp);
886	if (!new_shared)
887	goto fail;
888	}
889
890	ret = init_cache_node(cachep, node, gfp);
891	if (ret)
892	goto fail;
893
894	n = get_node(s: cachep, node);
895	raw_spin_lock_irq(&n->list_lock);
896	if (n->shared && force_change) {
897	free_block(cachep, objpp: n->shared->entry,
898	len: n->shared->avail, node, list: &list);
899	n->shared->avail = `0`;
900	}
901
902	if (!n->shared \|\| force_change) {
903	old_shared = n->shared;
904	n->shared = new_shared;
905	new_shared = NULL;
906	}
907
908	if (!n->alien) {
909	n->alien = new_alien;
910	new_alien = NULL;
911	}
912
913	raw_spin_unlock_irq(&n->list_lock);
914	slabs_destroy(cachep, list: &list);
915
916	/*
917	* To protect lockless access to n->shared during irq disabled context.
918	* If n->shared isn't NULL in irq disabled context, accessing to it is
919	* guaranteed to be valid until irq is re-enabled, because it will be
920	* freed after synchronize_rcu().
921	*/
922	if (old_shared && force_change)
923	synchronize_rcu();
924
925	fail:
926	kfree(objp: old_shared);
927	kfree(objp: new_shared);
928	free_alien_cache(alc_ptr: new_alien);
929
930	return ret;
931	}
932
933	#ifdef CONFIG_SMP
934
935	static void cpuup_canceled(long cpu)
936	{
937	struct kmem_cache *cachep;
938	struct kmem_cache_node *n = NULL;
939	int node = cpu_to_mem(cpu);
940	const struct cpumask *mask = cpumask_of_node(node);
941
942	list_for_each_entry(cachep, &slab_caches, list) {
943	struct array_cache *nc;
944	struct array_cache *shared;
945	struct alien_cache **alien;
946	LIST_HEAD(list);
947
948	n = get_node(s: cachep, node);
949	if (!n)
950	continue;
951
952	raw_spin_lock_irq(&n->list_lock);
953
954	/ Free limit for this kmem_cache_node /
955	n->free_limit -= cachep->batchcount;
956
957	/ cpu is dead; no one can alloc from it. /
958	nc = per_cpu_ptr(cachep->cpu_cache, cpu);
959	free_block(cachep, objpp: nc->entry, len: nc->avail, node, list: &list);
960	nc->avail = `0`;
961
962	if (!cpumask_empty(srcp: mask)) {
963	raw_spin_unlock_irq(&n->list_lock);
964	goto free_slab;
965	}
966
967	shared = n->shared;
968	if (shared) {
969	free_block(cachep, objpp: shared->entry,
970	len: shared->avail, node, list: &list);
971	n->shared = NULL;
972	}
973
974	alien = n->alien;
975	n->alien = NULL;
976
977	raw_spin_unlock_irq(&n->list_lock);
978
979	kfree(objp: shared);
980	if (alien) {
981	drain_alien_cache(cachep, alien);
982	free_alien_cache(alc_ptr: alien);
983	}
984
985	free_slab:
986	slabs_destroy(cachep, list: &list);
987	}
988	/*
989	* In the previous loop, all the objects were freed to
990	* the respective cache's slabs, now we can go ahead and
991	* shrink each nodelist to its limit.
992	*/
993	list_for_each_entry(cachep, &slab_caches, list) {
994	n = get_node(s: cachep, node);
995	if (!n)
996	continue;
997	drain_freelist(cache: cachep, n, INT_MAX);
998	}
999	}
1000
1001	static int cpuup_prepare(long cpu)
1002	{
1003	struct kmem_cache *cachep;
1004	int node = cpu_to_mem(cpu);
1005	int err;
1006
1007	/*
1008	* We need to do this right in the beginning since
1009	* alloc_arraycache's are going to use this list.
1010	* kmalloc_node allows us to add the slab to the right
1011	* kmem_cache_node and not this cpu's kmem_cache_node
1012	*/
1013	err = init_cache_node_node(node);
1014	if (err < `0`)
1015	goto bad;
1016
1017	/*
1018	* Now we can go ahead with allocating the shared arrays and
1019	* array caches
1020	*/
1021	list_for_each_entry(cachep, &slab_caches, list) {
1022	err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, force_change: false);
1023	if (err)
1024	goto bad;
1025	}
1026
1027	return `0`;
1028	bad:
1029	cpuup_canceled(cpu);
1030	return -ENOMEM;
1031	}
1032
1033	int slab_prepare_cpu(unsigned int cpu)
1034	{
1035	int err;
1036
1037	mutex_lock(&slab_mutex);
1038	err = cpuup_prepare(cpu);
1039	mutex_unlock(&slab_mutex);
1040	return err;
1041	}
1042
1043	/*
1044	* This is called for a failed online attempt and for a successful
1045	* offline.
1046	*
1047	* Even if all the cpus of a node are down, we don't free the
1048	* kmem_cache_node of any cache. This is to avoid a race between cpu_down, and
1049	* a kmalloc allocation from another cpu for memory from the node of
1050	* the cpu going down. The kmem_cache_node structure is usually allocated from
1051	* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
1052	*/
1053	int slab_dead_cpu(unsigned int cpu)
1054	{
1055	mutex_lock(&slab_mutex);
1056	cpuup_canceled(cpu);
1057	mutex_unlock(&slab_mutex);
1058	return `0`;
1059	}
1060	#endif
1061
1062	static int slab_online_cpu(unsigned int cpu)
1063	{
1064	start_cpu_timer(cpu);
1065	return `0`;
1066	}
1067
1068	static int slab_offline_cpu(unsigned int cpu)
1069	{
1070	/*
1071	* Shutdown cache reaper. Note that the slab_mutex is held so
1072	* that if cache_reap() is invoked it cannot do anything
1073	* expensive but will only modify reap_work and reschedule the
1074	* timer.
1075	*/
1076	cancel_delayed_work_sync(dwork: &per_cpu(slab_reap_work, cpu));
1077	/ Now the cache_reaper is guaranteed to be not running. /
1078	per_cpu(slab_reap_work, cpu).work.func = NULL;
1079	return `0`;
1080	}
1081
1082	#if defined(CONFIG_NUMA)
1083	/*
1084	* Drains freelist for a node on each slab cache, used for memory hot-remove.
1085	* Returns -EBUSY if all objects cannot be drained so that the node is not
1086	* removed.
1087	*
1088	* Must hold slab_mutex.
1089	*/
1090	static int __meminit drain_cache_node_node(int node)
1091	{
1092	struct kmem_cache *cachep;
1093	int ret = `0`;
1094
1095	list_for_each_entry(cachep, &slab_caches, list) {
1096	struct kmem_cache_node *n;
1097
1098	n = get_node(s: cachep, node);
1099	if (!n)
1100	continue;
1101
1102	drain_freelist(cache: cachep, n, INT_MAX);
1103
1104	if (!list_empty(head: &n->slabs_full) \|\|
1105	!list_empty(head: &n->slabs_partial)) {
1106	ret = -EBUSY;
1107	break;
1108	}
1109	}
1110	return ret;
1111	}
1112
1113	static int __meminit slab_memory_callback(struct notifier_block *self,
1114	unsigned long action, void *arg)
1115	{
1116	struct memory_notify *mnb = arg;
1117	int ret = `0`;
1118	int nid;
1119
1120	nid = mnb->status_change_nid;
1121	if (nid < `0`)
1122	goto out;
1123
1124	switch (action) {
1125	case MEM_GOING_ONLINE:
1126	mutex_lock(&slab_mutex);
1127	ret = init_cache_node_node(node: nid);
1128	mutex_unlock(lock: &slab_mutex);
1129	break;
1130	case MEM_GOING_OFFLINE:
1131	mutex_lock(&slab_mutex);
1132	ret = drain_cache_node_node(node: nid);
1133	mutex_unlock(lock: &slab_mutex);
1134	break;
1135	case MEM_ONLINE:
1136	case MEM_OFFLINE:
1137	case MEM_CANCEL_ONLINE:
1138	case MEM_CANCEL_OFFLINE:
1139	break;
1140	}
1141	out:
1142	return notifier_from_errno(err: ret);
1143	}
1144	#endif /* CONFIG_NUMA */
1145
1146	/*
1147	* swap the static kmem_cache_node with kmalloced memory
1148	*/
1149	static void __init init_list(struct kmem_cache cachep, struct* kmem_cache_node *list,
1150	int nodeid)
1151	{
1152	struct kmem_cache_node *ptr;
1153
1154	ptr = kmalloc_node(size: sizeof(struct kmem_cache_node), GFP_NOWAIT, node: nodeid);
1155	BUG_ON(!ptr);
1156
1157	memcpy(ptr, list, sizeof(struct kmem_cache_node));
1158	/*
1159	* Do not assume that spinlocks can be initialized via memcpy:
1160	*/
1161	raw_spin_lock_init(&ptr->list_lock);
1162
1163	MAKE_ALL_LISTS(cachep, ptr, nodeid);
1164	cachep->node[nodeid] = ptr;
1165	}
1166
1167	/*
1168	* For setting up all the kmem_cache_node for cache whose buffer_size is same as
1169	* size of kmem_cache_node.
1170	*/
1171	static void __init set_up_node(struct kmem_cache cachep, int* index)
1172	{
1173	int node;
1174
1175	for_each_online_node(node) {
1176	cachep->node[node] = &init_kmem_cache_node[index + node];
1177	cachep->node[node]->next_reap = jiffies +
1178	REAPTIMEOUT_NODE +
1179	((unsigned long)cachep) % REAPTIMEOUT_NODE;
1180	}
1181	}
1182
1183	/*
1184	* Initialisation. Called after the page allocator have been initialised and
1185	* before smp_init().
1186	*/
1187	void __init kmem_cache_init(void)
1188	{
1189	int i;
1190
1191	kmem_cache = &kmem_cache_boot;
1192
1193	if (!IS_ENABLED(CONFIG_NUMA) \|\| num_possible_nodes() == `1`)
1194	use_alien_caches = `0`;
1195
1196	for (i = `0`; i < NUM_INIT_LISTS; i++)
1197	kmem_cache_node_init(parent: &init_kmem_cache_node[i]);
1198
1199	/*
1200	* Fragmentation resistance on low memory - only use bigger
1201	* page orders on machines with more than 32MB of memory if
1202	* not overridden on the command line.
1203	*/
1204	if (!slab_max_order_set && totalram_pages() > (`32` << `20`) >> PAGE_SHIFT)
1205	slab_max_order = SLAB_MAX_ORDER_HI;
1206
1207	/ Bootstrap is tricky, because several objects are allocated*
1208	* from caches that do not exist yet:
1209	* 1) initialize the kmem_cache cache: it contains the struct
1210	* kmem_cache structures of all caches, except kmem_cache itself:
1211	* kmem_cache is statically allocated.
1212	* Initially an __init data area is used for the head array and the
1213	* kmem_cache_node structures, it's replaced with a kmalloc allocated
1214	* array at the end of the bootstrap.
1215	* 2) Create the first kmalloc cache.
1216	* The struct kmem_cache for the new cache is allocated normally.
1217	* An __init data area is used for the head array.
1218	* 3) Create the remaining kmalloc caches, with minimally sized
1219	* head arrays.
1220	* 4) Replace the __init data head arrays for kmem_cache and the first
1221	* kmalloc cache with kmalloc allocated arrays.
1222	* 5) Replace the __init data for kmem_cache_node for kmem_cache and
1223	* the other cache's with kmalloc allocated memory.
1224	* 6) Resize the head arrays of the kmalloc caches to their final sizes.
1225	*/
1226
1227	/ 1) create the kmem_cache /
1228
1229	/*
1230	* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1231	*/
1232	create_boot_cache(kmem_cache, name: "kmem_cache",
1233	offsetof(struct kmem_cache, node) +
1234	nr_node_ids * sizeof(struct kmem_cache_node *),
1235	SLAB_HWCACHE_ALIGN, useroffset: `0`, usersize: `0`);
1236	list_add(new: &kmem_cache->list, head: &slab_caches);
1237	slab_state = PARTIAL;
1238
1239	/*
1240	* Initialize the caches that provide memory for the kmem_cache_node
1241	* structures first. Without this, further allocations will bug.
1242	*/
1243	new_kmalloc_cache(INDEX_NODE, type: KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS);
1244	slab_state = PARTIAL_NODE;
1245	setup_kmalloc_cache_index_table();
1246
1247	/ 5) Replace the bootstrap kmem_cache_node /
1248	{
1249	int nid;
1250
1251	for_each_online_node(nid) {
1252	init_list(cachep: kmem_cache, list: &init_kmem_cache_node[CACHE_CACHE + nid], nodeid: nid);
1253
1254	init_list(cachep: kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
1255	list: &init_kmem_cache_node[SIZE_NODE + nid], nodeid: nid);
1256	}
1257	}
1258
1259	create_kmalloc_caches(ARCH_KMALLOC_FLAGS);
1260	}
1261
1262	void __init kmem_cache_init_late(void)
1263	{
1264	struct kmem_cache *cachep;
1265
1266	/ 6) resize the head arrays to their final sizes /
1267	mutex_lock(&slab_mutex);
1268	list_for_each_entry(cachep, &slab_caches, list)
1269	if (enable_cpucache(cachep, GFP_NOWAIT))
1270	BUG();
1271	mutex_unlock(lock: &slab_mutex);
1272
1273	/ Done! /
1274	slab_state = FULL;
1275
1276	#ifdef CONFIG_NUMA
1277	/*
1278	* Register a memory hotplug callback that initializes and frees
1279	* node.
1280	*/
1281	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
1282	#endif
1283
1284	/*
1285	* The reap timers are started later, with a module init call: That part
1286	* of the kernel is not yet operational.
1287	*/
1288	}
1289
1290	static int __init cpucache_init(void)
1291	{
1292	int ret;
1293
1294	/*
1295	* Register the timers that return unneeded pages to the page allocator
1296	*/
1297	ret = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "SLAB online",
1298	startup: slab_online_cpu, teardown: slab_offline_cpu);
1299	WARN_ON(ret < `0`);
1300
1301	return `0`;
1302	}
1303	__initcall(cpucache_init);
1304
1305	static noinline void
1306	slab_out_of_memory(struct kmem_cache cachep, gfp_t gfpflags, int* nodeid)
1307	{
1308	#if DEBUG
1309	struct kmem_cache_node *n;
1310	unsigned long flags;
1311	int node;
1312	static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
1313	DEFAULT_RATELIMIT_BURST);
1314
1315	if ((gfpflags & __GFP_NOWARN) \|\| !__ratelimit(&slab_oom_rs))
1316	return;
1317
1318	pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
1319	nodeid, gfpflags, &gfpflags);
1320	pr_warn(" cache: %s, object size: %d, order: %d\n",
1321	cachep->name, cachep->size, cachep->gfporder);
1322
1323	for_each_kmem_cache_node(cachep, node, n) {
1324	unsigned long total_slabs, free_slabs, free_objs;
1325
1326	raw_spin_lock_irqsave(&n->list_lock, flags);
1327	total_slabs = n->total_slabs;
1328	free_slabs = n->free_slabs;
1329	free_objs = n->free_objects;
1330	raw_spin_unlock_irqrestore(&n->list_lock, flags);
1331
1332	pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
1333	node, total_slabs - free_slabs, total_slabs,
1334	(total_slabs * cachep->num) - free_objs,
1335	total_slabs * cachep->num);
1336	}
1337	#endif
1338	}
1339
1340	/*
1341	* Interface to system's page allocator. No need to hold the
1342	* kmem_cache_node ->list_lock.
1343	*
1344	* If we requested dmaable memory, we will get it. Even if we
1345	* did not request dmaable memory, we might get it, but that
1346	* would be relatively rare and ignorable.
1347	*/
1348	static struct slab kmem_getpages(struct* kmem_cache *cachep, gfp_t flags,
1349	int nodeid)
1350	{
1351	struct folio *folio;
1352	struct slab *slab;
1353
1354	flags \|= cachep->allocflags;
1355
1356	folio = (struct folio *) __alloc_pages_node(nid: nodeid, gfp_mask: flags, order: cachep->gfporder);
1357	if (!folio) {
1358	slab_out_of_memory(cachep, gfpflags: flags, nodeid);
1359	return NULL;
1360	}
1361
1362	slab = folio_slab(folio);
1363
1364	account_slab(slab, order: cachep->gfporder, s: cachep, gfp: flags);
1365	__folio_set_slab(folio);
1366	/ Make the flag visible before any changes to folio->mapping /
1367	smp_wmb();
1368	/ Record if ALLOC_NO_WATERMARKS was set when allocating the slab /
1369	if (sk_memalloc_socks() && folio_is_pfmemalloc(folio))
1370	slab_set_pfmemalloc(slab);
1371
1372	return slab;
1373	}
1374
1375	/*
1376	* Interface to system's page release.
1377	*/
1378	static void kmem_freepages(struct kmem_cache cachep, struct* slab *slab)
1379	{
1380	int order = cachep->gfporder;
1381	struct folio *folio = slab_folio(slab);
1382
1383	BUG_ON(!folio_test_slab(folio));
1384	__slab_clear_pfmemalloc(slab);
1385	page_mapcount_reset(page: &folio->page);
1386	folio->mapping = NULL;
1387	/ Make the mapping reset visible before clearing the flag /
1388	smp_wmb();
1389	__folio_clear_slab(folio);
1390
1391	mm_account_reclaimed_pages(pages: `1` << order);
1392	unaccount_slab(slab, order, s: cachep);
1393	__free_pages(page: &folio->page, order);
1394	}
1395
1396	static void kmem_rcu_free(struct rcu_head *head)
1397	{
1398	struct kmem_cache *cachep;
1399	struct slab *slab;
1400
1401	slab = container_of(head, struct slab, rcu_head);
1402	cachep = slab->slab_cache;
1403
1404	kmem_freepages(cachep, slab);
1405	}
1406
1407	#if DEBUG
1408	static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
1409	{
1410	return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
1411	((cachep->size % PAGE_SIZE) == `0`);
1412	}
1413
1414	#ifdef CONFIG_DEBUG_PAGEALLOC
1415	static void slab_kernel_map(struct kmem_cache cachep, void* objp, int* map)
1416	{
1417	if (!is_debug_pagealloc_cache(cachep))
1418	return;
1419
1420	__kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
1421	}
1422
1423	#else
1424	static inline void slab_kernel_map(struct kmem_cache cachep, void* *objp,
1425	int map) {}
1426
1427	#endif
1428
1429	static void poison_obj(struct kmem_cache cachep, void* addr, unsigned* char val)
1430	{
1431	int size = cachep->object_size;
1432	addr = &((char *)addr)[obj_offset(cachep)];
1433
1434	memset(addr, val, size);
1435	(unsigned* char *)(addr + size - `1`) = POISON_END;
1436	}
1437
1438	static void dump_line(char data, int* offset, int limit)
1439	{
1440	int i;
1441	unsigned char error = `0`;
1442	int bad_count = `0`;
1443
1444	pr_err("%03x: ", offset);
1445	for (i = `0`; i < limit; i++) {
1446	if (data[offset + i] != POISON_FREE) {
1447	error = data[offset + i];
1448	bad_count++;
1449	}
1450	}
1451	print_hex_dump(KERN_CONT, "", `0`, `16`, `1`,
1452	&data[offset], limit, `1`);
1453
1454	if (bad_count == `1`) {
1455	error ^= POISON_FREE;
1456	if (!(error & (error - `1`))) {
1457	pr_err("Single bit error detected. Probably bad RAM.\n");
1458	#ifdef CONFIG_X86
1459	pr_err("Run memtest86+ or a similar memory test tool.\n");
1460	#else
1461	pr_err("Run a memory test tool.\n");
1462	#endif
1463	}
1464	}
1465	}
1466	#endif
1467
1468	#if DEBUG
1469
1470	static void print_objinfo(struct kmem_cache cachep, void* objp, int* lines)
1471	{
1472	int i, size;
1473	char *realobj;
1474
1475	if (cachep->flags & SLAB_RED_ZONE) {
1476	pr_err("Redzone: 0x%llx/0x%llx\n",
1477	*dbg_redzone1(cachep, objp),
1478	*dbg_redzone2(cachep, objp));
1479	}
1480
1481	if (cachep->flags & SLAB_STORE_USER)
1482	pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
1483	realobj = (char *)objp + obj_offset(cachep);
1484	size = cachep->object_size;
1485	for (i = `0`; i < size && lines; i += `16`, lines--) {
1486	int limit;
1487	limit = `16`;
1488	if (i + limit > size)
1489	limit = size - i;
1490	dump_line(realobj, i, limit);
1491	}
1492	}
1493
1494	static void check_poison_obj(struct kmem_cache cachep, void* *objp)
1495	{
1496	char *realobj;
1497	int size, i;
1498	int lines = `0`;
1499
1500	if (is_debug_pagealloc_cache(cachep))
1501	return;
1502
1503	realobj = (char *)objp + obj_offset(cachep);
1504	size = cachep->object_size;
1505
1506	for (i = `0`; i < size; i++) {
1507	char exp = POISON_FREE;
1508	if (i == size - `1`)
1509	exp = POISON_END;
1510	if (realobj[i] != exp) {
1511	int limit;
1512	/ Mismatch ! /
1513	/ Print header /
1514	if (lines == `0`) {
1515	pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
1516	print_tainted(), cachep->name,
1517	realobj, size);
1518	print_objinfo(cachep, objp, `0`);
1519	}
1520	/ Hexdump the affected line /
1521	i = (i / `16`) * `16`;
1522	limit = `16`;
1523	if (i + limit > size)
1524	limit = size - i;
1525	dump_line(realobj, i, limit);
1526	i += `16`;
1527	lines++;
1528	/ Limit to 5 lines /
1529	if (lines > `5`)
1530	break;
1531	}
1532	}
1533	if (lines != `0`) {
1534	/ Print some data about the neighboring objects, if they*
1535	* exist:
1536	*/
1537	struct slab *slab = virt_to_slab(objp);
1538	unsigned int objnr;
1539
1540	objnr = obj_to_index(cachep, slab, objp);
1541	if (objnr) {
1542	objp = index_to_obj(cachep, slab, objnr - `1`);
1543	realobj = (char *)objp + obj_offset(cachep);
1544	pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
1545	print_objinfo(cachep, objp, `2`);
1546	}
1547	if (objnr + `1` < cachep->num) {
1548	objp = index_to_obj(cachep, slab, objnr + `1`);
1549	realobj = (char *)objp + obj_offset(cachep);
1550	pr_err("Next obj: start=%px, len=%d\n", realobj, size);
1551	print_objinfo(cachep, objp, `2`);
1552	}
1553	}
1554	}
1555	#endif
1556
1557	#if DEBUG
1558	static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1559	struct slab *slab)
1560	{
1561	int i;
1562
1563	if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
1564	poison_obj(cachep, slab->freelist - obj_offset(cachep),
1565	POISON_FREE);
1566	}
1567
1568	for (i = `0`; i < cachep->num; i++) {
1569	void *objp = index_to_obj(cachep, slab, i);
1570
1571	if (cachep->flags & SLAB_POISON) {
1572	check_poison_obj(cachep, objp);
1573	slab_kernel_map(cachep, objp, `1`);
1574	}
1575	if (cachep->flags & SLAB_RED_ZONE) {
1576	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1577	slab_error(cachep, "start of a freed object was overwritten");
1578	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1579	slab_error(cachep, "end of a freed object was overwritten");
1580	}
1581	}
1582	}
1583	#else
1584	static void slab_destroy_debugcheck(struct kmem_cache *cachep,
1585	struct slab *slab)
1586	{
1587	}
1588	#endif
1589
1590	/**
1591	* slab_destroy - destroy and release all objects in a slab
1592	* @cachep: cache pointer being destroyed
1593	* @slab: slab being destroyed
1594	*
1595	* Destroy all the objs in a slab, and release the mem back to the system.
1596	* Before calling the slab must have been unlinked from the cache. The
1597	* kmem_cache_node ->list_lock is not held/needed.
1598	*/
1599	static void slab_destroy(struct kmem_cache cachep, struct* slab *slab)
1600	{
1601	void *freelist;
1602
1603	freelist = slab->freelist;
1604	slab_destroy_debugcheck(cachep, slab);
1605	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
1606	call_rcu(head: &slab->rcu_head, func: kmem_rcu_free);
1607	else
1608	kmem_freepages(cachep, slab);
1609
1610	/*
1611	* From now on, we don't use freelist
1612	* although actual page can be freed in rcu context
1613	*/
1614	if (OFF_SLAB(cachep))
1615	kfree(objp: freelist);
1616	}
1617
1618	/*
1619	* Update the size of the caches before calling slabs_destroy as it may
1620	* recursively call kfree.
1621	*/
1622	static void slabs_destroy(struct kmem_cache cachep, struct* list_head *list)
1623	{
1624	struct slab slab, n;
1625
1626	list_for_each_entry_safe(slab, n, list, slab_list) {
1627	list_del(entry: &slab->slab_list);
1628	slab_destroy(cachep, slab);
1629	}
1630	}
1631
1632	/**
1633	* calculate_slab_order - calculate size (page order) of slabs
1634	* @cachep: pointer to the cache that is being created
1635	* @size: size of objects to be created in this cache.
1636	* @flags: slab allocation flags
1637	*
1638	* Also calculates the number of objects per slab.
1639	*
1640	* This could be made much more intelligent. For now, try to avoid using
1641	* high order pages for slabs. When the gfp() functions are more friendly
1642	* towards high-order requests, this should be changed.
1643	*
1644	* Return: number of left-over bytes in a slab
1645	*/
1646	static size_t calculate_slab_order(struct kmem_cache *cachep,
1647	size_t size, slab_flags_t flags)
1648	{
1649	size_t left_over = `0`;
1650	int gfporder;
1651
1652	for (gfporder = `0`; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
1653	unsigned int num;
1654	size_t remainder;
1655
1656	num = cache_estimate(gfporder, buffer_size: size, flags, left_over: &remainder);
1657	if (!num)
1658	continue;
1659
1660	/ Can't handle number of objects more than SLAB_OBJ_MAX_NUM /
1661	if (num > SLAB_OBJ_MAX_NUM)
1662	break;
1663
1664	if (flags & CFLGS_OFF_SLAB) {
1665	struct kmem_cache *freelist_cache;
1666	size_t freelist_size;
1667	size_t freelist_cache_size;
1668
1669	freelist_size = num * sizeof(freelist_idx_t);
1670	if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
1671	freelist_cache_size = PAGE_SIZE << get_order(size: freelist_size);
1672	} else {
1673	freelist_cache = kmalloc_slab(size: freelist_size, flags: `0u`, _RET_IP_);
1674	if (!freelist_cache)
1675	continue;
1676	freelist_cache_size = freelist_cache->size;
1677
1678	/*
1679	* Needed to avoid possible looping condition
1680	* in cache_grow_begin()
1681	*/
1682	if (OFF_SLAB(freelist_cache))
1683	continue;
1684	}
1685
1686	/ check if off slab has enough benefit /
1687	if (freelist_cache_size > cachep->size / `2`)
1688	continue;
1689	}
1690
1691	/ Found something acceptable - save it away /
1692	cachep->num = num;
1693	cachep->gfporder = gfporder;
1694	left_over = remainder;
1695
1696	/*
1697	* A VFS-reclaimable slab tends to have most allocations
1698	* as GFP_NOFS and we really don't want to have to be allocating
1699	* higher-order pages when we are unable to shrink dcache.
1700	*/
1701	if (flags & SLAB_RECLAIM_ACCOUNT)
1702	break;
1703
1704	/*
1705	* Large number of objects is good, but very large slabs are
1706	* currently bad for the gfp()s.
1707	*/
1708	if (gfporder >= slab_max_order)
1709	break;
1710
1711	/*
1712	* Acceptable internal fragmentation?
1713	*/
1714	if (left_over * `8` <= (PAGE_SIZE << gfporder))
1715	break;
1716	}
1717	return left_over;
1718	}
1719
1720	static struct array_cache __percpu *alloc_kmem_cache_cpus(
1721	struct kmem_cache cachep, int* entries, int batchcount)
1722	{
1723	int cpu;
1724	size_t size;
1725	struct array_cache __percpu *cpu_cache;
1726
1727	size = sizeof(void ) entries + sizeof(struct array_cache);
1728	cpu_cache = __alloc_percpu(size, align: sizeof(void *));
1729
1730	if (!cpu_cache)
1731	return NULL;
1732
1733	for_each_possible_cpu(cpu) {
1734	init_arraycache(per_cpu_ptr(cpu_cache, cpu),
1735	limit: entries, batch: batchcount);
1736	}
1737
1738	return cpu_cache;
1739	}
1740
1741	static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
1742	{
1743	if (slab_state >= FULL)
1744	return enable_cpucache(cachep, gfp);
1745
1746	cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, entries: `1`, batchcount: `1`);
1747	if (!cachep->cpu_cache)
1748	return `1`;
1749
1750	if (slab_state == DOWN) {
1751	/ Creation of first cache (kmem_cache). /
1752	set_up_node(cachep: kmem_cache, CACHE_CACHE);
1753	} else if (slab_state == PARTIAL) {
1754	/ For kmem_cache_node /
1755	set_up_node(cachep, SIZE_NODE);
1756	} else {
1757	int node;
1758
1759	for_each_online_node(node) {
1760	cachep->node[node] = kmalloc_node(
1761	size: sizeof(struct kmem_cache_node), flags: gfp, node);
1762	BUG_ON(!cachep->node[node]);
1763	kmem_cache_node_init(parent: cachep->node[node]);
1764	}
1765	}
1766
1767	cachep->node[numa_mem_id()]->next_reap =
1768	jiffies + REAPTIMEOUT_NODE +
1769	((unsigned long)cachep) % REAPTIMEOUT_NODE;
1770
1771	cpu_cache_get(cachep)->avail = `0`;
1772	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1773	cpu_cache_get(cachep)->batchcount = `1`;
1774	cpu_cache_get(cachep)->touched = `0`;
1775	cachep->batchcount = `1`;
1776	cachep->limit = BOOT_CPUCACHE_ENTRIES;
1777	return `0`;
1778	}
1779
1780	slab_flags_t kmem_cache_flags(unsigned int object_size,
1781	slab_flags_t flags, const char *name)
1782	{
1783	return flags;
1784	}
1785
1786	struct kmem_cache *
1787	__kmem_cache_alias(const char name, unsigned* int size, unsigned int align,
1788	slab_flags_t flags, void (ctor)(void* *))
1789	{
1790	struct kmem_cache *cachep;
1791
1792	cachep = find_mergeable(size, align, flags, name, ctor);
1793	if (cachep) {
1794	cachep->refcount++;
1795
1796	/*
1797	* Adjust the object sizes so that we clear
1798	* the complete object on kzalloc.
1799	*/
1800	cachep->object_size = max_t(int, cachep->object_size, size);
1801	}
1802	return cachep;
1803	}
1804
1805	static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
1806	size_t size, slab_flags_t flags)
1807	{
1808	size_t left;
1809
1810	cachep->num = `0`;
1811
1812	/*
1813	* If slab auto-initialization on free is enabled, store the freelist
1814	* off-slab, so that its contents don't end up in one of the allocated
1815	* objects.
1816	*/
1817	if (unlikely(slab_want_init_on_free(cachep)))
1818	return false;
1819
1820	if (cachep->ctor \|\| flags & SLAB_TYPESAFE_BY_RCU)
1821	return false;
1822
1823	left = calculate_slab_order(cachep, size,
1824	flags: flags \| CFLGS_OBJFREELIST_SLAB);
1825	if (!cachep->num)
1826	return false;
1827
1828	if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
1829	return false;
1830
1831	cachep->colour = left / cachep->colour_off;
1832
1833	return true;
1834	}
1835
1836	static bool set_off_slab_cache(struct kmem_cache *cachep,
1837	size_t size, slab_flags_t flags)
1838	{
1839	size_t left;
1840
1841	cachep->num = `0`;
1842
1843	/*
1844	* Always use on-slab management when SLAB_NOLEAKTRACE
1845	* to avoid recursive calls into kmemleak.
1846	*/
1847	if (flags & SLAB_NOLEAKTRACE)
1848	return false;
1849
1850	/*
1851	* Size is large, assume best to place the slab management obj
1852	* off-slab (should allow better packing of objs).
1853	*/
1854	left = calculate_slab_order(cachep, size, flags: flags \| CFLGS_OFF_SLAB);
1855	if (!cachep->num)
1856	return false;
1857
1858	/*
1859	* If the slab has been placed off-slab, and we have enough space then
1860	* move it on-slab. This is at the expense of any extra colouring.
1861	*/
1862	if (left >= cachep->num * sizeof(freelist_idx_t))
1863	return false;
1864
1865	cachep->colour = left / cachep->colour_off;
1866
1867	return true;
1868	}
1869
1870	static bool set_on_slab_cache(struct kmem_cache *cachep,
1871	size_t size, slab_flags_t flags)
1872	{
1873	size_t left;
1874
1875	cachep->num = `0`;
1876
1877	left = calculate_slab_order(cachep, size, flags);
1878	if (!cachep->num)
1879	return false;
1880
1881	cachep->colour = left / cachep->colour_off;
1882
1883	return true;
1884	}
1885
1886	/*
1887	* __kmem_cache_create - Create a cache.
1888	* @cachep: cache management descriptor
1889	* @flags: SLAB flags
1890	*
1891	* Returns zero on success, nonzero on failure.
1892	*
1893	* The flags are
1894	*
1895	* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1896	* to catch references to uninitialised memory.
1897	*
1898	* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1899	* for buffer overruns.
1900	*
1901	* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1902	* cacheline. This can be beneficial if you're counting cycles as closely
1903	* as davem.
1904	*/
1905	int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
1906	{
1907	size_t ralign = BYTES_PER_WORD;
1908	gfp_t gfp;
1909	int err;
1910	unsigned int size = cachep->size;
1911
1912	#if DEBUG
1913	#if FORCED_DEBUG
1914	/*
1915	* Enable redzoning and last user accounting, except for caches with
1916	* large objects, if the increased size would increase the object size
1917	* above the next power of two: caches with object sizes just above a
1918	* power of two have a significant amount of internal fragmentation.
1919	*/
1920	if (size < `4096` \|\| fls(size - `1`) == fls(size-`1` + REDZONE_ALIGN +
1921	`2` * sizeof(unsigned long long)))
1922	flags \|= SLAB_RED_ZONE \| SLAB_STORE_USER;
1923	if (!(flags & SLAB_TYPESAFE_BY_RCU))
1924	flags \|= SLAB_POISON;
1925	#endif
1926	#endif
1927
1928	/*
1929	* Check that size is in terms of words. This is needed to avoid
1930	* unaligned accesses for some archs when redzoning is used, and makes
1931	* sure any on-slab bufctl's are also correctly aligned.
1932	*/
1933	size = ALIGN(size, BYTES_PER_WORD);
1934
1935	if (flags & SLAB_RED_ZONE) {
1936	ralign = REDZONE_ALIGN;
1937	/ If redzoning, ensure that the second redzone is suitably*
1938	* aligned, by adjusting the object size accordingly. */
1939	size = ALIGN(size, REDZONE_ALIGN);
1940	}
1941
1942	/ 3) caller mandated alignment /
1943	if (ralign < cachep->align) {
1944	ralign = cachep->align;
1945	}
1946	/ disable debug if necessary /
1947	if (ralign > __alignof__(unsigned long long))
1948	flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
1949	/*
1950	* 4) Store it.
1951	*/
1952	cachep->align = ralign;
1953	cachep->colour_off = cache_line_size();
1954	/ Offset must be a multiple of the alignment. /
1955	if (cachep->colour_off < cachep->align)
1956	cachep->colour_off = cachep->align;
1957
1958	if (slab_is_available())
1959	gfp = GFP_KERNEL;
1960	else
1961	gfp = GFP_NOWAIT;
1962
1963	#if DEBUG
1964
1965	/*
1966	* Both debugging options require word-alignment which is calculated
1967	* into align above.
1968	*/
1969	if (flags & SLAB_RED_ZONE) {
1970	/ add space for red zone words /
1971	cachep->obj_offset += sizeof(unsigned long long);
1972	size += `2` * sizeof(unsigned long long);
1973	}
1974	if (flags & SLAB_STORE_USER) {
1975	/ user store requires one word storage behind the end of*
1976	* the real object. But if the second red zone needs to be
1977	* aligned to 64 bits, we must allow that much space.
1978	*/
1979	if (flags & SLAB_RED_ZONE)
1980	size += REDZONE_ALIGN;
1981	else
1982	size += BYTES_PER_WORD;
1983	}
1984	#endif
1985
1986	kasan_cache_create(cache: cachep, size: &size, flags: &flags);
1987
1988	size = ALIGN(size, cachep->align);
1989	/*
1990	* We should restrict the number of objects in a slab to implement
1991	* byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
1992	*/
1993	if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
1994	size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
1995
1996	#if DEBUG
1997	/*
1998	* To activate debug pagealloc, off-slab management is necessary
1999	* requirement. In early phase of initialization, small sized slab
2000	* doesn't get initialized so it would not be possible. So, we need
2001	* to check size >= 256. It guarantees that all necessary small
2002	* sized slab is initialized in current slab initialization sequence.
2003	*/
2004	if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
2005	size >= `256` && cachep->object_size > cache_line_size()) {
2006	if (size < PAGE_SIZE \|\| size % PAGE_SIZE == `0`) {
2007	size_t tmp_size = ALIGN(size, PAGE_SIZE);
2008
2009	if (set_off_slab_cache(cachep, tmp_size, flags)) {
2010	flags \|= CFLGS_OFF_SLAB;
2011	cachep->obj_offset += tmp_size - size;
2012	size = tmp_size;
2013	goto done;
2014	}
2015	}
2016	}
2017	#endif
2018
2019	if (set_objfreelist_slab_cache(cachep, size, flags)) {
2020	flags \|= CFLGS_OBJFREELIST_SLAB;
2021	goto done;
2022	}
2023
2024	if (set_off_slab_cache(cachep, size, flags)) {
2025	flags \|= CFLGS_OFF_SLAB;
2026	goto done;
2027	}
2028
2029	if (set_on_slab_cache(cachep, size, flags))
2030	goto done;
2031
2032	return -E2BIG;
2033
2034	done:
2035	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
2036	cachep->flags = flags;
2037	cachep->allocflags = __GFP_COMP;
2038	if (flags & SLAB_CACHE_DMA)
2039	cachep->allocflags \|= GFP_DMA;
2040	if (flags & SLAB_CACHE_DMA32)
2041	cachep->allocflags \|= GFP_DMA32;
2042	if (flags & SLAB_RECLAIM_ACCOUNT)
2043	cachep->allocflags \|= __GFP_RECLAIMABLE;
2044	cachep->size = size;
2045	cachep->reciprocal_buffer_size = reciprocal_value(d: size);
2046
2047	#if DEBUG
2048	/*
2049	* If we're going to use the generic kernel_map_pages()
2050	* poisoning, then it's going to smash the contents of
2051	* the redzone and userword anyhow, so switch them off.
2052	*/
2053	if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
2054	(cachep->flags & SLAB_POISON) &&
2055	is_debug_pagealloc_cache(cachep))
2056	cachep->flags &= ~(SLAB_RED_ZONE \| SLAB_STORE_USER);
2057	#endif
2058
2059	err = setup_cpu_cache(cachep, gfp);
2060	if (err) {
2061	__kmem_cache_release(cachep);
2062	return err;
2063	}
2064
2065	return `0`;
2066	}
2067
2068	#if DEBUG
2069	static void check_irq_off(void)
2070	{
2071	BUG_ON(!irqs_disabled());
2072	}
2073
2074	static void check_irq_on(void)
2075	{
2076	BUG_ON(irqs_disabled());
2077	}
2078
2079	static void check_mutex_acquired(void)
2080	{
2081	BUG_ON(!mutex_is_locked(&slab_mutex));
2082	}
2083
2084	static void check_spinlock_acquired(struct kmem_cache *cachep)
2085	{
2086	#ifdef CONFIG_SMP
2087	check_irq_off();
2088	assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
2089	#endif
2090	}
2091
2092	static void check_spinlock_acquired_node(struct kmem_cache cachep, int* node)
2093	{
2094	#ifdef CONFIG_SMP
2095	check_irq_off();
2096	assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
2097	#endif
2098	}
2099
2100	#else
2101	#define check_irq_off() do { } while(0)
2102	#define check_irq_on() do { } while(0)
2103	#define check_mutex_acquired() do { } while(0)
2104	#define check_spinlock_acquired(x) do { } while(0)
2105	#define check_spinlock_acquired_node(x, y) do { } while(0)
2106	#endif
2107
2108	static void drain_array_locked(struct kmem_cache cachep, struct* array_cache *ac,
2109	int node, bool free_all, struct list_head *list)
2110	{
2111	int tofree;
2112
2113	if (!ac \|\| !ac->avail)
2114	return;
2115
2116	tofree = free_all ? ac->avail : (ac->limit + `4`) / `5`;
2117	if (tofree > ac->avail)
2118	tofree = (ac->avail + `1`) / `2`;
2119
2120	free_block(cachep, objpp: ac->entry, len: tofree, node, list);
2121	ac->avail -= tofree;
2122	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void ) ac->avail);
2123	}
2124
2125	static void do_drain(void *arg)
2126	{
2127	struct kmem_cache *cachep = arg;
2128	struct array_cache *ac;
2129	int node = numa_mem_id();
2130	struct kmem_cache_node *n;
2131	LIST_HEAD(list);
2132
2133	check_irq_off();
2134	ac = cpu_cache_get(cachep);
2135	n = get_node(s: cachep, node);
2136	raw_spin_lock(&n->list_lock);
2137	free_block(cachep, objpp: ac->entry, len: ac->avail, node, list: &list);
2138	raw_spin_unlock(&n->list_lock);
2139	ac->avail = `0`;
2140	slabs_destroy(cachep, list: &list);
2141	}
2142
2143	static void drain_cpu_caches(struct kmem_cache *cachep)
2144	{
2145	struct kmem_cache_node *n;
2146	int node;
2147	LIST_HEAD(list);
2148
2149	on_each_cpu(func: do_drain, info: cachep, wait: `1`);
2150	check_irq_on();
2151	for_each_kmem_cache_node(cachep, node, n)
2152	if (n->alien)
2153	drain_alien_cache(cachep, alien: n->alien);
2154
2155	for_each_kmem_cache_node(cachep, node, n) {
2156	raw_spin_lock_irq(&n->list_lock);
2157	drain_array_locked(cachep, ac: n->shared, node, free_all: true, list: &list);
2158	raw_spin_unlock_irq(&n->list_lock);
2159
2160	slabs_destroy(cachep, list: &list);
2161	}
2162	}
2163
2164	/*
2165	* Remove slabs from the list of free slabs.
2166	* Specify the number of slabs to drain in tofree.
2167	*
2168	* Returns the actual number of slabs released.
2169	*/
2170	static int drain_freelist(struct kmem_cache *cache,
2171	struct kmem_cache_node n, int* tofree)
2172	{
2173	struct list_head *p;
2174	int nr_freed;
2175	struct slab *slab;
2176
2177	nr_freed = `0`;
2178	while (nr_freed < tofree && !list_empty(head: &n->slabs_free)) {
2179
2180	raw_spin_lock_irq(&n->list_lock);
2181	p = n->slabs_free.prev;
2182	if (p == &n->slabs_free) {
2183	raw_spin_unlock_irq(&n->list_lock);
2184	goto out;
2185	}
2186
2187	slab = list_entry(p, struct slab, slab_list);
2188	list_del(entry: &slab->slab_list);
2189	n->free_slabs--;
2190	n->total_slabs--;
2191	/*
2192	* Safe to drop the lock. The slab is no longer linked
2193	* to the cache.
2194	*/
2195	n->free_objects -= cache->num;
2196	raw_spin_unlock_irq(&n->list_lock);
2197	slab_destroy(cachep: cache, slab);
2198	nr_freed++;
2199
2200	cond_resched();
2201	}
2202	out:
2203	return nr_freed;
2204	}
2205
2206	bool __kmem_cache_empty(struct kmem_cache *s)
2207	{
2208	int node;
2209	struct kmem_cache_node *n;
2210
2211	for_each_kmem_cache_node(s, node, n)
2212	if (!list_empty(head: &n->slabs_full) \|\|
2213	!list_empty(head: &n->slabs_partial))
2214	return false;
2215	return true;
2216	}
2217
2218	int __kmem_cache_shrink(struct kmem_cache *cachep)
2219	{
2220	int ret = `0`;
2221	int node;
2222	struct kmem_cache_node *n;
2223
2224	drain_cpu_caches(cachep);
2225
2226	check_irq_on();
2227	for_each_kmem_cache_node(cachep, node, n) {
2228	drain_freelist(cache: cachep, n, INT_MAX);
2229
2230	ret += !list_empty(head: &n->slabs_full) \|\|
2231	!list_empty(head: &n->slabs_partial);
2232	}
2233	return (ret ? `1` : `0`);
2234	}
2235
2236	int __kmem_cache_shutdown(struct kmem_cache *cachep)
2237	{
2238	return __kmem_cache_shrink(cachep);
2239	}
2240
2241	void __kmem_cache_release(struct kmem_cache *cachep)
2242	{
2243	int i;
2244	struct kmem_cache_node *n;
2245
2246	cache_random_seq_destroy(cachep);
2247
2248	free_percpu(pdata: cachep->cpu_cache);
2249
2250	/ NUMA: free the node structures /
2251	for_each_kmem_cache_node(cachep, i, n) {
2252	kfree(objp: n->shared);
2253	free_alien_cache(alc_ptr: n->alien);
2254	kfree(objp: n);
2255	cachep->node[i] = NULL;
2256	}
2257	}
2258
2259	/*
2260	* Get the memory for a slab management obj.
2261	*
2262	* For a slab cache when the slab descriptor is off-slab, the
2263	* slab descriptor can't come from the same cache which is being created,
2264	* Because if it is the case, that means we defer the creation of
2265	* the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
2266	* And we eventually call down to __kmem_cache_create(), which
2267	* in turn looks up in the kmalloc_{dma,}_caches for the desired-size one.
2268	* This is a "chicken-and-egg" problem.
2269	*
2270	* So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
2271	* which are all initialized during kmem_cache_init().
2272	*/
2273	static void alloc_slabmgmt(struct* kmem_cache *cachep,
2274	struct slab slab, int* colour_off,
2275	gfp_t local_flags, int nodeid)
2276	{
2277	void *freelist;
2278	void *addr = slab_address(slab);
2279
2280	slab->s_mem = addr + colour_off;
2281	slab->active = `0`;
2282
2283	if (OBJFREELIST_SLAB(cachep))
2284	freelist = NULL;
2285	else if (OFF_SLAB(cachep)) {
2286	/ Slab management obj is off-slab. /
2287	freelist = kmalloc_node(size: cachep->freelist_size,
2288	flags: local_flags, node: nodeid);
2289	} else {
2290	/ We will use last bytes at the slab for freelist /
2291	freelist = addr + (PAGE_SIZE << cachep->gfporder) -
2292	cachep->freelist_size;
2293	}
2294
2295	return freelist;
2296	}
2297
2298	static inline freelist_idx_t get_free_obj(struct slab slab, unsigned* int idx)
2299	{
2300	return ((freelist_idx_t *) slab->freelist)[idx];
2301	}
2302
2303	static inline void set_free_obj(struct slab *slab,
2304	unsigned int idx, freelist_idx_t val)
2305	{
2306	((freelist_idx_t *)(slab->freelist))[idx] = val;
2307	}
2308
2309	static void cache_init_objs_debug(struct kmem_cache cachep, struct* slab *slab)
2310	{
2311	#if DEBUG
2312	int i;
2313
2314	for (i = `0`; i < cachep->num; i++) {
2315	void *objp = index_to_obj(cachep, slab, i);
2316
2317	if (cachep->flags & SLAB_STORE_USER)
2318	*dbg_userword(cachep, objp) = NULL;
2319
2320	if (cachep->flags & SLAB_RED_ZONE) {
2321	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2322	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2323	}
2324	/*
2325	* Constructors are not allowed to allocate memory from the same
2326	* cache which they are a constructor for. Otherwise, deadlock.
2327	* They must also be threaded.
2328	*/
2329	if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
2330	kasan_unpoison_object_data(cachep,
2331	objp + obj_offset(cachep));
2332	cachep->ctor(objp + obj_offset(cachep));
2333	kasan_poison_object_data(
2334	cachep, objp + obj_offset(cachep));
2335	}
2336
2337	if (cachep->flags & SLAB_RED_ZONE) {
2338	if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2339	slab_error(cachep, "constructor overwrote the end of an object");
2340	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2341	slab_error(cachep, "constructor overwrote the start of an object");
2342	}
2343	/ need to poison the objs? /
2344	if (cachep->flags & SLAB_POISON) {
2345	poison_obj(cachep, objp, POISON_FREE);
2346	slab_kernel_map(cachep, objp, `0`);
2347	}
2348	}
2349	#endif
2350	}
2351
2352	#ifdef CONFIG_SLAB_FREELIST_RANDOM
2353	/ Hold information during a freelist initialization /
2354	struct freelist_init_state {
2355	unsigned int pos;
2356	unsigned int *list;
2357	unsigned int count;
2358	};
2359
2360	/*
2361	* Initialize the state based on the randomization method available.
2362	* return true if the pre-computed list is available, false otherwise.
2363	*/
2364	static bool freelist_state_initialize(struct freelist_init_state *state,
2365	struct kmem_cache *cachep,
2366	unsigned int count)
2367	{
2368	bool ret;
2369	if (!cachep->random_seq) {
2370	ret = false;
2371	} else {
2372	state->list = cachep->random_seq;
2373	state->count = count;
2374	state->pos = get_random_u32_below(count);
2375	ret = true;
2376	}
2377	return ret;
2378	}
2379
2380	/ Get the next entry on the list and randomize it using a random shift /
2381	static freelist_idx_t next_random_slot(struct freelist_init_state *state)
2382	{
2383	if (state->pos >= state->count)
2384	state->pos = `0`;
2385	return state->list[state->pos++];
2386	}
2387
2388	/ Swap two freelist entries /
2389	static void swap_free_obj(struct slab slab, unsigned* int a, unsigned int b)
2390	{
2391	swap(((freelist_idx_t *) slab->freelist)[a],
2392	((freelist_idx_t *) slab->freelist)[b]);
2393	}
2394
2395	/*
2396	* Shuffle the freelist initialization state based on pre-computed lists.
2397	* return true if the list was successfully shuffled, false otherwise.
2398	*/
2399	static bool shuffle_freelist(struct kmem_cache cachep, struct* slab *slab)
2400	{
2401	unsigned int objfreelist = `0`, i, rand, count = cachep->num;
2402	struct freelist_init_state state;
2403	bool precomputed;
2404
2405	if (count < `2`)
2406	return false;
2407
2408	precomputed = freelist_state_initialize(&state, cachep, count);
2409
2410	/ Take a random entry as the objfreelist /
2411	if (OBJFREELIST_SLAB(cachep)) {
2412	if (!precomputed)
2413	objfreelist = count - `1`;
2414	else
2415	objfreelist = next_random_slot(&state);
2416	slab->freelist = index_to_obj(cachep, slab, objfreelist) +
2417	obj_offset(cachep);
2418	count--;
2419	}
2420
2421	/*
2422	* On early boot, generate the list dynamically.
2423	* Later use a pre-computed list for speed.
2424	*/
2425	if (!precomputed) {
2426	for (i = `0`; i < count; i++)
2427	set_free_obj(slab, i, i);
2428
2429	/ Fisher-Yates shuffle /
2430	for (i = count - `1`; i > `0`; i--) {
2431	rand = get_random_u32_below(i + `1`);
2432	swap_free_obj(slab, i, rand);
2433	}
2434	} else {
2435	for (i = `0`; i < count; i++)
2436	set_free_obj(slab, i, next_random_slot(&state));
2437	}
2438
2439	if (OBJFREELIST_SLAB(cachep))
2440	set_free_obj(slab, cachep->num - `1`, objfreelist);
2441
2442	return true;
2443	}
2444	#else
2445	static inline bool shuffle_freelist(struct kmem_cache *cachep,
2446	struct slab *slab)
2447	{
2448	return false;
2449	}
2450	#endif /* CONFIG_SLAB_FREELIST_RANDOM */
2451
2452	static void cache_init_objs(struct kmem_cache *cachep,
2453	struct slab *slab)
2454	{
2455	int i;
2456	void *objp;
2457	bool shuffled;
2458
2459	cache_init_objs_debug(cachep, slab);
2460
2461	/ Try to randomize the freelist if enabled /
2462	shuffled = shuffle_freelist(cachep, slab);
2463
2464	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
2465	slab->freelist = index_to_obj(cache: cachep, slab, idx: cachep->num - `1`) +
2466	obj_offset(cachep);
2467	}
2468
2469	for (i = `0`; i < cachep->num; i++) {
2470	objp = index_to_obj(cache: cachep, slab, idx: i);
2471	objp = kasan_init_slab_obj(cache: cachep, object: objp);
2472
2473	/ constructor could break poison info /
2474	if (DEBUG == `0` && cachep->ctor) {
2475	kasan_unpoison_object_data(cache: cachep, object: objp);
2476	cachep->ctor(objp);
2477	kasan_poison_object_data(cache: cachep, object: objp);
2478	}
2479
2480	if (!shuffled)
2481	set_free_obj(slab, idx: i, val: i);
2482	}
2483	}
2484
2485	static void slab_get_obj(struct* kmem_cache cachep, struct* slab *slab)
2486	{
2487	void *objp;
2488
2489	objp = index_to_obj(cache: cachep, slab, idx: get_free_obj(slab, idx: slab->active));
2490	slab->active++;
2491
2492	return objp;
2493	}
2494
2495	static void slab_put_obj(struct kmem_cache *cachep,
2496	struct slab slab, void* *objp)
2497	{
2498	unsigned int objnr = obj_to_index(cache: cachep, slab, obj: objp);
2499	#if DEBUG
2500	unsigned int i;
2501
2502	/ Verify double free bug /
2503	for (i = slab->active; i < cachep->num; i++) {
2504	if (get_free_obj(slab, i) == objnr) {
2505	pr_err("slab: double free detected in cache '%s', objp %px\n",
2506	cachep->name, objp);
2507	BUG();
2508	}
2509	}
2510	#endif
2511	slab->active--;
2512	if (!slab->freelist)
2513	slab->freelist = objp + obj_offset(cachep);
2514
2515	set_free_obj(slab, idx: slab->active, val: objnr);
2516	}
2517
2518	/*
2519	* Grow (by 1) the number of slabs within a cache. This is called by
2520	* kmem_cache_alloc() when there are no active objs left in a cache.
2521	*/
2522	static struct slab cache_grow_begin(struct* kmem_cache *cachep,
2523	gfp_t flags, int nodeid)
2524	{
2525	void *freelist;
2526	size_t offset;
2527	gfp_t local_flags;
2528	int slab_node;
2529	struct kmem_cache_node *n;
2530	struct slab *slab;
2531
2532	/*
2533	* Be lazy and only check for valid flags here, keeping it out of the
2534	* critical path in kmem_cache_alloc().
2535	*/
2536	if (unlikely(flags & GFP_SLAB_BUG_MASK))
2537	flags = kmalloc_fix_flags(flags);
2538
2539	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
2540	local_flags = flags & (GFP_CONSTRAINT_MASK\|GFP_RECLAIM_MASK);
2541
2542	check_irq_off();
2543	if (gfpflags_allow_blocking(gfp_flags: local_flags))
2544	local_irq_enable();
2545
2546	/*
2547	* Get mem for the objs. Attempt to allocate a physical page from
2548	* 'nodeid'.
2549	*/
2550	slab = kmem_getpages(cachep, flags: local_flags, nodeid);
2551	if (!slab)
2552	goto failed;
2553
2554	slab_node = slab_nid(slab);
2555	n = get_node(s: cachep, node: slab_node);
2556
2557	/ Get colour for the slab, and cal the next value. /
2558	n->colour_next++;
2559	if (n->colour_next >= cachep->colour)
2560	n->colour_next = `0`;
2561
2562	offset = n->colour_next;
2563	if (offset >= cachep->colour)
2564	offset = `0`;
2565
2566	offset *= cachep->colour_off;
2567
2568	/*
2569	* Call kasan_poison_slab() before calling alloc_slabmgmt(), so
2570	* page_address() in the latter returns a non-tagged pointer,
2571	* as it should be for slab pages.
2572	*/
2573	kasan_poison_slab(slab);
2574
2575	/ Get slab management. /
2576	freelist = alloc_slabmgmt(cachep, slab, colour_off: offset,
2577	local_flags: local_flags & ~GFP_CONSTRAINT_MASK, nodeid: slab_node);
2578	if (OFF_SLAB(cachep) && !freelist)
2579	goto opps1;
2580
2581	slab->slab_cache = cachep;
2582	slab->freelist = freelist;
2583
2584	cache_init_objs(cachep, slab);
2585
2586	if (gfpflags_allow_blocking(gfp_flags: local_flags))
2587	local_irq_disable();
2588
2589	return slab;
2590
2591	opps1:
2592	kmem_freepages(cachep, slab);
2593	failed:
2594	if (gfpflags_allow_blocking(gfp_flags: local_flags))
2595	local_irq_disable();
2596	return NULL;
2597	}
2598
2599	static void cache_grow_end(struct kmem_cache cachep, struct* slab *slab)
2600	{
2601	struct kmem_cache_node *n;
2602	void *list = NULL;
2603
2604	check_irq_off();
2605
2606	if (!slab)
2607	return;
2608
2609	INIT_LIST_HEAD(list: &slab->slab_list);
2610	n = get_node(s: cachep, node: slab_nid(slab));
2611
2612	raw_spin_lock(&n->list_lock);
2613	n->total_slabs++;
2614	if (!slab->active) {
2615	list_add_tail(new: &slab->slab_list, head: &n->slabs_free);
2616	n->free_slabs++;
2617	} else
2618	fixup_slab_list(cachep, n, slab, list: &list);
2619
2620	STATS_INC_GROWN(cachep);
2621	n->free_objects += cachep->num - slab->active;
2622	raw_spin_unlock(&n->list_lock);
2623
2624	fixup_objfreelist_debug(cachep, list: &list);
2625	}
2626
2627	#if DEBUG
2628
2629	/*
2630	* Perform extra freeing checks:
2631	* - detect bad pointers.
2632	* - POISON/RED_ZONE checking
2633	*/
2634	static void kfree_debugcheck(const void *objp)
2635	{
2636	if (!virt_addr_valid(objp)) {
2637	pr_err("kfree_debugcheck: out of range ptr %lxh\n",
2638	(unsigned long)objp);
2639	BUG();
2640	}
2641	}
2642
2643	static inline void verify_redzone_free(struct kmem_cache cache, void* *obj)
2644	{
2645	unsigned long long redzone1, redzone2;
2646
2647	redzone1 = *dbg_redzone1(cache, obj);
2648	redzone2 = *dbg_redzone2(cache, obj);
2649
2650	/*
2651	* Redzone is ok.
2652	*/
2653	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2654	return;
2655
2656	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2657	slab_error(cache, "double free detected");
2658	else
2659	slab_error(cache, "memory outside object was overwritten");
2660
2661	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
2662	obj, redzone1, redzone2);
2663	}
2664
2665	static void cache_free_debugcheck(struct* kmem_cache cachep, void* *objp,
2666	unsigned long caller)
2667	{
2668	unsigned int objnr;
2669	struct slab *slab;
2670
2671	BUG_ON(virt_to_cache(objp) != cachep);
2672
2673	objp -= obj_offset(cachep);
2674	kfree_debugcheck(objp);
2675	slab = virt_to_slab(objp);
2676
2677	if (cachep->flags & SLAB_RED_ZONE) {
2678	verify_redzone_free(cachep, objp);
2679	*dbg_redzone1(cachep, objp) = RED_INACTIVE;
2680	*dbg_redzone2(cachep, objp) = RED_INACTIVE;
2681	}
2682	if (cachep->flags & SLAB_STORE_USER)
2683	dbg_userword(cachep, objp) = (void* *)caller;
2684
2685	objnr = obj_to_index(cachep, slab, objp);
2686
2687	BUG_ON(objnr >= cachep->num);
2688	BUG_ON(objp != index_to_obj(cachep, slab, objnr));
2689
2690	if (cachep->flags & SLAB_POISON) {
2691	poison_obj(cachep, objp, POISON_FREE);
2692	slab_kernel_map(cachep, objp, `0`);
2693	}
2694	return objp;
2695	}
2696
2697	#else
2698	#define kfree_debugcheck(x) do { } while(0)
2699	#define cache_free_debugcheck(x, objp, z) (objp)
2700	#endif
2701
2702	static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
2703	void **list)
2704	{
2705	#if DEBUG
2706	void next = list;
2707	void *objp;
2708
2709	while (next) {
2710	objp = next - obj_offset(cachep);
2711	next = (void* **)next;
2712	poison_obj(cachep, objp, POISON_FREE);
2713	}
2714	#endif
2715	}
2716
2717	static inline void fixup_slab_list(struct kmem_cache *cachep,
2718	struct kmem_cache_node n, struct* slab *slab,
2719	void **list)
2720	{
2721	/ move slabp to correct slabp list: /
2722	list_del(entry: &slab->slab_list);
2723	if (slab->active == cachep->num) {
2724	list_add(new: &slab->slab_list, head: &n->slabs_full);
2725	if (OBJFREELIST_SLAB(cachep)) {
2726	#if DEBUG
2727	/ Poisoning will be done without holding the lock /
2728	if (cachep->flags & SLAB_POISON) {
2729	void **objp = slab->freelist;
2730
2731	objp = list;
2732	*list = objp;
2733	}
2734	#endif
2735	slab->freelist = NULL;
2736	}
2737	} else
2738	list_add(new: &slab->slab_list, head: &n->slabs_partial);
2739	}
2740
2741	/ Try to find non-pfmemalloc slab if needed /
2742	static noinline struct slab get_valid_first_slab(struct* kmem_cache_node *n,
2743	struct slab *slab, bool pfmemalloc)
2744	{
2745	if (!slab)
2746	return NULL;
2747
2748	if (pfmemalloc)
2749	return slab;
2750
2751	if (!slab_test_pfmemalloc(slab))
2752	return slab;
2753
2754	/ No need to keep pfmemalloc slab if we have enough free objects /
2755	if (n->free_objects > n->free_limit) {
2756	slab_clear_pfmemalloc(slab);
2757	return slab;
2758	}
2759
2760	/ Move pfmemalloc slab to the end of list to speed up next search /
2761	list_del(entry: &slab->slab_list);
2762	if (!slab->active) {
2763	list_add_tail(new: &slab->slab_list, head: &n->slabs_free);
2764	n->free_slabs++;
2765	} else
2766	list_add_tail(new: &slab->slab_list, head: &n->slabs_partial);
2767
2768	list_for_each_entry(slab, &n->slabs_partial, slab_list) {
2769	if (!slab_test_pfmemalloc(slab))
2770	return slab;
2771	}
2772
2773	n->free_touched = `1`;
2774	list_for_each_entry(slab, &n->slabs_free, slab_list) {
2775	if (!slab_test_pfmemalloc(slab)) {
2776	n->free_slabs--;
2777	return slab;
2778	}
2779	}
2780
2781	return NULL;
2782	}
2783
2784	static struct slab get_first_slab(struct* kmem_cache_node *n, bool pfmemalloc)
2785	{
2786	struct slab *slab;
2787
2788	assert_raw_spin_locked(&n->list_lock);
2789	slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
2790	slab_list);
2791	if (!slab) {
2792	n->free_touched = `1`;
2793	slab = list_first_entry_or_null(&n->slabs_free, struct slab,
2794	slab_list);
2795	if (slab)
2796	n->free_slabs--;
2797	}
2798
2799	if (sk_memalloc_socks())
2800	slab = get_valid_first_slab(n, slab, pfmemalloc);
2801
2802	return slab;
2803	}
2804
2805	static noinline void cache_alloc_pfmemalloc(struct* kmem_cache *cachep,
2806	struct kmem_cache_node *n, gfp_t flags)
2807	{
2808	struct slab *slab;
2809	void *obj;
2810	void *list = NULL;
2811
2812	if (!gfp_pfmemalloc_allowed(gfp_mask: flags))
2813	return NULL;
2814
2815	raw_spin_lock(&n->list_lock);
2816	slab = get_first_slab(n, pfmemalloc: true);
2817	if (!slab) {
2818	raw_spin_unlock(&n->list_lock);
2819	return NULL;
2820	}
2821
2822	obj = slab_get_obj(cachep, slab);
2823	n->free_objects--;
2824
2825	fixup_slab_list(cachep, n, slab, list: &list);
2826
2827	raw_spin_unlock(&n->list_lock);
2828	fixup_objfreelist_debug(cachep, list: &list);
2829
2830	return obj;
2831	}
2832
2833	/*
2834	* Slab list should be fixed up by fixup_slab_list() for existing slab
2835	* or cache_grow_end() for new slab
2836	*/
2837	static __always_inline int alloc_block(struct kmem_cache *cachep,
2838	struct array_cache ac, struct* slab slab, int* batchcount)
2839	{
2840	/*
2841	* There must be at least one object available for
2842	* allocation.
2843	*/
2844	BUG_ON(slab->active >= cachep->num);
2845
2846	while (slab->active < cachep->num && batchcount--) {
2847	STATS_INC_ALLOCED(cachep);
2848	STATS_INC_ACTIVE(cachep);
2849	STATS_SET_HIGH(cachep);
2850
2851	ac->entry[ac->avail++] = slab_get_obj(cachep, slab);
2852	}
2853
2854	return batchcount;
2855	}
2856
2857	static void cache_alloc_refill(struct* kmem_cache *cachep, gfp_t flags)
2858	{
2859	int batchcount;
2860	struct kmem_cache_node *n;
2861	struct array_cache ac, shared;
2862	int node;
2863	void *list = NULL;
2864	struct slab *slab;
2865
2866	check_irq_off();
2867	node = numa_mem_id();
2868
2869	ac = cpu_cache_get(cachep);
2870	batchcount = ac->batchcount;
2871	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2872	/*
2873	* If there was little recent activity on this cache, then
2874	* perform only a partial refill. Otherwise we could generate
2875	* refill bouncing.
2876	*/
2877	batchcount = BATCHREFILL_LIMIT;
2878	}
2879	n = get_node(s: cachep, node);
2880
2881	BUG_ON(ac->avail > `0` \|\| !n);
2882	shared = READ_ONCE(n->shared);
2883	if (!n->free_objects && (!shared \|\| !shared->avail))
2884	goto direct_grow;
2885
2886	raw_spin_lock(&n->list_lock);
2887	shared = READ_ONCE(n->shared);
2888
2889	/ See if we can refill from the shared array /
2890	if (shared && transfer_objects(to: ac, from: shared, max: batchcount)) {
2891	shared->touched = `1`;
2892	goto alloc_done;
2893	}
2894
2895	while (batchcount > `0`) {
2896	/ Get slab alloc is to come from. /
2897	slab = get_first_slab(n, pfmemalloc: false);
2898	if (!slab)
2899	goto must_grow;
2900
2901	check_spinlock_acquired(cachep);
2902
2903	batchcount = alloc_block(cachep, ac, slab, batchcount);
2904	fixup_slab_list(cachep, n, slab, list: &list);
2905	}
2906
2907	must_grow:
2908	n->free_objects -= ac->avail;
2909	alloc_done:
2910	raw_spin_unlock(&n->list_lock);
2911	fixup_objfreelist_debug(cachep, list: &list);
2912
2913	direct_grow:
2914	if (unlikely(!ac->avail)) {
2915	/ Check if we can use obj in pfmemalloc slab /
2916	if (sk_memalloc_socks()) {
2917	void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
2918
2919	if (obj)
2920	return obj;
2921	}
2922
2923	slab = cache_grow_begin(cachep, flags: gfp_exact_node(flags), nodeid: node);
2924
2925	/*
2926	* cache_grow_begin() can reenable interrupts,
2927	* then ac could change.
2928	*/
2929	ac = cpu_cache_get(cachep);
2930	if (!ac->avail && slab)
2931	alloc_block(cachep, ac, slab, batchcount);
2932	cache_grow_end(cachep, slab);
2933
2934	if (!ac->avail)
2935	return NULL;
2936	}
2937	ac->touched = `1`;
2938
2939	return ac->entry[--ac->avail];
2940	}
2941
2942	#if DEBUG
2943	static void cache_alloc_debugcheck_after(struct* kmem_cache *cachep,
2944	gfp_t flags, void objp, unsigned* long caller)
2945	{
2946	WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
2947	if (!objp \|\| is_kfence_address(objp))
2948	return objp;
2949	if (cachep->flags & SLAB_POISON) {
2950	check_poison_obj(cachep, objp);
2951	slab_kernel_map(cachep, objp, `1`);
2952	poison_obj(cachep, objp, POISON_INUSE);
2953	}
2954	if (cachep->flags & SLAB_STORE_USER)
2955	dbg_userword(cachep, objp) = (void* *)caller;
2956
2957	if (cachep->flags & SLAB_RED_ZONE) {
2958	if (*dbg_redzone1(cachep, objp) != RED_INACTIVE \|\|
2959	*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2960	slab_error(cachep, "double free, or memory outside object was overwritten");
2961	pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
2962	objp, *dbg_redzone1(cachep, objp),
2963	*dbg_redzone2(cachep, objp));
2964	}
2965	*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2966	*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2967	}
2968
2969	objp += obj_offset(cachep);
2970	if (cachep->ctor && cachep->flags & SLAB_POISON)
2971	cachep->ctor(objp);
2972	if ((unsigned long)objp & (arch_slab_minalign() - `1`)) {
2973	pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp,
2974	arch_slab_minalign());
2975	}
2976	return objp;
2977	}
2978	#else
2979	#define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
2980	#endif
2981
2982	static inline void ____cache_alloc(struct* kmem_cache *cachep, gfp_t flags)
2983	{
2984	void *objp;
2985	struct array_cache *ac;
2986
2987	check_irq_off();
2988
2989	ac = cpu_cache_get(cachep);
2990	if (likely(ac->avail)) {
2991	ac->touched = `1`;
2992	objp = ac->entry[--ac->avail];
2993
2994	STATS_INC_ALLOCHIT(cachep);
2995	goto out;
2996	}
2997
2998	STATS_INC_ALLOCMISS(cachep);
2999	objp = cache_alloc_refill(cachep, flags);
3000	/*
3001	* the 'ac' may be updated by cache_alloc_refill(),
3002	* and kmemleak_erase() requires its correct value.
3003	*/
3004	ac = cpu_cache_get(cachep);
3005
3006	out:
3007	/*
3008	* To avoid a false negative, if an object that is in one of the
3009	* per-CPU caches is leaked, we need to make sure kmemleak doesn't
3010	* treat the array pointers as a reference to the object.
3011	*/
3012	if (objp)
3013	kmemleak_erase(ptr: &ac->entry[ac->avail]);
3014	return objp;
3015	}
3016
3017	#ifdef CONFIG_NUMA
3018	static void ____cache_alloc_node(struct* kmem_cache , gfp_t, int*);
3019
3020	/*
3021	* Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
3022	*
3023	* If we are in_interrupt, then process context, including cpusets and
3024	* mempolicy, may not apply and should not be used for allocation policy.
3025	*/
3026	static void alternate_node_alloc(struct* kmem_cache *cachep, gfp_t flags)
3027	{
3028	int nid_alloc, nid_here;
3029
3030	if (in_interrupt() \|\| (flags & __GFP_THISNODE))
3031	return NULL;
3032	nid_alloc = nid_here = numa_mem_id();
3033	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3034	nid_alloc = cpuset_slab_spread_node();
3035	else if (current->mempolicy)
3036	nid_alloc = mempolicy_slab_node();
3037	if (nid_alloc != nid_here)
3038	return ____cache_alloc_node(cachep, flags, nid_alloc);
3039	return NULL;
3040	}
3041
3042	/*
3043	* Fallback function if there was no memory available and no objects on a
3044	* certain node and fall back is permitted. First we scan all the
3045	* available node for available objects. If that fails then we
3046	* perform an allocation without specifying a node. This allows the page
3047	* allocator to do its reclaim / fallback magic. We then insert the
3048	* slab into the proper nodelist and then allocate from it.
3049	*/
3050	static void fallback_alloc(struct* kmem_cache *cache, gfp_t flags)
3051	{
3052	struct zonelist *zonelist;
3053	struct zoneref *z;
3054	struct zone *zone;
3055	enum zone_type highest_zoneidx = gfp_zone(flags);
3056	void *obj = NULL;
3057	struct slab *slab;
3058	int nid;
3059	unsigned int cpuset_mems_cookie;
3060
3061	if (flags & __GFP_THISNODE)
3062	return NULL;
3063
3064	retry_cpuset:
3065	cpuset_mems_cookie = read_mems_allowed_begin();
3066	zonelist = node_zonelist(nid: mempolicy_slab_node(), flags);
3067
3068	retry:
3069	/*
3070	* Look through allowed nodes for objects available
3071	* from existing per node queues.
3072	*/
3073	for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
3074	nid = zone_to_nid(zone);
3075
3076	if (cpuset_zone_allowed(z: zone, gfp_mask: flags) &&
3077	get_node(s: cache, node: nid) &&
3078	get_node(s: cache, node: nid)->free_objects) {
3079	obj = ____cache_alloc_node(cache,
3080	gfp_exact_node(flags), nid);
3081	if (obj)
3082	break;
3083	}
3084	}
3085
3086	if (!obj) {
3087	/*
3088	* This allocation will be performed within the constraints
3089	* of the current cpuset / memory policy requirements.
3090	* We may trigger various forms of reclaim on the allowed
3091	* set and go into memory reserves if necessary.
3092	*/
3093	slab = cache_grow_begin(cachep: cache, flags, nodeid: numa_mem_id());
3094	cache_grow_end(cachep: cache, slab);
3095	if (slab) {
3096	nid = slab_nid(slab);
3097	obj = ____cache_alloc_node(cache,
3098	gfp_exact_node(flags), nid);
3099
3100	/*
3101	* Another processor may allocate the objects in
3102	* the slab since we are not holding any locks.
3103	*/
3104	if (!obj)
3105	goto retry;
3106	}
3107	}
3108
3109	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
3110	goto retry_cpuset;
3111	return obj;
3112	}
3113
3114	/*
3115	* An interface to enable slab creation on nodeid
3116	*/
3117	static void ____cache_alloc_node(struct* kmem_cache *cachep, gfp_t flags,
3118	int nodeid)
3119	{
3120	struct slab *slab;
3121	struct kmem_cache_node *n;
3122	void *obj = NULL;
3123	void *list = NULL;
3124
3125	VM_BUG_ON(nodeid < `0` \|\| nodeid >= MAX_NUMNODES);
3126	n = get_node(s: cachep, node: nodeid);
3127	BUG_ON(!n);
3128
3129	check_irq_off();
3130	raw_spin_lock(&n->list_lock);
3131	slab = get_first_slab(n, pfmemalloc: false);
3132	if (!slab)
3133	goto must_grow;
3134
3135	check_spinlock_acquired_node(cachep, nodeid);
3136
3137	STATS_INC_NODEALLOCS(cachep);
3138	STATS_INC_ACTIVE(cachep);
3139	STATS_SET_HIGH(cachep);
3140
3141	BUG_ON(slab->active == cachep->num);
3142
3143	obj = slab_get_obj(cachep, slab);
3144	n->free_objects--;
3145
3146	fixup_slab_list(cachep, n, slab, list: &list);
3147
3148	raw_spin_unlock(&n->list_lock);
3149	fixup_objfreelist_debug(cachep, list: &list);
3150	return obj;
3151
3152	must_grow:
3153	raw_spin_unlock(&n->list_lock);
3154	slab = cache_grow_begin(cachep, flags: gfp_exact_node(flags), nodeid);
3155	if (slab) {
3156	/ This slab isn't counted yet so don't update free_objects /
3157	obj = slab_get_obj(cachep, slab);
3158	}
3159	cache_grow_end(cachep, slab);
3160
3161	return obj ? obj : fallback_alloc(cache: cachep, flags);
3162	}
3163
3164	static __always_inline void *
3165	__do_cache_alloc(struct kmem_cache cachep, gfp_t flags, int* nodeid)
3166	{
3167	void *objp = NULL;
3168	int slab_node = numa_mem_id();
3169
3170	if (nodeid == NUMA_NO_NODE) {
3171	if (current->mempolicy \|\| cpuset_do_slab_mem_spread()) {
3172	objp = alternate_node_alloc(cachep, flags);
3173	if (objp)
3174	goto out;
3175	}
3176	/*
3177	* Use the locally cached objects if possible.
3178	* However ____cache_alloc does not allow fallback
3179	* to other nodes. It may fail while we still have
3180	* objects on other nodes available.
3181	*/
3182	objp = ____cache_alloc(cachep, flags);
3183	nodeid = slab_node;
3184	} else if (nodeid == slab_node) {
3185	objp = ____cache_alloc(cachep, flags);
3186	} else if (!get_node(s: cachep, node: nodeid)) {
3187	/ Node not bootstrapped yet /
3188	objp = fallback_alloc(cache: cachep, flags);
3189	goto out;
3190	}
3191
3192	/*
3193	* We may just have run out of memory on the local node.
3194	* ____cache_alloc_node() knows how to locate memory on other nodes
3195	*/
3196	if (!objp)
3197	objp = ____cache_alloc_node(cachep, flags, nodeid);
3198	out:
3199	return objp;
3200	}
3201	#else
3202
3203	static __always_inline void *
3204	__do_cache_alloc(struct kmem_cache cachep, gfp_t flags, int* nodeid __maybe_unused)
3205	{
3206	return ____cache_alloc(cachep, flags);
3207	}
3208
3209	#endif /* CONFIG_NUMA */
3210
3211	static __always_inline void *
3212	slab_alloc_node(struct kmem_cache cachep, struct* list_lru *lru, gfp_t flags,
3213	int nodeid, size_t orig_size, unsigned long caller)
3214	{
3215	unsigned long save_flags;
3216	void *objp;
3217	struct obj_cgroup *objcg = NULL;
3218	bool init = false;
3219
3220	flags &= gfp_allowed_mask;
3221	cachep = slab_pre_alloc_hook(s: cachep, lru, objcgp: &objcg, size: `1`, flags);
3222	if (unlikely(!cachep))
3223	return NULL;
3224
3225	objp = kfence_alloc(s: cachep, size: orig_size, flags);
3226	if (unlikely(objp))
3227	goto out;
3228
3229	local_irq_save(save_flags);
3230	objp = __do_cache_alloc(cachep, flags, nodeid);
3231	local_irq_restore(save_flags);
3232	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
3233	prefetchw(x: objp);
3234	init = slab_want_init_on_alloc(flags, c: cachep);
3235
3236	out:
3237	slab_post_alloc_hook(s: cachep, objcg, flags, size: `1`, p: &objp, init,
3238	orig_size: cachep->object_size);
3239	return objp;
3240	}
3241
3242	static __always_inline void *
3243	slab_alloc(struct kmem_cache cachep, struct* list_lru *lru, gfp_t flags,
3244	size_t orig_size, unsigned long caller)
3245	{
3246	return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size,
3247	caller);
3248	}
3249
3250	/*
3251	* Caller needs to acquire correct kmem_cache_node's list_lock
3252	* @list: List of detached free slabs should be freed by caller
3253	*/
3254	static void free_block(struct kmem_cache cachep, void* **objpp,
3255	int nr_objects, int node, struct list_head *list)
3256	{
3257	int i;
3258	struct kmem_cache_node *n = get_node(s: cachep, node);
3259	struct slab *slab;
3260
3261	n->free_objects += nr_objects;
3262
3263	for (i = `0`; i < nr_objects; i++) {
3264	void *objp;
3265	struct slab *slab;
3266
3267	objp = objpp[i];
3268
3269	slab = virt_to_slab(addr: objp);
3270	list_del(entry: &slab->slab_list);
3271	check_spinlock_acquired_node(cachep, node);
3272	slab_put_obj(cachep, slab, objp);
3273	STATS_DEC_ACTIVE(cachep);
3274
3275	/ fixup slab chains /
3276	if (slab->active == `0`) {
3277	list_add(new: &slab->slab_list, head: &n->slabs_free);
3278	n->free_slabs++;
3279	} else {
3280	/ Unconditionally move a slab to the end of the*
3281	* partial list on free - maximum time for the
3282	* other objects to be freed, too.
3283	*/
3284	list_add_tail(new: &slab->slab_list, head: &n->slabs_partial);
3285	}
3286	}
3287
3288	while (n->free_objects > n->free_limit && !list_empty(head: &n->slabs_free)) {
3289	n->free_objects -= cachep->num;
3290
3291	slab = list_last_entry(&n->slabs_free, struct slab, slab_list);
3292	list_move(list: &slab->slab_list, head: list);
3293	n->free_slabs--;
3294	n->total_slabs--;
3295	}
3296	}
3297
3298	static void cache_flusharray(struct kmem_cache cachep, struct* array_cache *ac)
3299	{
3300	int batchcount;
3301	struct kmem_cache_node *n;
3302	int node = numa_mem_id();
3303	LIST_HEAD(list);
3304
3305	batchcount = ac->batchcount;
3306
3307	check_irq_off();
3308	n = get_node(s: cachep, node);
3309	raw_spin_lock(&n->list_lock);
3310	if (n->shared) {
3311	struct array_cache *shared_array = n->shared;
3312	int max = shared_array->limit - shared_array->avail;
3313	if (max) {
3314	if (batchcount > max)
3315	batchcount = max;
3316	memcpy(&(shared_array->entry[shared_array->avail]),
3317	ac->entry, sizeof(void ) batchcount);
3318	shared_array->avail += batchcount;
3319	goto free_done;
3320	}
3321	}
3322
3323	free_block(cachep, objpp: ac->entry, nr_objects: batchcount, node, list: &list);
3324	free_done:
3325	#if STATS
3326	{
3327	int i = `0`;
3328	struct slab *slab;
3329
3330	list_for_each_entry(slab, &n->slabs_free, slab_list) {
3331	BUG_ON(slab->active);
3332
3333	i++;
3334	}
3335	STATS_SET_FREEABLE(cachep, i);
3336	}
3337	#endif
3338	raw_spin_unlock(&n->list_lock);
3339	ac->avail -= batchcount;
3340	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void )ac->avail);
3341	slabs_destroy(cachep, list: &list);
3342	}
3343
3344	/*
3345	* Release an obj back to its cache. If the obj has a constructed state, it must
3346	* be in this state _before_ it is released. Called with disabled ints.
3347	*/
3348	static __always_inline void __cache_free(struct kmem_cache cachep, void* *objp,
3349	unsigned long caller)
3350	{
3351	bool init;
3352
3353	memcg_slab_free_hook(s: cachep, slab: virt_to_slab(addr: objp), p: &objp, objects: `1`);
3354
3355	if (is_kfence_address(addr: objp)) {
3356	kmemleak_free_recursive(ptr: objp, flags: cachep->flags);
3357	__kfence_free(addr: objp);
3358	return;
3359	}
3360
3361	/*
3362	* As memory initialization might be integrated into KASAN,
3363	* kasan_slab_free and initialization memset must be
3364	* kept together to avoid discrepancies in behavior.
3365	*/
3366	init = slab_want_init_on_free(c: cachep);
3367	if (init && !kasan_has_integrated_init())
3368	memset(objp, `0`, cachep->object_size);
3369	/ KASAN might put objp into memory quarantine, delaying its reuse. /
3370	if (kasan_slab_free(s: cachep, object: objp, init))
3371	return;
3372
3373	/ Use KCSAN to help debug racy use-after-free. /
3374	if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
3375	__kcsan_check_access(ptr: objp, size: cachep->object_size,
3376	KCSAN_ACCESS_WRITE \| KCSAN_ACCESS_ASSERT);
3377
3378	___cache_free(cache: cachep, x: objp, addr: caller);
3379	}
3380
3381	void ___cache_free(struct kmem_cache cachep, void* *objp,
3382	unsigned long caller)
3383	{
3384	struct array_cache *ac = cpu_cache_get(cachep);
3385
3386	check_irq_off();
3387	kmemleak_free_recursive(ptr: objp, flags: cachep->flags);
3388	objp = cache_free_debugcheck(cachep, objp, caller);
3389
3390	/*
3391	* Skip calling cache_free_alien() when the platform is not numa.
3392	* This will avoid cache misses that happen while accessing slabp (which
3393	* is per page memory reference) to get nodeid. Instead use a global
3394	* variable to skip the call, which is mostly likely to be present in
3395	* the cache.
3396	*/
3397	if (nr_online_nodes > `1` && cache_free_alien(cachep, objp))
3398	return;
3399
3400	if (ac->avail < ac->limit) {
3401	STATS_INC_FREEHIT(cachep);
3402	} else {
3403	STATS_INC_FREEMISS(cachep);
3404	cache_flusharray(cachep, ac);
3405	}
3406
3407	if (sk_memalloc_socks()) {
3408	struct slab *slab = virt_to_slab(addr: objp);
3409
3410	if (unlikely(slab_test_pfmemalloc(slab))) {
3411	cache_free_pfmemalloc(cachep, slab, objp);
3412	return;
3413	}
3414	}
3415
3416	__free_one(ac, objp);
3417	}
3418
3419	static __always_inline
3420	void __kmem_cache_alloc_lru(struct* kmem_cache cachep, struct* list_lru *lru,
3421	gfp_t flags)
3422	{
3423	void *ret = slab_alloc(cachep, lru, flags, orig_size: cachep->object_size, _RET_IP_);
3424
3425	trace_kmem_cache_alloc(_RET_IP_, ptr: ret, s: cachep, gfp_flags: flags, NUMA_NO_NODE);
3426
3427	return ret;
3428	}
3429
3430	void kmem_cache_alloc(struct* kmem_cache *cachep, gfp_t flags)
3431	{
3432	return __kmem_cache_alloc_lru(cachep, NULL, flags);
3433	}
3434	EXPORT_SYMBOL(kmem_cache_alloc);
3435
3436	void kmem_cache_alloc_lru(struct* kmem_cache cachep, struct* list_lru *lru,
3437	gfp_t flags)
3438	{
3439	return __kmem_cache_alloc_lru(cachep, lru, flags);
3440	}
3441	EXPORT_SYMBOL(kmem_cache_alloc_lru);
3442
3443	static __always_inline void
3444	cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
3445	size_t size, void *p, unsigned* long caller)
3446	{
3447	size_t i;
3448
3449	for (i = `0`; i < size; i++)
3450	p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
3451	}
3452
3453	int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
3454	void **p)
3455	{
3456	struct obj_cgroup *objcg = NULL;
3457	unsigned long irqflags;
3458	size_t i;
3459
3460	s = slab_pre_alloc_hook(s, NULL, objcgp: &objcg, size, flags);
3461	if (!s)
3462	return `0`;
3463
3464	local_irq_save(irqflags);
3465	for (i = `0`; i < size; i++) {
3466	void *objp = kfence_alloc(s, size: s->object_size, flags) ?:
3467	__do_cache_alloc(cachep: s, flags, NUMA_NO_NODE);
3468
3469	if (unlikely(!objp))
3470	goto error;
3471	p[i] = objp;
3472	}
3473	local_irq_restore(irqflags);
3474
3475	cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
3476
3477	/*
3478	* memcg and kmem_cache debug support and memory initialization.
3479	* Done outside of the IRQ disabled section.
3480	*/
3481	slab_post_alloc_hook(s, objcg, flags, size, p,
3482	init: slab_want_init_on_alloc(flags, c: s), orig_size: s->object_size);
3483	/ FIXME: Trace call missing. Christoph would like a bulk variant /
3484	return size;
3485	error:
3486	local_irq_restore(irqflags);
3487	cache_alloc_debugcheck_after_bulk(s, flags, size: i, p, _RET_IP_);
3488	slab_post_alloc_hook(s, objcg, flags, size: i, p, init: false, orig_size: s->object_size);
3489	kmem_cache_free_bulk(s, size: i, p);
3490	return `0`;
3491	}
3492	EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3493
3494	/**
3495	* kmem_cache_alloc_node - Allocate an object on the specified node
3496	* @cachep: The cache to allocate from.
3497	* @flags: See kmalloc().
3498	* @nodeid: node number of the target node.
3499	*
3500	* Identical to kmem_cache_alloc but it will allocate memory on the given
3501	* node, which can improve the performance for cpu bound structures.
3502	*
3503	* Fallback to other node is possible if __GFP_THISNODE is not set.
3504	*
3505	* Return: pointer to the new object or %NULL in case of error
3506	*/
3507	void kmem_cache_alloc_node(struct* kmem_cache cachep, gfp_t flags, int* nodeid)
3508	{
3509	void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, orig_size: cachep->object_size, _RET_IP_);
3510
3511	trace_kmem_cache_alloc(_RET_IP_, ptr: ret, s: cachep, gfp_flags: flags, node: nodeid);
3512
3513	return ret;
3514	}
3515	EXPORT_SYMBOL(kmem_cache_alloc_node);
3516
3517	void __kmem_cache_alloc_node(struct* kmem_cache *cachep, gfp_t flags,
3518	int nodeid, size_t orig_size,
3519	unsigned long caller)
3520	{
3521	return slab_alloc_node(cachep, NULL, flags, nodeid,
3522	orig_size, caller);
3523	}
3524
3525	#ifdef CONFIG_PRINTK
3526	void __kmem_obj_info(struct kmem_obj_info kpp, void* object, struct* slab *slab)
3527	{
3528	struct kmem_cache *cachep;
3529	unsigned int objnr;
3530	void *objp;
3531
3532	kpp->kp_ptr = object;
3533	kpp->kp_slab = slab;
3534	cachep = slab->slab_cache;
3535	kpp->kp_slab_cache = cachep;
3536	objp = object - obj_offset(cachep);
3537	kpp->kp_data_offset = obj_offset(cachep);
3538	slab = virt_to_slab(addr: objp);
3539	objnr = obj_to_index(cache: cachep, slab, obj: objp);
3540	objp = index_to_obj(cache: cachep, slab, idx: objnr);
3541	kpp->kp_objp = objp;
3542	if (DEBUG && cachep->flags & SLAB_STORE_USER)
3543	kpp->kp_ret = *dbg_userword(cachep, objp);
3544	}
3545	#endif
3546
3547	static __always_inline
3548	void __do_kmem_cache_free(struct kmem_cache cachep, void* *objp,
3549	unsigned long caller)
3550	{
3551	unsigned long flags;
3552
3553	local_irq_save(flags);
3554	debug_check_no_locks_freed(from: objp, len: cachep->object_size);
3555	if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3556	debug_check_no_obj_freed(address: objp, size: cachep->object_size);
3557	__cache_free(cachep, objp, caller);
3558	local_irq_restore(flags);
3559	}
3560
3561	void __kmem_cache_free(struct kmem_cache cachep, void* *objp,
3562	unsigned long caller)
3563	{
3564	__do_kmem_cache_free(cachep, objp, caller);
3565	}
3566
3567	/**
3568	* kmem_cache_free - Deallocate an object
3569	* @cachep: The cache the allocation was from.
3570	* @objp: The previously allocated object.
3571	*
3572	* Free an object which was previously allocated from this
3573	* cache.
3574	*/
3575	void kmem_cache_free(struct kmem_cache cachep, void* *objp)
3576	{
3577	cachep = cache_from_obj(s: cachep, x: objp);
3578	if (!cachep)
3579	return;
3580
3581	trace_kmem_cache_free(_RET_IP_, ptr: objp, s: cachep);
3582	__do_kmem_cache_free(cachep, objp, _RET_IP_);
3583	}
3584	EXPORT_SYMBOL(kmem_cache_free);
3585
3586	void kmem_cache_free_bulk(struct kmem_cache orig_s, size_t size, void* **p)
3587	{
3588	unsigned long flags;
3589
3590	local_irq_save(flags);
3591	for (int i = `0`; i < size; i++) {
3592	void *objp = p[i];
3593	struct kmem_cache *s;
3594
3595	if (!orig_s) {
3596	struct folio *folio = virt_to_folio(x: objp);
3597
3598	/ called via kfree_bulk /
3599	if (!folio_test_slab(folio)) {
3600	local_irq_restore(flags);
3601	free_large_kmalloc(folio, object: objp);
3602	local_irq_save(flags);
3603	continue;
3604	}
3605	s = folio_slab(folio)->slab_cache;
3606	} else {
3607	s = cache_from_obj(s: orig_s, x: objp);
3608	}
3609
3610	if (!s)
3611	continue;
3612
3613	debug_check_no_locks_freed(from: objp, len: s->object_size);
3614	if (!(s->flags & SLAB_DEBUG_OBJECTS))
3615	debug_check_no_obj_freed(address: objp, size: s->object_size);
3616
3617	__cache_free(cachep: s, objp, _RET_IP_);
3618	}
3619	local_irq_restore(flags);
3620
3621	/ FIXME: add tracing /
3622	}
3623	EXPORT_SYMBOL(kmem_cache_free_bulk);
3624
3625	/*
3626	* This initializes kmem_cache_node or resizes various caches for all nodes.
3627	*/
3628	static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
3629	{
3630	int ret;
3631	int node;
3632	struct kmem_cache_node *n;
3633
3634	for_each_online_node(node) {
3635	ret = setup_kmem_cache_node(cachep, node, gfp, force_change: true);
3636	if (ret)
3637	goto fail;
3638
3639	}
3640
3641	return `0`;
3642
3643	fail:
3644	if (!cachep->list.next) {
3645	/ Cache is not active yet. Roll back what we did /
3646	node--;
3647	while (node >= `0`) {
3648	n = get_node(s: cachep, node);
3649	if (n) {
3650	kfree(objp: n->shared);
3651	free_alien_cache(alc_ptr: n->alien);
3652	kfree(objp: n);
3653	cachep->node[node] = NULL;
3654	}
3655	node--;
3656	}
3657	}
3658	return -ENOMEM;
3659	}
3660
3661	/ Always called with the slab_mutex held /
3662	static int do_tune_cpucache(struct kmem_cache cachep, int* limit,
3663	int batchcount, int shared, gfp_t gfp)
3664	{
3665	struct array_cache __percpu cpu_cache, prev;
3666	int cpu;
3667
3668	cpu_cache = alloc_kmem_cache_cpus(cachep, entries: limit, batchcount);
3669	if (!cpu_cache)
3670	return -ENOMEM;
3671
3672	prev = cachep->cpu_cache;
3673	cachep->cpu_cache = cpu_cache;
3674	/*
3675	* Without a previous cpu_cache there's no need to synchronize remote
3676	* cpus, so skip the IPIs.
3677	*/
3678	if (prev)
3679	kick_all_cpus_sync();
3680
3681	check_irq_on();
3682	cachep->batchcount = batchcount;
3683	cachep->limit = limit;
3684	cachep->shared = shared;
3685
3686	if (!prev)
3687	goto setup_node;
3688
3689	for_each_online_cpu(cpu) {
3690	LIST_HEAD(list);
3691	int node;
3692	struct kmem_cache_node *n;
3693	struct array_cache *ac = per_cpu_ptr(prev, cpu);
3694
3695	node = cpu_to_mem(cpu);
3696	n = get_node(s: cachep, node);
3697	raw_spin_lock_irq(&n->list_lock);
3698	free_block(cachep, objpp: ac->entry, nr_objects: ac->avail, node, list: &list);
3699	raw_spin_unlock_irq(&n->list_lock);
3700	slabs_destroy(cachep, list: &list);
3701	}
3702	free_percpu(pdata: prev);
3703
3704	setup_node:
3705	return setup_kmem_cache_nodes(cachep, gfp);
3706	}
3707
3708	/ Called with slab_mutex held always /
3709	static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3710	{
3711	int err;
3712	int limit = `0`;
3713	int shared = `0`;
3714	int batchcount = `0`;
3715
3716	err = cache_random_seq_create(cachep, count: cachep->num, gfp);
3717	if (err)
3718	goto end;
3719
3720	/*
3721	* The head array serves three purposes:
3722	* - create a LIFO ordering, i.e. return objects that are cache-warm
3723	* - reduce the number of spinlock operations.
3724	* - reduce the number of linked list operations on the slab and
3725	* bufctl chains: array operations are cheaper.
3726	* The numbers are guessed, we should auto-tune as described by
3727	* Bonwick.
3728	*/
3729	if (cachep->size > `131072`)
3730	limit = `1`;
3731	else if (cachep->size > PAGE_SIZE)
3732	limit = `8`;
3733	else if (cachep->size > `1024`)
3734	limit = `24`;
3735	else if (cachep->size > `256`)
3736	limit = `54`;
3737	else
3738	limit = `120`;
3739
3740	/*
3741	* CPU bound tasks (e.g. network routing) can exhibit cpu bound
3742	* allocation behaviour: Most allocs on one cpu, most free operations
3743	* on another cpu. For these cases, an efficient object passing between
3744	* cpus is necessary. This is provided by a shared array. The array
3745	* replaces Bonwick's magazine layer.
3746	* On uniprocessor, it's functionally equivalent (but less efficient)
3747	* to a larger limit. Thus disabled by default.
3748	*/
3749	shared = `0`;
3750	if (cachep->size <= PAGE_SIZE && num_possible_cpus() > `1`)
3751	shared = `8`;
3752
3753	#if DEBUG
3754	/*
3755	* With debugging enabled, large batchcount lead to excessively long
3756	* periods with disabled local interrupts. Limit the batchcount
3757	*/
3758	if (limit > `32`)
3759	limit = `32`;
3760	#endif
3761	batchcount = (limit + `1`) / `2`;
3762	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3763	end:
3764	if (err)
3765	pr_err("enable_cpucache failed for %s, error %d\n",
3766	cachep->name, -err);
3767	return err;
3768	}
3769
3770	/*
3771	* Drain an array if it contains any elements taking the node lock only if
3772	* necessary. Note that the node listlock also protects the array_cache
3773	* if drain_array() is used on the shared array.
3774	*/
3775	static void drain_array(struct kmem_cache cachep, struct* kmem_cache_node *n,
3776	struct array_cache ac, int* node)
3777	{
3778	LIST_HEAD(list);
3779
3780	/ ac from n->shared can be freed if we don't hold the slab_mutex. /
3781	check_mutex_acquired();
3782
3783	if (!ac \|\| !ac->avail)
3784	return;
3785
3786	if (ac->touched) {
3787	ac->touched = `0`;
3788	return;
3789	}
3790
3791	raw_spin_lock_irq(&n->list_lock);
3792	drain_array_locked(cachep, ac, node, free_all: false, list: &list);
3793	raw_spin_unlock_irq(&n->list_lock);
3794
3795	slabs_destroy(cachep, list: &list);
3796	}
3797
3798	/**
3799	* cache_reap - Reclaim memory from caches.
3800	* @w: work descriptor
3801	*
3802	* Called from workqueue/eventd every few seconds.
3803	* Purpose:
3804	* - clear the per-cpu caches for this CPU.
3805	* - return freeable pages to the main free memory pool.
3806	*
3807	* If we cannot acquire the cache chain mutex then just give up - we'll try
3808	* again on the next iteration.
3809	*/
3810	static void cache_reap(struct work_struct *w)
3811	{
3812	struct kmem_cache *searchp;
3813	struct kmem_cache_node *n;
3814	int node = numa_mem_id();
3815	struct delayed_work *work = to_delayed_work(work: w);
3816
3817	if (!mutex_trylock(lock: &slab_mutex))
3818	/ Give up. Setup the next iteration. /
3819	goto out;
3820
3821	list_for_each_entry(searchp, &slab_caches, list) {
3822	check_irq_on();
3823
3824	/*
3825	* We only take the node lock if absolutely necessary and we
3826	* have established with reasonable certainty that
3827	* we can do some work if the lock was obtained.
3828	*/
3829	n = get_node(s: searchp, node);
3830
3831	reap_alien(cachep: searchp, n);
3832
3833	drain_array(cachep: searchp, n, ac: cpu_cache_get(cachep: searchp), node);
3834
3835	/*
3836	* These are racy checks but it does not matter
3837	* if we skip one check or scan twice.
3838	*/
3839	if (time_after(n->next_reap, jiffies))
3840	goto next;
3841
3842	n->next_reap = jiffies + REAPTIMEOUT_NODE;
3843
3844	drain_array(cachep: searchp, n, ac: n->shared, node);
3845
3846	if (n->free_touched)
3847	n->free_touched = `0`;
3848	else {
3849	int freed;
3850
3851	freed = drain_freelist(cache: searchp, n, tofree: (n->free_limit +
3852	`5` * searchp->num - `1`) / (`5` * searchp->num));
3853	STATS_ADD_REAPED(searchp, freed);
3854	}
3855	next:
3856	cond_resched();
3857	}
3858	check_irq_on();
3859	mutex_unlock(lock: &slab_mutex);
3860	next_reap_node();
3861	out:
3862	/ Set up the next iteration /
3863	schedule_delayed_work_on(smp_processor_id(), dwork: work,
3864	delay: round_jiffies_relative(REAPTIMEOUT_AC));
3865	}
3866
3867	void get_slabinfo(struct kmem_cache cachep, struct* slabinfo *sinfo)
3868	{
3869	unsigned long active_objs, num_objs, active_slabs;
3870	unsigned long total_slabs = `0`, free_objs = `0`, shared_avail = `0`;
3871	unsigned long free_slabs = `0`;
3872	int node;
3873	struct kmem_cache_node *n;
3874
3875	for_each_kmem_cache_node(cachep, node, n) {
3876	check_irq_on();
3877	raw_spin_lock_irq(&n->list_lock);
3878
3879	total_slabs += n->total_slabs;
3880	free_slabs += n->free_slabs;
3881	free_objs += n->free_objects;
3882
3883	if (n->shared)
3884	shared_avail += n->shared->avail;
3885
3886	raw_spin_unlock_irq(&n->list_lock);
3887	}
3888	num_objs = total_slabs * cachep->num;
3889	active_slabs = total_slabs - free_slabs;
3890	active_objs = num_objs - free_objs;
3891
3892	sinfo->active_objs = active_objs;
3893	sinfo->num_objs = num_objs;
3894	sinfo->active_slabs = active_slabs;
3895	sinfo->num_slabs = total_slabs;
3896	sinfo->shared_avail = shared_avail;
3897	sinfo->limit = cachep->limit;
3898	sinfo->batchcount = cachep->batchcount;
3899	sinfo->shared = cachep->shared;
3900	sinfo->objects_per_slab = cachep->num;
3901	sinfo->cache_order = cachep->gfporder;
3902	}
3903
3904	void slabinfo_show_stats(struct seq_file m, struct* kmem_cache *cachep)
3905	{
3906	#if STATS
3907	{ / node stats /
3908	unsigned long high = cachep->high_mark;
3909	unsigned long allocs = cachep->num_allocations;
3910	unsigned long grown = cachep->grown;
3911	unsigned long reaped = cachep->reaped;
3912	unsigned long errors = cachep->errors;
3913	unsigned long max_freeable = cachep->max_freeable;
3914	unsigned long node_allocs = cachep->node_allocs;
3915	unsigned long node_frees = cachep->node_frees;
3916	unsigned long overflows = cachep->node_overflow;
3917
3918	seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
3919	allocs, high, grown,
3920	reaped, errors, max_freeable, node_allocs,
3921	node_frees, overflows);
3922	}
3923	/ cpu stats /
3924	{
3925	unsigned long allochit = atomic_read(&cachep->allochit);
3926	unsigned long allocmiss = atomic_read(&cachep->allocmiss);
3927	unsigned long freehit = atomic_read(&cachep->freehit);
3928	unsigned long freemiss = atomic_read(&cachep->freemiss);
3929
3930	seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3931	allochit, allocmiss, freehit, freemiss);
3932	}
3933	#endif
3934	}
3935
3936	#define MAX_SLABINFO_WRITE 128
3937	/**
3938	* slabinfo_write - Tuning for the slab allocator
3939	* @file: unused
3940	* @buffer: user buffer
3941	* @count: data length
3942	* @ppos: unused
3943	*
3944	* Return: %0 on success, negative error code otherwise.
3945	*/
3946	ssize_t slabinfo_write(struct file file, const* char __user *buffer,
3947	size_t count, loff_t *ppos)
3948	{
3949	char kbuf[MAX_SLABINFO_WRITE + `1`], *tmp;
3950	int limit, batchcount, shared, res;
3951	struct kmem_cache *cachep;
3952
3953	if (count > MAX_SLABINFO_WRITE)
3954	return -EINVAL;
3955	if (copy_from_user(to: &kbuf, from: buffer, n: count))
3956	return -EFAULT;
3957	kbuf[MAX_SLABINFO_WRITE] = `'\0'`;
3958
3959	tmp = strchr(kbuf, `' '`);
3960	if (!tmp)
3961	return -EINVAL;
3962	*tmp = `'\0'`;
3963	tmp++;
3964	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != `3`)
3965	return -EINVAL;
3966
3967	/ Find the cache in the chain of caches. /
3968	mutex_lock(&slab_mutex);
3969	res = -EINVAL;
3970	list_for_each_entry(cachep, &slab_caches, list) {
3971	if (!strcmp(cachep->name, kbuf)) {
3972	if (limit < `1` \|\| batchcount < `1` \|\|
3973	batchcount > limit \|\| shared < `0`) {
3974	res = `0`;
3975	} else {
3976	res = do_tune_cpucache(cachep, limit,
3977	batchcount, shared,
3978	GFP_KERNEL);
3979	}
3980	break;
3981	}
3982	}
3983	mutex_unlock(lock: &slab_mutex);
3984	if (res >= `0`)
3985	res = count;
3986	return res;
3987	}
3988
3989	#ifdef CONFIG_HARDENED_USERCOPY
3990	/*
3991	* Rejects incorrectly sized objects and objects that are to be copied
3992	* to/from userspace but do not fall entirely within the containing slab
3993	* cache's usercopy region.
3994	*
3995	* Returns NULL if check passes, otherwise const char * to name of cache
3996	* to indicate an error.
3997	*/
3998	void __check_heap_object(const void ptr, unsigned* long n,
3999	const struct slab *slab, bool to_user)
4000	{
4001	struct kmem_cache *cachep;
4002	unsigned int objnr;
4003	unsigned long offset;
4004
4005	ptr = kasan_reset_tag(addr: ptr);
4006
4007	/ Find and validate object. /
4008	cachep = slab->slab_cache;
4009	objnr = obj_to_index(cache: cachep, slab, obj: (void *)ptr);
4010	BUG_ON(objnr >= cachep->num);
4011
4012	/ Find offset within object. /
4013	if (is_kfence_address(addr: ptr))
4014	offset = ptr - kfence_object_start(addr: ptr);
4015	else
4016	offset = ptr - index_to_obj(cache: cachep, slab, idx: objnr) - obj_offset(cachep);
4017
4018	/ Allow address range falling entirely within usercopy region. /
4019	if (offset >= cachep->useroffset &&
4020	offset - cachep->useroffset <= cachep->usersize &&
4021	n <= cachep->useroffset - offset + cachep->usersize)
4022	return;
4023
4024	usercopy_abort(name: "SLAB object", detail: cachep->name, to_user, offset, len: n);
4025	}
4026	#endif /* CONFIG_HARDENED_USERCOPY */
4027

source code of linux/mm/slab.c