1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #define _GNU_SOURCE |
3 | |
4 | #include <linux/limits.h> |
5 | #include <fcntl.h> |
6 | #include <stdio.h> |
7 | #include <stdlib.h> |
8 | #include <string.h> |
9 | #include <sys/stat.h> |
10 | #include <sys/types.h> |
11 | #include <unistd.h> |
12 | #include <sys/wait.h> |
13 | #include <errno.h> |
14 | #include <sys/sysinfo.h> |
15 | #include <pthread.h> |
16 | |
17 | #include "../kselftest.h" |
18 | #include "cgroup_util.h" |
19 | |
20 | |
21 | /* |
22 | * Memory cgroup charging is performed using percpu batches 64 pages |
23 | * big (look at MEMCG_CHARGE_BATCH), whereas memory.stat is exact. So |
24 | * the maximum discrepancy between charge and vmstat entries is number |
25 | * of cpus multiplied by 64 pages. |
26 | */ |
27 | #define MAX_VMSTAT_ERROR (4096 * 64 * get_nprocs()) |
28 | |
29 | |
30 | static int alloc_dcache(const char *cgroup, void *arg) |
31 | { |
32 | unsigned long i; |
33 | struct stat st; |
34 | char buf[128]; |
35 | |
36 | for (i = 0; i < (unsigned long)arg; i++) { |
37 | snprintf(buf, sizeof(buf), |
38 | "/something-non-existent-with-a-long-name-%64lu-%d" , |
39 | i, getpid()); |
40 | stat(buf, &st); |
41 | } |
42 | |
43 | return 0; |
44 | } |
45 | |
46 | /* |
47 | * This test allocates 100000 of negative dentries with long names. |
48 | * Then it checks that "slab" in memory.stat is larger than 1M. |
49 | * Then it sets memory.high to 1M and checks that at least 1/2 |
50 | * of slab memory has been reclaimed. |
51 | */ |
52 | static int test_kmem_basic(const char *root) |
53 | { |
54 | int ret = KSFT_FAIL; |
55 | char *cg = NULL; |
56 | long slab0, slab1, current; |
57 | |
58 | cg = cg_name(root, name: "kmem_basic_test" ); |
59 | if (!cg) |
60 | goto cleanup; |
61 | |
62 | if (cg_create(cgroup: cg)) |
63 | goto cleanup; |
64 | |
65 | if (cg_run(cgroup: cg, fn: alloc_dcache, arg: (void *)100000)) |
66 | goto cleanup; |
67 | |
68 | slab0 = cg_read_key_long(cgroup: cg, control: "memory.stat" , key: "slab " ); |
69 | if (slab0 < (1 << 20)) |
70 | goto cleanup; |
71 | |
72 | cg_write(cgroup: cg, control: "memory.high" , buf: "1M" ); |
73 | |
74 | /* wait for RCU freeing */ |
75 | sleep(1); |
76 | |
77 | slab1 = cg_read_key_long(cgroup: cg, control: "memory.stat" , key: "slab " ); |
78 | if (slab1 < 0) |
79 | goto cleanup; |
80 | |
81 | current = cg_read_long(cgroup: cg, control: "memory.current" ); |
82 | if (current < 0) |
83 | goto cleanup; |
84 | |
85 | if (slab1 < slab0 / 2 && current < slab0 / 2) |
86 | ret = KSFT_PASS; |
87 | cleanup: |
88 | cg_destroy(cgroup: cg); |
89 | free(cg); |
90 | |
91 | return ret; |
92 | } |
93 | |
94 | static void *alloc_kmem_fn(void *arg) |
95 | { |
96 | alloc_dcache(NULL, arg: (void *)100); |
97 | return NULL; |
98 | } |
99 | |
100 | static int alloc_kmem_smp(const char *cgroup, void *arg) |
101 | { |
102 | int nr_threads = 2 * get_nprocs(); |
103 | pthread_t *tinfo; |
104 | unsigned long i; |
105 | int ret = -1; |
106 | |
107 | tinfo = calloc(nr_threads, sizeof(pthread_t)); |
108 | if (tinfo == NULL) |
109 | return -1; |
110 | |
111 | for (i = 0; i < nr_threads; i++) { |
112 | if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn, |
113 | (void *)i)) { |
114 | free(tinfo); |
115 | return -1; |
116 | } |
117 | } |
118 | |
119 | for (i = 0; i < nr_threads; i++) { |
120 | ret = pthread_join(tinfo[i], NULL); |
121 | if (ret) |
122 | break; |
123 | } |
124 | |
125 | free(tinfo); |
126 | return ret; |
127 | } |
128 | |
129 | static int cg_run_in_subcgroups(const char *parent, |
130 | int (*fn)(const char *cgroup, void *arg), |
131 | void *arg, int times) |
132 | { |
133 | char *child; |
134 | int i; |
135 | |
136 | for (i = 0; i < times; i++) { |
137 | child = cg_name_indexed(root: parent, name: "child" , index: i); |
138 | if (!child) |
139 | return -1; |
140 | |
141 | if (cg_create(cgroup: child)) { |
142 | cg_destroy(cgroup: child); |
143 | free(child); |
144 | return -1; |
145 | } |
146 | |
147 | if (cg_run(cgroup: child, fn, NULL)) { |
148 | cg_destroy(cgroup: child); |
149 | free(child); |
150 | return -1; |
151 | } |
152 | |
153 | cg_destroy(cgroup: child); |
154 | free(child); |
155 | } |
156 | |
157 | return 0; |
158 | } |
159 | |
160 | /* |
161 | * The test creates and destroys a large number of cgroups. In each cgroup it |
162 | * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS |
163 | * threads. Then it checks the sanity of numbers on the parent level: |
164 | * the total size of the cgroups should be roughly equal to |
165 | * anon + file + kernel + sock. |
166 | */ |
167 | static int test_kmem_memcg_deletion(const char *root) |
168 | { |
169 | long current, anon, file, kernel, sock, sum; |
170 | int ret = KSFT_FAIL; |
171 | char *parent; |
172 | |
173 | parent = cg_name(root, name: "kmem_memcg_deletion_test" ); |
174 | if (!parent) |
175 | goto cleanup; |
176 | |
177 | if (cg_create(cgroup: parent)) |
178 | goto cleanup; |
179 | |
180 | if (cg_write(cgroup: parent, control: "cgroup.subtree_control" , buf: "+memory" )) |
181 | goto cleanup; |
182 | |
183 | if (cg_run_in_subcgroups(parent, fn: alloc_kmem_smp, NULL, times: 100)) |
184 | goto cleanup; |
185 | |
186 | current = cg_read_long(cgroup: parent, control: "memory.current" ); |
187 | anon = cg_read_key_long(cgroup: parent, control: "memory.stat" , key: "anon " ); |
188 | file = cg_read_key_long(cgroup: parent, control: "memory.stat" , key: "file " ); |
189 | kernel = cg_read_key_long(cgroup: parent, control: "memory.stat" , key: "kernel " ); |
190 | sock = cg_read_key_long(cgroup: parent, control: "memory.stat" , key: "sock " ); |
191 | if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0) |
192 | goto cleanup; |
193 | |
194 | sum = anon + file + kernel + sock; |
195 | if (abs(sum - current) < MAX_VMSTAT_ERROR) { |
196 | ret = KSFT_PASS; |
197 | } else { |
198 | printf("memory.current = %ld\n" , current); |
199 | printf("anon + file + kernel + sock = %ld\n" , sum); |
200 | printf("anon = %ld\n" , anon); |
201 | printf("file = %ld\n" , file); |
202 | printf("kernel = %ld\n" , kernel); |
203 | printf("sock = %ld\n" , sock); |
204 | } |
205 | |
206 | cleanup: |
207 | cg_destroy(cgroup: parent); |
208 | free(parent); |
209 | |
210 | return ret; |
211 | } |
212 | |
213 | /* |
214 | * The test reads the entire /proc/kpagecgroup. If the operation went |
215 | * successfully (and the kernel didn't panic), the test is treated as passed. |
216 | */ |
217 | static int test_kmem_proc_kpagecgroup(const char *root) |
218 | { |
219 | unsigned long buf[128]; |
220 | int ret = KSFT_FAIL; |
221 | ssize_t len; |
222 | int fd; |
223 | |
224 | fd = open("/proc/kpagecgroup" , O_RDONLY); |
225 | if (fd < 0) |
226 | return ret; |
227 | |
228 | do { |
229 | len = read(fd, buf, sizeof(buf)); |
230 | } while (len > 0); |
231 | |
232 | if (len == 0) |
233 | ret = KSFT_PASS; |
234 | |
235 | close(fd); |
236 | return ret; |
237 | } |
238 | |
239 | static void *pthread_wait_fn(void *arg) |
240 | { |
241 | sleep(100); |
242 | return NULL; |
243 | } |
244 | |
245 | static int spawn_1000_threads(const char *cgroup, void *arg) |
246 | { |
247 | int nr_threads = 1000; |
248 | pthread_t *tinfo; |
249 | unsigned long i; |
250 | long stack; |
251 | int ret = -1; |
252 | |
253 | tinfo = calloc(nr_threads, sizeof(pthread_t)); |
254 | if (tinfo == NULL) |
255 | return -1; |
256 | |
257 | for (i = 0; i < nr_threads; i++) { |
258 | if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn, |
259 | (void *)i)) { |
260 | free(tinfo); |
261 | return(-1); |
262 | } |
263 | } |
264 | |
265 | stack = cg_read_key_long(cgroup, control: "memory.stat" , key: "kernel_stack " ); |
266 | if (stack >= 4096 * 1000) |
267 | ret = 0; |
268 | |
269 | free(tinfo); |
270 | return ret; |
271 | } |
272 | |
273 | /* |
274 | * The test spawns a process, which spawns 1000 threads. Then it checks |
275 | * that memory.stat's kernel_stack is at least 1000 pages large. |
276 | */ |
277 | static int test_kmem_kernel_stacks(const char *root) |
278 | { |
279 | int ret = KSFT_FAIL; |
280 | char *cg = NULL; |
281 | |
282 | cg = cg_name(root, name: "kmem_kernel_stacks_test" ); |
283 | if (!cg) |
284 | goto cleanup; |
285 | |
286 | if (cg_create(cgroup: cg)) |
287 | goto cleanup; |
288 | |
289 | if (cg_run(cgroup: cg, fn: spawn_1000_threads, NULL)) |
290 | goto cleanup; |
291 | |
292 | ret = KSFT_PASS; |
293 | cleanup: |
294 | cg_destroy(cgroup: cg); |
295 | free(cg); |
296 | |
297 | return ret; |
298 | } |
299 | |
300 | /* |
301 | * This test sequentionally creates 30 child cgroups, allocates some |
302 | * kernel memory in each of them, and deletes them. Then it checks |
303 | * that the number of dying cgroups on the parent level is 0. |
304 | */ |
305 | static int test_kmem_dead_cgroups(const char *root) |
306 | { |
307 | int ret = KSFT_FAIL; |
308 | char *parent; |
309 | long dead; |
310 | int i; |
311 | |
312 | parent = cg_name(root, name: "kmem_dead_cgroups_test" ); |
313 | if (!parent) |
314 | goto cleanup; |
315 | |
316 | if (cg_create(cgroup: parent)) |
317 | goto cleanup; |
318 | |
319 | if (cg_write(cgroup: parent, control: "cgroup.subtree_control" , buf: "+memory" )) |
320 | goto cleanup; |
321 | |
322 | if (cg_run_in_subcgroups(parent, fn: alloc_dcache, arg: (void *)100, times: 30)) |
323 | goto cleanup; |
324 | |
325 | for (i = 0; i < 5; i++) { |
326 | dead = cg_read_key_long(cgroup: parent, control: "cgroup.stat" , |
327 | key: "nr_dying_descendants " ); |
328 | if (dead == 0) { |
329 | ret = KSFT_PASS; |
330 | break; |
331 | } |
332 | /* |
333 | * Reclaiming cgroups might take some time, |
334 | * let's wait a bit and repeat. |
335 | */ |
336 | sleep(1); |
337 | } |
338 | |
339 | cleanup: |
340 | cg_destroy(cgroup: parent); |
341 | free(parent); |
342 | |
343 | return ret; |
344 | } |
345 | |
346 | /* |
347 | * This test creates a sub-tree with 1000 memory cgroups. |
348 | * Then it checks that the memory.current on the parent level |
349 | * is greater than 0 and approximates matches the percpu value |
350 | * from memory.stat. |
351 | */ |
352 | static int test_percpu_basic(const char *root) |
353 | { |
354 | int ret = KSFT_FAIL; |
355 | char *parent, *child; |
356 | long current, percpu; |
357 | int i; |
358 | |
359 | parent = cg_name(root, name: "percpu_basic_test" ); |
360 | if (!parent) |
361 | goto cleanup; |
362 | |
363 | if (cg_create(cgroup: parent)) |
364 | goto cleanup; |
365 | |
366 | if (cg_write(cgroup: parent, control: "cgroup.subtree_control" , buf: "+memory" )) |
367 | goto cleanup; |
368 | |
369 | for (i = 0; i < 1000; i++) { |
370 | child = cg_name_indexed(root: parent, name: "child" , index: i); |
371 | if (!child) |
372 | return -1; |
373 | |
374 | if (cg_create(cgroup: child)) |
375 | goto cleanup_children; |
376 | |
377 | free(child); |
378 | } |
379 | |
380 | current = cg_read_long(cgroup: parent, control: "memory.current" ); |
381 | percpu = cg_read_key_long(cgroup: parent, control: "memory.stat" , key: "percpu " ); |
382 | |
383 | if (current > 0 && percpu > 0 && abs(current - percpu) < |
384 | MAX_VMSTAT_ERROR) |
385 | ret = KSFT_PASS; |
386 | else |
387 | printf("memory.current %ld\npercpu %ld\n" , |
388 | current, percpu); |
389 | |
390 | cleanup_children: |
391 | for (i = 0; i < 1000; i++) { |
392 | child = cg_name_indexed(root: parent, name: "child" , index: i); |
393 | cg_destroy(cgroup: child); |
394 | free(child); |
395 | } |
396 | |
397 | cleanup: |
398 | cg_destroy(cgroup: parent); |
399 | free(parent); |
400 | |
401 | return ret; |
402 | } |
403 | |
404 | #define T(x) { x, #x } |
405 | struct kmem_test { |
406 | int (*fn)(const char *root); |
407 | const char *name; |
408 | } tests[] = { |
409 | T(test_kmem_basic), |
410 | T(test_kmem_memcg_deletion), |
411 | T(test_kmem_proc_kpagecgroup), |
412 | T(test_kmem_kernel_stacks), |
413 | T(test_kmem_dead_cgroups), |
414 | T(test_percpu_basic), |
415 | }; |
416 | #undef T |
417 | |
418 | int main(int argc, char **argv) |
419 | { |
420 | char root[PATH_MAX]; |
421 | int i, ret = EXIT_SUCCESS; |
422 | |
423 | if (cg_find_unified_root(root, len: sizeof(root))) |
424 | ksft_exit_skip(msg: "cgroup v2 isn't mounted\n" ); |
425 | |
426 | /* |
427 | * Check that memory controller is available: |
428 | * memory is listed in cgroup.controllers |
429 | */ |
430 | if (cg_read_strstr(cgroup: root, control: "cgroup.controllers" , needle: "memory" )) |
431 | ksft_exit_skip(msg: "memory controller isn't available\n" ); |
432 | |
433 | if (cg_read_strstr(cgroup: root, control: "cgroup.subtree_control" , needle: "memory" )) |
434 | if (cg_write(cgroup: root, control: "cgroup.subtree_control" , buf: "+memory" )) |
435 | ksft_exit_skip(msg: "Failed to set memory controller\n" ); |
436 | |
437 | for (i = 0; i < ARRAY_SIZE(tests); i++) { |
438 | switch (tests[i].fn(root)) { |
439 | case KSFT_PASS: |
440 | ksft_test_result_pass(msg: "%s\n" , tests[i].name); |
441 | break; |
442 | case KSFT_SKIP: |
443 | ksft_test_result_skip(msg: "%s\n" , tests[i].name); |
444 | break; |
445 | default: |
446 | ret = EXIT_FAILURE; |
447 | ksft_test_result_fail(msg: "%s\n" , tests[i].name); |
448 | break; |
449 | } |
450 | } |
451 | |
452 | return ret; |
453 | } |
454 | |