1 | /* |
2 | * |
3 | * Copyright IBM Corporation, 2012 |
4 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> |
5 | * |
6 | * Cgroup v2 |
7 | * Copyright (C) 2019 Red Hat, Inc. |
8 | * Author: Giuseppe Scrivano <gscrivan@redhat.com> |
9 | * |
10 | * This program is free software; you can redistribute it and/or modify it |
11 | * under the terms of version 2.1 of the GNU Lesser General Public License |
12 | * as published by the Free Software Foundation. |
13 | * |
14 | * This program is distributed in the hope that it would be useful, but |
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
17 | * |
18 | */ |
19 | |
20 | #include <linux/cgroup.h> |
21 | #include <linux/page_counter.h> |
22 | #include <linux/slab.h> |
23 | #include <linux/hugetlb.h> |
24 | #include <linux/hugetlb_cgroup.h> |
25 | |
26 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
27 | #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) |
28 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
29 | |
30 | static struct hugetlb_cgroup *root_h_cgroup __read_mostly; |
31 | |
32 | static inline struct page_counter * |
33 | __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, |
34 | bool rsvd) |
35 | { |
36 | if (rsvd) |
37 | return &h_cg->rsvd_hugepage[idx]; |
38 | return &h_cg->hugepage[idx]; |
39 | } |
40 | |
41 | static inline struct page_counter * |
42 | hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) |
43 | { |
44 | return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd: false); |
45 | } |
46 | |
47 | static inline struct page_counter * |
48 | hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) |
49 | { |
50 | return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd: true); |
51 | } |
52 | |
53 | static inline |
54 | struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) |
55 | { |
56 | return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; |
57 | } |
58 | |
59 | static inline |
60 | struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) |
61 | { |
62 | return hugetlb_cgroup_from_css(s: task_css(task, subsys_id: hugetlb_cgrp_id)); |
63 | } |
64 | |
65 | static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) |
66 | { |
67 | return (h_cg == root_h_cgroup); |
68 | } |
69 | |
70 | static inline struct hugetlb_cgroup * |
71 | parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) |
72 | { |
73 | return hugetlb_cgroup_from_css(s: h_cg->css.parent); |
74 | } |
75 | |
76 | static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) |
77 | { |
78 | struct hstate *h; |
79 | |
80 | for_each_hstate(h) { |
81 | if (page_counter_read( |
82 | counter: hugetlb_cgroup_counter_from_cgroup(h_cg, idx: hstate_index(h)))) |
83 | return true; |
84 | } |
85 | return false; |
86 | } |
87 | |
88 | static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, |
89 | struct hugetlb_cgroup *parent_h_cgroup) |
90 | { |
91 | int idx; |
92 | |
93 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { |
94 | struct page_counter *fault_parent = NULL; |
95 | struct page_counter *rsvd_parent = NULL; |
96 | unsigned long limit; |
97 | int ret; |
98 | |
99 | if (parent_h_cgroup) { |
100 | fault_parent = hugetlb_cgroup_counter_from_cgroup( |
101 | h_cg: parent_h_cgroup, idx); |
102 | rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( |
103 | h_cg: parent_h_cgroup, idx); |
104 | } |
105 | page_counter_init(counter: hugetlb_cgroup_counter_from_cgroup(h_cg: h_cgroup, |
106 | idx), |
107 | parent: fault_parent); |
108 | page_counter_init( |
109 | counter: hugetlb_cgroup_counter_from_cgroup_rsvd(h_cg: h_cgroup, idx), |
110 | parent: rsvd_parent); |
111 | |
112 | limit = round_down(PAGE_COUNTER_MAX, |
113 | pages_per_huge_page(&hstates[idx])); |
114 | |
115 | ret = page_counter_set_max( |
116 | counter: hugetlb_cgroup_counter_from_cgroup(h_cg: h_cgroup, idx), |
117 | nr_pages: limit); |
118 | VM_BUG_ON(ret); |
119 | ret = page_counter_set_max( |
120 | counter: hugetlb_cgroup_counter_from_cgroup_rsvd(h_cg: h_cgroup, idx), |
121 | nr_pages: limit); |
122 | VM_BUG_ON(ret); |
123 | } |
124 | } |
125 | |
126 | static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) |
127 | { |
128 | int node; |
129 | |
130 | for_each_node(node) |
131 | kfree(objp: h_cgroup->nodeinfo[node]); |
132 | kfree(objp: h_cgroup); |
133 | } |
134 | |
135 | static struct cgroup_subsys_state * |
136 | hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
137 | { |
138 | struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(s: parent_css); |
139 | struct hugetlb_cgroup *h_cgroup; |
140 | int node; |
141 | |
142 | h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), |
143 | GFP_KERNEL); |
144 | |
145 | if (!h_cgroup) |
146 | return ERR_PTR(error: -ENOMEM); |
147 | |
148 | if (!parent_h_cgroup) |
149 | root_h_cgroup = h_cgroup; |
150 | |
151 | /* |
152 | * TODO: this routine can waste much memory for nodes which will |
153 | * never be onlined. It's better to use memory hotplug callback |
154 | * function. |
155 | */ |
156 | for_each_node(node) { |
157 | /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ |
158 | int node_to_alloc = |
159 | node_state(node, state: N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; |
160 | h_cgroup->nodeinfo[node] = |
161 | kzalloc_node(size: sizeof(struct hugetlb_cgroup_per_node), |
162 | GFP_KERNEL, node: node_to_alloc); |
163 | if (!h_cgroup->nodeinfo[node]) |
164 | goto fail_alloc_nodeinfo; |
165 | } |
166 | |
167 | hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); |
168 | return &h_cgroup->css; |
169 | |
170 | fail_alloc_nodeinfo: |
171 | hugetlb_cgroup_free(h_cgroup); |
172 | return ERR_PTR(error: -ENOMEM); |
173 | } |
174 | |
175 | static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) |
176 | { |
177 | hugetlb_cgroup_free(h_cgroup: hugetlb_cgroup_from_css(s: css)); |
178 | } |
179 | |
180 | /* |
181 | * Should be called with hugetlb_lock held. |
182 | * Since we are holding hugetlb_lock, pages cannot get moved from |
183 | * active list or uncharged from the cgroup, So no need to get |
184 | * page reference and test for page active here. This function |
185 | * cannot fail. |
186 | */ |
187 | static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, |
188 | struct page *page) |
189 | { |
190 | unsigned int nr_pages; |
191 | struct page_counter *counter; |
192 | struct hugetlb_cgroup *page_hcg; |
193 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); |
194 | struct folio *folio = page_folio(page); |
195 | |
196 | page_hcg = hugetlb_cgroup_from_folio(folio); |
197 | /* |
198 | * We can have pages in active list without any cgroup |
199 | * ie, hugepage with less than 3 pages. We can safely |
200 | * ignore those pages. |
201 | */ |
202 | if (!page_hcg || page_hcg != h_cg) |
203 | goto out; |
204 | |
205 | nr_pages = compound_nr(page); |
206 | if (!parent) { |
207 | parent = root_h_cgroup; |
208 | /* root has no limit */ |
209 | page_counter_charge(counter: &parent->hugepage[idx], nr_pages); |
210 | } |
211 | counter = &h_cg->hugepage[idx]; |
212 | /* Take the pages off the local counter */ |
213 | page_counter_cancel(counter, nr_pages); |
214 | |
215 | set_hugetlb_cgroup(folio, h_cg: parent); |
216 | out: |
217 | return; |
218 | } |
219 | |
220 | /* |
221 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to |
222 | * the parent cgroup. |
223 | */ |
224 | static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) |
225 | { |
226 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: css); |
227 | struct hstate *h; |
228 | struct page *page; |
229 | |
230 | do { |
231 | for_each_hstate(h) { |
232 | spin_lock_irq(lock: &hugetlb_lock); |
233 | list_for_each_entry(page, &h->hugepage_activelist, lru) |
234 | hugetlb_cgroup_move_parent(idx: hstate_index(h), h_cg, page); |
235 | |
236 | spin_unlock_irq(lock: &hugetlb_lock); |
237 | } |
238 | cond_resched(); |
239 | } while (hugetlb_cgroup_have_usage(h_cg)); |
240 | } |
241 | |
242 | static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, |
243 | enum hugetlb_memory_event event) |
244 | { |
245 | atomic_long_inc(v: &hugetlb->events_local[idx][event]); |
246 | cgroup_file_notify(cfile: &hugetlb->events_local_file[idx]); |
247 | |
248 | do { |
249 | atomic_long_inc(v: &hugetlb->events[idx][event]); |
250 | cgroup_file_notify(cfile: &hugetlb->events_file[idx]); |
251 | } while ((hugetlb = parent_hugetlb_cgroup(h_cg: hugetlb)) && |
252 | !hugetlb_cgroup_is_root(h_cg: hugetlb)); |
253 | } |
254 | |
255 | static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
256 | struct hugetlb_cgroup **ptr, |
257 | bool rsvd) |
258 | { |
259 | int ret = 0; |
260 | struct page_counter *counter; |
261 | struct hugetlb_cgroup *h_cg = NULL; |
262 | |
263 | if (hugetlb_cgroup_disabled()) |
264 | goto done; |
265 | again: |
266 | rcu_read_lock(); |
267 | h_cg = hugetlb_cgroup_from_task(current); |
268 | if (!css_tryget(css: &h_cg->css)) { |
269 | rcu_read_unlock(); |
270 | goto again; |
271 | } |
272 | rcu_read_unlock(); |
273 | |
274 | if (!page_counter_try_charge( |
275 | counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), |
276 | nr_pages, fail: &counter)) { |
277 | ret = -ENOMEM; |
278 | hugetlb_event(hugetlb: h_cg, idx, event: HUGETLB_MAX); |
279 | css_put(css: &h_cg->css); |
280 | goto done; |
281 | } |
282 | /* Reservations take a reference to the css because they do not get |
283 | * reparented. |
284 | */ |
285 | if (!rsvd) |
286 | css_put(css: &h_cg->css); |
287 | done: |
288 | *ptr = h_cg; |
289 | return ret; |
290 | } |
291 | |
292 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, |
293 | struct hugetlb_cgroup **ptr) |
294 | { |
295 | return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, rsvd: false); |
296 | } |
297 | |
298 | int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, |
299 | struct hugetlb_cgroup **ptr) |
300 | { |
301 | return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, rsvd: true); |
302 | } |
303 | |
304 | /* Should be called with hugetlb_lock held */ |
305 | static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, |
306 | struct hugetlb_cgroup *h_cg, |
307 | struct folio *folio, bool rsvd) |
308 | { |
309 | if (hugetlb_cgroup_disabled() || !h_cg) |
310 | return; |
311 | |
312 | __set_hugetlb_cgroup(folio, h_cg, rsvd); |
313 | if (!rsvd) { |
314 | unsigned long usage = |
315 | h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; |
316 | /* |
317 | * This write is not atomic due to fetching usage and writing |
318 | * to it, but that's fine because we call this with |
319 | * hugetlb_lock held anyway. |
320 | */ |
321 | WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], |
322 | usage + nr_pages); |
323 | } |
324 | } |
325 | |
326 | void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, |
327 | struct hugetlb_cgroup *h_cg, |
328 | struct folio *folio) |
329 | { |
330 | __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, rsvd: false); |
331 | } |
332 | |
333 | void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, |
334 | struct hugetlb_cgroup *h_cg, |
335 | struct folio *folio) |
336 | { |
337 | __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, rsvd: true); |
338 | } |
339 | |
340 | /* |
341 | * Should be called with hugetlb_lock held |
342 | */ |
343 | static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, |
344 | struct folio *folio, bool rsvd) |
345 | { |
346 | struct hugetlb_cgroup *h_cg; |
347 | |
348 | if (hugetlb_cgroup_disabled()) |
349 | return; |
350 | lockdep_assert_held(&hugetlb_lock); |
351 | h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); |
352 | if (unlikely(!h_cg)) |
353 | return; |
354 | __set_hugetlb_cgroup(folio, NULL, rsvd); |
355 | |
356 | page_counter_uncharge(counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, |
357 | rsvd), |
358 | nr_pages); |
359 | |
360 | if (rsvd) |
361 | css_put(css: &h_cg->css); |
362 | else { |
363 | unsigned long usage = |
364 | h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; |
365 | /* |
366 | * This write is not atomic due to fetching usage and writing |
367 | * to it, but that's fine because we call this with |
368 | * hugetlb_lock held anyway. |
369 | */ |
370 | WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], |
371 | usage - nr_pages); |
372 | } |
373 | } |
374 | |
375 | void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, |
376 | struct folio *folio) |
377 | { |
378 | __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, rsvd: false); |
379 | } |
380 | |
381 | void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, |
382 | struct folio *folio) |
383 | { |
384 | __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, rsvd: true); |
385 | } |
386 | |
387 | static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, |
388 | struct hugetlb_cgroup *h_cg, |
389 | bool rsvd) |
390 | { |
391 | if (hugetlb_cgroup_disabled() || !h_cg) |
392 | return; |
393 | |
394 | page_counter_uncharge(counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, |
395 | rsvd), |
396 | nr_pages); |
397 | |
398 | if (rsvd) |
399 | css_put(css: &h_cg->css); |
400 | } |
401 | |
402 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, |
403 | struct hugetlb_cgroup *h_cg) |
404 | { |
405 | __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, rsvd: false); |
406 | } |
407 | |
408 | void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, |
409 | struct hugetlb_cgroup *h_cg) |
410 | { |
411 | __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, rsvd: true); |
412 | } |
413 | |
414 | void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, |
415 | unsigned long end) |
416 | { |
417 | if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || |
418 | !resv->css) |
419 | return; |
420 | |
421 | page_counter_uncharge(counter: resv->reservation_counter, |
422 | nr_pages: (end - start) * resv->pages_per_hpage); |
423 | css_put(css: resv->css); |
424 | } |
425 | |
426 | void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, |
427 | struct file_region *rg, |
428 | unsigned long nr_pages, |
429 | bool region_del) |
430 | { |
431 | if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) |
432 | return; |
433 | |
434 | if (rg->reservation_counter && resv->pages_per_hpage && |
435 | !resv->reservation_counter) { |
436 | page_counter_uncharge(counter: rg->reservation_counter, |
437 | nr_pages: nr_pages * resv->pages_per_hpage); |
438 | /* |
439 | * Only do css_put(rg->css) when we delete the entire region |
440 | * because one file_region must hold exactly one css reference. |
441 | */ |
442 | if (region_del) |
443 | css_put(css: rg->css); |
444 | } |
445 | } |
446 | |
447 | enum { |
448 | RES_USAGE, |
449 | RES_RSVD_USAGE, |
450 | RES_LIMIT, |
451 | RES_RSVD_LIMIT, |
452 | RES_MAX_USAGE, |
453 | RES_RSVD_MAX_USAGE, |
454 | RES_FAILCNT, |
455 | RES_RSVD_FAILCNT, |
456 | }; |
457 | |
458 | static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) |
459 | { |
460 | int nid; |
461 | struct cftype *cft = seq_cft(seq); |
462 | int idx = MEMFILE_IDX(cft->private); |
463 | bool legacy = MEMFILE_ATTR(cft->private); |
464 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: seq_css(seq)); |
465 | struct cgroup_subsys_state *css; |
466 | unsigned long usage; |
467 | |
468 | if (legacy) { |
469 | /* Add up usage across all nodes for the non-hierarchical total. */ |
470 | usage = 0; |
471 | for_each_node_state(nid, N_MEMORY) |
472 | usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); |
473 | seq_printf(m: seq, fmt: "total=%lu" , usage * PAGE_SIZE); |
474 | |
475 | /* Simply print the per-node usage for the non-hierarchical total. */ |
476 | for_each_node_state(nid, N_MEMORY) |
477 | seq_printf(m: seq, fmt: " N%d=%lu" , nid, |
478 | READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * |
479 | PAGE_SIZE); |
480 | seq_putc(m: seq, c: '\n'); |
481 | } |
482 | |
483 | /* |
484 | * The hierarchical total is pretty much the value recorded by the |
485 | * counter, so use that. |
486 | */ |
487 | seq_printf(m: seq, fmt: "%stotal=%lu" , legacy ? "hierarchical_" : "" , |
488 | page_counter_read(counter: &h_cg->hugepage[idx]) * PAGE_SIZE); |
489 | |
490 | /* |
491 | * For each node, transverse the css tree to obtain the hierarchical |
492 | * node usage. |
493 | */ |
494 | for_each_node_state(nid, N_MEMORY) { |
495 | usage = 0; |
496 | rcu_read_lock(); |
497 | css_for_each_descendant_pre(css, &h_cg->css) { |
498 | usage += READ_ONCE(hugetlb_cgroup_from_css(css) |
499 | ->nodeinfo[nid] |
500 | ->usage[idx]); |
501 | } |
502 | rcu_read_unlock(); |
503 | seq_printf(m: seq, fmt: " N%d=%lu" , nid, usage * PAGE_SIZE); |
504 | } |
505 | |
506 | seq_putc(m: seq, c: '\n'); |
507 | |
508 | return 0; |
509 | } |
510 | |
511 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, |
512 | struct cftype *cft) |
513 | { |
514 | struct page_counter *counter; |
515 | struct page_counter *rsvd_counter; |
516 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: css); |
517 | |
518 | counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; |
519 | rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; |
520 | |
521 | switch (MEMFILE_ATTR(cft->private)) { |
522 | case RES_USAGE: |
523 | return (u64)page_counter_read(counter) * PAGE_SIZE; |
524 | case RES_RSVD_USAGE: |
525 | return (u64)page_counter_read(counter: rsvd_counter) * PAGE_SIZE; |
526 | case RES_LIMIT: |
527 | return (u64)counter->max * PAGE_SIZE; |
528 | case RES_RSVD_LIMIT: |
529 | return (u64)rsvd_counter->max * PAGE_SIZE; |
530 | case RES_MAX_USAGE: |
531 | return (u64)counter->watermark * PAGE_SIZE; |
532 | case RES_RSVD_MAX_USAGE: |
533 | return (u64)rsvd_counter->watermark * PAGE_SIZE; |
534 | case RES_FAILCNT: |
535 | return counter->failcnt; |
536 | case RES_RSVD_FAILCNT: |
537 | return rsvd_counter->failcnt; |
538 | default: |
539 | BUG(); |
540 | } |
541 | } |
542 | |
543 | static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) |
544 | { |
545 | int idx; |
546 | u64 val; |
547 | struct cftype *cft = seq_cft(seq); |
548 | unsigned long limit; |
549 | struct page_counter *counter; |
550 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: seq_css(seq)); |
551 | |
552 | idx = MEMFILE_IDX(cft->private); |
553 | counter = &h_cg->hugepage[idx]; |
554 | |
555 | limit = round_down(PAGE_COUNTER_MAX, |
556 | pages_per_huge_page(&hstates[idx])); |
557 | |
558 | switch (MEMFILE_ATTR(cft->private)) { |
559 | case RES_RSVD_USAGE: |
560 | counter = &h_cg->rsvd_hugepage[idx]; |
561 | fallthrough; |
562 | case RES_USAGE: |
563 | val = (u64)page_counter_read(counter); |
564 | seq_printf(m: seq, fmt: "%llu\n" , val * PAGE_SIZE); |
565 | break; |
566 | case RES_RSVD_LIMIT: |
567 | counter = &h_cg->rsvd_hugepage[idx]; |
568 | fallthrough; |
569 | case RES_LIMIT: |
570 | val = (u64)counter->max; |
571 | if (val == limit) |
572 | seq_puts(m: seq, s: "max\n" ); |
573 | else |
574 | seq_printf(m: seq, fmt: "%llu\n" , val * PAGE_SIZE); |
575 | break; |
576 | default: |
577 | BUG(); |
578 | } |
579 | |
580 | return 0; |
581 | } |
582 | |
583 | static DEFINE_MUTEX(hugetlb_limit_mutex); |
584 | |
585 | static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, |
586 | char *buf, size_t nbytes, loff_t off, |
587 | const char *max) |
588 | { |
589 | int ret, idx; |
590 | unsigned long nr_pages; |
591 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: of_css(of)); |
592 | bool rsvd = false; |
593 | |
594 | if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ |
595 | return -EINVAL; |
596 | |
597 | buf = strstrip(str: buf); |
598 | ret = page_counter_memparse(buf, max, nr_pages: &nr_pages); |
599 | if (ret) |
600 | return ret; |
601 | |
602 | idx = MEMFILE_IDX(of_cft(of)->private); |
603 | nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); |
604 | |
605 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
606 | case RES_RSVD_LIMIT: |
607 | rsvd = true; |
608 | fallthrough; |
609 | case RES_LIMIT: |
610 | mutex_lock(&hugetlb_limit_mutex); |
611 | ret = page_counter_set_max( |
612 | counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), |
613 | nr_pages); |
614 | mutex_unlock(lock: &hugetlb_limit_mutex); |
615 | break; |
616 | default: |
617 | ret = -EINVAL; |
618 | break; |
619 | } |
620 | return ret ?: nbytes; |
621 | } |
622 | |
623 | static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, |
624 | char *buf, size_t nbytes, loff_t off) |
625 | { |
626 | return hugetlb_cgroup_write(of, buf, nbytes, off, max: "-1" ); |
627 | } |
628 | |
629 | static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, |
630 | char *buf, size_t nbytes, loff_t off) |
631 | { |
632 | return hugetlb_cgroup_write(of, buf, nbytes, off, max: "max" ); |
633 | } |
634 | |
635 | static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, |
636 | char *buf, size_t nbytes, loff_t off) |
637 | { |
638 | int ret = 0; |
639 | struct page_counter *counter, *rsvd_counter; |
640 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: of_css(of)); |
641 | |
642 | counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; |
643 | rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; |
644 | |
645 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
646 | case RES_MAX_USAGE: |
647 | page_counter_reset_watermark(counter); |
648 | break; |
649 | case RES_RSVD_MAX_USAGE: |
650 | page_counter_reset_watermark(counter: rsvd_counter); |
651 | break; |
652 | case RES_FAILCNT: |
653 | counter->failcnt = 0; |
654 | break; |
655 | case RES_RSVD_FAILCNT: |
656 | rsvd_counter->failcnt = 0; |
657 | break; |
658 | default: |
659 | ret = -EINVAL; |
660 | break; |
661 | } |
662 | return ret ?: nbytes; |
663 | } |
664 | |
665 | static char *mem_fmt(char *buf, int size, unsigned long hsize) |
666 | { |
667 | if (hsize >= SZ_1G) |
668 | snprintf(buf, size, fmt: "%luGB" , hsize / SZ_1G); |
669 | else if (hsize >= SZ_1M) |
670 | snprintf(buf, size, fmt: "%luMB" , hsize / SZ_1M); |
671 | else |
672 | snprintf(buf, size, fmt: "%luKB" , hsize / SZ_1K); |
673 | return buf; |
674 | } |
675 | |
676 | static int __hugetlb_events_show(struct seq_file *seq, bool local) |
677 | { |
678 | int idx; |
679 | long max; |
680 | struct cftype *cft = seq_cft(seq); |
681 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: seq_css(seq)); |
682 | |
683 | idx = MEMFILE_IDX(cft->private); |
684 | |
685 | if (local) |
686 | max = atomic_long_read(v: &h_cg->events_local[idx][HUGETLB_MAX]); |
687 | else |
688 | max = atomic_long_read(v: &h_cg->events[idx][HUGETLB_MAX]); |
689 | |
690 | seq_printf(m: seq, fmt: "max %lu\n" , max); |
691 | |
692 | return 0; |
693 | } |
694 | |
695 | static int hugetlb_events_show(struct seq_file *seq, void *v) |
696 | { |
697 | return __hugetlb_events_show(seq, local: false); |
698 | } |
699 | |
700 | static int hugetlb_events_local_show(struct seq_file *seq, void *v) |
701 | { |
702 | return __hugetlb_events_show(seq, local: true); |
703 | } |
704 | |
705 | static void __init __hugetlb_cgroup_file_dfl_init(int idx) |
706 | { |
707 | char buf[32]; |
708 | struct cftype *cft; |
709 | struct hstate *h = &hstates[idx]; |
710 | |
711 | /* format the size */ |
712 | mem_fmt(buf, size: sizeof(buf), hsize: huge_page_size(h)); |
713 | |
714 | /* Add the limit file */ |
715 | cft = &h->cgroup_files_dfl[0]; |
716 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.max" , buf); |
717 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); |
718 | cft->seq_show = hugetlb_cgroup_read_u64_max; |
719 | cft->write = hugetlb_cgroup_write_dfl; |
720 | cft->flags = CFTYPE_NOT_ON_ROOT; |
721 | |
722 | /* Add the reservation limit file */ |
723 | cft = &h->cgroup_files_dfl[1]; |
724 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.rsvd.max" , buf); |
725 | cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); |
726 | cft->seq_show = hugetlb_cgroup_read_u64_max; |
727 | cft->write = hugetlb_cgroup_write_dfl; |
728 | cft->flags = CFTYPE_NOT_ON_ROOT; |
729 | |
730 | /* Add the current usage file */ |
731 | cft = &h->cgroup_files_dfl[2]; |
732 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.current" , buf); |
733 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); |
734 | cft->seq_show = hugetlb_cgroup_read_u64_max; |
735 | cft->flags = CFTYPE_NOT_ON_ROOT; |
736 | |
737 | /* Add the current reservation usage file */ |
738 | cft = &h->cgroup_files_dfl[3]; |
739 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.rsvd.current" , buf); |
740 | cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); |
741 | cft->seq_show = hugetlb_cgroup_read_u64_max; |
742 | cft->flags = CFTYPE_NOT_ON_ROOT; |
743 | |
744 | /* Add the events file */ |
745 | cft = &h->cgroup_files_dfl[4]; |
746 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.events" , buf); |
747 | cft->private = MEMFILE_PRIVATE(idx, 0); |
748 | cft->seq_show = hugetlb_events_show; |
749 | cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); |
750 | cft->flags = CFTYPE_NOT_ON_ROOT; |
751 | |
752 | /* Add the events.local file */ |
753 | cft = &h->cgroup_files_dfl[5]; |
754 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.events.local" , buf); |
755 | cft->private = MEMFILE_PRIVATE(idx, 0); |
756 | cft->seq_show = hugetlb_events_local_show; |
757 | cft->file_offset = offsetof(struct hugetlb_cgroup, |
758 | events_local_file[idx]); |
759 | cft->flags = CFTYPE_NOT_ON_ROOT; |
760 | |
761 | /* Add the numa stat file */ |
762 | cft = &h->cgroup_files_dfl[6]; |
763 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.numa_stat" , buf); |
764 | cft->private = MEMFILE_PRIVATE(idx, 0); |
765 | cft->seq_show = hugetlb_cgroup_read_numa_stat; |
766 | cft->flags = CFTYPE_NOT_ON_ROOT; |
767 | |
768 | /* NULL terminate the last cft */ |
769 | cft = &h->cgroup_files_dfl[7]; |
770 | memset(cft, 0, sizeof(*cft)); |
771 | |
772 | WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, |
773 | h->cgroup_files_dfl)); |
774 | } |
775 | |
776 | static void __init __hugetlb_cgroup_file_legacy_init(int idx) |
777 | { |
778 | char buf[32]; |
779 | struct cftype *cft; |
780 | struct hstate *h = &hstates[idx]; |
781 | |
782 | /* format the size */ |
783 | mem_fmt(buf, size: sizeof(buf), hsize: huge_page_size(h)); |
784 | |
785 | /* Add the limit file */ |
786 | cft = &h->cgroup_files_legacy[0]; |
787 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.limit_in_bytes" , buf); |
788 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); |
789 | cft->read_u64 = hugetlb_cgroup_read_u64; |
790 | cft->write = hugetlb_cgroup_write_legacy; |
791 | |
792 | /* Add the reservation limit file */ |
793 | cft = &h->cgroup_files_legacy[1]; |
794 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.rsvd.limit_in_bytes" , buf); |
795 | cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); |
796 | cft->read_u64 = hugetlb_cgroup_read_u64; |
797 | cft->write = hugetlb_cgroup_write_legacy; |
798 | |
799 | /* Add the usage file */ |
800 | cft = &h->cgroup_files_legacy[2]; |
801 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.usage_in_bytes" , buf); |
802 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); |
803 | cft->read_u64 = hugetlb_cgroup_read_u64; |
804 | |
805 | /* Add the reservation usage file */ |
806 | cft = &h->cgroup_files_legacy[3]; |
807 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.rsvd.usage_in_bytes" , buf); |
808 | cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); |
809 | cft->read_u64 = hugetlb_cgroup_read_u64; |
810 | |
811 | /* Add the MAX usage file */ |
812 | cft = &h->cgroup_files_legacy[4]; |
813 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.max_usage_in_bytes" , buf); |
814 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); |
815 | cft->write = hugetlb_cgroup_reset; |
816 | cft->read_u64 = hugetlb_cgroup_read_u64; |
817 | |
818 | /* Add the MAX reservation usage file */ |
819 | cft = &h->cgroup_files_legacy[5]; |
820 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.rsvd.max_usage_in_bytes" , buf); |
821 | cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); |
822 | cft->write = hugetlb_cgroup_reset; |
823 | cft->read_u64 = hugetlb_cgroup_read_u64; |
824 | |
825 | /* Add the failcntfile */ |
826 | cft = &h->cgroup_files_legacy[6]; |
827 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.failcnt" , buf); |
828 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); |
829 | cft->write = hugetlb_cgroup_reset; |
830 | cft->read_u64 = hugetlb_cgroup_read_u64; |
831 | |
832 | /* Add the reservation failcntfile */ |
833 | cft = &h->cgroup_files_legacy[7]; |
834 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.rsvd.failcnt" , buf); |
835 | cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); |
836 | cft->write = hugetlb_cgroup_reset; |
837 | cft->read_u64 = hugetlb_cgroup_read_u64; |
838 | |
839 | /* Add the numa stat file */ |
840 | cft = &h->cgroup_files_legacy[8]; |
841 | snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.numa_stat" , buf); |
842 | cft->private = MEMFILE_PRIVATE(idx, 1); |
843 | cft->seq_show = hugetlb_cgroup_read_numa_stat; |
844 | |
845 | /* NULL terminate the last cft */ |
846 | cft = &h->cgroup_files_legacy[9]; |
847 | memset(cft, 0, sizeof(*cft)); |
848 | |
849 | WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, |
850 | h->cgroup_files_legacy)); |
851 | } |
852 | |
853 | static void __init __hugetlb_cgroup_file_init(int idx) |
854 | { |
855 | __hugetlb_cgroup_file_dfl_init(idx); |
856 | __hugetlb_cgroup_file_legacy_init(idx); |
857 | } |
858 | |
859 | void __init hugetlb_cgroup_file_init(void) |
860 | { |
861 | struct hstate *h; |
862 | |
863 | for_each_hstate(h) |
864 | __hugetlb_cgroup_file_init(idx: hstate_index(h)); |
865 | } |
866 | |
867 | /* |
868 | * hugetlb_lock will make sure a parallel cgroup rmdir won't happen |
869 | * when we migrate hugepages |
870 | */ |
871 | void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) |
872 | { |
873 | struct hugetlb_cgroup *h_cg; |
874 | struct hugetlb_cgroup *h_cg_rsvd; |
875 | struct hstate *h = folio_hstate(folio: old_folio); |
876 | |
877 | if (hugetlb_cgroup_disabled()) |
878 | return; |
879 | |
880 | spin_lock_irq(lock: &hugetlb_lock); |
881 | h_cg = hugetlb_cgroup_from_folio(folio: old_folio); |
882 | h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(folio: old_folio); |
883 | set_hugetlb_cgroup(folio: old_folio, NULL); |
884 | set_hugetlb_cgroup_rsvd(folio: old_folio, NULL); |
885 | |
886 | /* move the h_cg details to new cgroup */ |
887 | set_hugetlb_cgroup(folio: new_folio, h_cg); |
888 | set_hugetlb_cgroup_rsvd(folio: new_folio, h_cg: h_cg_rsvd); |
889 | list_move(list: &new_folio->lru, head: &h->hugepage_activelist); |
890 | spin_unlock_irq(lock: &hugetlb_lock); |
891 | return; |
892 | } |
893 | |
894 | static struct cftype hugetlb_files[] = { |
895 | {} /* terminate */ |
896 | }; |
897 | |
898 | struct cgroup_subsys hugetlb_cgrp_subsys = { |
899 | .css_alloc = hugetlb_cgroup_css_alloc, |
900 | .css_offline = hugetlb_cgroup_css_offline, |
901 | .css_free = hugetlb_cgroup_css_free, |
902 | .dfl_cftypes = hugetlb_files, |
903 | .legacy_cftypes = hugetlb_files, |
904 | }; |
905 | |