1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Lockless hierarchical page accounting & limiting |
4 | * |
5 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner |
6 | */ |
7 | |
8 | #include <linux/page_counter.h> |
9 | #include <linux/atomic.h> |
10 | #include <linux/kernel.h> |
11 | #include <linux/string.h> |
12 | #include <linux/sched.h> |
13 | #include <linux/bug.h> |
14 | #include <asm/page.h> |
15 | |
16 | static void propagate_protected_usage(struct page_counter *c, |
17 | unsigned long usage) |
18 | { |
19 | unsigned long protected, old_protected; |
20 | long delta; |
21 | |
22 | if (!c->parent) |
23 | return; |
24 | |
25 | if (c->min || atomic_long_read(&c->min_usage)) { |
26 | if (usage <= c->min) |
27 | protected = usage; |
28 | else |
29 | protected = 0; |
30 | |
31 | old_protected = atomic_long_xchg(&c->min_usage, protected); |
32 | delta = protected - old_protected; |
33 | if (delta) |
34 | atomic_long_add(delta, &c->parent->children_min_usage); |
35 | } |
36 | |
37 | if (c->low || atomic_long_read(&c->low_usage)) { |
38 | if (usage <= c->low) |
39 | protected = usage; |
40 | else |
41 | protected = 0; |
42 | |
43 | old_protected = atomic_long_xchg(&c->low_usage, protected); |
44 | delta = protected - old_protected; |
45 | if (delta) |
46 | atomic_long_add(delta, &c->parent->children_low_usage); |
47 | } |
48 | } |
49 | |
50 | /** |
51 | * page_counter_cancel - take pages out of the local counter |
52 | * @counter: counter |
53 | * @nr_pages: number of pages to cancel |
54 | */ |
55 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) |
56 | { |
57 | long new; |
58 | |
59 | new = atomic_long_sub_return(nr_pages, &counter->usage); |
60 | propagate_protected_usage(counter, new); |
61 | /* More uncharges than charges? */ |
62 | WARN_ON_ONCE(new < 0); |
63 | } |
64 | |
65 | /** |
66 | * page_counter_charge - hierarchically charge pages |
67 | * @counter: counter |
68 | * @nr_pages: number of pages to charge |
69 | * |
70 | * NOTE: This does not consider any configured counter limits. |
71 | */ |
72 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) |
73 | { |
74 | struct page_counter *c; |
75 | |
76 | for (c = counter; c; c = c->parent) { |
77 | long new; |
78 | |
79 | new = atomic_long_add_return(nr_pages, &c->usage); |
80 | propagate_protected_usage(counter, new); |
81 | /* |
82 | * This is indeed racy, but we can live with some |
83 | * inaccuracy in the watermark. |
84 | */ |
85 | if (new > c->watermark) |
86 | c->watermark = new; |
87 | } |
88 | } |
89 | |
90 | /** |
91 | * page_counter_try_charge - try to hierarchically charge pages |
92 | * @counter: counter |
93 | * @nr_pages: number of pages to charge |
94 | * @fail: points first counter to hit its limit, if any |
95 | * |
96 | * Returns %true on success, or %false and @fail if the counter or one |
97 | * of its ancestors has hit its configured limit. |
98 | */ |
99 | bool page_counter_try_charge(struct page_counter *counter, |
100 | unsigned long nr_pages, |
101 | struct page_counter **fail) |
102 | { |
103 | struct page_counter *c; |
104 | |
105 | for (c = counter; c; c = c->parent) { |
106 | long new; |
107 | /* |
108 | * Charge speculatively to avoid an expensive CAS. If |
109 | * a bigger charge fails, it might falsely lock out a |
110 | * racing smaller charge and send it into reclaim |
111 | * early, but the error is limited to the difference |
112 | * between the two sizes, which is less than 2M/4M in |
113 | * case of a THP locking out a regular page charge. |
114 | * |
115 | * The atomic_long_add_return() implies a full memory |
116 | * barrier between incrementing the count and reading |
117 | * the limit. When racing with page_counter_limit(), |
118 | * we either see the new limit or the setter sees the |
119 | * counter has changed and retries. |
120 | */ |
121 | new = atomic_long_add_return(nr_pages, &c->usage); |
122 | if (new > c->max) { |
123 | atomic_long_sub(nr_pages, &c->usage); |
124 | propagate_protected_usage(counter, new); |
125 | /* |
126 | * This is racy, but we can live with some |
127 | * inaccuracy in the failcnt. |
128 | */ |
129 | c->failcnt++; |
130 | *fail = c; |
131 | goto failed; |
132 | } |
133 | propagate_protected_usage(counter, new); |
134 | /* |
135 | * Just like with failcnt, we can live with some |
136 | * inaccuracy in the watermark. |
137 | */ |
138 | if (new > c->watermark) |
139 | c->watermark = new; |
140 | } |
141 | return true; |
142 | |
143 | failed: |
144 | for (c = counter; c != *fail; c = c->parent) |
145 | page_counter_cancel(c, nr_pages); |
146 | |
147 | return false; |
148 | } |
149 | |
150 | /** |
151 | * page_counter_uncharge - hierarchically uncharge pages |
152 | * @counter: counter |
153 | * @nr_pages: number of pages to uncharge |
154 | */ |
155 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) |
156 | { |
157 | struct page_counter *c; |
158 | |
159 | for (c = counter; c; c = c->parent) |
160 | page_counter_cancel(c, nr_pages); |
161 | } |
162 | |
163 | /** |
164 | * page_counter_set_max - set the maximum number of pages allowed |
165 | * @counter: counter |
166 | * @nr_pages: limit to set |
167 | * |
168 | * Returns 0 on success, -EBUSY if the current number of pages on the |
169 | * counter already exceeds the specified limit. |
170 | * |
171 | * The caller must serialize invocations on the same counter. |
172 | */ |
173 | int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) |
174 | { |
175 | for (;;) { |
176 | unsigned long old; |
177 | long usage; |
178 | |
179 | /* |
180 | * Update the limit while making sure that it's not |
181 | * below the concurrently-changing counter value. |
182 | * |
183 | * The xchg implies two full memory barriers before |
184 | * and after, so the read-swap-read is ordered and |
185 | * ensures coherency with page_counter_try_charge(): |
186 | * that function modifies the count before checking |
187 | * the limit, so if it sees the old limit, we see the |
188 | * modified counter and retry. |
189 | */ |
190 | usage = atomic_long_read(&counter->usage); |
191 | |
192 | if (usage > nr_pages) |
193 | return -EBUSY; |
194 | |
195 | old = xchg(&counter->max, nr_pages); |
196 | |
197 | if (atomic_long_read(&counter->usage) <= usage) |
198 | return 0; |
199 | |
200 | counter->max = old; |
201 | cond_resched(); |
202 | } |
203 | } |
204 | |
205 | /** |
206 | * page_counter_set_min - set the amount of protected memory |
207 | * @counter: counter |
208 | * @nr_pages: value to set |
209 | * |
210 | * The caller must serialize invocations on the same counter. |
211 | */ |
212 | void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) |
213 | { |
214 | struct page_counter *c; |
215 | |
216 | counter->min = nr_pages; |
217 | |
218 | for (c = counter; c; c = c->parent) |
219 | propagate_protected_usage(c, atomic_long_read(&c->usage)); |
220 | } |
221 | |
222 | /** |
223 | * page_counter_set_low - set the amount of protected memory |
224 | * @counter: counter |
225 | * @nr_pages: value to set |
226 | * |
227 | * The caller must serialize invocations on the same counter. |
228 | */ |
229 | void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) |
230 | { |
231 | struct page_counter *c; |
232 | |
233 | counter->low = nr_pages; |
234 | |
235 | for (c = counter; c; c = c->parent) |
236 | propagate_protected_usage(c, atomic_long_read(&c->usage)); |
237 | } |
238 | |
239 | /** |
240 | * page_counter_memparse - memparse() for page counter limits |
241 | * @buf: string to parse |
242 | * @max: string meaning maximum possible value |
243 | * @nr_pages: returns the result in number of pages |
244 | * |
245 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be |
246 | * limited to %PAGE_COUNTER_MAX. |
247 | */ |
248 | int page_counter_memparse(const char *buf, const char *max, |
249 | unsigned long *nr_pages) |
250 | { |
251 | char *end; |
252 | u64 bytes; |
253 | |
254 | if (!strcmp(buf, max)) { |
255 | *nr_pages = PAGE_COUNTER_MAX; |
256 | return 0; |
257 | } |
258 | |
259 | bytes = memparse(buf, &end); |
260 | if (*end != '\0') |
261 | return -EINVAL; |
262 | |
263 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); |
264 | |
265 | return 0; |
266 | } |
267 | |