1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Lockless hierarchical page accounting & limiting |
4 | * |
5 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner |
6 | */ |
7 | |
8 | #include <linux/page_counter.h> |
9 | #include <linux/atomic.h> |
10 | #include <linux/kernel.h> |
11 | #include <linux/string.h> |
12 | #include <linux/sched.h> |
13 | #include <linux/bug.h> |
14 | #include <asm/page.h> |
15 | |
16 | static void propagate_protected_usage(struct page_counter *c, |
17 | unsigned long usage) |
18 | { |
19 | unsigned long protected, old_protected; |
20 | long delta; |
21 | |
22 | if (!c->parent) |
23 | return; |
24 | |
25 | protected = min(usage, READ_ONCE(c->min)); |
26 | old_protected = atomic_long_read(v: &c->min_usage); |
27 | if (protected != old_protected) { |
28 | old_protected = atomic_long_xchg(v: &c->min_usage, new: protected); |
29 | delta = protected - old_protected; |
30 | if (delta) |
31 | atomic_long_add(i: delta, v: &c->parent->children_min_usage); |
32 | } |
33 | |
34 | protected = min(usage, READ_ONCE(c->low)); |
35 | old_protected = atomic_long_read(v: &c->low_usage); |
36 | if (protected != old_protected) { |
37 | old_protected = atomic_long_xchg(v: &c->low_usage, new: protected); |
38 | delta = protected - old_protected; |
39 | if (delta) |
40 | atomic_long_add(i: delta, v: &c->parent->children_low_usage); |
41 | } |
42 | } |
43 | |
44 | /** |
45 | * page_counter_cancel - take pages out of the local counter |
46 | * @counter: counter |
47 | * @nr_pages: number of pages to cancel |
48 | */ |
49 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) |
50 | { |
51 | long new; |
52 | |
53 | new = atomic_long_sub_return(i: nr_pages, v: &counter->usage); |
54 | /* More uncharges than charges? */ |
55 | if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n" , |
56 | new, nr_pages)) { |
57 | new = 0; |
58 | atomic_long_set(v: &counter->usage, i: new); |
59 | } |
60 | propagate_protected_usage(c: counter, usage: new); |
61 | } |
62 | |
63 | /** |
64 | * page_counter_charge - hierarchically charge pages |
65 | * @counter: counter |
66 | * @nr_pages: number of pages to charge |
67 | * |
68 | * NOTE: This does not consider any configured counter limits. |
69 | */ |
70 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) |
71 | { |
72 | struct page_counter *c; |
73 | |
74 | for (c = counter; c; c = c->parent) { |
75 | long new; |
76 | |
77 | new = atomic_long_add_return(i: nr_pages, v: &c->usage); |
78 | propagate_protected_usage(c, usage: new); |
79 | /* |
80 | * This is indeed racy, but we can live with some |
81 | * inaccuracy in the watermark. |
82 | */ |
83 | if (new > READ_ONCE(c->watermark)) |
84 | WRITE_ONCE(c->watermark, new); |
85 | } |
86 | } |
87 | |
88 | /** |
89 | * page_counter_try_charge - try to hierarchically charge pages |
90 | * @counter: counter |
91 | * @nr_pages: number of pages to charge |
92 | * @fail: points first counter to hit its limit, if any |
93 | * |
94 | * Returns %true on success, or %false and @fail if the counter or one |
95 | * of its ancestors has hit its configured limit. |
96 | */ |
97 | bool page_counter_try_charge(struct page_counter *counter, |
98 | unsigned long nr_pages, |
99 | struct page_counter **fail) |
100 | { |
101 | struct page_counter *c; |
102 | |
103 | for (c = counter; c; c = c->parent) { |
104 | long new; |
105 | /* |
106 | * Charge speculatively to avoid an expensive CAS. If |
107 | * a bigger charge fails, it might falsely lock out a |
108 | * racing smaller charge and send it into reclaim |
109 | * early, but the error is limited to the difference |
110 | * between the two sizes, which is less than 2M/4M in |
111 | * case of a THP locking out a regular page charge. |
112 | * |
113 | * The atomic_long_add_return() implies a full memory |
114 | * barrier between incrementing the count and reading |
115 | * the limit. When racing with page_counter_set_max(), |
116 | * we either see the new limit or the setter sees the |
117 | * counter has changed and retries. |
118 | */ |
119 | new = atomic_long_add_return(i: nr_pages, v: &c->usage); |
120 | if (new > c->max) { |
121 | atomic_long_sub(i: nr_pages, v: &c->usage); |
122 | /* |
123 | * This is racy, but we can live with some |
124 | * inaccuracy in the failcnt which is only used |
125 | * to report stats. |
126 | */ |
127 | data_race(c->failcnt++); |
128 | *fail = c; |
129 | goto failed; |
130 | } |
131 | propagate_protected_usage(c, usage: new); |
132 | /* |
133 | * Just like with failcnt, we can live with some |
134 | * inaccuracy in the watermark. |
135 | */ |
136 | if (new > READ_ONCE(c->watermark)) |
137 | WRITE_ONCE(c->watermark, new); |
138 | } |
139 | return true; |
140 | |
141 | failed: |
142 | for (c = counter; c != *fail; c = c->parent) |
143 | page_counter_cancel(counter: c, nr_pages); |
144 | |
145 | return false; |
146 | } |
147 | |
148 | /** |
149 | * page_counter_uncharge - hierarchically uncharge pages |
150 | * @counter: counter |
151 | * @nr_pages: number of pages to uncharge |
152 | */ |
153 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) |
154 | { |
155 | struct page_counter *c; |
156 | |
157 | for (c = counter; c; c = c->parent) |
158 | page_counter_cancel(counter: c, nr_pages); |
159 | } |
160 | |
161 | /** |
162 | * page_counter_set_max - set the maximum number of pages allowed |
163 | * @counter: counter |
164 | * @nr_pages: limit to set |
165 | * |
166 | * Returns 0 on success, -EBUSY if the current number of pages on the |
167 | * counter already exceeds the specified limit. |
168 | * |
169 | * The caller must serialize invocations on the same counter. |
170 | */ |
171 | int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) |
172 | { |
173 | for (;;) { |
174 | unsigned long old; |
175 | long usage; |
176 | |
177 | /* |
178 | * Update the limit while making sure that it's not |
179 | * below the concurrently-changing counter value. |
180 | * |
181 | * The xchg implies two full memory barriers before |
182 | * and after, so the read-swap-read is ordered and |
183 | * ensures coherency with page_counter_try_charge(): |
184 | * that function modifies the count before checking |
185 | * the limit, so if it sees the old limit, we see the |
186 | * modified counter and retry. |
187 | */ |
188 | usage = page_counter_read(counter); |
189 | |
190 | if (usage > nr_pages) |
191 | return -EBUSY; |
192 | |
193 | old = xchg(&counter->max, nr_pages); |
194 | |
195 | if (page_counter_read(counter) <= usage || nr_pages >= old) |
196 | return 0; |
197 | |
198 | counter->max = old; |
199 | cond_resched(); |
200 | } |
201 | } |
202 | |
203 | /** |
204 | * page_counter_set_min - set the amount of protected memory |
205 | * @counter: counter |
206 | * @nr_pages: value to set |
207 | * |
208 | * The caller must serialize invocations on the same counter. |
209 | */ |
210 | void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) |
211 | { |
212 | struct page_counter *c; |
213 | |
214 | WRITE_ONCE(counter->min, nr_pages); |
215 | |
216 | for (c = counter; c; c = c->parent) |
217 | propagate_protected_usage(c, usage: atomic_long_read(v: &c->usage)); |
218 | } |
219 | |
220 | /** |
221 | * page_counter_set_low - set the amount of protected memory |
222 | * @counter: counter |
223 | * @nr_pages: value to set |
224 | * |
225 | * The caller must serialize invocations on the same counter. |
226 | */ |
227 | void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) |
228 | { |
229 | struct page_counter *c; |
230 | |
231 | WRITE_ONCE(counter->low, nr_pages); |
232 | |
233 | for (c = counter; c; c = c->parent) |
234 | propagate_protected_usage(c, usage: atomic_long_read(v: &c->usage)); |
235 | } |
236 | |
237 | /** |
238 | * page_counter_memparse - memparse() for page counter limits |
239 | * @buf: string to parse |
240 | * @max: string meaning maximum possible value |
241 | * @nr_pages: returns the result in number of pages |
242 | * |
243 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be |
244 | * limited to %PAGE_COUNTER_MAX. |
245 | */ |
246 | int page_counter_memparse(const char *buf, const char *max, |
247 | unsigned long *nr_pages) |
248 | { |
249 | char *end; |
250 | u64 bytes; |
251 | |
252 | if (!strcmp(buf, max)) { |
253 | *nr_pages = PAGE_COUNTER_MAX; |
254 | return 0; |
255 | } |
256 | |
257 | bytes = memparse(ptr: buf, retptr: &end); |
258 | if (*end != '\0') |
259 | return -EINVAL; |
260 | |
261 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); |
262 | |
263 | return 0; |
264 | } |
265 | |