1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | |
3 | /* |
4 | * rcuref - A scalable reference count implementation for RCU managed objects |
5 | * |
6 | * rcuref is provided to replace open coded reference count implementations |
7 | * based on atomic_t. It protects explicitely RCU managed objects which can |
8 | * be visible even after the last reference has been dropped and the object |
9 | * is heading towards destruction. |
10 | * |
11 | * A common usage pattern is: |
12 | * |
13 | * get() |
14 | * rcu_read_lock(); |
15 | * p = get_ptr(); |
16 | * if (p && !atomic_inc_not_zero(&p->refcnt)) |
17 | * p = NULL; |
18 | * rcu_read_unlock(); |
19 | * return p; |
20 | * |
21 | * put() |
22 | * if (!atomic_dec_return(&->refcnt)) { |
23 | * remove_ptr(p); |
24 | * kfree_rcu((p, rcu); |
25 | * } |
26 | * |
27 | * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has |
28 | * O(N^2) behaviour under contention with N concurrent operations. |
29 | * |
30 | * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales |
31 | * better under contention. |
32 | * |
33 | * Why not refcount? |
34 | * ================= |
35 | * |
36 | * In principle it should be possible to make refcount use the rcuref |
37 | * scheme, but the destruction race described below cannot be prevented |
38 | * unless the protected object is RCU managed. |
39 | * |
40 | * Theory of operation |
41 | * =================== |
42 | * |
43 | * rcuref uses an unsigned integer reference counter. As long as the |
44 | * counter value is greater than or equal to RCUREF_ONEREF and not larger |
45 | * than RCUREF_MAXREF the reference is alive: |
46 | * |
47 | * ONEREF MAXREF SATURATED RELEASED DEAD NOREF |
48 | * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF |
49 | * <---valid --------> <-------saturation zone-------> <-----dead zone-----> |
50 | * |
51 | * The get() and put() operations do unconditional increments and |
52 | * decrements. The result is checked after the operation. This optimizes |
53 | * for the fast path. |
54 | * |
55 | * If the reference count is saturated or dead, then the increments and |
56 | * decrements are not harmful as the reference count still stays in the |
57 | * respective zones and is always set back to STATURATED resp. DEAD. The |
58 | * zones have room for 2^28 racing operations in each direction, which |
59 | * makes it practically impossible to escape the zones. |
60 | * |
61 | * Once the last reference is dropped the reference count becomes |
62 | * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The |
63 | * slowpath then tries to set the reference count from RCUREF_NOREF to |
64 | * RCUREF_DEAD via a cmpxchg(). This opens a small window where a |
65 | * concurrent rcuref_get() can acquire the reference count and bring it |
66 | * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. |
67 | * |
68 | * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in |
69 | * DEAD + 1, which is inside the dead zone. If that happens the reference |
70 | * count is put back to DEAD. |
71 | * |
72 | * The actual race is possible due to the unconditional increment and |
73 | * decrements in rcuref_get() and rcuref_put(): |
74 | * |
75 | * T1 T2 |
76 | * get() put() |
77 | * if (atomic_add_negative(-1, &ref->refcnt)) |
78 | * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); |
79 | * |
80 | * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 |
81 | * |
82 | * As the result of T1's add is negative, the get() goes into the slow path |
83 | * and observes refcnt being in the dead zone which makes the operation fail. |
84 | * |
85 | * Possible critical states: |
86 | * |
87 | * Context Counter References Operation |
88 | * T1 0 1 init() |
89 | * T2 1 2 get() |
90 | * T1 0 1 put() |
91 | * T2 -1 0 put() tries to mark dead |
92 | * T1 0 1 get() |
93 | * T2 0 1 put() mark dead fails |
94 | * T1 -1 0 put() tries to mark dead |
95 | * T1 DEAD 0 put() mark dead succeeds |
96 | * T2 DEAD+1 0 get() fails and puts it back to DEAD |
97 | * |
98 | * Of course there are more complex scenarios, but the above illustrates |
99 | * the working principle. The rest is left to the imagination of the |
100 | * reader. |
101 | * |
102 | * Deconstruction race |
103 | * =================== |
104 | * |
105 | * The release operation must be protected by prohibiting a grace period in |
106 | * order to prevent a possible use after free: |
107 | * |
108 | * T1 T2 |
109 | * put() get() |
110 | * // ref->refcnt = ONEREF |
111 | * if (!atomic_add_negative(-1, &ref->refcnt)) |
112 | * return false; <- Not taken |
113 | * |
114 | * // ref->refcnt == NOREF |
115 | * --> preemption |
116 | * // Elevates ref->refcnt to ONEREF |
117 | * if (!atomic_add_negative(1, &ref->refcnt)) |
118 | * return true; <- taken |
119 | * |
120 | * if (put(&p->ref)) { <-- Succeeds |
121 | * remove_pointer(p); |
122 | * kfree_rcu(p, rcu); |
123 | * } |
124 | * |
125 | * RCU grace period ends, object is freed |
126 | * |
127 | * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF |
128 | * |
129 | * This is prevented by disabling preemption around the put() operation as |
130 | * that's in most kernel configurations cheaper than a rcu_read_lock() / |
131 | * rcu_read_unlock() pair and in many cases even a NOOP. In any case it |
132 | * prevents the grace period which keeps the object alive until all put() |
133 | * operations complete. |
134 | * |
135 | * Saturation protection |
136 | * ===================== |
137 | * |
138 | * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). |
139 | * Once this is exceedded the reference count becomes stale by setting it |
140 | * to RCUREF_SATURATED, which will cause a memory leak, but it prevents |
141 | * wrap arounds which obviously cause worse problems than a memory |
142 | * leak. When saturation is reached a warning is emitted. |
143 | * |
144 | * Race conditions |
145 | * =============== |
146 | * |
147 | * All reference count increment/decrement operations are unconditional and |
148 | * only verified after the fact. This optimizes for the good case and takes |
149 | * the occasional race vs. a dead or already saturated refcount into |
150 | * account. The saturation and dead zones are large enough to accomodate |
151 | * for that. |
152 | * |
153 | * Memory ordering |
154 | * =============== |
155 | * |
156 | * Memory ordering rules are slightly relaxed wrt regular atomic_t functions |
157 | * and provide only what is strictly required for refcounts. |
158 | * |
159 | * The increments are fully relaxed; these will not provide ordering. The |
160 | * rationale is that whatever is used to obtain the object to increase the |
161 | * reference count on will provide the ordering. For locked data |
162 | * structures, its the lock acquire, for RCU/lockless data structures its |
163 | * the dependent load. |
164 | * |
165 | * rcuref_get() provides a control dependency ordering future stores which |
166 | * ensures that the object is not modified when acquiring a reference |
167 | * fails. |
168 | * |
169 | * rcuref_put() provides release order, i.e. all prior loads and stores |
170 | * will be issued before. It also provides a control dependency ordering |
171 | * against the subsequent destruction of the object. |
172 | * |
173 | * If rcuref_put() successfully dropped the last reference and marked the |
174 | * object DEAD it also provides acquire ordering. |
175 | */ |
176 | |
177 | #include <linux/export.h> |
178 | #include <linux/rcuref.h> |
179 | |
180 | /** |
181 | * rcuref_get_slowpath - Slowpath of rcuref_get() |
182 | * @ref: Pointer to the reference count |
183 | * |
184 | * Invoked when the reference count is outside of the valid zone. |
185 | * |
186 | * Return: |
187 | * False if the reference count was already marked dead |
188 | * |
189 | * True if the reference count is saturated, which prevents the |
190 | * object from being deconstructed ever. |
191 | */ |
192 | bool rcuref_get_slowpath(rcuref_t *ref) |
193 | { |
194 | unsigned int cnt = atomic_read(v: &ref->refcnt); |
195 | |
196 | /* |
197 | * If the reference count was already marked dead, undo the |
198 | * increment so it stays in the middle of the dead zone and return |
199 | * fail. |
200 | */ |
201 | if (cnt >= RCUREF_RELEASED) { |
202 | atomic_set(v: &ref->refcnt, RCUREF_DEAD); |
203 | return false; |
204 | } |
205 | |
206 | /* |
207 | * If it was saturated, warn and mark it so. In case the increment |
208 | * was already on a saturated value restore the saturation |
209 | * marker. This keeps it in the middle of the saturation zone and |
210 | * prevents the reference count from overflowing. This leaks the |
211 | * object memory, but prevents the obvious reference count overflow |
212 | * damage. |
213 | */ |
214 | if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory" )) |
215 | atomic_set(v: &ref->refcnt, RCUREF_SATURATED); |
216 | return true; |
217 | } |
218 | EXPORT_SYMBOL_GPL(rcuref_get_slowpath); |
219 | |
220 | /** |
221 | * rcuref_put_slowpath - Slowpath of __rcuref_put() |
222 | * @ref: Pointer to the reference count |
223 | * |
224 | * Invoked when the reference count is outside of the valid zone. |
225 | * |
226 | * Return: |
227 | * True if this was the last reference with no future references |
228 | * possible. This signals the caller that it can safely schedule the |
229 | * object, which is protected by the reference counter, for |
230 | * deconstruction. |
231 | * |
232 | * False if there are still active references or the put() raced |
233 | * with a concurrent get()/put() pair. Caller is not allowed to |
234 | * deconstruct the protected object. |
235 | */ |
236 | bool rcuref_put_slowpath(rcuref_t *ref) |
237 | { |
238 | unsigned int cnt = atomic_read(v: &ref->refcnt); |
239 | |
240 | /* Did this drop the last reference? */ |
241 | if (likely(cnt == RCUREF_NOREF)) { |
242 | /* |
243 | * Carefully try to set the reference count to RCUREF_DEAD. |
244 | * |
245 | * This can fail if a concurrent get() operation has |
246 | * elevated it again or the corresponding put() even marked |
247 | * it dead already. Both are valid situations and do not |
248 | * require a retry. If this fails the caller is not |
249 | * allowed to deconstruct the object. |
250 | */ |
251 | if (!atomic_try_cmpxchg_release(v: &ref->refcnt, old: &cnt, RCUREF_DEAD)) |
252 | return false; |
253 | |
254 | /* |
255 | * The caller can safely schedule the object for |
256 | * deconstruction. Provide acquire ordering. |
257 | */ |
258 | smp_acquire__after_ctrl_dep(); |
259 | return true; |
260 | } |
261 | |
262 | /* |
263 | * If the reference count was already in the dead zone, then this |
264 | * put() operation is imbalanced. Warn, put the reference count back to |
265 | * DEAD and tell the caller to not deconstruct the object. |
266 | */ |
267 | if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()" )) { |
268 | atomic_set(v: &ref->refcnt, RCUREF_DEAD); |
269 | return false; |
270 | } |
271 | |
272 | /* |
273 | * This is a put() operation on a saturated refcount. Restore the |
274 | * mean saturation value and tell the caller to not deconstruct the |
275 | * object. |
276 | */ |
277 | if (cnt > RCUREF_MAXREF) |
278 | atomic_set(v: &ref->refcnt, RCUREF_SATURATED); |
279 | return false; |
280 | } |
281 | EXPORT_SYMBOL_GPL(rcuref_put_slowpath); |
282 | |