1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (c) 2018 Facebook |
4 | */ |
5 | #include <linux/bpf.h> |
6 | #include <linux/err.h> |
7 | #include <linux/sock_diag.h> |
8 | #include <net/sock_reuseport.h> |
9 | #include <linux/btf_ids.h> |
10 | |
11 | struct reuseport_array { |
12 | struct bpf_map map; |
13 | struct sock __rcu *ptrs[]; |
14 | }; |
15 | |
16 | static struct reuseport_array *reuseport_array(struct bpf_map *map) |
17 | { |
18 | return (struct reuseport_array *)map; |
19 | } |
20 | |
21 | /* The caller must hold the reuseport_lock */ |
22 | void bpf_sk_reuseport_detach(struct sock *sk) |
23 | { |
24 | struct sock __rcu **socks; |
25 | |
26 | write_lock_bh(&sk->sk_callback_lock); |
27 | socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); |
28 | if (socks) { |
29 | WRITE_ONCE(sk->sk_user_data, NULL); |
30 | /* |
31 | * Do not move this NULL assignment outside of |
32 | * sk->sk_callback_lock because there is |
33 | * a race with reuseport_array_free() |
34 | * which does not hold the reuseport_lock. |
35 | */ |
36 | RCU_INIT_POINTER(*socks, NULL); |
37 | } |
38 | write_unlock_bh(&sk->sk_callback_lock); |
39 | } |
40 | |
41 | static int reuseport_array_alloc_check(union bpf_attr *attr) |
42 | { |
43 | if (attr->value_size != sizeof(u32) && |
44 | attr->value_size != sizeof(u64)) |
45 | return -EINVAL; |
46 | |
47 | return array_map_alloc_check(attr); |
48 | } |
49 | |
50 | static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) |
51 | { |
52 | struct reuseport_array *array = reuseport_array(map); |
53 | u32 index = *(u32 *)key; |
54 | |
55 | if (unlikely(index >= array->map.max_entries)) |
56 | return NULL; |
57 | |
58 | return rcu_dereference(array->ptrs[index]); |
59 | } |
60 | |
61 | /* Called from syscall only */ |
62 | static long reuseport_array_delete_elem(struct bpf_map *map, void *key) |
63 | { |
64 | struct reuseport_array *array = reuseport_array(map); |
65 | u32 index = *(u32 *)key; |
66 | struct sock *sk; |
67 | int err; |
68 | |
69 | if (index >= map->max_entries) |
70 | return -E2BIG; |
71 | |
72 | if (!rcu_access_pointer(array->ptrs[index])) |
73 | return -ENOENT; |
74 | |
75 | spin_lock_bh(lock: &reuseport_lock); |
76 | |
77 | sk = rcu_dereference_protected(array->ptrs[index], |
78 | lockdep_is_held(&reuseport_lock)); |
79 | if (sk) { |
80 | write_lock_bh(&sk->sk_callback_lock); |
81 | WRITE_ONCE(sk->sk_user_data, NULL); |
82 | RCU_INIT_POINTER(array->ptrs[index], NULL); |
83 | write_unlock_bh(&sk->sk_callback_lock); |
84 | err = 0; |
85 | } else { |
86 | err = -ENOENT; |
87 | } |
88 | |
89 | spin_unlock_bh(lock: &reuseport_lock); |
90 | |
91 | return err; |
92 | } |
93 | |
94 | static void reuseport_array_free(struct bpf_map *map) |
95 | { |
96 | struct reuseport_array *array = reuseport_array(map); |
97 | struct sock *sk; |
98 | u32 i; |
99 | |
100 | /* |
101 | * ops->map_*_elem() will not be able to access this |
102 | * array now. Hence, this function only races with |
103 | * bpf_sk_reuseport_detach() which was triggered by |
104 | * close() or disconnect(). |
105 | * |
106 | * This function and bpf_sk_reuseport_detach() are |
107 | * both removing sk from "array". Who removes it |
108 | * first does not matter. |
109 | * |
110 | * The only concern here is bpf_sk_reuseport_detach() |
111 | * may access "array" which is being freed here. |
112 | * bpf_sk_reuseport_detach() access this "array" |
113 | * through sk->sk_user_data _and_ with sk->sk_callback_lock |
114 | * held which is enough because this "array" is not freed |
115 | * until all sk->sk_user_data has stopped referencing this "array". |
116 | * |
117 | * Hence, due to the above, taking "reuseport_lock" is not |
118 | * needed here. |
119 | */ |
120 | |
121 | /* |
122 | * Since reuseport_lock is not taken, sk is accessed under |
123 | * rcu_read_lock() |
124 | */ |
125 | rcu_read_lock(); |
126 | for (i = 0; i < map->max_entries; i++) { |
127 | sk = rcu_dereference(array->ptrs[i]); |
128 | if (sk) { |
129 | write_lock_bh(&sk->sk_callback_lock); |
130 | /* |
131 | * No need for WRITE_ONCE(). At this point, |
132 | * no one is reading it without taking the |
133 | * sk->sk_callback_lock. |
134 | */ |
135 | sk->sk_user_data = NULL; |
136 | write_unlock_bh(&sk->sk_callback_lock); |
137 | RCU_INIT_POINTER(array->ptrs[i], NULL); |
138 | } |
139 | } |
140 | rcu_read_unlock(); |
141 | |
142 | /* |
143 | * Once reaching here, all sk->sk_user_data is not |
144 | * referencing this "array". "array" can be freed now. |
145 | */ |
146 | bpf_map_area_free(base: array); |
147 | } |
148 | |
149 | static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) |
150 | { |
151 | int numa_node = bpf_map_attr_numa_node(attr); |
152 | struct reuseport_array *array; |
153 | |
154 | /* allocate all map elements and zero-initialize them */ |
155 | array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); |
156 | if (!array) |
157 | return ERR_PTR(error: -ENOMEM); |
158 | |
159 | /* copy mandatory map attributes */ |
160 | bpf_map_init_from_attr(map: &array->map, attr); |
161 | |
162 | return &array->map; |
163 | } |
164 | |
165 | int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, |
166 | void *value) |
167 | { |
168 | struct sock *sk; |
169 | int err; |
170 | |
171 | if (map->value_size != sizeof(u64)) |
172 | return -ENOSPC; |
173 | |
174 | rcu_read_lock(); |
175 | sk = reuseport_array_lookup_elem(map, key); |
176 | if (sk) { |
177 | *(u64 *)value = __sock_gen_cookie(sk); |
178 | err = 0; |
179 | } else { |
180 | err = -ENOENT; |
181 | } |
182 | rcu_read_unlock(); |
183 | |
184 | return err; |
185 | } |
186 | |
187 | static int |
188 | reuseport_array_update_check(const struct reuseport_array *array, |
189 | const struct sock *nsk, |
190 | const struct sock *osk, |
191 | const struct sock_reuseport *nsk_reuse, |
192 | u32 map_flags) |
193 | { |
194 | if (osk && map_flags == BPF_NOEXIST) |
195 | return -EEXIST; |
196 | |
197 | if (!osk && map_flags == BPF_EXIST) |
198 | return -ENOENT; |
199 | |
200 | if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) |
201 | return -ENOTSUPP; |
202 | |
203 | if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) |
204 | return -ENOTSUPP; |
205 | |
206 | if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) |
207 | return -ENOTSUPP; |
208 | |
209 | /* |
210 | * sk must be hashed (i.e. listening in the TCP case or binded |
211 | * in the UDP case) and |
212 | * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). |
213 | * |
214 | * Also, sk will be used in bpf helper that is protected by |
215 | * rcu_read_lock(). |
216 | */ |
217 | if (!sock_flag(sk: nsk, flag: SOCK_RCU_FREE) || !sk_hashed(sk: nsk) || !nsk_reuse) |
218 | return -EINVAL; |
219 | |
220 | /* READ_ONCE because the sk->sk_callback_lock may not be held here */ |
221 | if (READ_ONCE(nsk->sk_user_data)) |
222 | return -EBUSY; |
223 | |
224 | return 0; |
225 | } |
226 | |
227 | /* |
228 | * Called from syscall only. |
229 | * The "nsk" in the fd refcnt. |
230 | * The "osk" and "reuse" are protected by reuseport_lock. |
231 | */ |
232 | int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, |
233 | void *value, u64 map_flags) |
234 | { |
235 | struct reuseport_array *array = reuseport_array(map); |
236 | struct sock *free_osk = NULL, *osk, *nsk; |
237 | struct sock_reuseport *reuse; |
238 | u32 index = *(u32 *)key; |
239 | uintptr_t sk_user_data; |
240 | struct socket *socket; |
241 | int err, fd; |
242 | |
243 | if (map_flags > BPF_EXIST) |
244 | return -EINVAL; |
245 | |
246 | if (index >= map->max_entries) |
247 | return -E2BIG; |
248 | |
249 | if (map->value_size == sizeof(u64)) { |
250 | u64 fd64 = *(u64 *)value; |
251 | |
252 | if (fd64 > S32_MAX) |
253 | return -EINVAL; |
254 | fd = fd64; |
255 | } else { |
256 | fd = *(int *)value; |
257 | } |
258 | |
259 | socket = sockfd_lookup(fd, err: &err); |
260 | if (!socket) |
261 | return err; |
262 | |
263 | nsk = socket->sk; |
264 | if (!nsk) { |
265 | err = -EINVAL; |
266 | goto put_file; |
267 | } |
268 | |
269 | /* Quick checks before taking reuseport_lock */ |
270 | err = reuseport_array_update_check(array, nsk, |
271 | rcu_access_pointer(array->ptrs[index]), |
272 | rcu_access_pointer(nsk->sk_reuseport_cb), |
273 | map_flags); |
274 | if (err) |
275 | goto put_file; |
276 | |
277 | spin_lock_bh(lock: &reuseport_lock); |
278 | /* |
279 | * Some of the checks only need reuseport_lock |
280 | * but it is done under sk_callback_lock also |
281 | * for simplicity reason. |
282 | */ |
283 | write_lock_bh(&nsk->sk_callback_lock); |
284 | |
285 | osk = rcu_dereference_protected(array->ptrs[index], |
286 | lockdep_is_held(&reuseport_lock)); |
287 | reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, |
288 | lockdep_is_held(&reuseport_lock)); |
289 | err = reuseport_array_update_check(array, nsk, osk, nsk_reuse: reuse, map_flags); |
290 | if (err) |
291 | goto put_file_unlock; |
292 | |
293 | sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | |
294 | SK_USER_DATA_BPF; |
295 | WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); |
296 | rcu_assign_pointer(array->ptrs[index], nsk); |
297 | free_osk = osk; |
298 | err = 0; |
299 | |
300 | put_file_unlock: |
301 | write_unlock_bh(&nsk->sk_callback_lock); |
302 | |
303 | if (free_osk) { |
304 | write_lock_bh(&free_osk->sk_callback_lock); |
305 | WRITE_ONCE(free_osk->sk_user_data, NULL); |
306 | write_unlock_bh(&free_osk->sk_callback_lock); |
307 | } |
308 | |
309 | spin_unlock_bh(lock: &reuseport_lock); |
310 | put_file: |
311 | fput(socket->file); |
312 | return err; |
313 | } |
314 | |
315 | /* Called from syscall */ |
316 | static int reuseport_array_get_next_key(struct bpf_map *map, void *key, |
317 | void *next_key) |
318 | { |
319 | struct reuseport_array *array = reuseport_array(map); |
320 | u32 index = key ? *(u32 *)key : U32_MAX; |
321 | u32 *next = (u32 *)next_key; |
322 | |
323 | if (index >= array->map.max_entries) { |
324 | *next = 0; |
325 | return 0; |
326 | } |
327 | |
328 | if (index == array->map.max_entries - 1) |
329 | return -ENOENT; |
330 | |
331 | *next = index + 1; |
332 | return 0; |
333 | } |
334 | |
335 | static u64 reuseport_array_mem_usage(const struct bpf_map *map) |
336 | { |
337 | struct reuseport_array *array; |
338 | |
339 | return struct_size(array, ptrs, map->max_entries); |
340 | } |
341 | |
342 | BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) |
343 | const struct bpf_map_ops reuseport_array_ops = { |
344 | .map_meta_equal = bpf_map_meta_equal, |
345 | .map_alloc_check = reuseport_array_alloc_check, |
346 | .map_alloc = reuseport_array_alloc, |
347 | .map_free = reuseport_array_free, |
348 | .map_lookup_elem = reuseport_array_lookup_elem, |
349 | .map_get_next_key = reuseport_array_get_next_key, |
350 | .map_delete_elem = reuseport_array_delete_elem, |
351 | .map_mem_usage = reuseport_array_mem_usage, |
352 | .map_btf_id = &reuseport_array_map_btf_ids[0], |
353 | }; |
354 | |