1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include "io_uring.h" |
4 | #include "napi.h" |
5 | |
6 | #ifdef CONFIG_NET_RX_BUSY_POLL |
7 | |
8 | /* Timeout for cleanout of stale entries. */ |
9 | #define NAPI_TIMEOUT (60 * SEC_CONVERSION) |
10 | |
11 | struct io_napi_entry { |
12 | unsigned int napi_id; |
13 | struct list_head list; |
14 | |
15 | unsigned long timeout; |
16 | struct hlist_node node; |
17 | |
18 | struct rcu_head rcu; |
19 | }; |
20 | |
21 | static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, |
22 | unsigned int napi_id) |
23 | { |
24 | struct io_napi_entry *e; |
25 | |
26 | hlist_for_each_entry_rcu(e, hash_list, node) { |
27 | if (e->napi_id != napi_id) |
28 | continue; |
29 | e->timeout = jiffies + NAPI_TIMEOUT; |
30 | return e; |
31 | } |
32 | |
33 | return NULL; |
34 | } |
35 | |
36 | void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) |
37 | { |
38 | struct hlist_head *hash_list; |
39 | unsigned int napi_id; |
40 | struct sock *sk; |
41 | struct io_napi_entry *e; |
42 | |
43 | sk = sock->sk; |
44 | if (!sk) |
45 | return; |
46 | |
47 | napi_id = READ_ONCE(sk->sk_napi_id); |
48 | |
49 | /* Non-NAPI IDs can be rejected. */ |
50 | if (napi_id < MIN_NAPI_ID) |
51 | return; |
52 | |
53 | hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; |
54 | |
55 | rcu_read_lock(); |
56 | e = io_napi_hash_find(hash_list, napi_id); |
57 | if (e) { |
58 | e->timeout = jiffies + NAPI_TIMEOUT; |
59 | rcu_read_unlock(); |
60 | return; |
61 | } |
62 | rcu_read_unlock(); |
63 | |
64 | e = kmalloc(size: sizeof(*e), GFP_NOWAIT); |
65 | if (!e) |
66 | return; |
67 | |
68 | e->napi_id = napi_id; |
69 | e->timeout = jiffies + NAPI_TIMEOUT; |
70 | |
71 | spin_lock(lock: &ctx->napi_lock); |
72 | if (unlikely(io_napi_hash_find(hash_list, napi_id))) { |
73 | spin_unlock(lock: &ctx->napi_lock); |
74 | kfree(objp: e); |
75 | return; |
76 | } |
77 | |
78 | hlist_add_tail_rcu(n: &e->node, h: hash_list); |
79 | list_add_tail(new: &e->list, head: &ctx->napi_list); |
80 | spin_unlock(lock: &ctx->napi_lock); |
81 | } |
82 | |
83 | static void __io_napi_remove_stale(struct io_ring_ctx *ctx) |
84 | { |
85 | struct io_napi_entry *e; |
86 | unsigned int i; |
87 | |
88 | spin_lock(lock: &ctx->napi_lock); |
89 | hash_for_each(ctx->napi_ht, i, e, node) { |
90 | if (time_after(jiffies, e->timeout)) { |
91 | list_del(entry: &e->list); |
92 | hash_del_rcu(node: &e->node); |
93 | kfree_rcu(e, rcu); |
94 | } |
95 | } |
96 | spin_unlock(lock: &ctx->napi_lock); |
97 | } |
98 | |
99 | static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) |
100 | { |
101 | if (is_stale) |
102 | __io_napi_remove_stale(ctx); |
103 | } |
104 | |
105 | static inline bool io_napi_busy_loop_timeout(unsigned long start_time, |
106 | unsigned long bp_usec) |
107 | { |
108 | if (bp_usec) { |
109 | unsigned long end_time = start_time + bp_usec; |
110 | unsigned long now = busy_loop_current_time(); |
111 | |
112 | return time_after(now, end_time); |
113 | } |
114 | |
115 | return true; |
116 | } |
117 | |
118 | static bool io_napi_busy_loop_should_end(void *data, |
119 | unsigned long start_time) |
120 | { |
121 | struct io_wait_queue *iowq = data; |
122 | |
123 | if (signal_pending(current)) |
124 | return true; |
125 | if (io_should_wake(iowq) || io_has_work(ctx: iowq->ctx)) |
126 | return true; |
127 | if (io_napi_busy_loop_timeout(start_time, bp_usec: iowq->napi_busy_poll_to)) |
128 | return true; |
129 | |
130 | return false; |
131 | } |
132 | |
133 | static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, |
134 | void *loop_end_arg) |
135 | { |
136 | struct io_napi_entry *e; |
137 | bool (*loop_end)(void *, unsigned long) = NULL; |
138 | bool is_stale = false; |
139 | |
140 | if (loop_end_arg) |
141 | loop_end = io_napi_busy_loop_should_end; |
142 | |
143 | list_for_each_entry_rcu(e, &ctx->napi_list, list) { |
144 | napi_busy_loop_rcu(napi_id: e->napi_id, loop_end, loop_end_arg, |
145 | prefer_busy_poll: ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); |
146 | |
147 | if (time_after(jiffies, e->timeout)) |
148 | is_stale = true; |
149 | } |
150 | |
151 | return is_stale; |
152 | } |
153 | |
154 | static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, |
155 | struct io_wait_queue *iowq) |
156 | { |
157 | unsigned long start_time = busy_loop_current_time(); |
158 | void *loop_end_arg = NULL; |
159 | bool is_stale = false; |
160 | |
161 | /* Singular lists use a different napi loop end check function and are |
162 | * only executed once. |
163 | */ |
164 | if (list_is_singular(head: &ctx->napi_list)) |
165 | loop_end_arg = iowq; |
166 | |
167 | rcu_read_lock(); |
168 | do { |
169 | is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); |
170 | } while (!io_napi_busy_loop_should_end(data: iowq, start_time) && !loop_end_arg); |
171 | rcu_read_unlock(); |
172 | |
173 | io_napi_remove_stale(ctx, is_stale); |
174 | } |
175 | |
176 | /* |
177 | * io_napi_init() - Init napi settings |
178 | * @ctx: pointer to io-uring context structure |
179 | * |
180 | * Init napi settings in the io-uring context. |
181 | */ |
182 | void io_napi_init(struct io_ring_ctx *ctx) |
183 | { |
184 | INIT_LIST_HEAD(list: &ctx->napi_list); |
185 | spin_lock_init(&ctx->napi_lock); |
186 | ctx->napi_prefer_busy_poll = false; |
187 | ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); |
188 | } |
189 | |
190 | /* |
191 | * io_napi_free() - Deallocate napi |
192 | * @ctx: pointer to io-uring context structure |
193 | * |
194 | * Free the napi list and the hash table in the io-uring context. |
195 | */ |
196 | void io_napi_free(struct io_ring_ctx *ctx) |
197 | { |
198 | struct io_napi_entry *e; |
199 | LIST_HEAD(napi_list); |
200 | unsigned int i; |
201 | |
202 | spin_lock(lock: &ctx->napi_lock); |
203 | hash_for_each(ctx->napi_ht, i, e, node) { |
204 | hash_del_rcu(node: &e->node); |
205 | kfree_rcu(e, rcu); |
206 | } |
207 | spin_unlock(lock: &ctx->napi_lock); |
208 | } |
209 | |
210 | /* |
211 | * io_napi_register() - Register napi with io-uring |
212 | * @ctx: pointer to io-uring context structure |
213 | * @arg: pointer to io_uring_napi structure |
214 | * |
215 | * Register napi in the io-uring context. |
216 | */ |
217 | int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) |
218 | { |
219 | const struct io_uring_napi curr = { |
220 | .busy_poll_to = ctx->napi_busy_poll_to, |
221 | .prefer_busy_poll = ctx->napi_prefer_busy_poll |
222 | }; |
223 | struct io_uring_napi napi; |
224 | |
225 | if (copy_from_user(to: &napi, from: arg, n: sizeof(napi))) |
226 | return -EFAULT; |
227 | if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) |
228 | return -EINVAL; |
229 | |
230 | if (copy_to_user(to: arg, from: &curr, n: sizeof(curr))) |
231 | return -EFAULT; |
232 | |
233 | WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); |
234 | WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); |
235 | WRITE_ONCE(ctx->napi_enabled, true); |
236 | return 0; |
237 | } |
238 | |
239 | /* |
240 | * io_napi_unregister() - Unregister napi with io-uring |
241 | * @ctx: pointer to io-uring context structure |
242 | * @arg: pointer to io_uring_napi structure |
243 | * |
244 | * Unregister napi. If arg has been specified copy the busy poll timeout and |
245 | * prefer busy poll setting to the passed in structure. |
246 | */ |
247 | int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) |
248 | { |
249 | const struct io_uring_napi curr = { |
250 | .busy_poll_to = ctx->napi_busy_poll_to, |
251 | .prefer_busy_poll = ctx->napi_prefer_busy_poll |
252 | }; |
253 | |
254 | if (arg && copy_to_user(to: arg, from: &curr, n: sizeof(curr))) |
255 | return -EFAULT; |
256 | |
257 | WRITE_ONCE(ctx->napi_busy_poll_to, 0); |
258 | WRITE_ONCE(ctx->napi_prefer_busy_poll, false); |
259 | WRITE_ONCE(ctx->napi_enabled, false); |
260 | return 0; |
261 | } |
262 | |
263 | /* |
264 | * __io_napi_adjust_timeout() - Add napi id to the busy poll list |
265 | * @ctx: pointer to io-uring context structure |
266 | * @iowq: pointer to io wait queue |
267 | * @ts: pointer to timespec or NULL |
268 | * |
269 | * Adjust the busy loop timeout according to timespec and busy poll timeout. |
270 | */ |
271 | void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, |
272 | struct timespec64 *ts) |
273 | { |
274 | unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); |
275 | |
276 | if (ts) { |
277 | struct timespec64 poll_to_ts = ns_to_timespec64(nsec: 1000 * (s64)poll_to); |
278 | |
279 | if (timespec64_compare(lhs: ts, rhs: &poll_to_ts) > 0) { |
280 | *ts = timespec64_sub(lhs: *ts, rhs: poll_to_ts); |
281 | } else { |
282 | u64 to = timespec64_to_ns(ts); |
283 | |
284 | do_div(to, 1000); |
285 | ts->tv_sec = 0; |
286 | ts->tv_nsec = 0; |
287 | } |
288 | } |
289 | |
290 | iowq->napi_busy_poll_to = poll_to; |
291 | } |
292 | |
293 | /* |
294 | * __io_napi_busy_loop() - execute busy poll loop |
295 | * @ctx: pointer to io-uring context structure |
296 | * @iowq: pointer to io wait queue |
297 | * |
298 | * Execute the busy poll loop and merge the spliced off list. |
299 | */ |
300 | void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) |
301 | { |
302 | iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); |
303 | |
304 | if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) |
305 | io_napi_blocking_busy_loop(ctx, iowq); |
306 | } |
307 | |
308 | /* |
309 | * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll |
310 | * @ctx: pointer to io-uring context structure |
311 | * |
312 | * Splice of the napi list and execute the napi busy poll loop. |
313 | */ |
314 | int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) |
315 | { |
316 | LIST_HEAD(napi_list); |
317 | bool is_stale = false; |
318 | |
319 | if (!READ_ONCE(ctx->napi_busy_poll_to)) |
320 | return 0; |
321 | if (list_empty_careful(head: &ctx->napi_list)) |
322 | return 0; |
323 | |
324 | rcu_read_lock(); |
325 | is_stale = __io_napi_do_busy_loop(ctx, NULL); |
326 | rcu_read_unlock(); |
327 | |
328 | io_napi_remove_stale(ctx, is_stale); |
329 | return 1; |
330 | } |
331 | |
332 | #endif |
333 | |