1// SPDX-License-Identifier: GPL-2.0
2
3#include "io_uring.h"
4#include "napi.h"
5
6#ifdef CONFIG_NET_RX_BUSY_POLL
7
8/* Timeout for cleanout of stale entries. */
9#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
10
11struct io_napi_entry {
12 unsigned int napi_id;
13 struct list_head list;
14
15 unsigned long timeout;
16 struct hlist_node node;
17
18 struct rcu_head rcu;
19};
20
21static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 unsigned int napi_id)
23{
24 struct io_napi_entry *e;
25
26 hlist_for_each_entry_rcu(e, hash_list, node) {
27 if (e->napi_id != napi_id)
28 continue;
29 e->timeout = jiffies + NAPI_TIMEOUT;
30 return e;
31 }
32
33 return NULL;
34}
35
36void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
37{
38 struct hlist_head *hash_list;
39 unsigned int napi_id;
40 struct sock *sk;
41 struct io_napi_entry *e;
42
43 sk = sock->sk;
44 if (!sk)
45 return;
46
47 napi_id = READ_ONCE(sk->sk_napi_id);
48
49 /* Non-NAPI IDs can be rejected. */
50 if (napi_id < MIN_NAPI_ID)
51 return;
52
53 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
54
55 rcu_read_lock();
56 e = io_napi_hash_find(hash_list, napi_id);
57 if (e) {
58 e->timeout = jiffies + NAPI_TIMEOUT;
59 rcu_read_unlock();
60 return;
61 }
62 rcu_read_unlock();
63
64 e = kmalloc(size: sizeof(*e), GFP_NOWAIT);
65 if (!e)
66 return;
67
68 e->napi_id = napi_id;
69 e->timeout = jiffies + NAPI_TIMEOUT;
70
71 spin_lock(lock: &ctx->napi_lock);
72 if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
73 spin_unlock(lock: &ctx->napi_lock);
74 kfree(objp: e);
75 return;
76 }
77
78 hlist_add_tail_rcu(n: &e->node, h: hash_list);
79 list_add_tail(new: &e->list, head: &ctx->napi_list);
80 spin_unlock(lock: &ctx->napi_lock);
81}
82
83static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
84{
85 struct io_napi_entry *e;
86 unsigned int i;
87
88 spin_lock(lock: &ctx->napi_lock);
89 hash_for_each(ctx->napi_ht, i, e, node) {
90 if (time_after(jiffies, e->timeout)) {
91 list_del(entry: &e->list);
92 hash_del_rcu(node: &e->node);
93 kfree_rcu(e, rcu);
94 }
95 }
96 spin_unlock(lock: &ctx->napi_lock);
97}
98
99static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
100{
101 if (is_stale)
102 __io_napi_remove_stale(ctx);
103}
104
105static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
106 unsigned long bp_usec)
107{
108 if (bp_usec) {
109 unsigned long end_time = start_time + bp_usec;
110 unsigned long now = busy_loop_current_time();
111
112 return time_after(now, end_time);
113 }
114
115 return true;
116}
117
118static bool io_napi_busy_loop_should_end(void *data,
119 unsigned long start_time)
120{
121 struct io_wait_queue *iowq = data;
122
123 if (signal_pending(current))
124 return true;
125 if (io_should_wake(iowq) || io_has_work(ctx: iowq->ctx))
126 return true;
127 if (io_napi_busy_loop_timeout(start_time, bp_usec: iowq->napi_busy_poll_to))
128 return true;
129
130 return false;
131}
132
133static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
134 void *loop_end_arg)
135{
136 struct io_napi_entry *e;
137 bool (*loop_end)(void *, unsigned long) = NULL;
138 bool is_stale = false;
139
140 if (loop_end_arg)
141 loop_end = io_napi_busy_loop_should_end;
142
143 list_for_each_entry_rcu(e, &ctx->napi_list, list) {
144 napi_busy_loop_rcu(napi_id: e->napi_id, loop_end, loop_end_arg,
145 prefer_busy_poll: ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
146
147 if (time_after(jiffies, e->timeout))
148 is_stale = true;
149 }
150
151 return is_stale;
152}
153
154static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
155 struct io_wait_queue *iowq)
156{
157 unsigned long start_time = busy_loop_current_time();
158 void *loop_end_arg = NULL;
159 bool is_stale = false;
160
161 /* Singular lists use a different napi loop end check function and are
162 * only executed once.
163 */
164 if (list_is_singular(head: &ctx->napi_list))
165 loop_end_arg = iowq;
166
167 rcu_read_lock();
168 do {
169 is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
170 } while (!io_napi_busy_loop_should_end(data: iowq, start_time) && !loop_end_arg);
171 rcu_read_unlock();
172
173 io_napi_remove_stale(ctx, is_stale);
174}
175
176/*
177 * io_napi_init() - Init napi settings
178 * @ctx: pointer to io-uring context structure
179 *
180 * Init napi settings in the io-uring context.
181 */
182void io_napi_init(struct io_ring_ctx *ctx)
183{
184 INIT_LIST_HEAD(list: &ctx->napi_list);
185 spin_lock_init(&ctx->napi_lock);
186 ctx->napi_prefer_busy_poll = false;
187 ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
188}
189
190/*
191 * io_napi_free() - Deallocate napi
192 * @ctx: pointer to io-uring context structure
193 *
194 * Free the napi list and the hash table in the io-uring context.
195 */
196void io_napi_free(struct io_ring_ctx *ctx)
197{
198 struct io_napi_entry *e;
199 LIST_HEAD(napi_list);
200 unsigned int i;
201
202 spin_lock(lock: &ctx->napi_lock);
203 hash_for_each(ctx->napi_ht, i, e, node) {
204 hash_del_rcu(node: &e->node);
205 kfree_rcu(e, rcu);
206 }
207 spin_unlock(lock: &ctx->napi_lock);
208}
209
210/*
211 * io_napi_register() - Register napi with io-uring
212 * @ctx: pointer to io-uring context structure
213 * @arg: pointer to io_uring_napi structure
214 *
215 * Register napi in the io-uring context.
216 */
217int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
218{
219 const struct io_uring_napi curr = {
220 .busy_poll_to = ctx->napi_busy_poll_to,
221 .prefer_busy_poll = ctx->napi_prefer_busy_poll
222 };
223 struct io_uring_napi napi;
224
225 if (copy_from_user(to: &napi, from: arg, n: sizeof(napi)))
226 return -EFAULT;
227 if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
228 return -EINVAL;
229
230 if (copy_to_user(to: arg, from: &curr, n: sizeof(curr)))
231 return -EFAULT;
232
233 WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
234 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
235 WRITE_ONCE(ctx->napi_enabled, true);
236 return 0;
237}
238
239/*
240 * io_napi_unregister() - Unregister napi with io-uring
241 * @ctx: pointer to io-uring context structure
242 * @arg: pointer to io_uring_napi structure
243 *
244 * Unregister napi. If arg has been specified copy the busy poll timeout and
245 * prefer busy poll setting to the passed in structure.
246 */
247int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
248{
249 const struct io_uring_napi curr = {
250 .busy_poll_to = ctx->napi_busy_poll_to,
251 .prefer_busy_poll = ctx->napi_prefer_busy_poll
252 };
253
254 if (arg && copy_to_user(to: arg, from: &curr, n: sizeof(curr)))
255 return -EFAULT;
256
257 WRITE_ONCE(ctx->napi_busy_poll_to, 0);
258 WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
259 WRITE_ONCE(ctx->napi_enabled, false);
260 return 0;
261}
262
263/*
264 * __io_napi_adjust_timeout() - Add napi id to the busy poll list
265 * @ctx: pointer to io-uring context structure
266 * @iowq: pointer to io wait queue
267 * @ts: pointer to timespec or NULL
268 *
269 * Adjust the busy loop timeout according to timespec and busy poll timeout.
270 */
271void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
272 struct timespec64 *ts)
273{
274 unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
275
276 if (ts) {
277 struct timespec64 poll_to_ts = ns_to_timespec64(nsec: 1000 * (s64)poll_to);
278
279 if (timespec64_compare(lhs: ts, rhs: &poll_to_ts) > 0) {
280 *ts = timespec64_sub(lhs: *ts, rhs: poll_to_ts);
281 } else {
282 u64 to = timespec64_to_ns(ts);
283
284 do_div(to, 1000);
285 ts->tv_sec = 0;
286 ts->tv_nsec = 0;
287 }
288 }
289
290 iowq->napi_busy_poll_to = poll_to;
291}
292
293/*
294 * __io_napi_busy_loop() - execute busy poll loop
295 * @ctx: pointer to io-uring context structure
296 * @iowq: pointer to io wait queue
297 *
298 * Execute the busy poll loop and merge the spliced off list.
299 */
300void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
301{
302 iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
303
304 if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
305 io_napi_blocking_busy_loop(ctx, iowq);
306}
307
308/*
309 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
310 * @ctx: pointer to io-uring context structure
311 *
312 * Splice of the napi list and execute the napi busy poll loop.
313 */
314int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
315{
316 LIST_HEAD(napi_list);
317 bool is_stale = false;
318
319 if (!READ_ONCE(ctx->napi_busy_poll_to))
320 return 0;
321 if (list_empty_careful(head: &ctx->napi_list))
322 return 0;
323
324 rcu_read_lock();
325 is_stale = __io_napi_do_busy_loop(ctx, NULL);
326 rcu_read_unlock();
327
328 io_napi_remove_stale(ctx, is_stale);
329 return 1;
330}
331
332#endif
333

source code of linux/io_uring/napi.c