io_uring.h source code [linux/io_uring/io_uring.h]

1	#ifndef IOU_CORE_H
2	#define IOU_CORE_H
3
4	#include <linux/errno.h>
5	#include <linux/lockdep.h>
6	#include <linux/resume_user_mode.h>
7	#include <linux/kasan.h>
8	#include <linux/poll.h>
9	#include <linux/io_uring_types.h>
10	#include <uapi/linux/eventpoll.h>
11	#include "io-wq.h"
12	#include "slist.h"
13	#include "filetable.h"
14
15	#ifndef CREATE_TRACE_POINTS
16	#include <trace/events/io_uring.h>
17	#endif
18
19	enum {
20	IOU_OK = `0`,
21	IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
22
23	/*
24	* Requeue the task_work to restart operations on this request. The
25	* actual value isn't important, should just be not an otherwise
26	* valid error code, yet less than -MAX_ERRNO and valid internally.
27	*/
28	IOU_REQUEUE = -`3072`,
29
30	/*
31	* Intended only when both IO_URING_F_MULTISHOT is passed
32	* to indicate to the poll runner that multishot should be
33	* removed and the result is set on req->cqe.res.
34	*/
35	IOU_STOP_MULTISHOT = -ECANCELED,
36	};
37
38	struct io_wait_queue {
39	struct wait_queue_entry wq;
40	struct io_ring_ctx *ctx;
41	unsigned cq_tail;
42	unsigned nr_timeouts;
43	ktime_t timeout;
44
45	#ifdef CONFIG_NET_RX_BUSY_POLL
46	unsigned int napi_busy_poll_to;
47	bool napi_prefer_busy_poll;
48	#endif
49	};
50
51	static inline bool io_should_wake(struct io_wait_queue *iowq)
52	{
53	struct io_ring_ctx *ctx = iowq->ctx;
54	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
55
56	/*
57	* Wake up if we have enough events, or if a timeout occurred since we
58	* started waiting. For timeouts, we always want to return to userspace,
59	* regardless of event count.
60	*/
61	return dist >= `0` \|\| atomic_read(v: &ctx->cq_timeouts) != iowq->nr_timeouts;
62	}
63
64	bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
65	void io_req_cqe_overflow(struct io_kiocb *req);
66	int io_run_task_work_sig(struct io_ring_ctx *ctx);
67	void io_req_defer_failed(struct io_kiocb *req, s32 res);
68	void io_req_complete_post(struct io_kiocb req, unsigned* issue_flags);
69	bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
70	bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
71	void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
72
73	struct page *io_pin_pages(unsigned* long ubuf, unsigned long len, int *npages);
74
75	struct file io_file_get_normal(struct* io_kiocb req, int* fd);
76	struct file io_file_get_fixed(struct* io_kiocb req, int* fd,
77	unsigned issue_flags);
78
79	void __io_req_task_work_add(struct io_kiocb req, unsigned* flags);
80	bool io_alloc_async_data(struct io_kiocb *req);
81	void io_req_task_queue(struct io_kiocb *req);
82	void io_queue_iowq(struct io_kiocb req, struct* io_tw_state *ts_dont_use);
83	void io_req_task_complete(struct io_kiocb req, struct* io_tw_state *ts);
84	void io_req_task_queue_fail(struct io_kiocb req, int* ret);
85	void io_req_task_submit(struct io_kiocb req, struct* io_tw_state *ts);
86	struct llist_node io_handle_tw_list(struct* llist_node node, unsigned* int count, unsigned* int max_entries);
87	struct llist_node tctx_task_work_run(struct* io_uring_task tctx, unsigned* int max_entries, unsigned int *count);
88	void tctx_task_work(struct callback_head *cb);
89	__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
90	int io_uring_alloc_task_context(struct task_struct *task,
91	struct io_ring_ctx *ctx);
92
93	int io_ring_add_registered_file(struct io_uring_task tctx, struct* file *file,
94	int start, int end);
95
96	int io_poll_issue(struct io_kiocb req, struct* io_tw_state *ts);
97	int io_submit_sqes(struct io_ring_ctx ctx, unsigned* int nr);
98	int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
99	void __io_submit_flush_completions(struct io_ring_ctx *ctx);
100	int io_req_prep_async(struct io_kiocb *req);
101
102	struct io_wq_work io_wq_free_work(struct* io_wq_work *work);
103	void io_wq_submit_work(struct io_wq_work *work);
104
105	void io_free_req(struct io_kiocb *req);
106	void io_queue_next(struct io_kiocb *req);
107	void io_task_refs_refill(struct io_uring_task *tctx);
108	bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
109
110	bool io_match_task_safe(struct io_kiocb head, struct* task_struct *task,
111	bool cancel_all);
112
113	void *io_mem_alloc(size_t size);
114	void io_mem_free(void *ptr);
115
116	enum {
117	IO_EVENTFD_OP_SIGNAL_BIT,
118	IO_EVENTFD_OP_FREE_BIT,
119	};
120
121	void io_eventfd_ops(struct rcu_head *rcu);
122	void io_activate_pollwq(struct io_ring_ctx *ctx);
123
124	#if defined(CONFIG_PROVE_LOCKING)
125	static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
126	{
127	lockdep_assert(in_task());
128
129	if (ctx->flags & IORING_SETUP_IOPOLL) {
130	lockdep_assert_held(&ctx->uring_lock);
131	} else if (!ctx->task_complete) {
132	lockdep_assert_held(&ctx->completion_lock);
133	} else if (ctx->submitter_task) {
134	/*
135	* ->submitter_task may be NULL and we can still post a CQE,
136	* if the ring has been setup with IORING_SETUP_R_DISABLED.
137	* Not from an SQE, as those cannot be submitted, but via
138	* updating tagged resources.
139	*/
140	if (ctx->submitter_task->flags & PF_EXITING)
141	lockdep_assert(current_work());
142	else
143	lockdep_assert(current == ctx->submitter_task);
144	}
145	}
146	#else
147	static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
148	{
149	}
150	#endif
151
152	static inline void io_req_task_work_add(struct io_kiocb *req)
153	{
154	__io_req_task_work_add(req, flags: `0`);
155	}
156
157	#define io_for_each_link(pos, head) \
158	for (pos = (head); pos; pos = pos->link)
159
160	static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
161	struct io_uring_cqe **ret,
162	bool overflow)
163	{
164	io_lockdep_assert_cq_locked(ctx);
165
166	if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
167	if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
168	return false;
169	}
170	*ret = ctx->cqe_cached;
171	ctx->cached_cq_tail++;
172	ctx->cqe_cached++;
173	if (ctx->flags & IORING_SETUP_CQE32)
174	ctx->cqe_cached++;
175	return true;
176	}
177
178	static inline bool io_get_cqe(struct io_ring_ctx ctx, struct* io_uring_cqe **ret)
179	{
180	return io_get_cqe_overflow(ctx, ret, overflow: false);
181	}
182
183	static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
184	struct io_kiocb *req)
185	{
186	struct io_uring_cqe *cqe;
187
188	/*
189	* If we can't get a cq entry, userspace overflowed the
190	* submission (by quite a lot). Increment the overflow count in
191	* the ring.
192	*/
193	if (unlikely(!io_get_cqe(ctx, &cqe)))
194	return false;
195
196	if (trace_io_uring_complete_enabled())
197	trace_io_uring_complete(ctx: req->ctx, req, user_data: req->cqe.user_data,
198	res: req->cqe.res, cflags: req->cqe.flags,
199	extra1: req->big_cqe.extra1, extra2: req->big_cqe.extra2);
200
201	memcpy(cqe, &req->cqe, sizeof(*cqe));
202	if (ctx->flags & IORING_SETUP_CQE32) {
203	memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
204	memset(&req->big_cqe, `0`, sizeof(req->big_cqe));
205	}
206	return true;
207	}
208
209	static inline void req_set_fail(struct io_kiocb *req)
210	{
211	req->flags \|= REQ_F_FAIL;
212	if (req->flags & REQ_F_CQE_SKIP) {
213	req->flags &= ~REQ_F_CQE_SKIP;
214	req->flags \|= REQ_F_SKIP_LINK_CQES;
215	}
216	}
217
218	static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
219	{
220	req->cqe.res = res;
221	req->cqe.flags = cflags;
222	}
223
224	static inline bool req_has_async_data(struct io_kiocb *req)
225	{
226	return req->flags & REQ_F_ASYNC_DATA;
227	}
228
229	static inline void io_put_file(struct io_kiocb *req)
230	{
231	if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
232	fput(req->file);
233	}
234
235	static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
236	unsigned issue_flags)
237	{
238	lockdep_assert_held(&ctx->uring_lock);
239	if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
240	mutex_unlock(lock: &ctx->uring_lock);
241	}
242
243	static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
244	unsigned issue_flags)
245	{
246	/*
247	* "Normal" inline submissions always hold the uring_lock, since we
248	* grab it from the system call. Same is true for the SQPOLL offload.
249	* The only exception is when we've detached the request and issue it
250	* from an async worker thread, grab the lock for that case.
251	*/
252	if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
253	mutex_lock(&ctx->uring_lock);
254	lockdep_assert_held(&ctx->uring_lock);
255	}
256
257	static inline void io_commit_cqring(struct io_ring_ctx *ctx)
258	{
259	/ order cqe stores with ring update /
260	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
261	}
262
263	static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
264	{
265	if (wq_has_sleeper(wq_head: &ctx->poll_wq))
266	__wake_up(wq_head: &ctx->poll_wq, TASK_NORMAL, nr: `0`,
267	poll_to_key(EPOLL_URING_WAKE \| EPOLLIN));
268	}
269
270	static inline void io_cqring_wake(struct io_ring_ctx *ctx)
271	{
272	/*
273	* Trigger waitqueue handler on all waiters on our waitqueue. This
274	* won't necessarily wake up all the tasks, io_should_wake() will make
275	* that decision.
276	*
277	* Pass in EPOLLIN\|EPOLL_URING_WAKE as the poll wakeup key. The latter
278	* set in the mask so that if we recurse back into our own poll
279	* waitqueue handlers, we know we have a dependency between eventfd or
280	* epoll and should terminate multishot poll at that point.
281	*/
282	if (wq_has_sleeper(wq_head: &ctx->cq_wait))
283	__wake_up(wq_head: &ctx->cq_wait, TASK_NORMAL, nr: `0`,
284	poll_to_key(EPOLL_URING_WAKE \| EPOLLIN));
285	}
286
287	static inline bool io_sqring_full(struct io_ring_ctx *ctx)
288	{
289	struct io_rings *r = ctx->rings;
290
291	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
292	}
293
294	static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
295	{
296	struct io_rings *rings = ctx->rings;
297	unsigned int entries;
298
299	/ make sure SQ entry isn't read before tail /
300	entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
301	return min(entries, ctx->sq_entries);
302	}
303
304	static inline int io_run_task_work(void)
305	{
306	bool ret = false;
307
308	/*
309	* Always check-and-clear the task_work notification signal. With how
310	* signaling works for task_work, we can find it set with nothing to
311	* run. We need to clear it for that case, like get_signal() does.
312	*/
313	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
314	clear_notify_signal();
315	/*
316	* PF_IO_WORKER never returns to userspace, so check here if we have
317	* notify work that needs processing.
318	*/
319	if (current->flags & PF_IO_WORKER) {
320	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
321	__set_current_state(TASK_RUNNING);
322	resume_user_mode_work(NULL);
323	}
324	if (current->io_uring) {
325	unsigned int count = `0`;
326
327	tctx_task_work_run(current->io_uring, UINT_MAX, count: &count);
328	if (count)
329	ret = true;
330	}
331	}
332	if (task_work_pending(current)) {
333	__set_current_state(TASK_RUNNING);
334	task_work_run();
335	ret = true;
336	}
337
338	return ret;
339	}
340
341	static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
342	{
343	return task_work_pending(current) \|\| !wq_list_empty(&ctx->work_llist);
344	}
345
346	static inline void io_tw_lock(struct io_ring_ctx ctx, struct* io_tw_state *ts)
347	{
348	if (!ts->locked) {
349	mutex_lock(&ctx->uring_lock);
350	ts->locked = true;
351	}
352	}
353
354	/*
355	* Don't complete immediately but use deferred completion infrastructure.
356	* Protected by ->uring_lock and can only be used either with
357	* IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
358	*/
359	static inline void io_req_complete_defer(struct io_kiocb *req)
360	__must_hold(&req->ctx->uring_lock)
361	{
362	struct io_submit_state *state = &req->ctx->submit_state;
363
364	lockdep_assert_held(&req->ctx->uring_lock);
365
366	wq_list_add_tail(node: &req->comp_list, list: &state->compl_reqs);
367	}
368
369	static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
370	{
371	if (unlikely(ctx->off_timeout_used \|\| ctx->drain_active \|\|
372	ctx->has_evfd \|\| ctx->poll_activated))
373	__io_commit_cqring_flush(ctx);
374	}
375
376	static inline void io_get_task_refs(int nr)
377	{
378	struct io_uring_task *tctx = current->io_uring;
379
380	tctx->cached_refs -= nr;
381	if (unlikely(tctx->cached_refs < `0`))
382	io_task_refs_refill(tctx);
383	}
384
385	static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
386	{
387	return !ctx->submit_state.free_list.next;
388	}
389
390	extern struct kmem_cache *req_cachep;
391	extern struct kmem_cache *io_buf_cachep;
392
393	static inline struct io_kiocb io_extract_req(struct* io_ring_ctx *ctx)
394	{
395	struct io_kiocb *req;
396
397	req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
398	wq_stack_extract(stack: &ctx->submit_state.free_list);
399	return req;
400	}
401
402	static inline bool io_alloc_req(struct io_ring_ctx ctx, struct* io_kiocb **req)
403	{
404	if (unlikely(io_req_cache_empty(ctx))) {
405	if (!__io_alloc_req_refill(ctx))
406	return false;
407	}
408	*req = io_extract_req(ctx);
409	return true;
410	}
411
412	static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
413	{
414	return likely(ctx->submitter_task == current);
415	}
416
417	static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
418	{
419	return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) \|\|
420	ctx->submitter_task == current);
421	}
422
423	static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
424	{
425	io_req_set_res(req, res, cflags: `0`);
426	req->io_task_work.func = io_req_task_complete;
427	io_req_task_work_add(req);
428	}
429
430	/*
431	* IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
432	* slot.
433	*/
434	static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
435	{
436	if (ctx->flags & IORING_SETUP_SQE128)
437	return `2` * sizeof(struct io_uring_sqe);
438	return sizeof(struct io_uring_sqe);
439	}
440
441	static inline bool io_file_can_poll(struct io_kiocb *req)
442	{
443	if (req->flags & REQ_F_CAN_POLL)
444	return true;
445	if (file_can_poll(file: req->file)) {
446	req->flags \|= REQ_F_CAN_POLL;
447	return true;
448	}
449	return false;
450	}
451
452	enum {
453	IO_CHECK_CQ_OVERFLOW_BIT,
454	IO_CHECK_CQ_DROPPED_BIT,
455	};
456
457	static inline bool io_has_work(struct io_ring_ctx *ctx)
458	{
459	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) \|\|
460	!llist_empty(head: &ctx->work_llist);
461	}
462	#endif
463

source code of linux/io_uring/io_uring.h