rw.c source code [linux/io_uring/rw.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/kernel.h>
3	#include <linux/errno.h>
4	#include <linux/fs.h>
5	#include <linux/file.h>
6	#include <linux/blk-mq.h>
7	#include <linux/mm.h>
8	#include <linux/slab.h>
9	#include <linux/fsnotify.h>
10	#include <linux/poll.h>
11	#include <linux/nospec.h>
12	#include <linux/compat.h>
13	#include <linux/io_uring/cmd.h>
14	#include <linux/indirect_call_wrapper.h>
15
16	#include <uapi/linux/io_uring.h>
17
18	#include "io_uring.h"
19	#include "opdef.h"
20	#include "kbuf.h"
21	#include "rsrc.h"
22	#include "poll.h"
23	#include "rw.h"
24
25	struct io_rw {
26	/ NOTE: kiocb has the file as the first member, so don't do it here /
27	struct kiocb kiocb;
28	u64 addr;
29	u32 len;
30	rwf_t flags;
31	};
32
33	static inline bool io_file_supports_nowait(struct io_kiocb *req)
34	{
35	return req->flags & REQ_F_SUPPORT_NOWAIT;
36	}
37
38	#ifdef CONFIG_COMPAT
39	static int io_iov_compat_buffer_select_prep(struct io_rw *rw)
40	{
41	struct compat_iovec __user *uiov;
42	compat_ssize_t clen;
43
44	uiov = u64_to_user_ptr(rw->addr);
45	if (!access_ok(uiov, sizeof(*uiov)))
46	return -EFAULT;
47	if (__get_user(clen, &uiov->iov_len))
48	return -EFAULT;
49	if (clen < `0`)
50	return -EINVAL;
51
52	rw->len = clen;
53	return `0`;
54	}
55	#endif
56
57	static int io_iov_buffer_select_prep(struct io_kiocb *req)
58	{
59	struct iovec __user *uiov;
60	struct iovec iov;
61	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
62
63	if (rw->len != `1`)
64	return -EINVAL;
65
66	#ifdef CONFIG_COMPAT
67	if (req->ctx->compat)
68	return io_iov_compat_buffer_select_prep(rw);
69	#endif
70
71	uiov = u64_to_user_ptr(rw->addr);
72	if (copy_from_user(to: &iov, from: uiov, n: sizeof(*uiov)))
73	return -EFAULT;
74	rw->len = iov.iov_len;
75	return `0`;
76	}
77
78	int io_prep_rw(struct io_kiocb req, const* struct io_uring_sqe *sqe)
79	{
80	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
81	unsigned ioprio;
82	int ret;
83
84	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
85	/ used for fixed read/write too - just read unconditionally /
86	req->buf_index = READ_ONCE(sqe->buf_index);
87
88	ioprio = READ_ONCE(sqe->ioprio);
89	if (ioprio) {
90	ret = ioprio_check_cap(ioprio);
91	if (ret)
92	return ret;
93
94	rw->kiocb.ki_ioprio = ioprio;
95	} else {
96	rw->kiocb.ki_ioprio = get_current_ioprio();
97	}
98	rw->kiocb.dio_complete = NULL;
99
100	rw->addr = READ_ONCE(sqe->addr);
101	rw->len = READ_ONCE(sqe->len);
102	rw->flags = READ_ONCE(sqe->rw_flags);
103	return `0`;
104	}
105
106	int io_prep_rwv(struct io_kiocb req, const* struct io_uring_sqe *sqe)
107	{
108	int ret;
109
110	ret = io_prep_rw(req, sqe);
111	if (unlikely(ret))
112	return ret;
113
114	/*
115	* Have to do this validation here, as this is in io_read() rw->len
116	* might have chanaged due to buffer selection
117	*/
118	if (req->flags & REQ_F_BUFFER_SELECT)
119	return io_iov_buffer_select_prep(req);
120
121	return `0`;
122	}
123
124	int io_prep_rw_fixed(struct io_kiocb req, const* struct io_uring_sqe *sqe)
125	{
126	struct io_ring_ctx *ctx = req->ctx;
127	u16 index;
128	int ret;
129
130	ret = io_prep_rw(req, sqe);
131	if (unlikely(ret))
132	return ret;
133
134	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
135	return -EFAULT;
136	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
137	req->imu = ctx->user_bufs[index];
138	io_req_set_rsrc_node(req, ctx, issue_flags: `0`);
139	return `0`;
140	}
141
142	/*
143	* Multishot read is prepared just like a normal read/write request, only
144	* difference is that we set the MULTISHOT flag.
145	*/
146	int io_read_mshot_prep(struct io_kiocb req, const* struct io_uring_sqe *sqe)
147	{
148	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
149	int ret;
150
151	/ must be used with provided buffers /
152	if (!(req->flags & REQ_F_BUFFER_SELECT))
153	return -EINVAL;
154
155	ret = io_prep_rw(req, sqe);
156	if (unlikely(ret))
157	return ret;
158
159	if (rw->addr \|\| rw->len)
160	return -EINVAL;
161
162	req->flags \|= REQ_F_APOLL_MULTISHOT;
163	return `0`;
164	}
165
166	void io_readv_writev_cleanup(struct io_kiocb *req)
167	{
168	struct io_async_rw *io = req->async_data;
169
170	kfree(objp: io->free_iovec);
171	}
172
173	static inline loff_t io_kiocb_update_pos(struct* io_kiocb *req)
174	{
175	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
176
177	if (rw->kiocb.ki_pos != -`1`)
178	return &rw->kiocb.ki_pos;
179
180	if (!(req->file->f_mode & FMODE_STREAM)) {
181	req->flags \|= REQ_F_CUR_POS;
182	rw->kiocb.ki_pos = req->file->f_pos;
183	return &rw->kiocb.ki_pos;
184	}
185
186	rw->kiocb.ki_pos = `0`;
187	return NULL;
188	}
189
190	static void io_req_task_queue_reissue(struct io_kiocb *req)
191	{
192	req->io_task_work.func = io_queue_iowq;
193	io_req_task_work_add(req);
194	}
195
196	#ifdef CONFIG_BLOCK
197	static bool io_resubmit_prep(struct io_kiocb *req)
198	{
199	struct io_async_rw *io = req->async_data;
200
201	if (!req_has_async_data(req))
202	return !io_req_prep_async(req);
203	iov_iter_restore(i: &io->s.iter, state: &io->s.iter_state);
204	return true;
205	}
206
207	static bool io_rw_should_reissue(struct io_kiocb *req)
208	{
209	umode_t mode = file_inode(f: req->file)->i_mode;
210	struct io_ring_ctx *ctx = req->ctx;
211
212	if (!S_ISBLK(mode) && !S_ISREG(mode))
213	return false;
214	if ((req->flags & REQ_F_NOWAIT) \|\| (io_wq_current_is_worker() &&
215	!(ctx->flags & IORING_SETUP_IOPOLL)))
216	return false;
217	/*
218	* If ref is dying, we might be running poll reap from the exit work.
219	* Don't attempt to reissue from that path, just let it fail with
220	* -EAGAIN.
221	*/
222	if (percpu_ref_is_dying(ref: &ctx->refs))
223	return false;
224	/*
225	* Play it safe and assume not safe to re-import and reissue if we're
226	* not in the original thread group (or in task context).
227	*/
228	if (!same_thread_group(p1: req->task, current) \|\| !in_task())
229	return false;
230	return true;
231	}
232	#else
233	static bool io_resubmit_prep(struct io_kiocb *req)
234	{
235	return false;
236	}
237	static bool io_rw_should_reissue(struct io_kiocb *req)
238	{
239	return false;
240	}
241	#endif
242
243	static void io_req_end_write(struct io_kiocb *req)
244	{
245	if (req->flags & REQ_F_ISREG) {
246	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
247
248	kiocb_end_write(iocb: &rw->kiocb);
249	}
250	}
251
252	/*
253	* Trigger the notifications after having done some IO, and finish the write
254	* accounting, if any.
255	*/
256	static void io_req_io_end(struct io_kiocb *req)
257	{
258	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
259
260	if (rw->kiocb.ki_flags & IOCB_WRITE) {
261	io_req_end_write(req);
262	fsnotify_modify(file: req->file);
263	} else {
264	fsnotify_access(file: req->file);
265	}
266	}
267
268	static bool __io_complete_rw_common(struct io_kiocb req, long* res)
269	{
270	if (unlikely(res != req->cqe.res)) {
271	if ((res == -EAGAIN \|\| res == -EOPNOTSUPP) &&
272	io_rw_should_reissue(req)) {
273	/*
274	* Reissue will start accounting again, finish the
275	* current cycle.
276	*/
277	io_req_io_end(req);
278	req->flags \|= REQ_F_REISSUE \| REQ_F_BL_NO_RECYCLE;
279	return true;
280	}
281	req_set_fail(req);
282	req->cqe.res = res;
283	}
284	return false;
285	}
286
287	static inline int io_fixup_rw_res(struct io_kiocb req, long* res)
288	{
289	struct io_async_rw *io = req->async_data;
290
291	/ add previously done IO, if any /
292	if (req_has_async_data(req) && io->bytes_done > `0`) {
293	if (res < `0`)
294	res = io->bytes_done;
295	else
296	res += io->bytes_done;
297	}
298	return res;
299	}
300
301	void io_req_rw_complete(struct io_kiocb req, struct* io_tw_state *ts)
302	{
303	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
304	struct kiocb *kiocb = &rw->kiocb;
305
306	if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
307	long res = kiocb->dio_complete(rw->kiocb.private);
308
309	io_req_set_res(req, res: io_fixup_rw_res(req, res), cflags: `0`);
310	}
311
312	io_req_io_end(req);
313
314	if (req->flags & (REQ_F_BUFFER_SELECTED\|REQ_F_BUFFER_RING)) {
315	unsigned issue_flags = ts->locked ? `0` : IO_URING_F_UNLOCKED;
316
317	req->cqe.flags \|= io_put_kbuf(req, issue_flags);
318	}
319	io_req_task_complete(req, ts);
320	}
321
322	static void io_complete_rw(struct kiocb kiocb, long* res)
323	{
324	struct io_rw rw = container_of(kiocb, struct* io_rw, kiocb);
325	struct io_kiocb *req = cmd_to_io_kiocb(rw);
326
327	if (!kiocb->dio_complete \|\| !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
328	if (__io_complete_rw_common(req, res))
329	return;
330	io_req_set_res(req, res: io_fixup_rw_res(req, res), cflags: `0`);
331	}
332	req->io_task_work.func = io_req_rw_complete;
333	__io_req_task_work_add(req, flags: IOU_F_TWQ_LAZY_WAKE);
334	}
335
336	static void io_complete_rw_iopoll(struct kiocb kiocb, long* res)
337	{
338	struct io_rw rw = container_of(kiocb, struct* io_rw, kiocb);
339	struct io_kiocb *req = cmd_to_io_kiocb(rw);
340
341	if (kiocb->ki_flags & IOCB_WRITE)
342	io_req_end_write(req);
343	if (unlikely(res != req->cqe.res)) {
344	if (res == -EAGAIN && io_rw_should_reissue(req)) {
345	req->flags \|= REQ_F_REISSUE \| REQ_F_BL_NO_RECYCLE;
346	return;
347	}
348	req->cqe.res = res;
349	}
350
351	/ order with io_iopoll_complete() checking ->iopoll_completed /
352	smp_store_release(&req->iopoll_completed, `1`);
353	}
354
355	static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
356	{
357	/ IO was queued async, completion will happen later /
358	if (ret == -EIOCBQUEUED)
359	return;
360
361	/ transform internal restart error codes /
362	if (unlikely(ret < `0`)) {
363	switch (ret) {
364	case -ERESTARTSYS:
365	case -ERESTARTNOINTR:
366	case -ERESTARTNOHAND:
367	case -ERESTART_RESTARTBLOCK:
368	/*
369	* We can't just restart the syscall, since previously
370	* submitted sqes may already be in progress. Just fail
371	* this IO with EINTR.
372	*/
373	ret = -EINTR;
374	break;
375	}
376	}
377
378	INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll,
379	io_complete_rw, kiocb, ret);
380	}
381
382	static int kiocb_done(struct io_kiocb *req, ssize_t ret,
383	unsigned int issue_flags)
384	{
385	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
386	unsigned final_ret = io_fixup_rw_res(req, res: ret);
387
388	if (ret >= `0` && req->flags & REQ_F_CUR_POS)
389	req->file->f_pos = rw->kiocb.ki_pos;
390	if (ret >= `0` && (rw->kiocb.ki_complete == io_complete_rw)) {
391	if (!__io_complete_rw_common(req, res: ret)) {
392	/*
393	* Safe to call io_end from here as we're inline
394	* from the submission path.
395	*/
396	io_req_io_end(req);
397	io_req_set_res(req, res: final_ret,
398	cflags: io_put_kbuf(req, issue_flags));
399	return IOU_OK;
400	}
401	} else {
402	io_rw_done(kiocb: &rw->kiocb, ret);
403	}
404
405	if (req->flags & REQ_F_REISSUE) {
406	req->flags &= ~REQ_F_REISSUE;
407	if (io_resubmit_prep(req))
408	io_req_task_queue_reissue(req);
409	else
410	io_req_task_queue_fail(req, ret: final_ret);
411	}
412	return IOU_ISSUE_SKIP_COMPLETE;
413	}
414
415	static struct iovec __io_import_iovec(int* ddir, struct io_kiocb *req,
416	struct io_rw_state *s,
417	unsigned int issue_flags)
418	{
419	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
420	struct iov_iter *iter = &s->iter;
421	u8 opcode = req->opcode;
422	struct iovec *iovec;
423	void __user *buf;
424	size_t sqe_len;
425	ssize_t ret;
426
427	if (opcode == IORING_OP_READ_FIXED \|\| opcode == IORING_OP_WRITE_FIXED) {
428	ret = io_import_fixed(ddir, iter, imu: req->imu, buf_addr: rw->addr, len: rw->len);
429	if (ret)
430	return ERR_PTR(error: ret);
431	return NULL;
432	}
433
434	buf = u64_to_user_ptr(rw->addr);
435	sqe_len = rw->len;
436
437	if (!io_issue_defs[opcode].vectored \|\| req->flags & REQ_F_BUFFER_SELECT) {
438	if (io_do_buffer_select(req)) {
439	buf = io_buffer_select(req, len: &sqe_len, issue_flags);
440	if (!buf)
441	return ERR_PTR(error: -ENOBUFS);
442	rw->addr = (unsigned long) buf;
443	rw->len = sqe_len;
444	}
445
446	ret = import_ubuf(type: ddir, buf, len: sqe_len, i: iter);
447	if (ret)
448	return ERR_PTR(error: ret);
449	return NULL;
450	}
451
452	iovec = s->fast_iov;
453	ret = __import_iovec(type: ddir, uvec: buf, nr_segs: sqe_len, UIO_FASTIOV, iovp: &iovec, i: iter,
454	compat: req->ctx->compat);
455	if (unlikely(ret < `0`))
456	return ERR_PTR(error: ret);
457	return iovec;
458	}
459
460	static inline int io_import_iovec(int rw, struct io_kiocb *req,
461	struct iovec iovec, struct** io_rw_state *s,
462	unsigned int issue_flags)
463	{
464	*iovec = __io_import_iovec(ddir: rw, req, s, issue_flags);
465	if (IS_ERR(ptr: *iovec))
466	return PTR_ERR(ptr: *iovec);
467
468	iov_iter_save_state(iter: &s->iter, state: &s->iter_state);
469	return `0`;
470	}
471
472	static inline loff_t io_kiocb_ppos(struct* kiocb *kiocb)
473	{
474	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
475	}
476
477	/*
478	* For files that don't have ->read_iter() and ->write_iter(), handle them
479	* by looping over ->read() or ->write() manually.
480	*/
481	static ssize_t loop_rw_iter(int ddir, struct io_rw rw, struct* iov_iter *iter)
482	{
483	struct kiocb *kiocb = &rw->kiocb;
484	struct file *file = kiocb->ki_filp;
485	ssize_t ret = `0`;
486	loff_t *ppos;
487
488	/*
489	* Don't support polled IO through this interface, and we can't
490	* support non-blocking either. For the latter, this just causes
491	* the kiocb to be handled from an async context.
492	*/
493	if (kiocb->ki_flags & IOCB_HIPRI)
494	return -EOPNOTSUPP;
495	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
496	!(kiocb->ki_filp->f_flags & O_NONBLOCK))
497	return -EAGAIN;
498
499	ppos = io_kiocb_ppos(kiocb);
500
501	while (iov_iter_count(i: iter)) {
502	void __user *addr;
503	size_t len;
504	ssize_t nr;
505
506	if (iter_is_ubuf(i: iter)) {
507	addr = iter->ubuf + iter->iov_offset;
508	len = iov_iter_count(i: iter);
509	} else if (!iov_iter_is_bvec(i: iter)) {
510	addr = iter_iov_addr(iter);
511	len = iter_iov_len(iter);
512	} else {
513	addr = u64_to_user_ptr(rw->addr);
514	len = rw->len;
515	}
516
517	if (ddir == READ)
518	nr = file->f_op->read(file, addr, len, ppos);
519	else
520	nr = file->f_op->write(file, addr, len, ppos);
521
522	if (nr < `0`) {
523	if (!ret)
524	ret = nr;
525	break;
526	}
527	ret += nr;
528	if (!iov_iter_is_bvec(i: iter)) {
529	iov_iter_advance(i: iter, bytes: nr);
530	} else {
531	rw->addr += nr;
532	rw->len -= nr;
533	if (!rw->len)
534	break;
535	}
536	if (nr != len)
537	break;
538	}
539
540	return ret;
541	}
542
543	static void io_req_map_rw(struct io_kiocb req, const* struct iovec *iovec,
544	const struct iovec fast_iov, struct* iov_iter *iter)
545	{
546	struct io_async_rw *io = req->async_data;
547
548	memcpy(&io->s.iter, iter, sizeof(*iter));
549	io->free_iovec = iovec;
550	io->bytes_done = `0`;
551	/ can only be fixed buffers, no need to do anything /
552	if (iov_iter_is_bvec(i: iter) \|\| iter_is_ubuf(i: iter))
553	return;
554	if (!iovec) {
555	unsigned iov_off = `0`;
556
557	io->s.iter.__iov = io->s.fast_iov;
558	if (iter->__iov != fast_iov) {
559	iov_off = iter_iov(iter) - fast_iov;
560	io->s.iter.__iov += iov_off;
561	}
562	if (io->s.fast_iov != fast_iov)
563	memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
564	sizeof(struct iovec) * iter->nr_segs);
565	} else {
566	req->flags \|= REQ_F_NEED_CLEANUP;
567	}
568	}
569
570	static int io_setup_async_rw(struct io_kiocb req, const* struct iovec *iovec,
571	struct io_rw_state *s, bool force)
572	{
573	if (!force && !io_cold_defs[req->opcode].prep_async)
574	return `0`;
575	/ opcode type doesn't need async data /
576	if (!io_cold_defs[req->opcode].async_size)
577	return `0`;
578	if (!req_has_async_data(req)) {
579	struct io_async_rw *iorw;
580
581	if (io_alloc_async_data(req)) {
582	kfree(objp: iovec);
583	return -ENOMEM;
584	}
585
586	io_req_map_rw(req, iovec, fast_iov: s->fast_iov, iter: &s->iter);
587	iorw = req->async_data;
588	/ we've copied and mapped the iter, ensure state is saved /
589	iov_iter_save_state(iter: &iorw->s.iter, state: &iorw->s.iter_state);
590	}
591	return `0`;
592	}
593
594	static inline int io_rw_prep_async(struct io_kiocb req, int* rw)
595	{
596	struct io_async_rw *iorw = req->async_data;
597	struct iovec *iov;
598	int ret;
599
600	iorw->bytes_done = `0`;
601	iorw->free_iovec = NULL;
602
603	/ submission path, ->uring_lock should already be taken /
604	ret = io_import_iovec(rw, req, iovec: &iov, s: &iorw->s, issue_flags: `0`);
605	if (unlikely(ret < `0`))
606	return ret;
607
608	if (iov) {
609	iorw->free_iovec = iov;
610	req->flags \|= REQ_F_NEED_CLEANUP;
611	}
612
613	return `0`;
614	}
615
616	int io_readv_prep_async(struct io_kiocb *req)
617	{
618	return io_rw_prep_async(req, ITER_DEST);
619	}
620
621	int io_writev_prep_async(struct io_kiocb *req)
622	{
623	return io_rw_prep_async(req, ITER_SOURCE);
624	}
625
626	/*
627	* This is our waitqueue callback handler, registered through __folio_lock_async()
628	* when we initially tried to do the IO with the iocb armed our waitqueue.
629	* This gets called when the page is unlocked, and we generally expect that to
630	* happen when the page IO is completed and the page is now uptodate. This will
631	* queue a task_work based retry of the operation, attempting to copy the data
632	* again. If the latter fails because the page was NOT uptodate, then we will
633	* do a thread based blocking retry of the operation. That's the unexpected
634	* slow path.
635	*/
636	static int io_async_buf_func(struct wait_queue_entry wait, unsigned* mode,
637	int sync, void *arg)
638	{
639	struct wait_page_queue *wpq;
640	struct io_kiocb *req = wait->private;
641	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
642	struct wait_page_key *key = arg;
643
644	wpq = container_of(wait, struct wait_page_queue, wait);
645
646	if (!wake_page_match(wait_page: wpq, key))
647	return `0`;
648
649	rw->kiocb.ki_flags &= ~IOCB_WAITQ;
650	list_del_init(entry: &wait->entry);
651	io_req_task_queue(req);
652	return `1`;
653	}
654
655	/*
656	* This controls whether a given IO request should be armed for async page
657	* based retry. If we return false here, the request is handed to the async
658	* worker threads for retry. If we're doing buffered reads on a regular file,
659	* we prepare a private wait_page_queue entry and retry the operation. This
660	* will either succeed because the page is now uptodate and unlocked, or it
661	* will register a callback when the page is unlocked at IO completion. Through
662	* that callback, io_uring uses task_work to setup a retry of the operation.
663	* That retry will attempt the buffered read again. The retry will generally
664	* succeed, or in rare cases where it fails, we then fall back to using the
665	* async worker threads for a blocking retry.
666	*/
667	static bool io_rw_should_retry(struct io_kiocb *req)
668	{
669	struct io_async_rw *io = req->async_data;
670	struct wait_page_queue *wait = &io->wpq;
671	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
672	struct kiocb *kiocb = &rw->kiocb;
673
674	/ never retry for NOWAIT, we just complete with -EAGAIN /
675	if (req->flags & REQ_F_NOWAIT)
676	return false;
677
678	/ Only for buffered IO /
679	if (kiocb->ki_flags & (IOCB_DIRECT \| IOCB_HIPRI))
680	return false;
681
682	/*
683	* just use poll if we can, and don't attempt if the fs doesn't
684	* support callback based unlocks
685	*/
686	if (io_file_can_poll(req) \|\| !(req->file->f_mode & FMODE_BUF_RASYNC))
687	return false;
688
689	wait->wait.func = io_async_buf_func;
690	wait->wait.private = req;
691	wait->wait.flags = `0`;
692	INIT_LIST_HEAD(list: &wait->wait.entry);
693	kiocb->ki_flags \|= IOCB_WAITQ;
694	kiocb->ki_flags &= ~IOCB_NOWAIT;
695	kiocb->ki_waitq = wait;
696	return true;
697	}
698
699	static inline int io_iter_do_read(struct io_rw rw, struct* iov_iter *iter)
700	{
701	struct file *file = rw->kiocb.ki_filp;
702
703	if (likely(file->f_op->read_iter))
704	return call_read_iter(file, kio: &rw->kiocb, iter);
705	else if (file->f_op->read)
706	return loop_rw_iter(READ, rw, iter);
707	else
708	return -EINVAL;
709	}
710
711	static bool need_complete_io(struct io_kiocb *req)
712	{
713	return req->flags & REQ_F_ISREG \|\|
714	S_ISBLK(file_inode(req->file)->i_mode);
715	}
716
717	static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
718	{
719	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
720	struct kiocb *kiocb = &rw->kiocb;
721	struct io_ring_ctx *ctx = req->ctx;
722	struct file *file = req->file;
723	int ret;
724
725	if (unlikely(!(file->f_mode & mode)))
726	return -EBADF;
727
728	if (!(req->flags & REQ_F_FIXED_FILE))
729	req->flags \|= io_file_get_flags(file);
730
731	kiocb->ki_flags = file->f_iocb_flags;
732	ret = kiocb_set_rw_flags(ki: kiocb, flags: rw->flags);
733	if (unlikely(ret))
734	return ret;
735	kiocb->ki_flags \|= IOCB_ALLOC_CACHE;
736
737	/*
738	* If the file is marked O_NONBLOCK, still allow retry for it if it
739	* supports async. Otherwise it's impossible to use O_NONBLOCK files
740	* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
741	*/
742	if ((kiocb->ki_flags & IOCB_NOWAIT) \|\|
743	((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
744	req->flags \|= REQ_F_NOWAIT;
745
746	if (ctx->flags & IORING_SETUP_IOPOLL) {
747	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\| !file->f_op->iopoll)
748	return -EOPNOTSUPP;
749
750	kiocb->private = NULL;
751	kiocb->ki_flags \|= IOCB_HIPRI;
752	kiocb->ki_complete = io_complete_rw_iopoll;
753	req->iopoll_completed = `0`;
754	} else {
755	if (kiocb->ki_flags & IOCB_HIPRI)
756	return -EINVAL;
757	kiocb->ki_complete = io_complete_rw;
758	}
759
760	return `0`;
761	}
762
763	static int __io_read(struct io_kiocb req, unsigned* int issue_flags)
764	{
765	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
766	struct io_rw_state __s, *s = &__s;
767	struct iovec *iovec;
768	struct kiocb *kiocb = &rw->kiocb;
769	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
770	struct io_async_rw *io;
771	ssize_t ret, ret2;
772	loff_t *ppos;
773
774	if (!req_has_async_data(req)) {
775	ret = io_import_iovec(ITER_DEST, req, iovec: &iovec, s, issue_flags);
776	if (unlikely(ret < `0`))
777	return ret;
778	} else {
779	io = req->async_data;
780	s = &io->s;
781
782	/*
783	* Safe and required to re-import if we're using provided
784	* buffers, as we dropped the selected one before retry.
785	*/
786	if (io_do_buffer_select(req)) {
787	ret = io_import_iovec(ITER_DEST, req, iovec: &iovec, s, issue_flags);
788	if (unlikely(ret < `0`))
789	return ret;
790	}
791
792	/*
793	* We come here from an earlier attempt, restore our state to
794	* match in case it doesn't. It's cheap enough that we don't
795	* need to make this conditional.
796	*/
797	iov_iter_restore(i: &s->iter, state: &s->iter_state);
798	iovec = NULL;
799	}
800	ret = io_rw_init_file(req, FMODE_READ);
801	if (unlikely(ret)) {
802	kfree(objp: iovec);
803	return ret;
804	}
805	req->cqe.res = iov_iter_count(i: &s->iter);
806
807	if (force_nonblock) {
808	/ If the file doesn't support async, just async punt /
809	if (unlikely(!io_file_supports_nowait(req))) {
810	ret = io_setup_async_rw(req, iovec, s, force: true);
811	return ret ?: -EAGAIN;
812	}
813	kiocb->ki_flags \|= IOCB_NOWAIT;
814	} else {
815	/ Ensure we clear previously set non-block flag /
816	kiocb->ki_flags &= ~IOCB_NOWAIT;
817	}
818
819	ppos = io_kiocb_update_pos(req);
820
821	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
822	if (unlikely(ret)) {
823	kfree(objp: iovec);
824	return ret;
825	}
826
827	ret = io_iter_do_read(rw, iter: &s->iter);
828
829	if (ret == -EAGAIN \|\| (req->flags & REQ_F_REISSUE)) {
830	req->flags &= ~REQ_F_REISSUE;
831	/*
832	* If we can poll, just do that. For a vectored read, we'll
833	* need to copy state first.
834	*/
835	if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored)
836	return -EAGAIN;
837	/ IOPOLL retry should happen for io-wq threads /
838	if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
839	goto done;
840	/ no retry on NONBLOCK nor RWF_NOWAIT /
841	if (req->flags & REQ_F_NOWAIT)
842	goto done;
843	ret = `0`;
844	} else if (ret == -EIOCBQUEUED) {
845	if (iovec)
846	kfree(objp: iovec);
847	return IOU_ISSUE_SKIP_COMPLETE;
848	} else if (ret == req->cqe.res \|\| ret <= `0` \|\| !force_nonblock \|\|
849	(req->flags & REQ_F_NOWAIT) \|\| !need_complete_io(req)) {
850	/ read all, failed, already did sync or don't want to retry /
851	goto done;
852	}
853
854	/*
855	* Don't depend on the iter state matching what was consumed, or being
856	* untouched in case of error. Restore it and we'll advance it
857	* manually if we need to.
858	*/
859	iov_iter_restore(i: &s->iter, state: &s->iter_state);
860
861	ret2 = io_setup_async_rw(req, iovec, s, force: true);
862	iovec = NULL;
863	if (ret2) {
864	ret = ret > `0` ? ret : ret2;
865	goto done;
866	}
867
868	io = req->async_data;
869	s = &io->s;
870	/*
871	* Now use our persistent iterator and state, if we aren't already.
872	* We've restored and mapped the iter to match.
873	*/
874
875	do {
876	/*
877	* We end up here because of a partial read, either from
878	* above or inside this loop. Advance the iter by the bytes
879	* that were consumed.
880	*/
881	iov_iter_advance(i: &s->iter, bytes: ret);
882	if (!iov_iter_count(i: &s->iter))
883	break;
884	io->bytes_done += ret;
885	iov_iter_save_state(iter: &s->iter, state: &s->iter_state);
886
887	/ if we can retry, do so with the callbacks armed /
888	if (!io_rw_should_retry(req)) {
889	kiocb->ki_flags &= ~IOCB_WAITQ;
890	return -EAGAIN;
891	}
892
893	req->cqe.res = iov_iter_count(i: &s->iter);
894	/*
895	* Now retry read with the IOCB_WAITQ parts set in the iocb. If
896	* we get -EIOCBQUEUED, then we'll get a notification when the
897	* desired page gets unlocked. We can also get a partial read
898	* here, and if we do, then just retry at the new offset.
899	*/
900	ret = io_iter_do_read(rw, iter: &s->iter);
901	if (ret == -EIOCBQUEUED)
902	return IOU_ISSUE_SKIP_COMPLETE;
903	/ we got some bytes, but not all. retry. /
904	kiocb->ki_flags &= ~IOCB_WAITQ;
905	iov_iter_restore(i: &s->iter, state: &s->iter_state);
906	} while (ret > `0`);
907	done:
908	/ it's faster to check here then delegate to kfree /
909	if (iovec)
910	kfree(objp: iovec);
911	return ret;
912	}
913
914	int io_read(struct io_kiocb req, unsigned* int issue_flags)
915	{
916	int ret;
917
918	ret = __io_read(req, issue_flags);
919	if (ret >= `0`)
920	return kiocb_done(req, ret, issue_flags);
921
922	return ret;
923	}
924
925	int io_read_mshot(struct io_kiocb req, unsigned* int issue_flags)
926	{
927	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
928	unsigned int cflags = `0`;
929	int ret;
930
931	/*
932	* Multishot MUST be used on a pollable file
933	*/
934	if (!io_file_can_poll(req))
935	return -EBADFD;
936
937	ret = __io_read(req, issue_flags);
938
939	/*
940	* If the file doesn't support proper NOWAIT, then disable multishot
941	* and stay in single shot mode.
942	*/
943	if (!io_file_supports_nowait(req))
944	req->flags &= ~REQ_F_APOLL_MULTISHOT;
945
946	/*
947	* If we get -EAGAIN, recycle our buffer and just let normal poll
948	* handling arm it.
949	*/
950	if (ret == -EAGAIN) {
951	/*
952	* Reset rw->len to 0 again to avoid clamping future mshot
953	* reads, in case the buffer size varies.
954	*/
955	if (io_kbuf_recycle(req, issue_flags))
956	rw->len = `0`;
957	if (issue_flags & IO_URING_F_MULTISHOT)
958	return IOU_ISSUE_SKIP_COMPLETE;
959	return -EAGAIN;
960	}
961
962	/*
963	* Any successful return value will keep the multishot read armed.
964	*/
965	if (ret > `0` && req->flags & REQ_F_APOLL_MULTISHOT) {
966	/*
967	* Put our buffer and post a CQE. If we fail to post a CQE, then
968	* jump to the termination path. This request is then done.
969	*/
970	cflags = io_put_kbuf(req, issue_flags);
971	rw->len = `0`; / similarly to above, reset len to 0 /
972
973	if (io_fill_cqe_req_aux(req,
974	defer: issue_flags & IO_URING_F_COMPLETE_DEFER,
975	res: ret, cflags: cflags \| IORING_CQE_F_MORE)) {
976	if (issue_flags & IO_URING_F_MULTISHOT) {
977	/*
978	* Force retry, as we might have more data to
979	* be read and otherwise it won't get retried
980	* until (if ever) another poll is triggered.
981	*/
982	io_poll_multishot_retry(req);
983	return IOU_ISSUE_SKIP_COMPLETE;
984	}
985	return -EAGAIN;
986	}
987	}
988
989	/*
990	* Either an error, or we've hit overflow posting the CQE. For any
991	* multishot request, hitting overflow will terminate it.
992	*/
993	io_req_set_res(req, res: ret, cflags);
994	if (issue_flags & IO_URING_F_MULTISHOT)
995	return IOU_STOP_MULTISHOT;
996	return IOU_OK;
997	}
998
999	int io_write(struct io_kiocb req, unsigned* int issue_flags)
1000	{
1001	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
1002	struct io_rw_state __s, *s = &__s;
1003	struct iovec *iovec;
1004	struct kiocb *kiocb = &rw->kiocb;
1005	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1006	ssize_t ret, ret2;
1007	loff_t *ppos;
1008
1009	if (!req_has_async_data(req)) {
1010	ret = io_import_iovec(ITER_SOURCE, req, iovec: &iovec, s, issue_flags);
1011	if (unlikely(ret < `0`))
1012	return ret;
1013	} else {
1014	struct io_async_rw *io = req->async_data;
1015
1016	s = &io->s;
1017	iov_iter_restore(i: &s->iter, state: &s->iter_state);
1018	iovec = NULL;
1019	}
1020	ret = io_rw_init_file(req, FMODE_WRITE);
1021	if (unlikely(ret)) {
1022	kfree(objp: iovec);
1023	return ret;
1024	}
1025	req->cqe.res = iov_iter_count(i: &s->iter);
1026
1027	if (force_nonblock) {
1028	/ If the file doesn't support async, just async punt /
1029	if (unlikely(!io_file_supports_nowait(req)))
1030	goto copy_iov;
1031
1032	/ File path supports NOWAIT for non-direct_IO only for block devices. /
1033	if (!(kiocb->ki_flags & IOCB_DIRECT) &&
1034	!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
1035	(req->flags & REQ_F_ISREG))
1036	goto copy_iov;
1037
1038	kiocb->ki_flags \|= IOCB_NOWAIT;
1039	} else {
1040	/ Ensure we clear previously set non-block flag /
1041	kiocb->ki_flags &= ~IOCB_NOWAIT;
1042	}
1043
1044	ppos = io_kiocb_update_pos(req);
1045
1046	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
1047	if (unlikely(ret)) {
1048	kfree(objp: iovec);
1049	return ret;
1050	}
1051
1052	if (req->flags & REQ_F_ISREG)
1053	kiocb_start_write(iocb: kiocb);
1054	kiocb->ki_flags \|= IOCB_WRITE;
1055
1056	if (likely(req->file->f_op->write_iter))
1057	ret2 = call_write_iter(file: req->file, kio: kiocb, iter: &s->iter);
1058	else if (req->file->f_op->write)
1059	ret2 = loop_rw_iter(WRITE, rw, iter: &s->iter);
1060	else
1061	ret2 = -EINVAL;
1062
1063	if (req->flags & REQ_F_REISSUE) {
1064	req->flags &= ~REQ_F_REISSUE;
1065	ret2 = -EAGAIN;
1066	}
1067
1068	/*
1069	* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
1070	* retry them without IOCB_NOWAIT.
1071	*/
1072	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
1073	ret2 = -EAGAIN;
1074	/ no retry on NONBLOCK nor RWF_NOWAIT /
1075	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
1076	goto done;
1077	if (!force_nonblock \|\| ret2 != -EAGAIN) {
1078	/ IOPOLL retry should happen for io-wq threads /
1079	if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
1080	goto copy_iov;
1081
1082	if (ret2 != req->cqe.res && ret2 >= `0` && need_complete_io(req)) {
1083	struct io_async_rw *io;
1084
1085	trace_io_uring_short_write(ctx: req->ctx, fpos: kiocb->ki_pos - ret2,
1086	wanted: req->cqe.res, got: ret2);
1087
1088	/ This is a partial write. The file pos has already been*
1089	* updated, setup the async struct to complete the request
1090	* in the worker. Also update bytes_done to account for
1091	* the bytes already written.
1092	*/
1093	iov_iter_save_state(iter: &s->iter, state: &s->iter_state);
1094	ret = io_setup_async_rw(req, iovec, s, force: true);
1095
1096	io = req->async_data;
1097	if (io)
1098	io->bytes_done += ret2;
1099
1100	if (kiocb->ki_flags & IOCB_WRITE)
1101	io_req_end_write(req);
1102	return ret ? ret : -EAGAIN;
1103	}
1104	done:
1105	ret = kiocb_done(req, ret: ret2, issue_flags);
1106	} else {
1107	copy_iov:
1108	iov_iter_restore(i: &s->iter, state: &s->iter_state);
1109	ret = io_setup_async_rw(req, iovec, s, force: false);
1110	if (!ret) {
1111	if (kiocb->ki_flags & IOCB_WRITE)
1112	io_req_end_write(req);
1113	return -EAGAIN;
1114	}
1115	return ret;
1116	}
1117	/ it's reportedly faster than delegating the null check to kfree() /
1118	if (iovec)
1119	kfree(objp: iovec);
1120	return ret;
1121	}
1122
1123	void io_rw_fail(struct io_kiocb *req)
1124	{
1125	int res;
1126
1127	res = io_fixup_rw_res(req, res: req->cqe.res);
1128	io_req_set_res(req, res, cflags: req->cqe.flags);
1129	}
1130
1131	int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
1132	{
1133	struct io_wq_work_node pos, start, *prev;
1134	unsigned int poll_flags = `0`;
1135	DEFINE_IO_COMP_BATCH(iob);
1136	int nr_events = `0`;
1137
1138	/*
1139	* Only spin for completions if we don't have multiple devices hanging
1140	* off our complete list.
1141	*/
1142	if (ctx->poll_multi_queue \|\| force_nonspin)
1143	poll_flags \|= BLK_POLL_ONESHOT;
1144
1145	wq_list_for_each(pos, start, &ctx->iopoll_list) {
1146	struct io_kiocb req = container_of(pos, struct* io_kiocb, comp_list);
1147	struct file *file = req->file;
1148	int ret;
1149
1150	/*
1151	* Move completed and retryable entries to our local lists.
1152	* If we find a request that requires polling, break out
1153	* and complete those lists first, if we have entries there.
1154	*/
1155	if (READ_ONCE(req->iopoll_completed))
1156	break;
1157
1158	if (req->opcode == IORING_OP_URING_CMD) {
1159	struct io_uring_cmd *ioucmd;
1160
1161	ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
1162	ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
1163	poll_flags);
1164	} else {
1165	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
1166
1167	ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
1168	}
1169	if (unlikely(ret < `0`))
1170	return ret;
1171	else if (ret)
1172	poll_flags \|= BLK_POLL_ONESHOT;
1173
1174	/ iopoll may have completed current req /
1175	if (!rq_list_empty(iob.req_list) \|\|
1176	READ_ONCE(req->iopoll_completed))
1177	break;
1178	}
1179
1180	if (!rq_list_empty(iob.req_list))
1181	iob.complete(&iob);
1182	else if (!pos)
1183	return `0`;
1184
1185	prev = start;
1186	wq_list_for_each_resume(pos, prev) {
1187	struct io_kiocb req = container_of(pos, struct* io_kiocb, comp_list);
1188
1189	/ order with io_complete_rw_iopoll(), e.g. ->result updates /
1190	if (!smp_load_acquire(&req->iopoll_completed))
1191	break;
1192	nr_events++;
1193	req->cqe.flags = io_put_kbuf(req, issue_flags: `0`);
1194	}
1195	if (unlikely(!nr_events))
1196	return `0`;
1197
1198	pos = start ? start->next : ctx->iopoll_list.first;
1199	wq_list_cut(list: &ctx->iopoll_list, last: prev, prev: start);
1200
1201	if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
1202	return `0`;
1203	ctx->submit_state.compl_reqs.first = pos;
1204	__io_submit_flush_completions(ctx);
1205	return nr_events;
1206	}
1207

source code of linux/io_uring/rw.c