1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/kernel.h> |
3 | #include <linux/errno.h> |
4 | #include <linux/fs.h> |
5 | #include <linux/file.h> |
6 | #include <linux/mm.h> |
7 | #include <linux/slab.h> |
8 | #include <linux/nospec.h> |
9 | #include <linux/hugetlb.h> |
10 | #include <linux/compat.h> |
11 | #include <linux/io_uring.h> |
12 | |
13 | #include <uapi/linux/io_uring.h> |
14 | |
15 | #include "io_uring.h" |
16 | #include "openclose.h" |
17 | #include "rsrc.h" |
18 | |
19 | struct io_rsrc_update { |
20 | struct file *file; |
21 | u64 arg; |
22 | u32 nr_args; |
23 | u32 offset; |
24 | }; |
25 | |
26 | static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); |
27 | static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, |
28 | struct io_mapped_ubuf **pimu, |
29 | struct page **last_hpage); |
30 | |
31 | /* only define max */ |
32 | #define IORING_MAX_FIXED_FILES (1U << 20) |
33 | #define IORING_MAX_REG_BUFFERS (1U << 14) |
34 | |
35 | static const struct io_mapped_ubuf dummy_ubuf = { |
36 | /* set invalid range, so io_import_fixed() fails meeting it */ |
37 | .ubuf = -1UL, |
38 | .ubuf_end = 0, |
39 | }; |
40 | |
41 | int __io_account_mem(struct user_struct *user, unsigned long nr_pages) |
42 | { |
43 | unsigned long page_limit, cur_pages, new_pages; |
44 | |
45 | if (!nr_pages) |
46 | return 0; |
47 | |
48 | /* Don't allow more pages than we can safely lock */ |
49 | page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
50 | |
51 | cur_pages = atomic_long_read(v: &user->locked_vm); |
52 | do { |
53 | new_pages = cur_pages + nr_pages; |
54 | if (new_pages > page_limit) |
55 | return -ENOMEM; |
56 | } while (!atomic_long_try_cmpxchg(v: &user->locked_vm, |
57 | old: &cur_pages, new: new_pages)); |
58 | return 0; |
59 | } |
60 | |
61 | static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) |
62 | { |
63 | if (ctx->user) |
64 | __io_unaccount_mem(user: ctx->user, nr_pages); |
65 | |
66 | if (ctx->mm_account) |
67 | atomic64_sub(i: nr_pages, v: &ctx->mm_account->pinned_vm); |
68 | } |
69 | |
70 | static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) |
71 | { |
72 | int ret; |
73 | |
74 | if (ctx->user) { |
75 | ret = __io_account_mem(user: ctx->user, nr_pages); |
76 | if (ret) |
77 | return ret; |
78 | } |
79 | |
80 | if (ctx->mm_account) |
81 | atomic64_add(i: nr_pages, v: &ctx->mm_account->pinned_vm); |
82 | |
83 | return 0; |
84 | } |
85 | |
86 | static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, |
87 | void __user *arg, unsigned index) |
88 | { |
89 | struct iovec __user *src; |
90 | |
91 | #ifdef CONFIG_COMPAT |
92 | if (ctx->compat) { |
93 | struct compat_iovec __user *ciovs; |
94 | struct compat_iovec ciov; |
95 | |
96 | ciovs = (struct compat_iovec __user *) arg; |
97 | if (copy_from_user(to: &ciov, from: &ciovs[index], n: sizeof(ciov))) |
98 | return -EFAULT; |
99 | |
100 | dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); |
101 | dst->iov_len = ciov.iov_len; |
102 | return 0; |
103 | } |
104 | #endif |
105 | src = (struct iovec __user *) arg; |
106 | if (copy_from_user(to: dst, from: &src[index], n: sizeof(*dst))) |
107 | return -EFAULT; |
108 | return 0; |
109 | } |
110 | |
111 | static int io_buffer_validate(struct iovec *iov) |
112 | { |
113 | unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); |
114 | |
115 | /* |
116 | * Don't impose further limits on the size and buffer |
117 | * constraints here, we'll -EINVAL later when IO is |
118 | * submitted if they are wrong. |
119 | */ |
120 | if (!iov->iov_base) |
121 | return iov->iov_len ? -EFAULT : 0; |
122 | if (!iov->iov_len) |
123 | return -EFAULT; |
124 | |
125 | /* arbitrary limit, but we need something */ |
126 | if (iov->iov_len > SZ_1G) |
127 | return -EFAULT; |
128 | |
129 | if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) |
130 | return -EOVERFLOW; |
131 | |
132 | return 0; |
133 | } |
134 | |
135 | static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) |
136 | { |
137 | struct io_mapped_ubuf *imu = *slot; |
138 | unsigned int i; |
139 | |
140 | if (imu != &dummy_ubuf) { |
141 | for (i = 0; i < imu->nr_bvecs; i++) |
142 | unpin_user_page(page: imu->bvec[i].bv_page); |
143 | if (imu->acct_pages) |
144 | io_unaccount_mem(ctx, nr_pages: imu->acct_pages); |
145 | kvfree(addr: imu); |
146 | } |
147 | *slot = NULL; |
148 | } |
149 | |
150 | static void io_rsrc_put_work(struct io_rsrc_node *node) |
151 | { |
152 | struct io_rsrc_put *prsrc = &node->item; |
153 | |
154 | if (prsrc->tag) |
155 | io_post_aux_cqe(ctx: node->ctx, user_data: prsrc->tag, res: 0, cflags: 0); |
156 | |
157 | switch (node->type) { |
158 | case IORING_RSRC_FILE: |
159 | fput(prsrc->file); |
160 | break; |
161 | case IORING_RSRC_BUFFER: |
162 | io_rsrc_buf_put(ctx: node->ctx, prsrc); |
163 | break; |
164 | default: |
165 | WARN_ON_ONCE(1); |
166 | break; |
167 | } |
168 | } |
169 | |
170 | void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) |
171 | { |
172 | if (!io_alloc_cache_put(cache: &ctx->rsrc_node_cache, entry: &node->cache)) |
173 | kfree(objp: node); |
174 | } |
175 | |
176 | void io_rsrc_node_ref_zero(struct io_rsrc_node *node) |
177 | __must_hold(&node->ctx->uring_lock) |
178 | { |
179 | struct io_ring_ctx *ctx = node->ctx; |
180 | |
181 | while (!list_empty(head: &ctx->rsrc_ref_list)) { |
182 | node = list_first_entry(&ctx->rsrc_ref_list, |
183 | struct io_rsrc_node, node); |
184 | /* recycle ref nodes in order */ |
185 | if (node->refs) |
186 | break; |
187 | list_del(entry: &node->node); |
188 | |
189 | if (likely(!node->empty)) |
190 | io_rsrc_put_work(node); |
191 | io_rsrc_node_destroy(ctx, node); |
192 | } |
193 | if (list_empty(head: &ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) |
194 | wake_up_all(&ctx->rsrc_quiesce_wq); |
195 | } |
196 | |
197 | struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) |
198 | { |
199 | struct io_rsrc_node *ref_node; |
200 | struct io_cache_entry *entry; |
201 | |
202 | entry = io_alloc_cache_get(cache: &ctx->rsrc_node_cache); |
203 | if (entry) { |
204 | ref_node = container_of(entry, struct io_rsrc_node, cache); |
205 | } else { |
206 | ref_node = kzalloc(size: sizeof(*ref_node), GFP_KERNEL); |
207 | if (!ref_node) |
208 | return NULL; |
209 | } |
210 | |
211 | ref_node->ctx = ctx; |
212 | ref_node->empty = 0; |
213 | ref_node->refs = 1; |
214 | return ref_node; |
215 | } |
216 | |
217 | __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, |
218 | struct io_ring_ctx *ctx) |
219 | { |
220 | struct io_rsrc_node *backup; |
221 | DEFINE_WAIT(we); |
222 | int ret; |
223 | |
224 | /* As We may drop ->uring_lock, other task may have started quiesce */ |
225 | if (data->quiesce) |
226 | return -ENXIO; |
227 | |
228 | backup = io_rsrc_node_alloc(ctx); |
229 | if (!backup) |
230 | return -ENOMEM; |
231 | ctx->rsrc_node->empty = true; |
232 | ctx->rsrc_node->type = -1; |
233 | list_add_tail(new: &ctx->rsrc_node->node, head: &ctx->rsrc_ref_list); |
234 | io_put_rsrc_node(ctx, node: ctx->rsrc_node); |
235 | ctx->rsrc_node = backup; |
236 | |
237 | if (list_empty(head: &ctx->rsrc_ref_list)) |
238 | return 0; |
239 | |
240 | if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { |
241 | atomic_set(v: &ctx->cq_wait_nr, i: 1); |
242 | smp_mb(); |
243 | } |
244 | |
245 | ctx->rsrc_quiesce++; |
246 | data->quiesce = true; |
247 | do { |
248 | prepare_to_wait(wq_head: &ctx->rsrc_quiesce_wq, wq_entry: &we, TASK_INTERRUPTIBLE); |
249 | mutex_unlock(lock: &ctx->uring_lock); |
250 | |
251 | ret = io_run_task_work_sig(ctx); |
252 | if (ret < 0) { |
253 | mutex_lock(&ctx->uring_lock); |
254 | if (list_empty(head: &ctx->rsrc_ref_list)) |
255 | ret = 0; |
256 | break; |
257 | } |
258 | |
259 | schedule(); |
260 | __set_current_state(TASK_RUNNING); |
261 | mutex_lock(&ctx->uring_lock); |
262 | ret = 0; |
263 | } while (!list_empty(head: &ctx->rsrc_ref_list)); |
264 | |
265 | finish_wait(wq_head: &ctx->rsrc_quiesce_wq, wq_entry: &we); |
266 | data->quiesce = false; |
267 | ctx->rsrc_quiesce--; |
268 | |
269 | if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { |
270 | atomic_set(v: &ctx->cq_wait_nr, i: 0); |
271 | smp_mb(); |
272 | } |
273 | return ret; |
274 | } |
275 | |
276 | static void io_free_page_table(void **table, size_t size) |
277 | { |
278 | unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); |
279 | |
280 | for (i = 0; i < nr_tables; i++) |
281 | kfree(objp: table[i]); |
282 | kfree(objp: table); |
283 | } |
284 | |
285 | static void io_rsrc_data_free(struct io_rsrc_data *data) |
286 | { |
287 | size_t size = data->nr * sizeof(data->tags[0][0]); |
288 | |
289 | if (data->tags) |
290 | io_free_page_table(table: (void **)data->tags, size); |
291 | kfree(objp: data); |
292 | } |
293 | |
294 | static __cold void **io_alloc_page_table(size_t size) |
295 | { |
296 | unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); |
297 | size_t init_size = size; |
298 | void **table; |
299 | |
300 | table = kcalloc(n: nr_tables, size: sizeof(*table), GFP_KERNEL_ACCOUNT); |
301 | if (!table) |
302 | return NULL; |
303 | |
304 | for (i = 0; i < nr_tables; i++) { |
305 | unsigned int this_size = min_t(size_t, size, PAGE_SIZE); |
306 | |
307 | table[i] = kzalloc(size: this_size, GFP_KERNEL_ACCOUNT); |
308 | if (!table[i]) { |
309 | io_free_page_table(table, size: init_size); |
310 | return NULL; |
311 | } |
312 | size -= this_size; |
313 | } |
314 | return table; |
315 | } |
316 | |
317 | __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, |
318 | u64 __user *utags, |
319 | unsigned nr, struct io_rsrc_data **pdata) |
320 | { |
321 | struct io_rsrc_data *data; |
322 | int ret = 0; |
323 | unsigned i; |
324 | |
325 | data = kzalloc(size: sizeof(*data), GFP_KERNEL); |
326 | if (!data) |
327 | return -ENOMEM; |
328 | data->tags = (u64 **)io_alloc_page_table(size: nr * sizeof(data->tags[0][0])); |
329 | if (!data->tags) { |
330 | kfree(objp: data); |
331 | return -ENOMEM; |
332 | } |
333 | |
334 | data->nr = nr; |
335 | data->ctx = ctx; |
336 | data->rsrc_type = type; |
337 | if (utags) { |
338 | ret = -EFAULT; |
339 | for (i = 0; i < nr; i++) { |
340 | u64 *tag_slot = io_get_tag_slot(data, idx: i); |
341 | |
342 | if (copy_from_user(to: tag_slot, from: &utags[i], |
343 | n: sizeof(*tag_slot))) |
344 | goto fail; |
345 | } |
346 | } |
347 | *pdata = data; |
348 | return 0; |
349 | fail: |
350 | io_rsrc_data_free(data); |
351 | return ret; |
352 | } |
353 | |
354 | static int __io_sqe_files_update(struct io_ring_ctx *ctx, |
355 | struct io_uring_rsrc_update2 *up, |
356 | unsigned nr_args) |
357 | { |
358 | u64 __user *tags = u64_to_user_ptr(up->tags); |
359 | __s32 __user *fds = u64_to_user_ptr(up->data); |
360 | struct io_rsrc_data *data = ctx->file_data; |
361 | struct io_fixed_file *file_slot; |
362 | int fd, i, err = 0; |
363 | unsigned int done; |
364 | |
365 | if (!ctx->file_data) |
366 | return -ENXIO; |
367 | if (up->offset + nr_args > ctx->nr_user_files) |
368 | return -EINVAL; |
369 | |
370 | for (done = 0; done < nr_args; done++) { |
371 | u64 tag = 0; |
372 | |
373 | if ((tags && copy_from_user(to: &tag, from: &tags[done], n: sizeof(tag))) || |
374 | copy_from_user(to: &fd, from: &fds[done], n: sizeof(fd))) { |
375 | err = -EFAULT; |
376 | break; |
377 | } |
378 | if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { |
379 | err = -EINVAL; |
380 | break; |
381 | } |
382 | if (fd == IORING_REGISTER_FILES_SKIP) |
383 | continue; |
384 | |
385 | i = array_index_nospec(up->offset + done, ctx->nr_user_files); |
386 | file_slot = io_fixed_file_slot(table: &ctx->file_table, i); |
387 | |
388 | if (file_slot->file_ptr) { |
389 | err = io_queue_rsrc_removal(data, idx: i, |
390 | rsrc: io_slot_file(slot: file_slot)); |
391 | if (err) |
392 | break; |
393 | file_slot->file_ptr = 0; |
394 | io_file_bitmap_clear(table: &ctx->file_table, bit: i); |
395 | } |
396 | if (fd != -1) { |
397 | struct file *file = fget(fd); |
398 | |
399 | if (!file) { |
400 | err = -EBADF; |
401 | break; |
402 | } |
403 | /* |
404 | * Don't allow io_uring instances to be registered. |
405 | */ |
406 | if (io_is_uring_fops(file)) { |
407 | fput(file); |
408 | err = -EBADF; |
409 | break; |
410 | } |
411 | *io_get_tag_slot(data, idx: i) = tag; |
412 | io_fixed_file_set(file_slot, file); |
413 | io_file_bitmap_set(table: &ctx->file_table, bit: i); |
414 | } |
415 | } |
416 | return done ? done : err; |
417 | } |
418 | |
419 | static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, |
420 | struct io_uring_rsrc_update2 *up, |
421 | unsigned int nr_args) |
422 | { |
423 | u64 __user *tags = u64_to_user_ptr(up->tags); |
424 | struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); |
425 | struct page *last_hpage = NULL; |
426 | __u32 done; |
427 | int i, err; |
428 | |
429 | if (!ctx->buf_data) |
430 | return -ENXIO; |
431 | if (up->offset + nr_args > ctx->nr_user_bufs) |
432 | return -EINVAL; |
433 | |
434 | for (done = 0; done < nr_args; done++) { |
435 | struct io_mapped_ubuf *imu; |
436 | u64 tag = 0; |
437 | |
438 | err = io_copy_iov(ctx, dst: &iov, arg: iovs, index: done); |
439 | if (err) |
440 | break; |
441 | if (tags && copy_from_user(to: &tag, from: &tags[done], n: sizeof(tag))) { |
442 | err = -EFAULT; |
443 | break; |
444 | } |
445 | err = io_buffer_validate(iov: &iov); |
446 | if (err) |
447 | break; |
448 | if (!iov.iov_base && tag) { |
449 | err = -EINVAL; |
450 | break; |
451 | } |
452 | err = io_sqe_buffer_register(ctx, iov: &iov, pimu: &imu, last_hpage: &last_hpage); |
453 | if (err) |
454 | break; |
455 | |
456 | i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); |
457 | if (ctx->user_bufs[i] != &dummy_ubuf) { |
458 | err = io_queue_rsrc_removal(data: ctx->buf_data, idx: i, |
459 | rsrc: ctx->user_bufs[i]); |
460 | if (unlikely(err)) { |
461 | io_buffer_unmap(ctx, slot: &imu); |
462 | break; |
463 | } |
464 | ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; |
465 | } |
466 | |
467 | ctx->user_bufs[i] = imu; |
468 | *io_get_tag_slot(data: ctx->buf_data, idx: i) = tag; |
469 | } |
470 | return done ? done : err; |
471 | } |
472 | |
473 | static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, |
474 | struct io_uring_rsrc_update2 *up, |
475 | unsigned nr_args) |
476 | { |
477 | __u32 tmp; |
478 | |
479 | lockdep_assert_held(&ctx->uring_lock); |
480 | |
481 | if (check_add_overflow(up->offset, nr_args, &tmp)) |
482 | return -EOVERFLOW; |
483 | |
484 | switch (type) { |
485 | case IORING_RSRC_FILE: |
486 | return __io_sqe_files_update(ctx, up, nr_args); |
487 | case IORING_RSRC_BUFFER: |
488 | return __io_sqe_buffers_update(ctx, up, nr_args); |
489 | } |
490 | return -EINVAL; |
491 | } |
492 | |
493 | int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, |
494 | unsigned nr_args) |
495 | { |
496 | struct io_uring_rsrc_update2 up; |
497 | |
498 | if (!nr_args) |
499 | return -EINVAL; |
500 | memset(&up, 0, sizeof(up)); |
501 | if (copy_from_user(to: &up, from: arg, n: sizeof(struct io_uring_rsrc_update))) |
502 | return -EFAULT; |
503 | if (up.resv || up.resv2) |
504 | return -EINVAL; |
505 | return __io_register_rsrc_update(ctx, type: IORING_RSRC_FILE, up: &up, nr_args); |
506 | } |
507 | |
508 | int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, |
509 | unsigned size, unsigned type) |
510 | { |
511 | struct io_uring_rsrc_update2 up; |
512 | |
513 | if (size != sizeof(up)) |
514 | return -EINVAL; |
515 | if (copy_from_user(to: &up, from: arg, n: sizeof(up))) |
516 | return -EFAULT; |
517 | if (!up.nr || up.resv || up.resv2) |
518 | return -EINVAL; |
519 | return __io_register_rsrc_update(ctx, type, up: &up, nr_args: up.nr); |
520 | } |
521 | |
522 | __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, |
523 | unsigned int size, unsigned int type) |
524 | { |
525 | struct io_uring_rsrc_register rr; |
526 | |
527 | /* keep it extendible */ |
528 | if (size != sizeof(rr)) |
529 | return -EINVAL; |
530 | |
531 | memset(&rr, 0, sizeof(rr)); |
532 | if (copy_from_user(to: &rr, from: arg, n: size)) |
533 | return -EFAULT; |
534 | if (!rr.nr || rr.resv2) |
535 | return -EINVAL; |
536 | if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) |
537 | return -EINVAL; |
538 | |
539 | switch (type) { |
540 | case IORING_RSRC_FILE: |
541 | if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) |
542 | break; |
543 | return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), |
544 | nr_args: rr.nr, u64_to_user_ptr(rr.tags)); |
545 | case IORING_RSRC_BUFFER: |
546 | if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) |
547 | break; |
548 | return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), |
549 | nr_args: rr.nr, u64_to_user_ptr(rr.tags)); |
550 | } |
551 | return -EINVAL; |
552 | } |
553 | |
554 | int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) |
555 | { |
556 | struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); |
557 | |
558 | if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) |
559 | return -EINVAL; |
560 | if (sqe->rw_flags || sqe->splice_fd_in) |
561 | return -EINVAL; |
562 | |
563 | up->offset = READ_ONCE(sqe->off); |
564 | up->nr_args = READ_ONCE(sqe->len); |
565 | if (!up->nr_args) |
566 | return -EINVAL; |
567 | up->arg = READ_ONCE(sqe->addr); |
568 | return 0; |
569 | } |
570 | |
571 | static int io_files_update_with_index_alloc(struct io_kiocb *req, |
572 | unsigned int issue_flags) |
573 | { |
574 | struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); |
575 | __s32 __user *fds = u64_to_user_ptr(up->arg); |
576 | unsigned int done; |
577 | struct file *file; |
578 | int ret, fd; |
579 | |
580 | if (!req->ctx->file_data) |
581 | return -ENXIO; |
582 | |
583 | for (done = 0; done < up->nr_args; done++) { |
584 | if (copy_from_user(to: &fd, from: &fds[done], n: sizeof(fd))) { |
585 | ret = -EFAULT; |
586 | break; |
587 | } |
588 | |
589 | file = fget(fd); |
590 | if (!file) { |
591 | ret = -EBADF; |
592 | break; |
593 | } |
594 | ret = io_fixed_fd_install(req, issue_flags, file, |
595 | IORING_FILE_INDEX_ALLOC); |
596 | if (ret < 0) |
597 | break; |
598 | if (copy_to_user(to: &fds[done], from: &ret, n: sizeof(ret))) { |
599 | __io_close_fixed(ctx: req->ctx, issue_flags, offset: ret); |
600 | ret = -EFAULT; |
601 | break; |
602 | } |
603 | } |
604 | |
605 | if (done) |
606 | return done; |
607 | return ret; |
608 | } |
609 | |
610 | int io_files_update(struct io_kiocb *req, unsigned int issue_flags) |
611 | { |
612 | struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); |
613 | struct io_ring_ctx *ctx = req->ctx; |
614 | struct io_uring_rsrc_update2 up2; |
615 | int ret; |
616 | |
617 | up2.offset = up->offset; |
618 | up2.data = up->arg; |
619 | up2.nr = 0; |
620 | up2.tags = 0; |
621 | up2.resv = 0; |
622 | up2.resv2 = 0; |
623 | |
624 | if (up->offset == IORING_FILE_INDEX_ALLOC) { |
625 | ret = io_files_update_with_index_alloc(req, issue_flags); |
626 | } else { |
627 | io_ring_submit_lock(ctx, issue_flags); |
628 | ret = __io_register_rsrc_update(ctx, type: IORING_RSRC_FILE, |
629 | up: &up2, nr_args: up->nr_args); |
630 | io_ring_submit_unlock(ctx, issue_flags); |
631 | } |
632 | |
633 | if (ret < 0) |
634 | req_set_fail(req); |
635 | io_req_set_res(req, res: ret, cflags: 0); |
636 | return IOU_OK; |
637 | } |
638 | |
639 | int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) |
640 | { |
641 | struct io_ring_ctx *ctx = data->ctx; |
642 | struct io_rsrc_node *node = ctx->rsrc_node; |
643 | u64 *tag_slot = io_get_tag_slot(data, idx); |
644 | |
645 | ctx->rsrc_node = io_rsrc_node_alloc(ctx); |
646 | if (unlikely(!ctx->rsrc_node)) { |
647 | ctx->rsrc_node = node; |
648 | return -ENOMEM; |
649 | } |
650 | |
651 | node->item.rsrc = rsrc; |
652 | node->type = data->rsrc_type; |
653 | node->item.tag = *tag_slot; |
654 | *tag_slot = 0; |
655 | list_add_tail(new: &node->node, head: &ctx->rsrc_ref_list); |
656 | io_put_rsrc_node(ctx, node); |
657 | return 0; |
658 | } |
659 | |
660 | void __io_sqe_files_unregister(struct io_ring_ctx *ctx) |
661 | { |
662 | int i; |
663 | |
664 | for (i = 0; i < ctx->nr_user_files; i++) { |
665 | struct file *file = io_file_from_index(table: &ctx->file_table, index: i); |
666 | |
667 | if (!file) |
668 | continue; |
669 | io_file_bitmap_clear(table: &ctx->file_table, bit: i); |
670 | fput(file); |
671 | } |
672 | |
673 | io_free_file_tables(table: &ctx->file_table); |
674 | io_file_table_set_alloc_range(ctx, off: 0, len: 0); |
675 | io_rsrc_data_free(data: ctx->file_data); |
676 | ctx->file_data = NULL; |
677 | ctx->nr_user_files = 0; |
678 | } |
679 | |
680 | int io_sqe_files_unregister(struct io_ring_ctx *ctx) |
681 | { |
682 | unsigned nr = ctx->nr_user_files; |
683 | int ret; |
684 | |
685 | if (!ctx->file_data) |
686 | return -ENXIO; |
687 | |
688 | /* |
689 | * Quiesce may unlock ->uring_lock, and while it's not held |
690 | * prevent new requests using the table. |
691 | */ |
692 | ctx->nr_user_files = 0; |
693 | ret = io_rsrc_ref_quiesce(data: ctx->file_data, ctx); |
694 | ctx->nr_user_files = nr; |
695 | if (!ret) |
696 | __io_sqe_files_unregister(ctx); |
697 | return ret; |
698 | } |
699 | |
700 | int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, |
701 | unsigned nr_args, u64 __user *tags) |
702 | { |
703 | __s32 __user *fds = (__s32 __user *) arg; |
704 | struct file *file; |
705 | int fd, ret; |
706 | unsigned i; |
707 | |
708 | if (ctx->file_data) |
709 | return -EBUSY; |
710 | if (!nr_args) |
711 | return -EINVAL; |
712 | if (nr_args > IORING_MAX_FIXED_FILES) |
713 | return -EMFILE; |
714 | if (nr_args > rlimit(RLIMIT_NOFILE)) |
715 | return -EMFILE; |
716 | ret = io_rsrc_data_alloc(ctx, type: IORING_RSRC_FILE, utags: tags, nr: nr_args, |
717 | pdata: &ctx->file_data); |
718 | if (ret) |
719 | return ret; |
720 | |
721 | if (!io_alloc_file_tables(table: &ctx->file_table, nr_files: nr_args)) { |
722 | io_rsrc_data_free(data: ctx->file_data); |
723 | ctx->file_data = NULL; |
724 | return -ENOMEM; |
725 | } |
726 | |
727 | for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { |
728 | struct io_fixed_file *file_slot; |
729 | |
730 | if (fds && copy_from_user(to: &fd, from: &fds[i], n: sizeof(fd))) { |
731 | ret = -EFAULT; |
732 | goto fail; |
733 | } |
734 | /* allow sparse sets */ |
735 | if (!fds || fd == -1) { |
736 | ret = -EINVAL; |
737 | if (unlikely(*io_get_tag_slot(ctx->file_data, i))) |
738 | goto fail; |
739 | continue; |
740 | } |
741 | |
742 | file = fget(fd); |
743 | ret = -EBADF; |
744 | if (unlikely(!file)) |
745 | goto fail; |
746 | |
747 | /* |
748 | * Don't allow io_uring instances to be registered. |
749 | */ |
750 | if (io_is_uring_fops(file)) { |
751 | fput(file); |
752 | goto fail; |
753 | } |
754 | file_slot = io_fixed_file_slot(table: &ctx->file_table, i); |
755 | io_fixed_file_set(file_slot, file); |
756 | io_file_bitmap_set(table: &ctx->file_table, bit: i); |
757 | } |
758 | |
759 | /* default it to the whole table */ |
760 | io_file_table_set_alloc_range(ctx, off: 0, len: ctx->nr_user_files); |
761 | return 0; |
762 | fail: |
763 | __io_sqe_files_unregister(ctx); |
764 | return ret; |
765 | } |
766 | |
767 | static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) |
768 | { |
769 | io_buffer_unmap(ctx, slot: &prsrc->buf); |
770 | prsrc->buf = NULL; |
771 | } |
772 | |
773 | void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) |
774 | { |
775 | unsigned int i; |
776 | |
777 | for (i = 0; i < ctx->nr_user_bufs; i++) |
778 | io_buffer_unmap(ctx, slot: &ctx->user_bufs[i]); |
779 | kfree(objp: ctx->user_bufs); |
780 | io_rsrc_data_free(data: ctx->buf_data); |
781 | ctx->user_bufs = NULL; |
782 | ctx->buf_data = NULL; |
783 | ctx->nr_user_bufs = 0; |
784 | } |
785 | |
786 | int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) |
787 | { |
788 | unsigned nr = ctx->nr_user_bufs; |
789 | int ret; |
790 | |
791 | if (!ctx->buf_data) |
792 | return -ENXIO; |
793 | |
794 | /* |
795 | * Quiesce may unlock ->uring_lock, and while it's not held |
796 | * prevent new requests using the table. |
797 | */ |
798 | ctx->nr_user_bufs = 0; |
799 | ret = io_rsrc_ref_quiesce(data: ctx->buf_data, ctx); |
800 | ctx->nr_user_bufs = nr; |
801 | if (!ret) |
802 | __io_sqe_buffers_unregister(ctx); |
803 | return ret; |
804 | } |
805 | |
806 | /* |
807 | * Not super efficient, but this is just a registration time. And we do cache |
808 | * the last compound head, so generally we'll only do a full search if we don't |
809 | * match that one. |
810 | * |
811 | * We check if the given compound head page has already been accounted, to |
812 | * avoid double accounting it. This allows us to account the full size of the |
813 | * page, not just the constituent pages of a huge page. |
814 | */ |
815 | static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, |
816 | int nr_pages, struct page *hpage) |
817 | { |
818 | int i, j; |
819 | |
820 | /* check current page array */ |
821 | for (i = 0; i < nr_pages; i++) { |
822 | if (!PageCompound(page: pages[i])) |
823 | continue; |
824 | if (compound_head(pages[i]) == hpage) |
825 | return true; |
826 | } |
827 | |
828 | /* check previously registered pages */ |
829 | for (i = 0; i < ctx->nr_user_bufs; i++) { |
830 | struct io_mapped_ubuf *imu = ctx->user_bufs[i]; |
831 | |
832 | for (j = 0; j < imu->nr_bvecs; j++) { |
833 | if (!PageCompound(page: imu->bvec[j].bv_page)) |
834 | continue; |
835 | if (compound_head(imu->bvec[j].bv_page) == hpage) |
836 | return true; |
837 | } |
838 | } |
839 | |
840 | return false; |
841 | } |
842 | |
843 | static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, |
844 | int nr_pages, struct io_mapped_ubuf *imu, |
845 | struct page **last_hpage) |
846 | { |
847 | int i, ret; |
848 | |
849 | imu->acct_pages = 0; |
850 | for (i = 0; i < nr_pages; i++) { |
851 | if (!PageCompound(page: pages[i])) { |
852 | imu->acct_pages++; |
853 | } else { |
854 | struct page *hpage; |
855 | |
856 | hpage = compound_head(pages[i]); |
857 | if (hpage == *last_hpage) |
858 | continue; |
859 | *last_hpage = hpage; |
860 | if (headpage_already_acct(ctx, pages, nr_pages: i, hpage)) |
861 | continue; |
862 | imu->acct_pages += page_size(page: hpage) >> PAGE_SHIFT; |
863 | } |
864 | } |
865 | |
866 | if (!imu->acct_pages) |
867 | return 0; |
868 | |
869 | ret = io_account_mem(ctx, nr_pages: imu->acct_pages); |
870 | if (ret) |
871 | imu->acct_pages = 0; |
872 | return ret; |
873 | } |
874 | |
875 | struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) |
876 | { |
877 | unsigned long start, end, nr_pages; |
878 | struct page **pages = NULL; |
879 | int ret; |
880 | |
881 | end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
882 | start = ubuf >> PAGE_SHIFT; |
883 | nr_pages = end - start; |
884 | WARN_ON(!nr_pages); |
885 | |
886 | pages = kvmalloc_array(n: nr_pages, size: sizeof(struct page *), GFP_KERNEL); |
887 | if (!pages) |
888 | return ERR_PTR(error: -ENOMEM); |
889 | |
890 | mmap_read_lock(current->mm); |
891 | ret = pin_user_pages(start: ubuf, nr_pages, gup_flags: FOLL_WRITE | FOLL_LONGTERM, pages); |
892 | mmap_read_unlock(current->mm); |
893 | |
894 | /* success, mapped all pages */ |
895 | if (ret == nr_pages) { |
896 | *npages = nr_pages; |
897 | return pages; |
898 | } |
899 | |
900 | /* partial map, or didn't map anything */ |
901 | if (ret >= 0) { |
902 | /* if we did partial map, release any pages we did get */ |
903 | if (ret) |
904 | unpin_user_pages(pages, npages: ret); |
905 | ret = -EFAULT; |
906 | } |
907 | kvfree(addr: pages); |
908 | return ERR_PTR(error: ret); |
909 | } |
910 | |
911 | static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, |
912 | struct io_mapped_ubuf **pimu, |
913 | struct page **last_hpage) |
914 | { |
915 | struct io_mapped_ubuf *imu = NULL; |
916 | struct page **pages = NULL; |
917 | unsigned long off; |
918 | size_t size; |
919 | int ret, nr_pages, i; |
920 | struct folio *folio = NULL; |
921 | |
922 | *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; |
923 | if (!iov->iov_base) |
924 | return 0; |
925 | |
926 | ret = -ENOMEM; |
927 | pages = io_pin_pages(ubuf: (unsigned long) iov->iov_base, len: iov->iov_len, |
928 | npages: &nr_pages); |
929 | if (IS_ERR(ptr: pages)) { |
930 | ret = PTR_ERR(ptr: pages); |
931 | pages = NULL; |
932 | goto done; |
933 | } |
934 | |
935 | /* If it's a huge page, try to coalesce them into a single bvec entry */ |
936 | if (nr_pages > 1) { |
937 | folio = page_folio(pages[0]); |
938 | for (i = 1; i < nr_pages; i++) { |
939 | /* |
940 | * Pages must be consecutive and on the same folio for |
941 | * this to work |
942 | */ |
943 | if (page_folio(pages[i]) != folio || |
944 | pages[i] != pages[i - 1] + 1) { |
945 | folio = NULL; |
946 | break; |
947 | } |
948 | } |
949 | if (folio) { |
950 | /* |
951 | * The pages are bound to the folio, it doesn't |
952 | * actually unpin them but drops all but one reference, |
953 | * which is usually put down by io_buffer_unmap(). |
954 | * Note, needs a better helper. |
955 | */ |
956 | unpin_user_pages(pages: &pages[1], npages: nr_pages - 1); |
957 | nr_pages = 1; |
958 | } |
959 | } |
960 | |
961 | imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); |
962 | if (!imu) |
963 | goto done; |
964 | |
965 | ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); |
966 | if (ret) { |
967 | unpin_user_pages(pages, npages: nr_pages); |
968 | goto done; |
969 | } |
970 | |
971 | off = (unsigned long) iov->iov_base & ~PAGE_MASK; |
972 | size = iov->iov_len; |
973 | /* store original address for later verification */ |
974 | imu->ubuf = (unsigned long) iov->iov_base; |
975 | imu->ubuf_end = imu->ubuf + iov->iov_len; |
976 | imu->nr_bvecs = nr_pages; |
977 | *pimu = imu; |
978 | ret = 0; |
979 | |
980 | if (folio) { |
981 | bvec_set_page(bv: &imu->bvec[0], page: pages[0], len: size, offset: off); |
982 | goto done; |
983 | } |
984 | for (i = 0; i < nr_pages; i++) { |
985 | size_t vec_len; |
986 | |
987 | vec_len = min_t(size_t, size, PAGE_SIZE - off); |
988 | bvec_set_page(bv: &imu->bvec[i], page: pages[i], len: vec_len, offset: off); |
989 | off = 0; |
990 | size -= vec_len; |
991 | } |
992 | done: |
993 | if (ret) |
994 | kvfree(addr: imu); |
995 | kvfree(addr: pages); |
996 | return ret; |
997 | } |
998 | |
999 | static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) |
1000 | { |
1001 | ctx->user_bufs = kcalloc(n: nr_args, size: sizeof(*ctx->user_bufs), GFP_KERNEL); |
1002 | return ctx->user_bufs ? 0 : -ENOMEM; |
1003 | } |
1004 | |
1005 | int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, |
1006 | unsigned int nr_args, u64 __user *tags) |
1007 | { |
1008 | struct page *last_hpage = NULL; |
1009 | struct io_rsrc_data *data; |
1010 | int i, ret; |
1011 | struct iovec iov; |
1012 | |
1013 | BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); |
1014 | |
1015 | if (ctx->user_bufs) |
1016 | return -EBUSY; |
1017 | if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) |
1018 | return -EINVAL; |
1019 | ret = io_rsrc_data_alloc(ctx, type: IORING_RSRC_BUFFER, utags: tags, nr: nr_args, pdata: &data); |
1020 | if (ret) |
1021 | return ret; |
1022 | ret = io_buffers_map_alloc(ctx, nr_args); |
1023 | if (ret) { |
1024 | io_rsrc_data_free(data); |
1025 | return ret; |
1026 | } |
1027 | |
1028 | for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { |
1029 | if (arg) { |
1030 | ret = io_copy_iov(ctx, dst: &iov, arg, index: i); |
1031 | if (ret) |
1032 | break; |
1033 | ret = io_buffer_validate(iov: &iov); |
1034 | if (ret) |
1035 | break; |
1036 | } else { |
1037 | memset(&iov, 0, sizeof(iov)); |
1038 | } |
1039 | |
1040 | if (!iov.iov_base && *io_get_tag_slot(data, idx: i)) { |
1041 | ret = -EINVAL; |
1042 | break; |
1043 | } |
1044 | |
1045 | ret = io_sqe_buffer_register(ctx, iov: &iov, pimu: &ctx->user_bufs[i], |
1046 | last_hpage: &last_hpage); |
1047 | if (ret) |
1048 | break; |
1049 | } |
1050 | |
1051 | WARN_ON_ONCE(ctx->buf_data); |
1052 | |
1053 | ctx->buf_data = data; |
1054 | if (ret) |
1055 | __io_sqe_buffers_unregister(ctx); |
1056 | return ret; |
1057 | } |
1058 | |
1059 | int io_import_fixed(int ddir, struct iov_iter *iter, |
1060 | struct io_mapped_ubuf *imu, |
1061 | u64 buf_addr, size_t len) |
1062 | { |
1063 | u64 buf_end; |
1064 | size_t offset; |
1065 | |
1066 | if (WARN_ON_ONCE(!imu)) |
1067 | return -EFAULT; |
1068 | if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) |
1069 | return -EFAULT; |
1070 | /* not inside the mapped region */ |
1071 | if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) |
1072 | return -EFAULT; |
1073 | |
1074 | /* |
1075 | * Might not be a start of buffer, set size appropriately |
1076 | * and advance us to the beginning. |
1077 | */ |
1078 | offset = buf_addr - imu->ubuf; |
1079 | iov_iter_bvec(i: iter, direction: ddir, bvec: imu->bvec, nr_segs: imu->nr_bvecs, count: offset + len); |
1080 | |
1081 | if (offset) { |
1082 | /* |
1083 | * Don't use iov_iter_advance() here, as it's really slow for |
1084 | * using the latter parts of a big fixed buffer - it iterates |
1085 | * over each segment manually. We can cheat a bit here, because |
1086 | * we know that: |
1087 | * |
1088 | * 1) it's a BVEC iter, we set it up |
1089 | * 2) all bvecs are PAGE_SIZE in size, except potentially the |
1090 | * first and last bvec |
1091 | * |
1092 | * So just find our index, and adjust the iterator afterwards. |
1093 | * If the offset is within the first bvec (or the whole first |
1094 | * bvec, just use iov_iter_advance(). This makes it easier |
1095 | * since we can just skip the first segment, which may not |
1096 | * be PAGE_SIZE aligned. |
1097 | */ |
1098 | const struct bio_vec *bvec = imu->bvec; |
1099 | |
1100 | if (offset < bvec->bv_len) { |
1101 | /* |
1102 | * Note, huge pages buffers consists of one large |
1103 | * bvec entry and should always go this way. The other |
1104 | * branch doesn't expect non PAGE_SIZE'd chunks. |
1105 | */ |
1106 | iter->bvec = bvec; |
1107 | iter->nr_segs = bvec->bv_len; |
1108 | iter->count -= offset; |
1109 | iter->iov_offset = offset; |
1110 | } else { |
1111 | unsigned long seg_skip; |
1112 | |
1113 | /* skip first vec */ |
1114 | offset -= bvec->bv_len; |
1115 | seg_skip = 1 + (offset >> PAGE_SHIFT); |
1116 | |
1117 | iter->bvec = bvec + seg_skip; |
1118 | iter->nr_segs -= seg_skip; |
1119 | iter->count -= bvec->bv_len + offset; |
1120 | iter->iov_offset = offset & ~PAGE_MASK; |
1121 | } |
1122 | } |
1123 | |
1124 | return 0; |
1125 | } |
1126 | |