1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/export.h> |
3 | #include <linux/bvec.h> |
4 | #include <linux/fault-inject-usercopy.h> |
5 | #include <linux/uio.h> |
6 | #include <linux/pagemap.h> |
7 | #include <linux/highmem.h> |
8 | #include <linux/slab.h> |
9 | #include <linux/vmalloc.h> |
10 | #include <linux/splice.h> |
11 | #include <linux/compat.h> |
12 | #include <linux/scatterlist.h> |
13 | #include <linux/instrumented.h> |
14 | #include <linux/iov_iter.h> |
15 | |
16 | static __always_inline |
17 | size_t copy_to_user_iter(void __user *iter_to, size_t progress, |
18 | size_t len, void *from, void *priv2) |
19 | { |
20 | if (should_fail_usercopy()) |
21 | return len; |
22 | if (access_ok(iter_to, len)) { |
23 | from += progress; |
24 | instrument_copy_to_user(to: iter_to, from, n: len); |
25 | len = raw_copy_to_user(dst: iter_to, src: from, size: len); |
26 | } |
27 | return len; |
28 | } |
29 | |
30 | static __always_inline |
31 | size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, |
32 | size_t len, void *from, void *priv2) |
33 | { |
34 | ssize_t res; |
35 | |
36 | if (should_fail_usercopy()) |
37 | return len; |
38 | |
39 | from += progress; |
40 | res = copy_to_user_nofault(dst: iter_to, src: from, size: len); |
41 | return res < 0 ? len : res; |
42 | } |
43 | |
44 | static __always_inline |
45 | size_t copy_from_user_iter(void __user *iter_from, size_t progress, |
46 | size_t len, void *to, void *priv2) |
47 | { |
48 | size_t res = len; |
49 | |
50 | if (should_fail_usercopy()) |
51 | return len; |
52 | if (access_ok(iter_from, len)) { |
53 | to += progress; |
54 | instrument_copy_from_user_before(to, from: iter_from, n: len); |
55 | res = raw_copy_from_user(dst: to, src: iter_from, size: len); |
56 | instrument_copy_from_user_after(to, from: iter_from, n: len, left: res); |
57 | } |
58 | return res; |
59 | } |
60 | |
61 | static __always_inline |
62 | size_t memcpy_to_iter(void *iter_to, size_t progress, |
63 | size_t len, void *from, void *priv2) |
64 | { |
65 | memcpy(iter_to, from + progress, len); |
66 | return 0; |
67 | } |
68 | |
69 | static __always_inline |
70 | size_t memcpy_from_iter(void *iter_from, size_t progress, |
71 | size_t len, void *to, void *priv2) |
72 | { |
73 | memcpy(to + progress, iter_from, len); |
74 | return 0; |
75 | } |
76 | |
77 | /* |
78 | * fault_in_iov_iter_readable - fault in iov iterator for reading |
79 | * @i: iterator |
80 | * @size: maximum length |
81 | * |
82 | * Fault in one or more iovecs of the given iov_iter, to a maximum length of |
83 | * @size. For each iovec, fault in each page that constitutes the iovec. |
84 | * |
85 | * Returns the number of bytes not faulted in (like copy_to_user() and |
86 | * copy_from_user()). |
87 | * |
88 | * Always returns 0 for non-userspace iterators. |
89 | */ |
90 | size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) |
91 | { |
92 | if (iter_is_ubuf(i)) { |
93 | size_t n = min(size, iov_iter_count(i)); |
94 | n -= fault_in_readable(uaddr: i->ubuf + i->iov_offset, size: n); |
95 | return size - n; |
96 | } else if (iter_is_iovec(i)) { |
97 | size_t count = min(size, iov_iter_count(i)); |
98 | const struct iovec *p; |
99 | size_t skip; |
100 | |
101 | size -= count; |
102 | for (p = iter_iov(iter: i), skip = i->iov_offset; count; p++, skip = 0) { |
103 | size_t len = min(count, p->iov_len - skip); |
104 | size_t ret; |
105 | |
106 | if (unlikely(!len)) |
107 | continue; |
108 | ret = fault_in_readable(uaddr: p->iov_base + skip, size: len); |
109 | count -= len - ret; |
110 | if (ret) |
111 | break; |
112 | } |
113 | return count + size; |
114 | } |
115 | return 0; |
116 | } |
117 | EXPORT_SYMBOL(fault_in_iov_iter_readable); |
118 | |
119 | /* |
120 | * fault_in_iov_iter_writeable - fault in iov iterator for writing |
121 | * @i: iterator |
122 | * @size: maximum length |
123 | * |
124 | * Faults in the iterator using get_user_pages(), i.e., without triggering |
125 | * hardware page faults. This is primarily useful when we already know that |
126 | * some or all of the pages in @i aren't in memory. |
127 | * |
128 | * Returns the number of bytes not faulted in, like copy_to_user() and |
129 | * copy_from_user(). |
130 | * |
131 | * Always returns 0 for non-user-space iterators. |
132 | */ |
133 | size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) |
134 | { |
135 | if (iter_is_ubuf(i)) { |
136 | size_t n = min(size, iov_iter_count(i)); |
137 | n -= fault_in_safe_writeable(uaddr: i->ubuf + i->iov_offset, size: n); |
138 | return size - n; |
139 | } else if (iter_is_iovec(i)) { |
140 | size_t count = min(size, iov_iter_count(i)); |
141 | const struct iovec *p; |
142 | size_t skip; |
143 | |
144 | size -= count; |
145 | for (p = iter_iov(iter: i), skip = i->iov_offset; count; p++, skip = 0) { |
146 | size_t len = min(count, p->iov_len - skip); |
147 | size_t ret; |
148 | |
149 | if (unlikely(!len)) |
150 | continue; |
151 | ret = fault_in_safe_writeable(uaddr: p->iov_base + skip, size: len); |
152 | count -= len - ret; |
153 | if (ret) |
154 | break; |
155 | } |
156 | return count + size; |
157 | } |
158 | return 0; |
159 | } |
160 | EXPORT_SYMBOL(fault_in_iov_iter_writeable); |
161 | |
162 | void iov_iter_init(struct iov_iter *i, unsigned int direction, |
163 | const struct iovec *iov, unsigned long nr_segs, |
164 | size_t count) |
165 | { |
166 | WARN_ON(direction & ~(READ | WRITE)); |
167 | *i = (struct iov_iter) { |
168 | .iter_type = ITER_IOVEC, |
169 | .copy_mc = false, |
170 | .nofault = false, |
171 | .data_source = direction, |
172 | .__iov = iov, |
173 | .nr_segs = nr_segs, |
174 | .iov_offset = 0, |
175 | .count = count |
176 | }; |
177 | } |
178 | EXPORT_SYMBOL(iov_iter_init); |
179 | |
180 | size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) |
181 | { |
182 | if (WARN_ON_ONCE(i->data_source)) |
183 | return 0; |
184 | if (user_backed_iter(i)) |
185 | might_fault(); |
186 | return iterate_and_advance(iter: i, len: bytes, priv: (void *)addr, |
187 | ustep: copy_to_user_iter, step: memcpy_to_iter); |
188 | } |
189 | EXPORT_SYMBOL(_copy_to_iter); |
190 | |
191 | #ifdef CONFIG_ARCH_HAS_COPY_MC |
192 | static __always_inline |
193 | size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, |
194 | size_t len, void *from, void *priv2) |
195 | { |
196 | if (access_ok(iter_to, len)) { |
197 | from += progress; |
198 | instrument_copy_to_user(to: iter_to, from, n: len); |
199 | len = copy_mc_to_user(to: iter_to, from, len); |
200 | } |
201 | return len; |
202 | } |
203 | |
204 | static __always_inline |
205 | size_t memcpy_to_iter_mc(void *iter_to, size_t progress, |
206 | size_t len, void *from, void *priv2) |
207 | { |
208 | return copy_mc_to_kernel(to: iter_to, from: from + progress, len); |
209 | } |
210 | |
211 | /** |
212 | * _copy_mc_to_iter - copy to iter with source memory error exception handling |
213 | * @addr: source kernel address |
214 | * @bytes: total transfer length |
215 | * @i: destination iterator |
216 | * |
217 | * The pmem driver deploys this for the dax operation |
218 | * (dax_copy_to_iter()) for dax reads (bypass page-cache and the |
219 | * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes |
220 | * successfully copied. |
221 | * |
222 | * The main differences between this and typical _copy_to_iter(). |
223 | * |
224 | * * Typical tail/residue handling after a fault retries the copy |
225 | * byte-by-byte until the fault happens again. Re-triggering machine |
226 | * checks is potentially fatal so the implementation uses source |
227 | * alignment and poison alignment assumptions to avoid re-triggering |
228 | * hardware exceptions. |
229 | * |
230 | * * ITER_KVEC and ITER_BVEC can return short copies. Compare to |
231 | * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. |
232 | * |
233 | * Return: number of bytes copied (may be %0) |
234 | */ |
235 | size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) |
236 | { |
237 | if (WARN_ON_ONCE(i->data_source)) |
238 | return 0; |
239 | if (user_backed_iter(i)) |
240 | might_fault(); |
241 | return iterate_and_advance(iter: i, len: bytes, priv: (void *)addr, |
242 | ustep: copy_to_user_iter_mc, step: memcpy_to_iter_mc); |
243 | } |
244 | EXPORT_SYMBOL_GPL(_copy_mc_to_iter); |
245 | #endif /* CONFIG_ARCH_HAS_COPY_MC */ |
246 | |
247 | static __always_inline |
248 | size_t memcpy_from_iter_mc(void *iter_from, size_t progress, |
249 | size_t len, void *to, void *priv2) |
250 | { |
251 | return copy_mc_to_kernel(to: to + progress, from: iter_from, len); |
252 | } |
253 | |
254 | static size_t __copy_from_iter_mc(void *addr, size_t bytes, struct iov_iter *i) |
255 | { |
256 | if (unlikely(i->count < bytes)) |
257 | bytes = i->count; |
258 | if (unlikely(!bytes)) |
259 | return 0; |
260 | return iterate_bvec(iter: i, len: bytes, priv: addr, NULL, step: memcpy_from_iter_mc); |
261 | } |
262 | |
263 | static __always_inline |
264 | size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) |
265 | { |
266 | if (unlikely(iov_iter_is_copy_mc(i))) |
267 | return __copy_from_iter_mc(addr, bytes, i); |
268 | return iterate_and_advance(iter: i, len: bytes, priv: addr, |
269 | ustep: copy_from_user_iter, step: memcpy_from_iter); |
270 | } |
271 | |
272 | size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) |
273 | { |
274 | if (WARN_ON_ONCE(!i->data_source)) |
275 | return 0; |
276 | |
277 | if (user_backed_iter(i)) |
278 | might_fault(); |
279 | return __copy_from_iter(addr, bytes, i); |
280 | } |
281 | EXPORT_SYMBOL(_copy_from_iter); |
282 | |
283 | static __always_inline |
284 | size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, |
285 | size_t len, void *to, void *priv2) |
286 | { |
287 | return __copy_from_user_inatomic_nocache(dst: to + progress, src: iter_from, size: len); |
288 | } |
289 | |
290 | size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) |
291 | { |
292 | if (WARN_ON_ONCE(!i->data_source)) |
293 | return 0; |
294 | |
295 | return iterate_and_advance(iter: i, len: bytes, priv: addr, |
296 | ustep: copy_from_user_iter_nocache, |
297 | step: memcpy_from_iter); |
298 | } |
299 | EXPORT_SYMBOL(_copy_from_iter_nocache); |
300 | |
301 | #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE |
302 | static __always_inline |
303 | size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, |
304 | size_t len, void *to, void *priv2) |
305 | { |
306 | return __copy_from_user_flushcache(dst: to + progress, src: iter_from, size: len); |
307 | } |
308 | |
309 | static __always_inline |
310 | size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, |
311 | size_t len, void *to, void *priv2) |
312 | { |
313 | memcpy_flushcache(dst: to + progress, src: iter_from, cnt: len); |
314 | return 0; |
315 | } |
316 | |
317 | /** |
318 | * _copy_from_iter_flushcache - write destination through cpu cache |
319 | * @addr: destination kernel address |
320 | * @bytes: total transfer length |
321 | * @i: source iterator |
322 | * |
323 | * The pmem driver arranges for filesystem-dax to use this facility via |
324 | * dax_copy_from_iter() for ensuring that writes to persistent memory |
325 | * are flushed through the CPU cache. It is differentiated from |
326 | * _copy_from_iter_nocache() in that guarantees all data is flushed for |
327 | * all iterator types. The _copy_from_iter_nocache() only attempts to |
328 | * bypass the cache for the ITER_IOVEC case, and on some archs may use |
329 | * instructions that strand dirty-data in the cache. |
330 | * |
331 | * Return: number of bytes copied (may be %0) |
332 | */ |
333 | size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) |
334 | { |
335 | if (WARN_ON_ONCE(!i->data_source)) |
336 | return 0; |
337 | |
338 | return iterate_and_advance(iter: i, len: bytes, priv: addr, |
339 | ustep: copy_from_user_iter_flushcache, |
340 | step: memcpy_from_iter_flushcache); |
341 | } |
342 | EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); |
343 | #endif |
344 | |
345 | static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) |
346 | { |
347 | struct page *head; |
348 | size_t v = n + offset; |
349 | |
350 | /* |
351 | * The general case needs to access the page order in order |
352 | * to compute the page size. |
353 | * However, we mostly deal with order-0 pages and thus can |
354 | * avoid a possible cache line miss for requests that fit all |
355 | * page orders. |
356 | */ |
357 | if (n <= v && v <= PAGE_SIZE) |
358 | return true; |
359 | |
360 | head = compound_head(page); |
361 | v += (page - head) << PAGE_SHIFT; |
362 | |
363 | if (WARN_ON(n > v || v > page_size(head))) |
364 | return false; |
365 | return true; |
366 | } |
367 | |
368 | size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, |
369 | struct iov_iter *i) |
370 | { |
371 | size_t res = 0; |
372 | if (!page_copy_sane(page, offset, n: bytes)) |
373 | return 0; |
374 | if (WARN_ON_ONCE(i->data_source)) |
375 | return 0; |
376 | page += offset / PAGE_SIZE; // first subpage |
377 | offset %= PAGE_SIZE; |
378 | while (1) { |
379 | void *kaddr = kmap_local_page(page); |
380 | size_t n = min(bytes, (size_t)PAGE_SIZE - offset); |
381 | n = _copy_to_iter(kaddr + offset, n, i); |
382 | kunmap_local(kaddr); |
383 | res += n; |
384 | bytes -= n; |
385 | if (!bytes || !n) |
386 | break; |
387 | offset += n; |
388 | if (offset == PAGE_SIZE) { |
389 | page++; |
390 | offset = 0; |
391 | } |
392 | } |
393 | return res; |
394 | } |
395 | EXPORT_SYMBOL(copy_page_to_iter); |
396 | |
397 | size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, |
398 | struct iov_iter *i) |
399 | { |
400 | size_t res = 0; |
401 | |
402 | if (!page_copy_sane(page, offset, n: bytes)) |
403 | return 0; |
404 | if (WARN_ON_ONCE(i->data_source)) |
405 | return 0; |
406 | page += offset / PAGE_SIZE; // first subpage |
407 | offset %= PAGE_SIZE; |
408 | while (1) { |
409 | void *kaddr = kmap_local_page(page); |
410 | size_t n = min(bytes, (size_t)PAGE_SIZE - offset); |
411 | |
412 | n = iterate_and_advance(iter: i, len: bytes, priv: kaddr, |
413 | ustep: copy_to_user_iter_nofault, |
414 | step: memcpy_to_iter); |
415 | kunmap_local(kaddr); |
416 | res += n; |
417 | bytes -= n; |
418 | if (!bytes || !n) |
419 | break; |
420 | offset += n; |
421 | if (offset == PAGE_SIZE) { |
422 | page++; |
423 | offset = 0; |
424 | } |
425 | } |
426 | return res; |
427 | } |
428 | EXPORT_SYMBOL(copy_page_to_iter_nofault); |
429 | |
430 | size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, |
431 | struct iov_iter *i) |
432 | { |
433 | size_t res = 0; |
434 | if (!page_copy_sane(page, offset, n: bytes)) |
435 | return 0; |
436 | page += offset / PAGE_SIZE; // first subpage |
437 | offset %= PAGE_SIZE; |
438 | while (1) { |
439 | void *kaddr = kmap_local_page(page); |
440 | size_t n = min(bytes, (size_t)PAGE_SIZE - offset); |
441 | n = _copy_from_iter(kaddr + offset, n, i); |
442 | kunmap_local(kaddr); |
443 | res += n; |
444 | bytes -= n; |
445 | if (!bytes || !n) |
446 | break; |
447 | offset += n; |
448 | if (offset == PAGE_SIZE) { |
449 | page++; |
450 | offset = 0; |
451 | } |
452 | } |
453 | return res; |
454 | } |
455 | EXPORT_SYMBOL(copy_page_from_iter); |
456 | |
457 | static __always_inline |
458 | size_t zero_to_user_iter(void __user *iter_to, size_t progress, |
459 | size_t len, void *priv, void *priv2) |
460 | { |
461 | return clear_user(to: iter_to, n: len); |
462 | } |
463 | |
464 | static __always_inline |
465 | size_t zero_to_iter(void *iter_to, size_t progress, |
466 | size_t len, void *priv, void *priv2) |
467 | { |
468 | memset(iter_to, 0, len); |
469 | return 0; |
470 | } |
471 | |
472 | size_t iov_iter_zero(size_t bytes, struct iov_iter *i) |
473 | { |
474 | return iterate_and_advance(iter: i, len: bytes, NULL, |
475 | ustep: zero_to_user_iter, step: zero_to_iter); |
476 | } |
477 | EXPORT_SYMBOL(iov_iter_zero); |
478 | |
479 | size_t copy_page_from_iter_atomic(struct page *page, size_t offset, |
480 | size_t bytes, struct iov_iter *i) |
481 | { |
482 | size_t n, copied = 0; |
483 | |
484 | if (!page_copy_sane(page, offset, n: bytes)) |
485 | return 0; |
486 | if (WARN_ON_ONCE(!i->data_source)) |
487 | return 0; |
488 | |
489 | do { |
490 | char *p; |
491 | |
492 | n = bytes - copied; |
493 | if (PageHighMem(page)) { |
494 | page += offset / PAGE_SIZE; |
495 | offset %= PAGE_SIZE; |
496 | n = min_t(size_t, n, PAGE_SIZE - offset); |
497 | } |
498 | |
499 | p = kmap_atomic(page) + offset; |
500 | n = __copy_from_iter(addr: p, bytes: n, i); |
501 | kunmap_atomic(p); |
502 | copied += n; |
503 | offset += n; |
504 | } while (PageHighMem(page) && copied != bytes && n > 0); |
505 | |
506 | return copied; |
507 | } |
508 | EXPORT_SYMBOL(copy_page_from_iter_atomic); |
509 | |
510 | static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) |
511 | { |
512 | const struct bio_vec *bvec, *end; |
513 | |
514 | if (!i->count) |
515 | return; |
516 | i->count -= size; |
517 | |
518 | size += i->iov_offset; |
519 | |
520 | for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { |
521 | if (likely(size < bvec->bv_len)) |
522 | break; |
523 | size -= bvec->bv_len; |
524 | } |
525 | i->iov_offset = size; |
526 | i->nr_segs -= bvec - i->bvec; |
527 | i->bvec = bvec; |
528 | } |
529 | |
530 | static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) |
531 | { |
532 | const struct iovec *iov, *end; |
533 | |
534 | if (!i->count) |
535 | return; |
536 | i->count -= size; |
537 | |
538 | size += i->iov_offset; // from beginning of current segment |
539 | for (iov = iter_iov(iter: i), end = iov + i->nr_segs; iov < end; iov++) { |
540 | if (likely(size < iov->iov_len)) |
541 | break; |
542 | size -= iov->iov_len; |
543 | } |
544 | i->iov_offset = size; |
545 | i->nr_segs -= iov - iter_iov(iter: i); |
546 | i->__iov = iov; |
547 | } |
548 | |
549 | void iov_iter_advance(struct iov_iter *i, size_t size) |
550 | { |
551 | if (unlikely(i->count < size)) |
552 | size = i->count; |
553 | if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { |
554 | i->iov_offset += size; |
555 | i->count -= size; |
556 | } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { |
557 | /* iovec and kvec have identical layouts */ |
558 | iov_iter_iovec_advance(i, size); |
559 | } else if (iov_iter_is_bvec(i)) { |
560 | iov_iter_bvec_advance(i, size); |
561 | } else if (iov_iter_is_discard(i)) { |
562 | i->count -= size; |
563 | } |
564 | } |
565 | EXPORT_SYMBOL(iov_iter_advance); |
566 | |
567 | void iov_iter_revert(struct iov_iter *i, size_t unroll) |
568 | { |
569 | if (!unroll) |
570 | return; |
571 | if (WARN_ON(unroll > MAX_RW_COUNT)) |
572 | return; |
573 | i->count += unroll; |
574 | if (unlikely(iov_iter_is_discard(i))) |
575 | return; |
576 | if (unroll <= i->iov_offset) { |
577 | i->iov_offset -= unroll; |
578 | return; |
579 | } |
580 | unroll -= i->iov_offset; |
581 | if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { |
582 | BUG(); /* We should never go beyond the start of the specified |
583 | * range since we might then be straying into pages that |
584 | * aren't pinned. |
585 | */ |
586 | } else if (iov_iter_is_bvec(i)) { |
587 | const struct bio_vec *bvec = i->bvec; |
588 | while (1) { |
589 | size_t n = (--bvec)->bv_len; |
590 | i->nr_segs++; |
591 | if (unroll <= n) { |
592 | i->bvec = bvec; |
593 | i->iov_offset = n - unroll; |
594 | return; |
595 | } |
596 | unroll -= n; |
597 | } |
598 | } else { /* same logics for iovec and kvec */ |
599 | const struct iovec *iov = iter_iov(iter: i); |
600 | while (1) { |
601 | size_t n = (--iov)->iov_len; |
602 | i->nr_segs++; |
603 | if (unroll <= n) { |
604 | i->__iov = iov; |
605 | i->iov_offset = n - unroll; |
606 | return; |
607 | } |
608 | unroll -= n; |
609 | } |
610 | } |
611 | } |
612 | EXPORT_SYMBOL(iov_iter_revert); |
613 | |
614 | /* |
615 | * Return the count of just the current iov_iter segment. |
616 | */ |
617 | size_t iov_iter_single_seg_count(const struct iov_iter *i) |
618 | { |
619 | if (i->nr_segs > 1) { |
620 | if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) |
621 | return min(i->count, iter_iov(i)->iov_len - i->iov_offset); |
622 | if (iov_iter_is_bvec(i)) |
623 | return min(i->count, i->bvec->bv_len - i->iov_offset); |
624 | } |
625 | return i->count; |
626 | } |
627 | EXPORT_SYMBOL(iov_iter_single_seg_count); |
628 | |
629 | void iov_iter_kvec(struct iov_iter *i, unsigned int direction, |
630 | const struct kvec *kvec, unsigned long nr_segs, |
631 | size_t count) |
632 | { |
633 | WARN_ON(direction & ~(READ | WRITE)); |
634 | *i = (struct iov_iter){ |
635 | .iter_type = ITER_KVEC, |
636 | .copy_mc = false, |
637 | .data_source = direction, |
638 | .kvec = kvec, |
639 | .nr_segs = nr_segs, |
640 | .iov_offset = 0, |
641 | .count = count |
642 | }; |
643 | } |
644 | EXPORT_SYMBOL(iov_iter_kvec); |
645 | |
646 | void iov_iter_bvec(struct iov_iter *i, unsigned int direction, |
647 | const struct bio_vec *bvec, unsigned long nr_segs, |
648 | size_t count) |
649 | { |
650 | WARN_ON(direction & ~(READ | WRITE)); |
651 | *i = (struct iov_iter){ |
652 | .iter_type = ITER_BVEC, |
653 | .copy_mc = false, |
654 | .data_source = direction, |
655 | .bvec = bvec, |
656 | .nr_segs = nr_segs, |
657 | .iov_offset = 0, |
658 | .count = count |
659 | }; |
660 | } |
661 | EXPORT_SYMBOL(iov_iter_bvec); |
662 | |
663 | /** |
664 | * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray |
665 | * @i: The iterator to initialise. |
666 | * @direction: The direction of the transfer. |
667 | * @xarray: The xarray to access. |
668 | * @start: The start file position. |
669 | * @count: The size of the I/O buffer in bytes. |
670 | * |
671 | * Set up an I/O iterator to either draw data out of the pages attached to an |
672 | * inode or to inject data into those pages. The pages *must* be prevented |
673 | * from evaporation, either by taking a ref on them or locking them by the |
674 | * caller. |
675 | */ |
676 | void iov_iter_xarray(struct iov_iter *i, unsigned int direction, |
677 | struct xarray *xarray, loff_t start, size_t count) |
678 | { |
679 | BUG_ON(direction & ~1); |
680 | *i = (struct iov_iter) { |
681 | .iter_type = ITER_XARRAY, |
682 | .copy_mc = false, |
683 | .data_source = direction, |
684 | .xarray = xarray, |
685 | .xarray_start = start, |
686 | .count = count, |
687 | .iov_offset = 0 |
688 | }; |
689 | } |
690 | EXPORT_SYMBOL(iov_iter_xarray); |
691 | |
692 | /** |
693 | * iov_iter_discard - Initialise an I/O iterator that discards data |
694 | * @i: The iterator to initialise. |
695 | * @direction: The direction of the transfer. |
696 | * @count: The size of the I/O buffer in bytes. |
697 | * |
698 | * Set up an I/O iterator that just discards everything that's written to it. |
699 | * It's only available as a READ iterator. |
700 | */ |
701 | void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) |
702 | { |
703 | BUG_ON(direction != READ); |
704 | *i = (struct iov_iter){ |
705 | .iter_type = ITER_DISCARD, |
706 | .copy_mc = false, |
707 | .data_source = false, |
708 | .count = count, |
709 | .iov_offset = 0 |
710 | }; |
711 | } |
712 | EXPORT_SYMBOL(iov_iter_discard); |
713 | |
714 | static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, |
715 | unsigned len_mask) |
716 | { |
717 | size_t size = i->count; |
718 | size_t skip = i->iov_offset; |
719 | unsigned k; |
720 | |
721 | for (k = 0; k < i->nr_segs; k++, skip = 0) { |
722 | const struct iovec *iov = iter_iov(iter: i) + k; |
723 | size_t len = iov->iov_len - skip; |
724 | |
725 | if (len > size) |
726 | len = size; |
727 | if (len & len_mask) |
728 | return false; |
729 | if ((unsigned long)(iov->iov_base + skip) & addr_mask) |
730 | return false; |
731 | |
732 | size -= len; |
733 | if (!size) |
734 | break; |
735 | } |
736 | return true; |
737 | } |
738 | |
739 | static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, |
740 | unsigned len_mask) |
741 | { |
742 | size_t size = i->count; |
743 | unsigned skip = i->iov_offset; |
744 | unsigned k; |
745 | |
746 | for (k = 0; k < i->nr_segs; k++, skip = 0) { |
747 | size_t len = i->bvec[k].bv_len - skip; |
748 | |
749 | if (len > size) |
750 | len = size; |
751 | if (len & len_mask) |
752 | return false; |
753 | if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) |
754 | return false; |
755 | |
756 | size -= len; |
757 | if (!size) |
758 | break; |
759 | } |
760 | return true; |
761 | } |
762 | |
763 | /** |
764 | * iov_iter_is_aligned() - Check if the addresses and lengths of each segments |
765 | * are aligned to the parameters. |
766 | * |
767 | * @i: &struct iov_iter to restore |
768 | * @addr_mask: bit mask to check against the iov element's addresses |
769 | * @len_mask: bit mask to check against the iov element's lengths |
770 | * |
771 | * Return: false if any addresses or lengths intersect with the provided masks |
772 | */ |
773 | bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, |
774 | unsigned len_mask) |
775 | { |
776 | if (likely(iter_is_ubuf(i))) { |
777 | if (i->count & len_mask) |
778 | return false; |
779 | if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) |
780 | return false; |
781 | return true; |
782 | } |
783 | |
784 | if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) |
785 | return iov_iter_aligned_iovec(i, addr_mask, len_mask); |
786 | |
787 | if (iov_iter_is_bvec(i)) |
788 | return iov_iter_aligned_bvec(i, addr_mask, len_mask); |
789 | |
790 | if (iov_iter_is_xarray(i)) { |
791 | if (i->count & len_mask) |
792 | return false; |
793 | if ((i->xarray_start + i->iov_offset) & addr_mask) |
794 | return false; |
795 | } |
796 | |
797 | return true; |
798 | } |
799 | EXPORT_SYMBOL_GPL(iov_iter_is_aligned); |
800 | |
801 | static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) |
802 | { |
803 | unsigned long res = 0; |
804 | size_t size = i->count; |
805 | size_t skip = i->iov_offset; |
806 | unsigned k; |
807 | |
808 | for (k = 0; k < i->nr_segs; k++, skip = 0) { |
809 | const struct iovec *iov = iter_iov(iter: i) + k; |
810 | size_t len = iov->iov_len - skip; |
811 | if (len) { |
812 | res |= (unsigned long)iov->iov_base + skip; |
813 | if (len > size) |
814 | len = size; |
815 | res |= len; |
816 | size -= len; |
817 | if (!size) |
818 | break; |
819 | } |
820 | } |
821 | return res; |
822 | } |
823 | |
824 | static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) |
825 | { |
826 | unsigned res = 0; |
827 | size_t size = i->count; |
828 | unsigned skip = i->iov_offset; |
829 | unsigned k; |
830 | |
831 | for (k = 0; k < i->nr_segs; k++, skip = 0) { |
832 | size_t len = i->bvec[k].bv_len - skip; |
833 | res |= (unsigned long)i->bvec[k].bv_offset + skip; |
834 | if (len > size) |
835 | len = size; |
836 | res |= len; |
837 | size -= len; |
838 | if (!size) |
839 | break; |
840 | } |
841 | return res; |
842 | } |
843 | |
844 | unsigned long iov_iter_alignment(const struct iov_iter *i) |
845 | { |
846 | if (likely(iter_is_ubuf(i))) { |
847 | size_t size = i->count; |
848 | if (size) |
849 | return ((unsigned long)i->ubuf + i->iov_offset) | size; |
850 | return 0; |
851 | } |
852 | |
853 | /* iovec and kvec have identical layouts */ |
854 | if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) |
855 | return iov_iter_alignment_iovec(i); |
856 | |
857 | if (iov_iter_is_bvec(i)) |
858 | return iov_iter_alignment_bvec(i); |
859 | |
860 | if (iov_iter_is_xarray(i)) |
861 | return (i->xarray_start + i->iov_offset) | i->count; |
862 | |
863 | return 0; |
864 | } |
865 | EXPORT_SYMBOL(iov_iter_alignment); |
866 | |
867 | unsigned long iov_iter_gap_alignment(const struct iov_iter *i) |
868 | { |
869 | unsigned long res = 0; |
870 | unsigned long v = 0; |
871 | size_t size = i->count; |
872 | unsigned k; |
873 | |
874 | if (iter_is_ubuf(i)) |
875 | return 0; |
876 | |
877 | if (WARN_ON(!iter_is_iovec(i))) |
878 | return ~0U; |
879 | |
880 | for (k = 0; k < i->nr_segs; k++) { |
881 | const struct iovec *iov = iter_iov(iter: i) + k; |
882 | if (iov->iov_len) { |
883 | unsigned long base = (unsigned long)iov->iov_base; |
884 | if (v) // if not the first one |
885 | res |= base | v; // this start | previous end |
886 | v = base + iov->iov_len; |
887 | if (size <= iov->iov_len) |
888 | break; |
889 | size -= iov->iov_len; |
890 | } |
891 | } |
892 | return res; |
893 | } |
894 | EXPORT_SYMBOL(iov_iter_gap_alignment); |
895 | |
896 | static int want_pages_array(struct page ***res, size_t size, |
897 | size_t start, unsigned int maxpages) |
898 | { |
899 | unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); |
900 | |
901 | if (count > maxpages) |
902 | count = maxpages; |
903 | WARN_ON(!count); // caller should've prevented that |
904 | if (!*res) { |
905 | *res = kvmalloc_array(n: count, size: sizeof(struct page *), GFP_KERNEL); |
906 | if (!*res) |
907 | return 0; |
908 | } |
909 | return count; |
910 | } |
911 | |
912 | static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, |
913 | pgoff_t index, unsigned int nr_pages) |
914 | { |
915 | XA_STATE(xas, xa, index); |
916 | struct page *page; |
917 | unsigned int ret = 0; |
918 | |
919 | rcu_read_lock(); |
920 | for (page = xas_load(&xas); page; page = xas_next(xas: &xas)) { |
921 | if (xas_retry(xas: &xas, entry: page)) |
922 | continue; |
923 | |
924 | /* Has the page moved or been split? */ |
925 | if (unlikely(page != xas_reload(&xas))) { |
926 | xas_reset(xas: &xas); |
927 | continue; |
928 | } |
929 | |
930 | pages[ret] = find_subpage(head: page, index: xas.xa_index); |
931 | get_page(page: pages[ret]); |
932 | if (++ret == nr_pages) |
933 | break; |
934 | } |
935 | rcu_read_unlock(); |
936 | return ret; |
937 | } |
938 | |
939 | static ssize_t iter_xarray_get_pages(struct iov_iter *i, |
940 | struct page ***pages, size_t maxsize, |
941 | unsigned maxpages, size_t *_start_offset) |
942 | { |
943 | unsigned nr, offset, count; |
944 | pgoff_t index; |
945 | loff_t pos; |
946 | |
947 | pos = i->xarray_start + i->iov_offset; |
948 | index = pos >> PAGE_SHIFT; |
949 | offset = pos & ~PAGE_MASK; |
950 | *_start_offset = offset; |
951 | |
952 | count = want_pages_array(res: pages, size: maxsize, start: offset, maxpages); |
953 | if (!count) |
954 | return -ENOMEM; |
955 | nr = iter_xarray_populate_pages(pages: *pages, xa: i->xarray, index, nr_pages: count); |
956 | if (nr == 0) |
957 | return 0; |
958 | |
959 | maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); |
960 | i->iov_offset += maxsize; |
961 | i->count -= maxsize; |
962 | return maxsize; |
963 | } |
964 | |
965 | /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ |
966 | static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) |
967 | { |
968 | size_t skip; |
969 | long k; |
970 | |
971 | if (iter_is_ubuf(i)) |
972 | return (unsigned long)i->ubuf + i->iov_offset; |
973 | |
974 | for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { |
975 | const struct iovec *iov = iter_iov(iter: i) + k; |
976 | size_t len = iov->iov_len - skip; |
977 | |
978 | if (unlikely(!len)) |
979 | continue; |
980 | if (*size > len) |
981 | *size = len; |
982 | return (unsigned long)iov->iov_base + skip; |
983 | } |
984 | BUG(); // if it had been empty, we wouldn't get called |
985 | } |
986 | |
987 | /* must be done on non-empty ITER_BVEC one */ |
988 | static struct page *first_bvec_segment(const struct iov_iter *i, |
989 | size_t *size, size_t *start) |
990 | { |
991 | struct page *page; |
992 | size_t skip = i->iov_offset, len; |
993 | |
994 | len = i->bvec->bv_len - skip; |
995 | if (*size > len) |
996 | *size = len; |
997 | skip += i->bvec->bv_offset; |
998 | page = i->bvec->bv_page + skip / PAGE_SIZE; |
999 | *start = skip % PAGE_SIZE; |
1000 | return page; |
1001 | } |
1002 | |
1003 | static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, |
1004 | struct page ***pages, size_t maxsize, |
1005 | unsigned int maxpages, size_t *start) |
1006 | { |
1007 | unsigned int n, gup_flags = 0; |
1008 | |
1009 | if (maxsize > i->count) |
1010 | maxsize = i->count; |
1011 | if (!maxsize) |
1012 | return 0; |
1013 | if (maxsize > MAX_RW_COUNT) |
1014 | maxsize = MAX_RW_COUNT; |
1015 | |
1016 | if (likely(user_backed_iter(i))) { |
1017 | unsigned long addr; |
1018 | int res; |
1019 | |
1020 | if (iov_iter_rw(i) != WRITE) |
1021 | gup_flags |= FOLL_WRITE; |
1022 | if (i->nofault) |
1023 | gup_flags |= FOLL_NOFAULT; |
1024 | |
1025 | addr = first_iovec_segment(i, size: &maxsize); |
1026 | *start = addr % PAGE_SIZE; |
1027 | addr &= PAGE_MASK; |
1028 | n = want_pages_array(res: pages, size: maxsize, start: *start, maxpages); |
1029 | if (!n) |
1030 | return -ENOMEM; |
1031 | res = get_user_pages_fast(start: addr, nr_pages: n, gup_flags, pages: *pages); |
1032 | if (unlikely(res <= 0)) |
1033 | return res; |
1034 | maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); |
1035 | iov_iter_advance(i, maxsize); |
1036 | return maxsize; |
1037 | } |
1038 | if (iov_iter_is_bvec(i)) { |
1039 | struct page **p; |
1040 | struct page *page; |
1041 | |
1042 | page = first_bvec_segment(i, size: &maxsize, start); |
1043 | n = want_pages_array(res: pages, size: maxsize, start: *start, maxpages); |
1044 | if (!n) |
1045 | return -ENOMEM; |
1046 | p = *pages; |
1047 | for (int k = 0; k < n; k++) |
1048 | get_page(page: p[k] = page + k); |
1049 | maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); |
1050 | i->count -= maxsize; |
1051 | i->iov_offset += maxsize; |
1052 | if (i->iov_offset == i->bvec->bv_len) { |
1053 | i->iov_offset = 0; |
1054 | i->bvec++; |
1055 | i->nr_segs--; |
1056 | } |
1057 | return maxsize; |
1058 | } |
1059 | if (iov_iter_is_xarray(i)) |
1060 | return iter_xarray_get_pages(i, pages, maxsize, maxpages, start_offset: start); |
1061 | return -EFAULT; |
1062 | } |
1063 | |
1064 | ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, |
1065 | size_t maxsize, unsigned maxpages, size_t *start) |
1066 | { |
1067 | if (!maxpages) |
1068 | return 0; |
1069 | BUG_ON(!pages); |
1070 | |
1071 | return __iov_iter_get_pages_alloc(i, pages: &pages, maxsize, maxpages, start); |
1072 | } |
1073 | EXPORT_SYMBOL(iov_iter_get_pages2); |
1074 | |
1075 | ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, |
1076 | struct page ***pages, size_t maxsize, size_t *start) |
1077 | { |
1078 | ssize_t len; |
1079 | |
1080 | *pages = NULL; |
1081 | |
1082 | len = __iov_iter_get_pages_alloc(i, pages, maxsize, maxpages: ~0U, start); |
1083 | if (len <= 0) { |
1084 | kvfree(addr: *pages); |
1085 | *pages = NULL; |
1086 | } |
1087 | return len; |
1088 | } |
1089 | EXPORT_SYMBOL(iov_iter_get_pages_alloc2); |
1090 | |
1091 | static int iov_npages(const struct iov_iter *i, int maxpages) |
1092 | { |
1093 | size_t skip = i->iov_offset, size = i->count; |
1094 | const struct iovec *p; |
1095 | int npages = 0; |
1096 | |
1097 | for (p = iter_iov(iter: i); size; skip = 0, p++) { |
1098 | unsigned offs = offset_in_page(p->iov_base + skip); |
1099 | size_t len = min(p->iov_len - skip, size); |
1100 | |
1101 | if (len) { |
1102 | size -= len; |
1103 | npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); |
1104 | if (unlikely(npages > maxpages)) |
1105 | return maxpages; |
1106 | } |
1107 | } |
1108 | return npages; |
1109 | } |
1110 | |
1111 | static int bvec_npages(const struct iov_iter *i, int maxpages) |
1112 | { |
1113 | size_t skip = i->iov_offset, size = i->count; |
1114 | const struct bio_vec *p; |
1115 | int npages = 0; |
1116 | |
1117 | for (p = i->bvec; size; skip = 0, p++) { |
1118 | unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; |
1119 | size_t len = min(p->bv_len - skip, size); |
1120 | |
1121 | size -= len; |
1122 | npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); |
1123 | if (unlikely(npages > maxpages)) |
1124 | return maxpages; |
1125 | } |
1126 | return npages; |
1127 | } |
1128 | |
1129 | int iov_iter_npages(const struct iov_iter *i, int maxpages) |
1130 | { |
1131 | if (unlikely(!i->count)) |
1132 | return 0; |
1133 | if (likely(iter_is_ubuf(i))) { |
1134 | unsigned offs = offset_in_page(i->ubuf + i->iov_offset); |
1135 | int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); |
1136 | return min(npages, maxpages); |
1137 | } |
1138 | /* iovec and kvec have identical layouts */ |
1139 | if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) |
1140 | return iov_npages(i, maxpages); |
1141 | if (iov_iter_is_bvec(i)) |
1142 | return bvec_npages(i, maxpages); |
1143 | if (iov_iter_is_xarray(i)) { |
1144 | unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; |
1145 | int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); |
1146 | return min(npages, maxpages); |
1147 | } |
1148 | return 0; |
1149 | } |
1150 | EXPORT_SYMBOL(iov_iter_npages); |
1151 | |
1152 | const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) |
1153 | { |
1154 | *new = *old; |
1155 | if (iov_iter_is_bvec(i: new)) |
1156 | return new->bvec = kmemdup(p: new->bvec, |
1157 | size: new->nr_segs * sizeof(struct bio_vec), |
1158 | gfp: flags); |
1159 | else if (iov_iter_is_kvec(i: new) || iter_is_iovec(i: new)) |
1160 | /* iovec and kvec have identical layout */ |
1161 | return new->__iov = kmemdup(p: new->__iov, |
1162 | size: new->nr_segs * sizeof(struct iovec), |
1163 | gfp: flags); |
1164 | return NULL; |
1165 | } |
1166 | EXPORT_SYMBOL(dup_iter); |
1167 | |
1168 | static __noclone int copy_compat_iovec_from_user(struct iovec *iov, |
1169 | const struct iovec __user *uvec, unsigned long nr_segs) |
1170 | { |
1171 | const struct compat_iovec __user *uiov = |
1172 | (const struct compat_iovec __user *)uvec; |
1173 | int ret = -EFAULT, i; |
1174 | |
1175 | if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) |
1176 | return -EFAULT; |
1177 | |
1178 | for (i = 0; i < nr_segs; i++) { |
1179 | compat_uptr_t buf; |
1180 | compat_ssize_t len; |
1181 | |
1182 | unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); |
1183 | unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); |
1184 | |
1185 | /* check for compat_size_t not fitting in compat_ssize_t .. */ |
1186 | if (len < 0) { |
1187 | ret = -EINVAL; |
1188 | goto uaccess_end; |
1189 | } |
1190 | iov[i].iov_base = compat_ptr(uptr: buf); |
1191 | iov[i].iov_len = len; |
1192 | } |
1193 | |
1194 | ret = 0; |
1195 | uaccess_end: |
1196 | user_access_end(); |
1197 | return ret; |
1198 | } |
1199 | |
1200 | static __noclone int copy_iovec_from_user(struct iovec *iov, |
1201 | const struct iovec __user *uiov, unsigned long nr_segs) |
1202 | { |
1203 | int ret = -EFAULT; |
1204 | |
1205 | if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) |
1206 | return -EFAULT; |
1207 | |
1208 | do { |
1209 | void __user *buf; |
1210 | ssize_t len; |
1211 | |
1212 | unsafe_get_user(len, &uiov->iov_len, uaccess_end); |
1213 | unsafe_get_user(buf, &uiov->iov_base, uaccess_end); |
1214 | |
1215 | /* check for size_t not fitting in ssize_t .. */ |
1216 | if (unlikely(len < 0)) { |
1217 | ret = -EINVAL; |
1218 | goto uaccess_end; |
1219 | } |
1220 | iov->iov_base = buf; |
1221 | iov->iov_len = len; |
1222 | |
1223 | uiov++; iov++; |
1224 | } while (--nr_segs); |
1225 | |
1226 | ret = 0; |
1227 | uaccess_end: |
1228 | user_access_end(); |
1229 | return ret; |
1230 | } |
1231 | |
1232 | struct iovec *iovec_from_user(const struct iovec __user *uvec, |
1233 | unsigned long nr_segs, unsigned long fast_segs, |
1234 | struct iovec *fast_iov, bool compat) |
1235 | { |
1236 | struct iovec *iov = fast_iov; |
1237 | int ret; |
1238 | |
1239 | /* |
1240 | * SuS says "The readv() function *may* fail if the iovcnt argument was |
1241 | * less than or equal to 0, or greater than {IOV_MAX}. Linux has |
1242 | * traditionally returned zero for zero segments, so... |
1243 | */ |
1244 | if (nr_segs == 0) |
1245 | return iov; |
1246 | if (nr_segs > UIO_MAXIOV) |
1247 | return ERR_PTR(error: -EINVAL); |
1248 | if (nr_segs > fast_segs) { |
1249 | iov = kmalloc_array(n: nr_segs, size: sizeof(struct iovec), GFP_KERNEL); |
1250 | if (!iov) |
1251 | return ERR_PTR(error: -ENOMEM); |
1252 | } |
1253 | |
1254 | if (unlikely(compat)) |
1255 | ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); |
1256 | else |
1257 | ret = copy_iovec_from_user(iov, uiov: uvec, nr_segs); |
1258 | if (ret) { |
1259 | if (iov != fast_iov) |
1260 | kfree(objp: iov); |
1261 | return ERR_PTR(error: ret); |
1262 | } |
1263 | |
1264 | return iov; |
1265 | } |
1266 | |
1267 | /* |
1268 | * Single segment iovec supplied by the user, import it as ITER_UBUF. |
1269 | */ |
1270 | static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, |
1271 | struct iovec **iovp, struct iov_iter *i, |
1272 | bool compat) |
1273 | { |
1274 | struct iovec *iov = *iovp; |
1275 | ssize_t ret; |
1276 | |
1277 | if (compat) |
1278 | ret = copy_compat_iovec_from_user(iov, uvec, nr_segs: 1); |
1279 | else |
1280 | ret = copy_iovec_from_user(iov, uiov: uvec, nr_segs: 1); |
1281 | if (unlikely(ret)) |
1282 | return ret; |
1283 | |
1284 | ret = import_ubuf(type, buf: iov->iov_base, len: iov->iov_len, i); |
1285 | if (unlikely(ret)) |
1286 | return ret; |
1287 | *iovp = NULL; |
1288 | return i->count; |
1289 | } |
1290 | |
1291 | ssize_t __import_iovec(int type, const struct iovec __user *uvec, |
1292 | unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, |
1293 | struct iov_iter *i, bool compat) |
1294 | { |
1295 | ssize_t total_len = 0; |
1296 | unsigned long seg; |
1297 | struct iovec *iov; |
1298 | |
1299 | if (nr_segs == 1) |
1300 | return __import_iovec_ubuf(type, uvec, iovp, i, compat); |
1301 | |
1302 | iov = iovec_from_user(uvec, nr_segs, fast_segs, fast_iov: *iovp, compat); |
1303 | if (IS_ERR(ptr: iov)) { |
1304 | *iovp = NULL; |
1305 | return PTR_ERR(ptr: iov); |
1306 | } |
1307 | |
1308 | /* |
1309 | * According to the Single Unix Specification we should return EINVAL if |
1310 | * an element length is < 0 when cast to ssize_t or if the total length |
1311 | * would overflow the ssize_t return value of the system call. |
1312 | * |
1313 | * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the |
1314 | * overflow case. |
1315 | */ |
1316 | for (seg = 0; seg < nr_segs; seg++) { |
1317 | ssize_t len = (ssize_t)iov[seg].iov_len; |
1318 | |
1319 | if (!access_ok(iov[seg].iov_base, len)) { |
1320 | if (iov != *iovp) |
1321 | kfree(objp: iov); |
1322 | *iovp = NULL; |
1323 | return -EFAULT; |
1324 | } |
1325 | |
1326 | if (len > MAX_RW_COUNT - total_len) { |
1327 | len = MAX_RW_COUNT - total_len; |
1328 | iov[seg].iov_len = len; |
1329 | } |
1330 | total_len += len; |
1331 | } |
1332 | |
1333 | iov_iter_init(i, type, iov, nr_segs, total_len); |
1334 | if (iov == *iovp) |
1335 | *iovp = NULL; |
1336 | else |
1337 | *iovp = iov; |
1338 | return total_len; |
1339 | } |
1340 | |
1341 | /** |
1342 | * import_iovec() - Copy an array of &struct iovec from userspace |
1343 | * into the kernel, check that it is valid, and initialize a new |
1344 | * &struct iov_iter iterator to access it. |
1345 | * |
1346 | * @type: One of %READ or %WRITE. |
1347 | * @uvec: Pointer to the userspace array. |
1348 | * @nr_segs: Number of elements in userspace array. |
1349 | * @fast_segs: Number of elements in @iov. |
1350 | * @iovp: (input and output parameter) Pointer to pointer to (usually small |
1351 | * on-stack) kernel array. |
1352 | * @i: Pointer to iterator that will be initialized on success. |
1353 | * |
1354 | * If the array pointed to by *@iov is large enough to hold all @nr_segs, |
1355 | * then this function places %NULL in *@iov on return. Otherwise, a new |
1356 | * array will be allocated and the result placed in *@iov. This means that |
1357 | * the caller may call kfree() on *@iov regardless of whether the small |
1358 | * on-stack array was used or not (and regardless of whether this function |
1359 | * returns an error or not). |
1360 | * |
1361 | * Return: Negative error code on error, bytes imported on success |
1362 | */ |
1363 | ssize_t import_iovec(int type, const struct iovec __user *uvec, |
1364 | unsigned nr_segs, unsigned fast_segs, |
1365 | struct iovec **iovp, struct iov_iter *i) |
1366 | { |
1367 | return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, |
1368 | in_compat_syscall()); |
1369 | } |
1370 | EXPORT_SYMBOL(import_iovec); |
1371 | |
1372 | int import_single_range(int rw, void __user *buf, size_t len, |
1373 | struct iovec *iov, struct iov_iter *i) |
1374 | { |
1375 | if (len > MAX_RW_COUNT) |
1376 | len = MAX_RW_COUNT; |
1377 | if (unlikely(!access_ok(buf, len))) |
1378 | return -EFAULT; |
1379 | |
1380 | iov_iter_ubuf(i, direction: rw, buf, count: len); |
1381 | return 0; |
1382 | } |
1383 | EXPORT_SYMBOL(import_single_range); |
1384 | |
1385 | int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) |
1386 | { |
1387 | if (len > MAX_RW_COUNT) |
1388 | len = MAX_RW_COUNT; |
1389 | if (unlikely(!access_ok(buf, len))) |
1390 | return -EFAULT; |
1391 | |
1392 | iov_iter_ubuf(i, direction: rw, buf, count: len); |
1393 | return 0; |
1394 | } |
1395 | EXPORT_SYMBOL_GPL(import_ubuf); |
1396 | |
1397 | /** |
1398 | * iov_iter_restore() - Restore a &struct iov_iter to the same state as when |
1399 | * iov_iter_save_state() was called. |
1400 | * |
1401 | * @i: &struct iov_iter to restore |
1402 | * @state: state to restore from |
1403 | * |
1404 | * Used after iov_iter_save_state() to bring restore @i, if operations may |
1405 | * have advanced it. |
1406 | * |
1407 | * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC |
1408 | */ |
1409 | void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) |
1410 | { |
1411 | if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && |
1412 | !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) |
1413 | return; |
1414 | i->iov_offset = state->iov_offset; |
1415 | i->count = state->count; |
1416 | if (iter_is_ubuf(i)) |
1417 | return; |
1418 | /* |
1419 | * For the *vec iters, nr_segs + iov is constant - if we increment |
1420 | * the vec, then we also decrement the nr_segs count. Hence we don't |
1421 | * need to track both of these, just one is enough and we can deduct |
1422 | * the other from that. ITER_KVEC and ITER_IOVEC are the same struct |
1423 | * size, so we can just increment the iov pointer as they are unionzed. |
1424 | * ITER_BVEC _may_ be the same size on some archs, but on others it is |
1425 | * not. Be safe and handle it separately. |
1426 | */ |
1427 | BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); |
1428 | if (iov_iter_is_bvec(i)) |
1429 | i->bvec -= state->nr_segs - i->nr_segs; |
1430 | else |
1431 | i->__iov -= state->nr_segs - i->nr_segs; |
1432 | i->nr_segs = state->nr_segs; |
1433 | } |
1434 | |
1435 | /* |
1436 | * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not |
1437 | * get references on the pages, nor does it get a pin on them. |
1438 | */ |
1439 | static ssize_t (struct iov_iter *i, |
1440 | struct page ***pages, size_t maxsize, |
1441 | unsigned int maxpages, |
1442 | iov_iter_extraction_t , |
1443 | size_t *offset0) |
1444 | { |
1445 | struct page *page, **p; |
1446 | unsigned int nr = 0, offset; |
1447 | loff_t pos = i->xarray_start + i->iov_offset; |
1448 | pgoff_t index = pos >> PAGE_SHIFT; |
1449 | XA_STATE(xas, i->xarray, index); |
1450 | |
1451 | offset = pos & ~PAGE_MASK; |
1452 | *offset0 = offset; |
1453 | |
1454 | maxpages = want_pages_array(res: pages, size: maxsize, start: offset, maxpages); |
1455 | if (!maxpages) |
1456 | return -ENOMEM; |
1457 | p = *pages; |
1458 | |
1459 | rcu_read_lock(); |
1460 | for (page = xas_load(&xas); page; page = xas_next(xas: &xas)) { |
1461 | if (xas_retry(xas: &xas, entry: page)) |
1462 | continue; |
1463 | |
1464 | /* Has the page moved or been split? */ |
1465 | if (unlikely(page != xas_reload(&xas))) { |
1466 | xas_reset(xas: &xas); |
1467 | continue; |
1468 | } |
1469 | |
1470 | p[nr++] = find_subpage(head: page, index: xas.xa_index); |
1471 | if (nr == maxpages) |
1472 | break; |
1473 | } |
1474 | rcu_read_unlock(); |
1475 | |
1476 | maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); |
1477 | iov_iter_advance(i, maxsize); |
1478 | return maxsize; |
1479 | } |
1480 | |
1481 | /* |
1482 | * Extract a list of contiguous pages from an ITER_BVEC iterator. This does |
1483 | * not get references on the pages, nor does it get a pin on them. |
1484 | */ |
1485 | static ssize_t (struct iov_iter *i, |
1486 | struct page ***pages, size_t maxsize, |
1487 | unsigned int maxpages, |
1488 | iov_iter_extraction_t , |
1489 | size_t *offset0) |
1490 | { |
1491 | struct page **p, *page; |
1492 | size_t skip = i->iov_offset, offset, size; |
1493 | int k; |
1494 | |
1495 | for (;;) { |
1496 | if (i->nr_segs == 0) |
1497 | return 0; |
1498 | size = min(maxsize, i->bvec->bv_len - skip); |
1499 | if (size) |
1500 | break; |
1501 | i->iov_offset = 0; |
1502 | i->nr_segs--; |
1503 | i->bvec++; |
1504 | skip = 0; |
1505 | } |
1506 | |
1507 | skip += i->bvec->bv_offset; |
1508 | page = i->bvec->bv_page + skip / PAGE_SIZE; |
1509 | offset = skip % PAGE_SIZE; |
1510 | *offset0 = offset; |
1511 | |
1512 | maxpages = want_pages_array(res: pages, size, start: offset, maxpages); |
1513 | if (!maxpages) |
1514 | return -ENOMEM; |
1515 | p = *pages; |
1516 | for (k = 0; k < maxpages; k++) |
1517 | p[k] = page + k; |
1518 | |
1519 | size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); |
1520 | iov_iter_advance(i, size); |
1521 | return size; |
1522 | } |
1523 | |
1524 | /* |
1525 | * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. |
1526 | * This does not get references on the pages, nor does it get a pin on them. |
1527 | */ |
1528 | static ssize_t (struct iov_iter *i, |
1529 | struct page ***pages, size_t maxsize, |
1530 | unsigned int maxpages, |
1531 | iov_iter_extraction_t , |
1532 | size_t *offset0) |
1533 | { |
1534 | struct page **p, *page; |
1535 | const void *kaddr; |
1536 | size_t skip = i->iov_offset, offset, len, size; |
1537 | int k; |
1538 | |
1539 | for (;;) { |
1540 | if (i->nr_segs == 0) |
1541 | return 0; |
1542 | size = min(maxsize, i->kvec->iov_len - skip); |
1543 | if (size) |
1544 | break; |
1545 | i->iov_offset = 0; |
1546 | i->nr_segs--; |
1547 | i->kvec++; |
1548 | skip = 0; |
1549 | } |
1550 | |
1551 | kaddr = i->kvec->iov_base + skip; |
1552 | offset = (unsigned long)kaddr & ~PAGE_MASK; |
1553 | *offset0 = offset; |
1554 | |
1555 | maxpages = want_pages_array(res: pages, size, start: offset, maxpages); |
1556 | if (!maxpages) |
1557 | return -ENOMEM; |
1558 | p = *pages; |
1559 | |
1560 | kaddr -= offset; |
1561 | len = offset + size; |
1562 | for (k = 0; k < maxpages; k++) { |
1563 | size_t seg = min_t(size_t, len, PAGE_SIZE); |
1564 | |
1565 | if (is_vmalloc_or_module_addr(x: kaddr)) |
1566 | page = vmalloc_to_page(addr: kaddr); |
1567 | else |
1568 | page = virt_to_page(kaddr); |
1569 | |
1570 | p[k] = page; |
1571 | len -= seg; |
1572 | kaddr += PAGE_SIZE; |
1573 | } |
1574 | |
1575 | size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); |
1576 | iov_iter_advance(i, size); |
1577 | return size; |
1578 | } |
1579 | |
1580 | /* |
1581 | * Extract a list of contiguous pages from a user iterator and get a pin on |
1582 | * each of them. This should only be used if the iterator is user-backed |
1583 | * (IOBUF/UBUF). |
1584 | * |
1585 | * It does not get refs on the pages, but the pages must be unpinned by the |
1586 | * caller once the transfer is complete. |
1587 | * |
1588 | * This is safe to be used where background IO/DMA *is* going to be modifying |
1589 | * the buffer; using a pin rather than a ref makes forces fork() to give the |
1590 | * child a copy of the page. |
1591 | */ |
1592 | static ssize_t (struct iov_iter *i, |
1593 | struct page ***pages, |
1594 | size_t maxsize, |
1595 | unsigned int maxpages, |
1596 | iov_iter_extraction_t , |
1597 | size_t *offset0) |
1598 | { |
1599 | unsigned long addr; |
1600 | unsigned int gup_flags = 0; |
1601 | size_t offset; |
1602 | int res; |
1603 | |
1604 | if (i->data_source == ITER_DEST) |
1605 | gup_flags |= FOLL_WRITE; |
1606 | if (extraction_flags & ITER_ALLOW_P2PDMA) |
1607 | gup_flags |= FOLL_PCI_P2PDMA; |
1608 | if (i->nofault) |
1609 | gup_flags |= FOLL_NOFAULT; |
1610 | |
1611 | addr = first_iovec_segment(i, size: &maxsize); |
1612 | *offset0 = offset = addr % PAGE_SIZE; |
1613 | addr &= PAGE_MASK; |
1614 | maxpages = want_pages_array(res: pages, size: maxsize, start: offset, maxpages); |
1615 | if (!maxpages) |
1616 | return -ENOMEM; |
1617 | res = pin_user_pages_fast(start: addr, nr_pages: maxpages, gup_flags, pages: *pages); |
1618 | if (unlikely(res <= 0)) |
1619 | return res; |
1620 | maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); |
1621 | iov_iter_advance(i, maxsize); |
1622 | return maxsize; |
1623 | } |
1624 | |
1625 | /** |
1626 | * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator |
1627 | * @i: The iterator to extract from |
1628 | * @pages: Where to return the list of pages |
1629 | * @maxsize: The maximum amount of iterator to extract |
1630 | * @maxpages: The maximum size of the list of pages |
1631 | * @extraction_flags: Flags to qualify request |
1632 | * @offset0: Where to return the starting offset into (*@pages)[0] |
1633 | * |
1634 | * Extract a list of contiguous pages from the current point of the iterator, |
1635 | * advancing the iterator. The maximum number of pages and the maximum amount |
1636 | * of page contents can be set. |
1637 | * |
1638 | * If *@pages is NULL, a page list will be allocated to the required size and |
1639 | * *@pages will be set to its base. If *@pages is not NULL, it will be assumed |
1640 | * that the caller allocated a page list at least @maxpages in size and this |
1641 | * will be filled in. |
1642 | * |
1643 | * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA |
1644 | * be allowed on the pages extracted. |
1645 | * |
1646 | * The iov_iter_extract_will_pin() function can be used to query how cleanup |
1647 | * should be performed. |
1648 | * |
1649 | * Extra refs or pins on the pages may be obtained as follows: |
1650 | * |
1651 | * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be |
1652 | * added to the pages, but refs will not be taken. |
1653 | * iov_iter_extract_will_pin() will return true. |
1654 | * |
1655 | * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are |
1656 | * merely listed; no extra refs or pins are obtained. |
1657 | * iov_iter_extract_will_pin() will return 0. |
1658 | * |
1659 | * Note also: |
1660 | * |
1661 | * (*) Use with ITER_DISCARD is not supported as that has no content. |
1662 | * |
1663 | * On success, the function sets *@pages to the new pagelist, if allocated, and |
1664 | * sets *offset0 to the offset into the first page. |
1665 | * |
1666 | * It may also return -ENOMEM and -EFAULT. |
1667 | */ |
1668 | ssize_t (struct iov_iter *i, |
1669 | struct page ***pages, |
1670 | size_t maxsize, |
1671 | unsigned int maxpages, |
1672 | iov_iter_extraction_t , |
1673 | size_t *offset0) |
1674 | { |
1675 | maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); |
1676 | if (!maxsize) |
1677 | return 0; |
1678 | |
1679 | if (likely(user_backed_iter(i))) |
1680 | return iov_iter_extract_user_pages(i, pages, maxsize, |
1681 | maxpages, extraction_flags, |
1682 | offset0); |
1683 | if (iov_iter_is_kvec(i)) |
1684 | return iov_iter_extract_kvec_pages(i, pages, maxsize, |
1685 | maxpages, extraction_flags, |
1686 | offset0); |
1687 | if (iov_iter_is_bvec(i)) |
1688 | return iov_iter_extract_bvec_pages(i, pages, maxsize, |
1689 | maxpages, extraction_flags, |
1690 | offset0); |
1691 | if (iov_iter_is_xarray(i)) |
1692 | return iov_iter_extract_xarray_pages(i, pages, maxsize, |
1693 | maxpages, extraction_flags, |
1694 | offset0); |
1695 | return -EFAULT; |
1696 | } |
1697 | EXPORT_SYMBOL_GPL(iov_iter_extract_pages); |
1698 | |