1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2003 Sistina Software |
4 | * Copyright (C) 2006 Red Hat GmbH |
5 | * |
6 | * This file is released under the GPL. |
7 | */ |
8 | |
9 | #include "dm-core.h" |
10 | |
11 | #include <linux/device-mapper.h> |
12 | |
13 | #include <linux/bio.h> |
14 | #include <linux/completion.h> |
15 | #include <linux/mempool.h> |
16 | #include <linux/module.h> |
17 | #include <linux/sched.h> |
18 | #include <linux/slab.h> |
19 | #include <linux/dm-io.h> |
20 | |
21 | #define DM_MSG_PREFIX "io" |
22 | |
23 | #define DM_IO_MAX_REGIONS BITS_PER_LONG |
24 | |
25 | struct dm_io_client { |
26 | mempool_t pool; |
27 | struct bio_set bios; |
28 | }; |
29 | |
30 | /* |
31 | * Aligning 'struct io' reduces the number of bits required to store |
32 | * its address. Refer to store_io_and_region_in_bio() below. |
33 | */ |
34 | struct io { |
35 | unsigned long error_bits; |
36 | atomic_t count; |
37 | struct dm_io_client *client; |
38 | io_notify_fn callback; |
39 | void *context; |
40 | void *vma_invalidate_address; |
41 | unsigned long vma_invalidate_size; |
42 | } __aligned(DM_IO_MAX_REGIONS); |
43 | |
44 | static struct kmem_cache *_dm_io_cache; |
45 | |
46 | /* |
47 | * Create a client with mempool and bioset. |
48 | */ |
49 | struct dm_io_client *dm_io_client_create(void) |
50 | { |
51 | struct dm_io_client *client; |
52 | unsigned int min_ios = dm_get_reserved_bio_based_ios(); |
53 | int ret; |
54 | |
55 | client = kzalloc(size: sizeof(*client), GFP_KERNEL); |
56 | if (!client) |
57 | return ERR_PTR(error: -ENOMEM); |
58 | |
59 | ret = mempool_init_slab_pool(pool: &client->pool, min_nr: min_ios, kc: _dm_io_cache); |
60 | if (ret) |
61 | goto bad; |
62 | |
63 | ret = bioset_init(&client->bios, min_ios, 0, flags: BIOSET_NEED_BVECS); |
64 | if (ret) |
65 | goto bad; |
66 | |
67 | return client; |
68 | |
69 | bad: |
70 | mempool_exit(pool: &client->pool); |
71 | kfree(objp: client); |
72 | return ERR_PTR(error: ret); |
73 | } |
74 | EXPORT_SYMBOL(dm_io_client_create); |
75 | |
76 | void dm_io_client_destroy(struct dm_io_client *client) |
77 | { |
78 | mempool_exit(pool: &client->pool); |
79 | bioset_exit(&client->bios); |
80 | kfree(objp: client); |
81 | } |
82 | EXPORT_SYMBOL(dm_io_client_destroy); |
83 | |
84 | /* |
85 | *------------------------------------------------------------------- |
86 | * We need to keep track of which region a bio is doing io for. |
87 | * To avoid a memory allocation to store just 5 or 6 bits, we |
88 | * ensure the 'struct io' pointer is aligned so enough low bits are |
89 | * always zero and then combine it with the region number directly in |
90 | * bi_private. |
91 | *------------------------------------------------------------------- |
92 | */ |
93 | static void store_io_and_region_in_bio(struct bio *bio, struct io *io, |
94 | unsigned int region) |
95 | { |
96 | if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { |
97 | DMCRIT("Unaligned struct io pointer %p" , io); |
98 | BUG(); |
99 | } |
100 | |
101 | bio->bi_private = (void *)((unsigned long)io | region); |
102 | } |
103 | |
104 | static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, |
105 | unsigned int *region) |
106 | { |
107 | unsigned long val = (unsigned long)bio->bi_private; |
108 | |
109 | *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); |
110 | *region = val & (DM_IO_MAX_REGIONS - 1); |
111 | } |
112 | |
113 | /* |
114 | *-------------------------------------------------------------- |
115 | * We need an io object to keep track of the number of bios that |
116 | * have been dispatched for a particular io. |
117 | *-------------------------------------------------------------- |
118 | */ |
119 | static void complete_io(struct io *io) |
120 | { |
121 | unsigned long error_bits = io->error_bits; |
122 | io_notify_fn fn = io->callback; |
123 | void *context = io->context; |
124 | |
125 | if (io->vma_invalidate_size) |
126 | invalidate_kernel_vmap_range(vaddr: io->vma_invalidate_address, |
127 | size: io->vma_invalidate_size); |
128 | |
129 | mempool_free(element: io, pool: &io->client->pool); |
130 | fn(error_bits, context); |
131 | } |
132 | |
133 | static void dec_count(struct io *io, unsigned int region, blk_status_t error) |
134 | { |
135 | if (error) |
136 | set_bit(nr: region, addr: &io->error_bits); |
137 | |
138 | if (atomic_dec_and_test(v: &io->count)) |
139 | complete_io(io); |
140 | } |
141 | |
142 | static void endio(struct bio *bio) |
143 | { |
144 | struct io *io; |
145 | unsigned int region; |
146 | blk_status_t error; |
147 | |
148 | if (bio->bi_status && bio_data_dir(bio) == READ) |
149 | zero_fill_bio(bio); |
150 | |
151 | /* |
152 | * The bio destructor in bio_put() may use the io object. |
153 | */ |
154 | retrieve_io_and_region_from_bio(bio, io: &io, region: ®ion); |
155 | |
156 | error = bio->bi_status; |
157 | bio_put(bio); |
158 | |
159 | dec_count(io, region, error); |
160 | } |
161 | |
162 | /* |
163 | *-------------------------------------------------------------- |
164 | * These little objects provide an abstraction for getting a new |
165 | * destination page for io. |
166 | *-------------------------------------------------------------- |
167 | */ |
168 | struct dpages { |
169 | void (*get_page)(struct dpages *dp, |
170 | struct page **p, unsigned long *len, unsigned int *offset); |
171 | void (*next_page)(struct dpages *dp); |
172 | |
173 | union { |
174 | unsigned int context_u; |
175 | struct bvec_iter context_bi; |
176 | }; |
177 | void *context_ptr; |
178 | |
179 | void *vma_invalidate_address; |
180 | unsigned long vma_invalidate_size; |
181 | }; |
182 | |
183 | /* |
184 | * Functions for getting the pages from a list. |
185 | */ |
186 | static void list_get_page(struct dpages *dp, |
187 | struct page **p, unsigned long *len, unsigned int *offset) |
188 | { |
189 | unsigned int o = dp->context_u; |
190 | struct page_list *pl = dp->context_ptr; |
191 | |
192 | *p = pl->page; |
193 | *len = PAGE_SIZE - o; |
194 | *offset = o; |
195 | } |
196 | |
197 | static void list_next_page(struct dpages *dp) |
198 | { |
199 | struct page_list *pl = dp->context_ptr; |
200 | |
201 | dp->context_ptr = pl->next; |
202 | dp->context_u = 0; |
203 | } |
204 | |
205 | static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned int offset) |
206 | { |
207 | dp->get_page = list_get_page; |
208 | dp->next_page = list_next_page; |
209 | dp->context_u = offset; |
210 | dp->context_ptr = pl; |
211 | } |
212 | |
213 | /* |
214 | * Functions for getting the pages from a bvec. |
215 | */ |
216 | static void bio_get_page(struct dpages *dp, struct page **p, |
217 | unsigned long *len, unsigned int *offset) |
218 | { |
219 | struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr, |
220 | dp->context_bi); |
221 | |
222 | *p = bvec.bv_page; |
223 | *len = bvec.bv_len; |
224 | *offset = bvec.bv_offset; |
225 | |
226 | /* avoid figuring it out again in bio_next_page() */ |
227 | dp->context_bi.bi_sector = (sector_t)bvec.bv_len; |
228 | } |
229 | |
230 | static void bio_next_page(struct dpages *dp) |
231 | { |
232 | unsigned int len = (unsigned int)dp->context_bi.bi_sector; |
233 | |
234 | bvec_iter_advance(bv: (struct bio_vec *)dp->context_ptr, |
235 | iter: &dp->context_bi, bytes: len); |
236 | } |
237 | |
238 | static void bio_dp_init(struct dpages *dp, struct bio *bio) |
239 | { |
240 | dp->get_page = bio_get_page; |
241 | dp->next_page = bio_next_page; |
242 | |
243 | /* |
244 | * We just use bvec iterator to retrieve pages, so it is ok to |
245 | * access the bvec table directly here |
246 | */ |
247 | dp->context_ptr = bio->bi_io_vec; |
248 | dp->context_bi = bio->bi_iter; |
249 | } |
250 | |
251 | /* |
252 | * Functions for getting the pages from a VMA. |
253 | */ |
254 | static void vm_get_page(struct dpages *dp, |
255 | struct page **p, unsigned long *len, unsigned int *offset) |
256 | { |
257 | *p = vmalloc_to_page(addr: dp->context_ptr); |
258 | *offset = dp->context_u; |
259 | *len = PAGE_SIZE - dp->context_u; |
260 | } |
261 | |
262 | static void vm_next_page(struct dpages *dp) |
263 | { |
264 | dp->context_ptr += PAGE_SIZE - dp->context_u; |
265 | dp->context_u = 0; |
266 | } |
267 | |
268 | static void vm_dp_init(struct dpages *dp, void *data) |
269 | { |
270 | dp->get_page = vm_get_page; |
271 | dp->next_page = vm_next_page; |
272 | dp->context_u = offset_in_page(data); |
273 | dp->context_ptr = data; |
274 | } |
275 | |
276 | /* |
277 | * Functions for getting the pages from kernel memory. |
278 | */ |
279 | static void km_get_page(struct dpages *dp, struct page **p, unsigned long *len, |
280 | unsigned int *offset) |
281 | { |
282 | *p = virt_to_page(dp->context_ptr); |
283 | *offset = dp->context_u; |
284 | *len = PAGE_SIZE - dp->context_u; |
285 | } |
286 | |
287 | static void km_next_page(struct dpages *dp) |
288 | { |
289 | dp->context_ptr += PAGE_SIZE - dp->context_u; |
290 | dp->context_u = 0; |
291 | } |
292 | |
293 | static void km_dp_init(struct dpages *dp, void *data) |
294 | { |
295 | dp->get_page = km_get_page; |
296 | dp->next_page = km_next_page; |
297 | dp->context_u = offset_in_page(data); |
298 | dp->context_ptr = data; |
299 | } |
300 | |
301 | /* |
302 | *--------------------------------------------------------------- |
303 | * IO routines that accept a list of pages. |
304 | *--------------------------------------------------------------- |
305 | */ |
306 | static void do_region(const blk_opf_t opf, unsigned int region, |
307 | struct dm_io_region *where, struct dpages *dp, |
308 | struct io *io) |
309 | { |
310 | struct bio *bio; |
311 | struct page *page; |
312 | unsigned long len; |
313 | unsigned int offset; |
314 | unsigned int num_bvecs; |
315 | sector_t remaining = where->count; |
316 | struct request_queue *q = bdev_get_queue(bdev: where->bdev); |
317 | sector_t num_sectors; |
318 | unsigned int special_cmd_max_sectors; |
319 | const enum req_op op = opf & REQ_OP_MASK; |
320 | |
321 | /* |
322 | * Reject unsupported discard and write same requests. |
323 | */ |
324 | if (op == REQ_OP_DISCARD) |
325 | special_cmd_max_sectors = bdev_max_discard_sectors(bdev: where->bdev); |
326 | else if (op == REQ_OP_WRITE_ZEROES) |
327 | special_cmd_max_sectors = q->limits.max_write_zeroes_sectors; |
328 | if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) && |
329 | special_cmd_max_sectors == 0) { |
330 | atomic_inc(v: &io->count); |
331 | dec_count(io, region, BLK_STS_NOTSUPP); |
332 | return; |
333 | } |
334 | |
335 | /* |
336 | * where->count may be zero if op holds a flush and we need to |
337 | * send a zero-sized flush. |
338 | */ |
339 | do { |
340 | /* |
341 | * Allocate a suitably sized-bio. |
342 | */ |
343 | switch (op) { |
344 | case REQ_OP_DISCARD: |
345 | case REQ_OP_WRITE_ZEROES: |
346 | num_bvecs = 0; |
347 | break; |
348 | default: |
349 | num_bvecs = bio_max_segs(dm_sector_div_up(remaining, |
350 | (PAGE_SIZE >> SECTOR_SHIFT))); |
351 | } |
352 | |
353 | bio = bio_alloc_bioset(bdev: where->bdev, nr_vecs: num_bvecs, opf, GFP_NOIO, |
354 | bs: &io->client->bios); |
355 | bio->bi_iter.bi_sector = where->sector + (where->count - remaining); |
356 | bio->bi_end_io = endio; |
357 | store_io_and_region_in_bio(bio, io, region); |
358 | |
359 | if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) { |
360 | num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining); |
361 | bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; |
362 | remaining -= num_sectors; |
363 | } else { |
364 | while (remaining) { |
365 | /* |
366 | * Try and add as many pages as possible. |
367 | */ |
368 | dp->get_page(dp, &page, &len, &offset); |
369 | len = min(len, to_bytes(remaining)); |
370 | if (!bio_add_page(bio, page, len, off: offset)) |
371 | break; |
372 | |
373 | offset = 0; |
374 | remaining -= to_sector(n: len); |
375 | dp->next_page(dp); |
376 | } |
377 | } |
378 | |
379 | atomic_inc(v: &io->count); |
380 | submit_bio(bio); |
381 | } while (remaining); |
382 | } |
383 | |
384 | static void dispatch_io(blk_opf_t opf, unsigned int num_regions, |
385 | struct dm_io_region *where, struct dpages *dp, |
386 | struct io *io, int sync) |
387 | { |
388 | int i; |
389 | struct dpages old_pages = *dp; |
390 | |
391 | BUG_ON(num_regions > DM_IO_MAX_REGIONS); |
392 | |
393 | if (sync) |
394 | opf |= REQ_SYNC; |
395 | |
396 | /* |
397 | * For multiple regions we need to be careful to rewind |
398 | * the dp object for each call to do_region. |
399 | */ |
400 | for (i = 0; i < num_regions; i++) { |
401 | *dp = old_pages; |
402 | if (where[i].count || (opf & REQ_PREFLUSH)) |
403 | do_region(opf, region: i, where: where + i, dp, io); |
404 | } |
405 | |
406 | /* |
407 | * Drop the extra reference that we were holding to avoid |
408 | * the io being completed too early. |
409 | */ |
410 | dec_count(io, region: 0, error: 0); |
411 | } |
412 | |
413 | struct sync_io { |
414 | unsigned long error_bits; |
415 | struct completion wait; |
416 | }; |
417 | |
418 | static void sync_io_complete(unsigned long error, void *context) |
419 | { |
420 | struct sync_io *sio = context; |
421 | |
422 | sio->error_bits = error; |
423 | complete(&sio->wait); |
424 | } |
425 | |
426 | static int sync_io(struct dm_io_client *client, unsigned int num_regions, |
427 | struct dm_io_region *where, blk_opf_t opf, struct dpages *dp, |
428 | unsigned long *error_bits) |
429 | { |
430 | struct io *io; |
431 | struct sync_io sio; |
432 | |
433 | if (num_regions > 1 && !op_is_write(op: opf)) { |
434 | WARN_ON(1); |
435 | return -EIO; |
436 | } |
437 | |
438 | init_completion(x: &sio.wait); |
439 | |
440 | io = mempool_alloc(pool: &client->pool, GFP_NOIO); |
441 | io->error_bits = 0; |
442 | atomic_set(v: &io->count, i: 1); /* see dispatch_io() */ |
443 | io->client = client; |
444 | io->callback = sync_io_complete; |
445 | io->context = &sio; |
446 | |
447 | io->vma_invalidate_address = dp->vma_invalidate_address; |
448 | io->vma_invalidate_size = dp->vma_invalidate_size; |
449 | |
450 | dispatch_io(opf, num_regions, where, dp, io, sync: 1); |
451 | |
452 | wait_for_completion_io(&sio.wait); |
453 | |
454 | if (error_bits) |
455 | *error_bits = sio.error_bits; |
456 | |
457 | return sio.error_bits ? -EIO : 0; |
458 | } |
459 | |
460 | static int async_io(struct dm_io_client *client, unsigned int num_regions, |
461 | struct dm_io_region *where, blk_opf_t opf, |
462 | struct dpages *dp, io_notify_fn fn, void *context) |
463 | { |
464 | struct io *io; |
465 | |
466 | if (num_regions > 1 && !op_is_write(op: opf)) { |
467 | WARN_ON(1); |
468 | fn(1, context); |
469 | return -EIO; |
470 | } |
471 | |
472 | io = mempool_alloc(pool: &client->pool, GFP_NOIO); |
473 | io->error_bits = 0; |
474 | atomic_set(v: &io->count, i: 1); /* see dispatch_io() */ |
475 | io->client = client; |
476 | io->callback = fn; |
477 | io->context = context; |
478 | |
479 | io->vma_invalidate_address = dp->vma_invalidate_address; |
480 | io->vma_invalidate_size = dp->vma_invalidate_size; |
481 | |
482 | dispatch_io(opf, num_regions, where, dp, io, sync: 0); |
483 | return 0; |
484 | } |
485 | |
486 | static int dp_init(struct dm_io_request *io_req, struct dpages *dp, |
487 | unsigned long size) |
488 | { |
489 | /* Set up dpages based on memory type */ |
490 | |
491 | dp->vma_invalidate_address = NULL; |
492 | dp->vma_invalidate_size = 0; |
493 | |
494 | switch (io_req->mem.type) { |
495 | case DM_IO_PAGE_LIST: |
496 | list_dp_init(dp, pl: io_req->mem.ptr.pl, offset: io_req->mem.offset); |
497 | break; |
498 | |
499 | case DM_IO_BIO: |
500 | bio_dp_init(dp, bio: io_req->mem.ptr.bio); |
501 | break; |
502 | |
503 | case DM_IO_VMA: |
504 | flush_kernel_vmap_range(vaddr: io_req->mem.ptr.vma, size); |
505 | if ((io_req->bi_opf & REQ_OP_MASK) == REQ_OP_READ) { |
506 | dp->vma_invalidate_address = io_req->mem.ptr.vma; |
507 | dp->vma_invalidate_size = size; |
508 | } |
509 | vm_dp_init(dp, data: io_req->mem.ptr.vma); |
510 | break; |
511 | |
512 | case DM_IO_KMEM: |
513 | km_dp_init(dp, data: io_req->mem.ptr.addr); |
514 | break; |
515 | |
516 | default: |
517 | return -EINVAL; |
518 | } |
519 | |
520 | return 0; |
521 | } |
522 | |
523 | int dm_io(struct dm_io_request *io_req, unsigned int num_regions, |
524 | struct dm_io_region *where, unsigned long *sync_error_bits) |
525 | { |
526 | int r; |
527 | struct dpages dp; |
528 | |
529 | r = dp_init(io_req, dp: &dp, size: (unsigned long)where->count << SECTOR_SHIFT); |
530 | if (r) |
531 | return r; |
532 | |
533 | if (!io_req->notify.fn) |
534 | return sync_io(client: io_req->client, num_regions, where, |
535 | opf: io_req->bi_opf, dp: &dp, error_bits: sync_error_bits); |
536 | |
537 | return async_io(client: io_req->client, num_regions, where, |
538 | opf: io_req->bi_opf, dp: &dp, fn: io_req->notify.fn, |
539 | context: io_req->notify.context); |
540 | } |
541 | EXPORT_SYMBOL(dm_io); |
542 | |
543 | int __init dm_io_init(void) |
544 | { |
545 | _dm_io_cache = KMEM_CACHE(io, 0); |
546 | if (!_dm_io_cache) |
547 | return -ENOMEM; |
548 | |
549 | return 0; |
550 | } |
551 | |
552 | void dm_io_exit(void) |
553 | { |
554 | kmem_cache_destroy(s: _dm_io_cache); |
555 | _dm_io_cache = NULL; |
556 | } |
557 | |