1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2014 Facebook. All rights reserved. |
4 | * |
5 | * This file is released under the GPL. |
6 | */ |
7 | |
8 | #include <linux/device-mapper.h> |
9 | |
10 | #include <linux/module.h> |
11 | #include <linux/init.h> |
12 | #include <linux/blkdev.h> |
13 | #include <linux/bio.h> |
14 | #include <linux/dax.h> |
15 | #include <linux/slab.h> |
16 | #include <linux/kthread.h> |
17 | #include <linux/freezer.h> |
18 | #include <linux/uio.h> |
19 | |
20 | #define DM_MSG_PREFIX "log-writes" |
21 | |
22 | /* |
23 | * This target will sequentially log all writes to the target device onto the |
24 | * log device. This is helpful for replaying writes to check for fs consistency |
25 | * at all times. This target provides a mechanism to mark specific events to |
26 | * check data at a later time. So for example you would: |
27 | * |
28 | * write data |
29 | * fsync |
30 | * dmsetup message /dev/whatever mark mymark |
31 | * unmount /mnt/test |
32 | * |
33 | * Then replay the log up to mymark and check the contents of the replay to |
34 | * verify it matches what was written. |
35 | * |
36 | * We log writes only after they have been flushed, this makes the log describe |
37 | * close to the order in which the data hits the actual disk, not its cache. So |
38 | * for example the following sequence (W means write, C means complete) |
39 | * |
40 | * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd |
41 | * |
42 | * Would result in the log looking like this: |
43 | * |
44 | * c,a,b,flush,fuad,<other writes>,<next flush> |
45 | * |
46 | * This is meant to help expose problems where file systems do not properly wait |
47 | * on data being written before invoking a FLUSH. FUA bypasses cache so once it |
48 | * completes it is added to the log as it should be on disk. |
49 | * |
50 | * We treat DISCARDs as if they don't bypass cache so that they are logged in |
51 | * order of completion along with the normal writes. If we didn't do it this |
52 | * way we would process all the discards first and then write all the data, when |
53 | * in fact we want to do the data and the discard in the order that they |
54 | * completed. |
55 | */ |
56 | #define LOG_FLUSH_FLAG (1 << 0) |
57 | #define LOG_FUA_FLAG (1 << 1) |
58 | #define LOG_DISCARD_FLAG (1 << 2) |
59 | #define LOG_MARK_FLAG (1 << 3) |
60 | #define LOG_METADATA_FLAG (1 << 4) |
61 | |
62 | #define WRITE_LOG_VERSION 1ULL |
63 | #define WRITE_LOG_MAGIC 0x6a736677736872ULL |
64 | #define WRITE_LOG_SUPER_SECTOR 0 |
65 | |
66 | /* |
67 | * The disk format for this is braindead simple. |
68 | * |
69 | * At byte 0 we have our super, followed by the following sequence for |
70 | * nr_entries: |
71 | * |
72 | * [ 1 sector ][ entry->nr_sectors ] |
73 | * [log_write_entry][ data written ] |
74 | * |
75 | * The log_write_entry takes up a full sector so we can have arbitrary length |
76 | * marks and it leaves us room for extra content in the future. |
77 | */ |
78 | |
79 | /* |
80 | * Basic info about the log for userspace. |
81 | */ |
82 | struct log_write_super { |
83 | __le64 magic; |
84 | __le64 version; |
85 | __le64 nr_entries; |
86 | __le32 sectorsize; |
87 | }; |
88 | |
89 | /* |
90 | * sector - the sector we wrote. |
91 | * nr_sectors - the number of sectors we wrote. |
92 | * flags - flags for this log entry. |
93 | * data_len - the size of the data in this log entry, this is for private log |
94 | * entry stuff, the MARK data provided by userspace for example. |
95 | */ |
96 | struct log_write_entry { |
97 | __le64 sector; |
98 | __le64 nr_sectors; |
99 | __le64 flags; |
100 | __le64 data_len; |
101 | }; |
102 | |
103 | struct log_writes_c { |
104 | struct dm_dev *dev; |
105 | struct dm_dev *logdev; |
106 | u64 logged_entries; |
107 | u32 sectorsize; |
108 | u32 sectorshift; |
109 | atomic_t io_blocks; |
110 | atomic_t pending_blocks; |
111 | sector_t next_sector; |
112 | sector_t end_sector; |
113 | bool logging_enabled; |
114 | bool device_supports_discard; |
115 | spinlock_t blocks_lock; |
116 | struct list_head unflushed_blocks; |
117 | struct list_head logging_blocks; |
118 | wait_queue_head_t wait; |
119 | struct task_struct *log_kthread; |
120 | struct completion super_done; |
121 | }; |
122 | |
123 | struct pending_block { |
124 | int vec_cnt; |
125 | u64 flags; |
126 | sector_t sector; |
127 | sector_t nr_sectors; |
128 | char *data; |
129 | u32 datalen; |
130 | struct list_head list; |
131 | struct bio_vec vecs[]; |
132 | }; |
133 | |
134 | struct per_bio_data { |
135 | struct pending_block *block; |
136 | }; |
137 | |
138 | static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc, |
139 | sector_t sectors) |
140 | { |
141 | return sectors >> (lc->sectorshift - SECTOR_SHIFT); |
142 | } |
143 | |
144 | static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc, |
145 | sector_t sectors) |
146 | { |
147 | return sectors << (lc->sectorshift - SECTOR_SHIFT); |
148 | } |
149 | |
150 | static void put_pending_block(struct log_writes_c *lc) |
151 | { |
152 | if (atomic_dec_and_test(v: &lc->pending_blocks)) { |
153 | smp_mb__after_atomic(); |
154 | if (waitqueue_active(wq_head: &lc->wait)) |
155 | wake_up(&lc->wait); |
156 | } |
157 | } |
158 | |
159 | static void put_io_block(struct log_writes_c *lc) |
160 | { |
161 | if (atomic_dec_and_test(v: &lc->io_blocks)) { |
162 | smp_mb__after_atomic(); |
163 | if (waitqueue_active(wq_head: &lc->wait)) |
164 | wake_up(&lc->wait); |
165 | } |
166 | } |
167 | |
168 | static void log_end_io(struct bio *bio) |
169 | { |
170 | struct log_writes_c *lc = bio->bi_private; |
171 | |
172 | if (bio->bi_status) { |
173 | unsigned long flags; |
174 | |
175 | DMERR("Error writing log block, error=%d" , bio->bi_status); |
176 | spin_lock_irqsave(&lc->blocks_lock, flags); |
177 | lc->logging_enabled = false; |
178 | spin_unlock_irqrestore(lock: &lc->blocks_lock, flags); |
179 | } |
180 | |
181 | bio_free_pages(bio); |
182 | put_io_block(lc); |
183 | bio_put(bio); |
184 | } |
185 | |
186 | static void log_end_super(struct bio *bio) |
187 | { |
188 | struct log_writes_c *lc = bio->bi_private; |
189 | |
190 | complete(&lc->super_done); |
191 | log_end_io(bio); |
192 | } |
193 | |
194 | /* |
195 | * Meant to be called if there is an error, it will free all the pages |
196 | * associated with the block. |
197 | */ |
198 | static void free_pending_block(struct log_writes_c *lc, |
199 | struct pending_block *block) |
200 | { |
201 | int i; |
202 | |
203 | for (i = 0; i < block->vec_cnt; i++) { |
204 | if (block->vecs[i].bv_page) |
205 | __free_page(block->vecs[i].bv_page); |
206 | } |
207 | kfree(objp: block->data); |
208 | kfree(objp: block); |
209 | put_pending_block(lc); |
210 | } |
211 | |
212 | static int write_metadata(struct log_writes_c *lc, void *entry, |
213 | size_t entrylen, void *data, size_t datalen, |
214 | sector_t sector) |
215 | { |
216 | struct bio *bio; |
217 | struct page *page; |
218 | void *ptr; |
219 | size_t ret; |
220 | |
221 | bio = bio_alloc(bdev: lc->logdev->bdev, nr_vecs: 1, opf: REQ_OP_WRITE, GFP_KERNEL); |
222 | bio->bi_iter.bi_size = 0; |
223 | bio->bi_iter.bi_sector = sector; |
224 | bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ? |
225 | log_end_super : log_end_io; |
226 | bio->bi_private = lc; |
227 | |
228 | page = alloc_page(GFP_KERNEL); |
229 | if (!page) { |
230 | DMERR("Couldn't alloc log page" ); |
231 | bio_put(bio); |
232 | goto error; |
233 | } |
234 | |
235 | ptr = kmap_local_page(page); |
236 | memcpy(ptr, entry, entrylen); |
237 | if (datalen) |
238 | memcpy(ptr + entrylen, data, datalen); |
239 | memset(ptr + entrylen + datalen, 0, |
240 | lc->sectorsize - entrylen - datalen); |
241 | kunmap_local(ptr); |
242 | |
243 | ret = bio_add_page(bio, page, len: lc->sectorsize, off: 0); |
244 | if (ret != lc->sectorsize) { |
245 | DMERR("Couldn't add page to the log block" ); |
246 | goto error_bio; |
247 | } |
248 | submit_bio(bio); |
249 | return 0; |
250 | error_bio: |
251 | bio_put(bio); |
252 | __free_page(page); |
253 | error: |
254 | put_io_block(lc); |
255 | return -1; |
256 | } |
257 | |
258 | static int write_inline_data(struct log_writes_c *lc, void *entry, |
259 | size_t entrylen, void *data, size_t datalen, |
260 | sector_t sector) |
261 | { |
262 | int bio_pages, pg_datalen, pg_sectorlen, i; |
263 | struct page *page; |
264 | struct bio *bio; |
265 | size_t ret; |
266 | void *ptr; |
267 | |
268 | while (datalen) { |
269 | bio_pages = bio_max_segs(DIV_ROUND_UP(datalen, PAGE_SIZE)); |
270 | |
271 | atomic_inc(v: &lc->io_blocks); |
272 | |
273 | bio = bio_alloc(bdev: lc->logdev->bdev, nr_vecs: bio_pages, opf: REQ_OP_WRITE, |
274 | GFP_KERNEL); |
275 | bio->bi_iter.bi_size = 0; |
276 | bio->bi_iter.bi_sector = sector; |
277 | bio->bi_end_io = log_end_io; |
278 | bio->bi_private = lc; |
279 | |
280 | for (i = 0; i < bio_pages; i++) { |
281 | pg_datalen = min_t(int, datalen, PAGE_SIZE); |
282 | pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize); |
283 | |
284 | page = alloc_page(GFP_KERNEL); |
285 | if (!page) { |
286 | DMERR("Couldn't alloc inline data page" ); |
287 | goto error_bio; |
288 | } |
289 | |
290 | ptr = kmap_local_page(page); |
291 | memcpy(ptr, data, pg_datalen); |
292 | if (pg_sectorlen > pg_datalen) |
293 | memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen); |
294 | kunmap_local(ptr); |
295 | |
296 | ret = bio_add_page(bio, page, len: pg_sectorlen, off: 0); |
297 | if (ret != pg_sectorlen) { |
298 | DMERR("Couldn't add page of inline data" ); |
299 | __free_page(page); |
300 | goto error_bio; |
301 | } |
302 | |
303 | datalen -= pg_datalen; |
304 | data += pg_datalen; |
305 | } |
306 | submit_bio(bio); |
307 | |
308 | sector += bio_pages * PAGE_SECTORS; |
309 | } |
310 | return 0; |
311 | error_bio: |
312 | bio_free_pages(bio); |
313 | bio_put(bio); |
314 | put_io_block(lc); |
315 | return -1; |
316 | } |
317 | |
318 | static int log_one_block(struct log_writes_c *lc, |
319 | struct pending_block *block, sector_t sector) |
320 | { |
321 | struct bio *bio; |
322 | struct log_write_entry entry; |
323 | size_t metadatalen, ret; |
324 | int i; |
325 | |
326 | entry.sector = cpu_to_le64(block->sector); |
327 | entry.nr_sectors = cpu_to_le64(block->nr_sectors); |
328 | entry.flags = cpu_to_le64(block->flags); |
329 | entry.data_len = cpu_to_le64(block->datalen); |
330 | |
331 | metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0; |
332 | if (write_metadata(lc, entry: &entry, entrylen: sizeof(entry), data: block->data, |
333 | datalen: metadatalen, sector)) { |
334 | free_pending_block(lc, block); |
335 | return -1; |
336 | } |
337 | |
338 | sector += dev_to_bio_sectors(lc, sectors: 1); |
339 | |
340 | if (block->datalen && metadatalen == 0) { |
341 | if (write_inline_data(lc, entry: &entry, entrylen: sizeof(entry), data: block->data, |
342 | datalen: block->datalen, sector)) { |
343 | free_pending_block(lc, block); |
344 | return -1; |
345 | } |
346 | /* we don't support both inline data & bio data */ |
347 | goto out; |
348 | } |
349 | |
350 | if (!block->vec_cnt) |
351 | goto out; |
352 | |
353 | atomic_inc(v: &lc->io_blocks); |
354 | bio = bio_alloc(bdev: lc->logdev->bdev, nr_vecs: bio_max_segs(nr_segs: block->vec_cnt), |
355 | opf: REQ_OP_WRITE, GFP_KERNEL); |
356 | bio->bi_iter.bi_size = 0; |
357 | bio->bi_iter.bi_sector = sector; |
358 | bio->bi_end_io = log_end_io; |
359 | bio->bi_private = lc; |
360 | |
361 | for (i = 0; i < block->vec_cnt; i++) { |
362 | /* |
363 | * The page offset is always 0 because we allocate a new page |
364 | * for every bvec in the original bio for simplicity sake. |
365 | */ |
366 | ret = bio_add_page(bio, page: block->vecs[i].bv_page, |
367 | len: block->vecs[i].bv_len, off: 0); |
368 | if (ret != block->vecs[i].bv_len) { |
369 | atomic_inc(v: &lc->io_blocks); |
370 | submit_bio(bio); |
371 | bio = bio_alloc(bdev: lc->logdev->bdev, |
372 | nr_vecs: bio_max_segs(nr_segs: block->vec_cnt - i), |
373 | opf: REQ_OP_WRITE, GFP_KERNEL); |
374 | bio->bi_iter.bi_size = 0; |
375 | bio->bi_iter.bi_sector = sector; |
376 | bio->bi_end_io = log_end_io; |
377 | bio->bi_private = lc; |
378 | |
379 | ret = bio_add_page(bio, page: block->vecs[i].bv_page, |
380 | len: block->vecs[i].bv_len, off: 0); |
381 | if (ret != block->vecs[i].bv_len) { |
382 | DMERR("Couldn't add page on new bio?" ); |
383 | bio_put(bio); |
384 | goto error; |
385 | } |
386 | } |
387 | sector += block->vecs[i].bv_len >> SECTOR_SHIFT; |
388 | } |
389 | submit_bio(bio); |
390 | out: |
391 | kfree(objp: block->data); |
392 | kfree(objp: block); |
393 | put_pending_block(lc); |
394 | return 0; |
395 | error: |
396 | free_pending_block(lc, block); |
397 | put_io_block(lc); |
398 | return -1; |
399 | } |
400 | |
401 | static int log_super(struct log_writes_c *lc) |
402 | { |
403 | struct log_write_super super; |
404 | |
405 | super.magic = cpu_to_le64(WRITE_LOG_MAGIC); |
406 | super.version = cpu_to_le64(WRITE_LOG_VERSION); |
407 | super.nr_entries = cpu_to_le64(lc->logged_entries); |
408 | super.sectorsize = cpu_to_le32(lc->sectorsize); |
409 | |
410 | if (write_metadata(lc, entry: &super, entrylen: sizeof(super), NULL, datalen: 0, |
411 | WRITE_LOG_SUPER_SECTOR)) { |
412 | DMERR("Couldn't write super" ); |
413 | return -1; |
414 | } |
415 | |
416 | /* |
417 | * Super sector should be writen in-order, otherwise the |
418 | * nr_entries could be rewritten incorrectly by an old bio. |
419 | */ |
420 | wait_for_completion_io(&lc->super_done); |
421 | |
422 | return 0; |
423 | } |
424 | |
425 | static inline sector_t logdev_last_sector(struct log_writes_c *lc) |
426 | { |
427 | return bdev_nr_sectors(bdev: lc->logdev->bdev); |
428 | } |
429 | |
430 | static int log_writes_kthread(void *arg) |
431 | { |
432 | struct log_writes_c *lc = arg; |
433 | sector_t sector = 0; |
434 | |
435 | while (!kthread_should_stop()) { |
436 | bool super = false; |
437 | bool logging_enabled; |
438 | struct pending_block *block = NULL; |
439 | int ret; |
440 | |
441 | spin_lock_irq(lock: &lc->blocks_lock); |
442 | if (!list_empty(head: &lc->logging_blocks)) { |
443 | block = list_first_entry(&lc->logging_blocks, |
444 | struct pending_block, list); |
445 | list_del_init(entry: &block->list); |
446 | if (!lc->logging_enabled) |
447 | goto next; |
448 | |
449 | sector = lc->next_sector; |
450 | if (!(block->flags & LOG_DISCARD_FLAG)) |
451 | lc->next_sector += dev_to_bio_sectors(lc, sectors: block->nr_sectors); |
452 | lc->next_sector += dev_to_bio_sectors(lc, sectors: 1); |
453 | |
454 | /* |
455 | * Apparently the size of the device may not be known |
456 | * right away, so handle this properly. |
457 | */ |
458 | if (!lc->end_sector) |
459 | lc->end_sector = logdev_last_sector(lc); |
460 | if (lc->end_sector && |
461 | lc->next_sector >= lc->end_sector) { |
462 | DMERR("Ran out of space on the logdev" ); |
463 | lc->logging_enabled = false; |
464 | goto next; |
465 | } |
466 | lc->logged_entries++; |
467 | atomic_inc(v: &lc->io_blocks); |
468 | |
469 | super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); |
470 | if (super) |
471 | atomic_inc(v: &lc->io_blocks); |
472 | } |
473 | next: |
474 | logging_enabled = lc->logging_enabled; |
475 | spin_unlock_irq(lock: &lc->blocks_lock); |
476 | if (block) { |
477 | if (logging_enabled) { |
478 | ret = log_one_block(lc, block, sector); |
479 | if (!ret && super) |
480 | ret = log_super(lc); |
481 | if (ret) { |
482 | spin_lock_irq(lock: &lc->blocks_lock); |
483 | lc->logging_enabled = false; |
484 | spin_unlock_irq(lock: &lc->blocks_lock); |
485 | } |
486 | } else |
487 | free_pending_block(lc, block); |
488 | continue; |
489 | } |
490 | |
491 | if (!try_to_freeze()) { |
492 | set_current_state(TASK_INTERRUPTIBLE); |
493 | if (!kthread_should_stop() && |
494 | list_empty(head: &lc->logging_blocks)) |
495 | schedule(); |
496 | __set_current_state(TASK_RUNNING); |
497 | } |
498 | } |
499 | return 0; |
500 | } |
501 | |
502 | /* |
503 | * Construct a log-writes mapping: |
504 | * log-writes <dev_path> <log_dev_path> |
505 | */ |
506 | static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
507 | { |
508 | struct log_writes_c *lc; |
509 | struct dm_arg_set as; |
510 | const char *devname, *logdevname; |
511 | int ret; |
512 | |
513 | as.argc = argc; |
514 | as.argv = argv; |
515 | |
516 | if (argc < 2) { |
517 | ti->error = "Invalid argument count" ; |
518 | return -EINVAL; |
519 | } |
520 | |
521 | lc = kzalloc(size: sizeof(struct log_writes_c), GFP_KERNEL); |
522 | if (!lc) { |
523 | ti->error = "Cannot allocate context" ; |
524 | return -ENOMEM; |
525 | } |
526 | spin_lock_init(&lc->blocks_lock); |
527 | INIT_LIST_HEAD(list: &lc->unflushed_blocks); |
528 | INIT_LIST_HEAD(list: &lc->logging_blocks); |
529 | init_waitqueue_head(&lc->wait); |
530 | init_completion(x: &lc->super_done); |
531 | atomic_set(v: &lc->io_blocks, i: 0); |
532 | atomic_set(v: &lc->pending_blocks, i: 0); |
533 | |
534 | devname = dm_shift_arg(as: &as); |
535 | ret = dm_get_device(ti, path: devname, mode: dm_table_get_mode(t: ti->table), result: &lc->dev); |
536 | if (ret) { |
537 | ti->error = "Device lookup failed" ; |
538 | goto bad; |
539 | } |
540 | |
541 | logdevname = dm_shift_arg(as: &as); |
542 | ret = dm_get_device(ti, path: logdevname, mode: dm_table_get_mode(t: ti->table), |
543 | result: &lc->logdev); |
544 | if (ret) { |
545 | ti->error = "Log device lookup failed" ; |
546 | dm_put_device(ti, d: lc->dev); |
547 | goto bad; |
548 | } |
549 | |
550 | lc->sectorsize = bdev_logical_block_size(bdev: lc->dev->bdev); |
551 | lc->sectorshift = ilog2(lc->sectorsize); |
552 | lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write" ); |
553 | if (IS_ERR(ptr: lc->log_kthread)) { |
554 | ret = PTR_ERR(ptr: lc->log_kthread); |
555 | ti->error = "Couldn't alloc kthread" ; |
556 | dm_put_device(ti, d: lc->dev); |
557 | dm_put_device(ti, d: lc->logdev); |
558 | goto bad; |
559 | } |
560 | |
561 | /* |
562 | * next_sector is in 512b sectors to correspond to what bi_sector expects. |
563 | * The super starts at sector 0, and the next_sector is the next logical |
564 | * one based on the sectorsize of the device. |
565 | */ |
566 | lc->next_sector = lc->sectorsize >> SECTOR_SHIFT; |
567 | lc->logging_enabled = true; |
568 | lc->end_sector = logdev_last_sector(lc); |
569 | lc->device_supports_discard = true; |
570 | |
571 | ti->num_flush_bios = 1; |
572 | ti->flush_supported = true; |
573 | ti->num_discard_bios = 1; |
574 | ti->discards_supported = true; |
575 | ti->per_io_data_size = sizeof(struct per_bio_data); |
576 | ti->private = lc; |
577 | return 0; |
578 | |
579 | bad: |
580 | kfree(objp: lc); |
581 | return ret; |
582 | } |
583 | |
584 | static int log_mark(struct log_writes_c *lc, char *data) |
585 | { |
586 | struct pending_block *block; |
587 | size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); |
588 | |
589 | block = kzalloc(size: sizeof(struct pending_block), GFP_KERNEL); |
590 | if (!block) { |
591 | DMERR("Error allocating pending block" ); |
592 | return -ENOMEM; |
593 | } |
594 | |
595 | block->data = kstrndup(s: data, len: maxsize - 1, GFP_KERNEL); |
596 | if (!block->data) { |
597 | DMERR("Error copying mark data" ); |
598 | kfree(objp: block); |
599 | return -ENOMEM; |
600 | } |
601 | atomic_inc(v: &lc->pending_blocks); |
602 | block->datalen = strlen(block->data); |
603 | block->flags |= LOG_MARK_FLAG; |
604 | spin_lock_irq(lock: &lc->blocks_lock); |
605 | list_add_tail(new: &block->list, head: &lc->logging_blocks); |
606 | spin_unlock_irq(lock: &lc->blocks_lock); |
607 | wake_up_process(tsk: lc->log_kthread); |
608 | return 0; |
609 | } |
610 | |
611 | static void log_writes_dtr(struct dm_target *ti) |
612 | { |
613 | struct log_writes_c *lc = ti->private; |
614 | |
615 | spin_lock_irq(lock: &lc->blocks_lock); |
616 | list_splice_init(list: &lc->unflushed_blocks, head: &lc->logging_blocks); |
617 | spin_unlock_irq(lock: &lc->blocks_lock); |
618 | |
619 | /* |
620 | * This is just nice to have since it'll update the super to include the |
621 | * unflushed blocks, if it fails we don't really care. |
622 | */ |
623 | log_mark(lc, data: "dm-log-writes-end" ); |
624 | wake_up_process(tsk: lc->log_kthread); |
625 | wait_event(lc->wait, !atomic_read(&lc->io_blocks) && |
626 | !atomic_read(&lc->pending_blocks)); |
627 | kthread_stop(k: lc->log_kthread); |
628 | |
629 | WARN_ON(!list_empty(&lc->logging_blocks)); |
630 | WARN_ON(!list_empty(&lc->unflushed_blocks)); |
631 | dm_put_device(ti, d: lc->dev); |
632 | dm_put_device(ti, d: lc->logdev); |
633 | kfree(objp: lc); |
634 | } |
635 | |
636 | static void normal_map_bio(struct dm_target *ti, struct bio *bio) |
637 | { |
638 | struct log_writes_c *lc = ti->private; |
639 | |
640 | bio_set_dev(bio, bdev: lc->dev->bdev); |
641 | } |
642 | |
643 | static int log_writes_map(struct dm_target *ti, struct bio *bio) |
644 | { |
645 | struct log_writes_c *lc = ti->private; |
646 | struct per_bio_data *pb = dm_per_bio_data(bio, data_size: sizeof(struct per_bio_data)); |
647 | struct pending_block *block; |
648 | struct bvec_iter iter; |
649 | struct bio_vec bv; |
650 | size_t alloc_size; |
651 | int i = 0; |
652 | bool flush_bio = (bio->bi_opf & REQ_PREFLUSH); |
653 | bool fua_bio = (bio->bi_opf & REQ_FUA); |
654 | bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD); |
655 | bool meta_bio = (bio->bi_opf & REQ_META); |
656 | |
657 | pb->block = NULL; |
658 | |
659 | /* Don't bother doing anything if logging has been disabled */ |
660 | if (!lc->logging_enabled) |
661 | goto map_bio; |
662 | |
663 | /* |
664 | * Map reads as normal. |
665 | */ |
666 | if (bio_data_dir(bio) == READ) |
667 | goto map_bio; |
668 | |
669 | /* No sectors and not a flush? Don't care */ |
670 | if (!bio_sectors(bio) && !flush_bio) |
671 | goto map_bio; |
672 | |
673 | /* |
674 | * Discards will have bi_size set but there's no actual data, so just |
675 | * allocate the size of the pending block. |
676 | */ |
677 | if (discard_bio) |
678 | alloc_size = sizeof(struct pending_block); |
679 | else |
680 | alloc_size = struct_size(block, vecs, bio_segments(bio)); |
681 | |
682 | block = kzalloc(size: alloc_size, GFP_NOIO); |
683 | if (!block) { |
684 | DMERR("Error allocating pending block" ); |
685 | spin_lock_irq(lock: &lc->blocks_lock); |
686 | lc->logging_enabled = false; |
687 | spin_unlock_irq(lock: &lc->blocks_lock); |
688 | return DM_MAPIO_KILL; |
689 | } |
690 | INIT_LIST_HEAD(list: &block->list); |
691 | pb->block = block; |
692 | atomic_inc(v: &lc->pending_blocks); |
693 | |
694 | if (flush_bio) |
695 | block->flags |= LOG_FLUSH_FLAG; |
696 | if (fua_bio) |
697 | block->flags |= LOG_FUA_FLAG; |
698 | if (discard_bio) |
699 | block->flags |= LOG_DISCARD_FLAG; |
700 | if (meta_bio) |
701 | block->flags |= LOG_METADATA_FLAG; |
702 | |
703 | block->sector = bio_to_dev_sectors(lc, sectors: bio->bi_iter.bi_sector); |
704 | block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio)); |
705 | |
706 | /* We don't need the data, just submit */ |
707 | if (discard_bio) { |
708 | WARN_ON(flush_bio || fua_bio); |
709 | if (lc->device_supports_discard) |
710 | goto map_bio; |
711 | bio_endio(bio); |
712 | return DM_MAPIO_SUBMITTED; |
713 | } |
714 | |
715 | /* Flush bio, splice the unflushed blocks onto this list and submit */ |
716 | if (flush_bio && !bio_sectors(bio)) { |
717 | spin_lock_irq(lock: &lc->blocks_lock); |
718 | list_splice_init(list: &lc->unflushed_blocks, head: &block->list); |
719 | spin_unlock_irq(lock: &lc->blocks_lock); |
720 | goto map_bio; |
721 | } |
722 | |
723 | /* |
724 | * We will write this bio somewhere else way later so we need to copy |
725 | * the actual contents into new pages so we know the data will always be |
726 | * there. |
727 | * |
728 | * We do this because this could be a bio from O_DIRECT in which case we |
729 | * can't just hold onto the page until some later point, we have to |
730 | * manually copy the contents. |
731 | */ |
732 | bio_for_each_segment(bv, bio, iter) { |
733 | struct page *page; |
734 | void *dst; |
735 | |
736 | page = alloc_page(GFP_NOIO); |
737 | if (!page) { |
738 | DMERR("Error allocing page" ); |
739 | free_pending_block(lc, block); |
740 | spin_lock_irq(lock: &lc->blocks_lock); |
741 | lc->logging_enabled = false; |
742 | spin_unlock_irq(lock: &lc->blocks_lock); |
743 | return DM_MAPIO_KILL; |
744 | } |
745 | |
746 | dst = kmap_local_page(page); |
747 | memcpy_from_bvec(to: dst, bvec: &bv); |
748 | kunmap_local(dst); |
749 | block->vecs[i].bv_page = page; |
750 | block->vecs[i].bv_len = bv.bv_len; |
751 | block->vec_cnt++; |
752 | i++; |
753 | } |
754 | |
755 | /* Had a flush with data in it, weird */ |
756 | if (flush_bio) { |
757 | spin_lock_irq(lock: &lc->blocks_lock); |
758 | list_splice_init(list: &lc->unflushed_blocks, head: &block->list); |
759 | spin_unlock_irq(lock: &lc->blocks_lock); |
760 | } |
761 | map_bio: |
762 | normal_map_bio(ti, bio); |
763 | return DM_MAPIO_REMAPPED; |
764 | } |
765 | |
766 | static int normal_end_io(struct dm_target *ti, struct bio *bio, |
767 | blk_status_t *error) |
768 | { |
769 | struct log_writes_c *lc = ti->private; |
770 | struct per_bio_data *pb = dm_per_bio_data(bio, data_size: sizeof(struct per_bio_data)); |
771 | |
772 | if (bio_data_dir(bio) == WRITE && pb->block) { |
773 | struct pending_block *block = pb->block; |
774 | unsigned long flags; |
775 | |
776 | spin_lock_irqsave(&lc->blocks_lock, flags); |
777 | if (block->flags & LOG_FLUSH_FLAG) { |
778 | list_splice_tail_init(list: &block->list, head: &lc->logging_blocks); |
779 | list_add_tail(new: &block->list, head: &lc->logging_blocks); |
780 | wake_up_process(tsk: lc->log_kthread); |
781 | } else if (block->flags & LOG_FUA_FLAG) { |
782 | list_add_tail(new: &block->list, head: &lc->logging_blocks); |
783 | wake_up_process(tsk: lc->log_kthread); |
784 | } else |
785 | list_add_tail(new: &block->list, head: &lc->unflushed_blocks); |
786 | spin_unlock_irqrestore(lock: &lc->blocks_lock, flags); |
787 | } |
788 | |
789 | return DM_ENDIO_DONE; |
790 | } |
791 | |
792 | /* |
793 | * INFO format: <logged entries> <highest allocated sector> |
794 | */ |
795 | static void log_writes_status(struct dm_target *ti, status_type_t type, |
796 | unsigned int status_flags, char *result, |
797 | unsigned int maxlen) |
798 | { |
799 | unsigned int sz = 0; |
800 | struct log_writes_c *lc = ti->private; |
801 | |
802 | switch (type) { |
803 | case STATUSTYPE_INFO: |
804 | DMEMIT("%llu %llu" , lc->logged_entries, |
805 | (unsigned long long)lc->next_sector - 1); |
806 | if (!lc->logging_enabled) |
807 | DMEMIT(" logging_disabled" ); |
808 | break; |
809 | |
810 | case STATUSTYPE_TABLE: |
811 | DMEMIT("%s %s" , lc->dev->name, lc->logdev->name); |
812 | break; |
813 | |
814 | case STATUSTYPE_IMA: |
815 | *result = '\0'; |
816 | break; |
817 | } |
818 | } |
819 | |
820 | static int log_writes_prepare_ioctl(struct dm_target *ti, |
821 | struct block_device **bdev) |
822 | { |
823 | struct log_writes_c *lc = ti->private; |
824 | struct dm_dev *dev = lc->dev; |
825 | |
826 | *bdev = dev->bdev; |
827 | /* |
828 | * Only pass ioctls through if the device sizes match exactly. |
829 | */ |
830 | if (ti->len != bdev_nr_sectors(bdev: dev->bdev)) |
831 | return 1; |
832 | return 0; |
833 | } |
834 | |
835 | static int log_writes_iterate_devices(struct dm_target *ti, |
836 | iterate_devices_callout_fn fn, |
837 | void *data) |
838 | { |
839 | struct log_writes_c *lc = ti->private; |
840 | |
841 | return fn(ti, lc->dev, 0, ti->len, data); |
842 | } |
843 | |
844 | /* |
845 | * Messages supported: |
846 | * mark <mark data> - specify the marked data. |
847 | */ |
848 | static int log_writes_message(struct dm_target *ti, unsigned int argc, char **argv, |
849 | char *result, unsigned int maxlen) |
850 | { |
851 | int r = -EINVAL; |
852 | struct log_writes_c *lc = ti->private; |
853 | |
854 | if (argc != 2) { |
855 | DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d" , argc); |
856 | return r; |
857 | } |
858 | |
859 | if (!strcasecmp(s1: argv[0], s2: "mark" )) |
860 | r = log_mark(lc, data: argv[1]); |
861 | else |
862 | DMWARN("Unrecognised log writes target message received: %s" , argv[0]); |
863 | |
864 | return r; |
865 | } |
866 | |
867 | static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) |
868 | { |
869 | struct log_writes_c *lc = ti->private; |
870 | |
871 | if (!bdev_max_discard_sectors(bdev: lc->dev->bdev)) { |
872 | lc->device_supports_discard = false; |
873 | limits->discard_granularity = lc->sectorsize; |
874 | limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); |
875 | } |
876 | limits->logical_block_size = bdev_logical_block_size(bdev: lc->dev->bdev); |
877 | limits->physical_block_size = bdev_physical_block_size(bdev: lc->dev->bdev); |
878 | limits->io_min = limits->physical_block_size; |
879 | limits->dma_alignment = limits->logical_block_size - 1; |
880 | } |
881 | |
882 | #if IS_ENABLED(CONFIG_FS_DAX) |
883 | static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, |
884 | pgoff_t *pgoff) |
885 | { |
886 | struct log_writes_c *lc = ti->private; |
887 | |
888 | *pgoff += (get_start_sect(bdev: lc->dev->bdev) >> PAGE_SECTORS_SHIFT); |
889 | return lc->dev->dax_dev; |
890 | } |
891 | |
892 | static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, |
893 | long nr_pages, enum dax_access_mode mode, void **kaddr, |
894 | pfn_t *pfn) |
895 | { |
896 | struct dax_device *dax_dev = log_writes_dax_pgoff(ti, pgoff: &pgoff); |
897 | |
898 | return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); |
899 | } |
900 | |
901 | static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, |
902 | size_t nr_pages) |
903 | { |
904 | struct dax_device *dax_dev = log_writes_dax_pgoff(ti, pgoff: &pgoff); |
905 | |
906 | return dax_zero_page_range(dax_dev, pgoff, nr_pages: nr_pages << PAGE_SHIFT); |
907 | } |
908 | |
909 | static size_t log_writes_dax_recovery_write(struct dm_target *ti, |
910 | pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) |
911 | { |
912 | struct dax_device *dax_dev = log_writes_dax_pgoff(ti, pgoff: &pgoff); |
913 | |
914 | return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); |
915 | } |
916 | |
917 | #else |
918 | #define log_writes_dax_direct_access NULL |
919 | #define log_writes_dax_zero_page_range NULL |
920 | #define log_writes_dax_recovery_write NULL |
921 | #endif |
922 | |
923 | static struct target_type log_writes_target = { |
924 | .name = "log-writes" , |
925 | .version = {1, 1, 0}, |
926 | .module = THIS_MODULE, |
927 | .ctr = log_writes_ctr, |
928 | .dtr = log_writes_dtr, |
929 | .map = log_writes_map, |
930 | .end_io = normal_end_io, |
931 | .status = log_writes_status, |
932 | .prepare_ioctl = log_writes_prepare_ioctl, |
933 | .message = log_writes_message, |
934 | .iterate_devices = log_writes_iterate_devices, |
935 | .io_hints = log_writes_io_hints, |
936 | .direct_access = log_writes_dax_direct_access, |
937 | .dax_zero_page_range = log_writes_dax_zero_page_range, |
938 | .dax_recovery_write = log_writes_dax_recovery_write, |
939 | }; |
940 | module_dm(log_writes); |
941 | |
942 | MODULE_DESCRIPTION(DM_NAME " log writes target" ); |
943 | MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>" ); |
944 | MODULE_LICENSE("GPL" ); |
945 | |