1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2005-2007 Red Hat GmbH
4 *
5 * A target that delays reads and/or writes and can send
6 * them to different devices.
7 *
8 * This file is released under the GPL.
9 */
10
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/blkdev.h>
14#include <linux/bio.h>
15#include <linux/slab.h>
16#include <linux/kthread.h>
17
18#include <linux/device-mapper.h>
19
20#define DM_MSG_PREFIX "delay"
21
22struct delay_class {
23 struct dm_dev *dev;
24 sector_t start;
25 unsigned int delay;
26 unsigned int ops;
27};
28
29struct delay_c {
30 struct timer_list delay_timer;
31 struct mutex timer_lock;
32 struct workqueue_struct *kdelayd_wq;
33 struct work_struct flush_expired_bios;
34 struct list_head delayed_bios;
35 struct task_struct *worker;
36 atomic_t may_delay;
37
38 struct delay_class read;
39 struct delay_class write;
40 struct delay_class flush;
41
42 int argc;
43};
44
45struct dm_delay_info {
46 struct delay_c *context;
47 struct delay_class *class;
48 struct list_head list;
49 unsigned long expires;
50};
51
52static DEFINE_MUTEX(delayed_bios_lock);
53
54static void handle_delayed_timer(struct timer_list *t)
55{
56 struct delay_c *dc = from_timer(dc, t, delay_timer);
57
58 queue_work(wq: dc->kdelayd_wq, work: &dc->flush_expired_bios);
59}
60
61static void queue_timeout(struct delay_c *dc, unsigned long expires)
62{
63 mutex_lock(&dc->timer_lock);
64
65 if (!timer_pending(timer: &dc->delay_timer) || expires < dc->delay_timer.expires)
66 mod_timer(timer: &dc->delay_timer, expires);
67
68 mutex_unlock(lock: &dc->timer_lock);
69}
70
71static inline bool delay_is_fast(struct delay_c *dc)
72{
73 return !!dc->worker;
74}
75
76static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
77{
78 struct dm_delay_info *delayed, *next;
79
80 mutex_lock(&delayed_bios_lock);
81 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
82 if (flush_all || time_after_eq(jiffies, delayed->expires)) {
83 struct bio *bio = dm_bio_from_per_bio_data(data: delayed,
84 data_size: sizeof(struct dm_delay_info));
85 list_del(entry: &delayed->list);
86 dm_submit_bio_remap(clone: bio, NULL);
87 delayed->class->ops--;
88 }
89 }
90 mutex_unlock(lock: &delayed_bios_lock);
91}
92
93static int flush_worker_fn(void *data)
94{
95 struct delay_c *dc = data;
96
97 while (1) {
98 flush_delayed_bios_fast(dc, flush_all: false);
99 if (unlikely(list_empty(&dc->delayed_bios))) {
100 set_current_state(TASK_INTERRUPTIBLE);
101 schedule();
102 } else
103 cond_resched();
104 }
105
106 return 0;
107}
108
109static void flush_bios(struct bio *bio)
110{
111 struct bio *n;
112
113 while (bio) {
114 n = bio->bi_next;
115 bio->bi_next = NULL;
116 dm_submit_bio_remap(clone: bio, NULL);
117 bio = n;
118 }
119}
120
121static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
122{
123 struct dm_delay_info *delayed, *next;
124 unsigned long next_expires = 0;
125 unsigned long start_timer = 0;
126 struct bio_list flush_bios = { };
127
128 mutex_lock(&delayed_bios_lock);
129 list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
130 if (flush_all || time_after_eq(jiffies, delayed->expires)) {
131 struct bio *bio = dm_bio_from_per_bio_data(data: delayed,
132 data_size: sizeof(struct dm_delay_info));
133 list_del(entry: &delayed->list);
134 bio_list_add(bl: &flush_bios, bio);
135 delayed->class->ops--;
136 continue;
137 }
138
139 if (!start_timer) {
140 start_timer = 1;
141 next_expires = delayed->expires;
142 } else
143 next_expires = min(next_expires, delayed->expires);
144 }
145 mutex_unlock(lock: &delayed_bios_lock);
146
147 if (start_timer)
148 queue_timeout(dc, expires: next_expires);
149
150 return bio_list_get(bl: &flush_bios);
151}
152
153static void flush_expired_bios(struct work_struct *work)
154{
155 struct delay_c *dc;
156
157 dc = container_of(work, struct delay_c, flush_expired_bios);
158 if (delay_is_fast(dc))
159 flush_delayed_bios_fast(dc, flush_all: false);
160 else
161 flush_bios(bio: flush_delayed_bios(dc, flush_all: false));
162}
163
164static void delay_dtr(struct dm_target *ti)
165{
166 struct delay_c *dc = ti->private;
167
168 if (dc->kdelayd_wq)
169 destroy_workqueue(wq: dc->kdelayd_wq);
170
171 if (dc->read.dev)
172 dm_put_device(ti, d: dc->read.dev);
173 if (dc->write.dev)
174 dm_put_device(ti, d: dc->write.dev);
175 if (dc->flush.dev)
176 dm_put_device(ti, d: dc->flush.dev);
177 if (dc->worker)
178 kthread_stop(k: dc->worker);
179
180 if (!delay_is_fast(dc))
181 mutex_destroy(lock: &dc->timer_lock);
182
183 kfree(objp: dc);
184}
185
186static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **argv)
187{
188 int ret;
189 unsigned long long tmpll;
190 char dummy;
191
192 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
193 ti->error = "Invalid device sector";
194 return -EINVAL;
195 }
196 c->start = tmpll;
197
198 if (sscanf(argv[2], "%u%c", &c->delay, &dummy) != 1) {
199 ti->error = "Invalid delay";
200 return -EINVAL;
201 }
202
203 ret = dm_get_device(ti, path: argv[0], mode: dm_table_get_mode(t: ti->table), result: &c->dev);
204 if (ret) {
205 ti->error = "Device lookup failed";
206 return ret;
207 }
208
209 return 0;
210}
211
212/*
213 * Mapping parameters:
214 * <device> <offset> <delay> [<write_device> <write_offset> <write_delay>]
215 *
216 * With separate write parameters, the first set is only used for reads.
217 * Offsets are specified in sectors.
218 * Delays are specified in milliseconds.
219 */
220static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
221{
222 struct delay_c *dc;
223 int ret;
224 unsigned int max_delay;
225
226 if (argc != 3 && argc != 6 && argc != 9) {
227 ti->error = "Requires exactly 3, 6 or 9 arguments";
228 return -EINVAL;
229 }
230
231 dc = kzalloc(size: sizeof(*dc), GFP_KERNEL);
232 if (!dc) {
233 ti->error = "Cannot allocate context";
234 return -ENOMEM;
235 }
236
237 ti->private = dc;
238 INIT_LIST_HEAD(list: &dc->delayed_bios);
239 atomic_set(v: &dc->may_delay, i: 1);
240 dc->argc = argc;
241
242 ret = delay_class_ctr(ti, c: &dc->read, argv);
243 if (ret)
244 goto bad;
245 max_delay = dc->read.delay;
246
247 if (argc == 3) {
248 ret = delay_class_ctr(ti, c: &dc->write, argv);
249 if (ret)
250 goto bad;
251 ret = delay_class_ctr(ti, c: &dc->flush, argv);
252 if (ret)
253 goto bad;
254 max_delay = max(max_delay, dc->write.delay);
255 max_delay = max(max_delay, dc->flush.delay);
256 goto out;
257 }
258
259 ret = delay_class_ctr(ti, c: &dc->write, argv: argv + 3);
260 if (ret)
261 goto bad;
262 if (argc == 6) {
263 ret = delay_class_ctr(ti, c: &dc->flush, argv: argv + 3);
264 if (ret)
265 goto bad;
266 max_delay = max(max_delay, dc->flush.delay);
267 goto out;
268 }
269
270 ret = delay_class_ctr(ti, c: &dc->flush, argv: argv + 6);
271 if (ret)
272 goto bad;
273 max_delay = max(max_delay, dc->flush.delay);
274
275out:
276 if (max_delay < 50) {
277 /*
278 * In case of small requested delays, use kthread instead of
279 * timers and workqueue to achieve better latency.
280 */
281 dc->worker = kthread_create(&flush_worker_fn, dc,
282 "dm-delay-flush-worker");
283 if (IS_ERR(ptr: dc->worker)) {
284 ret = PTR_ERR(ptr: dc->worker);
285 goto bad;
286 }
287 } else {
288 timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
289 INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
290 mutex_init(&dc->timer_lock);
291 dc->kdelayd_wq = alloc_workqueue(fmt: "kdelayd", flags: WQ_MEM_RECLAIM, max_active: 0);
292 if (!dc->kdelayd_wq) {
293 ret = -EINVAL;
294 DMERR("Couldn't start kdelayd");
295 goto bad;
296 }
297 }
298
299 ti->num_flush_bios = 1;
300 ti->num_discard_bios = 1;
301 ti->accounts_remapped_io = true;
302 ti->per_io_data_size = sizeof(struct dm_delay_info);
303 return 0;
304
305bad:
306 delay_dtr(ti);
307 return ret;
308}
309
310static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
311{
312 struct dm_delay_info *delayed;
313 unsigned long expires = 0;
314
315 if (!c->delay || !atomic_read(v: &dc->may_delay))
316 return DM_MAPIO_REMAPPED;
317
318 delayed = dm_per_bio_data(bio, data_size: sizeof(struct dm_delay_info));
319
320 delayed->context = dc;
321 delayed->expires = expires = jiffies + msecs_to_jiffies(m: c->delay);
322
323 mutex_lock(&delayed_bios_lock);
324 c->ops++;
325 list_add_tail(new: &delayed->list, head: &dc->delayed_bios);
326 mutex_unlock(lock: &delayed_bios_lock);
327
328 if (delay_is_fast(dc))
329 wake_up_process(tsk: dc->worker);
330 else
331 queue_timeout(dc, expires);
332
333 return DM_MAPIO_SUBMITTED;
334}
335
336static void delay_presuspend(struct dm_target *ti)
337{
338 struct delay_c *dc = ti->private;
339
340 atomic_set(v: &dc->may_delay, i: 0);
341
342 if (delay_is_fast(dc))
343 flush_delayed_bios_fast(dc, flush_all: true);
344 else {
345 del_timer_sync(timer: &dc->delay_timer);
346 flush_bios(bio: flush_delayed_bios(dc, flush_all: true));
347 }
348}
349
350static void delay_resume(struct dm_target *ti)
351{
352 struct delay_c *dc = ti->private;
353
354 atomic_set(v: &dc->may_delay, i: 1);
355}
356
357static int delay_map(struct dm_target *ti, struct bio *bio)
358{
359 struct delay_c *dc = ti->private;
360 struct delay_class *c;
361 struct dm_delay_info *delayed = dm_per_bio_data(bio, data_size: sizeof(struct dm_delay_info));
362
363 if (bio_data_dir(bio) == WRITE) {
364 if (unlikely(bio->bi_opf & REQ_PREFLUSH))
365 c = &dc->flush;
366 else
367 c = &dc->write;
368 } else {
369 c = &dc->read;
370 }
371 delayed->class = c;
372 bio_set_dev(bio, bdev: c->dev->bdev);
373 bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
374
375 return delay_bio(dc, c, bio);
376}
377
378#define DMEMIT_DELAY_CLASS(c) \
379 DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay)
380
381static void delay_status(struct dm_target *ti, status_type_t type,
382 unsigned int status_flags, char *result, unsigned int maxlen)
383{
384 struct delay_c *dc = ti->private;
385 int sz = 0;
386
387 switch (type) {
388 case STATUSTYPE_INFO:
389 DMEMIT("%u %u %u", dc->read.ops, dc->write.ops, dc->flush.ops);
390 break;
391
392 case STATUSTYPE_TABLE:
393 DMEMIT_DELAY_CLASS(&dc->read);
394 if (dc->argc >= 6) {
395 DMEMIT(" ");
396 DMEMIT_DELAY_CLASS(&dc->write);
397 }
398 if (dc->argc >= 9) {
399 DMEMIT(" ");
400 DMEMIT_DELAY_CLASS(&dc->flush);
401 }
402 break;
403
404 case STATUSTYPE_IMA:
405 *result = '\0';
406 break;
407 }
408}
409
410static int delay_iterate_devices(struct dm_target *ti,
411 iterate_devices_callout_fn fn, void *data)
412{
413 struct delay_c *dc = ti->private;
414 int ret = 0;
415
416 ret = fn(ti, dc->read.dev, dc->read.start, ti->len, data);
417 if (ret)
418 goto out;
419 ret = fn(ti, dc->write.dev, dc->write.start, ti->len, data);
420 if (ret)
421 goto out;
422 ret = fn(ti, dc->flush.dev, dc->flush.start, ti->len, data);
423 if (ret)
424 goto out;
425
426out:
427 return ret;
428}
429
430static struct target_type delay_target = {
431 .name = "delay",
432 .version = {1, 4, 0},
433 .features = DM_TARGET_PASSES_INTEGRITY,
434 .module = THIS_MODULE,
435 .ctr = delay_ctr,
436 .dtr = delay_dtr,
437 .map = delay_map,
438 .presuspend = delay_presuspend,
439 .resume = delay_resume,
440 .status = delay_status,
441 .iterate_devices = delay_iterate_devices,
442};
443module_dm(delay);
444
445MODULE_DESCRIPTION(DM_NAME " delay target");
446MODULE_AUTHOR("Heinz Mauelshagen <mauelshagen@redhat.com>");
447MODULE_LICENSE("GPL");
448

source code of linux/drivers/md/dm-delay.c