1/*
2 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
3 * Copyright (C) 2011-2013 Red Hat, Inc.
4 *
5 * This file is released under the GPL.
6 *
7 * dm-switch is a device-mapper target that maps IO to underlying block
8 * devices efficiently when there are a large number of fixed-sized
9 * address regions but there is no simple pattern to allow for a compact
10 * mapping representation such as dm-stripe.
11 */
12
13#include <linux/device-mapper.h>
14
15#include <linux/module.h>
16#include <linux/init.h>
17#include <linux/vmalloc.h>
18
19#define DM_MSG_PREFIX "switch"
20
21/*
22 * One region_table_slot_t holds <region_entries_per_slot> region table
23 * entries each of which is <region_table_entry_bits> in size.
24 */
25typedef unsigned long region_table_slot_t;
26
27/*
28 * A device with the offset to its start sector.
29 */
30struct switch_path {
31 struct dm_dev *dmdev;
32 sector_t start;
33};
34
35/*
36 * Context block for a dm switch device.
37 */
38struct switch_ctx {
39 struct dm_target *ti;
40
41 unsigned nr_paths; /* Number of paths in path_list. */
42
43 unsigned region_size; /* Region size in 512-byte sectors */
44 unsigned long nr_regions; /* Number of regions making up the device */
45 signed char region_size_bits; /* log2 of region_size or -1 */
46
47 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
48 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
49 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
50
51 region_table_slot_t *region_table; /* Region table */
52
53 /*
54 * Array of dm devices to switch between.
55 */
56 struct switch_path path_list[0];
57};
58
59static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
60 unsigned region_size)
61{
62 struct switch_ctx *sctx;
63
64 sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL);
65 if (!sctx)
66 return NULL;
67
68 sctx->ti = ti;
69 sctx->region_size = region_size;
70
71 ti->private = sctx;
72
73 return sctx;
74}
75
76static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
77{
78 struct switch_ctx *sctx = ti->private;
79 sector_t nr_regions = ti->len;
80 sector_t nr_slots;
81
82 if (!(sctx->region_size & (sctx->region_size - 1)))
83 sctx->region_size_bits = __ffs(sctx->region_size);
84 else
85 sctx->region_size_bits = -1;
86
87 sctx->region_table_entry_bits = 1;
88 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
89 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
90 sctx->region_table_entry_bits++;
91
92 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
93 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
94 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
95 else
96 sctx->region_entries_per_slot_bits = -1;
97
98 if (sector_div(nr_regions, sctx->region_size))
99 nr_regions++;
100
101 if (nr_regions >= ULONG_MAX) {
102 ti->error = "Region table too large";
103 return -EINVAL;
104 }
105 sctx->nr_regions = nr_regions;
106
107 nr_slots = nr_regions;
108 if (sector_div(nr_slots, sctx->region_entries_per_slot))
109 nr_slots++;
110
111 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
112 ti->error = "Region table too large";
113 return -EINVAL;
114 }
115
116 sctx->region_table = vmalloc(array_size(nr_slots,
117 sizeof(region_table_slot_t)));
118 if (!sctx->region_table) {
119 ti->error = "Cannot allocate region table";
120 return -ENOMEM;
121 }
122
123 return 0;
124}
125
126static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
127 unsigned long *region_index, unsigned *bit)
128{
129 if (sctx->region_entries_per_slot_bits >= 0) {
130 *region_index = region_nr >> sctx->region_entries_per_slot_bits;
131 *bit = region_nr & (sctx->region_entries_per_slot - 1);
132 } else {
133 *region_index = region_nr / sctx->region_entries_per_slot;
134 *bit = region_nr % sctx->region_entries_per_slot;
135 }
136
137 *bit *= sctx->region_table_entry_bits;
138}
139
140static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
141{
142 unsigned long region_index;
143 unsigned bit;
144
145 switch_get_position(sctx, region_nr, &region_index, &bit);
146
147 return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
148 ((1 << sctx->region_table_entry_bits) - 1);
149}
150
151/*
152 * Find which path to use at given offset.
153 */
154static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
155{
156 unsigned path_nr;
157 sector_t p;
158
159 p = offset;
160 if (sctx->region_size_bits >= 0)
161 p >>= sctx->region_size_bits;
162 else
163 sector_div(p, sctx->region_size);
164
165 path_nr = switch_region_table_read(sctx, p);
166
167 /* This can only happen if the processor uses non-atomic stores. */
168 if (unlikely(path_nr >= sctx->nr_paths))
169 path_nr = 0;
170
171 return path_nr;
172}
173
174static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
175 unsigned value)
176{
177 unsigned long region_index;
178 unsigned bit;
179 region_table_slot_t pte;
180
181 switch_get_position(sctx, region_nr, &region_index, &bit);
182
183 pte = sctx->region_table[region_index];
184 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
185 pte |= (region_table_slot_t)value << bit;
186 sctx->region_table[region_index] = pte;
187}
188
189/*
190 * Fill the region table with an initial round robin pattern.
191 */
192static void initialise_region_table(struct switch_ctx *sctx)
193{
194 unsigned path_nr = 0;
195 unsigned long region_nr;
196
197 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
198 switch_region_table_write(sctx, region_nr, path_nr);
199 if (++path_nr >= sctx->nr_paths)
200 path_nr = 0;
201 }
202}
203
204static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
205{
206 struct switch_ctx *sctx = ti->private;
207 unsigned long long start;
208 int r;
209
210 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
211 &sctx->path_list[sctx->nr_paths].dmdev);
212 if (r) {
213 ti->error = "Device lookup failed";
214 return r;
215 }
216
217 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
218 ti->error = "Invalid device starting offset";
219 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
220 return -EINVAL;
221 }
222
223 sctx->path_list[sctx->nr_paths].start = start;
224
225 sctx->nr_paths++;
226
227 return 0;
228}
229
230/*
231 * Destructor: Don't free the dm_target, just the ti->private data (if any).
232 */
233static void switch_dtr(struct dm_target *ti)
234{
235 struct switch_ctx *sctx = ti->private;
236
237 while (sctx->nr_paths--)
238 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
239
240 vfree(sctx->region_table);
241 kfree(sctx);
242}
243
244/*
245 * Constructor arguments:
246 * <num_paths> <region_size> <num_optional_args> [<optional_args>...]
247 * [<dev_path> <offset>]+
248 *
249 * Optional args are to allow for future extension: currently this
250 * parameter must be 0.
251 */
252static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
253{
254 static const struct dm_arg _args[] = {
255 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
256 {1, UINT_MAX, "Invalid region size"},
257 {0, 0, "Invalid number of optional args"},
258 };
259
260 struct switch_ctx *sctx;
261 struct dm_arg_set as;
262 unsigned nr_paths, region_size, nr_optional_args;
263 int r;
264
265 as.argc = argc;
266 as.argv = argv;
267
268 r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
269 if (r)
270 return -EINVAL;
271
272 r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
273 if (r)
274 return r;
275
276 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
277 if (r)
278 return r;
279 /* parse optional arguments here, if we add any */
280
281 if (as.argc != nr_paths * 2) {
282 ti->error = "Incorrect number of path arguments";
283 return -EINVAL;
284 }
285
286 sctx = alloc_switch_ctx(ti, nr_paths, region_size);
287 if (!sctx) {
288 ti->error = "Cannot allocate redirection context";
289 return -ENOMEM;
290 }
291
292 r = dm_set_target_max_io_len(ti, region_size);
293 if (r)
294 goto error;
295
296 while (as.argc) {
297 r = parse_path(&as, ti);
298 if (r)
299 goto error;
300 }
301
302 r = alloc_region_table(ti, nr_paths);
303 if (r)
304 goto error;
305
306 initialise_region_table(sctx);
307
308 /* For UNMAP, sending the request down any path is sufficient */
309 ti->num_discard_bios = 1;
310
311 return 0;
312
313error:
314 switch_dtr(ti);
315
316 return r;
317}
318
319static int switch_map(struct dm_target *ti, struct bio *bio)
320{
321 struct switch_ctx *sctx = ti->private;
322 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
323 unsigned path_nr = switch_get_path_nr(sctx, offset);
324
325 bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
326 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
327
328 return DM_MAPIO_REMAPPED;
329}
330
331/*
332 * We need to parse hex numbers in the message as quickly as possible.
333 *
334 * This table-based hex parser improves performance.
335 * It improves a time to load 1000000 entries compared to the condition-based
336 * parser.
337 * table-based parser condition-based parser
338 * PA-RISC 0.29s 0.31s
339 * Opteron 0.0495s 0.0498s
340 */
341static const unsigned char hex_table[256] = {
342255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
3450, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
346255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
347255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
350255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
351255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
352255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
353255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
354255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
355255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
356255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
357255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
358};
359
360static __always_inline unsigned long parse_hex(const char **string)
361{
362 unsigned char d;
363 unsigned long r = 0;
364
365 while ((d = hex_table[(unsigned char)**string]) < 16) {
366 r = (r << 4) | d;
367 (*string)++;
368 }
369
370 return r;
371}
372
373static int process_set_region_mappings(struct switch_ctx *sctx,
374 unsigned argc, char **argv)
375{
376 unsigned i;
377 unsigned long region_index = 0;
378
379 for (i = 1; i < argc; i++) {
380 unsigned long path_nr;
381 const char *string = argv[i];
382
383 if ((*string & 0xdf) == 'R') {
384 unsigned long cycle_length, num_write;
385
386 string++;
387 if (unlikely(*string == ',')) {
388 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
389 return -EINVAL;
390 }
391 cycle_length = parse_hex(&string);
392 if (unlikely(*string != ',')) {
393 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
394 return -EINVAL;
395 }
396 string++;
397 if (unlikely(!*string)) {
398 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
399 return -EINVAL;
400 }
401 num_write = parse_hex(&string);
402 if (unlikely(*string)) {
403 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
404 return -EINVAL;
405 }
406
407 if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
408 DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
409 cycle_length - 1, region_index);
410 return -EINVAL;
411 }
412 if (unlikely(region_index + num_write < region_index) ||
413 unlikely(region_index + num_write >= sctx->nr_regions)) {
414 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
415 region_index, num_write, sctx->nr_regions);
416 return -EINVAL;
417 }
418
419 while (num_write--) {
420 region_index++;
421 path_nr = switch_region_table_read(sctx, region_index - cycle_length);
422 switch_region_table_write(sctx, region_index, path_nr);
423 }
424
425 continue;
426 }
427
428 if (*string == ':')
429 region_index++;
430 else {
431 region_index = parse_hex(&string);
432 if (unlikely(*string != ':')) {
433 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
434 return -EINVAL;
435 }
436 }
437
438 string++;
439 if (unlikely(!*string)) {
440 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
441 return -EINVAL;
442 }
443
444 path_nr = parse_hex(&string);
445 if (unlikely(*string)) {
446 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
447 return -EINVAL;
448 }
449 if (unlikely(region_index >= sctx->nr_regions)) {
450 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
451 return -EINVAL;
452 }
453 if (unlikely(path_nr >= sctx->nr_paths)) {
454 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
455 return -EINVAL;
456 }
457
458 switch_region_table_write(sctx, region_index, path_nr);
459 }
460
461 return 0;
462}
463
464/*
465 * Messages are processed one-at-a-time.
466 *
467 * Only set_region_mappings is supported.
468 */
469static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
470 char *result, unsigned maxlen)
471{
472 static DEFINE_MUTEX(message_mutex);
473
474 struct switch_ctx *sctx = ti->private;
475 int r = -EINVAL;
476
477 mutex_lock(&message_mutex);
478
479 if (!strcasecmp(argv[0], "set_region_mappings"))
480 r = process_set_region_mappings(sctx, argc, argv);
481 else
482 DMWARN("Unrecognised message received.");
483
484 mutex_unlock(&message_mutex);
485
486 return r;
487}
488
489static void switch_status(struct dm_target *ti, status_type_t type,
490 unsigned status_flags, char *result, unsigned maxlen)
491{
492 struct switch_ctx *sctx = ti->private;
493 unsigned sz = 0;
494 int path_nr;
495
496 switch (type) {
497 case STATUSTYPE_INFO:
498 result[0] = '\0';
499 break;
500
501 case STATUSTYPE_TABLE:
502 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
503 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
504 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
505 (unsigned long long)sctx->path_list[path_nr].start);
506 break;
507 }
508}
509
510/*
511 * Switch ioctl:
512 *
513 * Passthrough all ioctls to the path for sector 0
514 */
515static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
516{
517 struct switch_ctx *sctx = ti->private;
518 unsigned path_nr;
519
520 path_nr = switch_get_path_nr(sctx, 0);
521
522 *bdev = sctx->path_list[path_nr].dmdev->bdev;
523
524 /*
525 * Only pass ioctls through if the device sizes match exactly.
526 */
527 if (ti->len + sctx->path_list[path_nr].start !=
528 i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
529 return 1;
530 return 0;
531}
532
533static int switch_iterate_devices(struct dm_target *ti,
534 iterate_devices_callout_fn fn, void *data)
535{
536 struct switch_ctx *sctx = ti->private;
537 int path_nr;
538 int r;
539
540 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
541 r = fn(ti, sctx->path_list[path_nr].dmdev,
542 sctx->path_list[path_nr].start, ti->len, data);
543 if (r)
544 return r;
545 }
546
547 return 0;
548}
549
550static struct target_type switch_target = {
551 .name = "switch",
552 .version = {1, 1, 0},
553 .module = THIS_MODULE,
554 .ctr = switch_ctr,
555 .dtr = switch_dtr,
556 .map = switch_map,
557 .message = switch_message,
558 .status = switch_status,
559 .prepare_ioctl = switch_prepare_ioctl,
560 .iterate_devices = switch_iterate_devices,
561};
562
563static int __init dm_switch_init(void)
564{
565 int r;
566
567 r = dm_register_target(&switch_target);
568 if (r < 0)
569 DMERR("dm_register_target() failed %d", r);
570
571 return r;
572}
573
574static void __exit dm_switch_exit(void)
575{
576 dm_unregister_target(&switch_target);
577}
578
579module_init(dm_switch_init);
580module_exit(dm_switch_exit);
581
582MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
583MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
584MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
585MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
586MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
587MODULE_LICENSE("GPL");
588