1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. |
4 | * Copyright (C) 2011-2013 Red Hat, Inc. |
5 | * |
6 | * This file is released under the GPL. |
7 | * |
8 | * dm-switch is a device-mapper target that maps IO to underlying block |
9 | * devices efficiently when there are a large number of fixed-sized |
10 | * address regions but there is no simple pattern to allow for a compact |
11 | * mapping representation such as dm-stripe. |
12 | */ |
13 | |
14 | #include <linux/device-mapper.h> |
15 | |
16 | #include <linux/module.h> |
17 | #include <linux/init.h> |
18 | #include <linux/vmalloc.h> |
19 | |
20 | #define DM_MSG_PREFIX "switch" |
21 | |
22 | /* |
23 | * One region_table_slot_t holds <region_entries_per_slot> region table |
24 | * entries each of which is <region_table_entry_bits> in size. |
25 | */ |
26 | typedef unsigned long region_table_slot_t; |
27 | |
28 | /* |
29 | * A device with the offset to its start sector. |
30 | */ |
31 | struct switch_path { |
32 | struct dm_dev *dmdev; |
33 | sector_t start; |
34 | }; |
35 | |
36 | /* |
37 | * Context block for a dm switch device. |
38 | */ |
39 | struct switch_ctx { |
40 | struct dm_target *ti; |
41 | |
42 | unsigned int nr_paths; /* Number of paths in path_list. */ |
43 | |
44 | unsigned int region_size; /* Region size in 512-byte sectors */ |
45 | unsigned long nr_regions; /* Number of regions making up the device */ |
46 | signed char region_size_bits; /* log2 of region_size or -1 */ |
47 | |
48 | unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ |
49 | unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ |
50 | signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ |
51 | |
52 | region_table_slot_t *region_table; /* Region table */ |
53 | |
54 | /* |
55 | * Array of dm devices to switch between. |
56 | */ |
57 | struct switch_path path_list[]; |
58 | }; |
59 | |
60 | static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned int nr_paths, |
61 | unsigned int region_size) |
62 | { |
63 | struct switch_ctx *sctx; |
64 | |
65 | sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL); |
66 | if (!sctx) |
67 | return NULL; |
68 | |
69 | sctx->ti = ti; |
70 | sctx->region_size = region_size; |
71 | |
72 | ti->private = sctx; |
73 | |
74 | return sctx; |
75 | } |
76 | |
77 | static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths) |
78 | { |
79 | struct switch_ctx *sctx = ti->private; |
80 | sector_t nr_regions = ti->len; |
81 | sector_t nr_slots; |
82 | |
83 | if (!(sctx->region_size & (sctx->region_size - 1))) |
84 | sctx->region_size_bits = __ffs(sctx->region_size); |
85 | else |
86 | sctx->region_size_bits = -1; |
87 | |
88 | sctx->region_table_entry_bits = 1; |
89 | while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && |
90 | (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) |
91 | sctx->region_table_entry_bits++; |
92 | |
93 | sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; |
94 | if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) |
95 | sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); |
96 | else |
97 | sctx->region_entries_per_slot_bits = -1; |
98 | |
99 | if (sector_div(nr_regions, sctx->region_size)) |
100 | nr_regions++; |
101 | |
102 | if (nr_regions >= ULONG_MAX) { |
103 | ti->error = "Region table too large" ; |
104 | return -EINVAL; |
105 | } |
106 | sctx->nr_regions = nr_regions; |
107 | |
108 | nr_slots = nr_regions; |
109 | if (sector_div(nr_slots, sctx->region_entries_per_slot)) |
110 | nr_slots++; |
111 | |
112 | if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { |
113 | ti->error = "Region table too large" ; |
114 | return -EINVAL; |
115 | } |
116 | |
117 | sctx->region_table = vmalloc(array_size(nr_slots, |
118 | sizeof(region_table_slot_t))); |
119 | if (!sctx->region_table) { |
120 | ti->error = "Cannot allocate region table" ; |
121 | return -ENOMEM; |
122 | } |
123 | |
124 | return 0; |
125 | } |
126 | |
127 | static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, |
128 | unsigned long *region_index, unsigned int *bit) |
129 | { |
130 | if (sctx->region_entries_per_slot_bits >= 0) { |
131 | *region_index = region_nr >> sctx->region_entries_per_slot_bits; |
132 | *bit = region_nr & (sctx->region_entries_per_slot - 1); |
133 | } else { |
134 | *region_index = region_nr / sctx->region_entries_per_slot; |
135 | *bit = region_nr % sctx->region_entries_per_slot; |
136 | } |
137 | |
138 | *bit *= sctx->region_table_entry_bits; |
139 | } |
140 | |
141 | static unsigned int switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) |
142 | { |
143 | unsigned long region_index; |
144 | unsigned int bit; |
145 | |
146 | switch_get_position(sctx, region_nr, region_index: ®ion_index, bit: &bit); |
147 | |
148 | return (READ_ONCE(sctx->region_table[region_index]) >> bit) & |
149 | ((1 << sctx->region_table_entry_bits) - 1); |
150 | } |
151 | |
152 | /* |
153 | * Find which path to use at given offset. |
154 | */ |
155 | static unsigned int switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) |
156 | { |
157 | unsigned int path_nr; |
158 | sector_t p; |
159 | |
160 | p = offset; |
161 | if (sctx->region_size_bits >= 0) |
162 | p >>= sctx->region_size_bits; |
163 | else |
164 | sector_div(p, sctx->region_size); |
165 | |
166 | path_nr = switch_region_table_read(sctx, region_nr: p); |
167 | |
168 | /* This can only happen if the processor uses non-atomic stores. */ |
169 | if (unlikely(path_nr >= sctx->nr_paths)) |
170 | path_nr = 0; |
171 | |
172 | return path_nr; |
173 | } |
174 | |
175 | static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, |
176 | unsigned int value) |
177 | { |
178 | unsigned long region_index; |
179 | unsigned int bit; |
180 | region_table_slot_t pte; |
181 | |
182 | switch_get_position(sctx, region_nr, region_index: ®ion_index, bit: &bit); |
183 | |
184 | pte = sctx->region_table[region_index]; |
185 | pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); |
186 | pte |= (region_table_slot_t)value << bit; |
187 | sctx->region_table[region_index] = pte; |
188 | } |
189 | |
190 | /* |
191 | * Fill the region table with an initial round robin pattern. |
192 | */ |
193 | static void initialise_region_table(struct switch_ctx *sctx) |
194 | { |
195 | unsigned int path_nr = 0; |
196 | unsigned long region_nr; |
197 | |
198 | for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { |
199 | switch_region_table_write(sctx, region_nr, value: path_nr); |
200 | if (++path_nr >= sctx->nr_paths) |
201 | path_nr = 0; |
202 | } |
203 | } |
204 | |
205 | static int parse_path(struct dm_arg_set *as, struct dm_target *ti) |
206 | { |
207 | struct switch_ctx *sctx = ti->private; |
208 | unsigned long long start; |
209 | int r; |
210 | |
211 | r = dm_get_device(ti, path: dm_shift_arg(as), mode: dm_table_get_mode(t: ti->table), |
212 | result: &sctx->path_list[sctx->nr_paths].dmdev); |
213 | if (r) { |
214 | ti->error = "Device lookup failed" ; |
215 | return r; |
216 | } |
217 | |
218 | if (kstrtoull(s: dm_shift_arg(as), base: 10, res: &start) || start != (sector_t)start) { |
219 | ti->error = "Invalid device starting offset" ; |
220 | dm_put_device(ti, d: sctx->path_list[sctx->nr_paths].dmdev); |
221 | return -EINVAL; |
222 | } |
223 | |
224 | sctx->path_list[sctx->nr_paths].start = start; |
225 | |
226 | sctx->nr_paths++; |
227 | |
228 | return 0; |
229 | } |
230 | |
231 | /* |
232 | * Destructor: Don't free the dm_target, just the ti->private data (if any). |
233 | */ |
234 | static void switch_dtr(struct dm_target *ti) |
235 | { |
236 | struct switch_ctx *sctx = ti->private; |
237 | |
238 | while (sctx->nr_paths--) |
239 | dm_put_device(ti, d: sctx->path_list[sctx->nr_paths].dmdev); |
240 | |
241 | vfree(addr: sctx->region_table); |
242 | kfree(objp: sctx); |
243 | } |
244 | |
245 | /* |
246 | * Constructor arguments: |
247 | * <num_paths> <region_size> <num_optional_args> [<optional_args>...] |
248 | * [<dev_path> <offset>]+ |
249 | * |
250 | * Optional args are to allow for future extension: currently this |
251 | * parameter must be 0. |
252 | */ |
253 | static int switch_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
254 | { |
255 | static const struct dm_arg _args[] = { |
256 | {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths" }, |
257 | {1, UINT_MAX, "Invalid region size" }, |
258 | {0, 0, "Invalid number of optional args" }, |
259 | }; |
260 | |
261 | struct switch_ctx *sctx; |
262 | struct dm_arg_set as; |
263 | unsigned int nr_paths, region_size, nr_optional_args; |
264 | int r; |
265 | |
266 | as.argc = argc; |
267 | as.argv = argv; |
268 | |
269 | r = dm_read_arg(arg: _args, arg_set: &as, value: &nr_paths, error: &ti->error); |
270 | if (r) |
271 | return -EINVAL; |
272 | |
273 | r = dm_read_arg(arg: _args + 1, arg_set: &as, value: ®ion_size, error: &ti->error); |
274 | if (r) |
275 | return r; |
276 | |
277 | r = dm_read_arg_group(arg: _args + 2, arg_set: &as, num_args: &nr_optional_args, error: &ti->error); |
278 | if (r) |
279 | return r; |
280 | /* parse optional arguments here, if we add any */ |
281 | |
282 | if (as.argc != nr_paths * 2) { |
283 | ti->error = "Incorrect number of path arguments" ; |
284 | return -EINVAL; |
285 | } |
286 | |
287 | sctx = alloc_switch_ctx(ti, nr_paths, region_size); |
288 | if (!sctx) { |
289 | ti->error = "Cannot allocate redirection context" ; |
290 | return -ENOMEM; |
291 | } |
292 | |
293 | r = dm_set_target_max_io_len(ti, len: region_size); |
294 | if (r) |
295 | goto error; |
296 | |
297 | while (as.argc) { |
298 | r = parse_path(as: &as, ti); |
299 | if (r) |
300 | goto error; |
301 | } |
302 | |
303 | r = alloc_region_table(ti, nr_paths); |
304 | if (r) |
305 | goto error; |
306 | |
307 | initialise_region_table(sctx); |
308 | |
309 | /* For UNMAP, sending the request down any path is sufficient */ |
310 | ti->num_discard_bios = 1; |
311 | |
312 | return 0; |
313 | |
314 | error: |
315 | switch_dtr(ti); |
316 | |
317 | return r; |
318 | } |
319 | |
320 | static int switch_map(struct dm_target *ti, struct bio *bio) |
321 | { |
322 | struct switch_ctx *sctx = ti->private; |
323 | sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); |
324 | unsigned int path_nr = switch_get_path_nr(sctx, offset); |
325 | |
326 | bio_set_dev(bio, bdev: sctx->path_list[path_nr].dmdev->bdev); |
327 | bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; |
328 | |
329 | return DM_MAPIO_REMAPPED; |
330 | } |
331 | |
332 | /* |
333 | * We need to parse hex numbers in the message as quickly as possible. |
334 | * |
335 | * This table-based hex parser improves performance. |
336 | * It improves a time to load 1000000 entries compared to the condition-based |
337 | * parser. |
338 | * table-based parser condition-based parser |
339 | * PA-RISC 0.29s 0.31s |
340 | * Opteron 0.0495s 0.0498s |
341 | */ |
342 | static const unsigned char hex_table[256] = { |
343 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
344 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
345 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
346 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, |
347 | 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
348 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
349 | 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
350 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
351 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
352 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
353 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
354 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
355 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
356 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
357 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, |
358 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 |
359 | }; |
360 | |
361 | static __always_inline unsigned long parse_hex(const char **string) |
362 | { |
363 | unsigned char d; |
364 | unsigned long r = 0; |
365 | |
366 | while ((d = hex_table[(unsigned char)**string]) < 16) { |
367 | r = (r << 4) | d; |
368 | (*string)++; |
369 | } |
370 | |
371 | return r; |
372 | } |
373 | |
374 | static int process_set_region_mappings(struct switch_ctx *sctx, |
375 | unsigned int argc, char **argv) |
376 | { |
377 | unsigned int i; |
378 | unsigned long region_index = 0; |
379 | |
380 | for (i = 1; i < argc; i++) { |
381 | unsigned long path_nr; |
382 | const char *string = argv[i]; |
383 | |
384 | if ((*string & 0xdf) == 'R') { |
385 | unsigned long cycle_length, num_write; |
386 | |
387 | string++; |
388 | if (unlikely(*string == ',')) { |
389 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
390 | return -EINVAL; |
391 | } |
392 | cycle_length = parse_hex(string: &string); |
393 | if (unlikely(*string != ',')) { |
394 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
395 | return -EINVAL; |
396 | } |
397 | string++; |
398 | if (unlikely(!*string)) { |
399 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
400 | return -EINVAL; |
401 | } |
402 | num_write = parse_hex(string: &string); |
403 | if (unlikely(*string)) { |
404 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
405 | return -EINVAL; |
406 | } |
407 | |
408 | if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { |
409 | DMWARN("invalid set_region_mappings cycle length: %lu > %lu" , |
410 | cycle_length - 1, region_index); |
411 | return -EINVAL; |
412 | } |
413 | if (unlikely(region_index + num_write < region_index) || |
414 | unlikely(region_index + num_write >= sctx->nr_regions)) { |
415 | DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu" , |
416 | region_index, num_write, sctx->nr_regions); |
417 | return -EINVAL; |
418 | } |
419 | |
420 | while (num_write--) { |
421 | region_index++; |
422 | path_nr = switch_region_table_read(sctx, region_nr: region_index - cycle_length); |
423 | switch_region_table_write(sctx, region_nr: region_index, value: path_nr); |
424 | } |
425 | |
426 | continue; |
427 | } |
428 | |
429 | if (*string == ':') |
430 | region_index++; |
431 | else { |
432 | region_index = parse_hex(string: &string); |
433 | if (unlikely(*string != ':')) { |
434 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
435 | return -EINVAL; |
436 | } |
437 | } |
438 | |
439 | string++; |
440 | if (unlikely(!*string)) { |
441 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
442 | return -EINVAL; |
443 | } |
444 | |
445 | path_nr = parse_hex(string: &string); |
446 | if (unlikely(*string)) { |
447 | DMWARN("invalid set_region_mappings argument: '%s'" , argv[i]); |
448 | return -EINVAL; |
449 | } |
450 | if (unlikely(region_index >= sctx->nr_regions)) { |
451 | DMWARN("invalid set_region_mappings region number: %lu >= %lu" , region_index, sctx->nr_regions); |
452 | return -EINVAL; |
453 | } |
454 | if (unlikely(path_nr >= sctx->nr_paths)) { |
455 | DMWARN("invalid set_region_mappings device: %lu >= %u" , path_nr, sctx->nr_paths); |
456 | return -EINVAL; |
457 | } |
458 | |
459 | switch_region_table_write(sctx, region_nr: region_index, value: path_nr); |
460 | } |
461 | |
462 | return 0; |
463 | } |
464 | |
465 | /* |
466 | * Messages are processed one-at-a-time. |
467 | * |
468 | * Only set_region_mappings is supported. |
469 | */ |
470 | static int switch_message(struct dm_target *ti, unsigned int argc, char **argv, |
471 | char *result, unsigned int maxlen) |
472 | { |
473 | static DEFINE_MUTEX(message_mutex); |
474 | |
475 | struct switch_ctx *sctx = ti->private; |
476 | int r = -EINVAL; |
477 | |
478 | mutex_lock(&message_mutex); |
479 | |
480 | if (!strcasecmp(s1: argv[0], s2: "set_region_mappings" )) |
481 | r = process_set_region_mappings(sctx, argc, argv); |
482 | else |
483 | DMWARN("Unrecognised message received." ); |
484 | |
485 | mutex_unlock(lock: &message_mutex); |
486 | |
487 | return r; |
488 | } |
489 | |
490 | static void switch_status(struct dm_target *ti, status_type_t type, |
491 | unsigned int status_flags, char *result, unsigned int maxlen) |
492 | { |
493 | struct switch_ctx *sctx = ti->private; |
494 | unsigned int sz = 0; |
495 | int path_nr; |
496 | |
497 | switch (type) { |
498 | case STATUSTYPE_INFO: |
499 | result[0] = '\0'; |
500 | break; |
501 | |
502 | case STATUSTYPE_TABLE: |
503 | DMEMIT("%u %u 0" , sctx->nr_paths, sctx->region_size); |
504 | for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) |
505 | DMEMIT(" %s %llu" , sctx->path_list[path_nr].dmdev->name, |
506 | (unsigned long long)sctx->path_list[path_nr].start); |
507 | break; |
508 | |
509 | case STATUSTYPE_IMA: |
510 | result[0] = '\0'; |
511 | break; |
512 | } |
513 | } |
514 | |
515 | /* |
516 | * Switch ioctl: |
517 | * |
518 | * Passthrough all ioctls to the path for sector 0 |
519 | */ |
520 | static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) |
521 | { |
522 | struct switch_ctx *sctx = ti->private; |
523 | unsigned int path_nr; |
524 | |
525 | path_nr = switch_get_path_nr(sctx, offset: 0); |
526 | |
527 | *bdev = sctx->path_list[path_nr].dmdev->bdev; |
528 | |
529 | /* |
530 | * Only pass ioctls through if the device sizes match exactly. |
531 | */ |
532 | if (ti->len + sctx->path_list[path_nr].start != |
533 | bdev_nr_sectors(bdev: (*bdev))) |
534 | return 1; |
535 | return 0; |
536 | } |
537 | |
538 | static int switch_iterate_devices(struct dm_target *ti, |
539 | iterate_devices_callout_fn fn, void *data) |
540 | { |
541 | struct switch_ctx *sctx = ti->private; |
542 | int path_nr; |
543 | int r; |
544 | |
545 | for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { |
546 | r = fn(ti, sctx->path_list[path_nr].dmdev, |
547 | sctx->path_list[path_nr].start, ti->len, data); |
548 | if (r) |
549 | return r; |
550 | } |
551 | |
552 | return 0; |
553 | } |
554 | |
555 | static struct target_type switch_target = { |
556 | .name = "switch" , |
557 | .version = {1, 1, 0}, |
558 | .features = DM_TARGET_NOWAIT, |
559 | .module = THIS_MODULE, |
560 | .ctr = switch_ctr, |
561 | .dtr = switch_dtr, |
562 | .map = switch_map, |
563 | .message = switch_message, |
564 | .status = switch_status, |
565 | .prepare_ioctl = switch_prepare_ioctl, |
566 | .iterate_devices = switch_iterate_devices, |
567 | }; |
568 | module_dm(switch); |
569 | |
570 | MODULE_DESCRIPTION(DM_NAME " dynamic path switching target" ); |
571 | MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>" ); |
572 | MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>" ); |
573 | MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>" ); |
574 | MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>" ); |
575 | MODULE_LICENSE("GPL" ); |
576 | |