1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2011 Red Hat, Inc. |
4 | * |
5 | * This file is released under the GPL. |
6 | */ |
7 | #include "dm-block-manager.h" |
8 | #include "dm-persistent-data-internal.h" |
9 | |
10 | #include <linux/dm-bufio.h> |
11 | #include <linux/crc32c.h> |
12 | #include <linux/module.h> |
13 | #include <linux/slab.h> |
14 | #include <linux/rwsem.h> |
15 | #include <linux/device-mapper.h> |
16 | #include <linux/stacktrace.h> |
17 | #include <linux/sched/task.h> |
18 | |
19 | #define DM_MSG_PREFIX "block manager" |
20 | |
21 | /*----------------------------------------------------------------*/ |
22 | |
23 | #ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING |
24 | |
25 | /* |
26 | * This is a read/write semaphore with a couple of differences. |
27 | * |
28 | * i) There is a restriction on the number of concurrent read locks that |
29 | * may be held at once. This is just an implementation detail. |
30 | * |
31 | * ii) Recursive locking attempts are detected and return EINVAL. A stack |
32 | * trace is also emitted for the previous lock acquisition. |
33 | * |
34 | * iii) Priority is given to write locks. |
35 | */ |
36 | #define MAX_HOLDERS 4 |
37 | #define MAX_STACK 10 |
38 | |
39 | struct stack_store { |
40 | unsigned int nr_entries; |
41 | unsigned long entries[MAX_STACK]; |
42 | }; |
43 | |
44 | struct block_lock { |
45 | spinlock_t lock; |
46 | __s32 count; |
47 | struct list_head waiters; |
48 | struct task_struct *holders[MAX_HOLDERS]; |
49 | |
50 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
51 | struct stack_store traces[MAX_HOLDERS]; |
52 | #endif |
53 | }; |
54 | |
55 | struct waiter { |
56 | struct list_head list; |
57 | struct task_struct *task; |
58 | int wants_write; |
59 | }; |
60 | |
61 | static unsigned int __find_holder(struct block_lock *lock, |
62 | struct task_struct *task) |
63 | { |
64 | unsigned int i; |
65 | |
66 | for (i = 0; i < MAX_HOLDERS; i++) |
67 | if (lock->holders[i] == task) |
68 | break; |
69 | |
70 | BUG_ON(i == MAX_HOLDERS); |
71 | return i; |
72 | } |
73 | |
74 | /* call this *after* you increment lock->count */ |
75 | static void __add_holder(struct block_lock *lock, struct task_struct *task) |
76 | { |
77 | unsigned int h = __find_holder(lock, NULL); |
78 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
79 | struct stack_store *t; |
80 | #endif |
81 | |
82 | get_task_struct(t: task); |
83 | lock->holders[h] = task; |
84 | |
85 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
86 | t = lock->traces + h; |
87 | t->nr_entries = stack_trace_save(store: t->entries, MAX_STACK, skipnr: 2); |
88 | #endif |
89 | } |
90 | |
91 | /* call this *before* you decrement lock->count */ |
92 | static void __del_holder(struct block_lock *lock, struct task_struct *task) |
93 | { |
94 | unsigned int h = __find_holder(lock, task); |
95 | |
96 | lock->holders[h] = NULL; |
97 | put_task_struct(t: task); |
98 | } |
99 | |
100 | static int __check_holder(struct block_lock *lock) |
101 | { |
102 | unsigned int i; |
103 | |
104 | for (i = 0; i < MAX_HOLDERS; i++) { |
105 | if (lock->holders[i] == current) { |
106 | DMERR("recursive lock detected in metadata" ); |
107 | #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING |
108 | DMERR("previously held here:" ); |
109 | stack_trace_print(trace: lock->traces[i].entries, |
110 | nr_entries: lock->traces[i].nr_entries, spaces: 4); |
111 | |
112 | DMERR("subsequent acquisition attempted here:" ); |
113 | dump_stack(); |
114 | #endif |
115 | return -EINVAL; |
116 | } |
117 | } |
118 | |
119 | return 0; |
120 | } |
121 | |
122 | static void __wait(struct waiter *w) |
123 | { |
124 | for (;;) { |
125 | set_current_state(TASK_UNINTERRUPTIBLE); |
126 | |
127 | if (!w->task) |
128 | break; |
129 | |
130 | schedule(); |
131 | } |
132 | |
133 | set_current_state(TASK_RUNNING); |
134 | } |
135 | |
136 | static void __wake_waiter(struct waiter *w) |
137 | { |
138 | struct task_struct *task; |
139 | |
140 | list_del(entry: &w->list); |
141 | task = w->task; |
142 | smp_mb(); |
143 | w->task = NULL; |
144 | wake_up_process(tsk: task); |
145 | } |
146 | |
147 | /* |
148 | * We either wake a few readers or a single writer. |
149 | */ |
150 | static void __wake_many(struct block_lock *lock) |
151 | { |
152 | struct waiter *w, *tmp; |
153 | |
154 | BUG_ON(lock->count < 0); |
155 | list_for_each_entry_safe(w, tmp, &lock->waiters, list) { |
156 | if (lock->count >= MAX_HOLDERS) |
157 | return; |
158 | |
159 | if (w->wants_write) { |
160 | if (lock->count > 0) |
161 | return; /* still read locked */ |
162 | |
163 | lock->count = -1; |
164 | __add_holder(lock, task: w->task); |
165 | __wake_waiter(w); |
166 | return; |
167 | } |
168 | |
169 | lock->count++; |
170 | __add_holder(lock, task: w->task); |
171 | __wake_waiter(w); |
172 | } |
173 | } |
174 | |
175 | static void bl_init(struct block_lock *lock) |
176 | { |
177 | int i; |
178 | |
179 | spin_lock_init(&lock->lock); |
180 | lock->count = 0; |
181 | INIT_LIST_HEAD(list: &lock->waiters); |
182 | for (i = 0; i < MAX_HOLDERS; i++) |
183 | lock->holders[i] = NULL; |
184 | } |
185 | |
186 | static int __available_for_read(struct block_lock *lock) |
187 | { |
188 | return lock->count >= 0 && |
189 | lock->count < MAX_HOLDERS && |
190 | list_empty(head: &lock->waiters); |
191 | } |
192 | |
193 | static int bl_down_read(struct block_lock *lock) |
194 | { |
195 | int r; |
196 | struct waiter w; |
197 | |
198 | spin_lock(lock: &lock->lock); |
199 | r = __check_holder(lock); |
200 | if (r) { |
201 | spin_unlock(lock: &lock->lock); |
202 | return r; |
203 | } |
204 | |
205 | if (__available_for_read(lock)) { |
206 | lock->count++; |
207 | __add_holder(lock, current); |
208 | spin_unlock(lock: &lock->lock); |
209 | return 0; |
210 | } |
211 | |
212 | get_task_struct(current); |
213 | |
214 | w.task = current; |
215 | w.wants_write = 0; |
216 | list_add_tail(new: &w.list, head: &lock->waiters); |
217 | spin_unlock(lock: &lock->lock); |
218 | |
219 | __wait(w: &w); |
220 | put_task_struct(current); |
221 | return 0; |
222 | } |
223 | |
224 | static int bl_down_read_nonblock(struct block_lock *lock) |
225 | { |
226 | int r; |
227 | |
228 | spin_lock(lock: &lock->lock); |
229 | r = __check_holder(lock); |
230 | if (r) |
231 | goto out; |
232 | |
233 | if (__available_for_read(lock)) { |
234 | lock->count++; |
235 | __add_holder(lock, current); |
236 | r = 0; |
237 | } else |
238 | r = -EWOULDBLOCK; |
239 | |
240 | out: |
241 | spin_unlock(lock: &lock->lock); |
242 | return r; |
243 | } |
244 | |
245 | static void bl_up_read(struct block_lock *lock) |
246 | { |
247 | spin_lock(lock: &lock->lock); |
248 | BUG_ON(lock->count <= 0); |
249 | __del_holder(lock, current); |
250 | --lock->count; |
251 | if (!list_empty(head: &lock->waiters)) |
252 | __wake_many(lock); |
253 | spin_unlock(lock: &lock->lock); |
254 | } |
255 | |
256 | static int bl_down_write(struct block_lock *lock) |
257 | { |
258 | int r; |
259 | struct waiter w; |
260 | |
261 | spin_lock(lock: &lock->lock); |
262 | r = __check_holder(lock); |
263 | if (r) { |
264 | spin_unlock(lock: &lock->lock); |
265 | return r; |
266 | } |
267 | |
268 | if (lock->count == 0 && list_empty(head: &lock->waiters)) { |
269 | lock->count = -1; |
270 | __add_holder(lock, current); |
271 | spin_unlock(lock: &lock->lock); |
272 | return 0; |
273 | } |
274 | |
275 | get_task_struct(current); |
276 | w.task = current; |
277 | w.wants_write = 1; |
278 | |
279 | /* |
280 | * Writers given priority. We know there's only one mutator in the |
281 | * system, so ignoring the ordering reversal. |
282 | */ |
283 | list_add(new: &w.list, head: &lock->waiters); |
284 | spin_unlock(lock: &lock->lock); |
285 | |
286 | __wait(w: &w); |
287 | put_task_struct(current); |
288 | |
289 | return 0; |
290 | } |
291 | |
292 | static void bl_up_write(struct block_lock *lock) |
293 | { |
294 | spin_lock(lock: &lock->lock); |
295 | __del_holder(lock, current); |
296 | lock->count = 0; |
297 | if (!list_empty(head: &lock->waiters)) |
298 | __wake_many(lock); |
299 | spin_unlock(lock: &lock->lock); |
300 | } |
301 | |
302 | static void report_recursive_bug(dm_block_t b, int r) |
303 | { |
304 | if (r == -EINVAL) |
305 | DMERR("recursive acquisition of block %llu requested." , |
306 | (unsigned long long) b); |
307 | } |
308 | |
309 | #else /* !CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ |
310 | |
311 | #define bl_init(x) do { } while (0) |
312 | #define bl_down_read(x) 0 |
313 | #define bl_down_read_nonblock(x) 0 |
314 | #define bl_up_read(x) do { } while (0) |
315 | #define bl_down_write(x) 0 |
316 | #define bl_up_write(x) do { } while (0) |
317 | #define report_recursive_bug(x, y) do { } while (0) |
318 | |
319 | #endif /* CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ |
320 | |
321 | /*----------------------------------------------------------------*/ |
322 | |
323 | /* |
324 | * Block manager is currently implemented using dm-bufio. struct |
325 | * dm_block_manager and struct dm_block map directly onto a couple of |
326 | * structs in the bufio interface. I want to retain the freedom to move |
327 | * away from bufio in the future. So these structs are just cast within |
328 | * this .c file, rather than making it through to the public interface. |
329 | */ |
330 | static struct dm_buffer *to_buffer(struct dm_block *b) |
331 | { |
332 | return (struct dm_buffer *) b; |
333 | } |
334 | |
335 | dm_block_t dm_block_location(struct dm_block *b) |
336 | { |
337 | return dm_bufio_get_block_number(b: to_buffer(b)); |
338 | } |
339 | EXPORT_SYMBOL_GPL(dm_block_location); |
340 | |
341 | void *dm_block_data(struct dm_block *b) |
342 | { |
343 | return dm_bufio_get_block_data(b: to_buffer(b)); |
344 | } |
345 | EXPORT_SYMBOL_GPL(dm_block_data); |
346 | |
347 | struct buffer_aux { |
348 | struct dm_block_validator *validator; |
349 | int write_locked; |
350 | |
351 | #ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING |
352 | struct block_lock lock; |
353 | #endif |
354 | }; |
355 | |
356 | static void dm_block_manager_alloc_callback(struct dm_buffer *buf) |
357 | { |
358 | struct buffer_aux *aux = dm_bufio_get_aux_data(b: buf); |
359 | |
360 | aux->validator = NULL; |
361 | bl_init(lock: &aux->lock); |
362 | } |
363 | |
364 | static void dm_block_manager_write_callback(struct dm_buffer *buf) |
365 | { |
366 | struct buffer_aux *aux = dm_bufio_get_aux_data(b: buf); |
367 | |
368 | if (aux->validator) { |
369 | aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf, |
370 | dm_bufio_get_block_size(c: dm_bufio_get_client(b: buf))); |
371 | } |
372 | } |
373 | |
374 | /* |
375 | * ------------------------------------------------------------- |
376 | * Public interface |
377 | *-------------------------------------------------------------- |
378 | */ |
379 | struct dm_block_manager { |
380 | struct dm_bufio_client *bufio; |
381 | bool read_only:1; |
382 | }; |
383 | |
384 | struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, |
385 | unsigned int block_size, |
386 | unsigned int max_held_per_thread) |
387 | { |
388 | int r; |
389 | struct dm_block_manager *bm; |
390 | |
391 | bm = kmalloc(size: sizeof(*bm), GFP_KERNEL); |
392 | if (!bm) { |
393 | r = -ENOMEM; |
394 | goto bad; |
395 | } |
396 | |
397 | bm->bufio = dm_bufio_client_create(bdev, block_size, reserved_buffers: max_held_per_thread, |
398 | aux_size: sizeof(struct buffer_aux), |
399 | alloc_callback: dm_block_manager_alloc_callback, |
400 | write_callback: dm_block_manager_write_callback, |
401 | flags: 0); |
402 | if (IS_ERR(ptr: bm->bufio)) { |
403 | r = PTR_ERR(ptr: bm->bufio); |
404 | kfree(objp: bm); |
405 | goto bad; |
406 | } |
407 | |
408 | bm->read_only = false; |
409 | |
410 | return bm; |
411 | |
412 | bad: |
413 | return ERR_PTR(error: r); |
414 | } |
415 | EXPORT_SYMBOL_GPL(dm_block_manager_create); |
416 | |
417 | void dm_block_manager_destroy(struct dm_block_manager *bm) |
418 | { |
419 | dm_bufio_client_destroy(c: bm->bufio); |
420 | kfree(objp: bm); |
421 | } |
422 | EXPORT_SYMBOL_GPL(dm_block_manager_destroy); |
423 | |
424 | void dm_block_manager_reset(struct dm_block_manager *bm) |
425 | { |
426 | dm_bufio_client_reset(c: bm->bufio); |
427 | } |
428 | EXPORT_SYMBOL_GPL(dm_block_manager_reset); |
429 | |
430 | unsigned int dm_bm_block_size(struct dm_block_manager *bm) |
431 | { |
432 | return dm_bufio_get_block_size(c: bm->bufio); |
433 | } |
434 | EXPORT_SYMBOL_GPL(dm_bm_block_size); |
435 | |
436 | dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) |
437 | { |
438 | return dm_bufio_get_device_size(c: bm->bufio); |
439 | } |
440 | |
441 | static int dm_bm_validate_buffer(struct dm_block_manager *bm, |
442 | struct dm_buffer *buf, |
443 | struct buffer_aux *aux, |
444 | struct dm_block_validator *v) |
445 | { |
446 | if (unlikely(!aux->validator)) { |
447 | int r; |
448 | |
449 | if (!v) |
450 | return 0; |
451 | r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(c: bm->bufio)); |
452 | if (unlikely(r)) { |
453 | DMERR_LIMIT("%s validator check failed for block %llu" , v->name, |
454 | (unsigned long long) dm_bufio_get_block_number(buf)); |
455 | return r; |
456 | } |
457 | aux->validator = v; |
458 | } else { |
459 | if (unlikely(aux->validator != v)) { |
460 | DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu" , |
461 | aux->validator->name, v ? v->name : "NULL" , |
462 | (unsigned long long) dm_bufio_get_block_number(buf)); |
463 | return -EINVAL; |
464 | } |
465 | } |
466 | |
467 | return 0; |
468 | } |
469 | int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, |
470 | struct dm_block_validator *v, |
471 | struct dm_block **result) |
472 | { |
473 | struct buffer_aux *aux; |
474 | void *p; |
475 | int r; |
476 | |
477 | p = dm_bufio_read(c: bm->bufio, block: b, bp: (struct dm_buffer **) result); |
478 | if (IS_ERR(ptr: p)) |
479 | return PTR_ERR(ptr: p); |
480 | |
481 | aux = dm_bufio_get_aux_data(b: to_buffer(b: *result)); |
482 | r = bl_down_read(lock: &aux->lock); |
483 | if (unlikely(r)) { |
484 | dm_bufio_release(b: to_buffer(b: *result)); |
485 | report_recursive_bug(b, r); |
486 | return r; |
487 | } |
488 | |
489 | aux->write_locked = 0; |
490 | |
491 | r = dm_bm_validate_buffer(bm, buf: to_buffer(b: *result), aux, v); |
492 | if (unlikely(r)) { |
493 | bl_up_read(lock: &aux->lock); |
494 | dm_bufio_release(b: to_buffer(b: *result)); |
495 | return r; |
496 | } |
497 | |
498 | return 0; |
499 | } |
500 | EXPORT_SYMBOL_GPL(dm_bm_read_lock); |
501 | |
502 | int dm_bm_write_lock(struct dm_block_manager *bm, |
503 | dm_block_t b, struct dm_block_validator *v, |
504 | struct dm_block **result) |
505 | { |
506 | struct buffer_aux *aux; |
507 | void *p; |
508 | int r; |
509 | |
510 | if (dm_bm_is_read_only(bm)) |
511 | return -EPERM; |
512 | |
513 | p = dm_bufio_read(c: bm->bufio, block: b, bp: (struct dm_buffer **) result); |
514 | if (IS_ERR(ptr: p)) |
515 | return PTR_ERR(ptr: p); |
516 | |
517 | aux = dm_bufio_get_aux_data(b: to_buffer(b: *result)); |
518 | r = bl_down_write(lock: &aux->lock); |
519 | if (r) { |
520 | dm_bufio_release(b: to_buffer(b: *result)); |
521 | report_recursive_bug(b, r); |
522 | return r; |
523 | } |
524 | |
525 | aux->write_locked = 1; |
526 | |
527 | r = dm_bm_validate_buffer(bm, buf: to_buffer(b: *result), aux, v); |
528 | if (unlikely(r)) { |
529 | bl_up_write(lock: &aux->lock); |
530 | dm_bufio_release(b: to_buffer(b: *result)); |
531 | return r; |
532 | } |
533 | |
534 | return 0; |
535 | } |
536 | EXPORT_SYMBOL_GPL(dm_bm_write_lock); |
537 | |
538 | int dm_bm_read_try_lock(struct dm_block_manager *bm, |
539 | dm_block_t b, struct dm_block_validator *v, |
540 | struct dm_block **result) |
541 | { |
542 | struct buffer_aux *aux; |
543 | void *p; |
544 | int r; |
545 | |
546 | p = dm_bufio_get(c: bm->bufio, block: b, bp: (struct dm_buffer **) result); |
547 | if (IS_ERR(ptr: p)) |
548 | return PTR_ERR(ptr: p); |
549 | if (unlikely(!p)) |
550 | return -EWOULDBLOCK; |
551 | |
552 | aux = dm_bufio_get_aux_data(b: to_buffer(b: *result)); |
553 | r = bl_down_read_nonblock(lock: &aux->lock); |
554 | if (r < 0) { |
555 | dm_bufio_release(b: to_buffer(b: *result)); |
556 | report_recursive_bug(b, r); |
557 | return r; |
558 | } |
559 | aux->write_locked = 0; |
560 | |
561 | r = dm_bm_validate_buffer(bm, buf: to_buffer(b: *result), aux, v); |
562 | if (unlikely(r)) { |
563 | bl_up_read(lock: &aux->lock); |
564 | dm_bufio_release(b: to_buffer(b: *result)); |
565 | return r; |
566 | } |
567 | |
568 | return 0; |
569 | } |
570 | |
571 | int dm_bm_write_lock_zero(struct dm_block_manager *bm, |
572 | dm_block_t b, struct dm_block_validator *v, |
573 | struct dm_block **result) |
574 | { |
575 | int r; |
576 | struct buffer_aux *aux; |
577 | void *p; |
578 | |
579 | if (dm_bm_is_read_only(bm)) |
580 | return -EPERM; |
581 | |
582 | p = dm_bufio_new(c: bm->bufio, block: b, bp: (struct dm_buffer **) result); |
583 | if (IS_ERR(ptr: p)) |
584 | return PTR_ERR(ptr: p); |
585 | |
586 | memset(p, 0, dm_bm_block_size(bm)); |
587 | |
588 | aux = dm_bufio_get_aux_data(b: to_buffer(b: *result)); |
589 | r = bl_down_write(lock: &aux->lock); |
590 | if (r) { |
591 | dm_bufio_release(b: to_buffer(b: *result)); |
592 | return r; |
593 | } |
594 | |
595 | aux->write_locked = 1; |
596 | aux->validator = v; |
597 | |
598 | return 0; |
599 | } |
600 | EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); |
601 | |
602 | void dm_bm_unlock(struct dm_block *b) |
603 | { |
604 | struct buffer_aux *aux = dm_bufio_get_aux_data(b: to_buffer(b)); |
605 | |
606 | if (aux->write_locked) { |
607 | dm_bufio_mark_buffer_dirty(b: to_buffer(b)); |
608 | bl_up_write(lock: &aux->lock); |
609 | } else |
610 | bl_up_read(lock: &aux->lock); |
611 | |
612 | dm_bufio_release(b: to_buffer(b)); |
613 | } |
614 | EXPORT_SYMBOL_GPL(dm_bm_unlock); |
615 | |
616 | int dm_bm_flush(struct dm_block_manager *bm) |
617 | { |
618 | if (dm_bm_is_read_only(bm)) |
619 | return -EPERM; |
620 | |
621 | return dm_bufio_write_dirty_buffers(c: bm->bufio); |
622 | } |
623 | EXPORT_SYMBOL_GPL(dm_bm_flush); |
624 | |
625 | void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) |
626 | { |
627 | dm_bufio_prefetch(c: bm->bufio, block: b, n_blocks: 1); |
628 | } |
629 | |
630 | bool dm_bm_is_read_only(struct dm_block_manager *bm) |
631 | { |
632 | return bm ? bm->read_only : true; |
633 | } |
634 | EXPORT_SYMBOL_GPL(dm_bm_is_read_only); |
635 | |
636 | void dm_bm_set_read_only(struct dm_block_manager *bm) |
637 | { |
638 | if (bm) |
639 | bm->read_only = true; |
640 | } |
641 | EXPORT_SYMBOL_GPL(dm_bm_set_read_only); |
642 | |
643 | void dm_bm_set_read_write(struct dm_block_manager *bm) |
644 | { |
645 | if (bm) |
646 | bm->read_only = false; |
647 | } |
648 | EXPORT_SYMBOL_GPL(dm_bm_set_read_write); |
649 | |
650 | u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) |
651 | { |
652 | return crc32c(crc: ~(u32) 0, address: data, length: len) ^ init_xor; |
653 | } |
654 | EXPORT_SYMBOL_GPL(dm_bm_checksum); |
655 | |
656 | /*----------------------------------------------------------------*/ |
657 | |
658 | MODULE_LICENSE("GPL" ); |
659 | MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>" ); |
660 | MODULE_DESCRIPTION("Immutable metadata library for dm" ); |
661 | |
662 | /*----------------------------------------------------------------*/ |
663 | |