dm-bufio.c source code [linux/drivers/md/dm-bufio.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2009-2011 Red Hat, Inc.
4	*
5	* Author: Mikulas Patocka <mpatocka@redhat.com>
6	*
7	* This file is released under the GPL.
8	*/
9
10	#include <linux/dm-bufio.h>
11
12	#include <linux/device-mapper.h>
13	#include <linux/dm-io.h>
14	#include <linux/slab.h>
15	#include <linux/sched/mm.h>
16	#include <linux/jiffies.h>
17	#include <linux/vmalloc.h>
18	#include <linux/shrinker.h>
19	#include <linux/module.h>
20	#include <linux/rbtree.h>
21	#include <linux/stacktrace.h>
22	#include <linux/jump_label.h>
23
24	#include "dm.h"
25
26	#define DM_MSG_PREFIX "bufio"
27
28	/*
29	* Memory management policy:
30	* Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
31	* or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
32	* Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
33	* Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
34	* dirty buffers.
35	*/
36	#define DM_BUFIO_MIN_BUFFERS 8
37
38	#define DM_BUFIO_MEMORY_PERCENT 2
39	#define DM_BUFIO_VMALLOC_PERCENT 25
40	#define DM_BUFIO_WRITEBACK_RATIO 3
41	#define DM_BUFIO_LOW_WATERMARK_RATIO 16
42
43	/*
44	* Check buffer ages in this interval (seconds)
45	*/
46	#define DM_BUFIO_WORK_TIMER_SECS 30
47
48	/*
49	* Free buffers when they are older than this (seconds)
50	*/
51	#define DM_BUFIO_DEFAULT_AGE_SECS 300
52
53	/*
54	* The nr of bytes of cached data to keep around.
55	*/
56	#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
57
58	/*
59	* Align buffer writes to this boundary.
60	* Tests show that SSDs have the highest IOPS when using 4k writes.
61	*/
62	#define DM_BUFIO_WRITE_ALIGN 4096
63
64	/*
65	* dm_buffer->list_mode
66	*/
67	#define LIST_CLEAN 0
68	#define LIST_DIRTY 1
69	#define LIST_SIZE 2
70
71	/--------------------------------------------------------------/
72
73	/*
74	* Rather than use an LRU list, we use a clock algorithm where entries
75	* are held in a circular list. When an entry is 'hit' a reference bit
76	* is set. The least recently used entry is approximated by running a
77	* cursor around the list selecting unreferenced entries. Referenced
78	* entries have their reference bit cleared as the cursor passes them.
79	*/
80	struct lru_entry {
81	struct list_head list;
82	atomic_t referenced;
83	};
84
85	struct lru_iter {
86	struct lru *lru;
87	struct list_head list;
88	struct lru_entry *stop;
89	struct lru_entry *e;
90	};
91
92	struct lru {
93	struct list_head *cursor;
94	unsigned long count;
95
96	struct list_head iterators;
97	};
98
99	/--------------/
100
101	static void lru_init(struct lru *lru)
102	{
103	lru->cursor = NULL;
104	lru->count = `0`;
105	INIT_LIST_HEAD(list: &lru->iterators);
106	}
107
108	static void lru_destroy(struct lru *lru)
109	{
110	WARN_ON_ONCE(lru->cursor);
111	WARN_ON_ONCE(!list_empty(&lru->iterators));
112	}
113
114	/*
115	* Insert a new entry into the lru.
116	*/
117	static void lru_insert(struct lru lru, struct* lru_entry *le)
118	{
119	/*
120	* Don't be tempted to set to 1, makes the lru aspect
121	* perform poorly.
122	*/
123	atomic_set(v: &le->referenced, i: `0`);
124
125	if (lru->cursor) {
126	list_add_tail(new: &le->list, head: lru->cursor);
127	} else {
128	INIT_LIST_HEAD(list: &le->list);
129	lru->cursor = &le->list;
130	}
131	lru->count++;
132	}
133
134	/--------------/
135
136	/*
137	* Convert a list_head pointer to an lru_entry pointer.
138	*/
139	static inline struct lru_entry to_le(struct* list_head *l)
140	{
141	return container_of(l, struct lru_entry, list);
142	}
143
144	/*
145	* Initialize an lru_iter and add it to the list of cursors in the lru.
146	*/
147	static void lru_iter_begin(struct lru lru, struct* lru_iter *it)
148	{
149	it->lru = lru;
150	it->stop = lru->cursor ? to_le(l: lru->cursor->prev) : NULL;
151	it->e = lru->cursor ? to_le(l: lru->cursor) : NULL;
152	list_add(new: &it->list, head: &lru->iterators);
153	}
154
155	/*
156	* Remove an lru_iter from the list of cursors in the lru.
157	*/
158	static inline void lru_iter_end(struct lru_iter *it)
159	{
160	list_del(entry: &it->list);
161	}
162
163	/ Predicate function type to be used with lru_iter_next /
164	typedef bool (iter_predicate)(struct* lru_entry le, void* *context);
165
166	/*
167	* Advance the cursor to the next entry that passes the
168	* predicate, and return that entry. Returns NULL if the
169	* iteration is complete.
170	*/
171	static struct lru_entry lru_iter_next(struct* lru_iter *it,
172	iter_predicate pred, void *context)
173	{
174	struct lru_entry *e;
175
176	while (it->e) {
177	e = it->e;
178
179	/ advance the cursor /
180	if (it->e == it->stop)
181	it->e = NULL;
182	else
183	it->e = to_le(l: it->e->list.next);
184
185	if (pred(e, context))
186	return e;
187	}
188
189	return NULL;
190	}
191
192	/*
193	* Invalidate a specific lru_entry and update all cursors in
194	* the lru accordingly.
195	*/
196	static void lru_iter_invalidate(struct lru lru, struct* lru_entry *e)
197	{
198	struct lru_iter *it;
199
200	list_for_each_entry(it, &lru->iterators, list) {
201	/ Move c->e forwards if necc. /
202	if (it->e == e) {
203	it->e = to_le(l: it->e->list.next);
204	if (it->e == e)
205	it->e = NULL;
206	}
207
208	/ Move it->stop backwards if necc. /
209	if (it->stop == e) {
210	it->stop = to_le(l: it->stop->list.prev);
211	if (it->stop == e)
212	it->stop = NULL;
213	}
214	}
215	}
216
217	/--------------/
218
219	/*
220	* Remove a specific entry from the lru.
221	*/
222	static void lru_remove(struct lru lru, struct* lru_entry *le)
223	{
224	lru_iter_invalidate(lru, e: le);
225	if (lru->count == `1`) {
226	lru->cursor = NULL;
227	} else {
228	if (lru->cursor == &le->list)
229	lru->cursor = lru->cursor->next;
230	list_del(entry: &le->list);
231	}
232	lru->count--;
233	}
234
235	/*
236	* Mark as referenced.
237	*/
238	static inline void lru_reference(struct lru_entry *le)
239	{
240	atomic_set(v: &le->referenced, i: `1`);
241	}
242
243	/--------------/
244
245	/*
246	* Remove the least recently used entry (approx), that passes the predicate.
247	* Returns NULL on failure.
248	*/
249	enum evict_result {
250	ER_EVICT,
251	ER_DONT_EVICT,
252	ER_STOP, / stop looking for something to evict /
253	};
254
255	typedef enum evict_result (le_predicate)(struct* lru_entry le, void* *context);
256
257	static struct lru_entry lru_evict(struct* lru lru, le_predicate pred, void* *context)
258	{
259	unsigned long tested = `0`;
260	struct list_head *h = lru->cursor;
261	struct lru_entry *le;
262
263	if (!h)
264	return NULL;
265	/*
266	* In the worst case we have to loop around twice. Once to clear
267	* the reference flags, and then again to discover the predicate
268	* fails for all entries.
269	*/
270	while (tested < lru->count) {
271	le = container_of(h, struct lru_entry, list);
272
273	if (atomic_read(v: &le->referenced)) {
274	atomic_set(v: &le->referenced, i: `0`);
275	} else {
276	tested++;
277	switch (pred(le, context)) {
278	case ER_EVICT:
279	/*
280	* Adjust the cursor, so we start the next
281	* search from here.
282	*/
283	lru->cursor = le->list.next;
284	lru_remove(lru, le);
285	return le;
286
287	case ER_DONT_EVICT:
288	break;
289
290	case ER_STOP:
291	lru->cursor = le->list.next;
292	return NULL;
293	}
294	}
295
296	h = h->next;
297
298	cond_resched();
299	}
300
301	return NULL;
302	}
303
304	/--------------------------------------------------------------/
305
306	/*
307	* Buffer state bits.
308	*/
309	#define B_READING 0
310	#define B_WRITING 1
311	#define B_DIRTY 2
312
313	/*
314	* Describes how the block was allocated:
315	* kmem_cache_alloc(), __get_free_pages() or vmalloc().
316	* See the comment at alloc_buffer_data.
317	*/
318	enum data_mode {
319	DATA_MODE_SLAB = `0`,
320	DATA_MODE_GET_FREE_PAGES = `1`,
321	DATA_MODE_VMALLOC = `2`,
322	DATA_MODE_LIMIT = `3`
323	};
324
325	struct dm_buffer {
326	/ protected by the locks in dm_buffer_cache /
327	struct rb_node node;
328
329	/ immutable, so don't need protecting /
330	sector_t block;
331	void *data;
332	unsigned char data_mode; / DATA_MODE_* /
333
334	/*
335	* These two fields are used in isolation, so do not need
336	* a surrounding lock.
337	*/
338	atomic_t hold_count;
339	unsigned long last_accessed;
340
341	/*
342	* Everything else is protected by the mutex in
343	* dm_bufio_client
344	*/
345	unsigned long state;
346	struct lru_entry lru;
347	unsigned char list_mode; / LIST_* /
348	blk_status_t read_error;
349	blk_status_t write_error;
350	unsigned int dirty_start;
351	unsigned int dirty_end;
352	unsigned int write_start;
353	unsigned int write_end;
354	struct list_head write_list;
355	struct dm_bufio_client *c;
356	void (end_io)(struct* dm_buffer *b, blk_status_t bs);
357	#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
358	#define MAX_STACK 10
359	unsigned int stack_len;
360	unsigned long stack_entries[MAX_STACK];
361	#endif
362	};
363
364	/--------------------------------------------------------------/
365
366	/*
367	* The buffer cache manages buffers, particularly:
368	* - inc/dec of holder count
369	* - setting the last_accessed field
370	* - maintains clean/dirty state along with lru
371	* - selecting buffers that match predicates
372	*
373	* It does not handle:
374	* - allocation/freeing of buffers.
375	* - IO
376	* - Eviction or cache sizing.
377	*
378	* cache_get() and cache_put() are threadsafe, you do not need to
379	* protect these calls with a surrounding mutex. All the other
380	* methods are not threadsafe; they do use locking primitives, but
381	* only enough to ensure get/put are threadsafe.
382	*/
383
384	struct buffer_tree {
385	struct rw_semaphore lock;
386	struct rb_root root;
387	} ____cacheline_aligned_in_smp;
388
389	struct dm_buffer_cache {
390	struct lru lru[LIST_SIZE];
391	/*
392	* We spread entries across multiple trees to reduce contention
393	* on the locks.
394	*/
395	unsigned int num_locks;
396	struct buffer_tree trees[];
397	};
398
399	static inline unsigned int cache_index(sector_t block, unsigned int num_locks)
400	{
401	return dm_hash_locks_index(block, num_locks);
402	}
403
404	static inline void cache_read_lock(struct dm_buffer_cache *bc, sector_t block)
405	{
406	down_read(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock);
407	}
408
409	static inline void cache_read_unlock(struct dm_buffer_cache *bc, sector_t block)
410	{
411	up_read(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock);
412	}
413
414	static inline void cache_write_lock(struct dm_buffer_cache *bc, sector_t block)
415	{
416	down_write(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock);
417	}
418
419	static inline void cache_write_unlock(struct dm_buffer_cache *bc, sector_t block)
420	{
421	up_write(sem: &bc->trees[cache_index(block, num_locks: bc->num_locks)].lock);
422	}
423
424	/*
425	* Sometimes we want to repeatedly get and drop locks as part of an iteration.
426	* This struct helps avoid redundant drop and gets of the same lock.
427	*/
428	struct lock_history {
429	struct dm_buffer_cache *cache;
430	bool write;
431	unsigned int previous;
432	unsigned int no_previous;
433	};
434
435	static void lh_init(struct lock_history lh, struct* dm_buffer_cache *cache, bool write)
436	{
437	lh->cache = cache;
438	lh->write = write;
439	lh->no_previous = cache->num_locks;
440	lh->previous = lh->no_previous;
441	}
442
443	static void __lh_lock(struct lock_history lh, unsigned* int index)
444	{
445	if (lh->write)
446	down_write(sem: &lh->cache->trees[index].lock);
447	else
448	down_read(sem: &lh->cache->trees[index].lock);
449	}
450
451	static void __lh_unlock(struct lock_history lh, unsigned* int index)
452	{
453	if (lh->write)
454	up_write(sem: &lh->cache->trees[index].lock);
455	else
456	up_read(sem: &lh->cache->trees[index].lock);
457	}
458
459	/*
460	* Make sure you call this since it will unlock the final lock.
461	*/
462	static void lh_exit(struct lock_history *lh)
463	{
464	if (lh->previous != lh->no_previous) {
465	__lh_unlock(lh, index: lh->previous);
466	lh->previous = lh->no_previous;
467	}
468	}
469
470	/*
471	* Named 'next' because there is no corresponding
472	* 'up/unlock' call since it's done automatically.
473	*/
474	static void lh_next(struct lock_history *lh, sector_t b)
475	{
476	unsigned int index = cache_index(block: b, num_locks: lh->no_previous); / no_previous is num_locks /
477
478	if (lh->previous != lh->no_previous) {
479	if (lh->previous != index) {
480	__lh_unlock(lh, index: lh->previous);
481	__lh_lock(lh, index);
482	lh->previous = index;
483	}
484	} else {
485	__lh_lock(lh, index);
486	lh->previous = index;
487	}
488	}
489
490	static inline struct dm_buffer le_to_buffer(struct* lru_entry *le)
491	{
492	return container_of(le, struct dm_buffer, lru);
493	}
494
495	static struct dm_buffer list_to_buffer(struct* list_head *l)
496	{
497	struct lru_entry le = list_entry(l, struct* lru_entry, list);
498
499	if (!le)
500	return NULL;
501
502	return le_to_buffer(le);
503	}
504
505	static void cache_init(struct dm_buffer_cache bc, unsigned* int num_locks)
506	{
507	unsigned int i;
508
509	bc->num_locks = num_locks;
510
511	for (i = `0`; i < bc->num_locks; i++) {
512	init_rwsem(&bc->trees[i].lock);
513	bc->trees[i].root = RB_ROOT;
514	}
515
516	lru_init(lru: &bc->lru[LIST_CLEAN]);
517	lru_init(lru: &bc->lru[LIST_DIRTY]);
518	}
519
520	static void cache_destroy(struct dm_buffer_cache *bc)
521	{
522	unsigned int i;
523
524	for (i = `0`; i < bc->num_locks; i++)
525	WARN_ON_ONCE(!RB_EMPTY_ROOT(&bc->trees[i].root));
526
527	lru_destroy(lru: &bc->lru[LIST_CLEAN]);
528	lru_destroy(lru: &bc->lru[LIST_DIRTY]);
529	}
530
531	/--------------/
532
533	/*
534	* not threadsafe, or racey depending how you look at it
535	*/
536	static inline unsigned long cache_count(struct dm_buffer_cache bc, int* list_mode)
537	{
538	return bc->lru[list_mode].count;
539	}
540
541	static inline unsigned long cache_total(struct dm_buffer_cache *bc)
542	{
543	return cache_count(bc, LIST_CLEAN) + cache_count(bc, LIST_DIRTY);
544	}
545
546	/--------------/
547
548	/*
549	* Gets a specific buffer, indexed by block.
550	* If the buffer is found then its holder count will be incremented and
551	* lru_reference will be called.
552	*
553	* threadsafe
554	*/
555	static struct dm_buffer __cache_get(const* struct rb_root *root, sector_t block)
556	{
557	struct rb_node *n = root->rb_node;
558	struct dm_buffer *b;
559
560	while (n) {
561	b = container_of(n, struct dm_buffer, node);
562
563	if (b->block == block)
564	return b;
565
566	n = block < b->block ? n->rb_left : n->rb_right;
567	}
568
569	return NULL;
570	}
571
572	static void __cache_inc_buffer(struct dm_buffer *b)
573	{
574	atomic_inc(v: &b->hold_count);
575	WRITE_ONCE(b->last_accessed, jiffies);
576	}
577
578	static struct dm_buffer cache_get(struct* dm_buffer_cache *bc, sector_t block)
579	{
580	struct dm_buffer *b;
581
582	cache_read_lock(bc, block);
583	b = __cache_get(root: &bc->trees[cache_index(block, num_locks: bc->num_locks)].root, block);
584	if (b) {
585	lru_reference(le: &b->lru);
586	__cache_inc_buffer(b);
587	}
588	cache_read_unlock(bc, block);
589
590	return b;
591	}
592
593	/--------------/
594
595	/*
596	* Returns true if the hold count hits zero.
597	* threadsafe
598	*/
599	static bool cache_put(struct dm_buffer_cache bc, struct* dm_buffer *b)
600	{
601	bool r;
602
603	cache_read_lock(bc, block: b->block);
604	BUG_ON(!atomic_read(&b->hold_count));
605	r = atomic_dec_and_test(v: &b->hold_count);
606	cache_read_unlock(bc, block: b->block);
607
608	return r;
609	}
610
611	/--------------/
612
613	typedef enum evict_result (b_predicate)(struct* dm_buffer , void* *);
614
615	/*
616	* Evicts a buffer based on a predicate. The oldest buffer that
617	* matches the predicate will be selected. In addition to the
618	* predicate the hold_count of the selected buffer will be zero.
619	*/
620	struct evict_wrapper {
621	struct lock_history *lh;
622	b_predicate pred;
623	void *context;
624	};
625
626	/*
627	* Wraps the buffer predicate turning it into an lru predicate. Adds
628	* extra test for hold_count.
629	*/
630	static enum evict_result __evict_pred(struct lru_entry le, void* *context)
631	{
632	struct evict_wrapper *w = context;
633	struct dm_buffer *b = le_to_buffer(le);
634
635	lh_next(lh: w->lh, b: b->block);
636
637	if (atomic_read(v: &b->hold_count))
638	return ER_DONT_EVICT;
639
640	return w->pred(b, w->context);
641	}
642
643	static struct dm_buffer __cache_evict(struct* dm_buffer_cache bc, int* list_mode,
644	b_predicate pred, void *context,
645	struct lock_history *lh)
646	{
647	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
648	struct lru_entry *le;
649	struct dm_buffer *b;
650
651	le = lru_evict(lru: &bc->lru[list_mode], pred: __evict_pred, context: &w);
652	if (!le)
653	return NULL;
654
655	b = le_to_buffer(le);
656	/ __evict_pred will have locked the appropriate tree. /
657	rb_erase(&b->node, &bc->trees[cache_index(block: b->block, num_locks: bc->num_locks)].root);
658
659	return b;
660	}
661
662	static struct dm_buffer cache_evict(struct* dm_buffer_cache bc, int* list_mode,
663	b_predicate pred, void *context)
664	{
665	struct dm_buffer *b;
666	struct lock_history lh;
667
668	lh_init(lh: &lh, cache: bc, write: true);
669	b = __cache_evict(bc, list_mode, pred, context, lh: &lh);
670	lh_exit(lh: &lh);
671
672	return b;
673	}
674
675	/--------------/
676
677	/*
678	* Mark a buffer as clean or dirty. Not threadsafe.
679	*/
680	static void cache_mark(struct dm_buffer_cache bc, struct* dm_buffer b, int* list_mode)
681	{
682	cache_write_lock(bc, block: b->block);
683	if (list_mode != b->list_mode) {
684	lru_remove(lru: &bc->lru[b->list_mode], le: &b->lru);
685	b->list_mode = list_mode;
686	lru_insert(lru: &bc->lru[b->list_mode], le: &b->lru);
687	}
688	cache_write_unlock(bc, block: b->block);
689	}
690
691	/--------------/
692
693	/*
694	* Runs through the lru associated with 'old_mode', if the predicate matches then
695	* it moves them to 'new_mode'. Not threadsafe.
696	*/
697	static void __cache_mark_many(struct dm_buffer_cache bc, int* old_mode, int new_mode,
698	b_predicate pred, void context, struct* lock_history *lh)
699	{
700	struct lru_entry *le;
701	struct dm_buffer *b;
702	struct evict_wrapper w = {.lh = lh, .pred = pred, .context = context};
703
704	while (true) {
705	le = lru_evict(lru: &bc->lru[old_mode], pred: __evict_pred, context: &w);
706	if (!le)
707	break;
708
709	b = le_to_buffer(le);
710	b->list_mode = new_mode;
711	lru_insert(lru: &bc->lru[b->list_mode], le: &b->lru);
712	}
713	}
714
715	static void cache_mark_many(struct dm_buffer_cache bc, int* old_mode, int new_mode,
716	b_predicate pred, void *context)
717	{
718	struct lock_history lh;
719
720	lh_init(lh: &lh, cache: bc, write: true);
721	__cache_mark_many(bc, old_mode, new_mode, pred, context, lh: &lh);
722	lh_exit(lh: &lh);
723	}
724
725	/--------------/
726
727	/*
728	* Iterates through all clean or dirty entries calling a function for each
729	* entry. The callback may terminate the iteration early. Not threadsafe.
730	*/
731
732	/*
733	* Iterator functions should return one of these actions to indicate
734	* how the iteration should proceed.
735	*/
736	enum it_action {
737	IT_NEXT,
738	IT_COMPLETE,
739	};
740
741	typedef enum it_action (iter_fn)(struct* dm_buffer b, void* *context);
742
743	static void __cache_iterate(struct dm_buffer_cache bc, int* list_mode,
744	iter_fn fn, void context, struct* lock_history *lh)
745	{
746	struct lru *lru = &bc->lru[list_mode];
747	struct lru_entry le, first;
748
749	if (!lru->cursor)
750	return;
751
752	first = le = to_le(l: lru->cursor);
753	do {
754	struct dm_buffer *b = le_to_buffer(le);
755
756	lh_next(lh, b: b->block);
757
758	switch (fn(b, context)) {
759	case IT_NEXT:
760	break;
761
762	case IT_COMPLETE:
763	return;
764	}
765	cond_resched();
766
767	le = to_le(l: le->list.next);
768	} while (le != first);
769	}
770
771	static void cache_iterate(struct dm_buffer_cache bc, int* list_mode,
772	iter_fn fn, void *context)
773	{
774	struct lock_history lh;
775
776	lh_init(lh: &lh, cache: bc, write: false);
777	__cache_iterate(bc, list_mode, fn, context, lh: &lh);
778	lh_exit(lh: &lh);
779	}
780
781	/--------------/
782
783	/*
784	* Passes ownership of the buffer to the cache. Returns false if the
785	* buffer was already present (in which case ownership does not pass).
786	* eg, a race with another thread.
787	*
788	* Holder count should be 1 on insertion.
789	*
790	* Not threadsafe.
791	*/
792	static bool __cache_insert(struct rb_root root, struct* dm_buffer *b)
793	{
794	struct rb_node *new = &root->rb_node, parent = NULL;
795	struct dm_buffer *found;
796
797	while (*new) {
798	found = container_of(new, struct* dm_buffer, node);
799
800	if (found->block == b->block)
801	return false;
802
803	parent = *new;
804	new = b->block < found->block ?
805	&found->node.rb_left : &found->node.rb_right;
806	}
807
808	rb_link_node(node: &b->node, parent, rb_link: new);
809	rb_insert_color(&b->node, root);
810
811	return true;
812	}
813
814	static bool cache_insert(struct dm_buffer_cache bc, struct* dm_buffer *b)
815	{
816	bool r;
817
818	if (WARN_ON_ONCE(b->list_mode >= LIST_SIZE))
819	return false;
820
821	cache_write_lock(bc, block: b->block);
822	BUG_ON(atomic_read(&b->hold_count) != `1`);
823	r = __cache_insert(root: &bc->trees[cache_index(block: b->block, num_locks: bc->num_locks)].root, b);
824	if (r)
825	lru_insert(lru: &bc->lru[b->list_mode], le: &b->lru);
826	cache_write_unlock(bc, block: b->block);
827
828	return r;
829	}
830
831	/--------------/
832
833	/*
834	* Removes buffer from cache, ownership of the buffer passes back to the caller.
835	* Fails if the hold_count is not one (ie. the caller holds the only reference).
836	*
837	* Not threadsafe.
838	*/
839	static bool cache_remove(struct dm_buffer_cache bc, struct* dm_buffer *b)
840	{
841	bool r;
842
843	cache_write_lock(bc, block: b->block);
844
845	if (atomic_read(v: &b->hold_count) != `1`) {
846	r = false;
847	} else {
848	r = true;
849	rb_erase(&b->node, &bc->trees[cache_index(block: b->block, num_locks: bc->num_locks)].root);
850	lru_remove(lru: &bc->lru[b->list_mode], le: &b->lru);
851	}
852
853	cache_write_unlock(bc, block: b->block);
854
855	return r;
856	}
857
858	/--------------/
859
860	typedef void (b_release)(struct* dm_buffer *);
861
862	static struct dm_buffer __find_next(struct* rb_root *root, sector_t block)
863	{
864	struct rb_node *n = root->rb_node;
865	struct dm_buffer *b;
866	struct dm_buffer *best = NULL;
867
868	while (n) {
869	b = container_of(n, struct dm_buffer, node);
870
871	if (b->block == block)
872	return b;
873
874	if (block <= b->block) {
875	n = n->rb_left;
876	best = b;
877	} else {
878	n = n->rb_right;
879	}
880	}
881
882	return best;
883	}
884
885	static void __remove_range(struct dm_buffer_cache *bc,
886	struct rb_root *root,
887	sector_t begin, sector_t end,
888	b_predicate pred, b_release release)
889	{
890	struct dm_buffer *b;
891
892	while (true) {
893	cond_resched();
894
895	b = __find_next(root, block: begin);
896	if (!b \|\| (b->block >= end))
897	break;
898
899	begin = b->block + `1`;
900
901	if (atomic_read(v: &b->hold_count))
902	continue;
903
904	if (pred(b, NULL) == ER_EVICT) {
905	rb_erase(&b->node, root);
906	lru_remove(lru: &bc->lru[b->list_mode], le: &b->lru);
907	release(b);
908	}
909	}
910	}
911
912	static void cache_remove_range(struct dm_buffer_cache *bc,
913	sector_t begin, sector_t end,
914	b_predicate pred, b_release release)
915	{
916	unsigned int i;
917
918	for (i = `0`; i < bc->num_locks; i++) {
919	down_write(sem: &bc->trees[i].lock);
920	__remove_range(bc, root: &bc->trees[i].root, begin, end, pred, release);
921	up_write(sem: &bc->trees[i].lock);
922	}
923	}
924
925	/----------------------------------------------------------------/
926
927	/*
928	* Linking of buffers:
929	* All buffers are linked to buffer_cache with their node field.
930	*
931	* Clean buffers that are not being written (B_WRITING not set)
932	* are linked to lru[LIST_CLEAN] with their lru_list field.
933	*
934	* Dirty and clean buffers that are being written are linked to
935	* lru[LIST_DIRTY] with their lru_list field. When the write
936	* finishes, the buffer cannot be relinked immediately (because we
937	* are in an interrupt context and relinking requires process
938	* context), so some clean-not-writing buffers can be held on
939	* dirty_lru too. They are later added to lru in the process
940	* context.
941	*/
942	struct dm_bufio_client {
943	struct block_device *bdev;
944	unsigned int block_size;
945	s8 sectors_per_block_bits;
946
947	bool no_sleep;
948	struct mutex lock;
949	spinlock_t spinlock;
950
951	int async_write_error;
952
953	void (alloc_callback)(struct* dm_buffer *buf);
954	void (write_callback)(struct* dm_buffer *buf);
955	struct kmem_cache *slab_buffer;
956	struct kmem_cache *slab_cache;
957	struct dm_io_client *dm_io;
958
959	struct list_head reserved_buffers;
960	unsigned int need_reserved_buffers;
961
962	unsigned int minimum_buffers;
963
964	sector_t start;
965
966	struct shrinker *shrinker;
967	struct work_struct shrink_work;
968	atomic_long_t need_shrink;
969
970	wait_queue_head_t free_buffer_wait;
971
972	struct list_head client_list;
973
974	/*
975	* Used by global_cleanup to sort the clients list.
976	*/
977	unsigned long oldest_buffer;
978
979	struct dm_buffer_cache cache; / must be last member /
980	};
981
982	static DEFINE_STATIC_KEY_FALSE(no_sleep_enabled);
983
984	/----------------------------------------------------------------/
985
986	#define dm_bufio_in_request() (!!current->bio_list)
987
988	static void dm_bufio_lock(struct dm_bufio_client *c)
989	{
990	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
991	spin_lock_bh(lock: &c->spinlock);
992	else
993	mutex_lock_nested(lock: &c->lock, dm_bufio_in_request());
994	}
995
996	static void dm_bufio_unlock(struct dm_bufio_client *c)
997	{
998	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
999	spin_unlock_bh(lock: &c->spinlock);
1000	else
1001	mutex_unlock(lock: &c->lock);
1002	}
1003
1004	/----------------------------------------------------------------/
1005
1006	/*
1007	* Default cache size: available memory divided by the ratio.
1008	*/
1009	static unsigned long dm_bufio_default_cache_size;
1010
1011	/*
1012	* Total cache size set by the user.
1013	*/
1014	static unsigned long dm_bufio_cache_size;
1015
1016	/*
1017	* A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
1018	* at any time. If it disagrees, the user has changed cache size.
1019	*/
1020	static unsigned long dm_bufio_cache_size_latch;
1021
1022	static DEFINE_SPINLOCK(global_spinlock);
1023
1024	/*
1025	* Buffers are freed after this timeout
1026	*/
1027	static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
1028	static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
1029
1030	static unsigned long dm_bufio_peak_allocated;
1031	static unsigned long dm_bufio_allocated_kmem_cache;
1032	static unsigned long dm_bufio_allocated_get_free_pages;
1033	static unsigned long dm_bufio_allocated_vmalloc;
1034	static unsigned long dm_bufio_current_allocated;
1035
1036	/----------------------------------------------------------------/
1037
1038	/*
1039	* The current number of clients.
1040	*/
1041	static int dm_bufio_client_count;
1042
1043	/*
1044	* The list of all clients.
1045	*/
1046	static LIST_HEAD(dm_bufio_all_clients);
1047
1048	/*
1049	* This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
1050	*/
1051	static DEFINE_MUTEX(dm_bufio_clients_lock);
1052
1053	static struct workqueue_struct *dm_bufio_wq;
1054	static struct delayed_work dm_bufio_cleanup_old_work;
1055	static struct work_struct dm_bufio_replacement_work;
1056
1057
1058	#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1059	static void buffer_record_stack(struct dm_buffer *b)
1060	{
1061	b->stack_len = stack_trace_save(store: b->stack_entries, MAX_STACK, skipnr: `2`);
1062	}
1063	#endif
1064
1065	/----------------------------------------------------------------/
1066
1067	static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
1068	{
1069	unsigned char data_mode;
1070	long diff;
1071
1072	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
1073	&dm_bufio_allocated_kmem_cache,
1074	&dm_bufio_allocated_get_free_pages,
1075	&dm_bufio_allocated_vmalloc,
1076	};
1077
1078	data_mode = b->data_mode;
1079	diff = (long)b->c->block_size;
1080	if (unlink)
1081	diff = -diff;
1082
1083	spin_lock(lock: &global_spinlock);
1084
1085	*class_ptr[data_mode] += diff;
1086
1087	dm_bufio_current_allocated += diff;
1088
1089	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
1090	dm_bufio_peak_allocated = dm_bufio_current_allocated;
1091
1092	if (!unlink) {
1093	if (dm_bufio_current_allocated > dm_bufio_cache_size)
1094	queue_work(wq: dm_bufio_wq, work: &dm_bufio_replacement_work);
1095	}
1096
1097	spin_unlock(lock: &global_spinlock);
1098	}
1099
1100	/*
1101	* Change the number of clients and recalculate per-client limit.
1102	*/
1103	static void __cache_size_refresh(void)
1104	{
1105	if (WARN_ON(!mutex_is_locked(&dm_bufio_clients_lock)))
1106	return;
1107	if (WARN_ON(dm_bufio_client_count < `0`))
1108	return;
1109
1110	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
1111
1112	/*
1113	* Use default if set to 0 and report the actual cache size used.
1114	*/
1115	if (!dm_bufio_cache_size_latch) {
1116	(void)cmpxchg(&dm_bufio_cache_size, `0`,
1117	dm_bufio_default_cache_size);
1118	dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
1119	}
1120	}
1121
1122	/*
1123	* Allocating buffer data.
1124	*
1125	* Small buffers are allocated with kmem_cache, to use space optimally.
1126	*
1127	* For large buffers, we choose between get_free_pages and vmalloc.
1128	* Each has advantages and disadvantages.
1129	*
1130	* __get_free_pages can randomly fail if the memory is fragmented.
1131	* __vmalloc won't randomly fail, but vmalloc space is limited (it may be
1132	* as low as 128M) so using it for caching is not appropriate.
1133	*
1134	* If the allocation may fail we use __get_free_pages. Memory fragmentation
1135	* won't have a fatal effect here, but it just causes flushes of some other
1136	* buffers and more I/O will be performed. Don't use __get_free_pages if it
1137	* always fails (i.e. order > MAX_ORDER).
1138	*
1139	* If the allocation shouldn't fail we use __vmalloc. This is only for the
1140	* initial reserve allocation, so there's no risk of wasting all vmalloc
1141	* space.
1142	*/
1143	static void alloc_buffer_data(struct* dm_bufio_client *c, gfp_t gfp_mask,
1144	unsigned char *data_mode)
1145	{
1146	if (unlikely(c->slab_cache != NULL)) {
1147	*data_mode = DATA_MODE_SLAB;
1148	return kmem_cache_alloc(cachep: c->slab_cache, flags: gfp_mask);
1149	}
1150
1151	if (c->block_size <= KMALLOC_MAX_SIZE &&
1152	gfp_mask & __GFP_NORETRY) {
1153	*data_mode = DATA_MODE_GET_FREE_PAGES;
1154	return (void *)__get_free_pages(gfp_mask,
1155	order: c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1156	}
1157
1158	*data_mode = DATA_MODE_VMALLOC;
1159
1160	return __vmalloc(size: c->block_size, gfp_mask);
1161	}
1162
1163	/*
1164	* Free buffer's data.
1165	*/
1166	static void free_buffer_data(struct dm_bufio_client *c,
1167	void data, unsigned* char data_mode)
1168	{
1169	switch (data_mode) {
1170	case DATA_MODE_SLAB:
1171	kmem_cache_free(s: c->slab_cache, objp: data);
1172	break;
1173
1174	case DATA_MODE_GET_FREE_PAGES:
1175	free_pages(addr: (unsigned long)data,
1176	order: c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
1177	break;
1178
1179	case DATA_MODE_VMALLOC:
1180	vfree(addr: data);
1181	break;
1182
1183	default:
1184	DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
1185	data_mode);
1186	BUG();
1187	}
1188	}
1189
1190	/*
1191	* Allocate buffer and its data.
1192	*/
1193	static struct dm_buffer alloc_buffer(struct* dm_bufio_client *c, gfp_t gfp_mask)
1194	{
1195	struct dm_buffer *b = kmem_cache_alloc(cachep: c->slab_buffer, flags: gfp_mask);
1196
1197	if (!b)
1198	return NULL;
1199
1200	b->c = c;
1201
1202	b->data = alloc_buffer_data(c, gfp_mask, data_mode: &b->data_mode);
1203	if (!b->data) {
1204	kmem_cache_free(s: c->slab_buffer, objp: b);
1205	return NULL;
1206	}
1207	adjust_total_allocated(b, unlink: false);
1208
1209	#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1210	b->stack_len = `0`;
1211	#endif
1212	return b;
1213	}
1214
1215	/*
1216	* Free buffer and its data.
1217	*/
1218	static void free_buffer(struct dm_buffer *b)
1219	{
1220	struct dm_bufio_client *c = b->c;
1221
1222	adjust_total_allocated(b, unlink: true);
1223	free_buffer_data(c, data: b->data, data_mode: b->data_mode);
1224	kmem_cache_free(s: c->slab_buffer, objp: b);
1225	}
1226
1227	/*
1228	*--------------------------------------------------------------------------
1229	* Submit I/O on the buffer.
1230	*
1231	* Bio interface is faster but it has some problems:
1232	* the vector list is limited (increasing this limit increases
1233	* memory-consumption per buffer, so it is not viable);
1234	*
1235	* the memory must be direct-mapped, not vmalloced;
1236	*
1237	* If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
1238	* it is not vmalloced, try using the bio interface.
1239	*
1240	* If the buffer is big, if it is vmalloced or if the underlying device
1241	* rejects the bio because it is too large, use dm-io layer to do the I/O.
1242	* The dm-io layer splits the I/O into multiple requests, avoiding the above
1243	* shortcomings.
1244	*--------------------------------------------------------------------------
1245	*/
1246
1247	/*
1248	* dm-io completion routine. It just calls b->bio.bi_end_io, pretending
1249	* that the request was handled directly with bio interface.
1250	*/
1251	static void dmio_complete(unsigned long error, void *context)
1252	{
1253	struct dm_buffer *b = context;
1254
1255	b->end_io(b, unlikely(error != `0`) ? BLK_STS_IOERR : `0`);
1256	}
1257
1258	static void use_dmio(struct dm_buffer b, enum* req_op op, sector_t sector,
1259	unsigned int n_sectors, unsigned int offset)
1260	{
1261	int r;
1262	struct dm_io_request io_req = {
1263	.bi_opf = op,
1264	.notify.fn = dmio_complete,
1265	.notify.context = b,
1266	.client = b->c->dm_io,
1267	};
1268	struct dm_io_region region = {
1269	.bdev = b->c->bdev,
1270	.sector = sector,
1271	.count = n_sectors,
1272	};
1273
1274	if (b->data_mode != DATA_MODE_VMALLOC) {
1275	io_req.mem.type = DM_IO_KMEM;
1276	io_req.mem.ptr.addr = (char *)b->data + offset;
1277	} else {
1278	io_req.mem.type = DM_IO_VMA;
1279	io_req.mem.ptr.vma = (char *)b->data + offset;
1280	}
1281
1282	r = dm_io(io_req: &io_req, num_regions: `1`, region: &region, NULL);
1283	if (unlikely(r))
1284	b->end_io(b, errno_to_blk_status(errno: r));
1285	}
1286
1287	static void bio_complete(struct bio *bio)
1288	{
1289	struct dm_buffer *b = bio->bi_private;
1290	blk_status_t status = bio->bi_status;
1291
1292	bio_uninit(bio);
1293	kfree(objp: bio);
1294	b->end_io(b, status);
1295	}
1296
1297	static void use_bio(struct dm_buffer b, enum* req_op op, sector_t sector,
1298	unsigned int n_sectors, unsigned int offset)
1299	{
1300	struct bio *bio;
1301	char *ptr;
1302	unsigned int len;
1303
1304	bio = bio_kmalloc(nr_vecs: `1`, GFP_NOWAIT \| __GFP_NORETRY \| __GFP_NOWARN);
1305	if (!bio) {
1306	use_dmio(b, op, sector, n_sectors, offset);
1307	return;
1308	}
1309	bio_init(bio, bdev: b->c->bdev, table: bio->bi_inline_vecs, max_vecs: `1`, opf: op);
1310	bio->bi_iter.bi_sector = sector;
1311	bio->bi_end_io = bio_complete;
1312	bio->bi_private = b;
1313
1314	ptr = (char *)b->data + offset;
1315	len = n_sectors << SECTOR_SHIFT;
1316
1317	__bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
1318
1319	submit_bio(bio);
1320	}
1321
1322	static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
1323	{
1324	sector_t sector;
1325
1326	if (likely(c->sectors_per_block_bits >= `0`))
1327	sector = block << c->sectors_per_block_bits;
1328	else
1329	sector = block * (c->block_size >> SECTOR_SHIFT);
1330	sector += c->start;
1331
1332	return sector;
1333	}
1334
1335	static void submit_io(struct dm_buffer b, enum* req_op op,
1336	void (end_io)(struct* dm_buffer *, blk_status_t))
1337	{
1338	unsigned int n_sectors;
1339	sector_t sector;
1340	unsigned int offset, end;
1341
1342	b->end_io = end_io;
1343
1344	sector = block_to_sector(c: b->c, block: b->block);
1345
1346	if (op != REQ_OP_WRITE) {
1347	n_sectors = b->c->block_size >> SECTOR_SHIFT;
1348	offset = `0`;
1349	} else {
1350	if (b->c->write_callback)
1351	b->c->write_callback(b);
1352	offset = b->write_start;
1353	end = b->write_end;
1354	offset &= -DM_BUFIO_WRITE_ALIGN;
1355	end += DM_BUFIO_WRITE_ALIGN - `1`;
1356	end &= -DM_BUFIO_WRITE_ALIGN;
1357	if (unlikely(end > b->c->block_size))
1358	end = b->c->block_size;
1359
1360	sector += offset >> SECTOR_SHIFT;
1361	n_sectors = (end - offset) >> SECTOR_SHIFT;
1362	}
1363
1364	if (b->data_mode != DATA_MODE_VMALLOC)
1365	use_bio(b, op, sector, n_sectors, offset);
1366	else
1367	use_dmio(b, op, sector, n_sectors, offset);
1368	}
1369
1370	/*
1371	*--------------------------------------------------------------
1372	* Writing dirty buffers
1373	*--------------------------------------------------------------
1374	*/
1375
1376	/*
1377	* The endio routine for write.
1378	*
1379	* Set the error, clear B_WRITING bit and wake anyone who was waiting on
1380	* it.
1381	*/
1382	static void write_endio(struct dm_buffer *b, blk_status_t status)
1383	{
1384	b->write_error = status;
1385	if (unlikely(status)) {
1386	struct dm_bufio_client *c = b->c;
1387
1388	(void)cmpxchg(&c->async_write_error, `0`,
1389	blk_status_to_errno(status));
1390	}
1391
1392	BUG_ON(!test_bit(B_WRITING, &b->state));
1393
1394	smp_mb__before_atomic();
1395	clear_bit(B_WRITING, addr: &b->state);
1396	smp_mb__after_atomic();
1397
1398	wake_up_bit(word: &b->state, B_WRITING);
1399	}
1400
1401	/*
1402	* Initiate a write on a dirty buffer, but don't wait for it.
1403	*
1404	* - If the buffer is not dirty, exit.
1405	* - If there some previous write going on, wait for it to finish (we can't
1406	* have two writes on the same buffer simultaneously).
1407	* - Submit our write and don't wait on it. We set B_WRITING indicating
1408	* that there is a write in progress.
1409	*/
1410	static void __write_dirty_buffer(struct dm_buffer *b,
1411	struct list_head *write_list)
1412	{
1413	if (!test_bit(B_DIRTY, &b->state))
1414	return;
1415
1416	clear_bit(B_DIRTY, addr: &b->state);
1417	wait_on_bit_lock_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1418
1419	b->write_start = b->dirty_start;
1420	b->write_end = b->dirty_end;
1421
1422	if (!write_list)
1423	submit_io(b, op: REQ_OP_WRITE, end_io: write_endio);
1424	else
1425	list_add_tail(new: &b->write_list, head: write_list);
1426	}
1427
1428	static void __flush_write_list(struct list_head *write_list)
1429	{
1430	struct blk_plug plug;
1431
1432	blk_start_plug(&plug);
1433	while (!list_empty(head: write_list)) {
1434	struct dm_buffer *b =
1435	list_entry(write_list->next, struct dm_buffer, write_list);
1436	list_del(entry: &b->write_list);
1437	submit_io(b, op: REQ_OP_WRITE, end_io: write_endio);
1438	cond_resched();
1439	}
1440	blk_finish_plug(&plug);
1441	}
1442
1443	/*
1444	* Wait until any activity on the buffer finishes. Possibly write the
1445	* buffer if it is dirty. When this function finishes, there is no I/O
1446	* running on the buffer and the buffer is not dirty.
1447	*/
1448	static void __make_buffer_clean(struct dm_buffer *b)
1449	{
1450	BUG_ON(atomic_read(&b->hold_count));
1451
1452	/ smp_load_acquire() pairs with read_endio()'s smp_mb__before_atomic() /
1453	if (!smp_load_acquire(&b->state)) / fast case /
1454	return;
1455
1456	wait_on_bit_io(word: &b->state, B_READING, TASK_UNINTERRUPTIBLE);
1457	__write_dirty_buffer(b, NULL);
1458	wait_on_bit_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
1459	}
1460
1461	static enum evict_result is_clean(struct dm_buffer b, void* *context)
1462	{
1463	struct dm_bufio_client *c = context;
1464
1465	/ These should never happen /
1466	if (WARN_ON_ONCE(test_bit(B_WRITING, &b->state)))
1467	return ER_DONT_EVICT;
1468	if (WARN_ON_ONCE(test_bit(B_DIRTY, &b->state)))
1469	return ER_DONT_EVICT;
1470	if (WARN_ON_ONCE(b->list_mode != LIST_CLEAN))
1471	return ER_DONT_EVICT;
1472
1473	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep &&
1474	unlikely(test_bit(B_READING, &b->state)))
1475	return ER_DONT_EVICT;
1476
1477	return ER_EVICT;
1478	}
1479
1480	static enum evict_result is_dirty(struct dm_buffer b, void* *context)
1481	{
1482	/ These should never happen /
1483	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1484	return ER_DONT_EVICT;
1485	if (WARN_ON_ONCE(b->list_mode != LIST_DIRTY))
1486	return ER_DONT_EVICT;
1487
1488	return ER_EVICT;
1489	}
1490
1491	/*
1492	* Find some buffer that is not held by anybody, clean it, unlink it and
1493	* return it.
1494	*/
1495	static struct dm_buffer __get_unclaimed_buffer(struct* dm_bufio_client *c)
1496	{
1497	struct dm_buffer *b;
1498
1499	b = cache_evict(bc: &c->cache, LIST_CLEAN, pred: is_clean, context: c);
1500	if (b) {
1501	/ this also waits for pending reads /
1502	__make_buffer_clean(b);
1503	return b;
1504	}
1505
1506	if (static_branch_unlikely(&no_sleep_enabled) && c->no_sleep)
1507	return NULL;
1508
1509	b = cache_evict(bc: &c->cache, LIST_DIRTY, pred: is_dirty, NULL);
1510	if (b) {
1511	__make_buffer_clean(b);
1512	return b;
1513	}
1514
1515	return NULL;
1516	}
1517
1518	/*
1519	* Wait until some other threads free some buffer or release hold count on
1520	* some buffer.
1521	*
1522	* This function is entered with c->lock held, drops it and regains it
1523	* before exiting.
1524	*/
1525	static void __wait_for_free_buffer(struct dm_bufio_client *c)
1526	{
1527	DECLARE_WAITQUEUE(wait, current);
1528
1529	add_wait_queue(wq_head: &c->free_buffer_wait, wq_entry: &wait);
1530	set_current_state(TASK_UNINTERRUPTIBLE);
1531	dm_bufio_unlock(c);
1532
1533	/*
1534	* It's possible to miss a wake up event since we don't always
1535	* hold c->lock when wake_up is called. So we have a timeout here,
1536	* just in case.
1537	*/
1538	io_schedule_timeout(timeout: `5` * HZ);
1539
1540	remove_wait_queue(wq_head: &c->free_buffer_wait, wq_entry: &wait);
1541
1542	dm_bufio_lock(c);
1543	}
1544
1545	enum new_flag {
1546	NF_FRESH = `0`,
1547	NF_READ = `1`,
1548	NF_GET = `2`,
1549	NF_PREFETCH = `3`
1550	};
1551
1552	/*
1553	* Allocate a new buffer. If the allocation is not possible, wait until
1554	* some other thread frees a buffer.
1555	*
1556	* May drop the lock and regain it.
1557	*/
1558	static struct dm_buffer __alloc_buffer_wait_no_callback(struct* dm_bufio_client c, enum* new_flag nf)
1559	{
1560	struct dm_buffer *b;
1561	bool tried_noio_alloc = false;
1562
1563	/*
1564	* dm-bufio is resistant to allocation failures (it just keeps
1565	* one buffer reserved in cases all the allocations fail).
1566	* So set flags to not try too hard:
1567	* GFP_NOWAIT: don't wait; if we need to sleep we'll release our
1568	* mutex and wait ourselves.
1569	* __GFP_NORETRY: don't retry and rather return failure
1570	* __GFP_NOMEMALLOC: don't use emergency reserves
1571	* __GFP_NOWARN: don't print a warning in case of failure
1572	*
1573	* For debugging, if we set the cache size to 1, no new buffers will
1574	* be allocated.
1575	*/
1576	while (`1`) {
1577	if (dm_bufio_cache_size_latch != `1`) {
1578	b = alloc_buffer(c, GFP_NOWAIT \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
1579	if (b)
1580	return b;
1581	}
1582
1583	if (nf == NF_PREFETCH)
1584	return NULL;
1585
1586	if (dm_bufio_cache_size_latch != `1` && !tried_noio_alloc) {
1587	dm_bufio_unlock(c);
1588	b = alloc_buffer(c, GFP_NOIO \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
1589	dm_bufio_lock(c);
1590	if (b)
1591	return b;
1592	tried_noio_alloc = true;
1593	}
1594
1595	if (!list_empty(head: &c->reserved_buffers)) {
1596	b = list_to_buffer(l: c->reserved_buffers.next);
1597	list_del(entry: &b->lru.list);
1598	c->need_reserved_buffers++;
1599
1600	return b;
1601	}
1602
1603	b = __get_unclaimed_buffer(c);
1604	if (b)
1605	return b;
1606
1607	__wait_for_free_buffer(c);
1608	}
1609	}
1610
1611	static struct dm_buffer __alloc_buffer_wait(struct* dm_bufio_client c, enum* new_flag nf)
1612	{
1613	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
1614
1615	if (!b)
1616	return NULL;
1617
1618	if (c->alloc_callback)
1619	c->alloc_callback(b);
1620
1621	return b;
1622	}
1623
1624	/*
1625	* Free a buffer and wake other threads waiting for free buffers.
1626	*/
1627	static void __free_buffer_wake(struct dm_buffer *b)
1628	{
1629	struct dm_bufio_client *c = b->c;
1630
1631	b->block = -`1`;
1632	if (!c->need_reserved_buffers)
1633	free_buffer(b);
1634	else {
1635	list_add(new: &b->lru.list, head: &c->reserved_buffers);
1636	c->need_reserved_buffers--;
1637	}
1638
1639	/*
1640	* We hold the bufio lock here, so no one can add entries to the
1641	* wait queue anyway.
1642	*/
1643	if (unlikely(waitqueue_active(&c->free_buffer_wait)))
1644	wake_up(&c->free_buffer_wait);
1645	}
1646
1647	static enum evict_result cleaned(struct dm_buffer b, void* *context)
1648	{
1649	if (WARN_ON_ONCE(test_bit(B_READING, &b->state)))
1650	return ER_DONT_EVICT; / should never happen /
1651
1652	if (test_bit(B_DIRTY, &b->state) \|\| test_bit(B_WRITING, &b->state))
1653	return ER_DONT_EVICT;
1654	else
1655	return ER_EVICT;
1656	}
1657
1658	static void __move_clean_buffers(struct dm_bufio_client *c)
1659	{
1660	cache_mark_many(bc: &c->cache, LIST_DIRTY, LIST_CLEAN, pred: cleaned, NULL);
1661	}
1662
1663	struct write_context {
1664	int no_wait;
1665	struct list_head *write_list;
1666	};
1667
1668	static enum it_action write_one(struct dm_buffer b, void* *context)
1669	{
1670	struct write_context *wc = context;
1671
1672	if (wc->no_wait && test_bit(B_WRITING, &b->state))
1673	return IT_COMPLETE;
1674
1675	__write_dirty_buffer(b, write_list: wc->write_list);
1676	return IT_NEXT;
1677	}
1678
1679	static void __write_dirty_buffers_async(struct dm_bufio_client c, int* no_wait,
1680	struct list_head *write_list)
1681	{
1682	struct write_context wc = {.no_wait = no_wait, .write_list = write_list};
1683
1684	__move_clean_buffers(c);
1685	cache_iterate(bc: &c->cache, LIST_DIRTY, fn: write_one, context: &wc);
1686	}
1687
1688	/*
1689	* Check if we're over watermark.
1690	* If we are over threshold_buffers, start freeing buffers.
1691	* If we're over "limit_buffers", block until we get under the limit.
1692	*/
1693	static void __check_watermark(struct dm_bufio_client *c,
1694	struct list_head *write_list)
1695	{
1696	if (cache_count(bc: &c->cache, LIST_DIRTY) >
1697	cache_count(bc: &c->cache, LIST_CLEAN) * DM_BUFIO_WRITEBACK_RATIO)
1698	__write_dirty_buffers_async(c, no_wait: `1`, write_list);
1699	}
1700
1701	/*
1702	*--------------------------------------------------------------
1703	* Getting a buffer
1704	*--------------------------------------------------------------
1705	*/
1706
1707	static void cache_put_and_wake(struct dm_bufio_client c, struct* dm_buffer *b)
1708	{
1709	/*
1710	* Relying on waitqueue_active() is racey, but we sleep
1711	* with schedule_timeout anyway.
1712	*/
1713	if (cache_put(bc: &c->cache, b) &&
1714	unlikely(waitqueue_active(&c->free_buffer_wait)))
1715	wake_up(&c->free_buffer_wait);
1716	}
1717
1718	/*
1719	* This assumes you have already checked the cache to see if the buffer
1720	* is already present (it will recheck after dropping the lock for allocation).
1721	*/
1722	static struct dm_buffer __bufio_new(struct* dm_bufio_client *c, sector_t block,
1723	enum new_flag nf, int *need_submit,
1724	struct list_head *write_list)
1725	{
1726	struct dm_buffer b, new_b = NULL;
1727
1728	*need_submit = `0`;
1729
1730	/ This can't be called with NF_GET /
1731	if (WARN_ON_ONCE(nf == NF_GET))
1732	return NULL;
1733
1734	new_b = __alloc_buffer_wait(c, nf);
1735	if (!new_b)
1736	return NULL;
1737
1738	/*
1739	* We've had a period where the mutex was unlocked, so need to
1740	* recheck the buffer tree.
1741	*/
1742	b = cache_get(bc: &c->cache, block);
1743	if (b) {
1744	__free_buffer_wake(b: new_b);
1745	goto found_buffer;
1746	}
1747
1748	__check_watermark(c, write_list);
1749
1750	b = new_b;
1751	atomic_set(v: &b->hold_count, i: `1`);
1752	WRITE_ONCE(b->last_accessed, jiffies);
1753	b->block = block;
1754	b->read_error = `0`;
1755	b->write_error = `0`;
1756	b->list_mode = LIST_CLEAN;
1757
1758	if (nf == NF_FRESH)
1759	b->state = `0`;
1760	else {
1761	b->state = `1` << B_READING;
1762	*need_submit = `1`;
1763	}
1764
1765	/*
1766	* We mustn't insert into the cache until the B_READING state
1767	* is set. Otherwise another thread could get it and use
1768	* it before it had been read.
1769	*/
1770	cache_insert(bc: &c->cache, b);
1771
1772	return b;
1773
1774	found_buffer:
1775	if (nf == NF_PREFETCH) {
1776	cache_put_and_wake(c, b);
1777	return NULL;
1778	}
1779
1780	/*
1781	* Note: it is essential that we don't wait for the buffer to be
1782	* read if dm_bufio_get function is used. Both dm_bufio_get and
1783	* dm_bufio_prefetch can be used in the driver request routine.
1784	* If the user called both dm_bufio_prefetch and dm_bufio_get on
1785	* the same buffer, it would deadlock if we waited.
1786	*/
1787	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1788	cache_put_and_wake(c, b);
1789	return NULL;
1790	}
1791
1792	return b;
1793	}
1794
1795	/*
1796	* The endio routine for reading: set the error, clear the bit and wake up
1797	* anyone waiting on the buffer.
1798	*/
1799	static void read_endio(struct dm_buffer *b, blk_status_t status)
1800	{
1801	b->read_error = status;
1802
1803	BUG_ON(!test_bit(B_READING, &b->state));
1804
1805	smp_mb__before_atomic();
1806	clear_bit(B_READING, addr: &b->state);
1807	smp_mb__after_atomic();
1808
1809	wake_up_bit(word: &b->state, B_READING);
1810	}
1811
1812	/*
1813	* A common routine for dm_bufio_new and dm_bufio_read. Operation of these
1814	* functions is similar except that dm_bufio_new doesn't read the
1815	* buffer from the disk (assuming that the caller overwrites all the data
1816	* and uses dm_bufio_mark_buffer_dirty to write new data back).
1817	*/
1818	static void new_read(struct* dm_bufio_client *c, sector_t block,
1819	enum new_flag nf, struct dm_buffer **bp)
1820	{
1821	int need_submit = `0`;
1822	struct dm_buffer *b;
1823
1824	LIST_HEAD(write_list);
1825
1826	*bp = NULL;
1827
1828	/*
1829	* Fast path, hopefully the block is already in the cache. No need
1830	* to get the client lock for this.
1831	*/
1832	b = cache_get(bc: &c->cache, block);
1833	if (b) {
1834	if (nf == NF_PREFETCH) {
1835	cache_put_and_wake(c, b);
1836	return NULL;
1837	}
1838
1839	/*
1840	* Note: it is essential that we don't wait for the buffer to be
1841	* read if dm_bufio_get function is used. Both dm_bufio_get and
1842	* dm_bufio_prefetch can be used in the driver request routine.
1843	* If the user called both dm_bufio_prefetch and dm_bufio_get on
1844	* the same buffer, it would deadlock if we waited.
1845	*/
1846	if (nf == NF_GET && unlikely(test_bit_acquire(B_READING, &b->state))) {
1847	cache_put_and_wake(c, b);
1848	return NULL;
1849	}
1850	}
1851
1852	if (!b) {
1853	if (nf == NF_GET)
1854	return NULL;
1855
1856	dm_bufio_lock(c);
1857	b = __bufio_new(c, block, nf, need_submit: &need_submit, write_list: &write_list);
1858	dm_bufio_unlock(c);
1859	}
1860
1861	#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1862	if (b && (atomic_read(v: &b->hold_count) == `1`))
1863	buffer_record_stack(b);
1864	#endif
1865
1866	__flush_write_list(write_list: &write_list);
1867
1868	if (!b)
1869	return NULL;
1870
1871	if (need_submit)
1872	submit_io(b, op: REQ_OP_READ, end_io: read_endio);
1873
1874	wait_on_bit_io(word: &b->state, B_READING, TASK_UNINTERRUPTIBLE);
1875
1876	if (b->read_error) {
1877	int error = blk_status_to_errno(status: b->read_error);
1878
1879	dm_bufio_release(b);
1880
1881	return ERR_PTR(error);
1882	}
1883
1884	*bp = b;
1885
1886	return b->data;
1887	}
1888
1889	void dm_bufio_get(struct* dm_bufio_client *c, sector_t block,
1890	struct dm_buffer **bp)
1891	{
1892	return new_read(c, block, nf: NF_GET, bp);
1893	}
1894	EXPORT_SYMBOL_GPL(dm_bufio_get);
1895
1896	void dm_bufio_read(struct* dm_bufio_client *c, sector_t block,
1897	struct dm_buffer **bp)
1898	{
1899	if (WARN_ON_ONCE(dm_bufio_in_request()))
1900	return ERR_PTR(error: -EINVAL);
1901
1902	return new_read(c, block, nf: NF_READ, bp);
1903	}
1904	EXPORT_SYMBOL_GPL(dm_bufio_read);
1905
1906	void dm_bufio_new(struct* dm_bufio_client *c, sector_t block,
1907	struct dm_buffer **bp)
1908	{
1909	if (WARN_ON_ONCE(dm_bufio_in_request()))
1910	return ERR_PTR(error: -EINVAL);
1911
1912	return new_read(c, block, nf: NF_FRESH, bp);
1913	}
1914	EXPORT_SYMBOL_GPL(dm_bufio_new);
1915
1916	void dm_bufio_prefetch(struct dm_bufio_client *c,
1917	sector_t block, unsigned int n_blocks)
1918	{
1919	struct blk_plug plug;
1920
1921	LIST_HEAD(write_list);
1922
1923	if (WARN_ON_ONCE(dm_bufio_in_request()))
1924	return; / should never happen /
1925
1926	blk_start_plug(&plug);
1927
1928	for (; n_blocks--; block++) {
1929	int need_submit;
1930	struct dm_buffer *b;
1931
1932	b = cache_get(bc: &c->cache, block);
1933	if (b) {
1934	/ already in cache /
1935	cache_put_and_wake(c, b);
1936	continue;
1937	}
1938
1939	dm_bufio_lock(c);
1940	b = __bufio_new(c, block, nf: NF_PREFETCH, need_submit: &need_submit,
1941	write_list: &write_list);
1942	if (unlikely(!list_empty(&write_list))) {
1943	dm_bufio_unlock(c);
1944	blk_finish_plug(&plug);
1945	__flush_write_list(write_list: &write_list);
1946	blk_start_plug(&plug);
1947	dm_bufio_lock(c);
1948	}
1949	if (unlikely(b != NULL)) {
1950	dm_bufio_unlock(c);
1951
1952	if (need_submit)
1953	submit_io(b, op: REQ_OP_READ, end_io: read_endio);
1954	dm_bufio_release(b);
1955
1956	cond_resched();
1957
1958	if (!n_blocks)
1959	goto flush_plug;
1960	dm_bufio_lock(c);
1961	}
1962	dm_bufio_unlock(c);
1963	}
1964
1965	flush_plug:
1966	blk_finish_plug(&plug);
1967	}
1968	EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1969
1970	void dm_bufio_release(struct dm_buffer *b)
1971	{
1972	struct dm_bufio_client *c = b->c;
1973
1974	/*
1975	* If there were errors on the buffer, and the buffer is not
1976	* to be written, free the buffer. There is no point in caching
1977	* invalid buffer.
1978	*/
1979	if ((b->read_error \|\| b->write_error) &&
1980	!test_bit_acquire(B_READING, &b->state) &&
1981	!test_bit(B_WRITING, &b->state) &&
1982	!test_bit(B_DIRTY, &b->state)) {
1983	dm_bufio_lock(c);
1984
1985	/ cache remove can fail if there are other holders /
1986	if (cache_remove(bc: &c->cache, b)) {
1987	__free_buffer_wake(b);
1988	dm_bufio_unlock(c);
1989	return;
1990	}
1991
1992	dm_bufio_unlock(c);
1993	}
1994
1995	cache_put_and_wake(c, b);
1996	}
1997	EXPORT_SYMBOL_GPL(dm_bufio_release);
1998
1999	void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
2000	unsigned int start, unsigned int end)
2001	{
2002	struct dm_bufio_client *c = b->c;
2003
2004	BUG_ON(start >= end);
2005	BUG_ON(end > b->c->block_size);
2006
2007	dm_bufio_lock(c);
2008
2009	BUG_ON(test_bit(B_READING, &b->state));
2010
2011	if (!test_and_set_bit(B_DIRTY, addr: &b->state)) {
2012	b->dirty_start = start;
2013	b->dirty_end = end;
2014	cache_mark(bc: &c->cache, b, LIST_DIRTY);
2015	} else {
2016	if (start < b->dirty_start)
2017	b->dirty_start = start;
2018	if (end > b->dirty_end)
2019	b->dirty_end = end;
2020	}
2021
2022	dm_bufio_unlock(c);
2023	}
2024	EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
2025
2026	void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
2027	{
2028	dm_bufio_mark_partial_buffer_dirty(b, `0`, b->c->block_size);
2029	}
2030	EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
2031
2032	void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
2033	{
2034	LIST_HEAD(write_list);
2035
2036	if (WARN_ON_ONCE(dm_bufio_in_request()))
2037	return; / should never happen /
2038
2039	dm_bufio_lock(c);
2040	__write_dirty_buffers_async(c, no_wait: `0`, write_list: &write_list);
2041	dm_bufio_unlock(c);
2042	__flush_write_list(write_list: &write_list);
2043	}
2044	EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
2045
2046	/*
2047	* For performance, it is essential that the buffers are written asynchronously
2048	* and simultaneously (so that the block layer can merge the writes) and then
2049	* waited upon.
2050	*
2051	* Finally, we flush hardware disk cache.
2052	*/
2053	static bool is_writing(struct lru_entry e, void* *context)
2054	{
2055	struct dm_buffer *b = le_to_buffer(le: e);
2056
2057	return test_bit(B_WRITING, &b->state);
2058	}
2059
2060	int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
2061	{
2062	int a, f;
2063	unsigned long nr_buffers;
2064	struct lru_entry *e;
2065	struct lru_iter it;
2066
2067	LIST_HEAD(write_list);
2068
2069	dm_bufio_lock(c);
2070	__write_dirty_buffers_async(c, no_wait: `0`, write_list: &write_list);
2071	dm_bufio_unlock(c);
2072	__flush_write_list(write_list: &write_list);
2073	dm_bufio_lock(c);
2074
2075	nr_buffers = cache_count(bc: &c->cache, LIST_DIRTY);
2076	lru_iter_begin(lru: &c->cache.lru[LIST_DIRTY], it: &it);
2077	while ((e = lru_iter_next(it: &it, pred: is_writing, context: c))) {
2078	struct dm_buffer *b = le_to_buffer(le: e);
2079	__cache_inc_buffer(b);
2080
2081	BUG_ON(test_bit(B_READING, &b->state));
2082
2083	if (nr_buffers) {
2084	nr_buffers--;
2085	dm_bufio_unlock(c);
2086	wait_on_bit_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2087	dm_bufio_lock(c);
2088	} else {
2089	wait_on_bit_io(word: &b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
2090	}
2091
2092	if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
2093	cache_mark(bc: &c->cache, b, LIST_CLEAN);
2094
2095	cache_put_and_wake(c, b);
2096
2097	cond_resched();
2098	}
2099	lru_iter_end(it: &it);
2100
2101	wake_up(&c->free_buffer_wait);
2102	dm_bufio_unlock(c);
2103
2104	a = xchg(&c->async_write_error, `0`);
2105	f = dm_bufio_issue_flush(c);
2106	if (a)
2107	return a;
2108
2109	return f;
2110	}
2111	EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
2112
2113	/*
2114	* Use dm-io to send an empty barrier to flush the device.
2115	*/
2116	int dm_bufio_issue_flush(struct dm_bufio_client *c)
2117	{
2118	struct dm_io_request io_req = {
2119	.bi_opf = REQ_OP_WRITE \| REQ_PREFLUSH \| REQ_SYNC,
2120	.mem.type = DM_IO_KMEM,
2121	.mem.ptr.addr = NULL,
2122	.client = c->dm_io,
2123	};
2124	struct dm_io_region io_reg = {
2125	.bdev = c->bdev,
2126	.sector = `0`,
2127	.count = `0`,
2128	};
2129
2130	if (WARN_ON_ONCE(dm_bufio_in_request()))
2131	return -EINVAL;
2132
2133	return dm_io(io_req: &io_req, num_regions: `1`, region: &io_reg, NULL);
2134	}
2135	EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
2136
2137	/*
2138	* Use dm-io to send a discard request to flush the device.
2139	*/
2140	int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
2141	{
2142	struct dm_io_request io_req = {
2143	.bi_opf = REQ_OP_DISCARD \| REQ_SYNC,
2144	.mem.type = DM_IO_KMEM,
2145	.mem.ptr.addr = NULL,
2146	.client = c->dm_io,
2147	};
2148	struct dm_io_region io_reg = {
2149	.bdev = c->bdev,
2150	.sector = block_to_sector(c, block),
2151	.count = block_to_sector(c, block: count),
2152	};
2153
2154	if (WARN_ON_ONCE(dm_bufio_in_request()))
2155	return -EINVAL; / discards are optional /
2156
2157	return dm_io(io_req: &io_req, num_regions: `1`, region: &io_reg, NULL);
2158	}
2159	EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
2160
2161	static bool forget_buffer(struct dm_bufio_client *c, sector_t block)
2162	{
2163	struct dm_buffer *b;
2164
2165	b = cache_get(bc: &c->cache, block);
2166	if (b) {
2167	if (likely(!smp_load_acquire(&b->state))) {
2168	if (cache_remove(bc: &c->cache, b))
2169	__free_buffer_wake(b);
2170	else
2171	cache_put_and_wake(c, b);
2172	} else {
2173	cache_put_and_wake(c, b);
2174	}
2175	}
2176
2177	return b ? true : false;
2178	}
2179
2180	/*
2181	* Free the given buffer.
2182	*
2183	* This is just a hint, if the buffer is in use or dirty, this function
2184	* does nothing.
2185	*/
2186	void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
2187	{
2188	dm_bufio_lock(c);
2189	forget_buffer(c, block);
2190	dm_bufio_unlock(c);
2191	}
2192	EXPORT_SYMBOL_GPL(dm_bufio_forget);
2193
2194	static enum evict_result idle(struct dm_buffer b, void* *context)
2195	{
2196	return b->state ? ER_DONT_EVICT : ER_EVICT;
2197	}
2198
2199	void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
2200	{
2201	dm_bufio_lock(c);
2202	cache_remove_range(bc: &c->cache, begin: block, end: block + n_blocks, pred: idle, release: __free_buffer_wake);
2203	dm_bufio_unlock(c);
2204	}
2205	EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
2206
2207	void dm_bufio_set_minimum_buffers(struct dm_bufio_client c, unsigned* int n)
2208	{
2209	c->minimum_buffers = n;
2210	}
2211	EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
2212
2213	unsigned int dm_bufio_get_block_size(struct dm_bufio_client *c)
2214	{
2215	return c->block_size;
2216	}
2217	EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
2218
2219	sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
2220	{
2221	sector_t s = bdev_nr_sectors(bdev: c->bdev);
2222
2223	if (s >= c->start)
2224	s -= c->start;
2225	else
2226	s = `0`;
2227	if (likely(c->sectors_per_block_bits >= `0`))
2228	s >>= c->sectors_per_block_bits;
2229	else
2230	sector_div(s, c->block_size >> SECTOR_SHIFT);
2231	return s;
2232	}
2233	EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
2234
2235	struct dm_io_client dm_bufio_get_dm_io_client(struct* dm_bufio_client *c)
2236	{
2237	return c->dm_io;
2238	}
2239	EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
2240
2241	sector_t dm_bufio_get_block_number(struct dm_buffer *b)
2242	{
2243	return b->block;
2244	}
2245	EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
2246
2247	void dm_bufio_get_block_data(struct* dm_buffer *b)
2248	{
2249	return b->data;
2250	}
2251	EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
2252
2253	void dm_bufio_get_aux_data(struct* dm_buffer *b)
2254	{
2255	return b + `1`;
2256	}
2257	EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
2258
2259	struct dm_bufio_client dm_bufio_get_client(struct* dm_buffer *b)
2260	{
2261	return b->c;
2262	}
2263	EXPORT_SYMBOL_GPL(dm_bufio_get_client);
2264
2265	static enum it_action warn_leak(struct dm_buffer b, void* *context)
2266	{
2267	bool *warned = context;
2268
2269	WARN_ON(!(*warned));
2270	*warned = true;
2271	DMERR("leaked buffer %llx, hold count %u, list %d",
2272	(unsigned long long)b->block, atomic_read(&b->hold_count), b->list_mode);
2273	#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2274	stack_trace_print(trace: b->stack_entries, nr_entries: b->stack_len, spaces: `1`);
2275	/ mark unclaimed to avoid WARN_ON at end of drop_buffers() /
2276	atomic_set(v: &b->hold_count, i: `0`);
2277	#endif
2278	return IT_NEXT;
2279	}
2280
2281	static void drop_buffers(struct dm_bufio_client *c)
2282	{
2283	int i;
2284	struct dm_buffer *b;
2285
2286	if (WARN_ON(dm_bufio_in_request()))
2287	return; / should never happen /
2288
2289	/*
2290	* An optimization so that the buffers are not written one-by-one.
2291	*/
2292	dm_bufio_write_dirty_buffers_async(c);
2293
2294	dm_bufio_lock(c);
2295
2296	while ((b = __get_unclaimed_buffer(c)))
2297	__free_buffer_wake(b);
2298
2299	for (i = `0`; i < LIST_SIZE; i++) {
2300	bool warned = false;
2301
2302	cache_iterate(bc: &c->cache, list_mode: i, fn: warn_leak, context: &warned);
2303	}
2304
2305	#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
2306	while ((b = __get_unclaimed_buffer(c)))
2307	__free_buffer_wake(b);
2308	#endif
2309
2310	for (i = `0`; i < LIST_SIZE; i++)
2311	WARN_ON(cache_count(&c->cache, i));
2312
2313	dm_bufio_unlock(c);
2314	}
2315
2316	static unsigned long get_retain_buffers(struct dm_bufio_client *c)
2317	{
2318	unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
2319
2320	if (likely(c->sectors_per_block_bits >= `0`))
2321	retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
2322	else
2323	retain_bytes /= c->block_size;
2324
2325	return retain_bytes;
2326	}
2327
2328	static void __scan(struct dm_bufio_client *c)
2329	{
2330	int l;
2331	struct dm_buffer *b;
2332	unsigned long freed = `0`;
2333	unsigned long retain_target = get_retain_buffers(c);
2334	unsigned long count = cache_total(bc: &c->cache);
2335
2336	for (l = `0`; l < LIST_SIZE; l++) {
2337	while (true) {
2338	if (count - freed <= retain_target)
2339	atomic_long_set(v: &c->need_shrink, i: `0`);
2340	if (!atomic_long_read(v: &c->need_shrink))
2341	break;
2342
2343	b = cache_evict(bc: &c->cache, list_mode: l,
2344	pred: l == LIST_CLEAN ? is_clean : is_dirty, context: c);
2345	if (!b)
2346	break;
2347
2348	__make_buffer_clean(b);
2349	__free_buffer_wake(b);
2350
2351	atomic_long_dec(v: &c->need_shrink);
2352	freed++;
2353	cond_resched();
2354	}
2355	}
2356	}
2357
2358	static void shrink_work(struct work_struct *w)
2359	{
2360	struct dm_bufio_client c = container_of(w, struct* dm_bufio_client, shrink_work);
2361
2362	dm_bufio_lock(c);
2363	__scan(c);
2364	dm_bufio_unlock(c);
2365	}
2366
2367	static unsigned long dm_bufio_shrink_scan(struct shrinker shrink, struct* shrink_control *sc)
2368	{
2369	struct dm_bufio_client *c;
2370
2371	c = shrink->private_data;
2372	atomic_long_add(i: sc->nr_to_scan, v: &c->need_shrink);
2373	queue_work(wq: dm_bufio_wq, work: &c->shrink_work);
2374
2375	return sc->nr_to_scan;
2376	}
2377
2378	static unsigned long dm_bufio_shrink_count(struct shrinker shrink, struct* shrink_control *sc)
2379	{
2380	struct dm_bufio_client *c = shrink->private_data;
2381	unsigned long count = cache_total(bc: &c->cache);
2382	unsigned long retain_target = get_retain_buffers(c);
2383	unsigned long queued_for_cleanup = atomic_long_read(v: &c->need_shrink);
2384
2385	if (unlikely(count < retain_target))
2386	count = `0`;
2387	else
2388	count -= retain_target;
2389
2390	if (unlikely(count < queued_for_cleanup))
2391	count = `0`;
2392	else
2393	count -= queued_for_cleanup;
2394
2395	return count;
2396	}
2397
2398	/*
2399	* Create the buffering interface
2400	*/
2401	struct dm_bufio_client dm_bufio_client_create(struct* block_device bdev, unsigned* int block_size,
2402	unsigned int reserved_buffers, unsigned int aux_size,
2403	void (alloc_callback)(struct* dm_buffer *),
2404	void (write_callback)(struct* dm_buffer *),
2405	unsigned int flags)
2406	{
2407	int r;
2408	unsigned int num_locks;
2409	struct dm_bufio_client *c;
2410	char slab_name[`27`];
2411
2412	if (!block_size \|\| block_size & ((`1` << SECTOR_SHIFT) - `1`)) {
2413	DMERR("%s: block size not specified or is not multiple of 512b", __func__);
2414	r = -EINVAL;
2415	goto bad_client;
2416	}
2417
2418	num_locks = dm_num_hash_locks();
2419	c = kzalloc(size: sizeof(c) + (num_locks sizeof(struct buffer_tree)), GFP_KERNEL);
2420	if (!c) {
2421	r = -ENOMEM;
2422	goto bad_client;
2423	}
2424	cache_init(bc: &c->cache, num_locks);
2425
2426	c->bdev = bdev;
2427	c->block_size = block_size;
2428	if (is_power_of_2(n: block_size))
2429	c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
2430	else
2431	c->sectors_per_block_bits = -`1`;
2432
2433	c->alloc_callback = alloc_callback;
2434	c->write_callback = write_callback;
2435
2436	if (flags & DM_BUFIO_CLIENT_NO_SLEEP) {
2437	c->no_sleep = true;
2438	static_branch_inc(&no_sleep_enabled);
2439	}
2440
2441	mutex_init(&c->lock);
2442	spin_lock_init(&c->spinlock);
2443	INIT_LIST_HEAD(list: &c->reserved_buffers);
2444	c->need_reserved_buffers = reserved_buffers;
2445
2446	dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
2447
2448	init_waitqueue_head(&c->free_buffer_wait);
2449	c->async_write_error = `0`;
2450
2451	c->dm_io = dm_io_client_create();
2452	if (IS_ERR(ptr: c->dm_io)) {
2453	r = PTR_ERR(ptr: c->dm_io);
2454	goto bad_dm_io;
2455	}
2456
2457	if (block_size <= KMALLOC_MAX_SIZE &&
2458	(block_size < PAGE_SIZE \|\| !is_power_of_2(n: block_size))) {
2459	unsigned int align = min(`1U` << __ffs(block_size), (unsigned int)PAGE_SIZE);
2460
2461	snprintf(buf: slab_name, size: sizeof(slab_name), fmt: "dm_bufio_cache-%u", block_size);
2462	c->slab_cache = kmem_cache_create(name: slab_name, size: block_size, align,
2463	SLAB_RECLAIM_ACCOUNT, NULL);
2464	if (!c->slab_cache) {
2465	r = -ENOMEM;
2466	goto bad;
2467	}
2468	}
2469	if (aux_size)
2470	snprintf(buf: slab_name, size: sizeof(slab_name), fmt: "dm_bufio_buffer-%u", aux_size);
2471	else
2472	snprintf(buf: slab_name, size: sizeof(slab_name), fmt: "dm_bufio_buffer");
2473	c->slab_buffer = kmem_cache_create(name: slab_name, size: sizeof(struct dm_buffer) + aux_size,
2474	align: `0`, SLAB_RECLAIM_ACCOUNT, NULL);
2475	if (!c->slab_buffer) {
2476	r = -ENOMEM;
2477	goto bad;
2478	}
2479
2480	while (c->need_reserved_buffers) {
2481	struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
2482
2483	if (!b) {
2484	r = -ENOMEM;
2485	goto bad;
2486	}
2487	__free_buffer_wake(b);
2488	}
2489
2490	INIT_WORK(&c->shrink_work, shrink_work);
2491	atomic_long_set(v: &c->need_shrink, i: `0`);
2492
2493	c->shrinker = shrinker_alloc(flags: `0`, fmt: "dm-bufio:(%u:%u)",
2494	MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
2495	if (!c->shrinker) {
2496	r = -ENOMEM;
2497	goto bad;
2498	}
2499
2500	c->shrinker->count_objects = dm_bufio_shrink_count;
2501	c->shrinker->scan_objects = dm_bufio_shrink_scan;
2502	c->shrinker->seeks = `1`;
2503	c->shrinker->batch = `0`;
2504	c->shrinker->private_data = c;
2505
2506	shrinker_register(shrinker: c->shrinker);
2507
2508	mutex_lock(&dm_bufio_clients_lock);
2509	dm_bufio_client_count++;
2510	list_add(new: &c->client_list, head: &dm_bufio_all_clients);
2511	__cache_size_refresh();
2512	mutex_unlock(lock: &dm_bufio_clients_lock);
2513
2514	return c;
2515
2516	bad:
2517	while (!list_empty(head: &c->reserved_buffers)) {
2518	struct dm_buffer *b = list_to_buffer(l: c->reserved_buffers.next);
2519
2520	list_del(entry: &b->lru.list);
2521	free_buffer(b);
2522	}
2523	kmem_cache_destroy(s: c->slab_cache);
2524	kmem_cache_destroy(s: c->slab_buffer);
2525	dm_io_client_destroy(client: c->dm_io);
2526	bad_dm_io:
2527	mutex_destroy(lock: &c->lock);
2528	if (c->no_sleep)
2529	static_branch_dec(&no_sleep_enabled);
2530	kfree(objp: c);
2531	bad_client:
2532	return ERR_PTR(error: r);
2533	}
2534	EXPORT_SYMBOL_GPL(dm_bufio_client_create);
2535
2536	/*
2537	* Free the buffering interface.
2538	* It is required that there are no references on any buffers.
2539	*/
2540	void dm_bufio_client_destroy(struct dm_bufio_client *c)
2541	{
2542	unsigned int i;
2543
2544	drop_buffers(c);
2545
2546	shrinker_free(shrinker: c->shrinker);
2547	flush_work(work: &c->shrink_work);
2548
2549	mutex_lock(&dm_bufio_clients_lock);
2550
2551	list_del(entry: &c->client_list);
2552	dm_bufio_client_count--;
2553	__cache_size_refresh();
2554
2555	mutex_unlock(lock: &dm_bufio_clients_lock);
2556
2557	WARN_ON(c->need_reserved_buffers);
2558
2559	while (!list_empty(head: &c->reserved_buffers)) {
2560	struct dm_buffer *b = list_to_buffer(l: c->reserved_buffers.next);
2561
2562	list_del(entry: &b->lru.list);
2563	free_buffer(b);
2564	}
2565
2566	for (i = `0`; i < LIST_SIZE; i++)
2567	if (cache_count(bc: &c->cache, list_mode: i))
2568	DMERR("leaked buffer count %d: %lu", i, cache_count(&c->cache, i));
2569
2570	for (i = `0`; i < LIST_SIZE; i++)
2571	WARN_ON(cache_count(&c->cache, i));
2572
2573	cache_destroy(bc: &c->cache);
2574	kmem_cache_destroy(s: c->slab_cache);
2575	kmem_cache_destroy(s: c->slab_buffer);
2576	dm_io_client_destroy(client: c->dm_io);
2577	mutex_destroy(lock: &c->lock);
2578	if (c->no_sleep)
2579	static_branch_dec(&no_sleep_enabled);
2580	kfree(objp: c);
2581	}
2582	EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
2583
2584	void dm_bufio_client_reset(struct dm_bufio_client *c)
2585	{
2586	drop_buffers(c);
2587	flush_work(work: &c->shrink_work);
2588	}
2589	EXPORT_SYMBOL_GPL(dm_bufio_client_reset);
2590
2591	void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
2592	{
2593	c->start = start;
2594	}
2595	EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
2596
2597	/--------------------------------------------------------------/
2598
2599	static unsigned int get_max_age_hz(void)
2600	{
2601	unsigned int max_age = READ_ONCE(dm_bufio_max_age);
2602
2603	if (max_age > UINT_MAX / HZ)
2604	max_age = UINT_MAX / HZ;
2605
2606	return max_age * HZ;
2607	}
2608
2609	static bool older_than(struct dm_buffer b, unsigned* long age_hz)
2610	{
2611	return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
2612	}
2613
2614	struct evict_params {
2615	gfp_t gfp;
2616	unsigned long age_hz;
2617
2618	/*
2619	* This gets updated with the largest last_accessed (ie. most
2620	* recently used) of the evicted buffers. It will not be reinitialised
2621	* by __evict_many(), so you can use it across multiple invocations.
2622	*/
2623	unsigned long last_accessed;
2624	};
2625
2626	/*
2627	* We may not be able to evict this buffer if IO pending or the client
2628	* is still using it.
2629	*
2630	* And if GFP_NOFS is used, we must not do any I/O because we hold
2631	* dm_bufio_clients_lock and we would risk deadlock if the I/O gets
2632	* rerouted to different bufio client.
2633	*/
2634	static enum evict_result select_for_evict(struct dm_buffer b, void* *context)
2635	{
2636	struct evict_params *params = context;
2637
2638	if (!(params->gfp & __GFP_FS) \|\|
2639	(static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
2640	if (test_bit_acquire(B_READING, &b->state) \|\|
2641	test_bit(B_WRITING, &b->state) \|\|
2642	test_bit(B_DIRTY, &b->state))
2643	return ER_DONT_EVICT;
2644	}
2645
2646	return older_than(b, age_hz: params->age_hz) ? ER_EVICT : ER_STOP;
2647	}
2648
2649	static unsigned long __evict_many(struct dm_bufio_client *c,
2650	struct evict_params *params,
2651	int list_mode, unsigned long max_count)
2652	{
2653	unsigned long count;
2654	unsigned long last_accessed;
2655	struct dm_buffer *b;
2656
2657	for (count = `0`; count < max_count; count++) {
2658	b = cache_evict(bc: &c->cache, list_mode, pred: select_for_evict, context: params);
2659	if (!b)
2660	break;
2661
2662	last_accessed = READ_ONCE(b->last_accessed);
2663	if (time_after_eq(params->last_accessed, last_accessed))
2664	params->last_accessed = last_accessed;
2665
2666	__make_buffer_clean(b);
2667	__free_buffer_wake(b);
2668
2669	cond_resched();
2670	}
2671
2672	return count;
2673	}
2674
2675	static void evict_old_buffers(struct dm_bufio_client c, unsigned* long age_hz)
2676	{
2677	struct evict_params params = {.gfp = `0`, .age_hz = age_hz, .last_accessed = `0`};
2678	unsigned long retain = get_retain_buffers(c);
2679	unsigned long count;
2680	LIST_HEAD(write_list);
2681
2682	dm_bufio_lock(c);
2683
2684	__check_watermark(c, write_list: &write_list);
2685	if (unlikely(!list_empty(&write_list))) {
2686	dm_bufio_unlock(c);
2687	__flush_write_list(write_list: &write_list);
2688	dm_bufio_lock(c);
2689	}
2690
2691	count = cache_total(bc: &c->cache);
2692	if (count > retain)
2693	__evict_many(c, params: &params, LIST_CLEAN, max_count: count - retain);
2694
2695	dm_bufio_unlock(c);
2696	}
2697
2698	static void cleanup_old_buffers(void)
2699	{
2700	unsigned long max_age_hz = get_max_age_hz();
2701	struct dm_bufio_client *c;
2702
2703	mutex_lock(&dm_bufio_clients_lock);
2704
2705	__cache_size_refresh();
2706
2707	list_for_each_entry(c, &dm_bufio_all_clients, client_list)
2708	evict_old_buffers(c, age_hz: max_age_hz);
2709
2710	mutex_unlock(lock: &dm_bufio_clients_lock);
2711	}
2712
2713	static void work_fn(struct work_struct *w)
2714	{
2715	cleanup_old_buffers();
2716
2717	queue_delayed_work(wq: dm_bufio_wq, dwork: &dm_bufio_cleanup_old_work,
2718	DM_BUFIO_WORK_TIMER_SECS * HZ);
2719	}
2720
2721	/--------------------------------------------------------------/
2722
2723	/*
2724	* Global cleanup tries to evict the oldest buffers from across _all_
2725	* the clients. It does this by repeatedly evicting a few buffers from
2726	* the client that holds the oldest buffer. It's approximate, but hopefully
2727	* good enough.
2728	*/
2729	static struct dm_bufio_client __pop_client(void*)
2730	{
2731	struct list_head *h;
2732
2733	if (list_empty(head: &dm_bufio_all_clients))
2734	return NULL;
2735
2736	h = dm_bufio_all_clients.next;
2737	list_del(entry: h);
2738	return container_of(h, struct dm_bufio_client, client_list);
2739	}
2740
2741	/*
2742	* Inserts the client in the global client list based on its
2743	* 'oldest_buffer' field.
2744	*/
2745	static void __insert_client(struct dm_bufio_client *new_client)
2746	{
2747	struct dm_bufio_client *c;
2748	struct list_head *h = dm_bufio_all_clients.next;
2749
2750	while (h != &dm_bufio_all_clients) {
2751	c = container_of(h, struct dm_bufio_client, client_list);
2752	if (time_after_eq(c->oldest_buffer, new_client->oldest_buffer))
2753	break;
2754	h = h->next;
2755	}
2756
2757	list_add_tail(new: &new_client->client_list, head: h);
2758	}
2759
2760	static unsigned long __evict_a_few(unsigned long nr_buffers)
2761	{
2762	unsigned long count;
2763	struct dm_bufio_client *c;
2764	struct evict_params params = {
2765	.gfp = GFP_KERNEL,
2766	.age_hz = `0`,
2767	/ set to jiffies in case there are no buffers in this client /
2768	.last_accessed = jiffies
2769	};
2770
2771	c = __pop_client();
2772	if (!c)
2773	return `0`;
2774
2775	dm_bufio_lock(c);
2776	count = __evict_many(c, params: &params, LIST_CLEAN, max_count: nr_buffers);
2777	dm_bufio_unlock(c);
2778
2779	if (count)
2780	c->oldest_buffer = params.last_accessed;
2781	__insert_client(new_client: c);
2782
2783	return count;
2784	}
2785
2786	static void check_watermarks(void)
2787	{
2788	LIST_HEAD(write_list);
2789	struct dm_bufio_client *c;
2790
2791	mutex_lock(&dm_bufio_clients_lock);
2792	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
2793	dm_bufio_lock(c);
2794	__check_watermark(c, write_list: &write_list);
2795	dm_bufio_unlock(c);
2796	}
2797	mutex_unlock(lock: &dm_bufio_clients_lock);
2798
2799	__flush_write_list(write_list: &write_list);
2800	}
2801
2802	static void evict_old(void)
2803	{
2804	unsigned long threshold = dm_bufio_cache_size -
2805	dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
2806
2807	mutex_lock(&dm_bufio_clients_lock);
2808	while (dm_bufio_current_allocated > threshold) {
2809	if (!__evict_a_few(nr_buffers: `64`))
2810	break;
2811	cond_resched();
2812	}
2813	mutex_unlock(lock: &dm_bufio_clients_lock);
2814	}
2815
2816	static void do_global_cleanup(struct work_struct *w)
2817	{
2818	check_watermarks();
2819	evict_old();
2820	}
2821
2822	/*
2823	*--------------------------------------------------------------
2824	* Module setup
2825	*--------------------------------------------------------------
2826	*/
2827
2828	/*
2829	* This is called only once for the whole dm_bufio module.
2830	* It initializes memory limit.
2831	*/
2832	static int __init dm_bufio_init(void)
2833	{
2834	__u64 mem;
2835
2836	dm_bufio_allocated_kmem_cache = `0`;
2837	dm_bufio_allocated_get_free_pages = `0`;
2838	dm_bufio_allocated_vmalloc = `0`;
2839	dm_bufio_current_allocated = `0`;
2840
2841	mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2842	DM_BUFIO_MEMORY_PERCENT, `100`) << PAGE_SHIFT;
2843
2844	if (mem > ULONG_MAX)
2845	mem = ULONG_MAX;
2846
2847	#ifdef CONFIG_MMU
2848	if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, `100`))
2849	mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, `100`);
2850	#endif
2851
2852	dm_bufio_default_cache_size = mem;
2853
2854	mutex_lock(&dm_bufio_clients_lock);
2855	__cache_size_refresh();
2856	mutex_unlock(lock: &dm_bufio_clients_lock);
2857
2858	dm_bufio_wq = alloc_workqueue(fmt: "dm_bufio_cache", flags: WQ_MEM_RECLAIM, max_active: `0`);
2859	if (!dm_bufio_wq)
2860	return -ENOMEM;
2861
2862	INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2863	INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2864	queue_delayed_work(wq: dm_bufio_wq, dwork: &dm_bufio_cleanup_old_work,
2865	DM_BUFIO_WORK_TIMER_SECS * HZ);
2866
2867	return `0`;
2868	}
2869
2870	/*
2871	* This is called once when unloading the dm_bufio module.
2872	*/
2873	static void __exit dm_bufio_exit(void)
2874	{
2875	int bug = `0`;
2876
2877	cancel_delayed_work_sync(dwork: &dm_bufio_cleanup_old_work);
2878	destroy_workqueue(wq: dm_bufio_wq);
2879
2880	if (dm_bufio_client_count) {
2881	DMCRIT("%s: dm_bufio_client_count leaked: %d",
2882	__func__, dm_bufio_client_count);
2883	bug = `1`;
2884	}
2885
2886	if (dm_bufio_current_allocated) {
2887	DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2888	__func__, dm_bufio_current_allocated);
2889	bug = `1`;
2890	}
2891
2892	if (dm_bufio_allocated_get_free_pages) {
2893	DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2894	__func__, dm_bufio_allocated_get_free_pages);
2895	bug = `1`;
2896	}
2897
2898	if (dm_bufio_allocated_vmalloc) {
2899	DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2900	__func__, dm_bufio_allocated_vmalloc);
2901	bug = `1`;
2902	}
2903
2904	WARN_ON(bug); / leaks are not worth crashing the system /
2905	}
2906
2907	module_init(dm_bufio_init)
2908	module_exit(dm_bufio_exit)
2909
2910	module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, `0644`);
2911	MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2912
2913	module_param_named(max_age_seconds, dm_bufio_max_age, uint, `0644`);
2914	MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2915
2916	module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, `0644`);
2917	MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2918
2919	module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, `0644`);
2920	MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2921
2922	module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, `0444`);
2923	MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2924
2925	module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, `0444`);
2926	MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2927
2928	module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, `0444`);
2929	MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2930
2931	module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, `0444`);
2932	MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2933
2934	MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2935	MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2936	MODULE_LICENSE("GPL");
2937

source code of linux/drivers/md/dm-bufio.c