dm-cache-policy-smq.c source code [linux/drivers/md/dm-cache-policy-smq.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2015 Red Hat. All rights reserved.
4	*
5	* This file is released under the GPL.
6	*/
7
8	#include "dm-cache-background-tracker.h"
9	#include "dm-cache-policy-internal.h"
10	#include "dm-cache-policy.h"
11	#include "dm.h"
12
13	#include <linux/hash.h>
14	#include <linux/jiffies.h>
15	#include <linux/module.h>
16	#include <linux/mutex.h>
17	#include <linux/vmalloc.h>
18	#include <linux/math64.h>
19
20	#define DM_MSG_PREFIX "cache-policy-smq"
21
22	/----------------------------------------------------------------/
23
24	/*
25	* Safe division functions that return zero on divide by zero.
26	*/
27	static unsigned int safe_div(unsigned int n, unsigned int d)
28	{
29	return d ? n / d : `0u`;
30	}
31
32	static unsigned int safe_mod(unsigned int n, unsigned int d)
33	{
34	return d ? n % d : `0u`;
35	}
36
37	/----------------------------------------------------------------/
38
39	struct entry {
40	unsigned int hash_next:`28`;
41	unsigned int prev:`28`;
42	unsigned int next:`28`;
43	unsigned int level:`6`;
44	bool dirty:`1`;
45	bool allocated:`1`;
46	bool sentinel:`1`;
47	bool pending_work:`1`;
48
49	dm_oblock_t oblock;
50	};
51
52	/----------------------------------------------------------------/
53
54	#define INDEXER_NULL ((1u << 28u) - 1u)
55
56	/*
57	* An entry_space manages a set of entries that we use for the queues.
58	* The clean and dirty queues share entries, so this object is separate
59	* from the queue itself.
60	*/
61	struct entry_space {
62	struct entry *begin;
63	struct entry *end;
64	};
65
66	static int space_init(struct entry_space es, unsigned* int nr_entries)
67	{
68	if (!nr_entries) {
69	es->begin = es->end = NULL;
70	return `0`;
71	}
72
73	es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
74	if (!es->begin)
75	return -ENOMEM;
76
77	es->end = es->begin + nr_entries;
78	return `0`;
79	}
80
81	static void space_exit(struct entry_space *es)
82	{
83	vfree(addr: es->begin);
84	}
85
86	static struct entry __get_entry(struct* entry_space es, unsigned* int block)
87	{
88	struct entry *e;
89
90	e = es->begin + block;
91	BUG_ON(e >= es->end);
92
93	return e;
94	}
95
96	static unsigned int to_index(struct entry_space es, struct* entry *e)
97	{
98	BUG_ON(e < es->begin \|\| e >= es->end);
99	return e - es->begin;
100	}
101
102	static struct entry to_entry(struct* entry_space es, unsigned* int block)
103	{
104	if (block == INDEXER_NULL)
105	return NULL;
106
107	return __get_entry(es, block);
108	}
109
110	/----------------------------------------------------------------/
111
112	struct ilist {
113	unsigned int nr_elts; / excluding sentinel entries /
114	unsigned int head, tail;
115	};
116
117	static void l_init(struct ilist *l)
118	{
119	l->nr_elts = `0`;
120	l->head = l->tail = INDEXER_NULL;
121	}
122
123	static struct entry l_head(struct* entry_space es, struct* ilist *l)
124	{
125	return to_entry(es, block: l->head);
126	}
127
128	static struct entry l_tail(struct* entry_space es, struct* ilist *l)
129	{
130	return to_entry(es, block: l->tail);
131	}
132
133	static struct entry l_next(struct* entry_space es, struct* entry *e)
134	{
135	return to_entry(es, block: e->next);
136	}
137
138	static struct entry l_prev(struct* entry_space es, struct* entry *e)
139	{
140	return to_entry(es, block: e->prev);
141	}
142
143	static bool l_empty(struct ilist *l)
144	{
145	return l->head == INDEXER_NULL;
146	}
147
148	static void l_add_head(struct entry_space es, struct* ilist l, struct* entry *e)
149	{
150	struct entry *head = l_head(es, l);
151
152	e->next = l->head;
153	e->prev = INDEXER_NULL;
154
155	if (head)
156	head->prev = l->head = to_index(es, e);
157	else
158	l->head = l->tail = to_index(es, e);
159
160	if (!e->sentinel)
161	l->nr_elts++;
162	}
163
164	static void l_add_tail(struct entry_space es, struct* ilist l, struct* entry *e)
165	{
166	struct entry *tail = l_tail(es, l);
167
168	e->next = INDEXER_NULL;
169	e->prev = l->tail;
170
171	if (tail)
172	tail->next = l->tail = to_index(es, e);
173	else
174	l->head = l->tail = to_index(es, e);
175
176	if (!e->sentinel)
177	l->nr_elts++;
178	}
179
180	static void l_add_before(struct entry_space es, struct* ilist *l,
181	struct entry old, struct* entry *e)
182	{
183	struct entry *prev = l_prev(es, e: old);
184
185	if (!prev)
186	l_add_head(es, l, e);
187
188	else {
189	e->prev = old->prev;
190	e->next = to_index(es, e: old);
191	prev->next = old->prev = to_index(es, e);
192
193	if (!e->sentinel)
194	l->nr_elts++;
195	}
196	}
197
198	static void l_del(struct entry_space es, struct* ilist l, struct* entry *e)
199	{
200	struct entry *prev = l_prev(es, e);
201	struct entry *next = l_next(es, e);
202
203	if (prev)
204	prev->next = e->next;
205	else
206	l->head = e->next;
207
208	if (next)
209	next->prev = e->prev;
210	else
211	l->tail = e->prev;
212
213	if (!e->sentinel)
214	l->nr_elts--;
215	}
216
217	static struct entry l_pop_head(struct* entry_space es, struct* ilist *l)
218	{
219	struct entry *e;
220
221	for (e = l_head(es, l); e; e = l_next(es, e))
222	if (!e->sentinel) {
223	l_del(es, l, e);
224	return e;
225	}
226
227	return NULL;
228	}
229
230	static struct entry l_pop_tail(struct* entry_space es, struct* ilist *l)
231	{
232	struct entry *e;
233
234	for (e = l_tail(es, l); e; e = l_prev(es, e))
235	if (!e->sentinel) {
236	l_del(es, l, e);
237	return e;
238	}
239
240	return NULL;
241	}
242
243	/----------------------------------------------------------------/
244
245	/*
246	* The stochastic-multi-queue is a set of lru lists stacked into levels.
247	* Entries are moved up levels when they are used, which loosely orders the
248	* most accessed entries in the top levels and least in the bottom. This
249	* structure is much better than a single lru list.
250	*/
251	#define MAX_LEVELS 64u
252
253	struct queue {
254	struct entry_space *es;
255
256	unsigned int nr_elts;
257	unsigned int nr_levels;
258	struct ilist qs[MAX_LEVELS];
259
260	/*
261	* We maintain a count of the number of entries we would like in each
262	* level.
263	*/
264	unsigned int last_target_nr_elts;
265	unsigned int nr_top_levels;
266	unsigned int nr_in_top_levels;
267	unsigned int target_count[MAX_LEVELS];
268	};
269
270	static void q_init(struct queue q, struct* entry_space es, unsigned* int nr_levels)
271	{
272	unsigned int i;
273
274	q->es = es;
275	q->nr_elts = `0`;
276	q->nr_levels = nr_levels;
277
278	for (i = `0`; i < q->nr_levels; i++) {
279	l_init(l: q->qs + i);
280	q->target_count[i] = `0u`;
281	}
282
283	q->last_target_nr_elts = `0u`;
284	q->nr_top_levels = `0u`;
285	q->nr_in_top_levels = `0u`;
286	}
287
288	static unsigned int q_size(struct queue *q)
289	{
290	return q->nr_elts;
291	}
292
293	/*
294	* Insert an entry to the back of the given level.
295	*/
296	static void q_push(struct queue q, struct* entry *e)
297	{
298	BUG_ON(e->pending_work);
299
300	if (!e->sentinel)
301	q->nr_elts++;
302
303	l_add_tail(es: q->es, l: q->qs + e->level, e);
304	}
305
306	static void q_push_front(struct queue q, struct* entry *e)
307	{
308	BUG_ON(e->pending_work);
309
310	if (!e->sentinel)
311	q->nr_elts++;
312
313	l_add_head(es: q->es, l: q->qs + e->level, e);
314	}
315
316	static void q_push_before(struct queue q, struct* entry old, struct* entry *e)
317	{
318	BUG_ON(e->pending_work);
319
320	if (!e->sentinel)
321	q->nr_elts++;
322
323	l_add_before(es: q->es, l: q->qs + e->level, old, e);
324	}
325
326	static void q_del(struct queue q, struct* entry *e)
327	{
328	l_del(es: q->es, l: q->qs + e->level, e);
329	if (!e->sentinel)
330	q->nr_elts--;
331	}
332
333	/*
334	* Return the oldest entry of the lowest populated level.
335	*/
336	static struct entry q_peek(struct* queue q, unsigned* int max_level, bool can_cross_sentinel)
337	{
338	unsigned int level;
339	struct entry *e;
340
341	max_level = min(max_level, q->nr_levels);
342
343	for (level = `0`; level < max_level; level++)
344	for (e = l_head(es: q->es, l: q->qs + level); e; e = l_next(es: q->es, e)) {
345	if (e->sentinel) {
346	if (can_cross_sentinel)
347	continue;
348	else
349	break;
350	}
351
352	return e;
353	}
354
355	return NULL;
356	}
357
358	static struct entry q_pop(struct* queue *q)
359	{
360	struct entry *e = q_peek(q, max_level: q->nr_levels, can_cross_sentinel: true);
361
362	if (e)
363	q_del(q, e);
364
365	return e;
366	}
367
368	/*
369	* This function assumes there is a non-sentinel entry to pop. It's only
370	* used by redistribute, so we know this is true. It also doesn't adjust
371	* the q->nr_elts count.
372	*/
373	static struct entry __redist_pop_from(struct* queue q, unsigned* int level)
374	{
375	struct entry *e;
376
377	for (; level < q->nr_levels; level++)
378	for (e = l_head(es: q->es, l: q->qs + level); e; e = l_next(es: q->es, e))
379	if (!e->sentinel) {
380	l_del(es: q->es, l: q->qs + e->level, e);
381	return e;
382	}
383
384	return NULL;
385	}
386
387	static void q_set_targets_subrange_(struct queue q, unsigned* int nr_elts,
388	unsigned int lbegin, unsigned int lend)
389	{
390	unsigned int level, nr_levels, entries_per_level, remainder;
391
392	BUG_ON(lbegin > lend);
393	BUG_ON(lend > q->nr_levels);
394	nr_levels = lend - lbegin;
395	entries_per_level = safe_div(n: nr_elts, d: nr_levels);
396	remainder = safe_mod(n: nr_elts, d: nr_levels);
397
398	for (level = lbegin; level < lend; level++)
399	q->target_count[level] =
400	(level < (lbegin + remainder)) ? entries_per_level + `1u` : entries_per_level;
401	}
402
403	/*
404	* Typically we have fewer elements in the top few levels which allows us
405	* to adjust the promote threshold nicely.
406	*/
407	static void q_set_targets(struct queue *q)
408	{
409	if (q->last_target_nr_elts == q->nr_elts)
410	return;
411
412	q->last_target_nr_elts = q->nr_elts;
413
414	if (q->nr_top_levels > q->nr_levels)
415	q_set_targets_subrange_(q, nr_elts: q->nr_elts, lbegin: `0`, lend: q->nr_levels);
416
417	else {
418	q_set_targets_subrange_(q, nr_elts: q->nr_in_top_levels,
419	lbegin: q->nr_levels - q->nr_top_levels, lend: q->nr_levels);
420
421	if (q->nr_in_top_levels < q->nr_elts)
422	q_set_targets_subrange_(q, nr_elts: q->nr_elts - q->nr_in_top_levels,
423	lbegin: `0`, lend: q->nr_levels - q->nr_top_levels);
424	else
425	q_set_targets_subrange_(q, nr_elts: `0`, lbegin: `0`, lend: q->nr_levels - q->nr_top_levels);
426	}
427	}
428
429	static void q_redistribute(struct queue *q)
430	{
431	unsigned int target, level;
432	struct ilist l, l_above;
433	struct entry *e;
434
435	q_set_targets(q);
436
437	for (level = `0u`; level < q->nr_levels - `1u`; level++) {
438	l = q->qs + level;
439	target = q->target_count[level];
440
441	/*
442	* Pull down some entries from the level above.
443	*/
444	while (l->nr_elts < target) {
445	e = __redist_pop_from(q, level: level + `1u`);
446	if (!e) {
447	/ bug in nr_elts /
448	break;
449	}
450
451	e->level = level;
452	l_add_tail(es: q->es, l, e);
453	}
454
455	/*
456	* Push some entries up.
457	*/
458	l_above = q->qs + level + `1u`;
459	while (l->nr_elts > target) {
460	e = l_pop_tail(es: q->es, l);
461
462	if (!e)
463	/ bug in nr_elts /
464	break;
465
466	e->level = level + `1u`;
467	l_add_tail(es: q->es, l: l_above, e);
468	}
469	}
470	}
471
472	static void q_requeue(struct queue q, struct* entry e, unsigned* int extra_levels,
473	struct entry s1, struct* entry *s2)
474	{
475	struct entry *de;
476	unsigned int sentinels_passed = `0`;
477	unsigned int new_level = min(q->nr_levels - `1u`, e->level + extra_levels);
478
479	/ try and find an entry to swap with /
480	if (extra_levels && (e->level < q->nr_levels - `1u`)) {
481	for (de = l_head(es: q->es, l: q->qs + new_level); de && de->sentinel; de = l_next(es: q->es, e: de))
482	sentinels_passed++;
483
484	if (de) {
485	q_del(q, e: de);
486	de->level = e->level;
487	if (s1) {
488	switch (sentinels_passed) {
489	case `0`:
490	q_push_before(q, old: s1, e: de);
491	break;
492
493	case `1`:
494	q_push_before(q, old: s2, e: de);
495	break;
496
497	default:
498	q_push(q, e: de);
499	}
500	} else
501	q_push(q, e: de);
502	}
503	}
504
505	q_del(q, e);
506	e->level = new_level;
507	q_push(q, e);
508	}
509
510	/----------------------------------------------------------------/
511
512	#define FP_SHIFT 8
513	#define SIXTEENTH (1u << (FP_SHIFT - 4u))
514	#define EIGHTH (1u << (FP_SHIFT - 3u))
515
516	struct stats {
517	unsigned int hit_threshold;
518	unsigned int hits;
519	unsigned int misses;
520	};
521
522	enum performance {
523	Q_POOR,
524	Q_FAIR,
525	Q_WELL
526	};
527
528	static void stats_init(struct stats s, unsigned* int nr_levels)
529	{
530	s->hit_threshold = (nr_levels * `3u`) / `4u`;
531	s->hits = `0u`;
532	s->misses = `0u`;
533	}
534
535	static void stats_reset(struct stats *s)
536	{
537	s->hits = s->misses = `0u`;
538	}
539
540	static void stats_level_accessed(struct stats s, unsigned* int level)
541	{
542	if (level >= s->hit_threshold)
543	s->hits++;
544	else
545	s->misses++;
546	}
547
548	static void stats_miss(struct stats *s)
549	{
550	s->misses++;
551	}
552
553	/*
554	* There are times when we don't have any confidence in the hotspot queue.
555	* Such as when a fresh cache is created and the blocks have been spread
556	* out across the levels, or if an io load changes. We detect this by
557	* seeing how often a lookup is in the top levels of the hotspot queue.
558	*/
559	static enum performance stats_assess(struct stats *s)
560	{
561	unsigned int confidence = safe_div(n: s->hits << FP_SHIFT, d: s->hits + s->misses);
562
563	if (confidence < SIXTEENTH)
564	return Q_POOR;
565
566	else if (confidence < EIGHTH)
567	return Q_FAIR;
568
569	else
570	return Q_WELL;
571	}
572
573	/----------------------------------------------------------------/
574
575	struct smq_hash_table {
576	struct entry_space *es;
577	unsigned long long hash_bits;
578	unsigned int *buckets;
579	};
580
581	/*
582	* All cache entries are stored in a chained hash table. To save space we
583	* use indexing again, and only store indexes to the next entry.
584	*/
585	static int h_init(struct smq_hash_table ht, struct* entry_space es, unsigned* int nr_entries)
586	{
587	unsigned int i, nr_buckets;
588
589	ht->es = es;
590	nr_buckets = roundup_pow_of_two(max(nr_entries / `4u`, `16u`));
591	ht->hash_bits = __ffs(nr_buckets);
592
593	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
594	if (!ht->buckets)
595	return -ENOMEM;
596
597	for (i = `0`; i < nr_buckets; i++)
598	ht->buckets[i] = INDEXER_NULL;
599
600	return `0`;
601	}
602
603	static void h_exit(struct smq_hash_table *ht)
604	{
605	vfree(addr: ht->buckets);
606	}
607
608	static struct entry h_head(struct* smq_hash_table ht, unsigned* int bucket)
609	{
610	return to_entry(es: ht->es, block: ht->buckets[bucket]);
611	}
612
613	static struct entry h_next(struct* smq_hash_table ht, struct* entry *e)
614	{
615	return to_entry(es: ht->es, block: e->hash_next);
616	}
617
618	static void __h_insert(struct smq_hash_table ht, unsigned* int bucket, struct entry *e)
619	{
620	e->hash_next = ht->buckets[bucket];
621	ht->buckets[bucket] = to_index(es: ht->es, e);
622	}
623
624	static void h_insert(struct smq_hash_table ht, struct* entry *e)
625	{
626	unsigned int h = hash_64(val: from_oblock(b: e->oblock), bits: ht->hash_bits);
627
628	__h_insert(ht, bucket: h, e);
629	}
630
631	static struct entry __h_lookup(struct* smq_hash_table ht, unsigned* int h, dm_oblock_t oblock,
632	struct entry **prev)
633	{
634	struct entry *e;
635
636	*prev = NULL;
637	for (e = h_head(ht, bucket: h); e; e = h_next(ht, e)) {
638	if (e->oblock == oblock)
639	return e;
640
641	*prev = e;
642	}
643
644	return NULL;
645	}
646
647	static void __h_unlink(struct smq_hash_table ht, unsigned* int h,
648	struct entry e, struct* entry *prev)
649	{
650	if (prev)
651	prev->hash_next = e->hash_next;
652	else
653	ht->buckets[h] = e->hash_next;
654	}
655
656	/*
657	* Also moves each entry to the front of the bucket.
658	*/
659	static struct entry h_lookup(struct* smq_hash_table *ht, dm_oblock_t oblock)
660	{
661	struct entry e, prev;
662	unsigned int h = hash_64(val: from_oblock(b: oblock), bits: ht->hash_bits);
663
664	e = __h_lookup(ht, h, oblock, prev: &prev);
665	if (e && prev) {
666	/*
667	* Move to the front because this entry is likely
668	* to be hit again.
669	*/
670	__h_unlink(ht, h, e, prev);
671	__h_insert(ht, bucket: h, e);
672	}
673
674	return e;
675	}
676
677	static void h_remove(struct smq_hash_table ht, struct* entry *e)
678	{
679	unsigned int h = hash_64(val: from_oblock(b: e->oblock), bits: ht->hash_bits);
680	struct entry *prev;
681
682	/*
683	* The down side of using a singly linked list is we have to
684	* iterate the bucket to remove an item.
685	*/
686	e = __h_lookup(ht, h, oblock: e->oblock, prev: &prev);
687	if (e)
688	__h_unlink(ht, h, e, prev);
689	}
690
691	/----------------------------------------------------------------/
692
693	struct entry_alloc {
694	struct entry_space *es;
695	unsigned int begin;
696
697	unsigned int nr_allocated;
698	struct ilist free;
699	};
700
701	static void init_allocator(struct entry_alloc ea, struct* entry_space *es,
702	unsigned int begin, unsigned int end)
703	{
704	unsigned int i;
705
706	ea->es = es;
707	ea->nr_allocated = `0u`;
708	ea->begin = begin;
709
710	l_init(l: &ea->free);
711	for (i = begin; i != end; i++)
712	l_add_tail(es: ea->es, l: &ea->free, e: __get_entry(es: ea->es, block: i));
713	}
714
715	static void init_entry(struct entry *e)
716	{
717	/*
718	* We can't memset because that would clear the hotspot and
719	* sentinel bits which remain constant.
720	*/
721	e->hash_next = INDEXER_NULL;
722	e->next = INDEXER_NULL;
723	e->prev = INDEXER_NULL;
724	e->level = `0u`;
725	e->dirty = true; / FIXME: audit /
726	e->allocated = true;
727	e->sentinel = false;
728	e->pending_work = false;
729	}
730
731	static struct entry alloc_entry(struct* entry_alloc *ea)
732	{
733	struct entry *e;
734
735	if (l_empty(l: &ea->free))
736	return NULL;
737
738	e = l_pop_head(es: ea->es, l: &ea->free);
739	init_entry(e);
740	ea->nr_allocated++;
741
742	return e;
743	}
744
745	/*
746	* This assumes the cblock hasn't already been allocated.
747	*/
748	static struct entry alloc_particular_entry(struct* entry_alloc ea, unsigned* int i)
749	{
750	struct entry *e = __get_entry(es: ea->es, block: ea->begin + i);
751
752	BUG_ON(e->allocated);
753
754	l_del(es: ea->es, l: &ea->free, e);
755	init_entry(e);
756	ea->nr_allocated++;
757
758	return e;
759	}
760
761	static void free_entry(struct entry_alloc ea, struct* entry *e)
762	{
763	BUG_ON(!ea->nr_allocated);
764	BUG_ON(!e->allocated);
765
766	ea->nr_allocated--;
767	e->allocated = false;
768	l_add_tail(es: ea->es, l: &ea->free, e);
769	}
770
771	static bool allocator_empty(struct entry_alloc *ea)
772	{
773	return l_empty(l: &ea->free);
774	}
775
776	static unsigned int get_index(struct entry_alloc ea, struct* entry *e)
777	{
778	return to_index(es: ea->es, e) - ea->begin;
779	}
780
781	static struct entry get_entry(struct* entry_alloc ea, unsigned* int index)
782	{
783	return __get_entry(es: ea->es, block: ea->begin + index);
784	}
785
786	/----------------------------------------------------------------/
787
788	#define NR_HOTSPOT_LEVELS 64u
789	#define NR_CACHE_LEVELS 64u
790
791	#define WRITEBACK_PERIOD (10ul * HZ)
792	#define DEMOTE_PERIOD (60ul * HZ)
793
794	#define HOTSPOT_UPDATE_PERIOD (HZ)
795	#define CACHE_UPDATE_PERIOD (60ul * HZ)
796
797	struct smq_policy {
798	struct dm_cache_policy policy;
799
800	/ protects everything /
801	spinlock_t lock;
802	dm_cblock_t cache_size;
803	sector_t cache_block_size;
804
805	sector_t hotspot_block_size;
806	unsigned int nr_hotspot_blocks;
807	unsigned int cache_blocks_per_hotspot_block;
808	unsigned int hotspot_level_jump;
809
810	struct entry_space es;
811	struct entry_alloc writeback_sentinel_alloc;
812	struct entry_alloc demote_sentinel_alloc;
813	struct entry_alloc hotspot_alloc;
814	struct entry_alloc cache_alloc;
815
816	unsigned long *hotspot_hit_bits;
817	unsigned long *cache_hit_bits;
818
819	/*
820	* We maintain three queues of entries. The cache proper,
821	* consisting of a clean and dirty queue, containing the currently
822	* active mappings. The hotspot queue uses a larger block size to
823	* track blocks that are being hit frequently and potential
824	* candidates for promotion to the cache.
825	*/
826	struct queue hotspot;
827	struct queue clean;
828	struct queue dirty;
829
830	struct stats hotspot_stats;
831	struct stats cache_stats;
832
833	/*
834	* Keeps track of time, incremented by the core. We use this to
835	* avoid attributing multiple hits within the same tick.
836	*/
837	unsigned int tick;
838
839	/*
840	* The hash tables allows us to quickly find an entry by origin
841	* block.
842	*/
843	struct smq_hash_table table;
844	struct smq_hash_table hotspot_table;
845
846	bool current_writeback_sentinels;
847	unsigned long next_writeback_period;
848
849	bool current_demote_sentinels;
850	unsigned long next_demote_period;
851
852	unsigned int write_promote_level;
853	unsigned int read_promote_level;
854
855	unsigned long next_hotspot_period;
856	unsigned long next_cache_period;
857
858	struct background_tracker *bg_work;
859
860	bool migrations_allowed:`1`;
861
862	/*
863	* If this is set the policy will try and clean the whole cache
864	* even if the device is not idle.
865	*/
866	bool cleaner:`1`;
867	};
868
869	/----------------------------------------------------------------/
870
871	static struct entry get_sentinel(struct* entry_alloc ea, unsigned* int level, bool which)
872	{
873	return get_entry(ea, index: which ? level : NR_CACHE_LEVELS + level);
874	}
875
876	static struct entry writeback_sentinel(struct* smq_policy mq, unsigned* int level)
877	{
878	return get_sentinel(ea: &mq->writeback_sentinel_alloc, level, which: mq->current_writeback_sentinels);
879	}
880
881	static struct entry demote_sentinel(struct* smq_policy mq, unsigned* int level)
882	{
883	return get_sentinel(ea: &mq->demote_sentinel_alloc, level, which: mq->current_demote_sentinels);
884	}
885
886	static void __update_writeback_sentinels(struct smq_policy *mq)
887	{
888	unsigned int level;
889	struct queue *q = &mq->dirty;
890	struct entry *sentinel;
891
892	for (level = `0`; level < q->nr_levels; level++) {
893	sentinel = writeback_sentinel(mq, level);
894	q_del(q, e: sentinel);
895	q_push(q, e: sentinel);
896	}
897	}
898
899	static void __update_demote_sentinels(struct smq_policy *mq)
900	{
901	unsigned int level;
902	struct queue *q = &mq->clean;
903	struct entry *sentinel;
904
905	for (level = `0`; level < q->nr_levels; level++) {
906	sentinel = demote_sentinel(mq, level);
907	q_del(q, e: sentinel);
908	q_push(q, e: sentinel);
909	}
910	}
911
912	static void update_sentinels(struct smq_policy *mq)
913	{
914	if (time_after(jiffies, mq->next_writeback_period)) {
915	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
916	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
917	__update_writeback_sentinels(mq);
918	}
919
920	if (time_after(jiffies, mq->next_demote_period)) {
921	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
922	mq->current_demote_sentinels = !mq->current_demote_sentinels;
923	__update_demote_sentinels(mq);
924	}
925	}
926
927	static void __sentinels_init(struct smq_policy *mq)
928	{
929	unsigned int level;
930	struct entry *sentinel;
931
932	for (level = `0`; level < NR_CACHE_LEVELS; level++) {
933	sentinel = writeback_sentinel(mq, level);
934	sentinel->level = level;
935	q_push(q: &mq->dirty, e: sentinel);
936
937	sentinel = demote_sentinel(mq, level);
938	sentinel->level = level;
939	q_push(q: &mq->clean, e: sentinel);
940	}
941	}
942
943	static void sentinels_init(struct smq_policy *mq)
944	{
945	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
946	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
947
948	mq->current_writeback_sentinels = false;
949	mq->current_demote_sentinels = false;
950	__sentinels_init(mq);
951
952	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
953	mq->current_demote_sentinels = !mq->current_demote_sentinels;
954	__sentinels_init(mq);
955	}
956
957	/----------------------------------------------------------------/
958
959	static void del_queue(struct smq_policy mq, struct* entry *e)
960	{
961	q_del(q: e->dirty ? &mq->dirty : &mq->clean, e);
962	}
963
964	static void push_queue(struct smq_policy mq, struct* entry *e)
965	{
966	if (e->dirty)
967	q_push(q: &mq->dirty, e);
968	else
969	q_push(q: &mq->clean, e);
970	}
971
972	// !h, !q, a -> h, q, a
973	static void push(struct smq_policy mq, struct* entry *e)
974	{
975	h_insert(ht: &mq->table, e);
976	if (!e->pending_work)
977	push_queue(mq, e);
978	}
979
980	static void push_queue_front(struct smq_policy mq, struct* entry *e)
981	{
982	if (e->dirty)
983	q_push_front(q: &mq->dirty, e);
984	else
985	q_push_front(q: &mq->clean, e);
986	}
987
988	static void push_front(struct smq_policy mq, struct* entry *e)
989	{
990	h_insert(ht: &mq->table, e);
991	if (!e->pending_work)
992	push_queue_front(mq, e);
993	}
994
995	static dm_cblock_t infer_cblock(struct smq_policy mq, struct* entry *e)
996	{
997	return to_cblock(b: get_index(ea: &mq->cache_alloc, e));
998	}
999
1000	static void requeue(struct smq_policy mq, struct* entry *e)
1001	{
1002	/*
1003	* Pending work has temporarily been taken out of the queues.
1004	*/
1005	if (e->pending_work)
1006	return;
1007
1008	if (!test_and_set_bit(nr: from_cblock(b: infer_cblock(mq, e)), addr: mq->cache_hit_bits)) {
1009	if (!e->dirty) {
1010	q_requeue(q: &mq->clean, e, extra_levels: `1u`, NULL, NULL);
1011	return;
1012	}
1013
1014	q_requeue(q: &mq->dirty, e, extra_levels: `1u`,
1015	s1: get_sentinel(ea: &mq->writeback_sentinel_alloc, level: e->level, which: !mq->current_writeback_sentinels),
1016	s2: get_sentinel(ea: &mq->writeback_sentinel_alloc, level: e->level, which: mq->current_writeback_sentinels));
1017	}
1018	}
1019
1020	static unsigned int default_promote_level(struct smq_policy *mq)
1021	{
1022	/*
1023	* The promote level depends on the current performance of the
1024	* cache.
1025	*
1026	* If the cache is performing badly, then we can't afford
1027	* to promote much without causing performance to drop below that
1028	* of the origin device.
1029	*
1030	* If the cache is performing well, then we don't need to promote
1031	* much. If it isn't broken, don't fix it.
1032	*
1033	* If the cache is middling then we promote more.
1034	*
1035	* This scheme reminds me of a graph of entropy vs probability of a
1036	* binary variable.
1037	*/
1038	static const unsigned int table[] = {
1039	`1`, `1`, `1`, `2`, `4`, `6`, `7`, `8`, `7`, `6`, `4`, `4`, `3`, `3`, `2`, `2`, `1`
1040	};
1041
1042	unsigned int hits = mq->cache_stats.hits;
1043	unsigned int misses = mq->cache_stats.misses;
1044	unsigned int index = safe_div(n: hits << `4u`, d: hits + misses);
1045	return table[index];
1046	}
1047
1048	static void update_promote_levels(struct smq_policy *mq)
1049	{
1050	/*
1051	* If there are unused cache entries then we want to be really
1052	* eager to promote.
1053	*/
1054	unsigned int threshold_level = allocator_empty(ea: &mq->cache_alloc) ?
1055	default_promote_level(mq) : (NR_HOTSPOT_LEVELS / `2u`);
1056
1057	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1058
1059	/*
1060	* If the hotspot queue is performing badly then we have little
1061	* confidence that we know which blocks to promote. So we cut down
1062	* the amount of promotions.
1063	*/
1064	switch (stats_assess(s: &mq->hotspot_stats)) {
1065	case Q_POOR:
1066	threshold_level /= `4u`;
1067	break;
1068
1069	case Q_FAIR:
1070	threshold_level /= `2u`;
1071	break;
1072
1073	case Q_WELL:
1074	break;
1075	}
1076
1077	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1078	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1079	}
1080
1081	/*
1082	* If the hotspot queue is performing badly, then we try and move entries
1083	* around more quickly.
1084	*/
1085	static void update_level_jump(struct smq_policy *mq)
1086	{
1087	switch (stats_assess(s: &mq->hotspot_stats)) {
1088	case Q_POOR:
1089	mq->hotspot_level_jump = `4u`;
1090	break;
1091
1092	case Q_FAIR:
1093	mq->hotspot_level_jump = `2u`;
1094	break;
1095
1096	case Q_WELL:
1097	mq->hotspot_level_jump = `1u`;
1098	break;
1099	}
1100	}
1101
1102	static void end_hotspot_period(struct smq_policy *mq)
1103	{
1104	clear_bitset(bitset: mq->hotspot_hit_bits, nr_entries: mq->nr_hotspot_blocks);
1105	update_promote_levels(mq);
1106
1107	if (time_after(jiffies, mq->next_hotspot_period)) {
1108	update_level_jump(mq);
1109	q_redistribute(q: &mq->hotspot);
1110	stats_reset(s: &mq->hotspot_stats);
1111	mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1112	}
1113	}
1114
1115	static void end_cache_period(struct smq_policy *mq)
1116	{
1117	if (time_after(jiffies, mq->next_cache_period)) {
1118	clear_bitset(bitset: mq->cache_hit_bits, nr_entries: from_cblock(b: mq->cache_size));
1119
1120	q_redistribute(q: &mq->dirty);
1121	q_redistribute(q: &mq->clean);
1122	stats_reset(s: &mq->cache_stats);
1123
1124	mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1125	}
1126	}
1127
1128	/----------------------------------------------------------------/
1129
1130	/*
1131	* Targets are given as a percentage.
1132	*/
1133	#define CLEAN_TARGET 25u
1134	#define FREE_TARGET 25u
1135
1136	static unsigned int percent_to_target(struct smq_policy mq, unsigned* int p)
1137	{
1138	return from_cblock(b: mq->cache_size) * p / `100u`;
1139	}
1140
1141	static bool clean_target_met(struct smq_policy *mq, bool idle)
1142	{
1143	/*
1144	* Cache entries may not be populated. So we cannot rely on the
1145	* size of the clean queue.
1146	*/
1147	if (idle \|\| mq->cleaner) {
1148	/*
1149	* We'd like to clean everything.
1150	*/
1151	return q_size(q: &mq->dirty) == `0u`;
1152	}
1153
1154	/*
1155	* If we're busy we don't worry about cleaning at all.
1156	*/
1157	return true;
1158	}
1159
1160	static bool free_target_met(struct smq_policy *mq)
1161	{
1162	unsigned int nr_free;
1163
1164	nr_free = from_cblock(b: mq->cache_size) - mq->cache_alloc.nr_allocated;
1165	return (nr_free + btracker_nr_demotions_queued(b: mq->bg_work)) >=
1166	percent_to_target(mq, FREE_TARGET);
1167	}
1168
1169	/----------------------------------------------------------------/
1170
1171	static void mark_pending(struct smq_policy mq, struct* entry *e)
1172	{
1173	BUG_ON(e->sentinel);
1174	BUG_ON(!e->allocated);
1175	BUG_ON(e->pending_work);
1176	e->pending_work = true;
1177	}
1178
1179	static void clear_pending(struct smq_policy mq, struct* entry *e)
1180	{
1181	BUG_ON(!e->pending_work);
1182	e->pending_work = false;
1183	}
1184
1185	static void queue_writeback(struct smq_policy *mq, bool idle)
1186	{
1187	int r;
1188	struct policy_work work;
1189	struct entry *e;
1190
1191	e = q_peek(q: &mq->dirty, max_level: mq->dirty.nr_levels, can_cross_sentinel: idle);
1192	if (e) {
1193	mark_pending(mq, e);
1194	q_del(q: &mq->dirty, e);
1195
1196	work.op = POLICY_WRITEBACK;
1197	work.oblock = e->oblock;
1198	work.cblock = infer_cblock(mq, e);
1199
1200	r = btracker_queue(b: mq->bg_work, work: &work, NULL);
1201	if (r) {
1202	clear_pending(mq, e);
1203	q_push_front(q: &mq->dirty, e);
1204	}
1205	}
1206	}
1207
1208	static void queue_demotion(struct smq_policy *mq)
1209	{
1210	int r;
1211	struct policy_work work;
1212	struct entry *e;
1213
1214	if (WARN_ON_ONCE(!mq->migrations_allowed))
1215	return;
1216
1217	e = q_peek(q: &mq->clean, max_level: mq->clean.nr_levels / `2`, can_cross_sentinel: true);
1218	if (!e) {
1219	if (!clean_target_met(mq, idle: true))
1220	queue_writeback(mq, idle: false);
1221	return;
1222	}
1223
1224	mark_pending(mq, e);
1225	q_del(q: &mq->clean, e);
1226
1227	work.op = POLICY_DEMOTE;
1228	work.oblock = e->oblock;
1229	work.cblock = infer_cblock(mq, e);
1230	r = btracker_queue(b: mq->bg_work, work: &work, NULL);
1231	if (r) {
1232	clear_pending(mq, e);
1233	q_push_front(q: &mq->clean, e);
1234	}
1235	}
1236
1237	static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1238	struct policy_work **workp)
1239	{
1240	int r;
1241	struct entry *e;
1242	struct policy_work work;
1243
1244	if (!mq->migrations_allowed)
1245	return;
1246
1247	if (allocator_empty(ea: &mq->cache_alloc)) {
1248	/*
1249	* We always claim to be 'idle' to ensure some demotions happen
1250	* with continuous loads.
1251	*/
1252	if (!free_target_met(mq))
1253	queue_demotion(mq);
1254	return;
1255	}
1256
1257	if (btracker_promotion_already_present(b: mq->bg_work, oblock))
1258	return;
1259
1260	/*
1261	* We allocate the entry now to reserve the cblock. If the
1262	* background work is aborted we must remember to free it.
1263	*/
1264	e = alloc_entry(ea: &mq->cache_alloc);
1265	BUG_ON(!e);
1266	e->pending_work = true;
1267	work.op = POLICY_PROMOTE;
1268	work.oblock = oblock;
1269	work.cblock = infer_cblock(mq, e);
1270	r = btracker_queue(b: mq->bg_work, work: &work, pwork: workp);
1271	if (r)
1272	free_entry(ea: &mq->cache_alloc, e);
1273	}
1274
1275	/----------------------------------------------------------------/
1276
1277	enum promote_result {
1278	PROMOTE_NOT,
1279	PROMOTE_TEMPORARY,
1280	PROMOTE_PERMANENT
1281	};
1282
1283	/*
1284	* Converts a boolean into a promote result.
1285	*/
1286	static enum promote_result maybe_promote(bool promote)
1287	{
1288	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1289	}
1290
1291	static enum promote_result should_promote(struct smq_policy mq, struct* entry *hs_e,
1292	int data_dir, bool fast_promote)
1293	{
1294	if (data_dir == WRITE) {
1295	if (!allocator_empty(ea: &mq->cache_alloc) && fast_promote)
1296	return PROMOTE_TEMPORARY;
1297
1298	return maybe_promote(promote: hs_e->level >= mq->write_promote_level);
1299	} else
1300	return maybe_promote(promote: hs_e->level >= mq->read_promote_level);
1301	}
1302
1303	static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1304	{
1305	sector_t r = from_oblock(b);
1306	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1307	return to_oblock(b: r);
1308	}
1309
1310	static struct entry update_hotspot_queue(struct* smq_policy *mq, dm_oblock_t b)
1311	{
1312	unsigned int hi;
1313	dm_oblock_t hb = to_hblock(mq, b);
1314	struct entry *e = h_lookup(ht: &mq->hotspot_table, oblock: hb);
1315
1316	if (e) {
1317	stats_level_accessed(s: &mq->hotspot_stats, level: e->level);
1318
1319	hi = get_index(ea: &mq->hotspot_alloc, e);
1320	q_requeue(q: &mq->hotspot, e,
1321	extra_levels: test_and_set_bit(nr: hi, addr: mq->hotspot_hit_bits) ?
1322	`0u` : mq->hotspot_level_jump,
1323	NULL, NULL);
1324
1325	} else {
1326	stats_miss(s: &mq->hotspot_stats);
1327
1328	e = alloc_entry(ea: &mq->hotspot_alloc);
1329	if (!e) {
1330	e = q_pop(q: &mq->hotspot);
1331	if (e) {
1332	h_remove(ht: &mq->hotspot_table, e);
1333	hi = get_index(ea: &mq->hotspot_alloc, e);
1334	clear_bit(nr: hi, addr: mq->hotspot_hit_bits);
1335	}
1336
1337	}
1338
1339	if (e) {
1340	e->oblock = hb;
1341	q_push(q: &mq->hotspot, e);
1342	h_insert(ht: &mq->hotspot_table, e);
1343	}
1344	}
1345
1346	return e;
1347	}
1348
1349	/----------------------------------------------------------------/
1350
1351	/*
1352	* Public interface, via the policy struct. See dm-cache-policy.h for a
1353	* description of these.
1354	*/
1355
1356	static struct smq_policy to_smq_policy(struct* dm_cache_policy *p)
1357	{
1358	return container_of(p, struct smq_policy, policy);
1359	}
1360
1361	static void smq_destroy(struct dm_cache_policy *p)
1362	{
1363	struct smq_policy *mq = to_smq_policy(p);
1364
1365	btracker_destroy(b: mq->bg_work);
1366	h_exit(ht: &mq->hotspot_table);
1367	h_exit(ht: &mq->table);
1368	free_bitset(bits: mq->hotspot_hit_bits);
1369	free_bitset(bits: mq->cache_hit_bits);
1370	space_exit(es: &mq->es);
1371	kfree(objp: mq);
1372	}
1373
1374	/----------------------------------------------------------------/
1375
1376	static int __lookup(struct smq_policy mq, dm_oblock_t oblock, dm_cblock_t cblock,
1377	int data_dir, bool fast_copy,
1378	struct policy_work *work, bool background_work)
1379	{
1380	struct entry e, hs_e;
1381	enum promote_result pr;
1382
1383	*background_work = false;
1384
1385	e = h_lookup(ht: &mq->table, oblock);
1386	if (e) {
1387	stats_level_accessed(s: &mq->cache_stats, level: e->level);
1388
1389	requeue(mq, e);
1390	*cblock = infer_cblock(mq, e);
1391	return `0`;
1392
1393	} else {
1394	stats_miss(s: &mq->cache_stats);
1395
1396	/*
1397	* The hotspot queue only gets updated with misses.
1398	*/
1399	hs_e = update_hotspot_queue(mq, b: oblock);
1400
1401	pr = should_promote(mq, hs_e, data_dir, fast_promote: fast_copy);
1402	if (pr != PROMOTE_NOT) {
1403	queue_promotion(mq, oblock, workp: work);
1404	*background_work = true;
1405	}
1406
1407	return -ENOENT;
1408	}
1409	}
1410
1411	static int smq_lookup(struct dm_cache_policy p, dm_oblock_t oblock, dm_cblock_t cblock,
1412	int data_dir, bool fast_copy,
1413	bool *background_work)
1414	{
1415	int r;
1416	unsigned long flags;
1417	struct smq_policy *mq = to_smq_policy(p);
1418
1419	spin_lock_irqsave(&mq->lock, flags);
1420	r = __lookup(mq, oblock, cblock,
1421	data_dir, fast_copy,
1422	NULL, background_work);
1423	spin_unlock_irqrestore(lock: &mq->lock, flags);
1424
1425	return r;
1426	}
1427
1428	static int smq_lookup_with_work(struct dm_cache_policy *p,
1429	dm_oblock_t oblock, dm_cblock_t *cblock,
1430	int data_dir, bool fast_copy,
1431	struct policy_work **work)
1432	{
1433	int r;
1434	bool background_queued;
1435	unsigned long flags;
1436	struct smq_policy *mq = to_smq_policy(p);
1437
1438	spin_lock_irqsave(&mq->lock, flags);
1439	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, background_work: &background_queued);
1440	spin_unlock_irqrestore(lock: &mq->lock, flags);
1441
1442	return r;
1443	}
1444
1445	static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1446	struct policy_work **result)
1447	{
1448	int r;
1449	unsigned long flags;
1450	struct smq_policy *mq = to_smq_policy(p);
1451
1452	spin_lock_irqsave(&mq->lock, flags);
1453	r = btracker_issue(b: mq->bg_work, work: result);
1454	if (r == -ENODATA) {
1455	if (!clean_target_met(mq, idle)) {
1456	queue_writeback(mq, idle);
1457	r = btracker_issue(b: mq->bg_work, work: result);
1458	}
1459	}
1460	spin_unlock_irqrestore(lock: &mq->lock, flags);
1461
1462	return r;
1463	}
1464
1465	/*
1466	* We need to clear any pending work flags that have been set, and in the
1467	* case of promotion free the entry for the destination cblock.
1468	*/
1469	static void __complete_background_work(struct smq_policy *mq,
1470	struct policy_work *work,
1471	bool success)
1472	{
1473	struct entry *e = get_entry(ea: &mq->cache_alloc,
1474	index: from_cblock(b: work->cblock));
1475
1476	switch (work->op) {
1477	case POLICY_PROMOTE:
1478	// !h, !q, a
1479	clear_pending(mq, e);
1480	if (success) {
1481	e->oblock = work->oblock;
1482	e->level = NR_CACHE_LEVELS - `1`;
1483	push(mq, e);
1484	// h, q, a
1485	} else {
1486	free_entry(ea: &mq->cache_alloc, e);
1487	// !h, !q, !a
1488	}
1489	break;
1490
1491	case POLICY_DEMOTE:
1492	// h, !q, a
1493	if (success) {
1494	h_remove(ht: &mq->table, e);
1495	free_entry(ea: &mq->cache_alloc, e);
1496	// !h, !q, !a
1497	} else {
1498	clear_pending(mq, e);
1499	push_queue(mq, e);
1500	// h, q, a
1501	}
1502	break;
1503
1504	case POLICY_WRITEBACK:
1505	// h, !q, a
1506	clear_pending(mq, e);
1507	push_queue(mq, e);
1508	// h, q, a
1509	break;
1510	}
1511
1512	btracker_complete(b: mq->bg_work, op: work);
1513	}
1514
1515	static void smq_complete_background_work(struct dm_cache_policy *p,
1516	struct policy_work *work,
1517	bool success)
1518	{
1519	unsigned long flags;
1520	struct smq_policy *mq = to_smq_policy(p);
1521
1522	spin_lock_irqsave(&mq->lock, flags);
1523	__complete_background_work(mq, work, success);
1524	spin_unlock_irqrestore(lock: &mq->lock, flags);
1525	}
1526
1527	// in_hash(oblock) -> in_hash(oblock)
1528	static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1529	{
1530	struct entry *e = get_entry(ea: &mq->cache_alloc, index: from_cblock(b: cblock));
1531
1532	if (e->pending_work)
1533	e->dirty = set;
1534	else {
1535	del_queue(mq, e);
1536	e->dirty = set;
1537	push_queue(mq, e);
1538	}
1539	}
1540
1541	static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1542	{
1543	unsigned long flags;
1544	struct smq_policy *mq = to_smq_policy(p);
1545
1546	spin_lock_irqsave(&mq->lock, flags);
1547	__smq_set_clear_dirty(mq, cblock, set: true);
1548	spin_unlock_irqrestore(lock: &mq->lock, flags);
1549	}
1550
1551	static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1552	{
1553	struct smq_policy *mq = to_smq_policy(p);
1554	unsigned long flags;
1555
1556	spin_lock_irqsave(&mq->lock, flags);
1557	__smq_set_clear_dirty(mq, cblock, set: false);
1558	spin_unlock_irqrestore(lock: &mq->lock, flags);
1559	}
1560
1561	static unsigned int random_level(dm_cblock_t cblock)
1562	{
1563	return hash_32(val: from_cblock(b: cblock), bits: `9`) & (NR_CACHE_LEVELS - `1`);
1564	}
1565
1566	static int smq_load_mapping(struct dm_cache_policy *p,
1567	dm_oblock_t oblock, dm_cblock_t cblock,
1568	bool dirty, uint32_t hint, bool hint_valid)
1569	{
1570	struct smq_policy *mq = to_smq_policy(p);
1571	struct entry *e;
1572
1573	e = alloc_particular_entry(ea: &mq->cache_alloc, i: from_cblock(b: cblock));
1574	e->oblock = oblock;
1575	e->dirty = dirty;
1576	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - `1`) : random_level(cblock);
1577	e->pending_work = false;
1578
1579	/*
1580	* When we load mappings we push ahead of both sentinels in order to
1581	* allow demotions and cleaning to occur immediately.
1582	*/
1583	push_front(mq, e);
1584
1585	return `0`;
1586	}
1587
1588	static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1589	{
1590	struct smq_policy *mq = to_smq_policy(p);
1591	struct entry *e = get_entry(ea: &mq->cache_alloc, index: from_cblock(b: cblock));
1592
1593	if (!e->allocated)
1594	return -ENODATA;
1595
1596	// FIXME: what if this block has pending background work?
1597	del_queue(mq, e);
1598	h_remove(ht: &mq->table, e);
1599	free_entry(ea: &mq->cache_alloc, e);
1600	return `0`;
1601	}
1602
1603	static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1604	{
1605	struct smq_policy *mq = to_smq_policy(p);
1606	struct entry *e = get_entry(ea: &mq->cache_alloc, index: from_cblock(b: cblock));
1607
1608	if (!e->allocated)
1609	return `0`;
1610
1611	return e->level;
1612	}
1613
1614	static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1615	{
1616	dm_cblock_t r;
1617	unsigned long flags;
1618	struct smq_policy *mq = to_smq_policy(p);
1619
1620	spin_lock_irqsave(&mq->lock, flags);
1621	r = to_cblock(b: mq->cache_alloc.nr_allocated);
1622	spin_unlock_irqrestore(lock: &mq->lock, flags);
1623
1624	return r;
1625	}
1626
1627	static void smq_tick(struct dm_cache_policy *p, bool can_block)
1628	{
1629	struct smq_policy *mq = to_smq_policy(p);
1630	unsigned long flags;
1631
1632	spin_lock_irqsave(&mq->lock, flags);
1633	mq->tick++;
1634	update_sentinels(mq);
1635	end_hotspot_period(mq);
1636	end_cache_period(mq);
1637	spin_unlock_irqrestore(lock: &mq->lock, flags);
1638	}
1639
1640	static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1641	{
1642	struct smq_policy *mq = to_smq_policy(p);
1643
1644	mq->migrations_allowed = allow;
1645	}
1646
1647	/*
1648	* smq has no config values, but the old mq policy did. To avoid breaking
1649	* software we continue to accept these configurables for the mq policy,
1650	* but they have no effect.
1651	*/
1652	static int mq_set_config_value(struct dm_cache_policy *p,
1653	const char key, const* char *value)
1654	{
1655	unsigned long tmp;
1656
1657	if (kstrtoul(s: value, base: `10`, res: &tmp))
1658	return -EINVAL;
1659
1660	if (!strcasecmp(s1: key, s2: "random_threshold") \|\|
1661	!strcasecmp(s1: key, s2: "sequential_threshold") \|\|
1662	!strcasecmp(s1: key, s2: "discard_promote_adjustment") \|\|
1663	!strcasecmp(s1: key, s2: "read_promote_adjustment") \|\|
1664	!strcasecmp(s1: key, s2: "write_promote_adjustment")) {
1665	DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1666	return `0`;
1667	}
1668
1669	return -EINVAL;
1670	}
1671
1672	static int mq_emit_config_values(struct dm_cache_policy p, char* *result,
1673	unsigned int maxlen, ssize_t *sz_ptr)
1674	{
1675	ssize_t sz = *sz_ptr;
1676
1677	DMEMIT("10 random_threshold 0 "
1678	"sequential_threshold 0 "
1679	"discard_promote_adjustment 0 "
1680	"read_promote_adjustment 0 "
1681	"write_promote_adjustment 0 ");
1682
1683	*sz_ptr = sz;
1684	return `0`;
1685	}
1686
1687	/ Init the policy plugin interface function pointers. /
1688	static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1689	{
1690	mq->policy.destroy = smq_destroy;
1691	mq->policy.lookup = smq_lookup;
1692	mq->policy.lookup_with_work = smq_lookup_with_work;
1693	mq->policy.get_background_work = smq_get_background_work;
1694	mq->policy.complete_background_work = smq_complete_background_work;
1695	mq->policy.set_dirty = smq_set_dirty;
1696	mq->policy.clear_dirty = smq_clear_dirty;
1697	mq->policy.load_mapping = smq_load_mapping;
1698	mq->policy.invalidate_mapping = smq_invalidate_mapping;
1699	mq->policy.get_hint = smq_get_hint;
1700	mq->policy.residency = smq_residency;
1701	mq->policy.tick = smq_tick;
1702	mq->policy.allow_migrations = smq_allow_migrations;
1703
1704	if (mimic_mq) {
1705	mq->policy.set_config_value = mq_set_config_value;
1706	mq->policy.emit_config_values = mq_emit_config_values;
1707	}
1708	}
1709
1710	static bool too_many_hotspot_blocks(sector_t origin_size,
1711	sector_t hotspot_block_size,
1712	unsigned int nr_hotspot_blocks)
1713	{
1714	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1715	}
1716
1717	static void calc_hotspot_params(sector_t origin_size,
1718	sector_t cache_block_size,
1719	unsigned int nr_cache_blocks,
1720	sector_t *hotspot_block_size,
1721	unsigned int *nr_hotspot_blocks)
1722	{
1723	hotspot_block_size = cache_block_size `16u`;
1724	*nr_hotspot_blocks = max(nr_cache_blocks / `4u`, `1024u`);
1725
1726	while ((*hotspot_block_size > cache_block_size) &&
1727	too_many_hotspot_blocks(origin_size, hotspot_block_size: hotspot_block_size, nr_hotspot_blocks: nr_hotspot_blocks))
1728	*hotspot_block_size /= `2u`;
1729	}
1730
1731	static struct dm_cache_policy *
1732	__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
1733	bool mimic_mq, bool migrations_allowed, bool cleaner)
1734	{
1735	unsigned int i;
1736	unsigned int nr_sentinels_per_queue = `2u` * NR_CACHE_LEVELS;
1737	unsigned int total_sentinels = `2u` * nr_sentinels_per_queue;
1738	struct smq_policy mq = kzalloc(size: sizeof(mq), GFP_KERNEL);
1739
1740	if (!mq)
1741	return NULL;
1742
1743	init_policy_functions(mq, mimic_mq);
1744	mq->cache_size = cache_size;
1745	mq->cache_block_size = cache_block_size;
1746
1747	calc_hotspot_params(origin_size, cache_block_size, nr_cache_blocks: from_cblock(b: cache_size),
1748	hotspot_block_size: &mq->hotspot_block_size, nr_hotspot_blocks: &mq->nr_hotspot_blocks);
1749
1750	mq->cache_blocks_per_hotspot_block = div64_u64(dividend: mq->hotspot_block_size, divisor: mq->cache_block_size);
1751	mq->hotspot_level_jump = `1u`;
1752	if (space_init(es: &mq->es, nr_entries: total_sentinels + mq->nr_hotspot_blocks + from_cblock(b: cache_size))) {
1753	DMERR("couldn't initialize entry space");
1754	goto bad_pool_init;
1755	}
1756
1757	init_allocator(ea: &mq->writeback_sentinel_alloc, es: &mq->es, begin: `0`, end: nr_sentinels_per_queue);
1758	for (i = `0`; i < nr_sentinels_per_queue; i++)
1759	get_entry(ea: &mq->writeback_sentinel_alloc, index: i)->sentinel = true;
1760
1761	init_allocator(ea: &mq->demote_sentinel_alloc, es: &mq->es, begin: nr_sentinels_per_queue, end: total_sentinels);
1762	for (i = `0`; i < nr_sentinels_per_queue; i++)
1763	get_entry(ea: &mq->demote_sentinel_alloc, index: i)->sentinel = true;
1764
1765	init_allocator(ea: &mq->hotspot_alloc, es: &mq->es, begin: total_sentinels,
1766	end: total_sentinels + mq->nr_hotspot_blocks);
1767
1768	init_allocator(ea: &mq->cache_alloc, es: &mq->es,
1769	begin: total_sentinels + mq->nr_hotspot_blocks,
1770	end: total_sentinels + mq->nr_hotspot_blocks + from_cblock(b: cache_size));
1771
1772	mq->hotspot_hit_bits = alloc_bitset(nr_entries: mq->nr_hotspot_blocks);
1773	if (!mq->hotspot_hit_bits) {
1774	DMERR("couldn't allocate hotspot hit bitset");
1775	goto bad_hotspot_hit_bits;
1776	}
1777	clear_bitset(bitset: mq->hotspot_hit_bits, nr_entries: mq->nr_hotspot_blocks);
1778
1779	if (from_cblock(b: cache_size)) {
1780	mq->cache_hit_bits = alloc_bitset(nr_entries: from_cblock(b: cache_size));
1781	if (!mq->cache_hit_bits) {
1782	DMERR("couldn't allocate cache hit bitset");
1783	goto bad_cache_hit_bits;
1784	}
1785	clear_bitset(bitset: mq->cache_hit_bits, nr_entries: from_cblock(b: mq->cache_size));
1786	} else
1787	mq->cache_hit_bits = NULL;
1788
1789	mq->tick = `0`;
1790	spin_lock_init(&mq->lock);
1791
1792	q_init(q: &mq->hotspot, es: &mq->es, NR_HOTSPOT_LEVELS);
1793	mq->hotspot.nr_top_levels = `8`;
1794	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1795	from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1796
1797	q_init(q: &mq->clean, es: &mq->es, NR_CACHE_LEVELS);
1798	q_init(q: &mq->dirty, es: &mq->es, NR_CACHE_LEVELS);
1799
1800	stats_init(s: &mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1801	stats_init(s: &mq->cache_stats, NR_CACHE_LEVELS);
1802
1803	if (h_init(ht: &mq->table, es: &mq->es, nr_entries: from_cblock(b: cache_size)))
1804	goto bad_alloc_table;
1805
1806	if (h_init(ht: &mq->hotspot_table, es: &mq->es, nr_entries: mq->nr_hotspot_blocks))
1807	goto bad_alloc_hotspot_table;
1808
1809	sentinels_init(mq);
1810	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1811
1812	mq->next_hotspot_period = jiffies;
1813	mq->next_cache_period = jiffies;
1814
1815	mq->bg_work = btracker_create(max_work: `4096`); / FIXME: hard coded value /
1816	if (!mq->bg_work)
1817	goto bad_btracker;
1818
1819	mq->migrations_allowed = migrations_allowed;
1820	mq->cleaner = cleaner;
1821
1822	return &mq->policy;
1823
1824	bad_btracker:
1825	h_exit(ht: &mq->hotspot_table);
1826	bad_alloc_hotspot_table:
1827	h_exit(ht: &mq->table);
1828	bad_alloc_table:
1829	free_bitset(bits: mq->cache_hit_bits);
1830	bad_cache_hit_bits:
1831	free_bitset(bits: mq->hotspot_hit_bits);
1832	bad_hotspot_hit_bits:
1833	space_exit(es: &mq->es);
1834	bad_pool_init:
1835	kfree(objp: mq);
1836
1837	return NULL;
1838	}
1839
1840	static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1841	sector_t origin_size,
1842	sector_t cache_block_size)
1843	{
1844	return __smq_create(cache_size, origin_size, cache_block_size,
1845	mimic_mq: false, migrations_allowed: true, cleaner: false);
1846	}
1847
1848	static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1849	sector_t origin_size,
1850	sector_t cache_block_size)
1851	{
1852	return __smq_create(cache_size, origin_size, cache_block_size,
1853	mimic_mq: true, migrations_allowed: true, cleaner: false);
1854	}
1855
1856	static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1857	sector_t origin_size,
1858	sector_t cache_block_size)
1859	{
1860	return __smq_create(cache_size, origin_size, cache_block_size,
1861	mimic_mq: false, migrations_allowed: false, cleaner: true);
1862	}
1863
1864	/----------------------------------------------------------------/
1865
1866	static struct dm_cache_policy_type smq_policy_type = {
1867	.name = "smq",
1868	.version = {`2`, `0`, `0`},
1869	.hint_size = `4`,
1870	.owner = THIS_MODULE,
1871	.create = smq_create
1872	};
1873
1874	static struct dm_cache_policy_type mq_policy_type = {
1875	.name = "mq",
1876	.version = {`2`, `0`, `0`},
1877	.hint_size = `4`,
1878	.owner = THIS_MODULE,
1879	.create = mq_create,
1880	};
1881
1882	static struct dm_cache_policy_type cleaner_policy_type = {
1883	.name = "cleaner",
1884	.version = {`2`, `0`, `0`},
1885	.hint_size = `4`,
1886	.owner = THIS_MODULE,
1887	.create = cleaner_create,
1888	};
1889
1890	static struct dm_cache_policy_type default_policy_type = {
1891	.name = "default",
1892	.version = {`2`, `0`, `0`},
1893	.hint_size = `4`,
1894	.owner = THIS_MODULE,
1895	.create = smq_create,
1896	.real = &smq_policy_type
1897	};
1898
1899	static int __init smq_init(void)
1900	{
1901	int r;
1902
1903	r = dm_cache_policy_register(type: &smq_policy_type);
1904	if (r) {
1905	DMERR("register failed %d", r);
1906	return -ENOMEM;
1907	}
1908
1909	r = dm_cache_policy_register(type: &mq_policy_type);
1910	if (r) {
1911	DMERR("register failed (as mq) %d", r);
1912	goto out_mq;
1913	}
1914
1915	r = dm_cache_policy_register(type: &cleaner_policy_type);
1916	if (r) {
1917	DMERR("register failed (as cleaner) %d", r);
1918	goto out_cleaner;
1919	}
1920
1921	r = dm_cache_policy_register(type: &default_policy_type);
1922	if (r) {
1923	DMERR("register failed (as default) %d", r);
1924	goto out_default;
1925	}
1926
1927	return `0`;
1928
1929	out_default:
1930	dm_cache_policy_unregister(type: &cleaner_policy_type);
1931	out_cleaner:
1932	dm_cache_policy_unregister(type: &mq_policy_type);
1933	out_mq:
1934	dm_cache_policy_unregister(type: &smq_policy_type);
1935
1936	return -ENOMEM;
1937	}
1938
1939	static void __exit smq_exit(void)
1940	{
1941	dm_cache_policy_unregister(type: &cleaner_policy_type);
1942	dm_cache_policy_unregister(type: &smq_policy_type);
1943	dm_cache_policy_unregister(type: &mq_policy_type);
1944	dm_cache_policy_unregister(type: &default_policy_type);
1945	}
1946
1947	module_init(smq_init);
1948	module_exit(smq_exit);
1949
1950	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1951	MODULE_LICENSE("GPL");
1952	MODULE_DESCRIPTION("smq cache policy");
1953
1954	MODULE_ALIAS("dm-cache-default");
1955	MODULE_ALIAS("dm-cache-mq");
1956	MODULE_ALIAS("dm-cache-cleaner");
1957

source code of linux/drivers/md/dm-cache-policy-smq.c