dm-cache-target.c source code [linux/drivers/md/dm-cache-target.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2012 Red Hat. All rights reserved.
4	*
5	* This file is released under the GPL.
6	*/
7
8	#include "dm.h"
9	#include "dm-bio-prison-v2.h"
10	#include "dm-bio-record.h"
11	#include "dm-cache-metadata.h"
12	#include "dm-io-tracker.h"
13
14	#include <linux/dm-io.h>
15	#include <linux/dm-kcopyd.h>
16	#include <linux/jiffies.h>
17	#include <linux/init.h>
18	#include <linux/mempool.h>
19	#include <linux/module.h>
20	#include <linux/rwsem.h>
21	#include <linux/slab.h>
22	#include <linux/vmalloc.h>
23
24	#define DM_MSG_PREFIX "cache"
25
26	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
27	"A percentage of time allocated for copying to and/or from cache");
28
29	/----------------------------------------------------------------/
30
31	/*
32	* Glossary:
33	*
34	* oblock: index of an origin block
35	* cblock: index of a cache block
36	* promotion: movement of a block from origin to cache
37	* demotion: movement of a block from cache to origin
38	* migration: movement of a block between the origin and cache device,
39	* either direction
40	*/
41
42	/----------------------------------------------------------------/
43
44	/*
45	* Represents a chunk of future work. 'input' allows continuations to pass
46	* values between themselves, typically error values.
47	*/
48	struct continuation {
49	struct work_struct ws;
50	blk_status_t input;
51	};
52
53	static inline void init_continuation(struct continuation *k,
54	void (fn)(struct* work_struct *))
55	{
56	INIT_WORK(&k->ws, fn);
57	k->input = `0`;
58	}
59
60	static inline void queue_continuation(struct workqueue_struct *wq,
61	struct continuation *k)
62	{
63	queue_work(wq, work: &k->ws);
64	}
65
66	/----------------------------------------------------------------/
67
68	/*
69	* The batcher collects together pieces of work that need a particular
70	* operation to occur before they can proceed (typically a commit).
71	*/
72	struct batcher {
73	/*
74	* The operation that everyone is waiting for.
75	*/
76	blk_status_t (commit_op)(void* *context);
77	void *commit_context;
78
79	/*
80	* This is how bios should be issued once the commit op is complete
81	* (accounted_request).
82	*/
83	void (issue_op)(struct* bio bio, void* *context);
84	void *issue_context;
85
86	/*
87	* Queued work gets put on here after commit.
88	*/
89	struct workqueue_struct *wq;
90
91	spinlock_t lock;
92	struct list_head work_items;
93	struct bio_list bios;
94	struct work_struct commit_work;
95
96	bool commit_scheduled;
97	};
98
99	static void __commit(struct work_struct *_ws)
100	{
101	struct batcher b = container_of(_ws, struct* batcher, commit_work);
102	blk_status_t r;
103	struct list_head work_items;
104	struct work_struct ws, tmp;
105	struct continuation *k;
106	struct bio *bio;
107	struct bio_list bios;
108
109	INIT_LIST_HEAD(list: &work_items);
110	bio_list_init(bl: &bios);
111
112	/*
113	* We have to grab these before the commit_op to avoid a race
114	* condition.
115	*/
116	spin_lock_irq(lock: &b->lock);
117	list_splice_init(list: &b->work_items, head: &work_items);
118	bio_list_merge(bl: &bios, bl2: &b->bios);
119	bio_list_init(bl: &b->bios);
120	b->commit_scheduled = false;
121	spin_unlock_irq(lock: &b->lock);
122
123	r = b->commit_op(b->commit_context);
124
125	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
126	k = container_of(ws, struct continuation, ws);
127	k->input = r;
128	INIT_LIST_HEAD(list: &ws->entry); / to avoid a WARN_ON /
129	queue_work(wq: b->wq, work: ws);
130	}
131
132	while ((bio = bio_list_pop(bl: &bios))) {
133	if (r) {
134	bio->bi_status = r;
135	bio_endio(bio);
136	} else
137	b->issue_op(bio, b->issue_context);
138	}
139	}
140
141	static void batcher_init(struct batcher *b,
142	blk_status_t (commit_op)(void* *),
143	void *commit_context,
144	void (issue_op)(struct* bio bio, void* *),
145	void *issue_context,
146	struct workqueue_struct *wq)
147	{
148	b->commit_op = commit_op;
149	b->commit_context = commit_context;
150	b->issue_op = issue_op;
151	b->issue_context = issue_context;
152	b->wq = wq;
153
154	spin_lock_init(&b->lock);
155	INIT_LIST_HEAD(list: &b->work_items);
156	bio_list_init(bl: &b->bios);
157	INIT_WORK(&b->commit_work, __commit);
158	b->commit_scheduled = false;
159	}
160
161	static void async_commit(struct batcher *b)
162	{
163	queue_work(wq: b->wq, work: &b->commit_work);
164	}
165
166	static void continue_after_commit(struct batcher b, struct* continuation *k)
167	{
168	bool commit_scheduled;
169
170	spin_lock_irq(lock: &b->lock);
171	commit_scheduled = b->commit_scheduled;
172	list_add_tail(new: &k->ws.entry, head: &b->work_items);
173	spin_unlock_irq(lock: &b->lock);
174
175	if (commit_scheduled)
176	async_commit(b);
177	}
178
179	/*
180	* Bios are errored if commit failed.
181	*/
182	static void issue_after_commit(struct batcher b, struct* bio *bio)
183	{
184	bool commit_scheduled;
185
186	spin_lock_irq(lock: &b->lock);
187	commit_scheduled = b->commit_scheduled;
188	bio_list_add(bl: &b->bios, bio);
189	spin_unlock_irq(lock: &b->lock);
190
191	if (commit_scheduled)
192	async_commit(b);
193	}
194
195	/*
196	* Call this if some urgent work is waiting for the commit to complete.
197	*/
198	static void schedule_commit(struct batcher *b)
199	{
200	bool immediate;
201
202	spin_lock_irq(lock: &b->lock);
203	immediate = !list_empty(head: &b->work_items) \|\| !bio_list_empty(bl: &b->bios);
204	b->commit_scheduled = true;
205	spin_unlock_irq(lock: &b->lock);
206
207	if (immediate)
208	async_commit(b);
209	}
210
211	/*
212	* There are a couple of places where we let a bio run, but want to do some
213	* work before calling its endio function. We do this by temporarily
214	* changing the endio fn.
215	*/
216	struct dm_hook_info {
217	bio_end_io_t *bi_end_io;
218	};
219
220	static void dm_hook_bio(struct dm_hook_info h, struct* bio *bio,
221	bio_end_io_t bi_end_io, void* *bi_private)
222	{
223	h->bi_end_io = bio->bi_end_io;
224
225	bio->bi_end_io = bi_end_io;
226	bio->bi_private = bi_private;
227	}
228
229	static void dm_unhook_bio(struct dm_hook_info h, struct* bio *bio)
230	{
231	bio->bi_end_io = h->bi_end_io;
232	}
233
234	/----------------------------------------------------------------/
235
236	#define MIGRATION_POOL_SIZE 128
237	#define COMMIT_PERIOD HZ
238	#define MIGRATION_COUNT_WINDOW 10
239
240	/*
241	* The block size of the device holding cache data must be
242	* between 32KB and 1GB.
243	*/
244	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
245	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
246
247	enum cache_metadata_mode {
248	CM_WRITE, / metadata may be changed /
249	CM_READ_ONLY, / metadata may not be changed /
250	CM_FAIL
251	};
252
253	enum cache_io_mode {
254	/*
255	* Data is written to cached blocks only. These blocks are marked
256	* dirty. If you lose the cache device you will lose data.
257	* Potential performance increase for both reads and writes.
258	*/
259	CM_IO_WRITEBACK,
260
261	/*
262	* Data is written to both cache and origin. Blocks are never
263	* dirty. Potential performance benfit for reads only.
264	*/
265	CM_IO_WRITETHROUGH,
266
267	/*
268	* A degraded mode useful for various cache coherency situations
269	* (eg, rolling back snapshots). Reads and writes always go to the
270	* origin. If a write goes to a cached oblock, then the cache
271	* block is invalidated.
272	*/
273	CM_IO_PASSTHROUGH
274	};
275
276	struct cache_features {
277	enum cache_metadata_mode mode;
278	enum cache_io_mode io_mode;
279	unsigned int metadata_version;
280	bool discard_passdown:`1`;
281	};
282
283	struct cache_stats {
284	atomic_t read_hit;
285	atomic_t read_miss;
286	atomic_t write_hit;
287	atomic_t write_miss;
288	atomic_t demotion;
289	atomic_t promotion;
290	atomic_t writeback;
291	atomic_t copies_avoided;
292	atomic_t cache_cell_clash;
293	atomic_t commit_count;
294	atomic_t discard_count;
295	};
296
297	struct cache {
298	struct dm_target *ti;
299	spinlock_t lock;
300
301	/*
302	* Fields for converting from sectors to blocks.
303	*/
304	int sectors_per_block_shift;
305	sector_t sectors_per_block;
306
307	struct dm_cache_metadata *cmd;
308
309	/*
310	* Metadata is written to this device.
311	*/
312	struct dm_dev *metadata_dev;
313
314	/*
315	* The slower of the two data devices. Typically a spindle.
316	*/
317	struct dm_dev *origin_dev;
318
319	/*
320	* The faster of the two data devices. Typically an SSD.
321	*/
322	struct dm_dev *cache_dev;
323
324	/*
325	* Size of the origin device in _complete_ blocks and native sectors.
326	*/
327	dm_oblock_t origin_blocks;
328	sector_t origin_sectors;
329
330	/*
331	* Size of the cache device in blocks.
332	*/
333	dm_cblock_t cache_size;
334
335	/*
336	* Invalidation fields.
337	*/
338	spinlock_t invalidation_lock;
339	struct list_head invalidation_requests;
340
341	sector_t migration_threshold;
342	wait_queue_head_t migration_wait;
343	atomic_t nr_allocated_migrations;
344
345	/*
346	* The number of in flight migrations that are performing
347	* background io. eg, promotion, writeback.
348	*/
349	atomic_t nr_io_migrations;
350
351	struct bio_list deferred_bios;
352
353	struct rw_semaphore quiesce_lock;
354
355	/*
356	* origin_blocks entries, discarded if set.
357	*/
358	dm_dblock_t discard_nr_blocks;
359	unsigned long *discard_bitset;
360	uint32_t discard_block_size; / a power of 2 times sectors per block /
361
362	/*
363	* Rather than reconstructing the table line for the status we just
364	* save it and regurgitate.
365	*/
366	unsigned int nr_ctr_args;
367	const char **ctr_args;
368
369	struct dm_kcopyd_client *copier;
370	struct work_struct deferred_bio_worker;
371	struct work_struct migration_worker;
372	struct workqueue_struct *wq;
373	struct delayed_work waker;
374	struct dm_bio_prison_v2 *prison;
375
376	/*
377	* cache_size entries, dirty if set
378	*/
379	unsigned long *dirty_bitset;
380	atomic_t nr_dirty;
381
382	unsigned int policy_nr_args;
383	struct dm_cache_policy *policy;
384
385	/*
386	* Cache features such as write-through.
387	*/
388	struct cache_features features;
389
390	struct cache_stats stats;
391
392	bool need_tick_bio:`1`;
393	bool sized:`1`;
394	bool invalidate:`1`;
395	bool commit_requested:`1`;
396	bool loaded_mappings:`1`;
397	bool loaded_discards:`1`;
398
399	struct rw_semaphore background_work_lock;
400
401	struct batcher committer;
402	struct work_struct commit_ws;
403
404	struct dm_io_tracker tracker;
405
406	mempool_t migration_pool;
407
408	struct bio_set bs;
409	};
410
411	struct per_bio_data {
412	bool tick:`1`;
413	unsigned int req_nr:`2`;
414	struct dm_bio_prison_cell_v2 *cell;
415	struct dm_hook_info hook_info;
416	sector_t len;
417	};
418
419	struct dm_cache_migration {
420	struct continuation k;
421	struct cache *cache;
422
423	struct policy_work *op;
424	struct bio *overwrite_bio;
425	struct dm_bio_prison_cell_v2 *cell;
426
427	dm_cblock_t invalidate_cblock;
428	dm_oblock_t invalidate_oblock;
429	};
430
431	/----------------------------------------------------------------/
432
433	static bool writethrough_mode(struct cache *cache)
434	{
435	return cache->features.io_mode == CM_IO_WRITETHROUGH;
436	}
437
438	static bool writeback_mode(struct cache *cache)
439	{
440	return cache->features.io_mode == CM_IO_WRITEBACK;
441	}
442
443	static inline bool passthrough_mode(struct cache *cache)
444	{
445	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
446	}
447
448	/----------------------------------------------------------------/
449
450	static void wake_deferred_bio_worker(struct cache *cache)
451	{
452	queue_work(wq: cache->wq, work: &cache->deferred_bio_worker);
453	}
454
455	static void wake_migration_worker(struct cache *cache)
456	{
457	if (passthrough_mode(cache))
458	return;
459
460	queue_work(wq: cache->wq, work: &cache->migration_worker);
461	}
462
463	/----------------------------------------------------------------/
464
465	static struct dm_bio_prison_cell_v2 alloc_prison_cell(struct* cache *cache)
466	{
467	return dm_bio_prison_alloc_cell_v2(prison: cache->prison, GFP_NOIO);
468	}
469
470	static void free_prison_cell(struct cache cache, struct* dm_bio_prison_cell_v2 *cell)
471	{
472	dm_bio_prison_free_cell_v2(prison: cache->prison, cell);
473	}
474
475	static struct dm_cache_migration alloc_migration(struct* cache *cache)
476	{
477	struct dm_cache_migration *mg;
478
479	mg = mempool_alloc(pool: &cache->migration_pool, GFP_NOIO);
480
481	memset(mg, `0`, sizeof(*mg));
482
483	mg->cache = cache;
484	atomic_inc(v: &cache->nr_allocated_migrations);
485
486	return mg;
487	}
488
489	static void free_migration(struct dm_cache_migration *mg)
490	{
491	struct cache *cache = mg->cache;
492
493	if (atomic_dec_and_test(v: &cache->nr_allocated_migrations))
494	wake_up(&cache->migration_wait);
495
496	mempool_free(element: mg, pool: &cache->migration_pool);
497	}
498
499	/----------------------------------------------------------------/
500
501	static inline dm_oblock_t oblock_succ(dm_oblock_t b)
502	{
503	return to_oblock(b: from_oblock(b) + `1ull`);
504	}
505
506	static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
507	{
508	key->virtual = `0`;
509	key->dev = `0`;
510	key->block_begin = from_oblock(b: begin);
511	key->block_end = from_oblock(b: end);
512	}
513
514	/*
515	* We have two lock levels. Level 0, which is used to prevent WRITEs, and
516	* level 1 which prevents both READs and WRITEs.
517	*/
518	#define WRITE_LOCK_LEVEL 0
519	#define READ_WRITE_LOCK_LEVEL 1
520
521	static unsigned int lock_level(struct bio *bio)
522	{
523	return bio_data_dir(bio) == WRITE ?
524	WRITE_LOCK_LEVEL :
525	READ_WRITE_LOCK_LEVEL;
526	}
527
528	/*
529	*--------------------------------------------------------------
530	* Per bio data
531	*--------------------------------------------------------------
532	*/
533
534	static struct per_bio_data get_per_bio_data(struct* bio *bio)
535	{
536	struct per_bio_data pb = dm_per_bio_data(bio, data_size: sizeof(struct* per_bio_data));
537
538	BUG_ON(!pb);
539	return pb;
540	}
541
542	static struct per_bio_data init_per_bio_data(struct* bio *bio)
543	{
544	struct per_bio_data *pb = get_per_bio_data(bio);
545
546	pb->tick = false;
547	pb->req_nr = dm_bio_get_target_bio_nr(bio);
548	pb->cell = NULL;
549	pb->len = `0`;
550
551	return pb;
552	}
553
554	/----------------------------------------------------------------/
555
556	static void defer_bio(struct cache cache, struct* bio *bio)
557	{
558	spin_lock_irq(lock: &cache->lock);
559	bio_list_add(bl: &cache->deferred_bios, bio);
560	spin_unlock_irq(lock: &cache->lock);
561
562	wake_deferred_bio_worker(cache);
563	}
564
565	static void defer_bios(struct cache cache, struct* bio_list *bios)
566	{
567	spin_lock_irq(lock: &cache->lock);
568	bio_list_merge(bl: &cache->deferred_bios, bl2: bios);
569	bio_list_init(bl: bios);
570	spin_unlock_irq(lock: &cache->lock);
571
572	wake_deferred_bio_worker(cache);
573	}
574
575	/----------------------------------------------------------------/
576
577	static bool bio_detain_shared(struct cache cache, dm_oblock_t oblock, struct* bio *bio)
578	{
579	bool r;
580	struct per_bio_data *pb;
581	struct dm_cell_key_v2 key;
582	dm_oblock_t end = to_oblock(b: from_oblock(b: oblock) + `1ULL`);
583	struct dm_bio_prison_cell_v2 cell_prealloc, cell;
584
585	cell_prealloc = alloc_prison_cell(cache); / FIXME: allow wait if calling from worker /
586
587	build_key(begin: oblock, end, key: &key);
588	r = dm_cell_get_v2(prison: cache->prison, key: &key, lock_level: lock_level(bio), inmate: bio, cell_prealloc, cell_result: &cell);
589	if (!r) {
590	/*
591	* Failed to get the lock.
592	*/
593	free_prison_cell(cache, cell: cell_prealloc);
594	return r;
595	}
596
597	if (cell != cell_prealloc)
598	free_prison_cell(cache, cell: cell_prealloc);
599
600	pb = get_per_bio_data(bio);
601	pb->cell = cell;
602
603	return r;
604	}
605
606	/----------------------------------------------------------------/
607
608	static bool is_dirty(struct cache *cache, dm_cblock_t b)
609	{
610	return test_bit(from_cblock(b), cache->dirty_bitset);
611	}
612
613	static void set_dirty(struct cache *cache, dm_cblock_t cblock)
614	{
615	if (!test_and_set_bit(nr: from_cblock(b: cblock), addr: cache->dirty_bitset)) {
616	atomic_inc(v: &cache->nr_dirty);
617	policy_set_dirty(p: cache->policy, cblock);
618	}
619	}
620
621	/*
622	* These two are called when setting after migrations to force the policy
623	* and dirty bitset to be in sync.
624	*/
625	static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
626	{
627	if (!test_and_set_bit(nr: from_cblock(b: cblock), addr: cache->dirty_bitset))
628	atomic_inc(v: &cache->nr_dirty);
629	policy_set_dirty(p: cache->policy, cblock);
630	}
631
632	static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
633	{
634	if (test_and_clear_bit(nr: from_cblock(b: cblock), addr: cache->dirty_bitset)) {
635	if (atomic_dec_return(v: &cache->nr_dirty) == `0`)
636	dm_table_event(t: cache->ti->table);
637	}
638
639	policy_clear_dirty(p: cache->policy, cblock);
640	}
641
642	/----------------------------------------------------------------/
643
644	static bool block_size_is_power_of_two(struct cache *cache)
645	{
646	return cache->sectors_per_block_shift >= `0`;
647	}
648
649	static dm_block_t block_div(dm_block_t b, uint32_t n)
650	{
651	do_div(b, n);
652
653	return b;
654	}
655
656	static dm_block_t oblocks_per_dblock(struct cache *cache)
657	{
658	dm_block_t oblocks = cache->discard_block_size;
659
660	if (block_size_is_power_of_two(cache))
661	oblocks >>= cache->sectors_per_block_shift;
662	else
663	oblocks = block_div(b: oblocks, n: cache->sectors_per_block);
664
665	return oblocks;
666	}
667
668	static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
669	{
670	return to_dblock(b: block_div(b: from_oblock(b: oblock),
671	n: oblocks_per_dblock(cache)));
672	}
673
674	static void set_discard(struct cache *cache, dm_dblock_t b)
675	{
676	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
677	atomic_inc(v: &cache->stats.discard_count);
678
679	spin_lock_irq(lock: &cache->lock);
680	set_bit(nr: from_dblock(b), addr: cache->discard_bitset);
681	spin_unlock_irq(lock: &cache->lock);
682	}
683
684	static void clear_discard(struct cache *cache, dm_dblock_t b)
685	{
686	spin_lock_irq(lock: &cache->lock);
687	clear_bit(nr: from_dblock(b), addr: cache->discard_bitset);
688	spin_unlock_irq(lock: &cache->lock);
689	}
690
691	static bool is_discarded(struct cache *cache, dm_dblock_t b)
692	{
693	int r;
694
695	spin_lock_irq(lock: &cache->lock);
696	r = test_bit(from_dblock(b), cache->discard_bitset);
697	spin_unlock_irq(lock: &cache->lock);
698
699	return r;
700	}
701
702	static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
703	{
704	int r;
705
706	spin_lock_irq(lock: &cache->lock);
707	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
708	cache->discard_bitset);
709	spin_unlock_irq(lock: &cache->lock);
710
711	return r;
712	}
713
714	/*
715	* -------------------------------------------------------------
716	* Remapping
717	*--------------------------------------------------------------
718	*/
719	static void remap_to_origin(struct cache cache, struct* bio *bio)
720	{
721	bio_set_dev(bio, bdev: cache->origin_dev->bdev);
722	}
723
724	static void remap_to_cache(struct cache cache, struct* bio *bio,
725	dm_cblock_t cblock)
726	{
727	sector_t bi_sector = bio->bi_iter.bi_sector;
728	sector_t block = from_cblock(b: cblock);
729
730	bio_set_dev(bio, bdev: cache->cache_dev->bdev);
731	if (!block_size_is_power_of_two(cache))
732	bio->bi_iter.bi_sector =
733	(block * cache->sectors_per_block) +
734	sector_div(bi_sector, cache->sectors_per_block);
735	else
736	bio->bi_iter.bi_sector =
737	(block << cache->sectors_per_block_shift) \|
738	(bi_sector & (cache->sectors_per_block - `1`));
739	}
740
741	static void check_if_tick_bio_needed(struct cache cache, struct* bio *bio)
742	{
743	struct per_bio_data *pb;
744
745	spin_lock_irq(lock: &cache->lock);
746	if (cache->need_tick_bio && !op_is_flush(op: bio->bi_opf) &&
747	bio_op(bio) != REQ_OP_DISCARD) {
748	pb = get_per_bio_data(bio);
749	pb->tick = true;
750	cache->need_tick_bio = false;
751	}
752	spin_unlock_irq(lock: &cache->lock);
753	}
754
755	static void remap_to_origin_clear_discard(struct cache cache, struct* bio *bio,
756	dm_oblock_t oblock)
757	{
758	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
759	check_if_tick_bio_needed(cache, bio);
760	remap_to_origin(cache, bio);
761	if (bio_data_dir(bio) == WRITE)
762	clear_discard(cache, b: oblock_to_dblock(cache, oblock));
763	}
764
765	static void remap_to_cache_dirty(struct cache cache, struct* bio *bio,
766	dm_oblock_t oblock, dm_cblock_t cblock)
767	{
768	check_if_tick_bio_needed(cache, bio);
769	remap_to_cache(cache, bio, cblock);
770	if (bio_data_dir(bio) == WRITE) {
771	set_dirty(cache, cblock);
772	clear_discard(cache, b: oblock_to_dblock(cache, oblock));
773	}
774	}
775
776	static dm_oblock_t get_bio_block(struct cache cache, struct* bio *bio)
777	{
778	sector_t block_nr = bio->bi_iter.bi_sector;
779
780	if (!block_size_is_power_of_two(cache))
781	(void) sector_div(block_nr, cache->sectors_per_block);
782	else
783	block_nr >>= cache->sectors_per_block_shift;
784
785	return to_oblock(b: block_nr);
786	}
787
788	static bool accountable_bio(struct cache cache, struct* bio *bio)
789	{
790	return bio_op(bio) != REQ_OP_DISCARD;
791	}
792
793	static void accounted_begin(struct cache cache, struct* bio *bio)
794	{
795	struct per_bio_data *pb;
796
797	if (accountable_bio(cache, bio)) {
798	pb = get_per_bio_data(bio);
799	pb->len = bio_sectors(bio);
800	dm_iot_io_begin(iot: &cache->tracker, len: pb->len);
801	}
802	}
803
804	static void accounted_complete(struct cache cache, struct* bio *bio)
805	{
806	struct per_bio_data *pb = get_per_bio_data(bio);
807
808	dm_iot_io_end(iot: &cache->tracker, len: pb->len);
809	}
810
811	static void accounted_request(struct cache cache, struct* bio *bio)
812	{
813	accounted_begin(cache, bio);
814	dm_submit_bio_remap(clone: bio, NULL);
815	}
816
817	static void issue_op(struct bio bio, void* *context)
818	{
819	struct cache *cache = context;
820
821	accounted_request(cache, bio);
822	}
823
824	/*
825	* When running in writethrough mode we need to send writes to clean blocks
826	* to both the cache and origin devices. Clone the bio and send them in parallel.
827	*/
828	static void remap_to_origin_and_cache(struct cache cache, struct* bio *bio,
829	dm_oblock_t oblock, dm_cblock_t cblock)
830	{
831	struct bio *origin_bio = bio_alloc_clone(bdev: cache->origin_dev->bdev, bio_src: bio,
832	GFP_NOIO, bs: &cache->bs);
833
834	BUG_ON(!origin_bio);
835
836	bio_chain(origin_bio, bio);
837
838	if (bio_data_dir(origin_bio) == WRITE)
839	clear_discard(cache, b: oblock_to_dblock(cache, oblock));
840	submit_bio(bio: origin_bio);
841
842	remap_to_cache(cache, bio, cblock);
843	}
844
845	/*
846	*--------------------------------------------------------------
847	* Failure modes
848	*--------------------------------------------------------------
849	*/
850	static enum cache_metadata_mode get_cache_mode(struct cache *cache)
851	{
852	return cache->features.mode;
853	}
854
855	static const char cache_device_name(struct* cache *cache)
856	{
857	return dm_table_device_name(t: cache->ti->table);
858	}
859
860	static void notify_mode_switch(struct cache cache, enum* cache_metadata_mode mode)
861	{
862	static const char *descs[] = {
863	"write",
864	"read-only",
865	"fail"
866	};
867
868	dm_table_event(t: cache->ti->table);
869	DMINFO("%s: switching cache to %s mode",
870	cache_device_name(cache), descs[(int)mode]);
871	}
872
873	static void set_cache_mode(struct cache cache, enum* cache_metadata_mode new_mode)
874	{
875	bool needs_check;
876	enum cache_metadata_mode old_mode = get_cache_mode(cache);
877
878	if (dm_cache_metadata_needs_check(cmd: cache->cmd, result: &needs_check)) {
879	DMERR("%s: unable to read needs_check flag, setting failure mode.",
880	cache_device_name(cache));
881	new_mode = CM_FAIL;
882	}
883
884	if (new_mode == CM_WRITE && needs_check) {
885	DMERR("%s: unable to switch cache to write mode until repaired.",
886	cache_device_name(cache));
887	if (old_mode != new_mode)
888	new_mode = old_mode;
889	else
890	new_mode = CM_READ_ONLY;
891	}
892
893	/ Never move out of fail mode /
894	if (old_mode == CM_FAIL)
895	new_mode = CM_FAIL;
896
897	switch (new_mode) {
898	case CM_FAIL:
899	case CM_READ_ONLY:
900	dm_cache_metadata_set_read_only(cmd: cache->cmd);
901	break;
902
903	case CM_WRITE:
904	dm_cache_metadata_set_read_write(cmd: cache->cmd);
905	break;
906	}
907
908	cache->features.mode = new_mode;
909
910	if (new_mode != old_mode)
911	notify_mode_switch(cache, mode: new_mode);
912	}
913
914	static void abort_transaction(struct cache *cache)
915	{
916	const char *dev_name = cache_device_name(cache);
917
918	if (get_cache_mode(cache) >= CM_READ_ONLY)
919	return;
920
921	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
922	if (dm_cache_metadata_abort(cmd: cache->cmd)) {
923	DMERR("%s: failed to abort metadata transaction", dev_name);
924	set_cache_mode(cache, new_mode: CM_FAIL);
925	}
926
927	if (dm_cache_metadata_set_needs_check(cmd: cache->cmd)) {
928	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
929	set_cache_mode(cache, new_mode: CM_FAIL);
930	}
931	}
932
933	static void metadata_operation_failed(struct cache cache, const* char op, int* r)
934	{
935	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
936	cache_device_name(cache), op, r);
937	abort_transaction(cache);
938	set_cache_mode(cache, new_mode: CM_READ_ONLY);
939	}
940
941	/----------------------------------------------------------------/
942
943	static void load_stats(struct cache *cache)
944	{
945	struct dm_cache_statistics stats;
946
947	dm_cache_metadata_get_stats(cmd: cache->cmd, stats: &stats);
948	atomic_set(v: &cache->stats.read_hit, i: stats.read_hits);
949	atomic_set(v: &cache->stats.read_miss, i: stats.read_misses);
950	atomic_set(v: &cache->stats.write_hit, i: stats.write_hits);
951	atomic_set(v: &cache->stats.write_miss, i: stats.write_misses);
952	}
953
954	static void save_stats(struct cache *cache)
955	{
956	struct dm_cache_statistics stats;
957
958	if (get_cache_mode(cache) >= CM_READ_ONLY)
959	return;
960
961	stats.read_hits = atomic_read(v: &cache->stats.read_hit);
962	stats.read_misses = atomic_read(v: &cache->stats.read_miss);
963	stats.write_hits = atomic_read(v: &cache->stats.write_hit);
964	stats.write_misses = atomic_read(v: &cache->stats.write_miss);
965
966	dm_cache_metadata_set_stats(cmd: cache->cmd, stats: &stats);
967	}
968
969	static void update_stats(struct cache_stats stats, enum* policy_operation op)
970	{
971	switch (op) {
972	case POLICY_PROMOTE:
973	atomic_inc(v: &stats->promotion);
974	break;
975
976	case POLICY_DEMOTE:
977	atomic_inc(v: &stats->demotion);
978	break;
979
980	case POLICY_WRITEBACK:
981	atomic_inc(v: &stats->writeback);
982	break;
983	}
984	}
985
986	/*
987	*---------------------------------------------------------------------
988	* Migration processing
989	*
990	* Migration covers moving data from the origin device to the cache, or
991	* vice versa.
992	*---------------------------------------------------------------------
993	*/
994	static void inc_io_migrations(struct cache *cache)
995	{
996	atomic_inc(v: &cache->nr_io_migrations);
997	}
998
999	static void dec_io_migrations(struct cache *cache)
1000	{
1001	atomic_dec(v: &cache->nr_io_migrations);
1002	}
1003
1004	static bool discard_or_flush(struct bio *bio)
1005	{
1006	return bio_op(bio) == REQ_OP_DISCARD \|\| op_is_flush(op: bio->bi_opf);
1007	}
1008
1009	static void calc_discard_block_range(struct cache cache, struct* bio *bio,
1010	dm_dblock_t b, dm_dblock_t e)
1011	{
1012	sector_t sb = bio->bi_iter.bi_sector;
1013	sector_t se = bio_end_sector(bio);
1014
1015	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1016
1017	if (se - sb < cache->discard_block_size)
1018	e = b;
1019	else
1020	*e = to_dblock(b: block_div(b: se, n: cache->discard_block_size));
1021	}
1022
1023	/----------------------------------------------------------------/
1024
1025	static void prevent_background_work(struct cache *cache)
1026	{
1027	lockdep_off();
1028	down_write(sem: &cache->background_work_lock);
1029	lockdep_on();
1030	}
1031
1032	static void allow_background_work(struct cache *cache)
1033	{
1034	lockdep_off();
1035	up_write(sem: &cache->background_work_lock);
1036	lockdep_on();
1037	}
1038
1039	static bool background_work_begin(struct cache *cache)
1040	{
1041	bool r;
1042
1043	lockdep_off();
1044	r = down_read_trylock(sem: &cache->background_work_lock);
1045	lockdep_on();
1046
1047	return r;
1048	}
1049
1050	static void background_work_end(struct cache *cache)
1051	{
1052	lockdep_off();
1053	up_read(sem: &cache->background_work_lock);
1054	lockdep_on();
1055	}
1056
1057	/----------------------------------------------------------------/
1058
1059	static bool bio_writes_complete_block(struct cache cache, struct* bio *bio)
1060	{
1061	return (bio_data_dir(bio) == WRITE) &&
1062	(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1063	}
1064
1065	static bool optimisable_bio(struct cache cache, struct* bio *bio, dm_oblock_t block)
1066	{
1067	return writeback_mode(cache) &&
1068	(is_discarded_oblock(cache, b: block) \|\| bio_writes_complete_block(cache, bio));
1069	}
1070
1071	static void quiesce(struct dm_cache_migration *mg,
1072	void (continuation)(struct* work_struct *))
1073	{
1074	init_continuation(k: &mg->k, fn: continuation);
1075	dm_cell_quiesce_v2(prison: mg->cache->prison, cell: mg->cell, continuation: &mg->k.ws);
1076	}
1077
1078	static struct dm_cache_migration ws_to_mg(struct* work_struct *ws)
1079	{
1080	struct continuation k = container_of(ws, struct* continuation, ws);
1081
1082	return container_of(k, struct dm_cache_migration, k);
1083	}
1084
1085	static void copy_complete(int read_err, unsigned long write_err, void *context)
1086	{
1087	struct dm_cache_migration mg = container_of(context, struct* dm_cache_migration, k);
1088
1089	if (read_err \|\| write_err)
1090	mg->k.input = BLK_STS_IOERR;
1091
1092	queue_continuation(wq: mg->cache->wq, k: &mg->k);
1093	}
1094
1095	static void copy(struct dm_cache_migration *mg, bool promote)
1096	{
1097	struct dm_io_region o_region, c_region;
1098	struct cache *cache = mg->cache;
1099
1100	o_region.bdev = cache->origin_dev->bdev;
1101	o_region.sector = from_oblock(b: mg->op->oblock) * cache->sectors_per_block;
1102	o_region.count = cache->sectors_per_block;
1103
1104	c_region.bdev = cache->cache_dev->bdev;
1105	c_region.sector = from_cblock(b: mg->op->cblock) * cache->sectors_per_block;
1106	c_region.count = cache->sectors_per_block;
1107
1108	if (promote)
1109	dm_kcopyd_copy(kc: cache->copier, from: &o_region, num_dests: `1`, dests: &c_region, flags: `0`, fn: copy_complete, context: &mg->k);
1110	else
1111	dm_kcopyd_copy(kc: cache->copier, from: &c_region, num_dests: `1`, dests: &o_region, flags: `0`, fn: copy_complete, context: &mg->k);
1112	}
1113
1114	static void bio_drop_shared_lock(struct cache cache, struct* bio *bio)
1115	{
1116	struct per_bio_data *pb = get_per_bio_data(bio);
1117
1118	if (pb->cell && dm_cell_put_v2(prison: cache->prison, cell: pb->cell))
1119	free_prison_cell(cache, cell: pb->cell);
1120	pb->cell = NULL;
1121	}
1122
1123	static void overwrite_endio(struct bio *bio)
1124	{
1125	struct dm_cache_migration *mg = bio->bi_private;
1126	struct cache *cache = mg->cache;
1127	struct per_bio_data *pb = get_per_bio_data(bio);
1128
1129	dm_unhook_bio(h: &pb->hook_info, bio);
1130
1131	if (bio->bi_status)
1132	mg->k.input = bio->bi_status;
1133
1134	queue_continuation(wq: cache->wq, k: &mg->k);
1135	}
1136
1137	static void overwrite(struct dm_cache_migration *mg,
1138	void (continuation)(struct* work_struct *))
1139	{
1140	struct bio *bio = mg->overwrite_bio;
1141	struct per_bio_data *pb = get_per_bio_data(bio);
1142
1143	dm_hook_bio(h: &pb->hook_info, bio, bi_end_io: overwrite_endio, bi_private: mg);
1144
1145	/*
1146	* The overwrite bio is part of the copy operation, as such it does
1147	* not set/clear discard or dirty flags.
1148	*/
1149	if (mg->op->op == POLICY_PROMOTE)
1150	remap_to_cache(cache: mg->cache, bio, cblock: mg->op->cblock);
1151	else
1152	remap_to_origin(cache: mg->cache, bio);
1153
1154	init_continuation(k: &mg->k, fn: continuation);
1155	accounted_request(cache: mg->cache, bio);
1156	}
1157
1158	/*
1159	* Migration steps:
1160	*
1161	* 1) exclusive lock preventing WRITEs
1162	* 2) quiesce
1163	* 3) copy or issue overwrite bio
1164	* 4) upgrade to exclusive lock preventing READs and WRITEs
1165	* 5) quiesce
1166	* 6) update metadata and commit
1167	* 7) unlock
1168	*/
1169	static void mg_complete(struct dm_cache_migration *mg, bool success)
1170	{
1171	struct bio_list bios;
1172	struct cache *cache = mg->cache;
1173	struct policy_work *op = mg->op;
1174	dm_cblock_t cblock = op->cblock;
1175
1176	if (success)
1177	update_stats(stats: &cache->stats, op: op->op);
1178
1179	switch (op->op) {
1180	case POLICY_PROMOTE:
1181	clear_discard(cache, b: oblock_to_dblock(cache, oblock: op->oblock));
1182	policy_complete_background_work(p: cache->policy, work: op, success);
1183
1184	if (mg->overwrite_bio) {
1185	if (success)
1186	force_set_dirty(cache, cblock);
1187	else if (mg->k.input)
1188	mg->overwrite_bio->bi_status = mg->k.input;
1189	else
1190	mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1191	bio_endio(mg->overwrite_bio);
1192	} else {
1193	if (success)
1194	force_clear_dirty(cache, cblock);
1195	dec_io_migrations(cache);
1196	}
1197	break;
1198
1199	case POLICY_DEMOTE:
1200	/*
1201	* We clear dirty here to update the nr_dirty counter.
1202	*/
1203	if (success)
1204	force_clear_dirty(cache, cblock);
1205	policy_complete_background_work(p: cache->policy, work: op, success);
1206	dec_io_migrations(cache);
1207	break;
1208
1209	case POLICY_WRITEBACK:
1210	if (success)
1211	force_clear_dirty(cache, cblock);
1212	policy_complete_background_work(p: cache->policy, work: op, success);
1213	dec_io_migrations(cache);
1214	break;
1215	}
1216
1217	bio_list_init(bl: &bios);
1218	if (mg->cell) {
1219	if (dm_cell_unlock_v2(prison: cache->prison, cell: mg->cell, bios: &bios))
1220	free_prison_cell(cache, cell: mg->cell);
1221	}
1222
1223	free_migration(mg);
1224	defer_bios(cache, bios: &bios);
1225	wake_migration_worker(cache);
1226
1227	background_work_end(cache);
1228	}
1229
1230	static void mg_success(struct work_struct *ws)
1231	{
1232	struct dm_cache_migration *mg = ws_to_mg(ws);
1233
1234	mg_complete(mg, success: mg->k.input == `0`);
1235	}
1236
1237	static void mg_update_metadata(struct work_struct *ws)
1238	{
1239	int r;
1240	struct dm_cache_migration *mg = ws_to_mg(ws);
1241	struct cache *cache = mg->cache;
1242	struct policy_work *op = mg->op;
1243
1244	switch (op->op) {
1245	case POLICY_PROMOTE:
1246	r = dm_cache_insert_mapping(cmd: cache->cmd, cblock: op->cblock, oblock: op->oblock);
1247	if (r) {
1248	DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1249	cache_device_name(cache));
1250	metadata_operation_failed(cache, op: "dm_cache_insert_mapping", r);
1251
1252	mg_complete(mg, success: false);
1253	return;
1254	}
1255	mg_complete(mg, success: true);
1256	break;
1257
1258	case POLICY_DEMOTE:
1259	r = dm_cache_remove_mapping(cmd: cache->cmd, cblock: op->cblock);
1260	if (r) {
1261	DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1262	cache_device_name(cache));
1263	metadata_operation_failed(cache, op: "dm_cache_remove_mapping", r);
1264
1265	mg_complete(mg, success: false);
1266	return;
1267	}
1268
1269	/*
1270	* It would be nice if we only had to commit when a REQ_FLUSH
1271	* comes through. But there's one scenario that we have to
1272	* look out for:
1273	*
1274	* - vblock x in a cache block
1275	* - domotion occurs
1276	* - cache block gets reallocated and over written
1277	* - crash
1278	*
1279	* When we recover, because there was no commit the cache will
1280	* rollback to having the data for vblock x in the cache block.
1281	* But the cache block has since been overwritten, so it'll end
1282	* up pointing to data that was never in 'x' during the history
1283	* of the device.
1284	*
1285	* To avoid this issue we require a commit as part of the
1286	* demotion operation.
1287	*/
1288	init_continuation(k: &mg->k, fn: mg_success);
1289	continue_after_commit(b: &cache->committer, k: &mg->k);
1290	schedule_commit(b: &cache->committer);
1291	break;
1292
1293	case POLICY_WRITEBACK:
1294	mg_complete(mg, success: true);
1295	break;
1296	}
1297	}
1298
1299	static void mg_update_metadata_after_copy(struct work_struct *ws)
1300	{
1301	struct dm_cache_migration *mg = ws_to_mg(ws);
1302
1303	/*
1304	* Did the copy succeed?
1305	*/
1306	if (mg->k.input)
1307	mg_complete(mg, success: false);
1308	else
1309	mg_update_metadata(ws);
1310	}
1311
1312	static void mg_upgrade_lock(struct work_struct *ws)
1313	{
1314	int r;
1315	struct dm_cache_migration *mg = ws_to_mg(ws);
1316
1317	/*
1318	* Did the copy succeed?
1319	*/
1320	if (mg->k.input)
1321	mg_complete(mg, success: false);
1322
1323	else {
1324	/*
1325	* Now we want the lock to prevent both reads and writes.
1326	*/
1327	r = dm_cell_lock_promote_v2(prison: mg->cache->prison, cell: mg->cell,
1328	READ_WRITE_LOCK_LEVEL);
1329	if (r < `0`)
1330	mg_complete(mg, success: false);
1331
1332	else if (r)
1333	quiesce(mg, continuation: mg_update_metadata);
1334
1335	else
1336	mg_update_metadata(ws);
1337	}
1338	}
1339
1340	static void mg_full_copy(struct work_struct *ws)
1341	{
1342	struct dm_cache_migration *mg = ws_to_mg(ws);
1343	struct cache *cache = mg->cache;
1344	struct policy_work *op = mg->op;
1345	bool is_policy_promote = (op->op == POLICY_PROMOTE);
1346
1347	if ((!is_policy_promote && !is_dirty(cache, b: op->cblock)) \|\|
1348	is_discarded_oblock(cache, b: op->oblock)) {
1349	mg_upgrade_lock(ws);
1350	return;
1351	}
1352
1353	init_continuation(k: &mg->k, fn: mg_upgrade_lock);
1354	copy(mg, promote: is_policy_promote);
1355	}
1356
1357	static void mg_copy(struct work_struct *ws)
1358	{
1359	struct dm_cache_migration *mg = ws_to_mg(ws);
1360
1361	if (mg->overwrite_bio) {
1362	/*
1363	* No exclusive lock was held when we last checked if the bio
1364	* was optimisable. So we have to check again in case things
1365	* have changed (eg, the block may no longer be discarded).
1366	*/
1367	if (!optimisable_bio(cache: mg->cache, bio: mg->overwrite_bio, block: mg->op->oblock)) {
1368	/*
1369	* Fallback to a real full copy after doing some tidying up.
1370	*/
1371	bool rb = bio_detain_shared(cache: mg->cache, oblock: mg->op->oblock, bio: mg->overwrite_bio);
1372
1373	BUG_ON(rb); / An exclussive lock must _not_ be held for this block /
1374	mg->overwrite_bio = NULL;
1375	inc_io_migrations(cache: mg->cache);
1376	mg_full_copy(ws);
1377	return;
1378	}
1379
1380	/*
1381	* It's safe to do this here, even though it's new data
1382	* because all IO has been locked out of the block.
1383	*
1384	* mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1385	* so _not_ using mg_upgrade_lock() as continutation.
1386	*/
1387	overwrite(mg, continuation: mg_update_metadata_after_copy);
1388
1389	} else
1390	mg_full_copy(ws);
1391	}
1392
1393	static int mg_lock_writes(struct dm_cache_migration *mg)
1394	{
1395	int r;
1396	struct dm_cell_key_v2 key;
1397	struct cache *cache = mg->cache;
1398	struct dm_bio_prison_cell_v2 *prealloc;
1399
1400	prealloc = alloc_prison_cell(cache);
1401
1402	/*
1403	* Prevent writes to the block, but allow reads to continue.
1404	* Unless we're using an overwrite bio, in which case we lock
1405	* everything.
1406	*/
1407	build_key(begin: mg->op->oblock, end: oblock_succ(b: mg->op->oblock), key: &key);
1408	r = dm_cell_lock_v2(prison: cache->prison, key: &key,
1409	lock_level: mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1410	cell_prealloc: prealloc, cell_result: &mg->cell);
1411	if (r < `0`) {
1412	free_prison_cell(cache, cell: prealloc);
1413	mg_complete(mg, success: false);
1414	return r;
1415	}
1416
1417	if (mg->cell != prealloc)
1418	free_prison_cell(cache, cell: prealloc);
1419
1420	if (r == `0`)
1421	mg_copy(ws: &mg->k.ws);
1422	else
1423	quiesce(mg, continuation: mg_copy);
1424
1425	return `0`;
1426	}
1427
1428	static int mg_start(struct cache cache, struct* policy_work op, struct* bio *bio)
1429	{
1430	struct dm_cache_migration *mg;
1431
1432	if (!background_work_begin(cache)) {
1433	policy_complete_background_work(p: cache->policy, work: op, success: false);
1434	return -EPERM;
1435	}
1436
1437	mg = alloc_migration(cache);
1438
1439	mg->op = op;
1440	mg->overwrite_bio = bio;
1441
1442	if (!bio)
1443	inc_io_migrations(cache);
1444
1445	return mg_lock_writes(mg);
1446	}
1447
1448	/*
1449	*--------------------------------------------------------------
1450	* invalidation processing
1451	*--------------------------------------------------------------
1452	*/
1453
1454	static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1455	{
1456	struct bio_list bios;
1457	struct cache *cache = mg->cache;
1458
1459	bio_list_init(bl: &bios);
1460	if (dm_cell_unlock_v2(prison: cache->prison, cell: mg->cell, bios: &bios))
1461	free_prison_cell(cache, cell: mg->cell);
1462
1463	if (!success && mg->overwrite_bio)
1464	bio_io_error(bio: mg->overwrite_bio);
1465
1466	free_migration(mg);
1467	defer_bios(cache, bios: &bios);
1468
1469	background_work_end(cache);
1470	}
1471
1472	static void invalidate_completed(struct work_struct *ws)
1473	{
1474	struct dm_cache_migration *mg = ws_to_mg(ws);
1475
1476	invalidate_complete(mg, success: !mg->k.input);
1477	}
1478
1479	static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1480	{
1481	int r;
1482
1483	r = policy_invalidate_mapping(p: cache->policy, cblock);
1484	if (!r) {
1485	r = dm_cache_remove_mapping(cmd: cache->cmd, cblock);
1486	if (r) {
1487	DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1488	cache_device_name(cache));
1489	metadata_operation_failed(cache, op: "dm_cache_remove_mapping", r);
1490	}
1491
1492	} else if (r == -ENODATA) {
1493	/*
1494	* Harmless, already unmapped.
1495	*/
1496	r = `0`;
1497
1498	} else
1499	DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1500
1501	return r;
1502	}
1503
1504	static void invalidate_remove(struct work_struct *ws)
1505	{
1506	int r;
1507	struct dm_cache_migration *mg = ws_to_mg(ws);
1508	struct cache *cache = mg->cache;
1509
1510	r = invalidate_cblock(cache, cblock: mg->invalidate_cblock);
1511	if (r) {
1512	invalidate_complete(mg, success: false);
1513	return;
1514	}
1515
1516	init_continuation(k: &mg->k, fn: invalidate_completed);
1517	continue_after_commit(b: &cache->committer, k: &mg->k);
1518	remap_to_origin_clear_discard(cache, bio: mg->overwrite_bio, oblock: mg->invalidate_oblock);
1519	mg->overwrite_bio = NULL;
1520	schedule_commit(b: &cache->committer);
1521	}
1522
1523	static int invalidate_lock(struct dm_cache_migration *mg)
1524	{
1525	int r;
1526	struct dm_cell_key_v2 key;
1527	struct cache *cache = mg->cache;
1528	struct dm_bio_prison_cell_v2 *prealloc;
1529
1530	prealloc = alloc_prison_cell(cache);
1531
1532	build_key(begin: mg->invalidate_oblock, end: oblock_succ(b: mg->invalidate_oblock), key: &key);
1533	r = dm_cell_lock_v2(prison: cache->prison, key: &key,
1534	READ_WRITE_LOCK_LEVEL, cell_prealloc: prealloc, cell_result: &mg->cell);
1535	if (r < `0`) {
1536	free_prison_cell(cache, cell: prealloc);
1537	invalidate_complete(mg, success: false);
1538	return r;
1539	}
1540
1541	if (mg->cell != prealloc)
1542	free_prison_cell(cache, cell: prealloc);
1543
1544	if (r)
1545	quiesce(mg, continuation: invalidate_remove);
1546
1547	else {
1548	/*
1549	* We can't call invalidate_remove() directly here because we
1550	* might still be in request context.
1551	*/
1552	init_continuation(k: &mg->k, fn: invalidate_remove);
1553	queue_work(wq: cache->wq, work: &mg->k.ws);
1554	}
1555
1556	return `0`;
1557	}
1558
1559	static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1560	dm_oblock_t oblock, struct bio *bio)
1561	{
1562	struct dm_cache_migration *mg;
1563
1564	if (!background_work_begin(cache))
1565	return -EPERM;
1566
1567	mg = alloc_migration(cache);
1568
1569	mg->overwrite_bio = bio;
1570	mg->invalidate_cblock = cblock;
1571	mg->invalidate_oblock = oblock;
1572
1573	return invalidate_lock(mg);
1574	}
1575
1576	/*
1577	*--------------------------------------------------------------
1578	* bio processing
1579	*--------------------------------------------------------------
1580	*/
1581
1582	enum busy {
1583	IDLE,
1584	BUSY
1585	};
1586
1587	static enum busy spare_migration_bandwidth(struct cache *cache)
1588	{
1589	bool idle = dm_iot_idle_for(iot: &cache->tracker, HZ);
1590	sector_t current_volume = (atomic_read(v: &cache->nr_io_migrations) + `1`) *
1591	cache->sectors_per_block;
1592
1593	if (idle && current_volume <= cache->migration_threshold)
1594	return IDLE;
1595	else
1596	return BUSY;
1597	}
1598
1599	static void inc_hit_counter(struct cache cache, struct* bio *bio)
1600	{
1601	atomic_inc(bio_data_dir(bio) == READ ?
1602	&cache->stats.read_hit : &cache->stats.write_hit);
1603	}
1604
1605	static void inc_miss_counter(struct cache cache, struct* bio *bio)
1606	{
1607	atomic_inc(bio_data_dir(bio) == READ ?
1608	&cache->stats.read_miss : &cache->stats.write_miss);
1609	}
1610
1611	/----------------------------------------------------------------/
1612
1613	static int map_bio(struct cache cache, struct* bio *bio, dm_oblock_t block,
1614	bool *commit_needed)
1615	{
1616	int r, data_dir;
1617	bool rb, background_queued;
1618	dm_cblock_t cblock;
1619
1620	*commit_needed = false;
1621
1622	rb = bio_detain_shared(cache, oblock: block, bio);
1623	if (!rb) {
1624	/*
1625	* An exclusive lock is held for this block, so we have to
1626	* wait. We set the commit_needed flag so the current
1627	* transaction will be committed asap, allowing this lock
1628	* to be dropped.
1629	*/
1630	*commit_needed = true;
1631	return DM_MAPIO_SUBMITTED;
1632	}
1633
1634	data_dir = bio_data_dir(bio);
1635
1636	if (optimisable_bio(cache, bio, block)) {
1637	struct policy_work *op = NULL;
1638
1639	r = policy_lookup_with_work(p: cache->policy, oblock: block, cblock: &cblock, data_dir, fast_copy: true, work: &op);
1640	if (unlikely(r && r != -ENOENT)) {
1641	DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1642	cache_device_name(cache), r);
1643	bio_io_error(bio);
1644	return DM_MAPIO_SUBMITTED;
1645	}
1646
1647	if (r == -ENOENT && op) {
1648	bio_drop_shared_lock(cache, bio);
1649	BUG_ON(op->op != POLICY_PROMOTE);
1650	mg_start(cache, op, bio);
1651	return DM_MAPIO_SUBMITTED;
1652	}
1653	} else {
1654	r = policy_lookup(p: cache->policy, oblock: block, cblock: &cblock, data_dir, fast_copy: false, background_queued: &background_queued);
1655	if (unlikely(r && r != -ENOENT)) {
1656	DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1657	cache_device_name(cache), r);
1658	bio_io_error(bio);
1659	return DM_MAPIO_SUBMITTED;
1660	}
1661
1662	if (background_queued)
1663	wake_migration_worker(cache);
1664	}
1665
1666	if (r == -ENOENT) {
1667	struct per_bio_data *pb = get_per_bio_data(bio);
1668
1669	/*
1670	* Miss.
1671	*/
1672	inc_miss_counter(cache, bio);
1673	if (pb->req_nr == `0`) {
1674	accounted_begin(cache, bio);
1675	remap_to_origin_clear_discard(cache, bio, oblock: block);
1676	} else {
1677	/*
1678	* This is a duplicate writethrough io that is no
1679	* longer needed because the block has been demoted.
1680	*/
1681	bio_endio(bio);
1682	return DM_MAPIO_SUBMITTED;
1683	}
1684	} else {
1685	/*
1686	* Hit.
1687	*/
1688	inc_hit_counter(cache, bio);
1689
1690	/*
1691	* Passthrough always maps to the origin, invalidating any
1692	* cache blocks that are written to.
1693	*/
1694	if (passthrough_mode(cache)) {
1695	if (bio_data_dir(bio) == WRITE) {
1696	bio_drop_shared_lock(cache, bio);
1697	atomic_inc(v: &cache->stats.demotion);
1698	invalidate_start(cache, cblock, oblock: block, bio);
1699	} else
1700	remap_to_origin_clear_discard(cache, bio, oblock: block);
1701	} else {
1702	if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1703	!is_dirty(cache, b: cblock)) {
1704	remap_to_origin_and_cache(cache, bio, oblock: block, cblock);
1705	accounted_begin(cache, bio);
1706	} else
1707	remap_to_cache_dirty(cache, bio, oblock: block, cblock);
1708	}
1709	}
1710
1711	/*
1712	* dm core turns FUA requests into a separate payload and FLUSH req.
1713	*/
1714	if (bio->bi_opf & REQ_FUA) {
1715	/*
1716	* issue_after_commit will call accounted_begin a second time. So
1717	* we call accounted_complete() to avoid double accounting.
1718	*/
1719	accounted_complete(cache, bio);
1720	issue_after_commit(b: &cache->committer, bio);
1721	*commit_needed = true;
1722	return DM_MAPIO_SUBMITTED;
1723	}
1724
1725	return DM_MAPIO_REMAPPED;
1726	}
1727
1728	static bool process_bio(struct cache cache, struct* bio *bio)
1729	{
1730	bool commit_needed;
1731
1732	if (map_bio(cache, bio, block: get_bio_block(cache, bio), commit_needed: &commit_needed) == DM_MAPIO_REMAPPED)
1733	dm_submit_bio_remap(clone: bio, NULL);
1734
1735	return commit_needed;
1736	}
1737
1738	/*
1739	* A non-zero return indicates read_only or fail_io mode.
1740	*/
1741	static int commit(struct cache *cache, bool clean_shutdown)
1742	{
1743	int r;
1744
1745	if (get_cache_mode(cache) >= CM_READ_ONLY)
1746	return -EINVAL;
1747
1748	atomic_inc(v: &cache->stats.commit_count);
1749	r = dm_cache_commit(cmd: cache->cmd, clean_shutdown);
1750	if (r)
1751	metadata_operation_failed(cache, op: "dm_cache_commit", r);
1752
1753	return r;
1754	}
1755
1756	/*
1757	* Used by the batcher.
1758	*/
1759	static blk_status_t commit_op(void *context)
1760	{
1761	struct cache *cache = context;
1762
1763	if (dm_cache_changed_this_transaction(cmd: cache->cmd))
1764	return errno_to_blk_status(errno: commit(cache, clean_shutdown: false));
1765
1766	return `0`;
1767	}
1768
1769	/----------------------------------------------------------------/
1770
1771	static bool process_flush_bio(struct cache cache, struct* bio *bio)
1772	{
1773	struct per_bio_data *pb = get_per_bio_data(bio);
1774
1775	if (!pb->req_nr)
1776	remap_to_origin(cache, bio);
1777	else
1778	remap_to_cache(cache, bio, cblock: `0`);
1779
1780	issue_after_commit(b: &cache->committer, bio);
1781	return true;
1782	}
1783
1784	static bool process_discard_bio(struct cache cache, struct* bio *bio)
1785	{
1786	dm_dblock_t b, e;
1787
1788	/*
1789	* FIXME: do we need to lock the region? Or can we just assume the
1790	* user wont be so foolish as to issue discard concurrently with
1791	* other IO?
1792	*/
1793	calc_discard_block_range(cache, bio, b: &b, e: &e);
1794	while (b != e) {
1795	set_discard(cache, b);
1796	b = to_dblock(b: from_dblock(b) + `1`);
1797	}
1798
1799	if (cache->features.discard_passdown) {
1800	remap_to_origin(cache, bio);
1801	dm_submit_bio_remap(clone: bio, NULL);
1802	} else
1803	bio_endio(bio);
1804
1805	return false;
1806	}
1807
1808	static void process_deferred_bios(struct work_struct *ws)
1809	{
1810	struct cache cache = container_of(ws, struct* cache, deferred_bio_worker);
1811
1812	bool commit_needed = false;
1813	struct bio_list bios;
1814	struct bio *bio;
1815
1816	bio_list_init(bl: &bios);
1817
1818	spin_lock_irq(lock: &cache->lock);
1819	bio_list_merge(bl: &bios, bl2: &cache->deferred_bios);
1820	bio_list_init(bl: &cache->deferred_bios);
1821	spin_unlock_irq(lock: &cache->lock);
1822
1823	while ((bio = bio_list_pop(bl: &bios))) {
1824	if (bio->bi_opf & REQ_PREFLUSH)
1825	commit_needed = process_flush_bio(cache, bio) \|\| commit_needed;
1826
1827	else if (bio_op(bio) == REQ_OP_DISCARD)
1828	commit_needed = process_discard_bio(cache, bio) \|\| commit_needed;
1829
1830	else
1831	commit_needed = process_bio(cache, bio) \|\| commit_needed;
1832	cond_resched();
1833	}
1834
1835	if (commit_needed)
1836	schedule_commit(b: &cache->committer);
1837	}
1838
1839	/*
1840	*--------------------------------------------------------------
1841	* Main worker loop
1842	*--------------------------------------------------------------
1843	*/
1844	static void requeue_deferred_bios(struct cache *cache)
1845	{
1846	struct bio *bio;
1847	struct bio_list bios;
1848
1849	bio_list_init(bl: &bios);
1850	bio_list_merge(bl: &bios, bl2: &cache->deferred_bios);
1851	bio_list_init(bl: &cache->deferred_bios);
1852
1853	while ((bio = bio_list_pop(bl: &bios))) {
1854	bio->bi_status = BLK_STS_DM_REQUEUE;
1855	bio_endio(bio);
1856	cond_resched();
1857	}
1858	}
1859
1860	/*
1861	* We want to commit periodically so that not too much
1862	* unwritten metadata builds up.
1863	*/
1864	static void do_waker(struct work_struct *ws)
1865	{
1866	struct cache cache = container_of(to_delayed_work(ws), struct* cache, waker);
1867
1868	policy_tick(p: cache->policy, can_block: true);
1869	wake_migration_worker(cache);
1870	schedule_commit(b: &cache->committer);
1871	queue_delayed_work(wq: cache->wq, dwork: &cache->waker, COMMIT_PERIOD);
1872	}
1873
1874	static void check_migrations(struct work_struct *ws)
1875	{
1876	int r;
1877	struct policy_work *op;
1878	struct cache cache = container_of(ws, struct* cache, migration_worker);
1879	enum busy b;
1880
1881	for (;;) {
1882	b = spare_migration_bandwidth(cache);
1883
1884	r = policy_get_background_work(p: cache->policy, idle: b == IDLE, result: &op);
1885	if (r == -ENODATA)
1886	break;
1887
1888	if (r) {
1889	DMERR_LIMIT("%s: policy_background_work failed",
1890	cache_device_name(cache));
1891	break;
1892	}
1893
1894	r = mg_start(cache, op, NULL);
1895	if (r)
1896	break;
1897
1898	cond_resched();
1899	}
1900	}
1901
1902	/*
1903	*--------------------------------------------------------------
1904	* Target methods
1905	*--------------------------------------------------------------
1906	*/
1907
1908	/*
1909	* This function gets called on the error paths of the constructor, so we
1910	* have to cope with a partially initialised struct.
1911	*/
1912	static void destroy(struct cache *cache)
1913	{
1914	unsigned int i;
1915
1916	mempool_exit(pool: &cache->migration_pool);
1917
1918	if (cache->prison)
1919	dm_bio_prison_destroy_v2(prison: cache->prison);
1920
1921	cancel_delayed_work_sync(dwork: &cache->waker);
1922	if (cache->wq)
1923	destroy_workqueue(wq: cache->wq);
1924
1925	if (cache->dirty_bitset)
1926	free_bitset(bits: cache->dirty_bitset);
1927
1928	if (cache->discard_bitset)
1929	free_bitset(bits: cache->discard_bitset);
1930
1931	if (cache->copier)
1932	dm_kcopyd_client_destroy(kc: cache->copier);
1933
1934	if (cache->cmd)
1935	dm_cache_metadata_close(cmd: cache->cmd);
1936
1937	if (cache->metadata_dev)
1938	dm_put_device(ti: cache->ti, d: cache->metadata_dev);
1939
1940	if (cache->origin_dev)
1941	dm_put_device(ti: cache->ti, d: cache->origin_dev);
1942
1943	if (cache->cache_dev)
1944	dm_put_device(ti: cache->ti, d: cache->cache_dev);
1945
1946	if (cache->policy)
1947	dm_cache_policy_destroy(p: cache->policy);
1948
1949	for (i = `0`; i < cache->nr_ctr_args ; i++)
1950	kfree(objp: cache->ctr_args[i]);
1951	kfree(objp: cache->ctr_args);
1952
1953	bioset_exit(&cache->bs);
1954
1955	kfree(objp: cache);
1956	}
1957
1958	static void cache_dtr(struct dm_target *ti)
1959	{
1960	struct cache *cache = ti->private;
1961
1962	destroy(cache);
1963	}
1964
1965	static sector_t get_dev_size(struct dm_dev *dev)
1966	{
1967	return bdev_nr_sectors(bdev: dev->bdev);
1968	}
1969
1970	/----------------------------------------------------------------/
1971
1972	/*
1973	* Construct a cache device mapping.
1974	*
1975	* cache <metadata dev> <cache dev> <origin dev> <block size>
1976	* <#feature args> [<feature arg>]*
1977	* <policy> <#policy args> [<policy arg>]*
1978	*
1979	* metadata dev : fast device holding the persistent metadata
1980	* cache dev : fast device holding cached data blocks
1981	* origin dev : slow device holding original data blocks
1982	* block size : cache unit size in sectors
1983	*
1984	* #feature args : number of feature arguments passed
1985	* feature args : writethrough. (The default is writeback.)
1986	*
1987	* policy : the replacement policy to use
1988	* #policy args : an even number of policy arguments corresponding
1989	* to key/value pairs passed to the policy
1990	* policy args : key/value pairs passed to the policy
1991	* E.g. 'sequential_threshold 1024'
1992	* See cache-policies.txt for details.
1993	*
1994	* Optional feature arguments are:
1995	* writethrough : write through caching that prohibits cache block
1996	* content from being different from origin block content.
1997	* Without this argument, the default behaviour is to write
1998	* back cache block contents later for performance reasons,
1999	* so they may differ from the corresponding origin blocks.
2000	*/
2001	struct cache_args {
2002	struct dm_target *ti;
2003
2004	struct dm_dev *metadata_dev;
2005
2006	struct dm_dev *cache_dev;
2007	sector_t cache_sectors;
2008
2009	struct dm_dev *origin_dev;
2010	sector_t origin_sectors;
2011
2012	uint32_t block_size;
2013
2014	const char *policy_name;
2015	int policy_argc;
2016	const char **policy_argv;
2017
2018	struct cache_features features;
2019	};
2020
2021	static void destroy_cache_args(struct cache_args *ca)
2022	{
2023	if (ca->metadata_dev)
2024	dm_put_device(ti: ca->ti, d: ca->metadata_dev);
2025
2026	if (ca->cache_dev)
2027	dm_put_device(ti: ca->ti, d: ca->cache_dev);
2028
2029	if (ca->origin_dev)
2030	dm_put_device(ti: ca->ti, d: ca->origin_dev);
2031
2032	kfree(objp: ca);
2033	}
2034
2035	static bool at_least_one_arg(struct dm_arg_set as, char* **error)
2036	{
2037	if (!as->argc) {
2038	*error = "Insufficient args";
2039	return false;
2040	}
2041
2042	return true;
2043	}
2044
2045	static int parse_metadata_dev(struct cache_args ca, struct* dm_arg_set *as,
2046	char **error)
2047	{
2048	int r;
2049	sector_t metadata_dev_size;
2050
2051	if (!at_least_one_arg(as, error))
2052	return -EINVAL;
2053
2054	r = dm_get_device(ti: ca->ti, path: dm_shift_arg(as),
2055	BLK_OPEN_READ \| BLK_OPEN_WRITE, result: &ca->metadata_dev);
2056	if (r) {
2057	*error = "Error opening metadata device";
2058	return r;
2059	}
2060
2061	metadata_dev_size = get_dev_size(dev: ca->metadata_dev);
2062	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2063	DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
2064	ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS);
2065
2066	return `0`;
2067	}
2068
2069	static int parse_cache_dev(struct cache_args ca, struct* dm_arg_set *as,
2070	char **error)
2071	{
2072	int r;
2073
2074	if (!at_least_one_arg(as, error))
2075	return -EINVAL;
2076
2077	r = dm_get_device(ti: ca->ti, path: dm_shift_arg(as),
2078	BLK_OPEN_READ \| BLK_OPEN_WRITE, result: &ca->cache_dev);
2079	if (r) {
2080	*error = "Error opening cache device";
2081	return r;
2082	}
2083	ca->cache_sectors = get_dev_size(dev: ca->cache_dev);
2084
2085	return `0`;
2086	}
2087
2088	static int parse_origin_dev(struct cache_args ca, struct* dm_arg_set *as,
2089	char **error)
2090	{
2091	int r;
2092
2093	if (!at_least_one_arg(as, error))
2094	return -EINVAL;
2095
2096	r = dm_get_device(ti: ca->ti, path: dm_shift_arg(as),
2097	BLK_OPEN_READ \| BLK_OPEN_WRITE, result: &ca->origin_dev);
2098	if (r) {
2099	*error = "Error opening origin device";
2100	return r;
2101	}
2102
2103	ca->origin_sectors = get_dev_size(dev: ca->origin_dev);
2104	if (ca->ti->len > ca->origin_sectors) {
2105	*error = "Device size larger than cached device";
2106	return -EINVAL;
2107	}
2108
2109	return `0`;
2110	}
2111
2112	static int parse_block_size(struct cache_args ca, struct* dm_arg_set *as,
2113	char **error)
2114	{
2115	unsigned long block_size;
2116
2117	if (!at_least_one_arg(as, error))
2118	return -EINVAL;
2119
2120	if (kstrtoul(s: dm_shift_arg(as), base: `10`, res: &block_size) \|\| !block_size \|\|
2121	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
2122	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
2123	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - `1`)) {
2124	*error = "Invalid data block size";
2125	return -EINVAL;
2126	}
2127
2128	if (block_size > ca->cache_sectors) {
2129	*error = "Data block size is larger than the cache device";
2130	return -EINVAL;
2131	}
2132
2133	ca->block_size = block_size;
2134
2135	return `0`;
2136	}
2137
2138	static void init_features(struct cache_features *cf)
2139	{
2140	cf->mode = CM_WRITE;
2141	cf->io_mode = CM_IO_WRITEBACK;
2142	cf->metadata_version = `1`;
2143	cf->discard_passdown = true;
2144	}
2145
2146	static int parse_features(struct cache_args ca, struct* dm_arg_set *as,
2147	char **error)
2148	{
2149	static const struct dm_arg _args[] = {
2150	{`0`, `3`, "Invalid number of cache feature arguments"},
2151	};
2152
2153	int r, mode_ctr = `0`;
2154	unsigned int argc;
2155	const char *arg;
2156	struct cache_features *cf = &ca->features;
2157
2158	init_features(cf);
2159
2160	r = dm_read_arg_group(arg: _args, arg_set: as, num_args: &argc, error);
2161	if (r)
2162	return -EINVAL;
2163
2164	while (argc--) {
2165	arg = dm_shift_arg(as);
2166
2167	if (!strcasecmp(s1: arg, s2: "writeback")) {
2168	cf->io_mode = CM_IO_WRITEBACK;
2169	mode_ctr++;
2170	}
2171
2172	else if (!strcasecmp(s1: arg, s2: "writethrough")) {
2173	cf->io_mode = CM_IO_WRITETHROUGH;
2174	mode_ctr++;
2175	}
2176
2177	else if (!strcasecmp(s1: arg, s2: "passthrough")) {
2178	cf->io_mode = CM_IO_PASSTHROUGH;
2179	mode_ctr++;
2180	}
2181
2182	else if (!strcasecmp(s1: arg, s2: "metadata2"))
2183	cf->metadata_version = `2`;
2184
2185	else if (!strcasecmp(s1: arg, s2: "no_discard_passdown"))
2186	cf->discard_passdown = false;
2187
2188	else {
2189	*error = "Unrecognised cache feature requested";
2190	return -EINVAL;
2191	}
2192	}
2193
2194	if (mode_ctr > `1`) {
2195	*error = "Duplicate cache io_mode features requested";
2196	return -EINVAL;
2197	}
2198
2199	return `0`;
2200	}
2201
2202	static int parse_policy(struct cache_args ca, struct* dm_arg_set *as,
2203	char **error)
2204	{
2205	static const struct dm_arg _args[] = {
2206	{`0`, `1024`, "Invalid number of policy arguments"},
2207	};
2208
2209	int r;
2210
2211	if (!at_least_one_arg(as, error))
2212	return -EINVAL;
2213
2214	ca->policy_name = dm_shift_arg(as);
2215
2216	r = dm_read_arg_group(arg: _args, arg_set: as, num_args: &ca->policy_argc, error);
2217	if (r)
2218	return -EINVAL;
2219
2220	ca->policy_argv = (const char **)as->argv;
2221	dm_consume_args(as, num_args: ca->policy_argc);
2222
2223	return `0`;
2224	}
2225
2226	static int parse_cache_args(struct cache_args ca, int* argc, char **argv,
2227	char **error)
2228	{
2229	int r;
2230	struct dm_arg_set as;
2231
2232	as.argc = argc;
2233	as.argv = argv;
2234
2235	r = parse_metadata_dev(ca, as: &as, error);
2236	if (r)
2237	return r;
2238
2239	r = parse_cache_dev(ca, as: &as, error);
2240	if (r)
2241	return r;
2242
2243	r = parse_origin_dev(ca, as: &as, error);
2244	if (r)
2245	return r;
2246
2247	r = parse_block_size(ca, as: &as, error);
2248	if (r)
2249	return r;
2250
2251	r = parse_features(ca, as: &as, error);
2252	if (r)
2253	return r;
2254
2255	r = parse_policy(ca, as: &as, error);
2256	if (r)
2257	return r;
2258
2259	return `0`;
2260	}
2261
2262	/----------------------------------------------------------------/
2263
2264	static struct kmem_cache *migration_cache;
2265
2266	#define NOT_CORE_OPTION 1
2267
2268	static int process_config_option(struct cache cache, const* char key, const* char *value)
2269	{
2270	unsigned long tmp;
2271
2272	if (!strcasecmp(s1: key, s2: "migration_threshold")) {
2273	if (kstrtoul(s: value, base: `10`, res: &tmp))
2274	return -EINVAL;
2275
2276	cache->migration_threshold = tmp;
2277	return `0`;
2278	}
2279
2280	return NOT_CORE_OPTION;
2281	}
2282
2283	static int set_config_value(struct cache cache, const* char key, const* char *value)
2284	{
2285	int r = process_config_option(cache, key, value);
2286
2287	if (r == NOT_CORE_OPTION)
2288	r = policy_set_config_value(p: cache->policy, key, value);
2289
2290	if (r)
2291	DMWARN("bad config value for %s: %s", key, value);
2292
2293	return r;
2294	}
2295
2296	static int set_config_values(struct cache cache, int* argc, const char **argv)
2297	{
2298	int r = `0`;
2299
2300	if (argc & `1`) {
2301	DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2302	return -EINVAL;
2303	}
2304
2305	while (argc) {
2306	r = set_config_value(cache, key: argv[`0`], value: argv[`1`]);
2307	if (r)
2308	break;
2309
2310	argc -= `2`;
2311	argv += `2`;
2312	}
2313
2314	return r;
2315	}
2316
2317	static int create_cache_policy(struct cache cache, struct* cache_args *ca,
2318	char **error)
2319	{
2320	struct dm_cache_policy *p = dm_cache_policy_create(name: ca->policy_name,
2321	cache_size: cache->cache_size,
2322	origin_size: cache->origin_sectors,
2323	block_size: cache->sectors_per_block);
2324	if (IS_ERR(ptr: p)) {
2325	*error = "Error creating cache's policy";
2326	return PTR_ERR(ptr: p);
2327	}
2328	cache->policy = p;
2329	BUG_ON(!cache->policy);
2330
2331	return `0`;
2332	}
2333
2334	/*
2335	* We want the discard block size to be at least the size of the cache
2336	* block size and have no more than 2^14 discard blocks across the origin.
2337	*/
2338	#define MAX_DISCARD_BLOCKS (1 << 14)
2339
2340	static bool too_many_discard_blocks(sector_t discard_block_size,
2341	sector_t origin_size)
2342	{
2343	(void) sector_div(origin_size, discard_block_size);
2344
2345	return origin_size > MAX_DISCARD_BLOCKS;
2346	}
2347
2348	static sector_t calculate_discard_block_size(sector_t cache_block_size,
2349	sector_t origin_size)
2350	{
2351	sector_t discard_block_size = cache_block_size;
2352
2353	if (origin_size)
2354	while (too_many_discard_blocks(discard_block_size, origin_size))
2355	discard_block_size *= `2`;
2356
2357	return discard_block_size;
2358	}
2359
2360	static void set_cache_size(struct cache *cache, dm_cblock_t size)
2361	{
2362	dm_block_t nr_blocks = from_cblock(b: size);
2363
2364	if (nr_blocks > (`1` << `20`) && cache->cache_size != size)
2365	DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2366	"All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2367	"Please consider increasing the cache block size to reduce the overall cache block count.",
2368	(unsigned long long) nr_blocks);
2369
2370	cache->cache_size = size;
2371	}
2372
2373	#define DEFAULT_MIGRATION_THRESHOLD 2048
2374
2375	static int cache_create(struct cache_args ca, struct* cache **result)
2376	{
2377	int r = `0`;
2378	char **error = &ca->ti->error;
2379	struct cache *cache;
2380	struct dm_target *ti = ca->ti;
2381	dm_block_t origin_blocks;
2382	struct dm_cache_metadata *cmd;
2383	bool may_format = ca->features.mode == CM_WRITE;
2384
2385	cache = kzalloc(size: sizeof(*cache), GFP_KERNEL);
2386	if (!cache)
2387	return -ENOMEM;
2388
2389	cache->ti = ca->ti;
2390	ti->private = cache;
2391	ti->accounts_remapped_io = true;
2392	ti->num_flush_bios = `2`;
2393	ti->flush_supported = true;
2394
2395	ti->num_discard_bios = `1`;
2396	ti->discards_supported = true;
2397
2398	ti->per_io_data_size = sizeof(struct per_bio_data);
2399
2400	cache->features = ca->features;
2401	if (writethrough_mode(cache)) {
2402	/ Create bioset for writethrough bios issued to origin /
2403	r = bioset_init(&cache->bs, BIO_POOL_SIZE, `0`, flags: `0`);
2404	if (r)
2405	goto bad;
2406	}
2407
2408	cache->metadata_dev = ca->metadata_dev;
2409	cache->origin_dev = ca->origin_dev;
2410	cache->cache_dev = ca->cache_dev;
2411
2412	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2413
2414	origin_blocks = cache->origin_sectors = ca->origin_sectors;
2415	origin_blocks = block_div(b: origin_blocks, n: ca->block_size);
2416	cache->origin_blocks = to_oblock(b: origin_blocks);
2417
2418	cache->sectors_per_block = ca->block_size;
2419	if (dm_set_target_max_io_len(ti, len: cache->sectors_per_block)) {
2420	r = -EINVAL;
2421	goto bad;
2422	}
2423
2424	if (ca->block_size & (ca->block_size - `1`)) {
2425	dm_block_t cache_size = ca->cache_sectors;
2426
2427	cache->sectors_per_block_shift = -`1`;
2428	cache_size = block_div(b: cache_size, n: ca->block_size);
2429	set_cache_size(cache, size: to_cblock(b: cache_size));
2430	} else {
2431	cache->sectors_per_block_shift = __ffs(ca->block_size);
2432	set_cache_size(cache, size: to_cblock(b: ca->cache_sectors >> cache->sectors_per_block_shift));
2433	}
2434
2435	r = create_cache_policy(cache, ca, error);
2436	if (r)
2437	goto bad;
2438
2439	cache->policy_nr_args = ca->policy_argc;
2440	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2441
2442	r = set_config_values(cache, argc: ca->policy_argc, argv: ca->policy_argv);
2443	if (r) {
2444	*error = "Error setting cache policy's config values";
2445	goto bad;
2446	}
2447
2448	cmd = dm_cache_metadata_open(bdev: cache->metadata_dev->bdev,
2449	data_block_size: ca->block_size, may_format_device: may_format,
2450	policy_hint_size: dm_cache_policy_get_hint_size(p: cache->policy),
2451	metadata_version: ca->features.metadata_version);
2452	if (IS_ERR(ptr: cmd)) {
2453	*error = "Error creating metadata object";
2454	r = PTR_ERR(ptr: cmd);
2455	goto bad;
2456	}
2457	cache->cmd = cmd;
2458	set_cache_mode(cache, new_mode: CM_WRITE);
2459	if (get_cache_mode(cache) != CM_WRITE) {
2460	*error = "Unable to get write access to metadata, please check/repair metadata.";
2461	r = -EINVAL;
2462	goto bad;
2463	}
2464
2465	if (passthrough_mode(cache)) {
2466	bool all_clean;
2467
2468	r = dm_cache_metadata_all_clean(cmd: cache->cmd, result: &all_clean);
2469	if (r) {
2470	*error = "dm_cache_metadata_all_clean() failed";
2471	goto bad;
2472	}
2473
2474	if (!all_clean) {
2475	*error = "Cannot enter passthrough mode unless all blocks are clean";
2476	r = -EINVAL;
2477	goto bad;
2478	}
2479
2480	policy_allow_migrations(p: cache->policy, allow: false);
2481	}
2482
2483	spin_lock_init(&cache->lock);
2484	bio_list_init(bl: &cache->deferred_bios);
2485	atomic_set(v: &cache->nr_allocated_migrations, i: `0`);
2486	atomic_set(v: &cache->nr_io_migrations, i: `0`);
2487	init_waitqueue_head(&cache->migration_wait);
2488
2489	r = -ENOMEM;
2490	atomic_set(v: &cache->nr_dirty, i: `0`);
2491	cache->dirty_bitset = alloc_bitset(nr_entries: from_cblock(b: cache->cache_size));
2492	if (!cache->dirty_bitset) {
2493	*error = "could not allocate dirty bitset";
2494	goto bad;
2495	}
2496	clear_bitset(bitset: cache->dirty_bitset, nr_entries: from_cblock(b: cache->cache_size));
2497
2498	cache->discard_block_size =
2499	calculate_discard_block_size(cache_block_size: cache->sectors_per_block,
2500	origin_size: cache->origin_sectors);
2501	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2502	cache->discard_block_size));
2503	cache->discard_bitset = alloc_bitset(nr_entries: from_dblock(b: cache->discard_nr_blocks));
2504	if (!cache->discard_bitset) {
2505	*error = "could not allocate discard bitset";
2506	goto bad;
2507	}
2508	clear_bitset(bitset: cache->discard_bitset, nr_entries: from_dblock(b: cache->discard_nr_blocks));
2509
2510	cache->copier = dm_kcopyd_client_create(throttle: &dm_kcopyd_throttle);
2511	if (IS_ERR(ptr: cache->copier)) {
2512	*error = "could not create kcopyd client";
2513	r = PTR_ERR(ptr: cache->copier);
2514	goto bad;
2515	}
2516
2517	cache->wq = alloc_workqueue(fmt: "dm-" DM_MSG_PREFIX, flags: WQ_MEM_RECLAIM, max_active: `0`);
2518	if (!cache->wq) {
2519	*error = "could not create workqueue for metadata object";
2520	goto bad;
2521	}
2522	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2523	INIT_WORK(&cache->migration_worker, check_migrations);
2524	INIT_DELAYED_WORK(&cache->waker, do_waker);
2525
2526	cache->prison = dm_bio_prison_create_v2(wq: cache->wq);
2527	if (!cache->prison) {
2528	*error = "could not create bio prison";
2529	goto bad;
2530	}
2531
2532	r = mempool_init_slab_pool(pool: &cache->migration_pool, MIGRATION_POOL_SIZE,
2533	kc: migration_cache);
2534	if (r) {
2535	*error = "Error creating cache's migration mempool";
2536	goto bad;
2537	}
2538
2539	cache->need_tick_bio = true;
2540	cache->sized = false;
2541	cache->invalidate = false;
2542	cache->commit_requested = false;
2543	cache->loaded_mappings = false;
2544	cache->loaded_discards = false;
2545
2546	load_stats(cache);
2547
2548	atomic_set(v: &cache->stats.demotion, i: `0`);
2549	atomic_set(v: &cache->stats.promotion, i: `0`);
2550	atomic_set(v: &cache->stats.copies_avoided, i: `0`);
2551	atomic_set(v: &cache->stats.cache_cell_clash, i: `0`);
2552	atomic_set(v: &cache->stats.commit_count, i: `0`);
2553	atomic_set(v: &cache->stats.discard_count, i: `0`);
2554
2555	spin_lock_init(&cache->invalidation_lock);
2556	INIT_LIST_HEAD(list: &cache->invalidation_requests);
2557
2558	batcher_init(b: &cache->committer, commit_op, commit_context: cache,
2559	issue_op, issue_context: cache, wq: cache->wq);
2560	dm_iot_init(iot: &cache->tracker);
2561
2562	init_rwsem(&cache->background_work_lock);
2563	prevent_background_work(cache);
2564
2565	*result = cache;
2566	return `0`;
2567	bad:
2568	destroy(cache);
2569	return r;
2570	}
2571
2572	static int copy_ctr_args(struct cache cache, int* argc, const char **argv)
2573	{
2574	unsigned int i;
2575	const char **copy;
2576
2577	copy = kcalloc(n: argc, size: sizeof(*copy), GFP_KERNEL);
2578	if (!copy)
2579	return -ENOMEM;
2580	for (i = `0`; i < argc; i++) {
2581	copy[i] = kstrdup(s: argv[i], GFP_KERNEL);
2582	if (!copy[i]) {
2583	while (i--)
2584	kfree(objp: copy[i]);
2585	kfree(objp: copy);
2586	return -ENOMEM;
2587	}
2588	}
2589
2590	cache->nr_ctr_args = argc;
2591	cache->ctr_args = copy;
2592
2593	return `0`;
2594	}
2595
2596	static int cache_ctr(struct dm_target ti, unsigned* int argc, char **argv)
2597	{
2598	int r = -EINVAL;
2599	struct cache_args *ca;
2600	struct cache *cache = NULL;
2601
2602	ca = kzalloc(size: sizeof(*ca), GFP_KERNEL);
2603	if (!ca) {
2604	ti->error = "Error allocating memory for cache";
2605	return -ENOMEM;
2606	}
2607	ca->ti = ti;
2608
2609	r = parse_cache_args(ca, argc, argv, error: &ti->error);
2610	if (r)
2611	goto out;
2612
2613	r = cache_create(ca, result: &cache);
2614	if (r)
2615	goto out;
2616
2617	r = copy_ctr_args(cache, argc: argc - `3`, argv: (const char **)argv + `3`);
2618	if (r) {
2619	destroy(cache);
2620	goto out;
2621	}
2622
2623	ti->private = cache;
2624	out:
2625	destroy_cache_args(ca);
2626	return r;
2627	}
2628
2629	/----------------------------------------------------------------/
2630
2631	static int cache_map(struct dm_target ti, struct* bio *bio)
2632	{
2633	struct cache *cache = ti->private;
2634
2635	int r;
2636	bool commit_needed;
2637	dm_oblock_t block = get_bio_block(cache, bio);
2638
2639	init_per_bio_data(bio);
2640	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2641	/*
2642	* This can only occur if the io goes to a partial block at
2643	* the end of the origin device. We don't cache these.
2644	* Just remap to the origin and carry on.
2645	*/
2646	remap_to_origin(cache, bio);
2647	accounted_begin(cache, bio);
2648	return DM_MAPIO_REMAPPED;
2649	}
2650
2651	if (discard_or_flush(bio)) {
2652	defer_bio(cache, bio);
2653	return DM_MAPIO_SUBMITTED;
2654	}
2655
2656	r = map_bio(cache, bio, block, commit_needed: &commit_needed);
2657	if (commit_needed)
2658	schedule_commit(b: &cache->committer);
2659
2660	return r;
2661	}
2662
2663	static int cache_end_io(struct dm_target ti, struct* bio bio, blk_status_t error)
2664	{
2665	struct cache *cache = ti->private;
2666	unsigned long flags;
2667	struct per_bio_data *pb = get_per_bio_data(bio);
2668
2669	if (pb->tick) {
2670	policy_tick(p: cache->policy, can_block: false);
2671
2672	spin_lock_irqsave(&cache->lock, flags);
2673	cache->need_tick_bio = true;
2674	spin_unlock_irqrestore(lock: &cache->lock, flags);
2675	}
2676
2677	bio_drop_shared_lock(cache, bio);
2678	accounted_complete(cache, bio);
2679
2680	return DM_ENDIO_DONE;
2681	}
2682
2683	static int write_dirty_bitset(struct cache *cache)
2684	{
2685	int r;
2686
2687	if (get_cache_mode(cache) >= CM_READ_ONLY)
2688	return -EINVAL;
2689
2690	r = dm_cache_set_dirty_bits(cmd: cache->cmd, nr_bits: from_cblock(b: cache->cache_size), bits: cache->dirty_bitset);
2691	if (r)
2692	metadata_operation_failed(cache, op: "dm_cache_set_dirty_bits", r);
2693
2694	return r;
2695	}
2696
2697	static int write_discard_bitset(struct cache *cache)
2698	{
2699	unsigned int i, r;
2700
2701	if (get_cache_mode(cache) >= CM_READ_ONLY)
2702	return -EINVAL;
2703
2704	r = dm_cache_discard_bitset_resize(cmd: cache->cmd, discard_block_size: cache->discard_block_size,
2705	new_nr_entries: cache->discard_nr_blocks);
2706	if (r) {
2707	DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2708	metadata_operation_failed(cache, op: "dm_cache_discard_bitset_resize", r);
2709	return r;
2710	}
2711
2712	for (i = `0`; i < from_dblock(b: cache->discard_nr_blocks); i++) {
2713	r = dm_cache_set_discard(cmd: cache->cmd, dblock: to_dblock(b: i),
2714	discard: is_discarded(cache, b: to_dblock(b: i)));
2715	if (r) {
2716	metadata_operation_failed(cache, op: "dm_cache_set_discard", r);
2717	return r;
2718	}
2719	}
2720
2721	return `0`;
2722	}
2723
2724	static int write_hints(struct cache *cache)
2725	{
2726	int r;
2727
2728	if (get_cache_mode(cache) >= CM_READ_ONLY)
2729	return -EINVAL;
2730
2731	r = dm_cache_write_hints(cmd: cache->cmd, p: cache->policy);
2732	if (r) {
2733	metadata_operation_failed(cache, op: "dm_cache_write_hints", r);
2734	return r;
2735	}
2736
2737	return `0`;
2738	}
2739
2740	/*
2741	* returns true on success
2742	*/
2743	static bool sync_metadata(struct cache *cache)
2744	{
2745	int r1, r2, r3, r4;
2746
2747	r1 = write_dirty_bitset(cache);
2748	if (r1)
2749	DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2750
2751	r2 = write_discard_bitset(cache);
2752	if (r2)
2753	DMERR("%s: could not write discard bitset", cache_device_name(cache));
2754
2755	save_stats(cache);
2756
2757	r3 = write_hints(cache);
2758	if (r3)
2759	DMERR("%s: could not write hints", cache_device_name(cache));
2760
2761	/*
2762	* If writing the above metadata failed, we still commit, but don't
2763	* set the clean shutdown flag. This will effectively force every
2764	* dirty bit to be set on reload.
2765	*/
2766	r4 = commit(cache, clean_shutdown: !r1 && !r2 && !r3);
2767	if (r4)
2768	DMERR("%s: could not write cache metadata", cache_device_name(cache));
2769
2770	return !r1 && !r2 && !r3 && !r4;
2771	}
2772
2773	static void cache_postsuspend(struct dm_target *ti)
2774	{
2775	struct cache *cache = ti->private;
2776
2777	prevent_background_work(cache);
2778	BUG_ON(atomic_read(&cache->nr_io_migrations));
2779
2780	cancel_delayed_work_sync(dwork: &cache->waker);
2781	drain_workqueue(wq: cache->wq);
2782	WARN_ON(cache->tracker.in_flight);
2783
2784	/*
2785	* If it's a flush suspend there won't be any deferred bios, so this
2786	* call is harmless.
2787	*/
2788	requeue_deferred_bios(cache);
2789
2790	if (get_cache_mode(cache) == CM_WRITE)
2791	(void) sync_metadata(cache);
2792	}
2793
2794	static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2795	bool dirty, uint32_t hint, bool hint_valid)
2796	{
2797	struct cache *cache = context;
2798
2799	if (dirty) {
2800	set_bit(nr: from_cblock(b: cblock), addr: cache->dirty_bitset);
2801	atomic_inc(v: &cache->nr_dirty);
2802	} else
2803	clear_bit(nr: from_cblock(b: cblock), addr: cache->dirty_bitset);
2804
2805	return policy_load_mapping(p: cache->policy, oblock, cblock, dirty, hint, hint_valid);
2806	}
2807
2808	/*
2809	* The discard block size in the on disk metadata is not
2810	* necessarily the same as we're currently using. So we have to
2811	* be careful to only set the discarded attribute if we know it
2812	* covers a complete block of the new size.
2813	*/
2814	struct discard_load_info {
2815	struct cache *cache;
2816
2817	/*
2818	* These blocks are sized using the on disk dblock size, rather
2819	* than the current one.
2820	*/
2821	dm_block_t block_size;
2822	dm_block_t discard_begin, discard_end;
2823	};
2824
2825	static void discard_load_info_init(struct cache *cache,
2826	struct discard_load_info *li)
2827	{
2828	li->cache = cache;
2829	li->discard_begin = li->discard_end = `0`;
2830	}
2831
2832	static void set_discard_range(struct discard_load_info *li)
2833	{
2834	sector_t b, e;
2835
2836	if (li->discard_begin == li->discard_end)
2837	return;
2838
2839	/*
2840	* Convert to sectors.
2841	*/
2842	b = li->discard_begin * li->block_size;
2843	e = li->discard_end * li->block_size;
2844
2845	/*
2846	* Then convert back to the current dblock size.
2847	*/
2848	b = dm_sector_div_up(b, li->cache->discard_block_size);
2849	sector_div(e, li->cache->discard_block_size);
2850
2851	/*
2852	* The origin may have shrunk, so we need to check we're still in
2853	* bounds.
2854	*/
2855	if (e > from_dblock(b: li->cache->discard_nr_blocks))
2856	e = from_dblock(b: li->cache->discard_nr_blocks);
2857
2858	for (; b < e; b++)
2859	set_discard(cache: li->cache, b: to_dblock(b));
2860	}
2861
2862	static int load_discard(void *context, sector_t discard_block_size,
2863	dm_dblock_t dblock, bool discard)
2864	{
2865	struct discard_load_info *li = context;
2866
2867	li->block_size = discard_block_size;
2868
2869	if (discard) {
2870	if (from_dblock(b: dblock) == li->discard_end)
2871	/*
2872	* We're already in a discard range, just extend it.
2873	*/
2874	li->discard_end = li->discard_end + `1ULL`;
2875
2876	else {
2877	/*
2878	* Emit the old range and start a new one.
2879	*/
2880	set_discard_range(li);
2881	li->discard_begin = from_dblock(b: dblock);
2882	li->discard_end = li->discard_begin + `1ULL`;
2883	}
2884	} else {
2885	set_discard_range(li);
2886	li->discard_begin = li->discard_end = `0`;
2887	}
2888
2889	return `0`;
2890	}
2891
2892	static dm_cblock_t get_cache_dev_size(struct cache *cache)
2893	{
2894	sector_t size = get_dev_size(dev: cache->cache_dev);
2895	(void) sector_div(size, cache->sectors_per_block);
2896	return to_cblock(b: size);
2897	}
2898
2899	static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2900	{
2901	if (from_cblock(b: new_size) > from_cblock(b: cache->cache_size)) {
2902	if (cache->sized) {
2903	DMERR("%s: unable to extend cache due to missing cache table reload",
2904	cache_device_name(cache));
2905	return false;
2906	}
2907	}
2908
2909	/*
2910	* We can't drop a dirty block when shrinking the cache.
2911	*/
2912	while (from_cblock(b: new_size) < from_cblock(b: cache->cache_size)) {
2913	new_size = to_cblock(b: from_cblock(b: new_size) + `1`);
2914	if (is_dirty(cache, b: new_size)) {
2915	DMERR("%s: unable to shrink cache; cache block %llu is dirty",
2916	cache_device_name(cache),
2917	(unsigned long long) from_cblock(new_size));
2918	return false;
2919	}
2920	}
2921
2922	return true;
2923	}
2924
2925	static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2926	{
2927	int r;
2928
2929	r = dm_cache_resize(cmd: cache->cmd, new_cache_size: new_size);
2930	if (r) {
2931	DMERR("%s: could not resize cache metadata", cache_device_name(cache));
2932	metadata_operation_failed(cache, op: "dm_cache_resize", r);
2933	return r;
2934	}
2935
2936	set_cache_size(cache, size: new_size);
2937
2938	return `0`;
2939	}
2940
2941	static int cache_preresume(struct dm_target *ti)
2942	{
2943	int r = `0`;
2944	struct cache *cache = ti->private;
2945	dm_cblock_t csize = get_cache_dev_size(cache);
2946
2947	/*
2948	* Check to see if the cache has resized.
2949	*/
2950	if (!cache->sized) {
2951	r = resize_cache_dev(cache, new_size: csize);
2952	if (r)
2953	return r;
2954
2955	cache->sized = true;
2956
2957	} else if (csize != cache->cache_size) {
2958	if (!can_resize(cache, new_size: csize))
2959	return -EINVAL;
2960
2961	r = resize_cache_dev(cache, new_size: csize);
2962	if (r)
2963	return r;
2964	}
2965
2966	if (!cache->loaded_mappings) {
2967	r = dm_cache_load_mappings(cmd: cache->cmd, policy: cache->policy,
2968	fn: load_mapping, context: cache);
2969	if (r) {
2970	DMERR("%s: could not load cache mappings", cache_device_name(cache));
2971	metadata_operation_failed(cache, op: "dm_cache_load_mappings", r);
2972	return r;
2973	}
2974
2975	cache->loaded_mappings = true;
2976	}
2977
2978	if (!cache->loaded_discards) {
2979	struct discard_load_info li;
2980
2981	/*
2982	* The discard bitset could have been resized, or the
2983	* discard block size changed. To be safe we start by
2984	* setting every dblock to not discarded.
2985	*/
2986	clear_bitset(bitset: cache->discard_bitset, nr_entries: from_dblock(b: cache->discard_nr_blocks));
2987
2988	discard_load_info_init(cache, li: &li);
2989	r = dm_cache_load_discards(cmd: cache->cmd, fn: load_discard, context: &li);
2990	if (r) {
2991	DMERR("%s: could not load origin discards", cache_device_name(cache));
2992	metadata_operation_failed(cache, op: "dm_cache_load_discards", r);
2993	return r;
2994	}
2995	set_discard_range(&li);
2996
2997	cache->loaded_discards = true;
2998	}
2999
3000	return r;
3001	}
3002
3003	static void cache_resume(struct dm_target *ti)
3004	{
3005	struct cache *cache = ti->private;
3006
3007	cache->need_tick_bio = true;
3008	allow_background_work(cache);
3009	do_waker(ws: &cache->waker.work);
3010	}
3011
3012	static void emit_flags(struct cache cache, char* *result,
3013	unsigned int maxlen, ssize_t *sz_ptr)
3014	{
3015	ssize_t sz = *sz_ptr;
3016	struct cache_features *cf = &cache->features;
3017	unsigned int count = (cf->metadata_version == `2`) + !cf->discard_passdown + `1`;
3018
3019	DMEMIT("%u ", count);
3020
3021	if (cf->metadata_version == `2`)
3022	DMEMIT("metadata2 ");
3023
3024	if (writethrough_mode(cache))
3025	DMEMIT("writethrough ");
3026
3027	else if (passthrough_mode(cache))
3028	DMEMIT("passthrough ");
3029
3030	else if (writeback_mode(cache))
3031	DMEMIT("writeback ");
3032
3033	else {
3034	DMEMIT("unknown ");
3035	DMERR("%s: internal error: unknown io mode: %d",
3036	cache_device_name(cache), (int) cf->io_mode);
3037	}
3038
3039	if (!cf->discard_passdown)
3040	DMEMIT("no_discard_passdown ");
3041
3042	*sz_ptr = sz;
3043	}
3044
3045	/*
3046	* Status format:
3047	*
3048	* <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3049	* <cache block size> <#used cache blocks>/<#total cache blocks>
3050	* <#read hits> <#read misses> <#write hits> <#write misses>
3051	* <#demotions> <#promotions> <#dirty>
3052	* <#features> <features>*
3053	* <#core args> <core args>
3054	* <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3055	*/
3056	static void cache_status(struct dm_target *ti, status_type_t type,
3057	unsigned int status_flags, char result, unsigned* int maxlen)
3058	{
3059	int r = `0`;
3060	unsigned int i;
3061	ssize_t sz = `0`;
3062	dm_block_t nr_free_blocks_metadata = `0`;
3063	dm_block_t nr_blocks_metadata = `0`;
3064	char buf[BDEVNAME_SIZE];
3065	struct cache *cache = ti->private;
3066	dm_cblock_t residency;
3067	bool needs_check;
3068
3069	switch (type) {
3070	case STATUSTYPE_INFO:
3071	if (get_cache_mode(cache) == CM_FAIL) {
3072	DMEMIT("Fail");
3073	break;
3074	}
3075
3076	/ Commit to ensure statistics aren't out-of-date /
3077	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3078	(void) commit(cache, clean_shutdown: false);
3079
3080	r = dm_cache_get_free_metadata_block_count(cmd: cache->cmd, result: &nr_free_blocks_metadata);
3081	if (r) {
3082	DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3083	cache_device_name(cache), r);
3084	goto err;
3085	}
3086
3087	r = dm_cache_get_metadata_dev_size(cmd: cache->cmd, result: &nr_blocks_metadata);
3088	if (r) {
3089	DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3090	cache_device_name(cache), r);
3091	goto err;
3092	}
3093
3094	residency = policy_residency(p: cache->policy);
3095
3096	DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3097	(unsigned int)DM_CACHE_METADATA_BLOCK_SIZE,
3098	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3099	(unsigned long long)nr_blocks_metadata,
3100	(unsigned long long)cache->sectors_per_block,
3101	(unsigned long long) from_cblock(residency),
3102	(unsigned long long) from_cblock(cache->cache_size),
3103	(unsigned int) atomic_read(&cache->stats.read_hit),
3104	(unsigned int) atomic_read(&cache->stats.read_miss),
3105	(unsigned int) atomic_read(&cache->stats.write_hit),
3106	(unsigned int) atomic_read(&cache->stats.write_miss),
3107	(unsigned int) atomic_read(&cache->stats.demotion),
3108	(unsigned int) atomic_read(&cache->stats.promotion),
3109	(unsigned long) atomic_read(&cache->nr_dirty));
3110
3111	emit_flags(cache, result, maxlen, sz_ptr: &sz);
3112
3113	DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3114
3115	DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3116	if (sz < maxlen) {
3117	r = policy_emit_config_values(p: cache->policy, result, maxlen, sz_ptr: &sz);
3118	if (r)
3119	DMERR("%s: policy_emit_config_values returned %d",
3120	cache_device_name(cache), r);
3121	}
3122
3123	if (get_cache_mode(cache) == CM_READ_ONLY)
3124	DMEMIT("ro ");
3125	else
3126	DMEMIT("rw ");
3127
3128	r = dm_cache_metadata_needs_check(cmd: cache->cmd, result: &needs_check);
3129
3130	if (r \|\| needs_check)
3131	DMEMIT("needs_check ");
3132	else
3133	DMEMIT("- ");
3134
3135	break;
3136
3137	case STATUSTYPE_TABLE:
3138	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3139	DMEMIT("%s ", buf);
3140	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3141	DMEMIT("%s ", buf);
3142	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3143	DMEMIT("%s", buf);
3144
3145	for (i = `0`; i < cache->nr_ctr_args - `1`; i++)
3146	DMEMIT(" %s", cache->ctr_args[i]);
3147	if (cache->nr_ctr_args)
3148	DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - `1`]);
3149	break;
3150
3151	case STATUSTYPE_IMA:
3152	DMEMIT_TARGET_NAME_VERSION(ti->type);
3153	if (get_cache_mode(cache) == CM_FAIL)
3154	DMEMIT(",metadata_mode=fail");
3155	else if (get_cache_mode(cache) == CM_READ_ONLY)
3156	DMEMIT(",metadata_mode=ro");
3157	else
3158	DMEMIT(",metadata_mode=rw");
3159
3160	format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3161	DMEMIT(",cache_metadata_device=%s", buf);
3162	format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3163	DMEMIT(",cache_device=%s", buf);
3164	format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3165	DMEMIT(",cache_origin_device=%s", buf);
3166	DMEMIT(",writethrough=%c", writethrough_mode(cache) ? `'y'` : `'n'`);
3167	DMEMIT(",writeback=%c", writeback_mode(cache) ? `'y'` : `'n'`);
3168	DMEMIT(",passthrough=%c", passthrough_mode(cache) ? `'y'` : `'n'`);
3169	DMEMIT(",metadata2=%c", cache->features.metadata_version == `2` ? `'y'` : `'n'`);
3170	DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? `'n'` : `'y'`);
3171	DMEMIT(";");
3172	break;
3173	}
3174
3175	return;
3176
3177	err:
3178	DMEMIT("Error");
3179	}
3180
3181	/*
3182	* Defines a range of cblocks, begin to (end - 1) are in the range. end is
3183	* the one-past-the-end value.
3184	*/
3185	struct cblock_range {
3186	dm_cblock_t begin;
3187	dm_cblock_t end;
3188	};
3189
3190	/*
3191	* A cache block range can take two forms:
3192	*
3193	* i) A single cblock, eg. '3456'
3194	* ii) A begin and end cblock with a dash between, eg. 123-234
3195	*/
3196	static int parse_cblock_range(struct cache cache, const* char *str,
3197	struct cblock_range *result)
3198	{
3199	char dummy;
3200	uint64_t b, e;
3201	int r;
3202
3203	/*
3204	* Try and parse form (ii) first.
3205	*/
3206	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3207	if (r < `0`)
3208	return r;
3209
3210	if (r == `2`) {
3211	result->begin = to_cblock(b);
3212	result->end = to_cblock(b: e);
3213	return `0`;
3214	}
3215
3216	/*
3217	* That didn't work, try form (i).
3218	*/
3219	r = sscanf(str, "%llu%c", &b, &dummy);
3220	if (r < `0`)
3221	return r;
3222
3223	if (r == `1`) {
3224	result->begin = to_cblock(b);
3225	result->end = to_cblock(b: from_cblock(b: result->begin) + `1u`);
3226	return `0`;
3227	}
3228
3229	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3230	return -EINVAL;
3231	}
3232
3233	static int validate_cblock_range(struct cache cache, struct* cblock_range *range)
3234	{
3235	uint64_t b = from_cblock(b: range->begin);
3236	uint64_t e = from_cblock(b: range->end);
3237	uint64_t n = from_cblock(b: cache->cache_size);
3238
3239	if (b >= n) {
3240	DMERR("%s: begin cblock out of range: %llu >= %llu",
3241	cache_device_name(cache), b, n);
3242	return -EINVAL;
3243	}
3244
3245	if (e > n) {
3246	DMERR("%s: end cblock out of range: %llu > %llu",
3247	cache_device_name(cache), e, n);
3248	return -EINVAL;
3249	}
3250
3251	if (b >= e) {
3252	DMERR("%s: invalid cblock range: %llu >= %llu",
3253	cache_device_name(cache), b, e);
3254	return -EINVAL;
3255	}
3256
3257	return `0`;
3258	}
3259
3260	static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3261	{
3262	return to_cblock(b: from_cblock(b) + `1`);
3263	}
3264
3265	static int request_invalidation(struct cache cache, struct* cblock_range *range)
3266	{
3267	int r = `0`;
3268
3269	/*
3270	* We don't need to do any locking here because we know we're in
3271	* passthrough mode. There's is potential for a race between an
3272	* invalidation triggered by an io and an invalidation message. This
3273	* is harmless, we must not worry if the policy call fails.
3274	*/
3275	while (range->begin != range->end) {
3276	r = invalidate_cblock(cache, cblock: range->begin);
3277	if (r)
3278	return r;
3279
3280	range->begin = cblock_succ(b: range->begin);
3281	}
3282
3283	cache->commit_requested = true;
3284	return r;
3285	}
3286
3287	static int process_invalidate_cblocks_message(struct cache cache, unsigned* int count,
3288	const char **cblock_ranges)
3289	{
3290	int r = `0`;
3291	unsigned int i;
3292	struct cblock_range range;
3293
3294	if (!passthrough_mode(cache)) {
3295	DMERR("%s: cache has to be in passthrough mode for invalidation",
3296	cache_device_name(cache));
3297	return -EPERM;
3298	}
3299
3300	for (i = `0`; i < count; i++) {
3301	r = parse_cblock_range(cache, str: cblock_ranges[i], result: &range);
3302	if (r)
3303	break;
3304
3305	r = validate_cblock_range(cache, range: &range);
3306	if (r)
3307	break;
3308
3309	/*
3310	* Pass begin and end origin blocks to the worker and wake it.
3311	*/
3312	r = request_invalidation(cache, range: &range);
3313	if (r)
3314	break;
3315	}
3316
3317	return r;
3318	}
3319
3320	/*
3321	* Supports
3322	* "<key> <value>"
3323	* and
3324	* "invalidate_cblocks [(<begin>)\|(<begin>-<end>)]*
3325	*
3326	* The key migration_threshold is supported by the cache target core.
3327	*/
3328	static int cache_message(struct dm_target ti, unsigned* int argc, char **argv,
3329	char result, unsigned* int maxlen)
3330	{
3331	struct cache *cache = ti->private;
3332
3333	if (!argc)
3334	return -EINVAL;
3335
3336	if (get_cache_mode(cache) >= CM_READ_ONLY) {
3337	DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3338	cache_device_name(cache));
3339	return -EOPNOTSUPP;
3340	}
3341
3342	if (!strcasecmp(s1: argv[`0`], s2: "invalidate_cblocks"))
3343	return process_invalidate_cblocks_message(cache, count: argc - `1`, cblock_ranges: (const char **) argv + `1`);
3344
3345	if (argc != `2`)
3346	return -EINVAL;
3347
3348	return set_config_value(cache, key: argv[`0`], value: argv[`1`]);
3349	}
3350
3351	static int cache_iterate_devices(struct dm_target *ti,
3352	iterate_devices_callout_fn fn, void *data)
3353	{
3354	int r = `0`;
3355	struct cache *cache = ti->private;
3356
3357	r = fn(ti, cache->cache_dev, `0`, get_dev_size(dev: cache->cache_dev), data);
3358	if (!r)
3359	r = fn(ti, cache->origin_dev, `0`, ti->len, data);
3360
3361	return r;
3362	}
3363
3364	/*
3365	* If discard_passdown was enabled verify that the origin device
3366	* supports discards. Disable discard_passdown if not.
3367	*/
3368	static void disable_passdown_if_not_supported(struct cache *cache)
3369	{
3370	struct block_device *origin_bdev = cache->origin_dev->bdev;
3371	struct queue_limits *origin_limits = &bdev_get_queue(bdev: origin_bdev)->limits;
3372	const char *reason = NULL;
3373
3374	if (!cache->features.discard_passdown)
3375	return;
3376
3377	if (!bdev_max_discard_sectors(bdev: origin_bdev))
3378	reason = "discard unsupported";
3379
3380	else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3381	reason = "max discard sectors smaller than a block";
3382
3383	if (reason) {
3384	DMWARN("Origin device (%pg) %s: Disabling discard passdown.",
3385	origin_bdev, reason);
3386	cache->features.discard_passdown = false;
3387	}
3388	}
3389
3390	static void set_discard_limits(struct cache cache, struct* queue_limits *limits)
3391	{
3392	struct block_device *origin_bdev = cache->origin_dev->bdev;
3393	struct queue_limits *origin_limits = &bdev_get_queue(bdev: origin_bdev)->limits;
3394
3395	if (!cache->features.discard_passdown) {
3396	/ No passdown is done so setting own virtual limits /
3397	limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * `1024`,
3398	cache->origin_sectors);
3399	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3400	return;
3401	}
3402
3403	/*
3404	* cache_iterate_devices() is stacking both origin and fast device limits
3405	* but discards aren't passed to fast device, so inherit origin's limits.
3406	*/
3407	limits->max_discard_sectors = origin_limits->max_discard_sectors;
3408	limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3409	limits->discard_granularity = origin_limits->discard_granularity;
3410	limits->discard_alignment = origin_limits->discard_alignment;
3411	limits->discard_misaligned = origin_limits->discard_misaligned;
3412	}
3413
3414	static void cache_io_hints(struct dm_target ti, struct* queue_limits *limits)
3415	{
3416	struct cache *cache = ti->private;
3417	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3418
3419	/*
3420	* If the system-determined stacked limits are compatible with the
3421	* cache's blocksize (io_opt is a factor) do not override them.
3422	*/
3423	if (io_opt_sectors < cache->sectors_per_block \|\|
3424	do_div(io_opt_sectors, cache->sectors_per_block)) {
3425	blk_limits_io_min(limits, min: cache->sectors_per_block << SECTOR_SHIFT);
3426	blk_limits_io_opt(limits, opt: cache->sectors_per_block << SECTOR_SHIFT);
3427	}
3428
3429	disable_passdown_if_not_supported(cache);
3430	set_discard_limits(cache, limits);
3431	}
3432
3433	/----------------------------------------------------------------/
3434
3435	static struct target_type cache_target = {
3436	.name = "cache",
3437	.version = {`2`, `2`, `0`},
3438	.module = THIS_MODULE,
3439	.ctr = cache_ctr,
3440	.dtr = cache_dtr,
3441	.map = cache_map,
3442	.end_io = cache_end_io,
3443	.postsuspend = cache_postsuspend,
3444	.preresume = cache_preresume,
3445	.resume = cache_resume,
3446	.status = cache_status,
3447	.message = cache_message,
3448	.iterate_devices = cache_iterate_devices,
3449	.io_hints = cache_io_hints,
3450	};
3451
3452	static int __init dm_cache_init(void)
3453	{
3454	int r;
3455
3456	migration_cache = KMEM_CACHE(dm_cache_migration, `0`);
3457	if (!migration_cache)
3458	return -ENOMEM;
3459
3460	r = dm_register_target(t: &cache_target);
3461	if (r) {
3462	kmem_cache_destroy(s: migration_cache);
3463	return r;
3464	}
3465
3466	return `0`;
3467	}
3468
3469	static void __exit dm_cache_exit(void)
3470	{
3471	dm_unregister_target(t: &cache_target);
3472	kmem_cache_destroy(s: migration_cache);
3473	}
3474
3475	module_init(dm_cache_init);
3476	module_exit(dm_cache_exit);
3477
3478	MODULE_DESCRIPTION(DM_NAME " cache target");
3479	MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3480	MODULE_LICENSE("GPL");
3481

source code of linux/drivers/md/dm-cache-target.c