dm-thin.c source code [linux/drivers/md/dm-thin.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2011-2012 Red Hat UK.
4	*
5	* This file is released under the GPL.
6	*/
7
8	#include "dm-thin-metadata.h"
9	#include "dm-bio-prison-v1.h"
10	#include "dm.h"
11
12	#include <linux/device-mapper.h>
13	#include <linux/dm-io.h>
14	#include <linux/dm-kcopyd.h>
15	#include <linux/jiffies.h>
16	#include <linux/log2.h>
17	#include <linux/list.h>
18	#include <linux/rculist.h>
19	#include <linux/init.h>
20	#include <linux/module.h>
21	#include <linux/slab.h>
22	#include <linux/vmalloc.h>
23	#include <linux/sort.h>
24	#include <linux/rbtree.h>
25
26	#define DM_MSG_PREFIX "thin"
27
28	/*
29	* Tunable constants
30	*/
31	#define ENDIO_HOOK_POOL_SIZE 1024
32	#define MAPPING_POOL_SIZE 1024
33	#define COMMIT_PERIOD HZ
34	#define NO_SPACE_TIMEOUT_SECS 60
35
36	static unsigned int no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
37
38	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
39	"A percentage of time allocated for copy on write");
40
41	/*
42	* The block size of the device holding pool data must be
43	* between 64KB and 1GB.
44	*/
45	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
46	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
47
48	/*
49	* Device id is restricted to 24 bits.
50	*/
51	#define MAX_DEV_ID ((1 << 24) - 1)
52
53	/*
54	* How do we handle breaking sharing of data blocks?
55	* =================================================
56	*
57	* We use a standard copy-on-write btree to store the mappings for the
58	* devices (note I'm talking about copy-on-write of the metadata here, not
59	* the data). When you take an internal snapshot you clone the root node
60	* of the origin btree. After this there is no concept of an origin or a
61	* snapshot. They are just two device trees that happen to point to the
62	* same data blocks.
63	*
64	* When we get a write in we decide if it's to a shared data block using
65	* some timestamp magic. If it is, we have to break sharing.
66	*
67	* Let's say we write to a shared block in what was the origin. The
68	* steps are:
69	*
70	* i) plug io further to this physical block. (see bio_prison code).
71	*
72	* ii) quiesce any read io to that shared data block. Obviously
73	* including all devices that share this block. (see dm_deferred_set code)
74	*
75	* iii) copy the data block to a newly allocate block. This step can be
76	* missed out if the io covers the block. (schedule_copy).
77	*
78	* iv) insert the new mapping into the origin's btree
79	* (process_prepared_mapping). This act of inserting breaks some
80	* sharing of btree nodes between the two devices. Breaking sharing only
81	* effects the btree of that specific device. Btrees for the other
82	* devices that share the block never change. The btree for the origin
83	* device as it was after the last commit is untouched, ie. we're using
84	* persistent data structures in the functional programming sense.
85	*
86	* v) unplug io to this physical block, including the io that triggered
87	* the breaking of sharing.
88	*
89	* Steps (ii) and (iii) occur in parallel.
90	*
91	* The metadata _doesn't_ need to be committed before the io continues. We
92	* get away with this because the io is always written to a _new_ block.
93	* If there's a crash, then:
94	*
95	* - The origin mapping will point to the old origin block (the shared
96	* one). This will contain the data as it was before the io that triggered
97	* the breaking of sharing came in.
98	*
99	* - The snap mapping still points to the old block. As it would after
100	* the commit.
101	*
102	* The downside of this scheme is the timestamp magic isn't perfect, and
103	* will continue to think that data block in the snapshot device is shared
104	* even after the write to the origin has broken sharing. I suspect data
105	* blocks will typically be shared by many different devices, so we're
106	* breaking sharing n + 1 times, rather than n, where n is the number of
107	* devices that reference this data block. At the moment I think the
108	* benefits far, far outweigh the disadvantages.
109	*/
110
111	/----------------------------------------------------------------/
112
113	/*
114	* Key building.
115	*/
116	enum lock_space {
117	VIRTUAL,
118	PHYSICAL
119	};
120
121	static bool build_key(struct dm_thin_device td, enum* lock_space ls,
122	dm_block_t b, dm_block_t e, struct dm_cell_key *key)
123	{
124	key->virtual = (ls == VIRTUAL);
125	key->dev = dm_thin_dev_id(td);
126	key->block_begin = b;
127	key->block_end = e;
128
129	return dm_cell_key_has_valid_range(key);
130	}
131
132	static void build_data_key(struct dm_thin_device *td, dm_block_t b,
133	struct dm_cell_key *key)
134	{
135	(void) build_key(td, ls: PHYSICAL, b, e: b + `1llu`, key);
136	}
137
138	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
139	struct dm_cell_key *key)
140	{
141	(void) build_key(td, ls: VIRTUAL, b, e: b + `1llu`, key);
142	}
143
144	/----------------------------------------------------------------/
145
146	#define THROTTLE_THRESHOLD (1 * HZ)
147
148	struct throttle {
149	struct rw_semaphore lock;
150	unsigned long threshold;
151	bool throttle_applied;
152	};
153
154	static void throttle_init(struct throttle *t)
155	{
156	init_rwsem(&t->lock);
157	t->throttle_applied = false;
158	}
159
160	static void throttle_work_start(struct throttle *t)
161	{
162	t->threshold = jiffies + THROTTLE_THRESHOLD;
163	}
164
165	static void throttle_work_update(struct throttle *t)
166	{
167	if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {
168	down_write(sem: &t->lock);
169	t->throttle_applied = true;
170	}
171	}
172
173	static void throttle_work_complete(struct throttle *t)
174	{
175	if (t->throttle_applied) {
176	t->throttle_applied = false;
177	up_write(sem: &t->lock);
178	}
179	}
180
181	static void throttle_lock(struct throttle *t)
182	{
183	down_read(sem: &t->lock);
184	}
185
186	static void throttle_unlock(struct throttle *t)
187	{
188	up_read(sem: &t->lock);
189	}
190
191	/----------------------------------------------------------------/
192
193	/*
194	* A pool device ties together a metadata device and a data device. It
195	* also provides the interface for creating and destroying internal
196	* devices.
197	*/
198	struct dm_thin_new_mapping;
199
200	/*
201	* The pool runs in various modes. Ordered in degraded order for comparisons.
202	*/
203	enum pool_mode {
204	PM_WRITE, / metadata may be changed /
205	PM_OUT_OF_DATA_SPACE, / metadata may be changed, though data may not be allocated /
206
207	/*
208	* Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
209	*/
210	PM_OUT_OF_METADATA_SPACE,
211	PM_READ_ONLY, / metadata may not be changed /
212
213	PM_FAIL, / all I/O fails /
214	};
215
216	struct pool_features {
217	enum pool_mode mode;
218
219	bool zero_new_blocks:`1`;
220	bool discard_enabled:`1`;
221	bool discard_passdown:`1`;
222	bool error_if_no_space:`1`;
223	};
224
225	struct thin_c;
226	typedef void (process_bio_fn)(struct* thin_c tc, struct* bio *bio);
227	typedef void (process_cell_fn)(struct* thin_c tc, struct* dm_bio_prison_cell *cell);
228	typedef void (process_mapping_fn)(struct* dm_thin_new_mapping *m);
229
230	#define CELL_SORT_ARRAY_SIZE 8192
231
232	struct pool {
233	struct list_head list;
234	struct dm_target ti; /* Only set if a pool target is bound /
235
236	struct mapped_device *pool_md;
237	struct block_device *data_dev;
238	struct block_device *md_dev;
239	struct dm_pool_metadata *pmd;
240
241	dm_block_t low_water_blocks;
242	uint32_t sectors_per_block;
243	int sectors_per_block_shift;
244
245	struct pool_features pf;
246	bool low_water_triggered:`1`; / A dm event has been sent /
247	bool suspended:`1`;
248	bool out_of_data_space:`1`;
249
250	struct dm_bio_prison *prison;
251	struct dm_kcopyd_client *copier;
252
253	struct work_struct worker;
254	struct workqueue_struct *wq;
255	struct throttle throttle;
256	struct delayed_work waker;
257	struct delayed_work no_space_timeout;
258
259	unsigned long last_commit_jiffies;
260	unsigned int ref_count;
261
262	spinlock_t lock;
263	struct bio_list deferred_flush_bios;
264	struct bio_list deferred_flush_completions;
265	struct list_head prepared_mappings;
266	struct list_head prepared_discards;
267	struct list_head prepared_discards_pt2;
268	struct list_head active_thins;
269
270	struct dm_deferred_set *shared_read_ds;
271	struct dm_deferred_set *all_io_ds;
272
273	struct dm_thin_new_mapping *next_mapping;
274
275	process_bio_fn process_bio;
276	process_bio_fn process_discard;
277
278	process_cell_fn process_cell;
279	process_cell_fn process_discard_cell;
280
281	process_mapping_fn process_prepared_mapping;
282	process_mapping_fn process_prepared_discard;
283	process_mapping_fn process_prepared_discard_pt2;
284
285	struct dm_bio_prison_cell **cell_sort_array;
286
287	mempool_t mapping_pool;
288	};
289
290	static void metadata_operation_failed(struct pool pool, const* char op, int* r);
291
292	static enum pool_mode get_pool_mode(struct pool *pool)
293	{
294	return pool->pf.mode;
295	}
296
297	static void notify_of_pool_mode_change(struct pool *pool)
298	{
299	static const char *descs[] = {
300	"write",
301	"out-of-data-space",
302	"read-only",
303	"read-only",
304	"fail"
305	};
306	const char *extra_desc = NULL;
307	enum pool_mode mode = get_pool_mode(pool);
308
309	if (mode == PM_OUT_OF_DATA_SPACE) {
310	if (!pool->pf.error_if_no_space)
311	extra_desc = " (queue IO)";
312	else
313	extra_desc = " (error IO)";
314	}
315
316	dm_table_event(t: pool->ti->table);
317	DMINFO("%s: switching pool to %s%s mode",
318	dm_device_name(pool->pool_md),
319	descs[(int)mode], extra_desc ? : "");
320	}
321
322	/*
323	* Target context for a pool.
324	*/
325	struct pool_c {
326	struct dm_target *ti;
327	struct pool *pool;
328	struct dm_dev *data_dev;
329	struct dm_dev *metadata_dev;
330
331	dm_block_t low_water_blocks;
332	struct pool_features requested_pf; / Features requested during table load /
333	struct pool_features adjusted_pf; / Features used after adjusting for constituent devices /
334	};
335
336	/*
337	* Target context for a thin.
338	*/
339	struct thin_c {
340	struct list_head list;
341	struct dm_dev *pool_dev;
342	struct dm_dev *origin_dev;
343	sector_t origin_size;
344	dm_thin_id dev_id;
345
346	struct pool *pool;
347	struct dm_thin_device *td;
348	struct mapped_device *thin_md;
349
350	bool requeue_mode:`1`;
351	spinlock_t lock;
352	struct list_head deferred_cells;
353	struct bio_list deferred_bio_list;
354	struct bio_list retry_on_resume_list;
355	struct rb_root sort_bio_list; / sorted list of deferred bios /
356
357	/*
358	* Ensures the thin is not destroyed until the worker has finished
359	* iterating the active_thins list.
360	*/
361	refcount_t refcount;
362	struct completion can_destroy;
363	};
364
365	/----------------------------------------------------------------/
366
367	static bool block_size_is_power_of_two(struct pool *pool)
368	{
369	return pool->sectors_per_block_shift >= `0`;
370	}
371
372	static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
373	{
374	return block_size_is_power_of_two(pool) ?
375	(b << pool->sectors_per_block_shift) :
376	(b * pool->sectors_per_block);
377	}
378
379	/----------------------------------------------------------------/
380
381	struct discard_op {
382	struct thin_c *tc;
383	struct blk_plug plug;
384	struct bio *parent_bio;
385	struct bio *bio;
386	};
387
388	static void begin_discard(struct discard_op op, struct* thin_c tc, struct* bio *parent)
389	{
390	BUG_ON(!parent);
391
392	op->tc = tc;
393	blk_start_plug(&op->plug);
394	op->parent_bio = parent;
395	op->bio = NULL;
396	}
397
398	static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
399	{
400	struct thin_c *tc = op->tc;
401	sector_t s = block_to_sectors(pool: tc->pool, b: data_b);
402	sector_t len = block_to_sectors(pool: tc->pool, b: data_e - data_b);
403
404	return __blkdev_issue_discard(bdev: tc->pool_dev->bdev, sector: s, nr_sects: len, GFP_NOIO, biop: &op->bio);
405	}
406
407	static void end_discard(struct discard_op op, int* r)
408	{
409	if (op->bio) {
410	/*
411	* Even if one of the calls to issue_discard failed, we
412	* need to wait for the chain to complete.
413	*/
414	bio_chain(op->bio, op->parent_bio);
415	op->bio->bi_opf = REQ_OP_DISCARD;
416	submit_bio(bio: op->bio);
417	}
418
419	blk_finish_plug(&op->plug);
420
421	/*
422	* Even if r is set, there could be sub discards in flight that we
423	* need to wait for.
424	*/
425	if (r && !op->parent_bio->bi_status)
426	op->parent_bio->bi_status = errno_to_blk_status(errno: r);
427	bio_endio(op->parent_bio);
428	}
429
430	/----------------------------------------------------------------/
431
432	/*
433	* wake_worker() is used when new work is queued and when pool_resume is
434	* ready to continue deferred IO processing.
435	*/
436	static void wake_worker(struct pool *pool)
437	{
438	queue_work(wq: pool->wq, work: &pool->worker);
439	}
440
441	/----------------------------------------------------------------/
442
443	static int bio_detain(struct pool pool, struct* dm_cell_key key, struct* bio *bio,
444	struct dm_bio_prison_cell **cell_result)
445	{
446	int r;
447	struct dm_bio_prison_cell *cell_prealloc;
448
449	/*
450	* Allocate a cell from the prison's mempool.
451	* This might block but it can't fail.
452	*/
453	cell_prealloc = dm_bio_prison_alloc_cell(prison: pool->prison, GFP_NOIO);
454
455	r = dm_bio_detain(prison: pool->prison, key, inmate: bio, cell_prealloc, cell_result);
456	if (r) {
457	/*
458	* We reused an old cell; we can get rid of
459	* the new one.
460	*/
461	dm_bio_prison_free_cell(prison: pool->prison, cell: cell_prealloc);
462	}
463
464	return r;
465	}
466
467	static void cell_release(struct pool *pool,
468	struct dm_bio_prison_cell *cell,
469	struct bio_list *bios)
470	{
471	dm_cell_release(prison: pool->prison, cell, bios);
472	dm_bio_prison_free_cell(prison: pool->prison, cell);
473	}
474
475	static void cell_visit_release(struct pool *pool,
476	void (fn)(void* , struct* dm_bio_prison_cell *),
477	void *context,
478	struct dm_bio_prison_cell *cell)
479	{
480	dm_cell_visit_release(prison: pool->prison, visit_fn: fn, context, cell);
481	dm_bio_prison_free_cell(prison: pool->prison, cell);
482	}
483
484	static void cell_release_no_holder(struct pool *pool,
485	struct dm_bio_prison_cell *cell,
486	struct bio_list *bios)
487	{
488	dm_cell_release_no_holder(prison: pool->prison, cell, inmates: bios);
489	dm_bio_prison_free_cell(prison: pool->prison, cell);
490	}
491
492	static void cell_error_with_code(struct pool *pool,
493	struct dm_bio_prison_cell *cell, blk_status_t error_code)
494	{
495	dm_cell_error(prison: pool->prison, cell, error: error_code);
496	dm_bio_prison_free_cell(prison: pool->prison, cell);
497	}
498
499	static blk_status_t get_pool_io_error_code(struct pool *pool)
500	{
501	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
502	}
503
504	static void cell_error(struct pool pool, struct* dm_bio_prison_cell *cell)
505	{
506	cell_error_with_code(pool, cell, error_code: get_pool_io_error_code(pool));
507	}
508
509	static void cell_success(struct pool pool, struct* dm_bio_prison_cell *cell)
510	{
511	cell_error_with_code(pool, cell, error_code: `0`);
512	}
513
514	static void cell_requeue(struct pool pool, struct* dm_bio_prison_cell *cell)
515	{
516	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
517	}
518
519	/----------------------------------------------------------------/
520
521	/*
522	* A global list of pools that uses a struct mapped_device as a key.
523	*/
524	static struct dm_thin_pool_table {
525	struct mutex mutex;
526	struct list_head pools;
527	} dm_thin_pool_table;
528
529	static void pool_table_init(void)
530	{
531	mutex_init(&dm_thin_pool_table.mutex);
532	INIT_LIST_HEAD(list: &dm_thin_pool_table.pools);
533	}
534
535	static void pool_table_exit(void)
536	{
537	mutex_destroy(lock: &dm_thin_pool_table.mutex);
538	}
539
540	static void __pool_table_insert(struct pool *pool)
541	{
542	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
543	list_add(new: &pool->list, head: &dm_thin_pool_table.pools);
544	}
545
546	static void __pool_table_remove(struct pool *pool)
547	{
548	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
549	list_del(entry: &pool->list);
550	}
551
552	static struct pool __pool_table_lookup(struct* mapped_device *md)
553	{
554	struct pool pool = NULL, tmp;
555
556	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
557
558	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
559	if (tmp->pool_md == md) {
560	pool = tmp;
561	break;
562	}
563	}
564
565	return pool;
566	}
567
568	static struct pool __pool_table_lookup_metadata_dev(struct* block_device *md_dev)
569	{
570	struct pool pool = NULL, tmp;
571
572	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
573
574	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
575	if (tmp->md_dev == md_dev) {
576	pool = tmp;
577	break;
578	}
579	}
580
581	return pool;
582	}
583
584	/----------------------------------------------------------------/
585
586	struct dm_thin_endio_hook {
587	struct thin_c *tc;
588	struct dm_deferred_entry *shared_read_entry;
589	struct dm_deferred_entry *all_io_entry;
590	struct dm_thin_new_mapping *overwrite_mapping;
591	struct rb_node rb_node;
592	struct dm_bio_prison_cell *cell;
593	};
594
595	static void __merge_bio_list(struct bio_list bios, struct* bio_list *master)
596	{
597	bio_list_merge(bl: bios, bl2: master);
598	bio_list_init(bl: master);
599	}
600
601	static void error_bio_list(struct bio_list *bios, blk_status_t error)
602	{
603	struct bio *bio;
604
605	while ((bio = bio_list_pop(bl: bios))) {
606	bio->bi_status = error;
607	bio_endio(bio);
608	}
609	}
610
611	static void error_thin_bio_list(struct thin_c tc, struct* bio_list *master,
612	blk_status_t error)
613	{
614	struct bio_list bios;
615
616	bio_list_init(bl: &bios);
617
618	spin_lock_irq(lock: &tc->lock);
619	__merge_bio_list(bios: &bios, master);
620	spin_unlock_irq(lock: &tc->lock);
621
622	error_bio_list(bios: &bios, error);
623	}
624
625	static void requeue_deferred_cells(struct thin_c *tc)
626	{
627	struct pool *pool = tc->pool;
628	struct list_head cells;
629	struct dm_bio_prison_cell cell, tmp;
630
631	INIT_LIST_HEAD(list: &cells);
632
633	spin_lock_irq(lock: &tc->lock);
634	list_splice_init(list: &tc->deferred_cells, head: &cells);
635	spin_unlock_irq(lock: &tc->lock);
636
637	list_for_each_entry_safe(cell, tmp, &cells, user_list)
638	cell_requeue(pool, cell);
639	}
640
641	static void requeue_io(struct thin_c *tc)
642	{
643	struct bio_list bios;
644
645	bio_list_init(bl: &bios);
646
647	spin_lock_irq(lock: &tc->lock);
648	__merge_bio_list(bios: &bios, master: &tc->deferred_bio_list);
649	__merge_bio_list(bios: &bios, master: &tc->retry_on_resume_list);
650	spin_unlock_irq(lock: &tc->lock);
651
652	error_bio_list(bios: &bios, BLK_STS_DM_REQUEUE);
653	requeue_deferred_cells(tc);
654	}
655
656	static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
657	{
658	struct thin_c *tc;
659
660	rcu_read_lock();
661	list_for_each_entry_rcu(tc, &pool->active_thins, list)
662	error_thin_bio_list(tc, master: &tc->retry_on_resume_list, error);
663	rcu_read_unlock();
664	}
665
666	static void error_retry_list(struct pool *pool)
667	{
668	error_retry_list_with_code(pool, error: get_pool_io_error_code(pool));
669	}
670
671	/*
672	* This section of code contains the logic for processing a thin device's IO.
673	* Much of the code depends on pool object resources (lists, workqueues, etc)
674	* but most is exclusively called from the thin target rather than the thin-pool
675	* target.
676	*/
677
678	static dm_block_t get_bio_block(struct thin_c tc, struct* bio *bio)
679	{
680	struct pool *pool = tc->pool;
681	sector_t block_nr = bio->bi_iter.bi_sector;
682
683	if (block_size_is_power_of_two(pool))
684	block_nr >>= pool->sectors_per_block_shift;
685	else
686	(void) sector_div(block_nr, pool->sectors_per_block);
687
688	return block_nr;
689	}
690
691	/*
692	* Returns the _complete_ blocks that this bio covers.
693	*/
694	static void get_bio_block_range(struct thin_c tc, struct* bio *bio,
695	dm_block_t begin, dm_block_t end)
696	{
697	struct pool *pool = tc->pool;
698	sector_t b = bio->bi_iter.bi_sector;
699	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
700
701	b += pool->sectors_per_block - `1ull`; / so we round up /
702
703	if (block_size_is_power_of_two(pool)) {
704	b >>= pool->sectors_per_block_shift;
705	e >>= pool->sectors_per_block_shift;
706	} else {
707	(void) sector_div(b, pool->sectors_per_block);
708	(void) sector_div(e, pool->sectors_per_block);
709	}
710
711	if (e < b) {
712	/ Can happen if the bio is within a single block. /
713	e = b;
714	}
715
716	*begin = b;
717	*end = e;
718	}
719
720	static void remap(struct thin_c tc, struct* bio *bio, dm_block_t block)
721	{
722	struct pool *pool = tc->pool;
723	sector_t bi_sector = bio->bi_iter.bi_sector;
724
725	bio_set_dev(bio, bdev: tc->pool_dev->bdev);
726	if (block_size_is_power_of_two(pool)) {
727	bio->bi_iter.bi_sector =
728	(block << pool->sectors_per_block_shift) \|
729	(bi_sector & (pool->sectors_per_block - `1`));
730	} else {
731	bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
732	sector_div(bi_sector, pool->sectors_per_block);
733	}
734	}
735
736	static void remap_to_origin(struct thin_c tc, struct* bio *bio)
737	{
738	bio_set_dev(bio, bdev: tc->origin_dev->bdev);
739	}
740
741	static int bio_triggers_commit(struct thin_c tc, struct* bio *bio)
742	{
743	return op_is_flush(op: bio->bi_opf) &&
744	dm_thin_changed_this_transaction(td: tc->td);
745	}
746
747	static void inc_all_io_entry(struct pool pool, struct* bio *bio)
748	{
749	struct dm_thin_endio_hook *h;
750
751	if (bio_op(bio) == REQ_OP_DISCARD)
752	return;
753
754	h = dm_per_bio_data(bio, data_size: sizeof(struct dm_thin_endio_hook));
755	h->all_io_entry = dm_deferred_entry_inc(ds: pool->all_io_ds);
756	}
757
758	static void issue(struct thin_c tc, struct* bio *bio)
759	{
760	struct pool *pool = tc->pool;
761
762	if (!bio_triggers_commit(tc, bio)) {
763	dm_submit_bio_remap(clone: bio, NULL);
764	return;
765	}
766
767	/*
768	* Complete bio with an error if earlier I/O caused changes to
769	* the metadata that can't be committed e.g, due to I/O errors
770	* on the metadata device.
771	*/
772	if (dm_thin_aborted_changes(td: tc->td)) {
773	bio_io_error(bio);
774	return;
775	}
776
777	/*
778	* Batch together any bios that trigger commits and then issue a
779	* single commit for them in process_deferred_bios().
780	*/
781	spin_lock_irq(lock: &pool->lock);
782	bio_list_add(bl: &pool->deferred_flush_bios, bio);
783	spin_unlock_irq(lock: &pool->lock);
784	}
785
786	static void remap_to_origin_and_issue(struct thin_c tc, struct* bio *bio)
787	{
788	remap_to_origin(tc, bio);
789	issue(tc, bio);
790	}
791
792	static void remap_and_issue(struct thin_c tc, struct* bio *bio,
793	dm_block_t block)
794	{
795	remap(tc, bio, block);
796	issue(tc, bio);
797	}
798
799	/----------------------------------------------------------------/
800
801	/*
802	* Bio endio functions.
803	*/
804	struct dm_thin_new_mapping {
805	struct list_head list;
806
807	bool pass_discard:`1`;
808	bool maybe_shared:`1`;
809
810	/*
811	* Track quiescing, copying and zeroing preparation actions. When this
812	* counter hits zero the block is prepared and can be inserted into the
813	* btree.
814	*/
815	atomic_t prepare_actions;
816
817	blk_status_t status;
818	struct thin_c *tc;
819	dm_block_t virt_begin, virt_end;
820	dm_block_t data_block;
821	struct dm_bio_prison_cell *cell;
822
823	/*
824	* If the bio covers the whole area of a block then we can avoid
825	* zeroing or copying. Instead this bio is hooked. The bio will
826	* still be in the cell, so care has to be taken to avoid issuing
827	* the bio twice.
828	*/
829	struct bio *bio;
830	bio_end_io_t *saved_bi_end_io;
831	};
832
833	static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
834	{
835	struct pool *pool = m->tc->pool;
836
837	if (atomic_dec_and_test(v: &m->prepare_actions)) {
838	list_add_tail(new: &m->list, head: &pool->prepared_mappings);
839	wake_worker(pool);
840	}
841	}
842
843	static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
844	{
845	unsigned long flags;
846	struct pool *pool = m->tc->pool;
847
848	spin_lock_irqsave(&pool->lock, flags);
849	__complete_mapping_preparation(m);
850	spin_unlock_irqrestore(lock: &pool->lock, flags);
851	}
852
853	static void copy_complete(int read_err, unsigned long write_err, void *context)
854	{
855	struct dm_thin_new_mapping *m = context;
856
857	m->status = read_err \|\| write_err ? BLK_STS_IOERR : `0`;
858	complete_mapping_preparation(m);
859	}
860
861	static void overwrite_endio(struct bio *bio)
862	{
863	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
864	struct dm_thin_new_mapping *m = h->overwrite_mapping;
865
866	bio->bi_end_io = m->saved_bi_end_io;
867
868	m->status = bio->bi_status;
869	complete_mapping_preparation(m);
870	}
871
872	/----------------------------------------------------------------/
873
874	/*
875	* Workqueue.
876	*/
877
878	/*
879	* Prepared mapping jobs.
880	*/
881
882	/*
883	* This sends the bios in the cell, except the original holder, back
884	* to the deferred_bios list.
885	*/
886	static void cell_defer_no_holder(struct thin_c tc, struct* dm_bio_prison_cell *cell)
887	{
888	struct pool *pool = tc->pool;
889	unsigned long flags;
890	struct bio_list bios;
891
892	bio_list_init(bl: &bios);
893	cell_release_no_holder(pool, cell, bios: &bios);
894
895	if (!bio_list_empty(bl: &bios)) {
896	spin_lock_irqsave(&tc->lock, flags);
897	bio_list_merge(bl: &tc->deferred_bio_list, bl2: &bios);
898	spin_unlock_irqrestore(lock: &tc->lock, flags);
899	wake_worker(pool);
900	}
901	}
902
903	static void thin_defer_bio(struct thin_c tc, struct* bio *bio);
904
905	struct remap_info {
906	struct thin_c *tc;
907	struct bio_list defer_bios;
908	struct bio_list issue_bios;
909	};
910
911	static void __inc_remap_and_issue_cell(void *context,
912	struct dm_bio_prison_cell *cell)
913	{
914	struct remap_info *info = context;
915	struct bio *bio;
916
917	while ((bio = bio_list_pop(bl: &cell->bios))) {
918	if (op_is_flush(op: bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD)
919	bio_list_add(bl: &info->defer_bios, bio);
920	else {
921	inc_all_io_entry(pool: info->tc->pool, bio);
922
923	/*
924	* We can't issue the bios with the bio prison lock
925	* held, so we add them to a list to issue on
926	* return from this function.
927	*/
928	bio_list_add(bl: &info->issue_bios, bio);
929	}
930	}
931	}
932
933	static void inc_remap_and_issue_cell(struct thin_c *tc,
934	struct dm_bio_prison_cell *cell,
935	dm_block_t block)
936	{
937	struct bio *bio;
938	struct remap_info info;
939
940	info.tc = tc;
941	bio_list_init(bl: &info.defer_bios);
942	bio_list_init(bl: &info.issue_bios);
943
944	/*
945	* We have to be careful to inc any bios we're about to issue
946	* before the cell is released, and avoid a race with new bios
947	* being added to the cell.
948	*/
949	cell_visit_release(pool: tc->pool, fn: __inc_remap_and_issue_cell,
950	context: &info, cell);
951
952	while ((bio = bio_list_pop(bl: &info.defer_bios)))
953	thin_defer_bio(tc, bio);
954
955	while ((bio = bio_list_pop(bl: &info.issue_bios)))
956	remap_and_issue(tc: info.tc, bio, block);
957	}
958
959	static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
960	{
961	cell_error(pool: m->tc->pool, cell: m->cell);
962	list_del(entry: &m->list);
963	mempool_free(element: m, pool: &m->tc->pool->mapping_pool);
964	}
965
966	static void complete_overwrite_bio(struct thin_c tc, struct* bio *bio)
967	{
968	struct pool *pool = tc->pool;
969
970	/*
971	* If the bio has the REQ_FUA flag set we must commit the metadata
972	* before signaling its completion.
973	*/
974	if (!bio_triggers_commit(tc, bio)) {
975	bio_endio(bio);
976	return;
977	}
978
979	/*
980	* Complete bio with an error if earlier I/O caused changes to the
981	* metadata that can't be committed, e.g, due to I/O errors on the
982	* metadata device.
983	*/
984	if (dm_thin_aborted_changes(td: tc->td)) {
985	bio_io_error(bio);
986	return;
987	}
988
989	/*
990	* Batch together any bios that trigger commits and then issue a
991	* single commit for them in process_deferred_bios().
992	*/
993	spin_lock_irq(lock: &pool->lock);
994	bio_list_add(bl: &pool->deferred_flush_completions, bio);
995	spin_unlock_irq(lock: &pool->lock);
996	}
997
998	static void process_prepared_mapping(struct dm_thin_new_mapping *m)
999	{
1000	struct thin_c *tc = m->tc;
1001	struct pool *pool = tc->pool;
1002	struct bio *bio = m->bio;
1003	int r;
1004
1005	if (m->status) {
1006	cell_error(pool, cell: m->cell);
1007	goto out;
1008	}
1009
1010	/*
1011	* Commit the prepared block into the mapping btree.
1012	* Any I/O for this block arriving after this point will get
1013	* remapped to it directly.
1014	*/
1015	r = dm_thin_insert_block(td: tc->td, block: m->virt_begin, data_block: m->data_block);
1016	if (r) {
1017	metadata_operation_failed(pool, op: "dm_thin_insert_block", r);
1018	cell_error(pool, cell: m->cell);
1019	goto out;
1020	}
1021
1022	/*
1023	* Release any bios held while the block was being provisioned.
1024	* If we are processing a write bio that completely covers the block,
1025	* we already processed it so can ignore it now when processing
1026	* the bios in the cell.
1027	*/
1028	if (bio) {
1029	inc_remap_and_issue_cell(tc, cell: m->cell, block: m->data_block);
1030	complete_overwrite_bio(tc, bio);
1031	} else {
1032	inc_all_io_entry(pool: tc->pool, bio: m->cell->holder);
1033	remap_and_issue(tc, bio: m->cell->holder, block: m->data_block);
1034	inc_remap_and_issue_cell(tc, cell: m->cell, block: m->data_block);
1035	}
1036
1037	out:
1038	list_del(entry: &m->list);
1039	mempool_free(element: m, pool: &pool->mapping_pool);
1040	}
1041
1042	/----------------------------------------------------------------/
1043
1044	static void free_discard_mapping(struct dm_thin_new_mapping *m)
1045	{
1046	struct thin_c *tc = m->tc;
1047
1048	if (m->cell)
1049	cell_defer_no_holder(tc, cell: m->cell);
1050	mempool_free(element: m, pool: &tc->pool->mapping_pool);
1051	}
1052
1053	static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1054	{
1055	bio_io_error(bio: m->bio);
1056	free_discard_mapping(m);
1057	}
1058
1059	static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1060	{
1061	bio_endio(m->bio);
1062	free_discard_mapping(m);
1063	}
1064
1065	static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1066	{
1067	int r;
1068	struct thin_c *tc = m->tc;
1069
1070	r = dm_thin_remove_range(td: tc->td, begin: m->cell->key.block_begin, end: m->cell->key.block_end);
1071	if (r) {
1072	metadata_operation_failed(pool: tc->pool, op: "dm_thin_remove_range", r);
1073	bio_io_error(bio: m->bio);
1074	} else
1075	bio_endio(m->bio);
1076
1077	cell_defer_no_holder(tc, cell: m->cell);
1078	mempool_free(element: m, pool: &tc->pool->mapping_pool);
1079	}
1080
1081	/----------------------------------------------------------------/
1082
1083	static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1084	struct bio *discard_parent)
1085	{
1086	/*
1087	* We've already unmapped this range of blocks, but before we
1088	* passdown we have to check that these blocks are now unused.
1089	*/
1090	int r = `0`;
1091	bool shared = true;
1092	struct thin_c *tc = m->tc;
1093	struct pool *pool = tc->pool;
1094	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1095	struct discard_op op;
1096
1097	begin_discard(op: &op, tc, parent: discard_parent);
1098	while (b != end) {
1099	/ find start of unmapped run /
1100	for (; b < end; b++) {
1101	r = dm_pool_block_is_shared(pmd: pool->pmd, b, result: &shared);
1102	if (r)
1103	goto out;
1104
1105	if (!shared)
1106	break;
1107	}
1108
1109	if (b == end)
1110	break;
1111
1112	/ find end of run /
1113	for (e = b + `1`; e != end; e++) {
1114	r = dm_pool_block_is_shared(pmd: pool->pmd, b: e, result: &shared);
1115	if (r)
1116	goto out;
1117
1118	if (shared)
1119	break;
1120	}
1121
1122	r = issue_discard(op: &op, data_b: b, data_e: e);
1123	if (r)
1124	goto out;
1125
1126	b = e;
1127	}
1128	out:
1129	end_discard(op: &op, r);
1130	}
1131
1132	static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1133	{
1134	unsigned long flags;
1135	struct pool *pool = m->tc->pool;
1136
1137	spin_lock_irqsave(&pool->lock, flags);
1138	list_add_tail(new: &m->list, head: &pool->prepared_discards_pt2);
1139	spin_unlock_irqrestore(lock: &pool->lock, flags);
1140	wake_worker(pool);
1141	}
1142
1143	static void passdown_endio(struct bio *bio)
1144	{
1145	/*
1146	* It doesn't matter if the passdown discard failed, we still want
1147	* to unmap (we ignore err).
1148	*/
1149	queue_passdown_pt2(m: bio->bi_private);
1150	bio_put(bio);
1151	}
1152
1153	static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1154	{
1155	int r;
1156	struct thin_c *tc = m->tc;
1157	struct pool *pool = tc->pool;
1158	struct bio *discard_parent;
1159	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1160
1161	/*
1162	* Only this thread allocates blocks, so we can be sure that the
1163	* newly unmapped blocks will not be allocated before the end of
1164	* the function.
1165	*/
1166	r = dm_thin_remove_range(td: tc->td, begin: m->virt_begin, end: m->virt_end);
1167	if (r) {
1168	metadata_operation_failed(pool, op: "dm_thin_remove_range", r);
1169	bio_io_error(bio: m->bio);
1170	cell_defer_no_holder(tc, cell: m->cell);
1171	mempool_free(element: m, pool: &pool->mapping_pool);
1172	return;
1173	}
1174
1175	/*
1176	* Increment the unmapped blocks. This prevents a race between the
1177	* passdown io and reallocation of freed blocks.
1178	*/
1179	r = dm_pool_inc_data_range(pmd: pool->pmd, b: m->data_block, e: data_end);
1180	if (r) {
1181	metadata_operation_failed(pool, op: "dm_pool_inc_data_range", r);
1182	bio_io_error(bio: m->bio);
1183	cell_defer_no_holder(tc, cell: m->cell);
1184	mempool_free(element: m, pool: &pool->mapping_pool);
1185	return;
1186	}
1187
1188	discard_parent = bio_alloc(NULL, nr_vecs: `1`, opf: `0`, GFP_NOIO);
1189	discard_parent->bi_end_io = passdown_endio;
1190	discard_parent->bi_private = m;
1191	if (m->maybe_shared)
1192	passdown_double_checking_shared_status(m, discard_parent);
1193	else {
1194	struct discard_op op;
1195
1196	begin_discard(op: &op, tc, parent: discard_parent);
1197	r = issue_discard(op: &op, data_b: m->data_block, data_e: data_end);
1198	end_discard(op: &op, r);
1199	}
1200	}
1201
1202	static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1203	{
1204	int r;
1205	struct thin_c *tc = m->tc;
1206	struct pool *pool = tc->pool;
1207
1208	/*
1209	* The passdown has completed, so now we can decrement all those
1210	* unmapped blocks.
1211	*/
1212	r = dm_pool_dec_data_range(pmd: pool->pmd, b: m->data_block,
1213	e: m->data_block + (m->virt_end - m->virt_begin));
1214	if (r) {
1215	metadata_operation_failed(pool, op: "dm_pool_dec_data_range", r);
1216	bio_io_error(bio: m->bio);
1217	} else
1218	bio_endio(m->bio);
1219
1220	cell_defer_no_holder(tc, cell: m->cell);
1221	mempool_free(element: m, pool: &pool->mapping_pool);
1222	}
1223
1224	static void process_prepared(struct pool pool, struct* list_head *head,
1225	process_mapping_fn *fn)
1226	{
1227	struct list_head maps;
1228	struct dm_thin_new_mapping m, tmp;
1229
1230	INIT_LIST_HEAD(list: &maps);
1231	spin_lock_irq(lock: &pool->lock);
1232	list_splice_init(list: head, head: &maps);
1233	spin_unlock_irq(lock: &pool->lock);
1234
1235	list_for_each_entry_safe(m, tmp, &maps, list)
1236	(*fn)(m);
1237	}
1238
1239	/*
1240	* Deferred bio jobs.
1241	*/
1242	static int io_overlaps_block(struct pool pool, struct* bio *bio)
1243	{
1244	return bio->bi_iter.bi_size ==
1245	(pool->sectors_per_block << SECTOR_SHIFT);
1246	}
1247
1248	static int io_overwrites_block(struct pool pool, struct* bio *bio)
1249	{
1250	return (bio_data_dir(bio) == WRITE) &&
1251	io_overlaps_block(pool, bio);
1252	}
1253
1254	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
1255	bio_end_io_t *fn)
1256	{
1257	*save = bio->bi_end_io;
1258	bio->bi_end_io = fn;
1259	}
1260
1261	static int ensure_next_mapping(struct pool *pool)
1262	{
1263	if (pool->next_mapping)
1264	return `0`;
1265
1266	pool->next_mapping = mempool_alloc(pool: &pool->mapping_pool, GFP_ATOMIC);
1267
1268	return pool->next_mapping ? `0` : -ENOMEM;
1269	}
1270
1271	static struct dm_thin_new_mapping get_next_mapping(struct* pool *pool)
1272	{
1273	struct dm_thin_new_mapping *m = pool->next_mapping;
1274
1275	BUG_ON(!pool->next_mapping);
1276
1277	memset(m, `0`, sizeof(struct dm_thin_new_mapping));
1278	INIT_LIST_HEAD(list: &m->list);
1279	m->bio = NULL;
1280
1281	pool->next_mapping = NULL;
1282
1283	return m;
1284	}
1285
1286	static void ll_zero(struct thin_c tc, struct* dm_thin_new_mapping *m,
1287	sector_t begin, sector_t end)
1288	{
1289	struct dm_io_region to;
1290
1291	to.bdev = tc->pool_dev->bdev;
1292	to.sector = begin;
1293	to.count = end - begin;
1294
1295	dm_kcopyd_zero(kc: tc->pool->copier, num_dests: `1`, dests: &to, flags: `0`, fn: copy_complete, context: m);
1296	}
1297
1298	static void remap_and_issue_overwrite(struct thin_c tc, struct* bio *bio,
1299	dm_block_t data_begin,
1300	struct dm_thin_new_mapping *m)
1301	{
1302	struct pool *pool = tc->pool;
1303	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1304
1305	h->overwrite_mapping = m;
1306	m->bio = bio;
1307	save_and_set_endio(bio, save: &m->saved_bi_end_io, fn: overwrite_endio);
1308	inc_all_io_entry(pool, bio);
1309	remap_and_issue(tc, bio, block: data_begin);
1310	}
1311
1312	/*
1313	* A partial copy also needs to zero the uncopied region.
1314	*/
1315	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1316	struct dm_dev *origin, dm_block_t data_origin,
1317	dm_block_t data_dest,
1318	struct dm_bio_prison_cell cell, struct* bio *bio,
1319	sector_t len)
1320	{
1321	struct pool *pool = tc->pool;
1322	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1323
1324	m->tc = tc;
1325	m->virt_begin = virt_block;
1326	m->virt_end = virt_block + `1u`;
1327	m->data_block = data_dest;
1328	m->cell = cell;
1329
1330	/*
1331	* quiesce action + copy action + an extra reference held for the
1332	* duration of this function (we may need to inc later for a
1333	* partial zero).
1334	*/
1335	atomic_set(v: &m->prepare_actions, i: `3`);
1336
1337	if (!dm_deferred_set_add_work(ds: pool->shared_read_ds, work: &m->list))
1338	complete_mapping_preparation(m); / already quiesced /
1339
1340	/*
1341	* IO to pool_dev remaps to the pool target's data_dev.
1342	*
1343	* If the whole block of data is being overwritten, we can issue the
1344	* bio immediately. Otherwise we use kcopyd to clone the data first.
1345	*/
1346	if (io_overwrites_block(pool, bio))
1347	remap_and_issue_overwrite(tc, bio, data_begin: data_dest, m);
1348	else {
1349	struct dm_io_region from, to;
1350
1351	from.bdev = origin->bdev;
1352	from.sector = data_origin * pool->sectors_per_block;
1353	from.count = len;
1354
1355	to.bdev = tc->pool_dev->bdev;
1356	to.sector = data_dest * pool->sectors_per_block;
1357	to.count = len;
1358
1359	dm_kcopyd_copy(kc: pool->copier, from: &from, num_dests: `1`, dests: &to,
1360	flags: `0`, fn: copy_complete, context: m);
1361
1362	/*
1363	* Do we need to zero a tail region?
1364	*/
1365	if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1366	atomic_inc(v: &m->prepare_actions);
1367	ll_zero(tc, m,
1368	begin: data_dest * pool->sectors_per_block + len,
1369	end: (data_dest + `1`) * pool->sectors_per_block);
1370	}
1371	}
1372
1373	complete_mapping_preparation(m); / drop our ref /
1374	}
1375
1376	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1377	dm_block_t data_origin, dm_block_t data_dest,
1378	struct dm_bio_prison_cell cell, struct* bio *bio)
1379	{
1380	schedule_copy(tc, virt_block, origin: tc->pool_dev,
1381	data_origin, data_dest, cell, bio,
1382	len: tc->pool->sectors_per_block);
1383	}
1384
1385	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1386	dm_block_t data_block, struct dm_bio_prison_cell *cell,
1387	struct bio *bio)
1388	{
1389	struct pool *pool = tc->pool;
1390	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1391
1392	atomic_set(v: &m->prepare_actions, i: `1`); / no need to quiesce /
1393	m->tc = tc;
1394	m->virt_begin = virt_block;
1395	m->virt_end = virt_block + `1u`;
1396	m->data_block = data_block;
1397	m->cell = cell;
1398
1399	/*
1400	* If the whole block of data is being overwritten or we are not
1401	* zeroing pre-existing data, we can issue the bio immediately.
1402	* Otherwise we use kcopyd to zero the data first.
1403	*/
1404	if (pool->pf.zero_new_blocks) {
1405	if (io_overwrites_block(pool, bio))
1406	remap_and_issue_overwrite(tc, bio, data_begin: data_block, m);
1407	else {
1408	ll_zero(tc, m, begin: data_block * pool->sectors_per_block,
1409	end: (data_block + `1`) * pool->sectors_per_block);
1410	}
1411	} else
1412	process_prepared_mapping(m);
1413	}
1414
1415	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1416	dm_block_t data_dest,
1417	struct dm_bio_prison_cell cell, struct* bio *bio)
1418	{
1419	struct pool *pool = tc->pool;
1420	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1421	sector_t virt_block_end = (virt_block + `1`) * pool->sectors_per_block;
1422
1423	if (virt_block_end <= tc->origin_size) {
1424	schedule_copy(tc, virt_block, origin: tc->origin_dev,
1425	data_origin: virt_block, data_dest, cell, bio,
1426	len: pool->sectors_per_block);
1427
1428	} else if (virt_block_begin < tc->origin_size) {
1429	schedule_copy(tc, virt_block, origin: tc->origin_dev,
1430	data_origin: virt_block, data_dest, cell, bio,
1431	len: tc->origin_size - virt_block_begin);
1432
1433	} else
1434	schedule_zero(tc, virt_block, data_block: data_dest, cell, bio);
1435	}
1436
1437	static void set_pool_mode(struct pool pool, enum* pool_mode new_mode);
1438
1439	static void requeue_bios(struct pool *pool);
1440
1441	static bool is_read_only_pool_mode(enum pool_mode mode)
1442	{
1443	return (mode == PM_OUT_OF_METADATA_SPACE \|\| mode == PM_READ_ONLY);
1444	}
1445
1446	static bool is_read_only(struct pool *pool)
1447	{
1448	return is_read_only_pool_mode(mode: get_pool_mode(pool));
1449	}
1450
1451	static void check_for_metadata_space(struct pool *pool)
1452	{
1453	int r;
1454	const char *ooms_reason = NULL;
1455	dm_block_t nr_free;
1456
1457	r = dm_pool_get_free_metadata_block_count(pmd: pool->pmd, result: &nr_free);
1458	if (r)
1459	ooms_reason = "Could not get free metadata blocks";
1460	else if (!nr_free)
1461	ooms_reason = "No free metadata blocks";
1462
1463	if (ooms_reason && !is_read_only(pool)) {
1464	DMERR("%s", ooms_reason);
1465	set_pool_mode(pool, new_mode: PM_OUT_OF_METADATA_SPACE);
1466	}
1467	}
1468
1469	static void check_for_data_space(struct pool *pool)
1470	{
1471	int r;
1472	dm_block_t nr_free;
1473
1474	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1475	return;
1476
1477	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &nr_free);
1478	if (r)
1479	return;
1480
1481	if (nr_free) {
1482	set_pool_mode(pool, new_mode: PM_WRITE);
1483	requeue_bios(pool);
1484	}
1485	}
1486
1487	/*
1488	* A non-zero return indicates read_only or fail_io mode.
1489	* Many callers don't care about the return value.
1490	*/
1491	static int commit(struct pool *pool)
1492	{
1493	int r;
1494
1495	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1496	return -EINVAL;
1497
1498	r = dm_pool_commit_metadata(pmd: pool->pmd);
1499	if (r)
1500	metadata_operation_failed(pool, op: "dm_pool_commit_metadata", r);
1501	else {
1502	check_for_metadata_space(pool);
1503	check_for_data_space(pool);
1504	}
1505
1506	return r;
1507	}
1508
1509	static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1510	{
1511	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1512	DMWARN("%s: reached low water mark for data device: sending event.",
1513	dm_device_name(pool->pool_md));
1514	spin_lock_irq(lock: &pool->lock);
1515	pool->low_water_triggered = true;
1516	spin_unlock_irq(lock: &pool->lock);
1517	dm_table_event(t: pool->ti->table);
1518	}
1519	}
1520
1521	static int alloc_data_block(struct thin_c tc, dm_block_t result)
1522	{
1523	int r;
1524	dm_block_t free_blocks;
1525	struct pool *pool = tc->pool;
1526
1527	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1528	return -EINVAL;
1529
1530	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &free_blocks);
1531	if (r) {
1532	metadata_operation_failed(pool, op: "dm_pool_get_free_block_count", r);
1533	return r;
1534	}
1535
1536	check_low_water_mark(pool, free_blocks);
1537
1538	if (!free_blocks) {
1539	/*
1540	* Try to commit to see if that will free up some
1541	* more space.
1542	*/
1543	r = commit(pool);
1544	if (r)
1545	return r;
1546
1547	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &free_blocks);
1548	if (r) {
1549	metadata_operation_failed(pool, op: "dm_pool_get_free_block_count", r);
1550	return r;
1551	}
1552
1553	if (!free_blocks) {
1554	set_pool_mode(pool, new_mode: PM_OUT_OF_DATA_SPACE);
1555	return -ENOSPC;
1556	}
1557	}
1558
1559	r = dm_pool_alloc_data_block(pmd: pool->pmd, result);
1560	if (r) {
1561	if (r == -ENOSPC)
1562	set_pool_mode(pool, new_mode: PM_OUT_OF_DATA_SPACE);
1563	else
1564	metadata_operation_failed(pool, op: "dm_pool_alloc_data_block", r);
1565	return r;
1566	}
1567
1568	r = dm_pool_get_free_metadata_block_count(pmd: pool->pmd, result: &free_blocks);
1569	if (r) {
1570	metadata_operation_failed(pool, op: "dm_pool_get_free_metadata_block_count", r);
1571	return r;
1572	}
1573
1574	if (!free_blocks) {
1575	/ Let's commit before we use up the metadata reserve. /
1576	r = commit(pool);
1577	if (r)
1578	return r;
1579	}
1580
1581	return `0`;
1582	}
1583
1584	/*
1585	* If we have run out of space, queue bios until the device is
1586	* resumed, presumably after having been reloaded with more space.
1587	*/
1588	static void retry_on_resume(struct bio *bio)
1589	{
1590	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1591	struct thin_c *tc = h->tc;
1592
1593	spin_lock_irq(lock: &tc->lock);
1594	bio_list_add(bl: &tc->retry_on_resume_list, bio);
1595	spin_unlock_irq(lock: &tc->lock);
1596	}
1597
1598	static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1599	{
1600	enum pool_mode m = get_pool_mode(pool);
1601
1602	switch (m) {
1603	case PM_WRITE:
1604	/ Shouldn't get here /
1605	DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1606	return BLK_STS_IOERR;
1607
1608	case PM_OUT_OF_DATA_SPACE:
1609	return pool->pf.error_if_no_space ? BLK_STS_NOSPC : `0`;
1610
1611	case PM_OUT_OF_METADATA_SPACE:
1612	case PM_READ_ONLY:
1613	case PM_FAIL:
1614	return BLK_STS_IOERR;
1615	default:
1616	/ Shouldn't get here /
1617	DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1618	return BLK_STS_IOERR;
1619	}
1620	}
1621
1622	static void handle_unserviceable_bio(struct pool pool, struct* bio *bio)
1623	{
1624	blk_status_t error = should_error_unserviceable_bio(pool);
1625
1626	if (error) {
1627	bio->bi_status = error;
1628	bio_endio(bio);
1629	} else
1630	retry_on_resume(bio);
1631	}
1632
1633	static void retry_bios_on_resume(struct pool pool, struct* dm_bio_prison_cell *cell)
1634	{
1635	struct bio *bio;
1636	struct bio_list bios;
1637	blk_status_t error;
1638
1639	error = should_error_unserviceable_bio(pool);
1640	if (error) {
1641	cell_error_with_code(pool, cell, error_code: error);
1642	return;
1643	}
1644
1645	bio_list_init(bl: &bios);
1646	cell_release(pool, cell, bios: &bios);
1647
1648	while ((bio = bio_list_pop(bl: &bios)))
1649	retry_on_resume(bio);
1650	}
1651
1652	static void process_discard_cell_no_passdown(struct thin_c *tc,
1653	struct dm_bio_prison_cell *virt_cell)
1654	{
1655	struct pool *pool = tc->pool;
1656	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1657
1658	/*
1659	* We don't need to lock the data blocks, since there's no
1660	* passdown. We only lock data blocks for allocation and breaking sharing.
1661	*/
1662	m->tc = tc;
1663	m->virt_begin = virt_cell->key.block_begin;
1664	m->virt_end = virt_cell->key.block_end;
1665	m->cell = virt_cell;
1666	m->bio = virt_cell->holder;
1667
1668	if (!dm_deferred_set_add_work(ds: pool->all_io_ds, work: &m->list))
1669	pool->process_prepared_discard(m);
1670	}
1671
1672	static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1673	struct bio *bio)
1674	{
1675	struct pool *pool = tc->pool;
1676
1677	int r;
1678	bool maybe_shared;
1679	struct dm_cell_key data_key;
1680	struct dm_bio_prison_cell *data_cell;
1681	struct dm_thin_new_mapping *m;
1682	dm_block_t virt_begin, virt_end, data_begin, data_end;
1683	dm_block_t len, next_boundary;
1684
1685	while (begin != end) {
1686	r = dm_thin_find_mapped_range(td: tc->td, begin, end, thin_begin: &virt_begin, thin_end: &virt_end,
1687	pool_begin: &data_begin, maybe_shared: &maybe_shared);
1688	if (r) {
1689	/*
1690	* Silently fail, letting any mappings we've
1691	* created complete.
1692	*/
1693	break;
1694	}
1695
1696	data_end = data_begin + (virt_end - virt_begin);
1697
1698	/*
1699	* Make sure the data region obeys the bio prison restrictions.
1700	*/
1701	while (data_begin < data_end) {
1702	r = ensure_next_mapping(pool);
1703	if (r)
1704	return; / we did our best /
1705
1706	next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + `1`)
1707	<< BIO_PRISON_MAX_RANGE_SHIFT;
1708	len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
1709
1710	/ This key is certainly within range given the above splitting /
1711	(void) build_key(td: tc->td, ls: PHYSICAL, b: data_begin, e: data_begin + len, key: &data_key);
1712	if (bio_detain(pool: tc->pool, key: &data_key, NULL, cell_result: &data_cell)) {
1713	/ contention, we'll give up with this range /
1714	data_begin += len;
1715	continue;
1716	}
1717
1718	/*
1719	* IO may still be going to the destination block. We must
1720	* quiesce before we can do the removal.
1721	*/
1722	m = get_next_mapping(pool);
1723	m->tc = tc;
1724	m->maybe_shared = maybe_shared;
1725	m->virt_begin = virt_begin;
1726	m->virt_end = virt_begin + len;
1727	m->data_block = data_begin;
1728	m->cell = data_cell;
1729	m->bio = bio;
1730
1731	/*
1732	* The parent bio must not complete before sub discard bios are
1733	* chained to it (see end_discard's bio_chain)!
1734	*
1735	* This per-mapping bi_remaining increment is paired with
1736	* the implicit decrement that occurs via bio_endio() in
1737	* end_discard().
1738	*/
1739	bio_inc_remaining(bio);
1740	if (!dm_deferred_set_add_work(ds: pool->all_io_ds, work: &m->list))
1741	pool->process_prepared_discard(m);
1742
1743	virt_begin += len;
1744	data_begin += len;
1745	}
1746
1747	begin = virt_end;
1748	}
1749	}
1750
1751	static void process_discard_cell_passdown(struct thin_c tc, struct* dm_bio_prison_cell *virt_cell)
1752	{
1753	struct bio *bio = virt_cell->holder;
1754	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1755
1756	/*
1757	* The virt_cell will only get freed once the origin bio completes.
1758	* This means it will remain locked while all the individual
1759	* passdown bios are in flight.
1760	*/
1761	h->cell = virt_cell;
1762	break_up_discard_bio(tc, begin: virt_cell->key.block_begin, end: virt_cell->key.block_end, bio);
1763
1764	/*
1765	* We complete the bio now, knowing that the bi_remaining field
1766	* will prevent completion until the sub range discards have
1767	* completed.
1768	*/
1769	bio_endio(bio);
1770	}
1771
1772	static void process_discard_bio(struct thin_c tc, struct* bio *bio)
1773	{
1774	dm_block_t begin, end;
1775	struct dm_cell_key virt_key;
1776	struct dm_bio_prison_cell *virt_cell;
1777
1778	get_bio_block_range(tc, bio, begin: &begin, end: &end);
1779	if (begin == end) {
1780	/*
1781	* The discard covers less than a block.
1782	*/
1783	bio_endio(bio);
1784	return;
1785	}
1786
1787	if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
1788	DMERR_LIMIT("Discard doesn't respect bio prison limits");
1789	bio_endio(bio);
1790	return;
1791	}
1792
1793	if (bio_detain(pool: tc->pool, key: &virt_key, bio, cell_result: &virt_cell)) {
1794	/*
1795	* Potential starvation issue: We're relying on the
1796	* fs/application being well behaved, and not trying to
1797	* send IO to a region at the same time as discarding it.
1798	* If they do this persistently then it's possible this
1799	* cell will never be granted.
1800	*/
1801	return;
1802	}
1803
1804	tc->pool->process_discard_cell(tc, virt_cell);
1805	}
1806
1807	static void break_sharing(struct thin_c tc, struct* bio *bio, dm_block_t block,
1808	struct dm_cell_key *key,
1809	struct dm_thin_lookup_result *lookup_result,
1810	struct dm_bio_prison_cell *cell)
1811	{
1812	int r;
1813	dm_block_t data_block;
1814	struct pool *pool = tc->pool;
1815
1816	r = alloc_data_block(tc, result: &data_block);
1817	switch (r) {
1818	case `0`:
1819	schedule_internal_copy(tc, virt_block: block, data_origin: lookup_result->block,
1820	data_dest: data_block, cell, bio);
1821	break;
1822
1823	case -ENOSPC:
1824	retry_bios_on_resume(pool, cell);
1825	break;
1826
1827	default:
1828	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1829	__func__, r);
1830	cell_error(pool, cell);
1831	break;
1832	}
1833	}
1834
1835	static void __remap_and_issue_shared_cell(void *context,
1836	struct dm_bio_prison_cell *cell)
1837	{
1838	struct remap_info *info = context;
1839	struct bio *bio;
1840
1841	while ((bio = bio_list_pop(bl: &cell->bios))) {
1842	if (bio_data_dir(bio) == WRITE \|\| op_is_flush(op: bio->bi_opf) \|\|
1843	bio_op(bio) == REQ_OP_DISCARD)
1844	bio_list_add(bl: &info->defer_bios, bio);
1845	else {
1846	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1847
1848	h->shared_read_entry = dm_deferred_entry_inc(ds: info->tc->pool->shared_read_ds);
1849	inc_all_io_entry(pool: info->tc->pool, bio);
1850	bio_list_add(bl: &info->issue_bios, bio);
1851	}
1852	}
1853	}
1854
1855	static void remap_and_issue_shared_cell(struct thin_c *tc,
1856	struct dm_bio_prison_cell *cell,
1857	dm_block_t block)
1858	{
1859	struct bio *bio;
1860	struct remap_info info;
1861
1862	info.tc = tc;
1863	bio_list_init(bl: &info.defer_bios);
1864	bio_list_init(bl: &info.issue_bios);
1865
1866	cell_visit_release(pool: tc->pool, fn: __remap_and_issue_shared_cell,
1867	context: &info, cell);
1868
1869	while ((bio = bio_list_pop(bl: &info.defer_bios)))
1870	thin_defer_bio(tc, bio);
1871
1872	while ((bio = bio_list_pop(bl: &info.issue_bios)))
1873	remap_and_issue(tc, bio, block);
1874	}
1875
1876	static void process_shared_bio(struct thin_c tc, struct* bio *bio,
1877	dm_block_t block,
1878	struct dm_thin_lookup_result *lookup_result,
1879	struct dm_bio_prison_cell *virt_cell)
1880	{
1881	struct dm_bio_prison_cell *data_cell;
1882	struct pool *pool = tc->pool;
1883	struct dm_cell_key key;
1884
1885	/*
1886	* If cell is already occupied, then sharing is already in the process
1887	* of being broken so we have nothing further to do here.
1888	*/
1889	build_data_key(td: tc->td, b: lookup_result->block, key: &key);
1890	if (bio_detain(pool, key: &key, bio, cell_result: &data_cell)) {
1891	cell_defer_no_holder(tc, cell: virt_cell);
1892	return;
1893	}
1894
1895	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1896	break_sharing(tc, bio, block, key: &key, lookup_result, cell: data_cell);
1897	cell_defer_no_holder(tc, cell: virt_cell);
1898	} else {
1899	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1900
1901	h->shared_read_entry = dm_deferred_entry_inc(ds: pool->shared_read_ds);
1902	inc_all_io_entry(pool, bio);
1903	remap_and_issue(tc, bio, block: lookup_result->block);
1904
1905	remap_and_issue_shared_cell(tc, cell: data_cell, block: lookup_result->block);
1906	remap_and_issue_shared_cell(tc, cell: virt_cell, block: lookup_result->block);
1907	}
1908	}
1909
1910	static void provision_block(struct thin_c tc, struct* bio *bio, dm_block_t block,
1911	struct dm_bio_prison_cell *cell)
1912	{
1913	int r;
1914	dm_block_t data_block;
1915	struct pool *pool = tc->pool;
1916
1917	/*
1918	* Remap empty bios (flushes) immediately, without provisioning.
1919	*/
1920	if (!bio->bi_iter.bi_size) {
1921	inc_all_io_entry(pool, bio);
1922	cell_defer_no_holder(tc, cell);
1923
1924	remap_and_issue(tc, bio, block: `0`);
1925	return;
1926	}
1927
1928	/*
1929	* Fill read bios with zeroes and complete them immediately.
1930	*/
1931	if (bio_data_dir(bio) == READ) {
1932	zero_fill_bio(bio);
1933	cell_defer_no_holder(tc, cell);
1934	bio_endio(bio);
1935	return;
1936	}
1937
1938	r = alloc_data_block(tc, result: &data_block);
1939	switch (r) {
1940	case `0`:
1941	if (tc->origin_dev)
1942	schedule_external_copy(tc, virt_block: block, data_dest: data_block, cell, bio);
1943	else
1944	schedule_zero(tc, virt_block: block, data_block, cell, bio);
1945	break;
1946
1947	case -ENOSPC:
1948	retry_bios_on_resume(pool, cell);
1949	break;
1950
1951	default:
1952	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1953	__func__, r);
1954	cell_error(pool, cell);
1955	break;
1956	}
1957	}
1958
1959	static void process_cell(struct thin_c tc, struct* dm_bio_prison_cell *cell)
1960	{
1961	int r;
1962	struct pool *pool = tc->pool;
1963	struct bio *bio = cell->holder;
1964	dm_block_t block = get_bio_block(tc, bio);
1965	struct dm_thin_lookup_result lookup_result;
1966
1967	if (tc->requeue_mode) {
1968	cell_requeue(pool, cell);
1969	return;
1970	}
1971
1972	r = dm_thin_find_block(td: tc->td, block, can_issue_io: `1`, result: &lookup_result);
1973	switch (r) {
1974	case `0`:
1975	if (lookup_result.shared)
1976	process_shared_bio(tc, bio, block, lookup_result: &lookup_result, virt_cell: cell);
1977	else {
1978	inc_all_io_entry(pool, bio);
1979	remap_and_issue(tc, bio, block: lookup_result.block);
1980	inc_remap_and_issue_cell(tc, cell, block: lookup_result.block);
1981	}
1982	break;
1983
1984	case -ENODATA:
1985	if (bio_data_dir(bio) == READ && tc->origin_dev) {
1986	inc_all_io_entry(pool, bio);
1987	cell_defer_no_holder(tc, cell);
1988
1989	if (bio_end_sector(bio) <= tc->origin_size)
1990	remap_to_origin_and_issue(tc, bio);
1991
1992	else if (bio->bi_iter.bi_sector < tc->origin_size) {
1993	zero_fill_bio(bio);
1994	bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1995	remap_to_origin_and_issue(tc, bio);
1996
1997	} else {
1998	zero_fill_bio(bio);
1999	bio_endio(bio);
2000	}
2001	} else
2002	provision_block(tc, bio, block, cell);
2003	break;
2004
2005	default:
2006	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2007	__func__, r);
2008	cell_defer_no_holder(tc, cell);
2009	bio_io_error(bio);
2010	break;
2011	}
2012	}
2013
2014	static void process_bio(struct thin_c tc, struct* bio *bio)
2015	{
2016	struct pool *pool = tc->pool;
2017	dm_block_t block = get_bio_block(tc, bio);
2018	struct dm_bio_prison_cell *cell;
2019	struct dm_cell_key key;
2020
2021	/*
2022	* If cell is already occupied, then the block is already
2023	* being provisioned so we have nothing further to do here.
2024	*/
2025	build_virtual_key(td: tc->td, b: block, key: &key);
2026	if (bio_detain(pool, key: &key, bio, cell_result: &cell))
2027	return;
2028
2029	process_cell(tc, cell);
2030	}
2031
2032	static void __process_bio_read_only(struct thin_c tc, struct* bio *bio,
2033	struct dm_bio_prison_cell *cell)
2034	{
2035	int r;
2036	int rw = bio_data_dir(bio);
2037	dm_block_t block = get_bio_block(tc, bio);
2038	struct dm_thin_lookup_result lookup_result;
2039
2040	r = dm_thin_find_block(td: tc->td, block, can_issue_io: `1`, result: &lookup_result);
2041	switch (r) {
2042	case `0`:
2043	if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
2044	handle_unserviceable_bio(pool: tc->pool, bio);
2045	if (cell)
2046	cell_defer_no_holder(tc, cell);
2047	} else {
2048	inc_all_io_entry(pool: tc->pool, bio);
2049	remap_and_issue(tc, bio, block: lookup_result.block);
2050	if (cell)
2051	inc_remap_and_issue_cell(tc, cell, block: lookup_result.block);
2052	}
2053	break;
2054
2055	case -ENODATA:
2056	if (cell)
2057	cell_defer_no_holder(tc, cell);
2058	if (rw != READ) {
2059	handle_unserviceable_bio(pool: tc->pool, bio);
2060	break;
2061	}
2062
2063	if (tc->origin_dev) {
2064	inc_all_io_entry(pool: tc->pool, bio);
2065	remap_to_origin_and_issue(tc, bio);
2066	break;
2067	}
2068
2069	zero_fill_bio(bio);
2070	bio_endio(bio);
2071	break;
2072
2073	default:
2074	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2075	__func__, r);
2076	if (cell)
2077	cell_defer_no_holder(tc, cell);
2078	bio_io_error(bio);
2079	break;
2080	}
2081	}
2082
2083	static void process_bio_read_only(struct thin_c tc, struct* bio *bio)
2084	{
2085	__process_bio_read_only(tc, bio, NULL);
2086	}
2087
2088	static void process_cell_read_only(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2089	{
2090	__process_bio_read_only(tc, bio: cell->holder, cell);
2091	}
2092
2093	static void process_bio_success(struct thin_c tc, struct* bio *bio)
2094	{
2095	bio_endio(bio);
2096	}
2097
2098	static void process_bio_fail(struct thin_c tc, struct* bio *bio)
2099	{
2100	bio_io_error(bio);
2101	}
2102
2103	static void process_cell_success(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2104	{
2105	cell_success(pool: tc->pool, cell);
2106	}
2107
2108	static void process_cell_fail(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2109	{
2110	cell_error(pool: tc->pool, cell);
2111	}
2112
2113	/*
2114	* FIXME: should we also commit due to size of transaction, measured in
2115	* metadata blocks?
2116	*/
2117	static int need_commit_due_to_time(struct pool *pool)
2118	{
2119	return !time_in_range(jiffies, pool->last_commit_jiffies,
2120	pool->last_commit_jiffies + COMMIT_PERIOD);
2121	}
2122
2123	#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
2124	#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
2125
2126	static void __thin_bio_rb_add(struct thin_c tc, struct* bio *bio)
2127	{
2128	struct rb_node *rbp, parent;
2129	struct dm_thin_endio_hook *pbd;
2130	sector_t bi_sector = bio->bi_iter.bi_sector;
2131
2132	rbp = &tc->sort_bio_list.rb_node;
2133	parent = NULL;
2134	while (*rbp) {
2135	parent = *rbp;
2136	pbd = thin_pbd(parent);
2137
2138	if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
2139	rbp = &(*rbp)->rb_left;
2140	else
2141	rbp = &(*rbp)->rb_right;
2142	}
2143
2144	pbd = dm_per_bio_data(bio, data_size: sizeof(struct dm_thin_endio_hook));
2145	rb_link_node(node: &pbd->rb_node, parent, rb_link: rbp);
2146	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
2147	}
2148
2149	static void __extract_sorted_bios(struct thin_c *tc)
2150	{
2151	struct rb_node *node;
2152	struct dm_thin_endio_hook *pbd;
2153	struct bio *bio;
2154
2155	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
2156	pbd = thin_pbd(node);
2157	bio = thin_bio(pbd);
2158
2159	bio_list_add(bl: &tc->deferred_bio_list, bio);
2160	rb_erase(&pbd->rb_node, &tc->sort_bio_list);
2161	}
2162
2163	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
2164	}
2165
2166	static void __sort_thin_deferred_bios(struct thin_c *tc)
2167	{
2168	struct bio *bio;
2169	struct bio_list bios;
2170
2171	bio_list_init(bl: &bios);
2172	bio_list_merge(bl: &bios, bl2: &tc->deferred_bio_list);
2173	bio_list_init(bl: &tc->deferred_bio_list);
2174
2175	/ Sort deferred_bio_list using rb-tree /
2176	while ((bio = bio_list_pop(bl: &bios)))
2177	__thin_bio_rb_add(tc, bio);
2178
2179	/*
2180	* Transfer the sorted bios in sort_bio_list back to
2181	* deferred_bio_list to allow lockless submission of
2182	* all bios.
2183	*/
2184	__extract_sorted_bios(tc);
2185	}
2186
2187	static void process_thin_deferred_bios(struct thin_c *tc)
2188	{
2189	struct pool *pool = tc->pool;
2190	struct bio *bio;
2191	struct bio_list bios;
2192	struct blk_plug plug;
2193	unsigned int count = `0`;
2194
2195	if (tc->requeue_mode) {
2196	error_thin_bio_list(tc, master: &tc->deferred_bio_list,
2197	BLK_STS_DM_REQUEUE);
2198	return;
2199	}
2200
2201	bio_list_init(bl: &bios);
2202
2203	spin_lock_irq(lock: &tc->lock);
2204
2205	if (bio_list_empty(bl: &tc->deferred_bio_list)) {
2206	spin_unlock_irq(lock: &tc->lock);
2207	return;
2208	}
2209
2210	__sort_thin_deferred_bios(tc);
2211
2212	bio_list_merge(bl: &bios, bl2: &tc->deferred_bio_list);
2213	bio_list_init(bl: &tc->deferred_bio_list);
2214
2215	spin_unlock_irq(lock: &tc->lock);
2216
2217	blk_start_plug(&plug);
2218	while ((bio = bio_list_pop(bl: &bios))) {
2219	/*
2220	* If we've got no free new_mapping structs, and processing
2221	* this bio might require one, we pause until there are some
2222	* prepared mappings to process.
2223	*/
2224	if (ensure_next_mapping(pool)) {
2225	spin_lock_irq(lock: &tc->lock);
2226	bio_list_add(bl: &tc->deferred_bio_list, bio);
2227	bio_list_merge(bl: &tc->deferred_bio_list, bl2: &bios);
2228	spin_unlock_irq(lock: &tc->lock);
2229	break;
2230	}
2231
2232	if (bio_op(bio) == REQ_OP_DISCARD)
2233	pool->process_discard(tc, bio);
2234	else
2235	pool->process_bio(tc, bio);
2236
2237	if ((count++ & `127`) == `0`) {
2238	throttle_work_update(t: &pool->throttle);
2239	dm_pool_issue_prefetches(pmd: pool->pmd);
2240	}
2241	cond_resched();
2242	}
2243	blk_finish_plug(&plug);
2244	}
2245
2246	static int cmp_cells(const void lhs, const* void *rhs)
2247	{
2248	struct dm_bio_prison_cell lhs_cell = ((struct dm_bio_prison_cell **) lhs);
2249	struct dm_bio_prison_cell rhs_cell = ((struct dm_bio_prison_cell **) rhs);
2250
2251	BUG_ON(!lhs_cell->holder);
2252	BUG_ON(!rhs_cell->holder);
2253
2254	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
2255	return -`1`;
2256
2257	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
2258	return `1`;
2259
2260	return `0`;
2261	}
2262
2263	static unsigned int sort_cells(struct pool pool, struct* list_head *cells)
2264	{
2265	unsigned int count = `0`;
2266	struct dm_bio_prison_cell cell, tmp;
2267
2268	list_for_each_entry_safe(cell, tmp, cells, user_list) {
2269	if (count >= CELL_SORT_ARRAY_SIZE)
2270	break;
2271
2272	pool->cell_sort_array[count++] = cell;
2273	list_del(entry: &cell->user_list);
2274	}
2275
2276	sort(base: pool->cell_sort_array, num: count, size: sizeof(cell), cmp_func: cmp_cells, NULL);
2277
2278	return count;
2279	}
2280
2281	static void process_thin_deferred_cells(struct thin_c *tc)
2282	{
2283	struct pool *pool = tc->pool;
2284	struct list_head cells;
2285	struct dm_bio_prison_cell *cell;
2286	unsigned int i, j, count;
2287
2288	INIT_LIST_HEAD(list: &cells);
2289
2290	spin_lock_irq(lock: &tc->lock);
2291	list_splice_init(list: &tc->deferred_cells, head: &cells);
2292	spin_unlock_irq(lock: &tc->lock);
2293
2294	if (list_empty(head: &cells))
2295	return;
2296
2297	do {
2298	count = sort_cells(pool: tc->pool, cells: &cells);
2299
2300	for (i = `0`; i < count; i++) {
2301	cell = pool->cell_sort_array[i];
2302	BUG_ON(!cell->holder);
2303
2304	/*
2305	* If we've got no free new_mapping structs, and processing
2306	* this bio might require one, we pause until there are some
2307	* prepared mappings to process.
2308	*/
2309	if (ensure_next_mapping(pool)) {
2310	for (j = i; j < count; j++)
2311	list_add(new: &pool->cell_sort_array[j]->user_list, head: &cells);
2312
2313	spin_lock_irq(lock: &tc->lock);
2314	list_splice(list: &cells, head: &tc->deferred_cells);
2315	spin_unlock_irq(lock: &tc->lock);
2316	return;
2317	}
2318
2319	if (bio_op(bio: cell->holder) == REQ_OP_DISCARD)
2320	pool->process_discard_cell(tc, cell);
2321	else
2322	pool->process_cell(tc, cell);
2323	}
2324	cond_resched();
2325	} while (!list_empty(head: &cells));
2326	}
2327
2328	static void thin_get(struct thin_c *tc);
2329	static void thin_put(struct thin_c *tc);
2330
2331	/*
2332	* We can't hold rcu_read_lock() around code that can block. So we
2333	* find a thin with the rcu lock held; bump a refcount; then drop
2334	* the lock.
2335	*/
2336	static struct thin_c get_first_thin(struct* pool *pool)
2337	{
2338	struct thin_c *tc = NULL;
2339
2340	rcu_read_lock();
2341	if (!list_empty(head: &pool->active_thins)) {
2342	tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
2343	thin_get(tc);
2344	}
2345	rcu_read_unlock();
2346
2347	return tc;
2348	}
2349
2350	static struct thin_c get_next_thin(struct* pool pool, struct* thin_c *tc)
2351	{
2352	struct thin_c *old_tc = tc;
2353
2354	rcu_read_lock();
2355	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
2356	thin_get(tc);
2357	thin_put(tc: old_tc);
2358	rcu_read_unlock();
2359	return tc;
2360	}
2361	thin_put(tc: old_tc);
2362	rcu_read_unlock();
2363
2364	return NULL;
2365	}
2366
2367	static void process_deferred_bios(struct pool *pool)
2368	{
2369	struct bio *bio;
2370	struct bio_list bios, bio_completions;
2371	struct thin_c *tc;
2372
2373	tc = get_first_thin(pool);
2374	while (tc) {
2375	process_thin_deferred_cells(tc);
2376	process_thin_deferred_bios(tc);
2377	tc = get_next_thin(pool, tc);
2378	}
2379
2380	/*
2381	* If there are any deferred flush bios, we must commit the metadata
2382	* before issuing them or signaling their completion.
2383	*/
2384	bio_list_init(bl: &bios);
2385	bio_list_init(bl: &bio_completions);
2386
2387	spin_lock_irq(lock: &pool->lock);
2388	bio_list_merge(bl: &bios, bl2: &pool->deferred_flush_bios);
2389	bio_list_init(bl: &pool->deferred_flush_bios);
2390
2391	bio_list_merge(bl: &bio_completions, bl2: &pool->deferred_flush_completions);
2392	bio_list_init(bl: &pool->deferred_flush_completions);
2393	spin_unlock_irq(lock: &pool->lock);
2394
2395	if (bio_list_empty(bl: &bios) && bio_list_empty(bl: &bio_completions) &&
2396	!(dm_pool_changed_this_transaction(pmd: pool->pmd) && need_commit_due_to_time(pool)))
2397	return;
2398
2399	if (commit(pool)) {
2400	bio_list_merge(bl: &bios, bl2: &bio_completions);
2401
2402	while ((bio = bio_list_pop(bl: &bios)))
2403	bio_io_error(bio);
2404	return;
2405	}
2406	pool->last_commit_jiffies = jiffies;
2407
2408	while ((bio = bio_list_pop(bl: &bio_completions)))
2409	bio_endio(bio);
2410
2411	while ((bio = bio_list_pop(bl: &bios))) {
2412	/*
2413	* The data device was flushed as part of metadata commit,
2414	* so complete redundant flushes immediately.
2415	*/
2416	if (bio->bi_opf & REQ_PREFLUSH)
2417	bio_endio(bio);
2418	else
2419	dm_submit_bio_remap(clone: bio, NULL);
2420	}
2421	}
2422
2423	static void do_worker(struct work_struct *ws)
2424	{
2425	struct pool pool = container_of(ws, struct* pool, worker);
2426
2427	throttle_work_start(t: &pool->throttle);
2428	dm_pool_issue_prefetches(pmd: pool->pmd);
2429	throttle_work_update(t: &pool->throttle);
2430	process_prepared(pool, head: &pool->prepared_mappings, fn: &pool->process_prepared_mapping);
2431	throttle_work_update(t: &pool->throttle);
2432	process_prepared(pool, head: &pool->prepared_discards, fn: &pool->process_prepared_discard);
2433	throttle_work_update(t: &pool->throttle);
2434	process_prepared(pool, head: &pool->prepared_discards_pt2, fn: &pool->process_prepared_discard_pt2);
2435	throttle_work_update(t: &pool->throttle);
2436	process_deferred_bios(pool);
2437	throttle_work_complete(t: &pool->throttle);
2438	}
2439
2440	/*
2441	* We want to commit periodically so that not too much
2442	* unwritten data builds up.
2443	*/
2444	static void do_waker(struct work_struct *ws)
2445	{
2446	struct pool pool = container_of(to_delayed_work(ws), struct* pool, waker);
2447
2448	wake_worker(pool);
2449	queue_delayed_work(wq: pool->wq, dwork: &pool->waker, COMMIT_PERIOD);
2450	}
2451
2452	/*
2453	* We're holding onto IO to allow userland time to react. After the
2454	* timeout either the pool will have been resized (and thus back in
2455	* PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
2456	*/
2457	static void do_no_space_timeout(struct work_struct *ws)
2458	{
2459	struct pool pool = container_of(to_delayed_work(ws), struct* pool,
2460	no_space_timeout);
2461
2462	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
2463	pool->pf.error_if_no_space = true;
2464	notify_of_pool_mode_change(pool);
2465	error_retry_list_with_code(pool, BLK_STS_NOSPC);
2466	}
2467	}
2468
2469	/----------------------------------------------------------------/
2470
2471	struct pool_work {
2472	struct work_struct worker;
2473	struct completion complete;
2474	};
2475
2476	static struct pool_work to_pool_work(struct* work_struct *ws)
2477	{
2478	return container_of(ws, struct pool_work, worker);
2479	}
2480
2481	static void pool_work_complete(struct pool_work *pw)
2482	{
2483	complete(&pw->complete);
2484	}
2485
2486	static void pool_work_wait(struct pool_work pw, struct* pool *pool,
2487	void (fn)(struct* work_struct *))
2488	{
2489	INIT_WORK_ONSTACK(&pw->worker, fn);
2490	init_completion(x: &pw->complete);
2491	queue_work(wq: pool->wq, work: &pw->worker);
2492	wait_for_completion(&pw->complete);
2493	}
2494
2495	/----------------------------------------------------------------/
2496
2497	struct noflush_work {
2498	struct pool_work pw;
2499	struct thin_c *tc;
2500	};
2501
2502	static struct noflush_work to_noflush(struct* work_struct *ws)
2503	{
2504	return container_of(to_pool_work(ws), struct noflush_work, pw);
2505	}
2506
2507	static void do_noflush_start(struct work_struct *ws)
2508	{
2509	struct noflush_work *w = to_noflush(ws);
2510
2511	w->tc->requeue_mode = true;
2512	requeue_io(tc: w->tc);
2513	pool_work_complete(pw: &w->pw);
2514	}
2515
2516	static void do_noflush_stop(struct work_struct *ws)
2517	{
2518	struct noflush_work *w = to_noflush(ws);
2519
2520	w->tc->requeue_mode = false;
2521	pool_work_complete(pw: &w->pw);
2522	}
2523
2524	static void noflush_work(struct thin_c tc, void* (fn)(struct* work_struct *))
2525	{
2526	struct noflush_work w;
2527
2528	w.tc = tc;
2529	pool_work_wait(pw: &w.pw, pool: tc->pool, fn);
2530	}
2531
2532	/----------------------------------------------------------------/
2533
2534	static void set_discard_callbacks(struct pool *pool)
2535	{
2536	struct pool_c *pt = pool->ti->private;
2537
2538	if (pt->adjusted_pf.discard_passdown) {
2539	pool->process_discard_cell = process_discard_cell_passdown;
2540	pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2541	pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
2542	} else {
2543	pool->process_discard_cell = process_discard_cell_no_passdown;
2544	pool->process_prepared_discard = process_prepared_discard_no_passdown;
2545	}
2546	}
2547
2548	static void set_pool_mode(struct pool pool, enum* pool_mode new_mode)
2549	{
2550	struct pool_c *pt = pool->ti->private;
2551	bool needs_check = dm_pool_metadata_needs_check(pmd: pool->pmd);
2552	enum pool_mode old_mode = get_pool_mode(pool);
2553	unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
2554
2555	/*
2556	* Never allow the pool to transition to PM_WRITE mode if user
2557	* intervention is required to verify metadata and data consistency.
2558	*/
2559	if (new_mode == PM_WRITE && needs_check) {
2560	DMERR("%s: unable to switch pool to write mode until repaired.",
2561	dm_device_name(pool->pool_md));
2562	if (old_mode != new_mode)
2563	new_mode = old_mode;
2564	else
2565	new_mode = PM_READ_ONLY;
2566	}
2567	/*
2568	* If we were in PM_FAIL mode, rollback of metadata failed. We're
2569	* not going to recover without a thin_repair. So we never let the
2570	* pool move out of the old mode.
2571	*/
2572	if (old_mode == PM_FAIL)
2573	new_mode = old_mode;
2574
2575	switch (new_mode) {
2576	case PM_FAIL:
2577	dm_pool_metadata_read_only(pmd: pool->pmd);
2578	pool->process_bio = process_bio_fail;
2579	pool->process_discard = process_bio_fail;
2580	pool->process_cell = process_cell_fail;
2581	pool->process_discard_cell = process_cell_fail;
2582	pool->process_prepared_mapping = process_prepared_mapping_fail;
2583	pool->process_prepared_discard = process_prepared_discard_fail;
2584
2585	error_retry_list(pool);
2586	break;
2587
2588	case PM_OUT_OF_METADATA_SPACE:
2589	case PM_READ_ONLY:
2590	dm_pool_metadata_read_only(pmd: pool->pmd);
2591	pool->process_bio = process_bio_read_only;
2592	pool->process_discard = process_bio_success;
2593	pool->process_cell = process_cell_read_only;
2594	pool->process_discard_cell = process_cell_success;
2595	pool->process_prepared_mapping = process_prepared_mapping_fail;
2596	pool->process_prepared_discard = process_prepared_discard_success;
2597
2598	error_retry_list(pool);
2599	break;
2600
2601	case PM_OUT_OF_DATA_SPACE:
2602	/*
2603	* Ideally we'd never hit this state; the low water mark
2604	* would trigger userland to extend the pool before we
2605	* completely run out of data space. However, many small
2606	* IOs to unprovisioned space can consume data space at an
2607	* alarming rate. Adjust your low water mark if you're
2608	* frequently seeing this mode.
2609	*/
2610	pool->out_of_data_space = true;
2611	pool->process_bio = process_bio_read_only;
2612	pool->process_discard = process_discard_bio;
2613	pool->process_cell = process_cell_read_only;
2614	pool->process_prepared_mapping = process_prepared_mapping;
2615	set_discard_callbacks(pool);
2616
2617	if (!pool->pf.error_if_no_space && no_space_timeout)
2618	queue_delayed_work(wq: pool->wq, dwork: &pool->no_space_timeout, delay: no_space_timeout);
2619	break;
2620
2621	case PM_WRITE:
2622	if (old_mode == PM_OUT_OF_DATA_SPACE)
2623	cancel_delayed_work_sync(dwork: &pool->no_space_timeout);
2624	pool->out_of_data_space = false;
2625	pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
2626	dm_pool_metadata_read_write(pmd: pool->pmd);
2627	pool->process_bio = process_bio;
2628	pool->process_discard = process_discard_bio;
2629	pool->process_cell = process_cell;
2630	pool->process_prepared_mapping = process_prepared_mapping;
2631	set_discard_callbacks(pool);
2632	break;
2633	}
2634
2635	pool->pf.mode = new_mode;
2636	/*
2637	* The pool mode may have changed, sync it so bind_control_target()
2638	* doesn't cause an unexpected mode transition on resume.
2639	*/
2640	pt->adjusted_pf.mode = new_mode;
2641
2642	if (old_mode != new_mode)
2643	notify_of_pool_mode_change(pool);
2644	}
2645
2646	static void abort_transaction(struct pool *pool)
2647	{
2648	const char *dev_name = dm_device_name(md: pool->pool_md);
2649
2650	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2651	if (dm_pool_abort_metadata(pmd: pool->pmd)) {
2652	DMERR("%s: failed to abort metadata transaction", dev_name);
2653	set_pool_mode(pool, new_mode: PM_FAIL);
2654	}
2655
2656	if (dm_pool_metadata_set_needs_check(pmd: pool->pmd)) {
2657	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2658	set_pool_mode(pool, new_mode: PM_FAIL);
2659	}
2660	}
2661
2662	static void metadata_operation_failed(struct pool pool, const* char op, int* r)
2663	{
2664	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2665	dm_device_name(pool->pool_md), op, r);
2666
2667	abort_transaction(pool);
2668	set_pool_mode(pool, new_mode: PM_READ_ONLY);
2669	}
2670
2671	/----------------------------------------------------------------/
2672
2673	/*
2674	* Mapping functions.
2675	*/
2676
2677	/*
2678	* Called only while mapping a thin bio to hand it over to the workqueue.
2679	*/
2680	static void thin_defer_bio(struct thin_c tc, struct* bio *bio)
2681	{
2682	struct pool *pool = tc->pool;
2683
2684	spin_lock_irq(lock: &tc->lock);
2685	bio_list_add(bl: &tc->deferred_bio_list, bio);
2686	spin_unlock_irq(lock: &tc->lock);
2687
2688	wake_worker(pool);
2689	}
2690
2691	static void thin_defer_bio_with_throttle(struct thin_c tc, struct* bio *bio)
2692	{
2693	struct pool *pool = tc->pool;
2694
2695	throttle_lock(t: &pool->throttle);
2696	thin_defer_bio(tc, bio);
2697	throttle_unlock(t: &pool->throttle);
2698	}
2699
2700	static void thin_defer_cell(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2701	{
2702	struct pool *pool = tc->pool;
2703
2704	throttle_lock(t: &pool->throttle);
2705	spin_lock_irq(lock: &tc->lock);
2706	list_add_tail(new: &cell->user_list, head: &tc->deferred_cells);
2707	spin_unlock_irq(lock: &tc->lock);
2708	throttle_unlock(t: &pool->throttle);
2709
2710	wake_worker(pool);
2711	}
2712
2713	static void thin_hook_bio(struct thin_c tc, struct* bio *bio)
2714	{
2715	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
2716
2717	h->tc = tc;
2718	h->shared_read_entry = NULL;
2719	h->all_io_entry = NULL;
2720	h->overwrite_mapping = NULL;
2721	h->cell = NULL;
2722	}
2723
2724	/*
2725	* Non-blocking function called from the thin target's map function.
2726	*/
2727	static int thin_bio_map(struct dm_target ti, struct* bio *bio)
2728	{
2729	int r;
2730	struct thin_c *tc = ti->private;
2731	dm_block_t block = get_bio_block(tc, bio);
2732	struct dm_thin_device *td = tc->td;
2733	struct dm_thin_lookup_result result;
2734	struct dm_bio_prison_cell virt_cell, data_cell;
2735	struct dm_cell_key key;
2736
2737	thin_hook_bio(tc, bio);
2738
2739	if (tc->requeue_mode) {
2740	bio->bi_status = BLK_STS_DM_REQUEUE;
2741	bio_endio(bio);
2742	return DM_MAPIO_SUBMITTED;
2743	}
2744
2745	if (get_pool_mode(pool: tc->pool) == PM_FAIL) {
2746	bio_io_error(bio);
2747	return DM_MAPIO_SUBMITTED;
2748	}
2749
2750	if (op_is_flush(op: bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD) {
2751	thin_defer_bio_with_throttle(tc, bio);
2752	return DM_MAPIO_SUBMITTED;
2753	}
2754
2755	/*
2756	* We must hold the virtual cell before doing the lookup, otherwise
2757	* there's a race with discard.
2758	*/
2759	build_virtual_key(td: tc->td, b: block, key: &key);
2760	if (bio_detain(pool: tc->pool, key: &key, bio, cell_result: &virt_cell))
2761	return DM_MAPIO_SUBMITTED;
2762
2763	r = dm_thin_find_block(td, block, can_issue_io: `0`, result: &result);
2764
2765	/*
2766	* Note that we defer readahead too.
2767	*/
2768	switch (r) {
2769	case `0`:
2770	if (unlikely(result.shared)) {
2771	/*
2772	* We have a race condition here between the
2773	* result.shared value returned by the lookup and
2774	* snapshot creation, which may cause new
2775	* sharing.
2776	*
2777	* To avoid this always quiesce the origin before
2778	* taking the snap. You want to do this anyway to
2779	* ensure a consistent application view
2780	* (i.e. lockfs).
2781	*
2782	* More distant ancestors are irrelevant. The
2783	* shared flag will be set in their case.
2784	*/
2785	thin_defer_cell(tc, cell: virt_cell);
2786	return DM_MAPIO_SUBMITTED;
2787	}
2788
2789	build_data_key(td: tc->td, b: result.block, key: &key);
2790	if (bio_detain(pool: tc->pool, key: &key, bio, cell_result: &data_cell)) {
2791	cell_defer_no_holder(tc, cell: virt_cell);
2792	return DM_MAPIO_SUBMITTED;
2793	}
2794
2795	inc_all_io_entry(pool: tc->pool, bio);
2796	cell_defer_no_holder(tc, cell: data_cell);
2797	cell_defer_no_holder(tc, cell: virt_cell);
2798
2799	remap(tc, bio, block: result.block);
2800	return DM_MAPIO_REMAPPED;
2801
2802	case -ENODATA:
2803	case -EWOULDBLOCK:
2804	thin_defer_cell(tc, cell: virt_cell);
2805	return DM_MAPIO_SUBMITTED;
2806
2807	default:
2808	/*
2809	* Must always call bio_io_error on failure.
2810	* dm_thin_find_block can fail with -EINVAL if the
2811	* pool is switched to fail-io mode.
2812	*/
2813	bio_io_error(bio);
2814	cell_defer_no_holder(tc, cell: virt_cell);
2815	return DM_MAPIO_SUBMITTED;
2816	}
2817	}
2818
2819	static void requeue_bios(struct pool *pool)
2820	{
2821	struct thin_c *tc;
2822
2823	rcu_read_lock();
2824	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2825	spin_lock_irq(lock: &tc->lock);
2826	bio_list_merge(bl: &tc->deferred_bio_list, bl2: &tc->retry_on_resume_list);
2827	bio_list_init(bl: &tc->retry_on_resume_list);
2828	spin_unlock_irq(lock: &tc->lock);
2829	}
2830	rcu_read_unlock();
2831	}
2832
2833	/*
2834	*--------------------------------------------------------------
2835	* Binding of control targets to a pool object
2836	*--------------------------------------------------------------
2837	*/
2838	static bool is_factor(sector_t block_size, uint32_t n)
2839	{
2840	return !sector_div(block_size, n);
2841	}
2842
2843	/*
2844	* If discard_passdown was enabled verify that the data device
2845	* supports discards. Disable discard_passdown if not.
2846	*/
2847	static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
2848	{
2849	struct pool *pool = pt->pool;
2850	struct block_device *data_bdev = pt->data_dev->bdev;
2851	struct queue_limits *data_limits = &bdev_get_queue(bdev: data_bdev)->limits;
2852	const char *reason = NULL;
2853
2854	if (!pt->adjusted_pf.discard_passdown)
2855	return;
2856
2857	if (!bdev_max_discard_sectors(bdev: pt->data_dev->bdev))
2858	reason = "discard unsupported";
2859
2860	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2861	reason = "max discard sectors smaller than a block";
2862
2863	if (reason) {
2864	DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);
2865	pt->adjusted_pf.discard_passdown = false;
2866	}
2867	}
2868
2869	static int bind_control_target(struct pool pool, struct* dm_target *ti)
2870	{
2871	struct pool_c *pt = ti->private;
2872
2873	/*
2874	* We want to make sure that a pool in PM_FAIL mode is never upgraded.
2875	*/
2876	enum pool_mode old_mode = get_pool_mode(pool);
2877	enum pool_mode new_mode = pt->adjusted_pf.mode;
2878
2879	/*
2880	* Don't change the pool's mode until set_pool_mode() below.
2881	* Otherwise the pool's process_* function pointers may
2882	* not match the desired pool mode.
2883	*/
2884	pt->adjusted_pf.mode = old_mode;
2885
2886	pool->ti = ti;
2887	pool->pf = pt->adjusted_pf;
2888	pool->low_water_blocks = pt->low_water_blocks;
2889
2890	set_pool_mode(pool, new_mode);
2891
2892	return `0`;
2893	}
2894
2895	static void unbind_control_target(struct pool pool, struct* dm_target *ti)
2896	{
2897	if (pool->ti == ti)
2898	pool->ti = NULL;
2899	}
2900
2901	/*
2902	*--------------------------------------------------------------
2903	* Pool creation
2904	*--------------------------------------------------------------
2905	*/
2906	/ Initialize pool features. /
2907	static void pool_features_init(struct pool_features *pf)
2908	{
2909	pf->mode = PM_WRITE;
2910	pf->zero_new_blocks = true;
2911	pf->discard_enabled = true;
2912	pf->discard_passdown = true;
2913	pf->error_if_no_space = false;
2914	}
2915
2916	static void __pool_destroy(struct pool *pool)
2917	{
2918	__pool_table_remove(pool);
2919
2920	vfree(addr: pool->cell_sort_array);
2921	if (dm_pool_metadata_close(pmd: pool->pmd) < `0`)
2922	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2923
2924	dm_bio_prison_destroy(prison: pool->prison);
2925	dm_kcopyd_client_destroy(kc: pool->copier);
2926
2927	cancel_delayed_work_sync(dwork: &pool->waker);
2928	cancel_delayed_work_sync(dwork: &pool->no_space_timeout);
2929	if (pool->wq)
2930	destroy_workqueue(wq: pool->wq);
2931
2932	if (pool->next_mapping)
2933	mempool_free(element: pool->next_mapping, pool: &pool->mapping_pool);
2934	mempool_exit(pool: &pool->mapping_pool);
2935	dm_deferred_set_destroy(ds: pool->shared_read_ds);
2936	dm_deferred_set_destroy(ds: pool->all_io_ds);
2937	kfree(objp: pool);
2938	}
2939
2940	static struct kmem_cache *_new_mapping_cache;
2941
2942	static struct pool pool_create(struct* mapped_device *pool_md,
2943	struct block_device *metadata_dev,
2944	struct block_device *data_dev,
2945	unsigned long block_size,
2946	int read_only, char **error)
2947	{
2948	int r;
2949	void *err_p;
2950	struct pool *pool;
2951	struct dm_pool_metadata *pmd;
2952	bool format_device = read_only ? false : true;
2953
2954	pmd = dm_pool_metadata_open(bdev: metadata_dev, data_block_size: block_size, format_device);
2955	if (IS_ERR(ptr: pmd)) {
2956	*error = "Error creating metadata object";
2957	return (struct pool *)pmd;
2958	}
2959
2960	pool = kzalloc(size: sizeof(*pool), GFP_KERNEL);
2961	if (!pool) {
2962	*error = "Error allocating memory for pool";
2963	err_p = ERR_PTR(error: -ENOMEM);
2964	goto bad_pool;
2965	}
2966
2967	pool->pmd = pmd;
2968	pool->sectors_per_block = block_size;
2969	if (block_size & (block_size - `1`))
2970	pool->sectors_per_block_shift = -`1`;
2971	else
2972	pool->sectors_per_block_shift = __ffs(block_size);
2973	pool->low_water_blocks = `0`;
2974	pool_features_init(pf: &pool->pf);
2975	pool->prison = dm_bio_prison_create();
2976	if (!pool->prison) {
2977	*error = "Error creating pool's bio prison";
2978	err_p = ERR_PTR(error: -ENOMEM);
2979	goto bad_prison;
2980	}
2981
2982	pool->copier = dm_kcopyd_client_create(throttle: &dm_kcopyd_throttle);
2983	if (IS_ERR(ptr: pool->copier)) {
2984	r = PTR_ERR(ptr: pool->copier);
2985	*error = "Error creating pool's kcopyd client";
2986	err_p = ERR_PTR(error: r);
2987	goto bad_kcopyd_client;
2988	}
2989
2990	/*
2991	* Create singlethreaded workqueue that will service all devices
2992	* that use this metadata.
2993	*/
2994	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2995	if (!pool->wq) {
2996	*error = "Error creating pool's workqueue";
2997	err_p = ERR_PTR(error: -ENOMEM);
2998	goto bad_wq;
2999	}
3000
3001	throttle_init(t: &pool->throttle);
3002	INIT_WORK(&pool->worker, do_worker);
3003	INIT_DELAYED_WORK(&pool->waker, do_waker);
3004	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
3005	spin_lock_init(&pool->lock);
3006	bio_list_init(bl: &pool->deferred_flush_bios);
3007	bio_list_init(bl: &pool->deferred_flush_completions);
3008	INIT_LIST_HEAD(list: &pool->prepared_mappings);
3009	INIT_LIST_HEAD(list: &pool->prepared_discards);
3010	INIT_LIST_HEAD(list: &pool->prepared_discards_pt2);
3011	INIT_LIST_HEAD(list: &pool->active_thins);
3012	pool->low_water_triggered = false;
3013	pool->suspended = true;
3014	pool->out_of_data_space = false;
3015
3016	pool->shared_read_ds = dm_deferred_set_create();
3017	if (!pool->shared_read_ds) {
3018	*error = "Error creating pool's shared read deferred set";
3019	err_p = ERR_PTR(error: -ENOMEM);
3020	goto bad_shared_read_ds;
3021	}
3022
3023	pool->all_io_ds = dm_deferred_set_create();
3024	if (!pool->all_io_ds) {
3025	*error = "Error creating pool's all io deferred set";
3026	err_p = ERR_PTR(error: -ENOMEM);
3027	goto bad_all_io_ds;
3028	}
3029
3030	pool->next_mapping = NULL;
3031	r = mempool_init_slab_pool(pool: &pool->mapping_pool, MAPPING_POOL_SIZE,
3032	kc: _new_mapping_cache);
3033	if (r) {
3034	*error = "Error creating pool's mapping mempool";
3035	err_p = ERR_PTR(error: r);
3036	goto bad_mapping_pool;
3037	}
3038
3039	pool->cell_sort_array =
3040	vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
3041	sizeof(*pool->cell_sort_array)));
3042	if (!pool->cell_sort_array) {
3043	*error = "Error allocating cell sort array";
3044	err_p = ERR_PTR(error: -ENOMEM);
3045	goto bad_sort_array;
3046	}
3047
3048	pool->ref_count = `1`;
3049	pool->last_commit_jiffies = jiffies;
3050	pool->pool_md = pool_md;
3051	pool->md_dev = metadata_dev;
3052	pool->data_dev = data_dev;
3053	__pool_table_insert(pool);
3054
3055	return pool;
3056
3057	bad_sort_array:
3058	mempool_exit(pool: &pool->mapping_pool);
3059	bad_mapping_pool:
3060	dm_deferred_set_destroy(ds: pool->all_io_ds);
3061	bad_all_io_ds:
3062	dm_deferred_set_destroy(ds: pool->shared_read_ds);
3063	bad_shared_read_ds:
3064	destroy_workqueue(wq: pool->wq);
3065	bad_wq:
3066	dm_kcopyd_client_destroy(kc: pool->copier);
3067	bad_kcopyd_client:
3068	dm_bio_prison_destroy(prison: pool->prison);
3069	bad_prison:
3070	kfree(objp: pool);
3071	bad_pool:
3072	if (dm_pool_metadata_close(pmd))
3073	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
3074
3075	return err_p;
3076	}
3077
3078	static void __pool_inc(struct pool *pool)
3079	{
3080	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3081	pool->ref_count++;
3082	}
3083
3084	static void __pool_dec(struct pool *pool)
3085	{
3086	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3087	BUG_ON(!pool->ref_count);
3088	if (!--pool->ref_count)
3089	__pool_destroy(pool);
3090	}
3091
3092	static struct pool __pool_find(struct* mapped_device *pool_md,
3093	struct block_device *metadata_dev,
3094	struct block_device *data_dev,
3095	unsigned long block_size, int read_only,
3096	char *error, int* *created)
3097	{
3098	struct pool *pool = __pool_table_lookup_metadata_dev(md_dev: metadata_dev);
3099
3100	if (pool) {
3101	if (pool->pool_md != pool_md) {
3102	*error = "metadata device already in use by a pool";
3103	return ERR_PTR(error: -EBUSY);
3104	}
3105	if (pool->data_dev != data_dev) {
3106	*error = "data device already in use by a pool";
3107	return ERR_PTR(error: -EBUSY);
3108	}
3109	__pool_inc(pool);
3110
3111	} else {
3112	pool = __pool_table_lookup(md: pool_md);
3113	if (pool) {
3114	if (pool->md_dev != metadata_dev \|\| pool->data_dev != data_dev) {
3115	*error = "different pool cannot replace a pool";
3116	return ERR_PTR(error: -EINVAL);
3117	}
3118	__pool_inc(pool);
3119
3120	} else {
3121	pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
3122	*created = `1`;
3123	}
3124	}
3125
3126	return pool;
3127	}
3128
3129	/*
3130	*--------------------------------------------------------------
3131	* Pool target methods
3132	*--------------------------------------------------------------
3133	*/
3134	static void pool_dtr(struct dm_target *ti)
3135	{
3136	struct pool_c *pt = ti->private;
3137
3138	mutex_lock(&dm_thin_pool_table.mutex);
3139
3140	unbind_control_target(pool: pt->pool, ti);
3141	__pool_dec(pool: pt->pool);
3142	dm_put_device(ti, d: pt->metadata_dev);
3143	dm_put_device(ti, d: pt->data_dev);
3144	kfree(objp: pt);
3145
3146	mutex_unlock(lock: &dm_thin_pool_table.mutex);
3147	}
3148
3149	static int parse_pool_features(struct dm_arg_set as, struct* pool_features *pf,
3150	struct dm_target *ti)
3151	{
3152	int r;
3153	unsigned int argc;
3154	const char *arg_name;
3155
3156	static const struct dm_arg _args[] = {
3157	{`0`, `4`, "Invalid number of pool feature arguments"},
3158	};
3159
3160	/*
3161	* No feature arguments supplied.
3162	*/
3163	if (!as->argc)
3164	return `0`;
3165
3166	r = dm_read_arg_group(arg: _args, arg_set: as, num_args: &argc, error: &ti->error);
3167	if (r)
3168	return -EINVAL;
3169
3170	while (argc && !r) {
3171	arg_name = dm_shift_arg(as);
3172	argc--;
3173
3174	if (!strcasecmp(s1: arg_name, s2: "skip_block_zeroing"))
3175	pf->zero_new_blocks = false;
3176
3177	else if (!strcasecmp(s1: arg_name, s2: "ignore_discard"))
3178	pf->discard_enabled = false;
3179
3180	else if (!strcasecmp(s1: arg_name, s2: "no_discard_passdown"))
3181	pf->discard_passdown = false;
3182
3183	else if (!strcasecmp(s1: arg_name, s2: "read_only"))
3184	pf->mode = PM_READ_ONLY;
3185
3186	else if (!strcasecmp(s1: arg_name, s2: "error_if_no_space"))
3187	pf->error_if_no_space = true;
3188
3189	else {
3190	ti->error = "Unrecognised pool feature requested";
3191	r = -EINVAL;
3192	break;
3193	}
3194	}
3195
3196	return r;
3197	}
3198
3199	static void metadata_low_callback(void *context)
3200	{
3201	struct pool *pool = context;
3202
3203	DMWARN("%s: reached low water mark for metadata device: sending event.",
3204	dm_device_name(pool->pool_md));
3205
3206	dm_table_event(t: pool->ti->table);
3207	}
3208
3209	/*
3210	* We need to flush the data device before committing the metadata.
3211	*
3212	* This ensures that the data blocks of any newly inserted mappings are
3213	* properly written to non-volatile storage and won't be lost in case of a
3214	* crash.
3215	*
3216	* Failure to do so can result in data corruption in the case of internal or
3217	* external snapshots and in the case of newly provisioned blocks, when block
3218	* zeroing is enabled.
3219	*/
3220	static int metadata_pre_commit_callback(void *context)
3221	{
3222	struct pool *pool = context;
3223
3224	return blkdev_issue_flush(bdev: pool->data_dev);
3225	}
3226
3227	static sector_t get_dev_size(struct block_device *bdev)
3228	{
3229	return bdev_nr_sectors(bdev);
3230	}
3231
3232	static void warn_if_metadata_device_too_big(struct block_device *bdev)
3233	{
3234	sector_t metadata_dev_size = get_dev_size(bdev);
3235
3236	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
3237	DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
3238	bdev, THIN_METADATA_MAX_SECTORS);
3239	}
3240
3241	static sector_t get_metadata_dev_size(struct block_device *bdev)
3242	{
3243	sector_t metadata_dev_size = get_dev_size(bdev);
3244
3245	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
3246	metadata_dev_size = THIN_METADATA_MAX_SECTORS;
3247
3248	return metadata_dev_size;
3249	}
3250
3251	static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
3252	{
3253	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
3254
3255	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
3256
3257	return metadata_dev_size;
3258	}
3259
3260	/*
3261	* When a metadata threshold is crossed a dm event is triggered, and
3262	* userland should respond by growing the metadata device. We could let
3263	* userland set the threshold, like we do with the data threshold, but I'm
3264	* not sure they know enough to do this well.
3265	*/
3266	static dm_block_t calc_metadata_threshold(struct pool_c *pt)
3267	{
3268	/*
3269	* 4M is ample for all ops with the possible exception of thin
3270	* device deletion which is harmless if it fails (just retry the
3271	* delete after you've grown the device).
3272	*/
3273	dm_block_t quarter = get_metadata_dev_size_in_blocks(bdev: pt->metadata_dev->bdev) / `4`;
3274
3275	return min((dm_block_t)`1024ULL` / 4M /, quarter);
3276	}
3277
3278	/*
3279	* thin-pool <metadata dev> <data dev>
3280	* <data block size (sectors)>
3281	* <low water mark (blocks)>
3282	* [<#feature args> [<arg>]*]
3283	*
3284	* Optional feature arguments are:
3285	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
3286	* ignore_discard: disable discard
3287	* no_discard_passdown: don't pass discards down to the data device
3288	* read_only: Don't allow any changes to be made to the pool metadata.
3289	* error_if_no_space: error IOs, instead of queueing, if no space.
3290	*/
3291	static int pool_ctr(struct dm_target ti, unsigned* int argc, char **argv)
3292	{
3293	int r, pool_created = `0`;
3294	struct pool_c *pt;
3295	struct pool *pool;
3296	struct pool_features pf;
3297	struct dm_arg_set as;
3298	struct dm_dev *data_dev;
3299	unsigned long block_size;
3300	dm_block_t low_water_blocks;
3301	struct dm_dev *metadata_dev;
3302	blk_mode_t metadata_mode;
3303
3304	/*
3305	* FIXME Remove validation from scope of lock.
3306	*/
3307	mutex_lock(&dm_thin_pool_table.mutex);
3308
3309	if (argc < `4`) {
3310	ti->error = "Invalid argument count";
3311	r = -EINVAL;
3312	goto out_unlock;
3313	}
3314
3315	as.argc = argc;
3316	as.argv = argv;
3317
3318	/ make sure metadata and data are different devices /
3319	if (!strcmp(argv[`0`], argv[`1`])) {
3320	ti->error = "Error setting metadata or data device";
3321	r = -EINVAL;
3322	goto out_unlock;
3323	}
3324
3325	/*
3326	* Set default pool features.
3327	*/
3328	pool_features_init(pf: &pf);
3329
3330	dm_consume_args(as: &as, num_args: `4`);
3331	r = parse_pool_features(as: &as, pf: &pf, ti);
3332	if (r)
3333	goto out_unlock;
3334
3335	metadata_mode = BLK_OPEN_READ \|
3336	((pf.mode == PM_READ_ONLY) ? `0` : BLK_OPEN_WRITE);
3337	r = dm_get_device(ti, path: argv[`0`], mode: metadata_mode, result: &metadata_dev);
3338	if (r) {
3339	ti->error = "Error opening metadata block device";
3340	goto out_unlock;
3341	}
3342	warn_if_metadata_device_too_big(bdev: metadata_dev->bdev);
3343
3344	r = dm_get_device(ti, path: argv[`1`], BLK_OPEN_READ \| BLK_OPEN_WRITE, result: &data_dev);
3345	if (r) {
3346	ti->error = "Error getting data device";
3347	goto out_metadata;
3348	}
3349
3350	if (kstrtoul(s: argv[`2`], base: `10`, res: &block_size) \|\| !block_size \|\|
3351	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
3352	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
3353	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - `1`)) {
3354	ti->error = "Invalid block size";
3355	r = -EINVAL;
3356	goto out;
3357	}
3358
3359	if (kstrtoull(s: argv[`3`], base: `10`, res: (unsigned long long *)&low_water_blocks)) {
3360	ti->error = "Invalid low water mark";
3361	r = -EINVAL;
3362	goto out;
3363	}
3364
3365	pt = kzalloc(size: sizeof(*pt), GFP_KERNEL);
3366	if (!pt) {
3367	r = -ENOMEM;
3368	goto out;
3369	}
3370
3371	pool = __pool_find(pool_md: dm_table_get_md(t: ti->table), metadata_dev: metadata_dev->bdev, data_dev: data_dev->bdev,
3372	block_size, read_only: pf.mode == PM_READ_ONLY, error: &ti->error, created: &pool_created);
3373	if (IS_ERR(ptr: pool)) {
3374	r = PTR_ERR(ptr: pool);
3375	goto out_free_pt;
3376	}
3377
3378	/*
3379	* 'pool_created' reflects whether this is the first table load.
3380	* Top level discard support is not allowed to be changed after
3381	* initial load. This would require a pool reload to trigger thin
3382	* device changes.
3383	*/
3384	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
3385	ti->error = "Discard support cannot be disabled once enabled";
3386	r = -EINVAL;
3387	goto out_flags_changed;
3388	}
3389
3390	pt->pool = pool;
3391	pt->ti = ti;
3392	pt->metadata_dev = metadata_dev;
3393	pt->data_dev = data_dev;
3394	pt->low_water_blocks = low_water_blocks;
3395	pt->adjusted_pf = pt->requested_pf = pf;
3396	ti->num_flush_bios = `1`;
3397	ti->limit_swap_bios = true;
3398
3399	/*
3400	* Only need to enable discards if the pool should pass
3401	* them down to the data device. The thin device's discard
3402	* processing will cause mappings to be removed from the btree.
3403	*/
3404	if (pf.discard_enabled && pf.discard_passdown) {
3405	ti->num_discard_bios = `1`;
3406	/*
3407	* Setting 'discards_supported' circumvents the normal
3408	* stacking of discard limits (this keeps the pool and
3409	* thin devices' discard limits consistent).
3410	*/
3411	ti->discards_supported = true;
3412	ti->max_discard_granularity = true;
3413	}
3414	ti->private = pt;
3415
3416	r = dm_pool_register_metadata_threshold(pmd: pt->pool->pmd,
3417	threshold: calc_metadata_threshold(pt),
3418	fn: metadata_low_callback,
3419	context: pool);
3420	if (r) {
3421	ti->error = "Error registering metadata threshold";
3422	goto out_flags_changed;
3423	}
3424
3425	dm_pool_register_pre_commit_callback(pmd: pool->pmd,
3426	fn: metadata_pre_commit_callback, context: pool);
3427
3428	mutex_unlock(lock: &dm_thin_pool_table.mutex);
3429
3430	return `0`;
3431
3432	out_flags_changed:
3433	__pool_dec(pool);
3434	out_free_pt:
3435	kfree(objp: pt);
3436	out:
3437	dm_put_device(ti, d: data_dev);
3438	out_metadata:
3439	dm_put_device(ti, d: metadata_dev);
3440	out_unlock:
3441	mutex_unlock(lock: &dm_thin_pool_table.mutex);
3442
3443	return r;
3444	}
3445
3446	static int pool_map(struct dm_target ti, struct* bio *bio)
3447	{
3448	struct pool_c *pt = ti->private;
3449	struct pool *pool = pt->pool;
3450
3451	/*
3452	* As this is a singleton target, ti->begin is always zero.
3453	*/
3454	spin_lock_irq(lock: &pool->lock);
3455	bio_set_dev(bio, bdev: pt->data_dev->bdev);
3456	spin_unlock_irq(lock: &pool->lock);
3457
3458	return DM_MAPIO_REMAPPED;
3459	}
3460
3461	static int maybe_resize_data_dev(struct dm_target ti, bool need_commit)
3462	{
3463	int r;
3464	struct pool_c *pt = ti->private;
3465	struct pool *pool = pt->pool;
3466	sector_t data_size = ti->len;
3467	dm_block_t sb_data_size;
3468
3469	*need_commit = false;
3470
3471	(void) sector_div(data_size, pool->sectors_per_block);
3472
3473	r = dm_pool_get_data_dev_size(pmd: pool->pmd, result: &sb_data_size);
3474	if (r) {
3475	DMERR("%s: failed to retrieve data device size",
3476	dm_device_name(pool->pool_md));
3477	return r;
3478	}
3479
3480	if (data_size < sb_data_size) {
3481	DMERR("%s: pool target (%llu blocks) too small: expected %llu",
3482	dm_device_name(pool->pool_md),
3483	(unsigned long long)data_size, sb_data_size);
3484	return -EINVAL;
3485
3486	} else if (data_size > sb_data_size) {
3487	if (dm_pool_metadata_needs_check(pmd: pool->pmd)) {
3488	DMERR("%s: unable to grow the data device until repaired.",
3489	dm_device_name(pool->pool_md));
3490	return `0`;
3491	}
3492
3493	if (sb_data_size)
3494	DMINFO("%s: growing the data device from %llu to %llu blocks",
3495	dm_device_name(pool->pool_md),
3496	sb_data_size, (unsigned long long)data_size);
3497	r = dm_pool_resize_data_dev(pmd: pool->pmd, new_size: data_size);
3498	if (r) {
3499	metadata_operation_failed(pool, op: "dm_pool_resize_data_dev", r);
3500	return r;
3501	}
3502
3503	*need_commit = true;
3504	}
3505
3506	return `0`;
3507	}
3508
3509	static int maybe_resize_metadata_dev(struct dm_target ti, bool need_commit)
3510	{
3511	int r;
3512	struct pool_c *pt = ti->private;
3513	struct pool *pool = pt->pool;
3514	dm_block_t metadata_dev_size, sb_metadata_dev_size;
3515
3516	*need_commit = false;
3517
3518	metadata_dev_size = get_metadata_dev_size_in_blocks(bdev: pool->md_dev);
3519
3520	r = dm_pool_get_metadata_dev_size(pmd: pool->pmd, result: &sb_metadata_dev_size);
3521	if (r) {
3522	DMERR("%s: failed to retrieve metadata device size",
3523	dm_device_name(pool->pool_md));
3524	return r;
3525	}
3526
3527	if (metadata_dev_size < sb_metadata_dev_size) {
3528	DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
3529	dm_device_name(pool->pool_md),
3530	metadata_dev_size, sb_metadata_dev_size);
3531	return -EINVAL;
3532
3533	} else if (metadata_dev_size > sb_metadata_dev_size) {
3534	if (dm_pool_metadata_needs_check(pmd: pool->pmd)) {
3535	DMERR("%s: unable to grow the metadata device until repaired.",
3536	dm_device_name(pool->pool_md));
3537	return `0`;
3538	}
3539
3540	warn_if_metadata_device_too_big(bdev: pool->md_dev);
3541	DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3542	dm_device_name(pool->pool_md),
3543	sb_metadata_dev_size, metadata_dev_size);
3544
3545	if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3546	set_pool_mode(pool, new_mode: PM_WRITE);
3547
3548	r = dm_pool_resize_metadata_dev(pmd: pool->pmd, new_size: metadata_dev_size);
3549	if (r) {
3550	metadata_operation_failed(pool, op: "dm_pool_resize_metadata_dev", r);
3551	return r;
3552	}
3553
3554	*need_commit = true;
3555	}
3556
3557	return `0`;
3558	}
3559
3560	/*
3561	* Retrieves the number of blocks of the data device from
3562	* the superblock and compares it to the actual device size,
3563	* thus resizing the data device in case it has grown.
3564	*
3565	* This both copes with opening preallocated data devices in the ctr
3566	* being followed by a resume
3567	* -and-
3568	* calling the resume method individually after userspace has
3569	* grown the data device in reaction to a table event.
3570	*/
3571	static int pool_preresume(struct dm_target *ti)
3572	{
3573	int r;
3574	bool need_commit1, need_commit2;
3575	struct pool_c *pt = ti->private;
3576	struct pool *pool = pt->pool;
3577
3578	/*
3579	* Take control of the pool object.
3580	*/
3581	r = bind_control_target(pool, ti);
3582	if (r)
3583	goto out;
3584
3585	r = maybe_resize_data_dev(ti, need_commit: &need_commit1);
3586	if (r)
3587	goto out;
3588
3589	r = maybe_resize_metadata_dev(ti, need_commit: &need_commit2);
3590	if (r)
3591	goto out;
3592
3593	if (need_commit1 \|\| need_commit2)
3594	(void) commit(pool);
3595	out:
3596	/*
3597	* When a thin-pool is PM_FAIL, it cannot be rebuilt if
3598	* bio is in deferred list. Therefore need to return 0
3599	* to allow pool_resume() to flush IO.
3600	*/
3601	if (r && get_pool_mode(pool) == PM_FAIL)
3602	r = `0`;
3603
3604	return r;
3605	}
3606
3607	static void pool_suspend_active_thins(struct pool *pool)
3608	{
3609	struct thin_c *tc;
3610
3611	/ Suspend all active thin devices /
3612	tc = get_first_thin(pool);
3613	while (tc) {
3614	dm_internal_suspend_noflush(md: tc->thin_md);
3615	tc = get_next_thin(pool, tc);
3616	}
3617	}
3618
3619	static void pool_resume_active_thins(struct pool *pool)
3620	{
3621	struct thin_c *tc;
3622
3623	/ Resume all active thin devices /
3624	tc = get_first_thin(pool);
3625	while (tc) {
3626	dm_internal_resume(md: tc->thin_md);
3627	tc = get_next_thin(pool, tc);
3628	}
3629	}
3630
3631	static void pool_resume(struct dm_target *ti)
3632	{
3633	struct pool_c *pt = ti->private;
3634	struct pool *pool = pt->pool;
3635
3636	/*
3637	* Must requeue active_thins' bios and then resume
3638	* active_thins _before_ clearing 'suspend' flag.
3639	*/
3640	requeue_bios(pool);
3641	pool_resume_active_thins(pool);
3642
3643	spin_lock_irq(lock: &pool->lock);
3644	pool->low_water_triggered = false;
3645	pool->suspended = false;
3646	spin_unlock_irq(lock: &pool->lock);
3647
3648	do_waker(ws: &pool->waker.work);
3649	}
3650
3651	static void pool_presuspend(struct dm_target *ti)
3652	{
3653	struct pool_c *pt = ti->private;
3654	struct pool *pool = pt->pool;
3655
3656	spin_lock_irq(lock: &pool->lock);
3657	pool->suspended = true;
3658	spin_unlock_irq(lock: &pool->lock);
3659
3660	pool_suspend_active_thins(pool);
3661	}
3662
3663	static void pool_presuspend_undo(struct dm_target *ti)
3664	{
3665	struct pool_c *pt = ti->private;
3666	struct pool *pool = pt->pool;
3667
3668	pool_resume_active_thins(pool);
3669
3670	spin_lock_irq(lock: &pool->lock);
3671	pool->suspended = false;
3672	spin_unlock_irq(lock: &pool->lock);
3673	}
3674
3675	static void pool_postsuspend(struct dm_target *ti)
3676	{
3677	struct pool_c *pt = ti->private;
3678	struct pool *pool = pt->pool;
3679
3680	cancel_delayed_work_sync(dwork: &pool->waker);
3681	cancel_delayed_work_sync(dwork: &pool->no_space_timeout);
3682	flush_workqueue(pool->wq);
3683	(void) commit(pool);
3684	}
3685
3686	static int check_arg_count(unsigned int argc, unsigned int args_required)
3687	{
3688	if (argc != args_required) {
3689	DMWARN("Message received with %u arguments instead of %u.",
3690	argc, args_required);
3691	return -EINVAL;
3692	}
3693
3694	return `0`;
3695	}
3696
3697	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
3698	{
3699	if (!kstrtoull(s: arg, base: `10`, res: (unsigned long long *)dev_id) &&
3700	*dev_id <= MAX_DEV_ID)
3701	return `0`;
3702
3703	if (warning)
3704	DMWARN("Message received with invalid device id: %s", arg);
3705
3706	return -EINVAL;
3707	}
3708
3709	static int process_create_thin_mesg(unsigned int argc, char argv, struct** pool *pool)
3710	{
3711	dm_thin_id dev_id;
3712	int r;
3713
3714	r = check_arg_count(argc, args_required: `2`);
3715	if (r)
3716	return r;
3717
3718	r = read_dev_id(arg: argv[`1`], dev_id: &dev_id, warning: `1`);
3719	if (r)
3720	return r;
3721
3722	r = dm_pool_create_thin(pmd: pool->pmd, dev: dev_id);
3723	if (r) {
3724	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3725	argv[`1`]);
3726	return r;
3727	}
3728
3729	return `0`;
3730	}
3731
3732	static int process_create_snap_mesg(unsigned int argc, char argv, struct** pool *pool)
3733	{
3734	dm_thin_id dev_id;
3735	dm_thin_id origin_dev_id;
3736	int r;
3737
3738	r = check_arg_count(argc, args_required: `3`);
3739	if (r)
3740	return r;
3741
3742	r = read_dev_id(arg: argv[`1`], dev_id: &dev_id, warning: `1`);
3743	if (r)
3744	return r;
3745
3746	r = read_dev_id(arg: argv[`2`], dev_id: &origin_dev_id, warning: `1`);
3747	if (r)
3748	return r;
3749
3750	r = dm_pool_create_snap(pmd: pool->pmd, dev: dev_id, origin: origin_dev_id);
3751	if (r) {
3752	DMWARN("Creation of new snapshot %s of device %s failed.",
3753	argv[`1`], argv[`2`]);
3754	return r;
3755	}
3756
3757	return `0`;
3758	}
3759
3760	static int process_delete_mesg(unsigned int argc, char argv, struct** pool *pool)
3761	{
3762	dm_thin_id dev_id;
3763	int r;
3764
3765	r = check_arg_count(argc, args_required: `2`);
3766	if (r)
3767	return r;
3768
3769	r = read_dev_id(arg: argv[`1`], dev_id: &dev_id, warning: `1`);
3770	if (r)
3771	return r;
3772
3773	r = dm_pool_delete_thin_device(pmd: pool->pmd, dev: dev_id);
3774	if (r)
3775	DMWARN("Deletion of thin device %s failed.", argv[`1`]);
3776
3777	return r;
3778	}
3779
3780	static int process_set_transaction_id_mesg(unsigned int argc, char argv, struct** pool *pool)
3781	{
3782	dm_thin_id old_id, new_id;
3783	int r;
3784
3785	r = check_arg_count(argc, args_required: `3`);
3786	if (r)
3787	return r;
3788
3789	if (kstrtoull(s: argv[`1`], base: `10`, res: (unsigned long long *)&old_id)) {
3790	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[`1`]);
3791	return -EINVAL;
3792	}
3793
3794	if (kstrtoull(s: argv[`2`], base: `10`, res: (unsigned long long *)&new_id)) {
3795	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[`2`]);
3796	return -EINVAL;
3797	}
3798
3799	r = dm_pool_set_metadata_transaction_id(pmd: pool->pmd, current_id: old_id, new_id);
3800	if (r) {
3801	DMWARN("Failed to change transaction id from %s to %s.",
3802	argv[`1`], argv[`2`]);
3803	return r;
3804	}
3805
3806	return `0`;
3807	}
3808
3809	static int process_reserve_metadata_snap_mesg(unsigned int argc, char argv, struct** pool *pool)
3810	{
3811	int r;
3812
3813	r = check_arg_count(argc, args_required: `1`);
3814	if (r)
3815	return r;
3816
3817	(void) commit(pool);
3818
3819	r = dm_pool_reserve_metadata_snap(pmd: pool->pmd);
3820	if (r)
3821	DMWARN("reserve_metadata_snap message failed.");
3822
3823	return r;
3824	}
3825
3826	static int process_release_metadata_snap_mesg(unsigned int argc, char argv, struct** pool *pool)
3827	{
3828	int r;
3829
3830	r = check_arg_count(argc, args_required: `1`);
3831	if (r)
3832	return r;
3833
3834	r = dm_pool_release_metadata_snap(pmd: pool->pmd);
3835	if (r)
3836	DMWARN("release_metadata_snap message failed.");
3837
3838	return r;
3839	}
3840
3841	/*
3842	* Messages supported:
3843	* create_thin <dev_id>
3844	* create_snap <dev_id> <origin_id>
3845	* delete <dev_id>
3846	* set_transaction_id <current_trans_id> <new_trans_id>
3847	* reserve_metadata_snap
3848	* release_metadata_snap
3849	*/
3850	static int pool_message(struct dm_target ti, unsigned* int argc, char **argv,
3851	char result, unsigned* int maxlen)
3852	{
3853	int r = -EINVAL;
3854	struct pool_c *pt = ti->private;
3855	struct pool *pool = pt->pool;
3856
3857	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
3858	DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3859	dm_device_name(pool->pool_md));
3860	return -EOPNOTSUPP;
3861	}
3862
3863	if (!strcasecmp(s1: argv[`0`], s2: "create_thin"))
3864	r = process_create_thin_mesg(argc, argv, pool);
3865
3866	else if (!strcasecmp(s1: argv[`0`], s2: "create_snap"))
3867	r = process_create_snap_mesg(argc, argv, pool);
3868
3869	else if (!strcasecmp(s1: argv[`0`], s2: "delete"))
3870	r = process_delete_mesg(argc, argv, pool);
3871
3872	else if (!strcasecmp(s1: argv[`0`], s2: "set_transaction_id"))
3873	r = process_set_transaction_id_mesg(argc, argv, pool);
3874
3875	else if (!strcasecmp(s1: argv[`0`], s2: "reserve_metadata_snap"))
3876	r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3877
3878	else if (!strcasecmp(s1: argv[`0`], s2: "release_metadata_snap"))
3879	r = process_release_metadata_snap_mesg(argc, argv, pool);
3880
3881	else
3882	DMWARN("Unrecognised thin pool target message received: %s", argv[`0`]);
3883
3884	if (!r)
3885	(void) commit(pool);
3886
3887	return r;
3888	}
3889
3890	static void emit_flags(struct pool_features pf, char* *result,
3891	unsigned int sz, unsigned int maxlen)
3892	{
3893	unsigned int count = !pf->zero_new_blocks + !pf->discard_enabled +
3894	!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3895	pf->error_if_no_space;
3896	DMEMIT("%u ", count);
3897
3898	if (!pf->zero_new_blocks)
3899	DMEMIT("skip_block_zeroing ");
3900
3901	if (!pf->discard_enabled)
3902	DMEMIT("ignore_discard ");
3903
3904	if (!pf->discard_passdown)
3905	DMEMIT("no_discard_passdown ");
3906
3907	if (pf->mode == PM_READ_ONLY)
3908	DMEMIT("read_only ");
3909
3910	if (pf->error_if_no_space)
3911	DMEMIT("error_if_no_space ");
3912	}
3913
3914	/*
3915	* Status line is:
3916	* <transaction id> <used metadata sectors>/<total metadata sectors>
3917	* <used data sectors>/<total data sectors> <held metadata root>
3918	* <pool mode> <discard config> <no space config> <needs_check>
3919	*/
3920	static void pool_status(struct dm_target *ti, status_type_t type,
3921	unsigned int status_flags, char result, unsigned* int maxlen)
3922	{
3923	int r;
3924	unsigned int sz = `0`;
3925	uint64_t transaction_id;
3926	dm_block_t nr_free_blocks_data;
3927	dm_block_t nr_free_blocks_metadata;
3928	dm_block_t nr_blocks_data;
3929	dm_block_t nr_blocks_metadata;
3930	dm_block_t held_root;
3931	enum pool_mode mode;
3932	char buf[BDEVNAME_SIZE];
3933	char buf2[BDEVNAME_SIZE];
3934	struct pool_c *pt = ti->private;
3935	struct pool *pool = pt->pool;
3936
3937	switch (type) {
3938	case STATUSTYPE_INFO:
3939	if (get_pool_mode(pool) == PM_FAIL) {
3940	DMEMIT("Fail");
3941	break;
3942	}
3943
3944	/ Commit to ensure statistics aren't out-of-date /
3945	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3946	(void) commit(pool);
3947
3948	r = dm_pool_get_metadata_transaction_id(pmd: pool->pmd, result: &transaction_id);
3949	if (r) {
3950	DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3951	dm_device_name(pool->pool_md), r);
3952	goto err;
3953	}
3954
3955	r = dm_pool_get_free_metadata_block_count(pmd: pool->pmd, result: &nr_free_blocks_metadata);
3956	if (r) {
3957	DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3958	dm_device_name(pool->pool_md), r);
3959	goto err;
3960	}
3961
3962	r = dm_pool_get_metadata_dev_size(pmd: pool->pmd, result: &nr_blocks_metadata);
3963	if (r) {
3964	DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3965	dm_device_name(pool->pool_md), r);
3966	goto err;
3967	}
3968
3969	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &nr_free_blocks_data);
3970	if (r) {
3971	DMERR("%s: dm_pool_get_free_block_count returned %d",
3972	dm_device_name(pool->pool_md), r);
3973	goto err;
3974	}
3975
3976	r = dm_pool_get_data_dev_size(pmd: pool->pmd, result: &nr_blocks_data);
3977	if (r) {
3978	DMERR("%s: dm_pool_get_data_dev_size returned %d",
3979	dm_device_name(pool->pool_md), r);
3980	goto err;
3981	}
3982
3983	r = dm_pool_get_metadata_snap(pmd: pool->pmd, result: &held_root);
3984	if (r) {
3985	DMERR("%s: dm_pool_get_metadata_snap returned %d",
3986	dm_device_name(pool->pool_md), r);
3987	goto err;
3988	}
3989
3990	DMEMIT("%llu %llu/%llu %llu/%llu ",
3991	(unsigned long long)transaction_id,
3992	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3993	(unsigned long long)nr_blocks_metadata,
3994	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3995	(unsigned long long)nr_blocks_data);
3996
3997	if (held_root)
3998	DMEMIT("%llu ", held_root);
3999	else
4000	DMEMIT("- ");
4001
4002	mode = get_pool_mode(pool);
4003	if (mode == PM_OUT_OF_DATA_SPACE)
4004	DMEMIT("out_of_data_space ");
4005	else if (is_read_only_pool_mode(mode))
4006	DMEMIT("ro ");
4007	else
4008	DMEMIT("rw ");
4009
4010	if (!pool->pf.discard_enabled)
4011	DMEMIT("ignore_discard ");
4012	else if (pool->pf.discard_passdown)
4013	DMEMIT("discard_passdown ");
4014	else
4015	DMEMIT("no_discard_passdown ");
4016
4017	if (pool->pf.error_if_no_space)
4018	DMEMIT("error_if_no_space ");
4019	else
4020	DMEMIT("queue_if_no_space ");
4021
4022	if (dm_pool_metadata_needs_check(pmd: pool->pmd))
4023	DMEMIT("needs_check ");
4024	else
4025	DMEMIT("- ");
4026
4027	DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
4028
4029	break;
4030
4031	case STATUSTYPE_TABLE:
4032	DMEMIT("%s %s %lu %llu ",
4033	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
4034	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
4035	(unsigned long)pool->sectors_per_block,
4036	(unsigned long long)pt->low_water_blocks);
4037	emit_flags(pf: &pt->requested_pf, result, sz, maxlen);
4038	break;
4039
4040	case STATUSTYPE_IMA:
4041	*result = `'\0'`;
4042	break;
4043	}
4044	return;
4045
4046	err:
4047	DMEMIT("Error");
4048	}
4049
4050	static int pool_iterate_devices(struct dm_target *ti,
4051	iterate_devices_callout_fn fn, void *data)
4052	{
4053	struct pool_c *pt = ti->private;
4054
4055	return fn(ti, pt->data_dev, `0`, ti->len, data);
4056	}
4057
4058	static void pool_io_hints(struct dm_target ti, struct* queue_limits *limits)
4059	{
4060	struct pool_c *pt = ti->private;
4061	struct pool *pool = pt->pool;
4062	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
4063
4064	/*
4065	* If max_sectors is smaller than pool->sectors_per_block adjust it
4066	* to the highest possible power-of-2 factor of pool->sectors_per_block.
4067	* This is especially beneficial when the pool's data device is a RAID
4068	* device that has a full stripe width that matches pool->sectors_per_block
4069	* -- because even though partial RAID stripe-sized IOs will be issued to a
4070	* single RAID stripe; when aggregated they will end on a full RAID stripe
4071	* boundary.. which avoids additional partial RAID stripe writes cascading
4072	*/
4073	if (limits->max_sectors < pool->sectors_per_block) {
4074	while (!is_factor(block_size: pool->sectors_per_block, n: limits->max_sectors)) {
4075	if ((limits->max_sectors & (limits->max_sectors - `1`)) == `0`)
4076	limits->max_sectors--;
4077	limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
4078	}
4079	}
4080
4081	/*
4082	* If the system-determined stacked limits are compatible with the
4083	* pool's blocksize (io_opt is a factor) do not override them.
4084	*/
4085	if (io_opt_sectors < pool->sectors_per_block \|\|
4086	!is_factor(block_size: io_opt_sectors, n: pool->sectors_per_block)) {
4087	if (is_factor(block_size: pool->sectors_per_block, n: limits->max_sectors))
4088	blk_limits_io_min(limits, min: limits->max_sectors << SECTOR_SHIFT);
4089	else
4090	blk_limits_io_min(limits, min: pool->sectors_per_block << SECTOR_SHIFT);
4091	blk_limits_io_opt(limits, opt: pool->sectors_per_block << SECTOR_SHIFT);
4092	}
4093
4094	/*
4095	* pt->adjusted_pf is a staging area for the actual features to use.
4096	* They get transferred to the live pool in bind_control_target()
4097	* called from pool_preresume().
4098	*/
4099
4100	if (pt->adjusted_pf.discard_enabled) {
4101	disable_discard_passdown_if_not_supported(pt);
4102	if (!pt->adjusted_pf.discard_passdown)
4103	limits->max_discard_sectors = `0`;
4104	/*
4105	* The pool uses the same discard limits as the underlying data
4106	* device. DM core has already set this up.
4107	*/
4108	} else {
4109	/*
4110	* Must explicitly disallow stacking discard limits otherwise the
4111	* block layer will stack them if pool's data device has support.
4112	*/
4113	limits->discard_granularity = `0`;
4114	}
4115	}
4116
4117	static struct target_type pool_target = {
4118	.name = "thin-pool",
4119	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
4120	DM_TARGET_IMMUTABLE,
4121	.version = {`1`, `23`, `0`},
4122	.module = THIS_MODULE,
4123	.ctr = pool_ctr,
4124	.dtr = pool_dtr,
4125	.map = pool_map,
4126	.presuspend = pool_presuspend,
4127	.presuspend_undo = pool_presuspend_undo,
4128	.postsuspend = pool_postsuspend,
4129	.preresume = pool_preresume,
4130	.resume = pool_resume,
4131	.message = pool_message,
4132	.status = pool_status,
4133	.iterate_devices = pool_iterate_devices,
4134	.io_hints = pool_io_hints,
4135	};
4136
4137	/*
4138	*--------------------------------------------------------------
4139	* Thin target methods
4140	*--------------------------------------------------------------
4141	*/
4142	static void thin_get(struct thin_c *tc)
4143	{
4144	refcount_inc(r: &tc->refcount);
4145	}
4146
4147	static void thin_put(struct thin_c *tc)
4148	{
4149	if (refcount_dec_and_test(r: &tc->refcount))
4150	complete(&tc->can_destroy);
4151	}
4152
4153	static void thin_dtr(struct dm_target *ti)
4154	{
4155	struct thin_c *tc = ti->private;
4156
4157	spin_lock_irq(lock: &tc->pool->lock);
4158	list_del_rcu(entry: &tc->list);
4159	spin_unlock_irq(lock: &tc->pool->lock);
4160	synchronize_rcu();
4161
4162	thin_put(tc);
4163	wait_for_completion(&tc->can_destroy);
4164
4165	mutex_lock(&dm_thin_pool_table.mutex);
4166
4167	__pool_dec(pool: tc->pool);
4168	dm_pool_close_thin_device(td: tc->td);
4169	dm_put_device(ti, d: tc->pool_dev);
4170	if (tc->origin_dev)
4171	dm_put_device(ti, d: tc->origin_dev);
4172	kfree(objp: tc);
4173
4174	mutex_unlock(lock: &dm_thin_pool_table.mutex);
4175	}
4176
4177	/*
4178	* Thin target parameters:
4179	*
4180	* <pool_dev> <dev_id> [origin_dev]
4181	*
4182	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
4183	* dev_id: the internal device identifier
4184	* origin_dev: a device external to the pool that should act as the origin
4185	*
4186	* If the pool device has discards disabled, they get disabled for the thin
4187	* device as well.
4188	*/
4189	static int thin_ctr(struct dm_target ti, unsigned* int argc, char **argv)
4190	{
4191	int r;
4192	struct thin_c *tc;
4193	struct dm_dev pool_dev, origin_dev;
4194	struct mapped_device *pool_md;
4195
4196	mutex_lock(&dm_thin_pool_table.mutex);
4197
4198	if (argc != `2` && argc != `3`) {
4199	ti->error = "Invalid argument count";
4200	r = -EINVAL;
4201	goto out_unlock;
4202	}
4203
4204	tc = ti->private = kzalloc(size: sizeof(*tc), GFP_KERNEL);
4205	if (!tc) {
4206	ti->error = "Out of memory";
4207	r = -ENOMEM;
4208	goto out_unlock;
4209	}
4210	tc->thin_md = dm_table_get_md(t: ti->table);
4211	spin_lock_init(&tc->lock);
4212	INIT_LIST_HEAD(list: &tc->deferred_cells);
4213	bio_list_init(bl: &tc->deferred_bio_list);
4214	bio_list_init(bl: &tc->retry_on_resume_list);
4215	tc->sort_bio_list = RB_ROOT;
4216
4217	if (argc == `3`) {
4218	if (!strcmp(argv[`0`], argv[`2`])) {
4219	ti->error = "Error setting origin device";
4220	r = -EINVAL;
4221	goto bad_origin_dev;
4222	}
4223
4224	r = dm_get_device(ti, path: argv[`2`], BLK_OPEN_READ, result: &origin_dev);
4225	if (r) {
4226	ti->error = "Error opening origin device";
4227	goto bad_origin_dev;
4228	}
4229	tc->origin_dev = origin_dev;
4230	}
4231
4232	r = dm_get_device(ti, path: argv[`0`], mode: dm_table_get_mode(t: ti->table), result: &pool_dev);
4233	if (r) {
4234	ti->error = "Error opening pool device";
4235	goto bad_pool_dev;
4236	}
4237	tc->pool_dev = pool_dev;
4238
4239	if (read_dev_id(arg: argv[`1`], dev_id: (unsigned long long *)&tc->dev_id, warning: `0`)) {
4240	ti->error = "Invalid device id";
4241	r = -EINVAL;
4242	goto bad_common;
4243	}
4244
4245	pool_md = dm_get_md(dev: tc->pool_dev->bdev->bd_dev);
4246	if (!pool_md) {
4247	ti->error = "Couldn't get pool mapped device";
4248	r = -EINVAL;
4249	goto bad_common;
4250	}
4251
4252	tc->pool = __pool_table_lookup(md: pool_md);
4253	if (!tc->pool) {
4254	ti->error = "Couldn't find pool object";
4255	r = -EINVAL;
4256	goto bad_pool_lookup;
4257	}
4258	__pool_inc(pool: tc->pool);
4259
4260	if (get_pool_mode(pool: tc->pool) == PM_FAIL) {
4261	ti->error = "Couldn't open thin device, Pool is in fail mode";
4262	r = -EINVAL;
4263	goto bad_pool;
4264	}
4265
4266	r = dm_pool_open_thin_device(pmd: tc->pool->pmd, dev: tc->dev_id, td: &tc->td);
4267	if (r) {
4268	ti->error = "Couldn't open thin internal device";
4269	goto bad_pool;
4270	}
4271
4272	r = dm_set_target_max_io_len(ti, len: tc->pool->sectors_per_block);
4273	if (r)
4274	goto bad;
4275
4276	ti->num_flush_bios = `1`;
4277	ti->limit_swap_bios = true;
4278	ti->flush_supported = true;
4279	ti->accounts_remapped_io = true;
4280	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
4281
4282	/ In case the pool supports discards, pass them on. /
4283	if (tc->pool->pf.discard_enabled) {
4284	ti->discards_supported = true;
4285	ti->num_discard_bios = `1`;
4286	ti->max_discard_granularity = true;
4287	}
4288
4289	mutex_unlock(lock: &dm_thin_pool_table.mutex);
4290
4291	spin_lock_irq(lock: &tc->pool->lock);
4292	if (tc->pool->suspended) {
4293	spin_unlock_irq(lock: &tc->pool->lock);
4294	mutex_lock(&dm_thin_pool_table.mutex); / reacquire for __pool_dec /
4295	ti->error = "Unable to activate thin device while pool is suspended";
4296	r = -EINVAL;
4297	goto bad;
4298	}
4299	refcount_set(r: &tc->refcount, n: `1`);
4300	init_completion(x: &tc->can_destroy);
4301	list_add_tail_rcu(new: &tc->list, head: &tc->pool->active_thins);
4302	spin_unlock_irq(lock: &tc->pool->lock);
4303	/*
4304	* This synchronize_rcu() call is needed here otherwise we risk a
4305	* wake_worker() call finding no bios to process (because the newly
4306	* added tc isn't yet visible). So this reduces latency since we
4307	* aren't then dependent on the periodic commit to wake_worker().
4308	*/
4309	synchronize_rcu();
4310
4311	dm_put(md: pool_md);
4312
4313	return `0`;
4314
4315	bad:
4316	dm_pool_close_thin_device(td: tc->td);
4317	bad_pool:
4318	__pool_dec(pool: tc->pool);
4319	bad_pool_lookup:
4320	dm_put(md: pool_md);
4321	bad_common:
4322	dm_put_device(ti, d: tc->pool_dev);
4323	bad_pool_dev:
4324	if (tc->origin_dev)
4325	dm_put_device(ti, d: tc->origin_dev);
4326	bad_origin_dev:
4327	kfree(objp: tc);
4328	out_unlock:
4329	mutex_unlock(lock: &dm_thin_pool_table.mutex);
4330
4331	return r;
4332	}
4333
4334	static int thin_map(struct dm_target ti, struct* bio *bio)
4335	{
4336	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
4337
4338	return thin_bio_map(ti, bio);
4339	}
4340
4341	static int thin_endio(struct dm_target ti, struct* bio *bio,
4342	blk_status_t *err)
4343	{
4344	unsigned long flags;
4345	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
4346	struct list_head work;
4347	struct dm_thin_new_mapping m, tmp;
4348	struct pool *pool = h->tc->pool;
4349
4350	if (h->shared_read_entry) {
4351	INIT_LIST_HEAD(list: &work);
4352	dm_deferred_entry_dec(entry: h->shared_read_entry, head: &work);
4353
4354	spin_lock_irqsave(&pool->lock, flags);
4355	list_for_each_entry_safe(m, tmp, &work, list) {
4356	list_del(entry: &m->list);
4357	__complete_mapping_preparation(m);
4358	}
4359	spin_unlock_irqrestore(lock: &pool->lock, flags);
4360	}
4361
4362	if (h->all_io_entry) {
4363	INIT_LIST_HEAD(list: &work);
4364	dm_deferred_entry_dec(entry: h->all_io_entry, head: &work);
4365	if (!list_empty(head: &work)) {
4366	spin_lock_irqsave(&pool->lock, flags);
4367	list_for_each_entry_safe(m, tmp, &work, list)
4368	list_add_tail(new: &m->list, head: &pool->prepared_discards);
4369	spin_unlock_irqrestore(lock: &pool->lock, flags);
4370	wake_worker(pool);
4371	}
4372	}
4373
4374	if (h->cell)
4375	cell_defer_no_holder(tc: h->tc, cell: h->cell);
4376
4377	return DM_ENDIO_DONE;
4378	}
4379
4380	static void thin_presuspend(struct dm_target *ti)
4381	{
4382	struct thin_c *tc = ti->private;
4383
4384	if (dm_noflush_suspending(ti))
4385	noflush_work(tc, fn: do_noflush_start);
4386	}
4387
4388	static void thin_postsuspend(struct dm_target *ti)
4389	{
4390	struct thin_c *tc = ti->private;
4391
4392	/*
4393	* The dm_noflush_suspending flag has been cleared by now, so
4394	* unfortunately we must always run this.
4395	*/
4396	noflush_work(tc, fn: do_noflush_stop);
4397	}
4398
4399	static int thin_preresume(struct dm_target *ti)
4400	{
4401	struct thin_c *tc = ti->private;
4402
4403	if (tc->origin_dev)
4404	tc->origin_size = get_dev_size(bdev: tc->origin_dev->bdev);
4405
4406	return `0`;
4407	}
4408
4409	/*
4410	* <nr mapped sectors> <highest mapped sector>
4411	*/
4412	static void thin_status(struct dm_target *ti, status_type_t type,
4413	unsigned int status_flags, char result, unsigned* int maxlen)
4414	{
4415	int r;
4416	ssize_t sz = `0`;
4417	dm_block_t mapped, highest;
4418	char buf[BDEVNAME_SIZE];
4419	struct thin_c *tc = ti->private;
4420
4421	if (get_pool_mode(pool: tc->pool) == PM_FAIL) {
4422	DMEMIT("Fail");
4423	return;
4424	}
4425
4426	if (!tc->td)
4427	DMEMIT("-");
4428	else {
4429	switch (type) {
4430	case STATUSTYPE_INFO:
4431	r = dm_thin_get_mapped_count(td: tc->td, result: &mapped);
4432	if (r) {
4433	DMERR("dm_thin_get_mapped_count returned %d", r);
4434	goto err;
4435	}
4436
4437	r = dm_thin_get_highest_mapped_block(td: tc->td, highest_mapped: &highest);
4438	if (r < `0`) {
4439	DMERR("dm_thin_get_highest_mapped_block returned %d", r);
4440	goto err;
4441	}
4442
4443	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
4444	if (r)
4445	DMEMIT("%llu", ((highest + `1`) *
4446	tc->pool->sectors_per_block) - `1`);
4447	else
4448	DMEMIT("-");
4449	break;
4450
4451	case STATUSTYPE_TABLE:
4452	DMEMIT("%s %lu",
4453	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
4454	(unsigned long) tc->dev_id);
4455	if (tc->origin_dev)
4456	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
4457	break;
4458
4459	case STATUSTYPE_IMA:
4460	*result = `'\0'`;
4461	break;
4462	}
4463	}
4464
4465	return;
4466
4467	err:
4468	DMEMIT("Error");
4469	}
4470
4471	static int thin_iterate_devices(struct dm_target *ti,
4472	iterate_devices_callout_fn fn, void *data)
4473	{
4474	sector_t blocks;
4475	struct thin_c *tc = ti->private;
4476	struct pool *pool = tc->pool;
4477
4478	/*
4479	* We can't call dm_pool_get_data_dev_size() since that blocks. So
4480	* we follow a more convoluted path through to the pool's target.
4481	*/
4482	if (!pool->ti)
4483	return `0`; / nothing is bound /
4484
4485	blocks = pool->ti->len;
4486	(void) sector_div(blocks, pool->sectors_per_block);
4487	if (blocks)
4488	return fn(ti, tc->pool_dev, `0`, pool->sectors_per_block * blocks, data);
4489
4490	return `0`;
4491	}
4492
4493	static void thin_io_hints(struct dm_target ti, struct* queue_limits *limits)
4494	{
4495	struct thin_c *tc = ti->private;
4496	struct pool *pool = tc->pool;
4497
4498	if (pool->pf.discard_enabled) {
4499	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
4500	limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
4501	}
4502	}
4503
4504	static struct target_type thin_target = {
4505	.name = "thin",
4506	.version = {`1`, `23`, `0`},
4507	.module = THIS_MODULE,
4508	.ctr = thin_ctr,
4509	.dtr = thin_dtr,
4510	.map = thin_map,
4511	.end_io = thin_endio,
4512	.preresume = thin_preresume,
4513	.presuspend = thin_presuspend,
4514	.postsuspend = thin_postsuspend,
4515	.status = thin_status,
4516	.iterate_devices = thin_iterate_devices,
4517	.io_hints = thin_io_hints,
4518	};
4519
4520	/----------------------------------------------------------------/
4521
4522	static int __init dm_thin_init(void)
4523	{
4524	int r = -ENOMEM;
4525
4526	pool_table_init();
4527
4528	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, `0`);
4529	if (!_new_mapping_cache)
4530	return r;
4531
4532	r = dm_register_target(t: &thin_target);
4533	if (r)
4534	goto bad_new_mapping_cache;
4535
4536	r = dm_register_target(t: &pool_target);
4537	if (r)
4538	goto bad_thin_target;
4539
4540	return `0`;
4541
4542	bad_thin_target:
4543	dm_unregister_target(t: &thin_target);
4544	bad_new_mapping_cache:
4545	kmem_cache_destroy(s: _new_mapping_cache);
4546
4547	return r;
4548	}
4549
4550	static void dm_thin_exit(void)
4551	{
4552	dm_unregister_target(t: &thin_target);
4553	dm_unregister_target(t: &pool_target);
4554
4555	kmem_cache_destroy(s: _new_mapping_cache);
4556
4557	pool_table_exit();
4558	}
4559
4560	module_init(dm_thin_init);
4561	module_exit(dm_thin_exit);
4562
4563	module_param_named(no_space_timeout, no_space_timeout_secs, uint, `0644`);
4564	MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
4565
4566	MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
4567	MODULE_AUTHOR("Joe Thornber <dm-devel@lists.linux.dev>");
4568	MODULE_LICENSE("GPL");
4569

source code of linux/drivers/md/dm-thin.c