dm-thin.c source code [linux/drivers/md/dm-thin.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (C) 2011-2012 Red Hat UK.
4	*
5	* This file is released under the GPL.
6	*/
7
8	#include "dm-thin-metadata.h"
9	#include "dm-bio-prison-v1.h"
10	#include "dm.h"
11
12	#include <linux/device-mapper.h>
13	#include <linux/dm-io.h>
14	#include <linux/dm-kcopyd.h>
15	#include <linux/jiffies.h>
16	#include <linux/log2.h>
17	#include <linux/list.h>
18	#include <linux/rculist.h>
19	#include <linux/init.h>
20	#include <linux/module.h>
21	#include <linux/slab.h>
22	#include <linux/vmalloc.h>
23	#include <linux/sort.h>
24	#include <linux/rbtree.h>
25
26	#define DM_MSG_PREFIX "thin"
27
28	/*
29	* Tunable constants
30	*/
31	#define ENDIO_HOOK_POOL_SIZE 1024
32	#define MAPPING_POOL_SIZE 1024
33	#define COMMIT_PERIOD HZ
34	#define NO_SPACE_TIMEOUT_SECS 60
35
36	static unsigned int no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
37
38	DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
39	"A percentage of time allocated for copy on write");
40
41	/*
42	* The block size of the device holding pool data must be
43	* between 64KB and 1GB.
44	*/
45	#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
46	#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
47
48	/*
49	* Device id is restricted to 24 bits.
50	*/
51	#define MAX_DEV_ID ((1 << 24) - 1)
52
53	/*
54	* How do we handle breaking sharing of data blocks?
55	* =================================================
56	*
57	* We use a standard copy-on-write btree to store the mappings for the
58	* devices (note I'm talking about copy-on-write of the metadata here, not
59	* the data). When you take an internal snapshot you clone the root node
60	* of the origin btree. After this there is no concept of an origin or a
61	* snapshot. They are just two device trees that happen to point to the
62	* same data blocks.
63	*
64	* When we get a write in we decide if it's to a shared data block using
65	* some timestamp magic. If it is, we have to break sharing.
66	*
67	* Let's say we write to a shared block in what was the origin. The
68	* steps are:
69	*
70	* i) plug io further to this physical block. (see bio_prison code).
71	*
72	* ii) quiesce any read io to that shared data block. Obviously
73	* including all devices that share this block. (see dm_deferred_set code)
74	*
75	* iii) copy the data block to a newly allocate block. This step can be
76	* missed out if the io covers the block. (schedule_copy).
77	*
78	* iv) insert the new mapping into the origin's btree
79	* (process_prepared_mapping). This act of inserting breaks some
80	* sharing of btree nodes between the two devices. Breaking sharing only
81	* effects the btree of that specific device. Btrees for the other
82	* devices that share the block never change. The btree for the origin
83	* device as it was after the last commit is untouched, ie. we're using
84	* persistent data structures in the functional programming sense.
85	*
86	* v) unplug io to this physical block, including the io that triggered
87	* the breaking of sharing.
88	*
89	* Steps (ii) and (iii) occur in parallel.
90	*
91	* The metadata _doesn't_ need to be committed before the io continues. We
92	* get away with this because the io is always written to a _new_ block.
93	* If there's a crash, then:
94	*
95	* - The origin mapping will point to the old origin block (the shared
96	* one). This will contain the data as it was before the io that triggered
97	* the breaking of sharing came in.
98	*
99	* - The snap mapping still points to the old block. As it would after
100	* the commit.
101	*
102	* The downside of this scheme is the timestamp magic isn't perfect, and
103	* will continue to think that data block in the snapshot device is shared
104	* even after the write to the origin has broken sharing. I suspect data
105	* blocks will typically be shared by many different devices, so we're
106	* breaking sharing n + 1 times, rather than n, where n is the number of
107	* devices that reference this data block. At the moment I think the
108	* benefits far, far outweigh the disadvantages.
109	*/
110
111	/----------------------------------------------------------------/
112
113	/*
114	* Key building.
115	*/
116	enum lock_space {
117	VIRTUAL,
118	PHYSICAL
119	};
120
121	static bool build_key(struct dm_thin_device td, enum* lock_space ls,
122	dm_block_t b, dm_block_t e, struct dm_cell_key *key)
123	{
124	key->virtual = (ls == VIRTUAL);
125	key->dev = dm_thin_dev_id(td);
126	key->block_begin = b;
127	key->block_end = e;
128
129	return dm_cell_key_has_valid_range(key);
130	}
131
132	static void build_data_key(struct dm_thin_device *td, dm_block_t b,
133	struct dm_cell_key *key)
134	{
135	(void) build_key(td, ls: PHYSICAL, b, e: b + `1llu`, key);
136	}
137
138	static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
139	struct dm_cell_key *key)
140	{
141	(void) build_key(td, ls: VIRTUAL, b, e: b + `1llu`, key);
142	}
143
144	/----------------------------------------------------------------/
145
146	#define THROTTLE_THRESHOLD (1 * HZ)
147
148	struct throttle {
149	struct rw_semaphore lock;
150	unsigned long threshold;
151	bool throttle_applied;
152	};
153
154	static void throttle_init(struct throttle *t)
155	{
156	init_rwsem(&t->lock);
157	t->throttle_applied = false;
158	}
159
160	static void throttle_work_start(struct throttle *t)
161	{
162	t->threshold = jiffies + THROTTLE_THRESHOLD;
163	}
164
165	static void throttle_work_update(struct throttle *t)
166	{
167	if (!t->throttle_applied && time_is_before_jiffies(t->threshold)) {
168	down_write(sem: &t->lock);
169	t->throttle_applied = true;
170	}
171	}
172
173	static void throttle_work_complete(struct throttle *t)
174	{
175	if (t->throttle_applied) {
176	t->throttle_applied = false;
177	up_write(sem: &t->lock);
178	}
179	}
180
181	static void throttle_lock(struct throttle *t)
182	{
183	down_read(sem: &t->lock);
184	}
185
186	static void throttle_unlock(struct throttle *t)
187	{
188	up_read(sem: &t->lock);
189	}
190
191	/----------------------------------------------------------------/
192
193	/*
194	* A pool device ties together a metadata device and a data device. It
195	* also provides the interface for creating and destroying internal
196	* devices.
197	*/
198	struct dm_thin_new_mapping;
199
200	/*
201	* The pool runs in various modes. Ordered in degraded order for comparisons.
202	*/
203	enum pool_mode {
204	PM_WRITE, / metadata may be changed /
205	PM_OUT_OF_DATA_SPACE, / metadata may be changed, though data may not be allocated /
206
207	/*
208	* Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
209	*/
210	PM_OUT_OF_METADATA_SPACE,
211	PM_READ_ONLY, / metadata may not be changed /
212
213	PM_FAIL, / all I/O fails /
214	};
215
216	struct pool_features {
217	enum pool_mode mode;
218
219	bool zero_new_blocks:`1`;
220	bool discard_enabled:`1`;
221	bool discard_passdown:`1`;
222	bool error_if_no_space:`1`;
223	};
224
225	struct thin_c;
226	typedef void (process_bio_fn)(struct* thin_c tc, struct* bio *bio);
227	typedef void (process_cell_fn)(struct* thin_c tc, struct* dm_bio_prison_cell *cell);
228	typedef void (process_mapping_fn)(struct* dm_thin_new_mapping *m);
229
230	#define CELL_SORT_ARRAY_SIZE 8192
231
232	struct pool {
233	struct list_head list;
234	struct dm_target ti; /* Only set if a pool target is bound /
235
236	struct mapped_device *pool_md;
237	struct block_device *data_dev;
238	struct block_device *md_dev;
239	struct dm_pool_metadata *pmd;
240
241	dm_block_t low_water_blocks;
242	uint32_t sectors_per_block;
243	int sectors_per_block_shift;
244
245	struct pool_features pf;
246	bool low_water_triggered:`1`; / A dm event has been sent /
247	bool suspended:`1`;
248	bool out_of_data_space:`1`;
249
250	struct dm_bio_prison *prison;
251	struct dm_kcopyd_client *copier;
252
253	struct work_struct worker;
254	struct workqueue_struct *wq;
255	struct throttle throttle;
256	struct delayed_work waker;
257	struct delayed_work no_space_timeout;
258
259	unsigned long last_commit_jiffies;
260	unsigned int ref_count;
261
262	spinlock_t lock;
263	struct bio_list deferred_flush_bios;
264	struct bio_list deferred_flush_completions;
265	struct list_head prepared_mappings;
266	struct list_head prepared_discards;
267	struct list_head prepared_discards_pt2;
268	struct list_head active_thins;
269
270	struct dm_deferred_set *shared_read_ds;
271	struct dm_deferred_set *all_io_ds;
272
273	struct dm_thin_new_mapping *next_mapping;
274
275	process_bio_fn process_bio;
276	process_bio_fn process_discard;
277
278	process_cell_fn process_cell;
279	process_cell_fn process_discard_cell;
280
281	process_mapping_fn process_prepared_mapping;
282	process_mapping_fn process_prepared_discard;
283	process_mapping_fn process_prepared_discard_pt2;
284
285	struct dm_bio_prison_cell **cell_sort_array;
286
287	mempool_t mapping_pool;
288	};
289
290	static void metadata_operation_failed(struct pool pool, const* char op, int* r);
291
292	static enum pool_mode get_pool_mode(struct pool *pool)
293	{
294	return pool->pf.mode;
295	}
296
297	static void notify_of_pool_mode_change(struct pool *pool)
298	{
299	static const char *descs[] = {
300	"write",
301	"out-of-data-space",
302	"read-only",
303	"read-only",
304	"fail"
305	};
306	const char *extra_desc = NULL;
307	enum pool_mode mode = get_pool_mode(pool);
308
309	if (mode == PM_OUT_OF_DATA_SPACE) {
310	if (!pool->pf.error_if_no_space)
311	extra_desc = " (queue IO)";
312	else
313	extra_desc = " (error IO)";
314	}
315
316	dm_table_event(t: pool->ti->table);
317	DMINFO("%s: switching pool to %s%s mode",
318	dm_device_name(pool->pool_md),
319	descs[(int)mode], extra_desc ? : "");
320	}
321
322	/*
323	* Target context for a pool.
324	*/
325	struct pool_c {
326	struct dm_target *ti;
327	struct pool *pool;
328	struct dm_dev *data_dev;
329	struct dm_dev *metadata_dev;
330
331	dm_block_t low_water_blocks;
332	struct pool_features requested_pf; / Features requested during table load /
333	struct pool_features adjusted_pf; / Features used after adjusting for constituent devices /
334	};
335
336	/*
337	* Target context for a thin.
338	*/
339	struct thin_c {
340	struct list_head list;
341	struct dm_dev *pool_dev;
342	struct dm_dev *origin_dev;
343	sector_t origin_size;
344	dm_thin_id dev_id;
345
346	struct pool *pool;
347	struct dm_thin_device *td;
348	struct mapped_device *thin_md;
349
350	bool requeue_mode:`1`;
351	spinlock_t lock;
352	struct list_head deferred_cells;
353	struct bio_list deferred_bio_list;
354	struct bio_list retry_on_resume_list;
355	struct rb_root sort_bio_list; / sorted list of deferred bios /
356
357	/*
358	* Ensures the thin is not destroyed until the worker has finished
359	* iterating the active_thins list.
360	*/
361	refcount_t refcount;
362	struct completion can_destroy;
363	};
364
365	/----------------------------------------------------------------/
366
367	static bool block_size_is_power_of_two(struct pool *pool)
368	{
369	return pool->sectors_per_block_shift >= `0`;
370	}
371
372	static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
373	{
374	return block_size_is_power_of_two(pool) ?
375	(b << pool->sectors_per_block_shift) :
376	(b * pool->sectors_per_block);
377	}
378
379	/----------------------------------------------------------------/
380
381	struct discard_op {
382	struct thin_c *tc;
383	struct blk_plug plug;
384	struct bio *parent_bio;
385	struct bio *bio;
386	};
387
388	static void begin_discard(struct discard_op op, struct* thin_c tc, struct* bio *parent)
389	{
390	BUG_ON(!parent);
391
392	op->tc = tc;
393	blk_start_plug(&op->plug);
394	op->parent_bio = parent;
395	op->bio = NULL;
396	}
397
398	static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
399	{
400	struct thin_c *tc = op->tc;
401	sector_t s = block_to_sectors(pool: tc->pool, b: data_b);
402	sector_t len = block_to_sectors(pool: tc->pool, b: data_e - data_b);
403
404	return __blkdev_issue_discard(bdev: tc->pool_dev->bdev, sector: s, nr_sects: len, GFP_NOIO, biop: &op->bio);
405	}
406
407	static void end_discard(struct discard_op op, int* r)
408	{
409	if (op->bio) {
410	/*
411	* Even if one of the calls to issue_discard failed, we
412	* need to wait for the chain to complete.
413	*/
414	bio_chain(op->bio, op->parent_bio);
415	op->bio->bi_opf = REQ_OP_DISCARD;
416	submit_bio(bio: op->bio);
417	}
418
419	blk_finish_plug(&op->plug);
420
421	/*
422	* Even if r is set, there could be sub discards in flight that we
423	* need to wait for.
424	*/
425	if (r && !op->parent_bio->bi_status)
426	op->parent_bio->bi_status = errno_to_blk_status(errno: r);
427	bio_endio(op->parent_bio);
428	}
429
430	/----------------------------------------------------------------/
431
432	/*
433	* wake_worker() is used when new work is queued and when pool_resume is
434	* ready to continue deferred IO processing.
435	*/
436	static void wake_worker(struct pool *pool)
437	{
438	queue_work(wq: pool->wq, work: &pool->worker);
439	}
440
441	/----------------------------------------------------------------/
442
443	static int bio_detain(struct pool pool, struct* dm_cell_key key, struct* bio *bio,
444	struct dm_bio_prison_cell **cell_result)
445	{
446	int r;
447	struct dm_bio_prison_cell *cell_prealloc;
448
449	/*
450	* Allocate a cell from the prison's mempool.
451	* This might block but it can't fail.
452	*/
453	cell_prealloc = dm_bio_prison_alloc_cell(prison: pool->prison, GFP_NOIO);
454
455	r = dm_bio_detain(prison: pool->prison, key, inmate: bio, cell_prealloc, cell_result);
456	if (r)
457	/*
458	* We reused an old cell; we can get rid of
459	* the new one.
460	*/
461	dm_bio_prison_free_cell(prison: pool->prison, cell: cell_prealloc);
462
463	return r;
464	}
465
466	static void cell_release(struct pool *pool,
467	struct dm_bio_prison_cell *cell,
468	struct bio_list *bios)
469	{
470	dm_cell_release(prison: pool->prison, cell, bios);
471	dm_bio_prison_free_cell(prison: pool->prison, cell);
472	}
473
474	static void cell_visit_release(struct pool *pool,
475	void (fn)(void* , struct* dm_bio_prison_cell *),
476	void *context,
477	struct dm_bio_prison_cell *cell)
478	{
479	dm_cell_visit_release(prison: pool->prison, visit_fn: fn, context, cell);
480	dm_bio_prison_free_cell(prison: pool->prison, cell);
481	}
482
483	static void cell_release_no_holder(struct pool *pool,
484	struct dm_bio_prison_cell *cell,
485	struct bio_list *bios)
486	{
487	dm_cell_release_no_holder(prison: pool->prison, cell, inmates: bios);
488	dm_bio_prison_free_cell(prison: pool->prison, cell);
489	}
490
491	static void cell_error_with_code(struct pool *pool,
492	struct dm_bio_prison_cell *cell, blk_status_t error_code)
493	{
494	dm_cell_error(prison: pool->prison, cell, error: error_code);
495	dm_bio_prison_free_cell(prison: pool->prison, cell);
496	}
497
498	static blk_status_t get_pool_io_error_code(struct pool *pool)
499	{
500	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
501	}
502
503	static void cell_error(struct pool pool, struct* dm_bio_prison_cell *cell)
504	{
505	cell_error_with_code(pool, cell, error_code: get_pool_io_error_code(pool));
506	}
507
508	static void cell_success(struct pool pool, struct* dm_bio_prison_cell *cell)
509	{
510	cell_error_with_code(pool, cell, error_code: `0`);
511	}
512
513	static void cell_requeue(struct pool pool, struct* dm_bio_prison_cell *cell)
514	{
515	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
516	}
517
518	/----------------------------------------------------------------/
519
520	/*
521	* A global list of pools that uses a struct mapped_device as a key.
522	*/
523	static struct dm_thin_pool_table {
524	struct mutex mutex;
525	struct list_head pools;
526	} dm_thin_pool_table;
527
528	static void pool_table_init(void)
529	{
530	mutex_init(&dm_thin_pool_table.mutex);
531	INIT_LIST_HEAD(list: &dm_thin_pool_table.pools);
532	}
533
534	static void pool_table_exit(void)
535	{
536	mutex_destroy(lock: &dm_thin_pool_table.mutex);
537	}
538
539	static void __pool_table_insert(struct pool *pool)
540	{
541	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
542	list_add(new: &pool->list, head: &dm_thin_pool_table.pools);
543	}
544
545	static void __pool_table_remove(struct pool *pool)
546	{
547	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
548	list_del(entry: &pool->list);
549	}
550
551	static struct pool __pool_table_lookup(struct* mapped_device *md)
552	{
553	struct pool pool = NULL, tmp;
554
555	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
556
557	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
558	if (tmp->pool_md == md) {
559	pool = tmp;
560	break;
561	}
562	}
563
564	return pool;
565	}
566
567	static struct pool __pool_table_lookup_metadata_dev(struct* block_device *md_dev)
568	{
569	struct pool pool = NULL, tmp;
570
571	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
572
573	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
574	if (tmp->md_dev == md_dev) {
575	pool = tmp;
576	break;
577	}
578	}
579
580	return pool;
581	}
582
583	/----------------------------------------------------------------/
584
585	struct dm_thin_endio_hook {
586	struct thin_c *tc;
587	struct dm_deferred_entry *shared_read_entry;
588	struct dm_deferred_entry *all_io_entry;
589	struct dm_thin_new_mapping *overwrite_mapping;
590	struct rb_node rb_node;
591	struct dm_bio_prison_cell *cell;
592	};
593
594	static void __merge_bio_list(struct bio_list bios, struct* bio_list *master)
595	{
596	bio_list_merge(bl: bios, bl2: master);
597	bio_list_init(bl: master);
598	}
599
600	static void error_bio_list(struct bio_list *bios, blk_status_t error)
601	{
602	struct bio *bio;
603
604	while ((bio = bio_list_pop(bl: bios))) {
605	bio->bi_status = error;
606	bio_endio(bio);
607	}
608	}
609
610	static void error_thin_bio_list(struct thin_c tc, struct* bio_list *master,
611	blk_status_t error)
612	{
613	struct bio_list bios;
614
615	bio_list_init(bl: &bios);
616
617	spin_lock_irq(lock: &tc->lock);
618	__merge_bio_list(bios: &bios, master);
619	spin_unlock_irq(lock: &tc->lock);
620
621	error_bio_list(bios: &bios, error);
622	}
623
624	static void requeue_deferred_cells(struct thin_c *tc)
625	{
626	struct pool *pool = tc->pool;
627	struct list_head cells;
628	struct dm_bio_prison_cell cell, tmp;
629
630	INIT_LIST_HEAD(list: &cells);
631
632	spin_lock_irq(lock: &tc->lock);
633	list_splice_init(list: &tc->deferred_cells, head: &cells);
634	spin_unlock_irq(lock: &tc->lock);
635
636	list_for_each_entry_safe(cell, tmp, &cells, user_list)
637	cell_requeue(pool, cell);
638	}
639
640	static void requeue_io(struct thin_c *tc)
641	{
642	struct bio_list bios;
643
644	bio_list_init(bl: &bios);
645
646	spin_lock_irq(lock: &tc->lock);
647	__merge_bio_list(bios: &bios, master: &tc->deferred_bio_list);
648	__merge_bio_list(bios: &bios, master: &tc->retry_on_resume_list);
649	spin_unlock_irq(lock: &tc->lock);
650
651	error_bio_list(bios: &bios, BLK_STS_DM_REQUEUE);
652	requeue_deferred_cells(tc);
653	}
654
655	static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
656	{
657	struct thin_c *tc;
658
659	rcu_read_lock();
660	list_for_each_entry_rcu(tc, &pool->active_thins, list)
661	error_thin_bio_list(tc, master: &tc->retry_on_resume_list, error);
662	rcu_read_unlock();
663	}
664
665	static void error_retry_list(struct pool *pool)
666	{
667	error_retry_list_with_code(pool, error: get_pool_io_error_code(pool));
668	}
669
670	/*
671	* This section of code contains the logic for processing a thin device's IO.
672	* Much of the code depends on pool object resources (lists, workqueues, etc)
673	* but most is exclusively called from the thin target rather than the thin-pool
674	* target.
675	*/
676
677	static dm_block_t get_bio_block(struct thin_c tc, struct* bio *bio)
678	{
679	struct pool *pool = tc->pool;
680	sector_t block_nr = bio->bi_iter.bi_sector;
681
682	if (block_size_is_power_of_two(pool))
683	block_nr >>= pool->sectors_per_block_shift;
684	else
685	(void) sector_div(block_nr, pool->sectors_per_block);
686
687	return block_nr;
688	}
689
690	/*
691	* Returns the _complete_ blocks that this bio covers.
692	*/
693	static void get_bio_block_range(struct thin_c tc, struct* bio *bio,
694	dm_block_t begin, dm_block_t end)
695	{
696	struct pool *pool = tc->pool;
697	sector_t b = bio->bi_iter.bi_sector;
698	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
699
700	b += pool->sectors_per_block - `1ull`; / so we round up /
701
702	if (block_size_is_power_of_two(pool)) {
703	b >>= pool->sectors_per_block_shift;
704	e >>= pool->sectors_per_block_shift;
705	} else {
706	(void) sector_div(b, pool->sectors_per_block);
707	(void) sector_div(e, pool->sectors_per_block);
708	}
709
710	if (e < b)
711	/ Can happen if the bio is within a single block. /
712	e = b;
713
714	*begin = b;
715	*end = e;
716	}
717
718	static void remap(struct thin_c tc, struct* bio *bio, dm_block_t block)
719	{
720	struct pool *pool = tc->pool;
721	sector_t bi_sector = bio->bi_iter.bi_sector;
722
723	bio_set_dev(bio, bdev: tc->pool_dev->bdev);
724	if (block_size_is_power_of_two(pool))
725	bio->bi_iter.bi_sector =
726	(block << pool->sectors_per_block_shift) \|
727	(bi_sector & (pool->sectors_per_block - `1`));
728	else
729	bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
730	sector_div(bi_sector, pool->sectors_per_block);
731	}
732
733	static void remap_to_origin(struct thin_c tc, struct* bio *bio)
734	{
735	bio_set_dev(bio, bdev: tc->origin_dev->bdev);
736	}
737
738	static int bio_triggers_commit(struct thin_c tc, struct* bio *bio)
739	{
740	return op_is_flush(op: bio->bi_opf) &&
741	dm_thin_changed_this_transaction(td: tc->td);
742	}
743
744	static void inc_all_io_entry(struct pool pool, struct* bio *bio)
745	{
746	struct dm_thin_endio_hook *h;
747
748	if (bio_op(bio) == REQ_OP_DISCARD)
749	return;
750
751	h = dm_per_bio_data(bio, data_size: sizeof(struct dm_thin_endio_hook));
752	h->all_io_entry = dm_deferred_entry_inc(ds: pool->all_io_ds);
753	}
754
755	static void issue(struct thin_c tc, struct* bio *bio)
756	{
757	struct pool *pool = tc->pool;
758
759	if (!bio_triggers_commit(tc, bio)) {
760	dm_submit_bio_remap(clone: bio, NULL);
761	return;
762	}
763
764	/*
765	* Complete bio with an error if earlier I/O caused changes to
766	* the metadata that can't be committed e.g, due to I/O errors
767	* on the metadata device.
768	*/
769	if (dm_thin_aborted_changes(td: tc->td)) {
770	bio_io_error(bio);
771	return;
772	}
773
774	/*
775	* Batch together any bios that trigger commits and then issue a
776	* single commit for them in process_deferred_bios().
777	*/
778	spin_lock_irq(lock: &pool->lock);
779	bio_list_add(bl: &pool->deferred_flush_bios, bio);
780	spin_unlock_irq(lock: &pool->lock);
781	}
782
783	static void remap_to_origin_and_issue(struct thin_c tc, struct* bio *bio)
784	{
785	remap_to_origin(tc, bio);
786	issue(tc, bio);
787	}
788
789	static void remap_and_issue(struct thin_c tc, struct* bio *bio,
790	dm_block_t block)
791	{
792	remap(tc, bio, block);
793	issue(tc, bio);
794	}
795
796	/----------------------------------------------------------------/
797
798	/*
799	* Bio endio functions.
800	*/
801	struct dm_thin_new_mapping {
802	struct list_head list;
803
804	bool pass_discard:`1`;
805	bool maybe_shared:`1`;
806
807	/*
808	* Track quiescing, copying and zeroing preparation actions. When this
809	* counter hits zero the block is prepared and can be inserted into the
810	* btree.
811	*/
812	atomic_t prepare_actions;
813
814	blk_status_t status;
815	struct thin_c *tc;
816	dm_block_t virt_begin, virt_end;
817	dm_block_t data_block;
818	struct dm_bio_prison_cell *cell;
819
820	/*
821	* If the bio covers the whole area of a block then we can avoid
822	* zeroing or copying. Instead this bio is hooked. The bio will
823	* still be in the cell, so care has to be taken to avoid issuing
824	* the bio twice.
825	*/
826	struct bio *bio;
827	bio_end_io_t *saved_bi_end_io;
828	};
829
830	static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
831	{
832	struct pool *pool = m->tc->pool;
833
834	if (atomic_dec_and_test(v: &m->prepare_actions)) {
835	list_add_tail(new: &m->list, head: &pool->prepared_mappings);
836	wake_worker(pool);
837	}
838	}
839
840	static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
841	{
842	unsigned long flags;
843	struct pool *pool = m->tc->pool;
844
845	spin_lock_irqsave(&pool->lock, flags);
846	__complete_mapping_preparation(m);
847	spin_unlock_irqrestore(lock: &pool->lock, flags);
848	}
849
850	static void copy_complete(int read_err, unsigned long write_err, void *context)
851	{
852	struct dm_thin_new_mapping *m = context;
853
854	m->status = read_err \|\| write_err ? BLK_STS_IOERR : `0`;
855	complete_mapping_preparation(m);
856	}
857
858	static void overwrite_endio(struct bio *bio)
859	{
860	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
861	struct dm_thin_new_mapping *m = h->overwrite_mapping;
862
863	bio->bi_end_io = m->saved_bi_end_io;
864
865	m->status = bio->bi_status;
866	complete_mapping_preparation(m);
867	}
868
869	/----------------------------------------------------------------/
870
871	/*
872	* Workqueue.
873	*/
874
875	/*
876	* Prepared mapping jobs.
877	*/
878
879	/*
880	* This sends the bios in the cell, except the original holder, back
881	* to the deferred_bios list.
882	*/
883	static void cell_defer_no_holder(struct thin_c tc, struct* dm_bio_prison_cell *cell)
884	{
885	struct pool *pool = tc->pool;
886	unsigned long flags;
887	struct bio_list bios;
888
889	bio_list_init(bl: &bios);
890	cell_release_no_holder(pool, cell, bios: &bios);
891
892	if (!bio_list_empty(bl: &bios)) {
893	spin_lock_irqsave(&tc->lock, flags);
894	bio_list_merge(bl: &tc->deferred_bio_list, bl2: &bios);
895	spin_unlock_irqrestore(lock: &tc->lock, flags);
896	wake_worker(pool);
897	}
898	}
899
900	static void thin_defer_bio(struct thin_c tc, struct* bio *bio);
901
902	struct remap_info {
903	struct thin_c *tc;
904	struct bio_list defer_bios;
905	struct bio_list issue_bios;
906	};
907
908	static void __inc_remap_and_issue_cell(void *context,
909	struct dm_bio_prison_cell *cell)
910	{
911	struct remap_info *info = context;
912	struct bio *bio;
913
914	while ((bio = bio_list_pop(bl: &cell->bios))) {
915	if (op_is_flush(op: bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD)
916	bio_list_add(bl: &info->defer_bios, bio);
917	else {
918	inc_all_io_entry(pool: info->tc->pool, bio);
919
920	/*
921	* We can't issue the bios with the bio prison lock
922	* held, so we add them to a list to issue on
923	* return from this function.
924	*/
925	bio_list_add(bl: &info->issue_bios, bio);
926	}
927	}
928	}
929
930	static void inc_remap_and_issue_cell(struct thin_c *tc,
931	struct dm_bio_prison_cell *cell,
932	dm_block_t block)
933	{
934	struct bio *bio;
935	struct remap_info info;
936
937	info.tc = tc;
938	bio_list_init(bl: &info.defer_bios);
939	bio_list_init(bl: &info.issue_bios);
940
941	/*
942	* We have to be careful to inc any bios we're about to issue
943	* before the cell is released, and avoid a race with new bios
944	* being added to the cell.
945	*/
946	cell_visit_release(pool: tc->pool, fn: __inc_remap_and_issue_cell,
947	context: &info, cell);
948
949	while ((bio = bio_list_pop(bl: &info.defer_bios)))
950	thin_defer_bio(tc, bio);
951
952	while ((bio = bio_list_pop(bl: &info.issue_bios)))
953	remap_and_issue(tc: info.tc, bio, block);
954	}
955
956	static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
957	{
958	cell_error(pool: m->tc->pool, cell: m->cell);
959	list_del(entry: &m->list);
960	mempool_free(element: m, pool: &m->tc->pool->mapping_pool);
961	}
962
963	static void complete_overwrite_bio(struct thin_c tc, struct* bio *bio)
964	{
965	struct pool *pool = tc->pool;
966
967	/*
968	* If the bio has the REQ_FUA flag set we must commit the metadata
969	* before signaling its completion.
970	*/
971	if (!bio_triggers_commit(tc, bio)) {
972	bio_endio(bio);
973	return;
974	}
975
976	/*
977	* Complete bio with an error if earlier I/O caused changes to the
978	* metadata that can't be committed, e.g, due to I/O errors on the
979	* metadata device.
980	*/
981	if (dm_thin_aborted_changes(td: tc->td)) {
982	bio_io_error(bio);
983	return;
984	}
985
986	/*
987	* Batch together any bios that trigger commits and then issue a
988	* single commit for them in process_deferred_bios().
989	*/
990	spin_lock_irq(lock: &pool->lock);
991	bio_list_add(bl: &pool->deferred_flush_completions, bio);
992	spin_unlock_irq(lock: &pool->lock);
993	}
994
995	static void process_prepared_mapping(struct dm_thin_new_mapping *m)
996	{
997	struct thin_c *tc = m->tc;
998	struct pool *pool = tc->pool;
999	struct bio *bio = m->bio;
1000	int r;
1001
1002	if (m->status) {
1003	cell_error(pool, cell: m->cell);
1004	goto out;
1005	}
1006
1007	/*
1008	* Commit the prepared block into the mapping btree.
1009	* Any I/O for this block arriving after this point will get
1010	* remapped to it directly.
1011	*/
1012	r = dm_thin_insert_block(td: tc->td, block: m->virt_begin, data_block: m->data_block);
1013	if (r) {
1014	metadata_operation_failed(pool, op: "dm_thin_insert_block", r);
1015	cell_error(pool, cell: m->cell);
1016	goto out;
1017	}
1018
1019	/*
1020	* Release any bios held while the block was being provisioned.
1021	* If we are processing a write bio that completely covers the block,
1022	* we already processed it so can ignore it now when processing
1023	* the bios in the cell.
1024	*/
1025	if (bio) {
1026	inc_remap_and_issue_cell(tc, cell: m->cell, block: m->data_block);
1027	complete_overwrite_bio(tc, bio);
1028	} else {
1029	inc_all_io_entry(pool: tc->pool, bio: m->cell->holder);
1030	remap_and_issue(tc, bio: m->cell->holder, block: m->data_block);
1031	inc_remap_and_issue_cell(tc, cell: m->cell, block: m->data_block);
1032	}
1033
1034	out:
1035	list_del(entry: &m->list);
1036	mempool_free(element: m, pool: &pool->mapping_pool);
1037	}
1038
1039	/----------------------------------------------------------------/
1040
1041	static void free_discard_mapping(struct dm_thin_new_mapping *m)
1042	{
1043	struct thin_c *tc = m->tc;
1044
1045	if (m->cell)
1046	cell_defer_no_holder(tc, cell: m->cell);
1047	mempool_free(element: m, pool: &tc->pool->mapping_pool);
1048	}
1049
1050	static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1051	{
1052	bio_io_error(bio: m->bio);
1053	free_discard_mapping(m);
1054	}
1055
1056	static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1057	{
1058	bio_endio(m->bio);
1059	free_discard_mapping(m);
1060	}
1061
1062	static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1063	{
1064	int r;
1065	struct thin_c *tc = m->tc;
1066
1067	r = dm_thin_remove_range(td: tc->td, begin: m->cell->key.block_begin, end: m->cell->key.block_end);
1068	if (r) {
1069	metadata_operation_failed(pool: tc->pool, op: "dm_thin_remove_range", r);
1070	bio_io_error(bio: m->bio);
1071	} else
1072	bio_endio(m->bio);
1073
1074	cell_defer_no_holder(tc, cell: m->cell);
1075	mempool_free(element: m, pool: &tc->pool->mapping_pool);
1076	}
1077
1078	/----------------------------------------------------------------/
1079
1080	static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1081	struct bio *discard_parent)
1082	{
1083	/*
1084	* We've already unmapped this range of blocks, but before we
1085	* passdown we have to check that these blocks are now unused.
1086	*/
1087	int r = `0`;
1088	bool shared = true;
1089	struct thin_c *tc = m->tc;
1090	struct pool *pool = tc->pool;
1091	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1092	struct discard_op op;
1093
1094	begin_discard(op: &op, tc, parent: discard_parent);
1095	while (b != end) {
1096	/ find start of unmapped run /
1097	for (; b < end; b++) {
1098	r = dm_pool_block_is_shared(pmd: pool->pmd, b, result: &shared);
1099	if (r)
1100	goto out;
1101
1102	if (!shared)
1103	break;
1104	}
1105
1106	if (b == end)
1107	break;
1108
1109	/ find end of run /
1110	for (e = b + `1`; e != end; e++) {
1111	r = dm_pool_block_is_shared(pmd: pool->pmd, b: e, result: &shared);
1112	if (r)
1113	goto out;
1114
1115	if (shared)
1116	break;
1117	}
1118
1119	r = issue_discard(op: &op, data_b: b, data_e: e);
1120	if (r)
1121	goto out;
1122
1123	b = e;
1124	}
1125	out:
1126	end_discard(op: &op, r);
1127	}
1128
1129	static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1130	{
1131	unsigned long flags;
1132	struct pool *pool = m->tc->pool;
1133
1134	spin_lock_irqsave(&pool->lock, flags);
1135	list_add_tail(new: &m->list, head: &pool->prepared_discards_pt2);
1136	spin_unlock_irqrestore(lock: &pool->lock, flags);
1137	wake_worker(pool);
1138	}
1139
1140	static void passdown_endio(struct bio *bio)
1141	{
1142	/*
1143	* It doesn't matter if the passdown discard failed, we still want
1144	* to unmap (we ignore err).
1145	*/
1146	queue_passdown_pt2(m: bio->bi_private);
1147	bio_put(bio);
1148	}
1149
1150	static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1151	{
1152	int r;
1153	struct thin_c *tc = m->tc;
1154	struct pool *pool = tc->pool;
1155	struct bio *discard_parent;
1156	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1157
1158	/*
1159	* Only this thread allocates blocks, so we can be sure that the
1160	* newly unmapped blocks will not be allocated before the end of
1161	* the function.
1162	*/
1163	r = dm_thin_remove_range(td: tc->td, begin: m->virt_begin, end: m->virt_end);
1164	if (r) {
1165	metadata_operation_failed(pool, op: "dm_thin_remove_range", r);
1166	bio_io_error(bio: m->bio);
1167	cell_defer_no_holder(tc, cell: m->cell);
1168	mempool_free(element: m, pool: &pool->mapping_pool);
1169	return;
1170	}
1171
1172	/*
1173	* Increment the unmapped blocks. This prevents a race between the
1174	* passdown io and reallocation of freed blocks.
1175	*/
1176	r = dm_pool_inc_data_range(pmd: pool->pmd, b: m->data_block, e: data_end);
1177	if (r) {
1178	metadata_operation_failed(pool, op: "dm_pool_inc_data_range", r);
1179	bio_io_error(bio: m->bio);
1180	cell_defer_no_holder(tc, cell: m->cell);
1181	mempool_free(element: m, pool: &pool->mapping_pool);
1182	return;
1183	}
1184
1185	discard_parent = bio_alloc(NULL, nr_vecs: `1`, opf: `0`, GFP_NOIO);
1186	discard_parent->bi_end_io = passdown_endio;
1187	discard_parent->bi_private = m;
1188	if (m->maybe_shared)
1189	passdown_double_checking_shared_status(m, discard_parent);
1190	else {
1191	struct discard_op op;
1192
1193	begin_discard(op: &op, tc, parent: discard_parent);
1194	r = issue_discard(op: &op, data_b: m->data_block, data_e: data_end);
1195	end_discard(op: &op, r);
1196	}
1197	}
1198
1199	static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1200	{
1201	int r;
1202	struct thin_c *tc = m->tc;
1203	struct pool *pool = tc->pool;
1204
1205	/*
1206	* The passdown has completed, so now we can decrement all those
1207	* unmapped blocks.
1208	*/
1209	r = dm_pool_dec_data_range(pmd: pool->pmd, b: m->data_block,
1210	e: m->data_block + (m->virt_end - m->virt_begin));
1211	if (r) {
1212	metadata_operation_failed(pool, op: "dm_pool_dec_data_range", r);
1213	bio_io_error(bio: m->bio);
1214	} else
1215	bio_endio(m->bio);
1216
1217	cell_defer_no_holder(tc, cell: m->cell);
1218	mempool_free(element: m, pool: &pool->mapping_pool);
1219	}
1220
1221	static void process_prepared(struct pool pool, struct* list_head *head,
1222	process_mapping_fn *fn)
1223	{
1224	struct list_head maps;
1225	struct dm_thin_new_mapping m, tmp;
1226
1227	INIT_LIST_HEAD(list: &maps);
1228	spin_lock_irq(lock: &pool->lock);
1229	list_splice_init(list: head, head: &maps);
1230	spin_unlock_irq(lock: &pool->lock);
1231
1232	list_for_each_entry_safe(m, tmp, &maps, list)
1233	(*fn)(m);
1234	}
1235
1236	/*
1237	* Deferred bio jobs.
1238	*/
1239	static int io_overlaps_block(struct pool pool, struct* bio *bio)
1240	{
1241	return bio->bi_iter.bi_size ==
1242	(pool->sectors_per_block << SECTOR_SHIFT);
1243	}
1244
1245	static int io_overwrites_block(struct pool pool, struct* bio *bio)
1246	{
1247	return (bio_data_dir(bio) == WRITE) &&
1248	io_overlaps_block(pool, bio);
1249	}
1250
1251	static void save_and_set_endio(struct bio bio, bio_end_io_t *save,
1252	bio_end_io_t *fn)
1253	{
1254	*save = bio->bi_end_io;
1255	bio->bi_end_io = fn;
1256	}
1257
1258	static int ensure_next_mapping(struct pool *pool)
1259	{
1260	if (pool->next_mapping)
1261	return `0`;
1262
1263	pool->next_mapping = mempool_alloc(pool: &pool->mapping_pool, GFP_ATOMIC);
1264
1265	return pool->next_mapping ? `0` : -ENOMEM;
1266	}
1267
1268	static struct dm_thin_new_mapping get_next_mapping(struct* pool *pool)
1269	{
1270	struct dm_thin_new_mapping *m = pool->next_mapping;
1271
1272	BUG_ON(!pool->next_mapping);
1273
1274	memset(m, `0`, sizeof(struct dm_thin_new_mapping));
1275	INIT_LIST_HEAD(list: &m->list);
1276	m->bio = NULL;
1277
1278	pool->next_mapping = NULL;
1279
1280	return m;
1281	}
1282
1283	static void ll_zero(struct thin_c tc, struct* dm_thin_new_mapping *m,
1284	sector_t begin, sector_t end)
1285	{
1286	struct dm_io_region to;
1287
1288	to.bdev = tc->pool_dev->bdev;
1289	to.sector = begin;
1290	to.count = end - begin;
1291
1292	dm_kcopyd_zero(kc: tc->pool->copier, num_dests: `1`, dests: &to, flags: `0`, fn: copy_complete, context: m);
1293	}
1294
1295	static void remap_and_issue_overwrite(struct thin_c tc, struct* bio *bio,
1296	dm_block_t data_begin,
1297	struct dm_thin_new_mapping *m)
1298	{
1299	struct pool *pool = tc->pool;
1300	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1301
1302	h->overwrite_mapping = m;
1303	m->bio = bio;
1304	save_and_set_endio(bio, save: &m->saved_bi_end_io, fn: overwrite_endio);
1305	inc_all_io_entry(pool, bio);
1306	remap_and_issue(tc, bio, block: data_begin);
1307	}
1308
1309	/*
1310	* A partial copy also needs to zero the uncopied region.
1311	*/
1312	static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1313	struct dm_dev *origin, dm_block_t data_origin,
1314	dm_block_t data_dest,
1315	struct dm_bio_prison_cell cell, struct* bio *bio,
1316	sector_t len)
1317	{
1318	struct pool *pool = tc->pool;
1319	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1320
1321	m->tc = tc;
1322	m->virt_begin = virt_block;
1323	m->virt_end = virt_block + `1u`;
1324	m->data_block = data_dest;
1325	m->cell = cell;
1326
1327	/*
1328	* quiesce action + copy action + an extra reference held for the
1329	* duration of this function (we may need to inc later for a
1330	* partial zero).
1331	*/
1332	atomic_set(v: &m->prepare_actions, i: `3`);
1333
1334	if (!dm_deferred_set_add_work(ds: pool->shared_read_ds, work: &m->list))
1335	complete_mapping_preparation(m); / already quiesced /
1336
1337	/*
1338	* IO to pool_dev remaps to the pool target's data_dev.
1339	*
1340	* If the whole block of data is being overwritten, we can issue the
1341	* bio immediately. Otherwise we use kcopyd to clone the data first.
1342	*/
1343	if (io_overwrites_block(pool, bio))
1344	remap_and_issue_overwrite(tc, bio, data_begin: data_dest, m);
1345	else {
1346	struct dm_io_region from, to;
1347
1348	from.bdev = origin->bdev;
1349	from.sector = data_origin * pool->sectors_per_block;
1350	from.count = len;
1351
1352	to.bdev = tc->pool_dev->bdev;
1353	to.sector = data_dest * pool->sectors_per_block;
1354	to.count = len;
1355
1356	dm_kcopyd_copy(kc: pool->copier, from: &from, num_dests: `1`, dests: &to,
1357	flags: `0`, fn: copy_complete, context: m);
1358
1359	/*
1360	* Do we need to zero a tail region?
1361	*/
1362	if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1363	atomic_inc(v: &m->prepare_actions);
1364	ll_zero(tc, m,
1365	begin: data_dest * pool->sectors_per_block + len,
1366	end: (data_dest + `1`) * pool->sectors_per_block);
1367	}
1368	}
1369
1370	complete_mapping_preparation(m); / drop our ref /
1371	}
1372
1373	static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1374	dm_block_t data_origin, dm_block_t data_dest,
1375	struct dm_bio_prison_cell cell, struct* bio *bio)
1376	{
1377	schedule_copy(tc, virt_block, origin: tc->pool_dev,
1378	data_origin, data_dest, cell, bio,
1379	len: tc->pool->sectors_per_block);
1380	}
1381
1382	static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1383	dm_block_t data_block, struct dm_bio_prison_cell *cell,
1384	struct bio *bio)
1385	{
1386	struct pool *pool = tc->pool;
1387	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1388
1389	atomic_set(v: &m->prepare_actions, i: `1`); / no need to quiesce /
1390	m->tc = tc;
1391	m->virt_begin = virt_block;
1392	m->virt_end = virt_block + `1u`;
1393	m->data_block = data_block;
1394	m->cell = cell;
1395
1396	/*
1397	* If the whole block of data is being overwritten or we are not
1398	* zeroing pre-existing data, we can issue the bio immediately.
1399	* Otherwise we use kcopyd to zero the data first.
1400	*/
1401	if (pool->pf.zero_new_blocks) {
1402	if (io_overwrites_block(pool, bio))
1403	remap_and_issue_overwrite(tc, bio, data_begin: data_block, m);
1404	else
1405	ll_zero(tc, m, begin: data_block * pool->sectors_per_block,
1406	end: (data_block + `1`) * pool->sectors_per_block);
1407	} else
1408	process_prepared_mapping(m);
1409	}
1410
1411	static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1412	dm_block_t data_dest,
1413	struct dm_bio_prison_cell cell, struct* bio *bio)
1414	{
1415	struct pool *pool = tc->pool;
1416	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1417	sector_t virt_block_end = (virt_block + `1`) * pool->sectors_per_block;
1418
1419	if (virt_block_end <= tc->origin_size)
1420	schedule_copy(tc, virt_block, origin: tc->origin_dev,
1421	data_origin: virt_block, data_dest, cell, bio,
1422	len: pool->sectors_per_block);
1423
1424	else if (virt_block_begin < tc->origin_size)
1425	schedule_copy(tc, virt_block, origin: tc->origin_dev,
1426	data_origin: virt_block, data_dest, cell, bio,
1427	len: tc->origin_size - virt_block_begin);
1428
1429	else
1430	schedule_zero(tc, virt_block, data_block: data_dest, cell, bio);
1431	}
1432
1433	static void set_pool_mode(struct pool pool, enum* pool_mode new_mode);
1434
1435	static void requeue_bios(struct pool *pool);
1436
1437	static bool is_read_only_pool_mode(enum pool_mode mode)
1438	{
1439	return (mode == PM_OUT_OF_METADATA_SPACE \|\| mode == PM_READ_ONLY);
1440	}
1441
1442	static bool is_read_only(struct pool *pool)
1443	{
1444	return is_read_only_pool_mode(mode: get_pool_mode(pool));
1445	}
1446
1447	static void check_for_metadata_space(struct pool *pool)
1448	{
1449	int r;
1450	const char *ooms_reason = NULL;
1451	dm_block_t nr_free;
1452
1453	r = dm_pool_get_free_metadata_block_count(pmd: pool->pmd, result: &nr_free);
1454	if (r)
1455	ooms_reason = "Could not get free metadata blocks";
1456	else if (!nr_free)
1457	ooms_reason = "No free metadata blocks";
1458
1459	if (ooms_reason && !is_read_only(pool)) {
1460	DMERR("%s", ooms_reason);
1461	set_pool_mode(pool, new_mode: PM_OUT_OF_METADATA_SPACE);
1462	}
1463	}
1464
1465	static void check_for_data_space(struct pool *pool)
1466	{
1467	int r;
1468	dm_block_t nr_free;
1469
1470	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1471	return;
1472
1473	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &nr_free);
1474	if (r)
1475	return;
1476
1477	if (nr_free) {
1478	set_pool_mode(pool, new_mode: PM_WRITE);
1479	requeue_bios(pool);
1480	}
1481	}
1482
1483	/*
1484	* A non-zero return indicates read_only or fail_io mode.
1485	* Many callers don't care about the return value.
1486	*/
1487	static int commit(struct pool *pool)
1488	{
1489	int r;
1490
1491	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1492	return -EINVAL;
1493
1494	r = dm_pool_commit_metadata(pmd: pool->pmd);
1495	if (r)
1496	metadata_operation_failed(pool, op: "dm_pool_commit_metadata", r);
1497	else {
1498	check_for_metadata_space(pool);
1499	check_for_data_space(pool);
1500	}
1501
1502	return r;
1503	}
1504
1505	static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1506	{
1507	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1508	DMWARN("%s: reached low water mark for data device: sending event.",
1509	dm_device_name(pool->pool_md));
1510	spin_lock_irq(lock: &pool->lock);
1511	pool->low_water_triggered = true;
1512	spin_unlock_irq(lock: &pool->lock);
1513	dm_table_event(t: pool->ti->table);
1514	}
1515	}
1516
1517	static int alloc_data_block(struct thin_c tc, dm_block_t result)
1518	{
1519	int r;
1520	dm_block_t free_blocks;
1521	struct pool *pool = tc->pool;
1522
1523	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1524	return -EINVAL;
1525
1526	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &free_blocks);
1527	if (r) {
1528	metadata_operation_failed(pool, op: "dm_pool_get_free_block_count", r);
1529	return r;
1530	}
1531
1532	check_low_water_mark(pool, free_blocks);
1533
1534	if (!free_blocks) {
1535	/*
1536	* Try to commit to see if that will free up some
1537	* more space.
1538	*/
1539	r = commit(pool);
1540	if (r)
1541	return r;
1542
1543	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &free_blocks);
1544	if (r) {
1545	metadata_operation_failed(pool, op: "dm_pool_get_free_block_count", r);
1546	return r;
1547	}
1548
1549	if (!free_blocks) {
1550	set_pool_mode(pool, new_mode: PM_OUT_OF_DATA_SPACE);
1551	return -ENOSPC;
1552	}
1553	}
1554
1555	r = dm_pool_alloc_data_block(pmd: pool->pmd, result);
1556	if (r) {
1557	if (r == -ENOSPC)
1558	set_pool_mode(pool, new_mode: PM_OUT_OF_DATA_SPACE);
1559	else
1560	metadata_operation_failed(pool, op: "dm_pool_alloc_data_block", r);
1561	return r;
1562	}
1563
1564	r = dm_pool_get_free_metadata_block_count(pmd: pool->pmd, result: &free_blocks);
1565	if (r) {
1566	metadata_operation_failed(pool, op: "dm_pool_get_free_metadata_block_count", r);
1567	return r;
1568	}
1569
1570	if (!free_blocks) {
1571	/ Let's commit before we use up the metadata reserve. /
1572	r = commit(pool);
1573	if (r)
1574	return r;
1575	}
1576
1577	return `0`;
1578	}
1579
1580	/*
1581	* If we have run out of space, queue bios until the device is
1582	* resumed, presumably after having been reloaded with more space.
1583	*/
1584	static void retry_on_resume(struct bio *bio)
1585	{
1586	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1587	struct thin_c *tc = h->tc;
1588
1589	spin_lock_irq(lock: &tc->lock);
1590	bio_list_add(bl: &tc->retry_on_resume_list, bio);
1591	spin_unlock_irq(lock: &tc->lock);
1592	}
1593
1594	static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1595	{
1596	enum pool_mode m = get_pool_mode(pool);
1597
1598	switch (m) {
1599	case PM_WRITE:
1600	/ Shouldn't get here /
1601	DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1602	return BLK_STS_IOERR;
1603
1604	case PM_OUT_OF_DATA_SPACE:
1605	return pool->pf.error_if_no_space ? BLK_STS_NOSPC : `0`;
1606
1607	case PM_OUT_OF_METADATA_SPACE:
1608	case PM_READ_ONLY:
1609	case PM_FAIL:
1610	return BLK_STS_IOERR;
1611	default:
1612	/ Shouldn't get here /
1613	DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1614	return BLK_STS_IOERR;
1615	}
1616	}
1617
1618	static void handle_unserviceable_bio(struct pool pool, struct* bio *bio)
1619	{
1620	blk_status_t error = should_error_unserviceable_bio(pool);
1621
1622	if (error) {
1623	bio->bi_status = error;
1624	bio_endio(bio);
1625	} else
1626	retry_on_resume(bio);
1627	}
1628
1629	static void retry_bios_on_resume(struct pool pool, struct* dm_bio_prison_cell *cell)
1630	{
1631	struct bio *bio;
1632	struct bio_list bios;
1633	blk_status_t error;
1634
1635	error = should_error_unserviceable_bio(pool);
1636	if (error) {
1637	cell_error_with_code(pool, cell, error_code: error);
1638	return;
1639	}
1640
1641	bio_list_init(bl: &bios);
1642	cell_release(pool, cell, bios: &bios);
1643
1644	while ((bio = bio_list_pop(bl: &bios)))
1645	retry_on_resume(bio);
1646	}
1647
1648	static void process_discard_cell_no_passdown(struct thin_c *tc,
1649	struct dm_bio_prison_cell *virt_cell)
1650	{
1651	struct pool *pool = tc->pool;
1652	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1653
1654	/*
1655	* We don't need to lock the data blocks, since there's no
1656	* passdown. We only lock data blocks for allocation and breaking sharing.
1657	*/
1658	m->tc = tc;
1659	m->virt_begin = virt_cell->key.block_begin;
1660	m->virt_end = virt_cell->key.block_end;
1661	m->cell = virt_cell;
1662	m->bio = virt_cell->holder;
1663
1664	if (!dm_deferred_set_add_work(ds: pool->all_io_ds, work: &m->list))
1665	pool->process_prepared_discard(m);
1666	}
1667
1668	static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1669	struct bio *bio)
1670	{
1671	struct pool *pool = tc->pool;
1672
1673	int r;
1674	bool maybe_shared;
1675	struct dm_cell_key data_key;
1676	struct dm_bio_prison_cell *data_cell;
1677	struct dm_thin_new_mapping *m;
1678	dm_block_t virt_begin, virt_end, data_begin, data_end;
1679	dm_block_t len, next_boundary;
1680
1681	while (begin != end) {
1682	r = dm_thin_find_mapped_range(td: tc->td, begin, end, thin_begin: &virt_begin, thin_end: &virt_end,
1683	pool_begin: &data_begin, maybe_shared: &maybe_shared);
1684	if (r) {
1685	/*
1686	* Silently fail, letting any mappings we've
1687	* created complete.
1688	*/
1689	break;
1690	}
1691
1692	data_end = data_begin + (virt_end - virt_begin);
1693
1694	/*
1695	* Make sure the data region obeys the bio prison restrictions.
1696	*/
1697	while (data_begin < data_end) {
1698	r = ensure_next_mapping(pool);
1699	if (r)
1700	return; / we did our best /
1701
1702	next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + `1`)
1703	<< BIO_PRISON_MAX_RANGE_SHIFT;
1704	len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
1705
1706	/ This key is certainly within range given the above splitting /
1707	(void) build_key(td: tc->td, ls: PHYSICAL, b: data_begin, e: data_begin + len, key: &data_key);
1708	if (bio_detain(pool: tc->pool, key: &data_key, NULL, cell_result: &data_cell)) {
1709	/ contention, we'll give up with this range /
1710	data_begin += len;
1711	continue;
1712	}
1713
1714	/*
1715	* IO may still be going to the destination block. We must
1716	* quiesce before we can do the removal.
1717	*/
1718	m = get_next_mapping(pool);
1719	m->tc = tc;
1720	m->maybe_shared = maybe_shared;
1721	m->virt_begin = virt_begin;
1722	m->virt_end = virt_begin + len;
1723	m->data_block = data_begin;
1724	m->cell = data_cell;
1725	m->bio = bio;
1726
1727	/*
1728	* The parent bio must not complete before sub discard bios are
1729	* chained to it (see end_discard's bio_chain)!
1730	*
1731	* This per-mapping bi_remaining increment is paired with
1732	* the implicit decrement that occurs via bio_endio() in
1733	* end_discard().
1734	*/
1735	bio_inc_remaining(bio);
1736	if (!dm_deferred_set_add_work(ds: pool->all_io_ds, work: &m->list))
1737	pool->process_prepared_discard(m);
1738
1739	virt_begin += len;
1740	data_begin += len;
1741	}
1742
1743	begin = virt_end;
1744	}
1745	}
1746
1747	static void process_discard_cell_passdown(struct thin_c tc, struct* dm_bio_prison_cell *virt_cell)
1748	{
1749	struct bio *bio = virt_cell->holder;
1750	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1751
1752	/*
1753	* The virt_cell will only get freed once the origin bio completes.
1754	* This means it will remain locked while all the individual
1755	* passdown bios are in flight.
1756	*/
1757	h->cell = virt_cell;
1758	break_up_discard_bio(tc, begin: virt_cell->key.block_begin, end: virt_cell->key.block_end, bio);
1759
1760	/*
1761	* We complete the bio now, knowing that the bi_remaining field
1762	* will prevent completion until the sub range discards have
1763	* completed.
1764	*/
1765	bio_endio(bio);
1766	}
1767
1768	static void process_discard_bio(struct thin_c tc, struct* bio *bio)
1769	{
1770	dm_block_t begin, end;
1771	struct dm_cell_key virt_key;
1772	struct dm_bio_prison_cell *virt_cell;
1773
1774	get_bio_block_range(tc, bio, begin: &begin, end: &end);
1775	if (begin == end) {
1776	/*
1777	* The discard covers less than a block.
1778	*/
1779	bio_endio(bio);
1780	return;
1781	}
1782
1783	if (unlikely(!build_key(tc->td, VIRTUAL, begin, end, &virt_key))) {
1784	DMERR_LIMIT("Discard doesn't respect bio prison limits");
1785	bio_endio(bio);
1786	return;
1787	}
1788
1789	if (bio_detain(pool: tc->pool, key: &virt_key, bio, cell_result: &virt_cell)) {
1790	/*
1791	* Potential starvation issue: We're relying on the
1792	* fs/application being well behaved, and not trying to
1793	* send IO to a region at the same time as discarding it.
1794	* If they do this persistently then it's possible this
1795	* cell will never be granted.
1796	*/
1797	return;
1798	}
1799
1800	tc->pool->process_discard_cell(tc, virt_cell);
1801	}
1802
1803	static void break_sharing(struct thin_c tc, struct* bio *bio, dm_block_t block,
1804	struct dm_cell_key *key,
1805	struct dm_thin_lookup_result *lookup_result,
1806	struct dm_bio_prison_cell *cell)
1807	{
1808	int r;
1809	dm_block_t data_block;
1810	struct pool *pool = tc->pool;
1811
1812	r = alloc_data_block(tc, result: &data_block);
1813	switch (r) {
1814	case `0`:
1815	schedule_internal_copy(tc, virt_block: block, data_origin: lookup_result->block,
1816	data_dest: data_block, cell, bio);
1817	break;
1818
1819	case -ENOSPC:
1820	retry_bios_on_resume(pool, cell);
1821	break;
1822
1823	default:
1824	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1825	__func__, r);
1826	cell_error(pool, cell);
1827	break;
1828	}
1829	}
1830
1831	static void __remap_and_issue_shared_cell(void *context,
1832	struct dm_bio_prison_cell *cell)
1833	{
1834	struct remap_info *info = context;
1835	struct bio *bio;
1836
1837	while ((bio = bio_list_pop(bl: &cell->bios))) {
1838	if (bio_data_dir(bio) == WRITE \|\| op_is_flush(op: bio->bi_opf) \|\|
1839	bio_op(bio) == REQ_OP_DISCARD)
1840	bio_list_add(bl: &info->defer_bios, bio);
1841	else {
1842	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1843
1844	h->shared_read_entry = dm_deferred_entry_inc(ds: info->tc->pool->shared_read_ds);
1845	inc_all_io_entry(pool: info->tc->pool, bio);
1846	bio_list_add(bl: &info->issue_bios, bio);
1847	}
1848	}
1849	}
1850
1851	static void remap_and_issue_shared_cell(struct thin_c *tc,
1852	struct dm_bio_prison_cell *cell,
1853	dm_block_t block)
1854	{
1855	struct bio *bio;
1856	struct remap_info info;
1857
1858	info.tc = tc;
1859	bio_list_init(bl: &info.defer_bios);
1860	bio_list_init(bl: &info.issue_bios);
1861
1862	cell_visit_release(pool: tc->pool, fn: __remap_and_issue_shared_cell,
1863	context: &info, cell);
1864
1865	while ((bio = bio_list_pop(bl: &info.defer_bios)))
1866	thin_defer_bio(tc, bio);
1867
1868	while ((bio = bio_list_pop(bl: &info.issue_bios)))
1869	remap_and_issue(tc, bio, block);
1870	}
1871
1872	static void process_shared_bio(struct thin_c tc, struct* bio *bio,
1873	dm_block_t block,
1874	struct dm_thin_lookup_result *lookup_result,
1875	struct dm_bio_prison_cell *virt_cell)
1876	{
1877	struct dm_bio_prison_cell *data_cell;
1878	struct pool *pool = tc->pool;
1879	struct dm_cell_key key;
1880
1881	/*
1882	* If cell is already occupied, then sharing is already in the process
1883	* of being broken so we have nothing further to do here.
1884	*/
1885	build_data_key(td: tc->td, b: lookup_result->block, key: &key);
1886	if (bio_detain(pool, key: &key, bio, cell_result: &data_cell)) {
1887	cell_defer_no_holder(tc, cell: virt_cell);
1888	return;
1889	}
1890
1891	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1892	break_sharing(tc, bio, block, key: &key, lookup_result, cell: data_cell);
1893	cell_defer_no_holder(tc, cell: virt_cell);
1894	} else {
1895	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
1896
1897	h->shared_read_entry = dm_deferred_entry_inc(ds: pool->shared_read_ds);
1898	inc_all_io_entry(pool, bio);
1899	remap_and_issue(tc, bio, block: lookup_result->block);
1900
1901	remap_and_issue_shared_cell(tc, cell: data_cell, block: lookup_result->block);
1902	remap_and_issue_shared_cell(tc, cell: virt_cell, block: lookup_result->block);
1903	}
1904	}
1905
1906	static void provision_block(struct thin_c tc, struct* bio *bio, dm_block_t block,
1907	struct dm_bio_prison_cell *cell)
1908	{
1909	int r;
1910	dm_block_t data_block;
1911	struct pool *pool = tc->pool;
1912
1913	/*
1914	* Remap empty bios (flushes) immediately, without provisioning.
1915	*/
1916	if (!bio->bi_iter.bi_size) {
1917	inc_all_io_entry(pool, bio);
1918	cell_defer_no_holder(tc, cell);
1919
1920	remap_and_issue(tc, bio, block: `0`);
1921	return;
1922	}
1923
1924	/*
1925	* Fill read bios with zeroes and complete them immediately.
1926	*/
1927	if (bio_data_dir(bio) == READ) {
1928	zero_fill_bio(bio);
1929	cell_defer_no_holder(tc, cell);
1930	bio_endio(bio);
1931	return;
1932	}
1933
1934	r = alloc_data_block(tc, result: &data_block);
1935	switch (r) {
1936	case `0`:
1937	if (tc->origin_dev)
1938	schedule_external_copy(tc, virt_block: block, data_dest: data_block, cell, bio);
1939	else
1940	schedule_zero(tc, virt_block: block, data_block, cell, bio);
1941	break;
1942
1943	case -ENOSPC:
1944	retry_bios_on_resume(pool, cell);
1945	break;
1946
1947	default:
1948	DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1949	__func__, r);
1950	cell_error(pool, cell);
1951	break;
1952	}
1953	}
1954
1955	static void process_cell(struct thin_c tc, struct* dm_bio_prison_cell *cell)
1956	{
1957	int r;
1958	struct pool *pool = tc->pool;
1959	struct bio *bio = cell->holder;
1960	dm_block_t block = get_bio_block(tc, bio);
1961	struct dm_thin_lookup_result lookup_result;
1962
1963	if (tc->requeue_mode) {
1964	cell_requeue(pool, cell);
1965	return;
1966	}
1967
1968	r = dm_thin_find_block(td: tc->td, block, can_issue_io: `1`, result: &lookup_result);
1969	switch (r) {
1970	case `0`:
1971	if (lookup_result.shared)
1972	process_shared_bio(tc, bio, block, lookup_result: &lookup_result, virt_cell: cell);
1973	else {
1974	inc_all_io_entry(pool, bio);
1975	remap_and_issue(tc, bio, block: lookup_result.block);
1976	inc_remap_and_issue_cell(tc, cell, block: lookup_result.block);
1977	}
1978	break;
1979
1980	case -ENODATA:
1981	if (bio_data_dir(bio) == READ && tc->origin_dev) {
1982	inc_all_io_entry(pool, bio);
1983	cell_defer_no_holder(tc, cell);
1984
1985	if (bio_end_sector(bio) <= tc->origin_size)
1986	remap_to_origin_and_issue(tc, bio);
1987
1988	else if (bio->bi_iter.bi_sector < tc->origin_size) {
1989	zero_fill_bio(bio);
1990	bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1991	remap_to_origin_and_issue(tc, bio);
1992
1993	} else {
1994	zero_fill_bio(bio);
1995	bio_endio(bio);
1996	}
1997	} else
1998	provision_block(tc, bio, block, cell);
1999	break;
2000
2001	default:
2002	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2003	__func__, r);
2004	cell_defer_no_holder(tc, cell);
2005	bio_io_error(bio);
2006	break;
2007	}
2008	}
2009
2010	static void process_bio(struct thin_c tc, struct* bio *bio)
2011	{
2012	struct pool *pool = tc->pool;
2013	dm_block_t block = get_bio_block(tc, bio);
2014	struct dm_bio_prison_cell *cell;
2015	struct dm_cell_key key;
2016
2017	/*
2018	* If cell is already occupied, then the block is already
2019	* being provisioned so we have nothing further to do here.
2020	*/
2021	build_virtual_key(td: tc->td, b: block, key: &key);
2022	if (bio_detain(pool, key: &key, bio, cell_result: &cell))
2023	return;
2024
2025	process_cell(tc, cell);
2026	}
2027
2028	static void __process_bio_read_only(struct thin_c tc, struct* bio *bio,
2029	struct dm_bio_prison_cell *cell)
2030	{
2031	int r;
2032	int rw = bio_data_dir(bio);
2033	dm_block_t block = get_bio_block(tc, bio);
2034	struct dm_thin_lookup_result lookup_result;
2035
2036	r = dm_thin_find_block(td: tc->td, block, can_issue_io: `1`, result: &lookup_result);
2037	switch (r) {
2038	case `0`:
2039	if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
2040	handle_unserviceable_bio(pool: tc->pool, bio);
2041	if (cell)
2042	cell_defer_no_holder(tc, cell);
2043	} else {
2044	inc_all_io_entry(pool: tc->pool, bio);
2045	remap_and_issue(tc, bio, block: lookup_result.block);
2046	if (cell)
2047	inc_remap_and_issue_cell(tc, cell, block: lookup_result.block);
2048	}
2049	break;
2050
2051	case -ENODATA:
2052	if (cell)
2053	cell_defer_no_holder(tc, cell);
2054	if (rw != READ) {
2055	handle_unserviceable_bio(pool: tc->pool, bio);
2056	break;
2057	}
2058
2059	if (tc->origin_dev) {
2060	inc_all_io_entry(pool: tc->pool, bio);
2061	remap_to_origin_and_issue(tc, bio);
2062	break;
2063	}
2064
2065	zero_fill_bio(bio);
2066	bio_endio(bio);
2067	break;
2068
2069	default:
2070	DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2071	__func__, r);
2072	if (cell)
2073	cell_defer_no_holder(tc, cell);
2074	bio_io_error(bio);
2075	break;
2076	}
2077	}
2078
2079	static void process_bio_read_only(struct thin_c tc, struct* bio *bio)
2080	{
2081	__process_bio_read_only(tc, bio, NULL);
2082	}
2083
2084	static void process_cell_read_only(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2085	{
2086	__process_bio_read_only(tc, bio: cell->holder, cell);
2087	}
2088
2089	static void process_bio_success(struct thin_c tc, struct* bio *bio)
2090	{
2091	bio_endio(bio);
2092	}
2093
2094	static void process_bio_fail(struct thin_c tc, struct* bio *bio)
2095	{
2096	bio_io_error(bio);
2097	}
2098
2099	static void process_cell_success(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2100	{
2101	cell_success(pool: tc->pool, cell);
2102	}
2103
2104	static void process_cell_fail(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2105	{
2106	cell_error(pool: tc->pool, cell);
2107	}
2108
2109	/*
2110	* FIXME: should we also commit due to size of transaction, measured in
2111	* metadata blocks?
2112	*/
2113	static int need_commit_due_to_time(struct pool *pool)
2114	{
2115	return !time_in_range(jiffies, pool->last_commit_jiffies,
2116	pool->last_commit_jiffies + COMMIT_PERIOD);
2117	}
2118
2119	#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
2120	#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
2121
2122	static void __thin_bio_rb_add(struct thin_c tc, struct* bio *bio)
2123	{
2124	struct rb_node *rbp, parent;
2125	struct dm_thin_endio_hook *pbd;
2126	sector_t bi_sector = bio->bi_iter.bi_sector;
2127
2128	rbp = &tc->sort_bio_list.rb_node;
2129	parent = NULL;
2130	while (*rbp) {
2131	parent = *rbp;
2132	pbd = thin_pbd(parent);
2133
2134	if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
2135	rbp = &(*rbp)->rb_left;
2136	else
2137	rbp = &(*rbp)->rb_right;
2138	}
2139
2140	pbd = dm_per_bio_data(bio, data_size: sizeof(struct dm_thin_endio_hook));
2141	rb_link_node(node: &pbd->rb_node, parent, rb_link: rbp);
2142	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
2143	}
2144
2145	static void __extract_sorted_bios(struct thin_c *tc)
2146	{
2147	struct rb_node *node;
2148	struct dm_thin_endio_hook *pbd;
2149	struct bio *bio;
2150
2151	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
2152	pbd = thin_pbd(node);
2153	bio = thin_bio(pbd);
2154
2155	bio_list_add(bl: &tc->deferred_bio_list, bio);
2156	rb_erase(&pbd->rb_node, &tc->sort_bio_list);
2157	}
2158
2159	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
2160	}
2161
2162	static void __sort_thin_deferred_bios(struct thin_c *tc)
2163	{
2164	struct bio *bio;
2165	struct bio_list bios;
2166
2167	bio_list_init(bl: &bios);
2168	bio_list_merge(bl: &bios, bl2: &tc->deferred_bio_list);
2169	bio_list_init(bl: &tc->deferred_bio_list);
2170
2171	/ Sort deferred_bio_list using rb-tree /
2172	while ((bio = bio_list_pop(bl: &bios)))
2173	__thin_bio_rb_add(tc, bio);
2174
2175	/*
2176	* Transfer the sorted bios in sort_bio_list back to
2177	* deferred_bio_list to allow lockless submission of
2178	* all bios.
2179	*/
2180	__extract_sorted_bios(tc);
2181	}
2182
2183	static void process_thin_deferred_bios(struct thin_c *tc)
2184	{
2185	struct pool *pool = tc->pool;
2186	struct bio *bio;
2187	struct bio_list bios;
2188	struct blk_plug plug;
2189	unsigned int count = `0`;
2190
2191	if (tc->requeue_mode) {
2192	error_thin_bio_list(tc, master: &tc->deferred_bio_list,
2193	BLK_STS_DM_REQUEUE);
2194	return;
2195	}
2196
2197	bio_list_init(bl: &bios);
2198
2199	spin_lock_irq(lock: &tc->lock);
2200
2201	if (bio_list_empty(bl: &tc->deferred_bio_list)) {
2202	spin_unlock_irq(lock: &tc->lock);
2203	return;
2204	}
2205
2206	__sort_thin_deferred_bios(tc);
2207
2208	bio_list_merge(bl: &bios, bl2: &tc->deferred_bio_list);
2209	bio_list_init(bl: &tc->deferred_bio_list);
2210
2211	spin_unlock_irq(lock: &tc->lock);
2212
2213	blk_start_plug(&plug);
2214	while ((bio = bio_list_pop(bl: &bios))) {
2215	/*
2216	* If we've got no free new_mapping structs, and processing
2217	* this bio might require one, we pause until there are some
2218	* prepared mappings to process.
2219	*/
2220	if (ensure_next_mapping(pool)) {
2221	spin_lock_irq(lock: &tc->lock);
2222	bio_list_add(bl: &tc->deferred_bio_list, bio);
2223	bio_list_merge(bl: &tc->deferred_bio_list, bl2: &bios);
2224	spin_unlock_irq(lock: &tc->lock);
2225	break;
2226	}
2227
2228	if (bio_op(bio) == REQ_OP_DISCARD)
2229	pool->process_discard(tc, bio);
2230	else
2231	pool->process_bio(tc, bio);
2232
2233	if ((count++ & `127`) == `0`) {
2234	throttle_work_update(t: &pool->throttle);
2235	dm_pool_issue_prefetches(pmd: pool->pmd);
2236	}
2237	cond_resched();
2238	}
2239	blk_finish_plug(&plug);
2240	}
2241
2242	static int cmp_cells(const void lhs, const* void *rhs)
2243	{
2244	struct dm_bio_prison_cell lhs_cell = ((struct dm_bio_prison_cell **) lhs);
2245	struct dm_bio_prison_cell rhs_cell = ((struct dm_bio_prison_cell **) rhs);
2246
2247	BUG_ON(!lhs_cell->holder);
2248	BUG_ON(!rhs_cell->holder);
2249
2250	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
2251	return -`1`;
2252
2253	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
2254	return `1`;
2255
2256	return `0`;
2257	}
2258
2259	static unsigned int sort_cells(struct pool pool, struct* list_head *cells)
2260	{
2261	unsigned int count = `0`;
2262	struct dm_bio_prison_cell cell, tmp;
2263
2264	list_for_each_entry_safe(cell, tmp, cells, user_list) {
2265	if (count >= CELL_SORT_ARRAY_SIZE)
2266	break;
2267
2268	pool->cell_sort_array[count++] = cell;
2269	list_del(entry: &cell->user_list);
2270	}
2271
2272	sort(base: pool->cell_sort_array, num: count, size: sizeof(cell), cmp_func: cmp_cells, NULL);
2273
2274	return count;
2275	}
2276
2277	static void process_thin_deferred_cells(struct thin_c *tc)
2278	{
2279	struct pool *pool = tc->pool;
2280	struct list_head cells;
2281	struct dm_bio_prison_cell *cell;
2282	unsigned int i, j, count;
2283
2284	INIT_LIST_HEAD(list: &cells);
2285
2286	spin_lock_irq(lock: &tc->lock);
2287	list_splice_init(list: &tc->deferred_cells, head: &cells);
2288	spin_unlock_irq(lock: &tc->lock);
2289
2290	if (list_empty(head: &cells))
2291	return;
2292
2293	do {
2294	count = sort_cells(pool: tc->pool, cells: &cells);
2295
2296	for (i = `0`; i < count; i++) {
2297	cell = pool->cell_sort_array[i];
2298	BUG_ON(!cell->holder);
2299
2300	/*
2301	* If we've got no free new_mapping structs, and processing
2302	* this bio might require one, we pause until there are some
2303	* prepared mappings to process.
2304	*/
2305	if (ensure_next_mapping(pool)) {
2306	for (j = i; j < count; j++)
2307	list_add(new: &pool->cell_sort_array[j]->user_list, head: &cells);
2308
2309	spin_lock_irq(lock: &tc->lock);
2310	list_splice(list: &cells, head: &tc->deferred_cells);
2311	spin_unlock_irq(lock: &tc->lock);
2312	return;
2313	}
2314
2315	if (bio_op(bio: cell->holder) == REQ_OP_DISCARD)
2316	pool->process_discard_cell(tc, cell);
2317	else
2318	pool->process_cell(tc, cell);
2319	}
2320	cond_resched();
2321	} while (!list_empty(head: &cells));
2322	}
2323
2324	static void thin_get(struct thin_c *tc);
2325	static void thin_put(struct thin_c *tc);
2326
2327	/*
2328	* We can't hold rcu_read_lock() around code that can block. So we
2329	* find a thin with the rcu lock held; bump a refcount; then drop
2330	* the lock.
2331	*/
2332	static struct thin_c get_first_thin(struct* pool *pool)
2333	{
2334	struct thin_c *tc = NULL;
2335
2336	rcu_read_lock();
2337	if (!list_empty(head: &pool->active_thins)) {
2338	tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
2339	thin_get(tc);
2340	}
2341	rcu_read_unlock();
2342
2343	return tc;
2344	}
2345
2346	static struct thin_c get_next_thin(struct* pool pool, struct* thin_c *tc)
2347	{
2348	struct thin_c *old_tc = tc;
2349
2350	rcu_read_lock();
2351	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
2352	thin_get(tc);
2353	thin_put(tc: old_tc);
2354	rcu_read_unlock();
2355	return tc;
2356	}
2357	thin_put(tc: old_tc);
2358	rcu_read_unlock();
2359
2360	return NULL;
2361	}
2362
2363	static void process_deferred_bios(struct pool *pool)
2364	{
2365	struct bio *bio;
2366	struct bio_list bios, bio_completions;
2367	struct thin_c *tc;
2368
2369	tc = get_first_thin(pool);
2370	while (tc) {
2371	process_thin_deferred_cells(tc);
2372	process_thin_deferred_bios(tc);
2373	tc = get_next_thin(pool, tc);
2374	}
2375
2376	/*
2377	* If there are any deferred flush bios, we must commit the metadata
2378	* before issuing them or signaling their completion.
2379	*/
2380	bio_list_init(bl: &bios);
2381	bio_list_init(bl: &bio_completions);
2382
2383	spin_lock_irq(lock: &pool->lock);
2384	bio_list_merge(bl: &bios, bl2: &pool->deferred_flush_bios);
2385	bio_list_init(bl: &pool->deferred_flush_bios);
2386
2387	bio_list_merge(bl: &bio_completions, bl2: &pool->deferred_flush_completions);
2388	bio_list_init(bl: &pool->deferred_flush_completions);
2389	spin_unlock_irq(lock: &pool->lock);
2390
2391	if (bio_list_empty(bl: &bios) && bio_list_empty(bl: &bio_completions) &&
2392	!(dm_pool_changed_this_transaction(pmd: pool->pmd) && need_commit_due_to_time(pool)))
2393	return;
2394
2395	if (commit(pool)) {
2396	bio_list_merge(bl: &bios, bl2: &bio_completions);
2397
2398	while ((bio = bio_list_pop(bl: &bios)))
2399	bio_io_error(bio);
2400	return;
2401	}
2402	pool->last_commit_jiffies = jiffies;
2403
2404	while ((bio = bio_list_pop(bl: &bio_completions)))
2405	bio_endio(bio);
2406
2407	while ((bio = bio_list_pop(bl: &bios))) {
2408	/*
2409	* The data device was flushed as part of metadata commit,
2410	* so complete redundant flushes immediately.
2411	*/
2412	if (bio->bi_opf & REQ_PREFLUSH)
2413	bio_endio(bio);
2414	else
2415	dm_submit_bio_remap(clone: bio, NULL);
2416	}
2417	}
2418
2419	static void do_worker(struct work_struct *ws)
2420	{
2421	struct pool pool = container_of(ws, struct* pool, worker);
2422
2423	throttle_work_start(t: &pool->throttle);
2424	dm_pool_issue_prefetches(pmd: pool->pmd);
2425	throttle_work_update(t: &pool->throttle);
2426	process_prepared(pool, head: &pool->prepared_mappings, fn: &pool->process_prepared_mapping);
2427	throttle_work_update(t: &pool->throttle);
2428	process_prepared(pool, head: &pool->prepared_discards, fn: &pool->process_prepared_discard);
2429	throttle_work_update(t: &pool->throttle);
2430	process_prepared(pool, head: &pool->prepared_discards_pt2, fn: &pool->process_prepared_discard_pt2);
2431	throttle_work_update(t: &pool->throttle);
2432	process_deferred_bios(pool);
2433	throttle_work_complete(t: &pool->throttle);
2434	}
2435
2436	/*
2437	* We want to commit periodically so that not too much
2438	* unwritten data builds up.
2439	*/
2440	static void do_waker(struct work_struct *ws)
2441	{
2442	struct pool pool = container_of(to_delayed_work(ws), struct* pool, waker);
2443
2444	wake_worker(pool);
2445	queue_delayed_work(wq: pool->wq, dwork: &pool->waker, COMMIT_PERIOD);
2446	}
2447
2448	/*
2449	* We're holding onto IO to allow userland time to react. After the
2450	* timeout either the pool will have been resized (and thus back in
2451	* PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
2452	*/
2453	static void do_no_space_timeout(struct work_struct *ws)
2454	{
2455	struct pool pool = container_of(to_delayed_work(ws), struct* pool,
2456	no_space_timeout);
2457
2458	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
2459	pool->pf.error_if_no_space = true;
2460	notify_of_pool_mode_change(pool);
2461	error_retry_list_with_code(pool, BLK_STS_NOSPC);
2462	}
2463	}
2464
2465	/----------------------------------------------------------------/
2466
2467	struct pool_work {
2468	struct work_struct worker;
2469	struct completion complete;
2470	};
2471
2472	static struct pool_work to_pool_work(struct* work_struct *ws)
2473	{
2474	return container_of(ws, struct pool_work, worker);
2475	}
2476
2477	static void pool_work_complete(struct pool_work *pw)
2478	{
2479	complete(&pw->complete);
2480	}
2481
2482	static void pool_work_wait(struct pool_work pw, struct* pool *pool,
2483	void (fn)(struct* work_struct *))
2484	{
2485	INIT_WORK_ONSTACK(&pw->worker, fn);
2486	init_completion(x: &pw->complete);
2487	queue_work(wq: pool->wq, work: &pw->worker);
2488	wait_for_completion(&pw->complete);
2489	}
2490
2491	/----------------------------------------------------------------/
2492
2493	struct noflush_work {
2494	struct pool_work pw;
2495	struct thin_c *tc;
2496	};
2497
2498	static struct noflush_work to_noflush(struct* work_struct *ws)
2499	{
2500	return container_of(to_pool_work(ws), struct noflush_work, pw);
2501	}
2502
2503	static void do_noflush_start(struct work_struct *ws)
2504	{
2505	struct noflush_work *w = to_noflush(ws);
2506
2507	w->tc->requeue_mode = true;
2508	requeue_io(tc: w->tc);
2509	pool_work_complete(pw: &w->pw);
2510	}
2511
2512	static void do_noflush_stop(struct work_struct *ws)
2513	{
2514	struct noflush_work *w = to_noflush(ws);
2515
2516	w->tc->requeue_mode = false;
2517	pool_work_complete(pw: &w->pw);
2518	}
2519
2520	static void noflush_work(struct thin_c tc, void* (fn)(struct* work_struct *))
2521	{
2522	struct noflush_work w;
2523
2524	w.tc = tc;
2525	pool_work_wait(pw: &w.pw, pool: tc->pool, fn);
2526	}
2527
2528	/----------------------------------------------------------------/
2529
2530	static void set_discard_callbacks(struct pool *pool)
2531	{
2532	struct pool_c *pt = pool->ti->private;
2533
2534	if (pt->adjusted_pf.discard_passdown) {
2535	pool->process_discard_cell = process_discard_cell_passdown;
2536	pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2537	pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
2538	} else {
2539	pool->process_discard_cell = process_discard_cell_no_passdown;
2540	pool->process_prepared_discard = process_prepared_discard_no_passdown;
2541	}
2542	}
2543
2544	static void set_pool_mode(struct pool pool, enum* pool_mode new_mode)
2545	{
2546	struct pool_c *pt = pool->ti->private;
2547	bool needs_check = dm_pool_metadata_needs_check(pmd: pool->pmd);
2548	enum pool_mode old_mode = get_pool_mode(pool);
2549	unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
2550
2551	/*
2552	* Never allow the pool to transition to PM_WRITE mode if user
2553	* intervention is required to verify metadata and data consistency.
2554	*/
2555	if (new_mode == PM_WRITE && needs_check) {
2556	DMERR("%s: unable to switch pool to write mode until repaired.",
2557	dm_device_name(pool->pool_md));
2558	if (old_mode != new_mode)
2559	new_mode = old_mode;
2560	else
2561	new_mode = PM_READ_ONLY;
2562	}
2563	/*
2564	* If we were in PM_FAIL mode, rollback of metadata failed. We're
2565	* not going to recover without a thin_repair. So we never let the
2566	* pool move out of the old mode.
2567	*/
2568	if (old_mode == PM_FAIL)
2569	new_mode = old_mode;
2570
2571	switch (new_mode) {
2572	case PM_FAIL:
2573	dm_pool_metadata_read_only(pmd: pool->pmd);
2574	pool->process_bio = process_bio_fail;
2575	pool->process_discard = process_bio_fail;
2576	pool->process_cell = process_cell_fail;
2577	pool->process_discard_cell = process_cell_fail;
2578	pool->process_prepared_mapping = process_prepared_mapping_fail;
2579	pool->process_prepared_discard = process_prepared_discard_fail;
2580
2581	error_retry_list(pool);
2582	break;
2583
2584	case PM_OUT_OF_METADATA_SPACE:
2585	case PM_READ_ONLY:
2586	dm_pool_metadata_read_only(pmd: pool->pmd);
2587	pool->process_bio = process_bio_read_only;
2588	pool->process_discard = process_bio_success;
2589	pool->process_cell = process_cell_read_only;
2590	pool->process_discard_cell = process_cell_success;
2591	pool->process_prepared_mapping = process_prepared_mapping_fail;
2592	pool->process_prepared_discard = process_prepared_discard_success;
2593
2594	error_retry_list(pool);
2595	break;
2596
2597	case PM_OUT_OF_DATA_SPACE:
2598	/*
2599	* Ideally we'd never hit this state; the low water mark
2600	* would trigger userland to extend the pool before we
2601	* completely run out of data space. However, many small
2602	* IOs to unprovisioned space can consume data space at an
2603	* alarming rate. Adjust your low water mark if you're
2604	* frequently seeing this mode.
2605	*/
2606	pool->out_of_data_space = true;
2607	pool->process_bio = process_bio_read_only;
2608	pool->process_discard = process_discard_bio;
2609	pool->process_cell = process_cell_read_only;
2610	pool->process_prepared_mapping = process_prepared_mapping;
2611	set_discard_callbacks(pool);
2612
2613	if (!pool->pf.error_if_no_space && no_space_timeout)
2614	queue_delayed_work(wq: pool->wq, dwork: &pool->no_space_timeout, delay: no_space_timeout);
2615	break;
2616
2617	case PM_WRITE:
2618	if (old_mode == PM_OUT_OF_DATA_SPACE)
2619	cancel_delayed_work_sync(dwork: &pool->no_space_timeout);
2620	pool->out_of_data_space = false;
2621	pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
2622	dm_pool_metadata_read_write(pmd: pool->pmd);
2623	pool->process_bio = process_bio;
2624	pool->process_discard = process_discard_bio;
2625	pool->process_cell = process_cell;
2626	pool->process_prepared_mapping = process_prepared_mapping;
2627	set_discard_callbacks(pool);
2628	break;
2629	}
2630
2631	pool->pf.mode = new_mode;
2632	/*
2633	* The pool mode may have changed, sync it so bind_control_target()
2634	* doesn't cause an unexpected mode transition on resume.
2635	*/
2636	pt->adjusted_pf.mode = new_mode;
2637
2638	if (old_mode != new_mode)
2639	notify_of_pool_mode_change(pool);
2640	}
2641
2642	static void abort_transaction(struct pool *pool)
2643	{
2644	const char *dev_name = dm_device_name(md: pool->pool_md);
2645
2646	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2647	if (dm_pool_abort_metadata(pmd: pool->pmd)) {
2648	DMERR("%s: failed to abort metadata transaction", dev_name);
2649	set_pool_mode(pool, new_mode: PM_FAIL);
2650	}
2651
2652	if (dm_pool_metadata_set_needs_check(pmd: pool->pmd)) {
2653	DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2654	set_pool_mode(pool, new_mode: PM_FAIL);
2655	}
2656	}
2657
2658	static void metadata_operation_failed(struct pool pool, const* char op, int* r)
2659	{
2660	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2661	dm_device_name(pool->pool_md), op, r);
2662
2663	abort_transaction(pool);
2664	set_pool_mode(pool, new_mode: PM_READ_ONLY);
2665	}
2666
2667	/----------------------------------------------------------------/
2668
2669	/*
2670	* Mapping functions.
2671	*/
2672
2673	/*
2674	* Called only while mapping a thin bio to hand it over to the workqueue.
2675	*/
2676	static void thin_defer_bio(struct thin_c tc, struct* bio *bio)
2677	{
2678	struct pool *pool = tc->pool;
2679
2680	spin_lock_irq(lock: &tc->lock);
2681	bio_list_add(bl: &tc->deferred_bio_list, bio);
2682	spin_unlock_irq(lock: &tc->lock);
2683
2684	wake_worker(pool);
2685	}
2686
2687	static void thin_defer_bio_with_throttle(struct thin_c tc, struct* bio *bio)
2688	{
2689	struct pool *pool = tc->pool;
2690
2691	throttle_lock(t: &pool->throttle);
2692	thin_defer_bio(tc, bio);
2693	throttle_unlock(t: &pool->throttle);
2694	}
2695
2696	static void thin_defer_cell(struct thin_c tc, struct* dm_bio_prison_cell *cell)
2697	{
2698	struct pool *pool = tc->pool;
2699
2700	throttle_lock(t: &pool->throttle);
2701	spin_lock_irq(lock: &tc->lock);
2702	list_add_tail(new: &cell->user_list, head: &tc->deferred_cells);
2703	spin_unlock_irq(lock: &tc->lock);
2704	throttle_unlock(t: &pool->throttle);
2705
2706	wake_worker(pool);
2707	}
2708
2709	static void thin_hook_bio(struct thin_c tc, struct* bio *bio)
2710	{
2711	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
2712
2713	h->tc = tc;
2714	h->shared_read_entry = NULL;
2715	h->all_io_entry = NULL;
2716	h->overwrite_mapping = NULL;
2717	h->cell = NULL;
2718	}
2719
2720	/*
2721	* Non-blocking function called from the thin target's map function.
2722	*/
2723	static int thin_bio_map(struct dm_target ti, struct* bio *bio)
2724	{
2725	int r;
2726	struct thin_c *tc = ti->private;
2727	dm_block_t block = get_bio_block(tc, bio);
2728	struct dm_thin_device *td = tc->td;
2729	struct dm_thin_lookup_result result;
2730	struct dm_bio_prison_cell virt_cell, data_cell;
2731	struct dm_cell_key key;
2732
2733	thin_hook_bio(tc, bio);
2734
2735	if (tc->requeue_mode) {
2736	bio->bi_status = BLK_STS_DM_REQUEUE;
2737	bio_endio(bio);
2738	return DM_MAPIO_SUBMITTED;
2739	}
2740
2741	if (get_pool_mode(pool: tc->pool) == PM_FAIL) {
2742	bio_io_error(bio);
2743	return DM_MAPIO_SUBMITTED;
2744	}
2745
2746	if (op_is_flush(op: bio->bi_opf) \|\| bio_op(bio) == REQ_OP_DISCARD) {
2747	thin_defer_bio_with_throttle(tc, bio);
2748	return DM_MAPIO_SUBMITTED;
2749	}
2750
2751	/*
2752	* We must hold the virtual cell before doing the lookup, otherwise
2753	* there's a race with discard.
2754	*/
2755	build_virtual_key(td: tc->td, b: block, key: &key);
2756	if (bio_detain(pool: tc->pool, key: &key, bio, cell_result: &virt_cell))
2757	return DM_MAPIO_SUBMITTED;
2758
2759	r = dm_thin_find_block(td, block, can_issue_io: `0`, result: &result);
2760
2761	/*
2762	* Note that we defer readahead too.
2763	*/
2764	switch (r) {
2765	case `0`:
2766	if (unlikely(result.shared)) {
2767	/*
2768	* We have a race condition here between the
2769	* result.shared value returned by the lookup and
2770	* snapshot creation, which may cause new
2771	* sharing.
2772	*
2773	* To avoid this always quiesce the origin before
2774	* taking the snap. You want to do this anyway to
2775	* ensure a consistent application view
2776	* (i.e. lockfs).
2777	*
2778	* More distant ancestors are irrelevant. The
2779	* shared flag will be set in their case.
2780	*/
2781	thin_defer_cell(tc, cell: virt_cell);
2782	return DM_MAPIO_SUBMITTED;
2783	}
2784
2785	build_data_key(td: tc->td, b: result.block, key: &key);
2786	if (bio_detain(pool: tc->pool, key: &key, bio, cell_result: &data_cell)) {
2787	cell_defer_no_holder(tc, cell: virt_cell);
2788	return DM_MAPIO_SUBMITTED;
2789	}
2790
2791	inc_all_io_entry(pool: tc->pool, bio);
2792	cell_defer_no_holder(tc, cell: data_cell);
2793	cell_defer_no_holder(tc, cell: virt_cell);
2794
2795	remap(tc, bio, block: result.block);
2796	return DM_MAPIO_REMAPPED;
2797
2798	case -ENODATA:
2799	case -EWOULDBLOCK:
2800	thin_defer_cell(tc, cell: virt_cell);
2801	return DM_MAPIO_SUBMITTED;
2802
2803	default:
2804	/*
2805	* Must always call bio_io_error on failure.
2806	* dm_thin_find_block can fail with -EINVAL if the
2807	* pool is switched to fail-io mode.
2808	*/
2809	bio_io_error(bio);
2810	cell_defer_no_holder(tc, cell: virt_cell);
2811	return DM_MAPIO_SUBMITTED;
2812	}
2813	}
2814
2815	static void requeue_bios(struct pool *pool)
2816	{
2817	struct thin_c *tc;
2818
2819	rcu_read_lock();
2820	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2821	spin_lock_irq(lock: &tc->lock);
2822	bio_list_merge(bl: &tc->deferred_bio_list, bl2: &tc->retry_on_resume_list);
2823	bio_list_init(bl: &tc->retry_on_resume_list);
2824	spin_unlock_irq(lock: &tc->lock);
2825	}
2826	rcu_read_unlock();
2827	}
2828
2829	/*
2830	*--------------------------------------------------------------
2831	* Binding of control targets to a pool object
2832	*--------------------------------------------------------------
2833	*/
2834	static bool is_factor(sector_t block_size, uint32_t n)
2835	{
2836	return !sector_div(block_size, n);
2837	}
2838
2839	/*
2840	* If discard_passdown was enabled verify that the data device
2841	* supports discards. Disable discard_passdown if not.
2842	*/
2843	static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
2844	{
2845	struct pool *pool = pt->pool;
2846	struct block_device *data_bdev = pt->data_dev->bdev;
2847	struct queue_limits *data_limits = &bdev_get_queue(bdev: data_bdev)->limits;
2848	const char *reason = NULL;
2849
2850	if (!pt->adjusted_pf.discard_passdown)
2851	return;
2852
2853	if (!bdev_max_discard_sectors(bdev: pt->data_dev->bdev))
2854	reason = "discard unsupported";
2855
2856	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2857	reason = "max discard sectors smaller than a block";
2858
2859	if (reason) {
2860	DMWARN("Data device (%pg) %s: Disabling discard passdown.", data_bdev, reason);
2861	pt->adjusted_pf.discard_passdown = false;
2862	}
2863	}
2864
2865	static int bind_control_target(struct pool pool, struct* dm_target *ti)
2866	{
2867	struct pool_c *pt = ti->private;
2868
2869	/*
2870	* We want to make sure that a pool in PM_FAIL mode is never upgraded.
2871	*/
2872	enum pool_mode old_mode = get_pool_mode(pool);
2873	enum pool_mode new_mode = pt->adjusted_pf.mode;
2874
2875	/*
2876	* Don't change the pool's mode until set_pool_mode() below.
2877	* Otherwise the pool's process_* function pointers may
2878	* not match the desired pool mode.
2879	*/
2880	pt->adjusted_pf.mode = old_mode;
2881
2882	pool->ti = ti;
2883	pool->pf = pt->adjusted_pf;
2884	pool->low_water_blocks = pt->low_water_blocks;
2885
2886	set_pool_mode(pool, new_mode);
2887
2888	return `0`;
2889	}
2890
2891	static void unbind_control_target(struct pool pool, struct* dm_target *ti)
2892	{
2893	if (pool->ti == ti)
2894	pool->ti = NULL;
2895	}
2896
2897	/*
2898	*--------------------------------------------------------------
2899	* Pool creation
2900	*--------------------------------------------------------------
2901	*/
2902	/ Initialize pool features. /
2903	static void pool_features_init(struct pool_features *pf)
2904	{
2905	pf->mode = PM_WRITE;
2906	pf->zero_new_blocks = true;
2907	pf->discard_enabled = true;
2908	pf->discard_passdown = true;
2909	pf->error_if_no_space = false;
2910	}
2911
2912	static void __pool_destroy(struct pool *pool)
2913	{
2914	__pool_table_remove(pool);
2915
2916	vfree(addr: pool->cell_sort_array);
2917	if (dm_pool_metadata_close(pmd: pool->pmd) < `0`)
2918	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2919
2920	dm_bio_prison_destroy(prison: pool->prison);
2921	dm_kcopyd_client_destroy(kc: pool->copier);
2922
2923	cancel_delayed_work_sync(dwork: &pool->waker);
2924	cancel_delayed_work_sync(dwork: &pool->no_space_timeout);
2925	if (pool->wq)
2926	destroy_workqueue(wq: pool->wq);
2927
2928	if (pool->next_mapping)
2929	mempool_free(element: pool->next_mapping, pool: &pool->mapping_pool);
2930	mempool_exit(pool: &pool->mapping_pool);
2931	dm_deferred_set_destroy(ds: pool->shared_read_ds);
2932	dm_deferred_set_destroy(ds: pool->all_io_ds);
2933	kfree(objp: pool);
2934	}
2935
2936	static struct kmem_cache *_new_mapping_cache;
2937
2938	static struct pool pool_create(struct* mapped_device *pool_md,
2939	struct block_device *metadata_dev,
2940	struct block_device *data_dev,
2941	unsigned long block_size,
2942	int read_only, char **error)
2943	{
2944	int r;
2945	void *err_p;
2946	struct pool *pool;
2947	struct dm_pool_metadata *pmd;
2948	bool format_device = read_only ? false : true;
2949
2950	pmd = dm_pool_metadata_open(bdev: metadata_dev, data_block_size: block_size, format_device);
2951	if (IS_ERR(ptr: pmd)) {
2952	*error = "Error creating metadata object";
2953	return (struct pool *)pmd;
2954	}
2955
2956	pool = kzalloc(size: sizeof(*pool), GFP_KERNEL);
2957	if (!pool) {
2958	*error = "Error allocating memory for pool";
2959	err_p = ERR_PTR(error: -ENOMEM);
2960	goto bad_pool;
2961	}
2962
2963	pool->pmd = pmd;
2964	pool->sectors_per_block = block_size;
2965	if (block_size & (block_size - `1`))
2966	pool->sectors_per_block_shift = -`1`;
2967	else
2968	pool->sectors_per_block_shift = __ffs(block_size);
2969	pool->low_water_blocks = `0`;
2970	pool_features_init(pf: &pool->pf);
2971	pool->prison = dm_bio_prison_create();
2972	if (!pool->prison) {
2973	*error = "Error creating pool's bio prison";
2974	err_p = ERR_PTR(error: -ENOMEM);
2975	goto bad_prison;
2976	}
2977
2978	pool->copier = dm_kcopyd_client_create(throttle: &dm_kcopyd_throttle);
2979	if (IS_ERR(ptr: pool->copier)) {
2980	r = PTR_ERR(ptr: pool->copier);
2981	*error = "Error creating pool's kcopyd client";
2982	err_p = ERR_PTR(error: r);
2983	goto bad_kcopyd_client;
2984	}
2985
2986	/*
2987	* Create singlethreaded workqueue that will service all devices
2988	* that use this metadata.
2989	*/
2990	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2991	if (!pool->wq) {
2992	*error = "Error creating pool's workqueue";
2993	err_p = ERR_PTR(error: -ENOMEM);
2994	goto bad_wq;
2995	}
2996
2997	throttle_init(t: &pool->throttle);
2998	INIT_WORK(&pool->worker, do_worker);
2999	INIT_DELAYED_WORK(&pool->waker, do_waker);
3000	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
3001	spin_lock_init(&pool->lock);
3002	bio_list_init(bl: &pool->deferred_flush_bios);
3003	bio_list_init(bl: &pool->deferred_flush_completions);
3004	INIT_LIST_HEAD(list: &pool->prepared_mappings);
3005	INIT_LIST_HEAD(list: &pool->prepared_discards);
3006	INIT_LIST_HEAD(list: &pool->prepared_discards_pt2);
3007	INIT_LIST_HEAD(list: &pool->active_thins);
3008	pool->low_water_triggered = false;
3009	pool->suspended = true;
3010	pool->out_of_data_space = false;
3011
3012	pool->shared_read_ds = dm_deferred_set_create();
3013	if (!pool->shared_read_ds) {
3014	*error = "Error creating pool's shared read deferred set";
3015	err_p = ERR_PTR(error: -ENOMEM);
3016	goto bad_shared_read_ds;
3017	}
3018
3019	pool->all_io_ds = dm_deferred_set_create();
3020	if (!pool->all_io_ds) {
3021	*error = "Error creating pool's all io deferred set";
3022	err_p = ERR_PTR(error: -ENOMEM);
3023	goto bad_all_io_ds;
3024	}
3025
3026	pool->next_mapping = NULL;
3027	r = mempool_init_slab_pool(pool: &pool->mapping_pool, MAPPING_POOL_SIZE,
3028	kc: _new_mapping_cache);
3029	if (r) {
3030	*error = "Error creating pool's mapping mempool";
3031	err_p = ERR_PTR(error: r);
3032	goto bad_mapping_pool;
3033	}
3034
3035	pool->cell_sort_array =
3036	vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
3037	sizeof(*pool->cell_sort_array)));
3038	if (!pool->cell_sort_array) {
3039	*error = "Error allocating cell sort array";
3040	err_p = ERR_PTR(error: -ENOMEM);
3041	goto bad_sort_array;
3042	}
3043
3044	pool->ref_count = `1`;
3045	pool->last_commit_jiffies = jiffies;
3046	pool->pool_md = pool_md;
3047	pool->md_dev = metadata_dev;
3048	pool->data_dev = data_dev;
3049	__pool_table_insert(pool);
3050
3051	return pool;
3052
3053	bad_sort_array:
3054	mempool_exit(pool: &pool->mapping_pool);
3055	bad_mapping_pool:
3056	dm_deferred_set_destroy(ds: pool->all_io_ds);
3057	bad_all_io_ds:
3058	dm_deferred_set_destroy(ds: pool->shared_read_ds);
3059	bad_shared_read_ds:
3060	destroy_workqueue(wq: pool->wq);
3061	bad_wq:
3062	dm_kcopyd_client_destroy(kc: pool->copier);
3063	bad_kcopyd_client:
3064	dm_bio_prison_destroy(prison: pool->prison);
3065	bad_prison:
3066	kfree(objp: pool);
3067	bad_pool:
3068	if (dm_pool_metadata_close(pmd))
3069	DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
3070
3071	return err_p;
3072	}
3073
3074	static void __pool_inc(struct pool *pool)
3075	{
3076	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3077	pool->ref_count++;
3078	}
3079
3080	static void __pool_dec(struct pool *pool)
3081	{
3082	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3083	BUG_ON(!pool->ref_count);
3084	if (!--pool->ref_count)
3085	__pool_destroy(pool);
3086	}
3087
3088	static struct pool __pool_find(struct* mapped_device *pool_md,
3089	struct block_device *metadata_dev,
3090	struct block_device *data_dev,
3091	unsigned long block_size, int read_only,
3092	char *error, int* *created)
3093	{
3094	struct pool *pool = __pool_table_lookup_metadata_dev(md_dev: metadata_dev);
3095
3096	if (pool) {
3097	if (pool->pool_md != pool_md) {
3098	*error = "metadata device already in use by a pool";
3099	return ERR_PTR(error: -EBUSY);
3100	}
3101	if (pool->data_dev != data_dev) {
3102	*error = "data device already in use by a pool";
3103	return ERR_PTR(error: -EBUSY);
3104	}
3105	__pool_inc(pool);
3106
3107	} else {
3108	pool = __pool_table_lookup(md: pool_md);
3109	if (pool) {
3110	if (pool->md_dev != metadata_dev \|\| pool->data_dev != data_dev) {
3111	*error = "different pool cannot replace a pool";
3112	return ERR_PTR(error: -EINVAL);
3113	}
3114	__pool_inc(pool);
3115
3116	} else {
3117	pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
3118	*created = `1`;
3119	}
3120	}
3121
3122	return pool;
3123	}
3124
3125	/*
3126	*--------------------------------------------------------------
3127	* Pool target methods
3128	*--------------------------------------------------------------
3129	*/
3130	static void pool_dtr(struct dm_target *ti)
3131	{
3132	struct pool_c *pt = ti->private;
3133
3134	mutex_lock(&dm_thin_pool_table.mutex);
3135
3136	unbind_control_target(pool: pt->pool, ti);
3137	__pool_dec(pool: pt->pool);
3138	dm_put_device(ti, d: pt->metadata_dev);
3139	dm_put_device(ti, d: pt->data_dev);
3140	kfree(objp: pt);
3141
3142	mutex_unlock(lock: &dm_thin_pool_table.mutex);
3143	}
3144
3145	static int parse_pool_features(struct dm_arg_set as, struct* pool_features *pf,
3146	struct dm_target *ti)
3147	{
3148	int r;
3149	unsigned int argc;
3150	const char *arg_name;
3151
3152	static const struct dm_arg _args[] = {
3153	{`0`, `4`, "Invalid number of pool feature arguments"},
3154	};
3155
3156	/*
3157	* No feature arguments supplied.
3158	*/
3159	if (!as->argc)
3160	return `0`;
3161
3162	r = dm_read_arg_group(arg: _args, arg_set: as, num_args: &argc, error: &ti->error);
3163	if (r)
3164	return -EINVAL;
3165
3166	while (argc && !r) {
3167	arg_name = dm_shift_arg(as);
3168	argc--;
3169
3170	if (!strcasecmp(s1: arg_name, s2: "skip_block_zeroing"))
3171	pf->zero_new_blocks = false;
3172
3173	else if (!strcasecmp(s1: arg_name, s2: "ignore_discard"))
3174	pf->discard_enabled = false;
3175
3176	else if (!strcasecmp(s1: arg_name, s2: "no_discard_passdown"))
3177	pf->discard_passdown = false;
3178
3179	else if (!strcasecmp(s1: arg_name, s2: "read_only"))
3180	pf->mode = PM_READ_ONLY;
3181
3182	else if (!strcasecmp(s1: arg_name, s2: "error_if_no_space"))
3183	pf->error_if_no_space = true;
3184
3185	else {
3186	ti->error = "Unrecognised pool feature requested";
3187	r = -EINVAL;
3188	break;
3189	}
3190	}
3191
3192	return r;
3193	}
3194
3195	static void metadata_low_callback(void *context)
3196	{
3197	struct pool *pool = context;
3198
3199	DMWARN("%s: reached low water mark for metadata device: sending event.",
3200	dm_device_name(pool->pool_md));
3201
3202	dm_table_event(t: pool->ti->table);
3203	}
3204
3205	/*
3206	* We need to flush the data device before committing the metadata.
3207	*
3208	* This ensures that the data blocks of any newly inserted mappings are
3209	* properly written to non-volatile storage and won't be lost in case of a
3210	* crash.
3211	*
3212	* Failure to do so can result in data corruption in the case of internal or
3213	* external snapshots and in the case of newly provisioned blocks, when block
3214	* zeroing is enabled.
3215	*/
3216	static int metadata_pre_commit_callback(void *context)
3217	{
3218	struct pool *pool = context;
3219
3220	return blkdev_issue_flush(bdev: pool->data_dev);
3221	}
3222
3223	static sector_t get_dev_size(struct block_device *bdev)
3224	{
3225	return bdev_nr_sectors(bdev);
3226	}
3227
3228	static void warn_if_metadata_device_too_big(struct block_device *bdev)
3229	{
3230	sector_t metadata_dev_size = get_dev_size(bdev);
3231
3232	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
3233	DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
3234	bdev, THIN_METADATA_MAX_SECTORS);
3235	}
3236
3237	static sector_t get_metadata_dev_size(struct block_device *bdev)
3238	{
3239	sector_t metadata_dev_size = get_dev_size(bdev);
3240
3241	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
3242	metadata_dev_size = THIN_METADATA_MAX_SECTORS;
3243
3244	return metadata_dev_size;
3245	}
3246
3247	static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
3248	{
3249	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
3250
3251	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
3252
3253	return metadata_dev_size;
3254	}
3255
3256	/*
3257	* When a metadata threshold is crossed a dm event is triggered, and
3258	* userland should respond by growing the metadata device. We could let
3259	* userland set the threshold, like we do with the data threshold, but I'm
3260	* not sure they know enough to do this well.
3261	*/
3262	static dm_block_t calc_metadata_threshold(struct pool_c *pt)
3263	{
3264	/*
3265	* 4M is ample for all ops with the possible exception of thin
3266	* device deletion which is harmless if it fails (just retry the
3267	* delete after you've grown the device).
3268	*/
3269	dm_block_t quarter = get_metadata_dev_size_in_blocks(bdev: pt->metadata_dev->bdev) / `4`;
3270
3271	return min((dm_block_t)`1024ULL` / 4M /, quarter);
3272	}
3273
3274	/*
3275	* thin-pool <metadata dev> <data dev>
3276	* <data block size (sectors)>
3277	* <low water mark (blocks)>
3278	* [<#feature args> [<arg>]*]
3279	*
3280	* Optional feature arguments are:
3281	* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
3282	* ignore_discard: disable discard
3283	* no_discard_passdown: don't pass discards down to the data device
3284	* read_only: Don't allow any changes to be made to the pool metadata.
3285	* error_if_no_space: error IOs, instead of queueing, if no space.
3286	*/
3287	static int pool_ctr(struct dm_target ti, unsigned* int argc, char **argv)
3288	{
3289	int r, pool_created = `0`;
3290	struct pool_c *pt;
3291	struct pool *pool;
3292	struct pool_features pf;
3293	struct dm_arg_set as;
3294	struct dm_dev *data_dev;
3295	unsigned long block_size;
3296	dm_block_t low_water_blocks;
3297	struct dm_dev *metadata_dev;
3298	blk_mode_t metadata_mode;
3299
3300	/*
3301	* FIXME Remove validation from scope of lock.
3302	*/
3303	mutex_lock(&dm_thin_pool_table.mutex);
3304
3305	if (argc < `4`) {
3306	ti->error = "Invalid argument count";
3307	r = -EINVAL;
3308	goto out_unlock;
3309	}
3310
3311	as.argc = argc;
3312	as.argv = argv;
3313
3314	/ make sure metadata and data are different devices /
3315	if (!strcmp(argv[`0`], argv[`1`])) {
3316	ti->error = "Error setting metadata or data device";
3317	r = -EINVAL;
3318	goto out_unlock;
3319	}
3320
3321	/*
3322	* Set default pool features.
3323	*/
3324	pool_features_init(pf: &pf);
3325
3326	dm_consume_args(as: &as, num_args: `4`);
3327	r = parse_pool_features(as: &as, pf: &pf, ti);
3328	if (r)
3329	goto out_unlock;
3330
3331	metadata_mode = BLK_OPEN_READ \|
3332	((pf.mode == PM_READ_ONLY) ? `0` : BLK_OPEN_WRITE);
3333	r = dm_get_device(ti, path: argv[`0`], mode: metadata_mode, result: &metadata_dev);
3334	if (r) {
3335	ti->error = "Error opening metadata block device";
3336	goto out_unlock;
3337	}
3338	warn_if_metadata_device_too_big(bdev: metadata_dev->bdev);
3339
3340	r = dm_get_device(ti, path: argv[`1`], BLK_OPEN_READ \| BLK_OPEN_WRITE, result: &data_dev);
3341	if (r) {
3342	ti->error = "Error getting data device";
3343	goto out_metadata;
3344	}
3345
3346	if (kstrtoul(s: argv[`2`], base: `10`, res: &block_size) \|\| !block_size \|\|
3347	block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS \|\|
3348	block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS \|\|
3349	block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - `1`)) {
3350	ti->error = "Invalid block size";
3351	r = -EINVAL;
3352	goto out;
3353	}
3354
3355	if (kstrtoull(s: argv[`3`], base: `10`, res: (unsigned long long *)&low_water_blocks)) {
3356	ti->error = "Invalid low water mark";
3357	r = -EINVAL;
3358	goto out;
3359	}
3360
3361	pt = kzalloc(size: sizeof(*pt), GFP_KERNEL);
3362	if (!pt) {
3363	r = -ENOMEM;
3364	goto out;
3365	}
3366
3367	pool = __pool_find(pool_md: dm_table_get_md(t: ti->table), metadata_dev: metadata_dev->bdev, data_dev: data_dev->bdev,
3368	block_size, read_only: pf.mode == PM_READ_ONLY, error: &ti->error, created: &pool_created);
3369	if (IS_ERR(ptr: pool)) {
3370	r = PTR_ERR(ptr: pool);
3371	goto out_free_pt;
3372	}
3373
3374	/*
3375	* 'pool_created' reflects whether this is the first table load.
3376	* Top level discard support is not allowed to be changed after
3377	* initial load. This would require a pool reload to trigger thin
3378	* device changes.
3379	*/
3380	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
3381	ti->error = "Discard support cannot be disabled once enabled";
3382	r = -EINVAL;
3383	goto out_flags_changed;
3384	}
3385
3386	pt->pool = pool;
3387	pt->ti = ti;
3388	pt->metadata_dev = metadata_dev;
3389	pt->data_dev = data_dev;
3390	pt->low_water_blocks = low_water_blocks;
3391	pt->adjusted_pf = pt->requested_pf = pf;
3392	ti->num_flush_bios = `1`;
3393	ti->limit_swap_bios = true;
3394
3395	/*
3396	* Only need to enable discards if the pool should pass
3397	* them down to the data device. The thin device's discard
3398	* processing will cause mappings to be removed from the btree.
3399	*/
3400	if (pf.discard_enabled && pf.discard_passdown) {
3401	ti->num_discard_bios = `1`;
3402	/*
3403	* Setting 'discards_supported' circumvents the normal
3404	* stacking of discard limits (this keeps the pool and
3405	* thin devices' discard limits consistent).
3406	*/
3407	ti->discards_supported = true;
3408	ti->max_discard_granularity = true;
3409	}
3410	ti->private = pt;
3411
3412	r = dm_pool_register_metadata_threshold(pmd: pt->pool->pmd,
3413	threshold: calc_metadata_threshold(pt),
3414	fn: metadata_low_callback,
3415	context: pool);
3416	if (r) {
3417	ti->error = "Error registering metadata threshold";
3418	goto out_flags_changed;
3419	}
3420
3421	dm_pool_register_pre_commit_callback(pmd: pool->pmd,
3422	fn: metadata_pre_commit_callback, context: pool);
3423
3424	mutex_unlock(lock: &dm_thin_pool_table.mutex);
3425
3426	return `0`;
3427
3428	out_flags_changed:
3429	__pool_dec(pool);
3430	out_free_pt:
3431	kfree(objp: pt);
3432	out:
3433	dm_put_device(ti, d: data_dev);
3434	out_metadata:
3435	dm_put_device(ti, d: metadata_dev);
3436	out_unlock:
3437	mutex_unlock(lock: &dm_thin_pool_table.mutex);
3438
3439	return r;
3440	}
3441
3442	static int pool_map(struct dm_target ti, struct* bio *bio)
3443	{
3444	struct pool_c *pt = ti->private;
3445	struct pool *pool = pt->pool;
3446
3447	/*
3448	* As this is a singleton target, ti->begin is always zero.
3449	*/
3450	spin_lock_irq(lock: &pool->lock);
3451	bio_set_dev(bio, bdev: pt->data_dev->bdev);
3452	spin_unlock_irq(lock: &pool->lock);
3453
3454	return DM_MAPIO_REMAPPED;
3455	}
3456
3457	static int maybe_resize_data_dev(struct dm_target ti, bool need_commit)
3458	{
3459	int r;
3460	struct pool_c *pt = ti->private;
3461	struct pool *pool = pt->pool;
3462	sector_t data_size = ti->len;
3463	dm_block_t sb_data_size;
3464
3465	*need_commit = false;
3466
3467	(void) sector_div(data_size, pool->sectors_per_block);
3468
3469	r = dm_pool_get_data_dev_size(pmd: pool->pmd, result: &sb_data_size);
3470	if (r) {
3471	DMERR("%s: failed to retrieve data device size",
3472	dm_device_name(pool->pool_md));
3473	return r;
3474	}
3475
3476	if (data_size < sb_data_size) {
3477	DMERR("%s: pool target (%llu blocks) too small: expected %llu",
3478	dm_device_name(pool->pool_md),
3479	(unsigned long long)data_size, sb_data_size);
3480	return -EINVAL;
3481
3482	} else if (data_size > sb_data_size) {
3483	if (dm_pool_metadata_needs_check(pmd: pool->pmd)) {
3484	DMERR("%s: unable to grow the data device until repaired.",
3485	dm_device_name(pool->pool_md));
3486	return `0`;
3487	}
3488
3489	if (sb_data_size)
3490	DMINFO("%s: growing the data device from %llu to %llu blocks",
3491	dm_device_name(pool->pool_md),
3492	sb_data_size, (unsigned long long)data_size);
3493	r = dm_pool_resize_data_dev(pmd: pool->pmd, new_size: data_size);
3494	if (r) {
3495	metadata_operation_failed(pool, op: "dm_pool_resize_data_dev", r);
3496	return r;
3497	}
3498
3499	*need_commit = true;
3500	}
3501
3502	return `0`;
3503	}
3504
3505	static int maybe_resize_metadata_dev(struct dm_target ti, bool need_commit)
3506	{
3507	int r;
3508	struct pool_c *pt = ti->private;
3509	struct pool *pool = pt->pool;
3510	dm_block_t metadata_dev_size, sb_metadata_dev_size;
3511
3512	*need_commit = false;
3513
3514	metadata_dev_size = get_metadata_dev_size_in_blocks(bdev: pool->md_dev);
3515
3516	r = dm_pool_get_metadata_dev_size(pmd: pool->pmd, result: &sb_metadata_dev_size);
3517	if (r) {
3518	DMERR("%s: failed to retrieve metadata device size",
3519	dm_device_name(pool->pool_md));
3520	return r;
3521	}
3522
3523	if (metadata_dev_size < sb_metadata_dev_size) {
3524	DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
3525	dm_device_name(pool->pool_md),
3526	metadata_dev_size, sb_metadata_dev_size);
3527	return -EINVAL;
3528
3529	} else if (metadata_dev_size > sb_metadata_dev_size) {
3530	if (dm_pool_metadata_needs_check(pmd: pool->pmd)) {
3531	DMERR("%s: unable to grow the metadata device until repaired.",
3532	dm_device_name(pool->pool_md));
3533	return `0`;
3534	}
3535
3536	warn_if_metadata_device_too_big(bdev: pool->md_dev);
3537	DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3538	dm_device_name(pool->pool_md),
3539	sb_metadata_dev_size, metadata_dev_size);
3540
3541	if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3542	set_pool_mode(pool, new_mode: PM_WRITE);
3543
3544	r = dm_pool_resize_metadata_dev(pmd: pool->pmd, new_size: metadata_dev_size);
3545	if (r) {
3546	metadata_operation_failed(pool, op: "dm_pool_resize_metadata_dev", r);
3547	return r;
3548	}
3549
3550	*need_commit = true;
3551	}
3552
3553	return `0`;
3554	}
3555
3556	/*
3557	* Retrieves the number of blocks of the data device from
3558	* the superblock and compares it to the actual device size,
3559	* thus resizing the data device in case it has grown.
3560	*
3561	* This both copes with opening preallocated data devices in the ctr
3562	* being followed by a resume
3563	* -and-
3564	* calling the resume method individually after userspace has
3565	* grown the data device in reaction to a table event.
3566	*/
3567	static int pool_preresume(struct dm_target *ti)
3568	{
3569	int r;
3570	bool need_commit1, need_commit2;
3571	struct pool_c *pt = ti->private;
3572	struct pool *pool = pt->pool;
3573
3574	/*
3575	* Take control of the pool object.
3576	*/
3577	r = bind_control_target(pool, ti);
3578	if (r)
3579	goto out;
3580
3581	r = maybe_resize_data_dev(ti, need_commit: &need_commit1);
3582	if (r)
3583	goto out;
3584
3585	r = maybe_resize_metadata_dev(ti, need_commit: &need_commit2);
3586	if (r)
3587	goto out;
3588
3589	if (need_commit1 \|\| need_commit2)
3590	(void) commit(pool);
3591	out:
3592	/*
3593	* When a thin-pool is PM_FAIL, it cannot be rebuilt if
3594	* bio is in deferred list. Therefore need to return 0
3595	* to allow pool_resume() to flush IO.
3596	*/
3597	if (r && get_pool_mode(pool) == PM_FAIL)
3598	r = `0`;
3599
3600	return r;
3601	}
3602
3603	static void pool_suspend_active_thins(struct pool *pool)
3604	{
3605	struct thin_c *tc;
3606
3607	/ Suspend all active thin devices /
3608	tc = get_first_thin(pool);
3609	while (tc) {
3610	dm_internal_suspend_noflush(md: tc->thin_md);
3611	tc = get_next_thin(pool, tc);
3612	}
3613	}
3614
3615	static void pool_resume_active_thins(struct pool *pool)
3616	{
3617	struct thin_c *tc;
3618
3619	/ Resume all active thin devices /
3620	tc = get_first_thin(pool);
3621	while (tc) {
3622	dm_internal_resume(md: tc->thin_md);
3623	tc = get_next_thin(pool, tc);
3624	}
3625	}
3626
3627	static void pool_resume(struct dm_target *ti)
3628	{
3629	struct pool_c *pt = ti->private;
3630	struct pool *pool = pt->pool;
3631
3632	/*
3633	* Must requeue active_thins' bios and then resume
3634	* active_thins _before_ clearing 'suspend' flag.
3635	*/
3636	requeue_bios(pool);
3637	pool_resume_active_thins(pool);
3638
3639	spin_lock_irq(lock: &pool->lock);
3640	pool->low_water_triggered = false;
3641	pool->suspended = false;
3642	spin_unlock_irq(lock: &pool->lock);
3643
3644	do_waker(ws: &pool->waker.work);
3645	}
3646
3647	static void pool_presuspend(struct dm_target *ti)
3648	{
3649	struct pool_c *pt = ti->private;
3650	struct pool *pool = pt->pool;
3651
3652	spin_lock_irq(lock: &pool->lock);
3653	pool->suspended = true;
3654	spin_unlock_irq(lock: &pool->lock);
3655
3656	pool_suspend_active_thins(pool);
3657	}
3658
3659	static void pool_presuspend_undo(struct dm_target *ti)
3660	{
3661	struct pool_c *pt = ti->private;
3662	struct pool *pool = pt->pool;
3663
3664	pool_resume_active_thins(pool);
3665
3666	spin_lock_irq(lock: &pool->lock);
3667	pool->suspended = false;
3668	spin_unlock_irq(lock: &pool->lock);
3669	}
3670
3671	static void pool_postsuspend(struct dm_target *ti)
3672	{
3673	struct pool_c *pt = ti->private;
3674	struct pool *pool = pt->pool;
3675
3676	cancel_delayed_work_sync(dwork: &pool->waker);
3677	cancel_delayed_work_sync(dwork: &pool->no_space_timeout);
3678	flush_workqueue(pool->wq);
3679	(void) commit(pool);
3680	}
3681
3682	static int check_arg_count(unsigned int argc, unsigned int args_required)
3683	{
3684	if (argc != args_required) {
3685	DMWARN("Message received with %u arguments instead of %u.",
3686	argc, args_required);
3687	return -EINVAL;
3688	}
3689
3690	return `0`;
3691	}
3692
3693	static int read_dev_id(char arg, dm_thin_id dev_id, int warning)
3694	{
3695	if (!kstrtoull(s: arg, base: `10`, res: (unsigned long long *)dev_id) &&
3696	*dev_id <= MAX_DEV_ID)
3697	return `0`;
3698
3699	if (warning)
3700	DMWARN("Message received with invalid device id: %s", arg);
3701
3702	return -EINVAL;
3703	}
3704
3705	static int process_create_thin_mesg(unsigned int argc, char argv, struct** pool *pool)
3706	{
3707	dm_thin_id dev_id;
3708	int r;
3709
3710	r = check_arg_count(argc, args_required: `2`);
3711	if (r)
3712	return r;
3713
3714	r = read_dev_id(arg: argv[`1`], dev_id: &dev_id, warning: `1`);
3715	if (r)
3716	return r;
3717
3718	r = dm_pool_create_thin(pmd: pool->pmd, dev: dev_id);
3719	if (r) {
3720	DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3721	argv[`1`]);
3722	return r;
3723	}
3724
3725	return `0`;
3726	}
3727
3728	static int process_create_snap_mesg(unsigned int argc, char argv, struct** pool *pool)
3729	{
3730	dm_thin_id dev_id;
3731	dm_thin_id origin_dev_id;
3732	int r;
3733
3734	r = check_arg_count(argc, args_required: `3`);
3735	if (r)
3736	return r;
3737
3738	r = read_dev_id(arg: argv[`1`], dev_id: &dev_id, warning: `1`);
3739	if (r)
3740	return r;
3741
3742	r = read_dev_id(arg: argv[`2`], dev_id: &origin_dev_id, warning: `1`);
3743	if (r)
3744	return r;
3745
3746	r = dm_pool_create_snap(pmd: pool->pmd, dev: dev_id, origin: origin_dev_id);
3747	if (r) {
3748	DMWARN("Creation of new snapshot %s of device %s failed.",
3749	argv[`1`], argv[`2`]);
3750	return r;
3751	}
3752
3753	return `0`;
3754	}
3755
3756	static int process_delete_mesg(unsigned int argc, char argv, struct** pool *pool)
3757	{
3758	dm_thin_id dev_id;
3759	int r;
3760
3761	r = check_arg_count(argc, args_required: `2`);
3762	if (r)
3763	return r;
3764
3765	r = read_dev_id(arg: argv[`1`], dev_id: &dev_id, warning: `1`);
3766	if (r)
3767	return r;
3768
3769	r = dm_pool_delete_thin_device(pmd: pool->pmd, dev: dev_id);
3770	if (r)
3771	DMWARN("Deletion of thin device %s failed.", argv[`1`]);
3772
3773	return r;
3774	}
3775
3776	static int process_set_transaction_id_mesg(unsigned int argc, char argv, struct** pool *pool)
3777	{
3778	dm_thin_id old_id, new_id;
3779	int r;
3780
3781	r = check_arg_count(argc, args_required: `3`);
3782	if (r)
3783	return r;
3784
3785	if (kstrtoull(s: argv[`1`], base: `10`, res: (unsigned long long *)&old_id)) {
3786	DMWARN("set_transaction_id message: Unrecognised id %s.", argv[`1`]);
3787	return -EINVAL;
3788	}
3789
3790	if (kstrtoull(s: argv[`2`], base: `10`, res: (unsigned long long *)&new_id)) {
3791	DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[`2`]);
3792	return -EINVAL;
3793	}
3794
3795	r = dm_pool_set_metadata_transaction_id(pmd: pool->pmd, current_id: old_id, new_id);
3796	if (r) {
3797	DMWARN("Failed to change transaction id from %s to %s.",
3798	argv[`1`], argv[`2`]);
3799	return r;
3800	}
3801
3802	return `0`;
3803	}
3804
3805	static int process_reserve_metadata_snap_mesg(unsigned int argc, char argv, struct** pool *pool)
3806	{
3807	int r;
3808
3809	r = check_arg_count(argc, args_required: `1`);
3810	if (r)
3811	return r;
3812
3813	(void) commit(pool);
3814
3815	r = dm_pool_reserve_metadata_snap(pmd: pool->pmd);
3816	if (r)
3817	DMWARN("reserve_metadata_snap message failed.");
3818
3819	return r;
3820	}
3821
3822	static int process_release_metadata_snap_mesg(unsigned int argc, char argv, struct** pool *pool)
3823	{
3824	int r;
3825
3826	r = check_arg_count(argc, args_required: `1`);
3827	if (r)
3828	return r;
3829
3830	r = dm_pool_release_metadata_snap(pmd: pool->pmd);
3831	if (r)
3832	DMWARN("release_metadata_snap message failed.");
3833
3834	return r;
3835	}
3836
3837	/*
3838	* Messages supported:
3839	* create_thin <dev_id>
3840	* create_snap <dev_id> <origin_id>
3841	* delete <dev_id>
3842	* set_transaction_id <current_trans_id> <new_trans_id>
3843	* reserve_metadata_snap
3844	* release_metadata_snap
3845	*/
3846	static int pool_message(struct dm_target ti, unsigned* int argc, char **argv,
3847	char result, unsigned* int maxlen)
3848	{
3849	int r = -EINVAL;
3850	struct pool_c *pt = ti->private;
3851	struct pool *pool = pt->pool;
3852
3853	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
3854	DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3855	dm_device_name(pool->pool_md));
3856	return -EOPNOTSUPP;
3857	}
3858
3859	if (!strcasecmp(s1: argv[`0`], s2: "create_thin"))
3860	r = process_create_thin_mesg(argc, argv, pool);
3861
3862	else if (!strcasecmp(s1: argv[`0`], s2: "create_snap"))
3863	r = process_create_snap_mesg(argc, argv, pool);
3864
3865	else if (!strcasecmp(s1: argv[`0`], s2: "delete"))
3866	r = process_delete_mesg(argc, argv, pool);
3867
3868	else if (!strcasecmp(s1: argv[`0`], s2: "set_transaction_id"))
3869	r = process_set_transaction_id_mesg(argc, argv, pool);
3870
3871	else if (!strcasecmp(s1: argv[`0`], s2: "reserve_metadata_snap"))
3872	r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3873
3874	else if (!strcasecmp(s1: argv[`0`], s2: "release_metadata_snap"))
3875	r = process_release_metadata_snap_mesg(argc, argv, pool);
3876
3877	else
3878	DMWARN("Unrecognised thin pool target message received: %s", argv[`0`]);
3879
3880	if (!r)
3881	(void) commit(pool);
3882
3883	return r;
3884	}
3885
3886	static void emit_flags(struct pool_features pf, char* *result,
3887	unsigned int sz, unsigned int maxlen)
3888	{
3889	unsigned int count = !pf->zero_new_blocks + !pf->discard_enabled +
3890	!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3891	pf->error_if_no_space;
3892	DMEMIT("%u ", count);
3893
3894	if (!pf->zero_new_blocks)
3895	DMEMIT("skip_block_zeroing ");
3896
3897	if (!pf->discard_enabled)
3898	DMEMIT("ignore_discard ");
3899
3900	if (!pf->discard_passdown)
3901	DMEMIT("no_discard_passdown ");
3902
3903	if (pf->mode == PM_READ_ONLY)
3904	DMEMIT("read_only ");
3905
3906	if (pf->error_if_no_space)
3907	DMEMIT("error_if_no_space ");
3908	}
3909
3910	/*
3911	* Status line is:
3912	* <transaction id> <used metadata sectors>/<total metadata sectors>
3913	* <used data sectors>/<total data sectors> <held metadata root>
3914	* <pool mode> <discard config> <no space config> <needs_check>
3915	*/
3916	static void pool_status(struct dm_target *ti, status_type_t type,
3917	unsigned int status_flags, char result, unsigned* int maxlen)
3918	{
3919	int r;
3920	unsigned int sz = `0`;
3921	uint64_t transaction_id;
3922	dm_block_t nr_free_blocks_data;
3923	dm_block_t nr_free_blocks_metadata;
3924	dm_block_t nr_blocks_data;
3925	dm_block_t nr_blocks_metadata;
3926	dm_block_t held_root;
3927	enum pool_mode mode;
3928	char buf[BDEVNAME_SIZE];
3929	char buf2[BDEVNAME_SIZE];
3930	struct pool_c *pt = ti->private;
3931	struct pool *pool = pt->pool;
3932
3933	switch (type) {
3934	case STATUSTYPE_INFO:
3935	if (get_pool_mode(pool) == PM_FAIL) {
3936	DMEMIT("Fail");
3937	break;
3938	}
3939
3940	/ Commit to ensure statistics aren't out-of-date /
3941	if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3942	(void) commit(pool);
3943
3944	r = dm_pool_get_metadata_transaction_id(pmd: pool->pmd, result: &transaction_id);
3945	if (r) {
3946	DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3947	dm_device_name(pool->pool_md), r);
3948	goto err;
3949	}
3950
3951	r = dm_pool_get_free_metadata_block_count(pmd: pool->pmd, result: &nr_free_blocks_metadata);
3952	if (r) {
3953	DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3954	dm_device_name(pool->pool_md), r);
3955	goto err;
3956	}
3957
3958	r = dm_pool_get_metadata_dev_size(pmd: pool->pmd, result: &nr_blocks_metadata);
3959	if (r) {
3960	DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3961	dm_device_name(pool->pool_md), r);
3962	goto err;
3963	}
3964
3965	r = dm_pool_get_free_block_count(pmd: pool->pmd, result: &nr_free_blocks_data);
3966	if (r) {
3967	DMERR("%s: dm_pool_get_free_block_count returned %d",
3968	dm_device_name(pool->pool_md), r);
3969	goto err;
3970	}
3971
3972	r = dm_pool_get_data_dev_size(pmd: pool->pmd, result: &nr_blocks_data);
3973	if (r) {
3974	DMERR("%s: dm_pool_get_data_dev_size returned %d",
3975	dm_device_name(pool->pool_md), r);
3976	goto err;
3977	}
3978
3979	r = dm_pool_get_metadata_snap(pmd: pool->pmd, result: &held_root);
3980	if (r) {
3981	DMERR("%s: dm_pool_get_metadata_snap returned %d",
3982	dm_device_name(pool->pool_md), r);
3983	goto err;
3984	}
3985
3986	DMEMIT("%llu %llu/%llu %llu/%llu ",
3987	(unsigned long long)transaction_id,
3988	(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3989	(unsigned long long)nr_blocks_metadata,
3990	(unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3991	(unsigned long long)nr_blocks_data);
3992
3993	if (held_root)
3994	DMEMIT("%llu ", held_root);
3995	else
3996	DMEMIT("- ");
3997
3998	mode = get_pool_mode(pool);
3999	if (mode == PM_OUT_OF_DATA_SPACE)
4000	DMEMIT("out_of_data_space ");
4001	else if (is_read_only_pool_mode(mode))
4002	DMEMIT("ro ");
4003	else
4004	DMEMIT("rw ");
4005
4006	if (!pool->pf.discard_enabled)
4007	DMEMIT("ignore_discard ");
4008	else if (pool->pf.discard_passdown)
4009	DMEMIT("discard_passdown ");
4010	else
4011	DMEMIT("no_discard_passdown ");
4012
4013	if (pool->pf.error_if_no_space)
4014	DMEMIT("error_if_no_space ");
4015	else
4016	DMEMIT("queue_if_no_space ");
4017
4018	if (dm_pool_metadata_needs_check(pmd: pool->pmd))
4019	DMEMIT("needs_check ");
4020	else
4021	DMEMIT("- ");
4022
4023	DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
4024
4025	break;
4026
4027	case STATUSTYPE_TABLE:
4028	DMEMIT("%s %s %lu %llu ",
4029	format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
4030	format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
4031	(unsigned long)pool->sectors_per_block,
4032	(unsigned long long)pt->low_water_blocks);
4033	emit_flags(pf: &pt->requested_pf, result, sz, maxlen);
4034	break;
4035
4036	case STATUSTYPE_IMA:
4037	*result = `'\0'`;
4038	break;
4039	}
4040	return;
4041
4042	err:
4043	DMEMIT("Error");
4044	}
4045
4046	static int pool_iterate_devices(struct dm_target *ti,
4047	iterate_devices_callout_fn fn, void *data)
4048	{
4049	struct pool_c *pt = ti->private;
4050
4051	return fn(ti, pt->data_dev, `0`, ti->len, data);
4052	}
4053
4054	static void pool_io_hints(struct dm_target ti, struct* queue_limits *limits)
4055	{
4056	struct pool_c *pt = ti->private;
4057	struct pool *pool = pt->pool;
4058	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
4059
4060	/*
4061	* If max_sectors is smaller than pool->sectors_per_block adjust it
4062	* to the highest possible power-of-2 factor of pool->sectors_per_block.
4063	* This is especially beneficial when the pool's data device is a RAID
4064	* device that has a full stripe width that matches pool->sectors_per_block
4065	* -- because even though partial RAID stripe-sized IOs will be issued to a
4066	* single RAID stripe; when aggregated they will end on a full RAID stripe
4067	* boundary.. which avoids additional partial RAID stripe writes cascading
4068	*/
4069	if (limits->max_sectors < pool->sectors_per_block) {
4070	while (!is_factor(block_size: pool->sectors_per_block, n: limits->max_sectors)) {
4071	if ((limits->max_sectors & (limits->max_sectors - `1`)) == `0`)
4072	limits->max_sectors--;
4073	limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
4074	}
4075	}
4076
4077	/*
4078	* If the system-determined stacked limits are compatible with the
4079	* pool's blocksize (io_opt is a factor) do not override them.
4080	*/
4081	if (io_opt_sectors < pool->sectors_per_block \|\|
4082	!is_factor(block_size: io_opt_sectors, n: pool->sectors_per_block)) {
4083	if (is_factor(block_size: pool->sectors_per_block, n: limits->max_sectors))
4084	blk_limits_io_min(limits, min: limits->max_sectors << SECTOR_SHIFT);
4085	else
4086	blk_limits_io_min(limits, min: pool->sectors_per_block << SECTOR_SHIFT);
4087	blk_limits_io_opt(limits, opt: pool->sectors_per_block << SECTOR_SHIFT);
4088	}
4089
4090	/*
4091	* pt->adjusted_pf is a staging area for the actual features to use.
4092	* They get transferred to the live pool in bind_control_target()
4093	* called from pool_preresume().
4094	*/
4095
4096	if (pt->adjusted_pf.discard_enabled) {
4097	disable_discard_passdown_if_not_supported(pt);
4098	if (!pt->adjusted_pf.discard_passdown)
4099	limits->max_discard_sectors = `0`;
4100	/*
4101	* The pool uses the same discard limits as the underlying data
4102	* device. DM core has already set this up.
4103	*/
4104	} else {
4105	/*
4106	* Must explicitly disallow stacking discard limits otherwise the
4107	* block layer will stack them if pool's data device has support.
4108	*/
4109	limits->discard_granularity = `0`;
4110	}
4111	}
4112
4113	static struct target_type pool_target = {
4114	.name = "thin-pool",
4115	.features = DM_TARGET_SINGLETON \| DM_TARGET_ALWAYS_WRITEABLE \|
4116	DM_TARGET_IMMUTABLE,
4117	.version = {`1`, `23`, `0`},
4118	.module = THIS_MODULE,
4119	.ctr = pool_ctr,
4120	.dtr = pool_dtr,
4121	.map = pool_map,
4122	.presuspend = pool_presuspend,
4123	.presuspend_undo = pool_presuspend_undo,
4124	.postsuspend = pool_postsuspend,
4125	.preresume = pool_preresume,
4126	.resume = pool_resume,
4127	.message = pool_message,
4128	.status = pool_status,
4129	.iterate_devices = pool_iterate_devices,
4130	.io_hints = pool_io_hints,
4131	};
4132
4133	/*
4134	*--------------------------------------------------------------
4135	* Thin target methods
4136	*--------------------------------------------------------------
4137	*/
4138	static void thin_get(struct thin_c *tc)
4139	{
4140	refcount_inc(r: &tc->refcount);
4141	}
4142
4143	static void thin_put(struct thin_c *tc)
4144	{
4145	if (refcount_dec_and_test(r: &tc->refcount))
4146	complete(&tc->can_destroy);
4147	}
4148
4149	static void thin_dtr(struct dm_target *ti)
4150	{
4151	struct thin_c *tc = ti->private;
4152
4153	spin_lock_irq(lock: &tc->pool->lock);
4154	list_del_rcu(entry: &tc->list);
4155	spin_unlock_irq(lock: &tc->pool->lock);
4156	synchronize_rcu();
4157
4158	thin_put(tc);
4159	wait_for_completion(&tc->can_destroy);
4160
4161	mutex_lock(&dm_thin_pool_table.mutex);
4162
4163	__pool_dec(pool: tc->pool);
4164	dm_pool_close_thin_device(td: tc->td);
4165	dm_put_device(ti, d: tc->pool_dev);
4166	if (tc->origin_dev)
4167	dm_put_device(ti, d: tc->origin_dev);
4168	kfree(objp: tc);
4169
4170	mutex_unlock(lock: &dm_thin_pool_table.mutex);
4171	}
4172
4173	/*
4174	* Thin target parameters:
4175	*
4176	* <pool_dev> <dev_id> [origin_dev]
4177	*
4178	* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
4179	* dev_id: the internal device identifier
4180	* origin_dev: a device external to the pool that should act as the origin
4181	*
4182	* If the pool device has discards disabled, they get disabled for the thin
4183	* device as well.
4184	*/
4185	static int thin_ctr(struct dm_target ti, unsigned* int argc, char **argv)
4186	{
4187	int r;
4188	struct thin_c *tc;
4189	struct dm_dev pool_dev, origin_dev;
4190	struct mapped_device *pool_md;
4191
4192	mutex_lock(&dm_thin_pool_table.mutex);
4193
4194	if (argc != `2` && argc != `3`) {
4195	ti->error = "Invalid argument count";
4196	r = -EINVAL;
4197	goto out_unlock;
4198	}
4199
4200	tc = ti->private = kzalloc(size: sizeof(*tc), GFP_KERNEL);
4201	if (!tc) {
4202	ti->error = "Out of memory";
4203	r = -ENOMEM;
4204	goto out_unlock;
4205	}
4206	tc->thin_md = dm_table_get_md(t: ti->table);
4207	spin_lock_init(&tc->lock);
4208	INIT_LIST_HEAD(list: &tc->deferred_cells);
4209	bio_list_init(bl: &tc->deferred_bio_list);
4210	bio_list_init(bl: &tc->retry_on_resume_list);
4211	tc->sort_bio_list = RB_ROOT;
4212
4213	if (argc == `3`) {
4214	if (!strcmp(argv[`0`], argv[`2`])) {
4215	ti->error = "Error setting origin device";
4216	r = -EINVAL;
4217	goto bad_origin_dev;
4218	}
4219
4220	r = dm_get_device(ti, path: argv[`2`], BLK_OPEN_READ, result: &origin_dev);
4221	if (r) {
4222	ti->error = "Error opening origin device";
4223	goto bad_origin_dev;
4224	}
4225	tc->origin_dev = origin_dev;
4226	}
4227
4228	r = dm_get_device(ti, path: argv[`0`], mode: dm_table_get_mode(t: ti->table), result: &pool_dev);
4229	if (r) {
4230	ti->error = "Error opening pool device";
4231	goto bad_pool_dev;
4232	}
4233	tc->pool_dev = pool_dev;
4234
4235	if (read_dev_id(arg: argv[`1`], dev_id: (unsigned long long *)&tc->dev_id, warning: `0`)) {
4236	ti->error = "Invalid device id";
4237	r = -EINVAL;
4238	goto bad_common;
4239	}
4240
4241	pool_md = dm_get_md(dev: tc->pool_dev->bdev->bd_dev);
4242	if (!pool_md) {
4243	ti->error = "Couldn't get pool mapped device";
4244	r = -EINVAL;
4245	goto bad_common;
4246	}
4247
4248	tc->pool = __pool_table_lookup(md: pool_md);
4249	if (!tc->pool) {
4250	ti->error = "Couldn't find pool object";
4251	r = -EINVAL;
4252	goto bad_pool_lookup;
4253	}
4254	__pool_inc(pool: tc->pool);
4255
4256	if (get_pool_mode(pool: tc->pool) == PM_FAIL) {
4257	ti->error = "Couldn't open thin device, Pool is in fail mode";
4258	r = -EINVAL;
4259	goto bad_pool;
4260	}
4261
4262	r = dm_pool_open_thin_device(pmd: tc->pool->pmd, dev: tc->dev_id, td: &tc->td);
4263	if (r) {
4264	ti->error = "Couldn't open thin internal device";
4265	goto bad_pool;
4266	}
4267
4268	r = dm_set_target_max_io_len(ti, len: tc->pool->sectors_per_block);
4269	if (r)
4270	goto bad;
4271
4272	ti->num_flush_bios = `1`;
4273	ti->limit_swap_bios = true;
4274	ti->flush_supported = true;
4275	ti->accounts_remapped_io = true;
4276	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
4277
4278	/ In case the pool supports discards, pass them on. /
4279	if (tc->pool->pf.discard_enabled) {
4280	ti->discards_supported = true;
4281	ti->num_discard_bios = `1`;
4282	ti->max_discard_granularity = true;
4283	}
4284
4285	mutex_unlock(lock: &dm_thin_pool_table.mutex);
4286
4287	spin_lock_irq(lock: &tc->pool->lock);
4288	if (tc->pool->suspended) {
4289	spin_unlock_irq(lock: &tc->pool->lock);
4290	mutex_lock(&dm_thin_pool_table.mutex); / reacquire for __pool_dec /
4291	ti->error = "Unable to activate thin device while pool is suspended";
4292	r = -EINVAL;
4293	goto bad;
4294	}
4295	refcount_set(r: &tc->refcount, n: `1`);
4296	init_completion(x: &tc->can_destroy);
4297	list_add_tail_rcu(new: &tc->list, head: &tc->pool->active_thins);
4298	spin_unlock_irq(lock: &tc->pool->lock);
4299	/*
4300	* This synchronize_rcu() call is needed here otherwise we risk a
4301	* wake_worker() call finding no bios to process (because the newly
4302	* added tc isn't yet visible). So this reduces latency since we
4303	* aren't then dependent on the periodic commit to wake_worker().
4304	*/
4305	synchronize_rcu();
4306
4307	dm_put(md: pool_md);
4308
4309	return `0`;
4310
4311	bad:
4312	dm_pool_close_thin_device(td: tc->td);
4313	bad_pool:
4314	__pool_dec(pool: tc->pool);
4315	bad_pool_lookup:
4316	dm_put(md: pool_md);
4317	bad_common:
4318	dm_put_device(ti, d: tc->pool_dev);
4319	bad_pool_dev:
4320	if (tc->origin_dev)
4321	dm_put_device(ti, d: tc->origin_dev);
4322	bad_origin_dev:
4323	kfree(objp: tc);
4324	out_unlock:
4325	mutex_unlock(lock: &dm_thin_pool_table.mutex);
4326
4327	return r;
4328	}
4329
4330	static int thin_map(struct dm_target ti, struct* bio *bio)
4331	{
4332	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
4333
4334	return thin_bio_map(ti, bio);
4335	}
4336
4337	static int thin_endio(struct dm_target ti, struct* bio *bio,
4338	blk_status_t *err)
4339	{
4340	unsigned long flags;
4341	struct dm_thin_endio_hook h = dm_per_bio_data(bio, data_size: sizeof(struct* dm_thin_endio_hook));
4342	struct list_head work;
4343	struct dm_thin_new_mapping m, tmp;
4344	struct pool *pool = h->tc->pool;
4345
4346	if (h->shared_read_entry) {
4347	INIT_LIST_HEAD(list: &work);
4348	dm_deferred_entry_dec(entry: h->shared_read_entry, head: &work);
4349
4350	spin_lock_irqsave(&pool->lock, flags);
4351	list_for_each_entry_safe(m, tmp, &work, list) {
4352	list_del(entry: &m->list);
4353	__complete_mapping_preparation(m);
4354	}
4355	spin_unlock_irqrestore(lock: &pool->lock, flags);
4356	}
4357
4358	if (h->all_io_entry) {
4359	INIT_LIST_HEAD(list: &work);
4360	dm_deferred_entry_dec(entry: h->all_io_entry, head: &work);
4361	if (!list_empty(head: &work)) {
4362	spin_lock_irqsave(&pool->lock, flags);
4363	list_for_each_entry_safe(m, tmp, &work, list)
4364	list_add_tail(new: &m->list, head: &pool->prepared_discards);
4365	spin_unlock_irqrestore(lock: &pool->lock, flags);
4366	wake_worker(pool);
4367	}
4368	}
4369
4370	if (h->cell)
4371	cell_defer_no_holder(tc: h->tc, cell: h->cell);
4372
4373	return DM_ENDIO_DONE;
4374	}
4375
4376	static void thin_presuspend(struct dm_target *ti)
4377	{
4378	struct thin_c *tc = ti->private;
4379
4380	if (dm_noflush_suspending(ti))
4381	noflush_work(tc, fn: do_noflush_start);
4382	}
4383
4384	static void thin_postsuspend(struct dm_target *ti)
4385	{
4386	struct thin_c *tc = ti->private;
4387
4388	/*
4389	* The dm_noflush_suspending flag has been cleared by now, so
4390	* unfortunately we must always run this.
4391	*/
4392	noflush_work(tc, fn: do_noflush_stop);
4393	}
4394
4395	static int thin_preresume(struct dm_target *ti)
4396	{
4397	struct thin_c *tc = ti->private;
4398
4399	if (tc->origin_dev)
4400	tc->origin_size = get_dev_size(bdev: tc->origin_dev->bdev);
4401
4402	return `0`;
4403	}
4404
4405	/*
4406	* <nr mapped sectors> <highest mapped sector>
4407	*/
4408	static void thin_status(struct dm_target *ti, status_type_t type,
4409	unsigned int status_flags, char result, unsigned* int maxlen)
4410	{
4411	int r;
4412	ssize_t sz = `0`;
4413	dm_block_t mapped, highest;
4414	char buf[BDEVNAME_SIZE];
4415	struct thin_c *tc = ti->private;
4416
4417	if (get_pool_mode(pool: tc->pool) == PM_FAIL) {
4418	DMEMIT("Fail");
4419	return;
4420	}
4421
4422	if (!tc->td)
4423	DMEMIT("-");
4424	else {
4425	switch (type) {
4426	case STATUSTYPE_INFO:
4427	r = dm_thin_get_mapped_count(td: tc->td, result: &mapped);
4428	if (r) {
4429	DMERR("dm_thin_get_mapped_count returned %d", r);
4430	goto err;
4431	}
4432
4433	r = dm_thin_get_highest_mapped_block(td: tc->td, highest_mapped: &highest);
4434	if (r < `0`) {
4435	DMERR("dm_thin_get_highest_mapped_block returned %d", r);
4436	goto err;
4437	}
4438
4439	DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
4440	if (r)
4441	DMEMIT("%llu", ((highest + `1`) *
4442	tc->pool->sectors_per_block) - `1`);
4443	else
4444	DMEMIT("-");
4445	break;
4446
4447	case STATUSTYPE_TABLE:
4448	DMEMIT("%s %lu",
4449	format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
4450	(unsigned long) tc->dev_id);
4451	if (tc->origin_dev)
4452	DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
4453	break;
4454
4455	case STATUSTYPE_IMA:
4456	*result = `'\0'`;
4457	break;
4458	}
4459	}
4460
4461	return;
4462
4463	err:
4464	DMEMIT("Error");
4465	}
4466
4467	static int thin_iterate_devices(struct dm_target *ti,
4468	iterate_devices_callout_fn fn, void *data)
4469	{
4470	sector_t blocks;
4471	struct thin_c *tc = ti->private;
4472	struct pool *pool = tc->pool;
4473
4474	/*
4475	* We can't call dm_pool_get_data_dev_size() since that blocks. So
4476	* we follow a more convoluted path through to the pool's target.
4477	*/
4478	if (!pool->ti)
4479	return `0`; / nothing is bound /
4480
4481	blocks = pool->ti->len;
4482	(void) sector_div(blocks, pool->sectors_per_block);
4483	if (blocks)
4484	return fn(ti, tc->pool_dev, `0`, pool->sectors_per_block * blocks, data);
4485
4486	return `0`;
4487	}
4488
4489	static void thin_io_hints(struct dm_target ti, struct* queue_limits *limits)
4490	{
4491	struct thin_c *tc = ti->private;
4492	struct pool *pool = tc->pool;
4493
4494	if (pool->pf.discard_enabled) {
4495	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
4496	limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
4497	}
4498	}
4499
4500	static struct target_type thin_target = {
4501	.name = "thin",
4502	.version = {`1`, `23`, `0`},
4503	.module = THIS_MODULE,
4504	.ctr = thin_ctr,
4505	.dtr = thin_dtr,
4506	.map = thin_map,
4507	.end_io = thin_endio,
4508	.preresume = thin_preresume,
4509	.presuspend = thin_presuspend,
4510	.postsuspend = thin_postsuspend,
4511	.status = thin_status,
4512	.iterate_devices = thin_iterate_devices,
4513	.io_hints = thin_io_hints,
4514	};
4515
4516	/----------------------------------------------------------------/
4517
4518	static int __init dm_thin_init(void)
4519	{
4520	int r = -ENOMEM;
4521
4522	pool_table_init();
4523
4524	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, `0`);
4525	if (!_new_mapping_cache)
4526	return r;
4527
4528	r = dm_register_target(t: &thin_target);
4529	if (r)
4530	goto bad_new_mapping_cache;
4531
4532	r = dm_register_target(t: &pool_target);
4533	if (r)
4534	goto bad_thin_target;
4535
4536	return `0`;
4537
4538	bad_thin_target:
4539	dm_unregister_target(t: &thin_target);
4540	bad_new_mapping_cache:
4541	kmem_cache_destroy(s: _new_mapping_cache);
4542
4543	return r;
4544	}
4545
4546	static void dm_thin_exit(void)
4547	{
4548	dm_unregister_target(t: &thin_target);
4549	dm_unregister_target(t: &pool_target);
4550
4551	kmem_cache_destroy(s: _new_mapping_cache);
4552
4553	pool_table_exit();
4554	}
4555
4556	module_init(dm_thin_init);
4557	module_exit(dm_thin_exit);
4558
4559	module_param_named(no_space_timeout, no_space_timeout_secs, uint, `0644`);
4560	MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
4561
4562	MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
4563	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
4564	MODULE_LICENSE("GPL");
4565

source code of linux/drivers/md/dm-thin.c