request.c source code [linux/drivers/md/bcache/request.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Main bcache entry point - handle a read or a write request and decide what to
4	* do with it; the make_request functions are called by the block layer.
5	*
6	* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7	* Copyright 2012 Google, Inc.
8	*/
9
10	#include "bcache.h"
11	#include "btree.h"
12	#include "debug.h"
13	#include "request.h"
14	#include "writeback.h"
15
16	#include <linux/module.h>
17	#include <linux/hash.h>
18	#include <linux/random.h>
19	#include <linux/backing-dev.h>
20
21	#include <trace/events/bcache.h>
22
23	#define CUTOFF_CACHE_ADD 95
24	#define CUTOFF_CACHE_READA 90
25
26	struct kmem_cache *bch_search_cache;
27
28	static void bch_data_insert_start(struct closure *cl);
29
30	static unsigned int cache_mode(struct cached_dev *dc)
31	{
32	return BDEV_CACHE_MODE(k: &dc->sb);
33	}
34
35	static bool verify(struct cached_dev *dc)
36	{
37	return dc->verify;
38	}
39
40	static void bio_csum(struct bio bio, struct* bkey *k)
41	{
42	struct bio_vec bv;
43	struct bvec_iter iter;
44	uint64_t csum = `0`;
45
46	bio_for_each_segment(bv, bio, iter) {
47	void *d = bvec_kmap_local(bvec: &bv);
48
49	csum = crc64_be(crc: csum, p: d, len: bv.bv_len);
50	kunmap_local(d);
51	}
52
53	k->ptr[KEY_PTRS(k)] = csum & (~`0ULL` >> `1`);
54	}
55
56	/ Insert data into cache /
57
58	static void bch_data_insert_keys(struct closure *cl)
59	{
60	struct data_insert_op op = container_of(cl, struct* data_insert_op, cl);
61	atomic_t *journal_ref = NULL;
62	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
63	int ret;
64
65	if (!op->replace)
66	journal_ref = bch_journal(c: op->c, keys: &op->insert_keys,
67	parent: op->flush_journal ? cl : NULL);
68
69	ret = bch_btree_insert(c: op->c, keys: &op->insert_keys,
70	journal_ref, replace_key);
71	if (ret == -ESRCH) {
72	op->replace_collision = true;
73	} else if (ret) {
74	op->status = BLK_STS_RESOURCE;
75	op->insert_data_done = true;
76	}
77
78	if (journal_ref)
79	atomic_dec_bug(journal_ref);
80
81	if (!op->insert_data_done) {
82	continue_at(cl, bch_data_insert_start, op->wq);
83	return;
84	}
85
86	bch_keylist_free(l: &op->insert_keys);
87	closure_return(cl);
88	}
89
90	static int bch_keylist_realloc(struct keylist l, unsigned* int u64s,
91	struct cache_set *c)
92	{
93	size_t oldsize = bch_keylist_nkeys(l);
94	size_t newsize = oldsize + u64s;
95
96	/*
97	* The journalling code doesn't handle the case where the keys to insert
98	* is bigger than an empty write: If we just return -ENOMEM here,
99	* bch_data_insert_keys() will insert the keys created so far
100	* and finish the rest when the keylist is empty.
101	*/
102	if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset))
103	return -ENOMEM;
104
105	return __bch_keylist_realloc(l, u64s);
106	}
107
108	static void bch_data_invalidate(struct closure *cl)
109	{
110	struct data_insert_op op = container_of(cl, struct* data_insert_op, cl);
111	struct bio *bio = op->bio;
112
113	pr_debug("invalidating %i sectors from %llu\n",
114	bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
115
116	while (bio_sectors(bio)) {
117	unsigned int sectors = min(bio_sectors(bio),
118	`1U` << (KEY_SIZE_BITS - `1`));
119
120	if (bch_keylist_realloc(l: &op->insert_keys, u64s: `2`, c: op->c))
121	goto out;
122
123	bio->bi_iter.bi_sector += sectors;
124	bio->bi_iter.bi_size -= sectors << `9`;
125
126	bch_keylist_add(l: &op->insert_keys,
127	k: &KEY(op->inode,
128	bio->bi_iter.bi_sector,
129	sectors));
130	}
131
132	op->insert_data_done = true;
133	/ get in bch_data_insert() /
134	bio_put(bio);
135	out:
136	continue_at(cl, bch_data_insert_keys, op->wq);
137	}
138
139	static void bch_data_insert_error(struct closure *cl)
140	{
141	struct data_insert_op op = container_of(cl, struct* data_insert_op, cl);
142
143	/*
144	* Our data write just errored, which means we've got a bunch of keys to
145	* insert that point to data that wasn't successfully written.
146	*
147	* We don't have to insert those keys but we still have to invalidate
148	* that region of the cache - so, if we just strip off all the pointers
149	* from the keys we'll accomplish just that.
150	*/
151
152	struct bkey src = op->insert_keys.keys, dst = op->insert_keys.keys;
153
154	while (src != op->insert_keys.top) {
155	struct bkey *n = bkey_next(k: src);
156
157	SET_KEY_PTRS(k: src, v: `0`);
158	memmove(dst, src, bkey_bytes(src));
159
160	dst = bkey_next(k: dst);
161	src = n;
162	}
163
164	op->insert_keys.top = dst;
165
166	bch_data_insert_keys(cl);
167	}
168
169	static void bch_data_insert_endio(struct bio *bio)
170	{
171	struct closure *cl = bio->bi_private;
172	struct data_insert_op op = container_of(cl, struct* data_insert_op, cl);
173
174	if (bio->bi_status) {
175	/ TODO: We could try to recover from this. /
176	if (op->writeback)
177	op->status = bio->bi_status;
178	else if (!op->replace)
179	set_closure_fn(cl, fn: bch_data_insert_error, wq: op->wq);
180	else
181	set_closure_fn(cl, NULL, NULL);
182	}
183
184	bch_bbio_endio(c: op->c, bio, error: bio->bi_status, m: "writing data to cache");
185	}
186
187	static void bch_data_insert_start(struct closure *cl)
188	{
189	struct data_insert_op op = container_of(cl, struct* data_insert_op, cl);
190	struct bio bio = op->bio, n;
191
192	if (op->bypass)
193	return bch_data_invalidate(cl);
194
195	if (atomic_sub_return(bio_sectors(bio), v: &op->c->sectors_to_gc) < `0`)
196	wake_up_gc(c: op->c);
197
198	/*
199	* Journal writes are marked REQ_PREFLUSH; if the original write was a
200	* flush, it'll wait on the journal write.
201	*/
202	bio->bi_opf &= ~(REQ_PREFLUSH\|REQ_FUA);
203
204	do {
205	unsigned int i;
206	struct bkey *k;
207	struct bio_set *split = &op->c->bio_split;
208
209	/ 1 for the device pointer and 1 for the chksum /
210	if (bch_keylist_realloc(l: &op->insert_keys,
211	u64s: `3` + (op->csum ? `1` : `0`),
212	c: op->c)) {
213	continue_at(cl, bch_data_insert_keys, op->wq);
214	return;
215	}
216
217	k = op->insert_keys.top;
218	bkey_init(k);
219	SET_KEY_INODE(k, v: op->inode);
220	SET_KEY_OFFSET(k, v: bio->bi_iter.bi_sector);
221
222	if (!bch_alloc_sectors(c: op->c, k, bio_sectors(bio),
223	write_point: op->write_point, write_prio: op->write_prio,
224	wait: op->writeback))
225	goto err;
226
227	n = bio_next_split(bio, sectors: KEY_SIZE(k), GFP_NOIO, bs: split);
228
229	n->bi_end_io = bch_data_insert_endio;
230	n->bi_private = cl;
231
232	if (op->writeback) {
233	SET_KEY_DIRTY(k, v: true);
234
235	for (i = `0`; i < KEY_PTRS(k); i++)
236	SET_GC_MARK(k: PTR_BUCKET(c: op->c, k, ptr: i),
237	GC_MARK_DIRTY);
238	}
239
240	SET_KEY_CSUM(k, v: op->csum);
241	if (KEY_CSUM(k))
242	bio_csum(bio: n, k);
243
244	trace_bcache_cache_insert(k);
245	bch_keylist_push(l: &op->insert_keys);
246
247	n->bi_opf = REQ_OP_WRITE;
248	bch_submit_bbio(bio: n, c: op->c, k, ptr: `0`);
249	} while (n != bio);
250
251	op->insert_data_done = true;
252	continue_at(cl, bch_data_insert_keys, op->wq);
253	return;
254	err:
255	/ bch_alloc_sectors() blocks if s->writeback = true /
256	BUG_ON(op->writeback);
257
258	/*
259	* But if it's not a writeback write we'd rather just bail out if
260	* there aren't any buckets ready to write to - it might take awhile and
261	* we might be starving btree writes for gc or something.
262	*/
263
264	if (!op->replace) {
265	/*
266	* Writethrough write: We can't complete the write until we've
267	* updated the index. But we don't want to delay the write while
268	* we wait for buckets to be freed up, so just invalidate the
269	* rest of the write.
270	*/
271	op->bypass = true;
272	return bch_data_invalidate(cl);
273	} else {
274	/*
275	* From a cache miss, we can just insert the keys for the data
276	* we have written or bail out if we didn't do anything.
277	*/
278	op->insert_data_done = true;
279	bio_put(bio);
280
281	if (!bch_keylist_empty(l: &op->insert_keys))
282	continue_at(cl, bch_data_insert_keys, op->wq);
283	else
284	closure_return(cl);
285	}
286	}
287
288	/**
289	* bch_data_insert - stick some data in the cache
290	* @cl: closure pointer.
291	*
292	* This is the starting point for any data to end up in a cache device; it could
293	* be from a normal write, or a writeback write, or a write to a flash only
294	* volume - it's also used by the moving garbage collector to compact data in
295	* mostly empty buckets.
296	*
297	* It first writes the data to the cache, creating a list of keys to be inserted
298	* (if the data had to be fragmented there will be multiple keys); after the
299	* data is written it calls bch_journal, and after the keys have been added to
300	* the next journal write they're inserted into the btree.
301	*
302	* It inserts the data in op->bio; bi_sector is used for the key offset,
303	* and op->inode is used for the key inode.
304	*
305	* If op->bypass is true, instead of inserting the data it invalidates the
306	* region of the cache represented by op->bio and op->inode.
307	*/
308	void bch_data_insert(struct closure *cl)
309	{
310	struct data_insert_op op = container_of(cl, struct* data_insert_op, cl);
311
312	trace_bcache_write(c: op->c, inode: op->inode, bio: op->bio,
313	writeback: op->writeback, bypass: op->bypass);
314
315	bch_keylist_init(l: &op->insert_keys);
316	bio_get(bio: op->bio);
317	bch_data_insert_start(cl);
318	}
319
320	/*
321	* Congested? Return 0 (not congested) or the limit (in sectors)
322	* beyond which we should bypass the cache due to congestion.
323	*/
324	unsigned int bch_get_congested(const struct cache_set *c)
325	{
326	int i;
327
328	if (!c->congested_read_threshold_us &&
329	!c->congested_write_threshold_us)
330	return `0`;
331
332	i = (local_clock_us() - c->congested_last_us) / `1024`;
333	if (i < `0`)
334	return `0`;
335
336	i += atomic_read(v: &c->congested);
337	if (i >= `0`)
338	return `0`;
339
340	i += CONGESTED_MAX;
341
342	if (i > `0`)
343	i = fract_exp_two(x: i, fract_bits: `6`);
344
345	i -= hweight32(get_random_u32());
346
347	return i > `0` ? i : `1`;
348	}
349
350	static void add_sequential(struct task_struct *t)
351	{
352	ewma_add(t->sequential_io_avg,
353	t->sequential_io, `8`, `0`);
354
355	t->sequential_io = `0`;
356	}
357
358	static struct hlist_head iohash(struct* cached_dev *dc, uint64_t k)
359	{
360	return &dc->io_hash[hash_64(val: k, RECENT_IO_BITS)];
361	}
362
363	static bool check_should_bypass(struct cached_dev dc, struct* bio *bio)
364	{
365	struct cache_set *c = dc->disk.c;
366	unsigned int mode = cache_mode(dc);
367	unsigned int sectors, congested;
368	struct task_struct *task = current;
369	struct io *i;
370
371	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) \|\|
372	c->gc_stats.in_use > CUTOFF_CACHE_ADD \|\|
373	(bio_op(bio) == REQ_OP_DISCARD))
374	goto skip;
375
376	if (mode == CACHE_MODE_NONE \|\|
377	(mode == CACHE_MODE_WRITEAROUND &&
378	op_is_write(op: bio_op(bio))))
379	goto skip;
380
381	/*
382	* If the bio is for read-ahead or background IO, bypass it or
383	* not depends on the following situations,
384	* - If the IO is for meta data, always cache it and no bypass
385	* - If the IO is not meta data, check dc->cache_reada_policy,
386	* BCH_CACHE_READA_ALL: cache it and not bypass
387	* BCH_CACHE_READA_META_ONLY: not cache it and bypass
388	* That is, read-ahead request for metadata always get cached
389	* (eg, for gfs2 or xfs).
390	*/
391	if ((bio->bi_opf & (REQ_RAHEAD\|REQ_BACKGROUND))) {
392	if (!(bio->bi_opf & (REQ_META\|REQ_PRIO)) &&
393	(dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
394	goto skip;
395	}
396
397	if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - `1`) \|\|
398	bio_sectors(bio) & (c->cache->sb.block_size - `1`)) {
399	pr_debug("skipping unaligned io\n");
400	goto skip;
401	}
402
403	if (bypass_torture_test(dc)) {
404	if (get_random_u32_below(ceil: `4`) == `3`)
405	goto skip;
406	else
407	goto rescale;
408	}
409
410	congested = bch_get_congested(c);
411	if (!congested && !dc->sequential_cutoff)
412	goto rescale;
413
414	spin_lock(lock: &dc->io_lock);
415
416	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
417	if (i->last == bio->bi_iter.bi_sector &&
418	time_before(jiffies, i->jiffies))
419	goto found;
420
421	i = list_first_entry(&dc->io_lru, struct io, lru);
422
423	add_sequential(t: task);
424	i->sequential = `0`;
425	found:
426	if (i->sequential + bio->bi_iter.bi_size > i->sequential)
427	i->sequential += bio->bi_iter.bi_size;
428
429	i->last = bio_end_sector(bio);
430	i->jiffies = jiffies + msecs_to_jiffies(m: `5000`);
431	task->sequential_io = i->sequential;
432
433	hlist_del(n: &i->hash);
434	hlist_add_head(n: &i->hash, h: iohash(dc, k: i->last));
435	list_move_tail(list: &i->lru, head: &dc->io_lru);
436
437	spin_unlock(lock: &dc->io_lock);
438
439	sectors = max(task->sequential_io,
440	task->sequential_io_avg) >> `9`;
441
442	if (dc->sequential_cutoff &&
443	sectors >= dc->sequential_cutoff >> `9`) {
444	trace_bcache_bypass_sequential(bio);
445	goto skip;
446	}
447
448	if (congested && sectors >= congested) {
449	trace_bcache_bypass_congested(bio);
450	goto skip;
451	}
452
453	rescale:
454	bch_rescale_priorities(c, bio_sectors(bio));
455	return false;
456	skip:
457	bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
458	return true;
459	}
460
461	/ Cache lookup /
462
463	struct search {
464	/ Stack frame for bio_complete /
465	struct closure cl;
466
467	struct bbio bio;
468	struct bio *orig_bio;
469	struct bio *cache_miss;
470	struct bcache_device *d;
471
472	unsigned int insert_bio_sectors;
473	unsigned int recoverable:`1`;
474	unsigned int write:`1`;
475	unsigned int read_dirty_data:`1`;
476	unsigned int cache_missed:`1`;
477
478	struct block_device *orig_bdev;
479	unsigned long start_time;
480
481	struct btree_op op;
482	struct data_insert_op iop;
483	};
484
485	static void bch_cache_read_endio(struct bio *bio)
486	{
487	struct bbio b = container_of(bio, struct* bbio, bio);
488	struct closure *cl = bio->bi_private;
489	struct search s = container_of(cl, struct* search, cl);
490
491	/*
492	* If the bucket was reused while our bio was in flight, we might have
493	* read the wrong data. Set s->error but not error so it doesn't get
494	* counted against the cache device, but we'll still reread the data
495	* from the backing device.
496	*/
497
498	if (bio->bi_status)
499	s->iop.status = bio->bi_status;
500	else if (!KEY_DIRTY(k: &b->key) &&
501	ptr_stale(c: s->iop.c, k: &b->key, i: `0`)) {
502	atomic_long_inc(v: &s->iop.c->cache_read_races);
503	s->iop.status = BLK_STS_IOERR;
504	}
505
506	bch_bbio_endio(c: s->iop.c, bio, error: bio->bi_status, m: "reading from cache");
507	}
508
509	/*
510	* Read from a single key, handling the initial cache miss if the key starts in
511	* the middle of the bio
512	*/
513	static int cache_lookup_fn(struct btree_op op, struct* btree b, struct* bkey *k)
514	{
515	struct search s = container_of(op, struct* search, op);
516	struct bio n, bio = &s->bio.bio;
517	struct bkey *bio_key;
518	unsigned int ptr;
519
520	if (bkey_cmp(l: k, r: &KEY(s->iop.inode, bio->bi_iter.bi_sector, `0`)) <= `0`)
521	return MAP_CONTINUE;
522
523	if (KEY_INODE(k) != s->iop.inode \|\|
524	KEY_START(k) > bio->bi_iter.bi_sector) {
525	unsigned int bio_sectors = bio_sectors(bio);
526	unsigned int sectors = KEY_INODE(k) == s->iop.inode
527	? min_t(uint64_t, INT_MAX,
528	KEY_START(k) - bio->bi_iter.bi_sector)
529	: INT_MAX;
530	int ret = s->d->cache_miss(b, s, bio, sectors);
531
532	if (ret != MAP_CONTINUE)
533	return ret;
534
535	/ if this was a complete miss we shouldn't get here /
536	BUG_ON(bio_sectors <= sectors);
537	}
538
539	if (!KEY_SIZE(k))
540	return MAP_CONTINUE;
541
542	/ XXX: figure out best pointer - for multiple cache devices /
543	ptr = `0`;
544
545	PTR_BUCKET(c: b->c, k, ptr)->prio = INITIAL_PRIO;
546
547	if (KEY_DIRTY(k))
548	s->read_dirty_data = true;
549
550	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
551	KEY_OFFSET(k) - bio->bi_iter.bi_sector),
552	GFP_NOIO, bs: &s->d->bio_split);
553
554	bio_key = &container_of(n, struct bbio, bio)->key;
555	bch_bkey_copy_single_ptr(dest: bio_key, src: k, i: ptr);
556
557	bch_cut_front(where: &KEY(s->iop.inode, n->bi_iter.bi_sector, `0`), k: bio_key);
558	bch_cut_back(where: &KEY(s->iop.inode, bio_end_sector(n), `0`), k: bio_key);
559
560	n->bi_end_io = bch_cache_read_endio;
561	n->bi_private = &s->cl;
562
563	/*
564	* The bucket we're reading from might be reused while our bio
565	* is in flight, and we could then end up reading the wrong
566	* data.
567	*
568	* We guard against this by checking (in cache_read_endio()) if
569	* the pointer is stale again; if so, we treat it as an error
570	* and reread from the backing device (but we don't pass that
571	* error up anywhere).
572	*/
573
574	__bch_submit_bbio(bio: n, c: b->c);
575	return n == bio ? MAP_DONE : MAP_CONTINUE;
576	}
577
578	static void cache_lookup(struct closure *cl)
579	{
580	struct search s = container_of(cl, struct* search, iop.cl);
581	struct bio *bio = &s->bio.bio;
582	struct cached_dev *dc;
583	int ret;
584
585	bch_btree_op_init(op: &s->op, write_lock_level: -`1`);
586
587	ret = bch_btree_map_keys(op: &s->op, c: s->iop.c,
588	from: &KEY(s->iop.inode, bio->bi_iter.bi_sector, `0`),
589	fn: cache_lookup_fn, MAP_END_KEY);
590	if (ret == -EAGAIN) {
591	continue_at(cl, cache_lookup, bcache_wq);
592	return;
593	}
594
595	/*
596	* We might meet err when searching the btree, If that happens, we will
597	* get negative ret, in this scenario we should not recover data from
598	* backing device (when cache device is dirty) because we don't know
599	* whether bkeys the read request covered are all clean.
600	*
601	* And after that happened, s->iop.status is still its initial value
602	* before we submit s->bio.bio
603	*/
604	if (ret < `0`) {
605	BUG_ON(ret == -EINTR);
606	if (s->d && s->d->c &&
607	!UUID_FLASH_ONLY(k: &s->d->c->uuids[s->d->id])) {
608	dc = container_of(s->d, struct cached_dev, disk);
609	if (dc && atomic_read(v: &dc->has_dirty))
610	s->recoverable = false;
611	}
612	if (!s->iop.status)
613	s->iop.status = BLK_STS_IOERR;
614	}
615
616	closure_return(cl);
617	}
618
619	/ Common code for the make_request functions /
620
621	static void request_endio(struct bio *bio)
622	{
623	struct closure *cl = bio->bi_private;
624
625	if (bio->bi_status) {
626	struct search s = container_of(cl, struct* search, cl);
627
628	s->iop.status = bio->bi_status;
629	/ Only cache read errors are recoverable /
630	s->recoverable = false;
631	}
632
633	bio_put(bio);
634	closure_put(cl);
635	}
636
637	static void backing_request_endio(struct bio *bio)
638	{
639	struct closure *cl = bio->bi_private;
640
641	if (bio->bi_status) {
642	struct search s = container_of(cl, struct* search, cl);
643	struct cached_dev *dc = container_of(s->d,
644	struct cached_dev, disk);
645	/*
646	* If a bio has REQ_PREFLUSH for writeback mode, it is
647	* speically assembled in cached_dev_write() for a non-zero
648	* write request which has REQ_PREFLUSH. we don't set
649	* s->iop.status by this failure, the status will be decided
650	* by result of bch_data_insert() operation.
651	*/
652	if (unlikely(s->iop.writeback &&
653	bio->bi_opf & REQ_PREFLUSH)) {
654	pr_err("Can't flush %pg: returned bi_status %i\n",
655	dc->bdev, bio->bi_status);
656	} else {
657	/ set to orig_bio->bi_status in bio_complete() /
658	s->iop.status = bio->bi_status;
659	}
660	s->recoverable = false;
661	/ should count I/O error for backing device here /
662	bch_count_backing_io_errors(dc, bio);
663	}
664
665	bio_put(bio);
666	closure_put(cl);
667	}
668
669	static void bio_complete(struct search *s)
670	{
671	if (s->orig_bio) {
672	/ Count on bcache device /
673	bio_end_io_acct_remapped(bio: s->orig_bio, start_time: s->start_time,
674	orig_bdev: s->orig_bdev);
675	trace_bcache_request_end(d: s->d, bio: s->orig_bio);
676	s->orig_bio->bi_status = s->iop.status;
677	bio_endio(s->orig_bio);
678	s->orig_bio = NULL;
679	}
680	}
681
682	static void do_bio_hook(struct search *s,
683	struct bio *orig_bio,
684	bio_end_io_t *end_io_fn)
685	{
686	struct bio *bio = &s->bio.bio;
687
688	bio_init_clone(bdev: orig_bio->bi_bdev, bio, bio_src: orig_bio, GFP_NOIO);
689	/*
690	* bi_end_io can be set separately somewhere else, e.g. the
691	* variants in,
692	* - cache_bio->bi_end_io from cached_dev_cache_miss()
693	* - n->bi_end_io from cache_lookup_fn()
694	*/
695	bio->bi_end_io = end_io_fn;
696	bio->bi_private = &s->cl;
697
698	bio_cnt_set(bio, count: `3`);
699	}
700
701	static void search_free(struct closure *cl)
702	{
703	struct search s = container_of(cl, struct* search, cl);
704
705	atomic_dec(v: &s->iop.c->search_inflight);
706
707	if (s->iop.bio)
708	bio_put(s->iop.bio);
709
710	bio_complete(s);
711	closure_debug_destroy(cl);
712	mempool_free(element: s, pool: &s->iop.c->search);
713	}
714
715	static inline struct search search_alloc(struct* bio *bio,
716	struct bcache_device d, struct* block_device *orig_bdev,
717	unsigned long start_time)
718	{
719	struct search *s;
720
721	s = mempool_alloc(pool: &d->c->search, GFP_NOIO);
722
723	closure_init(cl: &s->cl, NULL);
724	do_bio_hook(s, orig_bio: bio, end_io_fn: request_endio);
725	atomic_inc(v: &d->c->search_inflight);
726
727	s->orig_bio = bio;
728	s->cache_miss = NULL;
729	s->cache_missed = `0`;
730	s->d = d;
731	s->recoverable = `1`;
732	s->write = op_is_write(op: bio_op(bio));
733	s->read_dirty_data = `0`;
734	/ Count on the bcache device /
735	s->orig_bdev = orig_bdev;
736	s->start_time = start_time;
737	s->iop.c = d->c;
738	s->iop.bio = NULL;
739	s->iop.inode = d->id;
740	s->iop.write_point = hash_long((unsigned long) current, `16`);
741	s->iop.write_prio = `0`;
742	s->iop.status = `0`;
743	s->iop.flags = `0`;
744	s->iop.flush_journal = op_is_flush(op: bio->bi_opf);
745	s->iop.wq = bcache_wq;
746
747	return s;
748	}
749
750	/ Cached devices /
751
752	static void cached_dev_bio_complete(struct closure *cl)
753	{
754	struct search s = container_of(cl, struct* search, cl);
755	struct cached_dev dc = container_of(s->d, struct* cached_dev, disk);
756
757	cached_dev_put(dc);
758	search_free(cl);
759	}
760
761	/ Process reads /
762
763	static void cached_dev_read_error_done(struct closure *cl)
764	{
765	struct search s = container_of(cl, struct* search, cl);
766
767	if (s->iop.replace_collision)
768	bch_mark_cache_miss_collision(c: s->iop.c, d: s->d);
769
770	if (s->iop.bio)
771	bio_free_pages(bio: s->iop.bio);
772
773	cached_dev_bio_complete(cl);
774	}
775
776	static void cached_dev_read_error(struct closure *cl)
777	{
778	struct search s = container_of(cl, struct* search, cl);
779	struct bio *bio = &s->bio.bio;
780
781	/*
782	* If read request hit dirty data (s->read_dirty_data is true),
783	* then recovery a failed read request from cached device may
784	* get a stale data back. So read failure recovery is only
785	* permitted when read request hit clean data in cache device,
786	* or when cache read race happened.
787	*/
788	if (s->recoverable && !s->read_dirty_data) {
789	/ Retry from the backing device: /
790	trace_bcache_read_retry(bio: s->orig_bio);
791
792	s->iop.status = `0`;
793	do_bio_hook(s, orig_bio: s->orig_bio, end_io_fn: backing_request_endio);
794
795	/ XXX: invalidate cache /
796
797	/ I/O request sent to backing device /
798	closure_bio_submit(c: s->iop.c, bio, cl);
799	}
800
801	continue_at(cl, cached_dev_read_error_done, NULL);
802	}
803
804	static void cached_dev_cache_miss_done(struct closure *cl)
805	{
806	struct search s = container_of(cl, struct* search, cl);
807	struct bcache_device *d = s->d;
808
809	if (s->iop.replace_collision)
810	bch_mark_cache_miss_collision(c: s->iop.c, d: s->d);
811
812	if (s->iop.bio)
813	bio_free_pages(bio: s->iop.bio);
814
815	cached_dev_bio_complete(cl);
816	closure_put(cl: &d->cl);
817	}
818
819	static void cached_dev_read_done(struct closure *cl)
820	{
821	struct search s = container_of(cl, struct* search, cl);
822	struct cached_dev dc = container_of(s->d, struct* cached_dev, disk);
823
824	/*
825	* We had a cache miss; cache_bio now contains data ready to be inserted
826	* into the cache.
827	*
828	* First, we copy the data we just read from cache_bio's bounce buffers
829	* to the buffers the original bio pointed to:
830	*/
831
832	if (s->iop.bio) {
833	bio_reset(bio: s->iop.bio, bdev: s->cache_miss->bi_bdev, opf: REQ_OP_READ);
834	s->iop.bio->bi_iter.bi_sector =
835	s->cache_miss->bi_iter.bi_sector;
836	s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << `9`;
837	bio_clone_blkg_association(dst: s->iop.bio, src: s->cache_miss);
838	bch_bio_map(bio: s->iop.bio, NULL);
839
840	bio_copy_data(dst: s->cache_miss, src: s->iop.bio);
841
842	bio_put(s->cache_miss);
843	s->cache_miss = NULL;
844	}
845
846	if (verify(dc) && s->recoverable && !s->read_dirty_data)
847	bch_data_verify(dc, bio: s->orig_bio);
848
849	closure_get(cl: &dc->disk.cl);
850	bio_complete(s);
851
852	if (s->iop.bio &&
853	!test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
854	BUG_ON(!s->iop.replace);
855	closure_call(cl: &s->iop.cl, fn: bch_data_insert, NULL, parent: cl);
856	}
857
858	continue_at(cl, cached_dev_cache_miss_done, NULL);
859	}
860
861	static void cached_dev_read_done_bh(struct closure *cl)
862	{
863	struct search s = container_of(cl, struct* search, cl);
864	struct cached_dev dc = container_of(s->d, struct* cached_dev, disk);
865
866	bch_mark_cache_accounting(c: s->iop.c, d: s->d,
867	hit: !s->cache_missed, bypass: s->iop.bypass);
868	trace_bcache_read(bio: s->orig_bio, hit: !s->cache_missed, bypass: s->iop.bypass);
869
870	if (s->iop.status)
871	continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
872	else if (s->iop.bio \|\| verify(dc))
873	continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
874	else
875	continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
876	}
877
878	static int cached_dev_cache_miss(struct btree b, struct* search *s,
879	struct bio bio, unsigned* int sectors)
880	{
881	int ret = MAP_CONTINUE;
882	struct cached_dev dc = container_of(s->d, struct* cached_dev, disk);
883	struct bio miss, cache_bio;
884	unsigned int size_limit;
885
886	s->cache_missed = `1`;
887
888	if (s->cache_miss \|\| s->iop.bypass) {
889	miss = bio_next_split(bio, sectors, GFP_NOIO, bs: &s->d->bio_split);
890	ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
891	goto out_submit;
892	}
893
894	/ Limitation for valid replace key size and cache_bio bvecs number /
895	size_limit = min_t(unsigned int, BIO_MAX_VECS * PAGE_SECTORS,
896	(`1` << KEY_SIZE_BITS) - `1`);
897	s->insert_bio_sectors = min3(size_limit, sectors, bio_sectors(bio));
898
899	s->iop.replace_key = KEY(s->iop.inode,
900	bio->bi_iter.bi_sector + s->insert_bio_sectors,
901	s->insert_bio_sectors);
902
903	ret = bch_btree_insert_check_key(b, op: &s->op, check_key: &s->iop.replace_key);
904	if (ret)
905	return ret;
906
907	s->iop.replace = true;
908
909	miss = bio_next_split(bio, sectors: s->insert_bio_sectors, GFP_NOIO,
910	bs: &s->d->bio_split);
911
912	/ btree_search_recurse()'s btree iterator is no good anymore /
913	ret = miss == bio ? MAP_DONE : -EINTR;
914
915	cache_bio = bio_alloc_bioset(bdev: miss->bi_bdev,
916	DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
917	opf: `0`, GFP_NOWAIT, bs: &dc->disk.bio_split);
918	if (!cache_bio)
919	goto out_submit;
920
921	cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
922	cache_bio->bi_iter.bi_size = s->insert_bio_sectors << `9`;
923
924	cache_bio->bi_end_io = backing_request_endio;
925	cache_bio->bi_private = &s->cl;
926
927	bch_bio_map(bio: cache_bio, NULL);
928	if (bch_bio_alloc_pages(bio: cache_bio, __GFP_NOWARN\|GFP_NOIO))
929	goto out_put;
930
931	s->cache_miss = miss;
932	s->iop.bio = cache_bio;
933	bio_get(bio: cache_bio);
934	/ I/O request sent to backing device /
935	closure_bio_submit(c: s->iop.c, bio: cache_bio, cl: &s->cl);
936
937	return ret;
938	out_put:
939	bio_put(cache_bio);
940	out_submit:
941	miss->bi_end_io = backing_request_endio;
942	miss->bi_private = &s->cl;
943	/ I/O request sent to backing device /
944	closure_bio_submit(c: s->iop.c, bio: miss, cl: &s->cl);
945	return ret;
946	}
947
948	static void cached_dev_read(struct cached_dev dc, struct* search *s)
949	{
950	struct closure *cl = &s->cl;
951
952	closure_call(cl: &s->iop.cl, fn: cache_lookup, NULL, parent: cl);
953	continue_at(cl, cached_dev_read_done_bh, NULL);
954	}
955
956	/ Process writes /
957
958	static void cached_dev_write_complete(struct closure *cl)
959	{
960	struct search s = container_of(cl, struct* search, cl);
961	struct cached_dev dc = container_of(s->d, struct* cached_dev, disk);
962
963	up_read_non_owner(sem: &dc->writeback_lock);
964	cached_dev_bio_complete(cl);
965	}
966
967	static void cached_dev_write(struct cached_dev dc, struct* search *s)
968	{
969	struct closure *cl = &s->cl;
970	struct bio *bio = &s->bio.bio;
971	struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, `0`);
972	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), `0`);
973
974	bch_keybuf_check_overlapping(buf: &s->iop.c->moving_gc_keys, start: &start, end: &end);
975
976	down_read_non_owner(sem: &dc->writeback_lock);
977	if (bch_keybuf_check_overlapping(buf: &dc->writeback_keys, start: &start, end: &end)) {
978	/*
979	* We overlap with some dirty data undergoing background
980	* writeback, force this write to writeback
981	*/
982	s->iop.bypass = false;
983	s->iop.writeback = true;
984	}
985
986	/*
987	* Discards aren't _required_ to do anything, so skipping if
988	* check_overlapping returned true is ok
989	*
990	* But check_overlapping drops dirty keys for which io hasn't started,
991	* so we still want to call it.
992	*/
993	if (bio_op(bio) == REQ_OP_DISCARD)
994	s->iop.bypass = true;
995
996	if (should_writeback(dc, bio: s->orig_bio,
997	cache_mode: cache_mode(dc),
998	would_skip: s->iop.bypass)) {
999	s->iop.bypass = false;
1000	s->iop.writeback = true;
1001	}
1002
1003	if (s->iop.bypass) {
1004	s->iop.bio = s->orig_bio;
1005	bio_get(bio: s->iop.bio);
1006
1007	if (bio_op(bio) == REQ_OP_DISCARD &&
1008	!bdev_max_discard_sectors(bdev: dc->bdev))
1009	goto insert_data;
1010
1011	/ I/O request sent to backing device /
1012	bio->bi_end_io = backing_request_endio;
1013	closure_bio_submit(c: s->iop.c, bio, cl);
1014
1015	} else if (s->iop.writeback) {
1016	bch_writeback_add(dc);
1017	s->iop.bio = bio;
1018
1019	if (bio->bi_opf & REQ_PREFLUSH) {
1020	/*
1021	* Also need to send a flush to the backing
1022	* device.
1023	*/
1024	struct bio *flush;
1025
1026	flush = bio_alloc_bioset(bdev: bio->bi_bdev, nr_vecs: `0`,
1027	opf: REQ_OP_WRITE \| REQ_PREFLUSH,
1028	GFP_NOIO, bs: &dc->disk.bio_split);
1029	if (!flush) {
1030	s->iop.status = BLK_STS_RESOURCE;
1031	goto insert_data;
1032	}
1033	flush->bi_end_io = backing_request_endio;
1034	flush->bi_private = cl;
1035	/ I/O request sent to backing device /
1036	closure_bio_submit(c: s->iop.c, bio: flush, cl);
1037	}
1038	} else {
1039	s->iop.bio = bio_alloc_clone(bdev: bio->bi_bdev, bio_src: bio, GFP_NOIO,
1040	bs: &dc->disk.bio_split);
1041	/ I/O request sent to backing device /
1042	bio->bi_end_io = backing_request_endio;
1043	closure_bio_submit(c: s->iop.c, bio, cl);
1044	}
1045
1046	insert_data:
1047	closure_call(cl: &s->iop.cl, fn: bch_data_insert, NULL, parent: cl);
1048	continue_at(cl, cached_dev_write_complete, NULL);
1049	}
1050
1051	static void cached_dev_nodata(struct closure *cl)
1052	{
1053	struct search s = container_of(cl, struct* search, cl);
1054	struct bio *bio = &s->bio.bio;
1055
1056	if (s->iop.flush_journal)
1057	bch_journal_meta(c: s->iop.c, cl);
1058
1059	/ If it's a flush, we send the flush to the backing device too /
1060	bio->bi_end_io = backing_request_endio;
1061	closure_bio_submit(c: s->iop.c, bio, cl);
1062
1063	continue_at(cl, cached_dev_bio_complete, NULL);
1064	}
1065
1066	struct detached_dev_io_private {
1067	struct bcache_device *d;
1068	unsigned long start_time;
1069	bio_end_io_t *bi_end_io;
1070	void *bi_private;
1071	struct block_device *orig_bdev;
1072	};
1073
1074	static void detached_dev_end_io(struct bio *bio)
1075	{
1076	struct detached_dev_io_private *ddip;
1077
1078	ddip = bio->bi_private;
1079	bio->bi_end_io = ddip->bi_end_io;
1080	bio->bi_private = ddip->bi_private;
1081
1082	/ Count on the bcache device /
1083	bio_end_io_acct_remapped(bio, start_time: ddip->start_time, orig_bdev: ddip->orig_bdev);
1084
1085	if (bio->bi_status) {
1086	struct cached_dev *dc = container_of(ddip->d,
1087	struct cached_dev, disk);
1088	/ should count I/O error for backing device here /
1089	bch_count_backing_io_errors(dc, bio);
1090	}
1091
1092	kfree(objp: ddip);
1093	bio->bi_end_io(bio);
1094	}
1095
1096	static void detached_dev_do_request(struct bcache_device d, struct* bio *bio,
1097	struct block_device orig_bdev, unsigned* long start_time)
1098	{
1099	struct detached_dev_io_private *ddip;
1100	struct cached_dev dc = container_of(d, struct* cached_dev, disk);
1101
1102	/*
1103	* no need to call closure_get(&dc->disk.cl),
1104	* because upper layer had already opened bcache device,
1105	* which would call closure_get(&dc->disk.cl)
1106	*/
1107	ddip = kzalloc(size: sizeof(struct detached_dev_io_private), GFP_NOIO);
1108	if (!ddip) {
1109	bio->bi_status = BLK_STS_RESOURCE;
1110	bio->bi_end_io(bio);
1111	return;
1112	}
1113
1114	ddip->d = d;
1115	/ Count on the bcache device /
1116	ddip->orig_bdev = orig_bdev;
1117	ddip->start_time = start_time;
1118	ddip->bi_end_io = bio->bi_end_io;
1119	ddip->bi_private = bio->bi_private;
1120	bio->bi_end_io = detached_dev_end_io;
1121	bio->bi_private = ddip;
1122
1123	if ((bio_op(bio) == REQ_OP_DISCARD) &&
1124	!bdev_max_discard_sectors(bdev: dc->bdev))
1125	bio->bi_end_io(bio);
1126	else
1127	submit_bio_noacct(bio);
1128	}
1129
1130	static void quit_max_writeback_rate(struct cache_set *c,
1131	struct cached_dev *this_dc)
1132	{
1133	int i;
1134	struct bcache_device *d;
1135	struct cached_dev *dc;
1136
1137	/*
1138	* mutex bch_register_lock may compete with other parallel requesters,
1139	* or attach/detach operations on other backing device. Waiting to
1140	* the mutex lock may increase I/O request latency for seconds or more.
1141	* To avoid such situation, if mutext_trylock() failed, only writeback
1142	* rate of current cached device is set to 1, and __update_write_back()
1143	* will decide writeback rate of other cached devices (remember now
1144	* c->idle_counter is 0 already).
1145	*/
1146	if (mutex_trylock(lock: &bch_register_lock)) {
1147	for (i = `0`; i < c->devices_max_used; i++) {
1148	if (!c->devices[i])
1149	continue;
1150
1151	if (UUID_FLASH_ONLY(k: &c->uuids[i]))
1152	continue;
1153
1154	d = c->devices[i];
1155	dc = container_of(d, struct cached_dev, disk);
1156	/*
1157	* set writeback rate to default minimum value,
1158	* then let update_writeback_rate() to decide the
1159	* upcoming rate.
1160	*/
1161	atomic_long_set(v: &dc->writeback_rate.rate, i: `1`);
1162	}
1163	mutex_unlock(lock: &bch_register_lock);
1164	} else
1165	atomic_long_set(v: &this_dc->writeback_rate.rate, i: `1`);
1166	}
1167
1168	/ Cached devices - read & write stuff /
1169
1170	void cached_dev_submit_bio(struct bio *bio)
1171	{
1172	struct search *s;
1173	struct block_device *orig_bdev = bio->bi_bdev;
1174	struct bcache_device *d = orig_bdev->bd_disk->private_data;
1175	struct cached_dev dc = container_of(d, struct* cached_dev, disk);
1176	unsigned long start_time;
1177	int rw = bio_data_dir(bio);
1178
1179	if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) \|\|
1180	dc->io_disable)) {
1181	bio->bi_status = BLK_STS_IOERR;
1182	bio_endio(bio);
1183	return;
1184	}
1185
1186	if (likely(d->c)) {
1187	if (atomic_read(v: &d->c->idle_counter))
1188	atomic_set(v: &d->c->idle_counter, i: `0`);
1189	/*
1190	* If at_max_writeback_rate of cache set is true and new I/O
1191	* comes, quit max writeback rate of all cached devices
1192	* attached to this cache set, and set at_max_writeback_rate
1193	* to false.
1194	*/
1195	if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == `1`)) {
1196	atomic_set(v: &d->c->at_max_writeback_rate, i: `0`);
1197	quit_max_writeback_rate(c: d->c, this_dc: dc);
1198	}
1199	}
1200
1201	start_time = bio_start_io_acct(bio);
1202
1203	bio_set_dev(bio, bdev: dc->bdev);
1204	bio->bi_iter.bi_sector += dc->sb.data_offset;
1205
1206	if (cached_dev_get(dc)) {
1207	s = search_alloc(bio, d, orig_bdev, start_time);
1208	trace_bcache_request_start(d: s->d, bio);
1209
1210	if (!bio->bi_iter.bi_size) {
1211	/*
1212	* can't call bch_journal_meta from under
1213	* submit_bio_noacct
1214	*/
1215	continue_at_nobarrier(&s->cl,
1216	cached_dev_nodata,
1217	bcache_wq);
1218	} else {
1219	s->iop.bypass = check_should_bypass(dc, bio);
1220
1221	if (rw)
1222	cached_dev_write(dc, s);
1223	else
1224	cached_dev_read(dc, s);
1225	}
1226	} else
1227	/ I/O request sent to backing device /
1228	detached_dev_do_request(d, bio, orig_bdev, start_time);
1229	}
1230
1231	static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
1232	unsigned int cmd, unsigned long arg)
1233	{
1234	struct cached_dev dc = container_of(d, struct* cached_dev, disk);
1235
1236	if (dc->io_disable)
1237	return -EIO;
1238	if (!dc->bdev->bd_disk->fops->ioctl)
1239	return -ENOTTY;
1240	return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg);
1241	}
1242
1243	void bch_cached_dev_request_init(struct cached_dev *dc)
1244	{
1245	dc->disk.cache_miss = cached_dev_cache_miss;
1246	dc->disk.ioctl = cached_dev_ioctl;
1247	}
1248
1249	/ Flash backed devices /
1250
1251	static int flash_dev_cache_miss(struct btree b, struct* search *s,
1252	struct bio bio, unsigned* int sectors)
1253	{
1254	unsigned int bytes = min(sectors, bio_sectors(bio)) << `9`;
1255
1256	swap(bio->bi_iter.bi_size, bytes);
1257	zero_fill_bio(bio);
1258	swap(bio->bi_iter.bi_size, bytes);
1259
1260	bio_advance(bio, nbytes: bytes);
1261
1262	if (!bio->bi_iter.bi_size)
1263	return MAP_DONE;
1264
1265	return MAP_CONTINUE;
1266	}
1267
1268	static void flash_dev_nodata(struct closure *cl)
1269	{
1270	struct search s = container_of(cl, struct* search, cl);
1271
1272	if (s->iop.flush_journal)
1273	bch_journal_meta(c: s->iop.c, cl);
1274
1275	continue_at(cl, search_free, NULL);
1276	}
1277
1278	void flash_dev_submit_bio(struct bio *bio)
1279	{
1280	struct search *s;
1281	struct closure *cl;
1282	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
1283
1284	if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
1285	bio->bi_status = BLK_STS_IOERR;
1286	bio_endio(bio);
1287	return;
1288	}
1289
1290	s = search_alloc(bio, d, orig_bdev: bio->bi_bdev, start_time: bio_start_io_acct(bio));
1291	cl = &s->cl;
1292	bio = &s->bio.bio;
1293
1294	trace_bcache_request_start(d: s->d, bio);
1295
1296	if (!bio->bi_iter.bi_size) {
1297	/*
1298	* can't call bch_journal_meta from under submit_bio_noacct
1299	*/
1300	continue_at_nobarrier(&s->cl,
1301	flash_dev_nodata,
1302	bcache_wq);
1303	return;
1304	} else if (bio_data_dir(bio)) {
1305	bch_keybuf_check_overlapping(buf: &s->iop.c->moving_gc_keys,
1306	start: &KEY(d->id, bio->bi_iter.bi_sector, `0`),
1307	end: &KEY(d->id, bio_end_sector(bio), `0`));
1308
1309	s->iop.bypass = (bio_op(bio) == REQ_OP_DISCARD) != `0`;
1310	s->iop.writeback = true;
1311	s->iop.bio = bio;
1312
1313	closure_call(cl: &s->iop.cl, fn: bch_data_insert, NULL, parent: cl);
1314	} else {
1315	closure_call(cl: &s->iop.cl, fn: cache_lookup, NULL, parent: cl);
1316	}
1317
1318	continue_at(cl, search_free, NULL);
1319	}
1320
1321	static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
1322	unsigned int cmd, unsigned long arg)
1323	{
1324	return -ENOTTY;
1325	}
1326
1327	void bch_flash_dev_request_init(struct bcache_device *d)
1328	{
1329	d->cache_miss = flash_dev_cache_miss;
1330	d->ioctl = flash_dev_ioctl;
1331	}
1332
1333	void bch_request_exit(void)
1334	{
1335	kmem_cache_destroy(s: bch_search_cache);
1336	}
1337
1338	int __init bch_request_init(void)
1339	{
1340	bch_search_cache = KMEM_CACHE(search, `0`);
1341	if (!bch_search_cache)
1342	return -ENOMEM;
1343
1344	return `0`;
1345	}
1346

source code of linux/drivers/md/bcache/request.c