blk-core.c source code [linux/block/blk-core.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 1991, 1992 Linus Torvalds
4	* Copyright (C) 1994, Karl Keyte: Added support for disk statistics
5	* Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6	* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
7	* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8	* - July2000
9	* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10	*/
11
12	/*
13	* This handles all read/write requests to block devices
14	*/
15	#include <linux/kernel.h>
16	#include <linux/module.h>
17	#include <linux/bio.h>
18	#include <linux/blkdev.h>
19	#include <linux/blk-pm.h>
20	#include <linux/blk-integrity.h>
21	#include <linux/highmem.h>
22	#include <linux/mm.h>
23	#include <linux/pagemap.h>
24	#include <linux/kernel_stat.h>
25	#include <linux/string.h>
26	#include <linux/init.h>
27	#include <linux/completion.h>
28	#include <linux/slab.h>
29	#include <linux/swap.h>
30	#include <linux/writeback.h>
31	#include <linux/task_io_accounting_ops.h>
32	#include <linux/fault-inject.h>
33	#include <linux/list_sort.h>
34	#include <linux/delay.h>
35	#include <linux/ratelimit.h>
36	#include <linux/pm_runtime.h>
37	#include <linux/t10-pi.h>
38	#include <linux/debugfs.h>
39	#include <linux/bpf.h>
40	#include <linux/part_stat.h>
41	#include <linux/sched/sysctl.h>
42	#include <linux/blk-crypto.h>
43
44	#define CREATE_TRACE_POINTS
45	#include <trace/events/block.h>
46
47	#include "blk.h"
48	#include "blk-mq-sched.h"
49	#include "blk-pm.h"
50	#include "blk-cgroup.h"
51	#include "blk-throttle.h"
52
53	struct dentry *blk_debugfs_root;
54
55	EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
56	EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
57	EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
58	EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
59	EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
60	EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
61
62	static DEFINE_IDA(blk_queue_ida);
63
64	/*
65	* For queue allocation
66	*/
67	static struct kmem_cache *blk_requestq_cachep;
68
69	/*
70	* Controlling structure to kblockd
71	*/
72	static struct workqueue_struct *kblockd_workqueue;
73
74	/**
75	* blk_queue_flag_set - atomically set a queue flag
76	* @flag: flag to be set
77	* @q: request queue
78	*/
79	void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
80	{
81	set_bit(nr: flag, addr: &q->queue_flags);
82	}
83	EXPORT_SYMBOL(blk_queue_flag_set);
84
85	/**
86	* blk_queue_flag_clear - atomically clear a queue flag
87	* @flag: flag to be cleared
88	* @q: request queue
89	*/
90	void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
91	{
92	clear_bit(nr: flag, addr: &q->queue_flags);
93	}
94	EXPORT_SYMBOL(blk_queue_flag_clear);
95
96	/**
97	* blk_queue_flag_test_and_set - atomically test and set a queue flag
98	* @flag: flag to be set
99	* @q: request queue
100	*
101	* Returns the previous value of @flag - 0 if the flag was not set and 1 if
102	* the flag was already set.
103	*/
104	bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
105	{
106	return test_and_set_bit(nr: flag, addr: &q->queue_flags);
107	}
108	EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
109
110	#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
111	static const char *const blk_op_name[] = {
112	REQ_OP_NAME(READ),
113	REQ_OP_NAME(WRITE),
114	REQ_OP_NAME(FLUSH),
115	REQ_OP_NAME(DISCARD),
116	REQ_OP_NAME(SECURE_ERASE),
117	REQ_OP_NAME(ZONE_RESET),
118	REQ_OP_NAME(ZONE_RESET_ALL),
119	REQ_OP_NAME(ZONE_OPEN),
120	REQ_OP_NAME(ZONE_CLOSE),
121	REQ_OP_NAME(ZONE_FINISH),
122	REQ_OP_NAME(ZONE_APPEND),
123	REQ_OP_NAME(WRITE_ZEROES),
124	REQ_OP_NAME(DRV_IN),
125	REQ_OP_NAME(DRV_OUT),
126	};
127	#undef REQ_OP_NAME
128
129	/**
130	* blk_op_str - Return string XXX in the REQ_OP_XXX.
131	* @op: REQ_OP_XXX.
132	*
133	* Description: Centralize block layer function to convert REQ_OP_XXX into
134	* string format. Useful in the debugging and tracing bio or request. For
135	* invalid REQ_OP_XXX it returns string "UNKNOWN".
136	*/
137	inline const char blk_op_str(enum* req_op op)
138	{
139	const char *op_str = "UNKNOWN";
140
141	if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
142	op_str = blk_op_name[op];
143
144	return op_str;
145	}
146	EXPORT_SYMBOL_GPL(blk_op_str);
147
148	static const struct {
149	int errno;
150	const char *name;
151	} blk_errors[] = {
152	[BLK_STS_OK] = { `0`, "" },
153	[BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
154	[BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
155	[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
156	[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
157	[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
158	[BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" },
159	[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
160	[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
161	[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
162	[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
163	[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
164	[BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
165
166	/ device mapper special case, should not leak out: /
167	[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
168
169	/ zone device specific errors /
170	[BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
171	[BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
172
173	/ Command duration limit device-side timeout /
174	[BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" },
175
176	/ everything else not covered above: /
177	[BLK_STS_IOERR] = { -EIO, "I/O" },
178	};
179
180	blk_status_t errno_to_blk_status(int errno)
181	{
182	int i;
183
184	for (i = `0`; i < ARRAY_SIZE(blk_errors); i++) {
185	if (blk_errors[i].errno == errno)
186	return (__force blk_status_t)i;
187	}
188
189	return BLK_STS_IOERR;
190	}
191	EXPORT_SYMBOL_GPL(errno_to_blk_status);
192
193	int blk_status_to_errno(blk_status_t status)
194	{
195	int idx = (__force int)status;
196
197	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
198	return -EIO;
199	return blk_errors[idx].errno;
200	}
201	EXPORT_SYMBOL_GPL(blk_status_to_errno);
202
203	const char *blk_status_to_str(blk_status_t status)
204	{
205	int idx = (__force int)status;
206
207	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
208	return "<null>";
209	return blk_errors[idx].name;
210	}
211	EXPORT_SYMBOL_GPL(blk_status_to_str);
212
213	/**
214	* blk_sync_queue - cancel any pending callbacks on a queue
215	* @q: the queue
216	*
217	* Description:
218	* The block layer may perform asynchronous callback activity
219	* on a queue, such as calling the unplug function after a timeout.
220	* A block device may call blk_sync_queue to ensure that any
221	* such activity is cancelled, thus allowing it to release resources
222	* that the callbacks might use. The caller must already have made sure
223	* that its ->submit_bio will not re-add plugging prior to calling
224	* this function.
225	*
226	* This function does not cancel any asynchronous activity arising
227	* out of elevator or throttling code. That would require elevator_exit()
228	* and blkcg_exit_queue() to be called with queue lock initialized.
229	*
230	*/
231	void blk_sync_queue(struct request_queue *q)
232	{
233	del_timer_sync(timer: &q->timeout);
234	cancel_work_sync(work: &q->timeout_work);
235	}
236	EXPORT_SYMBOL(blk_sync_queue);
237
238	/**
239	* blk_set_pm_only - increment pm_only counter
240	* @q: request queue pointer
241	*/
242	void blk_set_pm_only(struct request_queue *q)
243	{
244	atomic_inc(v: &q->pm_only);
245	}
246	EXPORT_SYMBOL_GPL(blk_set_pm_only);
247
248	void blk_clear_pm_only(struct request_queue *q)
249	{
250	int pm_only;
251
252	pm_only = atomic_dec_return(v: &q->pm_only);
253	WARN_ON_ONCE(pm_only < `0`);
254	if (pm_only == `0`)
255	wake_up_all(&q->mq_freeze_wq);
256	}
257	EXPORT_SYMBOL_GPL(blk_clear_pm_only);
258
259	static void blk_free_queue_rcu(struct rcu_head *rcu_head)
260	{
261	struct request_queue *q = container_of(rcu_head,
262	struct request_queue, rcu_head);
263
264	percpu_ref_exit(ref: &q->q_usage_counter);
265	kmem_cache_free(s: blk_requestq_cachep, objp: q);
266	}
267
268	static void blk_free_queue(struct request_queue *q)
269	{
270	blk_free_queue_stats(q->stats);
271	if (queue_is_mq(q))
272	blk_mq_release(q);
273
274	ida_free(&blk_queue_ida, id: q->id);
275	call_rcu(head: &q->rcu_head, func: blk_free_queue_rcu);
276	}
277
278	/**
279	* blk_put_queue - decrement the request_queue refcount
280	* @q: the request_queue structure to decrement the refcount for
281	*
282	* Decrements the refcount of the request_queue and free it when the refcount
283	* reaches 0.
284	*/
285	void blk_put_queue(struct request_queue *q)
286	{
287	if (refcount_dec_and_test(r: &q->refs))
288	blk_free_queue(q);
289	}
290	EXPORT_SYMBOL(blk_put_queue);
291
292	void blk_queue_start_drain(struct request_queue *q)
293	{
294	/*
295	* When queue DYING flag is set, we need to block new req
296	* entering queue, so we call blk_freeze_queue_start() to
297	* prevent I/O from crossing blk_queue_enter().
298	*/
299	blk_freeze_queue_start(q);
300	if (queue_is_mq(q))
301	blk_mq_wake_waiters(q);
302	/ Make blk_queue_enter() reexamine the DYING flag. /
303	wake_up_all(&q->mq_freeze_wq);
304	}
305
306	/**
307	* blk_queue_enter() - try to increase q->q_usage_counter
308	* @q: request queue pointer
309	* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
310	*/
311	int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
312	{
313	const bool pm = flags & BLK_MQ_REQ_PM;
314
315	while (!blk_try_enter_queue(q, pm)) {
316	if (flags & BLK_MQ_REQ_NOWAIT)
317	return -EAGAIN;
318
319	/*
320	* read pair of barrier in blk_freeze_queue_start(), we need to
321	* order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
322	* reading .mq_freeze_depth or queue dying flag, otherwise the
323	* following wait may never return if the two reads are
324	* reordered.
325	*/
326	smp_rmb();
327	wait_event(q->mq_freeze_wq,
328	(!q->mq_freeze_depth &&
329	blk_pm_resume_queue(pm, q)) \|\|
330	blk_queue_dying(q));
331	if (blk_queue_dying(q))
332	return -ENODEV;
333	}
334
335	return `0`;
336	}
337
338	int __bio_queue_enter(struct request_queue q, struct* bio *bio)
339	{
340	while (!blk_try_enter_queue(q, pm: false)) {
341	struct gendisk *disk = bio->bi_bdev->bd_disk;
342
343	if (bio->bi_opf & REQ_NOWAIT) {
344	if (test_bit(GD_DEAD, &disk->state))
345	goto dead;
346	bio_wouldblock_error(bio);
347	return -EAGAIN;
348	}
349
350	/*
351	* read pair of barrier in blk_freeze_queue_start(), we need to
352	* order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
353	* reading .mq_freeze_depth or queue dying flag, otherwise the
354	* following wait may never return if the two reads are
355	* reordered.
356	*/
357	smp_rmb();
358	wait_event(q->mq_freeze_wq,
359	(!q->mq_freeze_depth &&
360	blk_pm_resume_queue(false, q)) \|\|
361	test_bit(GD_DEAD, &disk->state));
362	if (test_bit(GD_DEAD, &disk->state))
363	goto dead;
364	}
365
366	return `0`;
367	dead:
368	bio_io_error(bio);
369	return -ENODEV;
370	}
371
372	void blk_queue_exit(struct request_queue *q)
373	{
374	percpu_ref_put(ref: &q->q_usage_counter);
375	}
376
377	static void blk_queue_usage_counter_release(struct percpu_ref *ref)
378	{
379	struct request_queue *q =
380	container_of(ref, struct request_queue, q_usage_counter);
381
382	wake_up_all(&q->mq_freeze_wq);
383	}
384
385	static void blk_rq_timed_out_timer(struct timer_list *t)
386	{
387	struct request_queue *q = from_timer(q, t, timeout);
388
389	kblockd_schedule_work(work: &q->timeout_work);
390	}
391
392	static void blk_timeout_work(struct work_struct *work)
393	{
394	}
395
396	struct request_queue blk_alloc_queue(int* node_id)
397	{
398	struct request_queue *q;
399
400	q = kmem_cache_alloc_node(s: blk_requestq_cachep, GFP_KERNEL \| __GFP_ZERO,
401	node: node_id);
402	if (!q)
403	return NULL;
404
405	q->last_merge = NULL;
406
407	q->id = ida_alloc(ida: &blk_queue_ida, GFP_KERNEL);
408	if (q->id < `0`)
409	goto fail_q;
410
411	q->stats = blk_alloc_queue_stats();
412	if (!q->stats)
413	goto fail_id;
414
415	q->node = node_id;
416
417	atomic_set(v: &q->nr_active_requests_shared_tags, i: `0`);
418
419	timer_setup(&q->timeout, blk_rq_timed_out_timer, `0`);
420	INIT_WORK(&q->timeout_work, blk_timeout_work);
421	INIT_LIST_HEAD(list: &q->icq_list);
422
423	refcount_set(r: &q->refs, n: `1`);
424	mutex_init(&q->debugfs_mutex);
425	mutex_init(&q->sysfs_lock);
426	mutex_init(&q->sysfs_dir_lock);
427	mutex_init(&q->rq_qos_mutex);
428	spin_lock_init(&q->queue_lock);
429
430	init_waitqueue_head(&q->mq_freeze_wq);
431	mutex_init(&q->mq_freeze_lock);
432
433	/*
434	* Init percpu_ref in atomic mode so that it's faster to shutdown.
435	* See blk_register_queue() for details.
436	*/
437	if (percpu_ref_init(ref: &q->q_usage_counter,
438	release: blk_queue_usage_counter_release,
439	flags: PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
440	goto fail_stats;
441
442	blk_set_default_limits(lim: &q->limits);
443	q->nr_requests = BLKDEV_DEFAULT_RQ;
444
445	return q;
446
447	fail_stats:
448	blk_free_queue_stats(q->stats);
449	fail_id:
450	ida_free(&blk_queue_ida, id: q->id);
451	fail_q:
452	kmem_cache_free(s: blk_requestq_cachep, objp: q);
453	return NULL;
454	}
455
456	/**
457	* blk_get_queue - increment the request_queue refcount
458	* @q: the request_queue structure to increment the refcount for
459	*
460	* Increment the refcount of the request_queue kobject.
461	*
462	* Context: Any context.
463	*/
464	bool blk_get_queue(struct request_queue *q)
465	{
466	if (unlikely(blk_queue_dying(q)))
467	return false;
468	refcount_inc(r: &q->refs);
469	return true;
470	}
471	EXPORT_SYMBOL(blk_get_queue);
472
473	#ifdef CONFIG_FAIL_MAKE_REQUEST
474
475	static DECLARE_FAULT_ATTR(fail_make_request);
476
477	static int __init setup_fail_make_request(char *str)
478	{
479	return setup_fault_attr(attr: &fail_make_request, str);
480	}
481	__setup("fail_make_request=", setup_fail_make_request);
482
483	bool should_fail_request(struct block_device part, unsigned* int bytes)
484	{
485	return part->bd_make_it_fail && should_fail(attr: &fail_make_request, size: bytes);
486	}
487
488	static int __init fail_make_request_debugfs(void)
489	{
490	struct dentry *dir = fault_create_debugfs_attr(name: "fail_make_request",
491	NULL, attr: &fail_make_request);
492
493	return PTR_ERR_OR_ZERO(ptr: dir);
494	}
495
496	late_initcall(fail_make_request_debugfs);
497	#endif /* CONFIG_FAIL_MAKE_REQUEST */
498
499	static inline void bio_check_ro(struct bio *bio)
500	{
501	if (op_is_write(op: bio_op(bio)) && bdev_read_only(bdev: bio->bi_bdev)) {
502	if (op_is_flush(op: bio->bi_opf) && !bio_sectors(bio))
503	return;
504	pr_warn("Trying to write to read-only block-device %pg\n",
505	bio->bi_bdev);
506	/ Older lvm-tools actually trigger this /
507	}
508	}
509
510	static noinline int should_fail_bio(struct bio *bio)
511	{
512	if (should_fail_request(bdev_whole(bio->bi_bdev), bytes: bio->bi_iter.bi_size))
513	return -EIO;
514	return `0`;
515	}
516	ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
517
518	/*
519	* Check whether this bio extends beyond the end of the device or partition.
520	* This may well happen - the kernel calls bread() without checking the size of
521	* the device, e.g., when mounting a file system.
522	*/
523	static inline int bio_check_eod(struct bio *bio)
524	{
525	sector_t maxsector = bdev_nr_sectors(bdev: bio->bi_bdev);
526	unsigned int nr_sectors = bio_sectors(bio);
527
528	if (nr_sectors &&
529	(nr_sectors > maxsector \|\|
530	bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
531	pr_info_ratelimited("%s: attempt to access beyond end of device\n"
532	"%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
533	current->comm, bio->bi_bdev, bio->bi_opf,
534	bio->bi_iter.bi_sector, nr_sectors, maxsector);
535	return -EIO;
536	}
537	return `0`;
538	}
539
540	/*
541	* Remap block n of partition p to block n+start(p) of the disk.
542	*/
543	static int blk_partition_remap(struct bio *bio)
544	{
545	struct block_device *p = bio->bi_bdev;
546
547	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
548	return -EIO;
549	if (bio_sectors(bio)) {
550	bio->bi_iter.bi_sector += p->bd_start_sect;
551	trace_block_bio_remap(bio, dev: p->bd_dev,
552	from: bio->bi_iter.bi_sector -
553	p->bd_start_sect);
554	}
555	bio_set_flag(bio, bit: BIO_REMAPPED);
556	return `0`;
557	}
558
559	/*
560	* Check write append to a zoned block device.
561	*/
562	static inline blk_status_t blk_check_zone_append(struct request_queue *q,
563	struct bio *bio)
564	{
565	int nr_sectors = bio_sectors(bio);
566
567	/ Only applicable to zoned block devices /
568	if (!bdev_is_zoned(bdev: bio->bi_bdev))
569	return BLK_STS_NOTSUPP;
570
571	/ The bio sector must point to the start of a sequential zone /
572	if (!bdev_is_zone_start(bdev: bio->bi_bdev, sector: bio->bi_iter.bi_sector) \|\|
573	!bio_zone_is_seq(bio))
574	return BLK_STS_IOERR;
575
576	/*
577	* Not allowed to cross zone boundaries. Otherwise, the BIO will be
578	* split and could result in non-contiguous sectors being written in
579	* different zones.
580	*/
581	if (nr_sectors > q->limits.chunk_sectors)
582	return BLK_STS_IOERR;
583
584	/ Make sure the BIO is small enough and will not get split /
585	if (nr_sectors > q->limits.max_zone_append_sectors)
586	return BLK_STS_IOERR;
587
588	bio->bi_opf \|= REQ_NOMERGE;
589
590	return BLK_STS_OK;
591	}
592
593	static void __submit_bio(struct bio *bio)
594	{
595	if (unlikely(!blk_crypto_bio_prep(&bio)))
596	return;
597
598	if (!bio->bi_bdev->bd_has_submit_bio) {
599	blk_mq_submit_bio(bio);
600	} else if (likely(bio_queue_enter(bio) == `0`)) {
601	struct gendisk *disk = bio->bi_bdev->bd_disk;
602
603	disk->fops->submit_bio(bio);
604	blk_queue_exit(q: disk->queue);
605	}
606	}
607
608	/*
609	* The loop in this function may be a bit non-obvious, and so deserves some
610	* explanation:
611	*
612	* - Before entering the loop, bio->bi_next is NULL (as all callers ensure
613	* that), so we have a list with a single bio.
614	* - We pretend that we have just taken it off a longer list, so we assign
615	* bio_list to a pointer to the bio_list_on_stack, thus initialising the
616	* bio_list of new bios to be added. ->submit_bio() may indeed add some more
617	* bios through a recursive call to submit_bio_noacct. If it did, we find a
618	* non-NULL value in bio_list and re-enter the loop from the top.
619	* - In this case we really did just take the bio of the top of the list (no
620	* pretending) and so remove it from bio_list, and call into ->submit_bio()
621	* again.
622	*
623	* bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
624	* bio_list_on_stack[1] contains bios that were submitted before the current
625	* ->submit_bio, but that haven't been processed yet.
626	*/
627	static void __submit_bio_noacct(struct bio *bio)
628	{
629	struct bio_list bio_list_on_stack[`2`];
630
631	BUG_ON(bio->bi_next);
632
633	bio_list_init(bl: &bio_list_on_stack[`0`]);
634	current->bio_list = bio_list_on_stack;
635
636	do {
637	struct request_queue *q = bdev_get_queue(bdev: bio->bi_bdev);
638	struct bio_list lower, same;
639
640	/*
641	* Create a fresh bio_list for all subordinate requests.
642	*/
643	bio_list_on_stack[`1`] = bio_list_on_stack[`0`];
644	bio_list_init(bl: &bio_list_on_stack[`0`]);
645
646	__submit_bio(bio);
647
648	/*
649	* Sort new bios into those for a lower level and those for the
650	* same level.
651	*/
652	bio_list_init(bl: &lower);
653	bio_list_init(bl: &same);
654	while ((bio = bio_list_pop(bl: &bio_list_on_stack[`0`])) != NULL)
655	if (q == bdev_get_queue(bdev: bio->bi_bdev))
656	bio_list_add(bl: &same, bio);
657	else
658	bio_list_add(bl: &lower, bio);
659
660	/*
661	* Now assemble so we handle the lowest level first.
662	*/
663	bio_list_merge(bl: &bio_list_on_stack[`0`], bl2: &lower);
664	bio_list_merge(bl: &bio_list_on_stack[`0`], bl2: &same);
665	bio_list_merge(bl: &bio_list_on_stack[`0`], bl2: &bio_list_on_stack[`1`]);
666	} while ((bio = bio_list_pop(bl: &bio_list_on_stack[`0`])));
667
668	current->bio_list = NULL;
669	}
670
671	static void __submit_bio_noacct_mq(struct bio *bio)
672	{
673	struct bio_list bio_list[`2`] = { };
674
675	current->bio_list = bio_list;
676
677	do {
678	__submit_bio(bio);
679	} while ((bio = bio_list_pop(bl: &bio_list[`0`])));
680
681	current->bio_list = NULL;
682	}
683
684	void submit_bio_noacct_nocheck(struct bio *bio)
685	{
686	blk_cgroup_bio_start(bio);
687	blkcg_bio_issue_init(bio);
688
689	if (!bio_flagged(bio, bit: BIO_TRACE_COMPLETION)) {
690	trace_block_bio_queue(bio);
691	/*
692	* Now that enqueuing has been traced, we need to trace
693	* completion as well.
694	*/
695	bio_set_flag(bio, bit: BIO_TRACE_COMPLETION);
696	}
697
698	/*
699	* We only want one ->submit_bio to be active at a time, else stack
700	* usage with stacked devices could be a problem. Use current->bio_list
701	* to collect a list of requests submited by a ->submit_bio method while
702	* it is active, and then process them after it returned.
703	*/
704	if (current->bio_list)
705	bio_list_add(bl: &current->bio_list[`0`], bio);
706	else if (!bio->bi_bdev->bd_has_submit_bio)
707	__submit_bio_noacct_mq(bio);
708	else
709	__submit_bio_noacct(bio);
710	}
711
712	/**
713	* submit_bio_noacct - re-submit a bio to the block device layer for I/O
714	* @bio: The bio describing the location in memory and on the device.
715	*
716	* This is a version of submit_bio() that shall only be used for I/O that is
717	* resubmitted to lower level drivers by stacking block drivers. All file
718	* systems and other upper level users of the block layer should use
719	* submit_bio() instead.
720	*/
721	void submit_bio_noacct(struct bio *bio)
722	{
723	struct block_device *bdev = bio->bi_bdev;
724	struct request_queue *q = bdev_get_queue(bdev);
725	blk_status_t status = BLK_STS_IOERR;
726
727	might_sleep();
728
729	/*
730	* For a REQ_NOWAIT based request, return -EOPNOTSUPP
731	* if queue does not support NOWAIT.
732	*/
733	if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
734	goto not_supported;
735
736	if (should_fail_bio(bio))
737	goto end_io;
738	bio_check_ro(bio);
739	if (!bio_flagged(bio, bit: BIO_REMAPPED)) {
740	if (unlikely(bio_check_eod(bio)))
741	goto end_io;
742	if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
743	goto end_io;
744	}
745
746	/*
747	* Filter flush bio's early so that bio based drivers without flush
748	* support don't have to worry about them.
749	*/
750	if (op_is_flush(op: bio->bi_opf)) {
751	if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
752	bio_op(bio) != REQ_OP_ZONE_APPEND))
753	goto end_io;
754	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
755	bio->bi_opf &= ~(REQ_PREFLUSH \| REQ_FUA);
756	if (!bio_sectors(bio)) {
757	status = BLK_STS_OK;
758	goto end_io;
759	}
760	}
761	}
762
763	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
764	bio_clear_polled(bio);
765
766	switch (bio_op(bio)) {
767	case REQ_OP_DISCARD:
768	if (!bdev_max_discard_sectors(bdev))
769	goto not_supported;
770	break;
771	case REQ_OP_SECURE_ERASE:
772	if (!bdev_max_secure_erase_sectors(bdev))
773	goto not_supported;
774	break;
775	case REQ_OP_ZONE_APPEND:
776	status = blk_check_zone_append(q, bio);
777	if (status != BLK_STS_OK)
778	goto end_io;
779	break;
780	case REQ_OP_ZONE_RESET:
781	case REQ_OP_ZONE_OPEN:
782	case REQ_OP_ZONE_CLOSE:
783	case REQ_OP_ZONE_FINISH:
784	if (!bdev_is_zoned(bdev: bio->bi_bdev))
785	goto not_supported;
786	break;
787	case REQ_OP_ZONE_RESET_ALL:
788	if (!bdev_is_zoned(bdev: bio->bi_bdev) \|\| !blk_queue_zone_resetall(q))
789	goto not_supported;
790	break;
791	case REQ_OP_WRITE_ZEROES:
792	if (!q->limits.max_write_zeroes_sectors)
793	goto not_supported;
794	break;
795	default:
796	break;
797	}
798
799	if (blk_throtl_bio(bio))
800	return;
801	submit_bio_noacct_nocheck(bio);
802	return;
803
804	not_supported:
805	status = BLK_STS_NOTSUPP;
806	end_io:
807	bio->bi_status = status;
808	bio_endio(bio);
809	}
810	EXPORT_SYMBOL(submit_bio_noacct);
811
812	/**
813	* submit_bio - submit a bio to the block device layer for I/O
814	* @bio: The &struct bio which describes the I/O
815	*
816	* submit_bio() is used to submit I/O requests to block devices. It is passed a
817	* fully set up &struct bio that describes the I/O that needs to be done. The
818	* bio will be send to the device described by the bi_bdev field.
819	*
820	* The success/failure status of the request, along with notification of
821	* completion, is delivered asynchronously through the ->bi_end_io() callback
822	* in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has
823	* been called.
824	*/
825	void submit_bio(struct bio *bio)
826	{
827	if (bio_op(bio) == REQ_OP_READ) {
828	task_io_account_read(bytes: bio->bi_iter.bi_size);
829	count_vm_events(item: PGPGIN, bio_sectors(bio));
830	} else if (bio_op(bio) == REQ_OP_WRITE) {
831	count_vm_events(item: PGPGOUT, bio_sectors(bio));
832	}
833
834	submit_bio_noacct(bio);
835	}
836	EXPORT_SYMBOL(submit_bio);
837
838	/**
839	* bio_poll - poll for BIO completions
840	* @bio: bio to poll for
841	* @iob: batches of IO
842	* @flags: BLK_POLL_* flags that control the behavior
843	*
844	* Poll for completions on queue associated with the bio. Returns number of
845	* completed entries found.
846	*
847	* Note: the caller must either be the context that submitted @bio, or
848	* be in a RCU critical section to prevent freeing of @bio.
849	*/
850	int bio_poll(struct bio bio, struct* io_comp_batch iob, unsigned* int flags)
851	{
852	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
853	struct block_device *bdev;
854	struct request_queue *q;
855	int ret = `0`;
856
857	bdev = READ_ONCE(bio->bi_bdev);
858	if (!bdev)
859	return `0`;
860
861	q = bdev_get_queue(bdev);
862	if (cookie == BLK_QC_T_NONE \|\|
863	!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
864	return `0`;
865
866	/*
867	* As the requests that require a zone lock are not plugged in the
868	* first place, directly accessing the plug instead of using
869	* blk_mq_plug() should not have any consequences during flushing for
870	* zoned devices.
871	*/
872	blk_flush_plug(current->plug, async: false);
873
874	/*
875	* We need to be able to enter a frozen queue, similar to how
876	* timeouts also need to do that. If that is blocked, then we can
877	* have pending IO when a queue freeze is started, and then the
878	* wait for the freeze to finish will wait for polled requests to
879	* timeout as the poller is preventer from entering the queue and
880	* completing them. As long as we prevent new IO from being queued,
881	* that should be all that matters.
882	*/
883	if (!percpu_ref_tryget(ref: &q->q_usage_counter))
884	return `0`;
885	if (queue_is_mq(q)) {
886	ret = blk_mq_poll(q, cookie, iob, flags);
887	} else {
888	struct gendisk *disk = q->disk;
889
890	if (disk && disk->fops->poll_bio)
891	ret = disk->fops->poll_bio(bio, iob, flags);
892	}
893	blk_queue_exit(q);
894	return ret;
895	}
896	EXPORT_SYMBOL_GPL(bio_poll);
897
898	/*
899	* Helper to implement file_operations.iopoll. Requires the bio to be stored
900	* in iocb->private, and cleared before freeing the bio.
901	*/
902	int iocb_bio_iopoll(struct kiocb kiocb, struct* io_comp_batch *iob,
903	unsigned int flags)
904	{
905	struct bio *bio;
906	int ret = `0`;
907
908	/*
909	* Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
910	* point to a freshly allocated bio at this point. If that happens
911	* we have a few cases to consider:
912	*
913	* 1) the bio is beeing initialized and bi_bdev is NULL. We can just
914	* simply nothing in this case
915	* 2) the bio points to a not poll enabled device. bio_poll will catch
916	* this and return 0
917	* 3) the bio points to a poll capable device, including but not
918	* limited to the one that the original bio pointed to. In this
919	* case we will call into the actual poll method and poll for I/O,
920	* even if we don't need to, but it won't cause harm either.
921	*
922	* For cases 2) and 3) above the RCU grace period ensures that bi_bdev
923	* is still allocated. Because partitions hold a reference to the whole
924	* device bdev and thus disk, the disk is also still valid. Grabbing
925	* a reference to the queue in bio_poll() ensures the hctxs and requests
926	* are still valid as well.
927	*/
928	rcu_read_lock();
929	bio = READ_ONCE(kiocb->private);
930	if (bio)
931	ret = bio_poll(bio, iob, flags);
932	rcu_read_unlock();
933
934	return ret;
935	}
936	EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
937
938	void update_io_ticks(struct block_device part, unsigned* long now, bool end)
939	{
940	unsigned long stamp;
941	again:
942	stamp = READ_ONCE(part->bd_stamp);
943	if (unlikely(time_after(now, stamp))) {
944	if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
945	__part_stat_add(part, io_ticks, end ? now - stamp : `1`);
946	}
947	if (part->bd_partno) {
948	part = bdev_whole(part);
949	goto again;
950	}
951	}
952
953	unsigned long bdev_start_io_acct(struct block_device bdev, enum* req_op op,
954	unsigned long start_time)
955	{
956	part_stat_lock();
957	update_io_ticks(part: bdev, now: start_time, end: false);
958	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
959	part_stat_unlock();
960
961	return start_time;
962	}
963	EXPORT_SYMBOL(bdev_start_io_acct);
964
965	/**
966	* bio_start_io_acct - start I/O accounting for bio based drivers
967	* @bio: bio to start account for
968	*
969	* Returns the start time that should be passed back to bio_end_io_acct().
970	*/
971	unsigned long bio_start_io_acct(struct bio *bio)
972	{
973	return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
974	}
975	EXPORT_SYMBOL_GPL(bio_start_io_acct);
976
977	void bdev_end_io_acct(struct block_device bdev, enum* req_op op,
978	unsigned int sectors, unsigned long start_time)
979	{
980	const int sgrp = op_stat_group(op);
981	unsigned long now = READ_ONCE(jiffies);
982	unsigned long duration = now - start_time;
983
984	part_stat_lock();
985	update_io_ticks(part: bdev, now, end: true);
986	part_stat_inc(bdev, ios[sgrp]);
987	part_stat_add(bdev, sectors[sgrp], sectors);
988	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
989	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
990	part_stat_unlock();
991	}
992	EXPORT_SYMBOL(bdev_end_io_acct);
993
994	void bio_end_io_acct_remapped(struct bio bio, unsigned* long start_time,
995	struct block_device *orig_bdev)
996	{
997	bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
998	}
999	EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
1000
1001	/**
1002	* blk_lld_busy - Check if underlying low-level drivers of a device are busy
1003	* @q : the queue of the device being checked
1004	*
1005	* Description:
1006	* Check if underlying low-level drivers of a device are busy.
1007	* If the drivers want to export their busy state, they must set own
1008	* exporting function using blk_queue_lld_busy() first.
1009	*
1010	* Basically, this function is used only by request stacking drivers
1011	* to stop dispatching requests to underlying devices when underlying
1012	* devices are busy. This behavior helps more I/O merging on the queue
1013	* of the request stacking driver and prevents I/O throughput regression
1014	* on burst I/O load.
1015	*
1016	* Return:
1017	* 0 - Not busy (The request stacking driver should dispatch request)
1018	* 1 - Busy (The request stacking driver should stop dispatching request)
1019	*/
1020	int blk_lld_busy(struct request_queue *q)
1021	{
1022	if (queue_is_mq(q) && q->mq_ops->busy)
1023	return q->mq_ops->busy(q);
1024
1025	return `0`;
1026	}
1027	EXPORT_SYMBOL_GPL(blk_lld_busy);
1028
1029	int kblockd_schedule_work(struct work_struct *work)
1030	{
1031	return queue_work(wq: kblockd_workqueue, work);
1032	}
1033	EXPORT_SYMBOL(kblockd_schedule_work);
1034
1035	int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1036	unsigned long delay)
1037	{
1038	return mod_delayed_work_on(cpu, wq: kblockd_workqueue, dwork, delay);
1039	}
1040	EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1041
1042	void blk_start_plug_nr_ios(struct blk_plug plug, unsigned* short nr_ios)
1043	{
1044	struct task_struct *tsk = current;
1045
1046	/*
1047	* If this is a nested plug, don't actually assign it.
1048	*/
1049	if (tsk->plug)
1050	return;
1051
1052	plug->mq_list = NULL;
1053	plug->cached_rq = NULL;
1054	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
1055	plug->rq_count = `0`;
1056	plug->multiple_queues = false;
1057	plug->has_elevator = false;
1058	INIT_LIST_HEAD(list: &plug->cb_list);
1059
1060	/*
1061	* Store ordering should not be needed here, since a potential
1062	* preempt will imply a full memory barrier
1063	*/
1064	tsk->plug = plug;
1065	}
1066
1067	/**
1068	* blk_start_plug - initialize blk_plug and track it inside the task_struct
1069	* @plug: The &struct blk_plug that needs to be initialized
1070	*
1071	* Description:
1072	* blk_start_plug() indicates to the block layer an intent by the caller
1073	* to submit multiple I/O requests in a batch. The block layer may use
1074	* this hint to defer submitting I/Os from the caller until blk_finish_plug()
1075	* is called. However, the block layer may choose to submit requests
1076	* before a call to blk_finish_plug() if the number of queued I/Os
1077	* exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1078	* %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1079	* the task schedules (see below).
1080	*
1081	* Tracking blk_plug inside the task_struct will help with auto-flushing the
1082	* pending I/O should the task end up blocking between blk_start_plug() and
1083	* blk_finish_plug(). This is important from a performance perspective, but
1084	* also ensures that we don't deadlock. For instance, if the task is blocking
1085	* for a memory allocation, memory reclaim could end up wanting to free a
1086	* page belonging to that request that is currently residing in our private
1087	* plug. By flushing the pending I/O when the process goes to sleep, we avoid
1088	* this kind of deadlock.
1089	*/
1090	void blk_start_plug(struct blk_plug *plug)
1091	{
1092	blk_start_plug_nr_ios(plug, nr_ios: `1`);
1093	}
1094	EXPORT_SYMBOL(blk_start_plug);
1095
1096	static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
1097	{
1098	LIST_HEAD(callbacks);
1099
1100	while (!list_empty(head: &plug->cb_list)) {
1101	list_splice_init(list: &plug->cb_list, head: &callbacks);
1102
1103	while (!list_empty(head: &callbacks)) {
1104	struct blk_plug_cb *cb = list_first_entry(&callbacks,
1105	struct blk_plug_cb,
1106	list);
1107	list_del(entry: &cb->list);
1108	cb->callback(cb, from_schedule);
1109	}
1110	}
1111	}
1112
1113	struct blk_plug_cb blk_check_plugged(blk_plug_cb_fn unplug, void* *data,
1114	int size)
1115	{
1116	struct blk_plug *plug = current->plug;
1117	struct blk_plug_cb *cb;
1118
1119	if (!plug)
1120	return NULL;
1121
1122	list_for_each_entry(cb, &plug->cb_list, list)
1123	if (cb->callback == unplug && cb->data == data)
1124	return cb;
1125
1126	/ Not currently on the callback list /
1127	BUG_ON(size < sizeof(*cb));
1128	cb = kzalloc(size, GFP_ATOMIC);
1129	if (cb) {
1130	cb->data = data;
1131	cb->callback = unplug;
1132	list_add(new: &cb->list, head: &plug->cb_list);
1133	}
1134	return cb;
1135	}
1136	EXPORT_SYMBOL(blk_check_plugged);
1137
1138	void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
1139	{
1140	if (!list_empty(head: &plug->cb_list))
1141	flush_plug_callbacks(plug, from_schedule);
1142	blk_mq_flush_plug_list(plug, from_schedule);
1143	/*
1144	* Unconditionally flush out cached requests, even if the unplug
1145	* event came from schedule. Since we know hold references to the
1146	* queue for cached requests, we don't want a blocked task holding
1147	* up a queue freeze/quiesce event.
1148	*/
1149	if (unlikely(!rq_list_empty(plug->cached_rq)))
1150	blk_mq_free_plug_rqs(plug);
1151	}
1152
1153	/**
1154	* blk_finish_plug - mark the end of a batch of submitted I/O
1155	* @plug: The &struct blk_plug passed to blk_start_plug()
1156	*
1157	* Description:
1158	* Indicate that a batch of I/O submissions is complete. This function
1159	* must be paired with an initial call to blk_start_plug(). The intent
1160	* is to allow the block layer to optimize I/O submission. See the
1161	* documentation for blk_start_plug() for more information.
1162	*/
1163	void blk_finish_plug(struct blk_plug *plug)
1164	{
1165	if (plug == current->plug) {
1166	__blk_flush_plug(plug, from_schedule: false);
1167	current->plug = NULL;
1168	}
1169	}
1170	EXPORT_SYMBOL(blk_finish_plug);
1171
1172	void blk_io_schedule(void)
1173	{
1174	/ Prevent hang_check timer from firing at us during very long I/O /
1175	unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / `2`;
1176
1177	if (timeout)
1178	io_schedule_timeout(timeout);
1179	else
1180	io_schedule();
1181	}
1182	EXPORT_SYMBOL_GPL(blk_io_schedule);
1183
1184	int __init blk_dev_init(void)
1185	{
1186	BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (`1` << REQ_OP_BITS));
1187	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > `8` *
1188	sizeof_field(struct request, cmd_flags));
1189	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > `8` *
1190	sizeof_field(struct bio, bi_opf));
1191
1192	/ used for unplugging and affects IO latency/throughput - HIGHPRI /
1193	kblockd_workqueue = alloc_workqueue(fmt: "kblockd",
1194	flags: WQ_MEM_RECLAIM \| WQ_HIGHPRI, max_active: `0`);
1195	if (!kblockd_workqueue)
1196	panic(fmt: "Failed to create kblockd\n");
1197
1198	blk_requestq_cachep = kmem_cache_create(name: "request_queue",
1199	size: sizeof(struct request_queue), align: `0`, SLAB_PANIC, NULL);
1200
1201	blk_debugfs_root = debugfs_create_dir(name: "block", NULL);
1202
1203	return `0`;
1204	}
1205

source code of linux/block/blk-core.c