blk-core.c source code [linux/block/blk-core.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 1991, 1992 Linus Torvalds
4	* Copyright (C) 1994, Karl Keyte: Added support for disk statistics
5	* Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6	* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
7	* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8	* - July2000
9	* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10	*/
11
12	/*
13	* This handles all read/write requests to block devices
14	*/
15	#include <linux/kernel.h>
16	#include <linux/module.h>
17	#include <linux/bio.h>
18	#include <linux/blkdev.h>
19	#include <linux/blk-pm.h>
20	#include <linux/blk-integrity.h>
21	#include <linux/highmem.h>
22	#include <linux/mm.h>
23	#include <linux/pagemap.h>
24	#include <linux/kernel_stat.h>
25	#include <linux/string.h>
26	#include <linux/init.h>
27	#include <linux/completion.h>
28	#include <linux/slab.h>
29	#include <linux/swap.h>
30	#include <linux/writeback.h>
31	#include <linux/task_io_accounting_ops.h>
32	#include <linux/fault-inject.h>
33	#include <linux/list_sort.h>
34	#include <linux/delay.h>
35	#include <linux/ratelimit.h>
36	#include <linux/pm_runtime.h>
37	#include <linux/t10-pi.h>
38	#include <linux/debugfs.h>
39	#include <linux/bpf.h>
40	#include <linux/part_stat.h>
41	#include <linux/sched/sysctl.h>
42	#include <linux/blk-crypto.h>
43
44	#define CREATE_TRACE_POINTS
45	#include <trace/events/block.h>
46
47	#include "blk.h"
48	#include "blk-mq-sched.h"
49	#include "blk-pm.h"
50	#include "blk-cgroup.h"
51	#include "blk-throttle.h"
52	#include "blk-ioprio.h"
53
54	struct dentry *blk_debugfs_root;
55
56	EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
57	EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
58	EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
59	EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
60	EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
61	EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
62
63	static DEFINE_IDA(blk_queue_ida);
64
65	/*
66	* For queue allocation
67	*/
68	static struct kmem_cache *blk_requestq_cachep;
69
70	/*
71	* Controlling structure to kblockd
72	*/
73	static struct workqueue_struct *kblockd_workqueue;
74
75	/**
76	* blk_queue_flag_set - atomically set a queue flag
77	* @flag: flag to be set
78	* @q: request queue
79	*/
80	void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
81	{
82	set_bit(nr: flag, addr: &q->queue_flags);
83	}
84	EXPORT_SYMBOL(blk_queue_flag_set);
85
86	/**
87	* blk_queue_flag_clear - atomically clear a queue flag
88	* @flag: flag to be cleared
89	* @q: request queue
90	*/
91	void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
92	{
93	clear_bit(nr: flag, addr: &q->queue_flags);
94	}
95	EXPORT_SYMBOL(blk_queue_flag_clear);
96
97	/**
98	* blk_queue_flag_test_and_set - atomically test and set a queue flag
99	* @flag: flag to be set
100	* @q: request queue
101	*
102	* Returns the previous value of @flag - 0 if the flag was not set and 1 if
103	* the flag was already set.
104	*/
105	bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
106	{
107	return test_and_set_bit(nr: flag, addr: &q->queue_flags);
108	}
109	EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
110
111	#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
112	static const char *const blk_op_name[] = {
113	REQ_OP_NAME(READ),
114	REQ_OP_NAME(WRITE),
115	REQ_OP_NAME(FLUSH),
116	REQ_OP_NAME(DISCARD),
117	REQ_OP_NAME(SECURE_ERASE),
118	REQ_OP_NAME(ZONE_RESET),
119	REQ_OP_NAME(ZONE_RESET_ALL),
120	REQ_OP_NAME(ZONE_OPEN),
121	REQ_OP_NAME(ZONE_CLOSE),
122	REQ_OP_NAME(ZONE_FINISH),
123	REQ_OP_NAME(ZONE_APPEND),
124	REQ_OP_NAME(WRITE_ZEROES),
125	REQ_OP_NAME(DRV_IN),
126	REQ_OP_NAME(DRV_OUT),
127	};
128	#undef REQ_OP_NAME
129
130	/**
131	* blk_op_str - Return string XXX in the REQ_OP_XXX.
132	* @op: REQ_OP_XXX.
133	*
134	* Description: Centralize block layer function to convert REQ_OP_XXX into
135	* string format. Useful in the debugging and tracing bio or request. For
136	* invalid REQ_OP_XXX it returns string "UNKNOWN".
137	*/
138	inline const char blk_op_str(enum* req_op op)
139	{
140	const char *op_str = "UNKNOWN";
141
142	if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
143	op_str = blk_op_name[op];
144
145	return op_str;
146	}
147	EXPORT_SYMBOL_GPL(blk_op_str);
148
149	static const struct {
150	int errno;
151	const char *name;
152	} blk_errors[] = {
153	[BLK_STS_OK] = { .errno: `0`, .name: "" },
154	[BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
155	[BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
156	[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
157	[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
158	[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
159	[BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" },
160	[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
161	[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
162	[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
163	[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
164	[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
165	[BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
166
167	/ device mapper special case, should not leak out: /
168	[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
169
170	/ zone device specific errors /
171	[BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
172	[BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
173
174	/ Command duration limit device-side timeout /
175	[BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" },
176
177	/ everything else not covered above: /
178	[BLK_STS_IOERR] = { -EIO, "I/O" },
179	};
180
181	blk_status_t errno_to_blk_status(int errno)
182	{
183	int i;
184
185	for (i = `0`; i < ARRAY_SIZE(blk_errors); i++) {
186	if (blk_errors[i].errno == errno)
187	return (__force blk_status_t)i;
188	}
189
190	return BLK_STS_IOERR;
191	}
192	EXPORT_SYMBOL_GPL(errno_to_blk_status);
193
194	int blk_status_to_errno(blk_status_t status)
195	{
196	int idx = (__force int)status;
197
198	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
199	return -EIO;
200	return blk_errors[idx].errno;
201	}
202	EXPORT_SYMBOL_GPL(blk_status_to_errno);
203
204	const char *blk_status_to_str(blk_status_t status)
205	{
206	int idx = (__force int)status;
207
208	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
209	return "<null>";
210	return blk_errors[idx].name;
211	}
212	EXPORT_SYMBOL_GPL(blk_status_to_str);
213
214	/**
215	* blk_sync_queue - cancel any pending callbacks on a queue
216	* @q: the queue
217	*
218	* Description:
219	* The block layer may perform asynchronous callback activity
220	* on a queue, such as calling the unplug function after a timeout.
221	* A block device may call blk_sync_queue to ensure that any
222	* such activity is cancelled, thus allowing it to release resources
223	* that the callbacks might use. The caller must already have made sure
224	* that its ->submit_bio will not re-add plugging prior to calling
225	* this function.
226	*
227	* This function does not cancel any asynchronous activity arising
228	* out of elevator or throttling code. That would require elevator_exit()
229	* and blkcg_exit_queue() to be called with queue lock initialized.
230	*
231	*/
232	void blk_sync_queue(struct request_queue *q)
233	{
234	del_timer_sync(timer: &q->timeout);
235	cancel_work_sync(work: &q->timeout_work);
236	}
237	EXPORT_SYMBOL(blk_sync_queue);
238
239	/**
240	* blk_set_pm_only - increment pm_only counter
241	* @q: request queue pointer
242	*/
243	void blk_set_pm_only(struct request_queue *q)
244	{
245	atomic_inc(v: &q->pm_only);
246	}
247	EXPORT_SYMBOL_GPL(blk_set_pm_only);
248
249	void blk_clear_pm_only(struct request_queue *q)
250	{
251	int pm_only;
252
253	pm_only = atomic_dec_return(v: &q->pm_only);
254	WARN_ON_ONCE(pm_only < `0`);
255	if (pm_only == `0`)
256	wake_up_all(&q->mq_freeze_wq);
257	}
258	EXPORT_SYMBOL_GPL(blk_clear_pm_only);
259
260	static void blk_free_queue_rcu(struct rcu_head *rcu_head)
261	{
262	struct request_queue *q = container_of(rcu_head,
263	struct request_queue, rcu_head);
264
265	percpu_ref_exit(ref: &q->q_usage_counter);
266	kmem_cache_free(s: blk_requestq_cachep, objp: q);
267	}
268
269	static void blk_free_queue(struct request_queue *q)
270	{
271	blk_free_queue_stats(q->stats);
272	if (queue_is_mq(q))
273	blk_mq_release(q);
274
275	ida_free(&blk_queue_ida, id: q->id);
276	call_rcu(head: &q->rcu_head, func: blk_free_queue_rcu);
277	}
278
279	/**
280	* blk_put_queue - decrement the request_queue refcount
281	* @q: the request_queue structure to decrement the refcount for
282	*
283	* Decrements the refcount of the request_queue and free it when the refcount
284	* reaches 0.
285	*/
286	void blk_put_queue(struct request_queue *q)
287	{
288	if (refcount_dec_and_test(r: &q->refs))
289	blk_free_queue(q);
290	}
291	EXPORT_SYMBOL(blk_put_queue);
292
293	void blk_queue_start_drain(struct request_queue *q)
294	{
295	/*
296	* When queue DYING flag is set, we need to block new req
297	* entering queue, so we call blk_freeze_queue_start() to
298	* prevent I/O from crossing blk_queue_enter().
299	*/
300	blk_freeze_queue_start(q);
301	if (queue_is_mq(q))
302	blk_mq_wake_waiters(q);
303	/ Make blk_queue_enter() reexamine the DYING flag. /
304	wake_up_all(&q->mq_freeze_wq);
305	}
306
307	/**
308	* blk_queue_enter() - try to increase q->q_usage_counter
309	* @q: request queue pointer
310	* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
311	*/
312	int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
313	{
314	const bool pm = flags & BLK_MQ_REQ_PM;
315
316	while (!blk_try_enter_queue(q, pm)) {
317	if (flags & BLK_MQ_REQ_NOWAIT)
318	return -EAGAIN;
319
320	/*
321	* read pair of barrier in blk_freeze_queue_start(), we need to
322	* order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
323	* reading .mq_freeze_depth or queue dying flag, otherwise the
324	* following wait may never return if the two reads are
325	* reordered.
326	*/
327	smp_rmb();
328	wait_event(q->mq_freeze_wq,
329	(!q->mq_freeze_depth &&
330	blk_pm_resume_queue(pm, q)) \|\|
331	blk_queue_dying(q));
332	if (blk_queue_dying(q))
333	return -ENODEV;
334	}
335
336	return `0`;
337	}
338
339	int __bio_queue_enter(struct request_queue q, struct* bio *bio)
340	{
341	while (!blk_try_enter_queue(q, pm: false)) {
342	struct gendisk *disk = bio->bi_bdev->bd_disk;
343
344	if (bio->bi_opf & REQ_NOWAIT) {
345	if (test_bit(GD_DEAD, &disk->state))
346	goto dead;
347	bio_wouldblock_error(bio);
348	return -EAGAIN;
349	}
350
351	/*
352	* read pair of barrier in blk_freeze_queue_start(), we need to
353	* order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
354	* reading .mq_freeze_depth or queue dying flag, otherwise the
355	* following wait may never return if the two reads are
356	* reordered.
357	*/
358	smp_rmb();
359	wait_event(q->mq_freeze_wq,
360	(!q->mq_freeze_depth &&
361	blk_pm_resume_queue(false, q)) \|\|
362	test_bit(GD_DEAD, &disk->state));
363	if (test_bit(GD_DEAD, &disk->state))
364	goto dead;
365	}
366
367	return `0`;
368	dead:
369	bio_io_error(bio);
370	return -ENODEV;
371	}
372
373	void blk_queue_exit(struct request_queue *q)
374	{
375	percpu_ref_put(ref: &q->q_usage_counter);
376	}
377
378	static void blk_queue_usage_counter_release(struct percpu_ref *ref)
379	{
380	struct request_queue *q =
381	container_of(ref, struct request_queue, q_usage_counter);
382
383	wake_up_all(&q->mq_freeze_wq);
384	}
385
386	static void blk_rq_timed_out_timer(struct timer_list *t)
387	{
388	struct request_queue *q = from_timer(q, t, timeout);
389
390	kblockd_schedule_work(work: &q->timeout_work);
391	}
392
393	static void blk_timeout_work(struct work_struct *work)
394	{
395	}
396
397	struct request_queue blk_alloc_queue(struct* queue_limits lim, int* node_id)
398	{
399	struct request_queue *q;
400	int error;
401
402	q = kmem_cache_alloc_node(s: blk_requestq_cachep, GFP_KERNEL \| __GFP_ZERO,
403	node: node_id);
404	if (!q)
405	return ERR_PTR(error: -ENOMEM);
406
407	q->last_merge = NULL;
408
409	q->id = ida_alloc(ida: &blk_queue_ida, GFP_KERNEL);
410	if (q->id < `0`) {
411	error = q->id;
412	goto fail_q;
413	}
414
415	q->stats = blk_alloc_queue_stats();
416	if (!q->stats) {
417	error = -ENOMEM;
418	goto fail_id;
419	}
420
421	error = blk_set_default_limits(lim);
422	if (error)
423	goto fail_stats;
424	q->limits = *lim;
425
426	q->node = node_id;
427
428	atomic_set(v: &q->nr_active_requests_shared_tags, i: `0`);
429
430	timer_setup(&q->timeout, blk_rq_timed_out_timer, `0`);
431	INIT_WORK(&q->timeout_work, blk_timeout_work);
432	INIT_LIST_HEAD(list: &q->icq_list);
433
434	refcount_set(r: &q->refs, n: `1`);
435	mutex_init(&q->debugfs_mutex);
436	mutex_init(&q->sysfs_lock);
437	mutex_init(&q->sysfs_dir_lock);
438	mutex_init(&q->limits_lock);
439	mutex_init(&q->rq_qos_mutex);
440	spin_lock_init(&q->queue_lock);
441
442	init_waitqueue_head(&q->mq_freeze_wq);
443	mutex_init(&q->mq_freeze_lock);
444
445	blkg_init_queue(q);
446
447	/*
448	* Init percpu_ref in atomic mode so that it's faster to shutdown.
449	* See blk_register_queue() for details.
450	*/
451	error = percpu_ref_init(ref: &q->q_usage_counter,
452	release: blk_queue_usage_counter_release,
453	flags: PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
454	if (error)
455	goto fail_stats;
456
457	q->nr_requests = BLKDEV_DEFAULT_RQ;
458
459	return q;
460
461	fail_stats:
462	blk_free_queue_stats(q->stats);
463	fail_id:
464	ida_free(&blk_queue_ida, id: q->id);
465	fail_q:
466	kmem_cache_free(s: blk_requestq_cachep, objp: q);
467	return ERR_PTR(error);
468	}
469
470	/**
471	* blk_get_queue - increment the request_queue refcount
472	* @q: the request_queue structure to increment the refcount for
473	*
474	* Increment the refcount of the request_queue kobject.
475	*
476	* Context: Any context.
477	*/
478	bool blk_get_queue(struct request_queue *q)
479	{
480	if (unlikely(blk_queue_dying(q)))
481	return false;
482	refcount_inc(r: &q->refs);
483	return true;
484	}
485	EXPORT_SYMBOL(blk_get_queue);
486
487	#ifdef CONFIG_FAIL_MAKE_REQUEST
488
489	static DECLARE_FAULT_ATTR(fail_make_request);
490
491	static int __init setup_fail_make_request(char *str)
492	{
493	return setup_fault_attr(attr: &fail_make_request, str);
494	}
495	__setup("fail_make_request=", setup_fail_make_request);
496
497	bool should_fail_request(struct block_device part, unsigned* int bytes)
498	{
499	return part->bd_make_it_fail && should_fail(attr: &fail_make_request, size: bytes);
500	}
501
502	static int __init fail_make_request_debugfs(void)
503	{
504	struct dentry *dir = fault_create_debugfs_attr(name: "fail_make_request",
505	NULL, attr: &fail_make_request);
506
507	return PTR_ERR_OR_ZERO(ptr: dir);
508	}
509
510	late_initcall(fail_make_request_debugfs);
511	#endif /* CONFIG_FAIL_MAKE_REQUEST */
512
513	static inline void bio_check_ro(struct bio *bio)
514	{
515	if (op_is_write(op: bio_op(bio)) && bdev_read_only(bdev: bio->bi_bdev)) {
516	if (op_is_flush(op: bio->bi_opf) && !bio_sectors(bio))
517	return;
518
519	if (bio->bi_bdev->bd_ro_warned)
520	return;
521
522	bio->bi_bdev->bd_ro_warned = true;
523	/*
524	* Use ioctl to set underlying disk of raid/dm to read-only
525	* will trigger this.
526	*/
527	pr_warn("Trying to write to read-only block-device %pg\n",
528	bio->bi_bdev);
529	}
530	}
531
532	static noinline int should_fail_bio(struct bio *bio)
533	{
534	if (should_fail_request(bdev_whole(bio->bi_bdev), bytes: bio->bi_iter.bi_size))
535	return -EIO;
536	return `0`;
537	}
538	ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
539
540	/*
541	* Check whether this bio extends beyond the end of the device or partition.
542	* This may well happen - the kernel calls bread() without checking the size of
543	* the device, e.g., when mounting a file system.
544	*/
545	static inline int bio_check_eod(struct bio *bio)
546	{
547	sector_t maxsector = bdev_nr_sectors(bdev: bio->bi_bdev);
548	unsigned int nr_sectors = bio_sectors(bio);
549
550	if (nr_sectors &&
551	(nr_sectors > maxsector \|\|
552	bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
553	pr_info_ratelimited("%s: attempt to access beyond end of device\n"
554	"%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
555	current->comm, bio->bi_bdev, bio->bi_opf,
556	bio->bi_iter.bi_sector, nr_sectors, maxsector);
557	return -EIO;
558	}
559	return `0`;
560	}
561
562	/*
563	* Remap block n of partition p to block n+start(p) of the disk.
564	*/
565	static int blk_partition_remap(struct bio *bio)
566	{
567	struct block_device *p = bio->bi_bdev;
568
569	if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
570	return -EIO;
571	if (bio_sectors(bio)) {
572	bio->bi_iter.bi_sector += p->bd_start_sect;
573	trace_block_bio_remap(bio, dev: p->bd_dev,
574	from: bio->bi_iter.bi_sector -
575	p->bd_start_sect);
576	}
577	bio_set_flag(bio, bit: BIO_REMAPPED);
578	return `0`;
579	}
580
581	/*
582	* Check write append to a zoned block device.
583	*/
584	static inline blk_status_t blk_check_zone_append(struct request_queue *q,
585	struct bio *bio)
586	{
587	int nr_sectors = bio_sectors(bio);
588
589	/ Only applicable to zoned block devices /
590	if (!bdev_is_zoned(bdev: bio->bi_bdev))
591	return BLK_STS_NOTSUPP;
592
593	/ The bio sector must point to the start of a sequential zone /
594	if (!bdev_is_zone_start(bdev: bio->bi_bdev, sector: bio->bi_iter.bi_sector) \|\|
595	!bio_zone_is_seq(bio))
596	return BLK_STS_IOERR;
597
598	/*
599	* Not allowed to cross zone boundaries. Otherwise, the BIO will be
600	* split and could result in non-contiguous sectors being written in
601	* different zones.
602	*/
603	if (nr_sectors > q->limits.chunk_sectors)
604	return BLK_STS_IOERR;
605
606	/ Make sure the BIO is small enough and will not get split /
607	if (nr_sectors > q->limits.max_zone_append_sectors)
608	return BLK_STS_IOERR;
609
610	bio->bi_opf \|= REQ_NOMERGE;
611
612	return BLK_STS_OK;
613	}
614
615	static void __submit_bio(struct bio *bio)
616	{
617	if (unlikely(!blk_crypto_bio_prep(&bio)))
618	return;
619
620	if (!bio->bi_bdev->bd_has_submit_bio) {
621	blk_mq_submit_bio(bio);
622	} else if (likely(bio_queue_enter(bio) == `0`)) {
623	struct gendisk *disk = bio->bi_bdev->bd_disk;
624
625	disk->fops->submit_bio(bio);
626	blk_queue_exit(q: disk->queue);
627	}
628	}
629
630	/*
631	* The loop in this function may be a bit non-obvious, and so deserves some
632	* explanation:
633	*
634	* - Before entering the loop, bio->bi_next is NULL (as all callers ensure
635	* that), so we have a list with a single bio.
636	* - We pretend that we have just taken it off a longer list, so we assign
637	* bio_list to a pointer to the bio_list_on_stack, thus initialising the
638	* bio_list of new bios to be added. ->submit_bio() may indeed add some more
639	* bios through a recursive call to submit_bio_noacct. If it did, we find a
640	* non-NULL value in bio_list and re-enter the loop from the top.
641	* - In this case we really did just take the bio of the top of the list (no
642	* pretending) and so remove it from bio_list, and call into ->submit_bio()
643	* again.
644	*
645	* bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
646	* bio_list_on_stack[1] contains bios that were submitted before the current
647	* ->submit_bio, but that haven't been processed yet.
648	*/
649	static void __submit_bio_noacct(struct bio *bio)
650	{
651	struct bio_list bio_list_on_stack[`2`];
652
653	BUG_ON(bio->bi_next);
654
655	bio_list_init(bl: &bio_list_on_stack[`0`]);
656	current->bio_list = bio_list_on_stack;
657
658	do {
659	struct request_queue *q = bdev_get_queue(bdev: bio->bi_bdev);
660	struct bio_list lower, same;
661
662	/*
663	* Create a fresh bio_list for all subordinate requests.
664	*/
665	bio_list_on_stack[`1`] = bio_list_on_stack[`0`];
666	bio_list_init(bl: &bio_list_on_stack[`0`]);
667
668	__submit_bio(bio);
669
670	/*
671	* Sort new bios into those for a lower level and those for the
672	* same level.
673	*/
674	bio_list_init(bl: &lower);
675	bio_list_init(bl: &same);
676	while ((bio = bio_list_pop(bl: &bio_list_on_stack[`0`])) != NULL)
677	if (q == bdev_get_queue(bdev: bio->bi_bdev))
678	bio_list_add(bl: &same, bio);
679	else
680	bio_list_add(bl: &lower, bio);
681
682	/*
683	* Now assemble so we handle the lowest level first.
684	*/
685	bio_list_merge(bl: &bio_list_on_stack[`0`], bl2: &lower);
686	bio_list_merge(bl: &bio_list_on_stack[`0`], bl2: &same);
687	bio_list_merge(bl: &bio_list_on_stack[`0`], bl2: &bio_list_on_stack[`1`]);
688	} while ((bio = bio_list_pop(bl: &bio_list_on_stack[`0`])));
689
690	current->bio_list = NULL;
691	}
692
693	static void __submit_bio_noacct_mq(struct bio *bio)
694	{
695	struct bio_list bio_list[`2`] = { };
696
697	current->bio_list = bio_list;
698
699	do {
700	__submit_bio(bio);
701	} while ((bio = bio_list_pop(bl: &bio_list[`0`])));
702
703	current->bio_list = NULL;
704	}
705
706	void submit_bio_noacct_nocheck(struct bio *bio)
707	{
708	blk_cgroup_bio_start(bio);
709	blkcg_bio_issue_init(bio);
710
711	if (!bio_flagged(bio, bit: BIO_TRACE_COMPLETION)) {
712	trace_block_bio_queue(bio);
713	/*
714	* Now that enqueuing has been traced, we need to trace
715	* completion as well.
716	*/
717	bio_set_flag(bio, bit: BIO_TRACE_COMPLETION);
718	}
719
720	/*
721	* We only want one ->submit_bio to be active at a time, else stack
722	* usage with stacked devices could be a problem. Use current->bio_list
723	* to collect a list of requests submited by a ->submit_bio method while
724	* it is active, and then process them after it returned.
725	*/
726	if (current->bio_list)
727	bio_list_add(bl: &current->bio_list[`0`], bio);
728	else if (!bio->bi_bdev->bd_has_submit_bio)
729	__submit_bio_noacct_mq(bio);
730	else
731	__submit_bio_noacct(bio);
732	}
733
734	/**
735	* submit_bio_noacct - re-submit a bio to the block device layer for I/O
736	* @bio: The bio describing the location in memory and on the device.
737	*
738	* This is a version of submit_bio() that shall only be used for I/O that is
739	* resubmitted to lower level drivers by stacking block drivers. All file
740	* systems and other upper level users of the block layer should use
741	* submit_bio() instead.
742	*/
743	void submit_bio_noacct(struct bio *bio)
744	{
745	struct block_device *bdev = bio->bi_bdev;
746	struct request_queue *q = bdev_get_queue(bdev);
747	blk_status_t status = BLK_STS_IOERR;
748
749	might_sleep();
750
751	/*
752	* For a REQ_NOWAIT based request, return -EOPNOTSUPP
753	* if queue does not support NOWAIT.
754	*/
755	if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
756	goto not_supported;
757
758	if (should_fail_bio(bio))
759	goto end_io;
760	bio_check_ro(bio);
761	if (!bio_flagged(bio, bit: BIO_REMAPPED)) {
762	if (unlikely(bio_check_eod(bio)))
763	goto end_io;
764	if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
765	goto end_io;
766	}
767
768	/*
769	* Filter flush bio's early so that bio based drivers without flush
770	* support don't have to worry about them.
771	*/
772	if (op_is_flush(op: bio->bi_opf)) {
773	if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
774	bio_op(bio) != REQ_OP_ZONE_APPEND))
775	goto end_io;
776	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
777	bio->bi_opf &= ~(REQ_PREFLUSH \| REQ_FUA);
778	if (!bio_sectors(bio)) {
779	status = BLK_STS_OK;
780	goto end_io;
781	}
782	}
783	}
784
785	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
786	bio_clear_polled(bio);
787
788	switch (bio_op(bio)) {
789	case REQ_OP_READ:
790	case REQ_OP_WRITE:
791	break;
792	case REQ_OP_FLUSH:
793	/*
794	* REQ_OP_FLUSH can't be submitted through bios, it is only
795	* synthetized in struct request by the flush state machine.
796	*/
797	goto not_supported;
798	case REQ_OP_DISCARD:
799	if (!bdev_max_discard_sectors(bdev))
800	goto not_supported;
801	break;
802	case REQ_OP_SECURE_ERASE:
803	if (!bdev_max_secure_erase_sectors(bdev))
804	goto not_supported;
805	break;
806	case REQ_OP_ZONE_APPEND:
807	status = blk_check_zone_append(q, bio);
808	if (status != BLK_STS_OK)
809	goto end_io;
810	break;
811	case REQ_OP_WRITE_ZEROES:
812	if (!q->limits.max_write_zeroes_sectors)
813	goto not_supported;
814	break;
815	case REQ_OP_ZONE_RESET:
816	case REQ_OP_ZONE_OPEN:
817	case REQ_OP_ZONE_CLOSE:
818	case REQ_OP_ZONE_FINISH:
819	if (!bdev_is_zoned(bdev: bio->bi_bdev))
820	goto not_supported;
821	break;
822	case REQ_OP_ZONE_RESET_ALL:
823	if (!bdev_is_zoned(bdev: bio->bi_bdev) \|\| !blk_queue_zone_resetall(q))
824	goto not_supported;
825	break;
826	case REQ_OP_DRV_IN:
827	case REQ_OP_DRV_OUT:
828	/*
829	* Driver private operations are only used with passthrough
830	* requests.
831	*/
832	fallthrough;
833	default:
834	goto not_supported;
835	}
836
837	if (blk_throtl_bio(bio))
838	return;
839	submit_bio_noacct_nocheck(bio);
840	return;
841
842	not_supported:
843	status = BLK_STS_NOTSUPP;
844	end_io:
845	bio->bi_status = status;
846	bio_endio(bio);
847	}
848	EXPORT_SYMBOL(submit_bio_noacct);
849
850	static void bio_set_ioprio(struct bio *bio)
851	{
852	/ Nobody set ioprio so far? Initialize it based on task's nice value /
853	if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
854	bio->bi_ioprio = get_current_ioprio();
855	blkcg_set_ioprio(bio);
856	}
857
858	/**
859	* submit_bio - submit a bio to the block device layer for I/O
860	* @bio: The &struct bio which describes the I/O
861	*
862	* submit_bio() is used to submit I/O requests to block devices. It is passed a
863	* fully set up &struct bio that describes the I/O that needs to be done. The
864	* bio will be send to the device described by the bi_bdev field.
865	*
866	* The success/failure status of the request, along with notification of
867	* completion, is delivered asynchronously through the ->bi_end_io() callback
868	* in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has
869	* been called.
870	*/
871	void submit_bio(struct bio *bio)
872	{
873	if (bio_op(bio) == REQ_OP_READ) {
874	task_io_account_read(bytes: bio->bi_iter.bi_size);
875	count_vm_events(item: PGPGIN, bio_sectors(bio));
876	} else if (bio_op(bio) == REQ_OP_WRITE) {
877	count_vm_events(item: PGPGOUT, bio_sectors(bio));
878	}
879
880	bio_set_ioprio(bio);
881	submit_bio_noacct(bio);
882	}
883	EXPORT_SYMBOL(submit_bio);
884
885	/**
886	* bio_poll - poll for BIO completions
887	* @bio: bio to poll for
888	* @iob: batches of IO
889	* @flags: BLK_POLL_* flags that control the behavior
890	*
891	* Poll for completions on queue associated with the bio. Returns number of
892	* completed entries found.
893	*
894	* Note: the caller must either be the context that submitted @bio, or
895	* be in a RCU critical section to prevent freeing of @bio.
896	*/
897	int bio_poll(struct bio bio, struct* io_comp_batch iob, unsigned* int flags)
898	{
899	blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
900	struct block_device *bdev;
901	struct request_queue *q;
902	int ret = `0`;
903
904	bdev = READ_ONCE(bio->bi_bdev);
905	if (!bdev)
906	return `0`;
907
908	q = bdev_get_queue(bdev);
909	if (cookie == BLK_QC_T_NONE \|\|
910	!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
911	return `0`;
912
913	/*
914	* As the requests that require a zone lock are not plugged in the
915	* first place, directly accessing the plug instead of using
916	* blk_mq_plug() should not have any consequences during flushing for
917	* zoned devices.
918	*/
919	blk_flush_plug(current->plug, async: false);
920
921	/*
922	* We need to be able to enter a frozen queue, similar to how
923	* timeouts also need to do that. If that is blocked, then we can
924	* have pending IO when a queue freeze is started, and then the
925	* wait for the freeze to finish will wait for polled requests to
926	* timeout as the poller is preventer from entering the queue and
927	* completing them. As long as we prevent new IO from being queued,
928	* that should be all that matters.
929	*/
930	if (!percpu_ref_tryget(ref: &q->q_usage_counter))
931	return `0`;
932	if (queue_is_mq(q)) {
933	ret = blk_mq_poll(q, cookie, iob, flags);
934	} else {
935	struct gendisk *disk = q->disk;
936
937	if (disk && disk->fops->poll_bio)
938	ret = disk->fops->poll_bio(bio, iob, flags);
939	}
940	blk_queue_exit(q);
941	return ret;
942	}
943	EXPORT_SYMBOL_GPL(bio_poll);
944
945	/*
946	* Helper to implement file_operations.iopoll. Requires the bio to be stored
947	* in iocb->private, and cleared before freeing the bio.
948	*/
949	int iocb_bio_iopoll(struct kiocb kiocb, struct* io_comp_batch *iob,
950	unsigned int flags)
951	{
952	struct bio *bio;
953	int ret = `0`;
954
955	/*
956	* Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
957	* point to a freshly allocated bio at this point. If that happens
958	* we have a few cases to consider:
959	*
960	* 1) the bio is beeing initialized and bi_bdev is NULL. We can just
961	* simply nothing in this case
962	* 2) the bio points to a not poll enabled device. bio_poll will catch
963	* this and return 0
964	* 3) the bio points to a poll capable device, including but not
965	* limited to the one that the original bio pointed to. In this
966	* case we will call into the actual poll method and poll for I/O,
967	* even if we don't need to, but it won't cause harm either.
968	*
969	* For cases 2) and 3) above the RCU grace period ensures that bi_bdev
970	* is still allocated. Because partitions hold a reference to the whole
971	* device bdev and thus disk, the disk is also still valid. Grabbing
972	* a reference to the queue in bio_poll() ensures the hctxs and requests
973	* are still valid as well.
974	*/
975	rcu_read_lock();
976	bio = READ_ONCE(kiocb->private);
977	if (bio)
978	ret = bio_poll(bio, iob, flags);
979	rcu_read_unlock();
980
981	return ret;
982	}
983	EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
984
985	void update_io_ticks(struct block_device part, unsigned* long now, bool end)
986	{
987	unsigned long stamp;
988	again:
989	stamp = READ_ONCE(part->bd_stamp);
990	if (unlikely(time_after(now, stamp))) {
991	if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
992	__part_stat_add(part, io_ticks, end ? now - stamp : `1`);
993	}
994	if (part->bd_partno) {
995	part = bdev_whole(part);
996	goto again;
997	}
998	}
999
1000	unsigned long bdev_start_io_acct(struct block_device bdev, enum* req_op op,
1001	unsigned long start_time)
1002	{
1003	part_stat_lock();
1004	update_io_ticks(part: bdev, now: start_time, end: false);
1005	part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
1006	part_stat_unlock();
1007
1008	return start_time;
1009	}
1010	EXPORT_SYMBOL(bdev_start_io_acct);
1011
1012	/**
1013	* bio_start_io_acct - start I/O accounting for bio based drivers
1014	* @bio: bio to start account for
1015	*
1016	* Returns the start time that should be passed back to bio_end_io_acct().
1017	*/
1018	unsigned long bio_start_io_acct(struct bio *bio)
1019	{
1020	return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
1021	}
1022	EXPORT_SYMBOL_GPL(bio_start_io_acct);
1023
1024	void bdev_end_io_acct(struct block_device bdev, enum* req_op op,
1025	unsigned int sectors, unsigned long start_time)
1026	{
1027	const int sgrp = op_stat_group(op);
1028	unsigned long now = READ_ONCE(jiffies);
1029	unsigned long duration = now - start_time;
1030
1031	part_stat_lock();
1032	update_io_ticks(part: bdev, now, end: true);
1033	part_stat_inc(bdev, ios[sgrp]);
1034	part_stat_add(bdev, sectors[sgrp], sectors);
1035	part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
1036	part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
1037	part_stat_unlock();
1038	}
1039	EXPORT_SYMBOL(bdev_end_io_acct);
1040
1041	void bio_end_io_acct_remapped(struct bio bio, unsigned* long start_time,
1042	struct block_device *orig_bdev)
1043	{
1044	bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
1045	}
1046	EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
1047
1048	/**
1049	* blk_lld_busy - Check if underlying low-level drivers of a device are busy
1050	* @q : the queue of the device being checked
1051	*
1052	* Description:
1053	* Check if underlying low-level drivers of a device are busy.
1054	* If the drivers want to export their busy state, they must set own
1055	* exporting function using blk_queue_lld_busy() first.
1056	*
1057	* Basically, this function is used only by request stacking drivers
1058	* to stop dispatching requests to underlying devices when underlying
1059	* devices are busy. This behavior helps more I/O merging on the queue
1060	* of the request stacking driver and prevents I/O throughput regression
1061	* on burst I/O load.
1062	*
1063	* Return:
1064	* 0 - Not busy (The request stacking driver should dispatch request)
1065	* 1 - Busy (The request stacking driver should stop dispatching request)
1066	*/
1067	int blk_lld_busy(struct request_queue *q)
1068	{
1069	if (queue_is_mq(q) && q->mq_ops->busy)
1070	return q->mq_ops->busy(q);
1071
1072	return `0`;
1073	}
1074	EXPORT_SYMBOL_GPL(blk_lld_busy);
1075
1076	int kblockd_schedule_work(struct work_struct *work)
1077	{
1078	return queue_work(wq: kblockd_workqueue, work);
1079	}
1080	EXPORT_SYMBOL(kblockd_schedule_work);
1081
1082	int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1083	unsigned long delay)
1084	{
1085	return mod_delayed_work_on(cpu, wq: kblockd_workqueue, dwork, delay);
1086	}
1087	EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1088
1089	void blk_start_plug_nr_ios(struct blk_plug plug, unsigned* short nr_ios)
1090	{
1091	struct task_struct *tsk = current;
1092
1093	/*
1094	* If this is a nested plug, don't actually assign it.
1095	*/
1096	if (tsk->plug)
1097	return;
1098
1099	plug->cur_ktime = `0`;
1100	plug->mq_list = NULL;
1101	plug->cached_rq = NULL;
1102	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
1103	plug->rq_count = `0`;
1104	plug->multiple_queues = false;
1105	plug->has_elevator = false;
1106	INIT_LIST_HEAD(list: &plug->cb_list);
1107
1108	/*
1109	* Store ordering should not be needed here, since a potential
1110	* preempt will imply a full memory barrier
1111	*/
1112	tsk->plug = plug;
1113	}
1114
1115	/**
1116	* blk_start_plug - initialize blk_plug and track it inside the task_struct
1117	* @plug: The &struct blk_plug that needs to be initialized
1118	*
1119	* Description:
1120	* blk_start_plug() indicates to the block layer an intent by the caller
1121	* to submit multiple I/O requests in a batch. The block layer may use
1122	* this hint to defer submitting I/Os from the caller until blk_finish_plug()
1123	* is called. However, the block layer may choose to submit requests
1124	* before a call to blk_finish_plug() if the number of queued I/Os
1125	* exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1126	* %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1127	* the task schedules (see below).
1128	*
1129	* Tracking blk_plug inside the task_struct will help with auto-flushing the
1130	* pending I/O should the task end up blocking between blk_start_plug() and
1131	* blk_finish_plug(). This is important from a performance perspective, but
1132	* also ensures that we don't deadlock. For instance, if the task is blocking
1133	* for a memory allocation, memory reclaim could end up wanting to free a
1134	* page belonging to that request that is currently residing in our private
1135	* plug. By flushing the pending I/O when the process goes to sleep, we avoid
1136	* this kind of deadlock.
1137	*/
1138	void blk_start_plug(struct blk_plug *plug)
1139	{
1140	blk_start_plug_nr_ios(plug, nr_ios: `1`);
1141	}
1142	EXPORT_SYMBOL(blk_start_plug);
1143
1144	static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
1145	{
1146	LIST_HEAD(callbacks);
1147
1148	while (!list_empty(head: &plug->cb_list)) {
1149	list_splice_init(list: &plug->cb_list, head: &callbacks);
1150
1151	while (!list_empty(head: &callbacks)) {
1152	struct blk_plug_cb *cb = list_first_entry(&callbacks,
1153	struct blk_plug_cb,
1154	list);
1155	list_del(entry: &cb->list);
1156	cb->callback(cb, from_schedule);
1157	}
1158	}
1159	}
1160
1161	struct blk_plug_cb blk_check_plugged(blk_plug_cb_fn unplug, void* *data,
1162	int size)
1163	{
1164	struct blk_plug *plug = current->plug;
1165	struct blk_plug_cb *cb;
1166
1167	if (!plug)
1168	return NULL;
1169
1170	list_for_each_entry(cb, &plug->cb_list, list)
1171	if (cb->callback == unplug && cb->data == data)
1172	return cb;
1173
1174	/ Not currently on the callback list /
1175	BUG_ON(size < sizeof(*cb));
1176	cb = kzalloc(size, GFP_ATOMIC);
1177	if (cb) {
1178	cb->data = data;
1179	cb->callback = unplug;
1180	list_add(new: &cb->list, head: &plug->cb_list);
1181	}
1182	return cb;
1183	}
1184	EXPORT_SYMBOL(blk_check_plugged);
1185
1186	void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
1187	{
1188	if (!list_empty(head: &plug->cb_list))
1189	flush_plug_callbacks(plug, from_schedule);
1190	blk_mq_flush_plug_list(plug, from_schedule);
1191	/*
1192	* Unconditionally flush out cached requests, even if the unplug
1193	* event came from schedule. Since we know hold references to the
1194	* queue for cached requests, we don't want a blocked task holding
1195	* up a queue freeze/quiesce event.
1196	*/
1197	if (unlikely(!rq_list_empty(plug->cached_rq)))
1198	blk_mq_free_plug_rqs(plug);
1199
1200	plug->cur_ktime = `0`;
1201	current->flags &= ~PF_BLOCK_TS;
1202	}
1203
1204	/**
1205	* blk_finish_plug - mark the end of a batch of submitted I/O
1206	* @plug: The &struct blk_plug passed to blk_start_plug()
1207	*
1208	* Description:
1209	* Indicate that a batch of I/O submissions is complete. This function
1210	* must be paired with an initial call to blk_start_plug(). The intent
1211	* is to allow the block layer to optimize I/O submission. See the
1212	* documentation for blk_start_plug() for more information.
1213	*/
1214	void blk_finish_plug(struct blk_plug *plug)
1215	{
1216	if (plug == current->plug) {
1217	__blk_flush_plug(plug, from_schedule: false);
1218	current->plug = NULL;
1219	}
1220	}
1221	EXPORT_SYMBOL(blk_finish_plug);
1222
1223	void blk_io_schedule(void)
1224	{
1225	/ Prevent hang_check timer from firing at us during very long I/O /
1226	unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / `2`;
1227
1228	if (timeout)
1229	io_schedule_timeout(timeout);
1230	else
1231	io_schedule();
1232	}
1233	EXPORT_SYMBOL_GPL(blk_io_schedule);
1234
1235	int __init blk_dev_init(void)
1236	{
1237	BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (`1` << REQ_OP_BITS));
1238	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > `8` *
1239	sizeof_field(struct request, cmd_flags));
1240	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > `8` *
1241	sizeof_field(struct bio, bi_opf));
1242
1243	/ used for unplugging and affects IO latency/throughput - HIGHPRI /
1244	kblockd_workqueue = alloc_workqueue(fmt: "kblockd",
1245	flags: WQ_MEM_RECLAIM \| WQ_HIGHPRI, max_active: `0`);
1246	if (!kblockd_workqueue)
1247	panic(fmt: "Failed to create kblockd\n");
1248
1249	blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
1250
1251	blk_debugfs_root = debugfs_create_dir(name: "block", NULL);
1252
1253	return `0`;
1254	}
1255

source code of linux/block/blk-core.c