xen-blkfront.c source code [linux/drivers/block/xen-blkfront.c]

1	/*
2	* blkfront.c
3	*
4	* XenLinux virtual block device driver.
5	*
6	* Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7	* Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8	* Copyright (c) 2004, Christian Limpach
9	* Copyright (c) 2004, Andrew Warfield
10	* Copyright (c) 2005, Christopher Clark
11	* Copyright (c) 2005, XenSource Ltd
12	*
13	* This program is free software; you can redistribute it and/or
14	* modify it under the terms of the GNU General Public License version 2
15	* as published by the Free Software Foundation; or, when distributed
16	* separately from the Linux kernel or incorporated into other
17	* software packages, subject to the following license:
18	*
19	* Permission is hereby granted, free of charge, to any person obtaining a copy
20	* of this source file (the "Software"), to deal in the Software without
21	* restriction, including without limitation the rights to use, copy, modify,
22	* merge, publish, distribute, sublicense, and/or sell copies of the Software,
23	* and to permit persons to whom the Software is furnished to do so, subject to
24	* the following conditions:
25	*
26	* The above copyright notice and this permission notice shall be included in
27	* all copies or substantial portions of the Software.
28	*
29	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35	* IN THE SOFTWARE.
36	*/
37
38	#include <linux/interrupt.h>
39	#include <linux/blkdev.h>
40	#include <linux/blk-mq.h>
41	#include <linux/hdreg.h>
42	#include <linux/cdrom.h>
43	#include <linux/module.h>
44	#include <linux/slab.h>
45	#include <linux/major.h>
46	#include <linux/mutex.h>
47	#include <linux/scatterlist.h>
48	#include <linux/bitmap.h>
49	#include <linux/list.h>
50	#include <linux/workqueue.h>
51	#include <linux/sched/mm.h>
52
53	#include <xen/xen.h>
54	#include <xen/xenbus.h>
55	#include <xen/grant_table.h>
56	#include <xen/events.h>
57	#include <xen/page.h>
58	#include <xen/platform_pci.h>
59
60	#include <xen/interface/grant_table.h>
61	#include <xen/interface/io/blkif.h>
62	#include <xen/interface/io/protocols.h>
63
64	#include <asm/xen/hypervisor.h>
65
66	/*
67	* The minimal size of segment supported by the block framework is PAGE_SIZE.
68	* When Linux is using a different page size than Xen, it may not be possible
69	* to put all the data in a single segment.
70	* This can happen when the backend doesn't support indirect descriptor and
71	* therefore the maximum amount of data that a request can carry is
72	* BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
73	*
74	* Note that we only support one extra request. So the Linux page size
75	* should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
76	* 88KB.
77	*/
78	#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
79
80	enum blkif_state {
81	BLKIF_STATE_DISCONNECTED,
82	BLKIF_STATE_CONNECTED,
83	BLKIF_STATE_SUSPENDED,
84	BLKIF_STATE_ERROR,
85	};
86
87	struct grant {
88	grant_ref_t gref;
89	struct page *page;
90	struct list_head node;
91	};
92
93	enum blk_req_status {
94	REQ_PROCESSING,
95	REQ_WAITING,
96	REQ_DONE,
97	REQ_ERROR,
98	REQ_EOPNOTSUPP,
99	};
100
101	struct blk_shadow {
102	struct blkif_request req;
103	struct request *request;
104	struct grant **grants_used;
105	struct grant **indirect_grants;
106	struct scatterlist *sg;
107	unsigned int num_sg;
108	enum blk_req_status status;
109
110	#define NO_ASSOCIATED_ID ~0UL
111	/*
112	* Id of the sibling if we ever need 2 requests when handling a
113	* block I/O request
114	*/
115	unsigned long associated_id;
116	};
117
118	struct blkif_req {
119	blk_status_t error;
120	};
121
122	static inline struct blkif_req blkif_req(struct* request *rq)
123	{
124	return blk_mq_rq_to_pdu(rq);
125	}
126
127	static DEFINE_MUTEX(blkfront_mutex);
128	static const struct block_device_operations xlvbd_block_fops;
129	static struct delayed_work blkfront_work;
130	static LIST_HEAD(info_list);
131
132	/*
133	* Maximum number of segments in indirect requests, the actual value used by
134	* the frontend driver is the minimum of this value and the value provided
135	* by the backend driver.
136	*/
137
138	static unsigned int xen_blkif_max_segments = `32`;
139	module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, `0444`);
140	MODULE_PARM_DESC(max_indirect_segments,
141	"Maximum amount of segments in indirect requests (default is 32)");
142
143	static unsigned int xen_blkif_max_queues = `4`;
144	module_param_named(max_queues, xen_blkif_max_queues, uint, `0444`);
145	MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
146
147	/*
148	* Maximum order of pages to be used for the shared ring between front and
149	* backend, 4KB page granularity is used.
150	*/
151	static unsigned int xen_blkif_max_ring_order;
152	module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, `0444`);
153	MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
154
155	static bool __read_mostly xen_blkif_trusted = true;
156	module_param_named(trusted, xen_blkif_trusted, bool, `0644`);
157	MODULE_PARM_DESC(trusted, "Is the backend trusted");
158
159	#define BLK_RING_SIZE(info) \
160	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
161
162	/*
163	* ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
164	* characters are enough. Define to 20 to keep consistent with backend.
165	*/
166	#define RINGREF_NAME_LEN (20)
167	/*
168	* queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
169	*/
170	#define QUEUE_NAME_LEN (17)
171
172	/*
173	* Per-ring info.
174	* Every blkfront device can associate with one or more blkfront_ring_info,
175	* depending on how many hardware queues/rings to be used.
176	*/
177	struct blkfront_ring_info {
178	/ Lock to protect data in every ring buffer. /
179	spinlock_t ring_lock;
180	struct blkif_front_ring ring;
181	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
182	unsigned int evtchn, irq;
183	struct work_struct work;
184	struct gnttab_free_callback callback;
185	struct list_head indirect_pages;
186	struct list_head grants;
187	unsigned int persistent_gnts_c;
188	unsigned long shadow_free;
189	struct blkfront_info *dev_info;
190	struct blk_shadow shadow[];
191	};
192
193	/*
194	* We have one of these per vbd, whether ide, scsi or 'other'. They
195	* hang in private_data off the gendisk structure. We may end up
196	* putting all kinds of interesting stuff here :-)
197	*/
198	struct blkfront_info
199	{
200	struct mutex mutex;
201	struct xenbus_device *xbdev;
202	struct gendisk *gd;
203	u16 sector_size;
204	unsigned int physical_sector_size;
205	unsigned long vdisk_info;
206	int vdevice;
207	blkif_vdev_t handle;
208	enum blkif_state connected;
209	/ Number of pages per ring buffer. /
210	unsigned int nr_ring_pages;
211	struct request_queue *rq;
212	unsigned int feature_flush:`1`;
213	unsigned int feature_fua:`1`;
214	unsigned int feature_discard:`1`;
215	unsigned int feature_secdiscard:`1`;
216	/ Connect-time cached feature_persistent parameter /
217	unsigned int feature_persistent_parm:`1`;
218	/ Persistent grants feature negotiation result /
219	unsigned int feature_persistent:`1`;
220	unsigned int bounce:`1`;
221	unsigned int discard_granularity;
222	unsigned int discard_alignment;
223	/ Number of 4KB segments handled /
224	unsigned int max_indirect_segments;
225	int is_ready;
226	struct blk_mq_tag_set tag_set;
227	struct blkfront_ring_info *rinfo;
228	unsigned int nr_rings;
229	unsigned int rinfo_size;
230	/ Save uncomplete reqs and bios for migration. /
231	struct list_head requests;
232	struct bio_list bio_list;
233	struct list_head info_list;
234	};
235
236	static unsigned int nr_minors;
237	static unsigned long *minors;
238	static DEFINE_SPINLOCK(minor_lock);
239
240	#define PARTS_PER_DISK 16
241	#define PARTS_PER_EXT_DISK 256
242
243	#define BLKIF_MAJOR(dev) ((dev)>>8)
244	#define BLKIF_MINOR(dev) ((dev) & 0xff)
245
246	#define EXT_SHIFT 28
247	#define EXTENDED (1<<EXT_SHIFT)
248	#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
249	#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
250	#define EMULATED_HD_DISK_MINOR_OFFSET (0)
251	#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
252	#define EMULATED_SD_DISK_MINOR_OFFSET (0)
253	#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
254
255	#define DEV_NAME "xvd" /* name in /dev */
256
257	/*
258	* Grants are always the same size as a Xen page (i.e 4KB).
259	* A physical segment is always the same size as a Linux page.
260	* Number of grants per physical segment
261	*/
262	#define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE)
263
264	#define GRANTS_PER_INDIRECT_FRAME \
265	(XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
266
267	#define INDIRECT_GREFS(_grants) \
268	DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
269
270	static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
271	static void blkfront_gather_backend_features(struct blkfront_info *info);
272	static int negotiate_mq(struct blkfront_info *info);
273
274	#define for_each_rinfo(info, ptr, idx) \
275	for ((ptr) = (info)->rinfo, (idx) = 0; \
276	(idx) < (info)->nr_rings; \
277	(idx)++, (ptr) = (void *)(ptr) + (info)->rinfo_size)
278
279	static inline struct blkfront_ring_info *
280	get_rinfo(const struct blkfront_info info, unsigned* int i)
281	{
282	BUG_ON(i >= info->nr_rings);
283	return (void )info->rinfo + i info->rinfo_size;
284	}
285
286	static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
287	{
288	unsigned long free = rinfo->shadow_free;
289
290	BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
291	rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
292	rinfo->shadow[free].req.u.rw.id = `0x0fffffee`; / debug /
293	return free;
294	}
295
296	static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
297	unsigned long id)
298	{
299	if (rinfo->shadow[id].req.u.rw.id != id)
300	return -EINVAL;
301	if (rinfo->shadow[id].request == NULL)
302	return -EINVAL;
303	rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free;
304	rinfo->shadow[id].request = NULL;
305	rinfo->shadow_free = id;
306	return `0`;
307	}
308
309	static int fill_grant_buffer(struct blkfront_ring_info rinfo, int* num)
310	{
311	struct blkfront_info *info = rinfo->dev_info;
312	struct page *granted_page;
313	struct grant gnt_list_entry, n;
314	int i = `0`;
315
316	while (i < num) {
317	gnt_list_entry = kzalloc(size: sizeof(struct grant), GFP_NOIO);
318	if (!gnt_list_entry)
319	goto out_of_memory;
320
321	if (info->bounce) {
322	granted_page = alloc_page(GFP_NOIO \| __GFP_ZERO);
323	if (!granted_page) {
324	kfree(objp: gnt_list_entry);
325	goto out_of_memory;
326	}
327	gnt_list_entry->page = granted_page;
328	}
329
330	gnt_list_entry->gref = INVALID_GRANT_REF;
331	list_add(new: &gnt_list_entry->node, head: &rinfo->grants);
332	i++;
333	}
334
335	return `0`;
336
337	out_of_memory:
338	list_for_each_entry_safe(gnt_list_entry, n,
339	&rinfo->grants, node) {
340	list_del(entry: &gnt_list_entry->node);
341	if (info->bounce)
342	__free_page(gnt_list_entry->page);
343	kfree(objp: gnt_list_entry);
344	i--;
345	}
346	BUG_ON(i != `0`);
347	return -ENOMEM;
348	}
349
350	static struct grant get_free_grant(struct* blkfront_ring_info *rinfo)
351	{
352	struct grant *gnt_list_entry;
353
354	BUG_ON(list_empty(&rinfo->grants));
355	gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
356	node);
357	list_del(entry: &gnt_list_entry->node);
358
359	if (gnt_list_entry->gref != INVALID_GRANT_REF)
360	rinfo->persistent_gnts_c--;
361
362	return gnt_list_entry;
363	}
364
365	static inline void grant_foreign_access(const struct grant *gnt_list_entry,
366	const struct blkfront_info *info)
367	{
368	gnttab_page_grant_foreign_access_ref_one(ref: gnt_list_entry->gref,
369	domid: info->xbdev->otherend_id,
370	page: gnt_list_entry->page,
371	readonly: `0`);
372	}
373
374	static struct grant get_grant(grant_ref_t gref_head,
375	unsigned long gfn,
376	struct blkfront_ring_info *rinfo)
377	{
378	struct grant *gnt_list_entry = get_free_grant(rinfo);
379	struct blkfront_info *info = rinfo->dev_info;
380
381	if (gnt_list_entry->gref != INVALID_GRANT_REF)
382	return gnt_list_entry;
383
384	/ Assign a gref to this page /
385	gnt_list_entry->gref = gnttab_claim_grant_reference(pprivate_head: gref_head);
386	BUG_ON(gnt_list_entry->gref == -ENOSPC);
387	if (info->bounce)
388	grant_foreign_access(gnt_list_entry, info);
389	else {
390	/ Grant access to the GFN passed by the caller /
391	gnttab_grant_foreign_access_ref(ref: gnt_list_entry->gref,
392	domid: info->xbdev->otherend_id,
393	frame: gfn, readonly: `0`);
394	}
395
396	return gnt_list_entry;
397	}
398
399	static struct grant get_indirect_grant(grant_ref_t gref_head,
400	struct blkfront_ring_info *rinfo)
401	{
402	struct grant *gnt_list_entry = get_free_grant(rinfo);
403	struct blkfront_info *info = rinfo->dev_info;
404
405	if (gnt_list_entry->gref != INVALID_GRANT_REF)
406	return gnt_list_entry;
407
408	/ Assign a gref to this page /
409	gnt_list_entry->gref = gnttab_claim_grant_reference(pprivate_head: gref_head);
410	BUG_ON(gnt_list_entry->gref == -ENOSPC);
411	if (!info->bounce) {
412	struct page *indirect_page;
413
414	/ Fetch a pre-allocated page to use for indirect grefs /
415	BUG_ON(list_empty(&rinfo->indirect_pages));
416	indirect_page = list_first_entry(&rinfo->indirect_pages,
417	struct page, lru);
418	list_del(entry: &indirect_page->lru);
419	gnt_list_entry->page = indirect_page;
420	}
421	grant_foreign_access(gnt_list_entry, info);
422
423	return gnt_list_entry;
424	}
425
426	static const char op_name(int* op)
427	{
428	static const char *const names[] = {
429	[BLKIF_OP_READ] = "read",
430	[BLKIF_OP_WRITE] = "write",
431	[BLKIF_OP_WRITE_BARRIER] = "barrier",
432	[BLKIF_OP_FLUSH_DISKCACHE] = "flush",
433	[BLKIF_OP_DISCARD] = "discard" };
434
435	if (op < `0` \|\| op >= ARRAY_SIZE(names))
436	return "unknown";
437
438	if (!names[op])
439	return "reserved";
440
441	return names[op];
442	}
443	static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
444	{
445	unsigned int end = minor + nr;
446	int rc;
447
448	if (end > nr_minors) {
449	unsigned long bitmap, old;
450
451	bitmap = kcalloc(BITS_TO_LONGS(end), size: sizeof(*bitmap),
452	GFP_KERNEL);
453	if (bitmap == NULL)
454	return -ENOMEM;
455
456	spin_lock(lock: &minor_lock);
457	if (end > nr_minors) {
458	old = minors;
459	memcpy(bitmap, minors,
460	BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
461	minors = bitmap;
462	nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
463	} else
464	old = bitmap;
465	spin_unlock(lock: &minor_lock);
466	kfree(objp: old);
467	}
468
469	spin_lock(lock: &minor_lock);
470	if (find_next_bit(addr: minors, size: end, offset: minor) >= end) {
471	bitmap_set(map: minors, start: minor, nbits: nr);
472	rc = `0`;
473	} else
474	rc = -EBUSY;
475	spin_unlock(lock: &minor_lock);
476
477	return rc;
478	}
479
480	static void xlbd_release_minors(unsigned int minor, unsigned int nr)
481	{
482	unsigned int end = minor + nr;
483
484	BUG_ON(end > nr_minors);
485	spin_lock(lock: &minor_lock);
486	bitmap_clear(map: minors, start: minor, nbits: nr);
487	spin_unlock(lock: &minor_lock);
488	}
489
490	static void blkif_restart_queue_callback(void *arg)
491	{
492	struct blkfront_ring_info rinfo = (struct* blkfront_ring_info *)arg;
493	schedule_work(work: &rinfo->work);
494	}
495
496	static int blkif_getgeo(struct block_device bd, struct* hd_geometry *hg)
497	{
498	/ We don't have real geometry info, but let's at least return*
499	values consistent with the size of the device /*
500	sector_t nsect = get_capacity(disk: bd->bd_disk);
501	sector_t cylinders = nsect;
502
503	hg->heads = `0xff`;
504	hg->sectors = `0x3f`;
505	sector_div(cylinders, hg->heads * hg->sectors);
506	hg->cylinders = cylinders;
507	if ((sector_t)(hg->cylinders + `1`) * hg->heads * hg->sectors < nsect)
508	hg->cylinders = `0xffff`;
509	return `0`;
510	}
511
512	static int blkif_ioctl(struct block_device *bdev, blk_mode_t mode,
513	unsigned command, unsigned long argument)
514	{
515	struct blkfront_info *info = bdev->bd_disk->private_data;
516	int i;
517
518	switch (command) {
519	case CDROMMULTISESSION:
520	for (i = `0`; i < sizeof(struct cdrom_multisession); i++)
521	if (put_user(`0`, (char __user *)(argument + i)))
522	return -EFAULT;
523	return `0`;
524	case CDROM_GET_CAPABILITY:
525	if (!(info->vdisk_info & VDISK_CDROM))
526	return -EINVAL;
527	return `0`;
528	default:
529	return -EINVAL;
530	}
531	}
532
533	static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
534	struct request *req,
535	struct blkif_request **ring_req)
536	{
537	unsigned long id;
538
539	*ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
540	rinfo->ring.req_prod_pvt++;
541
542	id = get_id_from_freelist(rinfo);
543	rinfo->shadow[id].request = req;
544	rinfo->shadow[id].status = REQ_PROCESSING;
545	rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
546
547	rinfo->shadow[id].req.u.rw.id = id;
548
549	return id;
550	}
551
552	static int blkif_queue_discard_req(struct request req, struct* blkfront_ring_info *rinfo)
553	{
554	struct blkfront_info *info = rinfo->dev_info;
555	struct blkif_request ring_req, final_ring_req;
556	unsigned long id;
557
558	/ Fill out a communications ring structure. /
559	id = blkif_ring_get_request(rinfo, req, ring_req: &final_ring_req);
560	ring_req = &rinfo->shadow[id].req;
561
562	ring_req->operation = BLKIF_OP_DISCARD;
563	ring_req->u.discard.nr_sectors = blk_rq_sectors(rq: req);
564	ring_req->u.discard.id = id;
565	ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(rq: req);
566	if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
567	ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
568	else
569	ring_req->u.discard.flag = `0`;
570
571	/ Copy the request to the ring page. /
572	final_ring_req = ring_req;
573	rinfo->shadow[id].status = REQ_WAITING;
574
575	return `0`;
576	}
577
578	struct setup_rw_req {
579	unsigned int grant_idx;
580	struct blkif_request_segment *segments;
581	struct blkfront_ring_info *rinfo;
582	struct blkif_request *ring_req;
583	grant_ref_t gref_head;
584	unsigned int id;
585	/ Only used when persistent grant is used and it's a write request /
586	bool need_copy;
587	unsigned int bvec_off;
588	char *bvec_data;
589
590	bool require_extra_req;
591	struct blkif_request *extra_ring_req;
592	};
593
594	static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
595	unsigned int len, void *data)
596	{
597	struct setup_rw_req *setup = data;
598	int n, ref;
599	struct grant *gnt_list_entry;
600	unsigned int fsect, lsect;
601	/ Convenient aliases /
602	unsigned int grant_idx = setup->grant_idx;
603	struct blkif_request *ring_req = setup->ring_req;
604	struct blkfront_ring_info *rinfo = setup->rinfo;
605	/*
606	* We always use the shadow of the first request to store the list
607	* of grant associated to the block I/O request. This made the
608	* completion more easy to handle even if the block I/O request is
609	* split.
610	*/
611	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
612
613	if (unlikely(setup->require_extra_req &&
614	grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
615	/*
616	* We are using the second request, setup grant_idx
617	* to be the index of the segment array.
618	*/
619	grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
620	ring_req = setup->extra_ring_req;
621	}
622
623	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
624	(grant_idx % GRANTS_PER_INDIRECT_FRAME == `0`)) {
625	if (setup->segments)
626	kunmap_atomic(setup->segments);
627
628	n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
629	gnt_list_entry = get_indirect_grant(gref_head: &setup->gref_head, rinfo);
630	shadow->indirect_grants[n] = gnt_list_entry;
631	setup->segments = kmap_atomic(page: gnt_list_entry->page);
632	ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
633	}
634
635	gnt_list_entry = get_grant(gref_head: &setup->gref_head, gfn, rinfo);
636	ref = gnt_list_entry->gref;
637	/*
638	* All the grants are stored in the shadow of the first
639	* request. Therefore we have to use the global index.
640	*/
641	shadow->grants_used[setup->grant_idx] = gnt_list_entry;
642
643	if (setup->need_copy) {
644	void *shared_data;
645
646	shared_data = kmap_atomic(page: gnt_list_entry->page);
647	/*
648	* this does not wipe data stored outside the
649	* range sg->offset..sg->offset+sg->length.
650	* Therefore, blkback could see data from
651	* previous requests. This is OK as long as
652	* persistent grants are shared with just one
653	* domain. It may need refactoring if this
654	* changes
655	*/
656	memcpy(shared_data + offset,
657	setup->bvec_data + setup->bvec_off,
658	len);
659
660	kunmap_atomic(shared_data);
661	setup->bvec_off += len;
662	}
663
664	fsect = offset >> `9`;
665	lsect = fsect + (len >> `9`) - `1`;
666	if (ring_req->operation != BLKIF_OP_INDIRECT) {
667	ring_req->u.rw.seg[grant_idx] =
668	(struct blkif_request_segment) {
669	.gref = ref,
670	.first_sect = fsect,
671	.last_sect = lsect };
672	} else {
673	setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] =
674	(struct blkif_request_segment) {
675	.gref = ref,
676	.first_sect = fsect,
677	.last_sect = lsect };
678	}
679
680	(setup->grant_idx)++;
681	}
682
683	static void blkif_setup_extra_req(struct blkif_request *first,
684	struct blkif_request *second)
685	{
686	uint16_t nr_segments = first->u.rw.nr_segments;
687
688	/*
689	* The second request is only present when the first request uses
690	* all its segments. It's always the continuity of the first one.
691	*/
692	first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
693
694	second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
695	second->u.rw.sector_number = first->u.rw.sector_number +
696	(BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / `512`;
697
698	second->u.rw.handle = first->u.rw.handle;
699	second->operation = first->operation;
700	}
701
702	static int blkif_queue_rw_req(struct request req, struct* blkfront_ring_info *rinfo)
703	{
704	struct blkfront_info *info = rinfo->dev_info;
705	struct blkif_request ring_req, extra_ring_req = NULL;
706	struct blkif_request final_ring_req, final_extra_ring_req = NULL;
707	unsigned long id, extra_id = NO_ASSOCIATED_ID;
708	bool require_extra_req = false;
709	int i;
710	struct setup_rw_req setup = {
711	.grant_idx = `0`,
712	.segments = NULL,
713	.rinfo = rinfo,
714	.need_copy = rq_data_dir(req) && info->bounce,
715	};
716
717	/*
718	* Used to store if we are able to queue the request by just using
719	* existing persistent grants, or if we have to get new grants,
720	* as there are not sufficiently many free.
721	*/
722	bool new_persistent_gnts = false;
723	struct scatterlist *sg;
724	int num_sg, max_grefs, num_grant;
725
726	max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG;
727	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
728	/*
729	* If we are using indirect segments we need to account
730	* for the indirect grefs used in the request.
731	*/
732	max_grefs += INDIRECT_GREFS(max_grefs);
733
734	/ Check if we have enough persistent grants to allocate a requests /
735	if (rinfo->persistent_gnts_c < max_grefs) {
736	new_persistent_gnts = true;
737
738	if (gnttab_alloc_grant_references(
739	count: max_grefs - rinfo->persistent_gnts_c,
740	pprivate_head: &setup.gref_head) < `0`) {
741	gnttab_request_free_callback(
742	callback: &rinfo->callback,
743	fn: blkif_restart_queue_callback,
744	arg: rinfo,
745	count: max_grefs - rinfo->persistent_gnts_c);
746	return `1`;
747	}
748	}
749
750	/ Fill out a communications ring structure. /
751	id = blkif_ring_get_request(rinfo, req, ring_req: &final_ring_req);
752	ring_req = &rinfo->shadow[id].req;
753
754	num_sg = blk_rq_map_sg(q: req->q, rq: req, sglist: rinfo->shadow[id].sg);
755	num_grant = `0`;
756	/ Calculate the number of grant used /
757	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
758	num_grant += gnttab_count_grant(start: sg->offset, len: sg->length);
759
760	require_extra_req = info->max_indirect_segments == `0` &&
761	num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
762	BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
763
764	rinfo->shadow[id].num_sg = num_sg;
765	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
766	likely(!require_extra_req)) {
767	/*
768	* The indirect operation can only be a BLKIF_OP_READ or
769	* BLKIF_OP_WRITE
770	*/
771	BUG_ON(req_op(req) == REQ_OP_FLUSH \|\| req->cmd_flags & REQ_FUA);
772	ring_req->operation = BLKIF_OP_INDIRECT;
773	ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
774	BLKIF_OP_WRITE : BLKIF_OP_READ;
775	ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(rq: req);
776	ring_req->u.indirect.handle = info->handle;
777	ring_req->u.indirect.nr_segments = num_grant;
778	} else {
779	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(rq: req);
780	ring_req->u.rw.handle = info->handle;
781	ring_req->operation = rq_data_dir(req) ?
782	BLKIF_OP_WRITE : BLKIF_OP_READ;
783	if (req_op(req) == REQ_OP_FLUSH \|\|
784	(req_op(req) == REQ_OP_WRITE && (req->cmd_flags & REQ_FUA))) {
785	/*
786	* Ideally we can do an unordered flush-to-disk.
787	* In case the backend onlysupports barriers, use that.
788	* A barrier request a superset of FUA, so we can
789	* implement it the same way. (It's also a FLUSH+FUA,
790	* since it is guaranteed ordered WRT previous writes.)
791	*/
792	if (info->feature_flush && info->feature_fua)
793	ring_req->operation =
794	BLKIF_OP_WRITE_BARRIER;
795	else if (info->feature_flush)
796	ring_req->operation =
797	BLKIF_OP_FLUSH_DISKCACHE;
798	else
799	ring_req->operation = `0`;
800	}
801	ring_req->u.rw.nr_segments = num_grant;
802	if (unlikely(require_extra_req)) {
803	extra_id = blkif_ring_get_request(rinfo, req,
804	ring_req: &final_extra_ring_req);
805	extra_ring_req = &rinfo->shadow[extra_id].req;
806
807	/*
808	* Only the first request contains the scatter-gather
809	* list.
810	*/
811	rinfo->shadow[extra_id].num_sg = `0`;
812
813	blkif_setup_extra_req(first: ring_req, second: extra_ring_req);
814
815	/ Link the 2 requests together /
816	rinfo->shadow[extra_id].associated_id = id;
817	rinfo->shadow[id].associated_id = extra_id;
818	}
819	}
820
821	setup.ring_req = ring_req;
822	setup.id = id;
823
824	setup.require_extra_req = require_extra_req;
825	if (unlikely(require_extra_req))
826	setup.extra_ring_req = extra_ring_req;
827
828	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
829	BUG_ON(sg->offset + sg->length > PAGE_SIZE);
830
831	if (setup.need_copy) {
832	setup.bvec_off = sg->offset;
833	setup.bvec_data = kmap_atomic(page: sg_page(sg));
834	}
835
836	gnttab_foreach_grant_in_range(page: sg_page(sg),
837	offset: sg->offset,
838	len: sg->length,
839	fn: blkif_setup_rw_req_grant,
840	data: &setup);
841
842	if (setup.need_copy)
843	kunmap_atomic(setup.bvec_data);
844	}
845	if (setup.segments)
846	kunmap_atomic(setup.segments);
847
848	/ Copy request(s) to the ring page. /
849	final_ring_req = ring_req;
850	rinfo->shadow[id].status = REQ_WAITING;
851	if (unlikely(require_extra_req)) {
852	final_extra_ring_req = extra_ring_req;
853	rinfo->shadow[extra_id].status = REQ_WAITING;
854	}
855
856	if (new_persistent_gnts)
857	gnttab_free_grant_references(head: setup.gref_head);
858
859	return `0`;
860	}
861
862	/*
863	* Generate a Xen blkfront IO request from a blk layer request. Reads
864	* and writes are handled as expected.
865	*
866	* @req: a request struct
867	*/
868	static int blkif_queue_request(struct request req, struct* blkfront_ring_info *rinfo)
869	{
870	if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
871	return `1`;
872
873	if (unlikely(req_op(req) == REQ_OP_DISCARD \|\|
874	req_op(req) == REQ_OP_SECURE_ERASE))
875	return blkif_queue_discard_req(req, rinfo);
876	else
877	return blkif_queue_rw_req(req, rinfo);
878	}
879
880	static inline void flush_requests(struct blkfront_ring_info *rinfo)
881	{
882	int notify;
883
884	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
885
886	if (notify)
887	notify_remote_via_irq(irq: rinfo->irq);
888	}
889
890	static inline bool blkif_request_flush_invalid(struct request *req,
891	struct blkfront_info *info)
892	{
893	return (blk_rq_is_passthrough(rq: req) \|\|
894	((req_op(req) == REQ_OP_FLUSH) &&
895	!info->feature_flush) \|\|
896	((req->cmd_flags & REQ_FUA) &&
897	!info->feature_fua));
898	}
899
900	static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
901	const struct blk_mq_queue_data *qd)
902	{
903	unsigned long flags;
904	int qid = hctx->queue_num;
905	struct blkfront_info *info = hctx->queue->queuedata;
906	struct blkfront_ring_info *rinfo = NULL;
907
908	rinfo = get_rinfo(info, i: qid);
909	blk_mq_start_request(rq: qd->rq);
910	spin_lock_irqsave(&rinfo->ring_lock, flags);
911	if (RING_FULL(&rinfo->ring))
912	goto out_busy;
913
914	if (blkif_request_flush_invalid(req: qd->rq, info: rinfo->dev_info))
915	goto out_err;
916
917	if (blkif_queue_request(req: qd->rq, rinfo))
918	goto out_busy;
919
920	flush_requests(rinfo);
921	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
922	return BLK_STS_OK;
923
924	out_err:
925	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
926	return BLK_STS_IOERR;
927
928	out_busy:
929	blk_mq_stop_hw_queue(hctx);
930	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
931	return BLK_STS_DEV_RESOURCE;
932	}
933
934	static void blkif_complete_rq(struct request *rq)
935	{
936	blk_mq_end_request(rq, error: blkif_req(rq)->error);
937	}
938
939	static const struct blk_mq_ops blkfront_mq_ops = {
940	.queue_rq = blkif_queue_rq,
941	.complete = blkif_complete_rq,
942	};
943
944	static void blkif_set_queue_limits(struct blkfront_info *info)
945	{
946	struct request_queue *rq = info->rq;
947	struct gendisk *gd = info->gd;
948	unsigned int segments = info->max_indirect_segments ? :
949	BLKIF_MAX_SEGMENTS_PER_REQUEST;
950
951	blk_queue_flag_set(QUEUE_FLAG_VIRT, q: rq);
952
953	if (info->feature_discard) {
954	blk_queue_max_discard_sectors(q: rq, max_discard_sectors: get_capacity(disk: gd));
955	rq->limits.discard_granularity = info->discard_granularity ?:
956	info->physical_sector_size;
957	rq->limits.discard_alignment = info->discard_alignment;
958	if (info->feature_secdiscard)
959	blk_queue_max_secure_erase_sectors(q: rq,
960	max_sectors: get_capacity(disk: gd));
961	}
962
963	/ Hard sector size and max sectors impersonate the equiv. hardware. /
964	blk_queue_logical_block_size(rq, info->sector_size);
965	blk_queue_physical_block_size(rq, info->physical_sector_size);
966	blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / `512`);
967
968	/ Each segment in a request is up to an aligned page in size. /
969	blk_queue_segment_boundary(rq, PAGE_SIZE - `1`);
970	blk_queue_max_segment_size(rq, PAGE_SIZE);
971
972	/ Ensure a merged request will fit in a single I/O ring slot. /
973	blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
974
975	/ Make sure buffer addresses are sector-aligned. /
976	blk_queue_dma_alignment(rq, `511`);
977	}
978
979	static const char flush_info(struct* blkfront_info *info)
980	{
981	if (info->feature_flush && info->feature_fua)
982	return "barrier: enabled;";
983	else if (info->feature_flush)
984	return "flush diskcache: enabled;";
985	else
986	return "barrier or flush: disabled;";
987	}
988
989	static void xlvbd_flush(struct blkfront_info *info)
990	{
991	blk_queue_write_cache(q: info->rq, enabled: info->feature_flush ? true : false,
992	fua: info->feature_fua ? true : false);
993	pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
994	info->gd->disk_name, flush_info(info),
995	"persistent grants:", info->feature_persistent ?
996	"enabled;" : "disabled;", "indirect descriptors:",
997	info->max_indirect_segments ? "enabled;" : "disabled;",
998	"bounce buffer:", info->bounce ? "enabled" : "disabled;");
999	}
1000
1001	static int xen_translate_vdev(int vdevice, int minor, unsigned* int *offset)
1002	{
1003	int major;
1004	major = BLKIF_MAJOR(vdevice);
1005	*minor = BLKIF_MINOR(vdevice);
1006	switch (major) {
1007	case XEN_IDE0_MAJOR:
1008	offset = (minor / `64`) + EMULATED_HD_DISK_NAME_OFFSET;
1009	minor = ((minor / `64`) * PARTS_PER_DISK) +
1010	EMULATED_HD_DISK_MINOR_OFFSET;
1011	break;
1012	case XEN_IDE1_MAJOR:
1013	offset = (minor / `64`) + `2` + EMULATED_HD_DISK_NAME_OFFSET;
1014	minor = (((minor / `64`) + `2`) * PARTS_PER_DISK) +
1015	EMULATED_HD_DISK_MINOR_OFFSET;
1016	break;
1017	case XEN_SCSI_DISK0_MAJOR:
1018	offset = (minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
1019	minor = minor + EMULATED_SD_DISK_MINOR_OFFSET;
1020	break;
1021	case XEN_SCSI_DISK1_MAJOR:
1022	case XEN_SCSI_DISK2_MAJOR:
1023	case XEN_SCSI_DISK3_MAJOR:
1024	case XEN_SCSI_DISK4_MAJOR:
1025	case XEN_SCSI_DISK5_MAJOR:
1026	case XEN_SCSI_DISK6_MAJOR:
1027	case XEN_SCSI_DISK7_MAJOR:
1028	offset = (minor / PARTS_PER_DISK) +
1029	((major - XEN_SCSI_DISK1_MAJOR + `1`) * `16`) +
1030	EMULATED_SD_DISK_NAME_OFFSET;
1031	minor = minor +
1032	((major - XEN_SCSI_DISK1_MAJOR + `1`) * `16` * PARTS_PER_DISK) +
1033	EMULATED_SD_DISK_MINOR_OFFSET;
1034	break;
1035	case XEN_SCSI_DISK8_MAJOR:
1036	case XEN_SCSI_DISK9_MAJOR:
1037	case XEN_SCSI_DISK10_MAJOR:
1038	case XEN_SCSI_DISK11_MAJOR:
1039	case XEN_SCSI_DISK12_MAJOR:
1040	case XEN_SCSI_DISK13_MAJOR:
1041	case XEN_SCSI_DISK14_MAJOR:
1042	case XEN_SCSI_DISK15_MAJOR:
1043	offset = (minor / PARTS_PER_DISK) +
1044	((major - XEN_SCSI_DISK8_MAJOR + `8`) * `16`) +
1045	EMULATED_SD_DISK_NAME_OFFSET;
1046	minor = minor +
1047	((major - XEN_SCSI_DISK8_MAJOR + `8`) * `16` * PARTS_PER_DISK) +
1048	EMULATED_SD_DISK_MINOR_OFFSET;
1049	break;
1050	case XENVBD_MAJOR:
1051	offset = minor / PARTS_PER_DISK;
1052	break;
1053	default:
1054	printk(KERN_WARNING "blkfront: your disk configuration is "
1055	"incorrect, please use an xvd device instead\n");
1056	return -ENODEV;
1057	}
1058	return `0`;
1059	}
1060
1061	static char encode_disk_name(char* ptr, unsigned* int n)
1062	{
1063	if (n >= `26`)
1064	ptr = encode_disk_name(ptr, n: n / `26` - `1`);
1065	*ptr = `'a'` + n % `26`;
1066	return ptr + `1`;
1067	}
1068
1069	static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1070	struct blkfront_info *info, u16 sector_size,
1071	unsigned int physical_sector_size)
1072	{
1073	struct gendisk *gd;
1074	int nr_minors = `1`;
1075	int err;
1076	unsigned int offset;
1077	int minor;
1078	int nr_parts;
1079	char *ptr;
1080
1081	BUG_ON(info->gd != NULL);
1082	BUG_ON(info->rq != NULL);
1083
1084	if ((info->vdevice>>EXT_SHIFT) > `1`) {
1085	/ this is above the extended range; something is wrong /
1086	printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
1087	return -ENODEV;
1088	}
1089
1090	if (!VDEV_IS_EXTENDED(info->vdevice)) {
1091	err = xen_translate_vdev(vdevice: info->vdevice, minor: &minor, offset: &offset);
1092	if (err)
1093	return err;
1094	nr_parts = PARTS_PER_DISK;
1095	} else {
1096	minor = BLKIF_MINOR_EXT(info->vdevice);
1097	nr_parts = PARTS_PER_EXT_DISK;
1098	offset = minor / nr_parts;
1099	if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + `4`)
1100	printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
1101	"emulated IDE disks,\n\t choose an xvd device name"
1102	"from xvde on\n", info->vdevice);
1103	}
1104	if (minor >> MINORBITS) {
1105	pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
1106	info->vdevice, minor);
1107	return -ENODEV;
1108	}
1109
1110	if ((minor % nr_parts) == `0`)
1111	nr_minors = nr_parts;
1112
1113	err = xlbd_reserve_minors(minor, nr: nr_minors);
1114	if (err)
1115	return err;
1116
1117	memset(&info->tag_set, `0`, sizeof(info->tag_set));
1118	info->tag_set.ops = &blkfront_mq_ops;
1119	info->tag_set.nr_hw_queues = info->nr_rings;
1120	if (HAS_EXTRA_REQ && info->max_indirect_segments == `0`) {
1121	/*
1122	* When indirect descriptior is not supported, the I/O request
1123	* will be split between multiple request in the ring.
1124	* To avoid problems when sending the request, divide by
1125	* 2 the depth of the queue.
1126	*/
1127	info->tag_set.queue_depth = BLK_RING_SIZE(info) / `2`;
1128	} else
1129	info->tag_set.queue_depth = BLK_RING_SIZE(info);
1130	info->tag_set.numa_node = NUMA_NO_NODE;
1131	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1132	info->tag_set.cmd_size = sizeof(struct blkif_req);
1133	info->tag_set.driver_data = info;
1134
1135	err = blk_mq_alloc_tag_set(set: &info->tag_set);
1136	if (err)
1137	goto out_release_minors;
1138
1139	gd = blk_mq_alloc_disk(&info->tag_set, info);
1140	if (IS_ERR(ptr: gd)) {
1141	err = PTR_ERR(ptr: gd);
1142	goto out_free_tag_set;
1143	}
1144
1145	strcpy(p: gd->disk_name, DEV_NAME);
1146	ptr = encode_disk_name(ptr: gd->disk_name + sizeof(DEV_NAME) - `1`, n: offset);
1147	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
1148	if (nr_minors > `1`)
1149	*ptr = `0`;
1150	else
1151	snprintf(buf: ptr, size: gd->disk_name + DISK_NAME_LEN - ptr,
1152	fmt: "%d", minor & (nr_parts - `1`));
1153
1154	gd->major = XENVBD_MAJOR;
1155	gd->first_minor = minor;
1156	gd->minors = nr_minors;
1157	gd->fops = &xlvbd_block_fops;
1158	gd->private_data = info;
1159	set_capacity(disk: gd, size: capacity);
1160
1161	info->rq = gd->queue;
1162	info->gd = gd;
1163	info->sector_size = sector_size;
1164	info->physical_sector_size = physical_sector_size;
1165	blkif_set_queue_limits(info);
1166
1167	xlvbd_flush(info);
1168
1169	if (info->vdisk_info & VDISK_READONLY)
1170	set_disk_ro(disk: gd, read_only: `1`);
1171	if (info->vdisk_info & VDISK_REMOVABLE)
1172	gd->flags \|= GENHD_FL_REMOVABLE;
1173
1174	return `0`;
1175
1176	out_free_tag_set:
1177	blk_mq_free_tag_set(set: &info->tag_set);
1178	out_release_minors:
1179	xlbd_release_minors(minor, nr: nr_minors);
1180	return err;
1181	}
1182
1183	/ Already hold rinfo->ring_lock. /
1184	static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
1185	{
1186	if (!RING_FULL(&rinfo->ring))
1187	blk_mq_start_stopped_hw_queues(q: rinfo->dev_info->rq, async: true);
1188	}
1189
1190	static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
1191	{
1192	unsigned long flags;
1193
1194	spin_lock_irqsave(&rinfo->ring_lock, flags);
1195	kick_pending_request_queues_locked(rinfo);
1196	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
1197	}
1198
1199	static void blkif_restart_queue(struct work_struct *work)
1200	{
1201	struct blkfront_ring_info rinfo = container_of(work, struct* blkfront_ring_info, work);
1202
1203	if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
1204	kick_pending_request_queues(rinfo);
1205	}
1206
1207	static void blkif_free_ring(struct blkfront_ring_info *rinfo)
1208	{
1209	struct grant persistent_gnt, n;
1210	struct blkfront_info *info = rinfo->dev_info;
1211	int i, j, segs;
1212
1213	/*
1214	* Remove indirect pages, this only happens when using indirect
1215	* descriptors but not persistent grants
1216	*/
1217	if (!list_empty(head: &rinfo->indirect_pages)) {
1218	struct page indirect_page, n;
1219
1220	BUG_ON(info->bounce);
1221	list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1222	list_del(entry: &indirect_page->lru);
1223	__free_page(indirect_page);
1224	}
1225	}
1226
1227	/ Remove all persistent grants. /
1228	if (!list_empty(head: &rinfo->grants)) {
1229	list_for_each_entry_safe(persistent_gnt, n,
1230	&rinfo->grants, node) {
1231	list_del(entry: &persistent_gnt->node);
1232	if (persistent_gnt->gref != INVALID_GRANT_REF) {
1233	gnttab_end_foreign_access(ref: persistent_gnt->gref,
1234	NULL);
1235	rinfo->persistent_gnts_c--;
1236	}
1237	if (info->bounce)
1238	__free_page(persistent_gnt->page);
1239	kfree(objp: persistent_gnt);
1240	}
1241	}
1242	BUG_ON(rinfo->persistent_gnts_c != `0`);
1243
1244	for (i = `0`; i < BLK_RING_SIZE(info); i++) {
1245	/*
1246	* Clear persistent grants present in requests already
1247	* on the shared ring
1248	*/
1249	if (!rinfo->shadow[i].request)
1250	goto free_shadow;
1251
1252	segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
1253	rinfo->shadow[i].req.u.indirect.nr_segments :
1254	rinfo->shadow[i].req.u.rw.nr_segments;
1255	for (j = `0`; j < segs; j++) {
1256	persistent_gnt = rinfo->shadow[i].grants_used[j];
1257	gnttab_end_foreign_access(ref: persistent_gnt->gref, NULL);
1258	if (info->bounce)
1259	__free_page(persistent_gnt->page);
1260	kfree(objp: persistent_gnt);
1261	}
1262
1263	if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
1264	/*
1265	* If this is not an indirect operation don't try to
1266	* free indirect segments
1267	*/
1268	goto free_shadow;
1269
1270	for (j = `0`; j < INDIRECT_GREFS(segs); j++) {
1271	persistent_gnt = rinfo->shadow[i].indirect_grants[j];
1272	gnttab_end_foreign_access(ref: persistent_gnt->gref, NULL);
1273	__free_page(persistent_gnt->page);
1274	kfree(objp: persistent_gnt);
1275	}
1276
1277	free_shadow:
1278	kvfree(addr: rinfo->shadow[i].grants_used);
1279	rinfo->shadow[i].grants_used = NULL;
1280	kvfree(addr: rinfo->shadow[i].indirect_grants);
1281	rinfo->shadow[i].indirect_grants = NULL;
1282	kvfree(addr: rinfo->shadow[i].sg);
1283	rinfo->shadow[i].sg = NULL;
1284	}
1285
1286	/ No more gnttab callback work. /
1287	gnttab_cancel_free_callback(callback: &rinfo->callback);
1288
1289	/ Flush gnttab callback work. Must be done with no locks held. /
1290	flush_work(work: &rinfo->work);
1291
1292	/ Free resources associated with old device channel. /
1293	xenbus_teardown_ring(vaddr: (void **)&rinfo->ring.sring, nr_pages: info->nr_ring_pages,
1294	grefs: rinfo->ring_ref);
1295
1296	if (rinfo->irq)
1297	unbind_from_irqhandler(irq: rinfo->irq, dev_id: rinfo);
1298	rinfo->evtchn = rinfo->irq = `0`;
1299	}
1300
1301	static void blkif_free(struct blkfront_info info, int* suspend)
1302	{
1303	unsigned int i;
1304	struct blkfront_ring_info *rinfo;
1305
1306	/ Prevent new requests being issued until we fix things up. /
1307	info->connected = suspend ?
1308	BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1309	/ No more blkif_request(). /
1310	if (info->rq)
1311	blk_mq_stop_hw_queues(q: info->rq);
1312
1313	for_each_rinfo(info, rinfo, i)
1314	blkif_free_ring(rinfo);
1315
1316	kvfree(addr: info->rinfo);
1317	info->rinfo = NULL;
1318	info->nr_rings = `0`;
1319	}
1320
1321	struct copy_from_grant {
1322	const struct blk_shadow *s;
1323	unsigned int grant_idx;
1324	unsigned int bvec_offset;
1325	char *bvec_data;
1326	};
1327
1328	static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1329	unsigned int len, void *data)
1330	{
1331	struct copy_from_grant *info = data;
1332	char *shared_data;
1333	/ Convenient aliases /
1334	const struct blk_shadow *s = info->s;
1335
1336	shared_data = kmap_atomic(page: s->grants_used[info->grant_idx]->page);
1337
1338	memcpy(info->bvec_data + info->bvec_offset,
1339	shared_data + offset, len);
1340
1341	info->bvec_offset += len;
1342	info->grant_idx++;
1343
1344	kunmap_atomic(shared_data);
1345	}
1346
1347	static enum blk_req_status blkif_rsp_to_req_status(int rsp)
1348	{
1349	switch (rsp)
1350	{
1351	case BLKIF_RSP_OKAY:
1352	return REQ_DONE;
1353	case BLKIF_RSP_EOPNOTSUPP:
1354	return REQ_EOPNOTSUPP;
1355	case BLKIF_RSP_ERROR:
1356	default:
1357	return REQ_ERROR;
1358	}
1359	}
1360
1361	/*
1362	* Get the final status of the block request based on two ring response
1363	*/
1364	static int blkif_get_final_status(enum blk_req_status s1,
1365	enum blk_req_status s2)
1366	{
1367	BUG_ON(s1 < REQ_DONE);
1368	BUG_ON(s2 < REQ_DONE);
1369
1370	if (s1 == REQ_ERROR \|\| s2 == REQ_ERROR)
1371	return BLKIF_RSP_ERROR;
1372	else if (s1 == REQ_EOPNOTSUPP \|\| s2 == REQ_EOPNOTSUPP)
1373	return BLKIF_RSP_EOPNOTSUPP;
1374	return BLKIF_RSP_OKAY;
1375	}
1376
1377	/*
1378	* Return values:
1379	* 1 response processed.
1380	* 0 missing further responses.
1381	* -1 error while processing.
1382	*/
1383	static int blkif_completion(unsigned long *id,
1384	struct blkfront_ring_info *rinfo,
1385	struct blkif_response *bret)
1386	{
1387	int i = `0`;
1388	struct scatterlist *sg;
1389	int num_sg, num_grant;
1390	struct blkfront_info *info = rinfo->dev_info;
1391	struct blk_shadow s = &rinfo->shadow[id];
1392	struct copy_from_grant data = {
1393	.grant_idx = `0`,
1394	};
1395
1396	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1397	s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1398
1399	/ The I/O request may be split in two. /
1400	if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
1401	struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
1402
1403	/ Keep the status of the current response in shadow. /
1404	s->status = blkif_rsp_to_req_status(rsp: bret->status);
1405
1406	/ Wait the second response if not yet here. /
1407	if (s2->status < REQ_DONE)
1408	return `0`;
1409
1410	bret->status = blkif_get_final_status(s1: s->status,
1411	s2: s2->status);
1412
1413	/*
1414	* All the grants is stored in the first shadow in order
1415	* to make the completion code simpler.
1416	*/
1417	num_grant += s2->req.u.rw.nr_segments;
1418
1419	/*
1420	* The two responses may not come in order. Only the
1421	* first request will store the scatter-gather list.
1422	*/
1423	if (s2->num_sg != `0`) {
1424	/ Update "id" with the ID of the first response. /
1425	*id = s->associated_id;
1426	s = s2;
1427	}
1428
1429	/*
1430	* We don't need anymore the second request, so recycling
1431	* it now.
1432	*/
1433	if (add_id_to_freelist(rinfo, id: s->associated_id))
1434	WARN(`1`, "%s: can't recycle the second part (id = %ld) of the request\n",
1435	info->gd->disk_name, s->associated_id);
1436	}
1437
1438	data.s = s;
1439	num_sg = s->num_sg;
1440
1441	if (bret->operation == BLKIF_OP_READ && info->bounce) {
1442	for_each_sg(s->sg, sg, num_sg, i) {
1443	BUG_ON(sg->offset + sg->length > PAGE_SIZE);
1444
1445	data.bvec_offset = sg->offset;
1446	data.bvec_data = kmap_atomic(page: sg_page(sg));
1447
1448	gnttab_foreach_grant_in_range(page: sg_page(sg),
1449	offset: sg->offset,
1450	len: sg->length,
1451	fn: blkif_copy_from_grant,
1452	data: &data);
1453
1454	kunmap_atomic(data.bvec_data);
1455	}
1456	}
1457	/ Add the persistent grant into the list of free grants /
1458	for (i = `0`; i < num_grant; i++) {
1459	if (!gnttab_try_end_foreign_access(ref: s->grants_used[i]->gref)) {
1460	/*
1461	* If the grant is still mapped by the backend (the
1462	* backend has chosen to make this grant persistent)
1463	* we add it at the head of the list, so it will be
1464	* reused first.
1465	*/
1466	if (!info->feature_persistent) {
1467	pr_alert("backed has not unmapped grant: %u\n",
1468	s->grants_used[i]->gref);
1469	return -`1`;
1470	}
1471	list_add(new: &s->grants_used[i]->node, head: &rinfo->grants);
1472	rinfo->persistent_gnts_c++;
1473	} else {
1474	/*
1475	* If the grant is not mapped by the backend we add it
1476	* to the tail of the list, so it will not be picked
1477	* again unless we run out of persistent grants.
1478	*/
1479	s->grants_used[i]->gref = INVALID_GRANT_REF;
1480	list_add_tail(new: &s->grants_used[i]->node, head: &rinfo->grants);
1481	}
1482	}
1483	if (s->req.operation == BLKIF_OP_INDIRECT) {
1484	for (i = `0`; i < INDIRECT_GREFS(num_grant); i++) {
1485	if (!gnttab_try_end_foreign_access(ref: s->indirect_grants[i]->gref)) {
1486	if (!info->feature_persistent) {
1487	pr_alert("backed has not unmapped grant: %u\n",
1488	s->indirect_grants[i]->gref);
1489	return -`1`;
1490	}
1491	list_add(new: &s->indirect_grants[i]->node, head: &rinfo->grants);
1492	rinfo->persistent_gnts_c++;
1493	} else {
1494	struct page *indirect_page;
1495
1496	/*
1497	* Add the used indirect page back to the list of
1498	* available pages for indirect grefs.
1499	*/
1500	if (!info->bounce) {
1501	indirect_page = s->indirect_grants[i]->page;
1502	list_add(new: &indirect_page->lru, head: &rinfo->indirect_pages);
1503	}
1504	s->indirect_grants[i]->gref = INVALID_GRANT_REF;
1505	list_add_tail(new: &s->indirect_grants[i]->node, head: &rinfo->grants);
1506	}
1507	}
1508	}
1509
1510	return `1`;
1511	}
1512
1513	static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1514	{
1515	struct request *req;
1516	struct blkif_response bret;
1517	RING_IDX i, rp;
1518	unsigned long flags;
1519	struct blkfront_ring_info rinfo = (struct* blkfront_ring_info *)dev_id;
1520	struct blkfront_info *info = rinfo->dev_info;
1521	unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS;
1522
1523	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1524	xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
1525	return IRQ_HANDLED;
1526	}
1527
1528	spin_lock_irqsave(&rinfo->ring_lock, flags);
1529	again:
1530	rp = READ_ONCE(rinfo->ring.sring->rsp_prod);
1531	virt_rmb(); / Ensure we see queued responses up to 'rp'. /
1532	if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) {
1533	pr_alert("%s: illegal number of responses %u\n",
1534	info->gd->disk_name, rp - rinfo->ring.rsp_cons);
1535	goto err;
1536	}
1537
1538	for (i = rinfo->ring.rsp_cons; i != rp; i++) {
1539	unsigned long id;
1540	unsigned int op;
1541
1542	eoiflag = `0`;
1543
1544	RING_COPY_RESPONSE(&rinfo->ring, i, &bret);
1545	id = bret.id;
1546
1547	/*
1548	* The backend has messed up and given us an id that we would
1549	* never have given to it (we stamp it up to BLK_RING_SIZE -
1550	* look in get_id_from_freelist.
1551	*/
1552	if (id >= BLK_RING_SIZE(info)) {
1553	pr_alert("%s: response has incorrect id (%ld)\n",
1554	info->gd->disk_name, id);
1555	goto err;
1556	}
1557	if (rinfo->shadow[id].status != REQ_WAITING) {
1558	pr_alert("%s: response references no pending request\n",
1559	info->gd->disk_name);
1560	goto err;
1561	}
1562
1563	rinfo->shadow[id].status = REQ_PROCESSING;
1564	req = rinfo->shadow[id].request;
1565
1566	op = rinfo->shadow[id].req.operation;
1567	if (op == BLKIF_OP_INDIRECT)
1568	op = rinfo->shadow[id].req.u.indirect.indirect_op;
1569	if (bret.operation != op) {
1570	pr_alert("%s: response has wrong operation (%u instead of %u)\n",
1571	info->gd->disk_name, bret.operation, op);
1572	goto err;
1573	}
1574
1575	if (bret.operation != BLKIF_OP_DISCARD) {
1576	int ret;
1577
1578	/*
1579	* We may need to wait for an extra response if the
1580	* I/O request is split in 2
1581	*/
1582	ret = blkif_completion(id: &id, rinfo, bret: &bret);
1583	if (!ret)
1584	continue;
1585	if (unlikely(ret < `0`))
1586	goto err;
1587	}
1588
1589	if (add_id_to_freelist(rinfo, id)) {
1590	WARN(`1`, "%s: response to %s (id %ld) couldn't be recycled!\n",
1591	info->gd->disk_name, op_name(bret.operation), id);
1592	continue;
1593	}
1594
1595	if (bret.status == BLKIF_RSP_OKAY)
1596	blkif_req(rq: req)->error = BLK_STS_OK;
1597	else
1598	blkif_req(rq: req)->error = BLK_STS_IOERR;
1599
1600	switch (bret.operation) {
1601	case BLKIF_OP_DISCARD:
1602	if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
1603	struct request_queue *rq = info->rq;
1604
1605	pr_warn_ratelimited("blkfront: %s: %s op failed\n",
1606	info->gd->disk_name, op_name(bret.operation));
1607	blkif_req(rq: req)->error = BLK_STS_NOTSUPP;
1608	info->feature_discard = `0`;
1609	info->feature_secdiscard = `0`;
1610	blk_queue_max_discard_sectors(q: rq, max_discard_sectors: `0`);
1611	blk_queue_max_secure_erase_sectors(q: rq, max_sectors: `0`);
1612	}
1613	break;
1614	case BLKIF_OP_FLUSH_DISKCACHE:
1615	case BLKIF_OP_WRITE_BARRIER:
1616	if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
1617	pr_warn_ratelimited("blkfront: %s: %s op failed\n",
1618	info->gd->disk_name, op_name(bret.operation));
1619	blkif_req(rq: req)->error = BLK_STS_NOTSUPP;
1620	}
1621	if (unlikely(bret.status == BLKIF_RSP_ERROR &&
1622	rinfo->shadow[id].req.u.rw.nr_segments == `0`)) {
1623	pr_warn_ratelimited("blkfront: %s: empty %s op failed\n",
1624	info->gd->disk_name, op_name(bret.operation));
1625	blkif_req(rq: req)->error = BLK_STS_NOTSUPP;
1626	}
1627	if (unlikely(blkif_req(req)->error)) {
1628	if (blkif_req(rq: req)->error == BLK_STS_NOTSUPP)
1629	blkif_req(rq: req)->error = BLK_STS_OK;
1630	info->feature_fua = `0`;
1631	info->feature_flush = `0`;
1632	xlvbd_flush(info);
1633	}
1634	fallthrough;
1635	case BLKIF_OP_READ:
1636	case BLKIF_OP_WRITE:
1637	if (unlikely(bret.status != BLKIF_RSP_OKAY))
1638	dev_dbg_ratelimited(&info->xbdev->dev,
1639	"Bad return from blkdev data request: %#x\n",
1640	bret.status);
1641
1642	break;
1643	default:
1644	BUG();
1645	}
1646
1647	if (likely(!blk_should_fake_timeout(req->q)))
1648	blk_mq_complete_request(rq: req);
1649	}
1650
1651	rinfo->ring.rsp_cons = i;
1652
1653	if (i != rinfo->ring.req_prod_pvt) {
1654	int more_to_do;
1655	RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
1656	if (more_to_do)
1657	goto again;
1658	} else
1659	rinfo->ring.sring->rsp_event = i + `1`;
1660
1661	kick_pending_request_queues_locked(rinfo);
1662
1663	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
1664
1665	xen_irq_lateeoi(irq, eoi_flags: eoiflag);
1666
1667	return IRQ_HANDLED;
1668
1669	err:
1670	info->connected = BLKIF_STATE_ERROR;
1671
1672	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
1673
1674	/ No EOI in order to avoid further interrupts. /
1675
1676	pr_alert("%s disabled for further use\n", info->gd->disk_name);
1677	return IRQ_HANDLED;
1678	}
1679
1680
1681	static int setup_blkring(struct xenbus_device *dev,
1682	struct blkfront_ring_info *rinfo)
1683	{
1684	struct blkif_sring *sring;
1685	int err;
1686	struct blkfront_info *info = rinfo->dev_info;
1687	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
1688
1689	err = xenbus_setup_ring(dev, GFP_NOIO, vaddr: (void **)&sring,
1690	nr_pages: info->nr_ring_pages, grefs: rinfo->ring_ref);
1691	if (err)
1692	goto fail;
1693
1694	XEN_FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
1695
1696	err = xenbus_alloc_evtchn(dev, port: &rinfo->evtchn);
1697	if (err)
1698	goto fail;
1699
1700	err = bind_evtchn_to_irqhandler_lateeoi(evtchn: rinfo->evtchn, handler: blkif_interrupt,
1701	irqflags: `0`, devname: "blkif", dev_id: rinfo);
1702	if (err <= `0`) {
1703	xenbus_dev_fatal(dev, err,
1704	fmt: "bind_evtchn_to_irqhandler failed");
1705	goto fail;
1706	}
1707	rinfo->irq = err;
1708
1709	return `0`;
1710	fail:
1711	blkif_free(info, suspend: `0`);
1712	return err;
1713	}
1714
1715	/*
1716	* Write out per-ring/queue nodes including ring-ref and event-channel, and each
1717	* ring buffer may have multi pages depending on ->nr_ring_pages.
1718	*/
1719	static int write_per_ring_nodes(struct xenbus_transaction xbt,
1720	struct blkfront_ring_info rinfo, const* char *dir)
1721	{
1722	int err;
1723	unsigned int i;
1724	const char *message = NULL;
1725	struct blkfront_info *info = rinfo->dev_info;
1726
1727	if (info->nr_ring_pages == `1`) {
1728	err = xenbus_printf(t: xbt, dir, node: "ring-ref", fmt: "%u", rinfo->ring_ref[`0`]);
1729	if (err) {
1730	message = "writing ring-ref";
1731	goto abort_transaction;
1732	}
1733	} else {
1734	for (i = `0`; i < info->nr_ring_pages; i++) {
1735	char ring_ref_name[RINGREF_NAME_LEN];
1736
1737	snprintf(buf: ring_ref_name, RINGREF_NAME_LEN, fmt: "ring-ref%u", i);
1738	err = xenbus_printf(t: xbt, dir, node: ring_ref_name,
1739	fmt: "%u", rinfo->ring_ref[i]);
1740	if (err) {
1741	message = "writing ring-ref";
1742	goto abort_transaction;
1743	}
1744	}
1745	}
1746
1747	err = xenbus_printf(t: xbt, dir, node: "event-channel", fmt: "%u", rinfo->evtchn);
1748	if (err) {
1749	message = "writing event-channel";
1750	goto abort_transaction;
1751	}
1752
1753	return `0`;
1754
1755	abort_transaction:
1756	xenbus_transaction_end(t: xbt, abort: `1`);
1757	if (message)
1758	xenbus_dev_fatal(dev: info->xbdev, err, fmt: "%s", message);
1759
1760	return err;
1761	}
1762
1763	/ Enable the persistent grants feature. /
1764	static bool feature_persistent = true;
1765	module_param(feature_persistent, bool, `0644`);
1766	MODULE_PARM_DESC(feature_persistent,
1767	"Enables the persistent grants feature");
1768
1769	/ Common code used when first setting up, and when resuming. /
1770	static int talk_to_blkback(struct xenbus_device *dev,
1771	struct blkfront_info *info)
1772	{
1773	const char *message = NULL;
1774	struct xenbus_transaction xbt;
1775	int err;
1776	unsigned int i, max_page_order;
1777	unsigned int ring_page_order;
1778	struct blkfront_ring_info *rinfo;
1779
1780	if (!info)
1781	return -ENODEV;
1782
1783	/ Check if backend is trusted. /
1784	info->bounce = !xen_blkif_trusted \|\|
1785	!xenbus_read_unsigned(dir: dev->nodename, node: "trusted", default_val: `1`);
1786
1787	max_page_order = xenbus_read_unsigned(dir: info->xbdev->otherend,
1788	node: "max-ring-page-order", default_val: `0`);
1789	ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
1790	info->nr_ring_pages = `1` << ring_page_order;
1791
1792	err = negotiate_mq(info);
1793	if (err)
1794	goto destroy_blkring;
1795
1796	for_each_rinfo(info, rinfo, i) {
1797	/ Create shared ring, alloc event channel. /
1798	err = setup_blkring(dev, rinfo);
1799	if (err)
1800	goto destroy_blkring;
1801	}
1802
1803	again:
1804	err = xenbus_transaction_start(t: &xbt);
1805	if (err) {
1806	xenbus_dev_fatal(dev, err, fmt: "starting transaction");
1807	goto destroy_blkring;
1808	}
1809
1810	if (info->nr_ring_pages > `1`) {
1811	err = xenbus_printf(t: xbt, dir: dev->nodename, node: "ring-page-order", fmt: "%u",
1812	ring_page_order);
1813	if (err) {
1814	message = "writing ring-page-order";
1815	goto abort_transaction;
1816	}
1817	}
1818
1819	/ We already got the number of queues/rings in _probe /
1820	if (info->nr_rings == `1`) {
1821	err = write_per_ring_nodes(xbt, rinfo: info->rinfo, dir: dev->nodename);
1822	if (err)
1823	goto destroy_blkring;
1824	} else {
1825	char *path;
1826	size_t pathsize;
1827
1828	err = xenbus_printf(t: xbt, dir: dev->nodename, node: "multi-queue-num-queues", fmt: "%u",
1829	info->nr_rings);
1830	if (err) {
1831	message = "writing multi-queue-num-queues";
1832	goto abort_transaction;
1833	}
1834
1835	pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
1836	path = kmalloc(size: pathsize, GFP_KERNEL);
1837	if (!path) {
1838	err = -ENOMEM;
1839	message = "ENOMEM while writing ring references";
1840	goto abort_transaction;
1841	}
1842
1843	for_each_rinfo(info, rinfo, i) {
1844	memset(path, `0`, pathsize);
1845	snprintf(buf: path, size: pathsize, fmt: "%s/queue-%u", dev->nodename, i);
1846	err = write_per_ring_nodes(xbt, rinfo, dir: path);
1847	if (err) {
1848	kfree(objp: path);
1849	goto destroy_blkring;
1850	}
1851	}
1852	kfree(objp: path);
1853	}
1854	err = xenbus_printf(t: xbt, dir: dev->nodename, node: "protocol", fmt: "%s",
1855	XEN_IO_PROTO_ABI_NATIVE);
1856	if (err) {
1857	message = "writing protocol";
1858	goto abort_transaction;
1859	}
1860	info->feature_persistent_parm = feature_persistent;
1861	err = xenbus_printf(t: xbt, dir: dev->nodename, node: "feature-persistent", fmt: "%u",
1862	info->feature_persistent_parm);
1863	if (err)
1864	dev_warn(&dev->dev,
1865	"writing persistent grants feature to xenbus");
1866
1867	err = xenbus_transaction_end(t: xbt, abort: `0`);
1868	if (err) {
1869	if (err == -EAGAIN)
1870	goto again;
1871	xenbus_dev_fatal(dev, err, fmt: "completing transaction");
1872	goto destroy_blkring;
1873	}
1874
1875	for_each_rinfo(info, rinfo, i) {
1876	unsigned int j;
1877
1878	for (j = `0`; j < BLK_RING_SIZE(info); j++)
1879	rinfo->shadow[j].req.u.rw.id = j + `1`;
1880	rinfo->shadow[BLK_RING_SIZE(info)-`1`].req.u.rw.id = `0x0fffffff`;
1881	}
1882	xenbus_switch_state(dev, new_state: XenbusStateInitialised);
1883
1884	return `0`;
1885
1886	abort_transaction:
1887	xenbus_transaction_end(t: xbt, abort: `1`);
1888	if (message)
1889	xenbus_dev_fatal(dev, err, fmt: "%s", message);
1890	destroy_blkring:
1891	blkif_free(info, suspend: `0`);
1892	return err;
1893	}
1894
1895	static int negotiate_mq(struct blkfront_info *info)
1896	{
1897	unsigned int backend_max_queues;
1898	unsigned int i;
1899	struct blkfront_ring_info *rinfo;
1900
1901	BUG_ON(info->nr_rings);
1902
1903	/ Check if backend supports multiple queues. /
1904	backend_max_queues = xenbus_read_unsigned(dir: info->xbdev->otherend,
1905	node: "multi-queue-max-queues", default_val: `1`);
1906	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
1907	/ We need at least one ring. /
1908	if (!info->nr_rings)
1909	info->nr_rings = `1`;
1910
1911	info->rinfo_size = struct_size(info->rinfo, shadow,
1912	BLK_RING_SIZE(info));
1913	info->rinfo = kvcalloc(n: info->nr_rings, size: info->rinfo_size, GFP_KERNEL);
1914	if (!info->rinfo) {
1915	xenbus_dev_fatal(dev: info->xbdev, err: -ENOMEM, fmt: "allocating ring_info structure");
1916	info->nr_rings = `0`;
1917	return -ENOMEM;
1918	}
1919
1920	for_each_rinfo(info, rinfo, i) {
1921	INIT_LIST_HEAD(list: &rinfo->indirect_pages);
1922	INIT_LIST_HEAD(list: &rinfo->grants);
1923	rinfo->dev_info = info;
1924	INIT_WORK(&rinfo->work, blkif_restart_queue);
1925	spin_lock_init(&rinfo->ring_lock);
1926	}
1927	return `0`;
1928	}
1929
1930	/*
1931	* Entry point to this code when a new device is created. Allocate the basic
1932	* structures and the ring buffer for communication with the backend, and
1933	* inform the backend of the appropriate details for those. Switch to
1934	* Initialised state.
1935	*/
1936	static int blkfront_probe(struct xenbus_device *dev,
1937	const struct xenbus_device_id *id)
1938	{
1939	int err, vdevice;
1940	struct blkfront_info *info;
1941
1942	/ FIXME: Use dynamic device id if this is not set. /
1943	err = xenbus_scanf(XBT_NIL, dir: dev->nodename,
1944	node: "virtual-device", fmt: "%i", &vdevice);
1945	if (err != `1`) {
1946	/ go looking in the extended area instead /
1947	err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
1948	"%i", &vdevice);
1949	if (err != `1`) {
1950	xenbus_dev_fatal(dev, err, "reading virtual-device");
1951	return err;
1952	}
1953	}
1954
1955	if (xen_hvm_domain()) {
1956	char *type;
1957	int len;
1958	/ no unplug has been done: do not hook devices != xen vbds /
1959	if (xen_has_pv_and_legacy_disk_devices()) {
1960	int major;
1961
1962	if (!VDEV_IS_EXTENDED(vdevice))
1963	major = BLKIF_MAJOR(vdevice);
1964	else
1965	major = XENVBD_MAJOR;
1966
1967	if (major != XENVBD_MAJOR) {
1968	printk(KERN_INFO
1969	"%s: HVM does not support vbd %d as xen block device\n",
1970	__func__, vdevice);
1971	return -ENODEV;
1972	}
1973	}
1974	/ do not create a PV cdrom device if we are an HVM guest /
1975	type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
1976	if (IS_ERR(type))
1977	return -ENODEV;
1978	if (strncmp(type, "cdrom", `5`) == `0`) {
1979	kfree(type);
1980	return -ENODEV;
1981	}
1982	kfree(type);
1983	}
1984	info = kzalloc(sizeof(*info), GFP_KERNEL);
1985	if (!info) {
1986	xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
1987	return -ENOMEM;
1988	}
1989
1990	info->xbdev = dev;
1991
1992	mutex_init(&info->mutex);
1993	info->vdevice = vdevice;
1994	info->connected = BLKIF_STATE_DISCONNECTED;
1995
1996	/ Front end dir is a number, which is used as the id. /
1997	info->handle = simple_strtoul(strrchr(dev->nodename, `'/'`)+`1`, NULL, `0`);
1998	dev_set_drvdata(&dev->dev, info);
1999
2000	mutex_lock(&blkfront_mutex);
2001	list_add(&info->info_list, &info_list);
2002	mutex_unlock(&blkfront_mutex);
2003
2004	return `0`;
2005	}
2006
2007	static int blkif_recover(struct blkfront_info *info)
2008	{
2009	unsigned int r_index;
2010	struct request req, n;
2011	int rc;
2012	struct bio *bio;
2013	unsigned int segs;
2014	struct blkfront_ring_info *rinfo;
2015
2016	blkfront_gather_backend_features(info);
2017	/ Reset limits changed by blk_mq_update_nr_hw_queues(). /
2018	blkif_set_queue_limits(info);
2019	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
2020	blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
2021
2022	for_each_rinfo(info, rinfo, r_index) {
2023	rc = blkfront_setup_indirect(rinfo);
2024	if (rc)
2025	return rc;
2026	}
2027	xenbus_switch_state(dev: info->xbdev, new_state: XenbusStateConnected);
2028
2029	/ Now safe for us to use the shared ring /
2030	info->connected = BLKIF_STATE_CONNECTED;
2031
2032	for_each_rinfo(info, rinfo, r_index) {
2033	/ Kick any other new requests queued since we resumed /
2034	kick_pending_request_queues(rinfo);
2035	}
2036
2037	list_for_each_entry_safe(req, n, &info->requests, queuelist) {
2038	/ Requeue pending requests (flush or discard) /
2039	list_del_init(entry: &req->queuelist);
2040	BUG_ON(req->nr_phys_segments > segs);
2041	blk_mq_requeue_request(rq: req, kick_requeue_list: false);
2042	}
2043	blk_mq_start_stopped_hw_queues(q: info->rq, async: true);
2044	blk_mq_kick_requeue_list(q: info->rq);
2045
2046	while ((bio = bio_list_pop(bl: &info->bio_list)) != NULL) {
2047	/ Traverse the list of pending bios and re-queue them /
2048	submit_bio(bio);
2049	}
2050
2051	return `0`;
2052	}
2053
2054	/*
2055	* We are reconnecting to the backend, due to a suspend/resume, or a backend
2056	* driver restart. We tear down our blkif structure and recreate it, but
2057	* leave the device-layer structures intact so that this is transparent to the
2058	* rest of the kernel.
2059	*/
2060	static int blkfront_resume(struct xenbus_device *dev)
2061	{
2062	struct blkfront_info *info = dev_get_drvdata(dev: &dev->dev);
2063	int err = `0`;
2064	unsigned int i, j;
2065	struct blkfront_ring_info *rinfo;
2066
2067	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
2068
2069	bio_list_init(bl: &info->bio_list);
2070	INIT_LIST_HEAD(list: &info->requests);
2071	for_each_rinfo(info, rinfo, i) {
2072	struct bio_list merge_bio;
2073	struct blk_shadow *shadow = rinfo->shadow;
2074
2075	for (j = `0`; j < BLK_RING_SIZE(info); j++) {
2076	/ Not in use? /
2077	if (!shadow[j].request)
2078	continue;
2079
2080	/*
2081	* Get the bios in the request so we can re-queue them.
2082	*/
2083	if (req_op(req: shadow[j].request) == REQ_OP_FLUSH \|\|
2084	req_op(req: shadow[j].request) == REQ_OP_DISCARD \|\|
2085	req_op(req: shadow[j].request) == REQ_OP_SECURE_ERASE \|\|
2086	shadow[j].request->cmd_flags & REQ_FUA) {
2087	/*
2088	* Flush operations don't contain bios, so
2089	* we need to requeue the whole request
2090	*
2091	* XXX: but this doesn't make any sense for a
2092	* write with the FUA flag set..
2093	*/
2094	list_add(new: &shadow[j].request->queuelist, head: &info->requests);
2095	continue;
2096	}
2097	merge_bio.head = shadow[j].request->bio;
2098	merge_bio.tail = shadow[j].request->biotail;
2099	bio_list_merge(bl: &info->bio_list, bl2: &merge_bio);
2100	shadow[j].request->bio = NULL;
2101	blk_mq_end_request(rq: shadow[j].request, BLK_STS_OK);
2102	}
2103	}
2104
2105	blkif_free(info, suspend: info->connected == BLKIF_STATE_CONNECTED);
2106
2107	err = talk_to_blkback(dev, info);
2108	if (!err)
2109	blk_mq_update_nr_hw_queues(set: &info->tag_set, nr_hw_queues: info->nr_rings);
2110
2111	/*
2112	* We have to wait for the backend to switch to
2113	* connected state, since we want to read which
2114	* features it supports.
2115	*/
2116
2117	return err;
2118	}
2119
2120	static void blkfront_closing(struct blkfront_info *info)
2121	{
2122	struct xenbus_device *xbdev = info->xbdev;
2123	struct blkfront_ring_info *rinfo;
2124	unsigned int i;
2125
2126	if (xbdev->state == XenbusStateClosing)
2127	return;
2128
2129	/ No more blkif_request(). /
2130	if (info->rq && info->gd) {
2131	blk_mq_stop_hw_queues(q: info->rq);
2132	blk_mark_disk_dead(disk: info->gd);
2133	}
2134
2135	for_each_rinfo(info, rinfo, i) {
2136	/ No more gnttab callback work. /
2137	gnttab_cancel_free_callback(callback: &rinfo->callback);
2138
2139	/ Flush gnttab callback work. Must be done with no locks held. /
2140	flush_work(work: &rinfo->work);
2141	}
2142
2143	xenbus_frontend_closed(dev: xbdev);
2144	}
2145
2146	static void blkfront_setup_discard(struct blkfront_info *info)
2147	{
2148	info->feature_discard = `1`;
2149	info->discard_granularity = xenbus_read_unsigned(dir: info->xbdev->otherend,
2150	node: "discard-granularity",
2151	default_val: `0`);
2152	info->discard_alignment = xenbus_read_unsigned(dir: info->xbdev->otherend,
2153	node: "discard-alignment", default_val: `0`);
2154	info->feature_secdiscard =
2155	!!xenbus_read_unsigned(dir: info->xbdev->otherend, node: "discard-secure",
2156	default_val: `0`);
2157	}
2158
2159	static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
2160	{
2161	unsigned int psegs, grants, memflags;
2162	int err, i;
2163	struct blkfront_info *info = rinfo->dev_info;
2164
2165	memflags = memalloc_noio_save();
2166
2167	if (info->max_indirect_segments == `0`) {
2168	if (!HAS_EXTRA_REQ)
2169	grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2170	else {
2171	/*
2172	* When an extra req is required, the maximum
2173	* grants supported is related to the size of the
2174	* Linux block segment.
2175	*/
2176	grants = GRANTS_PER_PSEG;
2177	}
2178	}
2179	else
2180	grants = info->max_indirect_segments;
2181	psegs = DIV_ROUND_UP(grants, GRANTS_PER_PSEG);
2182
2183	err = fill_grant_buffer(rinfo,
2184	num: (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
2185	if (err)
2186	goto out_of_memory;
2187
2188	if (!info->bounce && info->max_indirect_segments) {
2189	/*
2190	* We are using indirect descriptors but don't have a bounce
2191	* buffer, we need to allocate a set of pages that can be
2192	* used for mapping indirect grefs
2193	*/
2194	int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
2195
2196	BUG_ON(!list_empty(&rinfo->indirect_pages));
2197	for (i = `0`; i < num; i++) {
2198	struct page *indirect_page = alloc_page(GFP_KERNEL \|
2199	__GFP_ZERO);
2200	if (!indirect_page)
2201	goto out_of_memory;
2202	list_add(new: &indirect_page->lru, head: &rinfo->indirect_pages);
2203	}
2204	}
2205
2206	for (i = `0`; i < BLK_RING_SIZE(info); i++) {
2207	rinfo->shadow[i].grants_used =
2208	kvcalloc(n: grants,
2209	size: sizeof(rinfo->shadow[i].grants_used[`0`]),
2210	GFP_KERNEL);
2211	rinfo->shadow[i].sg = kvcalloc(n: psegs,
2212	size: sizeof(rinfo->shadow[i].sg[`0`]),
2213	GFP_KERNEL);
2214	if (info->max_indirect_segments)
2215	rinfo->shadow[i].indirect_grants =
2216	kvcalloc(INDIRECT_GREFS(grants),
2217	size: sizeof(rinfo->shadow[i].indirect_grants[`0`]),
2218	GFP_KERNEL);
2219	if ((rinfo->shadow[i].grants_used == NULL) \|\|
2220	(rinfo->shadow[i].sg == NULL) \|\|
2221	(info->max_indirect_segments &&
2222	(rinfo->shadow[i].indirect_grants == NULL)))
2223	goto out_of_memory;
2224	sg_init_table(rinfo->shadow[i].sg, psegs);
2225	}
2226
2227	memalloc_noio_restore(flags: memflags);
2228
2229	return `0`;
2230
2231	out_of_memory:
2232	for (i = `0`; i < BLK_RING_SIZE(info); i++) {
2233	kvfree(addr: rinfo->shadow[i].grants_used);
2234	rinfo->shadow[i].grants_used = NULL;
2235	kvfree(addr: rinfo->shadow[i].sg);
2236	rinfo->shadow[i].sg = NULL;
2237	kvfree(addr: rinfo->shadow[i].indirect_grants);
2238	rinfo->shadow[i].indirect_grants = NULL;
2239	}
2240	if (!list_empty(head: &rinfo->indirect_pages)) {
2241	struct page indirect_page, n;
2242	list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
2243	list_del(entry: &indirect_page->lru);
2244	__free_page(indirect_page);
2245	}
2246	}
2247
2248	memalloc_noio_restore(flags: memflags);
2249
2250	return -ENOMEM;
2251	}
2252
2253	/*
2254	* Gather all backend feature-*
2255	*/
2256	static void blkfront_gather_backend_features(struct blkfront_info *info)
2257	{
2258	unsigned int indirect_segments;
2259
2260	info->feature_flush = `0`;
2261	info->feature_fua = `0`;
2262
2263	/*
2264	* If there's no "feature-barrier" defined, then it means
2265	* we're dealing with a very old backend which writes
2266	* synchronously; nothing to do.
2267	*
2268	* If there are barriers, then we use flush.
2269	*/
2270	if (xenbus_read_unsigned(dir: info->xbdev->otherend, node: "feature-barrier", default_val: `0`)) {
2271	info->feature_flush = `1`;
2272	info->feature_fua = `1`;
2273	}
2274
2275	/*
2276	* And if there is "feature-flush-cache" use that above
2277	* barriers.
2278	*/
2279	if (xenbus_read_unsigned(dir: info->xbdev->otherend, node: "feature-flush-cache",
2280	default_val: `0`)) {
2281	info->feature_flush = `1`;
2282	info->feature_fua = `0`;
2283	}
2284
2285	if (xenbus_read_unsigned(dir: info->xbdev->otherend, node: "feature-discard", default_val: `0`))
2286	blkfront_setup_discard(info);
2287
2288	if (info->feature_persistent_parm)
2289	info->feature_persistent =
2290	!!xenbus_read_unsigned(dir: info->xbdev->otherend,
2291	node: "feature-persistent", default_val: `0`);
2292	if (info->feature_persistent)
2293	info->bounce = true;
2294
2295	indirect_segments = xenbus_read_unsigned(dir: info->xbdev->otherend,
2296	node: "feature-max-indirect-segments", default_val: `0`);
2297	if (indirect_segments > xen_blkif_max_segments)
2298	indirect_segments = xen_blkif_max_segments;
2299	if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST)
2300	indirect_segments = `0`;
2301	info->max_indirect_segments = indirect_segments;
2302
2303	if (info->feature_persistent) {
2304	mutex_lock(&blkfront_mutex);
2305	schedule_delayed_work(dwork: &blkfront_work, HZ * `10`);
2306	mutex_unlock(lock: &blkfront_mutex);
2307	}
2308	}
2309
2310	/*
2311	* Invoked when the backend is finally 'ready' (and has told produced
2312	* the details about the physical device - #sectors, size, etc).
2313	*/
2314	static void blkfront_connect(struct blkfront_info *info)
2315	{
2316	unsigned long long sectors;
2317	unsigned long sector_size;
2318	unsigned int physical_sector_size;
2319	int err, i;
2320	struct blkfront_ring_info *rinfo;
2321
2322	switch (info->connected) {
2323	case BLKIF_STATE_CONNECTED:
2324	/*
2325	* Potentially, the back-end may be signalling
2326	* a capacity change; update the capacity.
2327	*/
2328	err = xenbus_scanf(XBT_NIL, dir: info->xbdev->otherend,
2329	node: "sectors", fmt: "%Lu", &sectors);
2330	if (XENBUS_EXIST_ERR(err))
2331	return;
2332	printk(KERN_INFO "Setting capacity to %Lu\n",
2333	sectors);
2334	set_capacity_and_notify(info->gd, sectors);
2335
2336	return;
2337	case BLKIF_STATE_SUSPENDED:
2338	/*
2339	* If we are recovering from suspension, we need to wait
2340	* for the backend to announce it's features before
2341	* reconnecting, at least we need to know if the backend
2342	* supports indirect descriptors, and how many.
2343	*/
2344	blkif_recover(info);
2345	return;
2346
2347	default:
2348	break;
2349	}
2350
2351	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
2352	__func__, info->xbdev->otherend);
2353
2354	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
2355	"sectors", "%llu", &sectors,
2356	"info", "%u", &info->vdisk_info,
2357	"sector-size", "%lu", &sector_size,
2358	NULL);
2359	if (err) {
2360	xenbus_dev_fatal(info->xbdev, err,
2361	"reading backend fields at %s",
2362	info->xbdev->otherend);
2363	return;
2364	}
2365
2366	/*
2367	* physical-sector-size is a newer field, so old backends may not
2368	* provide this. Assume physical sector size to be the same as
2369	* sector_size in that case.
2370	*/
2371	physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend,
2372	"physical-sector-size",
2373	sector_size);
2374	blkfront_gather_backend_features(info);
2375	for_each_rinfo(info, rinfo, i) {
2376	err = blkfront_setup_indirect(rinfo);
2377	if (err) {
2378	xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
2379	info->xbdev->otherend);
2380	blkif_free(info, `0`);
2381	break;
2382	}
2383	}
2384
2385	err = xlvbd_alloc_gendisk(sectors, info, sector_size,
2386	physical_sector_size);
2387	if (err) {
2388	xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
2389	info->xbdev->otherend);
2390	goto fail;
2391	}
2392
2393	xenbus_switch_state(info->xbdev, XenbusStateConnected);
2394
2395	/ Kick pending requests. /
2396	info->connected = BLKIF_STATE_CONNECTED;
2397	for_each_rinfo(info, rinfo, i)
2398	kick_pending_request_queues(rinfo);
2399
2400	err = device_add_disk(&info->xbdev->dev, info->gd, NULL);
2401	if (err) {
2402	put_disk(info->gd);
2403	blk_mq_free_tag_set(&info->tag_set);
2404	info->rq = NULL;
2405	goto fail;
2406	}
2407
2408	info->is_ready = `1`;
2409	return;
2410
2411	fail:
2412	blkif_free(info, `0`);
2413	return;
2414	}
2415
2416	/*
2417	* Callback received when the backend's state changes.
2418	*/
2419	static void blkback_changed(struct xenbus_device *dev,
2420	enum xenbus_state backend_state)
2421	{
2422	struct blkfront_info *info = dev_get_drvdata(dev: &dev->dev);
2423
2424	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
2425
2426	switch (backend_state) {
2427	case XenbusStateInitWait:
2428	if (dev->state != XenbusStateInitialising)
2429	break;
2430	if (talk_to_blkback(dev, info))
2431	break;
2432	break;
2433	case XenbusStateInitialising:
2434	case XenbusStateInitialised:
2435	case XenbusStateReconfiguring:
2436	case XenbusStateReconfigured:
2437	case XenbusStateUnknown:
2438	break;
2439
2440	case XenbusStateConnected:
2441	/*
2442	* talk_to_blkback sets state to XenbusStateInitialised
2443	* and blkfront_connect sets it to XenbusStateConnected
2444	* (if connection went OK).
2445	*
2446	* If the backend (or toolstack) decides to poke at backend
2447	* state (and re-trigger the watch by setting the state repeatedly
2448	* to XenbusStateConnected (4)) we need to deal with this.
2449	* This is allowed as this is used to communicate to the guest
2450	* that the size of disk has changed!
2451	*/
2452	if ((dev->state != XenbusStateInitialised) &&
2453	(dev->state != XenbusStateConnected)) {
2454	if (talk_to_blkback(dev, info))
2455	break;
2456	}
2457
2458	blkfront_connect(info);
2459	break;
2460
2461	case XenbusStateClosed:
2462	if (dev->state == XenbusStateClosed)
2463	break;
2464	fallthrough;
2465	case XenbusStateClosing:
2466	blkfront_closing(info);
2467	break;
2468	}
2469	}
2470
2471	static void blkfront_remove(struct xenbus_device *xbdev)
2472	{
2473	struct blkfront_info *info = dev_get_drvdata(dev: &xbdev->dev);
2474
2475	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
2476
2477	if (info->gd)
2478	del_gendisk(gp: info->gd);
2479
2480	mutex_lock(&blkfront_mutex);
2481	list_del(entry: &info->info_list);
2482	mutex_unlock(lock: &blkfront_mutex);
2483
2484	blkif_free(info, suspend: `0`);
2485	if (info->gd) {
2486	xlbd_release_minors(minor: info->gd->first_minor, nr: info->gd->minors);
2487	put_disk(disk: info->gd);
2488	blk_mq_free_tag_set(set: &info->tag_set);
2489	}
2490
2491	kfree(objp: info);
2492	}
2493
2494	static int blkfront_is_ready(struct xenbus_device *dev)
2495	{
2496	struct blkfront_info *info = dev_get_drvdata(dev: &dev->dev);
2497
2498	return info->is_ready && info->xbdev;
2499	}
2500
2501	static const struct block_device_operations xlvbd_block_fops =
2502	{
2503	.owner = THIS_MODULE,
2504	.getgeo = blkif_getgeo,
2505	.ioctl = blkif_ioctl,
2506	.compat_ioctl = blkdev_compat_ptr_ioctl,
2507	};
2508
2509
2510	static const struct xenbus_device_id blkfront_ids[] = {
2511	{ "vbd" },
2512	{ "" }
2513	};
2514
2515	static struct xenbus_driver blkfront_driver = {
2516	.ids = blkfront_ids,
2517	.probe = blkfront_probe,
2518	.remove = blkfront_remove,
2519	.resume = blkfront_resume,
2520	.otherend_changed = blkback_changed,
2521	.is_ready = blkfront_is_ready,
2522	};
2523
2524	static void purge_persistent_grants(struct blkfront_info *info)
2525	{
2526	unsigned int i;
2527	unsigned long flags;
2528	struct blkfront_ring_info *rinfo;
2529
2530	for_each_rinfo(info, rinfo, i) {
2531	struct grant gnt_list_entry, tmp;
2532	LIST_HEAD(grants);
2533
2534	spin_lock_irqsave(&rinfo->ring_lock, flags);
2535
2536	if (rinfo->persistent_gnts_c == `0`) {
2537	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
2538	continue;
2539	}
2540
2541	list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants,
2542	node) {
2543	if (gnt_list_entry->gref == INVALID_GRANT_REF \|\|
2544	!gnttab_try_end_foreign_access(ref: gnt_list_entry->gref))
2545	continue;
2546
2547	list_del(entry: &gnt_list_entry->node);
2548	rinfo->persistent_gnts_c--;
2549	gnt_list_entry->gref = INVALID_GRANT_REF;
2550	list_add_tail(new: &gnt_list_entry->node, head: &grants);
2551	}
2552
2553	list_splice_tail(list: &grants, head: &rinfo->grants);
2554
2555	spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags);
2556	}
2557	}
2558
2559	static void blkfront_delay_work(struct work_struct *work)
2560	{
2561	struct blkfront_info *info;
2562	bool need_schedule_work = false;
2563
2564	/*
2565	* Note that when using bounce buffers but not persistent grants
2566	* there's no need to run blkfront_delay_work because grants are
2567	* revoked in blkif_completion or else an error is reported and the
2568	* connection is closed.
2569	*/
2570
2571	mutex_lock(&blkfront_mutex);
2572
2573	list_for_each_entry(info, &info_list, info_list) {
2574	if (info->feature_persistent) {
2575	need_schedule_work = true;
2576	mutex_lock(&info->mutex);
2577	purge_persistent_grants(info);
2578	mutex_unlock(lock: &info->mutex);
2579	}
2580	}
2581
2582	if (need_schedule_work)
2583	schedule_delayed_work(dwork: &blkfront_work, HZ * `10`);
2584
2585	mutex_unlock(lock: &blkfront_mutex);
2586	}
2587
2588	static int __init xlblk_init(void)
2589	{
2590	int ret;
2591	int nr_cpus = num_online_cpus();
2592
2593	if (!xen_domain())
2594	return -ENODEV;
2595
2596	if (!xen_has_pv_disk_devices())
2597	return -ENODEV;
2598
2599	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
2600	pr_warn("xen_blk: can't get major %d with name %s\n",
2601	XENVBD_MAJOR, DEV_NAME);
2602	return -ENODEV;
2603	}
2604
2605	if (xen_blkif_max_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
2606	xen_blkif_max_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2607
2608	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
2609	pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
2610	xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
2611	xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
2612	}
2613
2614	if (xen_blkif_max_queues > nr_cpus) {
2615	pr_info("Invalid max_queues (%d), will use default max: %d.\n",
2616	xen_blkif_max_queues, nr_cpus);
2617	xen_blkif_max_queues = nr_cpus;
2618	}
2619
2620	INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work);
2621
2622	ret = xenbus_register_frontend(&blkfront_driver);
2623	if (ret) {
2624	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2625	return ret;
2626	}
2627
2628	return `0`;
2629	}
2630	module_init(xlblk_init);
2631
2632
2633	static void __exit xlblk_exit(void)
2634	{
2635	cancel_delayed_work_sync(dwork: &blkfront_work);
2636
2637	xenbus_unregister_driver(drv: &blkfront_driver);
2638	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2639	kfree(objp: minors);
2640	}
2641	module_exit(xlblk_exit);
2642
2643	MODULE_DESCRIPTION("Xen virtual block device frontend");
2644	MODULE_LICENSE("GPL");
2645	MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
2646	MODULE_ALIAS("xen:vbd");
2647	MODULE_ALIAS("xenblk");
2648

source code of linux/drivers/block/xen-blkfront.c