blkback.c source code [linux/drivers/block/xen-blkback/blkback.c]

1	/******************************************************************************
2	*
3	* Back-end of the driver for virtual block devices. This portion of the
4	* driver exports a 'unified' block-device interface that can be accessed
5	* by any operating system that implements a compatible front end. A
6	* reference front-end implementation can be found in:
7	* drivers/block/xen-blkfront.c
8	*
9	* Copyright (c) 2003-2004, Keir Fraser & Steve Hand
10	* Copyright (c) 2005, Christopher Clark
11	*
12	* This program is free software; you can redistribute it and/or
13	* modify it under the terms of the GNU General Public License version 2
14	* as published by the Free Software Foundation; or, when distributed
15	* separately from the Linux kernel or incorporated into other
16	* software packages, subject to the following license:
17	*
18	* Permission is hereby granted, free of charge, to any person obtaining a copy
19	* of this source file (the "Software"), to deal in the Software without
20	* restriction, including without limitation the rights to use, copy, modify,
21	* merge, publish, distribute, sublicense, and/or sell copies of the Software,
22	* and to permit persons to whom the Software is furnished to do so, subject to
23	* the following conditions:
24	*
25	* The above copyright notice and this permission notice shall be included in
26	* all copies or substantial portions of the Software.
27	*
28	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34	* IN THE SOFTWARE.
35	*/
36
37	#define pr_fmt(fmt) "xen-blkback: " fmt
38
39	#include <linux/spinlock.h>
40	#include <linux/kthread.h>
41	#include <linux/list.h>
42	#include <linux/delay.h>
43	#include <linux/freezer.h>
44	#include <linux/bitmap.h>
45
46	#include <xen/events.h>
47	#include <xen/page.h>
48	#include <xen/xen.h>
49	#include <asm/xen/hypervisor.h>
50	#include <asm/xen/hypercall.h>
51	#include <xen/balloon.h>
52	#include <xen/grant_table.h>
53	#include "common.h"
54
55	/*
56	* Maximum number of unused free pages to keep in the internal buffer.
57	* Setting this to a value too low will reduce memory used in each backend,
58	* but can have a performance penalty.
59	*
60	* A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
61	* be set to a lower value that might degrade performance on some intensive
62	* IO workloads.
63	*/
64
65	static int max_buffer_pages = `1024`;
66	module_param_named(max_buffer_pages, max_buffer_pages, int, `0644`);
67	MODULE_PARM_DESC(max_buffer_pages,
68	"Maximum number of free pages to keep in each block backend buffer");
69
70	/*
71	* Maximum number of grants to map persistently in blkback. For maximum
72	* performance this should be the total numbers of grants that can be used
73	* to fill the ring, but since this might become too high, specially with
74	* the use of indirect descriptors, we set it to a value that provides good
75	* performance without using too much memory.
76	*
77	* When the list of persistent grants is full we clean it up using a LRU
78	* algorithm.
79	*/
80
81	static int max_pgrants = `1056`;
82	module_param_named(max_persistent_grants, max_pgrants, int, `0644`);
83	MODULE_PARM_DESC(max_persistent_grants,
84	"Maximum number of grants to map persistently");
85
86	/*
87	* How long a persistent grant is allowed to remain allocated without being in
88	* use. The time is in seconds, 0 means indefinitely long.
89	*/
90
91	static unsigned int pgrant_timeout = `60`;
92	module_param_named(persistent_grant_unused_seconds, pgrant_timeout,
93	uint, `0644`);
94	MODULE_PARM_DESC(persistent_grant_unused_seconds,
95	"Time in seconds an unused persistent grant is allowed to "
96	"remain allocated. Default is 60, 0 means unlimited.");
97
98	/*
99	* Maximum number of rings/queues blkback supports, allow as many queues as there
100	* are CPUs if user has not specified a value.
101	*/
102	unsigned int xenblk_max_queues;
103	module_param_named(max_queues, xenblk_max_queues, uint, `0644`);
104	MODULE_PARM_DESC(max_queues,
105	"Maximum number of hardware queues per virtual disk." \
106	"By default it is the number of online CPUs.");
107
108	/*
109	* Maximum order of pages to be used for the shared ring between front and
110	* backend, 4KB page granularity is used.
111	*/
112	unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
113	module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, `0444`);
114	MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
115	/*
116	* The LRU mechanism to clean the lists of persistent grants needs to
117	* be executed periodically. The time interval between consecutive executions
118	* of the purge mechanism is set in ms.
119	*/
120	#define LRU_INTERVAL 100
121
122	/*
123	* When the persistent grants list is full we will remove unused grants
124	* from the list. The percent number of grants to be removed at each LRU
125	* execution.
126	*/
127	#define LRU_PERCENT_CLEAN 5
128
129	/ Run-time switchable: /sys/module/blkback/parameters/ /
130	static unsigned int log_stats;
131	module_param(log_stats, int, `0644`);
132
133	#define BLKBACK_INVALID_HANDLE (~0)
134
135	static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
136	{
137	return pgrant_timeout && (jiffies - persistent_gnt->last_used >=
138	HZ * pgrant_timeout);
139	}
140
141	#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
142
143	static int do_block_io_op(struct xen_blkif_ring ring, unsigned* int *eoi_flags);
144	static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
145	struct blkif_request *req,
146	struct pending_req *pending_req);
147	static void make_response(struct xen_blkif_ring *ring, u64 id,
148	unsigned short op, int st);
149
150	#define foreach_grant_safe(pos, n, rbtree, node) \
151	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
152	(n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
153	&(pos)->node != NULL; \
154	(pos) = container_of(n, typeof(*(pos)), node), \
155	(n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
156
157
158	/*
159	* We don't need locking around the persistent grant helpers
160	* because blkback uses a single-thread for each backend, so we
161	* can be sure that this functions will never be called recursively.
162	*
163	* The only exception to that is put_persistent_grant, that can be called
164	* from interrupt context (by xen_blkbk_unmap), so we have to use atomic
165	* bit operations to modify the flags of a persistent grant and to count
166	* the number of used grants.
167	*/
168	static int add_persistent_gnt(struct xen_blkif_ring *ring,
169	struct persistent_gnt *persistent_gnt)
170	{
171	struct rb_node *new = NULL, parent = NULL;
172	struct persistent_gnt *this;
173	struct xen_blkif *blkif = ring->blkif;
174
175	if (ring->persistent_gnt_c >= max_pgrants) {
176	if (!blkif->vbd.overflow_max_grants)
177	blkif->vbd.overflow_max_grants = `1`;
178	return -EBUSY;
179	}
180	/ Figure out where to put new node /
181	new = &ring->persistent_gnts.rb_node;
182	while (*new) {
183	this = container_of(new, struct* persistent_gnt, node);
184
185	parent = *new;
186	if (persistent_gnt->gnt < this->gnt)
187	new = &((*new)->rb_left);
188	else if (persistent_gnt->gnt > this->gnt)
189	new = &((*new)->rb_right);
190	else {
191	pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
192	return -EINVAL;
193	}
194	}
195
196	persistent_gnt->active = true;
197	/ Add new node and rebalance tree. /
198	rb_link_node(node: &(persistent_gnt->node), parent, rb_link: new);
199	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
200	ring->persistent_gnt_c++;
201	atomic_inc(v: &ring->persistent_gnt_in_use);
202	return `0`;
203	}
204
205	static struct persistent_gnt get_persistent_gnt(struct* xen_blkif_ring *ring,
206	grant_ref_t gref)
207	{
208	struct persistent_gnt *data;
209	struct rb_node *node = NULL;
210
211	node = ring->persistent_gnts.rb_node;
212	while (node) {
213	data = container_of(node, struct persistent_gnt, node);
214
215	if (gref < data->gnt)
216	node = node->rb_left;
217	else if (gref > data->gnt)
218	node = node->rb_right;
219	else {
220	if (data->active) {
221	pr_alert_ratelimited("requesting a grant already in use\n");
222	return NULL;
223	}
224	data->active = true;
225	atomic_inc(v: &ring->persistent_gnt_in_use);
226	return data;
227	}
228	}
229	return NULL;
230	}
231
232	static void put_persistent_gnt(struct xen_blkif_ring *ring,
233	struct persistent_gnt *persistent_gnt)
234	{
235	if (!persistent_gnt->active)
236	pr_alert_ratelimited("freeing a grant already unused\n");
237	persistent_gnt->last_used = jiffies;
238	persistent_gnt->active = false;
239	atomic_dec(v: &ring->persistent_gnt_in_use);
240	}
241
242	static void free_persistent_gnts(struct xen_blkif_ring *ring)
243	{
244	struct rb_root *root = &ring->persistent_gnts;
245	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
246	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
247	struct persistent_gnt *persistent_gnt;
248	struct rb_node *n;
249	int segs_to_unmap = `0`;
250	struct gntab_unmap_queue_data unmap_data;
251
252	if (RB_EMPTY_ROOT(root))
253	return;
254
255	unmap_data.pages = pages;
256	unmap_data.unmap_ops = unmap;
257	unmap_data.kunmap_ops = NULL;
258
259	foreach_grant_safe(persistent_gnt, n, root, node) {
260	BUG_ON(persistent_gnt->handle ==
261	BLKBACK_INVALID_HANDLE);
262	gnttab_set_unmap_op(unmap: &unmap[segs_to_unmap],
263	addr: (unsigned long) pfn_to_kaddr(page_to_pfn(
264	persistent_gnt->page)),
265	GNTMAP_host_map,
266	handle: persistent_gnt->handle);
267
268	pages[segs_to_unmap] = persistent_gnt->page;
269
270	if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST \|\|
271	!rb_next(&persistent_gnt->node)) {
272
273	unmap_data.count = segs_to_unmap;
274	BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
275
276	gnttab_page_cache_put(cache: &ring->free_pages, page: pages,
277	num: segs_to_unmap);
278	segs_to_unmap = `0`;
279	}
280
281	rb_erase(&persistent_gnt->node, root);
282	kfree(objp: persistent_gnt);
283	ring->persistent_gnt_c--;
284	}
285
286	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
287	BUG_ON(ring->persistent_gnt_c != `0`);
288	}
289
290	void xen_blkbk_unmap_purged_grants(struct work_struct *work)
291	{
292	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
293	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
294	struct persistent_gnt *persistent_gnt;
295	int segs_to_unmap = `0`;
296	struct xen_blkif_ring ring = container_of(work, typeof(ring), persistent_purge_work);
297	struct gntab_unmap_queue_data unmap_data;
298
299	unmap_data.pages = pages;
300	unmap_data.unmap_ops = unmap;
301	unmap_data.kunmap_ops = NULL;
302
303	while(!list_empty(head: &ring->persistent_purge_list)) {
304	persistent_gnt = list_first_entry(&ring->persistent_purge_list,
305	struct persistent_gnt,
306	remove_node);
307	list_del(entry: &persistent_gnt->remove_node);
308
309	gnttab_set_unmap_op(unmap: &unmap[segs_to_unmap],
310	vaddr(persistent_gnt->page),
311	GNTMAP_host_map,
312	handle: persistent_gnt->handle);
313
314	pages[segs_to_unmap] = persistent_gnt->page;
315
316	if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
317	unmap_data.count = segs_to_unmap;
318	BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
319	gnttab_page_cache_put(cache: &ring->free_pages, page: pages,
320	num: segs_to_unmap);
321	segs_to_unmap = `0`;
322	}
323	kfree(objp: persistent_gnt);
324	}
325	if (segs_to_unmap > `0`) {
326	unmap_data.count = segs_to_unmap;
327	BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
328	gnttab_page_cache_put(cache: &ring->free_pages, page: pages, num: segs_to_unmap);
329	}
330	}
331
332	static void purge_persistent_gnt(struct xen_blkif_ring *ring)
333	{
334	struct persistent_gnt *persistent_gnt;
335	struct rb_node *n;
336	unsigned int num_clean, total;
337	bool scan_used = false;
338	struct rb_root *root;
339
340	if (work_busy(work: &ring->persistent_purge_work)) {
341	pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
342	goto out;
343	}
344
345	if (ring->persistent_gnt_c < max_pgrants \|\|
346	(ring->persistent_gnt_c == max_pgrants &&
347	!ring->blkif->vbd.overflow_max_grants)) {
348	num_clean = `0`;
349	} else {
350	num_clean = (max_pgrants / `100`) * LRU_PERCENT_CLEAN;
351	num_clean = ring->persistent_gnt_c - max_pgrants + num_clean;
352	num_clean = min(ring->persistent_gnt_c, num_clean);
353	pr_debug("Going to purge at least %u persistent grants\n",
354	num_clean);
355	}
356
357	/*
358	* At this point, we can assure that there will be no calls
359	* to get_persistent_grant (because we are executing this code from
360	* xen_blkif_schedule), there can only be calls to put_persistent_gnt,
361	* which means that the number of currently used grants will go down,
362	* but never up, so we will always be able to remove the requested
363	* number of grants.
364	*/
365
366	total = `0`;
367
368	BUG_ON(!list_empty(&ring->persistent_purge_list));
369	root = &ring->persistent_gnts;
370	purge_list:
371	foreach_grant_safe(persistent_gnt, n, root, node) {
372	BUG_ON(persistent_gnt->handle ==
373	BLKBACK_INVALID_HANDLE);
374
375	if (persistent_gnt->active)
376	continue;
377	if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
378	continue;
379	if (scan_used && total >= num_clean)
380	continue;
381
382	rb_erase(&persistent_gnt->node, root);
383	list_add(new: &persistent_gnt->remove_node,
384	head: &ring->persistent_purge_list);
385	total++;
386	}
387	/*
388	* Check whether we also need to start cleaning
389	* grants that were used since last purge in order to cope
390	* with the requested num
391	*/
392	if (!scan_used && total < num_clean) {
393	pr_debug("Still missing %u purged frames\n", num_clean - total);
394	scan_used = true;
395	goto purge_list;
396	}
397
398	if (total) {
399	ring->persistent_gnt_c -= total;
400	ring->blkif->vbd.overflow_max_grants = `0`;
401
402	/ We can defer this work /
403	schedule_work(work: &ring->persistent_purge_work);
404	pr_debug("Purged %u/%u\n", num_clean, total);
405	}
406
407	out:
408	return;
409	}
410
411	/*
412	* Retrieve from the 'pending_reqs' a free pending_req structure to be used.
413	*/
414	static struct pending_req alloc_req(struct* xen_blkif_ring *ring)
415	{
416	struct pending_req *req = NULL;
417	unsigned long flags;
418
419	spin_lock_irqsave(&ring->pending_free_lock, flags);
420	if (!list_empty(head: &ring->pending_free)) {
421	req = list_entry(ring->pending_free.next, struct pending_req,
422	free_list);
423	list_del(entry: &req->free_list);
424	}
425	spin_unlock_irqrestore(lock: &ring->pending_free_lock, flags);
426	return req;
427	}
428
429	/*
430	* Return the 'pending_req' structure back to the freepool. We also
431	* wake up the thread if it was waiting for a free page.
432	*/
433	static void free_req(struct xen_blkif_ring ring, struct* pending_req *req)
434	{
435	unsigned long flags;
436	int was_empty;
437
438	spin_lock_irqsave(&ring->pending_free_lock, flags);
439	was_empty = list_empty(head: &ring->pending_free);
440	list_add(new: &req->free_list, head: &ring->pending_free);
441	spin_unlock_irqrestore(lock: &ring->pending_free_lock, flags);
442	if (was_empty)
443	wake_up(&ring->pending_free_wq);
444	}
445
446	/*
447	* Routines for managing virtual block devices (vbds).
448	*/
449	static int xen_vbd_translate(struct phys_req req, struct* xen_blkif *blkif,
450	enum req_op operation)
451	{
452	struct xen_vbd *vbd = &blkif->vbd;
453	int rc = -EACCES;
454
455	if ((operation != REQ_OP_READ) && vbd->readonly)
456	goto out;
457
458	if (likely(req->nr_sects)) {
459	blkif_sector_t end = req->sector_number + req->nr_sects;
460
461	if (unlikely(end < req->sector_number))
462	goto out;
463	if (unlikely(end > vbd_sz(vbd)))
464	goto out;
465	}
466
467	req->dev = vbd->pdevice;
468	req->bdev = vbd->bdev_handle->bdev;
469	rc = `0`;
470
471	out:
472	return rc;
473	}
474
475	static void xen_vbd_resize(struct xen_blkif *blkif)
476	{
477	struct xen_vbd *vbd = &blkif->vbd;
478	struct xenbus_transaction xbt;
479	int err;
480	struct xenbus_device *dev = xen_blkbk_xenbus(be: blkif->be);
481	unsigned long long new_size = vbd_sz(vbd);
482
483	pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
484	blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
485	pr_info("VBD Resize: new size %llu\n", new_size);
486	vbd->size = new_size;
487	again:
488	err = xenbus_transaction_start(t: &xbt);
489	if (err) {
490	pr_warn("Error starting transaction\n");
491	return;
492	}
493	err = xenbus_printf(t: xbt, dir: dev->nodename, node: "sectors", fmt: "%llu",
494	(unsigned long long)vbd_sz(vbd));
495	if (err) {
496	pr_warn("Error writing new size\n");
497	goto abort;
498	}
499	/*
500	* Write the current state; we will use this to synchronize
501	* the front-end. If the current state is "connected" the
502	* front-end will get the new size information online.
503	*/
504	err = xenbus_printf(t: xbt, dir: dev->nodename, node: "state", fmt: "%d", dev->state);
505	if (err) {
506	pr_warn("Error writing the state\n");
507	goto abort;
508	}
509
510	err = xenbus_transaction_end(t: xbt, abort: `0`);
511	if (err == -EAGAIN)
512	goto again;
513	if (err)
514	pr_warn("Error ending transaction\n");
515	return;
516	abort:
517	xenbus_transaction_end(t: xbt, abort: `1`);
518	}
519
520	/*
521	* Notification from the guest OS.
522	*/
523	static void blkif_notify_work(struct xen_blkif_ring *ring)
524	{
525	ring->waiting_reqs = `1`;
526	wake_up(&ring->wq);
527	}
528
529	irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
530	{
531	blkif_notify_work(ring: dev_id);
532	return IRQ_HANDLED;
533	}
534
535	/*
536	* SCHEDULER FUNCTIONS
537	*/
538
539	static void print_stats(struct xen_blkif_ring *ring)
540	{
541	pr_info("(%s): oo %3llu \| rd %4llu \| wr %4llu \| f %4llu"
542	" \| ds %4llu \| pg: %4u/%4d\n",
543	current->comm, ring->st_oo_req,
544	ring->st_rd_req, ring->st_wr_req,
545	ring->st_f_req, ring->st_ds_req,
546	ring->persistent_gnt_c, max_pgrants);
547	ring->st_print = jiffies + msecs_to_jiffies(m: `10` * `1000`);
548	ring->st_rd_req = `0`;
549	ring->st_wr_req = `0`;
550	ring->st_oo_req = `0`;
551	ring->st_ds_req = `0`;
552	}
553
554	int xen_blkif_schedule(void *arg)
555	{
556	struct xen_blkif_ring *ring = arg;
557	struct xen_blkif *blkif = ring->blkif;
558	struct xen_vbd *vbd = &blkif->vbd;
559	unsigned long timeout;
560	int ret;
561	bool do_eoi;
562	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
563
564	set_freezable();
565	while (!kthread_should_stop()) {
566	if (try_to_freeze())
567	continue;
568	if (unlikely(vbd->size != vbd_sz(vbd)))
569	xen_vbd_resize(blkif);
570
571	timeout = msecs_to_jiffies(LRU_INTERVAL);
572
573	timeout = wait_event_interruptible_timeout(
574	ring->wq,
575	ring->waiting_reqs \|\| kthread_should_stop(),
576	timeout);
577	if (timeout == `0`)
578	goto purge_gnt_list;
579	timeout = wait_event_interruptible_timeout(
580	ring->pending_free_wq,
581	!list_empty(&ring->pending_free) \|\|
582	kthread_should_stop(),
583	timeout);
584	if (timeout == `0`)
585	goto purge_gnt_list;
586
587	do_eoi = ring->waiting_reqs;
588
589	ring->waiting_reqs = `0`;
590	smp_mb(); / clear flag before checking for work /
591
592	ret = do_block_io_op(ring, eoi_flags: &eoi_flags);
593	if (ret > `0`)
594	ring->waiting_reqs = `1`;
595	if (ret == -EACCES)
596	wait_event_interruptible(ring->shutdown_wq,
597	kthread_should_stop());
598
599	if (do_eoi && !ring->waiting_reqs) {
600	xen_irq_lateeoi(irq: ring->irq, eoi_flags);
601	eoi_flags \|= XEN_EOI_FLAG_SPURIOUS;
602	}
603
604	purge_gnt_list:
605	if (blkif->vbd.feature_gnt_persistent &&
606	time_after(jiffies, ring->next_lru)) {
607	purge_persistent_gnt(ring);
608	ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
609	}
610
611	/ Shrink the free pages pool if it is too large. /
612	if (time_before(jiffies, blkif->buffer_squeeze_end))
613	gnttab_page_cache_shrink(cache: &ring->free_pages, num: `0`);
614	else
615	gnttab_page_cache_shrink(cache: &ring->free_pages,
616	num: max_buffer_pages);
617
618	if (log_stats && time_after(jiffies, ring->st_print))
619	print_stats(ring);
620	}
621
622	/ Drain pending purge work /
623	flush_work(work: &ring->persistent_purge_work);
624
625	if (log_stats)
626	print_stats(ring);
627
628	ring->xenblkd = NULL;
629
630	return `0`;
631	}
632
633	/*
634	* Remove persistent grants and empty the pool of free pages
635	*/
636	void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
637	{
638	/ Free all persistent grant pages /
639	free_persistent_gnts(ring);
640
641	/ Since we are shutting down remove all pages from the buffer /
642	gnttab_page_cache_shrink(cache: &ring->free_pages, num: `0` / All /);
643	}
644
645	static unsigned int xen_blkbk_unmap_prepare(
646	struct xen_blkif_ring *ring,
647	struct grant_page **pages,
648	unsigned int num,
649	struct gnttab_unmap_grant_ref *unmap_ops,
650	struct page **unmap_pages)
651	{
652	unsigned int i, invcount = `0`;
653
654	for (i = `0`; i < num; i++) {
655	if (pages[i]->persistent_gnt != NULL) {
656	put_persistent_gnt(ring, persistent_gnt: pages[i]->persistent_gnt);
657	continue;
658	}
659	if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
660	continue;
661	unmap_pages[invcount] = pages[i]->page;
662	gnttab_set_unmap_op(unmap: &unmap_ops[invcount], vaddr(pages[i]->page),
663	GNTMAP_host_map, handle: pages[i]->handle);
664	pages[i]->handle = BLKBACK_INVALID_HANDLE;
665	invcount++;
666	}
667
668	return invcount;
669	}
670
671	static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
672	{
673	struct pending_req pending_req = (struct* pending_req *)(data->data);
674	struct xen_blkif_ring *ring = pending_req->ring;
675	struct xen_blkif *blkif = ring->blkif;
676
677	/ BUG_ON used to reproduce existing behaviour,*
678	but is this the best way to deal with this? /*
679	BUG_ON(result);
680
681	gnttab_page_cache_put(cache: &ring->free_pages, page: data->pages, num: data->count);
682	make_response(ring, id: pending_req->id,
683	op: pending_req->operation, st: pending_req->status);
684	free_req(ring, req: pending_req);
685	/*
686	* Make sure the request is freed before releasing blkif,
687	* or there could be a race between free_req and the
688	* cleanup done in xen_blkif_free during shutdown.
689	*
690	* NB: The fact that we might try to wake up pending_free_wq
691	* before drain_complete (in case there's a drain going on)
692	* it's not a problem with our current implementation
693	* because we can assure there's no thread waiting on
694	* pending_free_wq if there's a drain going on, but it has
695	* to be taken into account if the current model is changed.
696	*/
697	if (atomic_dec_and_test(v: &ring->inflight) && atomic_read(v: &blkif->drain)) {
698	complete(&blkif->drain_complete);
699	}
700	xen_blkif_put(blkif);
701	}
702
703	static void xen_blkbk_unmap_and_respond(struct pending_req *req)
704	{
705	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
706	struct xen_blkif_ring *ring = req->ring;
707	struct grant_page **pages = req->segments;
708	unsigned int invcount;
709
710	invcount = xen_blkbk_unmap_prepare(ring, pages, num: req->nr_segs,
711	unmap_ops: req->unmap, unmap_pages: req->unmap_pages);
712
713	work->data = req;
714	work->done = xen_blkbk_unmap_and_respond_callback;
715	work->unmap_ops = req->unmap;
716	work->kunmap_ops = NULL;
717	work->pages = req->unmap_pages;
718	work->count = invcount;
719
720	gnttab_unmap_refs_async(item: &req->gnttab_unmap_data);
721	}
722
723
724	/*
725	* Unmap the grant references.
726	*
727	* This could accumulate ops up to the batch size to reduce the number
728	* of hypercalls, but since this is only used in error paths there's
729	* no real need.
730	*/
731	static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
732	struct grant_page *pages[],
733	int num)
734	{
735	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
736	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
737	unsigned int invcount = `0`;
738	int ret;
739
740	while (num) {
741	unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
742
743	invcount = xen_blkbk_unmap_prepare(ring, pages, num: batch,
744	unmap_ops: unmap, unmap_pages);
745	if (invcount) {
746	ret = gnttab_unmap_refs(unmap_ops: unmap, NULL, pages: unmap_pages, count: invcount);
747	BUG_ON(ret);
748	gnttab_page_cache_put(cache: &ring->free_pages, page: unmap_pages,
749	num: invcount);
750	}
751	pages += batch;
752	num -= batch;
753	}
754	}
755
756	static int xen_blkbk_map(struct xen_blkif_ring *ring,
757	struct grant_page *pages[],
758	int num, bool ro)
759	{
760	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
761	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
762	struct persistent_gnt *persistent_gnt = NULL;
763	phys_addr_t addr = `0`;
764	int i, seg_idx, new_map_idx;
765	int segs_to_map = `0`;
766	int ret = `0`;
767	int last_map = `0`, map_until = `0`;
768	int use_persistent_gnts;
769	struct xen_blkif *blkif = ring->blkif;
770
771	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
772
773	/*
774	* Fill out preq.nr_sects with proper amount of sectors, and setup
775	* assign map[..] with the PFN of the page in our domain with the
776	* corresponding grant reference for each page.
777	*/
778	again:
779	for (i = map_until; i < num; i++) {
780	uint32_t flags;
781
782	if (use_persistent_gnts) {
783	persistent_gnt = get_persistent_gnt(
784	ring,
785	gref: pages[i]->gref);
786	}
787
788	if (persistent_gnt) {
789	/*
790	* We are using persistent grants and
791	* the grant is already mapped
792	*/
793	pages[i]->page = persistent_gnt->page;
794	pages[i]->persistent_gnt = persistent_gnt;
795	} else {
796	if (gnttab_page_cache_get(cache: &ring->free_pages,
797	page: &pages[i]->page)) {
798	gnttab_page_cache_put(cache: &ring->free_pages,
799	page: pages_to_gnt,
800	num: segs_to_map);
801	ret = -ENOMEM;
802	goto out;
803	}
804	addr = vaddr(pages[i]->page);
805	pages_to_gnt[segs_to_map] = pages[i]->page;
806	pages[i]->persistent_gnt = NULL;
807	flags = GNTMAP_host_map;
808	if (!use_persistent_gnts && ro)
809	flags \|= GNTMAP_readonly;
810	gnttab_set_map_op(map: &map[segs_to_map++], addr,
811	flags, ref: pages[i]->gref,
812	domid: blkif->domid);
813	}
814	map_until = i + `1`;
815	if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
816	break;
817	}
818
819	if (segs_to_map)
820	ret = gnttab_map_refs(map_ops: map, NULL, pages: pages_to_gnt, count: segs_to_map);
821
822	/*
823	* Now swizzle the MFN in our domain with the MFN from the other domain
824	* so that when we access vaddr(pending_req,i) it has the contents of
825	* the page from the other domain.
826	*/
827	for (seg_idx = last_map, new_map_idx = `0`; seg_idx < map_until; seg_idx++) {
828	if (!pages[seg_idx]->persistent_gnt) {
829	/ This is a newly mapped grant /
830	BUG_ON(new_map_idx >= segs_to_map);
831	if (unlikely(map[new_map_idx].status != `0`)) {
832	pr_debug("invalid buffer -- could not remap it\n");
833	gnttab_page_cache_put(cache: &ring->free_pages,
834	page: &pages[seg_idx]->page, num: `1`);
835	pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
836	ret \|= !ret;
837	goto next;
838	}
839	pages[seg_idx]->handle = map[new_map_idx].handle;
840	} else {
841	continue;
842	}
843	if (use_persistent_gnts &&
844	ring->persistent_gnt_c < max_pgrants) {
845	/*
846	* We are using persistent grants, the grant is
847	* not mapped but we might have room for it.
848	*/
849	persistent_gnt = kmalloc(size: sizeof(struct persistent_gnt),
850	GFP_KERNEL);
851	if (!persistent_gnt) {
852	/*
853	* If we don't have enough memory to
854	* allocate the persistent_gnt struct
855	* map this grant non-persistenly
856	*/
857	goto next;
858	}
859	persistent_gnt->gnt = map[new_map_idx].ref;
860	persistent_gnt->handle = map[new_map_idx].handle;
861	persistent_gnt->page = pages[seg_idx]->page;
862	if (add_persistent_gnt(ring,
863	persistent_gnt)) {
864	kfree(objp: persistent_gnt);
865	persistent_gnt = NULL;
866	goto next;
867	}
868	pages[seg_idx]->persistent_gnt = persistent_gnt;
869	pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
870	persistent_gnt->gnt, ring->persistent_gnt_c,
871	max_pgrants);
872	goto next;
873	}
874	if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
875	blkif->vbd.overflow_max_grants = `1`;
876	pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
877	blkif->domid, blkif->vbd.handle);
878	}
879	/*
880	* We could not map this grant persistently, so use it as
881	* a non-persistent grant.
882	*/
883	next:
884	new_map_idx++;
885	}
886	segs_to_map = `0`;
887	last_map = map_until;
888	if (!ret && map_until != num)
889	goto again;
890
891	out:
892	for (i = last_map; i < num; i++) {
893	/ Don't zap current batch's valid persistent grants. /
894	if (i >= map_until)
895	pages[i]->persistent_gnt = NULL;
896	pages[i]->handle = BLKBACK_INVALID_HANDLE;
897	}
898
899	return ret;
900	}
901
902	static int xen_blkbk_map_seg(struct pending_req *pending_req)
903	{
904	int rc;
905
906	rc = xen_blkbk_map(ring: pending_req->ring, pages: pending_req->segments,
907	num: pending_req->nr_segs,
908	ro: (pending_req->operation != BLKIF_OP_READ));
909
910	return rc;
911	}
912
913	static int xen_blkbk_parse_indirect(struct blkif_request *req,
914	struct pending_req *pending_req,
915	struct seg_buf seg[],
916	struct phys_req *preq)
917	{
918	struct grant_page **pages = pending_req->indirect_pages;
919	struct xen_blkif_ring *ring = pending_req->ring;
920	int indirect_grefs, rc, n, nseg, i;
921	struct blkif_request_segment *segments = NULL;
922
923	nseg = pending_req->nr_segs;
924	indirect_grefs = INDIRECT_PAGES(nseg);
925	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
926
927	for (i = `0`; i < indirect_grefs; i++)
928	pages[i]->gref = req->u.indirect.indirect_grefs[i];
929
930	rc = xen_blkbk_map(ring, pages, num: indirect_grefs, ro: true);
931	if (rc)
932	goto unmap;
933
934	for (n = `0`; n < nseg; n++) {
935	uint8_t first_sect, last_sect;
936
937	if ((n % SEGS_PER_INDIRECT_FRAME) == `0`) {
938	/ Map indirect segments /
939	if (segments)
940	kunmap_atomic(segments);
941	segments = kmap_atomic(page: pages[n/SEGS_PER_INDIRECT_FRAME]->page);
942	}
943	i = n % SEGS_PER_INDIRECT_FRAME;
944
945	pending_req->segments[n]->gref = segments[i].gref;
946
947	first_sect = READ_ONCE(segments[i].first_sect);
948	last_sect = READ_ONCE(segments[i].last_sect);
949	if (last_sect >= (XEN_PAGE_SIZE >> `9`) \|\| last_sect < first_sect) {
950	rc = -EINVAL;
951	goto unmap;
952	}
953
954	seg[n].nsec = last_sect - first_sect + `1`;
955	seg[n].offset = first_sect << `9`;
956	preq->nr_sects += seg[n].nsec;
957	}
958
959	unmap:
960	if (segments)
961	kunmap_atomic(segments);
962	xen_blkbk_unmap(ring, pages, num: indirect_grefs);
963	return rc;
964	}
965
966	static int dispatch_discard_io(struct xen_blkif_ring *ring,
967	struct blkif_request *req)
968	{
969	int err = `0`;
970	int status = BLKIF_RSP_OKAY;
971	struct xen_blkif *blkif = ring->blkif;
972	struct block_device *bdev = blkif->vbd.bdev_handle->bdev;
973	struct phys_req preq;
974
975	xen_blkif_get(blkif);
976
977	preq.sector_number = req->u.discard.sector_number;
978	preq.nr_sects = req->u.discard.nr_sectors;
979
980	err = xen_vbd_translate(req: &preq, blkif, operation: REQ_OP_WRITE);
981	if (err) {
982	pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
983	preq.sector_number,
984	preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
985	goto fail_response;
986	}
987	ring->st_ds_req++;
988
989	if (blkif->vbd.discard_secure &&
990	(req->u.discard.flag & BLKIF_DISCARD_SECURE))
991	err = blkdev_issue_secure_erase(bdev,
992	sector: req->u.discard.sector_number,
993	nr_sects: req->u.discard.nr_sectors, GFP_KERNEL);
994	else
995	err = blkdev_issue_discard(bdev, sector: req->u.discard.sector_number,
996	nr_sects: req->u.discard.nr_sectors, GFP_KERNEL);
997
998	fail_response:
999	if (err == -EOPNOTSUPP) {
1000	pr_debug("discard op failed, not supported\n");
1001	status = BLKIF_RSP_EOPNOTSUPP;
1002	} else if (err)
1003	status = BLKIF_RSP_ERROR;
1004
1005	make_response(ring, id: req->u.discard.id, op: req->operation, st: status);
1006	xen_blkif_put(blkif);
1007	return err;
1008	}
1009
1010	static int dispatch_other_io(struct xen_blkif_ring *ring,
1011	struct blkif_request *req,
1012	struct pending_req *pending_req)
1013	{
1014	free_req(ring, req: pending_req);
1015	make_response(ring, id: req->u.other.id, op: req->operation,
1016	BLKIF_RSP_EOPNOTSUPP);
1017	return -EIO;
1018	}
1019
1020	static void xen_blk_drain_io(struct xen_blkif_ring *ring)
1021	{
1022	struct xen_blkif *blkif = ring->blkif;
1023
1024	atomic_set(v: &blkif->drain, i: `1`);
1025	do {
1026	if (atomic_read(v: &ring->inflight) == `0`)
1027	break;
1028	wait_for_completion_interruptible_timeout(
1029	x: &blkif->drain_complete, HZ);
1030
1031	if (!atomic_read(v: &blkif->drain))
1032	break;
1033	} while (!kthread_should_stop());
1034	atomic_set(v: &blkif->drain, i: `0`);
1035	}
1036
1037	static void __end_block_io_op(struct pending_req *pending_req,
1038	blk_status_t error)
1039	{
1040	/ An error fails the entire request. /
1041	if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
1042	error == BLK_STS_NOTSUPP) {
1043	pr_debug("flush diskcache op failed, not supported\n");
1044	xen_blkbk_flush_diskcache(XBT_NIL, be: pending_req->ring->blkif->be, state: `0`);
1045	pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1046	} else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
1047	error == BLK_STS_NOTSUPP) {
1048	pr_debug("write barrier op failed, not supported\n");
1049	xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, `0`);
1050	pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1051	} else if (error) {
1052	pr_debug("Buffer not up-to-date at end of operation,"
1053	" error=%d\n", error);
1054	pending_req->status = BLKIF_RSP_ERROR;
1055	}
1056
1057	/*
1058	* If all of the bio's have completed it is time to unmap
1059	* the grant references associated with 'request' and provide
1060	* the proper response on the ring.
1061	*/
1062	if (atomic_dec_and_test(&pending_req->pendcnt))
1063	xen_blkbk_unmap_and_respond(pending_req);
1064	}
1065
1066	/*
1067	* bio callback.
1068	*/
1069	static void end_block_io_op(struct bio *bio)
1070	{
1071	__end_block_io_op(pending_req: bio->bi_private, error: bio->bi_status);
1072	bio_put(bio);
1073	}
1074
1075	static void blkif_get_x86_32_req(struct blkif_request *dst,
1076	const struct blkif_x86_32_request *src)
1077	{
1078	unsigned int i, n;
1079
1080	dst->operation = READ_ONCE(src->operation);
1081
1082	switch (dst->operation) {
1083	case BLKIF_OP_READ:
1084	case BLKIF_OP_WRITE:
1085	case BLKIF_OP_WRITE_BARRIER:
1086	case BLKIF_OP_FLUSH_DISKCACHE:
1087	dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments);
1088	dst->u.rw.handle = src->u.rw.handle;
1089	dst->u.rw.id = src->u.rw.id;
1090	dst->u.rw.sector_number = src->u.rw.sector_number;
1091	n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST,
1092	dst->u.rw.nr_segments);
1093	for (i = `0`; i < n; i++)
1094	dst->u.rw.seg[i] = src->u.rw.seg[i];
1095	break;
1096
1097	case BLKIF_OP_DISCARD:
1098	dst->u.discard.flag = src->u.discard.flag;
1099	dst->u.discard.id = src->u.discard.id;
1100	dst->u.discard.sector_number = src->u.discard.sector_number;
1101	dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
1102	break;
1103
1104	case BLKIF_OP_INDIRECT:
1105	dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
1106	dst->u.indirect.nr_segments =
1107	READ_ONCE(src->u.indirect.nr_segments);
1108	dst->u.indirect.handle = src->u.indirect.handle;
1109	dst->u.indirect.id = src->u.indirect.id;
1110	dst->u.indirect.sector_number = src->u.indirect.sector_number;
1111	n = min(MAX_INDIRECT_PAGES,
1112	INDIRECT_PAGES(dst->u.indirect.nr_segments));
1113	for (i = `0`; i < n; i++)
1114	dst->u.indirect.indirect_grefs[i] =
1115	src->u.indirect.indirect_grefs[i];
1116	break;
1117
1118	default:
1119	/*
1120	* Don't know how to translate this op. Only get the
1121	* ID so failure can be reported to the frontend.
1122	*/
1123	dst->u.other.id = src->u.other.id;
1124	break;
1125	}
1126	}
1127
1128	static void blkif_get_x86_64_req(struct blkif_request *dst,
1129	const struct blkif_x86_64_request *src)
1130	{
1131	unsigned int i, n;
1132
1133	dst->operation = READ_ONCE(src->operation);
1134
1135	switch (dst->operation) {
1136	case BLKIF_OP_READ:
1137	case BLKIF_OP_WRITE:
1138	case BLKIF_OP_WRITE_BARRIER:
1139	case BLKIF_OP_FLUSH_DISKCACHE:
1140	dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments);
1141	dst->u.rw.handle = src->u.rw.handle;
1142	dst->u.rw.id = src->u.rw.id;
1143	dst->u.rw.sector_number = src->u.rw.sector_number;
1144	n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST,
1145	dst->u.rw.nr_segments);
1146	for (i = `0`; i < n; i++)
1147	dst->u.rw.seg[i] = src->u.rw.seg[i];
1148	break;
1149
1150	case BLKIF_OP_DISCARD:
1151	dst->u.discard.flag = src->u.discard.flag;
1152	dst->u.discard.id = src->u.discard.id;
1153	dst->u.discard.sector_number = src->u.discard.sector_number;
1154	dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
1155	break;
1156
1157	case BLKIF_OP_INDIRECT:
1158	dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
1159	dst->u.indirect.nr_segments =
1160	READ_ONCE(src->u.indirect.nr_segments);
1161	dst->u.indirect.handle = src->u.indirect.handle;
1162	dst->u.indirect.id = src->u.indirect.id;
1163	dst->u.indirect.sector_number = src->u.indirect.sector_number;
1164	n = min(MAX_INDIRECT_PAGES,
1165	INDIRECT_PAGES(dst->u.indirect.nr_segments));
1166	for (i = `0`; i < n; i++)
1167	dst->u.indirect.indirect_grefs[i] =
1168	src->u.indirect.indirect_grefs[i];
1169	break;
1170
1171	default:
1172	/*
1173	* Don't know how to translate this op. Only get the
1174	* ID so failure can be reported to the frontend.
1175	*/
1176	dst->u.other.id = src->u.other.id;
1177	break;
1178	}
1179	}
1180
1181	/*
1182	* Function to copy the from the ring buffer the 'struct blkif_request'
1183	* (which has the sectors we want, number of them, grant references, etc),
1184	* and transmute it to the block API to hand it over to the proper block disk.
1185	*/
1186	static int
1187	__do_block_io_op(struct xen_blkif_ring ring, unsigned* int *eoi_flags)
1188	{
1189	union blkif_back_rings *blk_rings = &ring->blk_rings;
1190	struct blkif_request req;
1191	struct pending_req *pending_req;
1192	RING_IDX rc, rp;
1193	int more_to_do = `0`;
1194
1195	rc = blk_rings->common.req_cons;
1196	rp = blk_rings->common.sring->req_prod;
1197	rmb(); / Ensure we see queued requests up to 'rp'. /
1198
1199	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1200	rc = blk_rings->common.rsp_prod_pvt;
1201	pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1202	rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1203	return -EACCES;
1204	}
1205	while (rc != rp) {
1206
1207	if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
1208	break;
1209
1210	/ We've seen a request, so clear spurious eoi flag. /
1211	*eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
1212
1213	if (kthread_should_stop()) {
1214	more_to_do = `1`;
1215	break;
1216	}
1217
1218	pending_req = alloc_req(ring);
1219	if (NULL == pending_req) {
1220	ring->st_oo_req++;
1221	more_to_do = `1`;
1222	break;
1223	}
1224
1225	switch (ring->blkif->blk_protocol) {
1226	case BLKIF_PROTOCOL_NATIVE:
1227	memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
1228	break;
1229	case BLKIF_PROTOCOL_X86_32:
1230	blkif_get_x86_32_req(dst: &req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1231	break;
1232	case BLKIF_PROTOCOL_X86_64:
1233	blkif_get_x86_64_req(dst: &req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1234	break;
1235	default:
1236	BUG();
1237	}
1238	blk_rings->common.req_cons = ++rc; / before make_response() /
1239
1240	/ Apply all sanity checks to /private copy/ of request. /
1241	barrier();
1242
1243	switch (req.operation) {
1244	case BLKIF_OP_READ:
1245	case BLKIF_OP_WRITE:
1246	case BLKIF_OP_WRITE_BARRIER:
1247	case BLKIF_OP_FLUSH_DISKCACHE:
1248	case BLKIF_OP_INDIRECT:
1249	if (dispatch_rw_block_io(ring, req: &req, pending_req))
1250	goto done;
1251	break;
1252	case BLKIF_OP_DISCARD:
1253	free_req(ring, req: pending_req);
1254	if (dispatch_discard_io(ring, req: &req))
1255	goto done;
1256	break;
1257	default:
1258	if (dispatch_other_io(ring, req: &req, pending_req))
1259	goto done;
1260	break;
1261	}
1262
1263	/ Yield point for this unbounded loop. /
1264	cond_resched();
1265	}
1266	done:
1267	return more_to_do;
1268	}
1269
1270	static int
1271	do_block_io_op(struct xen_blkif_ring ring, unsigned* int *eoi_flags)
1272	{
1273	union blkif_back_rings *blk_rings = &ring->blk_rings;
1274	int more_to_do;
1275
1276	do {
1277	more_to_do = __do_block_io_op(ring, eoi_flags);
1278	if (more_to_do)
1279	break;
1280
1281	RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1282	} while (more_to_do);
1283
1284	return more_to_do;
1285	}
1286	/*
1287	* Transmutation of the 'struct blkif_request' to a proper 'struct bio'
1288	* and call the 'submit_bio' to pass it to the underlying storage.
1289	*/
1290	static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
1291	struct blkif_request *req,
1292	struct pending_req *pending_req)
1293	{
1294	struct phys_req preq;
1295	struct seg_buf *seg = pending_req->seg;
1296	unsigned int nseg;
1297	struct bio *bio = NULL;
1298	struct bio **biolist = pending_req->biolist;
1299	int i, nbio = `0`;
1300	enum req_op operation;
1301	blk_opf_t operation_flags = `0`;
1302	struct blk_plug plug;
1303	bool drain = false;
1304	struct grant_page **pages = pending_req->segments;
1305	unsigned short req_operation;
1306
1307	req_operation = req->operation == BLKIF_OP_INDIRECT ?
1308	req->u.indirect.indirect_op : req->operation;
1309
1310	if ((req->operation == BLKIF_OP_INDIRECT) &&
1311	(req_operation != BLKIF_OP_READ) &&
1312	(req_operation != BLKIF_OP_WRITE)) {
1313	pr_debug("Invalid indirect operation (%u)\n", req_operation);
1314	goto fail_response;
1315	}
1316
1317	switch (req_operation) {
1318	case BLKIF_OP_READ:
1319	ring->st_rd_req++;
1320	operation = REQ_OP_READ;
1321	break;
1322	case BLKIF_OP_WRITE:
1323	ring->st_wr_req++;
1324	operation = REQ_OP_WRITE;
1325	operation_flags = REQ_SYNC \| REQ_IDLE;
1326	break;
1327	case BLKIF_OP_WRITE_BARRIER:
1328	drain = true;
1329	fallthrough;
1330	case BLKIF_OP_FLUSH_DISKCACHE:
1331	ring->st_f_req++;
1332	operation = REQ_OP_WRITE;
1333	operation_flags = REQ_PREFLUSH;
1334	break;
1335	default:
1336	operation = `0`; / make gcc happy /
1337	goto fail_response;
1338	break;
1339	}
1340
1341	/ Check that the number of segments is sane. /
1342	nseg = req->operation == BLKIF_OP_INDIRECT ?
1343	req->u.indirect.nr_segments : req->u.rw.nr_segments;
1344
1345	if (unlikely(nseg == `0` && operation_flags != REQ_PREFLUSH) \|\|
1346	unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1347	(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) \|\|
1348	unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1349	(nseg > MAX_INDIRECT_SEGMENTS))) {
1350	pr_debug("Bad number of segments in request (%d)\n", nseg);
1351	/ Haven't submitted any bio's yet. /
1352	goto fail_response;
1353	}
1354
1355	preq.nr_sects = `0`;
1356
1357	pending_req->ring = ring;
1358	pending_req->id = req->u.rw.id;
1359	pending_req->operation = req_operation;
1360	pending_req->status = BLKIF_RSP_OKAY;
1361	pending_req->nr_segs = nseg;
1362
1363	if (req->operation != BLKIF_OP_INDIRECT) {
1364	preq.dev = req->u.rw.handle;
1365	preq.sector_number = req->u.rw.sector_number;
1366	for (i = `0`; i < nseg; i++) {
1367	pages[i]->gref = req->u.rw.seg[i].gref;
1368	seg[i].nsec = req->u.rw.seg[i].last_sect -
1369	req->u.rw.seg[i].first_sect + `1`;
1370	seg[i].offset = (req->u.rw.seg[i].first_sect << `9`);
1371	if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> `9`)) \|\|
1372	(req->u.rw.seg[i].last_sect <
1373	req->u.rw.seg[i].first_sect))
1374	goto fail_response;
1375	preq.nr_sects += seg[i].nsec;
1376	}
1377	} else {
1378	preq.dev = req->u.indirect.handle;
1379	preq.sector_number = req->u.indirect.sector_number;
1380	if (xen_blkbk_parse_indirect(req, pending_req, seg, preq: &preq))
1381	goto fail_response;
1382	}
1383
1384	if (xen_vbd_translate(req: &preq, blkif: ring->blkif, operation) != `0`) {
1385	pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
1386	operation == REQ_OP_READ ? "read" : "write",
1387	preq.sector_number,
1388	preq.sector_number + preq.nr_sects,
1389	ring->blkif->vbd.pdevice);
1390	goto fail_response;
1391	}
1392
1393	/*
1394	* This check _MUST_ be done after xen_vbd_translate as the preq.bdev
1395	* is set there.
1396	*/
1397	for (i = `0`; i < nseg; i++) {
1398	if (((int)preq.sector_number\|(int)seg[i].nsec) &
1399	((bdev_logical_block_size(bdev: preq.bdev) >> `9`) - `1`)) {
1400	pr_debug("Misaligned I/O request from domain %d\n",
1401	ring->blkif->domid);
1402	goto fail_response;
1403	}
1404	}
1405
1406	/ Wait on all outstanding I/O's and once that has been completed*
1407	* issue the flush.
1408	*/
1409	if (drain)
1410	xen_blk_drain_io(ring: pending_req->ring);
1411
1412	/*
1413	* If we have failed at this point, we need to undo the M2P override,
1414	* set gnttab_set_unmap_op on all of the grant references and perform
1415	* the hypercall to unmap the grants - that is all done in
1416	* xen_blkbk_unmap.
1417	*/
1418	if (xen_blkbk_map_seg(pending_req))
1419	goto fail_flush;
1420
1421	/*
1422	* This corresponding xen_blkif_put is done in __end_block_io_op, or
1423	* below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1424	*/
1425	xen_blkif_get(ring->blkif);
1426	atomic_inc(v: &ring->inflight);
1427
1428	for (i = `0`; i < nseg; i++) {
1429	while ((bio == NULL) \|\|
1430	(bio_add_page(bio,
1431	page: pages[i]->page,
1432	len: seg[i].nsec << `9`,
1433	off: seg[i].offset) == `0`)) {
1434	bio = bio_alloc(bdev: preq.bdev, nr_vecs: bio_max_segs(nr_segs: nseg - i),
1435	opf: operation \| operation_flags,
1436	GFP_KERNEL);
1437	biolist[nbio++] = bio;
1438	bio->bi_private = pending_req;
1439	bio->bi_end_io = end_block_io_op;
1440	bio->bi_iter.bi_sector = preq.sector_number;
1441	}
1442
1443	preq.sector_number += seg[i].nsec;
1444	}
1445
1446	/ This will be hit if the operation was a flush or discard. /
1447	if (!bio) {
1448	BUG_ON(operation_flags != REQ_PREFLUSH);
1449
1450	bio = bio_alloc(bdev: preq.bdev, nr_vecs: `0`, opf: operation \| operation_flags,
1451	GFP_KERNEL);
1452	biolist[nbio++] = bio;
1453	bio->bi_private = pending_req;
1454	bio->bi_end_io = end_block_io_op;
1455	}
1456
1457	atomic_set(v: &pending_req->pendcnt, i: nbio);
1458	blk_start_plug(&plug);
1459
1460	for (i = `0`; i < nbio; i++)
1461	submit_bio(bio: biolist[i]);
1462
1463	/ Let the I/Os go.. /
1464	blk_finish_plug(&plug);
1465
1466	if (operation == REQ_OP_READ)
1467	ring->st_rd_sect += preq.nr_sects;
1468	else if (operation == REQ_OP_WRITE)
1469	ring->st_wr_sect += preq.nr_sects;
1470
1471	return `0`;
1472
1473	fail_flush:
1474	xen_blkbk_unmap(ring, pages: pending_req->segments,
1475	num: pending_req->nr_segs);
1476	fail_response:
1477	/ Haven't submitted any bio's yet. /
1478	make_response(ring, id: req->u.rw.id, op: req_operation, BLKIF_RSP_ERROR);
1479	free_req(ring, req: pending_req);
1480	msleep(msecs: `1`); / back off a bit /
1481	return -EIO;
1482	}
1483
1484
1485
1486	/*
1487	* Put a response on the ring on how the operation fared.
1488	*/
1489	static void make_response(struct xen_blkif_ring *ring, u64 id,
1490	unsigned short op, int st)
1491	{
1492	struct blkif_response *resp;
1493	unsigned long flags;
1494	union blkif_back_rings *blk_rings;
1495	int notify;
1496
1497	spin_lock_irqsave(&ring->blk_ring_lock, flags);
1498	blk_rings = &ring->blk_rings;
1499	/ Place on the response ring for the relevant domain. /
1500	switch (ring->blkif->blk_protocol) {
1501	case BLKIF_PROTOCOL_NATIVE:
1502	resp = RING_GET_RESPONSE(&blk_rings->native,
1503	blk_rings->native.rsp_prod_pvt);
1504	break;
1505	case BLKIF_PROTOCOL_X86_32:
1506	resp = RING_GET_RESPONSE(&blk_rings->x86_32,
1507	blk_rings->x86_32.rsp_prod_pvt);
1508	break;
1509	case BLKIF_PROTOCOL_X86_64:
1510	resp = RING_GET_RESPONSE(&blk_rings->x86_64,
1511	blk_rings->x86_64.rsp_prod_pvt);
1512	break;
1513	default:
1514	BUG();
1515	}
1516
1517	resp->id = id;
1518	resp->operation = op;
1519	resp->status = st;
1520
1521	blk_rings->common.rsp_prod_pvt++;
1522	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1523	spin_unlock_irqrestore(lock: &ring->blk_ring_lock, flags);
1524	if (notify)
1525	notify_remote_via_irq(irq: ring->irq);
1526	}
1527
1528	static int __init xen_blkif_init(void)
1529	{
1530	int rc = `0`;
1531
1532	if (!xen_domain())
1533	return -ENODEV;
1534
1535	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
1536	pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
1537	xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
1538	xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
1539	}
1540
1541	if (xenblk_max_queues == `0`)
1542	xenblk_max_queues = num_online_cpus();
1543
1544	rc = xen_blkif_interface_init();
1545	if (rc)
1546	goto failed_init;
1547
1548	rc = xen_blkif_xenbus_init();
1549	if (rc)
1550	goto failed_init;
1551
1552	failed_init:
1553	return rc;
1554	}
1555
1556	module_init(xen_blkif_init);
1557
1558	static void __exit xen_blkif_fini(void)
1559	{
1560	xen_blkif_xenbus_fini();
1561	xen_blkif_interface_fini();
1562	}
1563
1564	module_exit(xen_blkif_fini);
1565
1566	MODULE_LICENSE("Dual BSD/GPL");
1567	MODULE_ALIAS("xen-backend:vbd");
1568

source code of linux/drivers/block/xen-blkback/blkback.c