1 | /* |
2 | * blkfront.c |
3 | * |
4 | * XenLinux virtual block device driver. |
5 | * |
6 | * Copyright (c) 2003-2004, Keir Fraser & Steve Hand |
7 | * Modifications by Mark A. Williamson are (c) Intel Research Cambridge |
8 | * Copyright (c) 2004, Christian Limpach |
9 | * Copyright (c) 2004, Andrew Warfield |
10 | * Copyright (c) 2005, Christopher Clark |
11 | * Copyright (c) 2005, XenSource Ltd |
12 | * |
13 | * This program is free software; you can redistribute it and/or |
14 | * modify it under the terms of the GNU General Public License version 2 |
15 | * as published by the Free Software Foundation; or, when distributed |
16 | * separately from the Linux kernel or incorporated into other |
17 | * software packages, subject to the following license: |
18 | * |
19 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
20 | * of this source file (the "Software"), to deal in the Software without |
21 | * restriction, including without limitation the rights to use, copy, modify, |
22 | * merge, publish, distribute, sublicense, and/or sell copies of the Software, |
23 | * and to permit persons to whom the Software is furnished to do so, subject to |
24 | * the following conditions: |
25 | * |
26 | * The above copyright notice and this permission notice shall be included in |
27 | * all copies or substantial portions of the Software. |
28 | * |
29 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
30 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
31 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
32 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
33 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
34 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
35 | * IN THE SOFTWARE. |
36 | */ |
37 | |
38 | #include <linux/interrupt.h> |
39 | #include <linux/blkdev.h> |
40 | #include <linux/blk-mq.h> |
41 | #include <linux/hdreg.h> |
42 | #include <linux/cdrom.h> |
43 | #include <linux/module.h> |
44 | #include <linux/slab.h> |
45 | #include <linux/major.h> |
46 | #include <linux/mutex.h> |
47 | #include <linux/scatterlist.h> |
48 | #include <linux/bitmap.h> |
49 | #include <linux/list.h> |
50 | #include <linux/workqueue.h> |
51 | #include <linux/sched/mm.h> |
52 | |
53 | #include <xen/xen.h> |
54 | #include <xen/xenbus.h> |
55 | #include <xen/grant_table.h> |
56 | #include <xen/events.h> |
57 | #include <xen/page.h> |
58 | #include <xen/platform_pci.h> |
59 | |
60 | #include <xen/interface/grant_table.h> |
61 | #include <xen/interface/io/blkif.h> |
62 | #include <xen/interface/io/protocols.h> |
63 | |
64 | #include <asm/xen/hypervisor.h> |
65 | |
66 | /* |
67 | * The minimal size of segment supported by the block framework is PAGE_SIZE. |
68 | * When Linux is using a different page size than Xen, it may not be possible |
69 | * to put all the data in a single segment. |
70 | * This can happen when the backend doesn't support indirect descriptor and |
71 | * therefore the maximum amount of data that a request can carry is |
72 | * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB |
73 | * |
74 | * Note that we only support one extra request. So the Linux page size |
75 | * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = |
76 | * 88KB. |
77 | */ |
78 | #define (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE) |
79 | |
80 | enum blkif_state { |
81 | BLKIF_STATE_DISCONNECTED, |
82 | BLKIF_STATE_CONNECTED, |
83 | BLKIF_STATE_SUSPENDED, |
84 | BLKIF_STATE_ERROR, |
85 | }; |
86 | |
87 | struct grant { |
88 | grant_ref_t gref; |
89 | struct page *page; |
90 | struct list_head node; |
91 | }; |
92 | |
93 | enum blk_req_status { |
94 | REQ_PROCESSING, |
95 | REQ_WAITING, |
96 | REQ_DONE, |
97 | REQ_ERROR, |
98 | REQ_EOPNOTSUPP, |
99 | }; |
100 | |
101 | struct blk_shadow { |
102 | struct blkif_request req; |
103 | struct request *request; |
104 | struct grant **grants_used; |
105 | struct grant **indirect_grants; |
106 | struct scatterlist *sg; |
107 | unsigned int num_sg; |
108 | enum blk_req_status status; |
109 | |
110 | #define NO_ASSOCIATED_ID ~0UL |
111 | /* |
112 | * Id of the sibling if we ever need 2 requests when handling a |
113 | * block I/O request |
114 | */ |
115 | unsigned long associated_id; |
116 | }; |
117 | |
118 | struct blkif_req { |
119 | blk_status_t error; |
120 | }; |
121 | |
122 | static inline struct blkif_req *blkif_req(struct request *rq) |
123 | { |
124 | return blk_mq_rq_to_pdu(rq); |
125 | } |
126 | |
127 | static DEFINE_MUTEX(blkfront_mutex); |
128 | static const struct block_device_operations xlvbd_block_fops; |
129 | static struct delayed_work blkfront_work; |
130 | static LIST_HEAD(info_list); |
131 | |
132 | /* |
133 | * Maximum number of segments in indirect requests, the actual value used by |
134 | * the frontend driver is the minimum of this value and the value provided |
135 | * by the backend driver. |
136 | */ |
137 | |
138 | static unsigned int xen_blkif_max_segments = 32; |
139 | module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444); |
140 | MODULE_PARM_DESC(max_indirect_segments, |
141 | "Maximum amount of segments in indirect requests (default is 32)" ); |
142 | |
143 | static unsigned int xen_blkif_max_queues = 4; |
144 | module_param_named(max_queues, xen_blkif_max_queues, uint, 0444); |
145 | MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk" ); |
146 | |
147 | /* |
148 | * Maximum order of pages to be used for the shared ring between front and |
149 | * backend, 4KB page granularity is used. |
150 | */ |
151 | static unsigned int xen_blkif_max_ring_order; |
152 | module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444); |
153 | MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring" ); |
154 | |
155 | static bool __read_mostly xen_blkif_trusted = true; |
156 | module_param_named(trusted, xen_blkif_trusted, bool, 0644); |
157 | MODULE_PARM_DESC(trusted, "Is the backend trusted" ); |
158 | |
159 | #define BLK_RING_SIZE(info) \ |
160 | __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) |
161 | |
162 | /* |
163 | * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 |
164 | * characters are enough. Define to 20 to keep consistent with backend. |
165 | */ |
166 | #define RINGREF_NAME_LEN (20) |
167 | /* |
168 | * queue-%u would take 7 + 10(UINT_MAX) = 17 characters. |
169 | */ |
170 | #define QUEUE_NAME_LEN (17) |
171 | |
172 | /* |
173 | * Per-ring info. |
174 | * Every blkfront device can associate with one or more blkfront_ring_info, |
175 | * depending on how many hardware queues/rings to be used. |
176 | */ |
177 | struct blkfront_ring_info { |
178 | /* Lock to protect data in every ring buffer. */ |
179 | spinlock_t ring_lock; |
180 | struct blkif_front_ring ring; |
181 | unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; |
182 | unsigned int evtchn, irq; |
183 | struct work_struct work; |
184 | struct gnttab_free_callback callback; |
185 | struct list_head indirect_pages; |
186 | struct list_head grants; |
187 | unsigned int persistent_gnts_c; |
188 | unsigned long shadow_free; |
189 | struct blkfront_info *dev_info; |
190 | struct blk_shadow shadow[]; |
191 | }; |
192 | |
193 | /* |
194 | * We have one of these per vbd, whether ide, scsi or 'other'. They |
195 | * hang in private_data off the gendisk structure. We may end up |
196 | * putting all kinds of interesting stuff here :-) |
197 | */ |
198 | struct blkfront_info |
199 | { |
200 | struct mutex mutex; |
201 | struct xenbus_device *xbdev; |
202 | struct gendisk *gd; |
203 | u16 sector_size; |
204 | unsigned int physical_sector_size; |
205 | unsigned long vdisk_info; |
206 | int vdevice; |
207 | blkif_vdev_t handle; |
208 | enum blkif_state connected; |
209 | /* Number of pages per ring buffer. */ |
210 | unsigned int nr_ring_pages; |
211 | struct request_queue *rq; |
212 | unsigned int feature_flush:1; |
213 | unsigned int feature_fua:1; |
214 | unsigned int feature_discard:1; |
215 | unsigned int feature_secdiscard:1; |
216 | /* Connect-time cached feature_persistent parameter */ |
217 | unsigned int feature_persistent_parm:1; |
218 | /* Persistent grants feature negotiation result */ |
219 | unsigned int feature_persistent:1; |
220 | unsigned int bounce:1; |
221 | unsigned int discard_granularity; |
222 | unsigned int discard_alignment; |
223 | /* Number of 4KB segments handled */ |
224 | unsigned int max_indirect_segments; |
225 | int is_ready; |
226 | struct blk_mq_tag_set tag_set; |
227 | struct blkfront_ring_info *rinfo; |
228 | unsigned int nr_rings; |
229 | unsigned int rinfo_size; |
230 | /* Save uncomplete reqs and bios for migration. */ |
231 | struct list_head requests; |
232 | struct bio_list bio_list; |
233 | struct list_head info_list; |
234 | }; |
235 | |
236 | static unsigned int nr_minors; |
237 | static unsigned long *minors; |
238 | static DEFINE_SPINLOCK(minor_lock); |
239 | |
240 | #define PARTS_PER_DISK 16 |
241 | #define PARTS_PER_EXT_DISK 256 |
242 | |
243 | #define BLKIF_MAJOR(dev) ((dev)>>8) |
244 | #define BLKIF_MINOR(dev) ((dev) & 0xff) |
245 | |
246 | #define EXT_SHIFT 28 |
247 | #define EXTENDED (1<<EXT_SHIFT) |
248 | #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED)) |
249 | #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED)) |
250 | #define EMULATED_HD_DISK_MINOR_OFFSET (0) |
251 | #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256) |
252 | #define EMULATED_SD_DISK_MINOR_OFFSET (0) |
253 | #define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256) |
254 | |
255 | #define DEV_NAME "xvd" /* name in /dev */ |
256 | |
257 | /* |
258 | * Grants are always the same size as a Xen page (i.e 4KB). |
259 | * A physical segment is always the same size as a Linux page. |
260 | * Number of grants per physical segment |
261 | */ |
262 | #define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE) |
263 | |
264 | #define GRANTS_PER_INDIRECT_FRAME \ |
265 | (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) |
266 | |
267 | #define INDIRECT_GREFS(_grants) \ |
268 | DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) |
269 | |
270 | static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); |
271 | static void blkfront_gather_backend_features(struct blkfront_info *info); |
272 | static int negotiate_mq(struct blkfront_info *info); |
273 | |
274 | #define for_each_rinfo(info, ptr, idx) \ |
275 | for ((ptr) = (info)->rinfo, (idx) = 0; \ |
276 | (idx) < (info)->nr_rings; \ |
277 | (idx)++, (ptr) = (void *)(ptr) + (info)->rinfo_size) |
278 | |
279 | static inline struct blkfront_ring_info * |
280 | get_rinfo(const struct blkfront_info *info, unsigned int i) |
281 | { |
282 | BUG_ON(i >= info->nr_rings); |
283 | return (void *)info->rinfo + i * info->rinfo_size; |
284 | } |
285 | |
286 | static int get_id_from_freelist(struct blkfront_ring_info *rinfo) |
287 | { |
288 | unsigned long free = rinfo->shadow_free; |
289 | |
290 | BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info)); |
291 | rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id; |
292 | rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ |
293 | return free; |
294 | } |
295 | |
296 | static int add_id_to_freelist(struct blkfront_ring_info *rinfo, |
297 | unsigned long id) |
298 | { |
299 | if (rinfo->shadow[id].req.u.rw.id != id) |
300 | return -EINVAL; |
301 | if (rinfo->shadow[id].request == NULL) |
302 | return -EINVAL; |
303 | rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free; |
304 | rinfo->shadow[id].request = NULL; |
305 | rinfo->shadow_free = id; |
306 | return 0; |
307 | } |
308 | |
309 | static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) |
310 | { |
311 | struct blkfront_info *info = rinfo->dev_info; |
312 | struct page *granted_page; |
313 | struct grant *gnt_list_entry, *n; |
314 | int i = 0; |
315 | |
316 | while (i < num) { |
317 | gnt_list_entry = kzalloc(size: sizeof(struct grant), GFP_NOIO); |
318 | if (!gnt_list_entry) |
319 | goto out_of_memory; |
320 | |
321 | if (info->bounce) { |
322 | granted_page = alloc_page(GFP_NOIO | __GFP_ZERO); |
323 | if (!granted_page) { |
324 | kfree(objp: gnt_list_entry); |
325 | goto out_of_memory; |
326 | } |
327 | gnt_list_entry->page = granted_page; |
328 | } |
329 | |
330 | gnt_list_entry->gref = INVALID_GRANT_REF; |
331 | list_add(new: &gnt_list_entry->node, head: &rinfo->grants); |
332 | i++; |
333 | } |
334 | |
335 | return 0; |
336 | |
337 | out_of_memory: |
338 | list_for_each_entry_safe(gnt_list_entry, n, |
339 | &rinfo->grants, node) { |
340 | list_del(entry: &gnt_list_entry->node); |
341 | if (info->bounce) |
342 | __free_page(gnt_list_entry->page); |
343 | kfree(objp: gnt_list_entry); |
344 | i--; |
345 | } |
346 | BUG_ON(i != 0); |
347 | return -ENOMEM; |
348 | } |
349 | |
350 | static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) |
351 | { |
352 | struct grant *gnt_list_entry; |
353 | |
354 | BUG_ON(list_empty(&rinfo->grants)); |
355 | gnt_list_entry = list_first_entry(&rinfo->grants, struct grant, |
356 | node); |
357 | list_del(entry: &gnt_list_entry->node); |
358 | |
359 | if (gnt_list_entry->gref != INVALID_GRANT_REF) |
360 | rinfo->persistent_gnts_c--; |
361 | |
362 | return gnt_list_entry; |
363 | } |
364 | |
365 | static inline void grant_foreign_access(const struct grant *gnt_list_entry, |
366 | const struct blkfront_info *info) |
367 | { |
368 | gnttab_page_grant_foreign_access_ref_one(ref: gnt_list_entry->gref, |
369 | domid: info->xbdev->otherend_id, |
370 | page: gnt_list_entry->page, |
371 | readonly: 0); |
372 | } |
373 | |
374 | static struct grant *get_grant(grant_ref_t *gref_head, |
375 | unsigned long gfn, |
376 | struct blkfront_ring_info *rinfo) |
377 | { |
378 | struct grant *gnt_list_entry = get_free_grant(rinfo); |
379 | struct blkfront_info *info = rinfo->dev_info; |
380 | |
381 | if (gnt_list_entry->gref != INVALID_GRANT_REF) |
382 | return gnt_list_entry; |
383 | |
384 | /* Assign a gref to this page */ |
385 | gnt_list_entry->gref = gnttab_claim_grant_reference(pprivate_head: gref_head); |
386 | BUG_ON(gnt_list_entry->gref == -ENOSPC); |
387 | if (info->bounce) |
388 | grant_foreign_access(gnt_list_entry, info); |
389 | else { |
390 | /* Grant access to the GFN passed by the caller */ |
391 | gnttab_grant_foreign_access_ref(ref: gnt_list_entry->gref, |
392 | domid: info->xbdev->otherend_id, |
393 | frame: gfn, readonly: 0); |
394 | } |
395 | |
396 | return gnt_list_entry; |
397 | } |
398 | |
399 | static struct grant *get_indirect_grant(grant_ref_t *gref_head, |
400 | struct blkfront_ring_info *rinfo) |
401 | { |
402 | struct grant *gnt_list_entry = get_free_grant(rinfo); |
403 | struct blkfront_info *info = rinfo->dev_info; |
404 | |
405 | if (gnt_list_entry->gref != INVALID_GRANT_REF) |
406 | return gnt_list_entry; |
407 | |
408 | /* Assign a gref to this page */ |
409 | gnt_list_entry->gref = gnttab_claim_grant_reference(pprivate_head: gref_head); |
410 | BUG_ON(gnt_list_entry->gref == -ENOSPC); |
411 | if (!info->bounce) { |
412 | struct page *indirect_page; |
413 | |
414 | /* Fetch a pre-allocated page to use for indirect grefs */ |
415 | BUG_ON(list_empty(&rinfo->indirect_pages)); |
416 | indirect_page = list_first_entry(&rinfo->indirect_pages, |
417 | struct page, lru); |
418 | list_del(entry: &indirect_page->lru); |
419 | gnt_list_entry->page = indirect_page; |
420 | } |
421 | grant_foreign_access(gnt_list_entry, info); |
422 | |
423 | return gnt_list_entry; |
424 | } |
425 | |
426 | static const char *op_name(int op) |
427 | { |
428 | static const char *const names[] = { |
429 | [BLKIF_OP_READ] = "read" , |
430 | [BLKIF_OP_WRITE] = "write" , |
431 | [BLKIF_OP_WRITE_BARRIER] = "barrier" , |
432 | [BLKIF_OP_FLUSH_DISKCACHE] = "flush" , |
433 | [BLKIF_OP_DISCARD] = "discard" }; |
434 | |
435 | if (op < 0 || op >= ARRAY_SIZE(names)) |
436 | return "unknown" ; |
437 | |
438 | if (!names[op]) |
439 | return "reserved" ; |
440 | |
441 | return names[op]; |
442 | } |
443 | static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) |
444 | { |
445 | unsigned int end = minor + nr; |
446 | int rc; |
447 | |
448 | if (end > nr_minors) { |
449 | unsigned long *bitmap, *old; |
450 | |
451 | bitmap = kcalloc(BITS_TO_LONGS(end), size: sizeof(*bitmap), |
452 | GFP_KERNEL); |
453 | if (bitmap == NULL) |
454 | return -ENOMEM; |
455 | |
456 | spin_lock(lock: &minor_lock); |
457 | if (end > nr_minors) { |
458 | old = minors; |
459 | memcpy(bitmap, minors, |
460 | BITS_TO_LONGS(nr_minors) * sizeof(*bitmap)); |
461 | minors = bitmap; |
462 | nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG; |
463 | } else |
464 | old = bitmap; |
465 | spin_unlock(lock: &minor_lock); |
466 | kfree(objp: old); |
467 | } |
468 | |
469 | spin_lock(lock: &minor_lock); |
470 | if (find_next_bit(addr: minors, size: end, offset: minor) >= end) { |
471 | bitmap_set(map: minors, start: minor, nbits: nr); |
472 | rc = 0; |
473 | } else |
474 | rc = -EBUSY; |
475 | spin_unlock(lock: &minor_lock); |
476 | |
477 | return rc; |
478 | } |
479 | |
480 | static void xlbd_release_minors(unsigned int minor, unsigned int nr) |
481 | { |
482 | unsigned int end = minor + nr; |
483 | |
484 | BUG_ON(end > nr_minors); |
485 | spin_lock(lock: &minor_lock); |
486 | bitmap_clear(map: minors, start: minor, nbits: nr); |
487 | spin_unlock(lock: &minor_lock); |
488 | } |
489 | |
490 | static void blkif_restart_queue_callback(void *arg) |
491 | { |
492 | struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg; |
493 | schedule_work(work: &rinfo->work); |
494 | } |
495 | |
496 | static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) |
497 | { |
498 | /* We don't have real geometry info, but let's at least return |
499 | values consistent with the size of the device */ |
500 | sector_t nsect = get_capacity(disk: bd->bd_disk); |
501 | sector_t cylinders = nsect; |
502 | |
503 | hg->heads = 0xff; |
504 | hg->sectors = 0x3f; |
505 | sector_div(cylinders, hg->heads * hg->sectors); |
506 | hg->cylinders = cylinders; |
507 | if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) |
508 | hg->cylinders = 0xffff; |
509 | return 0; |
510 | } |
511 | |
512 | static int blkif_ioctl(struct block_device *bdev, blk_mode_t mode, |
513 | unsigned command, unsigned long argument) |
514 | { |
515 | struct blkfront_info *info = bdev->bd_disk->private_data; |
516 | int i; |
517 | |
518 | switch (command) { |
519 | case CDROMMULTISESSION: |
520 | for (i = 0; i < sizeof(struct cdrom_multisession); i++) |
521 | if (put_user(0, (char __user *)(argument + i))) |
522 | return -EFAULT; |
523 | return 0; |
524 | case CDROM_GET_CAPABILITY: |
525 | if (!(info->vdisk_info & VDISK_CDROM)) |
526 | return -EINVAL; |
527 | return 0; |
528 | default: |
529 | return -EINVAL; |
530 | } |
531 | } |
532 | |
533 | static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, |
534 | struct request *req, |
535 | struct blkif_request **ring_req) |
536 | { |
537 | unsigned long id; |
538 | |
539 | *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); |
540 | rinfo->ring.req_prod_pvt++; |
541 | |
542 | id = get_id_from_freelist(rinfo); |
543 | rinfo->shadow[id].request = req; |
544 | rinfo->shadow[id].status = REQ_PROCESSING; |
545 | rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; |
546 | |
547 | rinfo->shadow[id].req.u.rw.id = id; |
548 | |
549 | return id; |
550 | } |
551 | |
552 | static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) |
553 | { |
554 | struct blkfront_info *info = rinfo->dev_info; |
555 | struct blkif_request *ring_req, *final_ring_req; |
556 | unsigned long id; |
557 | |
558 | /* Fill out a communications ring structure. */ |
559 | id = blkif_ring_get_request(rinfo, req, ring_req: &final_ring_req); |
560 | ring_req = &rinfo->shadow[id].req; |
561 | |
562 | ring_req->operation = BLKIF_OP_DISCARD; |
563 | ring_req->u.discard.nr_sectors = blk_rq_sectors(rq: req); |
564 | ring_req->u.discard.id = id; |
565 | ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(rq: req); |
566 | if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard) |
567 | ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; |
568 | else |
569 | ring_req->u.discard.flag = 0; |
570 | |
571 | /* Copy the request to the ring page. */ |
572 | *final_ring_req = *ring_req; |
573 | rinfo->shadow[id].status = REQ_WAITING; |
574 | |
575 | return 0; |
576 | } |
577 | |
578 | struct setup_rw_req { |
579 | unsigned int grant_idx; |
580 | struct blkif_request_segment *segments; |
581 | struct blkfront_ring_info *rinfo; |
582 | struct blkif_request *ring_req; |
583 | grant_ref_t gref_head; |
584 | unsigned int id; |
585 | /* Only used when persistent grant is used and it's a write request */ |
586 | bool need_copy; |
587 | unsigned int bvec_off; |
588 | char *bvec_data; |
589 | |
590 | bool ; |
591 | struct blkif_request *; |
592 | }; |
593 | |
594 | static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, |
595 | unsigned int len, void *data) |
596 | { |
597 | struct setup_rw_req *setup = data; |
598 | int n, ref; |
599 | struct grant *gnt_list_entry; |
600 | unsigned int fsect, lsect; |
601 | /* Convenient aliases */ |
602 | unsigned int grant_idx = setup->grant_idx; |
603 | struct blkif_request *ring_req = setup->ring_req; |
604 | struct blkfront_ring_info *rinfo = setup->rinfo; |
605 | /* |
606 | * We always use the shadow of the first request to store the list |
607 | * of grant associated to the block I/O request. This made the |
608 | * completion more easy to handle even if the block I/O request is |
609 | * split. |
610 | */ |
611 | struct blk_shadow *shadow = &rinfo->shadow[setup->id]; |
612 | |
613 | if (unlikely(setup->require_extra_req && |
614 | grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { |
615 | /* |
616 | * We are using the second request, setup grant_idx |
617 | * to be the index of the segment array. |
618 | */ |
619 | grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST; |
620 | ring_req = setup->extra_ring_req; |
621 | } |
622 | |
623 | if ((ring_req->operation == BLKIF_OP_INDIRECT) && |
624 | (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { |
625 | if (setup->segments) |
626 | kunmap_atomic(setup->segments); |
627 | |
628 | n = grant_idx / GRANTS_PER_INDIRECT_FRAME; |
629 | gnt_list_entry = get_indirect_grant(gref_head: &setup->gref_head, rinfo); |
630 | shadow->indirect_grants[n] = gnt_list_entry; |
631 | setup->segments = kmap_atomic(page: gnt_list_entry->page); |
632 | ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; |
633 | } |
634 | |
635 | gnt_list_entry = get_grant(gref_head: &setup->gref_head, gfn, rinfo); |
636 | ref = gnt_list_entry->gref; |
637 | /* |
638 | * All the grants are stored in the shadow of the first |
639 | * request. Therefore we have to use the global index. |
640 | */ |
641 | shadow->grants_used[setup->grant_idx] = gnt_list_entry; |
642 | |
643 | if (setup->need_copy) { |
644 | void *shared_data; |
645 | |
646 | shared_data = kmap_atomic(page: gnt_list_entry->page); |
647 | /* |
648 | * this does not wipe data stored outside the |
649 | * range sg->offset..sg->offset+sg->length. |
650 | * Therefore, blkback *could* see data from |
651 | * previous requests. This is OK as long as |
652 | * persistent grants are shared with just one |
653 | * domain. It may need refactoring if this |
654 | * changes |
655 | */ |
656 | memcpy(shared_data + offset, |
657 | setup->bvec_data + setup->bvec_off, |
658 | len); |
659 | |
660 | kunmap_atomic(shared_data); |
661 | setup->bvec_off += len; |
662 | } |
663 | |
664 | fsect = offset >> 9; |
665 | lsect = fsect + (len >> 9) - 1; |
666 | if (ring_req->operation != BLKIF_OP_INDIRECT) { |
667 | ring_req->u.rw.seg[grant_idx] = |
668 | (struct blkif_request_segment) { |
669 | .gref = ref, |
670 | .first_sect = fsect, |
671 | .last_sect = lsect }; |
672 | } else { |
673 | setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] = |
674 | (struct blkif_request_segment) { |
675 | .gref = ref, |
676 | .first_sect = fsect, |
677 | .last_sect = lsect }; |
678 | } |
679 | |
680 | (setup->grant_idx)++; |
681 | } |
682 | |
683 | static void (struct blkif_request *first, |
684 | struct blkif_request *second) |
685 | { |
686 | uint16_t nr_segments = first->u.rw.nr_segments; |
687 | |
688 | /* |
689 | * The second request is only present when the first request uses |
690 | * all its segments. It's always the continuity of the first one. |
691 | */ |
692 | first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; |
693 | |
694 | second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST; |
695 | second->u.rw.sector_number = first->u.rw.sector_number + |
696 | (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512; |
697 | |
698 | second->u.rw.handle = first->u.rw.handle; |
699 | second->operation = first->operation; |
700 | } |
701 | |
702 | static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) |
703 | { |
704 | struct blkfront_info *info = rinfo->dev_info; |
705 | struct blkif_request *ring_req, * = NULL; |
706 | struct blkif_request *final_ring_req, * = NULL; |
707 | unsigned long id, = NO_ASSOCIATED_ID; |
708 | bool = false; |
709 | int i; |
710 | struct setup_rw_req setup = { |
711 | .grant_idx = 0, |
712 | .segments = NULL, |
713 | .rinfo = rinfo, |
714 | .need_copy = rq_data_dir(req) && info->bounce, |
715 | }; |
716 | |
717 | /* |
718 | * Used to store if we are able to queue the request by just using |
719 | * existing persistent grants, or if we have to get new grants, |
720 | * as there are not sufficiently many free. |
721 | */ |
722 | bool new_persistent_gnts = false; |
723 | struct scatterlist *sg; |
724 | int num_sg, max_grefs, num_grant; |
725 | |
726 | max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG; |
727 | if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) |
728 | /* |
729 | * If we are using indirect segments we need to account |
730 | * for the indirect grefs used in the request. |
731 | */ |
732 | max_grefs += INDIRECT_GREFS(max_grefs); |
733 | |
734 | /* Check if we have enough persistent grants to allocate a requests */ |
735 | if (rinfo->persistent_gnts_c < max_grefs) { |
736 | new_persistent_gnts = true; |
737 | |
738 | if (gnttab_alloc_grant_references( |
739 | count: max_grefs - rinfo->persistent_gnts_c, |
740 | pprivate_head: &setup.gref_head) < 0) { |
741 | gnttab_request_free_callback( |
742 | callback: &rinfo->callback, |
743 | fn: blkif_restart_queue_callback, |
744 | arg: rinfo, |
745 | count: max_grefs - rinfo->persistent_gnts_c); |
746 | return 1; |
747 | } |
748 | } |
749 | |
750 | /* Fill out a communications ring structure. */ |
751 | id = blkif_ring_get_request(rinfo, req, ring_req: &final_ring_req); |
752 | ring_req = &rinfo->shadow[id].req; |
753 | |
754 | num_sg = blk_rq_map_sg(q: req->q, rq: req, sglist: rinfo->shadow[id].sg); |
755 | num_grant = 0; |
756 | /* Calculate the number of grant used */ |
757 | for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) |
758 | num_grant += gnttab_count_grant(start: sg->offset, len: sg->length); |
759 | |
760 | require_extra_req = info->max_indirect_segments == 0 && |
761 | num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST; |
762 | BUG_ON(!HAS_EXTRA_REQ && require_extra_req); |
763 | |
764 | rinfo->shadow[id].num_sg = num_sg; |
765 | if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST && |
766 | likely(!require_extra_req)) { |
767 | /* |
768 | * The indirect operation can only be a BLKIF_OP_READ or |
769 | * BLKIF_OP_WRITE |
770 | */ |
771 | BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA); |
772 | ring_req->operation = BLKIF_OP_INDIRECT; |
773 | ring_req->u.indirect.indirect_op = rq_data_dir(req) ? |
774 | BLKIF_OP_WRITE : BLKIF_OP_READ; |
775 | ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(rq: req); |
776 | ring_req->u.indirect.handle = info->handle; |
777 | ring_req->u.indirect.nr_segments = num_grant; |
778 | } else { |
779 | ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(rq: req); |
780 | ring_req->u.rw.handle = info->handle; |
781 | ring_req->operation = rq_data_dir(req) ? |
782 | BLKIF_OP_WRITE : BLKIF_OP_READ; |
783 | if (req_op(req) == REQ_OP_FLUSH || |
784 | (req_op(req) == REQ_OP_WRITE && (req->cmd_flags & REQ_FUA))) { |
785 | /* |
786 | * Ideally we can do an unordered flush-to-disk. |
787 | * In case the backend onlysupports barriers, use that. |
788 | * A barrier request a superset of FUA, so we can |
789 | * implement it the same way. (It's also a FLUSH+FUA, |
790 | * since it is guaranteed ordered WRT previous writes.) |
791 | */ |
792 | if (info->feature_flush && info->feature_fua) |
793 | ring_req->operation = |
794 | BLKIF_OP_WRITE_BARRIER; |
795 | else if (info->feature_flush) |
796 | ring_req->operation = |
797 | BLKIF_OP_FLUSH_DISKCACHE; |
798 | else |
799 | ring_req->operation = 0; |
800 | } |
801 | ring_req->u.rw.nr_segments = num_grant; |
802 | if (unlikely(require_extra_req)) { |
803 | extra_id = blkif_ring_get_request(rinfo, req, |
804 | ring_req: &final_extra_ring_req); |
805 | extra_ring_req = &rinfo->shadow[extra_id].req; |
806 | |
807 | /* |
808 | * Only the first request contains the scatter-gather |
809 | * list. |
810 | */ |
811 | rinfo->shadow[extra_id].num_sg = 0; |
812 | |
813 | blkif_setup_extra_req(first: ring_req, second: extra_ring_req); |
814 | |
815 | /* Link the 2 requests together */ |
816 | rinfo->shadow[extra_id].associated_id = id; |
817 | rinfo->shadow[id].associated_id = extra_id; |
818 | } |
819 | } |
820 | |
821 | setup.ring_req = ring_req; |
822 | setup.id = id; |
823 | |
824 | setup.require_extra_req = require_extra_req; |
825 | if (unlikely(require_extra_req)) |
826 | setup.extra_ring_req = extra_ring_req; |
827 | |
828 | for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { |
829 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
830 | |
831 | if (setup.need_copy) { |
832 | setup.bvec_off = sg->offset; |
833 | setup.bvec_data = kmap_atomic(page: sg_page(sg)); |
834 | } |
835 | |
836 | gnttab_foreach_grant_in_range(page: sg_page(sg), |
837 | offset: sg->offset, |
838 | len: sg->length, |
839 | fn: blkif_setup_rw_req_grant, |
840 | data: &setup); |
841 | |
842 | if (setup.need_copy) |
843 | kunmap_atomic(setup.bvec_data); |
844 | } |
845 | if (setup.segments) |
846 | kunmap_atomic(setup.segments); |
847 | |
848 | /* Copy request(s) to the ring page. */ |
849 | *final_ring_req = *ring_req; |
850 | rinfo->shadow[id].status = REQ_WAITING; |
851 | if (unlikely(require_extra_req)) { |
852 | *final_extra_ring_req = *extra_ring_req; |
853 | rinfo->shadow[extra_id].status = REQ_WAITING; |
854 | } |
855 | |
856 | if (new_persistent_gnts) |
857 | gnttab_free_grant_references(head: setup.gref_head); |
858 | |
859 | return 0; |
860 | } |
861 | |
862 | /* |
863 | * Generate a Xen blkfront IO request from a blk layer request. Reads |
864 | * and writes are handled as expected. |
865 | * |
866 | * @req: a request struct |
867 | */ |
868 | static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo) |
869 | { |
870 | if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) |
871 | return 1; |
872 | |
873 | if (unlikely(req_op(req) == REQ_OP_DISCARD || |
874 | req_op(req) == REQ_OP_SECURE_ERASE)) |
875 | return blkif_queue_discard_req(req, rinfo); |
876 | else |
877 | return blkif_queue_rw_req(req, rinfo); |
878 | } |
879 | |
880 | static inline void flush_requests(struct blkfront_ring_info *rinfo) |
881 | { |
882 | int notify; |
883 | |
884 | RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify); |
885 | |
886 | if (notify) |
887 | notify_remote_via_irq(irq: rinfo->irq); |
888 | } |
889 | |
890 | static inline bool blkif_request_flush_invalid(struct request *req, |
891 | struct blkfront_info *info) |
892 | { |
893 | return (blk_rq_is_passthrough(rq: req) || |
894 | ((req_op(req) == REQ_OP_FLUSH) && |
895 | !info->feature_flush) || |
896 | ((req->cmd_flags & REQ_FUA) && |
897 | !info->feature_fua)); |
898 | } |
899 | |
900 | static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx, |
901 | const struct blk_mq_queue_data *qd) |
902 | { |
903 | unsigned long flags; |
904 | int qid = hctx->queue_num; |
905 | struct blkfront_info *info = hctx->queue->queuedata; |
906 | struct blkfront_ring_info *rinfo = NULL; |
907 | |
908 | rinfo = get_rinfo(info, i: qid); |
909 | blk_mq_start_request(rq: qd->rq); |
910 | spin_lock_irqsave(&rinfo->ring_lock, flags); |
911 | if (RING_FULL(&rinfo->ring)) |
912 | goto out_busy; |
913 | |
914 | if (blkif_request_flush_invalid(req: qd->rq, info: rinfo->dev_info)) |
915 | goto out_err; |
916 | |
917 | if (blkif_queue_request(req: qd->rq, rinfo)) |
918 | goto out_busy; |
919 | |
920 | flush_requests(rinfo); |
921 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
922 | return BLK_STS_OK; |
923 | |
924 | out_err: |
925 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
926 | return BLK_STS_IOERR; |
927 | |
928 | out_busy: |
929 | blk_mq_stop_hw_queue(hctx); |
930 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
931 | return BLK_STS_DEV_RESOURCE; |
932 | } |
933 | |
934 | static void blkif_complete_rq(struct request *rq) |
935 | { |
936 | blk_mq_end_request(rq, error: blkif_req(rq)->error); |
937 | } |
938 | |
939 | static const struct blk_mq_ops blkfront_mq_ops = { |
940 | .queue_rq = blkif_queue_rq, |
941 | .complete = blkif_complete_rq, |
942 | }; |
943 | |
944 | static void blkif_set_queue_limits(struct blkfront_info *info) |
945 | { |
946 | struct request_queue *rq = info->rq; |
947 | struct gendisk *gd = info->gd; |
948 | unsigned int segments = info->max_indirect_segments ? : |
949 | BLKIF_MAX_SEGMENTS_PER_REQUEST; |
950 | |
951 | blk_queue_flag_set(QUEUE_FLAG_VIRT, q: rq); |
952 | |
953 | if (info->feature_discard) { |
954 | blk_queue_max_discard_sectors(q: rq, max_discard_sectors: get_capacity(disk: gd)); |
955 | rq->limits.discard_granularity = info->discard_granularity ?: |
956 | info->physical_sector_size; |
957 | rq->limits.discard_alignment = info->discard_alignment; |
958 | if (info->feature_secdiscard) |
959 | blk_queue_max_secure_erase_sectors(q: rq, |
960 | max_sectors: get_capacity(disk: gd)); |
961 | } |
962 | |
963 | /* Hard sector size and max sectors impersonate the equiv. hardware. */ |
964 | blk_queue_logical_block_size(rq, info->sector_size); |
965 | blk_queue_physical_block_size(rq, info->physical_sector_size); |
966 | blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); |
967 | |
968 | /* Each segment in a request is up to an aligned page in size. */ |
969 | blk_queue_segment_boundary(rq, PAGE_SIZE - 1); |
970 | blk_queue_max_segment_size(rq, PAGE_SIZE); |
971 | |
972 | /* Ensure a merged request will fit in a single I/O ring slot. */ |
973 | blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); |
974 | |
975 | /* Make sure buffer addresses are sector-aligned. */ |
976 | blk_queue_dma_alignment(rq, 511); |
977 | } |
978 | |
979 | static const char *flush_info(struct blkfront_info *info) |
980 | { |
981 | if (info->feature_flush && info->feature_fua) |
982 | return "barrier: enabled;" ; |
983 | else if (info->feature_flush) |
984 | return "flush diskcache: enabled;" ; |
985 | else |
986 | return "barrier or flush: disabled;" ; |
987 | } |
988 | |
989 | static void xlvbd_flush(struct blkfront_info *info) |
990 | { |
991 | blk_queue_write_cache(q: info->rq, enabled: info->feature_flush ? true : false, |
992 | fua: info->feature_fua ? true : false); |
993 | pr_info("blkfront: %s: %s %s %s %s %s %s %s\n" , |
994 | info->gd->disk_name, flush_info(info), |
995 | "persistent grants:" , info->feature_persistent ? |
996 | "enabled;" : "disabled;" , "indirect descriptors:" , |
997 | info->max_indirect_segments ? "enabled;" : "disabled;" , |
998 | "bounce buffer:" , info->bounce ? "enabled" : "disabled;" ); |
999 | } |
1000 | |
1001 | static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) |
1002 | { |
1003 | int major; |
1004 | major = BLKIF_MAJOR(vdevice); |
1005 | *minor = BLKIF_MINOR(vdevice); |
1006 | switch (major) { |
1007 | case XEN_IDE0_MAJOR: |
1008 | *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; |
1009 | *minor = ((*minor / 64) * PARTS_PER_DISK) + |
1010 | EMULATED_HD_DISK_MINOR_OFFSET; |
1011 | break; |
1012 | case XEN_IDE1_MAJOR: |
1013 | *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; |
1014 | *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + |
1015 | EMULATED_HD_DISK_MINOR_OFFSET; |
1016 | break; |
1017 | case XEN_SCSI_DISK0_MAJOR: |
1018 | *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; |
1019 | *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; |
1020 | break; |
1021 | case XEN_SCSI_DISK1_MAJOR: |
1022 | case XEN_SCSI_DISK2_MAJOR: |
1023 | case XEN_SCSI_DISK3_MAJOR: |
1024 | case XEN_SCSI_DISK4_MAJOR: |
1025 | case XEN_SCSI_DISK5_MAJOR: |
1026 | case XEN_SCSI_DISK6_MAJOR: |
1027 | case XEN_SCSI_DISK7_MAJOR: |
1028 | *offset = (*minor / PARTS_PER_DISK) + |
1029 | ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + |
1030 | EMULATED_SD_DISK_NAME_OFFSET; |
1031 | *minor = *minor + |
1032 | ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + |
1033 | EMULATED_SD_DISK_MINOR_OFFSET; |
1034 | break; |
1035 | case XEN_SCSI_DISK8_MAJOR: |
1036 | case XEN_SCSI_DISK9_MAJOR: |
1037 | case XEN_SCSI_DISK10_MAJOR: |
1038 | case XEN_SCSI_DISK11_MAJOR: |
1039 | case XEN_SCSI_DISK12_MAJOR: |
1040 | case XEN_SCSI_DISK13_MAJOR: |
1041 | case XEN_SCSI_DISK14_MAJOR: |
1042 | case XEN_SCSI_DISK15_MAJOR: |
1043 | *offset = (*minor / PARTS_PER_DISK) + |
1044 | ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + |
1045 | EMULATED_SD_DISK_NAME_OFFSET; |
1046 | *minor = *minor + |
1047 | ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + |
1048 | EMULATED_SD_DISK_MINOR_OFFSET; |
1049 | break; |
1050 | case XENVBD_MAJOR: |
1051 | *offset = *minor / PARTS_PER_DISK; |
1052 | break; |
1053 | default: |
1054 | printk(KERN_WARNING "blkfront: your disk configuration is " |
1055 | "incorrect, please use an xvd device instead\n" ); |
1056 | return -ENODEV; |
1057 | } |
1058 | return 0; |
1059 | } |
1060 | |
1061 | static char *encode_disk_name(char *ptr, unsigned int n) |
1062 | { |
1063 | if (n >= 26) |
1064 | ptr = encode_disk_name(ptr, n: n / 26 - 1); |
1065 | *ptr = 'a' + n % 26; |
1066 | return ptr + 1; |
1067 | } |
1068 | |
1069 | static int xlvbd_alloc_gendisk(blkif_sector_t capacity, |
1070 | struct blkfront_info *info, u16 sector_size, |
1071 | unsigned int physical_sector_size) |
1072 | { |
1073 | struct gendisk *gd; |
1074 | int nr_minors = 1; |
1075 | int err; |
1076 | unsigned int offset; |
1077 | int minor; |
1078 | int nr_parts; |
1079 | char *ptr; |
1080 | |
1081 | BUG_ON(info->gd != NULL); |
1082 | BUG_ON(info->rq != NULL); |
1083 | |
1084 | if ((info->vdevice>>EXT_SHIFT) > 1) { |
1085 | /* this is above the extended range; something is wrong */ |
1086 | printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n" , info->vdevice); |
1087 | return -ENODEV; |
1088 | } |
1089 | |
1090 | if (!VDEV_IS_EXTENDED(info->vdevice)) { |
1091 | err = xen_translate_vdev(vdevice: info->vdevice, minor: &minor, offset: &offset); |
1092 | if (err) |
1093 | return err; |
1094 | nr_parts = PARTS_PER_DISK; |
1095 | } else { |
1096 | minor = BLKIF_MINOR_EXT(info->vdevice); |
1097 | nr_parts = PARTS_PER_EXT_DISK; |
1098 | offset = minor / nr_parts; |
1099 | if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4) |
1100 | printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " |
1101 | "emulated IDE disks,\n\t choose an xvd device name" |
1102 | "from xvde on\n" , info->vdevice); |
1103 | } |
1104 | if (minor >> MINORBITS) { |
1105 | pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n" , |
1106 | info->vdevice, minor); |
1107 | return -ENODEV; |
1108 | } |
1109 | |
1110 | if ((minor % nr_parts) == 0) |
1111 | nr_minors = nr_parts; |
1112 | |
1113 | err = xlbd_reserve_minors(minor, nr: nr_minors); |
1114 | if (err) |
1115 | return err; |
1116 | |
1117 | memset(&info->tag_set, 0, sizeof(info->tag_set)); |
1118 | info->tag_set.ops = &blkfront_mq_ops; |
1119 | info->tag_set.nr_hw_queues = info->nr_rings; |
1120 | if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { |
1121 | /* |
1122 | * When indirect descriptior is not supported, the I/O request |
1123 | * will be split between multiple request in the ring. |
1124 | * To avoid problems when sending the request, divide by |
1125 | * 2 the depth of the queue. |
1126 | */ |
1127 | info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; |
1128 | } else |
1129 | info->tag_set.queue_depth = BLK_RING_SIZE(info); |
1130 | info->tag_set.numa_node = NUMA_NO_NODE; |
1131 | info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; |
1132 | info->tag_set.cmd_size = sizeof(struct blkif_req); |
1133 | info->tag_set.driver_data = info; |
1134 | |
1135 | err = blk_mq_alloc_tag_set(set: &info->tag_set); |
1136 | if (err) |
1137 | goto out_release_minors; |
1138 | |
1139 | gd = blk_mq_alloc_disk(&info->tag_set, info); |
1140 | if (IS_ERR(ptr: gd)) { |
1141 | err = PTR_ERR(ptr: gd); |
1142 | goto out_free_tag_set; |
1143 | } |
1144 | |
1145 | strcpy(p: gd->disk_name, DEV_NAME); |
1146 | ptr = encode_disk_name(ptr: gd->disk_name + sizeof(DEV_NAME) - 1, n: offset); |
1147 | BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); |
1148 | if (nr_minors > 1) |
1149 | *ptr = 0; |
1150 | else |
1151 | snprintf(buf: ptr, size: gd->disk_name + DISK_NAME_LEN - ptr, |
1152 | fmt: "%d" , minor & (nr_parts - 1)); |
1153 | |
1154 | gd->major = XENVBD_MAJOR; |
1155 | gd->first_minor = minor; |
1156 | gd->minors = nr_minors; |
1157 | gd->fops = &xlvbd_block_fops; |
1158 | gd->private_data = info; |
1159 | set_capacity(disk: gd, size: capacity); |
1160 | |
1161 | info->rq = gd->queue; |
1162 | info->gd = gd; |
1163 | info->sector_size = sector_size; |
1164 | info->physical_sector_size = physical_sector_size; |
1165 | blkif_set_queue_limits(info); |
1166 | |
1167 | xlvbd_flush(info); |
1168 | |
1169 | if (info->vdisk_info & VDISK_READONLY) |
1170 | set_disk_ro(disk: gd, read_only: 1); |
1171 | if (info->vdisk_info & VDISK_REMOVABLE) |
1172 | gd->flags |= GENHD_FL_REMOVABLE; |
1173 | |
1174 | return 0; |
1175 | |
1176 | out_free_tag_set: |
1177 | blk_mq_free_tag_set(set: &info->tag_set); |
1178 | out_release_minors: |
1179 | xlbd_release_minors(minor, nr: nr_minors); |
1180 | return err; |
1181 | } |
1182 | |
1183 | /* Already hold rinfo->ring_lock. */ |
1184 | static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) |
1185 | { |
1186 | if (!RING_FULL(&rinfo->ring)) |
1187 | blk_mq_start_stopped_hw_queues(q: rinfo->dev_info->rq, async: true); |
1188 | } |
1189 | |
1190 | static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) |
1191 | { |
1192 | unsigned long flags; |
1193 | |
1194 | spin_lock_irqsave(&rinfo->ring_lock, flags); |
1195 | kick_pending_request_queues_locked(rinfo); |
1196 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
1197 | } |
1198 | |
1199 | static void blkif_restart_queue(struct work_struct *work) |
1200 | { |
1201 | struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); |
1202 | |
1203 | if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) |
1204 | kick_pending_request_queues(rinfo); |
1205 | } |
1206 | |
1207 | static void blkif_free_ring(struct blkfront_ring_info *rinfo) |
1208 | { |
1209 | struct grant *persistent_gnt, *n; |
1210 | struct blkfront_info *info = rinfo->dev_info; |
1211 | int i, j, segs; |
1212 | |
1213 | /* |
1214 | * Remove indirect pages, this only happens when using indirect |
1215 | * descriptors but not persistent grants |
1216 | */ |
1217 | if (!list_empty(head: &rinfo->indirect_pages)) { |
1218 | struct page *indirect_page, *n; |
1219 | |
1220 | BUG_ON(info->bounce); |
1221 | list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { |
1222 | list_del(entry: &indirect_page->lru); |
1223 | __free_page(indirect_page); |
1224 | } |
1225 | } |
1226 | |
1227 | /* Remove all persistent grants. */ |
1228 | if (!list_empty(head: &rinfo->grants)) { |
1229 | list_for_each_entry_safe(persistent_gnt, n, |
1230 | &rinfo->grants, node) { |
1231 | list_del(entry: &persistent_gnt->node); |
1232 | if (persistent_gnt->gref != INVALID_GRANT_REF) { |
1233 | gnttab_end_foreign_access(ref: persistent_gnt->gref, |
1234 | NULL); |
1235 | rinfo->persistent_gnts_c--; |
1236 | } |
1237 | if (info->bounce) |
1238 | __free_page(persistent_gnt->page); |
1239 | kfree(objp: persistent_gnt); |
1240 | } |
1241 | } |
1242 | BUG_ON(rinfo->persistent_gnts_c != 0); |
1243 | |
1244 | for (i = 0; i < BLK_RING_SIZE(info); i++) { |
1245 | /* |
1246 | * Clear persistent grants present in requests already |
1247 | * on the shared ring |
1248 | */ |
1249 | if (!rinfo->shadow[i].request) |
1250 | goto free_shadow; |
1251 | |
1252 | segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ? |
1253 | rinfo->shadow[i].req.u.indirect.nr_segments : |
1254 | rinfo->shadow[i].req.u.rw.nr_segments; |
1255 | for (j = 0; j < segs; j++) { |
1256 | persistent_gnt = rinfo->shadow[i].grants_used[j]; |
1257 | gnttab_end_foreign_access(ref: persistent_gnt->gref, NULL); |
1258 | if (info->bounce) |
1259 | __free_page(persistent_gnt->page); |
1260 | kfree(objp: persistent_gnt); |
1261 | } |
1262 | |
1263 | if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) |
1264 | /* |
1265 | * If this is not an indirect operation don't try to |
1266 | * free indirect segments |
1267 | */ |
1268 | goto free_shadow; |
1269 | |
1270 | for (j = 0; j < INDIRECT_GREFS(segs); j++) { |
1271 | persistent_gnt = rinfo->shadow[i].indirect_grants[j]; |
1272 | gnttab_end_foreign_access(ref: persistent_gnt->gref, NULL); |
1273 | __free_page(persistent_gnt->page); |
1274 | kfree(objp: persistent_gnt); |
1275 | } |
1276 | |
1277 | free_shadow: |
1278 | kvfree(addr: rinfo->shadow[i].grants_used); |
1279 | rinfo->shadow[i].grants_used = NULL; |
1280 | kvfree(addr: rinfo->shadow[i].indirect_grants); |
1281 | rinfo->shadow[i].indirect_grants = NULL; |
1282 | kvfree(addr: rinfo->shadow[i].sg); |
1283 | rinfo->shadow[i].sg = NULL; |
1284 | } |
1285 | |
1286 | /* No more gnttab callback work. */ |
1287 | gnttab_cancel_free_callback(callback: &rinfo->callback); |
1288 | |
1289 | /* Flush gnttab callback work. Must be done with no locks held. */ |
1290 | flush_work(work: &rinfo->work); |
1291 | |
1292 | /* Free resources associated with old device channel. */ |
1293 | xenbus_teardown_ring(vaddr: (void **)&rinfo->ring.sring, nr_pages: info->nr_ring_pages, |
1294 | grefs: rinfo->ring_ref); |
1295 | |
1296 | if (rinfo->irq) |
1297 | unbind_from_irqhandler(irq: rinfo->irq, dev_id: rinfo); |
1298 | rinfo->evtchn = rinfo->irq = 0; |
1299 | } |
1300 | |
1301 | static void blkif_free(struct blkfront_info *info, int suspend) |
1302 | { |
1303 | unsigned int i; |
1304 | struct blkfront_ring_info *rinfo; |
1305 | |
1306 | /* Prevent new requests being issued until we fix things up. */ |
1307 | info->connected = suspend ? |
1308 | BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; |
1309 | /* No more blkif_request(). */ |
1310 | if (info->rq) |
1311 | blk_mq_stop_hw_queues(q: info->rq); |
1312 | |
1313 | for_each_rinfo(info, rinfo, i) |
1314 | blkif_free_ring(rinfo); |
1315 | |
1316 | kvfree(addr: info->rinfo); |
1317 | info->rinfo = NULL; |
1318 | info->nr_rings = 0; |
1319 | } |
1320 | |
1321 | struct copy_from_grant { |
1322 | const struct blk_shadow *s; |
1323 | unsigned int grant_idx; |
1324 | unsigned int bvec_offset; |
1325 | char *bvec_data; |
1326 | }; |
1327 | |
1328 | static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, |
1329 | unsigned int len, void *data) |
1330 | { |
1331 | struct copy_from_grant *info = data; |
1332 | char *shared_data; |
1333 | /* Convenient aliases */ |
1334 | const struct blk_shadow *s = info->s; |
1335 | |
1336 | shared_data = kmap_atomic(page: s->grants_used[info->grant_idx]->page); |
1337 | |
1338 | memcpy(info->bvec_data + info->bvec_offset, |
1339 | shared_data + offset, len); |
1340 | |
1341 | info->bvec_offset += len; |
1342 | info->grant_idx++; |
1343 | |
1344 | kunmap_atomic(shared_data); |
1345 | } |
1346 | |
1347 | static enum blk_req_status blkif_rsp_to_req_status(int rsp) |
1348 | { |
1349 | switch (rsp) |
1350 | { |
1351 | case BLKIF_RSP_OKAY: |
1352 | return REQ_DONE; |
1353 | case BLKIF_RSP_EOPNOTSUPP: |
1354 | return REQ_EOPNOTSUPP; |
1355 | case BLKIF_RSP_ERROR: |
1356 | default: |
1357 | return REQ_ERROR; |
1358 | } |
1359 | } |
1360 | |
1361 | /* |
1362 | * Get the final status of the block request based on two ring response |
1363 | */ |
1364 | static int blkif_get_final_status(enum blk_req_status s1, |
1365 | enum blk_req_status s2) |
1366 | { |
1367 | BUG_ON(s1 < REQ_DONE); |
1368 | BUG_ON(s2 < REQ_DONE); |
1369 | |
1370 | if (s1 == REQ_ERROR || s2 == REQ_ERROR) |
1371 | return BLKIF_RSP_ERROR; |
1372 | else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP) |
1373 | return BLKIF_RSP_EOPNOTSUPP; |
1374 | return BLKIF_RSP_OKAY; |
1375 | } |
1376 | |
1377 | /* |
1378 | * Return values: |
1379 | * 1 response processed. |
1380 | * 0 missing further responses. |
1381 | * -1 error while processing. |
1382 | */ |
1383 | static int blkif_completion(unsigned long *id, |
1384 | struct blkfront_ring_info *rinfo, |
1385 | struct blkif_response *bret) |
1386 | { |
1387 | int i = 0; |
1388 | struct scatterlist *sg; |
1389 | int num_sg, num_grant; |
1390 | struct blkfront_info *info = rinfo->dev_info; |
1391 | struct blk_shadow *s = &rinfo->shadow[*id]; |
1392 | struct copy_from_grant data = { |
1393 | .grant_idx = 0, |
1394 | }; |
1395 | |
1396 | num_grant = s->req.operation == BLKIF_OP_INDIRECT ? |
1397 | s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; |
1398 | |
1399 | /* The I/O request may be split in two. */ |
1400 | if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { |
1401 | struct blk_shadow *s2 = &rinfo->shadow[s->associated_id]; |
1402 | |
1403 | /* Keep the status of the current response in shadow. */ |
1404 | s->status = blkif_rsp_to_req_status(rsp: bret->status); |
1405 | |
1406 | /* Wait the second response if not yet here. */ |
1407 | if (s2->status < REQ_DONE) |
1408 | return 0; |
1409 | |
1410 | bret->status = blkif_get_final_status(s1: s->status, |
1411 | s2: s2->status); |
1412 | |
1413 | /* |
1414 | * All the grants is stored in the first shadow in order |
1415 | * to make the completion code simpler. |
1416 | */ |
1417 | num_grant += s2->req.u.rw.nr_segments; |
1418 | |
1419 | /* |
1420 | * The two responses may not come in order. Only the |
1421 | * first request will store the scatter-gather list. |
1422 | */ |
1423 | if (s2->num_sg != 0) { |
1424 | /* Update "id" with the ID of the first response. */ |
1425 | *id = s->associated_id; |
1426 | s = s2; |
1427 | } |
1428 | |
1429 | /* |
1430 | * We don't need anymore the second request, so recycling |
1431 | * it now. |
1432 | */ |
1433 | if (add_id_to_freelist(rinfo, id: s->associated_id)) |
1434 | WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n" , |
1435 | info->gd->disk_name, s->associated_id); |
1436 | } |
1437 | |
1438 | data.s = s; |
1439 | num_sg = s->num_sg; |
1440 | |
1441 | if (bret->operation == BLKIF_OP_READ && info->bounce) { |
1442 | for_each_sg(s->sg, sg, num_sg, i) { |
1443 | BUG_ON(sg->offset + sg->length > PAGE_SIZE); |
1444 | |
1445 | data.bvec_offset = sg->offset; |
1446 | data.bvec_data = kmap_atomic(page: sg_page(sg)); |
1447 | |
1448 | gnttab_foreach_grant_in_range(page: sg_page(sg), |
1449 | offset: sg->offset, |
1450 | len: sg->length, |
1451 | fn: blkif_copy_from_grant, |
1452 | data: &data); |
1453 | |
1454 | kunmap_atomic(data.bvec_data); |
1455 | } |
1456 | } |
1457 | /* Add the persistent grant into the list of free grants */ |
1458 | for (i = 0; i < num_grant; i++) { |
1459 | if (!gnttab_try_end_foreign_access(ref: s->grants_used[i]->gref)) { |
1460 | /* |
1461 | * If the grant is still mapped by the backend (the |
1462 | * backend has chosen to make this grant persistent) |
1463 | * we add it at the head of the list, so it will be |
1464 | * reused first. |
1465 | */ |
1466 | if (!info->feature_persistent) { |
1467 | pr_alert("backed has not unmapped grant: %u\n" , |
1468 | s->grants_used[i]->gref); |
1469 | return -1; |
1470 | } |
1471 | list_add(new: &s->grants_used[i]->node, head: &rinfo->grants); |
1472 | rinfo->persistent_gnts_c++; |
1473 | } else { |
1474 | /* |
1475 | * If the grant is not mapped by the backend we add it |
1476 | * to the tail of the list, so it will not be picked |
1477 | * again unless we run out of persistent grants. |
1478 | */ |
1479 | s->grants_used[i]->gref = INVALID_GRANT_REF; |
1480 | list_add_tail(new: &s->grants_used[i]->node, head: &rinfo->grants); |
1481 | } |
1482 | } |
1483 | if (s->req.operation == BLKIF_OP_INDIRECT) { |
1484 | for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { |
1485 | if (!gnttab_try_end_foreign_access(ref: s->indirect_grants[i]->gref)) { |
1486 | if (!info->feature_persistent) { |
1487 | pr_alert("backed has not unmapped grant: %u\n" , |
1488 | s->indirect_grants[i]->gref); |
1489 | return -1; |
1490 | } |
1491 | list_add(new: &s->indirect_grants[i]->node, head: &rinfo->grants); |
1492 | rinfo->persistent_gnts_c++; |
1493 | } else { |
1494 | struct page *indirect_page; |
1495 | |
1496 | /* |
1497 | * Add the used indirect page back to the list of |
1498 | * available pages for indirect grefs. |
1499 | */ |
1500 | if (!info->bounce) { |
1501 | indirect_page = s->indirect_grants[i]->page; |
1502 | list_add(new: &indirect_page->lru, head: &rinfo->indirect_pages); |
1503 | } |
1504 | s->indirect_grants[i]->gref = INVALID_GRANT_REF; |
1505 | list_add_tail(new: &s->indirect_grants[i]->node, head: &rinfo->grants); |
1506 | } |
1507 | } |
1508 | } |
1509 | |
1510 | return 1; |
1511 | } |
1512 | |
1513 | static irqreturn_t blkif_interrupt(int irq, void *dev_id) |
1514 | { |
1515 | struct request *req; |
1516 | struct blkif_response bret; |
1517 | RING_IDX i, rp; |
1518 | unsigned long flags; |
1519 | struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; |
1520 | struct blkfront_info *info = rinfo->dev_info; |
1521 | unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS; |
1522 | |
1523 | if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { |
1524 | xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); |
1525 | return IRQ_HANDLED; |
1526 | } |
1527 | |
1528 | spin_lock_irqsave(&rinfo->ring_lock, flags); |
1529 | again: |
1530 | rp = READ_ONCE(rinfo->ring.sring->rsp_prod); |
1531 | virt_rmb(); /* Ensure we see queued responses up to 'rp'. */ |
1532 | if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) { |
1533 | pr_alert("%s: illegal number of responses %u\n" , |
1534 | info->gd->disk_name, rp - rinfo->ring.rsp_cons); |
1535 | goto err; |
1536 | } |
1537 | |
1538 | for (i = rinfo->ring.rsp_cons; i != rp; i++) { |
1539 | unsigned long id; |
1540 | unsigned int op; |
1541 | |
1542 | eoiflag = 0; |
1543 | |
1544 | RING_COPY_RESPONSE(&rinfo->ring, i, &bret); |
1545 | id = bret.id; |
1546 | |
1547 | /* |
1548 | * The backend has messed up and given us an id that we would |
1549 | * never have given to it (we stamp it up to BLK_RING_SIZE - |
1550 | * look in get_id_from_freelist. |
1551 | */ |
1552 | if (id >= BLK_RING_SIZE(info)) { |
1553 | pr_alert("%s: response has incorrect id (%ld)\n" , |
1554 | info->gd->disk_name, id); |
1555 | goto err; |
1556 | } |
1557 | if (rinfo->shadow[id].status != REQ_WAITING) { |
1558 | pr_alert("%s: response references no pending request\n" , |
1559 | info->gd->disk_name); |
1560 | goto err; |
1561 | } |
1562 | |
1563 | rinfo->shadow[id].status = REQ_PROCESSING; |
1564 | req = rinfo->shadow[id].request; |
1565 | |
1566 | op = rinfo->shadow[id].req.operation; |
1567 | if (op == BLKIF_OP_INDIRECT) |
1568 | op = rinfo->shadow[id].req.u.indirect.indirect_op; |
1569 | if (bret.operation != op) { |
1570 | pr_alert("%s: response has wrong operation (%u instead of %u)\n" , |
1571 | info->gd->disk_name, bret.operation, op); |
1572 | goto err; |
1573 | } |
1574 | |
1575 | if (bret.operation != BLKIF_OP_DISCARD) { |
1576 | int ret; |
1577 | |
1578 | /* |
1579 | * We may need to wait for an extra response if the |
1580 | * I/O request is split in 2 |
1581 | */ |
1582 | ret = blkif_completion(id: &id, rinfo, bret: &bret); |
1583 | if (!ret) |
1584 | continue; |
1585 | if (unlikely(ret < 0)) |
1586 | goto err; |
1587 | } |
1588 | |
1589 | if (add_id_to_freelist(rinfo, id)) { |
1590 | WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n" , |
1591 | info->gd->disk_name, op_name(bret.operation), id); |
1592 | continue; |
1593 | } |
1594 | |
1595 | if (bret.status == BLKIF_RSP_OKAY) |
1596 | blkif_req(rq: req)->error = BLK_STS_OK; |
1597 | else |
1598 | blkif_req(rq: req)->error = BLK_STS_IOERR; |
1599 | |
1600 | switch (bret.operation) { |
1601 | case BLKIF_OP_DISCARD: |
1602 | if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { |
1603 | struct request_queue *rq = info->rq; |
1604 | |
1605 | pr_warn_ratelimited("blkfront: %s: %s op failed\n" , |
1606 | info->gd->disk_name, op_name(bret.operation)); |
1607 | blkif_req(rq: req)->error = BLK_STS_NOTSUPP; |
1608 | info->feature_discard = 0; |
1609 | info->feature_secdiscard = 0; |
1610 | blk_queue_max_discard_sectors(q: rq, max_discard_sectors: 0); |
1611 | blk_queue_max_secure_erase_sectors(q: rq, max_sectors: 0); |
1612 | } |
1613 | break; |
1614 | case BLKIF_OP_FLUSH_DISKCACHE: |
1615 | case BLKIF_OP_WRITE_BARRIER: |
1616 | if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) { |
1617 | pr_warn_ratelimited("blkfront: %s: %s op failed\n" , |
1618 | info->gd->disk_name, op_name(bret.operation)); |
1619 | blkif_req(rq: req)->error = BLK_STS_NOTSUPP; |
1620 | } |
1621 | if (unlikely(bret.status == BLKIF_RSP_ERROR && |
1622 | rinfo->shadow[id].req.u.rw.nr_segments == 0)) { |
1623 | pr_warn_ratelimited("blkfront: %s: empty %s op failed\n" , |
1624 | info->gd->disk_name, op_name(bret.operation)); |
1625 | blkif_req(rq: req)->error = BLK_STS_NOTSUPP; |
1626 | } |
1627 | if (unlikely(blkif_req(req)->error)) { |
1628 | if (blkif_req(rq: req)->error == BLK_STS_NOTSUPP) |
1629 | blkif_req(rq: req)->error = BLK_STS_OK; |
1630 | info->feature_fua = 0; |
1631 | info->feature_flush = 0; |
1632 | xlvbd_flush(info); |
1633 | } |
1634 | fallthrough; |
1635 | case BLKIF_OP_READ: |
1636 | case BLKIF_OP_WRITE: |
1637 | if (unlikely(bret.status != BLKIF_RSP_OKAY)) |
1638 | dev_dbg_ratelimited(&info->xbdev->dev, |
1639 | "Bad return from blkdev data request: %#x\n" , |
1640 | bret.status); |
1641 | |
1642 | break; |
1643 | default: |
1644 | BUG(); |
1645 | } |
1646 | |
1647 | if (likely(!blk_should_fake_timeout(req->q))) |
1648 | blk_mq_complete_request(rq: req); |
1649 | } |
1650 | |
1651 | rinfo->ring.rsp_cons = i; |
1652 | |
1653 | if (i != rinfo->ring.req_prod_pvt) { |
1654 | int more_to_do; |
1655 | RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); |
1656 | if (more_to_do) |
1657 | goto again; |
1658 | } else |
1659 | rinfo->ring.sring->rsp_event = i + 1; |
1660 | |
1661 | kick_pending_request_queues_locked(rinfo); |
1662 | |
1663 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
1664 | |
1665 | xen_irq_lateeoi(irq, eoi_flags: eoiflag); |
1666 | |
1667 | return IRQ_HANDLED; |
1668 | |
1669 | err: |
1670 | info->connected = BLKIF_STATE_ERROR; |
1671 | |
1672 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
1673 | |
1674 | /* No EOI in order to avoid further interrupts. */ |
1675 | |
1676 | pr_alert("%s disabled for further use\n" , info->gd->disk_name); |
1677 | return IRQ_HANDLED; |
1678 | } |
1679 | |
1680 | |
1681 | static int setup_blkring(struct xenbus_device *dev, |
1682 | struct blkfront_ring_info *rinfo) |
1683 | { |
1684 | struct blkif_sring *sring; |
1685 | int err; |
1686 | struct blkfront_info *info = rinfo->dev_info; |
1687 | unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; |
1688 | |
1689 | err = xenbus_setup_ring(dev, GFP_NOIO, vaddr: (void **)&sring, |
1690 | nr_pages: info->nr_ring_pages, grefs: rinfo->ring_ref); |
1691 | if (err) |
1692 | goto fail; |
1693 | |
1694 | XEN_FRONT_RING_INIT(&rinfo->ring, sring, ring_size); |
1695 | |
1696 | err = xenbus_alloc_evtchn(dev, port: &rinfo->evtchn); |
1697 | if (err) |
1698 | goto fail; |
1699 | |
1700 | err = bind_evtchn_to_irqhandler_lateeoi(evtchn: rinfo->evtchn, handler: blkif_interrupt, |
1701 | irqflags: 0, devname: "blkif" , dev_id: rinfo); |
1702 | if (err <= 0) { |
1703 | xenbus_dev_fatal(dev, err, |
1704 | fmt: "bind_evtchn_to_irqhandler failed" ); |
1705 | goto fail; |
1706 | } |
1707 | rinfo->irq = err; |
1708 | |
1709 | return 0; |
1710 | fail: |
1711 | blkif_free(info, suspend: 0); |
1712 | return err; |
1713 | } |
1714 | |
1715 | /* |
1716 | * Write out per-ring/queue nodes including ring-ref and event-channel, and each |
1717 | * ring buffer may have multi pages depending on ->nr_ring_pages. |
1718 | */ |
1719 | static int write_per_ring_nodes(struct xenbus_transaction xbt, |
1720 | struct blkfront_ring_info *rinfo, const char *dir) |
1721 | { |
1722 | int err; |
1723 | unsigned int i; |
1724 | const char *message = NULL; |
1725 | struct blkfront_info *info = rinfo->dev_info; |
1726 | |
1727 | if (info->nr_ring_pages == 1) { |
1728 | err = xenbus_printf(t: xbt, dir, node: "ring-ref" , fmt: "%u" , rinfo->ring_ref[0]); |
1729 | if (err) { |
1730 | message = "writing ring-ref" ; |
1731 | goto abort_transaction; |
1732 | } |
1733 | } else { |
1734 | for (i = 0; i < info->nr_ring_pages; i++) { |
1735 | char ring_ref_name[RINGREF_NAME_LEN]; |
1736 | |
1737 | snprintf(buf: ring_ref_name, RINGREF_NAME_LEN, fmt: "ring-ref%u" , i); |
1738 | err = xenbus_printf(t: xbt, dir, node: ring_ref_name, |
1739 | fmt: "%u" , rinfo->ring_ref[i]); |
1740 | if (err) { |
1741 | message = "writing ring-ref" ; |
1742 | goto abort_transaction; |
1743 | } |
1744 | } |
1745 | } |
1746 | |
1747 | err = xenbus_printf(t: xbt, dir, node: "event-channel" , fmt: "%u" , rinfo->evtchn); |
1748 | if (err) { |
1749 | message = "writing event-channel" ; |
1750 | goto abort_transaction; |
1751 | } |
1752 | |
1753 | return 0; |
1754 | |
1755 | abort_transaction: |
1756 | xenbus_transaction_end(t: xbt, abort: 1); |
1757 | if (message) |
1758 | xenbus_dev_fatal(dev: info->xbdev, err, fmt: "%s" , message); |
1759 | |
1760 | return err; |
1761 | } |
1762 | |
1763 | /* Enable the persistent grants feature. */ |
1764 | static bool feature_persistent = true; |
1765 | module_param(feature_persistent, bool, 0644); |
1766 | MODULE_PARM_DESC(feature_persistent, |
1767 | "Enables the persistent grants feature" ); |
1768 | |
1769 | /* Common code used when first setting up, and when resuming. */ |
1770 | static int talk_to_blkback(struct xenbus_device *dev, |
1771 | struct blkfront_info *info) |
1772 | { |
1773 | const char *message = NULL; |
1774 | struct xenbus_transaction xbt; |
1775 | int err; |
1776 | unsigned int i, max_page_order; |
1777 | unsigned int ring_page_order; |
1778 | struct blkfront_ring_info *rinfo; |
1779 | |
1780 | if (!info) |
1781 | return -ENODEV; |
1782 | |
1783 | /* Check if backend is trusted. */ |
1784 | info->bounce = !xen_blkif_trusted || |
1785 | !xenbus_read_unsigned(dir: dev->nodename, node: "trusted" , default_val: 1); |
1786 | |
1787 | max_page_order = xenbus_read_unsigned(dir: info->xbdev->otherend, |
1788 | node: "max-ring-page-order" , default_val: 0); |
1789 | ring_page_order = min(xen_blkif_max_ring_order, max_page_order); |
1790 | info->nr_ring_pages = 1 << ring_page_order; |
1791 | |
1792 | err = negotiate_mq(info); |
1793 | if (err) |
1794 | goto destroy_blkring; |
1795 | |
1796 | for_each_rinfo(info, rinfo, i) { |
1797 | /* Create shared ring, alloc event channel. */ |
1798 | err = setup_blkring(dev, rinfo); |
1799 | if (err) |
1800 | goto destroy_blkring; |
1801 | } |
1802 | |
1803 | again: |
1804 | err = xenbus_transaction_start(t: &xbt); |
1805 | if (err) { |
1806 | xenbus_dev_fatal(dev, err, fmt: "starting transaction" ); |
1807 | goto destroy_blkring; |
1808 | } |
1809 | |
1810 | if (info->nr_ring_pages > 1) { |
1811 | err = xenbus_printf(t: xbt, dir: dev->nodename, node: "ring-page-order" , fmt: "%u" , |
1812 | ring_page_order); |
1813 | if (err) { |
1814 | message = "writing ring-page-order" ; |
1815 | goto abort_transaction; |
1816 | } |
1817 | } |
1818 | |
1819 | /* We already got the number of queues/rings in _probe */ |
1820 | if (info->nr_rings == 1) { |
1821 | err = write_per_ring_nodes(xbt, rinfo: info->rinfo, dir: dev->nodename); |
1822 | if (err) |
1823 | goto destroy_blkring; |
1824 | } else { |
1825 | char *path; |
1826 | size_t pathsize; |
1827 | |
1828 | err = xenbus_printf(t: xbt, dir: dev->nodename, node: "multi-queue-num-queues" , fmt: "%u" , |
1829 | info->nr_rings); |
1830 | if (err) { |
1831 | message = "writing multi-queue-num-queues" ; |
1832 | goto abort_transaction; |
1833 | } |
1834 | |
1835 | pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN; |
1836 | path = kmalloc(size: pathsize, GFP_KERNEL); |
1837 | if (!path) { |
1838 | err = -ENOMEM; |
1839 | message = "ENOMEM while writing ring references" ; |
1840 | goto abort_transaction; |
1841 | } |
1842 | |
1843 | for_each_rinfo(info, rinfo, i) { |
1844 | memset(path, 0, pathsize); |
1845 | snprintf(buf: path, size: pathsize, fmt: "%s/queue-%u" , dev->nodename, i); |
1846 | err = write_per_ring_nodes(xbt, rinfo, dir: path); |
1847 | if (err) { |
1848 | kfree(objp: path); |
1849 | goto destroy_blkring; |
1850 | } |
1851 | } |
1852 | kfree(objp: path); |
1853 | } |
1854 | err = xenbus_printf(t: xbt, dir: dev->nodename, node: "protocol" , fmt: "%s" , |
1855 | XEN_IO_PROTO_ABI_NATIVE); |
1856 | if (err) { |
1857 | message = "writing protocol" ; |
1858 | goto abort_transaction; |
1859 | } |
1860 | info->feature_persistent_parm = feature_persistent; |
1861 | err = xenbus_printf(t: xbt, dir: dev->nodename, node: "feature-persistent" , fmt: "%u" , |
1862 | info->feature_persistent_parm); |
1863 | if (err) |
1864 | dev_warn(&dev->dev, |
1865 | "writing persistent grants feature to xenbus" ); |
1866 | |
1867 | err = xenbus_transaction_end(t: xbt, abort: 0); |
1868 | if (err) { |
1869 | if (err == -EAGAIN) |
1870 | goto again; |
1871 | xenbus_dev_fatal(dev, err, fmt: "completing transaction" ); |
1872 | goto destroy_blkring; |
1873 | } |
1874 | |
1875 | for_each_rinfo(info, rinfo, i) { |
1876 | unsigned int j; |
1877 | |
1878 | for (j = 0; j < BLK_RING_SIZE(info); j++) |
1879 | rinfo->shadow[j].req.u.rw.id = j + 1; |
1880 | rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; |
1881 | } |
1882 | xenbus_switch_state(dev, new_state: XenbusStateInitialised); |
1883 | |
1884 | return 0; |
1885 | |
1886 | abort_transaction: |
1887 | xenbus_transaction_end(t: xbt, abort: 1); |
1888 | if (message) |
1889 | xenbus_dev_fatal(dev, err, fmt: "%s" , message); |
1890 | destroy_blkring: |
1891 | blkif_free(info, suspend: 0); |
1892 | return err; |
1893 | } |
1894 | |
1895 | static int negotiate_mq(struct blkfront_info *info) |
1896 | { |
1897 | unsigned int backend_max_queues; |
1898 | unsigned int i; |
1899 | struct blkfront_ring_info *rinfo; |
1900 | |
1901 | BUG_ON(info->nr_rings); |
1902 | |
1903 | /* Check if backend supports multiple queues. */ |
1904 | backend_max_queues = xenbus_read_unsigned(dir: info->xbdev->otherend, |
1905 | node: "multi-queue-max-queues" , default_val: 1); |
1906 | info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); |
1907 | /* We need at least one ring. */ |
1908 | if (!info->nr_rings) |
1909 | info->nr_rings = 1; |
1910 | |
1911 | info->rinfo_size = struct_size(info->rinfo, shadow, |
1912 | BLK_RING_SIZE(info)); |
1913 | info->rinfo = kvcalloc(n: info->nr_rings, size: info->rinfo_size, GFP_KERNEL); |
1914 | if (!info->rinfo) { |
1915 | xenbus_dev_fatal(dev: info->xbdev, err: -ENOMEM, fmt: "allocating ring_info structure" ); |
1916 | info->nr_rings = 0; |
1917 | return -ENOMEM; |
1918 | } |
1919 | |
1920 | for_each_rinfo(info, rinfo, i) { |
1921 | INIT_LIST_HEAD(list: &rinfo->indirect_pages); |
1922 | INIT_LIST_HEAD(list: &rinfo->grants); |
1923 | rinfo->dev_info = info; |
1924 | INIT_WORK(&rinfo->work, blkif_restart_queue); |
1925 | spin_lock_init(&rinfo->ring_lock); |
1926 | } |
1927 | return 0; |
1928 | } |
1929 | |
1930 | /* |
1931 | * Entry point to this code when a new device is created. Allocate the basic |
1932 | * structures and the ring buffer for communication with the backend, and |
1933 | * inform the backend of the appropriate details for those. Switch to |
1934 | * Initialised state. |
1935 | */ |
1936 | static int blkfront_probe(struct xenbus_device *dev, |
1937 | const struct xenbus_device_id *id) |
1938 | { |
1939 | int err, vdevice; |
1940 | struct blkfront_info *info; |
1941 | |
1942 | /* FIXME: Use dynamic device id if this is not set. */ |
1943 | err = xenbus_scanf(XBT_NIL, dir: dev->nodename, |
1944 | node: "virtual-device" , fmt: "%i" , &vdevice); |
1945 | if (err != 1) { |
1946 | /* go looking in the extended area instead */ |
1947 | err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext" , |
1948 | "%i" , &vdevice); |
1949 | if (err != 1) { |
1950 | xenbus_dev_fatal(dev, err, "reading virtual-device" ); |
1951 | return err; |
1952 | } |
1953 | } |
1954 | |
1955 | if (xen_hvm_domain()) { |
1956 | char *type; |
1957 | int len; |
1958 | /* no unplug has been done: do not hook devices != xen vbds */ |
1959 | if (xen_has_pv_and_legacy_disk_devices()) { |
1960 | int major; |
1961 | |
1962 | if (!VDEV_IS_EXTENDED(vdevice)) |
1963 | major = BLKIF_MAJOR(vdevice); |
1964 | else |
1965 | major = XENVBD_MAJOR; |
1966 | |
1967 | if (major != XENVBD_MAJOR) { |
1968 | printk(KERN_INFO |
1969 | "%s: HVM does not support vbd %d as xen block device\n" , |
1970 | __func__, vdevice); |
1971 | return -ENODEV; |
1972 | } |
1973 | } |
1974 | /* do not create a PV cdrom device if we are an HVM guest */ |
1975 | type = xenbus_read(XBT_NIL, dev->nodename, "device-type" , &len); |
1976 | if (IS_ERR(type)) |
1977 | return -ENODEV; |
1978 | if (strncmp(type, "cdrom" , 5) == 0) { |
1979 | kfree(type); |
1980 | return -ENODEV; |
1981 | } |
1982 | kfree(type); |
1983 | } |
1984 | info = kzalloc(sizeof(*info), GFP_KERNEL); |
1985 | if (!info) { |
1986 | xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure" ); |
1987 | return -ENOMEM; |
1988 | } |
1989 | |
1990 | info->xbdev = dev; |
1991 | |
1992 | mutex_init(&info->mutex); |
1993 | info->vdevice = vdevice; |
1994 | info->connected = BLKIF_STATE_DISCONNECTED; |
1995 | |
1996 | /* Front end dir is a number, which is used as the id. */ |
1997 | info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); |
1998 | dev_set_drvdata(&dev->dev, info); |
1999 | |
2000 | mutex_lock(&blkfront_mutex); |
2001 | list_add(&info->info_list, &info_list); |
2002 | mutex_unlock(&blkfront_mutex); |
2003 | |
2004 | return 0; |
2005 | } |
2006 | |
2007 | static int blkif_recover(struct blkfront_info *info) |
2008 | { |
2009 | unsigned int r_index; |
2010 | struct request *req, *n; |
2011 | int rc; |
2012 | struct bio *bio; |
2013 | unsigned int segs; |
2014 | struct blkfront_ring_info *rinfo; |
2015 | |
2016 | blkfront_gather_backend_features(info); |
2017 | /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ |
2018 | blkif_set_queue_limits(info); |
2019 | segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; |
2020 | blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG); |
2021 | |
2022 | for_each_rinfo(info, rinfo, r_index) { |
2023 | rc = blkfront_setup_indirect(rinfo); |
2024 | if (rc) |
2025 | return rc; |
2026 | } |
2027 | xenbus_switch_state(dev: info->xbdev, new_state: XenbusStateConnected); |
2028 | |
2029 | /* Now safe for us to use the shared ring */ |
2030 | info->connected = BLKIF_STATE_CONNECTED; |
2031 | |
2032 | for_each_rinfo(info, rinfo, r_index) { |
2033 | /* Kick any other new requests queued since we resumed */ |
2034 | kick_pending_request_queues(rinfo); |
2035 | } |
2036 | |
2037 | list_for_each_entry_safe(req, n, &info->requests, queuelist) { |
2038 | /* Requeue pending requests (flush or discard) */ |
2039 | list_del_init(entry: &req->queuelist); |
2040 | BUG_ON(req->nr_phys_segments > segs); |
2041 | blk_mq_requeue_request(rq: req, kick_requeue_list: false); |
2042 | } |
2043 | blk_mq_start_stopped_hw_queues(q: info->rq, async: true); |
2044 | blk_mq_kick_requeue_list(q: info->rq); |
2045 | |
2046 | while ((bio = bio_list_pop(bl: &info->bio_list)) != NULL) { |
2047 | /* Traverse the list of pending bios and re-queue them */ |
2048 | submit_bio(bio); |
2049 | } |
2050 | |
2051 | return 0; |
2052 | } |
2053 | |
2054 | /* |
2055 | * We are reconnecting to the backend, due to a suspend/resume, or a backend |
2056 | * driver restart. We tear down our blkif structure and recreate it, but |
2057 | * leave the device-layer structures intact so that this is transparent to the |
2058 | * rest of the kernel. |
2059 | */ |
2060 | static int blkfront_resume(struct xenbus_device *dev) |
2061 | { |
2062 | struct blkfront_info *info = dev_get_drvdata(dev: &dev->dev); |
2063 | int err = 0; |
2064 | unsigned int i, j; |
2065 | struct blkfront_ring_info *rinfo; |
2066 | |
2067 | dev_dbg(&dev->dev, "blkfront_resume: %s\n" , dev->nodename); |
2068 | |
2069 | bio_list_init(bl: &info->bio_list); |
2070 | INIT_LIST_HEAD(list: &info->requests); |
2071 | for_each_rinfo(info, rinfo, i) { |
2072 | struct bio_list merge_bio; |
2073 | struct blk_shadow *shadow = rinfo->shadow; |
2074 | |
2075 | for (j = 0; j < BLK_RING_SIZE(info); j++) { |
2076 | /* Not in use? */ |
2077 | if (!shadow[j].request) |
2078 | continue; |
2079 | |
2080 | /* |
2081 | * Get the bios in the request so we can re-queue them. |
2082 | */ |
2083 | if (req_op(req: shadow[j].request) == REQ_OP_FLUSH || |
2084 | req_op(req: shadow[j].request) == REQ_OP_DISCARD || |
2085 | req_op(req: shadow[j].request) == REQ_OP_SECURE_ERASE || |
2086 | shadow[j].request->cmd_flags & REQ_FUA) { |
2087 | /* |
2088 | * Flush operations don't contain bios, so |
2089 | * we need to requeue the whole request |
2090 | * |
2091 | * XXX: but this doesn't make any sense for a |
2092 | * write with the FUA flag set.. |
2093 | */ |
2094 | list_add(new: &shadow[j].request->queuelist, head: &info->requests); |
2095 | continue; |
2096 | } |
2097 | merge_bio.head = shadow[j].request->bio; |
2098 | merge_bio.tail = shadow[j].request->biotail; |
2099 | bio_list_merge(bl: &info->bio_list, bl2: &merge_bio); |
2100 | shadow[j].request->bio = NULL; |
2101 | blk_mq_end_request(rq: shadow[j].request, BLK_STS_OK); |
2102 | } |
2103 | } |
2104 | |
2105 | blkif_free(info, suspend: info->connected == BLKIF_STATE_CONNECTED); |
2106 | |
2107 | err = talk_to_blkback(dev, info); |
2108 | if (!err) |
2109 | blk_mq_update_nr_hw_queues(set: &info->tag_set, nr_hw_queues: info->nr_rings); |
2110 | |
2111 | /* |
2112 | * We have to wait for the backend to switch to |
2113 | * connected state, since we want to read which |
2114 | * features it supports. |
2115 | */ |
2116 | |
2117 | return err; |
2118 | } |
2119 | |
2120 | static void blkfront_closing(struct blkfront_info *info) |
2121 | { |
2122 | struct xenbus_device *xbdev = info->xbdev; |
2123 | struct blkfront_ring_info *rinfo; |
2124 | unsigned int i; |
2125 | |
2126 | if (xbdev->state == XenbusStateClosing) |
2127 | return; |
2128 | |
2129 | /* No more blkif_request(). */ |
2130 | if (info->rq && info->gd) { |
2131 | blk_mq_stop_hw_queues(q: info->rq); |
2132 | blk_mark_disk_dead(disk: info->gd); |
2133 | } |
2134 | |
2135 | for_each_rinfo(info, rinfo, i) { |
2136 | /* No more gnttab callback work. */ |
2137 | gnttab_cancel_free_callback(callback: &rinfo->callback); |
2138 | |
2139 | /* Flush gnttab callback work. Must be done with no locks held. */ |
2140 | flush_work(work: &rinfo->work); |
2141 | } |
2142 | |
2143 | xenbus_frontend_closed(dev: xbdev); |
2144 | } |
2145 | |
2146 | static void blkfront_setup_discard(struct blkfront_info *info) |
2147 | { |
2148 | info->feature_discard = 1; |
2149 | info->discard_granularity = xenbus_read_unsigned(dir: info->xbdev->otherend, |
2150 | node: "discard-granularity" , |
2151 | default_val: 0); |
2152 | info->discard_alignment = xenbus_read_unsigned(dir: info->xbdev->otherend, |
2153 | node: "discard-alignment" , default_val: 0); |
2154 | info->feature_secdiscard = |
2155 | !!xenbus_read_unsigned(dir: info->xbdev->otherend, node: "discard-secure" , |
2156 | default_val: 0); |
2157 | } |
2158 | |
2159 | static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) |
2160 | { |
2161 | unsigned int psegs, grants, memflags; |
2162 | int err, i; |
2163 | struct blkfront_info *info = rinfo->dev_info; |
2164 | |
2165 | memflags = memalloc_noio_save(); |
2166 | |
2167 | if (info->max_indirect_segments == 0) { |
2168 | if (!HAS_EXTRA_REQ) |
2169 | grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; |
2170 | else { |
2171 | /* |
2172 | * When an extra req is required, the maximum |
2173 | * grants supported is related to the size of the |
2174 | * Linux block segment. |
2175 | */ |
2176 | grants = GRANTS_PER_PSEG; |
2177 | } |
2178 | } |
2179 | else |
2180 | grants = info->max_indirect_segments; |
2181 | psegs = DIV_ROUND_UP(grants, GRANTS_PER_PSEG); |
2182 | |
2183 | err = fill_grant_buffer(rinfo, |
2184 | num: (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); |
2185 | if (err) |
2186 | goto out_of_memory; |
2187 | |
2188 | if (!info->bounce && info->max_indirect_segments) { |
2189 | /* |
2190 | * We are using indirect descriptors but don't have a bounce |
2191 | * buffer, we need to allocate a set of pages that can be |
2192 | * used for mapping indirect grefs |
2193 | */ |
2194 | int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); |
2195 | |
2196 | BUG_ON(!list_empty(&rinfo->indirect_pages)); |
2197 | for (i = 0; i < num; i++) { |
2198 | struct page *indirect_page = alloc_page(GFP_KERNEL | |
2199 | __GFP_ZERO); |
2200 | if (!indirect_page) |
2201 | goto out_of_memory; |
2202 | list_add(new: &indirect_page->lru, head: &rinfo->indirect_pages); |
2203 | } |
2204 | } |
2205 | |
2206 | for (i = 0; i < BLK_RING_SIZE(info); i++) { |
2207 | rinfo->shadow[i].grants_used = |
2208 | kvcalloc(n: grants, |
2209 | size: sizeof(rinfo->shadow[i].grants_used[0]), |
2210 | GFP_KERNEL); |
2211 | rinfo->shadow[i].sg = kvcalloc(n: psegs, |
2212 | size: sizeof(rinfo->shadow[i].sg[0]), |
2213 | GFP_KERNEL); |
2214 | if (info->max_indirect_segments) |
2215 | rinfo->shadow[i].indirect_grants = |
2216 | kvcalloc(INDIRECT_GREFS(grants), |
2217 | size: sizeof(rinfo->shadow[i].indirect_grants[0]), |
2218 | GFP_KERNEL); |
2219 | if ((rinfo->shadow[i].grants_used == NULL) || |
2220 | (rinfo->shadow[i].sg == NULL) || |
2221 | (info->max_indirect_segments && |
2222 | (rinfo->shadow[i].indirect_grants == NULL))) |
2223 | goto out_of_memory; |
2224 | sg_init_table(rinfo->shadow[i].sg, psegs); |
2225 | } |
2226 | |
2227 | memalloc_noio_restore(flags: memflags); |
2228 | |
2229 | return 0; |
2230 | |
2231 | out_of_memory: |
2232 | for (i = 0; i < BLK_RING_SIZE(info); i++) { |
2233 | kvfree(addr: rinfo->shadow[i].grants_used); |
2234 | rinfo->shadow[i].grants_used = NULL; |
2235 | kvfree(addr: rinfo->shadow[i].sg); |
2236 | rinfo->shadow[i].sg = NULL; |
2237 | kvfree(addr: rinfo->shadow[i].indirect_grants); |
2238 | rinfo->shadow[i].indirect_grants = NULL; |
2239 | } |
2240 | if (!list_empty(head: &rinfo->indirect_pages)) { |
2241 | struct page *indirect_page, *n; |
2242 | list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { |
2243 | list_del(entry: &indirect_page->lru); |
2244 | __free_page(indirect_page); |
2245 | } |
2246 | } |
2247 | |
2248 | memalloc_noio_restore(flags: memflags); |
2249 | |
2250 | return -ENOMEM; |
2251 | } |
2252 | |
2253 | /* |
2254 | * Gather all backend feature-* |
2255 | */ |
2256 | static void blkfront_gather_backend_features(struct blkfront_info *info) |
2257 | { |
2258 | unsigned int indirect_segments; |
2259 | |
2260 | info->feature_flush = 0; |
2261 | info->feature_fua = 0; |
2262 | |
2263 | /* |
2264 | * If there's no "feature-barrier" defined, then it means |
2265 | * we're dealing with a very old backend which writes |
2266 | * synchronously; nothing to do. |
2267 | * |
2268 | * If there are barriers, then we use flush. |
2269 | */ |
2270 | if (xenbus_read_unsigned(dir: info->xbdev->otherend, node: "feature-barrier" , default_val: 0)) { |
2271 | info->feature_flush = 1; |
2272 | info->feature_fua = 1; |
2273 | } |
2274 | |
2275 | /* |
2276 | * And if there is "feature-flush-cache" use that above |
2277 | * barriers. |
2278 | */ |
2279 | if (xenbus_read_unsigned(dir: info->xbdev->otherend, node: "feature-flush-cache" , |
2280 | default_val: 0)) { |
2281 | info->feature_flush = 1; |
2282 | info->feature_fua = 0; |
2283 | } |
2284 | |
2285 | if (xenbus_read_unsigned(dir: info->xbdev->otherend, node: "feature-discard" , default_val: 0)) |
2286 | blkfront_setup_discard(info); |
2287 | |
2288 | if (info->feature_persistent_parm) |
2289 | info->feature_persistent = |
2290 | !!xenbus_read_unsigned(dir: info->xbdev->otherend, |
2291 | node: "feature-persistent" , default_val: 0); |
2292 | if (info->feature_persistent) |
2293 | info->bounce = true; |
2294 | |
2295 | indirect_segments = xenbus_read_unsigned(dir: info->xbdev->otherend, |
2296 | node: "feature-max-indirect-segments" , default_val: 0); |
2297 | if (indirect_segments > xen_blkif_max_segments) |
2298 | indirect_segments = xen_blkif_max_segments; |
2299 | if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST) |
2300 | indirect_segments = 0; |
2301 | info->max_indirect_segments = indirect_segments; |
2302 | |
2303 | if (info->feature_persistent) { |
2304 | mutex_lock(&blkfront_mutex); |
2305 | schedule_delayed_work(dwork: &blkfront_work, HZ * 10); |
2306 | mutex_unlock(lock: &blkfront_mutex); |
2307 | } |
2308 | } |
2309 | |
2310 | /* |
2311 | * Invoked when the backend is finally 'ready' (and has told produced |
2312 | * the details about the physical device - #sectors, size, etc). |
2313 | */ |
2314 | static void blkfront_connect(struct blkfront_info *info) |
2315 | { |
2316 | unsigned long long sectors; |
2317 | unsigned long sector_size; |
2318 | unsigned int physical_sector_size; |
2319 | int err, i; |
2320 | struct blkfront_ring_info *rinfo; |
2321 | |
2322 | switch (info->connected) { |
2323 | case BLKIF_STATE_CONNECTED: |
2324 | /* |
2325 | * Potentially, the back-end may be signalling |
2326 | * a capacity change; update the capacity. |
2327 | */ |
2328 | err = xenbus_scanf(XBT_NIL, dir: info->xbdev->otherend, |
2329 | node: "sectors" , fmt: "%Lu" , §ors); |
2330 | if (XENBUS_EXIST_ERR(err)) |
2331 | return; |
2332 | printk(KERN_INFO "Setting capacity to %Lu\n" , |
2333 | sectors); |
2334 | set_capacity_and_notify(info->gd, sectors); |
2335 | |
2336 | return; |
2337 | case BLKIF_STATE_SUSPENDED: |
2338 | /* |
2339 | * If we are recovering from suspension, we need to wait |
2340 | * for the backend to announce it's features before |
2341 | * reconnecting, at least we need to know if the backend |
2342 | * supports indirect descriptors, and how many. |
2343 | */ |
2344 | blkif_recover(info); |
2345 | return; |
2346 | |
2347 | default: |
2348 | break; |
2349 | } |
2350 | |
2351 | dev_dbg(&info->xbdev->dev, "%s:%s.\n" , |
2352 | __func__, info->xbdev->otherend); |
2353 | |
2354 | err = xenbus_gather(XBT_NIL, info->xbdev->otherend, |
2355 | "sectors" , "%llu" , §ors, |
2356 | "info" , "%u" , &info->vdisk_info, |
2357 | "sector-size" , "%lu" , §or_size, |
2358 | NULL); |
2359 | if (err) { |
2360 | xenbus_dev_fatal(info->xbdev, err, |
2361 | "reading backend fields at %s" , |
2362 | info->xbdev->otherend); |
2363 | return; |
2364 | } |
2365 | |
2366 | /* |
2367 | * physical-sector-size is a newer field, so old backends may not |
2368 | * provide this. Assume physical sector size to be the same as |
2369 | * sector_size in that case. |
2370 | */ |
2371 | physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend, |
2372 | "physical-sector-size" , |
2373 | sector_size); |
2374 | blkfront_gather_backend_features(info); |
2375 | for_each_rinfo(info, rinfo, i) { |
2376 | err = blkfront_setup_indirect(rinfo); |
2377 | if (err) { |
2378 | xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s" , |
2379 | info->xbdev->otherend); |
2380 | blkif_free(info, 0); |
2381 | break; |
2382 | } |
2383 | } |
2384 | |
2385 | err = xlvbd_alloc_gendisk(sectors, info, sector_size, |
2386 | physical_sector_size); |
2387 | if (err) { |
2388 | xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s" , |
2389 | info->xbdev->otherend); |
2390 | goto fail; |
2391 | } |
2392 | |
2393 | xenbus_switch_state(info->xbdev, XenbusStateConnected); |
2394 | |
2395 | /* Kick pending requests. */ |
2396 | info->connected = BLKIF_STATE_CONNECTED; |
2397 | for_each_rinfo(info, rinfo, i) |
2398 | kick_pending_request_queues(rinfo); |
2399 | |
2400 | err = device_add_disk(&info->xbdev->dev, info->gd, NULL); |
2401 | if (err) { |
2402 | put_disk(info->gd); |
2403 | blk_mq_free_tag_set(&info->tag_set); |
2404 | info->rq = NULL; |
2405 | goto fail; |
2406 | } |
2407 | |
2408 | info->is_ready = 1; |
2409 | return; |
2410 | |
2411 | fail: |
2412 | blkif_free(info, 0); |
2413 | return; |
2414 | } |
2415 | |
2416 | /* |
2417 | * Callback received when the backend's state changes. |
2418 | */ |
2419 | static void blkback_changed(struct xenbus_device *dev, |
2420 | enum xenbus_state backend_state) |
2421 | { |
2422 | struct blkfront_info *info = dev_get_drvdata(dev: &dev->dev); |
2423 | |
2424 | dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n" , backend_state); |
2425 | |
2426 | switch (backend_state) { |
2427 | case XenbusStateInitWait: |
2428 | if (dev->state != XenbusStateInitialising) |
2429 | break; |
2430 | if (talk_to_blkback(dev, info)) |
2431 | break; |
2432 | break; |
2433 | case XenbusStateInitialising: |
2434 | case XenbusStateInitialised: |
2435 | case XenbusStateReconfiguring: |
2436 | case XenbusStateReconfigured: |
2437 | case XenbusStateUnknown: |
2438 | break; |
2439 | |
2440 | case XenbusStateConnected: |
2441 | /* |
2442 | * talk_to_blkback sets state to XenbusStateInitialised |
2443 | * and blkfront_connect sets it to XenbusStateConnected |
2444 | * (if connection went OK). |
2445 | * |
2446 | * If the backend (or toolstack) decides to poke at backend |
2447 | * state (and re-trigger the watch by setting the state repeatedly |
2448 | * to XenbusStateConnected (4)) we need to deal with this. |
2449 | * This is allowed as this is used to communicate to the guest |
2450 | * that the size of disk has changed! |
2451 | */ |
2452 | if ((dev->state != XenbusStateInitialised) && |
2453 | (dev->state != XenbusStateConnected)) { |
2454 | if (talk_to_blkback(dev, info)) |
2455 | break; |
2456 | } |
2457 | |
2458 | blkfront_connect(info); |
2459 | break; |
2460 | |
2461 | case XenbusStateClosed: |
2462 | if (dev->state == XenbusStateClosed) |
2463 | break; |
2464 | fallthrough; |
2465 | case XenbusStateClosing: |
2466 | blkfront_closing(info); |
2467 | break; |
2468 | } |
2469 | } |
2470 | |
2471 | static void blkfront_remove(struct xenbus_device *xbdev) |
2472 | { |
2473 | struct blkfront_info *info = dev_get_drvdata(dev: &xbdev->dev); |
2474 | |
2475 | dev_dbg(&xbdev->dev, "%s removed" , xbdev->nodename); |
2476 | |
2477 | if (info->gd) |
2478 | del_gendisk(gp: info->gd); |
2479 | |
2480 | mutex_lock(&blkfront_mutex); |
2481 | list_del(entry: &info->info_list); |
2482 | mutex_unlock(lock: &blkfront_mutex); |
2483 | |
2484 | blkif_free(info, suspend: 0); |
2485 | if (info->gd) { |
2486 | xlbd_release_minors(minor: info->gd->first_minor, nr: info->gd->minors); |
2487 | put_disk(disk: info->gd); |
2488 | blk_mq_free_tag_set(set: &info->tag_set); |
2489 | } |
2490 | |
2491 | kfree(objp: info); |
2492 | } |
2493 | |
2494 | static int blkfront_is_ready(struct xenbus_device *dev) |
2495 | { |
2496 | struct blkfront_info *info = dev_get_drvdata(dev: &dev->dev); |
2497 | |
2498 | return info->is_ready && info->xbdev; |
2499 | } |
2500 | |
2501 | static const struct block_device_operations xlvbd_block_fops = |
2502 | { |
2503 | .owner = THIS_MODULE, |
2504 | .getgeo = blkif_getgeo, |
2505 | .ioctl = blkif_ioctl, |
2506 | .compat_ioctl = blkdev_compat_ptr_ioctl, |
2507 | }; |
2508 | |
2509 | |
2510 | static const struct xenbus_device_id blkfront_ids[] = { |
2511 | { "vbd" }, |
2512 | { "" } |
2513 | }; |
2514 | |
2515 | static struct xenbus_driver blkfront_driver = { |
2516 | .ids = blkfront_ids, |
2517 | .probe = blkfront_probe, |
2518 | .remove = blkfront_remove, |
2519 | .resume = blkfront_resume, |
2520 | .otherend_changed = blkback_changed, |
2521 | .is_ready = blkfront_is_ready, |
2522 | }; |
2523 | |
2524 | static void purge_persistent_grants(struct blkfront_info *info) |
2525 | { |
2526 | unsigned int i; |
2527 | unsigned long flags; |
2528 | struct blkfront_ring_info *rinfo; |
2529 | |
2530 | for_each_rinfo(info, rinfo, i) { |
2531 | struct grant *gnt_list_entry, *tmp; |
2532 | LIST_HEAD(grants); |
2533 | |
2534 | spin_lock_irqsave(&rinfo->ring_lock, flags); |
2535 | |
2536 | if (rinfo->persistent_gnts_c == 0) { |
2537 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
2538 | continue; |
2539 | } |
2540 | |
2541 | list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants, |
2542 | node) { |
2543 | if (gnt_list_entry->gref == INVALID_GRANT_REF || |
2544 | !gnttab_try_end_foreign_access(ref: gnt_list_entry->gref)) |
2545 | continue; |
2546 | |
2547 | list_del(entry: &gnt_list_entry->node); |
2548 | rinfo->persistent_gnts_c--; |
2549 | gnt_list_entry->gref = INVALID_GRANT_REF; |
2550 | list_add_tail(new: &gnt_list_entry->node, head: &grants); |
2551 | } |
2552 | |
2553 | list_splice_tail(list: &grants, head: &rinfo->grants); |
2554 | |
2555 | spin_unlock_irqrestore(lock: &rinfo->ring_lock, flags); |
2556 | } |
2557 | } |
2558 | |
2559 | static void blkfront_delay_work(struct work_struct *work) |
2560 | { |
2561 | struct blkfront_info *info; |
2562 | bool need_schedule_work = false; |
2563 | |
2564 | /* |
2565 | * Note that when using bounce buffers but not persistent grants |
2566 | * there's no need to run blkfront_delay_work because grants are |
2567 | * revoked in blkif_completion or else an error is reported and the |
2568 | * connection is closed. |
2569 | */ |
2570 | |
2571 | mutex_lock(&blkfront_mutex); |
2572 | |
2573 | list_for_each_entry(info, &info_list, info_list) { |
2574 | if (info->feature_persistent) { |
2575 | need_schedule_work = true; |
2576 | mutex_lock(&info->mutex); |
2577 | purge_persistent_grants(info); |
2578 | mutex_unlock(lock: &info->mutex); |
2579 | } |
2580 | } |
2581 | |
2582 | if (need_schedule_work) |
2583 | schedule_delayed_work(dwork: &blkfront_work, HZ * 10); |
2584 | |
2585 | mutex_unlock(lock: &blkfront_mutex); |
2586 | } |
2587 | |
2588 | static int __init xlblk_init(void) |
2589 | { |
2590 | int ret; |
2591 | int nr_cpus = num_online_cpus(); |
2592 | |
2593 | if (!xen_domain()) |
2594 | return -ENODEV; |
2595 | |
2596 | if (!xen_has_pv_disk_devices()) |
2597 | return -ENODEV; |
2598 | |
2599 | if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { |
2600 | pr_warn("xen_blk: can't get major %d with name %s\n" , |
2601 | XENVBD_MAJOR, DEV_NAME); |
2602 | return -ENODEV; |
2603 | } |
2604 | |
2605 | if (xen_blkif_max_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) |
2606 | xen_blkif_max_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; |
2607 | |
2608 | if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { |
2609 | pr_info("Invalid max_ring_order (%d), will use default max: %d.\n" , |
2610 | xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); |
2611 | xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; |
2612 | } |
2613 | |
2614 | if (xen_blkif_max_queues > nr_cpus) { |
2615 | pr_info("Invalid max_queues (%d), will use default max: %d.\n" , |
2616 | xen_blkif_max_queues, nr_cpus); |
2617 | xen_blkif_max_queues = nr_cpus; |
2618 | } |
2619 | |
2620 | INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work); |
2621 | |
2622 | ret = xenbus_register_frontend(&blkfront_driver); |
2623 | if (ret) { |
2624 | unregister_blkdev(XENVBD_MAJOR, DEV_NAME); |
2625 | return ret; |
2626 | } |
2627 | |
2628 | return 0; |
2629 | } |
2630 | module_init(xlblk_init); |
2631 | |
2632 | |
2633 | static void __exit xlblk_exit(void) |
2634 | { |
2635 | cancel_delayed_work_sync(dwork: &blkfront_work); |
2636 | |
2637 | xenbus_unregister_driver(drv: &blkfront_driver); |
2638 | unregister_blkdev(XENVBD_MAJOR, DEV_NAME); |
2639 | kfree(objp: minors); |
2640 | } |
2641 | module_exit(xlblk_exit); |
2642 | |
2643 | MODULE_DESCRIPTION("Xen virtual block device frontend" ); |
2644 | MODULE_LICENSE("GPL" ); |
2645 | MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); |
2646 | MODULE_ALIAS("xen:vbd" ); |
2647 | MODULE_ALIAS("xenblk" ); |
2648 | |