1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /****************************************************************************** |
3 | * privcmd.c |
4 | * |
5 | * Interface to privileged domain-0 commands. |
6 | * |
7 | * Copyright (c) 2002-2004, K A Fraser, B Dragovic |
8 | */ |
9 | |
10 | #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt |
11 | |
12 | #include <linux/eventfd.h> |
13 | #include <linux/file.h> |
14 | #include <linux/kernel.h> |
15 | #include <linux/module.h> |
16 | #include <linux/mutex.h> |
17 | #include <linux/poll.h> |
18 | #include <linux/sched.h> |
19 | #include <linux/slab.h> |
20 | #include <linux/string.h> |
21 | #include <linux/workqueue.h> |
22 | #include <linux/errno.h> |
23 | #include <linux/mm.h> |
24 | #include <linux/mman.h> |
25 | #include <linux/uaccess.h> |
26 | #include <linux/swap.h> |
27 | #include <linux/highmem.h> |
28 | #include <linux/pagemap.h> |
29 | #include <linux/seq_file.h> |
30 | #include <linux/miscdevice.h> |
31 | #include <linux/moduleparam.h> |
32 | #include <linux/virtio_mmio.h> |
33 | |
34 | #include <asm/xen/hypervisor.h> |
35 | #include <asm/xen/hypercall.h> |
36 | |
37 | #include <xen/xen.h> |
38 | #include <xen/events.h> |
39 | #include <xen/privcmd.h> |
40 | #include <xen/interface/xen.h> |
41 | #include <xen/interface/memory.h> |
42 | #include <xen/interface/hvm/dm_op.h> |
43 | #include <xen/interface/hvm/ioreq.h> |
44 | #include <xen/features.h> |
45 | #include <xen/page.h> |
46 | #include <xen/xen-ops.h> |
47 | #include <xen/balloon.h> |
48 | |
49 | #include "privcmd.h" |
50 | |
51 | MODULE_LICENSE("GPL" ); |
52 | |
53 | #define PRIV_VMA_LOCKED ((void *)1) |
54 | |
55 | static unsigned int privcmd_dm_op_max_num = 16; |
56 | module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); |
57 | MODULE_PARM_DESC(dm_op_max_nr_bufs, |
58 | "Maximum number of buffers per dm_op hypercall" ); |
59 | |
60 | static unsigned int privcmd_dm_op_buf_max_size = 4096; |
61 | module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, |
62 | 0644); |
63 | MODULE_PARM_DESC(dm_op_buf_max_size, |
64 | "Maximum size of a dm_op hypercall buffer" ); |
65 | |
66 | struct privcmd_data { |
67 | domid_t domid; |
68 | }; |
69 | |
70 | static int privcmd_vma_range_is_mapped( |
71 | struct vm_area_struct *vma, |
72 | unsigned long addr, |
73 | unsigned long nr_pages); |
74 | |
75 | static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) |
76 | { |
77 | struct privcmd_data *data = file->private_data; |
78 | struct privcmd_hypercall hypercall; |
79 | long ret; |
80 | |
81 | /* Disallow arbitrary hypercalls if restricted */ |
82 | if (data->domid != DOMID_INVALID) |
83 | return -EPERM; |
84 | |
85 | if (copy_from_user(to: &hypercall, from: udata, n: sizeof(hypercall))) |
86 | return -EFAULT; |
87 | |
88 | xen_preemptible_hcall_begin(); |
89 | ret = privcmd_call(call: hypercall.op, |
90 | a1: hypercall.arg[0], a2: hypercall.arg[1], |
91 | a3: hypercall.arg[2], a4: hypercall.arg[3], |
92 | a5: hypercall.arg[4]); |
93 | xen_preemptible_hcall_end(); |
94 | |
95 | return ret; |
96 | } |
97 | |
98 | static void free_page_list(struct list_head *pages) |
99 | { |
100 | struct page *p, *n; |
101 | |
102 | list_for_each_entry_safe(p, n, pages, lru) |
103 | __free_page(p); |
104 | |
105 | INIT_LIST_HEAD(list: pages); |
106 | } |
107 | |
108 | /* |
109 | * Given an array of items in userspace, return a list of pages |
110 | * containing the data. If copying fails, either because of memory |
111 | * allocation failure or a problem reading user memory, return an |
112 | * error code; its up to the caller to dispose of any partial list. |
113 | */ |
114 | static int gather_array(struct list_head *pagelist, |
115 | unsigned nelem, size_t size, |
116 | const void __user *data) |
117 | { |
118 | unsigned pageidx; |
119 | void *pagedata; |
120 | int ret; |
121 | |
122 | if (size > PAGE_SIZE) |
123 | return 0; |
124 | |
125 | pageidx = PAGE_SIZE; |
126 | pagedata = NULL; /* quiet, gcc */ |
127 | while (nelem--) { |
128 | if (pageidx > PAGE_SIZE-size) { |
129 | struct page *page = alloc_page(GFP_KERNEL); |
130 | |
131 | ret = -ENOMEM; |
132 | if (page == NULL) |
133 | goto fail; |
134 | |
135 | pagedata = page_address(page); |
136 | |
137 | list_add_tail(new: &page->lru, head: pagelist); |
138 | pageidx = 0; |
139 | } |
140 | |
141 | ret = -EFAULT; |
142 | if (copy_from_user(to: pagedata + pageidx, from: data, n: size)) |
143 | goto fail; |
144 | |
145 | data += size; |
146 | pageidx += size; |
147 | } |
148 | |
149 | ret = 0; |
150 | |
151 | fail: |
152 | return ret; |
153 | } |
154 | |
155 | /* |
156 | * Call function "fn" on each element of the array fragmented |
157 | * over a list of pages. |
158 | */ |
159 | static int traverse_pages(unsigned nelem, size_t size, |
160 | struct list_head *pos, |
161 | int (*fn)(void *data, void *state), |
162 | void *state) |
163 | { |
164 | void *pagedata; |
165 | unsigned pageidx; |
166 | int ret = 0; |
167 | |
168 | BUG_ON(size > PAGE_SIZE); |
169 | |
170 | pageidx = PAGE_SIZE; |
171 | pagedata = NULL; /* hush, gcc */ |
172 | |
173 | while (nelem--) { |
174 | if (pageidx > PAGE_SIZE-size) { |
175 | struct page *page; |
176 | pos = pos->next; |
177 | page = list_entry(pos, struct page, lru); |
178 | pagedata = page_address(page); |
179 | pageidx = 0; |
180 | } |
181 | |
182 | ret = (*fn)(pagedata + pageidx, state); |
183 | if (ret) |
184 | break; |
185 | pageidx += size; |
186 | } |
187 | |
188 | return ret; |
189 | } |
190 | |
191 | /* |
192 | * Similar to traverse_pages, but use each page as a "block" of |
193 | * data to be processed as one unit. |
194 | */ |
195 | static int traverse_pages_block(unsigned nelem, size_t size, |
196 | struct list_head *pos, |
197 | int (*fn)(void *data, int nr, void *state), |
198 | void *state) |
199 | { |
200 | void *pagedata; |
201 | int ret = 0; |
202 | |
203 | BUG_ON(size > PAGE_SIZE); |
204 | |
205 | while (nelem) { |
206 | int nr = (PAGE_SIZE/size); |
207 | struct page *page; |
208 | if (nr > nelem) |
209 | nr = nelem; |
210 | pos = pos->next; |
211 | page = list_entry(pos, struct page, lru); |
212 | pagedata = page_address(page); |
213 | ret = (*fn)(pagedata, nr, state); |
214 | if (ret) |
215 | break; |
216 | nelem -= nr; |
217 | } |
218 | |
219 | return ret; |
220 | } |
221 | |
222 | struct mmap_gfn_state { |
223 | unsigned long va; |
224 | struct vm_area_struct *vma; |
225 | domid_t domain; |
226 | }; |
227 | |
228 | static int mmap_gfn_range(void *data, void *state) |
229 | { |
230 | struct privcmd_mmap_entry *msg = data; |
231 | struct mmap_gfn_state *st = state; |
232 | struct vm_area_struct *vma = st->vma; |
233 | int rc; |
234 | |
235 | /* Do not allow range to wrap the address space. */ |
236 | if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || |
237 | ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) |
238 | return -EINVAL; |
239 | |
240 | /* Range chunks must be contiguous in va space. */ |
241 | if ((msg->va != st->va) || |
242 | ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) |
243 | return -EINVAL; |
244 | |
245 | rc = xen_remap_domain_gfn_range(vma, |
246 | addr: msg->va & PAGE_MASK, |
247 | gfn: msg->mfn, nr: msg->npages, |
248 | prot: vma->vm_page_prot, |
249 | domid: st->domain, NULL); |
250 | if (rc < 0) |
251 | return rc; |
252 | |
253 | st->va += msg->npages << PAGE_SHIFT; |
254 | |
255 | return 0; |
256 | } |
257 | |
258 | static long privcmd_ioctl_mmap(struct file *file, void __user *udata) |
259 | { |
260 | struct privcmd_data *data = file->private_data; |
261 | struct privcmd_mmap mmapcmd; |
262 | struct mm_struct *mm = current->mm; |
263 | struct vm_area_struct *vma; |
264 | int rc; |
265 | LIST_HEAD(pagelist); |
266 | struct mmap_gfn_state state; |
267 | |
268 | /* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */ |
269 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
270 | return -ENOSYS; |
271 | |
272 | if (copy_from_user(to: &mmapcmd, from: udata, n: sizeof(mmapcmd))) |
273 | return -EFAULT; |
274 | |
275 | /* If restriction is in place, check the domid matches */ |
276 | if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) |
277 | return -EPERM; |
278 | |
279 | rc = gather_array(pagelist: &pagelist, |
280 | nelem: mmapcmd.num, size: sizeof(struct privcmd_mmap_entry), |
281 | data: mmapcmd.entry); |
282 | |
283 | if (rc || list_empty(head: &pagelist)) |
284 | goto out; |
285 | |
286 | mmap_write_lock(mm); |
287 | |
288 | { |
289 | struct page *page = list_first_entry(&pagelist, |
290 | struct page, lru); |
291 | struct privcmd_mmap_entry *msg = page_address(page); |
292 | |
293 | vma = vma_lookup(mm, addr: msg->va); |
294 | rc = -EINVAL; |
295 | |
296 | if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) |
297 | goto out_up; |
298 | vma->vm_private_data = PRIV_VMA_LOCKED; |
299 | } |
300 | |
301 | state.va = vma->vm_start; |
302 | state.vma = vma; |
303 | state.domain = mmapcmd.dom; |
304 | |
305 | rc = traverse_pages(nelem: mmapcmd.num, size: sizeof(struct privcmd_mmap_entry), |
306 | pos: &pagelist, |
307 | fn: mmap_gfn_range, state: &state); |
308 | |
309 | |
310 | out_up: |
311 | mmap_write_unlock(mm); |
312 | |
313 | out: |
314 | free_page_list(pages: &pagelist); |
315 | |
316 | return rc; |
317 | } |
318 | |
319 | struct mmap_batch_state { |
320 | domid_t domain; |
321 | unsigned long va; |
322 | struct vm_area_struct *vma; |
323 | int index; |
324 | /* A tristate: |
325 | * 0 for no errors |
326 | * 1 if at least one error has happened (and no |
327 | * -ENOENT errors have happened) |
328 | * -ENOENT if at least 1 -ENOENT has happened. |
329 | */ |
330 | int global_error; |
331 | int version; |
332 | |
333 | /* User-space gfn array to store errors in the second pass for V1. */ |
334 | xen_pfn_t __user *user_gfn; |
335 | /* User-space int array to store errors in the second pass for V2. */ |
336 | int __user *user_err; |
337 | }; |
338 | |
339 | /* auto translated dom0 note: if domU being created is PV, then gfn is |
340 | * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP). |
341 | */ |
342 | static int mmap_batch_fn(void *data, int nr, void *state) |
343 | { |
344 | xen_pfn_t *gfnp = data; |
345 | struct mmap_batch_state *st = state; |
346 | struct vm_area_struct *vma = st->vma; |
347 | struct page **pages = vma->vm_private_data; |
348 | struct page **cur_pages = NULL; |
349 | int ret; |
350 | |
351 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
352 | cur_pages = &pages[st->index]; |
353 | |
354 | BUG_ON(nr < 0); |
355 | ret = xen_remap_domain_gfn_array(vma: st->vma, addr: st->va & PAGE_MASK, gfn: gfnp, nr, |
356 | err_ptr: (int *)gfnp, prot: st->vma->vm_page_prot, |
357 | domid: st->domain, pages: cur_pages); |
358 | |
359 | /* Adjust the global_error? */ |
360 | if (ret != nr) { |
361 | if (ret == -ENOENT) |
362 | st->global_error = -ENOENT; |
363 | else { |
364 | /* Record that at least one error has happened. */ |
365 | if (st->global_error == 0) |
366 | st->global_error = 1; |
367 | } |
368 | } |
369 | st->va += XEN_PAGE_SIZE * nr; |
370 | st->index += nr / XEN_PFN_PER_PAGE; |
371 | |
372 | return 0; |
373 | } |
374 | |
375 | static int mmap_return_error(int err, struct mmap_batch_state *st) |
376 | { |
377 | int ret; |
378 | |
379 | if (st->version == 1) { |
380 | if (err) { |
381 | xen_pfn_t gfn; |
382 | |
383 | ret = get_user(gfn, st->user_gfn); |
384 | if (ret < 0) |
385 | return ret; |
386 | /* |
387 | * V1 encodes the error codes in the 32bit top |
388 | * nibble of the gfn (with its known |
389 | * limitations vis-a-vis 64 bit callers). |
390 | */ |
391 | gfn |= (err == -ENOENT) ? |
392 | PRIVCMD_MMAPBATCH_PAGED_ERROR : |
393 | PRIVCMD_MMAPBATCH_MFN_ERROR; |
394 | return __put_user(gfn, st->user_gfn++); |
395 | } else |
396 | st->user_gfn++; |
397 | } else { /* st->version == 2 */ |
398 | if (err) |
399 | return __put_user(err, st->user_err++); |
400 | else |
401 | st->user_err++; |
402 | } |
403 | |
404 | return 0; |
405 | } |
406 | |
407 | static int mmap_return_errors(void *data, int nr, void *state) |
408 | { |
409 | struct mmap_batch_state *st = state; |
410 | int *errs = data; |
411 | int i; |
412 | int ret; |
413 | |
414 | for (i = 0; i < nr; i++) { |
415 | ret = mmap_return_error(err: errs[i], st); |
416 | if (ret < 0) |
417 | return ret; |
418 | } |
419 | return 0; |
420 | } |
421 | |
422 | /* Allocate pfns that are then mapped with gfns from foreign domid. Update |
423 | * the vma with the page info to use later. |
424 | * Returns: 0 if success, otherwise -errno |
425 | */ |
426 | static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) |
427 | { |
428 | int rc; |
429 | struct page **pages; |
430 | |
431 | pages = kvcalloc(n: numpgs, size: sizeof(pages[0]), GFP_KERNEL); |
432 | if (pages == NULL) |
433 | return -ENOMEM; |
434 | |
435 | rc = xen_alloc_unpopulated_pages(nr_pages: numpgs, pages); |
436 | if (rc != 0) { |
437 | pr_warn("%s Could not alloc %d pfns rc:%d\n" , __func__, |
438 | numpgs, rc); |
439 | kvfree(addr: pages); |
440 | return -ENOMEM; |
441 | } |
442 | BUG_ON(vma->vm_private_data != NULL); |
443 | vma->vm_private_data = pages; |
444 | |
445 | return 0; |
446 | } |
447 | |
448 | static const struct vm_operations_struct privcmd_vm_ops; |
449 | |
450 | static long privcmd_ioctl_mmap_batch( |
451 | struct file *file, void __user *udata, int version) |
452 | { |
453 | struct privcmd_data *data = file->private_data; |
454 | int ret; |
455 | struct privcmd_mmapbatch_v2 m; |
456 | struct mm_struct *mm = current->mm; |
457 | struct vm_area_struct *vma; |
458 | unsigned long nr_pages; |
459 | LIST_HEAD(pagelist); |
460 | struct mmap_batch_state state; |
461 | |
462 | switch (version) { |
463 | case 1: |
464 | if (copy_from_user(to: &m, from: udata, n: sizeof(struct privcmd_mmapbatch))) |
465 | return -EFAULT; |
466 | /* Returns per-frame error in m.arr. */ |
467 | m.err = NULL; |
468 | if (!access_ok(m.arr, m.num * sizeof(*m.arr))) |
469 | return -EFAULT; |
470 | break; |
471 | case 2: |
472 | if (copy_from_user(to: &m, from: udata, n: sizeof(struct privcmd_mmapbatch_v2))) |
473 | return -EFAULT; |
474 | /* Returns per-frame error code in m.err. */ |
475 | if (!access_ok(m.err, m.num * (sizeof(*m.err)))) |
476 | return -EFAULT; |
477 | break; |
478 | default: |
479 | return -EINVAL; |
480 | } |
481 | |
482 | /* If restriction is in place, check the domid matches */ |
483 | if (data->domid != DOMID_INVALID && data->domid != m.dom) |
484 | return -EPERM; |
485 | |
486 | nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); |
487 | if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) |
488 | return -EINVAL; |
489 | |
490 | ret = gather_array(pagelist: &pagelist, nelem: m.num, size: sizeof(xen_pfn_t), data: m.arr); |
491 | |
492 | if (ret) |
493 | goto out; |
494 | if (list_empty(head: &pagelist)) { |
495 | ret = -EINVAL; |
496 | goto out; |
497 | } |
498 | |
499 | if (version == 2) { |
500 | /* Zero error array now to only copy back actual errors. */ |
501 | if (clear_user(to: m.err, n: sizeof(int) * m.num)) { |
502 | ret = -EFAULT; |
503 | goto out; |
504 | } |
505 | } |
506 | |
507 | mmap_write_lock(mm); |
508 | |
509 | vma = find_vma(mm, addr: m.addr); |
510 | if (!vma || |
511 | vma->vm_ops != &privcmd_vm_ops) { |
512 | ret = -EINVAL; |
513 | goto out_unlock; |
514 | } |
515 | |
516 | /* |
517 | * Caller must either: |
518 | * |
519 | * Map the whole VMA range, which will also allocate all the |
520 | * pages required for the auto_translated_physmap case. |
521 | * |
522 | * Or |
523 | * |
524 | * Map unmapped holes left from a previous map attempt (e.g., |
525 | * because those foreign frames were previously paged out). |
526 | */ |
527 | if (vma->vm_private_data == NULL) { |
528 | if (m.addr != vma->vm_start || |
529 | m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { |
530 | ret = -EINVAL; |
531 | goto out_unlock; |
532 | } |
533 | if (xen_feature(XENFEAT_auto_translated_physmap)) { |
534 | ret = alloc_empty_pages(vma, numpgs: nr_pages); |
535 | if (ret < 0) |
536 | goto out_unlock; |
537 | } else |
538 | vma->vm_private_data = PRIV_VMA_LOCKED; |
539 | } else { |
540 | if (m.addr < vma->vm_start || |
541 | m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { |
542 | ret = -EINVAL; |
543 | goto out_unlock; |
544 | } |
545 | if (privcmd_vma_range_is_mapped(vma, addr: m.addr, nr_pages)) { |
546 | ret = -EINVAL; |
547 | goto out_unlock; |
548 | } |
549 | } |
550 | |
551 | state.domain = m.dom; |
552 | state.vma = vma; |
553 | state.va = m.addr; |
554 | state.index = 0; |
555 | state.global_error = 0; |
556 | state.version = version; |
557 | |
558 | BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0); |
559 | /* mmap_batch_fn guarantees ret == 0 */ |
560 | BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), |
561 | &pagelist, mmap_batch_fn, &state)); |
562 | |
563 | mmap_write_unlock(mm); |
564 | |
565 | if (state.global_error) { |
566 | /* Write back errors in second pass. */ |
567 | state.user_gfn = (xen_pfn_t *)m.arr; |
568 | state.user_err = m.err; |
569 | ret = traverse_pages_block(nelem: m.num, size: sizeof(xen_pfn_t), |
570 | pos: &pagelist, fn: mmap_return_errors, state: &state); |
571 | } else |
572 | ret = 0; |
573 | |
574 | /* If we have not had any EFAULT-like global errors then set the global |
575 | * error to -ENOENT if necessary. */ |
576 | if ((ret == 0) && (state.global_error == -ENOENT)) |
577 | ret = -ENOENT; |
578 | |
579 | out: |
580 | free_page_list(pages: &pagelist); |
581 | return ret; |
582 | |
583 | out_unlock: |
584 | mmap_write_unlock(mm); |
585 | goto out; |
586 | } |
587 | |
588 | static int lock_pages( |
589 | struct privcmd_dm_op_buf kbufs[], unsigned int num, |
590 | struct page *pages[], unsigned int nr_pages, unsigned int *pinned) |
591 | { |
592 | unsigned int i, off = 0; |
593 | |
594 | for (i = 0; i < num; ) { |
595 | unsigned int requested; |
596 | int page_count; |
597 | |
598 | requested = DIV_ROUND_UP( |
599 | offset_in_page(kbufs[i].uptr) + kbufs[i].size, |
600 | PAGE_SIZE) - off; |
601 | if (requested > nr_pages) |
602 | return -ENOSPC; |
603 | |
604 | page_count = pin_user_pages_fast( |
605 | start: (unsigned long)kbufs[i].uptr + off * PAGE_SIZE, |
606 | nr_pages: requested, gup_flags: FOLL_WRITE, pages); |
607 | if (page_count <= 0) |
608 | return page_count ? : -EFAULT; |
609 | |
610 | *pinned += page_count; |
611 | nr_pages -= page_count; |
612 | pages += page_count; |
613 | |
614 | off = (requested == page_count) ? 0 : off + page_count; |
615 | i += !off; |
616 | } |
617 | |
618 | return 0; |
619 | } |
620 | |
621 | static void unlock_pages(struct page *pages[], unsigned int nr_pages) |
622 | { |
623 | unpin_user_pages_dirty_lock(pages, npages: nr_pages, make_dirty: true); |
624 | } |
625 | |
626 | static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) |
627 | { |
628 | struct privcmd_data *data = file->private_data; |
629 | struct privcmd_dm_op kdata; |
630 | struct privcmd_dm_op_buf *kbufs; |
631 | unsigned int nr_pages = 0; |
632 | struct page **pages = NULL; |
633 | struct xen_dm_op_buf *xbufs = NULL; |
634 | unsigned int i; |
635 | long rc; |
636 | unsigned int pinned = 0; |
637 | |
638 | if (copy_from_user(to: &kdata, from: udata, n: sizeof(kdata))) |
639 | return -EFAULT; |
640 | |
641 | /* If restriction is in place, check the domid matches */ |
642 | if (data->domid != DOMID_INVALID && data->domid != kdata.dom) |
643 | return -EPERM; |
644 | |
645 | if (kdata.num == 0) |
646 | return 0; |
647 | |
648 | if (kdata.num > privcmd_dm_op_max_num) |
649 | return -E2BIG; |
650 | |
651 | kbufs = kcalloc(n: kdata.num, size: sizeof(*kbufs), GFP_KERNEL); |
652 | if (!kbufs) |
653 | return -ENOMEM; |
654 | |
655 | if (copy_from_user(to: kbufs, from: kdata.ubufs, |
656 | n: sizeof(*kbufs) * kdata.num)) { |
657 | rc = -EFAULT; |
658 | goto out; |
659 | } |
660 | |
661 | for (i = 0; i < kdata.num; i++) { |
662 | if (kbufs[i].size > privcmd_dm_op_buf_max_size) { |
663 | rc = -E2BIG; |
664 | goto out; |
665 | } |
666 | |
667 | if (!access_ok(kbufs[i].uptr, |
668 | kbufs[i].size)) { |
669 | rc = -EFAULT; |
670 | goto out; |
671 | } |
672 | |
673 | nr_pages += DIV_ROUND_UP( |
674 | offset_in_page(kbufs[i].uptr) + kbufs[i].size, |
675 | PAGE_SIZE); |
676 | } |
677 | |
678 | pages = kcalloc(n: nr_pages, size: sizeof(*pages), GFP_KERNEL); |
679 | if (!pages) { |
680 | rc = -ENOMEM; |
681 | goto out; |
682 | } |
683 | |
684 | xbufs = kcalloc(n: kdata.num, size: sizeof(*xbufs), GFP_KERNEL); |
685 | if (!xbufs) { |
686 | rc = -ENOMEM; |
687 | goto out; |
688 | } |
689 | |
690 | rc = lock_pages(kbufs, num: kdata.num, pages, nr_pages, pinned: &pinned); |
691 | if (rc < 0) |
692 | goto out; |
693 | |
694 | for (i = 0; i < kdata.num; i++) { |
695 | set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); |
696 | xbufs[i].size = kbufs[i].size; |
697 | } |
698 | |
699 | xen_preemptible_hcall_begin(); |
700 | rc = HYPERVISOR_dm_op(dom: kdata.dom, nr_bufs: kdata.num, bufs: xbufs); |
701 | xen_preemptible_hcall_end(); |
702 | |
703 | out: |
704 | unlock_pages(pages, nr_pages: pinned); |
705 | kfree(objp: xbufs); |
706 | kfree(objp: pages); |
707 | kfree(objp: kbufs); |
708 | |
709 | return rc; |
710 | } |
711 | |
712 | static long privcmd_ioctl_restrict(struct file *file, void __user *udata) |
713 | { |
714 | struct privcmd_data *data = file->private_data; |
715 | domid_t dom; |
716 | |
717 | if (copy_from_user(to: &dom, from: udata, n: sizeof(dom))) |
718 | return -EFAULT; |
719 | |
720 | /* Set restriction to the specified domain, or check it matches */ |
721 | if (data->domid == DOMID_INVALID) |
722 | data->domid = dom; |
723 | else if (data->domid != dom) |
724 | return -EINVAL; |
725 | |
726 | return 0; |
727 | } |
728 | |
729 | static long privcmd_ioctl_mmap_resource(struct file *file, |
730 | struct privcmd_mmap_resource __user *udata) |
731 | { |
732 | struct privcmd_data *data = file->private_data; |
733 | struct mm_struct *mm = current->mm; |
734 | struct vm_area_struct *vma; |
735 | struct privcmd_mmap_resource kdata; |
736 | xen_pfn_t *pfns = NULL; |
737 | struct xen_mem_acquire_resource xdata = { }; |
738 | int rc; |
739 | |
740 | if (copy_from_user(to: &kdata, from: udata, n: sizeof(kdata))) |
741 | return -EFAULT; |
742 | |
743 | /* If restriction is in place, check the domid matches */ |
744 | if (data->domid != DOMID_INVALID && data->domid != kdata.dom) |
745 | return -EPERM; |
746 | |
747 | /* Both fields must be set or unset */ |
748 | if (!!kdata.addr != !!kdata.num) |
749 | return -EINVAL; |
750 | |
751 | xdata.domid = kdata.dom; |
752 | xdata.type = kdata.type; |
753 | xdata.id = kdata.id; |
754 | |
755 | if (!kdata.addr && !kdata.num) { |
756 | /* Query the size of the resource. */ |
757 | rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, arg: &xdata); |
758 | if (rc) |
759 | return rc; |
760 | return __put_user(xdata.nr_frames, &udata->num); |
761 | } |
762 | |
763 | mmap_write_lock(mm); |
764 | |
765 | vma = find_vma(mm, addr: kdata.addr); |
766 | if (!vma || vma->vm_ops != &privcmd_vm_ops) { |
767 | rc = -EINVAL; |
768 | goto out; |
769 | } |
770 | |
771 | pfns = kcalloc(n: kdata.num, size: sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN); |
772 | if (!pfns) { |
773 | rc = -ENOMEM; |
774 | goto out; |
775 | } |
776 | |
777 | if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && |
778 | xen_feature(XENFEAT_auto_translated_physmap)) { |
779 | unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE); |
780 | struct page **pages; |
781 | unsigned int i; |
782 | |
783 | rc = alloc_empty_pages(vma, numpgs: nr); |
784 | if (rc < 0) |
785 | goto out; |
786 | |
787 | pages = vma->vm_private_data; |
788 | |
789 | for (i = 0; i < kdata.num; i++) { |
790 | xen_pfn_t pfn = |
791 | page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]); |
792 | |
793 | pfns[i] = pfn + (i % XEN_PFN_PER_PAGE); |
794 | } |
795 | } else |
796 | vma->vm_private_data = PRIV_VMA_LOCKED; |
797 | |
798 | xdata.frame = kdata.idx; |
799 | xdata.nr_frames = kdata.num; |
800 | set_xen_guest_handle(xdata.frame_list, pfns); |
801 | |
802 | xen_preemptible_hcall_begin(); |
803 | rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, arg: &xdata); |
804 | xen_preemptible_hcall_end(); |
805 | |
806 | if (rc) |
807 | goto out; |
808 | |
809 | if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) && |
810 | xen_feature(XENFEAT_auto_translated_physmap)) { |
811 | rc = xen_remap_vma_range(vma, addr: kdata.addr, len: kdata.num << PAGE_SHIFT); |
812 | } else { |
813 | unsigned int domid = |
814 | (xdata.flags & XENMEM_rsrc_acq_caller_owned) ? |
815 | DOMID_SELF : kdata.dom; |
816 | int num, *errs = (int *)pfns; |
817 | |
818 | BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns)); |
819 | num = xen_remap_domain_mfn_array(vma, |
820 | addr: kdata.addr & PAGE_MASK, |
821 | mfn: pfns, nr: kdata.num, err_ptr: errs, |
822 | prot: vma->vm_page_prot, |
823 | domid); |
824 | if (num < 0) |
825 | rc = num; |
826 | else if (num != kdata.num) { |
827 | unsigned int i; |
828 | |
829 | for (i = 0; i < num; i++) { |
830 | rc = errs[i]; |
831 | if (rc < 0) |
832 | break; |
833 | } |
834 | } else |
835 | rc = 0; |
836 | } |
837 | |
838 | out: |
839 | mmap_write_unlock(mm); |
840 | kfree(objp: pfns); |
841 | |
842 | return rc; |
843 | } |
844 | |
845 | #ifdef CONFIG_XEN_PRIVCMD_EVENTFD |
846 | /* Irqfd support */ |
847 | static struct workqueue_struct *irqfd_cleanup_wq; |
848 | static DEFINE_MUTEX(irqfds_lock); |
849 | static LIST_HEAD(irqfds_list); |
850 | |
851 | struct privcmd_kernel_irqfd { |
852 | struct xen_dm_op_buf xbufs; |
853 | domid_t dom; |
854 | bool error; |
855 | struct eventfd_ctx *eventfd; |
856 | struct work_struct shutdown; |
857 | wait_queue_entry_t wait; |
858 | struct list_head list; |
859 | poll_table pt; |
860 | }; |
861 | |
862 | static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd) |
863 | { |
864 | lockdep_assert_held(&irqfds_lock); |
865 | |
866 | list_del_init(entry: &kirqfd->list); |
867 | queue_work(wq: irqfd_cleanup_wq, work: &kirqfd->shutdown); |
868 | } |
869 | |
870 | static void irqfd_shutdown(struct work_struct *work) |
871 | { |
872 | struct privcmd_kernel_irqfd *kirqfd = |
873 | container_of(work, struct privcmd_kernel_irqfd, shutdown); |
874 | u64 cnt; |
875 | |
876 | eventfd_ctx_remove_wait_queue(ctx: kirqfd->eventfd, wait: &kirqfd->wait, cnt: &cnt); |
877 | eventfd_ctx_put(ctx: kirqfd->eventfd); |
878 | kfree(objp: kirqfd); |
879 | } |
880 | |
881 | static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd) |
882 | { |
883 | u64 cnt; |
884 | long rc; |
885 | |
886 | eventfd_ctx_do_read(ctx: kirqfd->eventfd, cnt: &cnt); |
887 | |
888 | xen_preemptible_hcall_begin(); |
889 | rc = HYPERVISOR_dm_op(dom: kirqfd->dom, nr_bufs: 1, bufs: &kirqfd->xbufs); |
890 | xen_preemptible_hcall_end(); |
891 | |
892 | /* Don't repeat the error message for consecutive failures */ |
893 | if (rc && !kirqfd->error) { |
894 | pr_err("Failed to configure irq for guest domain: %d\n" , |
895 | kirqfd->dom); |
896 | } |
897 | |
898 | kirqfd->error = rc; |
899 | } |
900 | |
901 | static int |
902 | irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) |
903 | { |
904 | struct privcmd_kernel_irqfd *kirqfd = |
905 | container_of(wait, struct privcmd_kernel_irqfd, wait); |
906 | __poll_t flags = key_to_poll(key); |
907 | |
908 | if (flags & EPOLLIN) |
909 | irqfd_inject(kirqfd); |
910 | |
911 | if (flags & EPOLLHUP) { |
912 | mutex_lock(&irqfds_lock); |
913 | irqfd_deactivate(kirqfd); |
914 | mutex_unlock(lock: &irqfds_lock); |
915 | } |
916 | |
917 | return 0; |
918 | } |
919 | |
920 | static void |
921 | irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) |
922 | { |
923 | struct privcmd_kernel_irqfd *kirqfd = |
924 | container_of(pt, struct privcmd_kernel_irqfd, pt); |
925 | |
926 | add_wait_queue_priority(wq_head: wqh, wq_entry: &kirqfd->wait); |
927 | } |
928 | |
929 | static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd) |
930 | { |
931 | struct privcmd_kernel_irqfd *kirqfd, *tmp; |
932 | __poll_t events; |
933 | struct fd f; |
934 | void *dm_op; |
935 | int ret; |
936 | |
937 | kirqfd = kzalloc(size: sizeof(*kirqfd) + irqfd->size, GFP_KERNEL); |
938 | if (!kirqfd) |
939 | return -ENOMEM; |
940 | dm_op = kirqfd + 1; |
941 | |
942 | if (copy_from_user(to: dm_op, u64_to_user_ptr(irqfd->dm_op), n: irqfd->size)) { |
943 | ret = -EFAULT; |
944 | goto error_kfree; |
945 | } |
946 | |
947 | kirqfd->xbufs.size = irqfd->size; |
948 | set_xen_guest_handle(kirqfd->xbufs.h, dm_op); |
949 | kirqfd->dom = irqfd->dom; |
950 | INIT_WORK(&kirqfd->shutdown, irqfd_shutdown); |
951 | |
952 | f = fdget(fd: irqfd->fd); |
953 | if (!f.file) { |
954 | ret = -EBADF; |
955 | goto error_kfree; |
956 | } |
957 | |
958 | kirqfd->eventfd = eventfd_ctx_fileget(file: f.file); |
959 | if (IS_ERR(ptr: kirqfd->eventfd)) { |
960 | ret = PTR_ERR(ptr: kirqfd->eventfd); |
961 | goto error_fd_put; |
962 | } |
963 | |
964 | /* |
965 | * Install our own custom wake-up handling so we are notified via a |
966 | * callback whenever someone signals the underlying eventfd. |
967 | */ |
968 | init_waitqueue_func_entry(wq_entry: &kirqfd->wait, func: irqfd_wakeup); |
969 | init_poll_funcptr(pt: &kirqfd->pt, qproc: irqfd_poll_func); |
970 | |
971 | mutex_lock(&irqfds_lock); |
972 | |
973 | list_for_each_entry(tmp, &irqfds_list, list) { |
974 | if (kirqfd->eventfd == tmp->eventfd) { |
975 | ret = -EBUSY; |
976 | mutex_unlock(lock: &irqfds_lock); |
977 | goto error_eventfd; |
978 | } |
979 | } |
980 | |
981 | list_add_tail(new: &kirqfd->list, head: &irqfds_list); |
982 | mutex_unlock(lock: &irqfds_lock); |
983 | |
984 | /* |
985 | * Check if there was an event already pending on the eventfd before we |
986 | * registered, and trigger it as if we didn't miss it. |
987 | */ |
988 | events = vfs_poll(file: f.file, pt: &kirqfd->pt); |
989 | if (events & EPOLLIN) |
990 | irqfd_inject(kirqfd); |
991 | |
992 | /* |
993 | * Do not drop the file until the kirqfd is fully initialized, otherwise |
994 | * we might race against the EPOLLHUP. |
995 | */ |
996 | fdput(fd: f); |
997 | return 0; |
998 | |
999 | error_eventfd: |
1000 | eventfd_ctx_put(ctx: kirqfd->eventfd); |
1001 | |
1002 | error_fd_put: |
1003 | fdput(fd: f); |
1004 | |
1005 | error_kfree: |
1006 | kfree(objp: kirqfd); |
1007 | return ret; |
1008 | } |
1009 | |
1010 | static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd) |
1011 | { |
1012 | struct privcmd_kernel_irqfd *kirqfd; |
1013 | struct eventfd_ctx *eventfd; |
1014 | |
1015 | eventfd = eventfd_ctx_fdget(fd: irqfd->fd); |
1016 | if (IS_ERR(ptr: eventfd)) |
1017 | return PTR_ERR(ptr: eventfd); |
1018 | |
1019 | mutex_lock(&irqfds_lock); |
1020 | |
1021 | list_for_each_entry(kirqfd, &irqfds_list, list) { |
1022 | if (kirqfd->eventfd == eventfd) { |
1023 | irqfd_deactivate(kirqfd); |
1024 | break; |
1025 | } |
1026 | } |
1027 | |
1028 | mutex_unlock(lock: &irqfds_lock); |
1029 | |
1030 | eventfd_ctx_put(ctx: eventfd); |
1031 | |
1032 | /* |
1033 | * Block until we know all outstanding shutdown jobs have completed so |
1034 | * that we guarantee there will not be any more interrupts once this |
1035 | * deassign function returns. |
1036 | */ |
1037 | flush_workqueue(irqfd_cleanup_wq); |
1038 | |
1039 | return 0; |
1040 | } |
1041 | |
1042 | static long privcmd_ioctl_irqfd(struct file *file, void __user *udata) |
1043 | { |
1044 | struct privcmd_data *data = file->private_data; |
1045 | struct privcmd_irqfd irqfd; |
1046 | |
1047 | if (copy_from_user(to: &irqfd, from: udata, n: sizeof(irqfd))) |
1048 | return -EFAULT; |
1049 | |
1050 | /* No other flags should be set */ |
1051 | if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN) |
1052 | return -EINVAL; |
1053 | |
1054 | /* If restriction is in place, check the domid matches */ |
1055 | if (data->domid != DOMID_INVALID && data->domid != irqfd.dom) |
1056 | return -EPERM; |
1057 | |
1058 | if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN) |
1059 | return privcmd_irqfd_deassign(irqfd: &irqfd); |
1060 | |
1061 | return privcmd_irqfd_assign(irqfd: &irqfd); |
1062 | } |
1063 | |
1064 | static int privcmd_irqfd_init(void) |
1065 | { |
1066 | irqfd_cleanup_wq = alloc_workqueue(fmt: "privcmd-irqfd-cleanup" , flags: 0, max_active: 0); |
1067 | if (!irqfd_cleanup_wq) |
1068 | return -ENOMEM; |
1069 | |
1070 | return 0; |
1071 | } |
1072 | |
1073 | static void privcmd_irqfd_exit(void) |
1074 | { |
1075 | struct privcmd_kernel_irqfd *kirqfd, *tmp; |
1076 | |
1077 | mutex_lock(&irqfds_lock); |
1078 | |
1079 | list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list) |
1080 | irqfd_deactivate(kirqfd); |
1081 | |
1082 | mutex_unlock(lock: &irqfds_lock); |
1083 | |
1084 | destroy_workqueue(wq: irqfd_cleanup_wq); |
1085 | } |
1086 | |
1087 | /* Ioeventfd Support */ |
1088 | #define QUEUE_NOTIFY_VQ_MASK 0xFFFF |
1089 | |
1090 | static DEFINE_MUTEX(ioreq_lock); |
1091 | static LIST_HEAD(ioreq_list); |
1092 | |
1093 | /* per-eventfd structure */ |
1094 | struct privcmd_kernel_ioeventfd { |
1095 | struct eventfd_ctx *eventfd; |
1096 | struct list_head list; |
1097 | u64 addr; |
1098 | unsigned int addr_len; |
1099 | unsigned int vq; |
1100 | }; |
1101 | |
1102 | /* per-guest CPU / port structure */ |
1103 | struct ioreq_port { |
1104 | int vcpu; |
1105 | unsigned int port; |
1106 | struct privcmd_kernel_ioreq *kioreq; |
1107 | }; |
1108 | |
1109 | /* per-guest structure */ |
1110 | struct privcmd_kernel_ioreq { |
1111 | domid_t dom; |
1112 | unsigned int vcpus; |
1113 | u64 uioreq; |
1114 | struct ioreq *ioreq; |
1115 | spinlock_t lock; /* Protects ioeventfds list */ |
1116 | struct list_head ioeventfds; |
1117 | struct list_head list; |
1118 | struct ioreq_port ports[0]; |
1119 | }; |
1120 | |
1121 | static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) |
1122 | { |
1123 | struct ioreq_port *port = dev_id; |
1124 | struct privcmd_kernel_ioreq *kioreq = port->kioreq; |
1125 | struct ioreq *ioreq = &kioreq->ioreq[port->vcpu]; |
1126 | struct privcmd_kernel_ioeventfd *kioeventfd; |
1127 | unsigned int state = STATE_IOREQ_READY; |
1128 | |
1129 | if (ioreq->state != STATE_IOREQ_READY || |
1130 | ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE) |
1131 | return IRQ_NONE; |
1132 | |
1133 | /* |
1134 | * We need a barrier, smp_mb(), here to ensure reads are finished before |
1135 | * `state` is updated. Since the lock implementation ensures that |
1136 | * appropriate barrier will be added anyway, we can avoid adding |
1137 | * explicit barrier here. |
1138 | * |
1139 | * Ideally we don't need to update `state` within the locks, but we do |
1140 | * that here to avoid adding explicit barrier. |
1141 | */ |
1142 | |
1143 | spin_lock(lock: &kioreq->lock); |
1144 | ioreq->state = STATE_IOREQ_INPROCESS; |
1145 | |
1146 | list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { |
1147 | if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && |
1148 | ioreq->size == kioeventfd->addr_len && |
1149 | (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { |
1150 | eventfd_signal(ctx: kioeventfd->eventfd, n: 1); |
1151 | state = STATE_IORESP_READY; |
1152 | break; |
1153 | } |
1154 | } |
1155 | spin_unlock(lock: &kioreq->lock); |
1156 | |
1157 | /* |
1158 | * We need a barrier, smp_mb(), here to ensure writes are finished |
1159 | * before `state` is updated. Since the lock implementation ensures that |
1160 | * appropriate barrier will be added anyway, we can avoid adding |
1161 | * explicit barrier here. |
1162 | */ |
1163 | |
1164 | ioreq->state = state; |
1165 | |
1166 | if (state == STATE_IORESP_READY) { |
1167 | notify_remote_via_evtchn(port: port->port); |
1168 | return IRQ_HANDLED; |
1169 | } |
1170 | |
1171 | return IRQ_NONE; |
1172 | } |
1173 | |
1174 | static void ioreq_free(struct privcmd_kernel_ioreq *kioreq) |
1175 | { |
1176 | struct ioreq_port *ports = kioreq->ports; |
1177 | int i; |
1178 | |
1179 | lockdep_assert_held(&ioreq_lock); |
1180 | |
1181 | list_del(entry: &kioreq->list); |
1182 | |
1183 | for (i = kioreq->vcpus - 1; i >= 0; i--) |
1184 | unbind_from_irqhandler(irq: irq_from_evtchn(evtchn: ports[i].port), dev_id: &ports[i]); |
1185 | |
1186 | kfree(objp: kioreq); |
1187 | } |
1188 | |
1189 | static |
1190 | struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd) |
1191 | { |
1192 | struct privcmd_kernel_ioreq *kioreq; |
1193 | struct mm_struct *mm = current->mm; |
1194 | struct vm_area_struct *vma; |
1195 | struct page **pages; |
1196 | unsigned int *ports; |
1197 | int ret, size, i; |
1198 | |
1199 | lockdep_assert_held(&ioreq_lock); |
1200 | |
1201 | size = struct_size(kioreq, ports, ioeventfd->vcpus); |
1202 | kioreq = kzalloc(size, GFP_KERNEL); |
1203 | if (!kioreq) |
1204 | return ERR_PTR(error: -ENOMEM); |
1205 | |
1206 | kioreq->dom = ioeventfd->dom; |
1207 | kioreq->vcpus = ioeventfd->vcpus; |
1208 | kioreq->uioreq = ioeventfd->ioreq; |
1209 | spin_lock_init(&kioreq->lock); |
1210 | INIT_LIST_HEAD(list: &kioreq->ioeventfds); |
1211 | |
1212 | /* The memory for ioreq server must have been mapped earlier */ |
1213 | mmap_write_lock(mm); |
1214 | vma = find_vma(mm, addr: (unsigned long)ioeventfd->ioreq); |
1215 | if (!vma) { |
1216 | pr_err("Failed to find vma for ioreq page!\n" ); |
1217 | mmap_write_unlock(mm); |
1218 | ret = -EFAULT; |
1219 | goto error_kfree; |
1220 | } |
1221 | |
1222 | pages = vma->vm_private_data; |
1223 | kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0])); |
1224 | mmap_write_unlock(mm); |
1225 | |
1226 | size = sizeof(*ports) * kioreq->vcpus; |
1227 | ports = kzalloc(size, GFP_KERNEL); |
1228 | if (!ports) { |
1229 | ret = -ENOMEM; |
1230 | goto error_kfree; |
1231 | } |
1232 | |
1233 | if (copy_from_user(to: ports, u64_to_user_ptr(ioeventfd->ports), n: size)) { |
1234 | ret = -EFAULT; |
1235 | goto error_kfree_ports; |
1236 | } |
1237 | |
1238 | for (i = 0; i < kioreq->vcpus; i++) { |
1239 | kioreq->ports[i].vcpu = i; |
1240 | kioreq->ports[i].port = ports[i]; |
1241 | kioreq->ports[i].kioreq = kioreq; |
1242 | |
1243 | ret = bind_evtchn_to_irqhandler_lateeoi(evtchn: ports[i], |
1244 | handler: ioeventfd_interrupt, IRQF_SHARED, devname: "ioeventfd" , |
1245 | dev_id: &kioreq->ports[i]); |
1246 | if (ret < 0) |
1247 | goto error_unbind; |
1248 | } |
1249 | |
1250 | kfree(objp: ports); |
1251 | |
1252 | list_add_tail(new: &kioreq->list, head: &ioreq_list); |
1253 | |
1254 | return kioreq; |
1255 | |
1256 | error_unbind: |
1257 | while (--i >= 0) |
1258 | unbind_from_irqhandler(irq: irq_from_evtchn(evtchn: ports[i]), dev_id: &kioreq->ports[i]); |
1259 | error_kfree_ports: |
1260 | kfree(objp: ports); |
1261 | error_kfree: |
1262 | kfree(objp: kioreq); |
1263 | return ERR_PTR(error: ret); |
1264 | } |
1265 | |
1266 | static struct privcmd_kernel_ioreq * |
1267 | get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd) |
1268 | { |
1269 | struct privcmd_kernel_ioreq *kioreq; |
1270 | unsigned long flags; |
1271 | |
1272 | list_for_each_entry(kioreq, &ioreq_list, list) { |
1273 | struct privcmd_kernel_ioeventfd *kioeventfd; |
1274 | |
1275 | /* |
1276 | * kioreq fields can be accessed here without a lock as they are |
1277 | * never updated after being added to the ioreq_list. |
1278 | */ |
1279 | if (kioreq->uioreq != ioeventfd->ioreq) { |
1280 | continue; |
1281 | } else if (kioreq->dom != ioeventfd->dom || |
1282 | kioreq->vcpus != ioeventfd->vcpus) { |
1283 | pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n" , |
1284 | kioreq->dom, ioeventfd->dom, kioreq->vcpus, |
1285 | ioeventfd->vcpus); |
1286 | return ERR_PTR(error: -EINVAL); |
1287 | } |
1288 | |
1289 | /* Look for a duplicate eventfd for the same guest */ |
1290 | spin_lock_irqsave(&kioreq->lock, flags); |
1291 | list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) { |
1292 | if (eventfd == kioeventfd->eventfd) { |
1293 | spin_unlock_irqrestore(lock: &kioreq->lock, flags); |
1294 | return ERR_PTR(error: -EBUSY); |
1295 | } |
1296 | } |
1297 | spin_unlock_irqrestore(lock: &kioreq->lock, flags); |
1298 | |
1299 | return kioreq; |
1300 | } |
1301 | |
1302 | /* Matching kioreq isn't found, allocate a new one */ |
1303 | return alloc_ioreq(ioeventfd); |
1304 | } |
1305 | |
1306 | static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd) |
1307 | { |
1308 | list_del(entry: &kioeventfd->list); |
1309 | eventfd_ctx_put(ctx: kioeventfd->eventfd); |
1310 | kfree(objp: kioeventfd); |
1311 | } |
1312 | |
1313 | static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd) |
1314 | { |
1315 | struct privcmd_kernel_ioeventfd *kioeventfd; |
1316 | struct privcmd_kernel_ioreq *kioreq; |
1317 | unsigned long flags; |
1318 | struct fd f; |
1319 | int ret; |
1320 | |
1321 | /* Check for range overflow */ |
1322 | if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr) |
1323 | return -EINVAL; |
1324 | |
1325 | /* Vhost requires us to support length 1, 2, 4, and 8 */ |
1326 | if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 || |
1327 | ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8)) |
1328 | return -EINVAL; |
1329 | |
1330 | /* 4096 vcpus limit enough ? */ |
1331 | if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096) |
1332 | return -EINVAL; |
1333 | |
1334 | kioeventfd = kzalloc(size: sizeof(*kioeventfd), GFP_KERNEL); |
1335 | if (!kioeventfd) |
1336 | return -ENOMEM; |
1337 | |
1338 | f = fdget(fd: ioeventfd->event_fd); |
1339 | if (!f.file) { |
1340 | ret = -EBADF; |
1341 | goto error_kfree; |
1342 | } |
1343 | |
1344 | kioeventfd->eventfd = eventfd_ctx_fileget(file: f.file); |
1345 | fdput(fd: f); |
1346 | |
1347 | if (IS_ERR(ptr: kioeventfd->eventfd)) { |
1348 | ret = PTR_ERR(ptr: kioeventfd->eventfd); |
1349 | goto error_kfree; |
1350 | } |
1351 | |
1352 | kioeventfd->addr = ioeventfd->addr; |
1353 | kioeventfd->addr_len = ioeventfd->addr_len; |
1354 | kioeventfd->vq = ioeventfd->vq; |
1355 | |
1356 | mutex_lock(&ioreq_lock); |
1357 | kioreq = get_ioreq(ioeventfd, eventfd: kioeventfd->eventfd); |
1358 | if (IS_ERR(ptr: kioreq)) { |
1359 | mutex_unlock(lock: &ioreq_lock); |
1360 | ret = PTR_ERR(ptr: kioreq); |
1361 | goto error_eventfd; |
1362 | } |
1363 | |
1364 | spin_lock_irqsave(&kioreq->lock, flags); |
1365 | list_add_tail(new: &kioeventfd->list, head: &kioreq->ioeventfds); |
1366 | spin_unlock_irqrestore(lock: &kioreq->lock, flags); |
1367 | |
1368 | mutex_unlock(lock: &ioreq_lock); |
1369 | |
1370 | return 0; |
1371 | |
1372 | error_eventfd: |
1373 | eventfd_ctx_put(ctx: kioeventfd->eventfd); |
1374 | |
1375 | error_kfree: |
1376 | kfree(objp: kioeventfd); |
1377 | return ret; |
1378 | } |
1379 | |
1380 | static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd) |
1381 | { |
1382 | struct privcmd_kernel_ioreq *kioreq, *tkioreq; |
1383 | struct eventfd_ctx *eventfd; |
1384 | unsigned long flags; |
1385 | int ret = 0; |
1386 | |
1387 | eventfd = eventfd_ctx_fdget(fd: ioeventfd->event_fd); |
1388 | if (IS_ERR(ptr: eventfd)) |
1389 | return PTR_ERR(ptr: eventfd); |
1390 | |
1391 | mutex_lock(&ioreq_lock); |
1392 | list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) { |
1393 | struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; |
1394 | /* |
1395 | * kioreq fields can be accessed here without a lock as they are |
1396 | * never updated after being added to the ioreq_list. |
1397 | */ |
1398 | if (kioreq->dom != ioeventfd->dom || |
1399 | kioreq->uioreq != ioeventfd->ioreq || |
1400 | kioreq->vcpus != ioeventfd->vcpus) |
1401 | continue; |
1402 | |
1403 | spin_lock_irqsave(&kioreq->lock, flags); |
1404 | list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) { |
1405 | if (eventfd == kioeventfd->eventfd) { |
1406 | ioeventfd_free(kioeventfd); |
1407 | spin_unlock_irqrestore(lock: &kioreq->lock, flags); |
1408 | |
1409 | if (list_empty(head: &kioreq->ioeventfds)) |
1410 | ioreq_free(kioreq); |
1411 | goto unlock; |
1412 | } |
1413 | } |
1414 | spin_unlock_irqrestore(lock: &kioreq->lock, flags); |
1415 | break; |
1416 | } |
1417 | |
1418 | pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n" , |
1419 | ioeventfd->dom, ioeventfd->addr); |
1420 | ret = -ENODEV; |
1421 | |
1422 | unlock: |
1423 | mutex_unlock(lock: &ioreq_lock); |
1424 | eventfd_ctx_put(ctx: eventfd); |
1425 | |
1426 | return ret; |
1427 | } |
1428 | |
1429 | static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) |
1430 | { |
1431 | struct privcmd_data *data = file->private_data; |
1432 | struct privcmd_ioeventfd ioeventfd; |
1433 | |
1434 | if (copy_from_user(to: &ioeventfd, from: udata, n: sizeof(ioeventfd))) |
1435 | return -EFAULT; |
1436 | |
1437 | /* No other flags should be set */ |
1438 | if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) |
1439 | return -EINVAL; |
1440 | |
1441 | /* If restriction is in place, check the domid matches */ |
1442 | if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom) |
1443 | return -EPERM; |
1444 | |
1445 | if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN) |
1446 | return privcmd_ioeventfd_deassign(ioeventfd: &ioeventfd); |
1447 | |
1448 | return privcmd_ioeventfd_assign(ioeventfd: &ioeventfd); |
1449 | } |
1450 | |
1451 | static void privcmd_ioeventfd_exit(void) |
1452 | { |
1453 | struct privcmd_kernel_ioreq *kioreq, *tmp; |
1454 | unsigned long flags; |
1455 | |
1456 | mutex_lock(&ioreq_lock); |
1457 | list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) { |
1458 | struct privcmd_kernel_ioeventfd *kioeventfd, *tmp; |
1459 | |
1460 | spin_lock_irqsave(&kioreq->lock, flags); |
1461 | list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) |
1462 | ioeventfd_free(kioeventfd); |
1463 | spin_unlock_irqrestore(lock: &kioreq->lock, flags); |
1464 | |
1465 | ioreq_free(kioreq); |
1466 | } |
1467 | mutex_unlock(lock: &ioreq_lock); |
1468 | } |
1469 | #else |
1470 | static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata) |
1471 | { |
1472 | return -EOPNOTSUPP; |
1473 | } |
1474 | |
1475 | static inline int privcmd_irqfd_init(void) |
1476 | { |
1477 | return 0; |
1478 | } |
1479 | |
1480 | static inline void privcmd_irqfd_exit(void) |
1481 | { |
1482 | } |
1483 | |
1484 | static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata) |
1485 | { |
1486 | return -EOPNOTSUPP; |
1487 | } |
1488 | |
1489 | static inline void privcmd_ioeventfd_exit(void) |
1490 | { |
1491 | } |
1492 | #endif /* CONFIG_XEN_PRIVCMD_EVENTFD */ |
1493 | |
1494 | static long privcmd_ioctl(struct file *file, |
1495 | unsigned int cmd, unsigned long data) |
1496 | { |
1497 | int ret = -ENOTTY; |
1498 | void __user *udata = (void __user *) data; |
1499 | |
1500 | switch (cmd) { |
1501 | case IOCTL_PRIVCMD_HYPERCALL: |
1502 | ret = privcmd_ioctl_hypercall(file, udata); |
1503 | break; |
1504 | |
1505 | case IOCTL_PRIVCMD_MMAP: |
1506 | ret = privcmd_ioctl_mmap(file, udata); |
1507 | break; |
1508 | |
1509 | case IOCTL_PRIVCMD_MMAPBATCH: |
1510 | ret = privcmd_ioctl_mmap_batch(file, udata, version: 1); |
1511 | break; |
1512 | |
1513 | case IOCTL_PRIVCMD_MMAPBATCH_V2: |
1514 | ret = privcmd_ioctl_mmap_batch(file, udata, version: 2); |
1515 | break; |
1516 | |
1517 | case IOCTL_PRIVCMD_DM_OP: |
1518 | ret = privcmd_ioctl_dm_op(file, udata); |
1519 | break; |
1520 | |
1521 | case IOCTL_PRIVCMD_RESTRICT: |
1522 | ret = privcmd_ioctl_restrict(file, udata); |
1523 | break; |
1524 | |
1525 | case IOCTL_PRIVCMD_MMAP_RESOURCE: |
1526 | ret = privcmd_ioctl_mmap_resource(file, udata); |
1527 | break; |
1528 | |
1529 | case IOCTL_PRIVCMD_IRQFD: |
1530 | ret = privcmd_ioctl_irqfd(file, udata); |
1531 | break; |
1532 | |
1533 | case IOCTL_PRIVCMD_IOEVENTFD: |
1534 | ret = privcmd_ioctl_ioeventfd(file, udata); |
1535 | break; |
1536 | |
1537 | default: |
1538 | break; |
1539 | } |
1540 | |
1541 | return ret; |
1542 | } |
1543 | |
1544 | static int privcmd_open(struct inode *ino, struct file *file) |
1545 | { |
1546 | struct privcmd_data *data = kzalloc(size: sizeof(*data), GFP_KERNEL); |
1547 | |
1548 | if (!data) |
1549 | return -ENOMEM; |
1550 | |
1551 | /* DOMID_INVALID implies no restriction */ |
1552 | data->domid = DOMID_INVALID; |
1553 | |
1554 | file->private_data = data; |
1555 | return 0; |
1556 | } |
1557 | |
1558 | static int privcmd_release(struct inode *ino, struct file *file) |
1559 | { |
1560 | struct privcmd_data *data = file->private_data; |
1561 | |
1562 | kfree(objp: data); |
1563 | return 0; |
1564 | } |
1565 | |
1566 | static void privcmd_close(struct vm_area_struct *vma) |
1567 | { |
1568 | struct page **pages = vma->vm_private_data; |
1569 | int numpgs = vma_pages(vma); |
1570 | int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT; |
1571 | int rc; |
1572 | |
1573 | if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) |
1574 | return; |
1575 | |
1576 | rc = xen_unmap_domain_gfn_range(vma, numpgs: numgfns, pages); |
1577 | if (rc == 0) |
1578 | xen_free_unpopulated_pages(nr_pages: numpgs, pages); |
1579 | else |
1580 | pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n" , |
1581 | numpgs, rc); |
1582 | kvfree(addr: pages); |
1583 | } |
1584 | |
1585 | static vm_fault_t privcmd_fault(struct vm_fault *vmf) |
1586 | { |
1587 | printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n" , |
1588 | vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, |
1589 | vmf->pgoff, (void *)vmf->address); |
1590 | |
1591 | return VM_FAULT_SIGBUS; |
1592 | } |
1593 | |
1594 | static const struct vm_operations_struct privcmd_vm_ops = { |
1595 | .close = privcmd_close, |
1596 | .fault = privcmd_fault |
1597 | }; |
1598 | |
1599 | static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) |
1600 | { |
1601 | /* DONTCOPY is essential for Xen because copy_page_range doesn't know |
1602 | * how to recreate these mappings */ |
1603 | vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | |
1604 | VM_DONTEXPAND | VM_DONTDUMP); |
1605 | vma->vm_ops = &privcmd_vm_ops; |
1606 | vma->vm_private_data = NULL; |
1607 | |
1608 | return 0; |
1609 | } |
1610 | |
1611 | /* |
1612 | * For MMAPBATCH*. This allows asserting the singleshot mapping |
1613 | * on a per pfn/pte basis. Mapping calls that fail with ENOENT |
1614 | * can be then retried until success. |
1615 | */ |
1616 | static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) |
1617 | { |
1618 | return pte_none(pte: ptep_get(ptep: pte)) ? 0 : -EBUSY; |
1619 | } |
1620 | |
1621 | static int privcmd_vma_range_is_mapped( |
1622 | struct vm_area_struct *vma, |
1623 | unsigned long addr, |
1624 | unsigned long nr_pages) |
1625 | { |
1626 | return apply_to_page_range(mm: vma->vm_mm, address: addr, size: nr_pages << PAGE_SHIFT, |
1627 | fn: is_mapped_fn, NULL) != 0; |
1628 | } |
1629 | |
1630 | const struct file_operations xen_privcmd_fops = { |
1631 | .owner = THIS_MODULE, |
1632 | .unlocked_ioctl = privcmd_ioctl, |
1633 | .open = privcmd_open, |
1634 | .release = privcmd_release, |
1635 | .mmap = privcmd_mmap, |
1636 | }; |
1637 | EXPORT_SYMBOL_GPL(xen_privcmd_fops); |
1638 | |
1639 | static struct miscdevice privcmd_dev = { |
1640 | .minor = MISC_DYNAMIC_MINOR, |
1641 | .name = "xen/privcmd" , |
1642 | .fops = &xen_privcmd_fops, |
1643 | }; |
1644 | |
1645 | static int __init privcmd_init(void) |
1646 | { |
1647 | int err; |
1648 | |
1649 | if (!xen_domain()) |
1650 | return -ENODEV; |
1651 | |
1652 | err = misc_register(misc: &privcmd_dev); |
1653 | if (err != 0) { |
1654 | pr_err("Could not register Xen privcmd device\n" ); |
1655 | return err; |
1656 | } |
1657 | |
1658 | err = misc_register(misc: &xen_privcmdbuf_dev); |
1659 | if (err != 0) { |
1660 | pr_err("Could not register Xen hypercall-buf device\n" ); |
1661 | goto err_privcmdbuf; |
1662 | } |
1663 | |
1664 | err = privcmd_irqfd_init(); |
1665 | if (err != 0) { |
1666 | pr_err("irqfd init failed\n" ); |
1667 | goto err_irqfd; |
1668 | } |
1669 | |
1670 | return 0; |
1671 | |
1672 | err_irqfd: |
1673 | misc_deregister(misc: &xen_privcmdbuf_dev); |
1674 | err_privcmdbuf: |
1675 | misc_deregister(misc: &privcmd_dev); |
1676 | return err; |
1677 | } |
1678 | |
1679 | static void __exit privcmd_exit(void) |
1680 | { |
1681 | privcmd_ioeventfd_exit(); |
1682 | privcmd_irqfd_exit(); |
1683 | misc_deregister(misc: &privcmd_dev); |
1684 | misc_deregister(misc: &xen_privcmdbuf_dev); |
1685 | } |
1686 | |
1687 | module_init(privcmd_init); |
1688 | module_exit(privcmd_exit); |
1689 | |