1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Device driver to expose SGX enclave memory to KVM guests. |
4 | * |
5 | * Copyright(c) 2021 Intel Corporation. |
6 | */ |
7 | |
8 | #include <linux/miscdevice.h> |
9 | #include <linux/mm.h> |
10 | #include <linux/mman.h> |
11 | #include <linux/sched/mm.h> |
12 | #include <linux/sched/signal.h> |
13 | #include <linux/slab.h> |
14 | #include <linux/xarray.h> |
15 | #include <asm/sgx.h> |
16 | #include <uapi/asm/sgx.h> |
17 | |
18 | #include "encls.h" |
19 | #include "sgx.h" |
20 | |
21 | struct sgx_vepc { |
22 | struct xarray page_array; |
23 | struct mutex lock; |
24 | }; |
25 | |
26 | /* |
27 | * Temporary SECS pages that cannot be EREMOVE'd due to having child in other |
28 | * virtual EPC instances, and the lock to protect it. |
29 | */ |
30 | static struct mutex zombie_secs_pages_lock; |
31 | static struct list_head zombie_secs_pages; |
32 | |
33 | static int __sgx_vepc_fault(struct sgx_vepc *vepc, |
34 | struct vm_area_struct *vma, unsigned long addr) |
35 | { |
36 | struct sgx_epc_page *epc_page; |
37 | unsigned long index, pfn; |
38 | int ret; |
39 | |
40 | WARN_ON(!mutex_is_locked(&vepc->lock)); |
41 | |
42 | /* Calculate index of EPC page in virtual EPC's page_array */ |
43 | index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); |
44 | |
45 | epc_page = xa_load(&vepc->page_array, index); |
46 | if (epc_page) |
47 | return 0; |
48 | |
49 | epc_page = sgx_alloc_epc_page(owner: vepc, reclaim: false); |
50 | if (IS_ERR(ptr: epc_page)) |
51 | return PTR_ERR(ptr: epc_page); |
52 | |
53 | ret = xa_err(entry: xa_store(&vepc->page_array, index, entry: epc_page, GFP_KERNEL)); |
54 | if (ret) |
55 | goto err_free; |
56 | |
57 | pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); |
58 | |
59 | ret = vmf_insert_pfn(vma, addr, pfn); |
60 | if (ret != VM_FAULT_NOPAGE) { |
61 | ret = -EFAULT; |
62 | goto err_delete; |
63 | } |
64 | |
65 | return 0; |
66 | |
67 | err_delete: |
68 | xa_erase(&vepc->page_array, index); |
69 | err_free: |
70 | sgx_free_epc_page(page: epc_page); |
71 | return ret; |
72 | } |
73 | |
74 | static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) |
75 | { |
76 | struct vm_area_struct *vma = vmf->vma; |
77 | struct sgx_vepc *vepc = vma->vm_private_data; |
78 | int ret; |
79 | |
80 | mutex_lock(&vepc->lock); |
81 | ret = __sgx_vepc_fault(vepc, vma, addr: vmf->address); |
82 | mutex_unlock(lock: &vepc->lock); |
83 | |
84 | if (!ret) |
85 | return VM_FAULT_NOPAGE; |
86 | |
87 | if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { |
88 | mmap_read_unlock(mm: vma->vm_mm); |
89 | return VM_FAULT_RETRY; |
90 | } |
91 | |
92 | return VM_FAULT_SIGBUS; |
93 | } |
94 | |
95 | static const struct vm_operations_struct sgx_vepc_vm_ops = { |
96 | .fault = sgx_vepc_fault, |
97 | }; |
98 | |
99 | static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) |
100 | { |
101 | struct sgx_vepc *vepc = file->private_data; |
102 | |
103 | if (!(vma->vm_flags & VM_SHARED)) |
104 | return -EINVAL; |
105 | |
106 | vma->vm_ops = &sgx_vepc_vm_ops; |
107 | /* Don't copy VMA in fork() */ |
108 | vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); |
109 | vma->vm_private_data = vepc; |
110 | |
111 | return 0; |
112 | } |
113 | |
114 | static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) |
115 | { |
116 | /* |
117 | * Take a previously guest-owned EPC page and return it to the |
118 | * general EPC page pool. |
119 | * |
120 | * Guests can not be trusted to have left this page in a good |
121 | * state, so run EREMOVE on the page unconditionally. In the |
122 | * case that a guest properly EREMOVE'd this page, a superfluous |
123 | * EREMOVE is harmless. |
124 | */ |
125 | return __eremove(addr: sgx_get_epc_virt_addr(page: epc_page)); |
126 | } |
127 | |
128 | static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) |
129 | { |
130 | int ret = sgx_vepc_remove_page(epc_page); |
131 | if (ret) { |
132 | /* |
133 | * Only SGX_CHILD_PRESENT is expected, which is because of |
134 | * EREMOVE'ing an SECS still with child, in which case it can |
135 | * be handled by EREMOVE'ing the SECS again after all pages in |
136 | * virtual EPC have been EREMOVE'd. See comments in below in |
137 | * sgx_vepc_release(). |
138 | * |
139 | * The user of virtual EPC (KVM) needs to guarantee there's no |
140 | * logical processor is still running in the enclave in guest, |
141 | * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be |
142 | * handled here. |
143 | */ |
144 | WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, |
145 | ret, ret); |
146 | return ret; |
147 | } |
148 | |
149 | sgx_free_epc_page(page: epc_page); |
150 | return 0; |
151 | } |
152 | |
153 | static long sgx_vepc_remove_all(struct sgx_vepc *vepc) |
154 | { |
155 | struct sgx_epc_page *entry; |
156 | unsigned long index; |
157 | long failures = 0; |
158 | |
159 | xa_for_each(&vepc->page_array, index, entry) { |
160 | int ret = sgx_vepc_remove_page(epc_page: entry); |
161 | if (ret) { |
162 | if (ret == SGX_CHILD_PRESENT) { |
163 | /* The page is a SECS, userspace will retry. */ |
164 | failures++; |
165 | } else { |
166 | /* |
167 | * Report errors due to #GP or SGX_ENCLAVE_ACT; do not |
168 | * WARN, as userspace can induce said failures by |
169 | * calling the ioctl concurrently on multiple vEPCs or |
170 | * while one or more CPUs is running the enclave. Only |
171 | * a #PF on EREMOVE indicates a kernel/hardware issue. |
172 | */ |
173 | WARN_ON_ONCE(encls_faulted(ret) && |
174 | ENCLS_TRAPNR(ret) != X86_TRAP_GP); |
175 | return -EBUSY; |
176 | } |
177 | } |
178 | cond_resched(); |
179 | } |
180 | |
181 | /* |
182 | * Return the number of SECS pages that failed to be removed, so |
183 | * userspace knows that it has to retry. |
184 | */ |
185 | return failures; |
186 | } |
187 | |
188 | static int sgx_vepc_release(struct inode *inode, struct file *file) |
189 | { |
190 | struct sgx_vepc *vepc = file->private_data; |
191 | struct sgx_epc_page *epc_page, *tmp, *entry; |
192 | unsigned long index; |
193 | |
194 | LIST_HEAD(secs_pages); |
195 | |
196 | xa_for_each(&vepc->page_array, index, entry) { |
197 | /* |
198 | * Remove all normal, child pages. sgx_vepc_free_page() |
199 | * will fail if EREMOVE fails, but this is OK and expected on |
200 | * SECS pages. Those can only be EREMOVE'd *after* all their |
201 | * child pages. Retries below will clean them up. |
202 | */ |
203 | if (sgx_vepc_free_page(epc_page: entry)) |
204 | continue; |
205 | |
206 | xa_erase(&vepc->page_array, index); |
207 | cond_resched(); |
208 | } |
209 | |
210 | /* |
211 | * Retry EREMOVE'ing pages. This will clean up any SECS pages that |
212 | * only had children in this 'epc' area. |
213 | */ |
214 | xa_for_each(&vepc->page_array, index, entry) { |
215 | epc_page = entry; |
216 | /* |
217 | * An EREMOVE failure here means that the SECS page still |
218 | * has children. But, since all children in this 'sgx_vepc' |
219 | * have been removed, the SECS page must have a child on |
220 | * another instance. |
221 | */ |
222 | if (sgx_vepc_free_page(epc_page)) |
223 | list_add_tail(new: &epc_page->list, head: &secs_pages); |
224 | |
225 | xa_erase(&vepc->page_array, index); |
226 | cond_resched(); |
227 | } |
228 | |
229 | /* |
230 | * SECS pages are "pinned" by child pages, and "unpinned" once all |
231 | * children have been EREMOVE'd. A child page in this instance |
232 | * may have pinned an SECS page encountered in an earlier release(), |
233 | * creating a zombie. Since some children were EREMOVE'd above, |
234 | * try to EREMOVE all zombies in the hopes that one was unpinned. |
235 | */ |
236 | mutex_lock(&zombie_secs_pages_lock); |
237 | list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { |
238 | /* |
239 | * Speculatively remove the page from the list of zombies, |
240 | * if the page is successfully EREMOVE'd it will be added to |
241 | * the list of free pages. If EREMOVE fails, throw the page |
242 | * on the local list, which will be spliced on at the end. |
243 | */ |
244 | list_del(entry: &epc_page->list); |
245 | |
246 | if (sgx_vepc_free_page(epc_page)) |
247 | list_add_tail(new: &epc_page->list, head: &secs_pages); |
248 | cond_resched(); |
249 | } |
250 | |
251 | if (!list_empty(head: &secs_pages)) |
252 | list_splice_tail(list: &secs_pages, head: &zombie_secs_pages); |
253 | mutex_unlock(lock: &zombie_secs_pages_lock); |
254 | |
255 | xa_destroy(&vepc->page_array); |
256 | kfree(objp: vepc); |
257 | |
258 | return 0; |
259 | } |
260 | |
261 | static int sgx_vepc_open(struct inode *inode, struct file *file) |
262 | { |
263 | struct sgx_vepc *vepc; |
264 | |
265 | vepc = kzalloc(size: sizeof(struct sgx_vepc), GFP_KERNEL); |
266 | if (!vepc) |
267 | return -ENOMEM; |
268 | mutex_init(&vepc->lock); |
269 | xa_init(xa: &vepc->page_array); |
270 | |
271 | file->private_data = vepc; |
272 | |
273 | return 0; |
274 | } |
275 | |
276 | static long sgx_vepc_ioctl(struct file *file, |
277 | unsigned int cmd, unsigned long arg) |
278 | { |
279 | struct sgx_vepc *vepc = file->private_data; |
280 | |
281 | switch (cmd) { |
282 | case SGX_IOC_VEPC_REMOVE_ALL: |
283 | if (arg) |
284 | return -EINVAL; |
285 | return sgx_vepc_remove_all(vepc); |
286 | |
287 | default: |
288 | return -ENOTTY; |
289 | } |
290 | } |
291 | |
292 | static const struct file_operations sgx_vepc_fops = { |
293 | .owner = THIS_MODULE, |
294 | .open = sgx_vepc_open, |
295 | .unlocked_ioctl = sgx_vepc_ioctl, |
296 | .compat_ioctl = sgx_vepc_ioctl, |
297 | .release = sgx_vepc_release, |
298 | .mmap = sgx_vepc_mmap, |
299 | }; |
300 | |
301 | static struct miscdevice sgx_vepc_dev = { |
302 | .minor = MISC_DYNAMIC_MINOR, |
303 | .name = "sgx_vepc" , |
304 | .nodename = "sgx_vepc" , |
305 | .fops = &sgx_vepc_fops, |
306 | }; |
307 | |
308 | int __init sgx_vepc_init(void) |
309 | { |
310 | /* SGX virtualization requires KVM to work */ |
311 | if (!cpu_feature_enabled(X86_FEATURE_VMX)) |
312 | return -ENODEV; |
313 | |
314 | INIT_LIST_HEAD(list: &zombie_secs_pages); |
315 | mutex_init(&zombie_secs_pages_lock); |
316 | |
317 | return misc_register(misc: &sgx_vepc_dev); |
318 | } |
319 | |
320 | /** |
321 | * sgx_virt_ecreate() - Run ECREATE on behalf of guest |
322 | * @pageinfo: Pointer to PAGEINFO structure |
323 | * @secs: Userspace pointer to SECS page |
324 | * @trapnr: trap number injected to guest in case of ECREATE error |
325 | * |
326 | * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose |
327 | * of enforcing policies of guest's enclaves, and return the trap number |
328 | * which should be injected to guest in case of any ECREATE error. |
329 | * |
330 | * Return: |
331 | * - 0: ECREATE was successful. |
332 | * - <0: on error. |
333 | */ |
334 | int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, |
335 | int *trapnr) |
336 | { |
337 | int ret; |
338 | |
339 | /* |
340 | * @secs is an untrusted, userspace-provided address. It comes from |
341 | * KVM and is assumed to be a valid pointer which points somewhere in |
342 | * userspace. This can fault and call SGX or other fault handlers when |
343 | * userspace mapping @secs doesn't exist. |
344 | * |
345 | * Add a WARN() to make sure @secs is already valid userspace pointer |
346 | * from caller (KVM), who should already have handled invalid pointer |
347 | * case (for instance, made by malicious guest). All other checks, |
348 | * such as alignment of @secs, are deferred to ENCLS itself. |
349 | */ |
350 | if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) |
351 | return -EINVAL; |
352 | |
353 | __uaccess_begin(); |
354 | ret = __ecreate(pginfo: pageinfo, secs: (void *)secs); |
355 | __uaccess_end(); |
356 | |
357 | if (encls_faulted(ret)) { |
358 | *trapnr = ENCLS_TRAPNR(ret); |
359 | return -EFAULT; |
360 | } |
361 | |
362 | /* ECREATE doesn't return an error code, it faults or succeeds. */ |
363 | WARN_ON_ONCE(ret); |
364 | return 0; |
365 | } |
366 | EXPORT_SYMBOL_GPL(sgx_virt_ecreate); |
367 | |
368 | static int __sgx_virt_einit(void __user *sigstruct, void __user *token, |
369 | void __user *secs) |
370 | { |
371 | int ret; |
372 | |
373 | /* |
374 | * Make sure all userspace pointers from caller (KVM) are valid. |
375 | * All other checks deferred to ENCLS itself. Also see comment |
376 | * for @secs in sgx_virt_ecreate(). |
377 | */ |
378 | #define SGX_EINITTOKEN_SIZE 304 |
379 | if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || |
380 | !access_ok(token, SGX_EINITTOKEN_SIZE) || |
381 | !access_ok(secs, PAGE_SIZE))) |
382 | return -EINVAL; |
383 | |
384 | __uaccess_begin(); |
385 | ret = __einit(sigstruct: (void *)sigstruct, token: (void *)token, secs: (void *)secs); |
386 | __uaccess_end(); |
387 | |
388 | return ret; |
389 | } |
390 | |
391 | /** |
392 | * sgx_virt_einit() - Run EINIT on behalf of guest |
393 | * @sigstruct: Userspace pointer to SIGSTRUCT structure |
394 | * @token: Userspace pointer to EINITTOKEN structure |
395 | * @secs: Userspace pointer to SECS page |
396 | * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values |
397 | * @trapnr: trap number injected to guest in case of EINIT error |
398 | * |
399 | * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available |
400 | * in host, SGX driver may rewrite the hardware values at wish, therefore KVM |
401 | * needs to update hardware values to guest's virtual MSR values in order to |
402 | * ensure EINIT is executed with expected hardware values. |
403 | * |
404 | * Return: |
405 | * - 0: EINIT was successful. |
406 | * - <0: on error. |
407 | */ |
408 | int sgx_virt_einit(void __user *sigstruct, void __user *token, |
409 | void __user *secs, u64 *lepubkeyhash, int *trapnr) |
410 | { |
411 | int ret; |
412 | |
413 | if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { |
414 | ret = __sgx_virt_einit(sigstruct, token, secs); |
415 | } else { |
416 | preempt_disable(); |
417 | |
418 | sgx_update_lepubkeyhash(lepubkeyhash); |
419 | |
420 | ret = __sgx_virt_einit(sigstruct, token, secs); |
421 | preempt_enable(); |
422 | } |
423 | |
424 | /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ |
425 | if (ret == -EINVAL) |
426 | return ret; |
427 | |
428 | if (encls_faulted(ret)) { |
429 | *trapnr = ENCLS_TRAPNR(ret); |
430 | return -EFAULT; |
431 | } |
432 | |
433 | return ret; |
434 | } |
435 | EXPORT_SYMBOL_GPL(sgx_virt_einit); |
436 | |