1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Device driver to expose SGX enclave memory to KVM guests.
4 *
5 * Copyright(c) 2021 Intel Corporation.
6 */
7
8#include <linux/miscdevice.h>
9#include <linux/mm.h>
10#include <linux/mman.h>
11#include <linux/sched/mm.h>
12#include <linux/sched/signal.h>
13#include <linux/slab.h>
14#include <linux/xarray.h>
15#include <asm/sgx.h>
16#include <uapi/asm/sgx.h>
17
18#include "encls.h"
19#include "sgx.h"
20
21struct sgx_vepc {
22 struct xarray page_array;
23 struct mutex lock;
24};
25
26/*
27 * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
28 * virtual EPC instances, and the lock to protect it.
29 */
30static struct mutex zombie_secs_pages_lock;
31static struct list_head zombie_secs_pages;
32
33static int __sgx_vepc_fault(struct sgx_vepc *vepc,
34 struct vm_area_struct *vma, unsigned long addr)
35{
36 struct sgx_epc_page *epc_page;
37 unsigned long index, pfn;
38 int ret;
39
40 WARN_ON(!mutex_is_locked(&vepc->lock));
41
42 /* Calculate index of EPC page in virtual EPC's page_array */
43 index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
44
45 epc_page = xa_load(&vepc->page_array, index);
46 if (epc_page)
47 return 0;
48
49 epc_page = sgx_alloc_epc_page(owner: vepc, reclaim: false);
50 if (IS_ERR(ptr: epc_page))
51 return PTR_ERR(ptr: epc_page);
52
53 ret = xa_err(entry: xa_store(&vepc->page_array, index, entry: epc_page, GFP_KERNEL));
54 if (ret)
55 goto err_free;
56
57 pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
58
59 ret = vmf_insert_pfn(vma, addr, pfn);
60 if (ret != VM_FAULT_NOPAGE) {
61 ret = -EFAULT;
62 goto err_delete;
63 }
64
65 return 0;
66
67err_delete:
68 xa_erase(&vepc->page_array, index);
69err_free:
70 sgx_free_epc_page(page: epc_page);
71 return ret;
72}
73
74static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
75{
76 struct vm_area_struct *vma = vmf->vma;
77 struct sgx_vepc *vepc = vma->vm_private_data;
78 int ret;
79
80 mutex_lock(&vepc->lock);
81 ret = __sgx_vepc_fault(vepc, vma, addr: vmf->address);
82 mutex_unlock(lock: &vepc->lock);
83
84 if (!ret)
85 return VM_FAULT_NOPAGE;
86
87 if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
88 mmap_read_unlock(mm: vma->vm_mm);
89 return VM_FAULT_RETRY;
90 }
91
92 return VM_FAULT_SIGBUS;
93}
94
95static const struct vm_operations_struct sgx_vepc_vm_ops = {
96 .fault = sgx_vepc_fault,
97};
98
99static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
100{
101 struct sgx_vepc *vepc = file->private_data;
102
103 if (!(vma->vm_flags & VM_SHARED))
104 return -EINVAL;
105
106 vma->vm_ops = &sgx_vepc_vm_ops;
107 /* Don't copy VMA in fork() */
108 vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
109 vma->vm_private_data = vepc;
110
111 return 0;
112}
113
114static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
115{
116 /*
117 * Take a previously guest-owned EPC page and return it to the
118 * general EPC page pool.
119 *
120 * Guests can not be trusted to have left this page in a good
121 * state, so run EREMOVE on the page unconditionally. In the
122 * case that a guest properly EREMOVE'd this page, a superfluous
123 * EREMOVE is harmless.
124 */
125 return __eremove(addr: sgx_get_epc_virt_addr(page: epc_page));
126}
127
128static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
129{
130 int ret = sgx_vepc_remove_page(epc_page);
131 if (ret) {
132 /*
133 * Only SGX_CHILD_PRESENT is expected, which is because of
134 * EREMOVE'ing an SECS still with child, in which case it can
135 * be handled by EREMOVE'ing the SECS again after all pages in
136 * virtual EPC have been EREMOVE'd. See comments in below in
137 * sgx_vepc_release().
138 *
139 * The user of virtual EPC (KVM) needs to guarantee there's no
140 * logical processor is still running in the enclave in guest,
141 * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
142 * handled here.
143 */
144 WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
145 ret, ret);
146 return ret;
147 }
148
149 sgx_free_epc_page(page: epc_page);
150 return 0;
151}
152
153static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
154{
155 struct sgx_epc_page *entry;
156 unsigned long index;
157 long failures = 0;
158
159 xa_for_each(&vepc->page_array, index, entry) {
160 int ret = sgx_vepc_remove_page(epc_page: entry);
161 if (ret) {
162 if (ret == SGX_CHILD_PRESENT) {
163 /* The page is a SECS, userspace will retry. */
164 failures++;
165 } else {
166 /*
167 * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
168 * WARN, as userspace can induce said failures by
169 * calling the ioctl concurrently on multiple vEPCs or
170 * while one or more CPUs is running the enclave. Only
171 * a #PF on EREMOVE indicates a kernel/hardware issue.
172 */
173 WARN_ON_ONCE(encls_faulted(ret) &&
174 ENCLS_TRAPNR(ret) != X86_TRAP_GP);
175 return -EBUSY;
176 }
177 }
178 cond_resched();
179 }
180
181 /*
182 * Return the number of SECS pages that failed to be removed, so
183 * userspace knows that it has to retry.
184 */
185 return failures;
186}
187
188static int sgx_vepc_release(struct inode *inode, struct file *file)
189{
190 struct sgx_vepc *vepc = file->private_data;
191 struct sgx_epc_page *epc_page, *tmp, *entry;
192 unsigned long index;
193
194 LIST_HEAD(secs_pages);
195
196 xa_for_each(&vepc->page_array, index, entry) {
197 /*
198 * Remove all normal, child pages. sgx_vepc_free_page()
199 * will fail if EREMOVE fails, but this is OK and expected on
200 * SECS pages. Those can only be EREMOVE'd *after* all their
201 * child pages. Retries below will clean them up.
202 */
203 if (sgx_vepc_free_page(epc_page: entry))
204 continue;
205
206 xa_erase(&vepc->page_array, index);
207 cond_resched();
208 }
209
210 /*
211 * Retry EREMOVE'ing pages. This will clean up any SECS pages that
212 * only had children in this 'epc' area.
213 */
214 xa_for_each(&vepc->page_array, index, entry) {
215 epc_page = entry;
216 /*
217 * An EREMOVE failure here means that the SECS page still
218 * has children. But, since all children in this 'sgx_vepc'
219 * have been removed, the SECS page must have a child on
220 * another instance.
221 */
222 if (sgx_vepc_free_page(epc_page))
223 list_add_tail(new: &epc_page->list, head: &secs_pages);
224
225 xa_erase(&vepc->page_array, index);
226 cond_resched();
227 }
228
229 /*
230 * SECS pages are "pinned" by child pages, and "unpinned" once all
231 * children have been EREMOVE'd. A child page in this instance
232 * may have pinned an SECS page encountered in an earlier release(),
233 * creating a zombie. Since some children were EREMOVE'd above,
234 * try to EREMOVE all zombies in the hopes that one was unpinned.
235 */
236 mutex_lock(&zombie_secs_pages_lock);
237 list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
238 /*
239 * Speculatively remove the page from the list of zombies,
240 * if the page is successfully EREMOVE'd it will be added to
241 * the list of free pages. If EREMOVE fails, throw the page
242 * on the local list, which will be spliced on at the end.
243 */
244 list_del(entry: &epc_page->list);
245
246 if (sgx_vepc_free_page(epc_page))
247 list_add_tail(new: &epc_page->list, head: &secs_pages);
248 cond_resched();
249 }
250
251 if (!list_empty(head: &secs_pages))
252 list_splice_tail(list: &secs_pages, head: &zombie_secs_pages);
253 mutex_unlock(lock: &zombie_secs_pages_lock);
254
255 xa_destroy(&vepc->page_array);
256 kfree(objp: vepc);
257
258 return 0;
259}
260
261static int sgx_vepc_open(struct inode *inode, struct file *file)
262{
263 struct sgx_vepc *vepc;
264
265 vepc = kzalloc(size: sizeof(struct sgx_vepc), GFP_KERNEL);
266 if (!vepc)
267 return -ENOMEM;
268 mutex_init(&vepc->lock);
269 xa_init(xa: &vepc->page_array);
270
271 file->private_data = vepc;
272
273 return 0;
274}
275
276static long sgx_vepc_ioctl(struct file *file,
277 unsigned int cmd, unsigned long arg)
278{
279 struct sgx_vepc *vepc = file->private_data;
280
281 switch (cmd) {
282 case SGX_IOC_VEPC_REMOVE_ALL:
283 if (arg)
284 return -EINVAL;
285 return sgx_vepc_remove_all(vepc);
286
287 default:
288 return -ENOTTY;
289 }
290}
291
292static const struct file_operations sgx_vepc_fops = {
293 .owner = THIS_MODULE,
294 .open = sgx_vepc_open,
295 .unlocked_ioctl = sgx_vepc_ioctl,
296 .compat_ioctl = sgx_vepc_ioctl,
297 .release = sgx_vepc_release,
298 .mmap = sgx_vepc_mmap,
299};
300
301static struct miscdevice sgx_vepc_dev = {
302 .minor = MISC_DYNAMIC_MINOR,
303 .name = "sgx_vepc",
304 .nodename = "sgx_vepc",
305 .fops = &sgx_vepc_fops,
306};
307
308int __init sgx_vepc_init(void)
309{
310 /* SGX virtualization requires KVM to work */
311 if (!cpu_feature_enabled(X86_FEATURE_VMX))
312 return -ENODEV;
313
314 INIT_LIST_HEAD(list: &zombie_secs_pages);
315 mutex_init(&zombie_secs_pages_lock);
316
317 return misc_register(misc: &sgx_vepc_dev);
318}
319
320/**
321 * sgx_virt_ecreate() - Run ECREATE on behalf of guest
322 * @pageinfo: Pointer to PAGEINFO structure
323 * @secs: Userspace pointer to SECS page
324 * @trapnr: trap number injected to guest in case of ECREATE error
325 *
326 * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
327 * of enforcing policies of guest's enclaves, and return the trap number
328 * which should be injected to guest in case of any ECREATE error.
329 *
330 * Return:
331 * - 0: ECREATE was successful.
332 * - <0: on error.
333 */
334int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
335 int *trapnr)
336{
337 int ret;
338
339 /*
340 * @secs is an untrusted, userspace-provided address. It comes from
341 * KVM and is assumed to be a valid pointer which points somewhere in
342 * userspace. This can fault and call SGX or other fault handlers when
343 * userspace mapping @secs doesn't exist.
344 *
345 * Add a WARN() to make sure @secs is already valid userspace pointer
346 * from caller (KVM), who should already have handled invalid pointer
347 * case (for instance, made by malicious guest). All other checks,
348 * such as alignment of @secs, are deferred to ENCLS itself.
349 */
350 if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
351 return -EINVAL;
352
353 __uaccess_begin();
354 ret = __ecreate(pginfo: pageinfo, secs: (void *)secs);
355 __uaccess_end();
356
357 if (encls_faulted(ret)) {
358 *trapnr = ENCLS_TRAPNR(ret);
359 return -EFAULT;
360 }
361
362 /* ECREATE doesn't return an error code, it faults or succeeds. */
363 WARN_ON_ONCE(ret);
364 return 0;
365}
366EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
367
368static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
369 void __user *secs)
370{
371 int ret;
372
373 /*
374 * Make sure all userspace pointers from caller (KVM) are valid.
375 * All other checks deferred to ENCLS itself. Also see comment
376 * for @secs in sgx_virt_ecreate().
377 */
378#define SGX_EINITTOKEN_SIZE 304
379 if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
380 !access_ok(token, SGX_EINITTOKEN_SIZE) ||
381 !access_ok(secs, PAGE_SIZE)))
382 return -EINVAL;
383
384 __uaccess_begin();
385 ret = __einit(sigstruct: (void *)sigstruct, token: (void *)token, secs: (void *)secs);
386 __uaccess_end();
387
388 return ret;
389}
390
391/**
392 * sgx_virt_einit() - Run EINIT on behalf of guest
393 * @sigstruct: Userspace pointer to SIGSTRUCT structure
394 * @token: Userspace pointer to EINITTOKEN structure
395 * @secs: Userspace pointer to SECS page
396 * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
397 * @trapnr: trap number injected to guest in case of EINIT error
398 *
399 * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
400 * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
401 * needs to update hardware values to guest's virtual MSR values in order to
402 * ensure EINIT is executed with expected hardware values.
403 *
404 * Return:
405 * - 0: EINIT was successful.
406 * - <0: on error.
407 */
408int sgx_virt_einit(void __user *sigstruct, void __user *token,
409 void __user *secs, u64 *lepubkeyhash, int *trapnr)
410{
411 int ret;
412
413 if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
414 ret = __sgx_virt_einit(sigstruct, token, secs);
415 } else {
416 preempt_disable();
417
418 sgx_update_lepubkeyhash(lepubkeyhash);
419
420 ret = __sgx_virt_einit(sigstruct, token, secs);
421 preempt_enable();
422 }
423
424 /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
425 if (ret == -EINVAL)
426 return ret;
427
428 if (encls_faulted(ret)) {
429 *trapnr = ENCLS_TRAPNR(ret);
430 return -EFAULT;
431 }
432
433 return ret;
434}
435EXPORT_SYMBOL_GPL(sgx_virt_einit);
436

source code of linux/arch/x86/kernel/cpu/sgx/virt.c