1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright(c) 2016-20 Intel Corporation. */ |
3 | |
4 | #include <linux/file.h> |
5 | #include <linux/freezer.h> |
6 | #include <linux/highmem.h> |
7 | #include <linux/kthread.h> |
8 | #include <linux/miscdevice.h> |
9 | #include <linux/node.h> |
10 | #include <linux/pagemap.h> |
11 | #include <linux/ratelimit.h> |
12 | #include <linux/sched/mm.h> |
13 | #include <linux/sched/signal.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/sysfs.h> |
16 | #include <asm/sgx.h> |
17 | #include "driver.h" |
18 | #include "encl.h" |
19 | #include "encls.h" |
20 | |
21 | struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; |
22 | static int sgx_nr_epc_sections; |
23 | static struct task_struct *ksgxd_tsk; |
24 | static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); |
25 | static DEFINE_XARRAY(sgx_epc_address_space); |
26 | |
27 | /* |
28 | * These variables are part of the state of the reclaimer, and must be accessed |
29 | * with sgx_reclaimer_lock acquired. |
30 | */ |
31 | static LIST_HEAD(sgx_active_page_list); |
32 | static DEFINE_SPINLOCK(sgx_reclaimer_lock); |
33 | |
34 | static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); |
35 | |
36 | /* Nodes with one or more EPC sections. */ |
37 | static nodemask_t sgx_numa_mask; |
38 | |
39 | /* |
40 | * Array with one list_head for each possible NUMA node. Each |
41 | * list contains all the sgx_epc_section's which are on that |
42 | * node. |
43 | */ |
44 | static struct sgx_numa_node *sgx_numa_nodes; |
45 | |
46 | static LIST_HEAD(sgx_dirty_page_list); |
47 | |
48 | /* |
49 | * Reset post-kexec EPC pages to the uninitialized state. The pages are removed |
50 | * from the input list, and made available for the page allocator. SECS pages |
51 | * prepending their children in the input list are left intact. |
52 | * |
53 | * Return 0 when sanitization was successful or kthread was stopped, and the |
54 | * number of unsanitized pages otherwise. |
55 | */ |
56 | static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) |
57 | { |
58 | unsigned long left_dirty = 0; |
59 | struct sgx_epc_page *page; |
60 | LIST_HEAD(dirty); |
61 | int ret; |
62 | |
63 | /* dirty_page_list is thread-local, no need for a lock: */ |
64 | while (!list_empty(head: dirty_page_list)) { |
65 | if (kthread_should_stop()) |
66 | return 0; |
67 | |
68 | page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); |
69 | |
70 | /* |
71 | * Checking page->poison without holding the node->lock |
72 | * is racy, but losing the race (i.e. poison is set just |
73 | * after the check) just means __eremove() will be uselessly |
74 | * called for a page that sgx_free_epc_page() will put onto |
75 | * the node->sgx_poison_page_list later. |
76 | */ |
77 | if (page->poison) { |
78 | struct sgx_epc_section *section = &sgx_epc_sections[page->section]; |
79 | struct sgx_numa_node *node = section->node; |
80 | |
81 | spin_lock(lock: &node->lock); |
82 | list_move(list: &page->list, head: &node->sgx_poison_page_list); |
83 | spin_unlock(lock: &node->lock); |
84 | |
85 | continue; |
86 | } |
87 | |
88 | ret = __eremove(addr: sgx_get_epc_virt_addr(page)); |
89 | if (!ret) { |
90 | /* |
91 | * page is now sanitized. Make it available via the SGX |
92 | * page allocator: |
93 | */ |
94 | list_del(entry: &page->list); |
95 | sgx_free_epc_page(page); |
96 | } else { |
97 | /* The page is not yet clean - move to the dirty list. */ |
98 | list_move_tail(list: &page->list, head: &dirty); |
99 | left_dirty++; |
100 | } |
101 | |
102 | cond_resched(); |
103 | } |
104 | |
105 | list_splice(list: &dirty, head: dirty_page_list); |
106 | return left_dirty; |
107 | } |
108 | |
109 | static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) |
110 | { |
111 | struct sgx_encl_page *page = epc_page->owner; |
112 | struct sgx_encl *encl = page->encl; |
113 | struct sgx_encl_mm *encl_mm; |
114 | bool ret = true; |
115 | int idx; |
116 | |
117 | idx = srcu_read_lock(ssp: &encl->srcu); |
118 | |
119 | list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { |
120 | if (!mmget_not_zero(mm: encl_mm->mm)) |
121 | continue; |
122 | |
123 | mmap_read_lock(mm: encl_mm->mm); |
124 | ret = !sgx_encl_test_and_clear_young(mm: encl_mm->mm, page); |
125 | mmap_read_unlock(mm: encl_mm->mm); |
126 | |
127 | mmput_async(encl_mm->mm); |
128 | |
129 | if (!ret) |
130 | break; |
131 | } |
132 | |
133 | srcu_read_unlock(ssp: &encl->srcu, idx); |
134 | |
135 | if (!ret) |
136 | return false; |
137 | |
138 | return true; |
139 | } |
140 | |
141 | static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) |
142 | { |
143 | struct sgx_encl_page *page = epc_page->owner; |
144 | unsigned long addr = page->desc & PAGE_MASK; |
145 | struct sgx_encl *encl = page->encl; |
146 | int ret; |
147 | |
148 | sgx_zap_enclave_ptes(encl, addr); |
149 | |
150 | mutex_lock(&encl->lock); |
151 | |
152 | ret = __eblock(addr: sgx_get_epc_virt_addr(page: epc_page)); |
153 | if (encls_failed(ret)) |
154 | ENCLS_WARN(ret, "EBLOCK" ); |
155 | |
156 | mutex_unlock(lock: &encl->lock); |
157 | } |
158 | |
159 | static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, |
160 | struct sgx_backing *backing) |
161 | { |
162 | struct sgx_pageinfo pginfo; |
163 | int ret; |
164 | |
165 | pginfo.addr = 0; |
166 | pginfo.secs = 0; |
167 | |
168 | pginfo.contents = (unsigned long)kmap_local_page(page: backing->contents); |
169 | pginfo.metadata = (unsigned long)kmap_local_page(page: backing->pcmd) + |
170 | backing->pcmd_offset; |
171 | |
172 | ret = __ewb(pginfo: &pginfo, addr: sgx_get_epc_virt_addr(page: epc_page), va: va_slot); |
173 | set_page_dirty(backing->pcmd); |
174 | set_page_dirty(backing->contents); |
175 | |
176 | kunmap_local((void *)(unsigned long)(pginfo.metadata - |
177 | backing->pcmd_offset)); |
178 | kunmap_local((void *)(unsigned long)pginfo.contents); |
179 | |
180 | return ret; |
181 | } |
182 | |
183 | void sgx_ipi_cb(void *info) |
184 | { |
185 | } |
186 | |
187 | /* |
188 | * Swap page to the regular memory transformed to the blocked state by using |
189 | * EBLOCK, which means that it can no longer be referenced (no new TLB entries). |
190 | * |
191 | * The first trial just tries to write the page assuming that some other thread |
192 | * has reset the count for threads inside the enclave by using ETRACK, and |
193 | * previous thread count has been zeroed out. The second trial calls ETRACK |
194 | * before EWB. If that fails we kick all the HW threads out, and then do EWB, |
195 | * which should be guaranteed the succeed. |
196 | */ |
197 | static void sgx_encl_ewb(struct sgx_epc_page *epc_page, |
198 | struct sgx_backing *backing) |
199 | { |
200 | struct sgx_encl_page *encl_page = epc_page->owner; |
201 | struct sgx_encl *encl = encl_page->encl; |
202 | struct sgx_va_page *va_page; |
203 | unsigned int va_offset; |
204 | void *va_slot; |
205 | int ret; |
206 | |
207 | encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; |
208 | |
209 | va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, |
210 | list); |
211 | va_offset = sgx_alloc_va_slot(va_page); |
212 | va_slot = sgx_get_epc_virt_addr(page: va_page->epc_page) + va_offset; |
213 | if (sgx_va_page_full(va_page)) |
214 | list_move_tail(list: &va_page->list, head: &encl->va_pages); |
215 | |
216 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); |
217 | if (ret == SGX_NOT_TRACKED) { |
218 | ret = __etrack(addr: sgx_get_epc_virt_addr(page: encl->secs.epc_page)); |
219 | if (ret) { |
220 | if (encls_failed(ret)) |
221 | ENCLS_WARN(ret, "ETRACK" ); |
222 | } |
223 | |
224 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); |
225 | if (ret == SGX_NOT_TRACKED) { |
226 | /* |
227 | * Slow path, send IPIs to kick cpus out of the |
228 | * enclave. Note, it's imperative that the cpu |
229 | * mask is generated *after* ETRACK, else we'll |
230 | * miss cpus that entered the enclave between |
231 | * generating the mask and incrementing epoch. |
232 | */ |
233 | on_each_cpu_mask(mask: sgx_encl_cpumask(encl), |
234 | func: sgx_ipi_cb, NULL, wait: 1); |
235 | ret = __sgx_encl_ewb(epc_page, va_slot, backing); |
236 | } |
237 | } |
238 | |
239 | if (ret) { |
240 | if (encls_failed(ret)) |
241 | ENCLS_WARN(ret, "EWB" ); |
242 | |
243 | sgx_free_va_slot(va_page, offset: va_offset); |
244 | } else { |
245 | encl_page->desc |= va_offset; |
246 | encl_page->va_page = va_page; |
247 | } |
248 | } |
249 | |
250 | static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, |
251 | struct sgx_backing *backing) |
252 | { |
253 | struct sgx_encl_page *encl_page = epc_page->owner; |
254 | struct sgx_encl *encl = encl_page->encl; |
255 | struct sgx_backing secs_backing; |
256 | int ret; |
257 | |
258 | mutex_lock(&encl->lock); |
259 | |
260 | sgx_encl_ewb(epc_page, backing); |
261 | encl_page->epc_page = NULL; |
262 | encl->secs_child_cnt--; |
263 | sgx_encl_put_backing(backing); |
264 | |
265 | if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { |
266 | ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), |
267 | backing: &secs_backing); |
268 | if (ret) |
269 | goto out; |
270 | |
271 | sgx_encl_ewb(epc_page: encl->secs.epc_page, backing: &secs_backing); |
272 | |
273 | sgx_encl_free_epc_page(page: encl->secs.epc_page); |
274 | encl->secs.epc_page = NULL; |
275 | |
276 | sgx_encl_put_backing(backing: &secs_backing); |
277 | } |
278 | |
279 | out: |
280 | mutex_unlock(lock: &encl->lock); |
281 | } |
282 | |
283 | /* |
284 | * Take a fixed number of pages from the head of the active page pool and |
285 | * reclaim them to the enclave's private shmem files. Skip the pages, which have |
286 | * been accessed since the last scan. Move those pages to the tail of active |
287 | * page pool so that the pages get scanned in LRU like fashion. |
288 | * |
289 | * Batch process a chunk of pages (at the moment 16) in order to degrade amount |
290 | * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit |
291 | * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI |
292 | * + EWB) but not sufficiently. Reclaiming one page at a time would also be |
293 | * problematic as it would increase the lock contention too much, which would |
294 | * halt forward progress. |
295 | */ |
296 | static void sgx_reclaim_pages(void) |
297 | { |
298 | struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; |
299 | struct sgx_backing backing[SGX_NR_TO_SCAN]; |
300 | struct sgx_encl_page *encl_page; |
301 | struct sgx_epc_page *epc_page; |
302 | pgoff_t page_index; |
303 | int cnt = 0; |
304 | int ret; |
305 | int i; |
306 | |
307 | spin_lock(lock: &sgx_reclaimer_lock); |
308 | for (i = 0; i < SGX_NR_TO_SCAN; i++) { |
309 | if (list_empty(head: &sgx_active_page_list)) |
310 | break; |
311 | |
312 | epc_page = list_first_entry(&sgx_active_page_list, |
313 | struct sgx_epc_page, list); |
314 | list_del_init(entry: &epc_page->list); |
315 | encl_page = epc_page->owner; |
316 | |
317 | if (kref_get_unless_zero(kref: &encl_page->encl->refcount) != 0) |
318 | chunk[cnt++] = epc_page; |
319 | else |
320 | /* The owner is freeing the page. No need to add the |
321 | * page back to the list of reclaimable pages. |
322 | */ |
323 | epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; |
324 | } |
325 | spin_unlock(lock: &sgx_reclaimer_lock); |
326 | |
327 | for (i = 0; i < cnt; i++) { |
328 | epc_page = chunk[i]; |
329 | encl_page = epc_page->owner; |
330 | |
331 | if (!sgx_reclaimer_age(epc_page)) |
332 | goto skip; |
333 | |
334 | page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); |
335 | |
336 | mutex_lock(&encl_page->encl->lock); |
337 | ret = sgx_encl_alloc_backing(encl: encl_page->encl, page_index, backing: &backing[i]); |
338 | if (ret) { |
339 | mutex_unlock(lock: &encl_page->encl->lock); |
340 | goto skip; |
341 | } |
342 | |
343 | encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; |
344 | mutex_unlock(lock: &encl_page->encl->lock); |
345 | continue; |
346 | |
347 | skip: |
348 | spin_lock(lock: &sgx_reclaimer_lock); |
349 | list_add_tail(new: &epc_page->list, head: &sgx_active_page_list); |
350 | spin_unlock(lock: &sgx_reclaimer_lock); |
351 | |
352 | kref_put(kref: &encl_page->encl->refcount, release: sgx_encl_release); |
353 | |
354 | chunk[i] = NULL; |
355 | } |
356 | |
357 | for (i = 0; i < cnt; i++) { |
358 | epc_page = chunk[i]; |
359 | if (epc_page) |
360 | sgx_reclaimer_block(epc_page); |
361 | } |
362 | |
363 | for (i = 0; i < cnt; i++) { |
364 | epc_page = chunk[i]; |
365 | if (!epc_page) |
366 | continue; |
367 | |
368 | encl_page = epc_page->owner; |
369 | sgx_reclaimer_write(epc_page, backing: &backing[i]); |
370 | |
371 | kref_put(kref: &encl_page->encl->refcount, release: sgx_encl_release); |
372 | epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; |
373 | |
374 | sgx_free_epc_page(page: epc_page); |
375 | } |
376 | } |
377 | |
378 | static bool sgx_should_reclaim(unsigned long watermark) |
379 | { |
380 | return atomic_long_read(v: &sgx_nr_free_pages) < watermark && |
381 | !list_empty(head: &sgx_active_page_list); |
382 | } |
383 | |
384 | /* |
385 | * sgx_reclaim_direct() should be called (without enclave's mutex held) |
386 | * in locations where SGX memory resources might be low and might be |
387 | * needed in order to make forward progress. |
388 | */ |
389 | void sgx_reclaim_direct(void) |
390 | { |
391 | if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) |
392 | sgx_reclaim_pages(); |
393 | } |
394 | |
395 | static int ksgxd(void *p) |
396 | { |
397 | set_freezable(); |
398 | |
399 | /* |
400 | * Sanitize pages in order to recover from kexec(). The 2nd pass is |
401 | * required for SECS pages, whose child pages blocked EREMOVE. |
402 | */ |
403 | __sgx_sanitize_pages(dirty_page_list: &sgx_dirty_page_list); |
404 | WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); |
405 | |
406 | while (!kthread_should_stop()) { |
407 | if (try_to_freeze()) |
408 | continue; |
409 | |
410 | wait_event_freezable(ksgxd_waitq, |
411 | kthread_should_stop() || |
412 | sgx_should_reclaim(SGX_NR_HIGH_PAGES)); |
413 | |
414 | if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) |
415 | sgx_reclaim_pages(); |
416 | |
417 | cond_resched(); |
418 | } |
419 | |
420 | return 0; |
421 | } |
422 | |
423 | static bool __init sgx_page_reclaimer_init(void) |
424 | { |
425 | struct task_struct *tsk; |
426 | |
427 | tsk = kthread_run(ksgxd, NULL, "ksgxd" ); |
428 | if (IS_ERR(ptr: tsk)) |
429 | return false; |
430 | |
431 | ksgxd_tsk = tsk; |
432 | |
433 | return true; |
434 | } |
435 | |
436 | bool current_is_ksgxd(void) |
437 | { |
438 | return current == ksgxd_tsk; |
439 | } |
440 | |
441 | static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) |
442 | { |
443 | struct sgx_numa_node *node = &sgx_numa_nodes[nid]; |
444 | struct sgx_epc_page *page = NULL; |
445 | |
446 | spin_lock(lock: &node->lock); |
447 | |
448 | if (list_empty(head: &node->free_page_list)) { |
449 | spin_unlock(lock: &node->lock); |
450 | return NULL; |
451 | } |
452 | |
453 | page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); |
454 | list_del_init(entry: &page->list); |
455 | page->flags = 0; |
456 | |
457 | spin_unlock(lock: &node->lock); |
458 | atomic_long_dec(v: &sgx_nr_free_pages); |
459 | |
460 | return page; |
461 | } |
462 | |
463 | /** |
464 | * __sgx_alloc_epc_page() - Allocate an EPC page |
465 | * |
466 | * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start |
467 | * from the NUMA node, where the caller is executing. |
468 | * |
469 | * Return: |
470 | * - an EPC page: A borrowed EPC pages were available. |
471 | * - NULL: Out of EPC pages. |
472 | */ |
473 | struct sgx_epc_page *__sgx_alloc_epc_page(void) |
474 | { |
475 | struct sgx_epc_page *page; |
476 | int nid_of_current = numa_node_id(); |
477 | int nid = nid_of_current; |
478 | |
479 | if (node_isset(nid_of_current, sgx_numa_mask)) { |
480 | page = __sgx_alloc_epc_page_from_node(nid: nid_of_current); |
481 | if (page) |
482 | return page; |
483 | } |
484 | |
485 | /* Fall back to the non-local NUMA nodes: */ |
486 | while (true) { |
487 | nid = next_node_in(nid, sgx_numa_mask); |
488 | if (nid == nid_of_current) |
489 | break; |
490 | |
491 | page = __sgx_alloc_epc_page_from_node(nid); |
492 | if (page) |
493 | return page; |
494 | } |
495 | |
496 | return ERR_PTR(error: -ENOMEM); |
497 | } |
498 | |
499 | /** |
500 | * sgx_mark_page_reclaimable() - Mark a page as reclaimable |
501 | * @page: EPC page |
502 | * |
503 | * Mark a page as reclaimable and add it to the active page list. Pages |
504 | * are automatically removed from the active list when freed. |
505 | */ |
506 | void sgx_mark_page_reclaimable(struct sgx_epc_page *page) |
507 | { |
508 | spin_lock(lock: &sgx_reclaimer_lock); |
509 | page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; |
510 | list_add_tail(new: &page->list, head: &sgx_active_page_list); |
511 | spin_unlock(lock: &sgx_reclaimer_lock); |
512 | } |
513 | |
514 | /** |
515 | * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list |
516 | * @page: EPC page |
517 | * |
518 | * Clear the reclaimable flag and remove the page from the active page list. |
519 | * |
520 | * Return: |
521 | * 0 on success, |
522 | * -EBUSY if the page is in the process of being reclaimed |
523 | */ |
524 | int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) |
525 | { |
526 | spin_lock(lock: &sgx_reclaimer_lock); |
527 | if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { |
528 | /* The page is being reclaimed. */ |
529 | if (list_empty(head: &page->list)) { |
530 | spin_unlock(lock: &sgx_reclaimer_lock); |
531 | return -EBUSY; |
532 | } |
533 | |
534 | list_del(entry: &page->list); |
535 | page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; |
536 | } |
537 | spin_unlock(lock: &sgx_reclaimer_lock); |
538 | |
539 | return 0; |
540 | } |
541 | |
542 | /** |
543 | * sgx_alloc_epc_page() - Allocate an EPC page |
544 | * @owner: the owner of the EPC page |
545 | * @reclaim: reclaim pages if necessary |
546 | * |
547 | * Iterate through EPC sections and borrow a free EPC page to the caller. When a |
548 | * page is no longer needed it must be released with sgx_free_epc_page(). If |
549 | * @reclaim is set to true, directly reclaim pages when we are out of pages. No |
550 | * mm's can be locked when @reclaim is set to true. |
551 | * |
552 | * Finally, wake up ksgxd when the number of pages goes below the watermark |
553 | * before returning back to the caller. |
554 | * |
555 | * Return: |
556 | * an EPC page, |
557 | * -errno on error |
558 | */ |
559 | struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) |
560 | { |
561 | struct sgx_epc_page *page; |
562 | |
563 | for ( ; ; ) { |
564 | page = __sgx_alloc_epc_page(); |
565 | if (!IS_ERR(ptr: page)) { |
566 | page->owner = owner; |
567 | break; |
568 | } |
569 | |
570 | if (list_empty(head: &sgx_active_page_list)) |
571 | return ERR_PTR(error: -ENOMEM); |
572 | |
573 | if (!reclaim) { |
574 | page = ERR_PTR(error: -EBUSY); |
575 | break; |
576 | } |
577 | |
578 | if (signal_pending(current)) { |
579 | page = ERR_PTR(error: -ERESTARTSYS); |
580 | break; |
581 | } |
582 | |
583 | sgx_reclaim_pages(); |
584 | cond_resched(); |
585 | } |
586 | |
587 | if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) |
588 | wake_up(&ksgxd_waitq); |
589 | |
590 | return page; |
591 | } |
592 | |
593 | /** |
594 | * sgx_free_epc_page() - Free an EPC page |
595 | * @page: an EPC page |
596 | * |
597 | * Put the EPC page back to the list of free pages. It's the caller's |
598 | * responsibility to make sure that the page is in uninitialized state. In other |
599 | * words, do EREMOVE, EWB or whatever operation is necessary before calling |
600 | * this function. |
601 | */ |
602 | void sgx_free_epc_page(struct sgx_epc_page *page) |
603 | { |
604 | struct sgx_epc_section *section = &sgx_epc_sections[page->section]; |
605 | struct sgx_numa_node *node = section->node; |
606 | |
607 | spin_lock(lock: &node->lock); |
608 | |
609 | page->owner = NULL; |
610 | if (page->poison) |
611 | list_add(new: &page->list, head: &node->sgx_poison_page_list); |
612 | else |
613 | list_add_tail(new: &page->list, head: &node->free_page_list); |
614 | page->flags = SGX_EPC_PAGE_IS_FREE; |
615 | |
616 | spin_unlock(lock: &node->lock); |
617 | atomic_long_inc(v: &sgx_nr_free_pages); |
618 | } |
619 | |
620 | static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, |
621 | unsigned long index, |
622 | struct sgx_epc_section *section) |
623 | { |
624 | unsigned long nr_pages = size >> PAGE_SHIFT; |
625 | unsigned long i; |
626 | |
627 | section->virt_addr = memremap(offset: phys_addr, size, flags: MEMREMAP_WB); |
628 | if (!section->virt_addr) |
629 | return false; |
630 | |
631 | section->pages = vmalloc(size: nr_pages * sizeof(struct sgx_epc_page)); |
632 | if (!section->pages) { |
633 | memunmap(addr: section->virt_addr); |
634 | return false; |
635 | } |
636 | |
637 | section->phys_addr = phys_addr; |
638 | xa_store_range(&sgx_epc_address_space, first: section->phys_addr, |
639 | last: phys_addr + size - 1, entry: section, GFP_KERNEL); |
640 | |
641 | for (i = 0; i < nr_pages; i++) { |
642 | section->pages[i].section = index; |
643 | section->pages[i].flags = 0; |
644 | section->pages[i].owner = NULL; |
645 | section->pages[i].poison = 0; |
646 | list_add_tail(new: §ion->pages[i].list, head: &sgx_dirty_page_list); |
647 | } |
648 | |
649 | return true; |
650 | } |
651 | |
652 | bool arch_is_platform_page(u64 paddr) |
653 | { |
654 | return !!xa_load(&sgx_epc_address_space, index: paddr); |
655 | } |
656 | EXPORT_SYMBOL_GPL(arch_is_platform_page); |
657 | |
658 | static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) |
659 | { |
660 | struct sgx_epc_section *section; |
661 | |
662 | section = xa_load(&sgx_epc_address_space, index: paddr); |
663 | if (!section) |
664 | return NULL; |
665 | |
666 | return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; |
667 | } |
668 | |
669 | /* |
670 | * Called in process context to handle a hardware reported |
671 | * error in an SGX EPC page. |
672 | * If the MF_ACTION_REQUIRED bit is set in flags, then the |
673 | * context is the task that consumed the poison data. Otherwise |
674 | * this is called from a kernel thread unrelated to the page. |
675 | */ |
676 | int arch_memory_failure(unsigned long pfn, int flags) |
677 | { |
678 | struct sgx_epc_page *page = sgx_paddr_to_page(paddr: pfn << PAGE_SHIFT); |
679 | struct sgx_epc_section *section; |
680 | struct sgx_numa_node *node; |
681 | |
682 | /* |
683 | * mm/memory-failure.c calls this routine for all errors |
684 | * where there isn't a "struct page" for the address. But that |
685 | * includes other address ranges besides SGX. |
686 | */ |
687 | if (!page) |
688 | return -ENXIO; |
689 | |
690 | /* |
691 | * If poison was consumed synchronously. Send a SIGBUS to |
692 | * the task. Hardware has already exited the SGX enclave and |
693 | * will not allow re-entry to an enclave that has a memory |
694 | * error. The signal may help the task understand why the |
695 | * enclave is broken. |
696 | */ |
697 | if (flags & MF_ACTION_REQUIRED) |
698 | force_sig(SIGBUS); |
699 | |
700 | section = &sgx_epc_sections[page->section]; |
701 | node = section->node; |
702 | |
703 | spin_lock(lock: &node->lock); |
704 | |
705 | /* Already poisoned? Nothing more to do */ |
706 | if (page->poison) |
707 | goto out; |
708 | |
709 | page->poison = 1; |
710 | |
711 | /* |
712 | * If the page is on a free list, move it to the per-node |
713 | * poison page list. |
714 | */ |
715 | if (page->flags & SGX_EPC_PAGE_IS_FREE) { |
716 | list_move(list: &page->list, head: &node->sgx_poison_page_list); |
717 | goto out; |
718 | } |
719 | |
720 | /* |
721 | * TBD: Add additional plumbing to enable pre-emptive |
722 | * action for asynchronous poison notification. Until |
723 | * then just hope that the poison: |
724 | * a) is not accessed - sgx_free_epc_page() will deal with it |
725 | * when the user gives it back |
726 | * b) results in a recoverable machine check rather than |
727 | * a fatal one |
728 | */ |
729 | out: |
730 | spin_unlock(lock: &node->lock); |
731 | return 0; |
732 | } |
733 | |
734 | /** |
735 | * A section metric is concatenated in a way that @low bits 12-31 define the |
736 | * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the |
737 | * metric. |
738 | */ |
739 | static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) |
740 | { |
741 | return (low & GENMASK_ULL(31, 12)) + |
742 | ((high & GENMASK_ULL(19, 0)) << 32); |
743 | } |
744 | |
745 | #ifdef CONFIG_NUMA |
746 | static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) |
747 | { |
748 | return sysfs_emit(buf, fmt: "%lu\n" , sgx_numa_nodes[dev->id].size); |
749 | } |
750 | static DEVICE_ATTR_RO(sgx_total_bytes); |
751 | |
752 | static umode_t arch_node_attr_is_visible(struct kobject *kobj, |
753 | struct attribute *attr, int idx) |
754 | { |
755 | /* Make all x86/ attributes invisible when SGX is not initialized: */ |
756 | if (nodes_empty(sgx_numa_mask)) |
757 | return 0; |
758 | |
759 | return attr->mode; |
760 | } |
761 | |
762 | static struct attribute *arch_node_dev_attrs[] = { |
763 | &dev_attr_sgx_total_bytes.attr, |
764 | NULL, |
765 | }; |
766 | |
767 | const struct attribute_group arch_node_dev_group = { |
768 | .name = "x86" , |
769 | .attrs = arch_node_dev_attrs, |
770 | .is_visible = arch_node_attr_is_visible, |
771 | }; |
772 | |
773 | static void __init arch_update_sysfs_visibility(int nid) |
774 | { |
775 | struct node *node = node_devices[nid]; |
776 | int ret; |
777 | |
778 | ret = sysfs_update_group(kobj: &node->dev.kobj, grp: &arch_node_dev_group); |
779 | |
780 | if (ret) |
781 | pr_err("sysfs update failed (%d), files may be invisible" , ret); |
782 | } |
783 | #else /* !CONFIG_NUMA */ |
784 | static void __init arch_update_sysfs_visibility(int nid) {} |
785 | #endif |
786 | |
787 | static bool __init sgx_page_cache_init(void) |
788 | { |
789 | u32 eax, ebx, ecx, edx, type; |
790 | u64 pa, size; |
791 | int nid; |
792 | int i; |
793 | |
794 | sgx_numa_nodes = kmalloc_array(num_possible_nodes(), size: sizeof(*sgx_numa_nodes), GFP_KERNEL); |
795 | if (!sgx_numa_nodes) |
796 | return false; |
797 | |
798 | for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { |
799 | cpuid_count(SGX_CPUID, count: i + SGX_CPUID_EPC, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
800 | |
801 | type = eax & SGX_CPUID_EPC_MASK; |
802 | if (type == SGX_CPUID_EPC_INVALID) |
803 | break; |
804 | |
805 | if (type != SGX_CPUID_EPC_SECTION) { |
806 | pr_err_once("Unknown EPC section type: %u\n" , type); |
807 | break; |
808 | } |
809 | |
810 | pa = sgx_calc_section_metric(low: eax, high: ebx); |
811 | size = sgx_calc_section_metric(low: ecx, high: edx); |
812 | |
813 | pr_info("EPC section 0x%llx-0x%llx\n" , pa, pa + size - 1); |
814 | |
815 | if (!sgx_setup_epc_section(phys_addr: pa, size, index: i, section: &sgx_epc_sections[i])) { |
816 | pr_err("No free memory for an EPC section\n" ); |
817 | break; |
818 | } |
819 | |
820 | nid = numa_map_to_online_node(phys_to_target_node(pa)); |
821 | if (nid == NUMA_NO_NODE) { |
822 | /* The physical address is already printed above. */ |
823 | pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n" ); |
824 | nid = 0; |
825 | } |
826 | |
827 | if (!node_isset(nid, sgx_numa_mask)) { |
828 | spin_lock_init(&sgx_numa_nodes[nid].lock); |
829 | INIT_LIST_HEAD(list: &sgx_numa_nodes[nid].free_page_list); |
830 | INIT_LIST_HEAD(list: &sgx_numa_nodes[nid].sgx_poison_page_list); |
831 | node_set(nid, sgx_numa_mask); |
832 | sgx_numa_nodes[nid].size = 0; |
833 | |
834 | /* Make SGX-specific node sysfs files visible: */ |
835 | arch_update_sysfs_visibility(nid); |
836 | } |
837 | |
838 | sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; |
839 | sgx_numa_nodes[nid].size += size; |
840 | |
841 | sgx_nr_epc_sections++; |
842 | } |
843 | |
844 | if (!sgx_nr_epc_sections) { |
845 | pr_err("There are zero EPC sections.\n" ); |
846 | return false; |
847 | } |
848 | |
849 | return true; |
850 | } |
851 | |
852 | /* |
853 | * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. |
854 | * Bare-metal driver requires to update them to hash of enclave's signer |
855 | * before EINIT. KVM needs to update them to guest's virtual MSR values |
856 | * before doing EINIT from guest. |
857 | */ |
858 | void sgx_update_lepubkeyhash(u64 *lepubkeyhash) |
859 | { |
860 | int i; |
861 | |
862 | WARN_ON_ONCE(preemptible()); |
863 | |
864 | for (i = 0; i < 4; i++) |
865 | wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, val: lepubkeyhash[i]); |
866 | } |
867 | |
868 | const struct file_operations sgx_provision_fops = { |
869 | .owner = THIS_MODULE, |
870 | }; |
871 | |
872 | static struct miscdevice sgx_dev_provision = { |
873 | .minor = MISC_DYNAMIC_MINOR, |
874 | .name = "sgx_provision" , |
875 | .nodename = "sgx_provision" , |
876 | .fops = &sgx_provision_fops, |
877 | }; |
878 | |
879 | /** |
880 | * sgx_set_attribute() - Update allowed attributes given file descriptor |
881 | * @allowed_attributes: Pointer to allowed enclave attributes |
882 | * @attribute_fd: File descriptor for specific attribute |
883 | * |
884 | * Append enclave attribute indicated by file descriptor to allowed |
885 | * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by |
886 | * /dev/sgx_provision is supported. |
887 | * |
888 | * Return: |
889 | * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes |
890 | * -EINVAL: Invalid, or not supported file descriptor |
891 | */ |
892 | int sgx_set_attribute(unsigned long *allowed_attributes, |
893 | unsigned int attribute_fd) |
894 | { |
895 | struct fd f = fdget(fd: attribute_fd); |
896 | |
897 | if (!f.file) |
898 | return -EINVAL; |
899 | |
900 | if (f.file->f_op != &sgx_provision_fops) { |
901 | fdput(fd: f); |
902 | return -EINVAL; |
903 | } |
904 | |
905 | *allowed_attributes |= SGX_ATTR_PROVISIONKEY; |
906 | |
907 | fdput(fd: f); |
908 | return 0; |
909 | } |
910 | EXPORT_SYMBOL_GPL(sgx_set_attribute); |
911 | |
912 | static int __init sgx_init(void) |
913 | { |
914 | int ret; |
915 | int i; |
916 | |
917 | if (!cpu_feature_enabled(X86_FEATURE_SGX)) |
918 | return -ENODEV; |
919 | |
920 | if (!sgx_page_cache_init()) |
921 | return -ENOMEM; |
922 | |
923 | if (!sgx_page_reclaimer_init()) { |
924 | ret = -ENOMEM; |
925 | goto err_page_cache; |
926 | } |
927 | |
928 | ret = misc_register(misc: &sgx_dev_provision); |
929 | if (ret) |
930 | goto err_kthread; |
931 | |
932 | /* |
933 | * Always try to initialize the native *and* KVM drivers. |
934 | * The KVM driver is less picky than the native one and |
935 | * can function if the native one is not supported on the |
936 | * current system or fails to initialize. |
937 | * |
938 | * Error out only if both fail to initialize. |
939 | */ |
940 | ret = sgx_drv_init(); |
941 | |
942 | if (sgx_vepc_init() && ret) |
943 | goto err_provision; |
944 | |
945 | return 0; |
946 | |
947 | err_provision: |
948 | misc_deregister(misc: &sgx_dev_provision); |
949 | |
950 | err_kthread: |
951 | kthread_stop(k: ksgxd_tsk); |
952 | |
953 | err_page_cache: |
954 | for (i = 0; i < sgx_nr_epc_sections; i++) { |
955 | vfree(addr: sgx_epc_sections[i].pages); |
956 | memunmap(addr: sgx_epc_sections[i].virt_addr); |
957 | } |
958 | |
959 | return ret; |
960 | } |
961 | |
962 | device_initcall(sgx_init); |
963 | |