main.c source code [linux/arch/x86/kernel/cpu/sgx/main.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/ Copyright(c) 2016-20 Intel Corporation. /
3
4	#include <linux/file.h>
5	#include <linux/freezer.h>
6	#include <linux/highmem.h>
7	#include <linux/kthread.h>
8	#include <linux/miscdevice.h>
9	#include <linux/node.h>
10	#include <linux/pagemap.h>
11	#include <linux/ratelimit.h>
12	#include <linux/sched/mm.h>
13	#include <linux/sched/signal.h>
14	#include <linux/slab.h>
15	#include <linux/sysfs.h>
16	#include <asm/sgx.h>
17	#include "driver.h"
18	#include "encl.h"
19	#include "encls.h"
20
21	struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
22	static int sgx_nr_epc_sections;
23	static struct task_struct *ksgxd_tsk;
24	static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
25	static DEFINE_XARRAY(sgx_epc_address_space);
26
27	/*
28	* These variables are part of the state of the reclaimer, and must be accessed
29	* with sgx_reclaimer_lock acquired.
30	*/
31	static LIST_HEAD(sgx_active_page_list);
32	static DEFINE_SPINLOCK(sgx_reclaimer_lock);
33
34	static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(`0`);
35
36	/ Nodes with one or more EPC sections. /
37	static nodemask_t sgx_numa_mask;
38
39	/*
40	* Array with one list_head for each possible NUMA node. Each
41	* list contains all the sgx_epc_section's which are on that
42	* node.
43	*/
44	static struct sgx_numa_node *sgx_numa_nodes;
45
46	static LIST_HEAD(sgx_dirty_page_list);
47
48	/*
49	* Reset post-kexec EPC pages to the uninitialized state. The pages are removed
50	* from the input list, and made available for the page allocator. SECS pages
51	* prepending their children in the input list are left intact.
52	*
53	* Return 0 when sanitization was successful or kthread was stopped, and the
54	* number of unsanitized pages otherwise.
55	*/
56	static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
57	{
58	unsigned long left_dirty = `0`;
59	struct sgx_epc_page *page;
60	LIST_HEAD(dirty);
61	int ret;
62
63	/ dirty_page_list is thread-local, no need for a lock: /
64	while (!list_empty(head: dirty_page_list)) {
65	if (kthread_should_stop())
66	return `0`;
67
68	page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
69
70	/*
71	* Checking page->poison without holding the node->lock
72	* is racy, but losing the race (i.e. poison is set just
73	* after the check) just means __eremove() will be uselessly
74	* called for a page that sgx_free_epc_page() will put onto
75	* the node->sgx_poison_page_list later.
76	*/
77	if (page->poison) {
78	struct sgx_epc_section *section = &sgx_epc_sections[page->section];
79	struct sgx_numa_node *node = section->node;
80
81	spin_lock(lock: &node->lock);
82	list_move(list: &page->list, head: &node->sgx_poison_page_list);
83	spin_unlock(lock: &node->lock);
84
85	continue;
86	}
87
88	ret = __eremove(addr: sgx_get_epc_virt_addr(page));
89	if (!ret) {
90	/*
91	* page is now sanitized. Make it available via the SGX
92	* page allocator:
93	*/
94	list_del(entry: &page->list);
95	sgx_free_epc_page(page);
96	} else {
97	/ The page is not yet clean - move to the dirty list. /
98	list_move_tail(list: &page->list, head: &dirty);
99	left_dirty++;
100	}
101
102	cond_resched();
103	}
104
105	list_splice(list: &dirty, head: dirty_page_list);
106	return left_dirty;
107	}
108
109	static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
110	{
111	struct sgx_encl_page *page = epc_page->owner;
112	struct sgx_encl *encl = page->encl;
113	struct sgx_encl_mm *encl_mm;
114	bool ret = true;
115	int idx;
116
117	idx = srcu_read_lock(ssp: &encl->srcu);
118
119	list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
120	if (!mmget_not_zero(mm: encl_mm->mm))
121	continue;
122
123	mmap_read_lock(mm: encl_mm->mm);
124	ret = !sgx_encl_test_and_clear_young(mm: encl_mm->mm, page);
125	mmap_read_unlock(mm: encl_mm->mm);
126
127	mmput_async(encl_mm->mm);
128
129	if (!ret)
130	break;
131	}
132
133	srcu_read_unlock(ssp: &encl->srcu, idx);
134
135	if (!ret)
136	return false;
137
138	return true;
139	}
140
141	static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
142	{
143	struct sgx_encl_page *page = epc_page->owner;
144	unsigned long addr = page->desc & PAGE_MASK;
145	struct sgx_encl *encl = page->encl;
146	int ret;
147
148	sgx_zap_enclave_ptes(encl, addr);
149
150	mutex_lock(&encl->lock);
151
152	ret = __eblock(addr: sgx_get_epc_virt_addr(page: epc_page));
153	if (encls_failed(ret))
154	ENCLS_WARN(ret, "EBLOCK");
155
156	mutex_unlock(lock: &encl->lock);
157	}
158
159	static int __sgx_encl_ewb(struct sgx_epc_page epc_page, void* *va_slot,
160	struct sgx_backing *backing)
161	{
162	struct sgx_pageinfo pginfo;
163	int ret;
164
165	pginfo.addr = `0`;
166	pginfo.secs = `0`;
167
168	pginfo.contents = (unsigned long)kmap_local_page(page: backing->contents);
169	pginfo.metadata = (unsigned long)kmap_local_page(page: backing->pcmd) +
170	backing->pcmd_offset;
171
172	ret = __ewb(pginfo: &pginfo, addr: sgx_get_epc_virt_addr(page: epc_page), va: va_slot);
173	set_page_dirty(backing->pcmd);
174	set_page_dirty(backing->contents);
175
176	kunmap_local((void )(unsigned* long)(pginfo.metadata -
177	backing->pcmd_offset));
178	kunmap_local((void )(unsigned* long)pginfo.contents);
179
180	return ret;
181	}
182
183	void sgx_ipi_cb(void *info)
184	{
185	}
186
187	/*
188	* Swap page to the regular memory transformed to the blocked state by using
189	* EBLOCK, which means that it can no longer be referenced (no new TLB entries).
190	*
191	* The first trial just tries to write the page assuming that some other thread
192	* has reset the count for threads inside the enclave by using ETRACK, and
193	* previous thread count has been zeroed out. The second trial calls ETRACK
194	* before EWB. If that fails we kick all the HW threads out, and then do EWB,
195	* which should be guaranteed the succeed.
196	*/
197	static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
198	struct sgx_backing *backing)
199	{
200	struct sgx_encl_page *encl_page = epc_page->owner;
201	struct sgx_encl *encl = encl_page->encl;
202	struct sgx_va_page *va_page;
203	unsigned int va_offset;
204	void *va_slot;
205	int ret;
206
207	encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
208
209	va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
210	list);
211	va_offset = sgx_alloc_va_slot(va_page);
212	va_slot = sgx_get_epc_virt_addr(page: va_page->epc_page) + va_offset;
213	if (sgx_va_page_full(va_page))
214	list_move_tail(list: &va_page->list, head: &encl->va_pages);
215
216	ret = __sgx_encl_ewb(epc_page, va_slot, backing);
217	if (ret == SGX_NOT_TRACKED) {
218	ret = __etrack(addr: sgx_get_epc_virt_addr(page: encl->secs.epc_page));
219	if (ret) {
220	if (encls_failed(ret))
221	ENCLS_WARN(ret, "ETRACK");
222	}
223
224	ret = __sgx_encl_ewb(epc_page, va_slot, backing);
225	if (ret == SGX_NOT_TRACKED) {
226	/*
227	* Slow path, send IPIs to kick cpus out of the
228	* enclave. Note, it's imperative that the cpu
229	* mask is generated after ETRACK, else we'll
230	* miss cpus that entered the enclave between
231	* generating the mask and incrementing epoch.
232	*/
233	on_each_cpu_mask(mask: sgx_encl_cpumask(encl),
234	func: sgx_ipi_cb, NULL, wait: `1`);
235	ret = __sgx_encl_ewb(epc_page, va_slot, backing);
236	}
237	}
238
239	if (ret) {
240	if (encls_failed(ret))
241	ENCLS_WARN(ret, "EWB");
242
243	sgx_free_va_slot(va_page, offset: va_offset);
244	} else {
245	encl_page->desc \|= va_offset;
246	encl_page->va_page = va_page;
247	}
248	}
249
250	static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
251	struct sgx_backing *backing)
252	{
253	struct sgx_encl_page *encl_page = epc_page->owner;
254	struct sgx_encl *encl = encl_page->encl;
255	struct sgx_backing secs_backing;
256	int ret;
257
258	mutex_lock(&encl->lock);
259
260	sgx_encl_ewb(epc_page, backing);
261	encl_page->epc_page = NULL;
262	encl->secs_child_cnt--;
263	sgx_encl_put_backing(backing);
264
265	if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
266	ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
267	backing: &secs_backing);
268	if (ret)
269	goto out;
270
271	sgx_encl_ewb(epc_page: encl->secs.epc_page, backing: &secs_backing);
272
273	sgx_encl_free_epc_page(page: encl->secs.epc_page);
274	encl->secs.epc_page = NULL;
275
276	sgx_encl_put_backing(backing: &secs_backing);
277	}
278
279	out:
280	mutex_unlock(lock: &encl->lock);
281	}
282
283	/*
284	* Take a fixed number of pages from the head of the active page pool and
285	* reclaim them to the enclave's private shmem files. Skip the pages, which have
286	* been accessed since the last scan. Move those pages to the tail of active
287	* page pool so that the pages get scanned in LRU like fashion.
288	*
289	* Batch process a chunk of pages (at the moment 16) in order to degrade amount
290	* of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
291	* among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
292	* + EWB) but not sufficiently. Reclaiming one page at a time would also be
293	* problematic as it would increase the lock contention too much, which would
294	* halt forward progress.
295	*/
296	static void sgx_reclaim_pages(void)
297	{
298	struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
299	struct sgx_backing backing[SGX_NR_TO_SCAN];
300	struct sgx_encl_page *encl_page;
301	struct sgx_epc_page *epc_page;
302	pgoff_t page_index;
303	int cnt = `0`;
304	int ret;
305	int i;
306
307	spin_lock(lock: &sgx_reclaimer_lock);
308	for (i = `0`; i < SGX_NR_TO_SCAN; i++) {
309	if (list_empty(head: &sgx_active_page_list))
310	break;
311
312	epc_page = list_first_entry(&sgx_active_page_list,
313	struct sgx_epc_page, list);
314	list_del_init(entry: &epc_page->list);
315	encl_page = epc_page->owner;
316
317	if (kref_get_unless_zero(kref: &encl_page->encl->refcount) != `0`)
318	chunk[cnt++] = epc_page;
319	else
320	/ The owner is freeing the page. No need to add the*
321	* page back to the list of reclaimable pages.
322	*/
323	epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
324	}
325	spin_unlock(lock: &sgx_reclaimer_lock);
326
327	for (i = `0`; i < cnt; i++) {
328	epc_page = chunk[i];
329	encl_page = epc_page->owner;
330
331	if (!sgx_reclaimer_age(epc_page))
332	goto skip;
333
334	page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
335
336	mutex_lock(&encl_page->encl->lock);
337	ret = sgx_encl_alloc_backing(encl: encl_page->encl, page_index, backing: &backing[i]);
338	if (ret) {
339	mutex_unlock(lock: &encl_page->encl->lock);
340	goto skip;
341	}
342
343	encl_page->desc \|= SGX_ENCL_PAGE_BEING_RECLAIMED;
344	mutex_unlock(lock: &encl_page->encl->lock);
345	continue;
346
347	skip:
348	spin_lock(lock: &sgx_reclaimer_lock);
349	list_add_tail(new: &epc_page->list, head: &sgx_active_page_list);
350	spin_unlock(lock: &sgx_reclaimer_lock);
351
352	kref_put(kref: &encl_page->encl->refcount, release: sgx_encl_release);
353
354	chunk[i] = NULL;
355	}
356
357	for (i = `0`; i < cnt; i++) {
358	epc_page = chunk[i];
359	if (epc_page)
360	sgx_reclaimer_block(epc_page);
361	}
362
363	for (i = `0`; i < cnt; i++) {
364	epc_page = chunk[i];
365	if (!epc_page)
366	continue;
367
368	encl_page = epc_page->owner;
369	sgx_reclaimer_write(epc_page, backing: &backing[i]);
370
371	kref_put(kref: &encl_page->encl->refcount, release: sgx_encl_release);
372	epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
373
374	sgx_free_epc_page(page: epc_page);
375	}
376	}
377
378	static bool sgx_should_reclaim(unsigned long watermark)
379	{
380	return atomic_long_read(v: &sgx_nr_free_pages) < watermark &&
381	!list_empty(head: &sgx_active_page_list);
382	}
383
384	/*
385	* sgx_reclaim_direct() should be called (without enclave's mutex held)
386	* in locations where SGX memory resources might be low and might be
387	* needed in order to make forward progress.
388	*/
389	void sgx_reclaim_direct(void)
390	{
391	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
392	sgx_reclaim_pages();
393	}
394
395	static int ksgxd(void *p)
396	{
397	set_freezable();
398
399	/*
400	* Sanitize pages in order to recover from kexec(). The 2nd pass is
401	* required for SECS pages, whose child pages blocked EREMOVE.
402	*/
403	__sgx_sanitize_pages(dirty_page_list: &sgx_dirty_page_list);
404	WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
405
406	while (!kthread_should_stop()) {
407	if (try_to_freeze())
408	continue;
409
410	wait_event_freezable(ksgxd_waitq,
411	kthread_should_stop() \|\|
412	sgx_should_reclaim(SGX_NR_HIGH_PAGES));
413
414	if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
415	sgx_reclaim_pages();
416
417	cond_resched();
418	}
419
420	return `0`;
421	}
422
423	static bool __init sgx_page_reclaimer_init(void)
424	{
425	struct task_struct *tsk;
426
427	tsk = kthread_run(ksgxd, NULL, "ksgxd");
428	if (IS_ERR(ptr: tsk))
429	return false;
430
431	ksgxd_tsk = tsk;
432
433	return true;
434	}
435
436	bool current_is_ksgxd(void)
437	{
438	return current == ksgxd_tsk;
439	}
440
441	static struct sgx_epc_page __sgx_alloc_epc_page_from_node(int* nid)
442	{
443	struct sgx_numa_node *node = &sgx_numa_nodes[nid];
444	struct sgx_epc_page *page = NULL;
445
446	spin_lock(lock: &node->lock);
447
448	if (list_empty(head: &node->free_page_list)) {
449	spin_unlock(lock: &node->lock);
450	return NULL;
451	}
452
453	page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
454	list_del_init(entry: &page->list);
455	page->flags = `0`;
456
457	spin_unlock(lock: &node->lock);
458	atomic_long_dec(v: &sgx_nr_free_pages);
459
460	return page;
461	}
462
463	/**
464	* __sgx_alloc_epc_page() - Allocate an EPC page
465	*
466	* Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
467	* from the NUMA node, where the caller is executing.
468	*
469	* Return:
470	* - an EPC page: A borrowed EPC pages were available.
471	* - NULL: Out of EPC pages.
472	*/
473	struct sgx_epc_page __sgx_alloc_epc_page(void*)
474	{
475	struct sgx_epc_page *page;
476	int nid_of_current = numa_node_id();
477	int nid = nid_of_current;
478
479	if (node_isset(nid_of_current, sgx_numa_mask)) {
480	page = __sgx_alloc_epc_page_from_node(nid: nid_of_current);
481	if (page)
482	return page;
483	}
484
485	/ Fall back to the non-local NUMA nodes: /
486	while (true) {
487	nid = next_node_in(nid, sgx_numa_mask);
488	if (nid == nid_of_current)
489	break;
490
491	page = __sgx_alloc_epc_page_from_node(nid);
492	if (page)
493	return page;
494	}
495
496	return ERR_PTR(error: -ENOMEM);
497	}
498
499	/**
500	* sgx_mark_page_reclaimable() - Mark a page as reclaimable
501	* @page: EPC page
502	*
503	* Mark a page as reclaimable and add it to the active page list. Pages
504	* are automatically removed from the active list when freed.
505	*/
506	void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
507	{
508	spin_lock(lock: &sgx_reclaimer_lock);
509	page->flags \|= SGX_EPC_PAGE_RECLAIMER_TRACKED;
510	list_add_tail(new: &page->list, head: &sgx_active_page_list);
511	spin_unlock(lock: &sgx_reclaimer_lock);
512	}
513
514	/**
515	* sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
516	* @page: EPC page
517	*
518	* Clear the reclaimable flag and remove the page from the active page list.
519	*
520	* Return:
521	* 0 on success,
522	* -EBUSY if the page is in the process of being reclaimed
523	*/
524	int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
525	{
526	spin_lock(lock: &sgx_reclaimer_lock);
527	if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
528	/ The page is being reclaimed. /
529	if (list_empty(head: &page->list)) {
530	spin_unlock(lock: &sgx_reclaimer_lock);
531	return -EBUSY;
532	}
533
534	list_del(entry: &page->list);
535	page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
536	}
537	spin_unlock(lock: &sgx_reclaimer_lock);
538
539	return `0`;
540	}
541
542	/**
543	* sgx_alloc_epc_page() - Allocate an EPC page
544	* @owner: the owner of the EPC page
545	* @reclaim: reclaim pages if necessary
546	*
547	* Iterate through EPC sections and borrow a free EPC page to the caller. When a
548	* page is no longer needed it must be released with sgx_free_epc_page(). If
549	* @reclaim is set to true, directly reclaim pages when we are out of pages. No
550	* mm's can be locked when @reclaim is set to true.
551	*
552	* Finally, wake up ksgxd when the number of pages goes below the watermark
553	* before returning back to the caller.
554	*
555	* Return:
556	* an EPC page,
557	* -errno on error
558	*/
559	struct sgx_epc_page sgx_alloc_epc_page(void* *owner, bool reclaim)
560	{
561	struct sgx_epc_page *page;
562
563	for ( ; ; ) {
564	page = __sgx_alloc_epc_page();
565	if (!IS_ERR(ptr: page)) {
566	page->owner = owner;
567	break;
568	}
569
570	if (list_empty(head: &sgx_active_page_list))
571	return ERR_PTR(error: -ENOMEM);
572
573	if (!reclaim) {
574	page = ERR_PTR(error: -EBUSY);
575	break;
576	}
577
578	if (signal_pending(current)) {
579	page = ERR_PTR(error: -ERESTARTSYS);
580	break;
581	}
582
583	sgx_reclaim_pages();
584	cond_resched();
585	}
586
587	if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
588	wake_up(&ksgxd_waitq);
589
590	return page;
591	}
592
593	/**
594	* sgx_free_epc_page() - Free an EPC page
595	* @page: an EPC page
596	*
597	* Put the EPC page back to the list of free pages. It's the caller's
598	* responsibility to make sure that the page is in uninitialized state. In other
599	* words, do EREMOVE, EWB or whatever operation is necessary before calling
600	* this function.
601	*/
602	void sgx_free_epc_page(struct sgx_epc_page *page)
603	{
604	struct sgx_epc_section *section = &sgx_epc_sections[page->section];
605	struct sgx_numa_node *node = section->node;
606
607	spin_lock(lock: &node->lock);
608
609	page->owner = NULL;
610	if (page->poison)
611	list_add(new: &page->list, head: &node->sgx_poison_page_list);
612	else
613	list_add_tail(new: &page->list, head: &node->free_page_list);
614	page->flags = SGX_EPC_PAGE_IS_FREE;
615
616	spin_unlock(lock: &node->lock);
617	atomic_long_inc(v: &sgx_nr_free_pages);
618	}
619
620	static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
621	unsigned long index,
622	struct sgx_epc_section *section)
623	{
624	unsigned long nr_pages = size >> PAGE_SHIFT;
625	unsigned long i;
626
627	section->virt_addr = memremap(offset: phys_addr, size, flags: MEMREMAP_WB);
628	if (!section->virt_addr)
629	return false;
630
631	section->pages = vmalloc(size: nr_pages * sizeof(struct sgx_epc_page));
632	if (!section->pages) {
633	memunmap(addr: section->virt_addr);
634	return false;
635	}
636
637	section->phys_addr = phys_addr;
638	xa_store_range(&sgx_epc_address_space, first: section->phys_addr,
639	last: phys_addr + size - `1`, entry: section, GFP_KERNEL);
640
641	for (i = `0`; i < nr_pages; i++) {
642	section->pages[i].section = index;
643	section->pages[i].flags = `0`;
644	section->pages[i].owner = NULL;
645	section->pages[i].poison = `0`;
646	list_add_tail(new: &section->pages[i].list, head: &sgx_dirty_page_list);
647	}
648
649	return true;
650	}
651
652	bool arch_is_platform_page(u64 paddr)
653	{
654	return !!xa_load(&sgx_epc_address_space, index: paddr);
655	}
656	EXPORT_SYMBOL_GPL(arch_is_platform_page);
657
658	static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
659	{
660	struct sgx_epc_section *section;
661
662	section = xa_load(&sgx_epc_address_space, index: paddr);
663	if (!section)
664	return NULL;
665
666	return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
667	}
668
669	/*
670	* Called in process context to handle a hardware reported
671	* error in an SGX EPC page.
672	* If the MF_ACTION_REQUIRED bit is set in flags, then the
673	* context is the task that consumed the poison data. Otherwise
674	* this is called from a kernel thread unrelated to the page.
675	*/
676	int arch_memory_failure(unsigned long pfn, int flags)
677	{
678	struct sgx_epc_page *page = sgx_paddr_to_page(paddr: pfn << PAGE_SHIFT);
679	struct sgx_epc_section *section;
680	struct sgx_numa_node *node;
681
682	/*
683	* mm/memory-failure.c calls this routine for all errors
684	* where there isn't a "struct page" for the address. But that
685	* includes other address ranges besides SGX.
686	*/
687	if (!page)
688	return -ENXIO;
689
690	/*
691	* If poison was consumed synchronously. Send a SIGBUS to
692	* the task. Hardware has already exited the SGX enclave and
693	* will not allow re-entry to an enclave that has a memory
694	* error. The signal may help the task understand why the
695	* enclave is broken.
696	*/
697	if (flags & MF_ACTION_REQUIRED)
698	force_sig(SIGBUS);
699
700	section = &sgx_epc_sections[page->section];
701	node = section->node;
702
703	spin_lock(lock: &node->lock);
704
705	/ Already poisoned? Nothing more to do /
706	if (page->poison)
707	goto out;
708
709	page->poison = `1`;
710
711	/*
712	* If the page is on a free list, move it to the per-node
713	* poison page list.
714	*/
715	if (page->flags & SGX_EPC_PAGE_IS_FREE) {
716	list_move(list: &page->list, head: &node->sgx_poison_page_list);
717	goto out;
718	}
719
720	/*
721	* TBD: Add additional plumbing to enable pre-emptive
722	* action for asynchronous poison notification. Until
723	* then just hope that the poison:
724	* a) is not accessed - sgx_free_epc_page() will deal with it
725	* when the user gives it back
726	* b) results in a recoverable machine check rather than
727	* a fatal one
728	*/
729	out:
730	spin_unlock(lock: &node->lock);
731	return `0`;
732	}
733
734	/**
735	* A section metric is concatenated in a way that @low bits 12-31 define the
736	* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
737	* metric.
738	*/
739	static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
740	{
741	return (low & GENMASK_ULL(`31`, `12`)) +
742	((high & GENMASK_ULL(`19`, `0`)) << `32`);
743	}
744
745	#ifdef CONFIG_NUMA
746	static ssize_t sgx_total_bytes_show(struct device dev, struct* device_attribute attr, char* *buf)
747	{
748	return sysfs_emit(buf, fmt: "%lu\n", sgx_numa_nodes[dev->id].size);
749	}
750	static DEVICE_ATTR_RO(sgx_total_bytes);
751
752	static umode_t arch_node_attr_is_visible(struct kobject *kobj,
753	struct attribute attr, int* idx)
754	{
755	/ Make all x86/ attributes invisible when SGX is not initialized: /
756	if (nodes_empty(sgx_numa_mask))
757	return `0`;
758
759	return attr->mode;
760	}
761
762	static struct attribute *arch_node_dev_attrs[] = {
763	&dev_attr_sgx_total_bytes.attr,
764	NULL,
765	};
766
767	const struct attribute_group arch_node_dev_group = {
768	.name = "x86",
769	.attrs = arch_node_dev_attrs,
770	.is_visible = arch_node_attr_is_visible,
771	};
772
773	static void __init arch_update_sysfs_visibility(int nid)
774	{
775	struct node *node = node_devices[nid];
776	int ret;
777
778	ret = sysfs_update_group(kobj: &node->dev.kobj, grp: &arch_node_dev_group);
779
780	if (ret)
781	pr_err("sysfs update failed (%d), files may be invisible", ret);
782	}
783	#else /* !CONFIG_NUMA */
784	static void __init arch_update_sysfs_visibility(int nid) {}
785	#endif
786
787	static bool __init sgx_page_cache_init(void)
788	{
789	u32 eax, ebx, ecx, edx, type;
790	u64 pa, size;
791	int nid;
792	int i;
793
794	sgx_numa_nodes = kmalloc_array(num_possible_nodes(), size: sizeof(*sgx_numa_nodes), GFP_KERNEL);
795	if (!sgx_numa_nodes)
796	return false;
797
798	for (i = `0`; i < ARRAY_SIZE(sgx_epc_sections); i++) {
799	cpuid_count(SGX_CPUID, count: i + SGX_CPUID_EPC, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx);
800
801	type = eax & SGX_CPUID_EPC_MASK;
802	if (type == SGX_CPUID_EPC_INVALID)
803	break;
804
805	if (type != SGX_CPUID_EPC_SECTION) {
806	pr_err_once("Unknown EPC section type: %u\n", type);
807	break;
808	}
809
810	pa = sgx_calc_section_metric(low: eax, high: ebx);
811	size = sgx_calc_section_metric(low: ecx, high: edx);
812
813	pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - `1`);
814
815	if (!sgx_setup_epc_section(phys_addr: pa, size, index: i, section: &sgx_epc_sections[i])) {
816	pr_err("No free memory for an EPC section\n");
817	break;
818	}
819
820	nid = numa_map_to_online_node(phys_to_target_node(pa));
821	if (nid == NUMA_NO_NODE) {
822	/ The physical address is already printed above. /
823	pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
824	nid = `0`;
825	}
826
827	if (!node_isset(nid, sgx_numa_mask)) {
828	spin_lock_init(&sgx_numa_nodes[nid].lock);
829	INIT_LIST_HEAD(list: &sgx_numa_nodes[nid].free_page_list);
830	INIT_LIST_HEAD(list: &sgx_numa_nodes[nid].sgx_poison_page_list);
831	node_set(nid, sgx_numa_mask);
832	sgx_numa_nodes[nid].size = `0`;
833
834	/ Make SGX-specific node sysfs files visible: /
835	arch_update_sysfs_visibility(nid);
836	}
837
838	sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
839	sgx_numa_nodes[nid].size += size;
840
841	sgx_nr_epc_sections++;
842	}
843
844	if (!sgx_nr_epc_sections) {
845	pr_err("There are zero EPC sections.\n");
846	return false;
847	}
848
849	return true;
850	}
851
852	/*
853	* Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
854	* Bare-metal driver requires to update them to hash of enclave's signer
855	* before EINIT. KVM needs to update them to guest's virtual MSR values
856	* before doing EINIT from guest.
857	*/
858	void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
859	{
860	int i;
861
862	WARN_ON_ONCE(preemptible());
863
864	for (i = `0`; i < `4`; i++)
865	wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, val: lepubkeyhash[i]);
866	}
867
868	const struct file_operations sgx_provision_fops = {
869	.owner = THIS_MODULE,
870	};
871
872	static struct miscdevice sgx_dev_provision = {
873	.minor = MISC_DYNAMIC_MINOR,
874	.name = "sgx_provision",
875	.nodename = "sgx_provision",
876	.fops = &sgx_provision_fops,
877	};
878
879	/**
880	* sgx_set_attribute() - Update allowed attributes given file descriptor
881	* @allowed_attributes: Pointer to allowed enclave attributes
882	* @attribute_fd: File descriptor for specific attribute
883	*
884	* Append enclave attribute indicated by file descriptor to allowed
885	* attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
886	* /dev/sgx_provision is supported.
887	*
888	* Return:
889	* -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
890	* -EINVAL: Invalid, or not supported file descriptor
891	*/
892	int sgx_set_attribute(unsigned long *allowed_attributes,
893	unsigned int attribute_fd)
894	{
895	struct fd f = fdget(fd: attribute_fd);
896
897	if (!f.file)
898	return -EINVAL;
899
900	if (f.file->f_op != &sgx_provision_fops) {
901	fdput(fd: f);
902	return -EINVAL;
903	}
904
905	*allowed_attributes \|= SGX_ATTR_PROVISIONKEY;
906
907	fdput(fd: f);
908	return `0`;
909	}
910	EXPORT_SYMBOL_GPL(sgx_set_attribute);
911
912	static int __init sgx_init(void)
913	{
914	int ret;
915	int i;
916
917	if (!cpu_feature_enabled(X86_FEATURE_SGX))
918	return -ENODEV;
919
920	if (!sgx_page_cache_init())
921	return -ENOMEM;
922
923	if (!sgx_page_reclaimer_init()) {
924	ret = -ENOMEM;
925	goto err_page_cache;
926	}
927
928	ret = misc_register(misc: &sgx_dev_provision);
929	if (ret)
930	goto err_kthread;
931
932	/*
933	* Always try to initialize the native and KVM drivers.
934	* The KVM driver is less picky than the native one and
935	* can function if the native one is not supported on the
936	* current system or fails to initialize.
937	*
938	* Error out only if both fail to initialize.
939	*/
940	ret = sgx_drv_init();
941
942	if (sgx_vepc_init() && ret)
943	goto err_provision;
944
945	return `0`;
946
947	err_provision:
948	misc_deregister(misc: &sgx_dev_provision);
949
950	err_kthread:
951	kthread_stop(k: ksgxd_tsk);
952
953	err_page_cache:
954	for (i = `0`; i < sgx_nr_epc_sections; i++) {
955	vfree(addr: sgx_epc_sections[i].pages);
956	memunmap(addr: sgx_epc_sections[i].virt_addr);
957	}
958
959	return ret;
960	}
961
962	device_initcall(sgx_init);
963

source code of linux/arch/x86/kernel/cpu/sgx/main.c