sev.c source code [linux/arch/x86/kernel/sev.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* AMD Memory Encryption Support
4	*
5	* Copyright (C) 2019 SUSE
6	*
7	* Author: Joerg Roedel <jroedel@suse.de>
8	*/
9
10	#define pr_fmt(fmt) "SEV: " fmt
11
12	#include <linux/sched/debug.h> /* For show_regs() */
13	#include <linux/percpu-defs.h>
14	#include <linux/cc_platform.h>
15	#include <linux/printk.h>
16	#include <linux/mm_types.h>
17	#include <linux/set_memory.h>
18	#include <linux/memblock.h>
19	#include <linux/kernel.h>
20	#include <linux/mm.h>
21	#include <linux/cpumask.h>
22	#include <linux/efi.h>
23	#include <linux/platform_device.h>
24	#include <linux/io.h>
25	#include <linux/psp-sev.h>
26	#include <linux/dmi.h>
27	#include <uapi/linux/sev-guest.h>
28
29	#include <asm/init.h>
30	#include <asm/cpu_entry_area.h>
31	#include <asm/stacktrace.h>
32	#include <asm/sev.h>
33	#include <asm/insn-eval.h>
34	#include <asm/fpu/xcr.h>
35	#include <asm/processor.h>
36	#include <asm/realmode.h>
37	#include <asm/setup.h>
38	#include <asm/traps.h>
39	#include <asm/svm.h>
40	#include <asm/smp.h>
41	#include <asm/cpu.h>
42	#include <asm/apic.h>
43	#include <asm/cpuid.h>
44	#include <asm/cmdline.h>
45
46	#define DR7_RESET_VALUE 0x400
47
48	/ AP INIT values as documented in the APM2 section "Processor Initialization State" /
49	#define AP_INIT_CS_LIMIT 0xffff
50	#define AP_INIT_DS_LIMIT 0xffff
51	#define AP_INIT_LDTR_LIMIT 0xffff
52	#define AP_INIT_GDTR_LIMIT 0xffff
53	#define AP_INIT_IDTR_LIMIT 0xffff
54	#define AP_INIT_TR_LIMIT 0xffff
55	#define AP_INIT_RFLAGS_DEFAULT 0x2
56	#define AP_INIT_DR6_DEFAULT 0xffff0ff0
57	#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
58	#define AP_INIT_XCR0_DEFAULT 0x1
59	#define AP_INIT_X87_FTW_DEFAULT 0x5555
60	#define AP_INIT_X87_FCW_DEFAULT 0x0040
61	#define AP_INIT_CR0_DEFAULT 0x60000010
62	#define AP_INIT_MXCSR_DEFAULT 0x1f80
63
64	static const char * const sev_status_feat_names[] = {
65	[MSR_AMD64_SEV_ENABLED_BIT] = "SEV",
66	[MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES",
67	[MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP",
68	[MSR_AMD64_SNP_VTOM_BIT] = "vTom",
69	[MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC",
70	[MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI",
71	[MSR_AMD64_SNP_ALT_INJ_BIT] = "AI",
72	[MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap",
73	[MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS",
74	[MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol",
75	[MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS",
76	[MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC",
77	[MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam",
78	[MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt",
79	[MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt",
80	[MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt",
81	};
82
83	/ For early boot hypervisor communication in SEV-ES enabled guests /
84	static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
85
86	/*
87	* Needs to be in the .data section because we need it NULL before bss is
88	* cleared
89	*/
90	static struct ghcb *boot_ghcb __section(".data");
91
92	/ Bitmap of SEV features supported by the hypervisor /
93	static u64 sev_hv_features __ro_after_init;
94
95	/ #VC handler runtime per-CPU data /
96	struct sev_es_runtime_data {
97	struct ghcb ghcb_page;
98
99	/*
100	* Reserve one page per CPU as backup storage for the unencrypted GHCB.
101	* It is needed when an NMI happens while the #VC handler uses the real
102	* GHCB, and the NMI handler itself is causing another #VC exception. In
103	* that case the GHCB content of the first handler needs to be backed up
104	* and restored.
105	*/
106	struct ghcb backup_ghcb;
107
108	/*
109	* Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
110	* There is no need for it to be atomic, because nothing is written to
111	* the GHCB between the read and the write of ghcb_active. So it is safe
112	* to use it when a nested #VC exception happens before the write.
113	*
114	* This is necessary for example in the #VC->NMI->#VC case when the NMI
115	* happens while the first #VC handler uses the GHCB. When the NMI code
116	* raises a second #VC handler it might overwrite the contents of the
117	* GHCB written by the first handler. To avoid this the content of the
118	* GHCB is saved and restored when the GHCB is detected to be in use
119	* already.
120	*/
121	bool ghcb_active;
122	bool backup_ghcb_active;
123
124	/*
125	* Cached DR7 value - write it on DR7 writes and return it on reads.
126	* That value will never make it to the real hardware DR7 as debugging
127	* is currently unsupported in SEV-ES guests.
128	*/
129	unsigned long dr7;
130	};
131
132	struct ghcb_state {
133	struct ghcb *ghcb;
134	};
135
136	static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
137	static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
138
139	struct sev_config {
140	__u64 debug : `1`,
141
142	/*
143	* A flag used by __set_pages_state() that indicates when the
144	* per-CPU GHCB has been created and registered and thus can be
145	* used by the BSP instead of the early boot GHCB.
146	*
147	* For APs, the per-CPU GHCB is created before they are started
148	* and registered upon startup, so this flag can be used globally
149	* for the BSP and APs.
150	*/
151	ghcbs_initialized : `1`,
152
153	__reserved : `62`;
154	};
155
156	static struct sev_config sev_cfg __read_mostly;
157
158	static __always_inline bool on_vc_stack(struct pt_regs *regs)
159	{
160	unsigned long sp = regs->sp;
161
162	/ User-mode RSP is not trusted /
163	if (user_mode(regs))
164	return false;
165
166	/ SYSCALL gap still has user-mode RSP /
167	if (ip_within_syscall_gap(regs))
168	return false;
169
170	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
171	}
172
173	/*
174	* This function handles the case when an NMI is raised in the #VC
175	* exception handler entry code, before the #VC handler has switched off
176	* its IST stack. In this case, the IST entry for #VC must be adjusted,
177	* so that any nested #VC exception will not overwrite the stack
178	* contents of the interrupted #VC handler.
179	*
180	* The IST entry is adjusted unconditionally so that it can be also be
181	* unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
182	* nested sev_es_ist_exit() call may adjust back the IST entry too
183	* early.
184	*
185	* The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
186	* on the NMI IST stack, as they are only called from NMI handling code
187	* right now.
188	*/
189	void noinstr __sev_es_ist_enter(struct pt_regs *regs)
190	{
191	unsigned long old_ist, new_ist;
192
193	/ Read old IST entry /
194	new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
195
196	/*
197	* If NMI happened while on the #VC IST stack, set the new IST
198	* value below regs->sp, so that the interrupted stack frame is
199	* not overwritten by subsequent #VC exceptions.
200	*/
201	if (on_vc_stack(regs))
202	new_ist = regs->sp;
203
204	/*
205	* Reserve additional 8 bytes and store old IST value so this
206	* adjustment can be unrolled in __sev_es_ist_exit().
207	*/
208	new_ist -= sizeof(old_ist);
209	(unsigned* long *)new_ist = old_ist;
210
211	/ Set new IST entry /
212	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
213	}
214
215	void noinstr __sev_es_ist_exit(void)
216	{
217	unsigned long ist;
218
219	/ Read IST entry /
220	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
221
222	if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
223	return;
224
225	/ Read back old IST entry and write it to the TSS /
226	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], (unsigned* long *)ist);
227	}
228
229	/*
230	* Nothing shall interrupt this code path while holding the per-CPU
231	* GHCB. The backup GHCB is only for NMIs interrupting this path.
232	*
233	* Callers must disable local interrupts around it.
234	*/
235	static noinstr struct ghcb __sev_get_ghcb(struct* ghcb_state *state)
236	{
237	struct sev_es_runtime_data *data;
238	struct ghcb *ghcb;
239
240	WARN_ON(!irqs_disabled());
241
242	data = this_cpu_read(runtime_data);
243	ghcb = &data->ghcb_page;
244
245	if (unlikely(data->ghcb_active)) {
246	/ GHCB is already in use - save its contents /
247
248	if (unlikely(data->backup_ghcb_active)) {
249	/*
250	* Backup-GHCB is also already in use. There is no way
251	* to continue here so just kill the machine. To make
252	* panic() work, mark GHCBs inactive so that messages
253	* can be printed out.
254	*/
255	data->ghcb_active = false;
256	data->backup_ghcb_active = false;
257
258	instrumentation_begin();
259	panic(fmt: "Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
260	instrumentation_end();
261	}
262
263	/ Mark backup_ghcb active before writing to it /
264	data->backup_ghcb_active = true;
265
266	state->ghcb = &data->backup_ghcb;
267
268	/ Backup GHCB content /
269	state->ghcb = ghcb;
270	} else {
271	state->ghcb = NULL;
272	data->ghcb_active = true;
273	}
274
275	return ghcb;
276	}
277
278	static inline u64 sev_es_rd_ghcb_msr(void)
279	{
280	return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
281	}
282
283	static __always_inline void sev_es_wr_ghcb_msr(u64 val)
284	{
285	u32 low, high;
286
287	low = (u32)(val);
288	high = (u32)(val >> `32`);
289
290	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
291	}
292
293	static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
294	unsigned char *buffer)
295	{
296	return copy_from_kernel_nofault(dst: buffer, src: (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
297	}
298
299	static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
300	{
301	char buffer[MAX_INSN_SIZE];
302	int insn_bytes;
303
304	insn_bytes = insn_fetch_from_user_inatomic(regs: ctxt->regs, buf: buffer);
305	if (insn_bytes == `0`) {
306	/ Nothing could be copied /
307	ctxt->fi.vector = X86_TRAP_PF;
308	ctxt->fi.error_code = X86_PF_INSTR \| X86_PF_USER;
309	ctxt->fi.cr2 = ctxt->regs->ip;
310	return ES_EXCEPTION;
311	} else if (insn_bytes == -EINVAL) {
312	/ Effective RIP could not be calculated /
313	ctxt->fi.vector = X86_TRAP_GP;
314	ctxt->fi.error_code = `0`;
315	ctxt->fi.cr2 = `0`;
316	return ES_EXCEPTION;
317	}
318
319	if (!insn_decode_from_regs(insn: &ctxt->insn, regs: ctxt->regs, buf: buffer, buf_size: insn_bytes))
320	return ES_DECODE_FAILED;
321
322	if (ctxt->insn.immediate.got)
323	return ES_OK;
324	else
325	return ES_DECODE_FAILED;
326	}
327
328	static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
329	{
330	char buffer[MAX_INSN_SIZE];
331	int res, ret;
332
333	res = vc_fetch_insn_kernel(ctxt, buffer);
334	if (res) {
335	ctxt->fi.vector = X86_TRAP_PF;
336	ctxt->fi.error_code = X86_PF_INSTR;
337	ctxt->fi.cr2 = ctxt->regs->ip;
338	return ES_EXCEPTION;
339	}
340
341	ret = insn_decode(insn: &ctxt->insn, kaddr: buffer, MAX_INSN_SIZE, m: INSN_MODE_64);
342	if (ret < `0`)
343	return ES_DECODE_FAILED;
344	else
345	return ES_OK;
346	}
347
348	static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
349	{
350	if (user_mode(regs: ctxt->regs))
351	return __vc_decode_user_insn(ctxt);
352	else
353	return __vc_decode_kern_insn(ctxt);
354	}
355
356	static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
357	char dst, char* *buf, size_t size)
358	{
359	unsigned long error_code = X86_PF_PROT \| X86_PF_WRITE;
360
361	/*
362	* This function uses __put_user() independent of whether kernel or user
363	* memory is accessed. This works fine because __put_user() does no
364	* sanity checks of the pointer being accessed. All that it does is
365	* to report when the access failed.
366	*
367	* Also, this function runs in atomic context, so __put_user() is not
368	* allowed to sleep. The page-fault handler detects that it is running
369	* in atomic context and will not try to take mmap_sem and handle the
370	* fault, so additional pagefault_enable()/disable() calls are not
371	* needed.
372	*
373	* The access can't be done via copy_to_user() here because
374	* vc_write_mem() must not use string instructions to access unsafe
375	* memory. The reason is that MOVS is emulated by the #VC handler by
376	* splitting the move up into a read and a write and taking a nested #VC
377	* exception on whatever of them is the MMIO access. Using string
378	* instructions here would cause infinite nesting.
379	*/
380	switch (size) {
381	case `1`: {
382	u8 d1;
383	u8 __user target = (u8 __user )dst;
384
385	memcpy(&d1, buf, `1`);
386	if (__put_user(d1, target))
387	goto fault;
388	break;
389	}
390	case `2`: {
391	u16 d2;
392	u16 __user target = (u16 __user )dst;
393
394	memcpy(&d2, buf, `2`);
395	if (__put_user(d2, target))
396	goto fault;
397	break;
398	}
399	case `4`: {
400	u32 d4;
401	u32 __user target = (u32 __user )dst;
402
403	memcpy(&d4, buf, `4`);
404	if (__put_user(d4, target))
405	goto fault;
406	break;
407	}
408	case `8`: {
409	u64 d8;
410	u64 __user target = (u64 __user )dst;
411
412	memcpy(&d8, buf, `8`);
413	if (__put_user(d8, target))
414	goto fault;
415	break;
416	}
417	default:
418	WARN_ONCE(`1`, "%s: Invalid size: %zu\n", __func__, size);
419	return ES_UNSUPPORTED;
420	}
421
422	return ES_OK;
423
424	fault:
425	if (user_mode(regs: ctxt->regs))
426	error_code \|= X86_PF_USER;
427
428	ctxt->fi.vector = X86_TRAP_PF;
429	ctxt->fi.error_code = error_code;
430	ctxt->fi.cr2 = (unsigned long)dst;
431
432	return ES_EXCEPTION;
433	}
434
435	static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
436	char src, char* *buf, size_t size)
437	{
438	unsigned long error_code = X86_PF_PROT;
439
440	/*
441	* This function uses __get_user() independent of whether kernel or user
442	* memory is accessed. This works fine because __get_user() does no
443	* sanity checks of the pointer being accessed. All that it does is
444	* to report when the access failed.
445	*
446	* Also, this function runs in atomic context, so __get_user() is not
447	* allowed to sleep. The page-fault handler detects that it is running
448	* in atomic context and will not try to take mmap_sem and handle the
449	* fault, so additional pagefault_enable()/disable() calls are not
450	* needed.
451	*
452	* The access can't be done via copy_from_user() here because
453	* vc_read_mem() must not use string instructions to access unsafe
454	* memory. The reason is that MOVS is emulated by the #VC handler by
455	* splitting the move up into a read and a write and taking a nested #VC
456	* exception on whatever of them is the MMIO access. Using string
457	* instructions here would cause infinite nesting.
458	*/
459	switch (size) {
460	case `1`: {
461	u8 d1;
462	u8 __user s = (u8 __user )src;
463
464	if (__get_user(d1, s))
465	goto fault;
466	memcpy(buf, &d1, `1`);
467	break;
468	}
469	case `2`: {
470	u16 d2;
471	u16 __user s = (u16 __user )src;
472
473	if (__get_user(d2, s))
474	goto fault;
475	memcpy(buf, &d2, `2`);
476	break;
477	}
478	case `4`: {
479	u32 d4;
480	u32 __user s = (u32 __user )src;
481
482	if (__get_user(d4, s))
483	goto fault;
484	memcpy(buf, &d4, `4`);
485	break;
486	}
487	case `8`: {
488	u64 d8;
489	u64 __user s = (u64 __user )src;
490	if (__get_user(d8, s))
491	goto fault;
492	memcpy(buf, &d8, `8`);
493	break;
494	}
495	default:
496	WARN_ONCE(`1`, "%s: Invalid size: %zu\n", __func__, size);
497	return ES_UNSUPPORTED;
498	}
499
500	return ES_OK;
501
502	fault:
503	if (user_mode(regs: ctxt->regs))
504	error_code \|= X86_PF_USER;
505
506	ctxt->fi.vector = X86_TRAP_PF;
507	ctxt->fi.error_code = error_code;
508	ctxt->fi.cr2 = (unsigned long)src;
509
510	return ES_EXCEPTION;
511	}
512
513	static enum es_result vc_slow_virt_to_phys(struct ghcb ghcb, struct* es_em_ctxt *ctxt,
514	unsigned long vaddr, phys_addr_t *paddr)
515	{
516	unsigned long va = (unsigned long)vaddr;
517	unsigned int level;
518	phys_addr_t pa;
519	pgd_t *pgd;
520	pte_t *pte;
521
522	pgd = __va(read_cr3_pa());
523	pgd = &pgd[pgd_index(va)];
524	pte = lookup_address_in_pgd(pgd, address: va, level: &level);
525	if (!pte) {
526	ctxt->fi.vector = X86_TRAP_PF;
527	ctxt->fi.cr2 = vaddr;
528	ctxt->fi.error_code = `0`;
529
530	if (user_mode(regs: ctxt->regs))
531	ctxt->fi.error_code \|= X86_PF_USER;
532
533	return ES_EXCEPTION;
534	}
535
536	if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
537	/ Emulated MMIO to/from encrypted memory not supported /
538	return ES_UNSUPPORTED;
539
540	pa = (phys_addr_t)pte_pfn(pte: *pte) << PAGE_SHIFT;
541	pa \|= va & ~page_level_mask(level);
542
543	*paddr = pa;
544
545	return ES_OK;
546	}
547
548	static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
549	{
550	BUG_ON(size > `4`);
551
552	if (user_mode(regs: ctxt->regs)) {
553	struct thread_struct *t = &current->thread;
554	struct io_bitmap *iobm = t->io_bitmap;
555	size_t idx;
556
557	if (!iobm)
558	goto fault;
559
560	for (idx = port; idx < port + size; ++idx) {
561	if (test_bit(idx, iobm->bitmap))
562	goto fault;
563	}
564	}
565
566	return ES_OK;
567
568	fault:
569	ctxt->fi.vector = X86_TRAP_GP;
570	ctxt->fi.error_code = `0`;
571
572	return ES_EXCEPTION;
573	}
574
575	/ Include code shared with pre-decompression boot stage /
576	#include "sev-shared.c"
577
578	static noinstr void __sev_put_ghcb(struct ghcb_state *state)
579	{
580	struct sev_es_runtime_data *data;
581	struct ghcb *ghcb;
582
583	WARN_ON(!irqs_disabled());
584
585	data = this_cpu_read(runtime_data);
586	ghcb = &data->ghcb_page;
587
588	if (state->ghcb) {
589	/ Restore GHCB from Backup /
590	ghcb = state->ghcb;
591	data->backup_ghcb_active = false;
592	state->ghcb = NULL;
593	} else {
594	/*
595	* Invalidate the GHCB so a VMGEXIT instruction issued
596	* from userspace won't appear to be valid.
597	*/
598	vc_ghcb_invalidate(ghcb);
599	data->ghcb_active = false;
600	}
601	}
602
603	void noinstr __sev_es_nmi_complete(void)
604	{
605	struct ghcb_state state;
606	struct ghcb *ghcb;
607
608	ghcb = __sev_get_ghcb(state: &state);
609
610	vc_ghcb_invalidate(ghcb);
611	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
612	ghcb_set_sw_exit_info_1(ghcb, value: `0`);
613	ghcb_set_sw_exit_info_2(ghcb, value: `0`);
614
615	sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
616	VMGEXIT();
617
618	__sev_put_ghcb(state: &state);
619	}
620
621	static u64 __init get_secrets_page(void)
622	{
623	u64 pa_data = boot_params.cc_blob_address;
624	struct cc_blob_sev_info info;
625	void *map;
626
627	/*
628	* The CC blob contains the address of the secrets page, check if the
629	* blob is present.
630	*/
631	if (!pa_data)
632	return `0`;
633
634	map = early_memremap(phys_addr: pa_data, size: sizeof(info));
635	if (!map) {
636	pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
637	return `0`;
638	}
639	memcpy(&info, map, sizeof(info));
640	early_memunmap(addr: map, size: sizeof(info));
641
642	/ smoke-test the secrets page passed /
643	if (!info.secrets_phys \|\| info.secrets_len != PAGE_SIZE)
644	return `0`;
645
646	return info.secrets_phys;
647	}
648
649	static u64 __init get_snp_jump_table_addr(void)
650	{
651	struct snp_secrets_page_layout *layout;
652	void __iomem *mem;
653	u64 pa, addr;
654
655	pa = get_secrets_page();
656	if (!pa)
657	return `0`;
658
659	mem = ioremap_encrypted(phys_addr: pa, PAGE_SIZE);
660	if (!mem) {
661	pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
662	return `0`;
663	}
664
665	layout = (__force struct snp_secrets_page_layout *)mem;
666
667	addr = layout->os_area.ap_jump_table_pa;
668	iounmap(addr: mem);
669
670	return addr;
671	}
672
673	static u64 __init get_jump_table_addr(void)
674	{
675	struct ghcb_state state;
676	unsigned long flags;
677	struct ghcb *ghcb;
678	u64 ret = `0`;
679
680	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
681	return get_snp_jump_table_addr();
682
683	local_irq_save(flags);
684
685	ghcb = __sev_get_ghcb(state: &state);
686
687	vc_ghcb_invalidate(ghcb);
688	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
689	ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
690	ghcb_set_sw_exit_info_2(ghcb, value: `0`);
691
692	sev_es_wr_ghcb_msr(__pa(ghcb));
693	VMGEXIT();
694
695	if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
696	ghcb_sw_exit_info_2_is_valid(ghcb))
697	ret = ghcb->save.sw_exit_info_2;
698
699	__sev_put_ghcb(state: &state);
700
701	local_irq_restore(flags);
702
703	return ret;
704	}
705
706	static void __head
707	early_set_pages_state(unsigned long vaddr, unsigned long paddr,
708	unsigned long npages, enum psc_op op)
709	{
710	unsigned long paddr_end;
711	u64 val;
712	int ret;
713
714	vaddr = vaddr & PAGE_MASK;
715
716	paddr = paddr & PAGE_MASK;
717	paddr_end = paddr + (npages << PAGE_SHIFT);
718
719	while (paddr < paddr_end) {
720	if (op == SNP_PAGE_STATE_SHARED) {
721	/ Page validation must be rescinded before changing to shared /
722	ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate: false);
723	if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
724	goto e_term;
725	}
726
727	/*
728	* Use the MSR protocol because this function can be called before
729	* the GHCB is established.
730	*/
731	sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
732	VMGEXIT();
733
734	val = sev_es_rd_ghcb_msr();
735
736	if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
737	"Wrong PSC response code: 0x%x\n",
738	(unsigned int)GHCB_RESP_CODE(val)))
739	goto e_term;
740
741	if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
742	"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
743	op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
744	paddr, GHCB_MSR_PSC_RESP_VAL(val)))
745	goto e_term;
746
747	if (op == SNP_PAGE_STATE_PRIVATE) {
748	/ Page validation must be performed after changing to private /
749	ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate: true);
750	if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
751	goto e_term;
752	}
753
754	vaddr += PAGE_SIZE;
755	paddr += PAGE_SIZE;
756	}
757
758	return;
759
760	e_term:
761	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
762	}
763
764	void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
765	unsigned long npages)
766	{
767	/*
768	* This can be invoked in early boot while running identity mapped, so
769	* use an open coded check for SNP instead of using cc_platform_has().
770	* This eliminates worries about jump tables or checking boot_cpu_data
771	* in the cc_platform_has() function.
772	*/
773	if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
774	return;
775
776	/*
777	* Ask the hypervisor to mark the memory pages as private in the RMP
778	* table.
779	*/
780	early_set_pages_state(vaddr, paddr, npages, op: SNP_PAGE_STATE_PRIVATE);
781	}
782
783	void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
784	unsigned long npages)
785	{
786	/*
787	* This can be invoked in early boot while running identity mapped, so
788	* use an open coded check for SNP instead of using cc_platform_has().
789	* This eliminates worries about jump tables or checking boot_cpu_data
790	* in the cc_platform_has() function.
791	*/
792	if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
793	return;
794
795	/ Ask hypervisor to mark the memory pages shared in the RMP table. /
796	early_set_pages_state(vaddr, paddr, npages, op: SNP_PAGE_STATE_SHARED);
797	}
798
799	static unsigned long __set_pages_state(struct snp_psc_desc data, unsigned* long vaddr,
800	unsigned long vaddr_end, int op)
801	{
802	struct ghcb_state state;
803	bool use_large_entry;
804	struct psc_hdr *hdr;
805	struct psc_entry *e;
806	unsigned long flags;
807	unsigned long pfn;
808	struct ghcb *ghcb;
809	int i;
810
811	hdr = &data->hdr;
812	e = data->entries;
813
814	memset(data, `0`, sizeof(*data));
815	i = `0`;
816
817	while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
818	hdr->end_entry = i;
819
820	if (is_vmalloc_addr(x: (void *)vaddr)) {
821	pfn = vmalloc_to_pfn(addr: (void *)vaddr);
822	use_large_entry = false;
823	} else {
824	pfn = __pa(vaddr) >> PAGE_SHIFT;
825	use_large_entry = true;
826	}
827
828	e->gfn = pfn;
829	e->operation = op;
830
831	if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
832	(vaddr_end - vaddr) >= PMD_SIZE) {
833	e->pagesize = RMP_PG_SIZE_2M;
834	vaddr += PMD_SIZE;
835	} else {
836	e->pagesize = RMP_PG_SIZE_4K;
837	vaddr += PAGE_SIZE;
838	}
839
840	e++;
841	i++;
842	}
843
844	/ Page validation must be rescinded before changing to shared /
845	if (op == SNP_PAGE_STATE_SHARED)
846	pvalidate_pages(desc: data);
847
848	local_irq_save(flags);
849
850	if (sev_cfg.ghcbs_initialized)
851	ghcb = __sev_get_ghcb(state: &state);
852	else
853	ghcb = boot_ghcb;
854
855	/ Invoke the hypervisor to perform the page state changes /
856	if (!ghcb \|\| vmgexit_psc(ghcb, desc: data))
857	sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
858
859	if (sev_cfg.ghcbs_initialized)
860	__sev_put_ghcb(state: &state);
861
862	local_irq_restore(flags);
863
864	/ Page validation must be performed after changing to private /
865	if (op == SNP_PAGE_STATE_PRIVATE)
866	pvalidate_pages(desc: data);
867
868	return vaddr;
869	}
870
871	static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
872	{
873	struct snp_psc_desc desc;
874	unsigned long vaddr_end;
875
876	/ Use the MSR protocol when a GHCB is not available. /
877	if (!boot_ghcb)
878	return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
879
880	vaddr = vaddr & PAGE_MASK;
881	vaddr_end = vaddr + (npages << PAGE_SHIFT);
882
883	while (vaddr < vaddr_end)
884	vaddr = __set_pages_state(data: &desc, vaddr, vaddr_end, op);
885	}
886
887	void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
888	{
889	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
890	return;
891
892	set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_SHARED);
893	}
894
895	void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
896	{
897	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
898	return;
899
900	set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_PRIVATE);
901	}
902
903	void snp_accept_memory(phys_addr_t start, phys_addr_t end)
904	{
905	unsigned long vaddr, npages;
906
907	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
908	return;
909
910	vaddr = (unsigned long)__va(start);
911	npages = (end - start) >> PAGE_SHIFT;
912
913	set_pages_state(vaddr, npages, op: SNP_PAGE_STATE_PRIVATE);
914	}
915
916	static int snp_set_vmsa(void *va, bool vmsa)
917	{
918	u64 attrs;
919
920	/*
921	* Running at VMPL0 allows the kernel to change the VMSA bit for a page
922	* using the RMPADJUST instruction. However, for the instruction to
923	* succeed it must target the permissions of a lesser privileged
924	* (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
925	* instruction in the AMD64 APM Volume 3).
926	*/
927	attrs = `1`;
928	if (vmsa)
929	attrs \|= RMPADJUST_VMSA_PAGE_BIT;
930
931	return rmpadjust(vaddr: (unsigned long)va, RMP_PG_SIZE_4K, attrs);
932	}
933
934	#define __ATTR_BASE (SVM_SELECTOR_P_MASK \| SVM_SELECTOR_S_MASK)
935	#define INIT_CS_ATTRIBS (__ATTR_BASE \| SVM_SELECTOR_READ_MASK \| SVM_SELECTOR_CODE_MASK)
936	#define INIT_DS_ATTRIBS (__ATTR_BASE \| SVM_SELECTOR_WRITE_MASK)
937
938	#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK \| 2)
939	#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK \| 3)
940
941	static void snp_alloc_vmsa_page(void*)
942	{
943	struct page *p;
944
945	/*
946	* Allocate VMSA page to work around the SNP erratum where the CPU will
947	* incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
948	* collides with the RMP entry of VMSA page. The recommended workaround
949	* is to not use a large page.
950	*
951	* Allocate an 8k page which is also 8k-aligned.
952	*/
953	p = alloc_pages(GFP_KERNEL_ACCOUNT \| __GFP_ZERO, order: `1`);
954	if (!p)
955	return NULL;
956
957	split_page(page: p, order: `1`);
958
959	/ Free the first 4k. This page may be 2M/1G aligned and cannot be used. /
960	__free_page(p);
961
962	return page_address(p + `1`);
963	}
964
965	static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
966	{
967	int err;
968
969	err = snp_set_vmsa(va: vmsa, vmsa: false);
970	if (err)
971	pr_err("clear VMSA page failed (%u), leaking page\n", err);
972	else
973	free_page((unsigned long)vmsa);
974	}
975
976	static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
977	{
978	struct sev_es_save_area cur_vmsa, vmsa;
979	struct ghcb_state state;
980	unsigned long flags;
981	struct ghcb *ghcb;
982	u8 sipi_vector;
983	int cpu, ret;
984	u64 cr4;
985
986	/*
987	* The hypervisor SNP feature support check has happened earlier, just check
988	* the AP_CREATION one here.
989	*/
990	if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
991	return -EOPNOTSUPP;
992
993	/*
994	* Verify the desired start IP against the known trampoline start IP
995	* to catch any future new trampolines that may be introduced that
996	* would require a new protected guest entry point.
997	*/
998	if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
999	"Unsupported SNP start_ip: %lx\n", start_ip))
1000	return -EINVAL;
1001
1002	/ Override start_ip with known protected guest start IP /
1003	start_ip = real_mode_header->sev_es_trampoline_start;
1004
1005	/ Find the logical CPU for the APIC ID /
1006	for_each_present_cpu(cpu) {
1007	if (arch_match_cpu_phys_id(cpu, phys_id: apic_id))
1008	break;
1009	}
1010	if (cpu >= nr_cpu_ids)
1011	return -EINVAL;
1012
1013	cur_vmsa = per_cpu(sev_vmsa, cpu);
1014
1015	/*
1016	* A new VMSA is created each time because there is no guarantee that
1017	* the current VMSA is the kernels or that the vCPU is not running. If
1018	* an attempt was done to use the current VMSA with a running vCPU, a
1019	* #VMEXIT of that vCPU would wipe out all of the settings being done
1020	* here.
1021	*/
1022	vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
1023	if (!vmsa)
1024	return -ENOMEM;
1025
1026	/ CR4 should maintain the MCE value /
1027	cr4 = native_read_cr4() & X86_CR4_MCE;
1028
1029	/ Set the CS value based on the start_ip converted to a SIPI vector /
1030	sipi_vector = (start_ip >> `12`);
1031	vmsa->cs.base = sipi_vector << `12`;
1032	vmsa->cs.limit = AP_INIT_CS_LIMIT;
1033	vmsa->cs.attrib = INIT_CS_ATTRIBS;
1034	vmsa->cs.selector = sipi_vector << `8`;
1035
1036	/ Set the RIP value based on start_ip /
1037	vmsa->rip = start_ip & `0xfff`;
1038
1039	/ Set AP INIT defaults as documented in the APM /
1040	vmsa->ds.limit = AP_INIT_DS_LIMIT;
1041	vmsa->ds.attrib = INIT_DS_ATTRIBS;
1042	vmsa->es = vmsa->ds;
1043	vmsa->fs = vmsa->ds;
1044	vmsa->gs = vmsa->ds;
1045	vmsa->ss = vmsa->ds;
1046
1047	vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
1048	vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
1049	vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
1050	vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
1051	vmsa->tr.limit = AP_INIT_TR_LIMIT;
1052	vmsa->tr.attrib = INIT_TR_ATTRIBS;
1053
1054	vmsa->cr4 = cr4;
1055	vmsa->cr0 = AP_INIT_CR0_DEFAULT;
1056	vmsa->dr7 = DR7_RESET_VALUE;
1057	vmsa->dr6 = AP_INIT_DR6_DEFAULT;
1058	vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
1059	vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
1060	vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
1061	vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
1062	vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
1063	vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
1064
1065	/ SVME must be set. /
1066	vmsa->efer = EFER_SVME;
1067
1068	/*
1069	* Set the SNP-specific fields for this VMSA:
1070	* VMPL level
1071	* SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
1072	*/
1073	vmsa->vmpl = `0`;
1074	vmsa->sev_features = sev_status >> `2`;
1075
1076	/ Switch the page over to a VMSA page now that it is initialized /
1077	ret = snp_set_vmsa(va: vmsa, vmsa: true);
1078	if (ret) {
1079	pr_err("set VMSA page failed (%u)\n", ret);
1080	free_page((unsigned long)vmsa);
1081
1082	return -EINVAL;
1083	}
1084
1085	/ Issue VMGEXIT AP Creation NAE event /
1086	local_irq_save(flags);
1087
1088	ghcb = __sev_get_ghcb(state: &state);
1089
1090	vc_ghcb_invalidate(ghcb);
1091	ghcb_set_rax(ghcb, value: vmsa->sev_features);
1092	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
1093	ghcb_set_sw_exit_info_1(ghcb, value: ((u64)apic_id << `32`) \| SVM_VMGEXIT_AP_CREATE);
1094	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
1095
1096	sev_es_wr_ghcb_msr(__pa(ghcb));
1097	VMGEXIT();
1098
1099	if (!ghcb_sw_exit_info_1_is_valid(ghcb) \|\|
1100	lower_32_bits(ghcb->save.sw_exit_info_1)) {
1101	pr_err("SNP AP Creation error\n");
1102	ret = -EINVAL;
1103	}
1104
1105	__sev_put_ghcb(state: &state);
1106
1107	local_irq_restore(flags);
1108
1109	/ Perform cleanup if there was an error /
1110	if (ret) {
1111	snp_cleanup_vmsa(vmsa);
1112	vmsa = NULL;
1113	}
1114
1115	/ Free up any previous VMSA page /
1116	if (cur_vmsa)
1117	snp_cleanup_vmsa(vmsa: cur_vmsa);
1118
1119	/ Record the current VMSA page /
1120	per_cpu(sev_vmsa, cpu) = vmsa;
1121
1122	return ret;
1123	}
1124
1125	void __init snp_set_wakeup_secondary_cpu(void)
1126	{
1127	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
1128	return;
1129
1130	/*
1131	* Always set this override if SNP is enabled. This makes it the
1132	* required method to start APs under SNP. If the hypervisor does
1133	* not support AP creation, then no APs will be started.
1134	*/
1135	apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
1136	}
1137
1138	int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
1139	{
1140	u16 startup_cs, startup_ip;
1141	phys_addr_t jump_table_pa;
1142	u64 jump_table_addr;
1143	u16 __iomem *jump_table;
1144
1145	jump_table_addr = get_jump_table_addr();
1146
1147	/ On UP guests there is no jump table so this is not a failure /
1148	if (!jump_table_addr)
1149	return `0`;
1150
1151	/ Check if AP Jump Table is page-aligned /
1152	if (jump_table_addr & ~PAGE_MASK)
1153	return -EINVAL;
1154
1155	jump_table_pa = jump_table_addr & PAGE_MASK;
1156
1157	startup_cs = (u16)(rmh->trampoline_start >> `4`);
1158	startup_ip = (u16)(rmh->sev_es_trampoline_start -
1159	rmh->trampoline_start);
1160
1161	jump_table = ioremap_encrypted(phys_addr: jump_table_pa, PAGE_SIZE);
1162	if (!jump_table)
1163	return -EIO;
1164
1165	writew(val: startup_ip, addr: &jump_table[`0`]);
1166	writew(val: startup_cs, addr: &jump_table[`1`]);
1167
1168	iounmap(addr: jump_table);
1169
1170	return `0`;
1171	}
1172
1173	/*
1174	* This is needed by the OVMF UEFI firmware which will use whatever it finds in
1175	* the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
1176	* runtime GHCBs used by the kernel are also mapped in the EFI page-table.
1177	*/
1178	int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
1179	{
1180	struct sev_es_runtime_data *data;
1181	unsigned long address, pflags;
1182	int cpu;
1183	u64 pfn;
1184
1185	if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT))
1186	return `0`;
1187
1188	pflags = _PAGE_NX \| _PAGE_RW;
1189
1190	for_each_possible_cpu(cpu) {
1191	data = per_cpu(runtime_data, cpu);
1192
1193	address = __pa(&data->ghcb_page);
1194	pfn = address >> PAGE_SHIFT;
1195
1196	if (kernel_map_pages_in_pgd(pgd, pfn, address, numpages: `1`, page_flags: pflags))
1197	return `1`;
1198	}
1199
1200	return `0`;
1201	}
1202
1203	static enum es_result vc_handle_msr(struct ghcb ghcb, struct* es_em_ctxt *ctxt)
1204	{
1205	struct pt_regs *regs = ctxt->regs;
1206	enum es_result ret;
1207	u64 exit_info_1;
1208
1209	/ Is it a WRMSR? /
1210	exit_info_1 = (ctxt->insn.opcode.bytes[`1`] == `0x30`) ? `1` : `0`;
1211
1212	ghcb_set_rcx(ghcb, value: regs->cx);
1213	if (exit_info_1) {
1214	ghcb_set_rax(ghcb, value: regs->ax);
1215	ghcb_set_rdx(ghcb, value: regs->dx);
1216	}
1217
1218	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, exit_info_2: `0`);
1219
1220	if ((ret == ES_OK) && (!exit_info_1)) {
1221	regs->ax = ghcb->save.rax;
1222	regs->dx = ghcb->save.rdx;
1223	}
1224
1225	return ret;
1226	}
1227
1228	static void snp_register_per_cpu_ghcb(void)
1229	{
1230	struct sev_es_runtime_data *data;
1231	struct ghcb *ghcb;
1232
1233	data = this_cpu_read(runtime_data);
1234	ghcb = &data->ghcb_page;
1235
1236	snp_register_ghcb_early(__pa(ghcb));
1237	}
1238
1239	void setup_ghcb(void)
1240	{
1241	if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT))
1242	return;
1243
1244	/*
1245	* Check whether the runtime #VC exception handler is active. It uses
1246	* the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
1247	*
1248	* If SNP is active, register the per-CPU GHCB page so that the runtime
1249	* exception handler can use it.
1250	*/
1251	if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
1252	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
1253	snp_register_per_cpu_ghcb();
1254
1255	sev_cfg.ghcbs_initialized = true;
1256
1257	return;
1258	}
1259
1260	/*
1261	* Make sure the hypervisor talks a supported protocol.
1262	* This gets called only in the BSP boot phase.
1263	*/
1264	if (!sev_es_negotiate_protocol())
1265	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1266
1267	/*
1268	* Clear the boot_ghcb. The first exception comes in before the bss
1269	* section is cleared.
1270	*/
1271	memset(&boot_ghcb_page, `0`, PAGE_SIZE);
1272
1273	/ Alright - Make the boot-ghcb public /
1274	boot_ghcb = &boot_ghcb_page;
1275
1276	/ SNP guest requires that GHCB GPA must be registered. /
1277	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
1278	snp_register_ghcb_early(__pa(&boot_ghcb_page));
1279	}
1280
1281	#ifdef CONFIG_HOTPLUG_CPU
1282	static void sev_es_ap_hlt_loop(void)
1283	{
1284	struct ghcb_state state;
1285	struct ghcb *ghcb;
1286
1287	ghcb = __sev_get_ghcb(state: &state);
1288
1289	while (true) {
1290	vc_ghcb_invalidate(ghcb);
1291	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
1292	ghcb_set_sw_exit_info_1(ghcb, value: `0`);
1293	ghcb_set_sw_exit_info_2(ghcb, value: `0`);
1294
1295	sev_es_wr_ghcb_msr(__pa(ghcb));
1296	VMGEXIT();
1297
1298	/ Wakeup signal? /
1299	if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
1300	ghcb->save.sw_exit_info_2)
1301	break;
1302	}
1303
1304	__sev_put_ghcb(state: &state);
1305	}
1306
1307	/*
1308	* Play_dead handler when running under SEV-ES. This is needed because
1309	* the hypervisor can't deliver an SIPI request to restart the AP.
1310	* Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
1311	* hypervisor wakes it up again.
1312	*/
1313	static void sev_es_play_dead(void)
1314	{
1315	play_dead_common();
1316
1317	/ IRQs now disabled /
1318
1319	sev_es_ap_hlt_loop();
1320
1321	/*
1322	* If we get here, the VCPU was woken up again. Jump to CPU
1323	* startup code to get it back online.
1324	*/
1325	soft_restart_cpu();
1326	}
1327	#else /* CONFIG_HOTPLUG_CPU */
1328	#define sev_es_play_dead native_play_dead
1329	#endif /* CONFIG_HOTPLUG_CPU */
1330
1331	#ifdef CONFIG_SMP
1332	static void __init sev_es_setup_play_dead(void)
1333	{
1334	smp_ops.play_dead = sev_es_play_dead;
1335	}
1336	#else
1337	static inline void sev_es_setup_play_dead(void) { }
1338	#endif
1339
1340	static void __init alloc_runtime_data(int cpu)
1341	{
1342	struct sev_es_runtime_data *data;
1343
1344	data = memblock_alloc(size: sizeof(*data), PAGE_SIZE);
1345	if (!data)
1346	panic(fmt: "Can't allocate SEV-ES runtime data");
1347
1348	per_cpu(runtime_data, cpu) = data;
1349	}
1350
1351	static void __init init_ghcb(int cpu)
1352	{
1353	struct sev_es_runtime_data *data;
1354	int err;
1355
1356	data = per_cpu(runtime_data, cpu);
1357
1358	err = early_set_memory_decrypted(vaddr: (unsigned long)&data->ghcb_page,
1359	size: sizeof(data->ghcb_page));
1360	if (err)
1361	panic(fmt: "Can't map GHCBs unencrypted");
1362
1363	memset(&data->ghcb_page, `0`, sizeof(data->ghcb_page));
1364
1365	data->ghcb_active = false;
1366	data->backup_ghcb_active = false;
1367	}
1368
1369	void __init sev_es_init_vc_handling(void)
1370	{
1371	int cpu;
1372
1373	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
1374
1375	if (!cc_platform_has(attr: CC_ATTR_GUEST_STATE_ENCRYPT))
1376	return;
1377
1378	if (!sev_es_check_cpu_features())
1379	panic(fmt: "SEV-ES CPU Features missing");
1380
1381	/*
1382	* SNP is supported in v2 of the GHCB spec which mandates support for HV
1383	* features.
1384	*/
1385	if (cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP)) {
1386	sev_hv_features = get_hv_features();
1387
1388	if (!(sev_hv_features & GHCB_HV_FT_SNP))
1389	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
1390	}
1391
1392	/ Initialize per-cpu GHCB pages /
1393	for_each_possible_cpu(cpu) {
1394	alloc_runtime_data(cpu);
1395	init_ghcb(cpu);
1396	}
1397
1398	sev_es_setup_play_dead();
1399
1400	/ Secondary CPUs use the runtime #VC handler /
1401	initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
1402	}
1403
1404	static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
1405	{
1406	int trapnr = ctxt->fi.vector;
1407
1408	if (trapnr == X86_TRAP_PF)
1409	native_write_cr2(val: ctxt->fi.cr2);
1410
1411	ctxt->regs->orig_ax = ctxt->fi.error_code;
1412	do_early_exception(regs: ctxt->regs, trapnr);
1413	}
1414
1415	static long vc_insn_get_rm(struct* es_em_ctxt *ctxt)
1416	{
1417	long *reg_array;
1418	int offset;
1419
1420	reg_array = (long *)ctxt->regs;
1421	offset = insn_get_modrm_rm_off(insn: &ctxt->insn, regs: ctxt->regs);
1422
1423	if (offset < `0`)
1424	return NULL;
1425
1426	offset /= sizeof(long);
1427
1428	return reg_array + offset;
1429	}
1430	static enum es_result vc_do_mmio(struct ghcb ghcb, struct* es_em_ctxt *ctxt,
1431	unsigned int bytes, bool read)
1432	{
1433	u64 exit_code, exit_info_1, exit_info_2;
1434	unsigned long ghcb_pa = __pa(ghcb);
1435	enum es_result res;
1436	phys_addr_t paddr;
1437	void __user *ref;
1438
1439	ref = insn_get_addr_ref(insn: &ctxt->insn, regs: ctxt->regs);
1440	if (ref == (void __user *)-`1L`)
1441	return ES_UNSUPPORTED;
1442
1443	exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
1444
1445	res = vc_slow_virt_to_phys(ghcb, ctxt, vaddr: (unsigned long)ref, paddr: &paddr);
1446	if (res != ES_OK) {
1447	if (res == ES_EXCEPTION && !read)
1448	ctxt->fi.error_code \|= X86_PF_WRITE;
1449
1450	return res;
1451	}
1452
1453	exit_info_1 = paddr;
1454	/ Can never be greater than 8 /
1455	exit_info_2 = bytes;
1456
1457	ghcb_set_sw_scratch(ghcb, value: ghcb_pa + offsetof(struct ghcb, shared_buffer));
1458
1459	return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
1460	}
1461
1462	/*
1463	* The MOVS instruction has two memory operands, which raises the
1464	* problem that it is not known whether the access to the source or the
1465	* destination caused the #VC exception (and hence whether an MMIO read
1466	* or write operation needs to be emulated).
1467	*
1468	* Instead of playing games with walking page-tables and trying to guess
1469	* whether the source or destination is an MMIO range, split the move
1470	* into two operations, a read and a write with only one memory operand.
1471	* This will cause a nested #VC exception on the MMIO address which can
1472	* then be handled.
1473	*
1474	* This implementation has the benefit that it also supports MOVS where
1475	* source _and_ destination are MMIO regions.
1476	*
1477	* It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
1478	* rare operation. If it turns out to be a performance problem the split
1479	* operations can be moved to memcpy_fromio() and memcpy_toio().
1480	*/
1481	static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
1482	unsigned int bytes)
1483	{
1484	unsigned long ds_base, es_base;
1485	unsigned char src, dst;
1486	unsigned char buffer[`8`];
1487	enum es_result ret;
1488	bool rep;
1489	int off;
1490
1491	ds_base = insn_get_seg_base(regs: ctxt->regs, INAT_SEG_REG_DS);
1492	es_base = insn_get_seg_base(regs: ctxt->regs, INAT_SEG_REG_ES);
1493
1494	if (ds_base == -`1L` \|\| es_base == -`1L`) {
1495	ctxt->fi.vector = X86_TRAP_GP;
1496	ctxt->fi.error_code = `0`;
1497	return ES_EXCEPTION;
1498	}
1499
1500	src = ds_base + (unsigned char *)ctxt->regs->si;
1501	dst = es_base + (unsigned char *)ctxt->regs->di;
1502
1503	ret = vc_read_mem(ctxt, src, buf: buffer, size: bytes);
1504	if (ret != ES_OK)
1505	return ret;
1506
1507	ret = vc_write_mem(ctxt, dst, buf: buffer, size: bytes);
1508	if (ret != ES_OK)
1509	return ret;
1510
1511	if (ctxt->regs->flags & X86_EFLAGS_DF)
1512	off = -bytes;
1513	else
1514	off = bytes;
1515
1516	ctxt->regs->si += off;
1517	ctxt->regs->di += off;
1518
1519	rep = insn_has_rep_prefix(insn: &ctxt->insn);
1520	if (rep)
1521	ctxt->regs->cx -= `1`;
1522
1523	if (!rep \|\| ctxt->regs->cx == `0`)
1524	return ES_OK;
1525	else
1526	return ES_RETRY;
1527	}
1528
1529	static enum es_result vc_handle_mmio(struct ghcb ghcb, struct* es_em_ctxt *ctxt)
1530	{
1531	struct insn *insn = &ctxt->insn;
1532	enum insn_mmio_type mmio;
1533	unsigned int bytes = `0`;
1534	enum es_result ret;
1535	u8 sign_byte;
1536	long *reg_data;
1537
1538	mmio = insn_decode_mmio(insn, bytes: &bytes);
1539	if (mmio == INSN_MMIO_DECODE_FAILED)
1540	return ES_DECODE_FAILED;
1541
1542	if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
1543	reg_data = insn_get_modrm_reg_ptr(insn, regs: ctxt->regs);
1544	if (!reg_data)
1545	return ES_DECODE_FAILED;
1546	}
1547
1548	if (user_mode(regs: ctxt->regs))
1549	return ES_UNSUPPORTED;
1550
1551	switch (mmio) {
1552	case INSN_MMIO_WRITE:
1553	memcpy(ghcb->shared_buffer, reg_data, bytes);
1554	ret = vc_do_mmio(ghcb, ctxt, bytes, read: false);
1555	break;
1556	case INSN_MMIO_WRITE_IMM:
1557	memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
1558	ret = vc_do_mmio(ghcb, ctxt, bytes, read: false);
1559	break;
1560	case INSN_MMIO_READ:
1561	ret = vc_do_mmio(ghcb, ctxt, bytes, read: true);
1562	if (ret)
1563	break;
1564
1565	/ Zero-extend for 32-bit operation /
1566	if (bytes == `4`)
1567	*reg_data = `0`;
1568
1569	memcpy(reg_data, ghcb->shared_buffer, bytes);
1570	break;
1571	case INSN_MMIO_READ_ZERO_EXTEND:
1572	ret = vc_do_mmio(ghcb, ctxt, bytes, read: true);
1573	if (ret)
1574	break;
1575
1576	/ Zero extend based on operand size /
1577	memset(reg_data, `0`, insn->opnd_bytes);
1578	memcpy(reg_data, ghcb->shared_buffer, bytes);
1579	break;
1580	case INSN_MMIO_READ_SIGN_EXTEND:
1581	ret = vc_do_mmio(ghcb, ctxt, bytes, read: true);
1582	if (ret)
1583	break;
1584
1585	if (bytes == `1`) {
1586	u8 val = (u8 )ghcb->shared_buffer;
1587
1588	sign_byte = (*val & `0x80`) ? `0xff` : `0x00`;
1589	} else {
1590	u16 val = (u16 )ghcb->shared_buffer;
1591
1592	sign_byte = (*val & `0x8000`) ? `0xff` : `0x00`;
1593	}
1594
1595	/ Sign extend based on operand size /
1596	memset(reg_data, sign_byte, insn->opnd_bytes);
1597	memcpy(reg_data, ghcb->shared_buffer, bytes);
1598	break;
1599	case INSN_MMIO_MOVS:
1600	ret = vc_handle_mmio_movs(ctxt, bytes);
1601	break;
1602	default:
1603	ret = ES_UNSUPPORTED;
1604	break;
1605	}
1606
1607	return ret;
1608	}
1609
1610	static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
1611	struct es_em_ctxt *ctxt)
1612	{
1613	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1614	long val, *reg = vc_insn_get_rm(ctxt);
1615	enum es_result ret;
1616
1617	if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1618	return ES_VMM_ERROR;
1619
1620	if (!reg)
1621	return ES_DECODE_FAILED;
1622
1623	val = *reg;
1624
1625	/ Upper 32 bits must be written as zeroes /
1626	if (val >> `32`) {
1627	ctxt->fi.vector = X86_TRAP_GP;
1628	ctxt->fi.error_code = `0`;
1629	return ES_EXCEPTION;
1630	}
1631
1632	/ Clear out other reserved bits and set bit 10 /
1633	val = (val & `0xffff23ffL`) \| BIT(`10`);
1634
1635	/ Early non-zero writes to DR7 are not supported /
1636	if (!data && (val & ~DR7_RESET_VALUE))
1637	return ES_UNSUPPORTED;
1638
1639	/ Using a value of 0 for ExitInfo1 means RAX holds the value /
1640	ghcb_set_rax(ghcb, value: val);
1641	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, exit_info_1: `0`, exit_info_2: `0`);
1642	if (ret != ES_OK)
1643	return ret;
1644
1645	if (data)
1646	data->dr7 = val;
1647
1648	return ES_OK;
1649	}
1650
1651	static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
1652	struct es_em_ctxt *ctxt)
1653	{
1654	struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
1655	long *reg = vc_insn_get_rm(ctxt);
1656
1657	if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
1658	return ES_VMM_ERROR;
1659
1660	if (!reg)
1661	return ES_DECODE_FAILED;
1662
1663	if (data)
1664	*reg = data->dr7;
1665	else
1666	*reg = DR7_RESET_VALUE;
1667
1668	return ES_OK;
1669	}
1670
1671	static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
1672	struct es_em_ctxt *ctxt)
1673	{
1674	return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, exit_info_1: `0`, exit_info_2: `0`);
1675	}
1676
1677	static enum es_result vc_handle_rdpmc(struct ghcb ghcb, struct* es_em_ctxt *ctxt)
1678	{
1679	enum es_result ret;
1680
1681	ghcb_set_rcx(ghcb, value: ctxt->regs->cx);
1682
1683	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, exit_info_1: `0`, exit_info_2: `0`);
1684	if (ret != ES_OK)
1685	return ret;
1686
1687	if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
1688	return ES_VMM_ERROR;
1689
1690	ctxt->regs->ax = ghcb->save.rax;
1691	ctxt->regs->dx = ghcb->save.rdx;
1692
1693	return ES_OK;
1694	}
1695
1696	static enum es_result vc_handle_monitor(struct ghcb *ghcb,
1697	struct es_em_ctxt *ctxt)
1698	{
1699	/*
1700	* Treat it as a NOP and do not leak a physical address to the
1701	* hypervisor.
1702	*/
1703	return ES_OK;
1704	}
1705
1706	static enum es_result vc_handle_mwait(struct ghcb *ghcb,
1707	struct es_em_ctxt *ctxt)
1708	{
1709	/ Treat the same as MONITOR/MONITORX /
1710	return ES_OK;
1711	}
1712
1713	static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
1714	struct es_em_ctxt *ctxt)
1715	{
1716	enum es_result ret;
1717
1718	ghcb_set_rax(ghcb, value: ctxt->regs->ax);
1719	ghcb_set_cpl(ghcb, value: user_mode(regs: ctxt->regs) ? `3` : `0`);
1720
1721	if (x86_platform.hyper.sev_es_hcall_prepare)
1722	x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
1723
1724	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, exit_info_1: `0`, exit_info_2: `0`);
1725	if (ret != ES_OK)
1726	return ret;
1727
1728	if (!ghcb_rax_is_valid(ghcb))
1729	return ES_VMM_ERROR;
1730
1731	ctxt->regs->ax = ghcb->save.rax;
1732
1733	/*
1734	* Call sev_es_hcall_finish() after regs->ax is already set.
1735	* This allows the hypervisor handler to overwrite it again if
1736	* necessary.
1737	*/
1738	if (x86_platform.hyper.sev_es_hcall_finish &&
1739	!x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
1740	return ES_VMM_ERROR;
1741
1742	return ES_OK;
1743	}
1744
1745	static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
1746	struct es_em_ctxt *ctxt)
1747	{
1748	/*
1749	* Calling ecx_alignment_check() directly does not work, because it
1750	* enables IRQs and the GHCB is active. Forward the exception and call
1751	* it later from vc_forward_exception().
1752	*/
1753	ctxt->fi.vector = X86_TRAP_AC;
1754	ctxt->fi.error_code = `0`;
1755	return ES_EXCEPTION;
1756	}
1757
1758	static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
1759	struct ghcb *ghcb,
1760	unsigned long exit_code)
1761	{
1762	enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
1763
1764	if (result != ES_OK)
1765	return result;
1766
1767	switch (exit_code) {
1768	case SVM_EXIT_READ_DR7:
1769	result = vc_handle_dr7_read(ghcb, ctxt);
1770	break;
1771	case SVM_EXIT_WRITE_DR7:
1772	result = vc_handle_dr7_write(ghcb, ctxt);
1773	break;
1774	case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
1775	result = vc_handle_trap_ac(ghcb, ctxt);
1776	break;
1777	case SVM_EXIT_RDTSC:
1778	case SVM_EXIT_RDTSCP:
1779	result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
1780	break;
1781	case SVM_EXIT_RDPMC:
1782	result = vc_handle_rdpmc(ghcb, ctxt);
1783	break;
1784	case SVM_EXIT_INVD:
1785	pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
1786	result = ES_UNSUPPORTED;
1787	break;
1788	case SVM_EXIT_CPUID:
1789	result = vc_handle_cpuid(ghcb, ctxt);
1790	break;
1791	case SVM_EXIT_IOIO:
1792	result = vc_handle_ioio(ghcb, ctxt);
1793	break;
1794	case SVM_EXIT_MSR:
1795	result = vc_handle_msr(ghcb, ctxt);
1796	break;
1797	case SVM_EXIT_VMMCALL:
1798	result = vc_handle_vmmcall(ghcb, ctxt);
1799	break;
1800	case SVM_EXIT_WBINVD:
1801	result = vc_handle_wbinvd(ghcb, ctxt);
1802	break;
1803	case SVM_EXIT_MONITOR:
1804	result = vc_handle_monitor(ghcb, ctxt);
1805	break;
1806	case SVM_EXIT_MWAIT:
1807	result = vc_handle_mwait(ghcb, ctxt);
1808	break;
1809	case SVM_EXIT_NPF:
1810	result = vc_handle_mmio(ghcb, ctxt);
1811	break;
1812	default:
1813	/*
1814	* Unexpected #VC exception
1815	*/
1816	result = ES_UNSUPPORTED;
1817	}
1818
1819	return result;
1820	}
1821
1822	static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
1823	{
1824	long error_code = ctxt->fi.error_code;
1825	int trapnr = ctxt->fi.vector;
1826
1827	ctxt->regs->orig_ax = ctxt->fi.error_code;
1828
1829	switch (trapnr) {
1830	case X86_TRAP_GP:
1831	exc_general_protection(regs: ctxt->regs, error_code);
1832	break;
1833	case X86_TRAP_UD:
1834	exc_invalid_op(regs: ctxt->regs);
1835	break;
1836	case X86_TRAP_PF:
1837	write_cr2(x: ctxt->fi.cr2);
1838	exc_page_fault(regs: ctxt->regs, error_code);
1839	break;
1840	case X86_TRAP_AC:
1841	exc_alignment_check(regs: ctxt->regs, error_code);
1842	break;
1843	default:
1844	pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
1845	BUG();
1846	}
1847	}
1848
1849	static __always_inline bool is_vc2_stack(unsigned long sp)
1850	{
1851	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
1852	}
1853
1854	static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
1855	{
1856	unsigned long sp, prev_sp;
1857
1858	sp = (unsigned long)regs;
1859	prev_sp = regs->sp;
1860
1861	/*
1862	* If the code was already executing on the VC2 stack when the #VC
1863	* happened, let it proceed to the normal handling routine. This way the
1864	* code executing on the VC2 stack can cause #VC exceptions to get handled.
1865	*/
1866	return is_vc2_stack(sp) && !is_vc2_stack(sp: prev_sp);
1867	}
1868
1869	static bool vc_raw_handle_exception(struct pt_regs regs, unsigned* long error_code)
1870	{
1871	struct ghcb_state state;
1872	struct es_em_ctxt ctxt;
1873	enum es_result result;
1874	struct ghcb *ghcb;
1875	bool ret = true;
1876
1877	ghcb = __sev_get_ghcb(state: &state);
1878
1879	vc_ghcb_invalidate(ghcb);
1880	result = vc_init_em_ctxt(ctxt: &ctxt, regs, exit_code: error_code);
1881
1882	if (result == ES_OK)
1883	result = vc_handle_exitcode(ctxt: &ctxt, ghcb, exit_code: error_code);
1884
1885	__sev_put_ghcb(state: &state);
1886
1887	/ Done - now check the result /
1888	switch (result) {
1889	case ES_OK:
1890	vc_finish_insn(ctxt: &ctxt);
1891	break;
1892	case ES_UNSUPPORTED:
1893	pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
1894	error_code, regs->ip);
1895	ret = false;
1896	break;
1897	case ES_VMM_ERROR:
1898	pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
1899	error_code, regs->ip);
1900	ret = false;
1901	break;
1902	case ES_DECODE_FAILED:
1903	pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
1904	error_code, regs->ip);
1905	ret = false;
1906	break;
1907	case ES_EXCEPTION:
1908	vc_forward_exception(ctxt: &ctxt);
1909	break;
1910	case ES_RETRY:
1911	/ Nothing to do /
1912	break;
1913	default:
1914	pr_emerg("Unknown result in %s():%d\n", __func__, result);
1915	/*
1916	* Emulating the instruction which caused the #VC exception
1917	* failed - can't continue so print debug information
1918	*/
1919	BUG();
1920	}
1921
1922	return ret;
1923	}
1924
1925	static __always_inline bool vc_is_db(unsigned long error_code)
1926	{
1927	return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
1928	}
1929
1930	/*
1931	* Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
1932	* and will panic when an error happens.
1933	*/
1934	DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
1935	{
1936	irqentry_state_t irq_state;
1937
1938	/*
1939	* With the current implementation it is always possible to switch to a
1940	* safe stack because #VC exceptions only happen at known places, like
1941	* intercepted instructions or accesses to MMIO areas/IO ports. They can
1942	* also happen with code instrumentation when the hypervisor intercepts
1943	* #DB, but the critical paths are forbidden to be instrumented, so #DB
1944	* exceptions currently also only happen in safe places.
1945	*
1946	* But keep this here in case the noinstr annotations are violated due
1947	* to bug elsewhere.
1948	*/
1949	if (unlikely(vc_from_invalid_context(regs))) {
1950	instrumentation_begin();
1951	panic(fmt: "Can't handle #VC exception from unsupported context\n");
1952	instrumentation_end();
1953	}
1954
1955	/*
1956	* Handle #DB before calling into !noinstr code to avoid recursive #DB.
1957	*/
1958	if (vc_is_db(error_code)) {
1959	exc_debug(regs);
1960	return;
1961	}
1962
1963	irq_state = irqentry_nmi_enter(regs);
1964
1965	instrumentation_begin();
1966
1967	if (!vc_raw_handle_exception(regs, error_code)) {
1968	/ Show some debug info /
1969	show_regs(regs);
1970
1971	/ Ask hypervisor to sev_es_terminate /
1972	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
1973
1974	/ If that fails and we get here - just panic /
1975	panic(fmt: "Returned from Terminate-Request to Hypervisor\n");
1976	}
1977
1978	instrumentation_end();
1979	irqentry_nmi_exit(regs, irq_state);
1980	}
1981
1982	/*
1983	* Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
1984	* and will kill the current task with SIGBUS when an error happens.
1985	*/
1986	DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
1987	{
1988	/*
1989	* Handle #DB before calling into !noinstr code to avoid recursive #DB.
1990	*/
1991	if (vc_is_db(error_code)) {
1992	noist_exc_debug(regs);
1993	return;
1994	}
1995
1996	irqentry_enter_from_user_mode(regs);
1997	instrumentation_begin();
1998
1999	if (!vc_raw_handle_exception(regs, error_code)) {
2000	/*
2001	* Do not kill the machine if user-space triggered the
2002	* exception. Send SIGBUS instead and let user-space deal with
2003	* it.
2004	*/
2005	force_sig_fault(SIGBUS, BUS_OBJERR, addr: (void __user *)`0`);
2006	}
2007
2008	instrumentation_end();
2009	irqentry_exit_to_user_mode(regs);
2010	}
2011
2012	bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
2013	{
2014	unsigned long exit_code = regs->orig_ax;
2015	struct es_em_ctxt ctxt;
2016	enum es_result result;
2017
2018	vc_ghcb_invalidate(ghcb: boot_ghcb);
2019
2020	result = vc_init_em_ctxt(ctxt: &ctxt, regs, exit_code);
2021	if (result == ES_OK)
2022	result = vc_handle_exitcode(ctxt: &ctxt, ghcb: boot_ghcb, exit_code);
2023
2024	/ Done - now check the result /
2025	switch (result) {
2026	case ES_OK:
2027	vc_finish_insn(ctxt: &ctxt);
2028	break;
2029	case ES_UNSUPPORTED:
2030	early_printk(fmt: "PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
2031	exit_code, regs->ip);
2032	goto fail;
2033	case ES_VMM_ERROR:
2034	early_printk(fmt: "PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
2035	exit_code, regs->ip);
2036	goto fail;
2037	case ES_DECODE_FAILED:
2038	early_printk(fmt: "PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
2039	exit_code, regs->ip);
2040	goto fail;
2041	case ES_EXCEPTION:
2042	vc_early_forward_exception(ctxt: &ctxt);
2043	break;
2044	case ES_RETRY:
2045	/ Nothing to do /
2046	break;
2047	default:
2048	BUG();
2049	}
2050
2051	return true;
2052
2053	fail:
2054	show_regs(regs);
2055
2056	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
2057	}
2058
2059	/*
2060	* Initial set up of SNP relies on information provided by the
2061	* Confidential Computing blob, which can be passed to the kernel
2062	* in the following ways, depending on how it is booted:
2063	*
2064	* - when booted via the boot/decompress kernel:
2065	* - via boot_params
2066	*
2067	* - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
2068	* - via a setup_data entry, as defined by the Linux Boot Protocol
2069	*
2070	* Scan for the blob in that order.
2071	*/
2072	static __head struct cc_blob_sev_info find_cc_blob(struct* boot_params *bp)
2073	{
2074	struct cc_blob_sev_info *cc_info;
2075
2076	/ Boot kernel would have passed the CC blob via boot_params. /
2077	if (bp->cc_blob_address) {
2078	cc_info = (struct cc_blob_sev_info )(unsigned* long)bp->cc_blob_address;
2079	goto found_cc_info;
2080	}
2081
2082	/*
2083	* If kernel was booted directly, without the use of the
2084	* boot/decompression kernel, the CC blob may have been passed via
2085	* setup_data instead.
2086	*/
2087	cc_info = find_cc_blob_setup_data(bp);
2088	if (!cc_info)
2089	return NULL;
2090
2091	found_cc_info:
2092	if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
2093	snp_abort();
2094
2095	return cc_info;
2096	}
2097
2098	bool __head snp_init(struct boot_params *bp)
2099	{
2100	struct cc_blob_sev_info *cc_info;
2101
2102	if (!bp)
2103	return false;
2104
2105	cc_info = find_cc_blob(bp);
2106	if (!cc_info)
2107	return false;
2108
2109	setup_cpuid_table(cc_info);
2110
2111	/*
2112	* The CC blob will be used later to access the secrets page. Cache
2113	* it here like the boot kernel does.
2114	*/
2115	bp->cc_blob_address = (u32)(unsigned long)cc_info;
2116
2117	return true;
2118	}
2119
2120	void __head __noreturn snp_abort(void)
2121	{
2122	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
2123	}
2124
2125	/*
2126	* SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
2127	* enabled, as the alternative (fallback) logic for DMI probing in the legacy
2128	* ROM region can cause a crash since this region is not pre-validated.
2129	*/
2130	void __init snp_dmi_setup(void)
2131	{
2132	if (efi_enabled(EFI_CONFIG_TABLES))
2133	dmi_setup();
2134	}
2135
2136	static void dump_cpuid_table(void)
2137	{
2138	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2139	int i = `0`;
2140
2141	pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
2142	cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
2143
2144	for (i = `0`; i < SNP_CPUID_COUNT_MAX; i++) {
2145	const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
2146
2147	pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
2148	i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
2149	fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
2150	}
2151	}
2152
2153	/*
2154	* It is useful from an auditing/testing perspective to provide an easy way
2155	* for the guest owner to know that the CPUID table has been initialized as
2156	* expected, but that initialization happens too early in boot to print any
2157	* sort of indicator, and there's not really any other good place to do it,
2158	* so do it here.
2159	*/
2160	static int __init report_cpuid_table(void)
2161	{
2162	const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
2163
2164	if (!cpuid_table->count)
2165	return `0`;
2166
2167	pr_info("Using SNP CPUID table, %d entries present.\n",
2168	cpuid_table->count);
2169
2170	if (sev_cfg.debug)
2171	dump_cpuid_table();
2172
2173	return `0`;
2174	}
2175	arch_initcall(report_cpuid_table);
2176
2177	static int __init init_sev_config(char *str)
2178	{
2179	char *s;
2180
2181	while ((s = strsep(&str, ","))) {
2182	if (!strcmp(s, "debug")) {
2183	sev_cfg.debug = true;
2184	continue;
2185	}
2186
2187	pr_info("SEV command-line option '%s' was not recognized\n", s);
2188	}
2189
2190	return `1`;
2191	}
2192	__setup("sev=", init_sev_config);
2193
2194	int snp_issue_guest_request(u64 exit_code, struct snp_req_data input, struct* snp_guest_request_ioctl *rio)
2195	{
2196	struct ghcb_state state;
2197	struct es_em_ctxt ctxt;
2198	unsigned long flags;
2199	struct ghcb *ghcb;
2200	int ret;
2201
2202	rio->exitinfo2 = SEV_RET_NO_FW_CALL;
2203
2204	/*
2205	* __sev_get_ghcb() needs to run with IRQs disabled because it is using
2206	* a per-CPU GHCB.
2207	*/
2208	local_irq_save(flags);
2209
2210	ghcb = __sev_get_ghcb(state: &state);
2211	if (!ghcb) {
2212	ret = -EIO;
2213	goto e_restore_irq;
2214	}
2215
2216	vc_ghcb_invalidate(ghcb);
2217
2218	if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2219	ghcb_set_rax(ghcb, value: input->data_gpa);
2220	ghcb_set_rbx(ghcb, value: input->data_npages);
2221	}
2222
2223	ret = sev_es_ghcb_hv_call(ghcb, ctxt: &ctxt, exit_code, exit_info_1: input->req_gpa, exit_info_2: input->resp_gpa);
2224	if (ret)
2225	goto e_put;
2226
2227	rio->exitinfo2 = ghcb->save.sw_exit_info_2;
2228	switch (rio->exitinfo2) {
2229	case `0`:
2230	break;
2231
2232	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
2233	ret = -EAGAIN;
2234	break;
2235
2236	case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
2237	/ Number of expected pages are returned in RBX /
2238	if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
2239	input->data_npages = ghcb_get_rbx(ghcb);
2240	ret = -ENOSPC;
2241	break;
2242	}
2243	fallthrough;
2244	default:
2245	ret = -EIO;
2246	break;
2247	}
2248
2249	e_put:
2250	__sev_put_ghcb(state: &state);
2251	e_restore_irq:
2252	local_irq_restore(flags);
2253
2254	return ret;
2255	}
2256	EXPORT_SYMBOL_GPL(snp_issue_guest_request);
2257
2258	static struct platform_device sev_guest_device = {
2259	.name = "sev-guest",
2260	.id = -`1`,
2261	};
2262
2263	static int __init snp_init_platform_device(void)
2264	{
2265	struct sev_guest_platform_data data;
2266	u64 gpa;
2267
2268	if (!cc_platform_has(attr: CC_ATTR_GUEST_SEV_SNP))
2269	return -ENODEV;
2270
2271	gpa = get_secrets_page();
2272	if (!gpa)
2273	return -ENODEV;
2274
2275	data.secrets_gpa = gpa;
2276	if (platform_device_add_data(pdev: &sev_guest_device, data: &data, size: sizeof(data)))
2277	return -ENODEV;
2278
2279	if (platform_device_register(&sev_guest_device))
2280	return -ENODEV;
2281
2282	pr_info("SNP guest platform device initialized.\n");
2283	return `0`;
2284	}
2285	device_initcall(snp_init_platform_device);
2286
2287	void sev_show_status(void)
2288	{
2289	int i;
2290
2291	pr_info("Status: ");
2292	for (i = `0`; i < MSR_AMD64_SNP_RESV_BIT; i++) {
2293	if (sev_status & BIT_ULL(i)) {
2294	if (!sev_status_feat_names[i])
2295	continue;
2296
2297	pr_cont("%s ", sev_status_feat_names[i]);
2298	}
2299	}
2300	pr_cont("\n");
2301	}
2302

source code of linux/arch/x86/kernel/sev.c