1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * X86 specific Hyper-V initialization code. |
4 | * |
5 | * Copyright (C) 2016, Microsoft, Inc. |
6 | * |
7 | * Author : K. Y. Srinivasan <kys@microsoft.com> |
8 | */ |
9 | |
10 | #define pr_fmt(fmt) "Hyper-V: " fmt |
11 | |
12 | #include <linux/efi.h> |
13 | #include <linux/types.h> |
14 | #include <linux/bitfield.h> |
15 | #include <linux/io.h> |
16 | #include <asm/apic.h> |
17 | #include <asm/desc.h> |
18 | #include <asm/e820/api.h> |
19 | #include <asm/sev.h> |
20 | #include <asm/ibt.h> |
21 | #include <asm/hypervisor.h> |
22 | #include <asm/hyperv-tlfs.h> |
23 | #include <asm/mshyperv.h> |
24 | #include <asm/idtentry.h> |
25 | #include <asm/set_memory.h> |
26 | #include <linux/kexec.h> |
27 | #include <linux/version.h> |
28 | #include <linux/vmalloc.h> |
29 | #include <linux/mm.h> |
30 | #include <linux/hyperv.h> |
31 | #include <linux/slab.h> |
32 | #include <linux/kernel.h> |
33 | #include <linux/cpuhotplug.h> |
34 | #include <linux/syscore_ops.h> |
35 | #include <clocksource/hyperv_timer.h> |
36 | #include <linux/highmem.h> |
37 | |
38 | int hyperv_init_cpuhp; |
39 | u64 hv_current_partition_id = ~0ull; |
40 | EXPORT_SYMBOL_GPL(hv_current_partition_id); |
41 | |
42 | void *hv_hypercall_pg; |
43 | EXPORT_SYMBOL_GPL(hv_hypercall_pg); |
44 | |
45 | union hv_ghcb * __percpu *hv_ghcb_pg; |
46 | |
47 | /* Storage to save the hypercall page temporarily for hibernation */ |
48 | static void *hv_hypercall_pg_saved; |
49 | |
50 | struct hv_vp_assist_page **hv_vp_assist_page; |
51 | EXPORT_SYMBOL_GPL(hv_vp_assist_page); |
52 | |
53 | static int hyperv_init_ghcb(void) |
54 | { |
55 | u64 ghcb_gpa; |
56 | void *ghcb_va; |
57 | void **ghcb_base; |
58 | |
59 | if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) |
60 | return 0; |
61 | |
62 | if (!hv_ghcb_pg) |
63 | return -EINVAL; |
64 | |
65 | /* |
66 | * GHCB page is allocated by paravisor. The address |
67 | * returned by MSR_AMD64_SEV_ES_GHCB is above shared |
68 | * memory boundary and map it here. |
69 | */ |
70 | rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); |
71 | |
72 | /* Mask out vTOM bit. ioremap_cache() maps decrypted */ |
73 | ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; |
74 | ghcb_va = (void *)ioremap_cache(offset: ghcb_gpa, HV_HYP_PAGE_SIZE); |
75 | if (!ghcb_va) |
76 | return -ENOMEM; |
77 | |
78 | ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); |
79 | *ghcb_base = ghcb_va; |
80 | |
81 | return 0; |
82 | } |
83 | |
84 | static int hv_cpu_init(unsigned int cpu) |
85 | { |
86 | union hv_vp_assist_msr_contents msr = { 0 }; |
87 | struct hv_vp_assist_page **hvp; |
88 | int ret; |
89 | |
90 | ret = hv_common_cpu_init(cpu); |
91 | if (ret) |
92 | return ret; |
93 | |
94 | if (!hv_vp_assist_page) |
95 | return 0; |
96 | |
97 | hvp = &hv_vp_assist_page[cpu]; |
98 | if (hv_root_partition) { |
99 | /* |
100 | * For root partition we get the hypervisor provided VP assist |
101 | * page, instead of allocating a new page. |
102 | */ |
103 | rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); |
104 | *hvp = memremap(offset: msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, |
105 | PAGE_SIZE, flags: MEMREMAP_WB); |
106 | } else { |
107 | /* |
108 | * The VP assist page is an "overlay" page (see Hyper-V TLFS's |
109 | * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed |
110 | * out to make sure we always write the EOI MSR in |
111 | * hv_apic_eoi_write() *after* the EOI optimization is disabled |
112 | * in hv_cpu_die(), otherwise a CPU may not be stopped in the |
113 | * case of CPU offlining and the VM will hang. |
114 | */ |
115 | if (!*hvp) { |
116 | *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); |
117 | |
118 | /* |
119 | * Hyper-V should never specify a VM that is a Confidential |
120 | * VM and also running in the root partition. Root partition |
121 | * is blocked to run in Confidential VM. So only decrypt assist |
122 | * page in non-root partition here. |
123 | */ |
124 | if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
125 | WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); |
126 | memset(*hvp, 0, PAGE_SIZE); |
127 | } |
128 | } |
129 | |
130 | if (*hvp) |
131 | msr.pfn = vmalloc_to_pfn(addr: *hvp); |
132 | |
133 | } |
134 | if (!WARN_ON(!(*hvp))) { |
135 | msr.enable = 1; |
136 | wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val: msr.as_uint64); |
137 | } |
138 | |
139 | return hyperv_init_ghcb(); |
140 | } |
141 | |
142 | static void (*hv_reenlightenment_cb)(void); |
143 | |
144 | static void hv_reenlightenment_notify(struct work_struct *dummy) |
145 | { |
146 | struct hv_tsc_emulation_status emu_status; |
147 | |
148 | rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); |
149 | |
150 | /* Don't issue the callback if TSC accesses are not emulated */ |
151 | if (hv_reenlightenment_cb && emu_status.inprogress) |
152 | hv_reenlightenment_cb(); |
153 | } |
154 | static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); |
155 | |
156 | void hyperv_stop_tsc_emulation(void) |
157 | { |
158 | u64 freq; |
159 | struct hv_tsc_emulation_status emu_status; |
160 | |
161 | rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); |
162 | emu_status.inprogress = 0; |
163 | wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, val: *(u64 *)&emu_status); |
164 | |
165 | rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); |
166 | tsc_khz = div64_u64(dividend: freq, divisor: 1000); |
167 | } |
168 | EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); |
169 | |
170 | static inline bool hv_reenlightenment_available(void) |
171 | { |
172 | /* |
173 | * Check for required features and privileges to make TSC frequency |
174 | * change notifications work. |
175 | */ |
176 | return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && |
177 | ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && |
178 | ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT; |
179 | } |
180 | |
181 | DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) |
182 | { |
183 | apic_eoi(); |
184 | inc_irq_stat(irq_hv_reenlightenment_count); |
185 | schedule_delayed_work(dwork: &hv_reenlightenment_work, HZ/10); |
186 | } |
187 | |
188 | void set_hv_tscchange_cb(void (*cb)(void)) |
189 | { |
190 | struct hv_reenlightenment_control re_ctrl = { |
191 | .vector = HYPERV_REENLIGHTENMENT_VECTOR, |
192 | .enabled = 1, |
193 | }; |
194 | struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; |
195 | |
196 | if (!hv_reenlightenment_available()) { |
197 | pr_warn("reenlightenment support is unavailable\n" ); |
198 | return; |
199 | } |
200 | |
201 | if (!hv_vp_index) |
202 | return; |
203 | |
204 | hv_reenlightenment_cb = cb; |
205 | |
206 | /* Make sure callback is registered before we write to MSRs */ |
207 | wmb(); |
208 | |
209 | re_ctrl.target_vp = hv_vp_index[get_cpu()]; |
210 | |
211 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, val: *((u64 *)&re_ctrl)); |
212 | wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, val: *((u64 *)&emu_ctrl)); |
213 | |
214 | put_cpu(); |
215 | } |
216 | EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); |
217 | |
218 | void clear_hv_tscchange_cb(void) |
219 | { |
220 | struct hv_reenlightenment_control re_ctrl; |
221 | |
222 | if (!hv_reenlightenment_available()) |
223 | return; |
224 | |
225 | rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); |
226 | re_ctrl.enabled = 0; |
227 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, val: *(u64 *)&re_ctrl); |
228 | |
229 | hv_reenlightenment_cb = NULL; |
230 | } |
231 | EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); |
232 | |
233 | static int hv_cpu_die(unsigned int cpu) |
234 | { |
235 | struct hv_reenlightenment_control re_ctrl; |
236 | unsigned int new_cpu; |
237 | void **ghcb_va; |
238 | |
239 | if (hv_ghcb_pg) { |
240 | ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); |
241 | if (*ghcb_va) |
242 | iounmap(addr: *ghcb_va); |
243 | *ghcb_va = NULL; |
244 | } |
245 | |
246 | hv_common_cpu_die(cpu); |
247 | |
248 | if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { |
249 | union hv_vp_assist_msr_contents msr = { 0 }; |
250 | if (hv_root_partition) { |
251 | /* |
252 | * For root partition the VP assist page is mapped to |
253 | * hypervisor provided page, and thus we unmap the |
254 | * page here and nullify it, so that in future we have |
255 | * correct page address mapped in hv_cpu_init. |
256 | */ |
257 | memunmap(addr: hv_vp_assist_page[cpu]); |
258 | hv_vp_assist_page[cpu] = NULL; |
259 | rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); |
260 | msr.enable = 0; |
261 | } |
262 | wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val: msr.as_uint64); |
263 | } |
264 | |
265 | if (hv_reenlightenment_cb == NULL) |
266 | return 0; |
267 | |
268 | rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); |
269 | if (re_ctrl.target_vp == hv_vp_index[cpu]) { |
270 | /* |
271 | * Reassign reenlightenment notifications to some other online |
272 | * CPU or just disable the feature if there are no online CPUs |
273 | * left (happens on hibernation). |
274 | */ |
275 | new_cpu = cpumask_any_but(cpu_online_mask, cpu); |
276 | |
277 | if (new_cpu < nr_cpu_ids) |
278 | re_ctrl.target_vp = hv_vp_index[new_cpu]; |
279 | else |
280 | re_ctrl.enabled = 0; |
281 | |
282 | wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, val: *((u64 *)&re_ctrl)); |
283 | } |
284 | |
285 | return 0; |
286 | } |
287 | |
288 | static int __init hv_pci_init(void) |
289 | { |
290 | bool gen2vm = efi_enabled(EFI_BOOT); |
291 | |
292 | /* |
293 | * A Generation-2 VM doesn't support legacy PCI/PCIe, so both |
294 | * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> |
295 | * pcibios_init() doesn't call pcibios_resource_survey() -> |
296 | * e820__reserve_resources_late(); as a result, any emulated persistent |
297 | * memory of E820_TYPE_PRAM (12) via the kernel parameter |
298 | * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be |
299 | * detected by register_e820_pmem(). Fix this by directly calling |
300 | * e820__reserve_resources_late() here: e820__reserve_resources_late() |
301 | * depends on e820__reserve_resources(), which has been called earlier |
302 | * from setup_arch(). Note: e820__reserve_resources_late() also adds |
303 | * any memory of E820_TYPE_PMEM (7) into iomem_resource, and |
304 | * acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> |
305 | * region_intersects() returns REGION_INTERSECTS, so the memory of |
306 | * E820_TYPE_PMEM won't get added twice. |
307 | * |
308 | * We return 0 here so that pci_arch_init() won't print the warning: |
309 | * "PCI: Fatal: No config space access function found" |
310 | */ |
311 | if (gen2vm) { |
312 | e820__reserve_resources_late(); |
313 | return 0; |
314 | } |
315 | |
316 | /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ |
317 | return 1; |
318 | } |
319 | |
320 | static int hv_suspend(void) |
321 | { |
322 | union hv_x64_msr_hypercall_contents hypercall_msr; |
323 | int ret; |
324 | |
325 | if (hv_root_partition) |
326 | return -EPERM; |
327 | |
328 | /* |
329 | * Reset the hypercall page as it is going to be invalidated |
330 | * across hibernation. Setting hv_hypercall_pg to NULL ensures |
331 | * that any subsequent hypercall operation fails safely instead of |
332 | * crashing due to an access of an invalid page. The hypercall page |
333 | * pointer is restored on resume. |
334 | */ |
335 | hv_hypercall_pg_saved = hv_hypercall_pg; |
336 | hv_hypercall_pg = NULL; |
337 | |
338 | /* Disable the hypercall page in the hypervisor */ |
339 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
340 | hypercall_msr.enable = 0; |
341 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
342 | |
343 | ret = hv_cpu_die(cpu: 0); |
344 | return ret; |
345 | } |
346 | |
347 | static void hv_resume(void) |
348 | { |
349 | union hv_x64_msr_hypercall_contents hypercall_msr; |
350 | int ret; |
351 | |
352 | ret = hv_cpu_init(cpu: 0); |
353 | WARN_ON(ret); |
354 | |
355 | /* Re-enable the hypercall page */ |
356 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
357 | hypercall_msr.enable = 1; |
358 | hypercall_msr.guest_physical_address = |
359 | vmalloc_to_pfn(addr: hv_hypercall_pg_saved); |
360 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
361 | |
362 | hv_hypercall_pg = hv_hypercall_pg_saved; |
363 | hv_hypercall_pg_saved = NULL; |
364 | |
365 | /* |
366 | * Reenlightenment notifications are disabled by hv_cpu_die(0), |
367 | * reenable them here if hv_reenlightenment_cb was previously set. |
368 | */ |
369 | if (hv_reenlightenment_cb) |
370 | set_hv_tscchange_cb(hv_reenlightenment_cb); |
371 | } |
372 | |
373 | /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ |
374 | static struct syscore_ops hv_syscore_ops = { |
375 | .suspend = hv_suspend, |
376 | .resume = hv_resume, |
377 | }; |
378 | |
379 | static void (* __initdata old_setup_percpu_clockev)(void); |
380 | |
381 | static void __init hv_stimer_setup_percpu_clockev(void) |
382 | { |
383 | /* |
384 | * Ignore any errors in setting up stimer clockevents |
385 | * as we can run with the LAPIC timer as a fallback. |
386 | */ |
387 | (void)hv_stimer_alloc(have_percpu_irqs: false); |
388 | |
389 | /* |
390 | * Still register the LAPIC timer, because the direct-mode STIMER is |
391 | * not supported by old versions of Hyper-V. This also allows users |
392 | * to switch to LAPIC timer via /sys, if they want to. |
393 | */ |
394 | if (old_setup_percpu_clockev) |
395 | old_setup_percpu_clockev(); |
396 | } |
397 | |
398 | static void __init hv_get_partition_id(void) |
399 | { |
400 | struct hv_get_partition_id *output_page; |
401 | u64 status; |
402 | unsigned long flags; |
403 | |
404 | local_irq_save(flags); |
405 | output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); |
406 | status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, outputaddr: output_page); |
407 | if (!hv_result_success(status)) { |
408 | /* No point in proceeding if this failed */ |
409 | pr_err("Failed to get partition ID: %lld\n" , status); |
410 | BUG(); |
411 | } |
412 | hv_current_partition_id = output_page->partition_id; |
413 | local_irq_restore(flags); |
414 | } |
415 | |
416 | #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) |
417 | static u8 __init get_vtl(void) |
418 | { |
419 | u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; |
420 | struct hv_get_vp_registers_input *input; |
421 | struct hv_get_vp_registers_output *output; |
422 | unsigned long flags; |
423 | u64 ret; |
424 | |
425 | local_irq_save(flags); |
426 | input = *this_cpu_ptr(hyperv_pcpu_input_arg); |
427 | output = (struct hv_get_vp_registers_output *)input; |
428 | |
429 | memset(input, 0, struct_size(input, element, 1)); |
430 | input->header.partitionid = HV_PARTITION_ID_SELF; |
431 | input->header.vpindex = HV_VP_INDEX_SELF; |
432 | input->header.inputvtl = 0; |
433 | input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS; |
434 | |
435 | ret = hv_do_hypercall(control, inputaddr: input, outputaddr: output); |
436 | if (hv_result_success(status: ret)) { |
437 | ret = output->as64.low & HV_X64_VTL_MASK; |
438 | } else { |
439 | pr_err("Failed to get VTL(error: %lld) exiting...\n" , ret); |
440 | BUG(); |
441 | } |
442 | |
443 | local_irq_restore(flags); |
444 | return ret; |
445 | } |
446 | #else |
447 | static inline u8 get_vtl(void) { return 0; } |
448 | #endif |
449 | |
450 | /* |
451 | * This function is to be invoked early in the boot sequence after the |
452 | * hypervisor has been detected. |
453 | * |
454 | * 1. Setup the hypercall page. |
455 | * 2. Register Hyper-V specific clocksource. |
456 | * 3. Setup Hyper-V specific APIC entry points. |
457 | */ |
458 | void __init hyperv_init(void) |
459 | { |
460 | u64 guest_id; |
461 | union hv_x64_msr_hypercall_contents hypercall_msr; |
462 | int cpuhp; |
463 | |
464 | if (x86_hyper_type != X86_HYPER_MS_HYPERV) |
465 | return; |
466 | |
467 | if (hv_common_init()) |
468 | return; |
469 | |
470 | /* |
471 | * The VP assist page is useless to a TDX guest: the only use we |
472 | * would have for it is lazy EOI, which can not be used with TDX. |
473 | */ |
474 | if (hv_isolation_type_tdx()) |
475 | hv_vp_assist_page = NULL; |
476 | else |
477 | hv_vp_assist_page = kcalloc(num_possible_cpus(), |
478 | size: sizeof(*hv_vp_assist_page), |
479 | GFP_KERNEL); |
480 | if (!hv_vp_assist_page) { |
481 | ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; |
482 | |
483 | if (!hv_isolation_type_tdx()) |
484 | goto common_free; |
485 | } |
486 | |
487 | if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { |
488 | /* Negotiate GHCB Version. */ |
489 | if (!hv_ghcb_negotiate_protocol()) |
490 | hv_ghcb_terminate(SEV_TERM_SET_GEN, |
491 | GHCB_SEV_ES_PROT_UNSUPPORTED); |
492 | |
493 | hv_ghcb_pg = alloc_percpu(union hv_ghcb *); |
494 | if (!hv_ghcb_pg) |
495 | goto free_vp_assist_page; |
496 | } |
497 | |
498 | cpuhp = cpuhp_setup_state(state: CPUHP_AP_HYPERV_ONLINE, name: "x86/hyperv_init:online" , |
499 | startup: hv_cpu_init, teardown: hv_cpu_die); |
500 | if (cpuhp < 0) |
501 | goto free_ghcb_page; |
502 | |
503 | /* |
504 | * Setup the hypercall page and enable hypercalls. |
505 | * 1. Register the guest ID |
506 | * 2. Enable the hypercall and register the hypercall page |
507 | * |
508 | * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: |
509 | * when the hypercall input is a page, such a VM must pass a decrypted |
510 | * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page |
511 | * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. |
512 | * |
513 | * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, |
514 | * which are handled by the paravisor and the VM must use an encrypted |
515 | * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and |
516 | * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and |
517 | * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: |
518 | * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). |
519 | * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. |
520 | * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; |
521 | * instead, hv_post_message() uses the post_msg_page, which is decrypted |
522 | * in such a VM and is only used in such a VM. |
523 | */ |
524 | guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); |
525 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, val: guest_id); |
526 | |
527 | /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ |
528 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, value: guest_id); |
529 | |
530 | /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ |
531 | if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) |
532 | goto skip_hypercall_pg_init; |
533 | |
534 | hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, align: 1, VMALLOC_START, |
535 | VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, |
536 | VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, |
537 | caller: __builtin_return_address(0)); |
538 | if (hv_hypercall_pg == NULL) |
539 | goto clean_guest_os_id; |
540 | |
541 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
542 | hypercall_msr.enable = 1; |
543 | |
544 | if (hv_root_partition) { |
545 | struct page *pg; |
546 | void *src; |
547 | |
548 | /* |
549 | * For the root partition, the hypervisor will set up its |
550 | * hypercall page. The hypervisor guarantees it will not show |
551 | * up in the root's address space. The root can't change the |
552 | * location of the hypercall page. |
553 | * |
554 | * Order is important here. We must enable the hypercall page |
555 | * so it is populated with code, then copy the code to an |
556 | * executable page. |
557 | */ |
558 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
559 | |
560 | pg = vmalloc_to_page(addr: hv_hypercall_pg); |
561 | src = memremap(offset: hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, |
562 | flags: MEMREMAP_WB); |
563 | BUG_ON(!src); |
564 | memcpy_to_page(page: pg, offset: 0, from: src, HV_HYP_PAGE_SIZE); |
565 | memunmap(addr: src); |
566 | |
567 | hv_remap_tsc_clocksource(); |
568 | } else { |
569 | hypercall_msr.guest_physical_address = vmalloc_to_pfn(addr: hv_hypercall_pg); |
570 | wrmsrl(HV_X64_MSR_HYPERCALL, val: hypercall_msr.as_uint64); |
571 | } |
572 | |
573 | skip_hypercall_pg_init: |
574 | /* |
575 | * Some versions of Hyper-V that provide IBT in guest VMs have a bug |
576 | * in that there's no ENDBR64 instruction at the entry to the |
577 | * hypercall page. Because hypercalls are invoked via an indirect call |
578 | * to the hypercall page, all hypercall attempts fail when IBT is |
579 | * enabled, and Linux panics. For such buggy versions, disable IBT. |
580 | * |
581 | * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall |
582 | * page, so if future Linux kernel versions enable IBT for 32-bit |
583 | * builds, additional hypercall page hackery will be required here |
584 | * to provide an ENDBR32. |
585 | */ |
586 | #ifdef CONFIG_X86_KERNEL_IBT |
587 | if (cpu_feature_enabled(X86_FEATURE_IBT) && |
588 | *(u32 *)hv_hypercall_pg != gen_endbr()) { |
589 | setup_clear_cpu_cap(X86_FEATURE_IBT); |
590 | pr_warn("Disabling IBT because of Hyper-V bug\n" ); |
591 | } |
592 | #endif |
593 | |
594 | /* |
595 | * hyperv_init() is called before LAPIC is initialized: see |
596 | * apic_intr_mode_init() -> x86_platform.apic_post_init() and |
597 | * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER |
598 | * depends on LAPIC, so hv_stimer_alloc() should be called from |
599 | * x86_init.timers.setup_percpu_clockev. |
600 | */ |
601 | old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; |
602 | x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; |
603 | |
604 | hv_apic_init(); |
605 | |
606 | x86_init.pci.arch_init = hv_pci_init; |
607 | |
608 | register_syscore_ops(ops: &hv_syscore_ops); |
609 | |
610 | hyperv_init_cpuhp = cpuhp; |
611 | |
612 | if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) |
613 | hv_get_partition_id(); |
614 | |
615 | BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); |
616 | |
617 | #ifdef CONFIG_PCI_MSI |
618 | /* |
619 | * If we're running as root, we want to create our own PCI MSI domain. |
620 | * We can't set this in hv_pci_init because that would be too late. |
621 | */ |
622 | if (hv_root_partition) |
623 | x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; |
624 | #endif |
625 | |
626 | /* Query the VMs extended capability once, so that it can be cached. */ |
627 | hv_query_ext_cap(cap_query: 0); |
628 | |
629 | /* Find the VTL */ |
630 | ms_hyperv.vtl = get_vtl(); |
631 | |
632 | if (ms_hyperv.vtl > 0) /* non default VTL */ |
633 | hv_vtl_early_init(); |
634 | |
635 | return; |
636 | |
637 | clean_guest_os_id: |
638 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, val: 0); |
639 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, value: 0); |
640 | cpuhp_remove_state(state: cpuhp); |
641 | free_ghcb_page: |
642 | free_percpu(pdata: hv_ghcb_pg); |
643 | free_vp_assist_page: |
644 | kfree(objp: hv_vp_assist_page); |
645 | hv_vp_assist_page = NULL; |
646 | common_free: |
647 | hv_common_free(); |
648 | } |
649 | |
650 | /* |
651 | * This routine is called before kexec/kdump, it does the required cleanup. |
652 | */ |
653 | void hyperv_cleanup(void) |
654 | { |
655 | union hv_x64_msr_hypercall_contents hypercall_msr; |
656 | union hv_reference_tsc_msr tsc_msr; |
657 | |
658 | /* Reset our OS id */ |
659 | wrmsrl(HV_X64_MSR_GUEST_OS_ID, val: 0); |
660 | hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, value: 0); |
661 | |
662 | /* |
663 | * Reset hypercall page reference before reset the page, |
664 | * let hypercall operations fail safely rather than |
665 | * panic the kernel for using invalid hypercall page |
666 | */ |
667 | hv_hypercall_pg = NULL; |
668 | |
669 | /* Reset the hypercall page */ |
670 | hypercall_msr.as_uint64 = hv_get_msr(HV_X64_MSR_HYPERCALL); |
671 | hypercall_msr.enable = 0; |
672 | hv_set_msr(HV_X64_MSR_HYPERCALL, value: hypercall_msr.as_uint64); |
673 | |
674 | /* Reset the TSC page */ |
675 | tsc_msr.as_uint64 = hv_get_msr(HV_X64_MSR_REFERENCE_TSC); |
676 | tsc_msr.enable = 0; |
677 | hv_set_msr(HV_X64_MSR_REFERENCE_TSC, value: tsc_msr.as_uint64); |
678 | } |
679 | |
680 | void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) |
681 | { |
682 | static bool panic_reported; |
683 | u64 guest_id; |
684 | |
685 | if (in_die && !panic_on_oops) |
686 | return; |
687 | |
688 | /* |
689 | * We prefer to report panic on 'die' chain as we have proper |
690 | * registers to report, but if we miss it (e.g. on BUG()) we need |
691 | * to report it on 'panic'. |
692 | */ |
693 | if (panic_reported) |
694 | return; |
695 | panic_reported = true; |
696 | |
697 | rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); |
698 | |
699 | wrmsrl(HV_X64_MSR_CRASH_P0, val: err); |
700 | wrmsrl(HV_X64_MSR_CRASH_P1, val: guest_id); |
701 | wrmsrl(HV_X64_MSR_CRASH_P2, val: regs->ip); |
702 | wrmsrl(HV_X64_MSR_CRASH_P3, val: regs->ax); |
703 | wrmsrl(HV_X64_MSR_CRASH_P4, val: regs->sp); |
704 | |
705 | /* |
706 | * Let Hyper-V know there is crash data available |
707 | */ |
708 | wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); |
709 | } |
710 | EXPORT_SYMBOL_GPL(hyperv_report_panic); |
711 | |
712 | bool hv_is_hyperv_initialized(void) |
713 | { |
714 | union hv_x64_msr_hypercall_contents hypercall_msr; |
715 | |
716 | /* |
717 | * Ensure that we're really on Hyper-V, and not a KVM or Xen |
718 | * emulation of Hyper-V |
719 | */ |
720 | if (x86_hyper_type != X86_HYPER_MS_HYPERV) |
721 | return false; |
722 | |
723 | /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ |
724 | if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) |
725 | return true; |
726 | /* |
727 | * Verify that earlier initialization succeeded by checking |
728 | * that the hypercall page is setup |
729 | */ |
730 | hypercall_msr.as_uint64 = 0; |
731 | rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); |
732 | |
733 | return hypercall_msr.enable; |
734 | } |
735 | EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); |
736 | |