1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (c) 2009, Microsoft Corporation. |
4 | * |
5 | * Authors: |
6 | * Haiyang Zhang <haiyangz@microsoft.com> |
7 | * Hank Janssen <hjanssen@microsoft.com> |
8 | */ |
9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
10 | |
11 | #include <linux/io.h> |
12 | #include <linux/kernel.h> |
13 | #include <linux/mm.h> |
14 | #include <linux/slab.h> |
15 | #include <linux/vmalloc.h> |
16 | #include <linux/hyperv.h> |
17 | #include <linux/random.h> |
18 | #include <linux/clockchips.h> |
19 | #include <linux/delay.h> |
20 | #include <linux/interrupt.h> |
21 | #include <clocksource/hyperv_timer.h> |
22 | #include <asm/mshyperv.h> |
23 | #include <linux/set_memory.h> |
24 | #include "hyperv_vmbus.h" |
25 | |
26 | /* The one and only */ |
27 | struct hv_context hv_context; |
28 | |
29 | /* |
30 | * hv_init - Main initialization routine. |
31 | * |
32 | * This routine must be called before any other routines in here are called |
33 | */ |
34 | int hv_init(void) |
35 | { |
36 | hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context); |
37 | if (!hv_context.cpu_context) |
38 | return -ENOMEM; |
39 | return 0; |
40 | } |
41 | |
42 | /* |
43 | * hv_post_message - Post a message using the hypervisor message IPC. |
44 | * |
45 | * This involves a hypercall. |
46 | */ |
47 | int hv_post_message(union hv_connection_id connection_id, |
48 | enum hv_message_type message_type, |
49 | void *payload, size_t payload_size) |
50 | { |
51 | struct hv_input_post_message *aligned_msg; |
52 | unsigned long flags; |
53 | u64 status; |
54 | |
55 | if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) |
56 | return -EMSGSIZE; |
57 | |
58 | local_irq_save(flags); |
59 | |
60 | /* |
61 | * A TDX VM with the paravisor must use the decrypted post_msg_page: see |
62 | * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor |
63 | * can use the encrypted hyperv_pcpu_input_arg because it copies the |
64 | * input into the GHCB page, which has been decrypted by the paravisor. |
65 | */ |
66 | if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present) |
67 | aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page; |
68 | else |
69 | aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); |
70 | |
71 | aligned_msg->connectionid = connection_id; |
72 | aligned_msg->reserved = 0; |
73 | aligned_msg->message_type = message_type; |
74 | aligned_msg->payload_size = payload_size; |
75 | memcpy((void *)aligned_msg->payload, payload, payload_size); |
76 | |
77 | if (ms_hyperv.paravisor_present) { |
78 | if (hv_isolation_type_tdx()) |
79 | status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, |
80 | virt_to_phys(address: aligned_msg), param2: 0); |
81 | else if (hv_isolation_type_snp()) |
82 | status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, |
83 | input: aligned_msg, NULL, |
84 | input_size: sizeof(*aligned_msg)); |
85 | else |
86 | status = HV_STATUS_INVALID_PARAMETER; |
87 | } else { |
88 | status = hv_do_hypercall(HVCALL_POST_MESSAGE, |
89 | inputaddr: aligned_msg, NULL); |
90 | } |
91 | |
92 | local_irq_restore(flags); |
93 | |
94 | return hv_result(status); |
95 | } |
96 | |
97 | int hv_synic_alloc(void) |
98 | { |
99 | int cpu, ret = -ENOMEM; |
100 | struct hv_per_cpu_context *hv_cpu; |
101 | |
102 | /* |
103 | * First, zero all per-cpu memory areas so hv_synic_free() can |
104 | * detect what memory has been allocated and cleanup properly |
105 | * after any failures. |
106 | */ |
107 | for_each_present_cpu(cpu) { |
108 | hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); |
109 | memset(hv_cpu, 0, sizeof(*hv_cpu)); |
110 | } |
111 | |
112 | hv_context.hv_numa_map = kcalloc(n: nr_node_ids, size: sizeof(struct cpumask), |
113 | GFP_KERNEL); |
114 | if (hv_context.hv_numa_map == NULL) { |
115 | pr_err("Unable to allocate NUMA map\n" ); |
116 | goto err; |
117 | } |
118 | |
119 | for_each_present_cpu(cpu) { |
120 | hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); |
121 | |
122 | tasklet_init(t: &hv_cpu->msg_dpc, |
123 | func: vmbus_on_msg_dpc, data: (unsigned long) hv_cpu); |
124 | |
125 | if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { |
126 | hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); |
127 | if (hv_cpu->post_msg_page == NULL) { |
128 | pr_err("Unable to allocate post msg page\n" ); |
129 | goto err; |
130 | } |
131 | |
132 | ret = set_memory_decrypted(addr: (unsigned long)hv_cpu->post_msg_page, numpages: 1); |
133 | if (ret) { |
134 | pr_err("Failed to decrypt post msg page: %d\n" , ret); |
135 | /* Just leak the page, as it's unsafe to free the page. */ |
136 | hv_cpu->post_msg_page = NULL; |
137 | goto err; |
138 | } |
139 | |
140 | memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); |
141 | } |
142 | |
143 | /* |
144 | * Synic message and event pages are allocated by paravisor. |
145 | * Skip these pages allocation here. |
146 | */ |
147 | if (!ms_hyperv.paravisor_present && !hv_root_partition) { |
148 | hv_cpu->synic_message_page = |
149 | (void *)get_zeroed_page(GFP_ATOMIC); |
150 | if (hv_cpu->synic_message_page == NULL) { |
151 | pr_err("Unable to allocate SYNIC message page\n" ); |
152 | goto err; |
153 | } |
154 | |
155 | hv_cpu->synic_event_page = |
156 | (void *)get_zeroed_page(GFP_ATOMIC); |
157 | if (hv_cpu->synic_event_page == NULL) { |
158 | pr_err("Unable to allocate SYNIC event page\n" ); |
159 | |
160 | free_page((unsigned long)hv_cpu->synic_message_page); |
161 | hv_cpu->synic_message_page = NULL; |
162 | goto err; |
163 | } |
164 | } |
165 | |
166 | if (!ms_hyperv.paravisor_present && |
167 | (hv_isolation_type_snp() || hv_isolation_type_tdx())) { |
168 | ret = set_memory_decrypted(addr: (unsigned long) |
169 | hv_cpu->synic_message_page, numpages: 1); |
170 | if (ret) { |
171 | pr_err("Failed to decrypt SYNIC msg page: %d\n" , ret); |
172 | hv_cpu->synic_message_page = NULL; |
173 | |
174 | /* |
175 | * Free the event page here so that hv_synic_free() |
176 | * won't later try to re-encrypt it. |
177 | */ |
178 | free_page((unsigned long)hv_cpu->synic_event_page); |
179 | hv_cpu->synic_event_page = NULL; |
180 | goto err; |
181 | } |
182 | |
183 | ret = set_memory_decrypted(addr: (unsigned long) |
184 | hv_cpu->synic_event_page, numpages: 1); |
185 | if (ret) { |
186 | pr_err("Failed to decrypt SYNIC event page: %d\n" , ret); |
187 | hv_cpu->synic_event_page = NULL; |
188 | goto err; |
189 | } |
190 | |
191 | memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); |
192 | memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); |
193 | } |
194 | } |
195 | |
196 | return 0; |
197 | |
198 | err: |
199 | /* |
200 | * Any memory allocations that succeeded will be freed when |
201 | * the caller cleans up by calling hv_synic_free() |
202 | */ |
203 | return ret; |
204 | } |
205 | |
206 | |
207 | void hv_synic_free(void) |
208 | { |
209 | int cpu, ret; |
210 | |
211 | for_each_present_cpu(cpu) { |
212 | struct hv_per_cpu_context *hv_cpu |
213 | = per_cpu_ptr(hv_context.cpu_context, cpu); |
214 | |
215 | /* It's better to leak the page if the encryption fails. */ |
216 | if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { |
217 | if (hv_cpu->post_msg_page) { |
218 | ret = set_memory_encrypted(addr: (unsigned long) |
219 | hv_cpu->post_msg_page, numpages: 1); |
220 | if (ret) { |
221 | pr_err("Failed to encrypt post msg page: %d\n" , ret); |
222 | hv_cpu->post_msg_page = NULL; |
223 | } |
224 | } |
225 | } |
226 | |
227 | if (!ms_hyperv.paravisor_present && |
228 | (hv_isolation_type_snp() || hv_isolation_type_tdx())) { |
229 | if (hv_cpu->synic_message_page) { |
230 | ret = set_memory_encrypted(addr: (unsigned long) |
231 | hv_cpu->synic_message_page, numpages: 1); |
232 | if (ret) { |
233 | pr_err("Failed to encrypt SYNIC msg page: %d\n" , ret); |
234 | hv_cpu->synic_message_page = NULL; |
235 | } |
236 | } |
237 | |
238 | if (hv_cpu->synic_event_page) { |
239 | ret = set_memory_encrypted(addr: (unsigned long) |
240 | hv_cpu->synic_event_page, numpages: 1); |
241 | if (ret) { |
242 | pr_err("Failed to encrypt SYNIC event page: %d\n" , ret); |
243 | hv_cpu->synic_event_page = NULL; |
244 | } |
245 | } |
246 | } |
247 | |
248 | free_page((unsigned long)hv_cpu->post_msg_page); |
249 | free_page((unsigned long)hv_cpu->synic_event_page); |
250 | free_page((unsigned long)hv_cpu->synic_message_page); |
251 | } |
252 | |
253 | kfree(objp: hv_context.hv_numa_map); |
254 | } |
255 | |
256 | /* |
257 | * hv_synic_init - Initialize the Synthetic Interrupt Controller. |
258 | * |
259 | * If it is already initialized by another entity (ie x2v shim), we need to |
260 | * retrieve the initialized message and event pages. Otherwise, we create and |
261 | * initialize the message and event pages. |
262 | */ |
263 | void hv_synic_enable_regs(unsigned int cpu) |
264 | { |
265 | struct hv_per_cpu_context *hv_cpu |
266 | = per_cpu_ptr(hv_context.cpu_context, cpu); |
267 | union hv_synic_simp simp; |
268 | union hv_synic_siefp siefp; |
269 | union hv_synic_sint shared_sint; |
270 | union hv_synic_scontrol sctrl; |
271 | |
272 | /* Setup the Synic's message page */ |
273 | simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); |
274 | simp.simp_enabled = 1; |
275 | |
276 | if (ms_hyperv.paravisor_present || hv_root_partition) { |
277 | /* Mask out vTOM bit. ioremap_cache() maps decrypted */ |
278 | u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & |
279 | ~ms_hyperv.shared_gpa_boundary; |
280 | hv_cpu->synic_message_page |
281 | = (void *)ioremap_cache(offset: base, HV_HYP_PAGE_SIZE); |
282 | if (!hv_cpu->synic_message_page) |
283 | pr_err("Fail to map synic message page.\n" ); |
284 | } else { |
285 | simp.base_simp_gpa = virt_to_phys(address: hv_cpu->synic_message_page) |
286 | >> HV_HYP_PAGE_SHIFT; |
287 | } |
288 | |
289 | hv_set_register(HV_REGISTER_SIMP, value: simp.as_uint64); |
290 | |
291 | /* Setup the Synic's event page */ |
292 | siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); |
293 | siefp.siefp_enabled = 1; |
294 | |
295 | if (ms_hyperv.paravisor_present || hv_root_partition) { |
296 | /* Mask out vTOM bit. ioremap_cache() maps decrypted */ |
297 | u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & |
298 | ~ms_hyperv.shared_gpa_boundary; |
299 | hv_cpu->synic_event_page |
300 | = (void *)ioremap_cache(offset: base, HV_HYP_PAGE_SIZE); |
301 | if (!hv_cpu->synic_event_page) |
302 | pr_err("Fail to map synic event page.\n" ); |
303 | } else { |
304 | siefp.base_siefp_gpa = virt_to_phys(address: hv_cpu->synic_event_page) |
305 | >> HV_HYP_PAGE_SHIFT; |
306 | } |
307 | |
308 | hv_set_register(HV_REGISTER_SIEFP, value: siefp.as_uint64); |
309 | |
310 | /* Setup the shared SINT. */ |
311 | if (vmbus_irq != -1) |
312 | enable_percpu_irq(irq: vmbus_irq, type: 0); |
313 | shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + |
314 | VMBUS_MESSAGE_SINT); |
315 | |
316 | shared_sint.vector = vmbus_interrupt; |
317 | shared_sint.masked = false; |
318 | |
319 | /* |
320 | * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), |
321 | * it doesn't provide a recommendation flag and AEOI must be disabled. |
322 | */ |
323 | #ifdef HV_DEPRECATING_AEOI_RECOMMENDED |
324 | shared_sint.auto_eoi = |
325 | !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); |
326 | #else |
327 | shared_sint.auto_eoi = 0; |
328 | #endif |
329 | hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, |
330 | value: shared_sint.as_uint64); |
331 | |
332 | /* Enable the global synic bit */ |
333 | sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); |
334 | sctrl.enable = 1; |
335 | |
336 | hv_set_register(HV_REGISTER_SCONTROL, value: sctrl.as_uint64); |
337 | } |
338 | |
339 | int hv_synic_init(unsigned int cpu) |
340 | { |
341 | hv_synic_enable_regs(cpu); |
342 | |
343 | hv_stimer_legacy_init(cpu, sint: VMBUS_MESSAGE_SINT); |
344 | |
345 | return 0; |
346 | } |
347 | |
348 | /* |
349 | * hv_synic_cleanup - Cleanup routine for hv_synic_init(). |
350 | */ |
351 | void hv_synic_disable_regs(unsigned int cpu) |
352 | { |
353 | struct hv_per_cpu_context *hv_cpu |
354 | = per_cpu_ptr(hv_context.cpu_context, cpu); |
355 | union hv_synic_sint shared_sint; |
356 | union hv_synic_simp simp; |
357 | union hv_synic_siefp siefp; |
358 | union hv_synic_scontrol sctrl; |
359 | |
360 | shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + |
361 | VMBUS_MESSAGE_SINT); |
362 | |
363 | shared_sint.masked = 1; |
364 | |
365 | /* Need to correctly cleanup in the case of SMP!!! */ |
366 | /* Disable the interrupt */ |
367 | hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, |
368 | value: shared_sint.as_uint64); |
369 | |
370 | simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); |
371 | /* |
372 | * In Isolation VM, sim and sief pages are allocated by |
373 | * paravisor. These pages also will be used by kdump |
374 | * kernel. So just reset enable bit here and keep page |
375 | * addresses. |
376 | */ |
377 | simp.simp_enabled = 0; |
378 | if (ms_hyperv.paravisor_present || hv_root_partition) { |
379 | iounmap(addr: hv_cpu->synic_message_page); |
380 | hv_cpu->synic_message_page = NULL; |
381 | } else { |
382 | simp.base_simp_gpa = 0; |
383 | } |
384 | |
385 | hv_set_register(HV_REGISTER_SIMP, value: simp.as_uint64); |
386 | |
387 | siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); |
388 | siefp.siefp_enabled = 0; |
389 | |
390 | if (ms_hyperv.paravisor_present || hv_root_partition) { |
391 | iounmap(addr: hv_cpu->synic_event_page); |
392 | hv_cpu->synic_event_page = NULL; |
393 | } else { |
394 | siefp.base_siefp_gpa = 0; |
395 | } |
396 | |
397 | hv_set_register(HV_REGISTER_SIEFP, value: siefp.as_uint64); |
398 | |
399 | /* Disable the global synic bit */ |
400 | sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); |
401 | sctrl.enable = 0; |
402 | hv_set_register(HV_REGISTER_SCONTROL, value: sctrl.as_uint64); |
403 | |
404 | if (vmbus_irq != -1) |
405 | disable_percpu_irq(irq: vmbus_irq); |
406 | } |
407 | |
408 | #define HV_MAX_TRIES 3 |
409 | /* |
410 | * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one |
411 | * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times. |
412 | * Return 'true', if there is still any set bit after this operation; 'false', otherwise. |
413 | * |
414 | * If a bit is set, that means there is a pending channel interrupt. The expectation is |
415 | * that the normal interrupt handling mechanism will find and process the channel interrupt |
416 | * "very soon", and in the process clear the bit. |
417 | */ |
418 | static bool hv_synic_event_pending(void) |
419 | { |
420 | struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); |
421 | union hv_synic_event_flags *event = |
422 | (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; |
423 | unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ |
424 | bool pending; |
425 | u32 relid; |
426 | int tries = 0; |
427 | |
428 | retry: |
429 | pending = false; |
430 | for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { |
431 | /* Special case - VMBus channel protocol messages */ |
432 | if (relid == 0) |
433 | continue; |
434 | pending = true; |
435 | break; |
436 | } |
437 | if (pending && tries++ < HV_MAX_TRIES) { |
438 | usleep_range(min: 10000, max: 20000); |
439 | goto retry; |
440 | } |
441 | return pending; |
442 | } |
443 | |
444 | int hv_synic_cleanup(unsigned int cpu) |
445 | { |
446 | struct vmbus_channel *channel, *sc; |
447 | bool channel_found = false; |
448 | |
449 | if (vmbus_connection.conn_state != CONNECTED) |
450 | goto always_cleanup; |
451 | |
452 | /* |
453 | * Hyper-V does not provide a way to change the connect CPU once |
454 | * it is set; we must prevent the connect CPU from going offline |
455 | * while the VM is running normally. But in the panic or kexec() |
456 | * path where the vmbus is already disconnected, the CPU must be |
457 | * allowed to shut down. |
458 | */ |
459 | if (cpu == VMBUS_CONNECT_CPU) |
460 | return -EBUSY; |
461 | |
462 | /* |
463 | * Search for channels which are bound to the CPU we're about to |
464 | * cleanup. In case we find one and vmbus is still connected, we |
465 | * fail; this will effectively prevent CPU offlining. |
466 | * |
467 | * TODO: Re-bind the channels to different CPUs. |
468 | */ |
469 | mutex_lock(&vmbus_connection.channel_mutex); |
470 | list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { |
471 | if (channel->target_cpu == cpu) { |
472 | channel_found = true; |
473 | break; |
474 | } |
475 | list_for_each_entry(sc, &channel->sc_list, sc_list) { |
476 | if (sc->target_cpu == cpu) { |
477 | channel_found = true; |
478 | break; |
479 | } |
480 | } |
481 | if (channel_found) |
482 | break; |
483 | } |
484 | mutex_unlock(lock: &vmbus_connection.channel_mutex); |
485 | |
486 | if (channel_found) |
487 | return -EBUSY; |
488 | |
489 | /* |
490 | * channel_found == false means that any channels that were previously |
491 | * assigned to the CPU have been reassigned elsewhere with a call of |
492 | * vmbus_send_modifychannel(). Scan the event flags page looking for |
493 | * bits that are set and waiting with a timeout for vmbus_chan_sched() |
494 | * to process such bits. If bits are still set after this operation |
495 | * and VMBus is connected, fail the CPU offlining operation. |
496 | */ |
497 | if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) |
498 | return -EBUSY; |
499 | |
500 | always_cleanup: |
501 | hv_stimer_legacy_cleanup(cpu); |
502 | |
503 | hv_synic_disable_regs(cpu); |
504 | |
505 | return 0; |
506 | } |
507 | |