hv_balloon.c source code [linux/drivers/hv/hv_balloon.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (c) 2012, Microsoft Corporation.
4	*
5	* Author:
6	* K. Y. Srinivasan <kys@microsoft.com>
7	*/
8
9	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11	#include <linux/cleanup.h>
12	#include <linux/kernel.h>
13	#include <linux/jiffies.h>
14	#include <linux/mman.h>
15	#include <linux/debugfs.h>
16	#include <linux/delay.h>
17	#include <linux/init.h>
18	#include <linux/module.h>
19	#include <linux/slab.h>
20	#include <linux/kthread.h>
21	#include <linux/completion.h>
22	#include <linux/count_zeros.h>
23	#include <linux/memory_hotplug.h>
24	#include <linux/memory.h>
25	#include <linux/notifier.h>
26	#include <linux/percpu_counter.h>
27	#include <linux/page_reporting.h>
28
29	#include <linux/hyperv.h>
30	#include <asm/hyperv-tlfs.h>
31
32	#include <asm/mshyperv.h>
33
34	#define CREATE_TRACE_POINTS
35	#include "hv_trace_balloon.h"
36
37	/*
38	* We begin with definitions supporting the Dynamic Memory protocol
39	* with the host.
40	*
41	* Begin protocol definitions.
42	*/
43
44
45
46	/*
47	* Protocol versions. The low word is the minor version, the high word the major
48	* version.
49	*
50	* History:
51	* Initial version 1.0
52	* Changed to 0.1 on 2009/03/25
53	* Changes to 0.2 on 2009/05/14
54	* Changes to 0.3 on 2009/12/03
55	* Changed to 1.0 on 2011/04/05
56	*/
57
58	#define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) \| (Minor)))
59	#define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
60	#define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
61
62	enum {
63	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(`0`, `3`),
64	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(`1`, `0`),
65	DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(`2`, `0`),
66
67	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
68	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
69	DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,
70
71	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
72	};
73
74
75
76	/*
77	* Message Types
78	*/
79
80	enum dm_message_type {
81	/*
82	* Version 0.3
83	*/
84	DM_ERROR = `0`,
85	DM_VERSION_REQUEST = `1`,
86	DM_VERSION_RESPONSE = `2`,
87	DM_CAPABILITIES_REPORT = `3`,
88	DM_CAPABILITIES_RESPONSE = `4`,
89	DM_STATUS_REPORT = `5`,
90	DM_BALLOON_REQUEST = `6`,
91	DM_BALLOON_RESPONSE = `7`,
92	DM_UNBALLOON_REQUEST = `8`,
93	DM_UNBALLOON_RESPONSE = `9`,
94	DM_MEM_HOT_ADD_REQUEST = `10`,
95	DM_MEM_HOT_ADD_RESPONSE = `11`,
96	DM_VERSION_03_MAX = `11`,
97	/*
98	* Version 1.0.
99	*/
100	DM_INFO_MESSAGE = `12`,
101	DM_VERSION_1_MAX = `12`
102	};
103
104
105	/*
106	* Structures defining the dynamic memory management
107	* protocol.
108	*/
109
110	union dm_version {
111	struct {
112	__u16 minor_version;
113	__u16 major_version;
114	};
115	__u32 version;
116	} __packed;
117
118
119	union dm_caps {
120	struct {
121	__u64 balloon:`1`;
122	__u64 hot_add:`1`;
123	/*
124	* To support guests that may have alignment
125	* limitations on hot-add, the guest can specify
126	* its alignment requirements; a value of n
127	* represents an alignment of 2^n in mega bytes.
128	*/
129	__u64 hot_add_alignment:`4`;
130	__u64 reservedz:`58`;
131	} cap_bits;
132	__u64 caps;
133	} __packed;
134
135	union dm_mem_page_range {
136	struct {
137	/*
138	* The PFN number of the first page in the range.
139	* 40 bits is the architectural limit of a PFN
140	* number for AMD64.
141	*/
142	__u64 start_page:`40`;
143	/*
144	* The number of pages in the range.
145	*/
146	__u64 page_cnt:`24`;
147	} finfo;
148	__u64 page_range;
149	} __packed;
150
151
152
153	/*
154	* The header for all dynamic memory messages:
155	*
156	* type: Type of the message.
157	* size: Size of the message in bytes; including the header.
158	* trans_id: The guest is responsible for manufacturing this ID.
159	*/
160
161	struct dm_header {
162	__u16 type;
163	__u16 size;
164	__u32 trans_id;
165	} __packed;
166
167	/*
168	* A generic message format for dynamic memory.
169	* Specific message formats are defined later in the file.
170	*/
171
172	struct dm_message {
173	struct dm_header hdr;
174	__u8 data[]; / enclosed message /
175	} __packed;
176
177
178	/*
179	* Specific message types supporting the dynamic memory protocol.
180	*/
181
182	/*
183	* Version negotiation message. Sent from the guest to the host.
184	* The guest is free to try different versions until the host
185	* accepts the version.
186	*
187	* dm_version: The protocol version requested.
188	* is_last_attempt: If TRUE, this is the last version guest will request.
189	* reservedz: Reserved field, set to zero.
190	*/
191
192	struct dm_version_request {
193	struct dm_header hdr;
194	union dm_version version;
195	__u32 is_last_attempt:`1`;
196	__u32 reservedz:`31`;
197	} __packed;
198
199	/*
200	* Version response message; Host to Guest and indicates
201	* if the host has accepted the version sent by the guest.
202	*
203	* is_accepted: If TRUE, host has accepted the version and the guest
204	* should proceed to the next stage of the protocol. FALSE indicates that
205	* guest should re-try with a different version.
206	*
207	* reservedz: Reserved field, set to zero.
208	*/
209
210	struct dm_version_response {
211	struct dm_header hdr;
212	__u64 is_accepted:`1`;
213	__u64 reservedz:`63`;
214	} __packed;
215
216	/*
217	* Message reporting capabilities. This is sent from the guest to the
218	* host.
219	*/
220
221	struct dm_capabilities {
222	struct dm_header hdr;
223	union dm_caps caps;
224	__u64 min_page_cnt;
225	__u64 max_page_number;
226	} __packed;
227
228	/*
229	* Response to the capabilities message. This is sent from the host to the
230	* guest. This message notifies if the host has accepted the guest's
231	* capabilities. If the host has not accepted, the guest must shutdown
232	* the service.
233	*
234	* is_accepted: Indicates if the host has accepted guest's capabilities.
235	* reservedz: Must be 0.
236	*/
237
238	struct dm_capabilities_resp_msg {
239	struct dm_header hdr;
240	__u64 is_accepted:`1`;
241	__u64 reservedz:`63`;
242	} __packed;
243
244	/*
245	* This message is used to report memory pressure from the guest.
246	* This message is not part of any transaction and there is no
247	* response to this message.
248	*
249	* num_avail: Available memory in pages.
250	* num_committed: Committed memory in pages.
251	* page_file_size: The accumulated size of all page files
252	* in the system in pages.
253	* zero_free: The number of zero and free pages.
254	* page_file_writes: The writes to the page file in pages.
255	* io_diff: An indicator of file cache efficiency or page file activity,
256	* calculated as File Cache Page Fault Count - Page Read Count.
257	* This value is in pages.
258	*
259	* Some of these metrics are Windows specific and fortunately
260	* the algorithm on the host side that computes the guest memory
261	* pressure only uses num_committed value.
262	*/
263
264	struct dm_status {
265	struct dm_header hdr;
266	__u64 num_avail;
267	__u64 num_committed;
268	__u64 page_file_size;
269	__u64 zero_free;
270	__u32 page_file_writes;
271	__u32 io_diff;
272	} __packed;
273
274
275	/*
276	* Message to ask the guest to allocate memory - balloon up message.
277	* This message is sent from the host to the guest. The guest may not be
278	* able to allocate as much memory as requested.
279	*
280	* num_pages: number of pages to allocate.
281	*/
282
283	struct dm_balloon {
284	struct dm_header hdr;
285	__u32 num_pages;
286	__u32 reservedz;
287	} __packed;
288
289
290	/*
291	* Balloon response message; this message is sent from the guest
292	* to the host in response to the balloon message.
293	*
294	* reservedz: Reserved; must be set to zero.
295	* more_pages: If FALSE, this is the last message of the transaction.
296	* if TRUE there will atleast one more message from the guest.
297	*
298	* range_count: The number of ranges in the range array.
299	*
300	* range_array: An array of page ranges returned to the host.
301	*
302	*/
303
304	struct dm_balloon_response {
305	struct dm_header hdr;
306	__u32 reservedz;
307	__u32 more_pages:`1`;
308	__u32 range_count:`31`;
309	union dm_mem_page_range range_array[];
310	} __packed;
311
312	/*
313	* Un-balloon message; this message is sent from the host
314	* to the guest to give guest more memory.
315	*
316	* more_pages: If FALSE, this is the last message of the transaction.
317	* if TRUE there will atleast one more message from the guest.
318	*
319	* reservedz: Reserved; must be set to zero.
320	*
321	* range_count: The number of ranges in the range array.
322	*
323	* range_array: An array of page ranges returned to the host.
324	*
325	*/
326
327	struct dm_unballoon_request {
328	struct dm_header hdr;
329	__u32 more_pages:`1`;
330	__u32 reservedz:`31`;
331	__u32 range_count;
332	union dm_mem_page_range range_array[];
333	} __packed;
334
335	/*
336	* Un-balloon response message; this message is sent from the guest
337	* to the host in response to an unballoon request.
338	*
339	*/
340
341	struct dm_unballoon_response {
342	struct dm_header hdr;
343	} __packed;
344
345
346	/*
347	* Hot add request message. Message sent from the host to the guest.
348	*
349	* mem_range: Memory range to hot add.
350	*
351	*/
352
353	struct dm_hot_add {
354	struct dm_header hdr;
355	union dm_mem_page_range range;
356	} __packed;
357
358	/*
359	* Hot add response message.
360	* This message is sent by the guest to report the status of a hot add request.
361	* If page_count is less than the requested page count, then the host should
362	* assume all further hot add requests will fail, since this indicates that
363	* the guest has hit an upper physical memory barrier.
364	*
365	* Hot adds may also fail due to low resources; in this case, the guest must
366	* not complete this message until the hot add can succeed, and the host must
367	* not send a new hot add request until the response is sent.
368	* If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
369	* times it fails the request.
370	*
371	*
372	* page_count: number of pages that were successfully hot added.
373	*
374	* result: result of the operation 1: success, 0: failure.
375	*
376	*/
377
378	struct dm_hot_add_response {
379	struct dm_header hdr;
380	__u32 page_count;
381	__u32 result;
382	} __packed;
383
384	/*
385	* Types of information sent from host to the guest.
386	*/
387
388	enum dm_info_type {
389	INFO_TYPE_MAX_PAGE_CNT = `0`,
390	MAX_INFO_TYPE
391	};
392
393
394	/*
395	* Header for the information message.
396	*/
397
398	struct dm_info_header {
399	enum dm_info_type type;
400	__u32 data_size;
401	} __packed;
402
403	/*
404	* This message is sent from the host to the guest to pass
405	* some relevant information (win8 addition).
406	*
407	* reserved: no used.
408	* info_size: size of the information blob.
409	* info: information blob.
410	*/
411
412	struct dm_info_msg {
413	struct dm_header hdr;
414	__u32 reserved;
415	__u32 info_size;
416	__u8 info[];
417	};
418
419	/*
420	* End protocol definitions.
421	*/
422
423	/*
424	* State to manage hot adding memory into the guest.
425	* The range start_pfn : end_pfn specifies the range
426	* that the host has asked us to hot add. The range
427	* start_pfn : ha_end_pfn specifies the range that we have
428	* currently hot added. We hot add in multiples of 128M
429	* chunks; it is possible that we may not be able to bring
430	* online all the pages in the region. The range
431	* covered_start_pfn:covered_end_pfn defines the pages that can
432	* be brough online.
433	*/
434
435	struct hv_hotadd_state {
436	struct list_head list;
437	unsigned long start_pfn;
438	unsigned long covered_start_pfn;
439	unsigned long covered_end_pfn;
440	unsigned long ha_end_pfn;
441	unsigned long end_pfn;
442	/*
443	* A list of gaps.
444	*/
445	struct list_head gap_list;
446	};
447
448	struct hv_hotadd_gap {
449	struct list_head list;
450	unsigned long start_pfn;
451	unsigned long end_pfn;
452	};
453
454	struct balloon_state {
455	__u32 num_pages;
456	struct work_struct wrk;
457	};
458
459	struct hot_add_wrk {
460	union dm_mem_page_range ha_page_range;
461	union dm_mem_page_range ha_region_range;
462	struct work_struct wrk;
463	};
464
465	static bool allow_hibernation;
466	static bool hot_add = true;
467	static bool do_hot_add;
468	/*
469	* Delay reporting memory pressure by
470	* the specified number of seconds.
471	*/
472	static uint pressure_report_delay = `45`;
473	extern unsigned int page_reporting_order;
474	#define HV_MAX_FAILURES 2
475
476	/*
477	* The last time we posted a pressure report to host.
478	*/
479	static unsigned long last_post_time;
480
481	static int hv_hypercall_multi_failure;
482
483	module_param(hot_add, bool, (S_IRUGO \| S_IWUSR));
484	MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
485
486	module_param(pressure_report_delay, uint, (S_IRUGO \| S_IWUSR));
487	MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
488	static atomic_t trans_id = ATOMIC_INIT(`0`);
489
490	static int dm_ring_size = VMBUS_RING_SIZE(`16` * `1024`);
491
492	/*
493	* Driver specific state.
494	*/
495
496	enum hv_dm_state {
497	DM_INITIALIZING = `0`,
498	DM_INITIALIZED,
499	DM_BALLOON_UP,
500	DM_BALLOON_DOWN,
501	DM_HOT_ADD,
502	DM_INIT_ERROR
503	};
504
505
506	static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
507	static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
508	#define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
509	#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
510
511	struct hv_dynmem_device {
512	struct hv_device *dev;
513	enum hv_dm_state state;
514	struct completion host_event;
515	struct completion config_event;
516
517	/*
518	* Number of pages we have currently ballooned out.
519	*/
520	unsigned int num_pages_ballooned;
521	unsigned int num_pages_onlined;
522	unsigned int num_pages_added;
523
524	/*
525	* State to manage the ballooning (up) operation.
526	*/
527	struct balloon_state balloon_wrk;
528
529	/*
530	* State to execute the "hot-add" operation.
531	*/
532	struct hot_add_wrk ha_wrk;
533
534	/*
535	* This state tracks if the host has specified a hot-add
536	* region.
537	*/
538	bool host_specified_ha_region;
539
540	/*
541	* State to synchronize hot-add.
542	*/
543	struct completion ol_waitevent;
544	/*
545	* This thread handles hot-add
546	* requests from the host as well as notifying
547	* the host with regards to memory pressure in
548	* the guest.
549	*/
550	struct task_struct *thread;
551
552	/*
553	* Protects ha_region_list, num_pages_onlined counter and individual
554	* regions from ha_region_list.
555	*/
556	spinlock_t ha_lock;
557
558	/*
559	* A list of hot-add regions.
560	*/
561	struct list_head ha_region_list;
562
563	/*
564	* We start with the highest version we can support
565	* and downgrade based on the host; we save here the
566	* next version to try.
567	*/
568	__u32 next_version;
569
570	/*
571	* The negotiated version agreed by host.
572	*/
573	__u32 version;
574
575	struct page_reporting_dev_info pr_dev_info;
576
577	/*
578	* Maximum number of pages that can be hot_add-ed
579	*/
580	__u64 max_dynamic_page_count;
581	};
582
583	static struct hv_dynmem_device dm_device;
584
585	static void post_status(struct hv_dynmem_device *dm);
586
587	static void enable_page_reporting(void);
588
589	static void disable_page_reporting(void);
590
591	#ifdef CONFIG_MEMORY_HOTPLUG
592	static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
593	unsigned long pfn)
594	{
595	struct hv_hotadd_gap *gap;
596
597	/ The page is not backed. /
598	if ((pfn < has->covered_start_pfn) \|\| (pfn >= has->covered_end_pfn))
599	return false;
600
601	/ Check for gaps. /
602	list_for_each_entry(gap, &has->gap_list, list) {
603	if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
604	return false;
605	}
606
607	return true;
608	}
609
610	static unsigned long hv_page_offline_check(unsigned long start_pfn,
611	unsigned long nr_pages)
612	{
613	unsigned long pfn = start_pfn, count = `0`;
614	struct hv_hotadd_state *has;
615	bool found;
616
617	while (pfn < start_pfn + nr_pages) {
618	/*
619	* Search for HAS which covers the pfn and when we find one
620	* count how many consequitive PFNs are covered.
621	*/
622	found = false;
623	list_for_each_entry(has, &dm_device.ha_region_list, list) {
624	while ((pfn >= has->start_pfn) &&
625	(pfn < has->end_pfn) &&
626	(pfn < start_pfn + nr_pages)) {
627	found = true;
628	if (has_pfn_is_backed(has, pfn))
629	count++;
630	pfn++;
631	}
632	}
633
634	/*
635	* This PFN is not in any HAS (e.g. we're offlining a region
636	* which was present at boot), no need to account for it. Go
637	* to the next one.
638	*/
639	if (!found)
640	pfn++;
641	}
642
643	return count;
644	}
645
646	static int hv_memory_notifier(struct notifier_block nb, unsigned* long val,
647	void *v)
648	{
649	struct memory_notify mem = (struct* memory_notify *)v;
650	unsigned long pfn_count;
651
652	switch (val) {
653	case MEM_ONLINE:
654	case MEM_CANCEL_ONLINE:
655	complete(&dm_device.ol_waitevent);
656	break;
657
658	case MEM_OFFLINE:
659	scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
660	pfn_count = hv_page_offline_check(start_pfn: mem->start_pfn,
661	nr_pages: mem->nr_pages);
662	if (pfn_count <= dm_device.num_pages_onlined) {
663	dm_device.num_pages_onlined -= pfn_count;
664	} else {
665	/*
666	* We're offlining more pages than we
667	* managed to online. This is
668	* unexpected. In any case don't let
669	* num_pages_onlined wrap around zero.
670	*/
671	WARN_ON_ONCE(`1`);
672	dm_device.num_pages_onlined = `0`;
673	}
674	}
675	break;
676	case MEM_GOING_ONLINE:
677	case MEM_GOING_OFFLINE:
678	case MEM_CANCEL_OFFLINE:
679	break;
680	}
681	return NOTIFY_OK;
682	}
683
684	static struct notifier_block hv_memory_nb = {
685	.notifier_call = hv_memory_notifier,
686	.priority = `0`
687	};
688
689	/ Check if the particular page is backed and can be onlined and online it. /
690	static void hv_page_online_one(struct hv_hotadd_state has, struct* page *pg)
691	{
692	if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
693	if (!PageOffline(page: pg))
694	__SetPageOffline(page: pg);
695	return;
696	}
697	if (PageOffline(page: pg))
698	__ClearPageOffline(page: pg);
699
700	/ This frame is currently backed; online the page. /
701	generic_online_page(page: pg, order: `0`);
702
703	lockdep_assert_held(&dm_device.ha_lock);
704	dm_device.num_pages_onlined++;
705	}
706
707	static void hv_bring_pgs_online(struct hv_hotadd_state *has,
708	unsigned long start_pfn, unsigned long size)
709	{
710	int i;
711
712	pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
713	for (i = `0`; i < size; i++)
714	hv_page_online_one(has, pfn_to_page(start_pfn + i));
715	}
716
717	static void hv_mem_hot_add(unsigned long start, unsigned long size,
718	unsigned long pfn_count,
719	struct hv_hotadd_state *has)
720	{
721	int ret = `0`;
722	int i, nid;
723	unsigned long start_pfn;
724	unsigned long processed_pfn;
725	unsigned long total_pfn = pfn_count;
726
727	for (i = `0`; i < (size/HA_CHUNK); i++) {
728	start_pfn = start + (i * HA_CHUNK);
729
730	scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
731	has->ha_end_pfn += HA_CHUNK;
732
733	if (total_pfn > HA_CHUNK) {
734	processed_pfn = HA_CHUNK;
735	total_pfn -= HA_CHUNK;
736	} else {
737	processed_pfn = total_pfn;
738	total_pfn = `0`;
739	}
740
741	has->covered_end_pfn += processed_pfn;
742	}
743
744	reinit_completion(x: &dm_device.ol_waitevent);
745
746	nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
747	ret = add_memory(nid, PFN_PHYS((start_pfn)),
748	size: (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
749
750	if (ret) {
751	pr_err("hot_add memory failed error is %d\n", ret);
752	if (ret == -EEXIST) {
753	/*
754	* This error indicates that the error
755	* is not a transient failure. This is the
756	* case where the guest's physical address map
757	* precludes hot adding memory. Stop all further
758	* memory hot-add.
759	*/
760	do_hot_add = false;
761	}
762	scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
763	has->ha_end_pfn -= HA_CHUNK;
764	has->covered_end_pfn -= processed_pfn;
765	}
766	break;
767	}
768
769	/*
770	* Wait for memory to get onlined. If the kernel onlined the
771	* memory when adding it, this will return directly. Otherwise,
772	* it will wait for user space to online the memory. This helps
773	* to avoid adding memory faster than it is getting onlined. As
774	* adding succeeded, it is ok to proceed even if the memory was
775	* not onlined in time.
776	*/
777	wait_for_completion_timeout(x: &dm_device.ol_waitevent, timeout: `5` * HZ);
778	post_status(dm: &dm_device);
779	}
780	}
781
782	static void hv_online_page(struct page pg, unsigned* int order)
783	{
784	struct hv_hotadd_state *has;
785	unsigned long pfn = page_to_pfn(pg);
786
787	guard(spinlock_irqsave)(l: &dm_device.ha_lock);
788	list_for_each_entry(has, &dm_device.ha_region_list, list) {
789	/ The page belongs to a different HAS. /
790	if ((pfn < has->start_pfn) \|\|
791	(pfn + (`1UL` << order) > has->end_pfn))
792	continue;
793
794	hv_bring_pgs_online(has, start_pfn: pfn, size: `1UL` << order);
795	break;
796	}
797	}
798
799	static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
800	{
801	struct hv_hotadd_state *has;
802	struct hv_hotadd_gap *gap;
803	unsigned long residual, new_inc;
804	int ret = `0`;
805
806	guard(spinlock_irqsave)(l: &dm_device.ha_lock);
807	list_for_each_entry(has, &dm_device.ha_region_list, list) {
808	/*
809	* If the pfn range we are dealing with is not in the current
810	* "hot add block", move on.
811	*/
812	if (start_pfn < has->start_pfn \|\| start_pfn >= has->end_pfn)
813	continue;
814
815	/*
816	* If the current start pfn is not where the covered_end
817	* is, create a gap and update covered_end_pfn.
818	*/
819	if (has->covered_end_pfn != start_pfn) {
820	gap = kzalloc(size: sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
821	if (!gap) {
822	ret = -ENOMEM;
823	break;
824	}
825
826	INIT_LIST_HEAD(list: &gap->list);
827	gap->start_pfn = has->covered_end_pfn;
828	gap->end_pfn = start_pfn;
829	list_add_tail(new: &gap->list, head: &has->gap_list);
830
831	has->covered_end_pfn = start_pfn;
832	}
833
834	/*
835	* If the current hot add-request extends beyond
836	* our current limit; extend it.
837	*/
838	if ((start_pfn + pfn_cnt) > has->end_pfn) {
839	residual = (start_pfn + pfn_cnt - has->end_pfn);
840	/*
841	* Extend the region by multiples of HA_CHUNK.
842	*/
843	new_inc = (residual / HA_CHUNK) * HA_CHUNK;
844	if (residual % HA_CHUNK)
845	new_inc += HA_CHUNK;
846
847	has->end_pfn += new_inc;
848	}
849
850	ret = `1`;
851	break;
852	}
853
854	return ret;
855	}
856
857	static unsigned long handle_pg_range(unsigned long pg_start,
858	unsigned long pg_count)
859	{
860	unsigned long start_pfn = pg_start;
861	unsigned long pfn_cnt = pg_count;
862	unsigned long size;
863	struct hv_hotadd_state *has;
864	unsigned long pgs_ol = `0`;
865	unsigned long old_covered_state;
866	unsigned long res = `0`, flags;
867
868	pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
869	pg_start);
870
871	spin_lock_irqsave(&dm_device.ha_lock, flags);
872	list_for_each_entry(has, &dm_device.ha_region_list, list) {
873	/*
874	* If the pfn range we are dealing with is not in the current
875	* "hot add block", move on.
876	*/
877	if (start_pfn < has->start_pfn \|\| start_pfn >= has->end_pfn)
878	continue;
879
880	old_covered_state = has->covered_end_pfn;
881
882	if (start_pfn < has->ha_end_pfn) {
883	/*
884	* This is the case where we are backing pages
885	* in an already hot added region. Bring
886	* these pages online first.
887	*/
888	pgs_ol = has->ha_end_pfn - start_pfn;
889	if (pgs_ol > pfn_cnt)
890	pgs_ol = pfn_cnt;
891
892	has->covered_end_pfn += pgs_ol;
893	pfn_cnt -= pgs_ol;
894	/*
895	* Check if the corresponding memory block is already
896	* online. It is possible to observe struct pages still
897	* being uninitialized here so check section instead.
898	* In case the section is online we need to bring the
899	* rest of pfns (which were not backed previously)
900	* online too.
901	*/
902	if (start_pfn > has->start_pfn &&
903	online_section_nr(nr: pfn_to_section_nr(pfn: start_pfn)))
904	hv_bring_pgs_online(has, start_pfn, size: pgs_ol);
905
906	}
907
908	if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > `0`)) {
909	/*
910	* We have some residual hot add range
911	* that needs to be hot added; hot add
912	* it now. Hot add a multiple of
913	* HA_CHUNK that fully covers the pages
914	* we have.
915	*/
916	size = (has->end_pfn - has->ha_end_pfn);
917	if (pfn_cnt <= size) {
918	size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
919	if (pfn_cnt % HA_CHUNK)
920	size += HA_CHUNK;
921	} else {
922	pfn_cnt = size;
923	}
924	spin_unlock_irqrestore(lock: &dm_device.ha_lock, flags);
925	hv_mem_hot_add(start: has->ha_end_pfn, size, pfn_count: pfn_cnt, has);
926	spin_lock_irqsave(&dm_device.ha_lock, flags);
927	}
928	/*
929	* If we managed to online any pages that were given to us,
930	* we declare success.
931	*/
932	res = has->covered_end_pfn - old_covered_state;
933	break;
934	}
935	spin_unlock_irqrestore(lock: &dm_device.ha_lock, flags);
936
937	return res;
938	}
939
940	static unsigned long process_hot_add(unsigned long pg_start,
941	unsigned long pfn_cnt,
942	unsigned long rg_start,
943	unsigned long rg_size)
944	{
945	struct hv_hotadd_state *ha_region = NULL;
946	int covered;
947
948	if (pfn_cnt == `0`)
949	return `0`;
950
951	if (!dm_device.host_specified_ha_region) {
952	covered = pfn_covered(start_pfn: pg_start, pfn_cnt);
953	if (covered < `0`)
954	return `0`;
955
956	if (covered)
957	goto do_pg_range;
958	}
959
960	/*
961	* If the host has specified a hot-add range; deal with it first.
962	*/
963
964	if (rg_size != `0`) {
965	ha_region = kzalloc(size: sizeof(struct hv_hotadd_state), GFP_KERNEL);
966	if (!ha_region)
967	return `0`;
968
969	INIT_LIST_HEAD(list: &ha_region->list);
970	INIT_LIST_HEAD(list: &ha_region->gap_list);
971
972	ha_region->start_pfn = rg_start;
973	ha_region->ha_end_pfn = rg_start;
974	ha_region->covered_start_pfn = pg_start;
975	ha_region->covered_end_pfn = pg_start;
976	ha_region->end_pfn = rg_start + rg_size;
977
978	scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
979	list_add_tail(new: &ha_region->list, head: &dm_device.ha_region_list);
980	}
981	}
982
983	do_pg_range:
984	/*
985	* Process the page range specified; bringing them
986	* online if possible.
987	*/
988	return handle_pg_range(pg_start, pg_count: pfn_cnt);
989	}
990
991	#endif
992
993	static void hot_add_req(struct work_struct *dummy)
994	{
995	struct dm_hot_add_response resp;
996	#ifdef CONFIG_MEMORY_HOTPLUG
997	unsigned long pg_start, pfn_cnt;
998	unsigned long rg_start, rg_sz;
999	#endif
1000	struct hv_dynmem_device *dm = &dm_device;
1001
1002	memset(&resp, `0`, sizeof(struct dm_hot_add_response));
1003	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
1004	resp.hdr.size = sizeof(struct dm_hot_add_response);
1005
1006	#ifdef CONFIG_MEMORY_HOTPLUG
1007	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
1008	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
1009
1010	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
1011	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
1012
1013	if ((rg_start == `0`) && (!dm->host_specified_ha_region)) {
1014	unsigned long region_size;
1015	unsigned long region_start;
1016
1017	/*
1018	* The host has not specified the hot-add region.
1019	* Based on the hot-add page range being specified,
1020	* compute a hot-add region that can cover the pages
1021	* that need to be hot-added while ensuring the alignment
1022	* and size requirements of Linux as it relates to hot-add.
1023	*/
1024	region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
1025	if (pfn_cnt % HA_CHUNK)
1026	region_size += HA_CHUNK;
1027
1028	region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
1029
1030	rg_start = region_start;
1031	rg_sz = region_size;
1032	}
1033
1034	if (do_hot_add)
1035	resp.page_count = process_hot_add(pg_start, pfn_cnt,
1036	rg_start, rg_size: rg_sz);
1037
1038	dm->num_pages_added += resp.page_count;
1039	#endif
1040	/*
1041	* The result field of the response structure has the
1042	* following semantics:
1043	*
1044	* 1. If all or some pages hot-added: Guest should return success.
1045	*
1046	* 2. If no pages could be hot-added:
1047	*
1048	* If the guest returns success, then the host
1049	* will not attempt any further hot-add operations. This
1050	* signifies a permanent failure.
1051	*
1052	* If the guest returns failure, then this failure will be
1053	* treated as a transient failure and the host may retry the
1054	* hot-add operation after some delay.
1055	*/
1056	if (resp.page_count > `0`)
1057	resp.result = `1`;
1058	else if (!do_hot_add)
1059	resp.result = `1`;
1060	else
1061	resp.result = `0`;
1062
1063	if (!do_hot_add \|\| resp.page_count == `0`) {
1064	if (!allow_hibernation)
1065	pr_err("Memory hot add failed\n");
1066	else
1067	pr_info("Ignore hot-add request!\n");
1068	}
1069
1070	dm->state = DM_INITIALIZED;
1071	resp.hdr.trans_id = atomic_inc_return(v: &trans_id);
1072	vmbus_sendpacket(channel: dm->dev->channel, buffer: &resp,
1073	bufferLen: sizeof(struct dm_hot_add_response),
1074	requestid: (unsigned long)NULL,
1075	type: VM_PKT_DATA_INBAND, flags: `0`);
1076	}
1077
1078	static void process_info(struct hv_dynmem_device dm, struct* dm_info_msg *msg)
1079	{
1080	struct dm_info_header *info_hdr;
1081
1082	info_hdr = (struct dm_info_header *)msg->info;
1083
1084	switch (info_hdr->type) {
1085	case INFO_TYPE_MAX_PAGE_CNT:
1086	if (info_hdr->data_size == sizeof(__u64)) {
1087	__u64 max_page_count = (__u64 )&info_hdr[`1`];
1088
1089	pr_info("Max. dynamic memory size: %llu MB\n",
1090	(*max_page_count) >> (`20` - HV_HYP_PAGE_SHIFT));
1091	dm->max_dynamic_page_count = *max_page_count;
1092	}
1093
1094	break;
1095	default:
1096	pr_warn("Received Unknown type: %d\n", info_hdr->type);
1097	}
1098	}
1099
1100	static unsigned long compute_balloon_floor(void)
1101	{
1102	unsigned long min_pages;
1103	unsigned long nr_pages = totalram_pages();
1104	#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
1105	/ Simple continuous piecewiese linear function:*
1106	* max MiB -> min MiB gradient
1107	* 0 0
1108	* 16 16
1109	* 32 24
1110	* 128 72 (1/2)
1111	* 512 168 (1/4)
1112	* 2048 360 (1/8)
1113	* 8192 744 (1/16)
1114	* 32768 1512 (1/32)
1115	*/
1116	if (nr_pages < MB2PAGES(`128`))
1117	min_pages = MB2PAGES(`8`) + (nr_pages >> `1`);
1118	else if (nr_pages < MB2PAGES(`512`))
1119	min_pages = MB2PAGES(`40`) + (nr_pages >> `2`);
1120	else if (nr_pages < MB2PAGES(`2048`))
1121	min_pages = MB2PAGES(`104`) + (nr_pages >> `3`);
1122	else if (nr_pages < MB2PAGES(`8192`))
1123	min_pages = MB2PAGES(`232`) + (nr_pages >> `4`);
1124	else
1125	min_pages = MB2PAGES(`488`) + (nr_pages >> `5`);
1126	#undef MB2PAGES
1127	return min_pages;
1128	}
1129
1130	/*
1131	* Compute total committed memory pages
1132	*/
1133
1134	static unsigned long get_pages_committed(struct hv_dynmem_device *dm)
1135	{
1136	return vm_memory_committed() +
1137	dm->num_pages_ballooned +
1138	(dm->num_pages_added > dm->num_pages_onlined ?
1139	dm->num_pages_added - dm->num_pages_onlined : `0`) +
1140	compute_balloon_floor();
1141	}
1142
1143	/*
1144	* Post our status as it relates memory pressure to the
1145	* host. Host expects the guests to post this status
1146	* periodically at 1 second intervals.
1147	*
1148	* The metrics specified in this protocol are very Windows
1149	* specific and so we cook up numbers here to convey our memory
1150	* pressure.
1151	*/
1152
1153	static void post_status(struct hv_dynmem_device *dm)
1154	{
1155	struct dm_status status;
1156	unsigned long now = jiffies;
1157	unsigned long last_post = last_post_time;
1158	unsigned long num_pages_avail, num_pages_committed;
1159
1160	if (pressure_report_delay > `0`) {
1161	--pressure_report_delay;
1162	return;
1163	}
1164
1165	if (!time_after(now, (last_post_time + HZ)))
1166	return;
1167
1168	memset(&status, `0`, sizeof(struct dm_status));
1169	status.hdr.type = DM_STATUS_REPORT;
1170	status.hdr.size = sizeof(struct dm_status);
1171	status.hdr.trans_id = atomic_inc_return(v: &trans_id);
1172
1173	/*
1174	* The host expects the guest to report free and committed memory.
1175	* Furthermore, the host expects the pressure information to include
1176	* the ballooned out pages. For a given amount of memory that we are
1177	* managing we need to compute a floor below which we should not
1178	* balloon. Compute this and add it to the pressure report.
1179	* We also need to report all offline pages (num_pages_added -
1180	* num_pages_onlined) as committed to the host, otherwise it can try
1181	* asking us to balloon them out.
1182	*/
1183	num_pages_avail = si_mem_available();
1184	num_pages_committed = get_pages_committed(dm);
1185
1186	trace_balloon_status(available: num_pages_avail, committed: num_pages_committed,
1187	vm_memory_committed: vm_memory_committed(), pages_ballooned: dm->num_pages_ballooned,
1188	pages_added: dm->num_pages_added, pages_onlined: dm->num_pages_onlined);
1189
1190	/ Convert numbers of pages into numbers of HV_HYP_PAGEs. /
1191	status.num_avail = num_pages_avail * NR_HV_HYP_PAGES_IN_PAGE;
1192	status.num_committed = num_pages_committed * NR_HV_HYP_PAGES_IN_PAGE;
1193
1194	/*
1195	* If our transaction ID is no longer current, just don't
1196	* send the status. This can happen if we were interrupted
1197	* after we picked our transaction ID.
1198	*/
1199	if (status.hdr.trans_id != atomic_read(v: &trans_id))
1200	return;
1201
1202	/*
1203	* If the last post time that we sampled has changed,
1204	* we have raced, don't post the status.
1205	*/
1206	if (last_post != last_post_time)
1207	return;
1208
1209	last_post_time = jiffies;
1210	vmbus_sendpacket(channel: dm->dev->channel, buffer: &status,
1211	bufferLen: sizeof(struct dm_status),
1212	requestid: (unsigned long)NULL,
1213	type: VM_PKT_DATA_INBAND, flags: `0`);
1214
1215	}
1216
1217	static void free_balloon_pages(struct hv_dynmem_device *dm,
1218	union dm_mem_page_range *range_array)
1219	{
1220	int num_pages = range_array->finfo.page_cnt;
1221	__u64 start_frame = range_array->finfo.start_page;
1222	struct page *pg;
1223	int i;
1224
1225	for (i = `0`; i < num_pages; i++) {
1226	pg = pfn_to_page(i + start_frame);
1227	__ClearPageOffline(page: pg);
1228	__free_page(pg);
1229	dm->num_pages_ballooned--;
1230	adjust_managed_page_count(page: pg, count: `1`);
1231	}
1232	}
1233
1234
1235
1236	static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
1237	unsigned int num_pages,
1238	struct dm_balloon_response *bl_resp,
1239	int alloc_unit)
1240	{
1241	unsigned int i, j;
1242	struct page *pg;
1243
1244	for (i = `0`; i < num_pages / alloc_unit; i++) {
1245	if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
1246	HV_HYP_PAGE_SIZE)
1247	return i * alloc_unit;
1248
1249	/*
1250	* We execute this code in a thread context. Furthermore,
1251	* we don't want the kernel to try too hard.
1252	*/
1253	pg = alloc_pages(GFP_HIGHUSER \| __GFP_NORETRY \|
1254	__GFP_NOMEMALLOC \| __GFP_NOWARN,
1255	order: get_order(size: alloc_unit << PAGE_SHIFT));
1256
1257	if (!pg)
1258	return i * alloc_unit;
1259
1260	dm->num_pages_ballooned += alloc_unit;
1261
1262	/*
1263	* If we allocatted 2M pages; split them so we
1264	* can free them in any order we get.
1265	*/
1266
1267	if (alloc_unit != `1`)
1268	split_page(page: pg, order: get_order(size: alloc_unit << PAGE_SHIFT));
1269
1270	/ mark all pages offline /
1271	for (j = `0`; j < alloc_unit; j++) {
1272	__SetPageOffline(page: pg + j);
1273	adjust_managed_page_count(page: pg + j, count: -`1`);
1274	}
1275
1276	bl_resp->range_count++;
1277	bl_resp->range_array[i].finfo.start_page =
1278	page_to_pfn(pg);
1279	bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
1280	bl_resp->hdr.size += sizeof(union dm_mem_page_range);
1281
1282	}
1283
1284	return i * alloc_unit;
1285	}
1286
1287	static void balloon_up(struct work_struct *dummy)
1288	{
1289	unsigned int num_pages = dm_device.balloon_wrk.num_pages;
1290	unsigned int num_ballooned = `0`;
1291	struct dm_balloon_response *bl_resp;
1292	int alloc_unit;
1293	int ret;
1294	bool done = false;
1295	int i;
1296	long avail_pages;
1297	unsigned long floor;
1298
1299	/*
1300	* We will attempt 2M allocations. However, if we fail to
1301	* allocate 2M chunks, we will go back to PAGE_SIZE allocations.
1302	*/
1303	alloc_unit = PAGES_IN_2M;
1304
1305	avail_pages = si_mem_available();
1306	floor = compute_balloon_floor();
1307
1308	/ Refuse to balloon below the floor. /
1309	if (avail_pages < num_pages \|\| avail_pages - num_pages < floor) {
1310	pr_info("Balloon request will be partially fulfilled. %s\n",
1311	avail_pages < num_pages ? "Not enough memory." :
1312	"Balloon floor reached.");
1313
1314	num_pages = avail_pages > floor ? (avail_pages - floor) : `0`;
1315	}
1316
1317	while (!done) {
1318	memset(balloon_up_send_buffer, `0`, HV_HYP_PAGE_SIZE);
1319	bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
1320	bl_resp->hdr.type = DM_BALLOON_RESPONSE;
1321	bl_resp->hdr.size = sizeof(struct dm_balloon_response);
1322	bl_resp->more_pages = `1`;
1323
1324	num_pages -= num_ballooned;
1325	num_ballooned = alloc_balloon_pages(dm: &dm_device, num_pages,
1326	bl_resp, alloc_unit);
1327
1328	if (alloc_unit != `1` && num_ballooned == `0`) {
1329	alloc_unit = `1`;
1330	continue;
1331	}
1332
1333	if (num_ballooned == `0` \|\| num_ballooned == num_pages) {
1334	pr_debug("Ballooned %u out of %u requested pages.\n",
1335	num_pages, dm_device.balloon_wrk.num_pages);
1336
1337	bl_resp->more_pages = `0`;
1338	done = true;
1339	dm_device.state = DM_INITIALIZED;
1340	}
1341
1342	/*
1343	* We are pushing a lot of data through the channel;
1344	* deal with transient failures caused because of the
1345	* lack of space in the ring buffer.
1346	*/
1347
1348	do {
1349	bl_resp->hdr.trans_id = atomic_inc_return(v: &trans_id);
1350	ret = vmbus_sendpacket(channel: dm_device.dev->channel,
1351	buffer: bl_resp,
1352	bufferLen: bl_resp->hdr.size,
1353	requestid: (unsigned long)NULL,
1354	type: VM_PKT_DATA_INBAND, flags: `0`);
1355
1356	if (ret == -EAGAIN)
1357	msleep(msecs: `20`);
1358	post_status(dm: &dm_device);
1359	} while (ret == -EAGAIN);
1360
1361	if (ret) {
1362	/*
1363	* Free up the memory we allocatted.
1364	*/
1365	pr_err("Balloon response failed\n");
1366
1367	for (i = `0`; i < bl_resp->range_count; i++)
1368	free_balloon_pages(dm: &dm_device,
1369	range_array: &bl_resp->range_array[i]);
1370
1371	done = true;
1372	}
1373	}
1374
1375	}
1376
1377	static void balloon_down(struct hv_dynmem_device *dm,
1378	struct dm_unballoon_request *req)
1379	{
1380	union dm_mem_page_range *range_array = req->range_array;
1381	int range_count = req->range_count;
1382	struct dm_unballoon_response resp;
1383	int i;
1384	unsigned int prev_pages_ballooned = dm->num_pages_ballooned;
1385
1386	for (i = `0`; i < range_count; i++) {
1387	free_balloon_pages(dm, range_array: &range_array[i]);
1388	complete(&dm_device.config_event);
1389	}
1390
1391	pr_debug("Freed %u ballooned pages.\n",
1392	prev_pages_ballooned - dm->num_pages_ballooned);
1393
1394	if (req->more_pages == `1`)
1395	return;
1396
1397	memset(&resp, `0`, sizeof(struct dm_unballoon_response));
1398	resp.hdr.type = DM_UNBALLOON_RESPONSE;
1399	resp.hdr.trans_id = atomic_inc_return(v: &trans_id);
1400	resp.hdr.size = sizeof(struct dm_unballoon_response);
1401
1402	vmbus_sendpacket(channel: dm_device.dev->channel, buffer: &resp,
1403	bufferLen: sizeof(struct dm_unballoon_response),
1404	requestid: (unsigned long)NULL,
1405	type: VM_PKT_DATA_INBAND, flags: `0`);
1406
1407	dm->state = DM_INITIALIZED;
1408	}
1409
1410	static void balloon_onchannelcallback(void *context);
1411
1412	static int dm_thread_func(void *dm_dev)
1413	{
1414	struct hv_dynmem_device *dm = dm_dev;
1415
1416	while (!kthread_should_stop()) {
1417	wait_for_completion_interruptible_timeout(
1418	x: &dm_device.config_event, timeout: `1`*HZ);
1419	/*
1420	* The host expects us to post information on the memory
1421	* pressure every second.
1422	*/
1423	reinit_completion(x: &dm_device.config_event);
1424	post_status(dm);
1425	/*
1426	* disable free page reporting if multiple hypercall
1427	* failure flag set. It is not done in the page_reporting
1428	* callback context as that causes a deadlock between
1429	* page_reporting_process() and page_reporting_unregister()
1430	*/
1431	if (hv_hypercall_multi_failure >= HV_MAX_FAILURES) {
1432	pr_err("Multiple failures in cold memory discard hypercall, disabling page reporting\n");
1433	disable_page_reporting();
1434	/ Reset the flag after disabling reporting /
1435	hv_hypercall_multi_failure = `0`;
1436	}
1437	}
1438
1439	return `0`;
1440	}
1441
1442
1443	static void version_resp(struct hv_dynmem_device *dm,
1444	struct dm_version_response *vresp)
1445	{
1446	struct dm_version_request version_req;
1447	int ret;
1448
1449	if (vresp->is_accepted) {
1450	/*
1451	* We are done; wakeup the
1452	* context waiting for version
1453	* negotiation.
1454	*/
1455	complete(&dm->host_event);
1456	return;
1457	}
1458	/*
1459	* If there are more versions to try, continue
1460	* with negotiations; if not
1461	* shutdown the service since we are not able
1462	* to negotiate a suitable version number
1463	* with the host.
1464	*/
1465	if (dm->next_version == `0`)
1466	goto version_error;
1467
1468	memset(&version_req, `0`, sizeof(struct dm_version_request));
1469	version_req.hdr.type = DM_VERSION_REQUEST;
1470	version_req.hdr.size = sizeof(struct dm_version_request);
1471	version_req.hdr.trans_id = atomic_inc_return(v: &trans_id);
1472	version_req.version.version = dm->next_version;
1473	dm->version = version_req.version.version;
1474
1475	/*
1476	* Set the next version to try in case current version fails.
1477	* Win7 protocol ought to be the last one to try.
1478	*/
1479	switch (version_req.version.version) {
1480	case DYNMEM_PROTOCOL_VERSION_WIN8:
1481	dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
1482	version_req.is_last_attempt = `0`;
1483	break;
1484	default:
1485	dm->next_version = `0`;
1486	version_req.is_last_attempt = `1`;
1487	}
1488
1489	ret = vmbus_sendpacket(channel: dm->dev->channel, buffer: &version_req,
1490	bufferLen: sizeof(struct dm_version_request),
1491	requestid: (unsigned long)NULL,
1492	type: VM_PKT_DATA_INBAND, flags: `0`);
1493
1494	if (ret)
1495	goto version_error;
1496
1497	return;
1498
1499	version_error:
1500	dm->state = DM_INIT_ERROR;
1501	complete(&dm->host_event);
1502	}
1503
1504	static void cap_resp(struct hv_dynmem_device *dm,
1505	struct dm_capabilities_resp_msg *cap_resp)
1506	{
1507	if (!cap_resp->is_accepted) {
1508	pr_err("Capabilities not accepted by host\n");
1509	dm->state = DM_INIT_ERROR;
1510	}
1511	complete(&dm->host_event);
1512	}
1513
1514	static void balloon_onchannelcallback(void *context)
1515	{
1516	struct hv_device *dev = context;
1517	u32 recvlen;
1518	u64 requestid;
1519	struct dm_message *dm_msg;
1520	struct dm_header *dm_hdr;
1521	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1522	struct dm_balloon *bal_msg;
1523	struct dm_hot_add *ha_msg;
1524	union dm_mem_page_range *ha_pg_range;
1525	union dm_mem_page_range *ha_region;
1526
1527	memset(recv_buffer, `0`, sizeof(recv_buffer));
1528	vmbus_recvpacket(channel: dev->channel, buffer: recv_buffer,
1529	HV_HYP_PAGE_SIZE, buffer_actual_len: &recvlen, requestid: &requestid);
1530
1531	if (recvlen > `0`) {
1532	dm_msg = (struct dm_message *)recv_buffer;
1533	dm_hdr = &dm_msg->hdr;
1534
1535	switch (dm_hdr->type) {
1536	case DM_VERSION_RESPONSE:
1537	version_resp(dm,
1538	vresp: (struct dm_version_response *)dm_msg);
1539	break;
1540
1541	case DM_CAPABILITIES_RESPONSE:
1542	cap_resp(dm,
1543	cap_resp: (struct dm_capabilities_resp_msg *)dm_msg);
1544	break;
1545
1546	case DM_BALLOON_REQUEST:
1547	if (allow_hibernation) {
1548	pr_info("Ignore balloon-up request!\n");
1549	break;
1550	}
1551
1552	if (dm->state == DM_BALLOON_UP)
1553	pr_warn("Currently ballooning\n");
1554	bal_msg = (struct dm_balloon *)recv_buffer;
1555	dm->state = DM_BALLOON_UP;
1556	dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
1557	schedule_work(work: &dm_device.balloon_wrk.wrk);
1558	break;
1559
1560	case DM_UNBALLOON_REQUEST:
1561	if (allow_hibernation) {
1562	pr_info("Ignore balloon-down request!\n");
1563	break;
1564	}
1565
1566	dm->state = DM_BALLOON_DOWN;
1567	balloon_down(dm,
1568	req: (struct dm_unballoon_request *)recv_buffer);
1569	break;
1570
1571	case DM_MEM_HOT_ADD_REQUEST:
1572	if (dm->state == DM_HOT_ADD)
1573	pr_warn("Currently hot-adding\n");
1574	dm->state = DM_HOT_ADD;
1575	ha_msg = (struct dm_hot_add *)recv_buffer;
1576	if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
1577	/*
1578	* This is a normal hot-add request specifying
1579	* hot-add memory.
1580	*/
1581	dm->host_specified_ha_region = false;
1582	ha_pg_range = &ha_msg->range;
1583	dm->ha_wrk.ha_page_range = *ha_pg_range;
1584	dm->ha_wrk.ha_region_range.page_range = `0`;
1585	} else {
1586	/*
1587	* Host is specifying that we first hot-add
1588	* a region and then partially populate this
1589	* region.
1590	*/
1591	dm->host_specified_ha_region = true;
1592	ha_pg_range = &ha_msg->range;
1593	ha_region = &ha_pg_range[`1`];
1594	dm->ha_wrk.ha_page_range = *ha_pg_range;
1595	dm->ha_wrk.ha_region_range = *ha_region;
1596	}
1597	schedule_work(work: &dm_device.ha_wrk.wrk);
1598	break;
1599
1600	case DM_INFO_MESSAGE:
1601	process_info(dm, msg: (struct dm_info_msg *)dm_msg);
1602	break;
1603
1604	default:
1605	pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
1606
1607	}
1608	}
1609
1610	}
1611
1612	#define HV_LARGE_REPORTING_ORDER 9
1613	#define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \
1614	HV_LARGE_REPORTING_ORDER)
1615	static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
1616	struct scatterlist sgl, unsigned* int nents)
1617	{
1618	unsigned long flags;
1619	struct hv_memory_hint *hint;
1620	int i, order;
1621	u64 status;
1622	struct scatterlist *sg;
1623
1624	WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
1625	WARN_ON_ONCE(sgl->length < (HV_HYP_PAGE_SIZE << page_reporting_order));
1626	local_irq_save(flags);
1627	hint = *this_cpu_ptr(hyperv_pcpu_input_arg);
1628	if (!hint) {
1629	local_irq_restore(flags);
1630	return -ENOSPC;
1631	}
1632
1633	hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
1634	hint->reserved = `0`;
1635	for_each_sg(sgl, sg, nents, i) {
1636	union hv_gpa_page_range *range;
1637
1638	range = &hint->ranges[i];
1639	range->address_space = `0`;
1640	order = get_order(size: sg->length);
1641	/*
1642	* Hyper-V expects the additional_pages field in the units
1643	* of one of these 3 sizes, 4Kbytes, 2Mbytes or 1Gbytes.
1644	* This is dictated by the values of the fields page.largesize
1645	* and page_size.
1646	* This code however, only uses 4Kbytes and 2Mbytes units
1647	* and not 1Gbytes unit.
1648	*/
1649
1650	/ page reporting for pages 2MB or higher /
1651	if (order >= HV_LARGE_REPORTING_ORDER ) {
1652	range->page.largepage = `1`;
1653	range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
1654	range->base_large_pfn = page_to_hvpfn(
1655	sg_page(sg)) >> HV_LARGE_REPORTING_ORDER;
1656	range->page.additional_pages =
1657	(sg->length / HV_LARGE_REPORTING_LEN) - `1`;
1658	} else {
1659	/ Page reporting for pages below 2MB /
1660	range->page.basepfn = page_to_hvpfn(sg_page(sg));
1661	range->page.largepage = false;
1662	range->page.additional_pages =
1663	(sg->length / HV_HYP_PAGE_SIZE) - `1`;
1664	}
1665
1666	}
1667
1668	status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, rep_count: nents, varhead_size: `0`,
1669	input: hint, NULL);
1670	local_irq_restore(flags);
1671	if (!hv_result_success(status)) {
1672
1673	pr_err("Cold memory discard hypercall failed with status %llx\n",
1674	status);
1675	if (hv_hypercall_multi_failure > `0`)
1676	hv_hypercall_multi_failure++;
1677
1678	if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) {
1679	pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n");
1680	pr_err("Defaulting to page_reporting_order %d\n",
1681	pageblock_order);
1682	page_reporting_order = pageblock_order;
1683	hv_hypercall_multi_failure++;
1684	return -EINVAL;
1685	}
1686
1687	return -EINVAL;
1688	}
1689
1690	return `0`;
1691	}
1692
1693	static void enable_page_reporting(void)
1694	{
1695	int ret;
1696
1697	if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
1698	pr_debug("Cold memory discard hint not supported by Hyper-V\n");
1699	return;
1700	}
1701
1702	BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
1703	dm_device.pr_dev_info.report = hv_free_page_report;
1704	/*
1705	* We let the page_reporting_order parameter decide the order
1706	* in the page_reporting code
1707	*/
1708	dm_device.pr_dev_info.order = `0`;
1709	ret = page_reporting_register(prdev: &dm_device.pr_dev_info);
1710	if (ret < `0`) {
1711	dm_device.pr_dev_info.report = NULL;
1712	pr_err("Failed to enable cold memory discard: %d\n", ret);
1713	} else {
1714	pr_info("Cold memory discard hint enabled with order %d\n",
1715	page_reporting_order);
1716	}
1717	}
1718
1719	static void disable_page_reporting(void)
1720	{
1721	if (dm_device.pr_dev_info.report) {
1722	page_reporting_unregister(prdev: &dm_device.pr_dev_info);
1723	dm_device.pr_dev_info.report = NULL;
1724	}
1725	}
1726
1727	static int ballooning_enabled(void)
1728	{
1729	/*
1730	* Disable ballooning if the page size is not 4k (HV_HYP_PAGE_SIZE),
1731	* since currently it's unclear to us whether an unballoon request can
1732	* make sure all page ranges are guest page size aligned.
1733	*/
1734	if (PAGE_SIZE != HV_HYP_PAGE_SIZE) {
1735	pr_info("Ballooning disabled because page size is not 4096 bytes\n");
1736	return `0`;
1737	}
1738
1739	return `1`;
1740	}
1741
1742	static int hot_add_enabled(void)
1743	{
1744	/*
1745	* Disable hot add on ARM64, because we currently rely on
1746	* memory_add_physaddr_to_nid() to get a node id of a hot add range,
1747	* however ARM64's memory_add_physaddr_to_nid() always return 0 and
1748	* DM_MEM_HOT_ADD_REQUEST doesn't have the NUMA node information for
1749	* add_memory().
1750	*/
1751	if (IS_ENABLED(CONFIG_ARM64)) {
1752	pr_info("Memory hot add disabled on ARM64\n");
1753	return `0`;
1754	}
1755
1756	return `1`;
1757	}
1758
1759	static int balloon_connect_vsp(struct hv_device *dev)
1760	{
1761	struct dm_version_request version_req;
1762	struct dm_capabilities cap_msg;
1763	unsigned long t;
1764	int ret;
1765
1766	/*
1767	* max_pkt_size should be large enough for one vmbus packet header plus
1768	* our receive buffer size. Hyper-V sends messages up to
1769	* HV_HYP_PAGE_SIZE bytes long on balloon channel.
1770	*/
1771	dev->channel->max_pkt_size = HV_HYP_PAGE_SIZE * `2`;
1772
1773	ret = vmbus_open(channel: dev->channel, send_ringbuffersize: dm_ring_size, recv_ringbuffersize: dm_ring_size, NULL, userdatalen: `0`,
1774	onchannel_callback: balloon_onchannelcallback, context: dev);
1775	if (ret)
1776	return ret;
1777
1778	/*
1779	* Initiate the hand shake with the host and negotiate
1780	* a version that the host can support. We start with the
1781	* highest version number and go down if the host cannot
1782	* support it.
1783	*/
1784	memset(&version_req, `0`, sizeof(struct dm_version_request));
1785	version_req.hdr.type = DM_VERSION_REQUEST;
1786	version_req.hdr.size = sizeof(struct dm_version_request);
1787	version_req.hdr.trans_id = atomic_inc_return(v: &trans_id);
1788	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10;
1789	version_req.is_last_attempt = `0`;
1790	dm_device.version = version_req.version.version;
1791
1792	ret = vmbus_sendpacket(channel: dev->channel, buffer: &version_req,
1793	bufferLen: sizeof(struct dm_version_request),
1794	requestid: (unsigned long)NULL, type: VM_PKT_DATA_INBAND, flags: `0`);
1795	if (ret)
1796	goto out;
1797
1798	t = wait_for_completion_timeout(x: &dm_device.host_event, timeout: `5`*HZ);
1799	if (t == `0`) {
1800	ret = -ETIMEDOUT;
1801	goto out;
1802	}
1803
1804	/*
1805	* If we could not negotiate a compatible version with the host
1806	* fail the probe function.
1807	*/
1808	if (dm_device.state == DM_INIT_ERROR) {
1809	ret = -EPROTO;
1810	goto out;
1811	}
1812
1813	pr_info("Using Dynamic Memory protocol version %u.%u\n",
1814	DYNMEM_MAJOR_VERSION(dm_device.version),
1815	DYNMEM_MINOR_VERSION(dm_device.version));
1816
1817	/*
1818	* Now submit our capabilities to the host.
1819	*/
1820	memset(&cap_msg, `0`, sizeof(struct dm_capabilities));
1821	cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
1822	cap_msg.hdr.size = sizeof(struct dm_capabilities);
1823	cap_msg.hdr.trans_id = atomic_inc_return(v: &trans_id);
1824
1825	/*
1826	* When hibernation (i.e. virtual ACPI S4 state) is enabled, the host
1827	* currently still requires the bits to be set, so we have to add code
1828	* to fail the host's hot-add and balloon up/down requests, if any.
1829	*/
1830	cap_msg.caps.cap_bits.balloon = ballooning_enabled();
1831	cap_msg.caps.cap_bits.hot_add = hot_add_enabled();
1832
1833	/*
1834	* Specify our alignment requirements as it relates
1835	* memory hot-add. Specify 128MB alignment.
1836	*/
1837	cap_msg.caps.cap_bits.hot_add_alignment = `7`;
1838
1839	/*
1840	* Currently the host does not use these
1841	* values and we set them to what is done in the
1842	* Windows driver.
1843	*/
1844	cap_msg.min_page_cnt = `0`;
1845	cap_msg.max_page_number = -`1`;
1846
1847	ret = vmbus_sendpacket(channel: dev->channel, buffer: &cap_msg,
1848	bufferLen: sizeof(struct dm_capabilities),
1849	requestid: (unsigned long)NULL, type: VM_PKT_DATA_INBAND, flags: `0`);
1850	if (ret)
1851	goto out;
1852
1853	t = wait_for_completion_timeout(x: &dm_device.host_event, timeout: `5`*HZ);
1854	if (t == `0`) {
1855	ret = -ETIMEDOUT;
1856	goto out;
1857	}
1858
1859	/*
1860	* If the host does not like our capabilities,
1861	* fail the probe function.
1862	*/
1863	if (dm_device.state == DM_INIT_ERROR) {
1864	ret = -EPROTO;
1865	goto out;
1866	}
1867
1868	return `0`;
1869	out:
1870	vmbus_close(channel: dev->channel);
1871	return ret;
1872	}
1873
1874	/*
1875	* DEBUGFS Interface
1876	*/
1877	#ifdef CONFIG_DEBUG_FS
1878
1879	/**
1880	* hv_balloon_debug_show - shows statistics of balloon operations.
1881	* @f: pointer to the &struct seq_file.
1882	* @offset: ignored.
1883	*
1884	* Provides the statistics that can be accessed in hv-balloon in the debugfs.
1885	*
1886	* Return: zero on success or an error code.
1887	*/
1888	static int hv_balloon_debug_show(struct seq_file f, void* *offset)
1889	{
1890	struct hv_dynmem_device *dm = f->private;
1891	char *sname;
1892
1893	seq_printf(m: f, fmt: "%-22s: %u.%u\n", "host_version",
1894	DYNMEM_MAJOR_VERSION(dm->version),
1895	DYNMEM_MINOR_VERSION(dm->version));
1896
1897	seq_printf(m: f, fmt: "%-22s:", "capabilities");
1898	if (ballooning_enabled())
1899	seq_puts(m: f, s: " enabled");
1900
1901	if (hot_add_enabled())
1902	seq_puts(m: f, s: " hot_add");
1903
1904	seq_puts(m: f, s: "\n");
1905
1906	seq_printf(m: f, fmt: "%-22s: %u", "state", dm->state);
1907	switch (dm->state) {
1908	case DM_INITIALIZING:
1909	sname = "Initializing";
1910	break;
1911	case DM_INITIALIZED:
1912	sname = "Initialized";
1913	break;
1914	case DM_BALLOON_UP:
1915	sname = "Balloon Up";
1916	break;
1917	case DM_BALLOON_DOWN:
1918	sname = "Balloon Down";
1919	break;
1920	case DM_HOT_ADD:
1921	sname = "Hot Add";
1922	break;
1923	case DM_INIT_ERROR:
1924	sname = "Error";
1925	break;
1926	default:
1927	sname = "Unknown";
1928	}
1929	seq_printf(m: f, fmt: " (%s)\n", sname);
1930
1931	/ HV Page Size /
1932	seq_printf(m: f, fmt: "%-22s: %ld\n", "page_size", HV_HYP_PAGE_SIZE);
1933
1934	/ Pages added with hot_add /
1935	seq_printf(m: f, fmt: "%-22s: %u\n", "pages_added", dm->num_pages_added);
1936
1937	/ pages that are "onlined"/used from pages_added /
1938	seq_printf(m: f, fmt: "%-22s: %u\n", "pages_onlined", dm->num_pages_onlined);
1939
1940	/ pages we have given back to host /
1941	seq_printf(m: f, fmt: "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned);
1942
1943	seq_printf(m: f, fmt: "%-22s: %lu\n", "total_pages_committed",
1944	get_pages_committed(dm));
1945
1946	seq_printf(m: f, fmt: "%-22s: %llu\n", "max_dynamic_page_count",
1947	dm->max_dynamic_page_count);
1948
1949	return `0`;
1950	}
1951
1952	DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug);
1953
1954	static void hv_balloon_debugfs_init(struct hv_dynmem_device *b)
1955	{
1956	debugfs_create_file(name: "hv-balloon", mode: `0444`, NULL, data: b,
1957	fops: &hv_balloon_debug_fops);
1958	}
1959
1960	static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b)
1961	{
1962	debugfs_lookup_and_remove(name: "hv-balloon", NULL);
1963	}
1964
1965	#else
1966
1967	static inline void hv_balloon_debugfs_init(struct hv_dynmem_device *b)
1968	{
1969	}
1970
1971	static inline void hv_balloon_debugfs_exit(struct hv_dynmem_device *b)
1972	{
1973	}
1974
1975	#endif /* CONFIG_DEBUG_FS */
1976
1977	static int balloon_probe(struct hv_device *dev,
1978	const struct hv_vmbus_device_id *dev_id)
1979	{
1980	int ret;
1981
1982	allow_hibernation = hv_is_hibernation_supported();
1983	if (allow_hibernation)
1984	hot_add = false;
1985
1986	#ifdef CONFIG_MEMORY_HOTPLUG
1987	do_hot_add = hot_add;
1988	#else
1989	do_hot_add = false;
1990	#endif
1991	dm_device.dev = dev;
1992	dm_device.state = DM_INITIALIZING;
1993	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
1994	init_completion(x: &dm_device.host_event);
1995	init_completion(x: &dm_device.config_event);
1996	INIT_LIST_HEAD(list: &dm_device.ha_region_list);
1997	spin_lock_init(&dm_device.ha_lock);
1998	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
1999	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
2000	dm_device.host_specified_ha_region = false;
2001
2002	#ifdef CONFIG_MEMORY_HOTPLUG
2003	set_online_page_callback(&hv_online_page);
2004	init_completion(x: &dm_device.ol_waitevent);
2005	register_memory_notifier(nb: &hv_memory_nb);
2006	#endif
2007
2008	hv_set_drvdata(dev, data: &dm_device);
2009
2010	ret = balloon_connect_vsp(dev);
2011	if (ret != `0`)
2012	goto connect_error;
2013
2014	enable_page_reporting();
2015	dm_device.state = DM_INITIALIZED;
2016
2017	dm_device.thread =
2018	kthread_run(dm_thread_func, &dm_device, "hv_balloon");
2019	if (IS_ERR(ptr: dm_device.thread)) {
2020	ret = PTR_ERR(ptr: dm_device.thread);
2021	goto probe_error;
2022	}
2023
2024	hv_balloon_debugfs_init(b: &dm_device);
2025
2026	return `0`;
2027
2028	probe_error:
2029	dm_device.state = DM_INIT_ERROR;
2030	dm_device.thread = NULL;
2031	disable_page_reporting();
2032	vmbus_close(channel: dev->channel);
2033	connect_error:
2034	#ifdef CONFIG_MEMORY_HOTPLUG
2035	unregister_memory_notifier(nb: &hv_memory_nb);
2036	restore_online_page_callback(callback: &hv_online_page);
2037	#endif
2038	return ret;
2039	}
2040
2041	static void balloon_remove(struct hv_device *dev)
2042	{
2043	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
2044	struct hv_hotadd_state has, tmp;
2045	struct hv_hotadd_gap gap, tmp_gap;
2046
2047	if (dm->num_pages_ballooned != `0`)
2048	pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
2049
2050	hv_balloon_debugfs_exit(b: dm);
2051
2052	cancel_work_sync(work: &dm->balloon_wrk.wrk);
2053	cancel_work_sync(work: &dm->ha_wrk.wrk);
2054
2055	kthread_stop(k: dm->thread);
2056
2057	/*
2058	* This is to handle the case when balloon_resume()
2059	* call has failed and some cleanup has been done as
2060	* a part of the error handling.
2061	*/
2062	if (dm_device.state != DM_INIT_ERROR) {
2063	disable_page_reporting();
2064	vmbus_close(channel: dev->channel);
2065	#ifdef CONFIG_MEMORY_HOTPLUG
2066	unregister_memory_notifier(nb: &hv_memory_nb);
2067	restore_online_page_callback(callback: &hv_online_page);
2068	#endif
2069	}
2070
2071	guard(spinlock_irqsave)(l: &dm_device.ha_lock);
2072	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
2073	list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
2074	list_del(entry: &gap->list);
2075	kfree(objp: gap);
2076	}
2077	list_del(entry: &has->list);
2078	kfree(objp: has);
2079	}
2080	}
2081
2082	static int balloon_suspend(struct hv_device *hv_dev)
2083	{
2084	struct hv_dynmem_device *dm = hv_get_drvdata(dev: hv_dev);
2085
2086	tasklet_disable(t: &hv_dev->channel->callback_event);
2087
2088	cancel_work_sync(work: &dm->balloon_wrk.wrk);
2089	cancel_work_sync(work: &dm->ha_wrk.wrk);
2090
2091	if (dm->thread) {
2092	kthread_stop(k: dm->thread);
2093	dm->thread = NULL;
2094	vmbus_close(channel: hv_dev->channel);
2095	}
2096
2097	tasklet_enable(t: &hv_dev->channel->callback_event);
2098
2099	return `0`;
2100
2101	}
2102
2103	static int balloon_resume(struct hv_device *dev)
2104	{
2105	int ret;
2106
2107	dm_device.state = DM_INITIALIZING;
2108
2109	ret = balloon_connect_vsp(dev);
2110
2111	if (ret != `0`)
2112	goto out;
2113
2114	dm_device.thread =
2115	kthread_run(dm_thread_func, &dm_device, "hv_balloon");
2116	if (IS_ERR(ptr: dm_device.thread)) {
2117	ret = PTR_ERR(ptr: dm_device.thread);
2118	dm_device.thread = NULL;
2119	goto close_channel;
2120	}
2121
2122	dm_device.state = DM_INITIALIZED;
2123	return `0`;
2124	close_channel:
2125	vmbus_close(channel: dev->channel);
2126	out:
2127	dm_device.state = DM_INIT_ERROR;
2128	disable_page_reporting();
2129	#ifdef CONFIG_MEMORY_HOTPLUG
2130	unregister_memory_notifier(nb: &hv_memory_nb);
2131	restore_online_page_callback(callback: &hv_online_page);
2132	#endif
2133	return ret;
2134	}
2135
2136	static const struct hv_vmbus_device_id id_table[] = {
2137	/ Dynamic Memory Class ID /
2138	/ 525074DC-8985-46e2-8057-A307DC18A502 /
2139	{ HV_DM_GUID, },
2140	{ },
2141	};
2142
2143	MODULE_DEVICE_TABLE(vmbus, id_table);
2144
2145	static struct hv_driver balloon_drv = {
2146	.name = "hv_balloon",
2147	.id_table = id_table,
2148	.probe = balloon_probe,
2149	.remove = balloon_remove,
2150	.suspend = balloon_suspend,
2151	.resume = balloon_resume,
2152	.driver = {
2153	.probe_type = PROBE_PREFER_ASYNCHRONOUS,
2154	},
2155	};
2156
2157	static int __init init_balloon_drv(void)
2158	{
2159
2160	return vmbus_driver_register(&balloon_drv);
2161	}
2162
2163	module_init(init_balloon_drv);
2164
2165	MODULE_DESCRIPTION("Hyper-V Balloon");
2166	MODULE_LICENSE("GPL");
2167

source code of linux/drivers/hv/hv_balloon.c