netvsc.c source code [linux/drivers/net/hyperv/netvsc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (c) 2009, Microsoft Corporation.
4	*
5	* Authors:
6	* Haiyang Zhang <haiyangz@microsoft.com>
7	* Hank Janssen <hjanssen@microsoft.com>
8	*/
9	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11	#include <linux/kernel.h>
12	#include <linux/sched.h>
13	#include <linux/wait.h>
14	#include <linux/mm.h>
15	#include <linux/delay.h>
16	#include <linux/io.h>
17	#include <linux/slab.h>
18	#include <linux/netdevice.h>
19	#include <linux/if_ether.h>
20	#include <linux/vmalloc.h>
21	#include <linux/rtnetlink.h>
22	#include <linux/prefetch.h>
23	#include <linux/filter.h>
24
25	#include <asm/sync_bitops.h>
26	#include <asm/mshyperv.h>
27
28	#include "hyperv_net.h"
29	#include "netvsc_trace.h"
30
31	/*
32	* Switch the data path from the synthetic interface to the VF
33	* interface.
34	*/
35	int netvsc_switch_datapath(struct net_device *ndev, bool vf)
36	{
37	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
38	struct hv_device *dev = net_device_ctx->device_ctx;
39	struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev);
40	struct nvsp_message *init_pkt = &nv_dev->channel_init_pkt;
41	int ret, retry = `0`;
42
43	/ Block sending traffic to VF if it's about to be gone /
44	if (!vf)
45	net_device_ctx->data_path_is_vf = vf;
46
47	memset(init_pkt, `0`, sizeof(struct nvsp_message));
48	init_pkt->hdr.msg_type = NVSP_MSG4_TYPE_SWITCH_DATA_PATH;
49	if (vf)
50	init_pkt->msg.v4_msg.active_dp.active_datapath =
51	NVSP_DATAPATH_VF;
52	else
53	init_pkt->msg.v4_msg.active_dp.active_datapath =
54	NVSP_DATAPATH_SYNTHETIC;
55
56	again:
57	trace_nvsp_send(ndev, msg: init_pkt);
58
59	ret = vmbus_sendpacket(channel: dev->channel, buffer: init_pkt,
60	bufferLen: sizeof(struct nvsp_message),
61	requestid: (unsigned long)init_pkt, type: VM_PKT_DATA_INBAND,
62	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
63
64	/ If failed to switch to/from VF, let data_path_is_vf stay false,*
65	* so we use synthetic path to send data.
66	*/
67	if (ret) {
68	if (ret != -EAGAIN) {
69	netdev_err(dev: ndev,
70	format: "Unable to send sw datapath msg, err: %d\n",
71	ret);
72	return ret;
73	}
74
75	if (retry++ < RETRY_MAX) {
76	usleep_range(RETRY_US_LO, RETRY_US_HI);
77	goto again;
78	} else {
79	netdev_err(
80	dev: ndev,
81	format: "Retry failed to send sw datapath msg, err: %d\n",
82	ret);
83	return ret;
84	}
85	}
86
87	wait_for_completion(&nv_dev->channel_init_wait);
88	net_device_ctx->data_path_is_vf = vf;
89
90	return `0`;
91	}
92
93	/ Worker to setup sub channels on initial setup*
94	* Initial hotplug event occurs in softirq context
95	* and can't wait for channels.
96	*/
97	static void netvsc_subchan_work(struct work_struct *w)
98	{
99	struct netvsc_device *nvdev =
100	container_of(w, struct netvsc_device, subchan_work);
101	struct rndis_device *rdev;
102	int i, ret;
103
104	/ Avoid deadlock with device removal already under RTNL /
105	if (!rtnl_trylock()) {
106	schedule_work(work: w);
107	return;
108	}
109
110	rdev = nvdev->extension;
111	if (rdev) {
112	ret = rndis_set_subchannel(ndev: rdev->ndev, nvdev, NULL);
113	if (ret == `0`) {
114	netif_device_attach(dev: rdev->ndev);
115	} else {
116	/ fallback to only primary channel /
117	for (i = `1`; i < nvdev->num_chn; i++)
118	netif_napi_del(napi: &nvdev->chan_table[i].napi);
119
120	nvdev->max_chn = `1`;
121	nvdev->num_chn = `1`;
122	}
123	}
124
125	rtnl_unlock();
126	}
127
128	static struct netvsc_device alloc_net_device(void*)
129	{
130	struct netvsc_device *net_device;
131
132	net_device = kzalloc(size: sizeof(struct netvsc_device), GFP_KERNEL);
133	if (!net_device)
134	return NULL;
135
136	init_waitqueue_head(&net_device->wait_drain);
137	net_device->destroy = false;
138	net_device->tx_disable = true;
139
140	net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
141	net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
142
143	init_completion(x: &net_device->channel_init_wait);
144	init_waitqueue_head(&net_device->subchan_open);
145	INIT_WORK(&net_device->subchan_work, netvsc_subchan_work);
146
147	return net_device;
148	}
149
150	static void free_netvsc_device(struct rcu_head *head)
151	{
152	struct netvsc_device *nvdev
153	= container_of(head, struct netvsc_device, rcu);
154	int i;
155
156	kfree(objp: nvdev->extension);
157	vfree(addr: nvdev->recv_buf);
158	vfree(addr: nvdev->send_buf);
159	bitmap_free(bitmap: nvdev->send_section_map);
160
161	for (i = `0`; i < VRSS_CHANNEL_MAX; i++) {
162	xdp_rxq_info_unreg(xdp_rxq: &nvdev->chan_table[i].xdp_rxq);
163	kfree(objp: nvdev->chan_table[i].recv_buf);
164	vfree(addr: nvdev->chan_table[i].mrc.slots);
165	}
166
167	kfree(objp: nvdev);
168	}
169
170	static void free_netvsc_device_rcu(struct netvsc_device *nvdev)
171	{
172	call_rcu(head: &nvdev->rcu, func: free_netvsc_device);
173	}
174
175	static void netvsc_revoke_recv_buf(struct hv_device *device,
176	struct netvsc_device *net_device,
177	struct net_device *ndev)
178	{
179	struct nvsp_message *revoke_packet;
180	int ret;
181
182	/*
183	* If we got a section count, it means we received a
184	* SendReceiveBufferComplete msg (ie sent
185	* NvspMessage1TypeSendReceiveBuffer msg) therefore, we need
186	* to send a revoke msg here
187	*/
188	if (net_device->recv_section_cnt) {
189	/ Send the revoke receive buffer /
190	revoke_packet = &net_device->revoke_packet;
191	memset(revoke_packet, `0`, sizeof(struct nvsp_message));
192
193	revoke_packet->hdr.msg_type =
194	NVSP_MSG1_TYPE_REVOKE_RECV_BUF;
195	revoke_packet->msg.v1_msg.
196	revoke_recv_buf.id = NETVSC_RECEIVE_BUFFER_ID;
197
198	trace_nvsp_send(ndev, msg: revoke_packet);
199
200	ret = vmbus_sendpacket(channel: device->channel,
201	buffer: revoke_packet,
202	bufferLen: sizeof(struct nvsp_message),
203	VMBUS_RQST_ID_NO_RESPONSE,
204	type: VM_PKT_DATA_INBAND, flags: `0`);
205	/ If the failure is because the channel is rescinded;*
206	* ignore the failure since we cannot send on a rescinded
207	* channel. This would allow us to properly cleanup
208	* even when the channel is rescinded.
209	*/
210	if (device->channel->rescind)
211	ret = `0`;
212	/*
213	* If we failed here, we might as well return and
214	* have a leak rather than continue and a bugchk
215	*/
216	if (ret != `0`) {
217	netdev_err(dev: ndev, format: "unable to send "
218	"revoke receive buffer to netvsp\n");
219	return;
220	}
221	net_device->recv_section_cnt = `0`;
222	}
223	}
224
225	static void netvsc_revoke_send_buf(struct hv_device *device,
226	struct netvsc_device *net_device,
227	struct net_device *ndev)
228	{
229	struct nvsp_message *revoke_packet;
230	int ret;
231
232	/ Deal with the send buffer we may have setup.*
233	* If we got a send section size, it means we received a
234	* NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent
235	* NVSP_MSG1_TYPE_SEND_SEND_BUF msg) therefore, we need
236	* to send a revoke msg here
237	*/
238	if (net_device->send_section_cnt) {
239	/ Send the revoke receive buffer /
240	revoke_packet = &net_device->revoke_packet;
241	memset(revoke_packet, `0`, sizeof(struct nvsp_message));
242
243	revoke_packet->hdr.msg_type =
244	NVSP_MSG1_TYPE_REVOKE_SEND_BUF;
245	revoke_packet->msg.v1_msg.revoke_send_buf.id =
246	NETVSC_SEND_BUFFER_ID;
247
248	trace_nvsp_send(ndev, msg: revoke_packet);
249
250	ret = vmbus_sendpacket(channel: device->channel,
251	buffer: revoke_packet,
252	bufferLen: sizeof(struct nvsp_message),
253	VMBUS_RQST_ID_NO_RESPONSE,
254	type: VM_PKT_DATA_INBAND, flags: `0`);
255
256	/ If the failure is because the channel is rescinded;*
257	* ignore the failure since we cannot send on a rescinded
258	* channel. This would allow us to properly cleanup
259	* even when the channel is rescinded.
260	*/
261	if (device->channel->rescind)
262	ret = `0`;
263
264	/ If we failed here, we might as well return and*
265	* have a leak rather than continue and a bugchk
266	*/
267	if (ret != `0`) {
268	netdev_err(dev: ndev, format: "unable to send "
269	"revoke send buffer to netvsp\n");
270	return;
271	}
272	net_device->send_section_cnt = `0`;
273	}
274	}
275
276	static void netvsc_teardown_recv_gpadl(struct hv_device *device,
277	struct netvsc_device *net_device,
278	struct net_device *ndev)
279	{
280	int ret;
281
282	if (net_device->recv_buf_gpadl_handle.gpadl_handle) {
283	ret = vmbus_teardown_gpadl(channel: device->channel,
284	gpadl: &net_device->recv_buf_gpadl_handle);
285
286	/ If we failed here, we might as well return and have a leak*
287	* rather than continue and a bugchk
288	*/
289	if (ret != `0`) {
290	netdev_err(dev: ndev,
291	format: "unable to teardown receive buffer's gpadl\n");
292	return;
293	}
294	}
295	}
296
297	static void netvsc_teardown_send_gpadl(struct hv_device *device,
298	struct netvsc_device *net_device,
299	struct net_device *ndev)
300	{
301	int ret;
302
303	if (net_device->send_buf_gpadl_handle.gpadl_handle) {
304	ret = vmbus_teardown_gpadl(channel: device->channel,
305	gpadl: &net_device->send_buf_gpadl_handle);
306
307	/ If we failed here, we might as well return and have a leak*
308	* rather than continue and a bugchk
309	*/
310	if (ret != `0`) {
311	netdev_err(dev: ndev,
312	format: "unable to teardown send buffer's gpadl\n");
313	return;
314	}
315	}
316	}
317
318	int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx)
319	{
320	struct netvsc_channel *nvchan = &net_device->chan_table[q_idx];
321	int node = cpu_to_node(cpu: nvchan->channel->target_cpu);
322	size_t size;
323
324	size = net_device->recv_completion_cnt * sizeof(struct recv_comp_data);
325	nvchan->mrc.slots = vzalloc_node(size, node);
326	if (!nvchan->mrc.slots)
327	nvchan->mrc.slots = vzalloc(size);
328
329	return nvchan->mrc.slots ? `0` : -ENOMEM;
330	}
331
332	static int netvsc_init_buf(struct hv_device *device,
333	struct netvsc_device *net_device,
334	const struct netvsc_device_info *device_info)
335	{
336	struct nvsp_1_message_send_receive_buffer_complete *resp;
337	struct net_device *ndev = hv_get_drvdata(dev: device);
338	struct nvsp_message *init_packet;
339	unsigned int buf_size;
340	int i, ret = `0`;
341
342	/ Get receive buffer area. /
343	buf_size = device_info->recv_sections * device_info->recv_section_size;
344	buf_size = roundup(buf_size, PAGE_SIZE);
345
346	/ Legacy hosts only allow smaller receive buffer /
347	if (net_device->nvsp_version <= NVSP_PROTOCOL_VERSION_2)
348	buf_size = min_t(unsigned int, buf_size,
349	NETVSC_RECEIVE_BUFFER_SIZE_LEGACY);
350
351	net_device->recv_buf = vzalloc(size: buf_size);
352	if (!net_device->recv_buf) {
353	netdev_err(dev: ndev,
354	format: "unable to allocate receive buffer of size %u\n",
355	buf_size);
356	ret = -ENOMEM;
357	goto cleanup;
358	}
359
360	net_device->recv_buf_size = buf_size;
361
362	/*
363	* Establish the gpadl handle for this buffer on this
364	* channel. Note: This call uses the vmbus connection rather
365	* than the channel to establish the gpadl handle.
366	*/
367	ret = vmbus_establish_gpadl(channel: device->channel, kbuffer: net_device->recv_buf,
368	size: buf_size,
369	gpadl: &net_device->recv_buf_gpadl_handle);
370	if (ret != `0`) {
371	netdev_err(dev: ndev,
372	format: "unable to establish receive buffer's gpadl\n");
373	goto cleanup;
374	}
375
376	/ Notify the NetVsp of the gpadl handle /
377	init_packet = &net_device->channel_init_pkt;
378	memset(init_packet, `0`, sizeof(struct nvsp_message));
379	init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_RECV_BUF;
380	init_packet->msg.v1_msg.send_recv_buf.
381	gpadl_handle = net_device->recv_buf_gpadl_handle.gpadl_handle;
382	init_packet->msg.v1_msg.
383	send_recv_buf.id = NETVSC_RECEIVE_BUFFER_ID;
384
385	trace_nvsp_send(ndev, msg: init_packet);
386
387	/ Send the gpadl notification request /
388	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
389	bufferLen: sizeof(struct nvsp_message),
390	requestid: (unsigned long)init_packet,
391	type: VM_PKT_DATA_INBAND,
392	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
393	if (ret != `0`) {
394	netdev_err(dev: ndev,
395	format: "unable to send receive buffer's gpadl to netvsp\n");
396	goto cleanup;
397	}
398
399	wait_for_completion(&net_device->channel_init_wait);
400
401	/ Check the response /
402	resp = &init_packet->msg.v1_msg.send_recv_buf_complete;
403	if (resp->status != NVSP_STAT_SUCCESS) {
404	netdev_err(dev: ndev,
405	format: "Unable to complete receive buffer initialization with NetVsp - status %d\n",
406	resp->status);
407	ret = -EINVAL;
408	goto cleanup;
409	}
410
411	/ Parse the response /
412	netdev_dbg(ndev, "Receive sections: %u sub_allocs: size %u count: %u\n",
413	resp->num_sections, resp->sections[`0`].sub_alloc_size,
414	resp->sections[`0`].num_sub_allocs);
415
416	/ There should only be one section for the entire receive buffer /
417	if (resp->num_sections != `1` \|\| resp->sections[`0`].offset != `0`) {
418	ret = -EINVAL;
419	goto cleanup;
420	}
421
422	net_device->recv_section_size = resp->sections[`0`].sub_alloc_size;
423	net_device->recv_section_cnt = resp->sections[`0`].num_sub_allocs;
424
425	/ Ensure buffer will not overflow /
426	if (net_device->recv_section_size < NETVSC_MTU_MIN \|\| (u64)net_device->recv_section_size *
427	(u64)net_device->recv_section_cnt > (u64)buf_size) {
428	netdev_err(dev: ndev, format: "invalid recv_section_size %u\n",
429	net_device->recv_section_size);
430	ret = -EINVAL;
431	goto cleanup;
432	}
433
434	for (i = `0`; i < VRSS_CHANNEL_MAX; i++) {
435	struct netvsc_channel *nvchan = &net_device->chan_table[i];
436
437	nvchan->recv_buf = kzalloc(size: net_device->recv_section_size, GFP_KERNEL);
438	if (nvchan->recv_buf == NULL) {
439	ret = -ENOMEM;
440	goto cleanup;
441	}
442	}
443
444	/ Setup receive completion ring.*
445	* Add 1 to the recv_section_cnt because at least one entry in a
446	* ring buffer has to be empty.
447	*/
448	net_device->recv_completion_cnt = net_device->recv_section_cnt + `1`;
449	ret = netvsc_alloc_recv_comp_ring(net_device, q_idx: `0`);
450	if (ret)
451	goto cleanup;
452
453	/ Now setup the send buffer. /
454	buf_size = device_info->send_sections * device_info->send_section_size;
455	buf_size = round_up(buf_size, PAGE_SIZE);
456
457	net_device->send_buf = vzalloc(size: buf_size);
458	if (!net_device->send_buf) {
459	netdev_err(dev: ndev, format: "unable to allocate send buffer of size %u\n",
460	buf_size);
461	ret = -ENOMEM;
462	goto cleanup;
463	}
464	net_device->send_buf_size = buf_size;
465
466	/ Establish the gpadl handle for this buffer on this*
467	* channel. Note: This call uses the vmbus connection rather
468	* than the channel to establish the gpadl handle.
469	*/
470	ret = vmbus_establish_gpadl(channel: device->channel, kbuffer: net_device->send_buf,
471	size: buf_size,
472	gpadl: &net_device->send_buf_gpadl_handle);
473	if (ret != `0`) {
474	netdev_err(dev: ndev,
475	format: "unable to establish send buffer's gpadl\n");
476	goto cleanup;
477	}
478
479	/ Notify the NetVsp of the gpadl handle /
480	init_packet = &net_device->channel_init_pkt;
481	memset(init_packet, `0`, sizeof(struct nvsp_message));
482	init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_SEND_BUF;
483	init_packet->msg.v1_msg.send_send_buf.gpadl_handle =
484	net_device->send_buf_gpadl_handle.gpadl_handle;
485	init_packet->msg.v1_msg.send_send_buf.id = NETVSC_SEND_BUFFER_ID;
486
487	trace_nvsp_send(ndev, msg: init_packet);
488
489	/ Send the gpadl notification request /
490	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
491	bufferLen: sizeof(struct nvsp_message),
492	requestid: (unsigned long)init_packet,
493	type: VM_PKT_DATA_INBAND,
494	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
495	if (ret != `0`) {
496	netdev_err(dev: ndev,
497	format: "unable to send send buffer's gpadl to netvsp\n");
498	goto cleanup;
499	}
500
501	wait_for_completion(&net_device->channel_init_wait);
502
503	/ Check the response /
504	if (init_packet->msg.v1_msg.
505	send_send_buf_complete.status != NVSP_STAT_SUCCESS) {
506	netdev_err(dev: ndev, format: "Unable to complete send buffer "
507	"initialization with NetVsp - status %d\n",
508	init_packet->msg.v1_msg.
509	send_send_buf_complete.status);
510	ret = -EINVAL;
511	goto cleanup;
512	}
513
514	/ Parse the response /
515	net_device->send_section_size = init_packet->msg.
516	v1_msg.send_send_buf_complete.section_size;
517	if (net_device->send_section_size < NETVSC_MTU_MIN) {
518	netdev_err(dev: ndev, format: "invalid send_section_size %u\n",
519	net_device->send_section_size);
520	ret = -EINVAL;
521	goto cleanup;
522	}
523
524	/ Section count is simply the size divided by the section size. /
525	net_device->send_section_cnt = buf_size / net_device->send_section_size;
526
527	netdev_dbg(ndev, "Send section size: %d, Section count:%d\n",
528	net_device->send_section_size, net_device->send_section_cnt);
529
530	/ Setup state for managing the send buffer. /
531	net_device->send_section_map = bitmap_zalloc(nbits: net_device->send_section_cnt,
532	GFP_KERNEL);
533	if (!net_device->send_section_map) {
534	ret = -ENOMEM;
535	goto cleanup;
536	}
537
538	goto exit;
539
540	cleanup:
541	netvsc_revoke_recv_buf(device, net_device, ndev);
542	netvsc_revoke_send_buf(device, net_device, ndev);
543	netvsc_teardown_recv_gpadl(device, net_device, ndev);
544	netvsc_teardown_send_gpadl(device, net_device, ndev);
545
546	exit:
547	return ret;
548	}
549
550	/ Negotiate NVSP protocol version /
551	static int negotiate_nvsp_ver(struct hv_device *device,
552	struct netvsc_device *net_device,
553	struct nvsp_message *init_packet,
554	u32 nvsp_ver)
555	{
556	struct net_device *ndev = hv_get_drvdata(dev: device);
557	int ret;
558
559	memset(init_packet, `0`, sizeof(struct nvsp_message));
560	init_packet->hdr.msg_type = NVSP_MSG_TYPE_INIT;
561	init_packet->msg.init_msg.init.min_protocol_ver = nvsp_ver;
562	init_packet->msg.init_msg.init.max_protocol_ver = nvsp_ver;
563	trace_nvsp_send(ndev, msg: init_packet);
564
565	/ Send the init request /
566	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
567	bufferLen: sizeof(struct nvsp_message),
568	requestid: (unsigned long)init_packet,
569	type: VM_PKT_DATA_INBAND,
570	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
571
572	if (ret != `0`)
573	return ret;
574
575	wait_for_completion(&net_device->channel_init_wait);
576
577	if (init_packet->msg.init_msg.init_complete.status !=
578	NVSP_STAT_SUCCESS)
579	return -EINVAL;
580
581	if (nvsp_ver == NVSP_PROTOCOL_VERSION_1)
582	return `0`;
583
584	/ NVSPv2 or later: Send NDIS config /
585	memset(init_packet, `0`, sizeof(struct nvsp_message));
586	init_packet->hdr.msg_type = NVSP_MSG2_TYPE_SEND_NDIS_CONFIG;
587	init_packet->msg.v2_msg.send_ndis_config.mtu = ndev->mtu + ETH_HLEN;
588	init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = `1`;
589
590	if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
591	if (hv_is_isolation_supported())
592	netdev_info(dev: ndev, format: "SR-IOV not advertised by guests on the host supporting isolation\n");
593	else
594	init_packet->msg.v2_msg.send_ndis_config.capability.sriov = `1`;
595
596	/ Teaming bit is needed to receive link speed updates /
597	init_packet->msg.v2_msg.send_ndis_config.capability.teaming = `1`;
598	}
599
600	if (nvsp_ver >= NVSP_PROTOCOL_VERSION_61)
601	init_packet->msg.v2_msg.send_ndis_config.capability.rsc = `1`;
602
603	trace_nvsp_send(ndev, msg: init_packet);
604
605	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
606	bufferLen: sizeof(struct nvsp_message),
607	VMBUS_RQST_ID_NO_RESPONSE,
608	type: VM_PKT_DATA_INBAND, flags: `0`);
609
610	return ret;
611	}
612
613	static int netvsc_connect_vsp(struct hv_device *device,
614	struct netvsc_device *net_device,
615	const struct netvsc_device_info *device_info)
616	{
617	struct net_device *ndev = hv_get_drvdata(dev: device);
618	static const u32 ver_list[] = {
619	NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2,
620	NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5,
621	NVSP_PROTOCOL_VERSION_6, NVSP_PROTOCOL_VERSION_61
622	};
623	struct nvsp_message *init_packet;
624	int ndis_version, i, ret;
625
626	init_packet = &net_device->channel_init_pkt;
627
628	/ Negotiate the latest NVSP protocol supported /
629	for (i = ARRAY_SIZE(ver_list) - `1`; i >= `0`; i--)
630	if (negotiate_nvsp_ver(device, net_device, init_packet,
631	nvsp_ver: ver_list[i]) == `0`) {
632	net_device->nvsp_version = ver_list[i];
633	break;
634	}
635
636	if (i < `0`) {
637	ret = -EPROTO;
638	goto cleanup;
639	}
640
641	if (hv_is_isolation_supported() && net_device->nvsp_version < NVSP_PROTOCOL_VERSION_61) {
642	netdev_err(dev: ndev, format: "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n",
643	net_device->nvsp_version, NVSP_PROTOCOL_VERSION_61);
644	ret = -EPROTO;
645	goto cleanup;
646	}
647
648	pr_debug("Negotiated NVSP version:%x\n", net_device->nvsp_version);
649
650	/ Send the ndis version /
651	memset(init_packet, `0`, sizeof(struct nvsp_message));
652
653	if (net_device->nvsp_version <= NVSP_PROTOCOL_VERSION_4)
654	ndis_version = `0x00060001`;
655	else
656	ndis_version = `0x0006001e`;
657
658	init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_NDIS_VER;
659	init_packet->msg.v1_msg.
660	send_ndis_ver.ndis_major_ver =
661	(ndis_version & `0xFFFF0000`) >> `16`;
662	init_packet->msg.v1_msg.
663	send_ndis_ver.ndis_minor_ver =
664	ndis_version & `0xFFFF`;
665
666	trace_nvsp_send(ndev, msg: init_packet);
667
668	/ Send the init request /
669	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
670	bufferLen: sizeof(struct nvsp_message),
671	VMBUS_RQST_ID_NO_RESPONSE,
672	type: VM_PKT_DATA_INBAND, flags: `0`);
673	if (ret != `0`)
674	goto cleanup;
675
676
677	ret = netvsc_init_buf(device, net_device, device_info);
678
679	cleanup:
680	return ret;
681	}
682
683	/*
684	* netvsc_device_remove - Callback when the root bus device is removed
685	*/
686	void netvsc_device_remove(struct hv_device *device)
687	{
688	struct net_device *ndev = hv_get_drvdata(dev: device);
689	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
690	struct netvsc_device *net_device
691	= rtnl_dereference(net_device_ctx->nvdev);
692	int i;
693
694	/*
695	* Revoke receive buffer. If host is pre-Win2016 then tear down
696	* receive buffer GPADL. Do the same for send buffer.
697	*/
698	netvsc_revoke_recv_buf(device, net_device, ndev);
699	if (vmbus_proto_version < VERSION_WIN10)
700	netvsc_teardown_recv_gpadl(device, net_device, ndev);
701
702	netvsc_revoke_send_buf(device, net_device, ndev);
703	if (vmbus_proto_version < VERSION_WIN10)
704	netvsc_teardown_send_gpadl(device, net_device, ndev);
705
706	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
707
708	/ Disable NAPI and disassociate its context from the device. /
709	for (i = `0`; i < net_device->num_chn; i++) {
710	/ See also vmbus_reset_channel_cb(). /
711	napi_disable(n: &net_device->chan_table[i].napi);
712	netif_napi_del(napi: &net_device->chan_table[i].napi);
713	}
714
715	/*
716	* At this point, no one should be accessing net_device
717	* except in here
718	*/
719	netdev_dbg(ndev, "net device safe to remove\n");
720
721	/ Now, we can close the channel safely /
722	vmbus_close(channel: device->channel);
723
724	/*
725	* If host is Win2016 or higher then we do the GPADL tear down
726	* here after VMBus is closed.
727	*/
728	if (vmbus_proto_version >= VERSION_WIN10) {
729	netvsc_teardown_recv_gpadl(device, net_device, ndev);
730	netvsc_teardown_send_gpadl(device, net_device, ndev);
731	}
732
733	/ Release all resources /
734	free_netvsc_device_rcu(nvdev: net_device);
735	}
736
737	#define RING_AVAIL_PERCENT_HIWATER 20
738	#define RING_AVAIL_PERCENT_LOWATER 10
739
740	static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
741	u32 index)
742	{
743	sync_change_bit(nr: index, addr: net_device->send_section_map);
744	}
745
746	static void netvsc_send_tx_complete(struct net_device *ndev,
747	struct netvsc_device *net_device,
748	struct vmbus_channel *channel,
749	const struct vmpacket_descriptor *desc,
750	int budget)
751	{
752	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
753	struct sk_buff *skb;
754	u16 q_idx = `0`;
755	int queue_sends;
756	u64 cmd_rqst;
757
758	cmd_rqst = channel->request_addr_callback(channel, desc->trans_id);
759	if (cmd_rqst == VMBUS_RQST_ERROR) {
760	netdev_err(dev: ndev, format: "Invalid transaction ID %llx\n", desc->trans_id);
761	return;
762	}
763
764	skb = (struct sk_buff )(unsigned* long)cmd_rqst;
765
766	/ Notify the layer above us /
767	if (likely(skb)) {
768	struct hv_netvsc_packet *packet
769	= (struct hv_netvsc_packet *)skb->cb;
770	u32 send_index = packet->send_buf_index;
771	struct netvsc_stats_tx *tx_stats;
772
773	if (send_index != NETVSC_INVALID_INDEX)
774	netvsc_free_send_slot(net_device, index: send_index);
775	q_idx = packet->q_idx;
776
777	tx_stats = &net_device->chan_table[q_idx].tx_stats;
778
779	u64_stats_update_begin(syncp: &tx_stats->syncp);
780	tx_stats->packets += packet->total_packets;
781	tx_stats->bytes += packet->total_bytes;
782	u64_stats_update_end(syncp: &tx_stats->syncp);
783
784	netvsc_dma_unmap(hv_dev: ndev_ctx->device_ctx, packet);
785	napi_consume_skb(skb, budget);
786	}
787
788	queue_sends =
789	atomic_dec_return(v: &net_device->chan_table[q_idx].queue_sends);
790
791	if (unlikely(net_device->destroy)) {
792	if (queue_sends == `0`)
793	wake_up(&net_device->wait_drain);
794	} else {
795	struct netdev_queue *txq = netdev_get_tx_queue(dev: ndev, index: q_idx);
796
797	if (netif_tx_queue_stopped(dev_queue: txq) && !net_device->tx_disable &&
798	(hv_get_avail_to_write_percent(rbi: &channel->outbound) >
799	RING_AVAIL_PERCENT_HIWATER \|\| queue_sends < `1`)) {
800	netif_tx_wake_queue(dev_queue: txq);
801	ndev_ctx->eth_stats.wake_queue++;
802	}
803	}
804	}
805
806	static void netvsc_send_completion(struct net_device *ndev,
807	struct netvsc_device *net_device,
808	struct vmbus_channel *incoming_channel,
809	const struct vmpacket_descriptor *desc,
810	int budget)
811	{
812	const struct nvsp_message *nvsp_packet;
813	u32 msglen = hv_pkt_datalen(desc);
814	struct nvsp_message *pkt_rqst;
815	u64 cmd_rqst;
816	u32 status;
817
818	/ First check if this is a VMBUS completion without data payload /
819	if (!msglen) {
820	cmd_rqst = incoming_channel->request_addr_callback(incoming_channel,
821	desc->trans_id);
822	if (cmd_rqst == VMBUS_RQST_ERROR) {
823	netdev_err(dev: ndev, format: "Invalid transaction ID %llx\n", desc->trans_id);
824	return;
825	}
826
827	pkt_rqst = (struct nvsp_message *)(uintptr_t)cmd_rqst;
828	switch (pkt_rqst->hdr.msg_type) {
829	case NVSP_MSG4_TYPE_SWITCH_DATA_PATH:
830	complete(&net_device->channel_init_wait);
831	break;
832
833	default:
834	netdev_err(dev: ndev, format: "Unexpected VMBUS completion!!\n");
835	}
836	return;
837	}
838
839	/ Ensure packet is big enough to read header fields /
840	if (msglen < sizeof(struct nvsp_message_header)) {
841	netdev_err(dev: ndev, format: "nvsp_message length too small: %u\n", msglen);
842	return;
843	}
844
845	nvsp_packet = hv_pkt_data(desc);
846	switch (nvsp_packet->hdr.msg_type) {
847	case NVSP_MSG_TYPE_INIT_COMPLETE:
848	if (msglen < sizeof(struct nvsp_message_header) +
849	sizeof(struct nvsp_message_init_complete)) {
850	netdev_err(dev: ndev, format: "nvsp_msg length too small: %u\n",
851	msglen);
852	return;
853	}
854	break;
855
856	case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE:
857	if (msglen < sizeof(struct nvsp_message_header) +
858	sizeof(struct nvsp_1_message_send_receive_buffer_complete)) {
859	netdev_err(dev: ndev, format: "nvsp_msg1 length too small: %u\n",
860	msglen);
861	return;
862	}
863	break;
864
865	case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE:
866	if (msglen < sizeof(struct nvsp_message_header) +
867	sizeof(struct nvsp_1_message_send_send_buffer_complete)) {
868	netdev_err(dev: ndev, format: "nvsp_msg1 length too small: %u\n",
869	msglen);
870	return;
871	}
872	break;
873
874	case NVSP_MSG5_TYPE_SUBCHANNEL:
875	if (msglen < sizeof(struct nvsp_message_header) +
876	sizeof(struct nvsp_5_subchannel_complete)) {
877	netdev_err(dev: ndev, format: "nvsp_msg5 length too small: %u\n",
878	msglen);
879	return;
880	}
881	break;
882
883	case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE:
884	if (msglen < sizeof(struct nvsp_message_header) +
885	sizeof(struct nvsp_1_message_send_rndis_packet_complete)) {
886	if (net_ratelimit())
887	netdev_err(dev: ndev, format: "nvsp_rndis_pkt_complete length too small: %u\n",
888	msglen);
889	return;
890	}
891
892	/ If status indicates an error, output a message so we know*
893	* there's a problem. But process the completion anyway so the
894	* resources are released.
895	*/
896	status = nvsp_packet->msg.v1_msg.send_rndis_pkt_complete.status;
897	if (status != NVSP_STAT_SUCCESS && net_ratelimit())
898	netdev_err(dev: ndev, format: "nvsp_rndis_pkt_complete error status: %x\n",
899	status);
900
901	netvsc_send_tx_complete(ndev, net_device, channel: incoming_channel,
902	desc, budget);
903	return;
904
905	default:
906	netdev_err(dev: ndev,
907	format: "Unknown send completion type %d received!!\n",
908	nvsp_packet->hdr.msg_type);
909	return;
910	}
911
912	/ Copy the response back /
913	memcpy(&net_device->channel_init_pkt, nvsp_packet,
914	sizeof(struct nvsp_message));
915	complete(&net_device->channel_init_wait);
916	}
917
918	static u32 netvsc_get_next_send_section(struct netvsc_device *net_device)
919	{
920	unsigned long *map_addr = net_device->send_section_map;
921	unsigned int i;
922
923	for_each_clear_bit(i, map_addr, net_device->send_section_cnt) {
924	if (sync_test_and_set_bit(nr: i, addr: map_addr) == `0`)
925	return i;
926	}
927
928	return NETVSC_INVALID_INDEX;
929	}
930
931	static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
932	unsigned int section_index,
933	u32 pend_size,
934	struct hv_netvsc_packet *packet,
935	struct rndis_message *rndis_msg,
936	struct hv_page_buffer *pb,
937	bool xmit_more)
938	{
939	char *start = net_device->send_buf;
940	char dest = start + (section_index net_device->send_section_size)
941	+ pend_size;
942	int i;
943	u32 padding = `0`;
944	u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt :
945	packet->page_buf_cnt;
946	u32 remain;
947
948	/ Add padding /
949	remain = packet->total_data_buflen & (net_device->pkt_align - `1`);
950	if (xmit_more && remain) {
951	padding = net_device->pkt_align - remain;
952	rndis_msg->msg_len += padding;
953	packet->total_data_buflen += padding;
954	}
955
956	for (i = `0`; i < page_count; i++) {
957	char *src = phys_to_virt(address: pb[i].pfn << HV_HYP_PAGE_SHIFT);
958	u32 offset = pb[i].offset;
959	u32 len = pb[i].len;
960
961	memcpy(dest, (src + offset), len);
962	dest += len;
963	}
964
965	if (padding)
966	memset(dest, `0`, padding);
967	}
968
969	void netvsc_dma_unmap(struct hv_device *hv_dev,
970	struct hv_netvsc_packet *packet)
971	{
972	int i;
973
974	if (!hv_is_isolation_supported())
975	return;
976
977	if (!packet->dma_range)
978	return;
979
980	for (i = `0`; i < packet->page_buf_cnt; i++)
981	dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
982	packet->dma_range[i].mapping_size,
983	DMA_TO_DEVICE);
984
985	kfree(objp: packet->dma_range);
986	}
987
988	/ netvsc_dma_map - Map swiotlb bounce buffer with data page of*
989	* packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
990	* VM.
991	*
992	* In isolation VM, netvsc send buffer has been marked visible to
993	* host and so the data copied to send buffer doesn't need to use
994	* bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
995	* may not be copied to send buffer and so these pages need to be
996	* mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
997	* that. The pfns in the struct hv_page_buffer need to be converted
998	* to bounce buffer's pfn. The loop here is necessary because the
999	* entries in the page buffer array are not necessarily full
1000	* pages of data. Each entry in the array has a separate offset and
1001	* len that may be non-zero, even for entries in the middle of the
1002	* array. And the entries are not physically contiguous. So each
1003	* entry must be individually mapped rather than as a contiguous unit.
1004	* So not use dma_map_sg() here.
1005	*/
1006	static int netvsc_dma_map(struct hv_device *hv_dev,
1007	struct hv_netvsc_packet *packet,
1008	struct hv_page_buffer *pb)
1009	{
1010	u32 page_count = packet->page_buf_cnt;
1011	dma_addr_t dma;
1012	int i;
1013
1014	if (!hv_is_isolation_supported())
1015	return `0`;
1016
1017	packet->dma_range = kcalloc(n: page_count,
1018	size: sizeof(*packet->dma_range),
1019	GFP_ATOMIC);
1020	if (!packet->dma_range)
1021	return -ENOMEM;
1022
1023	for (i = `0`; i < page_count; i++) {
1024	char *src = phys_to_virt(address: (pb[i].pfn << HV_HYP_PAGE_SHIFT)
1025	+ pb[i].offset);
1026	u32 len = pb[i].len;
1027
1028	dma = dma_map_single(&hv_dev->device, src, len,
1029	DMA_TO_DEVICE);
1030	if (dma_mapping_error(dev: &hv_dev->device, dma_addr: dma)) {
1031	kfree(objp: packet->dma_range);
1032	return -ENOMEM;
1033	}
1034
1035	/ pb[].offset and pb[].len are not changed during dma mapping*
1036	* and so not reassign.
1037	*/
1038	packet->dma_range[i].dma = dma;
1039	packet->dma_range[i].mapping_size = len;
1040	pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
1041	}
1042
1043	return `0`;
1044	}
1045
1046	static inline int netvsc_send_pkt(
1047	struct hv_device *device,
1048	struct hv_netvsc_packet *packet,
1049	struct netvsc_device *net_device,
1050	struct hv_page_buffer *pb,
1051	struct sk_buff *skb)
1052	{
1053	struct nvsp_message nvmsg;
1054	struct nvsp_1_message_send_rndis_packet *rpkt =
1055	&nvmsg.msg.v1_msg.send_rndis_pkt;
1056	struct netvsc_channel * const nvchan =
1057	&net_device->chan_table[packet->q_idx];
1058	struct vmbus_channel *out_channel = nvchan->channel;
1059	struct net_device *ndev = hv_get_drvdata(dev: device);
1060	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
1061	struct netdev_queue *txq = netdev_get_tx_queue(dev: ndev, index: packet->q_idx);
1062	u64 req_id;
1063	int ret;
1064	u32 ring_avail = hv_get_avail_to_write_percent(rbi: &out_channel->outbound);
1065
1066	memset(&nvmsg, `0`, sizeof(struct nvsp_message));
1067	nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
1068	if (skb)
1069	rpkt->channel_type = `0`; / 0 is RMC_DATA /
1070	else
1071	rpkt->channel_type = `1`; / 1 is RMC_CONTROL /
1072
1073	rpkt->send_buf_section_index = packet->send_buf_index;
1074	if (packet->send_buf_index == NETVSC_INVALID_INDEX)
1075	rpkt->send_buf_section_size = `0`;
1076	else
1077	rpkt->send_buf_section_size = packet->total_data_buflen;
1078
1079	req_id = (ulong)skb;
1080
1081	if (out_channel->rescind)
1082	return -ENODEV;
1083
1084	trace_nvsp_send_pkt(ndev, chan: out_channel, rpkt);
1085
1086	packet->dma_range = NULL;
1087	if (packet->page_buf_cnt) {
1088	if (packet->cp_partial)
1089	pb += packet->rmsg_pgcnt;
1090
1091	ret = netvsc_dma_map(hv_dev: ndev_ctx->device_ctx, packet, pb);
1092	if (ret) {
1093	ret = -EAGAIN;
1094	goto exit;
1095	}
1096
1097	ret = vmbus_sendpacket_pagebuffer(channel: out_channel,
1098	pagebuffers: pb, pagecount: packet->page_buf_cnt,
1099	buffer: &nvmsg, bufferlen: sizeof(nvmsg),
1100	requestid: req_id);
1101
1102	if (ret)
1103	netvsc_dma_unmap(hv_dev: ndev_ctx->device_ctx, packet);
1104	} else {
1105	ret = vmbus_sendpacket(channel: out_channel,
1106	buffer: &nvmsg, bufferLen: sizeof(nvmsg),
1107	requestid: req_id, type: VM_PKT_DATA_INBAND,
1108	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1109	}
1110
1111	exit:
1112	if (ret == `0`) {
1113	atomic_inc_return(v: &nvchan->queue_sends);
1114
1115	if (ring_avail < RING_AVAIL_PERCENT_LOWATER) {
1116	netif_tx_stop_queue(dev_queue: txq);
1117	ndev_ctx->eth_stats.stop_queue++;
1118	}
1119	} else if (ret == -EAGAIN) {
1120	netif_tx_stop_queue(dev_queue: txq);
1121	ndev_ctx->eth_stats.stop_queue++;
1122	} else {
1123	netdev_err(dev: ndev,
1124	format: "Unable to send packet pages %u len %u, ret %d\n",
1125	packet->page_buf_cnt, packet->total_data_buflen,
1126	ret);
1127	}
1128
1129	if (netif_tx_queue_stopped(dev_queue: txq) &&
1130	atomic_read(v: &nvchan->queue_sends) < `1` &&
1131	!net_device->tx_disable) {
1132	netif_tx_wake_queue(dev_queue: txq);
1133	ndev_ctx->eth_stats.wake_queue++;
1134	if (ret == -EAGAIN)
1135	ret = -ENOSPC;
1136	}
1137
1138	return ret;
1139	}
1140
1141	/ Move packet out of multi send data (msd), and clear msd /
1142	static inline void move_pkt_msd(struct hv_netvsc_packet **msd_send,
1143	struct sk_buff **msd_skb,
1144	struct multi_send_data *msdp)
1145	{
1146	*msd_skb = msdp->skb;
1147	*msd_send = msdp->pkt;
1148	msdp->skb = NULL;
1149	msdp->pkt = NULL;
1150	msdp->count = `0`;
1151	}
1152
1153	/ RCU already held by caller /
1154	/ Batching/bouncing logic is designed to attempt to optimize*
1155	* performance.
1156	*
1157	* For small, non-LSO packets we copy the packet to a send buffer
1158	* which is pre-registered with the Hyper-V side. This enables the
1159	* hypervisor to avoid remapping the aperture to access the packet
1160	* descriptor and data.
1161	*
1162	* If we already started using a buffer and the netdev is transmitting
1163	* a burst of packets, keep on copying into the buffer until it is
1164	* full or we are done collecting a burst. If there is an existing
1165	* buffer with space for the RNDIS descriptor but not the packet, copy
1166	* the RNDIS descriptor to the buffer, keeping the packet in place.
1167	*
1168	* If we do batching and send more than one packet using a single
1169	* NetVSC message, free the SKBs of the packets copied, except for the
1170	* last packet. This is done to streamline the handling of the case
1171	* where the last packet only had the RNDIS descriptor copied to the
1172	* send buffer, with the data pointers included in the NetVSC message.
1173	*/
1174	int netvsc_send(struct net_device *ndev,
1175	struct hv_netvsc_packet *packet,
1176	struct rndis_message *rndis_msg,
1177	struct hv_page_buffer *pb,
1178	struct sk_buff *skb,
1179	bool xdp_tx)
1180	{
1181	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
1182	struct netvsc_device *net_device
1183	= rcu_dereference_bh(ndev_ctx->nvdev);
1184	struct hv_device *device = ndev_ctx->device_ctx;
1185	int ret = `0`;
1186	struct netvsc_channel *nvchan;
1187	u32 pktlen = packet->total_data_buflen, msd_len = `0`;
1188	unsigned int section_index = NETVSC_INVALID_INDEX;
1189	struct multi_send_data *msdp;
1190	struct hv_netvsc_packet msd_send = NULL, cur_send = NULL;
1191	struct sk_buff *msd_skb = NULL;
1192	bool try_batch, xmit_more;
1193
1194	/ If device is rescinded, return error and packet will get dropped. /
1195	if (unlikely(!net_device \|\| net_device->destroy))
1196	return -ENODEV;
1197
1198	nvchan = &net_device->chan_table[packet->q_idx];
1199	packet->send_buf_index = NETVSC_INVALID_INDEX;
1200	packet->cp_partial = false;
1201
1202	/ Send a control message or XDP packet directly without accessing*
1203	* msd (Multi-Send Data) field which may be changed during data packet
1204	* processing.
1205	*/
1206	if (!skb \|\| xdp_tx)
1207	return netvsc_send_pkt(device, packet, net_device, pb, skb);
1208
1209	/ batch packets in send buffer if possible /
1210	msdp = &nvchan->msd;
1211	if (msdp->pkt)
1212	msd_len = msdp->pkt->total_data_buflen;
1213
1214	try_batch = msd_len > `0` && msdp->count < net_device->max_pkt;
1215	if (try_batch && msd_len + pktlen + net_device->pkt_align <
1216	net_device->send_section_size) {
1217	section_index = msdp->pkt->send_buf_index;
1218
1219	} else if (try_batch && msd_len + packet->rmsg_size <
1220	net_device->send_section_size) {
1221	section_index = msdp->pkt->send_buf_index;
1222	packet->cp_partial = true;
1223
1224	} else if (pktlen + net_device->pkt_align <
1225	net_device->send_section_size) {
1226	section_index = netvsc_get_next_send_section(net_device);
1227	if (unlikely(section_index == NETVSC_INVALID_INDEX)) {
1228	++ndev_ctx->eth_stats.tx_send_full;
1229	} else {
1230	move_pkt_msd(msd_send: &msd_send, msd_skb: &msd_skb, msdp);
1231	msd_len = `0`;
1232	}
1233	}
1234
1235	/ Keep aggregating only if stack says more data is coming*
1236	* and not doing mixed modes send and not flow blocked
1237	*/
1238	xmit_more = netdev_xmit_more() &&
1239	!packet->cp_partial &&
1240	!netif_xmit_stopped(dev_queue: netdev_get_tx_queue(dev: ndev, index: packet->q_idx));
1241
1242	if (section_index != NETVSC_INVALID_INDEX) {
1243	netvsc_copy_to_send_buf(net_device,
1244	section_index, pend_size: msd_len,
1245	packet, rndis_msg, pb, xmit_more);
1246
1247	packet->send_buf_index = section_index;
1248
1249	if (packet->cp_partial) {
1250	packet->page_buf_cnt -= packet->rmsg_pgcnt;
1251	packet->total_data_buflen = msd_len + packet->rmsg_size;
1252	} else {
1253	packet->page_buf_cnt = `0`;
1254	packet->total_data_buflen += msd_len;
1255	}
1256
1257	if (msdp->pkt) {
1258	packet->total_packets += msdp->pkt->total_packets;
1259	packet->total_bytes += msdp->pkt->total_bytes;
1260	}
1261
1262	if (msdp->skb)
1263	dev_consume_skb_any(skb: msdp->skb);
1264
1265	if (xmit_more) {
1266	msdp->skb = skb;
1267	msdp->pkt = packet;
1268	msdp->count++;
1269	} else {
1270	cur_send = packet;
1271	msdp->skb = NULL;
1272	msdp->pkt = NULL;
1273	msdp->count = `0`;
1274	}
1275	} else {
1276	move_pkt_msd(msd_send: &msd_send, msd_skb: &msd_skb, msdp);
1277	cur_send = packet;
1278	}
1279
1280	if (msd_send) {
1281	int m_ret = netvsc_send_pkt(device, packet: msd_send, net_device,
1282	NULL, skb: msd_skb);
1283
1284	if (m_ret != `0`) {
1285	netvsc_free_send_slot(net_device,
1286	index: msd_send->send_buf_index);
1287	dev_kfree_skb_any(skb: msd_skb);
1288	}
1289	}
1290
1291	if (cur_send)
1292	ret = netvsc_send_pkt(device, packet: cur_send, net_device, pb, skb);
1293
1294	if (ret != `0` && section_index != NETVSC_INVALID_INDEX)
1295	netvsc_free_send_slot(net_device, index: section_index);
1296
1297	return ret;
1298	}
1299
1300	/ Send pending recv completions /
1301	static int send_recv_completions(struct net_device *ndev,
1302	struct netvsc_device *nvdev,
1303	struct netvsc_channel *nvchan)
1304	{
1305	struct multi_recv_comp *mrc = &nvchan->mrc;
1306	struct recv_comp_msg {
1307	struct nvsp_message_header hdr;
1308	u32 status;
1309	} __packed;
1310	struct recv_comp_msg msg = {
1311	.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE,
1312	};
1313	int ret;
1314
1315	while (mrc->first != mrc->next) {
1316	const struct recv_comp_data *rcd
1317	= mrc->slots + mrc->first;
1318
1319	msg.status = rcd->status;
1320	ret = vmbus_sendpacket(channel: nvchan->channel, buffer: &msg, bufferLen: sizeof(msg),
1321	requestid: rcd->tid, type: VM_PKT_COMP, flags: `0`);
1322	if (unlikely(ret)) {
1323	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
1324
1325	++ndev_ctx->eth_stats.rx_comp_busy;
1326	return ret;
1327	}
1328
1329	if (++mrc->first == nvdev->recv_completion_cnt)
1330	mrc->first = `0`;
1331	}
1332
1333	/ receive completion ring has been emptied /
1334	if (unlikely(nvdev->destroy))
1335	wake_up(&nvdev->wait_drain);
1336
1337	return `0`;
1338	}
1339
1340	/ Count how many receive completions are outstanding /
1341	static void recv_comp_slot_avail(const struct netvsc_device *nvdev,
1342	const struct multi_recv_comp *mrc,
1343	u32 filled, u32 avail)
1344	{
1345	u32 count = nvdev->recv_completion_cnt;
1346
1347	if (mrc->next >= mrc->first)
1348	*filled = mrc->next - mrc->first;
1349	else
1350	*filled = (count - mrc->first) + mrc->next;
1351
1352	avail = count - filled - `1`;
1353	}
1354
1355	/ Add receive complete to ring to send to host. /
1356	static void enq_receive_complete(struct net_device *ndev,
1357	struct netvsc_device *nvdev, u16 q_idx,
1358	u64 tid, u32 status)
1359	{
1360	struct netvsc_channel *nvchan = &nvdev->chan_table[q_idx];
1361	struct multi_recv_comp *mrc = &nvchan->mrc;
1362	struct recv_comp_data *rcd;
1363	u32 filled, avail;
1364
1365	recv_comp_slot_avail(nvdev, mrc, filled: &filled, avail: &avail);
1366
1367	if (unlikely(filled > NAPI_POLL_WEIGHT)) {
1368	send_recv_completions(ndev, nvdev, nvchan);
1369	recv_comp_slot_avail(nvdev, mrc, filled: &filled, avail: &avail);
1370	}
1371
1372	if (unlikely(!avail)) {
1373	netdev_err(dev: ndev, format: "Recv_comp full buf q:%hd, tid:%llx\n",
1374	q_idx, tid);
1375	return;
1376	}
1377
1378	rcd = mrc->slots + mrc->next;
1379	rcd->tid = tid;
1380	rcd->status = status;
1381
1382	if (++mrc->next == nvdev->recv_completion_cnt)
1383	mrc->next = `0`;
1384	}
1385
1386	static int netvsc_receive(struct net_device *ndev,
1387	struct netvsc_device *net_device,
1388	struct netvsc_channel *nvchan,
1389	const struct vmpacket_descriptor *desc)
1390	{
1391	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1392	struct vmbus_channel *channel = nvchan->channel;
1393	const struct vmtransfer_page_packet_header *vmxferpage_packet
1394	= container_of(desc, const struct vmtransfer_page_packet_header, d);
1395	const struct nvsp_message *nvsp = hv_pkt_data(desc);
1396	u32 msglen = hv_pkt_datalen(desc);
1397	u16 q_idx = channel->offermsg.offer.sub_channel_index;
1398	char *recv_buf = net_device->recv_buf;
1399	u32 status = NVSP_STAT_SUCCESS;
1400	int i;
1401	int count = `0`;
1402
1403	/ Ensure packet is big enough to read header fields /
1404	if (msglen < sizeof(struct nvsp_message_header)) {
1405	netif_err(net_device_ctx, rx_err, ndev,
1406	"invalid nvsp header, length too small: %u\n",
1407	msglen);
1408	return `0`;
1409	}
1410
1411	/ Make sure this is a valid nvsp packet /
1412	if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
1413	netif_err(net_device_ctx, rx_err, ndev,
1414	"Unknown nvsp packet type received %u\n",
1415	nvsp->hdr.msg_type);
1416	return `0`;
1417	}
1418
1419	/ Validate xfer page pkt header /
1420	if ((desc->offset8 << `3`) < sizeof(struct vmtransfer_page_packet_header)) {
1421	netif_err(net_device_ctx, rx_err, ndev,
1422	"Invalid xfer page pkt, offset too small: %u\n",
1423	desc->offset8 << `3`);
1424	return `0`;
1425	}
1426
1427	if (unlikely(vmxferpage_packet->xfer_pageset_id != NETVSC_RECEIVE_BUFFER_ID)) {
1428	netif_err(net_device_ctx, rx_err, ndev,
1429	"Invalid xfer page set id - expecting %x got %x\n",
1430	NETVSC_RECEIVE_BUFFER_ID,
1431	vmxferpage_packet->xfer_pageset_id);
1432	return `0`;
1433	}
1434
1435	count = vmxferpage_packet->range_cnt;
1436
1437	/ Check count for a valid value /
1438	if (NETVSC_XFER_HEADER_SIZE(count) > desc->offset8 << `3`) {
1439	netif_err(net_device_ctx, rx_err, ndev,
1440	"Range count is not valid: %d\n",
1441	count);
1442	return `0`;
1443	}
1444
1445	/ Each range represents 1 RNDIS pkt that contains 1 ethernet frame /
1446	for (i = `0`; i < count; i++) {
1447	u32 offset = vmxferpage_packet->ranges[i].byte_offset;
1448	u32 buflen = vmxferpage_packet->ranges[i].byte_count;
1449	void *data;
1450	int ret;
1451
1452	if (unlikely(offset > net_device->recv_buf_size \|\|
1453	buflen > net_device->recv_buf_size - offset)) {
1454	nvchan->rsc.cnt = `0`;
1455	status = NVSP_STAT_FAIL;
1456	netif_err(net_device_ctx, rx_err, ndev,
1457	"Packet offset:%u + len:%u too big\n",
1458	offset, buflen);
1459
1460	continue;
1461	}
1462
1463	/ We're going to copy (sections of) the packet into nvchan->recv_buf;*
1464	* make sure that nvchan->recv_buf is large enough to hold the packet.
1465	*/
1466	if (unlikely(buflen > net_device->recv_section_size)) {
1467	nvchan->rsc.cnt = `0`;
1468	status = NVSP_STAT_FAIL;
1469	netif_err(net_device_ctx, rx_err, ndev,
1470	"Packet too big: buflen=%u recv_section_size=%u\n",
1471	buflen, net_device->recv_section_size);
1472
1473	continue;
1474	}
1475
1476	data = recv_buf + offset;
1477
1478	nvchan->rsc.is_last = (i == count - `1`);
1479
1480	trace_rndis_recv(ndev, q: q_idx, msg: data);
1481
1482	/ Pass it to the upper layer /
1483	ret = rndis_filter_receive(ndev, net_dev: net_device,
1484	nvchan, data, buflen);
1485
1486	if (unlikely(ret != NVSP_STAT_SUCCESS)) {
1487	/ Drop incomplete packet /
1488	nvchan->rsc.cnt = `0`;
1489	status = NVSP_STAT_FAIL;
1490	}
1491	}
1492
1493	enq_receive_complete(ndev, nvdev: net_device, q_idx,
1494	tid: vmxferpage_packet->d.trans_id, status);
1495
1496	return count;
1497	}
1498
1499	static void netvsc_send_table(struct net_device *ndev,
1500	struct netvsc_device *nvscdev,
1501	const struct nvsp_message *nvmsg,
1502	u32 msglen)
1503	{
1504	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1505	u32 count, offset, *tab;
1506	int i;
1507
1508	/ Ensure packet is big enough to read send_table fields /
1509	if (msglen < sizeof(struct nvsp_message_header) +
1510	sizeof(struct nvsp_5_send_indirect_table)) {
1511	netdev_err(dev: ndev, format: "nvsp_v5_msg length too small: %u\n", msglen);
1512	return;
1513	}
1514
1515	count = nvmsg->msg.v5_msg.send_table.count;
1516	offset = nvmsg->msg.v5_msg.send_table.offset;
1517
1518	if (count != VRSS_SEND_TAB_SIZE) {
1519	netdev_err(dev: ndev, format: "Received wrong send-table size:%u\n", count);
1520	return;
1521	}
1522
1523	/ If negotiated version <= NVSP_PROTOCOL_VERSION_6, the offset may be*
1524	* wrong due to a host bug. So fix the offset here.
1525	*/
1526	if (nvscdev->nvsp_version <= NVSP_PROTOCOL_VERSION_6 &&
1527	msglen >= sizeof(struct nvsp_message_header) +
1528	sizeof(union nvsp_6_message_uber) + count * sizeof(u32))
1529	offset = sizeof(struct nvsp_message_header) +
1530	sizeof(union nvsp_6_message_uber);
1531
1532	/ Boundary check for all versions /
1533	if (msglen < count * sizeof(u32) \|\| offset > msglen - count * sizeof(u32)) {
1534	netdev_err(dev: ndev, format: "Received send-table offset too big:%u\n",
1535	offset);
1536	return;
1537	}
1538
1539	tab = (void *)nvmsg + offset;
1540
1541	for (i = `0`; i < count; i++)
1542	net_device_ctx->tx_table[i] = tab[i];
1543	}
1544
1545	static void netvsc_send_vf(struct net_device *ndev,
1546	const struct nvsp_message *nvmsg,
1547	u32 msglen)
1548	{
1549	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1550
1551	/ Ensure packet is big enough to read its fields /
1552	if (msglen < sizeof(struct nvsp_message_header) +
1553	sizeof(struct nvsp_4_send_vf_association)) {
1554	netdev_err(dev: ndev, format: "nvsp_v4_msg length too small: %u\n", msglen);
1555	return;
1556	}
1557
1558	net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
1559	net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
1560
1561	if (net_device_ctx->vf_alloc)
1562	complete(&net_device_ctx->vf_add);
1563
1564	netdev_info(dev: ndev, format: "VF slot %u %s\n",
1565	net_device_ctx->vf_serial,
1566	net_device_ctx->vf_alloc ? "added" : "removed");
1567	}
1568
1569	static void netvsc_receive_inband(struct net_device *ndev,
1570	struct netvsc_device *nvscdev,
1571	const struct vmpacket_descriptor *desc)
1572	{
1573	const struct nvsp_message *nvmsg = hv_pkt_data(desc);
1574	u32 msglen = hv_pkt_datalen(desc);
1575
1576	/ Ensure packet is big enough to read header fields /
1577	if (msglen < sizeof(struct nvsp_message_header)) {
1578	netdev_err(dev: ndev, format: "inband nvsp_message length too small: %u\n", msglen);
1579	return;
1580	}
1581
1582	switch (nvmsg->hdr.msg_type) {
1583	case NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE:
1584	netvsc_send_table(ndev, nvscdev, nvmsg, msglen);
1585	break;
1586
1587	case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
1588	if (hv_is_isolation_supported())
1589	netdev_err(dev: ndev, format: "Ignore VF_ASSOCIATION msg from the host supporting isolation\n");
1590	else
1591	netvsc_send_vf(ndev, nvmsg, msglen);
1592	break;
1593	}
1594	}
1595
1596	static int netvsc_process_raw_pkt(struct hv_device *device,
1597	struct netvsc_channel *nvchan,
1598	struct netvsc_device *net_device,
1599	struct net_device *ndev,
1600	const struct vmpacket_descriptor *desc,
1601	int budget)
1602	{
1603	struct vmbus_channel *channel = nvchan->channel;
1604	const struct nvsp_message *nvmsg = hv_pkt_data(desc);
1605
1606	trace_nvsp_recv(ndev, chan: channel, msg: nvmsg);
1607
1608	switch (desc->type) {
1609	case VM_PKT_COMP:
1610	netvsc_send_completion(ndev, net_device, incoming_channel: channel, desc, budget);
1611	break;
1612
1613	case VM_PKT_DATA_USING_XFER_PAGES:
1614	return netvsc_receive(ndev, net_device, nvchan, desc);
1615
1616	case VM_PKT_DATA_INBAND:
1617	netvsc_receive_inband(ndev, nvscdev: net_device, desc);
1618	break;
1619
1620	default:
1621	netdev_err(dev: ndev, format: "unhandled packet type %d, tid %llx\n",
1622	desc->type, desc->trans_id);
1623	break;
1624	}
1625
1626	return `0`;
1627	}
1628
1629	static struct hv_device netvsc_channel_to_device(struct* vmbus_channel *channel)
1630	{
1631	struct vmbus_channel *primary = channel->primary_channel;
1632
1633	return primary ? primary->device_obj : channel->device_obj;
1634	}
1635
1636	/ Network processing softirq*
1637	* Process data in incoming ring buffer from host
1638	* Stops when ring is empty or budget is met or exceeded.
1639	*/
1640	int netvsc_poll(struct napi_struct napi, int* budget)
1641	{
1642	struct netvsc_channel *nvchan
1643	= container_of(napi, struct netvsc_channel, napi);
1644	struct netvsc_device *net_device = nvchan->net_device;
1645	struct vmbus_channel *channel = nvchan->channel;
1646	struct hv_device *device = netvsc_channel_to_device(channel);
1647	struct net_device *ndev = hv_get_drvdata(dev: device);
1648	int work_done = `0`;
1649	int ret;
1650
1651	/ If starting a new interval /
1652	if (!nvchan->desc)
1653	nvchan->desc = hv_pkt_iter_first(channel);
1654
1655	nvchan->xdp_flush = false;
1656
1657	while (nvchan->desc && work_done < budget) {
1658	work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
1659	ndev, desc: nvchan->desc, budget);
1660	nvchan->desc = hv_pkt_iter_next(channel, pkt: nvchan->desc);
1661	}
1662
1663	if (nvchan->xdp_flush)
1664	xdp_do_flush();
1665
1666	/ Send any pending receive completions /
1667	ret = send_recv_completions(ndev, nvdev: net_device, nvchan);
1668
1669	/ If it did not exhaust NAPI budget this time*
1670	* and not doing busy poll
1671	* then re-enable host interrupts
1672	* and reschedule if ring is not empty
1673	* or sending receive completion failed.
1674	*/
1675	if (work_done < budget &&
1676	napi_complete_done(n: napi, work_done) &&
1677	(ret \|\| hv_end_read(rbi: &channel->inbound)) &&
1678	napi_schedule_prep(n: napi)) {
1679	hv_begin_read(rbi: &channel->inbound);
1680	__napi_schedule(n: napi);
1681	}
1682
1683	/ Driver may overshoot since multiple packets per descriptor /
1684	return min(work_done, budget);
1685	}
1686
1687	/ Call back when data is available in host ring buffer.*
1688	* Processing is deferred until network softirq (NAPI)
1689	*/
1690	void netvsc_channel_cb(void *context)
1691	{
1692	struct netvsc_channel *nvchan = context;
1693	struct vmbus_channel *channel = nvchan->channel;
1694	struct hv_ring_buffer_info *rbi = &channel->inbound;
1695
1696	/ preload first vmpacket descriptor /
1697	prefetch(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
1698
1699	if (napi_schedule_prep(n: &nvchan->napi)) {
1700	/ disable interrupts from host /
1701	hv_begin_read(rbi);
1702
1703	__napi_schedule_irqoff(n: &nvchan->napi);
1704	}
1705	}
1706
1707	/*
1708	* netvsc_device_add - Callback when the device belonging to this
1709	* driver is added
1710	*/
1711	struct netvsc_device netvsc_device_add(struct* hv_device *device,
1712	const struct netvsc_device_info *device_info)
1713	{
1714	int i, ret = `0`;
1715	struct netvsc_device *net_device;
1716	struct net_device *ndev = hv_get_drvdata(dev: device);
1717	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1718
1719	net_device = alloc_net_device();
1720	if (!net_device)
1721	return ERR_PTR(error: -ENOMEM);
1722
1723	for (i = `0`; i < VRSS_SEND_TAB_SIZE; i++)
1724	net_device_ctx->tx_table[i] = `0`;
1725
1726	/ Because the device uses NAPI, all the interrupt batching and*
1727	* control is done via Net softirq, not the channel handling
1728	*/
1729	set_channel_read_mode(c: device->channel, mode: HV_CALL_ISR);
1730
1731	/ If we're reopening the device we may have multiple queues, fill the*
1732	* chn_table with the default channel to use it before subchannels are
1733	* opened.
1734	* Initialize the channel state before we open;
1735	* we can be interrupted as soon as we open the channel.
1736	*/
1737
1738	for (i = `0`; i < VRSS_CHANNEL_MAX; i++) {
1739	struct netvsc_channel *nvchan = &net_device->chan_table[i];
1740
1741	nvchan->channel = device->channel;
1742	nvchan->net_device = net_device;
1743	u64_stats_init(syncp: &nvchan->tx_stats.syncp);
1744	u64_stats_init(syncp: &nvchan->rx_stats.syncp);
1745
1746	ret = xdp_rxq_info_reg(xdp_rxq: &nvchan->xdp_rxq, dev: ndev, queue_index: i, napi_id: `0`);
1747
1748	if (ret) {
1749	netdev_err(dev: ndev, format: "xdp_rxq_info_reg fail: %d\n", ret);
1750	goto cleanup2;
1751	}
1752
1753	ret = xdp_rxq_info_reg_mem_model(xdp_rxq: &nvchan->xdp_rxq,
1754	type: MEM_TYPE_PAGE_SHARED, NULL);
1755
1756	if (ret) {
1757	netdev_err(dev: ndev, format: "xdp reg_mem_model fail: %d\n", ret);
1758	goto cleanup2;
1759	}
1760	}
1761
1762	/ Enable NAPI handler before init callbacks /
1763	netif_napi_add(dev: ndev, napi: &net_device->chan_table[`0`].napi, poll: netvsc_poll);
1764
1765	/ Open the channel /
1766	device->channel->next_request_id_callback = vmbus_next_request_id;
1767	device->channel->request_addr_callback = vmbus_request_addr;
1768	device->channel->rqstor_size = netvsc_rqstor_size(ringbytes: netvsc_ring_bytes);
1769	device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
1770
1771	ret = vmbus_open(channel: device->channel, send_ringbuffersize: netvsc_ring_bytes,
1772	recv_ringbuffersize: netvsc_ring_bytes, NULL, userdatalen: `0`,
1773	onchannel_callback: netvsc_channel_cb, context: net_device->chan_table);
1774
1775	if (ret != `0`) {
1776	netdev_err(dev: ndev, format: "unable to open channel: %d\n", ret);
1777	goto cleanup;
1778	}
1779
1780	/ Channel is opened /
1781	netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
1782
1783	napi_enable(n: &net_device->chan_table[`0`].napi);
1784
1785	/ Connect with the NetVsp /
1786	ret = netvsc_connect_vsp(device, net_device, device_info);
1787	if (ret != `0`) {
1788	netdev_err(dev: ndev,
1789	format: "unable to connect to NetVSP - %d\n", ret);
1790	goto close;
1791	}
1792
1793	/ Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is*
1794	* populated.
1795	*/
1796	rcu_assign_pointer(net_device_ctx->nvdev, net_device);
1797
1798	return net_device;
1799
1800	close:
1801	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
1802	napi_disable(n: &net_device->chan_table[`0`].napi);
1803
1804	/ Now, we can close the channel safely /
1805	vmbus_close(channel: device->channel);
1806
1807	cleanup:
1808	netif_napi_del(napi: &net_device->chan_table[`0`].napi);
1809
1810	cleanup2:
1811	free_netvsc_device(head: &net_device->rcu);
1812
1813	return ERR_PTR(error: ret);
1814	}
1815

source code of linux/drivers/net/hyperv/netvsc.c