netvsc.c source code [linux/drivers/net/hyperv/netvsc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Copyright (c) 2009, Microsoft Corporation.
4	*
5	* Authors:
6	* Haiyang Zhang <haiyangz@microsoft.com>
7	* Hank Janssen <hjanssen@microsoft.com>
8	*/
9	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11	#include <linux/kernel.h>
12	#include <linux/sched.h>
13	#include <linux/wait.h>
14	#include <linux/mm.h>
15	#include <linux/delay.h>
16	#include <linux/io.h>
17	#include <linux/slab.h>
18	#include <linux/netdevice.h>
19	#include <linux/if_ether.h>
20	#include <linux/vmalloc.h>
21	#include <linux/rtnetlink.h>
22	#include <linux/prefetch.h>
23	#include <linux/filter.h>
24
25	#include <asm/sync_bitops.h>
26	#include <asm/mshyperv.h>
27
28	#include "hyperv_net.h"
29	#include "netvsc_trace.h"
30
31	/*
32	* Switch the data path from the synthetic interface to the VF
33	* interface.
34	*/
35	int netvsc_switch_datapath(struct net_device *ndev, bool vf)
36	{
37	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
38	struct hv_device *dev = net_device_ctx->device_ctx;
39	struct netvsc_device *nv_dev = rtnl_dereference(net_device_ctx->nvdev);
40	struct nvsp_message *init_pkt = &nv_dev->channel_init_pkt;
41	int ret, retry = `0`;
42
43	/ Block sending traffic to VF if it's about to be gone /
44	if (!vf)
45	net_device_ctx->data_path_is_vf = vf;
46
47	memset(init_pkt, `0`, sizeof(struct nvsp_message));
48	init_pkt->hdr.msg_type = NVSP_MSG4_TYPE_SWITCH_DATA_PATH;
49	if (vf)
50	init_pkt->msg.v4_msg.active_dp.active_datapath =
51	NVSP_DATAPATH_VF;
52	else
53	init_pkt->msg.v4_msg.active_dp.active_datapath =
54	NVSP_DATAPATH_SYNTHETIC;
55
56	again:
57	trace_nvsp_send(ndev, msg: init_pkt);
58
59	ret = vmbus_sendpacket(channel: dev->channel, buffer: init_pkt,
60	bufferLen: sizeof(struct nvsp_message),
61	requestid: (unsigned long)init_pkt, type: VM_PKT_DATA_INBAND,
62	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
63
64	/ If failed to switch to/from VF, let data_path_is_vf stay false,*
65	* so we use synthetic path to send data.
66	*/
67	if (ret) {
68	if (ret != -EAGAIN) {
69	netdev_err(dev: ndev,
70	format: "Unable to send sw datapath msg, err: %d\n",
71	ret);
72	return ret;
73	}
74
75	if (retry++ < RETRY_MAX) {
76	usleep_range(RETRY_US_LO, RETRY_US_HI);
77	goto again;
78	} else {
79	netdev_err(
80	dev: ndev,
81	format: "Retry failed to send sw datapath msg, err: %d\n",
82	ret);
83	return ret;
84	}
85	}
86
87	wait_for_completion(&nv_dev->channel_init_wait);
88	net_device_ctx->data_path_is_vf = vf;
89
90	return `0`;
91	}
92
93	/ Worker to setup sub channels on initial setup*
94	* Initial hotplug event occurs in softirq context
95	* and can't wait for channels.
96	*/
97	static void netvsc_subchan_work(struct work_struct *w)
98	{
99	struct netvsc_device *nvdev =
100	container_of(w, struct netvsc_device, subchan_work);
101	struct rndis_device *rdev;
102	int i, ret;
103
104	/ Avoid deadlock with device removal already under RTNL /
105	if (!rtnl_trylock()) {
106	schedule_work(work: w);
107	return;
108	}
109
110	rdev = nvdev->extension;
111	if (rdev) {
112	ret = rndis_set_subchannel(ndev: rdev->ndev, nvdev, NULL);
113	if (ret == `0`) {
114	netif_device_attach(dev: rdev->ndev);
115	} else {
116	/ fallback to only primary channel /
117	for (i = `1`; i < nvdev->num_chn; i++)
118	netif_napi_del(napi: &nvdev->chan_table[i].napi);
119
120	nvdev->max_chn = `1`;
121	nvdev->num_chn = `1`;
122	}
123	}
124
125	rtnl_unlock();
126	}
127
128	static struct netvsc_device alloc_net_device(void*)
129	{
130	struct netvsc_device *net_device;
131
132	net_device = kzalloc(size: sizeof(struct netvsc_device), GFP_KERNEL);
133	if (!net_device)
134	return NULL;
135
136	init_waitqueue_head(&net_device->wait_drain);
137	net_device->destroy = false;
138	net_device->tx_disable = true;
139
140	net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
141	net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
142
143	init_completion(x: &net_device->channel_init_wait);
144	init_waitqueue_head(&net_device->subchan_open);
145	INIT_WORK(&net_device->subchan_work, netvsc_subchan_work);
146
147	return net_device;
148	}
149
150	static void free_netvsc_device(struct rcu_head *head)
151	{
152	struct netvsc_device *nvdev
153	= container_of(head, struct netvsc_device, rcu);
154	int i;
155
156	kfree(objp: nvdev->extension);
157
158	if (!nvdev->recv_buf_gpadl_handle.decrypted)
159	vfree(addr: nvdev->recv_buf);
160	if (!nvdev->send_buf_gpadl_handle.decrypted)
161	vfree(addr: nvdev->send_buf);
162	bitmap_free(bitmap: nvdev->send_section_map);
163
164	for (i = `0`; i < VRSS_CHANNEL_MAX; i++) {
165	xdp_rxq_info_unreg(xdp_rxq: &nvdev->chan_table[i].xdp_rxq);
166	kfree(objp: nvdev->chan_table[i].recv_buf);
167	vfree(addr: nvdev->chan_table[i].mrc.slots);
168	}
169
170	kfree(objp: nvdev);
171	}
172
173	static void free_netvsc_device_rcu(struct netvsc_device *nvdev)
174	{
175	call_rcu(head: &nvdev->rcu, func: free_netvsc_device);
176	}
177
178	static void netvsc_revoke_recv_buf(struct hv_device *device,
179	struct netvsc_device *net_device,
180	struct net_device *ndev)
181	{
182	struct nvsp_message *revoke_packet;
183	int ret;
184
185	/*
186	* If we got a section count, it means we received a
187	* SendReceiveBufferComplete msg (ie sent
188	* NvspMessage1TypeSendReceiveBuffer msg) therefore, we need
189	* to send a revoke msg here
190	*/
191	if (net_device->recv_section_cnt) {
192	/ Send the revoke receive buffer /
193	revoke_packet = &net_device->revoke_packet;
194	memset(revoke_packet, `0`, sizeof(struct nvsp_message));
195
196	revoke_packet->hdr.msg_type =
197	NVSP_MSG1_TYPE_REVOKE_RECV_BUF;
198	revoke_packet->msg.v1_msg.
199	revoke_recv_buf.id = NETVSC_RECEIVE_BUFFER_ID;
200
201	trace_nvsp_send(ndev, msg: revoke_packet);
202
203	ret = vmbus_sendpacket(channel: device->channel,
204	buffer: revoke_packet,
205	bufferLen: sizeof(struct nvsp_message),
206	VMBUS_RQST_ID_NO_RESPONSE,
207	type: VM_PKT_DATA_INBAND, flags: `0`);
208	/ If the failure is because the channel is rescinded;*
209	* ignore the failure since we cannot send on a rescinded
210	* channel. This would allow us to properly cleanup
211	* even when the channel is rescinded.
212	*/
213	if (device->channel->rescind)
214	ret = `0`;
215	/*
216	* If we failed here, we might as well return and
217	* have a leak rather than continue and a bugchk
218	*/
219	if (ret != `0`) {
220	netdev_err(dev: ndev, format: "unable to send "
221	"revoke receive buffer to netvsp\n");
222	return;
223	}
224	net_device->recv_section_cnt = `0`;
225	}
226	}
227
228	static void netvsc_revoke_send_buf(struct hv_device *device,
229	struct netvsc_device *net_device,
230	struct net_device *ndev)
231	{
232	struct nvsp_message *revoke_packet;
233	int ret;
234
235	/ Deal with the send buffer we may have setup.*
236	* If we got a send section size, it means we received a
237	* NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE msg (ie sent
238	* NVSP_MSG1_TYPE_SEND_SEND_BUF msg) therefore, we need
239	* to send a revoke msg here
240	*/
241	if (net_device->send_section_cnt) {
242	/ Send the revoke receive buffer /
243	revoke_packet = &net_device->revoke_packet;
244	memset(revoke_packet, `0`, sizeof(struct nvsp_message));
245
246	revoke_packet->hdr.msg_type =
247	NVSP_MSG1_TYPE_REVOKE_SEND_BUF;
248	revoke_packet->msg.v1_msg.revoke_send_buf.id =
249	NETVSC_SEND_BUFFER_ID;
250
251	trace_nvsp_send(ndev, msg: revoke_packet);
252
253	ret = vmbus_sendpacket(channel: device->channel,
254	buffer: revoke_packet,
255	bufferLen: sizeof(struct nvsp_message),
256	VMBUS_RQST_ID_NO_RESPONSE,
257	type: VM_PKT_DATA_INBAND, flags: `0`);
258
259	/ If the failure is because the channel is rescinded;*
260	* ignore the failure since we cannot send on a rescinded
261	* channel. This would allow us to properly cleanup
262	* even when the channel is rescinded.
263	*/
264	if (device->channel->rescind)
265	ret = `0`;
266
267	/ If we failed here, we might as well return and*
268	* have a leak rather than continue and a bugchk
269	*/
270	if (ret != `0`) {
271	netdev_err(dev: ndev, format: "unable to send "
272	"revoke send buffer to netvsp\n");
273	return;
274	}
275	net_device->send_section_cnt = `0`;
276	}
277	}
278
279	static void netvsc_teardown_recv_gpadl(struct hv_device *device,
280	struct netvsc_device *net_device,
281	struct net_device *ndev)
282	{
283	int ret;
284
285	if (net_device->recv_buf_gpadl_handle.gpadl_handle) {
286	ret = vmbus_teardown_gpadl(channel: device->channel,
287	gpadl: &net_device->recv_buf_gpadl_handle);
288
289	/ If we failed here, we might as well return and have a leak*
290	* rather than continue and a bugchk
291	*/
292	if (ret != `0`) {
293	netdev_err(dev: ndev,
294	format: "unable to teardown receive buffer's gpadl\n");
295	return;
296	}
297	}
298	}
299
300	static void netvsc_teardown_send_gpadl(struct hv_device *device,
301	struct netvsc_device *net_device,
302	struct net_device *ndev)
303	{
304	int ret;
305
306	if (net_device->send_buf_gpadl_handle.gpadl_handle) {
307	ret = vmbus_teardown_gpadl(channel: device->channel,
308	gpadl: &net_device->send_buf_gpadl_handle);
309
310	/ If we failed here, we might as well return and have a leak*
311	* rather than continue and a bugchk
312	*/
313	if (ret != `0`) {
314	netdev_err(dev: ndev,
315	format: "unable to teardown send buffer's gpadl\n");
316	return;
317	}
318	}
319	}
320
321	int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx)
322	{
323	struct netvsc_channel *nvchan = &net_device->chan_table[q_idx];
324	int node = cpu_to_node(cpu: nvchan->channel->target_cpu);
325	size_t size;
326
327	size = net_device->recv_completion_cnt * sizeof(struct recv_comp_data);
328	nvchan->mrc.slots = vzalloc_node(size, node);
329	if (!nvchan->mrc.slots)
330	nvchan->mrc.slots = vzalloc(size);
331
332	return nvchan->mrc.slots ? `0` : -ENOMEM;
333	}
334
335	static int netvsc_init_buf(struct hv_device *device,
336	struct netvsc_device *net_device,
337	const struct netvsc_device_info *device_info)
338	{
339	struct nvsp_1_message_send_receive_buffer_complete *resp;
340	struct net_device *ndev = hv_get_drvdata(dev: device);
341	struct nvsp_message *init_packet;
342	unsigned int buf_size;
343	int i, ret = `0`;
344
345	/ Get receive buffer area. /
346	buf_size = device_info->recv_sections * device_info->recv_section_size;
347	buf_size = roundup(buf_size, PAGE_SIZE);
348
349	/ Legacy hosts only allow smaller receive buffer /
350	if (net_device->nvsp_version <= NVSP_PROTOCOL_VERSION_2)
351	buf_size = min_t(unsigned int, buf_size,
352	NETVSC_RECEIVE_BUFFER_SIZE_LEGACY);
353
354	net_device->recv_buf = vzalloc(size: buf_size);
355	if (!net_device->recv_buf) {
356	netdev_err(dev: ndev,
357	format: "unable to allocate receive buffer of size %u\n",
358	buf_size);
359	ret = -ENOMEM;
360	goto cleanup;
361	}
362
363	net_device->recv_buf_size = buf_size;
364
365	/*
366	* Establish the gpadl handle for this buffer on this
367	* channel. Note: This call uses the vmbus connection rather
368	* than the channel to establish the gpadl handle.
369	*/
370	ret = vmbus_establish_gpadl(channel: device->channel, kbuffer: net_device->recv_buf,
371	size: buf_size,
372	gpadl: &net_device->recv_buf_gpadl_handle);
373	if (ret != `0`) {
374	netdev_err(dev: ndev,
375	format: "unable to establish receive buffer's gpadl\n");
376	goto cleanup;
377	}
378
379	/ Notify the NetVsp of the gpadl handle /
380	init_packet = &net_device->channel_init_pkt;
381	memset(init_packet, `0`, sizeof(struct nvsp_message));
382	init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_RECV_BUF;
383	init_packet->msg.v1_msg.send_recv_buf.
384	gpadl_handle = net_device->recv_buf_gpadl_handle.gpadl_handle;
385	init_packet->msg.v1_msg.
386	send_recv_buf.id = NETVSC_RECEIVE_BUFFER_ID;
387
388	trace_nvsp_send(ndev, msg: init_packet);
389
390	/ Send the gpadl notification request /
391	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
392	bufferLen: sizeof(struct nvsp_message),
393	requestid: (unsigned long)init_packet,
394	type: VM_PKT_DATA_INBAND,
395	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
396	if (ret != `0`) {
397	netdev_err(dev: ndev,
398	format: "unable to send receive buffer's gpadl to netvsp\n");
399	goto cleanup;
400	}
401
402	wait_for_completion(&net_device->channel_init_wait);
403
404	/ Check the response /
405	resp = &init_packet->msg.v1_msg.send_recv_buf_complete;
406	if (resp->status != NVSP_STAT_SUCCESS) {
407	netdev_err(dev: ndev,
408	format: "Unable to complete receive buffer initialization with NetVsp - status %d\n",
409	resp->status);
410	ret = -EINVAL;
411	goto cleanup;
412	}
413
414	/ Parse the response /
415	netdev_dbg(ndev, "Receive sections: %u sub_allocs: size %u count: %u\n",
416	resp->num_sections, resp->sections[`0`].sub_alloc_size,
417	resp->sections[`0`].num_sub_allocs);
418
419	/ There should only be one section for the entire receive buffer /
420	if (resp->num_sections != `1` \|\| resp->sections[`0`].offset != `0`) {
421	ret = -EINVAL;
422	goto cleanup;
423	}
424
425	net_device->recv_section_size = resp->sections[`0`].sub_alloc_size;
426	net_device->recv_section_cnt = resp->sections[`0`].num_sub_allocs;
427
428	/ Ensure buffer will not overflow /
429	if (net_device->recv_section_size < NETVSC_MTU_MIN \|\| (u64)net_device->recv_section_size *
430	(u64)net_device->recv_section_cnt > (u64)buf_size) {
431	netdev_err(dev: ndev, format: "invalid recv_section_size %u\n",
432	net_device->recv_section_size);
433	ret = -EINVAL;
434	goto cleanup;
435	}
436
437	for (i = `0`; i < VRSS_CHANNEL_MAX; i++) {
438	struct netvsc_channel *nvchan = &net_device->chan_table[i];
439
440	nvchan->recv_buf = kzalloc(size: net_device->recv_section_size, GFP_KERNEL);
441	if (nvchan->recv_buf == NULL) {
442	ret = -ENOMEM;
443	goto cleanup;
444	}
445	}
446
447	/ Setup receive completion ring.*
448	* Add 1 to the recv_section_cnt because at least one entry in a
449	* ring buffer has to be empty.
450	*/
451	net_device->recv_completion_cnt = net_device->recv_section_cnt + `1`;
452	ret = netvsc_alloc_recv_comp_ring(net_device, q_idx: `0`);
453	if (ret)
454	goto cleanup;
455
456	/ Now setup the send buffer. /
457	buf_size = device_info->send_sections * device_info->send_section_size;
458	buf_size = round_up(buf_size, PAGE_SIZE);
459
460	net_device->send_buf = vzalloc(size: buf_size);
461	if (!net_device->send_buf) {
462	netdev_err(dev: ndev, format: "unable to allocate send buffer of size %u\n",
463	buf_size);
464	ret = -ENOMEM;
465	goto cleanup;
466	}
467	net_device->send_buf_size = buf_size;
468
469	/ Establish the gpadl handle for this buffer on this*
470	* channel. Note: This call uses the vmbus connection rather
471	* than the channel to establish the gpadl handle.
472	*/
473	ret = vmbus_establish_gpadl(channel: device->channel, kbuffer: net_device->send_buf,
474	size: buf_size,
475	gpadl: &net_device->send_buf_gpadl_handle);
476	if (ret != `0`) {
477	netdev_err(dev: ndev,
478	format: "unable to establish send buffer's gpadl\n");
479	goto cleanup;
480	}
481
482	/ Notify the NetVsp of the gpadl handle /
483	init_packet = &net_device->channel_init_pkt;
484	memset(init_packet, `0`, sizeof(struct nvsp_message));
485	init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_SEND_BUF;
486	init_packet->msg.v1_msg.send_send_buf.gpadl_handle =
487	net_device->send_buf_gpadl_handle.gpadl_handle;
488	init_packet->msg.v1_msg.send_send_buf.id = NETVSC_SEND_BUFFER_ID;
489
490	trace_nvsp_send(ndev, msg: init_packet);
491
492	/ Send the gpadl notification request /
493	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
494	bufferLen: sizeof(struct nvsp_message),
495	requestid: (unsigned long)init_packet,
496	type: VM_PKT_DATA_INBAND,
497	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
498	if (ret != `0`) {
499	netdev_err(dev: ndev,
500	format: "unable to send send buffer's gpadl to netvsp\n");
501	goto cleanup;
502	}
503
504	wait_for_completion(&net_device->channel_init_wait);
505
506	/ Check the response /
507	if (init_packet->msg.v1_msg.
508	send_send_buf_complete.status != NVSP_STAT_SUCCESS) {
509	netdev_err(dev: ndev, format: "Unable to complete send buffer "
510	"initialization with NetVsp - status %d\n",
511	init_packet->msg.v1_msg.
512	send_send_buf_complete.status);
513	ret = -EINVAL;
514	goto cleanup;
515	}
516
517	/ Parse the response /
518	net_device->send_section_size = init_packet->msg.
519	v1_msg.send_send_buf_complete.section_size;
520	if (net_device->send_section_size < NETVSC_MTU_MIN) {
521	netdev_err(dev: ndev, format: "invalid send_section_size %u\n",
522	net_device->send_section_size);
523	ret = -EINVAL;
524	goto cleanup;
525	}
526
527	/ Section count is simply the size divided by the section size. /
528	net_device->send_section_cnt = buf_size / net_device->send_section_size;
529
530	netdev_dbg(ndev, "Send section size: %d, Section count:%d\n",
531	net_device->send_section_size, net_device->send_section_cnt);
532
533	/ Setup state for managing the send buffer. /
534	net_device->send_section_map = bitmap_zalloc(nbits: net_device->send_section_cnt,
535	GFP_KERNEL);
536	if (!net_device->send_section_map) {
537	ret = -ENOMEM;
538	goto cleanup;
539	}
540
541	goto exit;
542
543	cleanup:
544	netvsc_revoke_recv_buf(device, net_device, ndev);
545	netvsc_revoke_send_buf(device, net_device, ndev);
546	netvsc_teardown_recv_gpadl(device, net_device, ndev);
547	netvsc_teardown_send_gpadl(device, net_device, ndev);
548
549	exit:
550	return ret;
551	}
552
553	/ Negotiate NVSP protocol version /
554	static int negotiate_nvsp_ver(struct hv_device *device,
555	struct netvsc_device *net_device,
556	struct nvsp_message *init_packet,
557	u32 nvsp_ver)
558	{
559	struct net_device *ndev = hv_get_drvdata(dev: device);
560	int ret;
561
562	memset(init_packet, `0`, sizeof(struct nvsp_message));
563	init_packet->hdr.msg_type = NVSP_MSG_TYPE_INIT;
564	init_packet->msg.init_msg.init.min_protocol_ver = nvsp_ver;
565	init_packet->msg.init_msg.init.max_protocol_ver = nvsp_ver;
566	trace_nvsp_send(ndev, msg: init_packet);
567
568	/ Send the init request /
569	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
570	bufferLen: sizeof(struct nvsp_message),
571	requestid: (unsigned long)init_packet,
572	type: VM_PKT_DATA_INBAND,
573	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
574
575	if (ret != `0`)
576	return ret;
577
578	wait_for_completion(&net_device->channel_init_wait);
579
580	if (init_packet->msg.init_msg.init_complete.status !=
581	NVSP_STAT_SUCCESS)
582	return -EINVAL;
583
584	if (nvsp_ver == NVSP_PROTOCOL_VERSION_1)
585	return `0`;
586
587	/ NVSPv2 or later: Send NDIS config /
588	memset(init_packet, `0`, sizeof(struct nvsp_message));
589	init_packet->hdr.msg_type = NVSP_MSG2_TYPE_SEND_NDIS_CONFIG;
590	init_packet->msg.v2_msg.send_ndis_config.mtu = ndev->mtu + ETH_HLEN;
591	init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = `1`;
592
593	if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
594	if (hv_is_isolation_supported())
595	netdev_info(dev: ndev, format: "SR-IOV not advertised by guests on the host supporting isolation\n");
596	else
597	init_packet->msg.v2_msg.send_ndis_config.capability.sriov = `1`;
598
599	/ Teaming bit is needed to receive link speed updates /
600	init_packet->msg.v2_msg.send_ndis_config.capability.teaming = `1`;
601	}
602
603	if (nvsp_ver >= NVSP_PROTOCOL_VERSION_61)
604	init_packet->msg.v2_msg.send_ndis_config.capability.rsc = `1`;
605
606	trace_nvsp_send(ndev, msg: init_packet);
607
608	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
609	bufferLen: sizeof(struct nvsp_message),
610	VMBUS_RQST_ID_NO_RESPONSE,
611	type: VM_PKT_DATA_INBAND, flags: `0`);
612
613	return ret;
614	}
615
616	static int netvsc_connect_vsp(struct hv_device *device,
617	struct netvsc_device *net_device,
618	const struct netvsc_device_info *device_info)
619	{
620	struct net_device *ndev = hv_get_drvdata(dev: device);
621	static const u32 ver_list[] = {
622	NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2,
623	NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5,
624	NVSP_PROTOCOL_VERSION_6, NVSP_PROTOCOL_VERSION_61
625	};
626	struct nvsp_message *init_packet;
627	int ndis_version, i, ret;
628
629	init_packet = &net_device->channel_init_pkt;
630
631	/ Negotiate the latest NVSP protocol supported /
632	for (i = ARRAY_SIZE(ver_list) - `1`; i >= `0`; i--)
633	if (negotiate_nvsp_ver(device, net_device, init_packet,
634	nvsp_ver: ver_list[i]) == `0`) {
635	net_device->nvsp_version = ver_list[i];
636	break;
637	}
638
639	if (i < `0`) {
640	ret = -EPROTO;
641	goto cleanup;
642	}
643
644	if (hv_is_isolation_supported() && net_device->nvsp_version < NVSP_PROTOCOL_VERSION_61) {
645	netdev_err(dev: ndev, format: "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n",
646	net_device->nvsp_version, NVSP_PROTOCOL_VERSION_61);
647	ret = -EPROTO;
648	goto cleanup;
649	}
650
651	pr_debug("Negotiated NVSP version:%x\n", net_device->nvsp_version);
652
653	/ Send the ndis version /
654	memset(init_packet, `0`, sizeof(struct nvsp_message));
655
656	if (net_device->nvsp_version <= NVSP_PROTOCOL_VERSION_4)
657	ndis_version = `0x00060001`;
658	else
659	ndis_version = `0x0006001e`;
660
661	init_packet->hdr.msg_type = NVSP_MSG1_TYPE_SEND_NDIS_VER;
662	init_packet->msg.v1_msg.
663	send_ndis_ver.ndis_major_ver =
664	(ndis_version & `0xFFFF0000`) >> `16`;
665	init_packet->msg.v1_msg.
666	send_ndis_ver.ndis_minor_ver =
667	ndis_version & `0xFFFF`;
668
669	trace_nvsp_send(ndev, msg: init_packet);
670
671	/ Send the init request /
672	ret = vmbus_sendpacket(channel: device->channel, buffer: init_packet,
673	bufferLen: sizeof(struct nvsp_message),
674	VMBUS_RQST_ID_NO_RESPONSE,
675	type: VM_PKT_DATA_INBAND, flags: `0`);
676	if (ret != `0`)
677	goto cleanup;
678
679
680	ret = netvsc_init_buf(device, net_device, device_info);
681
682	cleanup:
683	return ret;
684	}
685
686	/*
687	* netvsc_device_remove - Callback when the root bus device is removed
688	*/
689	void netvsc_device_remove(struct hv_device *device)
690	{
691	struct net_device *ndev = hv_get_drvdata(dev: device);
692	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
693	struct netvsc_device *net_device
694	= rtnl_dereference(net_device_ctx->nvdev);
695	int i;
696
697	/*
698	* Revoke receive buffer. If host is pre-Win2016 then tear down
699	* receive buffer GPADL. Do the same for send buffer.
700	*/
701	netvsc_revoke_recv_buf(device, net_device, ndev);
702	if (vmbus_proto_version < VERSION_WIN10)
703	netvsc_teardown_recv_gpadl(device, net_device, ndev);
704
705	netvsc_revoke_send_buf(device, net_device, ndev);
706	if (vmbus_proto_version < VERSION_WIN10)
707	netvsc_teardown_send_gpadl(device, net_device, ndev);
708
709	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
710
711	/ Disable NAPI and disassociate its context from the device. /
712	for (i = `0`; i < net_device->num_chn; i++) {
713	/ See also vmbus_reset_channel_cb(). /
714	/ only disable enabled NAPI channel /
715	if (i < ndev->real_num_rx_queues)
716	napi_disable(n: &net_device->chan_table[i].napi);
717
718	netif_napi_del(napi: &net_device->chan_table[i].napi);
719	}
720
721	/*
722	* At this point, no one should be accessing net_device
723	* except in here
724	*/
725	netdev_dbg(ndev, "net device safe to remove\n");
726
727	/ Now, we can close the channel safely /
728	vmbus_close(channel: device->channel);
729
730	/*
731	* If host is Win2016 or higher then we do the GPADL tear down
732	* here after VMBus is closed.
733	*/
734	if (vmbus_proto_version >= VERSION_WIN10) {
735	netvsc_teardown_recv_gpadl(device, net_device, ndev);
736	netvsc_teardown_send_gpadl(device, net_device, ndev);
737	}
738
739	/ Release all resources /
740	free_netvsc_device_rcu(nvdev: net_device);
741	}
742
743	#define RING_AVAIL_PERCENT_HIWATER 20
744	#define RING_AVAIL_PERCENT_LOWATER 10
745
746	static inline void netvsc_free_send_slot(struct netvsc_device *net_device,
747	u32 index)
748	{
749	sync_change_bit(nr: index, addr: net_device->send_section_map);
750	}
751
752	static void netvsc_send_tx_complete(struct net_device *ndev,
753	struct netvsc_device *net_device,
754	struct vmbus_channel *channel,
755	const struct vmpacket_descriptor *desc,
756	int budget)
757	{
758	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
759	struct sk_buff *skb;
760	u16 q_idx = `0`;
761	int queue_sends;
762	u64 cmd_rqst;
763
764	cmd_rqst = channel->request_addr_callback(channel, desc->trans_id);
765	if (cmd_rqst == VMBUS_RQST_ERROR) {
766	netdev_err(dev: ndev, format: "Invalid transaction ID %llx\n", desc->trans_id);
767	return;
768	}
769
770	skb = (struct sk_buff )(unsigned* long)cmd_rqst;
771
772	/ Notify the layer above us /
773	if (likely(skb)) {
774	struct hv_netvsc_packet *packet
775	= (struct hv_netvsc_packet *)skb->cb;
776	u32 send_index = packet->send_buf_index;
777	struct netvsc_stats_tx *tx_stats;
778
779	if (send_index != NETVSC_INVALID_INDEX)
780	netvsc_free_send_slot(net_device, index: send_index);
781	q_idx = packet->q_idx;
782
783	tx_stats = &net_device->chan_table[q_idx].tx_stats;
784
785	u64_stats_update_begin(syncp: &tx_stats->syncp);
786	tx_stats->packets += packet->total_packets;
787	tx_stats->bytes += packet->total_bytes;
788	u64_stats_update_end(syncp: &tx_stats->syncp);
789
790	netvsc_dma_unmap(hv_dev: ndev_ctx->device_ctx, packet);
791	napi_consume_skb(skb, budget);
792	}
793
794	queue_sends =
795	atomic_dec_return(v: &net_device->chan_table[q_idx].queue_sends);
796
797	if (unlikely(net_device->destroy)) {
798	if (queue_sends == `0`)
799	wake_up(&net_device->wait_drain);
800	} else {
801	struct netdev_queue *txq = netdev_get_tx_queue(dev: ndev, index: q_idx);
802
803	if (netif_tx_queue_stopped(dev_queue: txq) && !net_device->tx_disable &&
804	(hv_get_avail_to_write_percent(rbi: &channel->outbound) >
805	RING_AVAIL_PERCENT_HIWATER \|\| queue_sends < `1`)) {
806	netif_tx_wake_queue(dev_queue: txq);
807	ndev_ctx->eth_stats.wake_queue++;
808	}
809	}
810	}
811
812	static void netvsc_send_completion(struct net_device *ndev,
813	struct netvsc_device *net_device,
814	struct vmbus_channel *incoming_channel,
815	const struct vmpacket_descriptor *desc,
816	int budget)
817	{
818	const struct nvsp_message *nvsp_packet;
819	u32 msglen = hv_pkt_datalen(desc);
820	struct nvsp_message *pkt_rqst;
821	u64 cmd_rqst;
822	u32 status;
823
824	/ First check if this is a VMBUS completion without data payload /
825	if (!msglen) {
826	cmd_rqst = incoming_channel->request_addr_callback(incoming_channel,
827	desc->trans_id);
828	if (cmd_rqst == VMBUS_RQST_ERROR) {
829	netdev_err(dev: ndev, format: "Invalid transaction ID %llx\n", desc->trans_id);
830	return;
831	}
832
833	pkt_rqst = (struct nvsp_message *)(uintptr_t)cmd_rqst;
834	switch (pkt_rqst->hdr.msg_type) {
835	case NVSP_MSG4_TYPE_SWITCH_DATA_PATH:
836	complete(&net_device->channel_init_wait);
837	break;
838
839	default:
840	netdev_err(dev: ndev, format: "Unexpected VMBUS completion!!\n");
841	}
842	return;
843	}
844
845	/ Ensure packet is big enough to read header fields /
846	if (msglen < sizeof(struct nvsp_message_header)) {
847	netdev_err(dev: ndev, format: "nvsp_message length too small: %u\n", msglen);
848	return;
849	}
850
851	nvsp_packet = hv_pkt_data(desc);
852	switch (nvsp_packet->hdr.msg_type) {
853	case NVSP_MSG_TYPE_INIT_COMPLETE:
854	if (msglen < sizeof(struct nvsp_message_header) +
855	sizeof(struct nvsp_message_init_complete)) {
856	netdev_err(dev: ndev, format: "nvsp_msg length too small: %u\n",
857	msglen);
858	return;
859	}
860	break;
861
862	case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE:
863	if (msglen < sizeof(struct nvsp_message_header) +
864	sizeof(struct nvsp_1_message_send_receive_buffer_complete)) {
865	netdev_err(dev: ndev, format: "nvsp_msg1 length too small: %u\n",
866	msglen);
867	return;
868	}
869	break;
870
871	case NVSP_MSG1_TYPE_SEND_SEND_BUF_COMPLETE:
872	if (msglen < sizeof(struct nvsp_message_header) +
873	sizeof(struct nvsp_1_message_send_send_buffer_complete)) {
874	netdev_err(dev: ndev, format: "nvsp_msg1 length too small: %u\n",
875	msglen);
876	return;
877	}
878	break;
879
880	case NVSP_MSG5_TYPE_SUBCHANNEL:
881	if (msglen < sizeof(struct nvsp_message_header) +
882	sizeof(struct nvsp_5_subchannel_complete)) {
883	netdev_err(dev: ndev, format: "nvsp_msg5 length too small: %u\n",
884	msglen);
885	return;
886	}
887	break;
888
889	case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE:
890	if (msglen < sizeof(struct nvsp_message_header) +
891	sizeof(struct nvsp_1_message_send_rndis_packet_complete)) {
892	if (net_ratelimit())
893	netdev_err(dev: ndev, format: "nvsp_rndis_pkt_complete length too small: %u\n",
894	msglen);
895	return;
896	}
897
898	/ If status indicates an error, output a message so we know*
899	* there's a problem. But process the completion anyway so the
900	* resources are released.
901	*/
902	status = nvsp_packet->msg.v1_msg.send_rndis_pkt_complete.status;
903	if (status != NVSP_STAT_SUCCESS && net_ratelimit())
904	netdev_err(dev: ndev, format: "nvsp_rndis_pkt_complete error status: %x\n",
905	status);
906
907	netvsc_send_tx_complete(ndev, net_device, channel: incoming_channel,
908	desc, budget);
909	return;
910
911	default:
912	netdev_err(dev: ndev,
913	format: "Unknown send completion type %d received!!\n",
914	nvsp_packet->hdr.msg_type);
915	return;
916	}
917
918	/ Copy the response back /
919	memcpy(&net_device->channel_init_pkt, nvsp_packet,
920	sizeof(struct nvsp_message));
921	complete(&net_device->channel_init_wait);
922	}
923
924	static u32 netvsc_get_next_send_section(struct netvsc_device *net_device)
925	{
926	unsigned long *map_addr = net_device->send_section_map;
927	unsigned int i;
928
929	for_each_clear_bit(i, map_addr, net_device->send_section_cnt) {
930	if (sync_test_and_set_bit(nr: i, addr: map_addr) == `0`)
931	return i;
932	}
933
934	return NETVSC_INVALID_INDEX;
935	}
936
937	static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
938	unsigned int section_index,
939	u32 pend_size,
940	struct hv_netvsc_packet *packet,
941	struct rndis_message *rndis_msg,
942	struct hv_page_buffer *pb,
943	bool xmit_more)
944	{
945	char *start = net_device->send_buf;
946	char dest = start + (section_index net_device->send_section_size)
947	+ pend_size;
948	int i;
949	u32 padding = `0`;
950	u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt :
951	packet->page_buf_cnt;
952	u32 remain;
953
954	/ Add padding /
955	remain = packet->total_data_buflen & (net_device->pkt_align - `1`);
956	if (xmit_more && remain) {
957	padding = net_device->pkt_align - remain;
958	rndis_msg->msg_len += padding;
959	packet->total_data_buflen += padding;
960	}
961
962	for (i = `0`; i < page_count; i++) {
963	char *src = phys_to_virt(address: pb[i].pfn << HV_HYP_PAGE_SHIFT);
964	u32 offset = pb[i].offset;
965	u32 len = pb[i].len;
966
967	memcpy(dest, (src + offset), len);
968	dest += len;
969	}
970
971	if (padding)
972	memset(dest, `0`, padding);
973	}
974
975	void netvsc_dma_unmap(struct hv_device *hv_dev,
976	struct hv_netvsc_packet *packet)
977	{
978	int i;
979
980	if (!hv_is_isolation_supported())
981	return;
982
983	if (!packet->dma_range)
984	return;
985
986	for (i = `0`; i < packet->page_buf_cnt; i++)
987	dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
988	packet->dma_range[i].mapping_size,
989	DMA_TO_DEVICE);
990
991	kfree(objp: packet->dma_range);
992	}
993
994	/ netvsc_dma_map - Map swiotlb bounce buffer with data page of*
995	* packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
996	* VM.
997	*
998	* In isolation VM, netvsc send buffer has been marked visible to
999	* host and so the data copied to send buffer doesn't need to use
1000	* bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
1001	* may not be copied to send buffer and so these pages need to be
1002	* mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
1003	* that. The pfns in the struct hv_page_buffer need to be converted
1004	* to bounce buffer's pfn. The loop here is necessary because the
1005	* entries in the page buffer array are not necessarily full
1006	* pages of data. Each entry in the array has a separate offset and
1007	* len that may be non-zero, even for entries in the middle of the
1008	* array. And the entries are not physically contiguous. So each
1009	* entry must be individually mapped rather than as a contiguous unit.
1010	* So not use dma_map_sg() here.
1011	*/
1012	static int netvsc_dma_map(struct hv_device *hv_dev,
1013	struct hv_netvsc_packet *packet,
1014	struct hv_page_buffer *pb)
1015	{
1016	u32 page_count = packet->page_buf_cnt;
1017	dma_addr_t dma;
1018	int i;
1019
1020	if (!hv_is_isolation_supported())
1021	return `0`;
1022
1023	packet->dma_range = kcalloc(n: page_count,
1024	size: sizeof(*packet->dma_range),
1025	GFP_ATOMIC);
1026	if (!packet->dma_range)
1027	return -ENOMEM;
1028
1029	for (i = `0`; i < page_count; i++) {
1030	char *src = phys_to_virt(address: (pb[i].pfn << HV_HYP_PAGE_SHIFT)
1031	+ pb[i].offset);
1032	u32 len = pb[i].len;
1033
1034	dma = dma_map_single(&hv_dev->device, src, len,
1035	DMA_TO_DEVICE);
1036	if (dma_mapping_error(dev: &hv_dev->device, dma_addr: dma)) {
1037	kfree(objp: packet->dma_range);
1038	return -ENOMEM;
1039	}
1040
1041	/ pb[].offset and pb[].len are not changed during dma mapping*
1042	* and so not reassign.
1043	*/
1044	packet->dma_range[i].dma = dma;
1045	packet->dma_range[i].mapping_size = len;
1046	pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
1047	}
1048
1049	return `0`;
1050	}
1051
1052	static inline int netvsc_send_pkt(
1053	struct hv_device *device,
1054	struct hv_netvsc_packet *packet,
1055	struct netvsc_device *net_device,
1056	struct hv_page_buffer *pb,
1057	struct sk_buff *skb)
1058	{
1059	struct nvsp_message nvmsg;
1060	struct nvsp_1_message_send_rndis_packet *rpkt =
1061	&nvmsg.msg.v1_msg.send_rndis_pkt;
1062	struct netvsc_channel * const nvchan =
1063	&net_device->chan_table[packet->q_idx];
1064	struct vmbus_channel *out_channel = nvchan->channel;
1065	struct net_device *ndev = hv_get_drvdata(dev: device);
1066	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
1067	struct netdev_queue *txq = netdev_get_tx_queue(dev: ndev, index: packet->q_idx);
1068	u64 req_id;
1069	int ret;
1070	u32 ring_avail = hv_get_avail_to_write_percent(rbi: &out_channel->outbound);
1071
1072	memset(&nvmsg, `0`, sizeof(struct nvsp_message));
1073	nvmsg.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT;
1074	if (skb)
1075	rpkt->channel_type = `0`; / 0 is RMC_DATA /
1076	else
1077	rpkt->channel_type = `1`; / 1 is RMC_CONTROL /
1078
1079	rpkt->send_buf_section_index = packet->send_buf_index;
1080	if (packet->send_buf_index == NETVSC_INVALID_INDEX)
1081	rpkt->send_buf_section_size = `0`;
1082	else
1083	rpkt->send_buf_section_size = packet->total_data_buflen;
1084
1085	req_id = (ulong)skb;
1086
1087	if (out_channel->rescind)
1088	return -ENODEV;
1089
1090	trace_nvsp_send_pkt(ndev, chan: out_channel, rpkt);
1091
1092	packet->dma_range = NULL;
1093	if (packet->page_buf_cnt) {
1094	if (packet->cp_partial)
1095	pb += packet->rmsg_pgcnt;
1096
1097	ret = netvsc_dma_map(hv_dev: ndev_ctx->device_ctx, packet, pb);
1098	if (ret) {
1099	ret = -EAGAIN;
1100	goto exit;
1101	}
1102
1103	ret = vmbus_sendpacket_pagebuffer(channel: out_channel,
1104	pagebuffers: pb, pagecount: packet->page_buf_cnt,
1105	buffer: &nvmsg, bufferlen: sizeof(nvmsg),
1106	requestid: req_id);
1107
1108	if (ret)
1109	netvsc_dma_unmap(hv_dev: ndev_ctx->device_ctx, packet);
1110	} else {
1111	ret = vmbus_sendpacket(channel: out_channel,
1112	buffer: &nvmsg, bufferLen: sizeof(nvmsg),
1113	requestid: req_id, type: VM_PKT_DATA_INBAND,
1114	VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1115	}
1116
1117	exit:
1118	if (ret == `0`) {
1119	atomic_inc_return(v: &nvchan->queue_sends);
1120
1121	if (ring_avail < RING_AVAIL_PERCENT_LOWATER) {
1122	netif_tx_stop_queue(dev_queue: txq);
1123	ndev_ctx->eth_stats.stop_queue++;
1124	}
1125	} else if (ret == -EAGAIN) {
1126	netif_tx_stop_queue(dev_queue: txq);
1127	ndev_ctx->eth_stats.stop_queue++;
1128	} else {
1129	netdev_err(dev: ndev,
1130	format: "Unable to send packet pages %u len %u, ret %d\n",
1131	packet->page_buf_cnt, packet->total_data_buflen,
1132	ret);
1133	}
1134
1135	if (netif_tx_queue_stopped(dev_queue: txq) &&
1136	atomic_read(v: &nvchan->queue_sends) < `1` &&
1137	!net_device->tx_disable) {
1138	netif_tx_wake_queue(dev_queue: txq);
1139	ndev_ctx->eth_stats.wake_queue++;
1140	if (ret == -EAGAIN)
1141	ret = -ENOSPC;
1142	}
1143
1144	return ret;
1145	}
1146
1147	/ Move packet out of multi send data (msd), and clear msd /
1148	static inline void move_pkt_msd(struct hv_netvsc_packet **msd_send,
1149	struct sk_buff **msd_skb,
1150	struct multi_send_data *msdp)
1151	{
1152	*msd_skb = msdp->skb;
1153	*msd_send = msdp->pkt;
1154	msdp->skb = NULL;
1155	msdp->pkt = NULL;
1156	msdp->count = `0`;
1157	}
1158
1159	/ RCU already held by caller /
1160	/ Batching/bouncing logic is designed to attempt to optimize*
1161	* performance.
1162	*
1163	* For small, non-LSO packets we copy the packet to a send buffer
1164	* which is pre-registered with the Hyper-V side. This enables the
1165	* hypervisor to avoid remapping the aperture to access the packet
1166	* descriptor and data.
1167	*
1168	* If we already started using a buffer and the netdev is transmitting
1169	* a burst of packets, keep on copying into the buffer until it is
1170	* full or we are done collecting a burst. If there is an existing
1171	* buffer with space for the RNDIS descriptor but not the packet, copy
1172	* the RNDIS descriptor to the buffer, keeping the packet in place.
1173	*
1174	* If we do batching and send more than one packet using a single
1175	* NetVSC message, free the SKBs of the packets copied, except for the
1176	* last packet. This is done to streamline the handling of the case
1177	* where the last packet only had the RNDIS descriptor copied to the
1178	* send buffer, with the data pointers included in the NetVSC message.
1179	*/
1180	int netvsc_send(struct net_device *ndev,
1181	struct hv_netvsc_packet *packet,
1182	struct rndis_message *rndis_msg,
1183	struct hv_page_buffer *pb,
1184	struct sk_buff *skb,
1185	bool xdp_tx)
1186	{
1187	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
1188	struct netvsc_device *net_device
1189	= rcu_dereference_bh(ndev_ctx->nvdev);
1190	struct hv_device *device = ndev_ctx->device_ctx;
1191	int ret = `0`;
1192	struct netvsc_channel *nvchan;
1193	u32 pktlen = packet->total_data_buflen, msd_len = `0`;
1194	unsigned int section_index = NETVSC_INVALID_INDEX;
1195	struct multi_send_data *msdp;
1196	struct hv_netvsc_packet msd_send = NULL, cur_send = NULL;
1197	struct sk_buff *msd_skb = NULL;
1198	bool try_batch, xmit_more;
1199
1200	/ If device is rescinded, return error and packet will get dropped. /
1201	if (unlikely(!net_device \|\| net_device->destroy))
1202	return -ENODEV;
1203
1204	nvchan = &net_device->chan_table[packet->q_idx];
1205	packet->send_buf_index = NETVSC_INVALID_INDEX;
1206	packet->cp_partial = false;
1207
1208	/ Send a control message or XDP packet directly without accessing*
1209	* msd (Multi-Send Data) field which may be changed during data packet
1210	* processing.
1211	*/
1212	if (!skb \|\| xdp_tx)
1213	return netvsc_send_pkt(device, packet, net_device, pb, skb);
1214
1215	/ batch packets in send buffer if possible /
1216	msdp = &nvchan->msd;
1217	if (msdp->pkt)
1218	msd_len = msdp->pkt->total_data_buflen;
1219
1220	try_batch = msd_len > `0` && msdp->count < net_device->max_pkt;
1221	if (try_batch && msd_len + pktlen + net_device->pkt_align <
1222	net_device->send_section_size) {
1223	section_index = msdp->pkt->send_buf_index;
1224
1225	} else if (try_batch && msd_len + packet->rmsg_size <
1226	net_device->send_section_size) {
1227	section_index = msdp->pkt->send_buf_index;
1228	packet->cp_partial = true;
1229
1230	} else if (pktlen + net_device->pkt_align <
1231	net_device->send_section_size) {
1232	section_index = netvsc_get_next_send_section(net_device);
1233	if (unlikely(section_index == NETVSC_INVALID_INDEX)) {
1234	++ndev_ctx->eth_stats.tx_send_full;
1235	} else {
1236	move_pkt_msd(msd_send: &msd_send, msd_skb: &msd_skb, msdp);
1237	msd_len = `0`;
1238	}
1239	}
1240
1241	/ Keep aggregating only if stack says more data is coming*
1242	* and not doing mixed modes send and not flow blocked
1243	*/
1244	xmit_more = netdev_xmit_more() &&
1245	!packet->cp_partial &&
1246	!netif_xmit_stopped(dev_queue: netdev_get_tx_queue(dev: ndev, index: packet->q_idx));
1247
1248	if (section_index != NETVSC_INVALID_INDEX) {
1249	netvsc_copy_to_send_buf(net_device,
1250	section_index, pend_size: msd_len,
1251	packet, rndis_msg, pb, xmit_more);
1252
1253	packet->send_buf_index = section_index;
1254
1255	if (packet->cp_partial) {
1256	packet->page_buf_cnt -= packet->rmsg_pgcnt;
1257	packet->total_data_buflen = msd_len + packet->rmsg_size;
1258	} else {
1259	packet->page_buf_cnt = `0`;
1260	packet->total_data_buflen += msd_len;
1261	}
1262
1263	if (msdp->pkt) {
1264	packet->total_packets += msdp->pkt->total_packets;
1265	packet->total_bytes += msdp->pkt->total_bytes;
1266	}
1267
1268	if (msdp->skb)
1269	dev_consume_skb_any(skb: msdp->skb);
1270
1271	if (xmit_more) {
1272	msdp->skb = skb;
1273	msdp->pkt = packet;
1274	msdp->count++;
1275	} else {
1276	cur_send = packet;
1277	msdp->skb = NULL;
1278	msdp->pkt = NULL;
1279	msdp->count = `0`;
1280	}
1281	} else {
1282	move_pkt_msd(msd_send: &msd_send, msd_skb: &msd_skb, msdp);
1283	cur_send = packet;
1284	}
1285
1286	if (msd_send) {
1287	int m_ret = netvsc_send_pkt(device, packet: msd_send, net_device,
1288	NULL, skb: msd_skb);
1289
1290	if (m_ret != `0`) {
1291	netvsc_free_send_slot(net_device,
1292	index: msd_send->send_buf_index);
1293	dev_kfree_skb_any(skb: msd_skb);
1294	}
1295	}
1296
1297	if (cur_send)
1298	ret = netvsc_send_pkt(device, packet: cur_send, net_device, pb, skb);
1299
1300	if (ret != `0` && section_index != NETVSC_INVALID_INDEX)
1301	netvsc_free_send_slot(net_device, index: section_index);
1302
1303	return ret;
1304	}
1305
1306	/ Send pending recv completions /
1307	static int send_recv_completions(struct net_device *ndev,
1308	struct netvsc_device *nvdev,
1309	struct netvsc_channel *nvchan)
1310	{
1311	struct multi_recv_comp *mrc = &nvchan->mrc;
1312	struct recv_comp_msg {
1313	struct nvsp_message_header hdr;
1314	u32 status;
1315	} __packed;
1316	struct recv_comp_msg msg = {
1317	.hdr.msg_type = NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE,
1318	};
1319	int ret;
1320
1321	while (mrc->first != mrc->next) {
1322	const struct recv_comp_data *rcd
1323	= mrc->slots + mrc->first;
1324
1325	msg.status = rcd->status;
1326	ret = vmbus_sendpacket(channel: nvchan->channel, buffer: &msg, bufferLen: sizeof(msg),
1327	requestid: rcd->tid, type: VM_PKT_COMP, flags: `0`);
1328	if (unlikely(ret)) {
1329	struct net_device_context *ndev_ctx = netdev_priv(dev: ndev);
1330
1331	++ndev_ctx->eth_stats.rx_comp_busy;
1332	return ret;
1333	}
1334
1335	if (++mrc->first == nvdev->recv_completion_cnt)
1336	mrc->first = `0`;
1337	}
1338
1339	/ receive completion ring has been emptied /
1340	if (unlikely(nvdev->destroy))
1341	wake_up(&nvdev->wait_drain);
1342
1343	return `0`;
1344	}
1345
1346	/ Count how many receive completions are outstanding /
1347	static void recv_comp_slot_avail(const struct netvsc_device *nvdev,
1348	const struct multi_recv_comp *mrc,
1349	u32 filled, u32 avail)
1350	{
1351	u32 count = nvdev->recv_completion_cnt;
1352
1353	if (mrc->next >= mrc->first)
1354	*filled = mrc->next - mrc->first;
1355	else
1356	*filled = (count - mrc->first) + mrc->next;
1357
1358	avail = count - filled - `1`;
1359	}
1360
1361	/ Add receive complete to ring to send to host. /
1362	static void enq_receive_complete(struct net_device *ndev,
1363	struct netvsc_device *nvdev, u16 q_idx,
1364	u64 tid, u32 status)
1365	{
1366	struct netvsc_channel *nvchan = &nvdev->chan_table[q_idx];
1367	struct multi_recv_comp *mrc = &nvchan->mrc;
1368	struct recv_comp_data *rcd;
1369	u32 filled, avail;
1370
1371	recv_comp_slot_avail(nvdev, mrc, filled: &filled, avail: &avail);
1372
1373	if (unlikely(filled > NAPI_POLL_WEIGHT)) {
1374	send_recv_completions(ndev, nvdev, nvchan);
1375	recv_comp_slot_avail(nvdev, mrc, filled: &filled, avail: &avail);
1376	}
1377
1378	if (unlikely(!avail)) {
1379	netdev_err(dev: ndev, format: "Recv_comp full buf q:%hd, tid:%llx\n",
1380	q_idx, tid);
1381	return;
1382	}
1383
1384	rcd = mrc->slots + mrc->next;
1385	rcd->tid = tid;
1386	rcd->status = status;
1387
1388	if (++mrc->next == nvdev->recv_completion_cnt)
1389	mrc->next = `0`;
1390	}
1391
1392	static int netvsc_receive(struct net_device *ndev,
1393	struct netvsc_device *net_device,
1394	struct netvsc_channel *nvchan,
1395	const struct vmpacket_descriptor *desc)
1396	{
1397	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1398	struct vmbus_channel *channel = nvchan->channel;
1399	const struct vmtransfer_page_packet_header *vmxferpage_packet
1400	= container_of(desc, const struct vmtransfer_page_packet_header, d);
1401	const struct nvsp_message *nvsp = hv_pkt_data(desc);
1402	u32 msglen = hv_pkt_datalen(desc);
1403	u16 q_idx = channel->offermsg.offer.sub_channel_index;
1404	char *recv_buf = net_device->recv_buf;
1405	u32 status = NVSP_STAT_SUCCESS;
1406	int i;
1407	int count = `0`;
1408
1409	/ Ensure packet is big enough to read header fields /
1410	if (msglen < sizeof(struct nvsp_message_header)) {
1411	netif_err(net_device_ctx, rx_err, ndev,
1412	"invalid nvsp header, length too small: %u\n",
1413	msglen);
1414	return `0`;
1415	}
1416
1417	/ Make sure this is a valid nvsp packet /
1418	if (unlikely(nvsp->hdr.msg_type != NVSP_MSG1_TYPE_SEND_RNDIS_PKT)) {
1419	netif_err(net_device_ctx, rx_err, ndev,
1420	"Unknown nvsp packet type received %u\n",
1421	nvsp->hdr.msg_type);
1422	return `0`;
1423	}
1424
1425	/ Validate xfer page pkt header /
1426	if ((desc->offset8 << `3`) < sizeof(struct vmtransfer_page_packet_header)) {
1427	netif_err(net_device_ctx, rx_err, ndev,
1428	"Invalid xfer page pkt, offset too small: %u\n",
1429	desc->offset8 << `3`);
1430	return `0`;
1431	}
1432
1433	if (unlikely(vmxferpage_packet->xfer_pageset_id != NETVSC_RECEIVE_BUFFER_ID)) {
1434	netif_err(net_device_ctx, rx_err, ndev,
1435	"Invalid xfer page set id - expecting %x got %x\n",
1436	NETVSC_RECEIVE_BUFFER_ID,
1437	vmxferpage_packet->xfer_pageset_id);
1438	return `0`;
1439	}
1440
1441	count = vmxferpage_packet->range_cnt;
1442
1443	/ Check count for a valid value /
1444	if (NETVSC_XFER_HEADER_SIZE(count) > desc->offset8 << `3`) {
1445	netif_err(net_device_ctx, rx_err, ndev,
1446	"Range count is not valid: %d\n",
1447	count);
1448	return `0`;
1449	}
1450
1451	/ Each range represents 1 RNDIS pkt that contains 1 ethernet frame /
1452	for (i = `0`; i < count; i++) {
1453	u32 offset = vmxferpage_packet->ranges[i].byte_offset;
1454	u32 buflen = vmxferpage_packet->ranges[i].byte_count;
1455	void *data;
1456	int ret;
1457
1458	if (unlikely(offset > net_device->recv_buf_size \|\|
1459	buflen > net_device->recv_buf_size - offset)) {
1460	nvchan->rsc.cnt = `0`;
1461	status = NVSP_STAT_FAIL;
1462	netif_err(net_device_ctx, rx_err, ndev,
1463	"Packet offset:%u + len:%u too big\n",
1464	offset, buflen);
1465
1466	continue;
1467	}
1468
1469	/ We're going to copy (sections of) the packet into nvchan->recv_buf;*
1470	* make sure that nvchan->recv_buf is large enough to hold the packet.
1471	*/
1472	if (unlikely(buflen > net_device->recv_section_size)) {
1473	nvchan->rsc.cnt = `0`;
1474	status = NVSP_STAT_FAIL;
1475	netif_err(net_device_ctx, rx_err, ndev,
1476	"Packet too big: buflen=%u recv_section_size=%u\n",
1477	buflen, net_device->recv_section_size);
1478
1479	continue;
1480	}
1481
1482	data = recv_buf + offset;
1483
1484	nvchan->rsc.is_last = (i == count - `1`);
1485
1486	trace_rndis_recv(ndev, q: q_idx, msg: data);
1487
1488	/ Pass it to the upper layer /
1489	ret = rndis_filter_receive(ndev, net_dev: net_device,
1490	nvchan, data, buflen);
1491
1492	if (unlikely(ret != NVSP_STAT_SUCCESS)) {
1493	/ Drop incomplete packet /
1494	nvchan->rsc.cnt = `0`;
1495	status = NVSP_STAT_FAIL;
1496	}
1497	}
1498
1499	enq_receive_complete(ndev, nvdev: net_device, q_idx,
1500	tid: vmxferpage_packet->d.trans_id, status);
1501
1502	return count;
1503	}
1504
1505	static void netvsc_send_table(struct net_device *ndev,
1506	struct netvsc_device *nvscdev,
1507	const struct nvsp_message *nvmsg,
1508	u32 msglen)
1509	{
1510	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1511	u32 count, offset, *tab;
1512	int i;
1513
1514	/ Ensure packet is big enough to read send_table fields /
1515	if (msglen < sizeof(struct nvsp_message_header) +
1516	sizeof(struct nvsp_5_send_indirect_table)) {
1517	netdev_err(dev: ndev, format: "nvsp_v5_msg length too small: %u\n", msglen);
1518	return;
1519	}
1520
1521	count = nvmsg->msg.v5_msg.send_table.count;
1522	offset = nvmsg->msg.v5_msg.send_table.offset;
1523
1524	if (count != VRSS_SEND_TAB_SIZE) {
1525	netdev_err(dev: ndev, format: "Received wrong send-table size:%u\n", count);
1526	return;
1527	}
1528
1529	/ If negotiated version <= NVSP_PROTOCOL_VERSION_6, the offset may be*
1530	* wrong due to a host bug. So fix the offset here.
1531	*/
1532	if (nvscdev->nvsp_version <= NVSP_PROTOCOL_VERSION_6 &&
1533	msglen >= sizeof(struct nvsp_message_header) +
1534	sizeof(union nvsp_6_message_uber) + count * sizeof(u32))
1535	offset = sizeof(struct nvsp_message_header) +
1536	sizeof(union nvsp_6_message_uber);
1537
1538	/ Boundary check for all versions /
1539	if (msglen < count * sizeof(u32) \|\| offset > msglen - count * sizeof(u32)) {
1540	netdev_err(dev: ndev, format: "Received send-table offset too big:%u\n",
1541	offset);
1542	return;
1543	}
1544
1545	tab = (void *)nvmsg + offset;
1546
1547	for (i = `0`; i < count; i++)
1548	net_device_ctx->tx_table[i] = tab[i];
1549	}
1550
1551	static void netvsc_send_vf(struct net_device *ndev,
1552	const struct nvsp_message *nvmsg,
1553	u32 msglen)
1554	{
1555	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1556
1557	/ Ensure packet is big enough to read its fields /
1558	if (msglen < sizeof(struct nvsp_message_header) +
1559	sizeof(struct nvsp_4_send_vf_association)) {
1560	netdev_err(dev: ndev, format: "nvsp_v4_msg length too small: %u\n", msglen);
1561	return;
1562	}
1563
1564	net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
1565	net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
1566
1567	if (net_device_ctx->vf_alloc)
1568	complete(&net_device_ctx->vf_add);
1569
1570	netdev_info(dev: ndev, format: "VF slot %u %s\n",
1571	net_device_ctx->vf_serial,
1572	net_device_ctx->vf_alloc ? "added" : "removed");
1573	}
1574
1575	static void netvsc_receive_inband(struct net_device *ndev,
1576	struct netvsc_device *nvscdev,
1577	const struct vmpacket_descriptor *desc)
1578	{
1579	const struct nvsp_message *nvmsg = hv_pkt_data(desc);
1580	u32 msglen = hv_pkt_datalen(desc);
1581
1582	/ Ensure packet is big enough to read header fields /
1583	if (msglen < sizeof(struct nvsp_message_header)) {
1584	netdev_err(dev: ndev, format: "inband nvsp_message length too small: %u\n", msglen);
1585	return;
1586	}
1587
1588	switch (nvmsg->hdr.msg_type) {
1589	case NVSP_MSG5_TYPE_SEND_INDIRECTION_TABLE:
1590	netvsc_send_table(ndev, nvscdev, nvmsg, msglen);
1591	break;
1592
1593	case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
1594	if (hv_is_isolation_supported())
1595	netdev_err(dev: ndev, format: "Ignore VF_ASSOCIATION msg from the host supporting isolation\n");
1596	else
1597	netvsc_send_vf(ndev, nvmsg, msglen);
1598	break;
1599	}
1600	}
1601
1602	static int netvsc_process_raw_pkt(struct hv_device *device,
1603	struct netvsc_channel *nvchan,
1604	struct netvsc_device *net_device,
1605	struct net_device *ndev,
1606	const struct vmpacket_descriptor *desc,
1607	int budget)
1608	{
1609	struct vmbus_channel *channel = nvchan->channel;
1610	const struct nvsp_message *nvmsg = hv_pkt_data(desc);
1611
1612	trace_nvsp_recv(ndev, chan: channel, msg: nvmsg);
1613
1614	switch (desc->type) {
1615	case VM_PKT_COMP:
1616	netvsc_send_completion(ndev, net_device, incoming_channel: channel, desc, budget);
1617	break;
1618
1619	case VM_PKT_DATA_USING_XFER_PAGES:
1620	return netvsc_receive(ndev, net_device, nvchan, desc);
1621
1622	case VM_PKT_DATA_INBAND:
1623	netvsc_receive_inband(ndev, nvscdev: net_device, desc);
1624	break;
1625
1626	default:
1627	netdev_err(dev: ndev, format: "unhandled packet type %d, tid %llx\n",
1628	desc->type, desc->trans_id);
1629	break;
1630	}
1631
1632	return `0`;
1633	}
1634
1635	static struct hv_device netvsc_channel_to_device(struct* vmbus_channel *channel)
1636	{
1637	struct vmbus_channel *primary = channel->primary_channel;
1638
1639	return primary ? primary->device_obj : channel->device_obj;
1640	}
1641
1642	/ Network processing softirq*
1643	* Process data in incoming ring buffer from host
1644	* Stops when ring is empty or budget is met or exceeded.
1645	*/
1646	int netvsc_poll(struct napi_struct napi, int* budget)
1647	{
1648	struct netvsc_channel *nvchan
1649	= container_of(napi, struct netvsc_channel, napi);
1650	struct netvsc_device *net_device = nvchan->net_device;
1651	struct vmbus_channel *channel = nvchan->channel;
1652	struct hv_device *device = netvsc_channel_to_device(channel);
1653	struct net_device *ndev = hv_get_drvdata(dev: device);
1654	int work_done = `0`;
1655	int ret;
1656
1657	/ If starting a new interval /
1658	if (!nvchan->desc)
1659	nvchan->desc = hv_pkt_iter_first(channel);
1660
1661	nvchan->xdp_flush = false;
1662
1663	while (nvchan->desc && work_done < budget) {
1664	work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
1665	ndev, desc: nvchan->desc, budget);
1666	nvchan->desc = hv_pkt_iter_next(channel, pkt: nvchan->desc);
1667	}
1668
1669	if (nvchan->xdp_flush)
1670	xdp_do_flush();
1671
1672	/ Send any pending receive completions /
1673	ret = send_recv_completions(ndev, nvdev: net_device, nvchan);
1674
1675	/ If it did not exhaust NAPI budget this time*
1676	* and not doing busy poll
1677	* then re-enable host interrupts
1678	* and reschedule if ring is not empty
1679	* or sending receive completion failed.
1680	*/
1681	if (work_done < budget &&
1682	napi_complete_done(n: napi, work_done) &&
1683	(ret \|\| hv_end_read(rbi: &channel->inbound)) &&
1684	napi_schedule_prep(n: napi)) {
1685	hv_begin_read(rbi: &channel->inbound);
1686	__napi_schedule(n: napi);
1687	}
1688
1689	/ Driver may overshoot since multiple packets per descriptor /
1690	return min(work_done, budget);
1691	}
1692
1693	/ Call back when data is available in host ring buffer.*
1694	* Processing is deferred until network softirq (NAPI)
1695	*/
1696	void netvsc_channel_cb(void *context)
1697	{
1698	struct netvsc_channel *nvchan = context;
1699	struct vmbus_channel *channel = nvchan->channel;
1700	struct hv_ring_buffer_info *rbi = &channel->inbound;
1701
1702	/ preload first vmpacket descriptor /
1703	prefetch(hv_get_ring_buffer(rbi) + rbi->priv_read_index);
1704
1705	if (napi_schedule_prep(n: &nvchan->napi)) {
1706	/ disable interrupts from host /
1707	hv_begin_read(rbi);
1708
1709	__napi_schedule_irqoff(n: &nvchan->napi);
1710	}
1711	}
1712
1713	/*
1714	* netvsc_device_add - Callback when the device belonging to this
1715	* driver is added
1716	*/
1717	struct netvsc_device netvsc_device_add(struct* hv_device *device,
1718	const struct netvsc_device_info *device_info)
1719	{
1720	int i, ret = `0`;
1721	struct netvsc_device *net_device;
1722	struct net_device *ndev = hv_get_drvdata(dev: device);
1723	struct net_device_context *net_device_ctx = netdev_priv(dev: ndev);
1724
1725	net_device = alloc_net_device();
1726	if (!net_device)
1727	return ERR_PTR(error: -ENOMEM);
1728
1729	for (i = `0`; i < VRSS_SEND_TAB_SIZE; i++)
1730	net_device_ctx->tx_table[i] = `0`;
1731
1732	/ Because the device uses NAPI, all the interrupt batching and*
1733	* control is done via Net softirq, not the channel handling
1734	*/
1735	set_channel_read_mode(c: device->channel, mode: HV_CALL_ISR);
1736
1737	/ If we're reopening the device we may have multiple queues, fill the*
1738	* chn_table with the default channel to use it before subchannels are
1739	* opened.
1740	* Initialize the channel state before we open;
1741	* we can be interrupted as soon as we open the channel.
1742	*/
1743
1744	for (i = `0`; i < VRSS_CHANNEL_MAX; i++) {
1745	struct netvsc_channel *nvchan = &net_device->chan_table[i];
1746
1747	nvchan->channel = device->channel;
1748	nvchan->net_device = net_device;
1749	u64_stats_init(syncp: &nvchan->tx_stats.syncp);
1750	u64_stats_init(syncp: &nvchan->rx_stats.syncp);
1751
1752	ret = xdp_rxq_info_reg(xdp_rxq: &nvchan->xdp_rxq, dev: ndev, queue_index: i, napi_id: `0`);
1753
1754	if (ret) {
1755	netdev_err(dev: ndev, format: "xdp_rxq_info_reg fail: %d\n", ret);
1756	goto cleanup2;
1757	}
1758
1759	ret = xdp_rxq_info_reg_mem_model(xdp_rxq: &nvchan->xdp_rxq,
1760	type: MEM_TYPE_PAGE_SHARED, NULL);
1761
1762	if (ret) {
1763	netdev_err(dev: ndev, format: "xdp reg_mem_model fail: %d\n", ret);
1764	goto cleanup2;
1765	}
1766	}
1767
1768	/ Enable NAPI handler before init callbacks /
1769	netif_napi_add(dev: ndev, napi: &net_device->chan_table[`0`].napi, poll: netvsc_poll);
1770
1771	/ Open the channel /
1772	device->channel->next_request_id_callback = vmbus_next_request_id;
1773	device->channel->request_addr_callback = vmbus_request_addr;
1774	device->channel->rqstor_size = netvsc_rqstor_size(ringbytes: netvsc_ring_bytes);
1775	device->channel->max_pkt_size = NETVSC_MAX_PKT_SIZE;
1776
1777	ret = vmbus_open(channel: device->channel, send_ringbuffersize: netvsc_ring_bytes,
1778	recv_ringbuffersize: netvsc_ring_bytes, NULL, userdatalen: `0`,
1779	onchannel_callback: netvsc_channel_cb, context: net_device->chan_table);
1780
1781	if (ret != `0`) {
1782	netdev_err(dev: ndev, format: "unable to open channel: %d\n", ret);
1783	goto cleanup;
1784	}
1785
1786	/ Channel is opened /
1787	netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
1788
1789	napi_enable(n: &net_device->chan_table[`0`].napi);
1790
1791	/ Connect with the NetVsp /
1792	ret = netvsc_connect_vsp(device, net_device, device_info);
1793	if (ret != `0`) {
1794	netdev_err(dev: ndev,
1795	format: "unable to connect to NetVSP - %d\n", ret);
1796	goto close;
1797	}
1798
1799	/ Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is*
1800	* populated.
1801	*/
1802	rcu_assign_pointer(net_device_ctx->nvdev, net_device);
1803
1804	return net_device;
1805
1806	close:
1807	RCU_INIT_POINTER(net_device_ctx->nvdev, NULL);
1808	napi_disable(n: &net_device->chan_table[`0`].napi);
1809
1810	/ Now, we can close the channel safely /
1811	vmbus_close(channel: device->channel);
1812
1813	cleanup:
1814	netif_napi_del(napi: &net_device->chan_table[`0`].napi);
1815
1816	cleanup2:
1817	free_netvsc_device(head: &net_device->rcu);
1818
1819	return ERR_PTR(error: ret);
1820	}
1821

source code of linux/drivers/net/hyperv/netvsc.c