vrf.c source code [linux/drivers/net/vrf.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* vrf.c: device driver to encapsulate a VRF space
4	*
5	* Copyright (c) 2015 Cumulus Networks. All rights reserved.
6	* Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
7	* Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
8	*
9	* Based on dummy, team and ipvlan drivers
10	*/
11
12	#include <linux/ethtool.h>
13	#include <linux/module.h>
14	#include <linux/kernel.h>
15	#include <linux/netdevice.h>
16	#include <linux/etherdevice.h>
17	#include <linux/ip.h>
18	#include <linux/init.h>
19	#include <linux/moduleparam.h>
20	#include <linux/netfilter.h>
21	#include <linux/rtnetlink.h>
22	#include <net/rtnetlink.h>
23	#include <linux/u64_stats_sync.h>
24	#include <linux/hashtable.h>
25	#include <linux/spinlock_types.h>
26
27	#include <linux/inetdevice.h>
28	#include <net/arp.h>
29	#include <net/ip.h>
30	#include <net/ip_fib.h>
31	#include <net/ip6_fib.h>
32	#include <net/ip6_route.h>
33	#include <net/route.h>
34	#include <net/addrconf.h>
35	#include <net/l3mdev.h>
36	#include <net/fib_rules.h>
37	#include <net/sch_generic.h>
38	#include <net/netns/generic.h>
39	#include <net/netfilter/nf_conntrack.h>
40
41	#define DRV_NAME "vrf"
42	#define DRV_VERSION "1.1"
43
44	#define FIB_RULE_PREF 1000 /* default preference for FIB rules */
45
46	#define HT_MAP_BITS 4
47	#define HASH_INITVAL ((u32)0xcafef00d)
48
49	struct vrf_map {
50	DECLARE_HASHTABLE(ht, HT_MAP_BITS);
51	spinlock_t vmap_lock;
52
53	/ shared_tables:*
54	* count how many distinct tables do not comply with the strict mode
55	* requirement.
56	* shared_tables value must be 0 in order to enable the strict mode.
57	*
58	* example of the evolution of shared_tables:
59	* \| time
60	* add vrf0 --> table 100 shared_tables = 0 \| t0
61	* add vrf1 --> table 101 shared_tables = 0 \| t1
62	* add vrf2 --> table 100 shared_tables = 1 \| t2
63	* add vrf3 --> table 100 shared_tables = 1 \| t3
64	* add vrf4 --> table 101 shared_tables = 2 v t4
65	*
66	* shared_tables is a "step function" (or "staircase function")
67	* and it is increased by one when the second vrf is associated to a
68	* table.
69	*
70	* at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1.
71	*
72	* at t3, another dev (vrf3) is bound to the same table 100 but the
73	* value of shared_tables is still 1.
74	* This means that no matter how many new vrfs will register on the
75	* table 100, the shared_tables will not increase (considering only
76	* table 100).
77	*
78	* at t4, vrf4 is bound to table 101, and shared_tables = 2.
79	*
80	* Looking at the value of shared_tables we can immediately know if
81	* the strict_mode can or cannot be enforced. Indeed, strict_mode
82	* can be enforced iff shared_tables = 0.
83	*
84	* Conversely, shared_tables is decreased when a vrf is de-associated
85	* from a table with exactly two associated vrfs.
86	*/
87	u32 shared_tables;
88
89	bool strict_mode;
90	};
91
92	struct vrf_map_elem {
93	struct hlist_node hnode;
94	struct list_head vrf_list; / VRFs registered to this table /
95
96	u32 table_id;
97	int users;
98	int ifindex;
99	};
100
101	static unsigned int vrf_net_id;
102
103	/ per netns vrf data /
104	struct netns_vrf {
105	/ protected by rtnl lock /
106	bool add_fib_rules;
107
108	struct vrf_map vmap;
109	struct ctl_table_header *ctl_hdr;
110	};
111
112	struct net_vrf {
113	struct rtable __rcu *rth;
114	struct rt6_info __rcu *rt6;
115	#if IS_ENABLED(CONFIG_IPV6)
116	struct fib6_table *fib6_table;
117	#endif
118	u32 tb_id;
119
120	struct list_head me_list; / entry in vrf_map_elem /
121	int ifindex;
122	};
123
124	static void vrf_rx_stats(struct net_device dev, int* len)
125	{
126	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
127
128	u64_stats_update_begin(syncp: &dstats->syncp);
129	dstats->rx_packets++;
130	dstats->rx_bytes += len;
131	u64_stats_update_end(syncp: &dstats->syncp);
132	}
133
134	static void vrf_tx_error(struct net_device vrf_dev, struct* sk_buff *skb)
135	{
136	vrf_dev->stats.tx_errors++;
137	kfree_skb(skb);
138	}
139
140	static void vrf_get_stats64(struct net_device *dev,
141	struct rtnl_link_stats64 *stats)
142	{
143	int i;
144
145	for_each_possible_cpu(i) {
146	const struct pcpu_dstats *dstats;
147	u64 tbytes, tpkts, tdrops, rbytes, rpkts;
148	unsigned int start;
149
150	dstats = per_cpu_ptr(dev->dstats, i);
151	do {
152	start = u64_stats_fetch_begin(syncp: &dstats->syncp);
153	tbytes = dstats->tx_bytes;
154	tpkts = dstats->tx_packets;
155	tdrops = dstats->tx_drops;
156	rbytes = dstats->rx_bytes;
157	rpkts = dstats->rx_packets;
158	} while (u64_stats_fetch_retry(syncp: &dstats->syncp, start));
159	stats->tx_bytes += tbytes;
160	stats->tx_packets += tpkts;
161	stats->tx_dropped += tdrops;
162	stats->rx_bytes += rbytes;
163	stats->rx_packets += rpkts;
164	}
165	}
166
167	static struct vrf_map netns_vrf_map(struct* net *net)
168	{
169	struct netns_vrf *nn_vrf = net_generic(net, id: vrf_net_id);
170
171	return &nn_vrf->vmap;
172	}
173
174	static struct vrf_map netns_vrf_map_by_dev(struct* net_device *dev)
175	{
176	return netns_vrf_map(net: dev_net(dev));
177	}
178
179	static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me)
180	{
181	struct list_head *me_head = &me->vrf_list;
182	struct net_vrf *vrf;
183
184	if (list_empty(head: me_head))
185	return -ENODEV;
186
187	vrf = list_first_entry(me_head, struct net_vrf, me_list);
188
189	return vrf->ifindex;
190	}
191
192	static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags)
193	{
194	struct vrf_map_elem *me;
195
196	me = kmalloc(size: sizeof(*me), flags);
197	if (!me)
198	return NULL;
199
200	return me;
201	}
202
203	static void vrf_map_elem_free(struct vrf_map_elem *me)
204	{
205	kfree(objp: me);
206	}
207
208	static void vrf_map_elem_init(struct vrf_map_elem me, int* table_id,
209	int ifindex, int users)
210	{
211	me->table_id = table_id;
212	me->ifindex = ifindex;
213	me->users = users;
214	INIT_LIST_HEAD(list: &me->vrf_list);
215	}
216
217	static struct vrf_map_elem vrf_map_lookup_elem(struct* vrf_map *vmap,
218	u32 table_id)
219	{
220	struct vrf_map_elem *me;
221	u32 key;
222
223	key = jhash_1word(a: table_id, HASH_INITVAL);
224	hash_for_each_possible(vmap->ht, me, hnode, key) {
225	if (me->table_id == table_id)
226	return me;
227	}
228
229	return NULL;
230	}
231
232	static void vrf_map_add_elem(struct vrf_map vmap, struct* vrf_map_elem *me)
233	{
234	u32 table_id = me->table_id;
235	u32 key;
236
237	key = jhash_1word(a: table_id, HASH_INITVAL);
238	hash_add(vmap->ht, &me->hnode, key);
239	}
240
241	static void vrf_map_del_elem(struct vrf_map_elem *me)
242	{
243	hash_del(node: &me->hnode);
244	}
245
246	static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock)
247	{
248	spin_lock(lock: &vmap->vmap_lock);
249	}
250
251	static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock)
252	{
253	spin_unlock(lock: &vmap->vmap_lock);
254	}
255
256	/ called with rtnl lock held /
257	static int
258	vrf_map_register_dev(struct net_device dev, struct* netlink_ext_ack *extack)
259	{
260	struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
261	struct net_vrf *vrf = netdev_priv(dev);
262	struct vrf_map_elem new_me, me;
263	u32 table_id = vrf->tb_id;
264	bool free_new_me = false;
265	int users;
266	int res;
267
268	/ we pre-allocate elements used in the spin-locked section (so that we*
269	* keep the spinlock as short as possible).
270	*/
271	new_me = vrf_map_elem_alloc(GFP_KERNEL);
272	if (!new_me)
273	return -ENOMEM;
274
275	vrf_map_elem_init(me: new_me, table_id, ifindex: dev->ifindex, users: `0`);
276
277	vrf_map_lock(vmap);
278
279	me = vrf_map_lookup_elem(vmap, table_id);
280	if (!me) {
281	me = new_me;
282	vrf_map_add_elem(vmap, me);
283	goto link_vrf;
284	}
285
286	/ we already have an entry in the vrf_map, so it means there is (at*
287	* least) a vrf registered on the specific table.
288	*/
289	free_new_me = true;
290	if (vmap->strict_mode) {
291	/ vrfs cannot share the same table /
292	NL_SET_ERR_MSG(extack, "Table is used by another VRF");
293	res = -EBUSY;
294	goto unlock;
295	}
296
297	link_vrf:
298	users = ++me->users;
299	if (users == `2`)
300	++vmap->shared_tables;
301
302	list_add(new: &vrf->me_list, head: &me->vrf_list);
303
304	res = `0`;
305
306	unlock:
307	vrf_map_unlock(vmap);
308
309	/ clean-up, if needed /
310	if (free_new_me)
311	vrf_map_elem_free(me: new_me);
312
313	return res;
314	}
315
316	/ called with rtnl lock held /
317	static void vrf_map_unregister_dev(struct net_device *dev)
318	{
319	struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
320	struct net_vrf *vrf = netdev_priv(dev);
321	u32 table_id = vrf->tb_id;
322	struct vrf_map_elem *me;
323	int users;
324
325	vrf_map_lock(vmap);
326
327	me = vrf_map_lookup_elem(vmap, table_id);
328	if (!me)
329	goto unlock;
330
331	list_del(entry: &vrf->me_list);
332
333	users = --me->users;
334	if (users == `1`) {
335	--vmap->shared_tables;
336	} else if (users == `0`) {
337	vrf_map_del_elem(me);
338
339	/ no one will refer to this element anymore /
340	vrf_map_elem_free(me);
341	}
342
343	unlock:
344	vrf_map_unlock(vmap);
345	}
346
347	/ return the vrf device index associated with the table_id /
348	static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
349	{
350	struct vrf_map *vmap = netns_vrf_map(net);
351	struct vrf_map_elem *me;
352	int ifindex;
353
354	vrf_map_lock(vmap);
355
356	if (!vmap->strict_mode) {
357	ifindex = -EPERM;
358	goto unlock;
359	}
360
361	me = vrf_map_lookup_elem(vmap, table_id);
362	if (!me) {
363	ifindex = -ENODEV;
364	goto unlock;
365	}
366
367	ifindex = vrf_map_elem_get_vrf_ifindex(me);
368
369	unlock:
370	vrf_map_unlock(vmap);
371
372	return ifindex;
373	}
374
375	/ by default VRF devices do not have a qdisc and are expected*
376	* to be created with only a single queue.
377	*/
378	static bool qdisc_tx_is_default(const struct net_device *dev)
379	{
380	struct netdev_queue *txq;
381	struct Qdisc *qdisc;
382
383	if (dev->num_tx_queues > `1`)
384	return false;
385
386	txq = netdev_get_tx_queue(dev, index: `0`);
387	qdisc = rcu_access_pointer(txq->qdisc);
388
389	return !qdisc->enqueue;
390	}
391
392	/ Local traffic destined to local address. Reinsert the packet to rx*
393	* path, similar to loopback handling.
394	*/
395	static int vrf_local_xmit(struct sk_buff skb, struct* net_device *dev,
396	struct dst_entry *dst)
397	{
398	int len = skb->len;
399
400	skb_orphan(skb);
401
402	skb_dst_set(skb, dst);
403
404	/ set pkt_type to avoid skb hitting packet taps twice -*
405	* once on Tx and again in Rx processing
406	*/
407	skb->pkt_type = PACKET_LOOPBACK;
408
409	skb->protocol = eth_type_trans(skb, dev);
410
411	if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
412	vrf_rx_stats(dev, len);
413	else
414	this_cpu_inc(dev->dstats->rx_drops);
415
416	return NETDEV_TX_OK;
417	}
418
419	static void vrf_nf_set_untracked(struct sk_buff *skb)
420	{
421	if (skb_get_nfct(skb) == `0`)
422	nf_ct_set(skb, NULL, info: IP_CT_UNTRACKED);
423	}
424
425	static void vrf_nf_reset_ct(struct sk_buff *skb)
426	{
427	if (skb_get_nfct(skb) == IP_CT_UNTRACKED)
428	nf_reset_ct(skb);
429	}
430
431	#if IS_ENABLED(CONFIG_IPV6)
432	static int vrf_ip6_local_out(struct net net, struct* sock *sk,
433	struct sk_buff *skb)
434	{
435	int err;
436
437	vrf_nf_reset_ct(skb);
438
439	err = nf_hook(pf: NFPROTO_IPV6, hook: NF_INET_LOCAL_OUT, net,
440	sk, skb, NULL, outdev: skb_dst(skb)->dev, okfn: dst_output);
441
442	if (likely(err == `1`))
443	err = dst_output(net, sk, skb);
444
445	return err;
446	}
447
448	static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
449	struct net_device *dev)
450	{
451	const struct ipv6hdr *iph;
452	struct net *net = dev_net(dev: skb->dev);
453	struct flowi6 fl6;
454	int ret = NET_XMIT_DROP;
455	struct dst_entry *dst;
456	struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
457
458	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
459	goto err;
460
461	iph = ipv6_hdr(skb);
462
463	memset(&fl6, `0`, sizeof(fl6));
464	/ needed to match OIF rule /
465	fl6.flowi6_l3mdev = dev->ifindex;
466	fl6.flowi6_iif = LOOPBACK_IFINDEX;
467	fl6.daddr = iph->daddr;
468	fl6.saddr = iph->saddr;
469	fl6.flowlabel = ip6_flowinfo(hdr: iph);
470	fl6.flowi6_mark = skb->mark;
471	fl6.flowi6_proto = iph->nexthdr;
472
473	dst = ip6_dst_lookup_flow(net, NULL, fl6: &fl6, NULL);
474	if (IS_ERR(ptr: dst) \|\| dst == dst_null)
475	goto err;
476
477	skb_dst_drop(skb);
478
479	/ if dst.dev is the VRF device again this is locally originated traffic*
480	* destined to a local address. Short circuit to Rx path.
481	*/
482	if (dst->dev == dev)
483	return vrf_local_xmit(skb, dev, dst);
484
485	skb_dst_set(skb, dst);
486
487	/ strip the ethernet header added for pass through VRF device /
488	__skb_pull(skb, len: skb_network_offset(skb));
489
490	memset(IP6CB(skb), `0`, sizeof(*IP6CB(skb)));
491	ret = vrf_ip6_local_out(net, sk: skb->sk, skb);
492	if (unlikely(net_xmit_eval(ret)))
493	dev->stats.tx_errors++;
494	else
495	ret = NET_XMIT_SUCCESS;
496
497	return ret;
498	err:
499	vrf_tx_error(vrf_dev: dev, skb);
500	return NET_XMIT_DROP;
501	}
502	#else
503	static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
504	struct net_device *dev)
505	{
506	vrf_tx_error(dev, skb);
507	return NET_XMIT_DROP;
508	}
509	#endif
510
511	/ based on ip_local_out; can't use it b/c the dst is switched pointing to us /
512	static int vrf_ip_local_out(struct net net, struct* sock *sk,
513	struct sk_buff *skb)
514	{
515	int err;
516
517	vrf_nf_reset_ct(skb);
518
519	err = nf_hook(pf: NFPROTO_IPV4, hook: NF_INET_LOCAL_OUT, net, sk,
520	skb, NULL, outdev: skb_dst(skb)->dev, okfn: dst_output);
521	if (likely(err == `1`))
522	err = dst_output(net, sk, skb);
523
524	return err;
525	}
526
527	static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
528	struct net_device *vrf_dev)
529	{
530	struct iphdr *ip4h;
531	int ret = NET_XMIT_DROP;
532	struct flowi4 fl4;
533	struct net *net = dev_net(dev: vrf_dev);
534	struct rtable *rt;
535
536	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
537	goto err;
538
539	ip4h = ip_hdr(skb);
540
541	memset(&fl4, `0`, sizeof(fl4));
542	/ needed to match OIF rule /
543	fl4.flowi4_l3mdev = vrf_dev->ifindex;
544	fl4.flowi4_iif = LOOPBACK_IFINDEX;
545	fl4.flowi4_tos = RT_TOS(ip4h->tos);
546	fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
547	fl4.flowi4_proto = ip4h->protocol;
548	fl4.daddr = ip4h->daddr;
549	fl4.saddr = ip4h->saddr;
550
551	rt = ip_route_output_flow(net, flp: &fl4, NULL);
552	if (IS_ERR(ptr: rt))
553	goto err;
554
555	skb_dst_drop(skb);
556
557	/ if dst.dev is the VRF device again this is locally originated traffic*
558	* destined to a local address. Short circuit to Rx path.
559	*/
560	if (rt->dst.dev == vrf_dev)
561	return vrf_local_xmit(skb, dev: vrf_dev, dst: &rt->dst);
562
563	skb_dst_set(skb, dst: &rt->dst);
564
565	/ strip the ethernet header added for pass through VRF device /
566	__skb_pull(skb, len: skb_network_offset(skb));
567
568	if (!ip4h->saddr) {
569	ip4h->saddr = inet_select_addr(dev: skb_dst(skb)->dev, dst: `0`,
570	scope: RT_SCOPE_LINK);
571	}
572
573	memset(IPCB(skb), `0`, sizeof(*IPCB(skb)));
574	ret = vrf_ip_local_out(net: dev_net(dev: skb_dst(skb)->dev), sk: skb->sk, skb);
575	if (unlikely(net_xmit_eval(ret)))
576	vrf_dev->stats.tx_errors++;
577	else
578	ret = NET_XMIT_SUCCESS;
579
580	out:
581	return ret;
582	err:
583	vrf_tx_error(vrf_dev, skb);
584	goto out;
585	}
586
587	static netdev_tx_t is_ip_tx_frame(struct sk_buff skb, struct* net_device *dev)
588	{
589	switch (skb->protocol) {
590	case htons(ETH_P_IP):
591	return vrf_process_v4_outbound(skb, vrf_dev: dev);
592	case htons(ETH_P_IPV6):
593	return vrf_process_v6_outbound(skb, dev);
594	default:
595	vrf_tx_error(vrf_dev: dev, skb);
596	return NET_XMIT_DROP;
597	}
598	}
599
600	static netdev_tx_t vrf_xmit(struct sk_buff skb, struct* net_device *dev)
601	{
602	int len = skb->len;
603	netdev_tx_t ret = is_ip_tx_frame(skb, dev);
604
605	if (likely(ret == NET_XMIT_SUCCESS \|\| ret == NET_XMIT_CN)) {
606	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
607
608	u64_stats_update_begin(syncp: &dstats->syncp);
609	dstats->tx_packets++;
610	dstats->tx_bytes += len;
611	u64_stats_update_end(syncp: &dstats->syncp);
612	} else {
613	this_cpu_inc(dev->dstats->tx_drops);
614	}
615
616	return ret;
617	}
618
619	static void vrf_finish_direct(struct sk_buff *skb)
620	{
621	struct net_device *vrf_dev = skb->dev;
622
623	if (!list_empty(head: &vrf_dev->ptype_all) &&
624	likely(skb_headroom(skb) >= ETH_HLEN)) {
625	struct ethhdr *eth = skb_push(skb, ETH_HLEN);
626
627	ether_addr_copy(dst: eth->h_source, src: vrf_dev->dev_addr);
628	eth_zero_addr(addr: eth->h_dest);
629	eth->h_proto = skb->protocol;
630
631	dev_queue_xmit_nit(skb, dev: vrf_dev);
632
633	skb_pull(skb, ETH_HLEN);
634	}
635
636	vrf_nf_reset_ct(skb);
637	}
638
639	#if IS_ENABLED(CONFIG_IPV6)
640	/ modelled after ip6_finish_output2 /
641	static int vrf_finish_output6(struct net net, struct* sock *sk,
642	struct sk_buff *skb)
643	{
644	struct dst_entry *dst = skb_dst(skb);
645	struct net_device *dev = dst->dev;
646	const struct in6_addr *nexthop;
647	struct neighbour *neigh;
648	int ret;
649
650	vrf_nf_reset_ct(skb);
651
652	skb->protocol = htons(ETH_P_IPV6);
653	skb->dev = dev;
654
655	rcu_read_lock();
656	nexthop = rt6_nexthop(rt: (struct rt6_info *)dst, daddr: &ipv6_hdr(skb)->daddr);
657	neigh = __ipv6_neigh_lookup_noref(dev: dst->dev, pkey: nexthop);
658	if (unlikely(!neigh))
659	neigh = __neigh_create(tbl: &nd_tbl, pkey: nexthop, dev: dst->dev, want_ref: false);
660	if (!IS_ERR(ptr: neigh)) {
661	sock_confirm_neigh(skb, n: neigh);
662	ret = neigh_output(n: neigh, skb, skip_cache: false);
663	rcu_read_unlock();
664	return ret;
665	}
666	rcu_read_unlock();
667
668	IP6_INC_STATS(dev_net(dst->dev),
669	ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
670	kfree_skb(skb);
671	return -EINVAL;
672	}
673
674	/ modelled after ip6_output /
675	static int vrf_output6(struct net net, struct* sock sk, struct* sk_buff *skb)
676	{
677	return NF_HOOK_COND(pf: NFPROTO_IPV6, hook: NF_INET_POST_ROUTING,
678	net, sk, skb, NULL, out: skb_dst(skb)->dev,
679	okfn: vrf_finish_output6,
680	cond: !(IP6CB(skb)->flags & IP6SKB_REROUTED));
681	}
682
683	/ set dst on skb to send packet to us via dev_xmit path. Allows*
684	* packet to go through device based features such as qdisc, netfilter
685	* hooks and packet sockets with skb->dev set to vrf device.
686	*/
687	static struct sk_buff vrf_ip6_out_redirect(struct* net_device *vrf_dev,
688	struct sk_buff *skb)
689	{
690	struct net_vrf *vrf = netdev_priv(dev: vrf_dev);
691	struct dst_entry *dst = NULL;
692	struct rt6_info *rt6;
693
694	rcu_read_lock();
695
696	rt6 = rcu_dereference(vrf->rt6);
697	if (likely(rt6)) {
698	dst = &rt6->dst;
699	dst_hold(dst);
700	}
701
702	rcu_read_unlock();
703
704	if (unlikely(!dst)) {
705	vrf_tx_error(vrf_dev, skb);
706	return NULL;
707	}
708
709	skb_dst_drop(skb);
710	skb_dst_set(skb, dst);
711
712	return skb;
713	}
714
715	static int vrf_output6_direct_finish(struct net net, struct* sock *sk,
716	struct sk_buff *skb)
717	{
718	vrf_finish_direct(skb);
719
720	return vrf_ip6_local_out(net, sk, skb);
721	}
722
723	static int vrf_output6_direct(struct net net, struct* sock *sk,
724	struct sk_buff *skb)
725	{
726	int err = `1`;
727
728	skb->protocol = htons(ETH_P_IPV6);
729
730	if (!(IPCB(skb)->flags & IPSKB_REROUTED))
731	err = nf_hook(pf: NFPROTO_IPV6, hook: NF_INET_POST_ROUTING, net, sk, skb,
732	NULL, outdev: skb->dev, okfn: vrf_output6_direct_finish);
733
734	if (likely(err == `1`))
735	vrf_finish_direct(skb);
736
737	return err;
738	}
739
740	static int vrf_ip6_out_direct_finish(struct net net, struct* sock *sk,
741	struct sk_buff *skb)
742	{
743	int err;
744
745	err = vrf_output6_direct(net, sk, skb);
746	if (likely(err == `1`))
747	err = vrf_ip6_local_out(net, sk, skb);
748
749	return err;
750	}
751
752	static struct sk_buff vrf_ip6_out_direct(struct* net_device *vrf_dev,
753	struct sock *sk,
754	struct sk_buff *skb)
755	{
756	struct net *net = dev_net(dev: vrf_dev);
757	int err;
758
759	skb->dev = vrf_dev;
760
761	err = nf_hook(pf: NFPROTO_IPV6, hook: NF_INET_LOCAL_OUT, net, sk,
762	skb, NULL, outdev: vrf_dev, okfn: vrf_ip6_out_direct_finish);
763
764	if (likely(err == `1`))
765	err = vrf_output6_direct(net, sk, skb);
766
767	if (likely(err == `1`))
768	return skb;
769
770	return NULL;
771	}
772
773	static struct sk_buff vrf_ip6_out(struct* net_device *vrf_dev,
774	struct sock *sk,
775	struct sk_buff *skb)
776	{
777	/ don't divert link scope packets /
778	if (rt6_need_strict(daddr: &ipv6_hdr(skb)->daddr))
779	return skb;
780
781	vrf_nf_set_untracked(skb);
782
783	if (qdisc_tx_is_default(dev: vrf_dev) \|\|
784	IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
785	return vrf_ip6_out_direct(vrf_dev, sk, skb);
786
787	return vrf_ip6_out_redirect(vrf_dev, skb);
788	}
789
790	/ holding rtnl /
791	static void vrf_rt6_release(struct net_device dev, struct* net_vrf *vrf)
792	{
793	struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
794	struct net *net = dev_net(dev);
795	struct dst_entry *dst;
796
797	RCU_INIT_POINTER(vrf->rt6, NULL);
798	synchronize_rcu();
799
800	/ move dev in dst's to loopback so this VRF device can be deleted*
801	* - based on dst_ifdown
802	*/
803	if (rt6) {
804	dst = &rt6->dst;
805	netdev_ref_replace(odev: dst->dev, ndev: net->loopback_dev,
806	tracker: &dst->dev_tracker, GFP_KERNEL);
807	dst->dev = net->loopback_dev;
808	dst_release(dst);
809	}
810	}
811
812	static int vrf_rt6_create(struct net_device *dev)
813	{
814	int flags = DST_NOPOLICY \| DST_NOXFRM;
815	struct net_vrf *vrf = netdev_priv(dev);
816	struct net *net = dev_net(dev);
817	struct rt6_info *rt6;
818	int rc = -ENOMEM;
819
820	/ IPv6 can be CONFIG enabled and then disabled runtime /
821	if (!ipv6_mod_enabled())
822	return `0`;
823
824	vrf->fib6_table = fib6_new_table(net, id: vrf->tb_id);
825	if (!vrf->fib6_table)
826	goto out;
827
828	/ create a dst for routing packets out a VRF device /
829	rt6 = ip6_dst_alloc(net, dev, flags);
830	if (!rt6)
831	goto out;
832
833	rt6->dst.output = vrf_output6;
834
835	rcu_assign_pointer(vrf->rt6, rt6);
836
837	rc = `0`;
838	out:
839	return rc;
840	}
841	#else
842	static struct sk_buff vrf_ip6_out(struct* net_device *vrf_dev,
843	struct sock *sk,
844	struct sk_buff *skb)
845	{
846	return skb;
847	}
848
849	static void vrf_rt6_release(struct net_device dev, struct* net_vrf *vrf)
850	{
851	}
852
853	static int vrf_rt6_create(struct net_device *dev)
854	{
855	return `0`;
856	}
857	#endif
858
859	/ modelled after ip_finish_output2 /
860	static int vrf_finish_output(struct net net, struct* sock sk, struct* sk_buff *skb)
861	{
862	struct dst_entry *dst = skb_dst(skb);
863	struct rtable rt = (struct* rtable *)dst;
864	struct net_device *dev = dst->dev;
865	unsigned int hh_len = LL_RESERVED_SPACE(dev);
866	struct neighbour *neigh;
867	bool is_v6gw = false;
868
869	vrf_nf_reset_ct(skb);
870
871	/ Be paranoid, rather than too clever. /
872	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
873	skb = skb_expand_head(skb, headroom: hh_len);
874	if (!skb) {
875	dev->stats.tx_errors++;
876	return -ENOMEM;
877	}
878	}
879
880	rcu_read_lock();
881
882	neigh = ip_neigh_for_gw(rt, skb, is_v6gw: &is_v6gw);
883	if (!IS_ERR(ptr: neigh)) {
884	int ret;
885
886	sock_confirm_neigh(skb, n: neigh);
887	/ if crossing protocols, can not use the cached header /
888	ret = neigh_output(n: neigh, skb, skip_cache: is_v6gw);
889	rcu_read_unlock();
890	return ret;
891	}
892
893	rcu_read_unlock();
894	vrf_tx_error(vrf_dev: skb->dev, skb);
895	return -EINVAL;
896	}
897
898	static int vrf_output(struct net net, struct* sock sk, struct* sk_buff *skb)
899	{
900	struct net_device *dev = skb_dst(skb)->dev;
901
902	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
903
904	skb->dev = dev;
905	skb->protocol = htons(ETH_P_IP);
906
907	return NF_HOOK_COND(pf: NFPROTO_IPV4, hook: NF_INET_POST_ROUTING,
908	net, sk, skb, NULL, out: dev,
909	okfn: vrf_finish_output,
910	cond: !(IPCB(skb)->flags & IPSKB_REROUTED));
911	}
912
913	/ set dst on skb to send packet to us via dev_xmit path. Allows*
914	* packet to go through device based features such as qdisc, netfilter
915	* hooks and packet sockets with skb->dev set to vrf device.
916	*/
917	static struct sk_buff vrf_ip_out_redirect(struct* net_device *vrf_dev,
918	struct sk_buff *skb)
919	{
920	struct net_vrf *vrf = netdev_priv(dev: vrf_dev);
921	struct dst_entry *dst = NULL;
922	struct rtable *rth;
923
924	rcu_read_lock();
925
926	rth = rcu_dereference(vrf->rth);
927	if (likely(rth)) {
928	dst = &rth->dst;
929	dst_hold(dst);
930	}
931
932	rcu_read_unlock();
933
934	if (unlikely(!dst)) {
935	vrf_tx_error(vrf_dev, skb);
936	return NULL;
937	}
938
939	skb_dst_drop(skb);
940	skb_dst_set(skb, dst);
941
942	return skb;
943	}
944
945	static int vrf_output_direct_finish(struct net net, struct* sock *sk,
946	struct sk_buff *skb)
947	{
948	vrf_finish_direct(skb);
949
950	return vrf_ip_local_out(net, sk, skb);
951	}
952
953	static int vrf_output_direct(struct net net, struct* sock *sk,
954	struct sk_buff *skb)
955	{
956	int err = `1`;
957
958	skb->protocol = htons(ETH_P_IP);
959
960	if (!(IPCB(skb)->flags & IPSKB_REROUTED))
961	err = nf_hook(pf: NFPROTO_IPV4, hook: NF_INET_POST_ROUTING, net, sk, skb,
962	NULL, outdev: skb->dev, okfn: vrf_output_direct_finish);
963
964	if (likely(err == `1`))
965	vrf_finish_direct(skb);
966
967	return err;
968	}
969
970	static int vrf_ip_out_direct_finish(struct net net, struct* sock *sk,
971	struct sk_buff *skb)
972	{
973	int err;
974
975	err = vrf_output_direct(net, sk, skb);
976	if (likely(err == `1`))
977	err = vrf_ip_local_out(net, sk, skb);
978
979	return err;
980	}
981
982	static struct sk_buff vrf_ip_out_direct(struct* net_device *vrf_dev,
983	struct sock *sk,
984	struct sk_buff *skb)
985	{
986	struct net *net = dev_net(dev: vrf_dev);
987	int err;
988
989	skb->dev = vrf_dev;
990
991	err = nf_hook(pf: NFPROTO_IPV4, hook: NF_INET_LOCAL_OUT, net, sk,
992	skb, NULL, outdev: vrf_dev, okfn: vrf_ip_out_direct_finish);
993
994	if (likely(err == `1`))
995	err = vrf_output_direct(net, sk, skb);
996
997	if (likely(err == `1`))
998	return skb;
999
1000	return NULL;
1001	}
1002
1003	static struct sk_buff vrf_ip_out(struct* net_device *vrf_dev,
1004	struct sock *sk,
1005	struct sk_buff *skb)
1006	{
1007	/ don't divert multicast or local broadcast /
1008	if (ipv4_is_multicast(addr: ip_hdr(skb)->daddr) \|\|
1009	ipv4_is_lbcast(addr: ip_hdr(skb)->daddr))
1010	return skb;
1011
1012	vrf_nf_set_untracked(skb);
1013
1014	if (qdisc_tx_is_default(dev: vrf_dev) \|\|
1015	IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
1016	return vrf_ip_out_direct(vrf_dev, sk, skb);
1017
1018	return vrf_ip_out_redirect(vrf_dev, skb);
1019	}
1020
1021	/ called with rcu lock held /
1022	static struct sk_buff vrf_l3_out(struct* net_device *vrf_dev,
1023	struct sock *sk,
1024	struct sk_buff *skb,
1025	u16 proto)
1026	{
1027	switch (proto) {
1028	case AF_INET:
1029	return vrf_ip_out(vrf_dev, sk, skb);
1030	case AF_INET6:
1031	return vrf_ip6_out(vrf_dev, sk, skb);
1032	}
1033
1034	return skb;
1035	}
1036
1037	/ holding rtnl /
1038	static void vrf_rtable_release(struct net_device dev, struct* net_vrf *vrf)
1039	{
1040	struct rtable *rth = rtnl_dereference(vrf->rth);
1041	struct net *net = dev_net(dev);
1042	struct dst_entry *dst;
1043
1044	RCU_INIT_POINTER(vrf->rth, NULL);
1045	synchronize_rcu();
1046
1047	/ move dev in dst's to loopback so this VRF device can be deleted*
1048	* - based on dst_ifdown
1049	*/
1050	if (rth) {
1051	dst = &rth->dst;
1052	netdev_ref_replace(odev: dst->dev, ndev: net->loopback_dev,
1053	tracker: &dst->dev_tracker, GFP_KERNEL);
1054	dst->dev = net->loopback_dev;
1055	dst_release(dst);
1056	}
1057	}
1058
1059	static int vrf_rtable_create(struct net_device *dev)
1060	{
1061	struct net_vrf *vrf = netdev_priv(dev);
1062	struct rtable *rth;
1063
1064	if (!fib_new_table(net: dev_net(dev), id: vrf->tb_id))
1065	return -ENOMEM;
1066
1067	/ create a dst for routing packets out through a VRF device /
1068	rth = rt_dst_alloc(dev, flags: `0`, type: RTN_UNICAST, noxfrm: `1`);
1069	if (!rth)
1070	return -ENOMEM;
1071
1072	rth->dst.output = vrf_output;
1073
1074	rcu_assign_pointer(vrf->rth, rth);
1075
1076	return `0`;
1077	}
1078
1079	/************************* device handling *****************/
1080
1081	/ cycle interface to flush neighbor cache and move routes across tables /
1082	static void cycle_netdev(struct net_device *dev,
1083	struct netlink_ext_ack *extack)
1084	{
1085	unsigned int flags = dev->flags;
1086	int ret;
1087
1088	if (!netif_running(dev))
1089	return;
1090
1091	ret = dev_change_flags(dev, flags: flags & ~IFF_UP, extack);
1092	if (ret >= `0`)
1093	ret = dev_change_flags(dev, flags, extack);
1094
1095	if (ret < `0`) {
1096	netdev_err(dev,
1097	format: "Failed to cycle device %s; route tables might be wrong!\n",
1098	dev->name);
1099	}
1100	}
1101
1102	static int do_vrf_add_slave(struct net_device dev, struct* net_device *port_dev,
1103	struct netlink_ext_ack *extack)
1104	{
1105	int ret;
1106
1107	/ do not allow loopback device to be enslaved to a VRF.*
1108	* The vrf device acts as the loopback for the vrf.
1109	*/
1110	if (port_dev == dev_net(dev)->loopback_dev) {
1111	NL_SET_ERR_MSG(extack,
1112	"Can not enslave loopback device to a VRF");
1113	return -EOPNOTSUPP;
1114	}
1115
1116	port_dev->priv_flags \|= IFF_L3MDEV_SLAVE;
1117	ret = netdev_master_upper_dev_link(dev: port_dev, upper_dev: dev, NULL, NULL, extack);
1118	if (ret < `0`)
1119	goto err;
1120
1121	cycle_netdev(dev: port_dev, extack);
1122
1123	return `0`;
1124
1125	err:
1126	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
1127	return ret;
1128	}
1129
1130	static int vrf_add_slave(struct net_device dev, struct* net_device *port_dev,
1131	struct netlink_ext_ack *extack)
1132	{
1133	if (netif_is_l3_master(dev: port_dev)) {
1134	NL_SET_ERR_MSG(extack,
1135	"Can not enslave an L3 master device to a VRF");
1136	return -EINVAL;
1137	}
1138
1139	if (netif_is_l3_slave(dev: port_dev))
1140	return -EINVAL;
1141
1142	return do_vrf_add_slave(dev, port_dev, extack);
1143	}
1144
1145	/ inverse of do_vrf_add_slave /
1146	static int do_vrf_del_slave(struct net_device dev, struct* net_device *port_dev)
1147	{
1148	netdev_upper_dev_unlink(dev: port_dev, upper_dev: dev);
1149	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
1150
1151	cycle_netdev(dev: port_dev, NULL);
1152
1153	return `0`;
1154	}
1155
1156	static int vrf_del_slave(struct net_device dev, struct* net_device *port_dev)
1157	{
1158	return do_vrf_del_slave(dev, port_dev);
1159	}
1160
1161	static void vrf_dev_uninit(struct net_device *dev)
1162	{
1163	struct net_vrf *vrf = netdev_priv(dev);
1164
1165	vrf_rtable_release(dev, vrf);
1166	vrf_rt6_release(dev, vrf);
1167	}
1168
1169	static int vrf_dev_init(struct net_device *dev)
1170	{
1171	struct net_vrf *vrf = netdev_priv(dev);
1172
1173	/ create the default dst which points back to us /
1174	if (vrf_rtable_create(dev) != `0`)
1175	goto out_nomem;
1176
1177	if (vrf_rt6_create(dev) != `0`)
1178	goto out_rth;
1179
1180	dev->flags = IFF_MASTER \| IFF_NOARP;
1181
1182	/ similarly, oper state is irrelevant; set to up to avoid confusion /
1183	dev->operstate = IF_OPER_UP;
1184	netdev_lockdep_set_classes(dev);
1185	return `0`;
1186
1187	out_rth:
1188	vrf_rtable_release(dev, vrf);
1189	out_nomem:
1190	return -ENOMEM;
1191	}
1192
1193	static const struct net_device_ops vrf_netdev_ops = {
1194	.ndo_init = vrf_dev_init,
1195	.ndo_uninit = vrf_dev_uninit,
1196	.ndo_start_xmit = vrf_xmit,
1197	.ndo_set_mac_address = eth_mac_addr,
1198	.ndo_get_stats64 = vrf_get_stats64,
1199	.ndo_add_slave = vrf_add_slave,
1200	.ndo_del_slave = vrf_del_slave,
1201	};
1202
1203	static u32 vrf_fib_table(const struct net_device *dev)
1204	{
1205	struct net_vrf *vrf = netdev_priv(dev);
1206
1207	return vrf->tb_id;
1208	}
1209
1210	static int vrf_rcv_finish(struct net net, struct* sock sk, struct* sk_buff *skb)
1211	{
1212	kfree_skb(skb);
1213	return `0`;
1214	}
1215
1216	static struct sk_buff vrf_rcv_nfhook(u8 pf, unsigned* int hook,
1217	struct sk_buff *skb,
1218	struct net_device *dev)
1219	{
1220	struct net *net = dev_net(dev);
1221
1222	if (nf_hook(pf, hook, net, NULL, skb, indev: dev, NULL, okfn: vrf_rcv_finish) != `1`)
1223	skb = NULL; / kfree_skb(skb) handled by nf code /
1224
1225	return skb;
1226	}
1227
1228	static int vrf_prepare_mac_header(struct sk_buff *skb,
1229	struct net_device *vrf_dev, u16 proto)
1230	{
1231	struct ethhdr *eth;
1232	int err;
1233
1234	/ in general, we do not know if there is enough space in the head of*
1235	* the packet for hosting the mac header.
1236	*/
1237	err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev));
1238	if (unlikely(err))
1239	/ no space in the skb head /
1240	return -ENOBUFS;
1241
1242	__skb_push(skb, ETH_HLEN);
1243	eth = (struct ethhdr *)skb->data;
1244
1245	skb_reset_mac_header(skb);
1246	skb_reset_mac_len(skb);
1247
1248	/ we set the ethernet destination and the source addresses to the*
1249	* address of the VRF device.
1250	*/
1251	ether_addr_copy(dst: eth->h_dest, src: vrf_dev->dev_addr);
1252	ether_addr_copy(dst: eth->h_source, src: vrf_dev->dev_addr);
1253	eth->h_proto = htons(proto);
1254
1255	/ the destination address of the Ethernet frame corresponds to the*
1256	* address set on the VRF interface; therefore, the packet is intended
1257	* to be processed locally.
1258	*/
1259	skb->protocol = eth->h_proto;
1260	skb->pkt_type = PACKET_HOST;
1261
1262	skb_postpush_rcsum(skb, start: skb->data, ETH_HLEN);
1263
1264	skb_pull_inline(skb, ETH_HLEN);
1265
1266	return `0`;
1267	}
1268
1269	/ prepare and add the mac header to the packet if it was not set previously.*
1270	* In this way, packet sniffers such as tcpdump can parse the packet correctly.
1271	* If the mac header was already set, the original mac header is left
1272	* untouched and the function returns immediately.
1273	*/
1274	static int vrf_add_mac_header_if_unset(struct sk_buff *skb,
1275	struct net_device *vrf_dev,
1276	u16 proto, struct net_device *orig_dev)
1277	{
1278	if (skb_mac_header_was_set(skb) && dev_has_header(dev: orig_dev))
1279	return `0`;
1280
1281	return vrf_prepare_mac_header(skb, vrf_dev, proto);
1282	}
1283
1284	#if IS_ENABLED(CONFIG_IPV6)
1285	/ neighbor handling is done with actual device; do not want*
1286	* to flip skb->dev for those ndisc packets. This really fails
1287	* for multiple next protocols (e.g., NEXTHDR_HOP). But it is
1288	* a start.
1289	*/
1290	static bool ipv6_ndisc_frame(const struct sk_buff *skb)
1291	{
1292	const struct ipv6hdr *iph = ipv6_hdr(skb);
1293	bool rc = false;
1294
1295	if (iph->nexthdr == NEXTHDR_ICMP) {
1296	const struct icmp6hdr *icmph;
1297	struct icmp6hdr _icmph;
1298
1299	icmph = skb_header_pointer(skb, offset: sizeof(*iph),
1300	len: sizeof(_icmph), buffer: &_icmph);
1301	if (!icmph)
1302	goto out;
1303
1304	switch (icmph->icmp6_type) {
1305	case NDISC_ROUTER_SOLICITATION:
1306	case NDISC_ROUTER_ADVERTISEMENT:
1307	case NDISC_NEIGHBOUR_SOLICITATION:
1308	case NDISC_NEIGHBOUR_ADVERTISEMENT:
1309	case NDISC_REDIRECT:
1310	rc = true;
1311	break;
1312	}
1313	}
1314
1315	out:
1316	return rc;
1317	}
1318
1319	static struct rt6_info vrf_ip6_route_lookup(struct* net *net,
1320	const struct net_device *dev,
1321	struct flowi6 *fl6,
1322	int ifindex,
1323	const struct sk_buff *skb,
1324	int flags)
1325	{
1326	struct net_vrf *vrf = netdev_priv(dev);
1327
1328	return ip6_pol_route(net, table: vrf->fib6_table, ifindex, fl6, skb, flags);
1329	}
1330
1331	static void vrf_ip6_input_dst(struct sk_buff skb, struct* net_device *vrf_dev,
1332	int ifindex)
1333	{
1334	const struct ipv6hdr *iph = ipv6_hdr(skb);
1335	struct flowi6 fl6 = {
1336	.flowi6_iif = ifindex,
1337	.flowi6_mark = skb->mark,
1338	.flowi6_proto = iph->nexthdr,
1339	.daddr = iph->daddr,
1340	.saddr = iph->saddr,
1341	.flowlabel = ip6_flowinfo(hdr: iph),
1342	};
1343	struct net *net = dev_net(dev: vrf_dev);
1344	struct rt6_info *rt6;
1345
1346	rt6 = vrf_ip6_route_lookup(net, dev: vrf_dev, fl6: &fl6, ifindex, skb,
1347	RT6_LOOKUP_F_HAS_SADDR \| RT6_LOOKUP_F_IFACE);
1348	if (unlikely(!rt6))
1349	return;
1350
1351	if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
1352	return;
1353
1354	skb_dst_set(skb, dst: &rt6->dst);
1355	}
1356
1357	static struct sk_buff vrf_ip6_rcv(struct* net_device *vrf_dev,
1358	struct sk_buff *skb)
1359	{
1360	int orig_iif = skb->skb_iif;
1361	bool need_strict = rt6_need_strict(daddr: &ipv6_hdr(skb)->daddr);
1362	bool is_ndisc = ipv6_ndisc_frame(skb);
1363
1364	/ loopback, multicast & non-ND link-local traffic; do not push through*
1365	* packet taps again. Reset pkt_type for upper layers to process skb.
1366	* For non-loopback strict packets, determine the dst using the original
1367	* ifindex.
1368	*/
1369	if (skb->pkt_type == PACKET_LOOPBACK \|\| (need_strict && !is_ndisc)) {
1370	skb->dev = vrf_dev;
1371	skb->skb_iif = vrf_dev->ifindex;
1372	IP6CB(skb)->flags \|= IP6SKB_L3SLAVE;
1373
1374	if (skb->pkt_type == PACKET_LOOPBACK)
1375	skb->pkt_type = PACKET_HOST;
1376	else
1377	vrf_ip6_input_dst(skb, vrf_dev, ifindex: orig_iif);
1378
1379	goto out;
1380	}
1381
1382	/ if packet is NDISC then keep the ingress interface /
1383	if (!is_ndisc) {
1384	struct net_device *orig_dev = skb->dev;
1385
1386	vrf_rx_stats(dev: vrf_dev, len: skb->len);
1387	skb->dev = vrf_dev;
1388	skb->skb_iif = vrf_dev->ifindex;
1389
1390	if (!list_empty(head: &vrf_dev->ptype_all)) {
1391	int err;
1392
1393	err = vrf_add_mac_header_if_unset(skb, vrf_dev,
1394	ETH_P_IPV6,
1395	orig_dev);
1396	if (likely(!err)) {
1397	skb_push(skb, len: skb->mac_len);
1398	dev_queue_xmit_nit(skb, dev: vrf_dev);
1399	skb_pull(skb, len: skb->mac_len);
1400	}
1401	}
1402
1403	IP6CB(skb)->flags \|= IP6SKB_L3SLAVE;
1404	}
1405
1406	if (need_strict)
1407	vrf_ip6_input_dst(skb, vrf_dev, ifindex: orig_iif);
1408
1409	skb = vrf_rcv_nfhook(pf: NFPROTO_IPV6, hook: NF_INET_PRE_ROUTING, skb, dev: vrf_dev);
1410	out:
1411	return skb;
1412	}
1413
1414	#else
1415	static struct sk_buff vrf_ip6_rcv(struct* net_device *vrf_dev,
1416	struct sk_buff *skb)
1417	{
1418	return skb;
1419	}
1420	#endif
1421
1422	static struct sk_buff vrf_ip_rcv(struct* net_device *vrf_dev,
1423	struct sk_buff *skb)
1424	{
1425	struct net_device *orig_dev = skb->dev;
1426
1427	skb->dev = vrf_dev;
1428	skb->skb_iif = vrf_dev->ifindex;
1429	IPCB(skb)->flags \|= IPSKB_L3SLAVE;
1430
1431	if (ipv4_is_multicast(addr: ip_hdr(skb)->daddr))
1432	goto out;
1433
1434	/ loopback traffic; do not push through packet taps again.*
1435	* Reset pkt_type for upper layers to process skb
1436	*/
1437	if (skb->pkt_type == PACKET_LOOPBACK) {
1438	skb->pkt_type = PACKET_HOST;
1439	goto out;
1440	}
1441
1442	vrf_rx_stats(dev: vrf_dev, len: skb->len);
1443
1444	if (!list_empty(head: &vrf_dev->ptype_all)) {
1445	int err;
1446
1447	err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP,
1448	orig_dev);
1449	if (likely(!err)) {
1450	skb_push(skb, len: skb->mac_len);
1451	dev_queue_xmit_nit(skb, dev: vrf_dev);
1452	skb_pull(skb, len: skb->mac_len);
1453	}
1454	}
1455
1456	skb = vrf_rcv_nfhook(pf: NFPROTO_IPV4, hook: NF_INET_PRE_ROUTING, skb, dev: vrf_dev);
1457	out:
1458	return skb;
1459	}
1460
1461	/ called with rcu lock held /
1462	static struct sk_buff vrf_l3_rcv(struct* net_device *vrf_dev,
1463	struct sk_buff *skb,
1464	u16 proto)
1465	{
1466	switch (proto) {
1467	case AF_INET:
1468	return vrf_ip_rcv(vrf_dev, skb);
1469	case AF_INET6:
1470	return vrf_ip6_rcv(vrf_dev, skb);
1471	}
1472
1473	return skb;
1474	}
1475
1476	#if IS_ENABLED(CONFIG_IPV6)
1477	/ send to link-local or multicast address via interface enslaved to*
1478	* VRF device. Force lookup to VRF table without changing flow struct
1479	* Note: Caller to this function must hold rcu_read_lock() and no refcnt
1480	* is taken on the dst by this function.
1481	*/
1482	static struct dst_entry vrf_link_scope_lookup(const* struct net_device *dev,
1483	struct flowi6 *fl6)
1484	{
1485	struct net *net = dev_net(dev);
1486	int flags = RT6_LOOKUP_F_IFACE \| RT6_LOOKUP_F_DST_NOREF;
1487	struct dst_entry *dst = NULL;
1488	struct rt6_info *rt;
1489
1490	/ VRF device does not have a link-local address and*
1491	* sending packets to link-local or mcast addresses over
1492	* a VRF device does not make sense
1493	*/
1494	if (fl6->flowi6_oif == dev->ifindex) {
1495	dst = &net->ipv6.ip6_null_entry->dst;
1496	return dst;
1497	}
1498
1499	if (!ipv6_addr_any(a: &fl6->saddr))
1500	flags \|= RT6_LOOKUP_F_HAS_SADDR;
1501
1502	rt = vrf_ip6_route_lookup(net, dev, fl6, ifindex: fl6->flowi6_oif, NULL, flags);
1503	if (rt)
1504	dst = &rt->dst;
1505
1506	return dst;
1507	}
1508	#endif
1509
1510	static const struct l3mdev_ops vrf_l3mdev_ops = {
1511	.l3mdev_fib_table = vrf_fib_table,
1512	.l3mdev_l3_rcv = vrf_l3_rcv,
1513	.l3mdev_l3_out = vrf_l3_out,
1514	#if IS_ENABLED(CONFIG_IPV6)
1515	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
1516	#endif
1517	};
1518
1519	static void vrf_get_drvinfo(struct net_device *dev,
1520	struct ethtool_drvinfo *info)
1521	{
1522	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
1523	strscpy(info->version, DRV_VERSION, sizeof(info->version));
1524	}
1525
1526	static const struct ethtool_ops vrf_ethtool_ops = {
1527	.get_drvinfo = vrf_get_drvinfo,
1528	};
1529
1530	static inline size_t vrf_fib_rule_nl_size(void)
1531	{
1532	size_t sz;
1533
1534	sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr));
1535	sz += nla_total_size(payload: sizeof(u8)); / FRA_L3MDEV /
1536	sz += nla_total_size(payload: sizeof(u32)); / FRA_PRIORITY /
1537	sz += nla_total_size(payload: sizeof(u8)); / FRA_PROTOCOL /
1538
1539	return sz;
1540	}
1541
1542	static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
1543	{
1544	struct fib_rule_hdr *frh;
1545	struct nlmsghdr *nlh;
1546	struct sk_buff *skb;
1547	int err;
1548
1549	if ((family == AF_INET6 \|\| family == RTNL_FAMILY_IP6MR) &&
1550	!ipv6_mod_enabled())
1551	return `0`;
1552
1553	skb = nlmsg_new(payload: vrf_fib_rule_nl_size(), GFP_KERNEL);
1554	if (!skb)
1555	return -ENOMEM;
1556
1557	nlh = nlmsg_put(skb, portid: `0`, seq: `0`, type: `0`, payload: sizeof(*frh), flags: `0`);
1558	if (!nlh)
1559	goto nla_put_failure;
1560
1561	/ rule only needs to appear once /
1562	nlh->nlmsg_flags \|= NLM_F_EXCL;
1563
1564	frh = nlmsg_data(nlh);
1565	memset(frh, `0`, sizeof(*frh));
1566	frh->family = family;
1567	frh->action = FR_ACT_TO_TBL;
1568
1569	if (nla_put_u8(skb, attrtype: FRA_PROTOCOL, RTPROT_KERNEL))
1570	goto nla_put_failure;
1571
1572	if (nla_put_u8(skb, attrtype: FRA_L3MDEV, value: `1`))
1573	goto nla_put_failure;
1574
1575	if (nla_put_u32(skb, attrtype: FRA_PRIORITY, FIB_RULE_PREF))
1576	goto nla_put_failure;
1577
1578	nlmsg_end(skb, nlh);
1579
1580	/ fib_nl_{new,del}rule handling looks for net from skb->sk /
1581	skb->sk = dev_net(dev)->rtnl;
1582	if (add_it) {
1583	err = fib_nl_newrule(skb, nlh, NULL);
1584	if (err == -EEXIST)
1585	err = `0`;
1586	} else {
1587	err = fib_nl_delrule(skb, nlh, NULL);
1588	if (err == -ENOENT)
1589	err = `0`;
1590	}
1591	nlmsg_free(skb);
1592
1593	return err;
1594
1595	nla_put_failure:
1596	nlmsg_free(skb);
1597
1598	return -EMSGSIZE;
1599	}
1600
1601	static int vrf_add_fib_rules(const struct net_device *dev)
1602	{
1603	int err;
1604
1605	err = vrf_fib_rule(dev, AF_INET, add_it: true);
1606	if (err < `0`)
1607	goto out_err;
1608
1609	err = vrf_fib_rule(dev, AF_INET6, add_it: true);
1610	if (err < `0`)
1611	goto ipv6_err;
1612
1613	#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
1614	err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, add_it: true);
1615	if (err < `0`)
1616	goto ipmr_err;
1617	#endif
1618
1619	#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1620	err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, add_it: true);
1621	if (err < `0`)
1622	goto ip6mr_err;
1623	#endif
1624
1625	return `0`;
1626
1627	#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1628	ip6mr_err:
1629	vrf_fib_rule(dev, RTNL_FAMILY_IPMR, add_it: false);
1630	#endif
1631
1632	#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
1633	ipmr_err:
1634	vrf_fib_rule(dev, AF_INET6, add_it: false);
1635	#endif
1636
1637	ipv6_err:
1638	vrf_fib_rule(dev, AF_INET, add_it: false);
1639
1640	out_err:
1641	netdev_err(dev, format: "Failed to add FIB rules.\n");
1642	return err;
1643	}
1644
1645	static void vrf_setup(struct net_device *dev)
1646	{
1647	ether_setup(dev);
1648
1649	/ Initialize the device structure. /
1650	dev->netdev_ops = &vrf_netdev_ops;
1651	dev->l3mdev_ops = &vrf_l3mdev_ops;
1652	dev->ethtool_ops = &vrf_ethtool_ops;
1653	dev->needs_free_netdev = true;
1654
1655	/ Fill in device structure with ethernet-generic values. /
1656	eth_hw_addr_random(dev);
1657
1658	/ don't acquire vrf device's netif_tx_lock when transmitting /
1659	dev->features \|= NETIF_F_LLTX;
1660
1661	/ don't allow vrf devices to change network namespaces. /
1662	dev->features \|= NETIF_F_NETNS_LOCAL;
1663
1664	/ does not make sense for a VLAN to be added to a vrf device /
1665	dev->features \|= NETIF_F_VLAN_CHALLENGED;
1666
1667	/ enable offload features /
1668	dev->features \|= NETIF_F_GSO_SOFTWARE;
1669	dev->features \|= NETIF_F_RXCSUM \| NETIF_F_HW_CSUM \| NETIF_F_SCTP_CRC;
1670	dev->features \|= NETIF_F_SG \| NETIF_F_FRAGLIST \| NETIF_F_HIGHDMA;
1671
1672	dev->hw_features = dev->features;
1673	dev->hw_enc_features = dev->features;
1674
1675	/ default to no qdisc; user can add if desired /
1676	dev->priv_flags \|= IFF_NO_QUEUE;
1677	dev->priv_flags \|= IFF_NO_RX_HANDLER;
1678	dev->priv_flags \|= IFF_LIVE_ADDR_CHANGE;
1679
1680	/ VRF devices do not care about MTU, but if the MTU is set*
1681	* too low then the ipv4 and ipv6 protocols are disabled
1682	* which breaks networking.
1683	*/
1684	dev->min_mtu = IPV6_MIN_MTU;
1685	dev->max_mtu = IP6_MAX_MTU;
1686	dev->mtu = dev->max_mtu;
1687
1688	dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
1689	}
1690
1691	static int vrf_validate(struct nlattr tb[], struct* nlattr *data[],
1692	struct netlink_ext_ack *extack)
1693	{
1694	if (tb[IFLA_ADDRESS]) {
1695	if (nla_len(nla: tb[IFLA_ADDRESS]) != ETH_ALEN) {
1696	NL_SET_ERR_MSG(extack, "Invalid hardware address");
1697	return -EINVAL;
1698	}
1699	if (!is_valid_ether_addr(addr: nla_data(nla: tb[IFLA_ADDRESS]))) {
1700	NL_SET_ERR_MSG(extack, "Invalid hardware address");
1701	return -EADDRNOTAVAIL;
1702	}
1703	}
1704	return `0`;
1705	}
1706
1707	static void vrf_dellink(struct net_device dev, struct* list_head *head)
1708	{
1709	struct net_device *port_dev;
1710	struct list_head *iter;
1711
1712	netdev_for_each_lower_dev(dev, port_dev, iter)
1713	vrf_del_slave(dev, port_dev);
1714
1715	vrf_map_unregister_dev(dev);
1716
1717	unregister_netdevice_queue(dev, head);
1718	}
1719
1720	static int vrf_newlink(struct net src_net, struct* net_device *dev,
1721	struct nlattr tb[], struct* nlattr *data[],
1722	struct netlink_ext_ack *extack)
1723	{
1724	struct net_vrf *vrf = netdev_priv(dev);
1725	struct netns_vrf *nn_vrf;
1726	bool *add_fib_rules;
1727	struct net *net;
1728	int err;
1729
1730	if (!data \|\| !data[IFLA_VRF_TABLE]) {
1731	NL_SET_ERR_MSG(extack, "VRF table id is missing");
1732	return -EINVAL;
1733	}
1734
1735	vrf->tb_id = nla_get_u32(nla: data[IFLA_VRF_TABLE]);
1736	if (vrf->tb_id == RT_TABLE_UNSPEC) {
1737	NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE],
1738	"Invalid VRF table id");
1739	return -EINVAL;
1740	}
1741
1742	dev->priv_flags \|= IFF_L3MDEV_MASTER;
1743
1744	err = register_netdevice(dev);
1745	if (err)
1746	goto out;
1747
1748	/ mapping between table_id and vrf;*
1749	* note: such binding could not be done in the dev init function
1750	* because dev->ifindex id is not available yet.
1751	*/
1752	vrf->ifindex = dev->ifindex;
1753
1754	err = vrf_map_register_dev(dev, extack);
1755	if (err) {
1756	unregister_netdevice(dev);
1757	goto out;
1758	}
1759
1760	net = dev_net(dev);
1761	nn_vrf = net_generic(net, id: vrf_net_id);
1762
1763	add_fib_rules = &nn_vrf->add_fib_rules;
1764	if (*add_fib_rules) {
1765	err = vrf_add_fib_rules(dev);
1766	if (err) {
1767	vrf_map_unregister_dev(dev);
1768	unregister_netdevice(dev);
1769	goto out;
1770	}
1771	*add_fib_rules = false;
1772	}
1773
1774	out:
1775	return err;
1776	}
1777
1778	static size_t vrf_nl_getsize(const struct net_device *dev)
1779	{
1780	return nla_total_size(payload: sizeof(u32)); / IFLA_VRF_TABLE /
1781	}
1782
1783	static int vrf_fillinfo(struct sk_buff *skb,
1784	const struct net_device *dev)
1785	{
1786	struct net_vrf *vrf = netdev_priv(dev);
1787
1788	return nla_put_u32(skb, attrtype: IFLA_VRF_TABLE, value: vrf->tb_id);
1789	}
1790
1791	static size_t vrf_get_slave_size(const struct net_device *bond_dev,
1792	const struct net_device *slave_dev)
1793	{
1794	return nla_total_size(payload: sizeof(u32)); / IFLA_VRF_PORT_TABLE /
1795	}
1796
1797	static int vrf_fill_slave_info(struct sk_buff *skb,
1798	const struct net_device *vrf_dev,
1799	const struct net_device *slave_dev)
1800	{
1801	struct net_vrf *vrf = netdev_priv(dev: vrf_dev);
1802
1803	if (nla_put_u32(skb, attrtype: IFLA_VRF_PORT_TABLE, value: vrf->tb_id))
1804	return -EMSGSIZE;
1805
1806	return `0`;
1807	}
1808
1809	static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + `1`] = {
1810	[IFLA_VRF_TABLE] = { .type = NLA_U32 },
1811	};
1812
1813	static struct rtnl_link_ops vrf_link_ops __read_mostly = {
1814	.kind = DRV_NAME,
1815	.priv_size = sizeof(struct net_vrf),
1816
1817	.get_size = vrf_nl_getsize,
1818	.policy = vrf_nl_policy,
1819	.validate = vrf_validate,
1820	.fill_info = vrf_fillinfo,
1821
1822	.get_slave_size = vrf_get_slave_size,
1823	.fill_slave_info = vrf_fill_slave_info,
1824
1825	.newlink = vrf_newlink,
1826	.dellink = vrf_dellink,
1827	.setup = vrf_setup,
1828	.maxtype = IFLA_VRF_MAX,
1829	};
1830
1831	static int vrf_device_event(struct notifier_block *unused,
1832	unsigned long event, void *ptr)
1833	{
1834	struct net_device *dev = netdev_notifier_info_to_dev(info: ptr);
1835
1836	/ only care about unregister events to drop slave references /
1837	if (event == NETDEV_UNREGISTER) {
1838	struct net_device *vrf_dev;
1839
1840	if (!netif_is_l3_slave(dev))
1841	goto out;
1842
1843	vrf_dev = netdev_master_upper_dev_get(dev);
1844	vrf_del_slave(dev: vrf_dev, port_dev: dev);
1845	}
1846	out:
1847	return NOTIFY_DONE;
1848	}
1849
1850	static struct notifier_block vrf_notifier_block __read_mostly = {
1851	.notifier_call = vrf_device_event,
1852	};
1853
1854	static int vrf_map_init(struct vrf_map *vmap)
1855	{
1856	spin_lock_init(&vmap->vmap_lock);
1857	hash_init(vmap->ht);
1858
1859	vmap->strict_mode = false;
1860
1861	return `0`;
1862	}
1863
1864	#ifdef CONFIG_SYSCTL
1865	static bool vrf_strict_mode(struct vrf_map *vmap)
1866	{
1867	bool strict_mode;
1868
1869	vrf_map_lock(vmap);
1870	strict_mode = vmap->strict_mode;
1871	vrf_map_unlock(vmap);
1872
1873	return strict_mode;
1874	}
1875
1876	static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode)
1877	{
1878	bool *cur_mode;
1879	int res = `0`;
1880
1881	vrf_map_lock(vmap);
1882
1883	cur_mode = &vmap->strict_mode;
1884	if (*cur_mode == new_mode)
1885	goto unlock;
1886
1887	if (*cur_mode) {
1888	/ disable strict mode /
1889	*cur_mode = false;
1890	} else {
1891	if (vmap->shared_tables) {
1892	/ we cannot allow strict_mode because there are some*
1893	* vrfs that share one or more tables.
1894	*/
1895	res = -EBUSY;
1896	goto unlock;
1897	}
1898
1899	/ no tables are shared among vrfs, so we can go back*
1900	* to 1:1 association between a vrf with its table.
1901	*/
1902	*cur_mode = true;
1903	}
1904
1905	unlock:
1906	vrf_map_unlock(vmap);
1907
1908	return res;
1909	}
1910
1911	static int vrf_shared_table_handler(struct ctl_table table, int* write,
1912	void buffer, size_t lenp, loff_t *ppos)
1913	{
1914	struct net net = (struct* net *)table->extra1;
1915	struct vrf_map *vmap = netns_vrf_map(net);
1916	int proc_strict_mode = `0`;
1917	struct ctl_table tmp = {
1918	.procname = table->procname,
1919	.data = &proc_strict_mode,
1920	.maxlen = sizeof(int),
1921	.mode = table->mode,
1922	.extra1 = SYSCTL_ZERO,
1923	.extra2 = SYSCTL_ONE,
1924	};
1925	int ret;
1926
1927	if (!write)
1928	proc_strict_mode = vrf_strict_mode(vmap);
1929
1930	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
1931
1932	if (write && ret == `0`)
1933	ret = vrf_strict_mode_change(vmap, new_mode: (bool)proc_strict_mode);
1934
1935	return ret;
1936	}
1937
1938	static const struct ctl_table vrf_table[] = {
1939	{
1940	.procname = "strict_mode",
1941	.data = NULL,
1942	.maxlen = sizeof(int),
1943	.mode = `0644`,
1944	.proc_handler = vrf_shared_table_handler,
1945	/ set by the vrf_netns_init /
1946	.extra1 = NULL,
1947	},
1948	};
1949
1950	static int vrf_netns_init_sysctl(struct net net, struct* netns_vrf *nn_vrf)
1951	{
1952	struct ctl_table *table;
1953
1954	table = kmemdup(p: vrf_table, size: sizeof(vrf_table), GFP_KERNEL);
1955	if (!table)
1956	return -ENOMEM;
1957
1958	/ init the extra1 parameter with the reference to current netns /
1959	table[`0`].extra1 = net;
1960
1961	nn_vrf->ctl_hdr = register_net_sysctl_sz(net, path: "net/vrf", table,
1962	ARRAY_SIZE(vrf_table));
1963	if (!nn_vrf->ctl_hdr) {
1964	kfree(objp: table);
1965	return -ENOMEM;
1966	}
1967
1968	return `0`;
1969	}
1970
1971	static void vrf_netns_exit_sysctl(struct net *net)
1972	{
1973	struct netns_vrf *nn_vrf = net_generic(net, id: vrf_net_id);
1974	struct ctl_table *table;
1975
1976	table = nn_vrf->ctl_hdr->ctl_table_arg;
1977	unregister_net_sysctl_table(header: nn_vrf->ctl_hdr);
1978	kfree(objp: table);
1979	}
1980	#else
1981	static int vrf_netns_init_sysctl(struct net net, struct* netns_vrf *nn_vrf)
1982	{
1983	return `0`;
1984	}
1985
1986	static void vrf_netns_exit_sysctl(struct net *net)
1987	{
1988	}
1989	#endif
1990
1991	/ Initialize per network namespace state /
1992	static int __net_init vrf_netns_init(struct net *net)
1993	{
1994	struct netns_vrf *nn_vrf = net_generic(net, id: vrf_net_id);
1995
1996	nn_vrf->add_fib_rules = true;
1997	vrf_map_init(vmap: &nn_vrf->vmap);
1998
1999	return vrf_netns_init_sysctl(net, nn_vrf);
2000	}
2001
2002	static void __net_exit vrf_netns_exit(struct net *net)
2003	{
2004	vrf_netns_exit_sysctl(net);
2005	}
2006
2007	static struct pernet_operations vrf_net_ops __net_initdata = {
2008	.init = vrf_netns_init,
2009	.exit = vrf_netns_exit,
2010	.id = &vrf_net_id,
2011	.size = sizeof(struct netns_vrf),
2012	};
2013
2014	static int __init vrf_init_module(void)
2015	{
2016	int rc;
2017
2018	register_netdevice_notifier(nb: &vrf_notifier_block);
2019
2020	rc = register_pernet_subsys(&vrf_net_ops);
2021	if (rc < `0`)
2022	goto error;
2023
2024	rc = l3mdev_table_lookup_register(l3type: L3MDEV_TYPE_VRF,
2025	fn: vrf_ifindex_lookup_by_table_id);
2026	if (rc < `0`)
2027	goto unreg_pernet;
2028
2029	rc = rtnl_link_register(ops: &vrf_link_ops);
2030	if (rc < `0`)
2031	goto table_lookup_unreg;
2032
2033	return `0`;
2034
2035	table_lookup_unreg:
2036	l3mdev_table_lookup_unregister(l3type: L3MDEV_TYPE_VRF,
2037	fn: vrf_ifindex_lookup_by_table_id);
2038
2039	unreg_pernet:
2040	unregister_pernet_subsys(&vrf_net_ops);
2041
2042	error:
2043	unregister_netdevice_notifier(nb: &vrf_notifier_block);
2044	return rc;
2045	}
2046
2047	module_init(vrf_init_module);
2048	MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
2049	MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
2050	MODULE_LICENSE("GPL");
2051	MODULE_ALIAS_RTNL_LINK(DRV_NAME);
2052	MODULE_VERSION(DRV_VERSION);
2053

source code of linux/drivers/net/vrf.c