1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (c) 2007-2014 Nicira, Inc. |
4 | */ |
5 | |
6 | #include <linux/uaccess.h> |
7 | #include <linux/netdevice.h> |
8 | #include <linux/etherdevice.h> |
9 | #include <linux/if_ether.h> |
10 | #include <linux/if_vlan.h> |
11 | #include <net/llc_pdu.h> |
12 | #include <linux/kernel.h> |
13 | #include <linux/jhash.h> |
14 | #include <linux/jiffies.h> |
15 | #include <linux/llc.h> |
16 | #include <linux/module.h> |
17 | #include <linux/in.h> |
18 | #include <linux/rcupdate.h> |
19 | #include <linux/cpumask.h> |
20 | #include <linux/if_arp.h> |
21 | #include <linux/ip.h> |
22 | #include <linux/ipv6.h> |
23 | #include <linux/mpls.h> |
24 | #include <linux/sctp.h> |
25 | #include <linux/smp.h> |
26 | #include <linux/tcp.h> |
27 | #include <linux/udp.h> |
28 | #include <linux/icmp.h> |
29 | #include <linux/icmpv6.h> |
30 | #include <linux/rculist.h> |
31 | #include <net/ip.h> |
32 | #include <net/ip_tunnels.h> |
33 | #include <net/ipv6.h> |
34 | #include <net/mpls.h> |
35 | #include <net/ndisc.h> |
36 | #include <net/nsh.h> |
37 | #include <net/pkt_cls.h> |
38 | #include <net/netfilter/nf_conntrack_zones.h> |
39 | |
40 | #include "conntrack.h" |
41 | #include "datapath.h" |
42 | #include "flow.h" |
43 | #include "flow_netlink.h" |
44 | #include "vport.h" |
45 | |
46 | u64 ovs_flow_used_time(unsigned long flow_jiffies) |
47 | { |
48 | struct timespec64 cur_ts; |
49 | u64 cur_ms, idle_ms; |
50 | |
51 | ktime_get_ts64(ts: &cur_ts); |
52 | idle_ms = jiffies_to_msecs(j: jiffies - flow_jiffies); |
53 | cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC + |
54 | cur_ts.tv_nsec / NSEC_PER_MSEC; |
55 | |
56 | return cur_ms - idle_ms; |
57 | } |
58 | |
59 | #define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) |
60 | |
61 | void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, |
62 | const struct sk_buff *skb) |
63 | { |
64 | struct sw_flow_stats *stats; |
65 | unsigned int cpu = smp_processor_id(); |
66 | int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); |
67 | |
68 | stats = rcu_dereference(flow->stats[cpu]); |
69 | |
70 | /* Check if already have CPU-specific stats. */ |
71 | if (likely(stats)) { |
72 | spin_lock(lock: &stats->lock); |
73 | /* Mark if we write on the pre-allocated stats. */ |
74 | if (cpu == 0 && unlikely(flow->stats_last_writer != cpu)) |
75 | flow->stats_last_writer = cpu; |
76 | } else { |
77 | stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ |
78 | spin_lock(lock: &stats->lock); |
79 | |
80 | /* If the current CPU is the only writer on the |
81 | * pre-allocated stats keep using them. |
82 | */ |
83 | if (unlikely(flow->stats_last_writer != cpu)) { |
84 | /* A previous locker may have already allocated the |
85 | * stats, so we need to check again. If CPU-specific |
86 | * stats were already allocated, we update the pre- |
87 | * allocated stats as we have already locked them. |
88 | */ |
89 | if (likely(flow->stats_last_writer != -1) && |
90 | likely(!rcu_access_pointer(flow->stats[cpu]))) { |
91 | /* Try to allocate CPU-specific stats. */ |
92 | struct sw_flow_stats *new_stats; |
93 | |
94 | new_stats = |
95 | kmem_cache_alloc_node(s: flow_stats_cache, |
96 | GFP_NOWAIT | |
97 | __GFP_THISNODE | |
98 | __GFP_NOWARN | |
99 | __GFP_NOMEMALLOC, |
100 | node: numa_node_id()); |
101 | if (likely(new_stats)) { |
102 | new_stats->used = jiffies; |
103 | new_stats->packet_count = 1; |
104 | new_stats->byte_count = len; |
105 | new_stats->tcp_flags = tcp_flags; |
106 | spin_lock_init(&new_stats->lock); |
107 | |
108 | rcu_assign_pointer(flow->stats[cpu], |
109 | new_stats); |
110 | cpumask_set_cpu(cpu, |
111 | dstp: flow->cpu_used_mask); |
112 | goto unlock; |
113 | } |
114 | } |
115 | flow->stats_last_writer = cpu; |
116 | } |
117 | } |
118 | |
119 | stats->used = jiffies; |
120 | stats->packet_count++; |
121 | stats->byte_count += len; |
122 | stats->tcp_flags |= tcp_flags; |
123 | unlock: |
124 | spin_unlock(lock: &stats->lock); |
125 | } |
126 | |
127 | /* Must be called with rcu_read_lock or ovs_mutex. */ |
128 | void ovs_flow_stats_get(const struct sw_flow *flow, |
129 | struct ovs_flow_stats *ovs_stats, |
130 | unsigned long *used, __be16 *tcp_flags) |
131 | { |
132 | int cpu; |
133 | |
134 | *used = 0; |
135 | *tcp_flags = 0; |
136 | memset(ovs_stats, 0, sizeof(*ovs_stats)); |
137 | |
138 | /* We open code this to make sure cpu 0 is always considered */ |
139 | for (cpu = 0; cpu < nr_cpu_ids; |
140 | cpu = cpumask_next(n: cpu, srcp: flow->cpu_used_mask)) { |
141 | struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); |
142 | |
143 | if (stats) { |
144 | /* Local CPU may write on non-local stats, so we must |
145 | * block bottom-halves here. |
146 | */ |
147 | spin_lock_bh(lock: &stats->lock); |
148 | if (!*used || time_after(stats->used, *used)) |
149 | *used = stats->used; |
150 | *tcp_flags |= stats->tcp_flags; |
151 | ovs_stats->n_packets += stats->packet_count; |
152 | ovs_stats->n_bytes += stats->byte_count; |
153 | spin_unlock_bh(lock: &stats->lock); |
154 | } |
155 | } |
156 | } |
157 | |
158 | /* Called with ovs_mutex. */ |
159 | void ovs_flow_stats_clear(struct sw_flow *flow) |
160 | { |
161 | int cpu; |
162 | |
163 | /* We open code this to make sure cpu 0 is always considered */ |
164 | for (cpu = 0; cpu < nr_cpu_ids; |
165 | cpu = cpumask_next(n: cpu, srcp: flow->cpu_used_mask)) { |
166 | struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); |
167 | |
168 | if (stats) { |
169 | spin_lock_bh(lock: &stats->lock); |
170 | stats->used = 0; |
171 | stats->packet_count = 0; |
172 | stats->byte_count = 0; |
173 | stats->tcp_flags = 0; |
174 | spin_unlock_bh(lock: &stats->lock); |
175 | } |
176 | } |
177 | } |
178 | |
179 | static int (struct sk_buff *skb, int len) |
180 | { |
181 | if (unlikely(skb->len < len)) |
182 | return -EINVAL; |
183 | if (unlikely(!pskb_may_pull(skb, len))) |
184 | return -ENOMEM; |
185 | return 0; |
186 | } |
187 | |
188 | static bool arphdr_ok(struct sk_buff *skb) |
189 | { |
190 | return pskb_may_pull(skb, len: skb_network_offset(skb) + |
191 | sizeof(struct arp_eth_header)); |
192 | } |
193 | |
194 | static int check_iphdr(struct sk_buff *skb) |
195 | { |
196 | unsigned int nh_ofs = skb_network_offset(skb); |
197 | unsigned int ip_len; |
198 | int err; |
199 | |
200 | err = check_header(skb, len: nh_ofs + sizeof(struct iphdr)); |
201 | if (unlikely(err)) |
202 | return err; |
203 | |
204 | ip_len = ip_hdrlen(skb); |
205 | if (unlikely(ip_len < sizeof(struct iphdr) || |
206 | skb->len < nh_ofs + ip_len)) |
207 | return -EINVAL; |
208 | |
209 | skb_set_transport_header(skb, offset: nh_ofs + ip_len); |
210 | return 0; |
211 | } |
212 | |
213 | static bool tcphdr_ok(struct sk_buff *skb) |
214 | { |
215 | int th_ofs = skb_transport_offset(skb); |
216 | int tcp_len; |
217 | |
218 | if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) |
219 | return false; |
220 | |
221 | tcp_len = tcp_hdrlen(skb); |
222 | if (unlikely(tcp_len < sizeof(struct tcphdr) || |
223 | skb->len < th_ofs + tcp_len)) |
224 | return false; |
225 | |
226 | return true; |
227 | } |
228 | |
229 | static bool udphdr_ok(struct sk_buff *skb) |
230 | { |
231 | return pskb_may_pull(skb, len: skb_transport_offset(skb) + |
232 | sizeof(struct udphdr)); |
233 | } |
234 | |
235 | static bool sctphdr_ok(struct sk_buff *skb) |
236 | { |
237 | return pskb_may_pull(skb, len: skb_transport_offset(skb) + |
238 | sizeof(struct sctphdr)); |
239 | } |
240 | |
241 | static bool icmphdr_ok(struct sk_buff *skb) |
242 | { |
243 | return pskb_may_pull(skb, len: skb_transport_offset(skb) + |
244 | sizeof(struct icmphdr)); |
245 | } |
246 | |
247 | /** |
248 | * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags. |
249 | * |
250 | * @skb: buffer where extension header data starts in packet |
251 | * @nh: ipv6 header |
252 | * @ext_hdrs: flags are stored here |
253 | * |
254 | * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header |
255 | * is unexpectedly encountered. (Two destination options headers may be |
256 | * expected and would not cause this bit to be set.) |
257 | * |
258 | * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order |
259 | * preferred (but not required) by RFC 2460: |
260 | * |
261 | * When more than one extension header is used in the same packet, it is |
262 | * recommended that those headers appear in the following order: |
263 | * IPv6 header |
264 | * Hop-by-Hop Options header |
265 | * Destination Options header |
266 | * Routing header |
267 | * Fragment header |
268 | * Authentication header |
269 | * Encapsulating Security Payload header |
270 | * Destination Options header |
271 | * upper-layer header |
272 | */ |
273 | static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh, |
274 | u16 *ext_hdrs) |
275 | { |
276 | u8 next_type = nh->nexthdr; |
277 | unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); |
278 | int = 0; |
279 | |
280 | *ext_hdrs = 0; |
281 | |
282 | while (ipv6_ext_hdr(nexthdr: next_type)) { |
283 | struct ipv6_opt_hdr _hdr, *hp; |
284 | |
285 | switch (next_type) { |
286 | case IPPROTO_NONE: |
287 | *ext_hdrs |= OFPIEH12_NONEXT; |
288 | /* stop parsing */ |
289 | return; |
290 | |
291 | case IPPROTO_ESP: |
292 | if (*ext_hdrs & OFPIEH12_ESP) |
293 | *ext_hdrs |= OFPIEH12_UNREP; |
294 | if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST | |
295 | OFPIEH12_ROUTER | IPPROTO_FRAGMENT | |
296 | OFPIEH12_AUTH | OFPIEH12_UNREP)) || |
297 | dest_options_header_count >= 2) { |
298 | *ext_hdrs |= OFPIEH12_UNSEQ; |
299 | } |
300 | *ext_hdrs |= OFPIEH12_ESP; |
301 | break; |
302 | |
303 | case IPPROTO_AH: |
304 | if (*ext_hdrs & OFPIEH12_AUTH) |
305 | *ext_hdrs |= OFPIEH12_UNREP; |
306 | if ((*ext_hdrs & |
307 | ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER | |
308 | IPPROTO_FRAGMENT | OFPIEH12_UNREP)) || |
309 | dest_options_header_count >= 2) { |
310 | *ext_hdrs |= OFPIEH12_UNSEQ; |
311 | } |
312 | *ext_hdrs |= OFPIEH12_AUTH; |
313 | break; |
314 | |
315 | case IPPROTO_DSTOPTS: |
316 | if (dest_options_header_count == 0) { |
317 | if (*ext_hdrs & |
318 | ~(OFPIEH12_HOP | OFPIEH12_UNREP)) |
319 | *ext_hdrs |= OFPIEH12_UNSEQ; |
320 | *ext_hdrs |= OFPIEH12_DEST; |
321 | } else if (dest_options_header_count == 1) { |
322 | if (*ext_hdrs & |
323 | ~(OFPIEH12_HOP | OFPIEH12_DEST | |
324 | OFPIEH12_ROUTER | OFPIEH12_FRAG | |
325 | OFPIEH12_AUTH | OFPIEH12_ESP | |
326 | OFPIEH12_UNREP)) { |
327 | *ext_hdrs |= OFPIEH12_UNSEQ; |
328 | } |
329 | } else { |
330 | *ext_hdrs |= OFPIEH12_UNREP; |
331 | } |
332 | dest_options_header_count++; |
333 | break; |
334 | |
335 | case IPPROTO_FRAGMENT: |
336 | if (*ext_hdrs & OFPIEH12_FRAG) |
337 | *ext_hdrs |= OFPIEH12_UNREP; |
338 | if ((*ext_hdrs & ~(OFPIEH12_HOP | |
339 | OFPIEH12_DEST | |
340 | OFPIEH12_ROUTER | |
341 | OFPIEH12_UNREP)) || |
342 | dest_options_header_count >= 2) { |
343 | *ext_hdrs |= OFPIEH12_UNSEQ; |
344 | } |
345 | *ext_hdrs |= OFPIEH12_FRAG; |
346 | break; |
347 | |
348 | case IPPROTO_ROUTING: |
349 | if (*ext_hdrs & OFPIEH12_ROUTER) |
350 | *ext_hdrs |= OFPIEH12_UNREP; |
351 | if ((*ext_hdrs & ~(OFPIEH12_HOP | |
352 | OFPIEH12_DEST | |
353 | OFPIEH12_UNREP)) || |
354 | dest_options_header_count >= 2) { |
355 | *ext_hdrs |= OFPIEH12_UNSEQ; |
356 | } |
357 | *ext_hdrs |= OFPIEH12_ROUTER; |
358 | break; |
359 | |
360 | case IPPROTO_HOPOPTS: |
361 | if (*ext_hdrs & OFPIEH12_HOP) |
362 | *ext_hdrs |= OFPIEH12_UNREP; |
363 | /* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6 |
364 | * extension header is present as the first |
365 | * extension header in the packet. |
366 | */ |
367 | if (*ext_hdrs == 0) |
368 | *ext_hdrs |= OFPIEH12_HOP; |
369 | else |
370 | *ext_hdrs |= OFPIEH12_UNSEQ; |
371 | break; |
372 | |
373 | default: |
374 | return; |
375 | } |
376 | |
377 | hp = skb_header_pointer(skb, offset: start, len: sizeof(_hdr), buffer: &_hdr); |
378 | if (!hp) |
379 | break; |
380 | next_type = hp->nexthdr; |
381 | start += ipv6_optlen(hp); |
382 | } |
383 | } |
384 | |
385 | static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) |
386 | { |
387 | unsigned short frag_off; |
388 | unsigned int payload_ofs = 0; |
389 | unsigned int nh_ofs = skb_network_offset(skb); |
390 | unsigned int nh_len; |
391 | struct ipv6hdr *nh; |
392 | int err, nexthdr, flags = 0; |
393 | |
394 | err = check_header(skb, len: nh_ofs + sizeof(*nh)); |
395 | if (unlikely(err)) |
396 | return err; |
397 | |
398 | nh = ipv6_hdr(skb); |
399 | |
400 | get_ipv6_ext_hdrs(skb, nh, ext_hdrs: &key->ipv6.exthdrs); |
401 | |
402 | key->ip.proto = NEXTHDR_NONE; |
403 | key->ip.tos = ipv6_get_dsfield(ipv6h: nh); |
404 | key->ip.ttl = nh->hop_limit; |
405 | key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); |
406 | key->ipv6.addr.src = nh->saddr; |
407 | key->ipv6.addr.dst = nh->daddr; |
408 | |
409 | nexthdr = ipv6_find_hdr(skb, offset: &payload_ofs, target: -1, fragoff: &frag_off, fragflg: &flags); |
410 | if (flags & IP6_FH_F_FRAG) { |
411 | if (frag_off) { |
412 | key->ip.frag = OVS_FRAG_TYPE_LATER; |
413 | key->ip.proto = NEXTHDR_FRAGMENT; |
414 | return 0; |
415 | } |
416 | key->ip.frag = OVS_FRAG_TYPE_FIRST; |
417 | } else { |
418 | key->ip.frag = OVS_FRAG_TYPE_NONE; |
419 | } |
420 | |
421 | /* Delayed handling of error in ipv6_find_hdr() as it |
422 | * always sets flags and frag_off to a valid value which may be |
423 | * used to set key->ip.frag above. |
424 | */ |
425 | if (unlikely(nexthdr < 0)) |
426 | return -EPROTO; |
427 | |
428 | nh_len = payload_ofs - nh_ofs; |
429 | skb_set_transport_header(skb, offset: nh_ofs + nh_len); |
430 | key->ip.proto = nexthdr; |
431 | return nh_len; |
432 | } |
433 | |
434 | static bool icmp6hdr_ok(struct sk_buff *skb) |
435 | { |
436 | return pskb_may_pull(skb, len: skb_transport_offset(skb) + |
437 | sizeof(struct icmp6hdr)); |
438 | } |
439 | |
440 | /** |
441 | * parse_vlan_tag - Parse vlan tag from vlan header. |
442 | * @skb: skb containing frame to parse |
443 | * @key_vh: pointer to parsed vlan tag |
444 | * @untag_vlan: should the vlan header be removed from the frame |
445 | * |
446 | * Return: ERROR on memory error. |
447 | * %0 if it encounters a non-vlan or incomplete packet. |
448 | * %1 after successfully parsing vlan tag. |
449 | */ |
450 | static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, |
451 | bool untag_vlan) |
452 | { |
453 | struct vlan_head *vh = (struct vlan_head *)skb->data; |
454 | |
455 | if (likely(!eth_type_vlan(vh->tpid))) |
456 | return 0; |
457 | |
458 | if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16))) |
459 | return 0; |
460 | |
461 | if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) + |
462 | sizeof(__be16)))) |
463 | return -ENOMEM; |
464 | |
465 | vh = (struct vlan_head *)skb->data; |
466 | key_vh->tci = vh->tci | htons(VLAN_CFI_MASK); |
467 | key_vh->tpid = vh->tpid; |
468 | |
469 | if (unlikely(untag_vlan)) { |
470 | int offset = skb->data - skb_mac_header(skb); |
471 | u16 tci; |
472 | int err; |
473 | |
474 | __skb_push(skb, len: offset); |
475 | err = __skb_vlan_pop(skb, vlan_tci: &tci); |
476 | __skb_pull(skb, len: offset); |
477 | if (err) |
478 | return err; |
479 | __vlan_hwaccel_put_tag(skb, vlan_proto: key_vh->tpid, vlan_tci: tci); |
480 | } else { |
481 | __skb_pull(skb, len: sizeof(struct vlan_head)); |
482 | } |
483 | return 1; |
484 | } |
485 | |
486 | static void clear_vlan(struct sw_flow_key *key) |
487 | { |
488 | key->eth.vlan.tci = 0; |
489 | key->eth.vlan.tpid = 0; |
490 | key->eth.cvlan.tci = 0; |
491 | key->eth.cvlan.tpid = 0; |
492 | } |
493 | |
494 | static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) |
495 | { |
496 | int res; |
497 | |
498 | if (skb_vlan_tag_present(skb)) { |
499 | key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK); |
500 | key->eth.vlan.tpid = skb->vlan_proto; |
501 | } else { |
502 | /* Parse outer vlan tag in the non-accelerated case. */ |
503 | res = parse_vlan_tag(skb, key_vh: &key->eth.vlan, untag_vlan: true); |
504 | if (res <= 0) |
505 | return res; |
506 | } |
507 | |
508 | /* Parse inner vlan tag. */ |
509 | res = parse_vlan_tag(skb, key_vh: &key->eth.cvlan, untag_vlan: false); |
510 | if (res <= 0) |
511 | return res; |
512 | |
513 | return 0; |
514 | } |
515 | |
516 | static __be16 parse_ethertype(struct sk_buff *skb) |
517 | { |
518 | struct llc_snap_hdr { |
519 | u8 dsap; /* Always 0xAA */ |
520 | u8 ssap; /* Always 0xAA */ |
521 | u8 ctrl; |
522 | u8 oui[3]; |
523 | __be16 ethertype; |
524 | }; |
525 | struct llc_snap_hdr *llc; |
526 | __be16 proto; |
527 | |
528 | proto = *(__be16 *) skb->data; |
529 | __skb_pull(skb, len: sizeof(__be16)); |
530 | |
531 | if (eth_proto_is_802_3(proto)) |
532 | return proto; |
533 | |
534 | if (skb->len < sizeof(struct llc_snap_hdr)) |
535 | return htons(ETH_P_802_2); |
536 | |
537 | if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) |
538 | return htons(0); |
539 | |
540 | llc = (struct llc_snap_hdr *) skb->data; |
541 | if (llc->dsap != LLC_SAP_SNAP || |
542 | llc->ssap != LLC_SAP_SNAP || |
543 | (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) |
544 | return htons(ETH_P_802_2); |
545 | |
546 | __skb_pull(skb, len: sizeof(struct llc_snap_hdr)); |
547 | |
548 | if (eth_proto_is_802_3(proto: llc->ethertype)) |
549 | return llc->ethertype; |
550 | |
551 | return htons(ETH_P_802_2); |
552 | } |
553 | |
554 | static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, |
555 | int nh_len) |
556 | { |
557 | struct icmp6hdr *icmp = icmp6_hdr(skb); |
558 | |
559 | /* The ICMPv6 type and code fields use the 16-bit transport port |
560 | * fields, so we need to store them in 16-bit network byte order. |
561 | */ |
562 | key->tp.src = htons(icmp->icmp6_type); |
563 | key->tp.dst = htons(icmp->icmp6_code); |
564 | memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); |
565 | |
566 | if (icmp->icmp6_code == 0 && |
567 | (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || |
568 | icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { |
569 | int icmp_len = skb->len - skb_transport_offset(skb); |
570 | struct nd_msg *nd; |
571 | int offset; |
572 | |
573 | /* In order to process neighbor discovery options, we need the |
574 | * entire packet. |
575 | */ |
576 | if (unlikely(icmp_len < sizeof(*nd))) |
577 | return 0; |
578 | |
579 | if (unlikely(skb_linearize(skb))) |
580 | return -ENOMEM; |
581 | |
582 | nd = (struct nd_msg *)skb_transport_header(skb); |
583 | key->ipv6.nd.target = nd->target; |
584 | |
585 | icmp_len -= sizeof(*nd); |
586 | offset = 0; |
587 | while (icmp_len >= 8) { |
588 | struct nd_opt_hdr *nd_opt = |
589 | (struct nd_opt_hdr *)(nd->opt + offset); |
590 | int opt_len = nd_opt->nd_opt_len * 8; |
591 | |
592 | if (unlikely(!opt_len || opt_len > icmp_len)) |
593 | return 0; |
594 | |
595 | /* Store the link layer address if the appropriate |
596 | * option is provided. It is considered an error if |
597 | * the same link layer option is specified twice. |
598 | */ |
599 | if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR |
600 | && opt_len == 8) { |
601 | if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) |
602 | goto invalid; |
603 | ether_addr_copy(dst: key->ipv6.nd.sll, |
604 | src: &nd->opt[offset+sizeof(*nd_opt)]); |
605 | } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR |
606 | && opt_len == 8) { |
607 | if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) |
608 | goto invalid; |
609 | ether_addr_copy(dst: key->ipv6.nd.tll, |
610 | src: &nd->opt[offset+sizeof(*nd_opt)]); |
611 | } |
612 | |
613 | icmp_len -= opt_len; |
614 | offset += opt_len; |
615 | } |
616 | } |
617 | |
618 | return 0; |
619 | |
620 | invalid: |
621 | memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); |
622 | memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); |
623 | memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); |
624 | |
625 | return 0; |
626 | } |
627 | |
628 | static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) |
629 | { |
630 | struct nshhdr *nh; |
631 | unsigned int nh_ofs = skb_network_offset(skb); |
632 | u8 version, length; |
633 | int err; |
634 | |
635 | err = check_header(skb, len: nh_ofs + NSH_BASE_HDR_LEN); |
636 | if (unlikely(err)) |
637 | return err; |
638 | |
639 | nh = nsh_hdr(skb); |
640 | version = nsh_get_ver(nsh: nh); |
641 | length = nsh_hdr_len(nsh: nh); |
642 | |
643 | if (version != 0) |
644 | return -EINVAL; |
645 | |
646 | err = check_header(skb, len: nh_ofs + length); |
647 | if (unlikely(err)) |
648 | return err; |
649 | |
650 | nh = nsh_hdr(skb); |
651 | key->nsh.base.flags = nsh_get_flags(nsh: nh); |
652 | key->nsh.base.ttl = nsh_get_ttl(nsh: nh); |
653 | key->nsh.base.mdtype = nh->mdtype; |
654 | key->nsh.base.np = nh->np; |
655 | key->nsh.base.path_hdr = nh->path_hdr; |
656 | switch (key->nsh.base.mdtype) { |
657 | case NSH_M_TYPE1: |
658 | if (length != NSH_M_TYPE1_LEN) |
659 | return -EINVAL; |
660 | memcpy(key->nsh.context, nh->md1.context, |
661 | sizeof(nh->md1)); |
662 | break; |
663 | case NSH_M_TYPE2: |
664 | memset(key->nsh.context, 0, |
665 | sizeof(nh->md1)); |
666 | break; |
667 | default: |
668 | return -EINVAL; |
669 | } |
670 | |
671 | return 0; |
672 | } |
673 | |
674 | /** |
675 | * key_extract_l3l4 - extracts L3/L4 header information. |
676 | * @skb: sk_buff that contains the frame, with skb->data pointing to the |
677 | * L3 header |
678 | * @key: output flow key |
679 | * |
680 | * Return: %0 if successful, otherwise a negative errno value. |
681 | */ |
682 | static int (struct sk_buff *skb, struct sw_flow_key *key) |
683 | { |
684 | int error; |
685 | |
686 | /* Network layer. */ |
687 | if (key->eth.type == htons(ETH_P_IP)) { |
688 | struct iphdr *nh; |
689 | __be16 offset; |
690 | |
691 | error = check_iphdr(skb); |
692 | if (unlikely(error)) { |
693 | memset(&key->ip, 0, sizeof(key->ip)); |
694 | memset(&key->ipv4, 0, sizeof(key->ipv4)); |
695 | if (error == -EINVAL) { |
696 | skb->transport_header = skb->network_header; |
697 | error = 0; |
698 | } |
699 | return error; |
700 | } |
701 | |
702 | nh = ip_hdr(skb); |
703 | key->ipv4.addr.src = nh->saddr; |
704 | key->ipv4.addr.dst = nh->daddr; |
705 | |
706 | key->ip.proto = nh->protocol; |
707 | key->ip.tos = nh->tos; |
708 | key->ip.ttl = nh->ttl; |
709 | |
710 | offset = nh->frag_off & htons(IP_OFFSET); |
711 | if (offset) { |
712 | key->ip.frag = OVS_FRAG_TYPE_LATER; |
713 | memset(&key->tp, 0, sizeof(key->tp)); |
714 | return 0; |
715 | } |
716 | if (nh->frag_off & htons(IP_MF) || |
717 | skb_shinfo(skb)->gso_type & SKB_GSO_UDP) |
718 | key->ip.frag = OVS_FRAG_TYPE_FIRST; |
719 | else |
720 | key->ip.frag = OVS_FRAG_TYPE_NONE; |
721 | |
722 | /* Transport layer. */ |
723 | if (key->ip.proto == IPPROTO_TCP) { |
724 | if (tcphdr_ok(skb)) { |
725 | struct tcphdr *tcp = tcp_hdr(skb); |
726 | key->tp.src = tcp->source; |
727 | key->tp.dst = tcp->dest; |
728 | key->tp.flags = TCP_FLAGS_BE16(tcp); |
729 | } else { |
730 | memset(&key->tp, 0, sizeof(key->tp)); |
731 | } |
732 | |
733 | } else if (key->ip.proto == IPPROTO_UDP) { |
734 | if (udphdr_ok(skb)) { |
735 | struct udphdr *udp = udp_hdr(skb); |
736 | key->tp.src = udp->source; |
737 | key->tp.dst = udp->dest; |
738 | } else { |
739 | memset(&key->tp, 0, sizeof(key->tp)); |
740 | } |
741 | } else if (key->ip.proto == IPPROTO_SCTP) { |
742 | if (sctphdr_ok(skb)) { |
743 | struct sctphdr *sctp = sctp_hdr(skb); |
744 | key->tp.src = sctp->source; |
745 | key->tp.dst = sctp->dest; |
746 | } else { |
747 | memset(&key->tp, 0, sizeof(key->tp)); |
748 | } |
749 | } else if (key->ip.proto == IPPROTO_ICMP) { |
750 | if (icmphdr_ok(skb)) { |
751 | struct icmphdr *icmp = icmp_hdr(skb); |
752 | /* The ICMP type and code fields use the 16-bit |
753 | * transport port fields, so we need to store |
754 | * them in 16-bit network byte order. */ |
755 | key->tp.src = htons(icmp->type); |
756 | key->tp.dst = htons(icmp->code); |
757 | } else { |
758 | memset(&key->tp, 0, sizeof(key->tp)); |
759 | } |
760 | } |
761 | |
762 | } else if (key->eth.type == htons(ETH_P_ARP) || |
763 | key->eth.type == htons(ETH_P_RARP)) { |
764 | struct arp_eth_header *arp; |
765 | bool arp_available = arphdr_ok(skb); |
766 | |
767 | arp = (struct arp_eth_header *)skb_network_header(skb); |
768 | |
769 | if (arp_available && |
770 | arp->ar_hrd == htons(ARPHRD_ETHER) && |
771 | arp->ar_pro == htons(ETH_P_IP) && |
772 | arp->ar_hln == ETH_ALEN && |
773 | arp->ar_pln == 4) { |
774 | |
775 | /* We only match on the lower 8 bits of the opcode. */ |
776 | if (ntohs(arp->ar_op) <= 0xff) |
777 | key->ip.proto = ntohs(arp->ar_op); |
778 | else |
779 | key->ip.proto = 0; |
780 | |
781 | memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); |
782 | memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); |
783 | ether_addr_copy(dst: key->ipv4.arp.sha, src: arp->ar_sha); |
784 | ether_addr_copy(dst: key->ipv4.arp.tha, src: arp->ar_tha); |
785 | } else { |
786 | memset(&key->ip, 0, sizeof(key->ip)); |
787 | memset(&key->ipv4, 0, sizeof(key->ipv4)); |
788 | } |
789 | } else if (eth_p_mpls(eth_type: key->eth.type)) { |
790 | u8 label_count = 1; |
791 | |
792 | memset(&key->mpls, 0, sizeof(key->mpls)); |
793 | skb_set_inner_network_header(skb, offset: skb->mac_len); |
794 | while (1) { |
795 | __be32 lse; |
796 | |
797 | error = check_header(skb, len: skb->mac_len + |
798 | label_count * MPLS_HLEN); |
799 | if (unlikely(error)) |
800 | return 0; |
801 | |
802 | memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); |
803 | |
804 | if (label_count <= MPLS_LABEL_DEPTH) |
805 | memcpy(&key->mpls.lse[label_count - 1], &lse, |
806 | MPLS_HLEN); |
807 | |
808 | skb_set_inner_network_header(skb, offset: skb->mac_len + |
809 | label_count * MPLS_HLEN); |
810 | if (lse & htonl(MPLS_LS_S_MASK)) |
811 | break; |
812 | |
813 | label_count++; |
814 | } |
815 | if (label_count > MPLS_LABEL_DEPTH) |
816 | label_count = MPLS_LABEL_DEPTH; |
817 | |
818 | key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); |
819 | } else if (key->eth.type == htons(ETH_P_IPV6)) { |
820 | int nh_len; /* IPv6 Header + Extensions */ |
821 | |
822 | nh_len = parse_ipv6hdr(skb, key); |
823 | if (unlikely(nh_len < 0)) { |
824 | switch (nh_len) { |
825 | case -EINVAL: |
826 | memset(&key->ip, 0, sizeof(key->ip)); |
827 | memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); |
828 | fallthrough; |
829 | case -EPROTO: |
830 | skb->transport_header = skb->network_header; |
831 | error = 0; |
832 | break; |
833 | default: |
834 | error = nh_len; |
835 | } |
836 | return error; |
837 | } |
838 | |
839 | if (key->ip.frag == OVS_FRAG_TYPE_LATER) { |
840 | memset(&key->tp, 0, sizeof(key->tp)); |
841 | return 0; |
842 | } |
843 | if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) |
844 | key->ip.frag = OVS_FRAG_TYPE_FIRST; |
845 | |
846 | /* Transport layer. */ |
847 | if (key->ip.proto == NEXTHDR_TCP) { |
848 | if (tcphdr_ok(skb)) { |
849 | struct tcphdr *tcp = tcp_hdr(skb); |
850 | key->tp.src = tcp->source; |
851 | key->tp.dst = tcp->dest; |
852 | key->tp.flags = TCP_FLAGS_BE16(tcp); |
853 | } else { |
854 | memset(&key->tp, 0, sizeof(key->tp)); |
855 | } |
856 | } else if (key->ip.proto == NEXTHDR_UDP) { |
857 | if (udphdr_ok(skb)) { |
858 | struct udphdr *udp = udp_hdr(skb); |
859 | key->tp.src = udp->source; |
860 | key->tp.dst = udp->dest; |
861 | } else { |
862 | memset(&key->tp, 0, sizeof(key->tp)); |
863 | } |
864 | } else if (key->ip.proto == NEXTHDR_SCTP) { |
865 | if (sctphdr_ok(skb)) { |
866 | struct sctphdr *sctp = sctp_hdr(skb); |
867 | key->tp.src = sctp->source; |
868 | key->tp.dst = sctp->dest; |
869 | } else { |
870 | memset(&key->tp, 0, sizeof(key->tp)); |
871 | } |
872 | } else if (key->ip.proto == NEXTHDR_ICMP) { |
873 | if (icmp6hdr_ok(skb)) { |
874 | error = parse_icmpv6(skb, key, nh_len); |
875 | if (error) |
876 | return error; |
877 | } else { |
878 | memset(&key->tp, 0, sizeof(key->tp)); |
879 | } |
880 | } |
881 | } else if (key->eth.type == htons(ETH_P_NSH)) { |
882 | error = parse_nsh(skb, key); |
883 | if (error) |
884 | return error; |
885 | } |
886 | return 0; |
887 | } |
888 | |
889 | /** |
890 | * key_extract - extracts a flow key from an Ethernet frame. |
891 | * @skb: sk_buff that contains the frame, with skb->data pointing to the |
892 | * Ethernet header |
893 | * @key: output flow key |
894 | * |
895 | * The caller must ensure that skb->len >= ETH_HLEN. |
896 | * |
897 | * Initializes @skb header fields as follows: |
898 | * |
899 | * - skb->mac_header: the L2 header. |
900 | * |
901 | * - skb->network_header: just past the L2 header, or just past the |
902 | * VLAN header, to the first byte of the L2 payload. |
903 | * |
904 | * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 |
905 | * on output, then just past the IP header, if one is present and |
906 | * of a correct length, otherwise the same as skb->network_header. |
907 | * For other key->eth.type values it is left untouched. |
908 | * |
909 | * - skb->protocol: the type of the data starting at skb->network_header. |
910 | * Equals to key->eth.type. |
911 | * |
912 | * Return: %0 if successful, otherwise a negative errno value. |
913 | */ |
914 | static int (struct sk_buff *skb, struct sw_flow_key *key) |
915 | { |
916 | struct ethhdr *eth; |
917 | |
918 | /* Flags are always used as part of stats */ |
919 | key->tp.flags = 0; |
920 | |
921 | skb_reset_mac_header(skb); |
922 | |
923 | /* Link layer. */ |
924 | clear_vlan(key); |
925 | if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { |
926 | if (unlikely(eth_type_vlan(skb->protocol))) |
927 | return -EINVAL; |
928 | |
929 | skb_reset_network_header(skb); |
930 | key->eth.type = skb->protocol; |
931 | } else { |
932 | eth = eth_hdr(skb); |
933 | ether_addr_copy(dst: key->eth.src, src: eth->h_source); |
934 | ether_addr_copy(dst: key->eth.dst, src: eth->h_dest); |
935 | |
936 | __skb_pull(skb, len: 2 * ETH_ALEN); |
937 | /* We are going to push all headers that we pull, so no need to |
938 | * update skb->csum here. |
939 | */ |
940 | |
941 | if (unlikely(parse_vlan(skb, key))) |
942 | return -ENOMEM; |
943 | |
944 | key->eth.type = parse_ethertype(skb); |
945 | if (unlikely(key->eth.type == htons(0))) |
946 | return -ENOMEM; |
947 | |
948 | /* Multiple tagged packets need to retain TPID to satisfy |
949 | * skb_vlan_pop(), which will later shift the ethertype into |
950 | * skb->protocol. |
951 | */ |
952 | if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) |
953 | skb->protocol = key->eth.cvlan.tpid; |
954 | else |
955 | skb->protocol = key->eth.type; |
956 | |
957 | skb_reset_network_header(skb); |
958 | __skb_push(skb, len: skb->data - skb_mac_header(skb)); |
959 | } |
960 | |
961 | skb_reset_mac_len(skb); |
962 | |
963 | /* Fill out L3/L4 key info, if any */ |
964 | return key_extract_l3l4(skb, key); |
965 | } |
966 | |
967 | /* In the case of conntrack fragment handling it expects L3 headers, |
968 | * add a helper. |
969 | */ |
970 | int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key) |
971 | { |
972 | return key_extract_l3l4(skb, key); |
973 | } |
974 | |
975 | int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) |
976 | { |
977 | int res; |
978 | |
979 | res = key_extract(skb, key); |
980 | if (!res) |
981 | key->mac_proto &= ~SW_FLOW_KEY_INVALID; |
982 | |
983 | return res; |
984 | } |
985 | |
986 | static int (struct sk_buff *skb) |
987 | { |
988 | switch (skb->dev->type) { |
989 | case ARPHRD_ETHER: |
990 | return MAC_PROTO_ETHERNET; |
991 | case ARPHRD_NONE: |
992 | if (skb->protocol == htons(ETH_P_TEB)) |
993 | return MAC_PROTO_ETHERNET; |
994 | return MAC_PROTO_NONE; |
995 | } |
996 | WARN_ON_ONCE(1); |
997 | return -EINVAL; |
998 | } |
999 | |
1000 | int (const struct ip_tunnel_info *tun_info, |
1001 | struct sk_buff *skb, struct sw_flow_key *key) |
1002 | { |
1003 | #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) |
1004 | struct tc_skb_ext *tc_ext; |
1005 | #endif |
1006 | bool post_ct = false, post_ct_snat = false, post_ct_dnat = false; |
1007 | int res, err; |
1008 | u16 zone = 0; |
1009 | |
1010 | /* Extract metadata from packet. */ |
1011 | if (tun_info) { |
1012 | key->tun_proto = ip_tunnel_info_af(tun_info); |
1013 | memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); |
1014 | |
1015 | if (tun_info->options_len) { |
1016 | BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * |
1017 | 8)) - 1 |
1018 | > sizeof(key->tun_opts)); |
1019 | |
1020 | ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), |
1021 | info: tun_info); |
1022 | key->tun_opts_len = tun_info->options_len; |
1023 | } else { |
1024 | key->tun_opts_len = 0; |
1025 | } |
1026 | } else { |
1027 | key->tun_proto = 0; |
1028 | key->tun_opts_len = 0; |
1029 | memset(&key->tun_key, 0, sizeof(key->tun_key)); |
1030 | } |
1031 | |
1032 | key->phy.priority = skb->priority; |
1033 | key->phy.in_port = OVS_CB(skb)->input_vport->port_no; |
1034 | key->phy.skb_mark = skb->mark; |
1035 | key->ovs_flow_hash = 0; |
1036 | res = key_extract_mac_proto(skb); |
1037 | if (res < 0) |
1038 | return res; |
1039 | key->mac_proto = res; |
1040 | |
1041 | #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) |
1042 | if (tc_skb_ext_tc_enabled()) { |
1043 | tc_ext = skb_ext_find(skb, id: TC_SKB_EXT); |
1044 | key->recirc_id = tc_ext && !tc_ext->act_miss ? |
1045 | tc_ext->chain : 0; |
1046 | OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; |
1047 | post_ct = tc_ext ? tc_ext->post_ct : false; |
1048 | post_ct_snat = post_ct ? tc_ext->post_ct_snat : false; |
1049 | post_ct_dnat = post_ct ? tc_ext->post_ct_dnat : false; |
1050 | zone = post_ct ? tc_ext->zone : 0; |
1051 | } else { |
1052 | key->recirc_id = 0; |
1053 | } |
1054 | #else |
1055 | key->recirc_id = 0; |
1056 | #endif |
1057 | |
1058 | err = key_extract(skb, key); |
1059 | if (!err) { |
1060 | ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */ |
1061 | if (post_ct) { |
1062 | if (!skb_get_nfct(skb)) { |
1063 | key->ct_zone = zone; |
1064 | } else { |
1065 | if (!post_ct_dnat) |
1066 | key->ct_state &= ~OVS_CS_F_DST_NAT; |
1067 | if (!post_ct_snat) |
1068 | key->ct_state &= ~OVS_CS_F_SRC_NAT; |
1069 | } |
1070 | } |
1071 | } |
1072 | return err; |
1073 | } |
1074 | |
1075 | int (struct net *net, const struct nlattr *attr, |
1076 | struct sk_buff *skb, |
1077 | struct sw_flow_key *key, bool log) |
1078 | { |
1079 | const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; |
1080 | u64 attrs = 0; |
1081 | int err; |
1082 | |
1083 | err = parse_flow_nlattrs(attr, a, attrsp: &attrs, log); |
1084 | if (err) |
1085 | return -EINVAL; |
1086 | |
1087 | /* Extract metadata from netlink attributes. */ |
1088 | err = ovs_nla_get_flow_metadata(net, a, attrs, key, log); |
1089 | if (err) |
1090 | return err; |
1091 | |
1092 | /* key_extract assumes that skb->protocol is set-up for |
1093 | * layer 3 packets which is the case for other callers, |
1094 | * in particular packets received from the network stack. |
1095 | * Here the correct value can be set from the metadata |
1096 | * extracted above. |
1097 | * For L2 packet key eth type would be zero. skb protocol |
1098 | * would be set to correct value later during key-extact. |
1099 | */ |
1100 | |
1101 | skb->protocol = key->eth.type; |
1102 | err = key_extract(skb, key); |
1103 | if (err) |
1104 | return err; |
1105 | |
1106 | /* Check that we have conntrack original direction tuple metadata only |
1107 | * for packets for which it makes sense. Otherwise the key may be |
1108 | * corrupted due to overlapping key fields. |
1109 | */ |
1110 | if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) && |
1111 | key->eth.type != htons(ETH_P_IP)) |
1112 | return -EINVAL; |
1113 | if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) && |
1114 | (key->eth.type != htons(ETH_P_IPV6) || |
1115 | sw_flow_key_is_nd(key))) |
1116 | return -EINVAL; |
1117 | |
1118 | return 0; |
1119 | } |
1120 | |