1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2// Copyright (c) 2019, 2020 Cloudflare
3
4#include <stdbool.h>
5#include <stddef.h>
6#include <stdint.h>
7#include <string.h>
8
9#include <linux/bpf.h>
10#include <linux/icmp.h>
11#include <linux/icmpv6.h>
12#include <linux/if_ether.h>
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/ipv6.h>
16#include <linux/pkt_cls.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19
20#include <bpf/bpf_helpers.h>
21#include <bpf/bpf_endian.h>
22
23#include "bpf_compiler.h"
24#include "test_cls_redirect.h"
25
26#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27
28#ifdef SUBPROGS
29#define INLINING __noinline
30#else
31#define INLINING __always_inline
32#endif
33
34#define offsetofend(TYPE, MEMBER) \
35 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
36
37#define IP_OFFSET_MASK (0x1FFF)
38#define IP_MF (0x2000)
39
40char _license[] SEC("license") = "Dual BSD/GPL";
41
42/**
43 * Destination port and IP used for UDP encapsulation.
44 */
45volatile const __be16 ENCAPSULATION_PORT;
46volatile const __be32 ENCAPSULATION_IP;
47
48typedef struct {
49 uint64_t processed_packets_total;
50 uint64_t l3_protocol_packets_total_ipv4;
51 uint64_t l3_protocol_packets_total_ipv6;
52 uint64_t l4_protocol_packets_total_tcp;
53 uint64_t l4_protocol_packets_total_udp;
54 uint64_t accepted_packets_total_syn;
55 uint64_t accepted_packets_total_syn_cookies;
56 uint64_t accepted_packets_total_last_hop;
57 uint64_t accepted_packets_total_icmp_echo_request;
58 uint64_t accepted_packets_total_established;
59 uint64_t forwarded_packets_total_gue;
60 uint64_t forwarded_packets_total_gre;
61
62 uint64_t errors_total_unknown_l3_proto;
63 uint64_t errors_total_unknown_l4_proto;
64 uint64_t errors_total_malformed_ip;
65 uint64_t errors_total_fragmented_ip;
66 uint64_t errors_total_malformed_icmp;
67 uint64_t errors_total_unwanted_icmp;
68 uint64_t errors_total_malformed_icmp_pkt_too_big;
69 uint64_t errors_total_malformed_tcp;
70 uint64_t errors_total_malformed_udp;
71 uint64_t errors_total_icmp_echo_replies;
72 uint64_t errors_total_malformed_encapsulation;
73 uint64_t errors_total_encap_adjust_failed;
74 uint64_t errors_total_encap_buffer_too_small;
75 uint64_t errors_total_redirect_loop;
76 uint64_t errors_total_encap_mtu_violate;
77} metrics_t;
78
79typedef enum {
80 INVALID = 0,
81 UNKNOWN,
82 ECHO_REQUEST,
83 SYN,
84 SYN_COOKIE,
85 ESTABLISHED,
86} verdict_t;
87
88typedef struct {
89 uint16_t src, dst;
90} flow_ports_t;
91
92_Static_assert(
93 sizeof(flow_ports_t) !=
94 offsetofend(struct bpf_sock_tuple, ipv4.dport) -
95 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
96 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
97_Static_assert(
98 sizeof(flow_ports_t) !=
99 offsetofend(struct bpf_sock_tuple, ipv6.dport) -
100 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
101 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
102
103typedef int ret_t;
104
105/* This is a bit of a hack. We need a return value which allows us to
106 * indicate that the regular flow of the program should continue,
107 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
108 */
109static const ret_t CONTINUE_PROCESSING = -1;
110
111/* Convenience macro to call functions which return ret_t.
112 */
113#define MAYBE_RETURN(x) \
114 do { \
115 ret_t __ret = x; \
116 if (__ret != CONTINUE_PROCESSING) \
117 return __ret; \
118 } while (0)
119
120/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
121 * or not aligned if the arch supports efficient unaligned access.
122 *
123 * Since the verifier ensures that eBPF packet accesses follow these rules,
124 * we can tell LLVM to emit code as if we always had a larger alignment.
125 * It will yell at us if we end up on a platform where this is not valid.
126 */
127typedef uint8_t *net_ptr __attribute__((align_value(8)));
128
129typedef struct buf {
130 struct __sk_buff *skb;
131 net_ptr head;
132 /* NB: tail musn't have alignment other than 1, otherwise
133 * LLVM will go and eliminate code, e.g. when checking packet lengths.
134 */
135 uint8_t *const tail;
136} buf_t;
137
138static __always_inline size_t buf_off(const buf_t *buf)
139{
140 /* Clang seems to optimize constructs like
141 * a - b + c
142 * if c is known:
143 * r? = c
144 * r? -= b
145 * r? += a
146 *
147 * This is a problem if a and b are packet pointers,
148 * since the verifier allows subtracting two pointers to
149 * get a scalar, but not a scalar and a pointer.
150 *
151 * Use inline asm to break this optimization.
152 */
153 size_t off = (size_t)buf->head;
154 asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
155 return off;
156}
157
158static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
159{
160 if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
161 return false;
162 }
163
164 buf->head += len;
165 return true;
166}
167
168static __always_inline bool buf_skip(buf_t *buf, const size_t len)
169{
170 /* Check whether off + len is valid in the non-linear part. */
171 if (buf_off(buf) + len > buf->skb->len) {
172 return false;
173 }
174
175 buf->head += len;
176 return true;
177}
178
179/* Returns a pointer to the start of buf, or NULL if len is
180 * larger than the remaining data. Consumes len bytes on a successful
181 * call.
182 *
183 * If scratch is not NULL, the function will attempt to load non-linear
184 * data via bpf_skb_load_bytes. On success, scratch is returned.
185 */
186static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
187{
188 if (buf->head + len > buf->tail) {
189 if (scratch == NULL) {
190 return NULL;
191 }
192
193 return buf_copy(buf, dst: scratch, len) ? scratch : NULL;
194 }
195
196 void *ptr = buf->head;
197 buf->head += len;
198 return ptr;
199}
200
201static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
202{
203 if (ipv4->ihl <= 5) {
204 return true;
205 }
206
207 return buf_skip(buf, len: (ipv4->ihl - 5) * 4);
208}
209
210static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
211{
212 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
213 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
214}
215
216static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
217{
218 struct iphdr *ipv4 = buf_assign(buf: pkt, len: sizeof(*ipv4), scratch);
219 if (ipv4 == NULL) {
220 return NULL;
221 }
222
223 if (ipv4->ihl < 5) {
224 return NULL;
225 }
226
227 if (!pkt_skip_ipv4_options(buf: pkt, ipv4)) {
228 return NULL;
229 }
230
231 return ipv4;
232}
233
234/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
235static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
236{
237 if (!buf_copy(buf: pkt, dst: ports, len: sizeof(*ports))) {
238 return false;
239 }
240
241 /* Ports in the L4 headers are reversed, since we are parsing an ICMP
242 * payload which is going towards the eyeball.
243 */
244 uint16_t dst = ports->src;
245 ports->src = ports->dst;
246 ports->dst = dst;
247 return true;
248}
249
250static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
251{
252 /* The highest reasonable value for an IPv4 header
253 * checksum requires two folds, so we just do that always.
254 */
255 csum = (csum & 0xffff) + (csum >> 16);
256 csum = (csum & 0xffff) + (csum >> 16);
257 return (uint16_t)~csum;
258}
259
260static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
261{
262 iph->check = 0;
263
264 /* An IP header without options is 20 bytes. Two of those
265 * are the checksum, which we always set to zero. Hence,
266 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
267 * which fits in 32 bit.
268 */
269 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
270 uint32_t acc = 0;
271 uint16_t *ipw = (uint16_t *)iph;
272
273 __pragma_loop_unroll_full
274 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
275 acc += ipw[i];
276 }
277
278 iph->check = pkt_checksum_fold(csum: acc);
279}
280
281static INLINING
282bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
283 const struct ipv6hdr *ipv6,
284 uint8_t *upper_proto,
285 bool *is_fragment)
286{
287 /* We understand five extension headers.
288 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
289 * headers should occur once, except Destination Options, which may
290 * occur twice. Hence we give up after 6 headers.
291 */
292 struct {
293 uint8_t next;
294 uint8_t len;
295 } exthdr = {
296 .next = ipv6->nexthdr,
297 };
298 *is_fragment = false;
299
300 __pragma_loop_unroll_full
301 for (int i = 0; i < 6; i++) {
302 switch (exthdr.next) {
303 case IPPROTO_FRAGMENT:
304 *is_fragment = true;
305 /* NB: We don't check that hdrlen == 0 as per spec. */
306 /* fallthrough; */
307
308 case IPPROTO_HOPOPTS:
309 case IPPROTO_ROUTING:
310 case IPPROTO_DSTOPTS:
311 case IPPROTO_MH:
312 if (!buf_copy(buf: pkt, dst: &exthdr, len: sizeof(exthdr))) {
313 return false;
314 }
315
316 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
317 if (!buf_skip(buf: pkt,
318 len: (exthdr.len + 1) * 8 - sizeof(exthdr))) {
319 return false;
320 }
321
322 /* Decode next header */
323 break;
324
325 default:
326 /* The next header is not one of the known extension
327 * headers, treat it as the upper layer header.
328 *
329 * This handles IPPROTO_NONE.
330 *
331 * Encapsulating Security Payload (50) and Authentication
332 * Header (51) also end up here (and will trigger an
333 * unknown proto error later). They have a custom header
334 * format and seem too esoteric to care about.
335 */
336 *upper_proto = exthdr.next;
337 return true;
338 }
339 }
340
341 /* We never found an upper layer header. */
342 return false;
343}
344
345/* This function has to be inlined, because the verifier otherwise rejects it
346 * due to returning a pointer to the stack. This is technically correct, since
347 * scratch is allocated on the stack. However, this usage should be safe since
348 * it's the callers stack after all.
349 */
350static __always_inline struct ipv6hdr *
351pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
352 bool *is_fragment)
353{
354 struct ipv6hdr *ipv6 = buf_assign(buf: pkt, len: sizeof(*ipv6), scratch);
355 if (ipv6 == NULL) {
356 return NULL;
357 }
358
359 if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, upper_proto: proto, is_fragment)) {
360 return NULL;
361 }
362
363 return ipv6;
364}
365
366/* Global metrics, per CPU
367 */
368struct {
369 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
370 __uint(max_entries, 1);
371 __type(key, unsigned int);
372 __type(value, metrics_t);
373} metrics_map SEC(".maps");
374
375static INLINING metrics_t *get_global_metrics(void)
376{
377 uint64_t key = 0;
378 return bpf_map_lookup_elem(&metrics_map, &key);
379}
380
381static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
382{
383 const int payload_off =
384 sizeof(*encap) +
385 sizeof(struct in_addr) * encap->unigue.hop_count;
386 int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
387
388 // Changing the ethertype if the encapsulated packet is ipv6
389 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
390 encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
391 }
392
393 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
394 BPF_F_ADJ_ROOM_FIXED_GSO |
395 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
396 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
397 return TC_ACT_SHOT;
398
399 return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
400}
401
402static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
403 struct in_addr *next_hop, metrics_t *metrics)
404{
405 metrics->forwarded_packets_total_gre++;
406
407 const int payload_off =
408 sizeof(*encap) +
409 sizeof(struct in_addr) * encap->unigue.hop_count;
410 int32_t encap_overhead =
411 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
412 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
413 uint16_t proto = ETH_P_IP;
414 uint32_t mtu_len = 0;
415
416 /* Loop protection: the inner packet's TTL is decremented as a safeguard
417 * against any forwarding loop. As the only interesting field is the TTL
418 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
419 * as they handle the split packets if needed (no need for the data to be
420 * in the linear section).
421 */
422 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
423 proto = ETH_P_IPV6;
424 uint8_t ttl;
425 int rc;
426
427 rc = bpf_skb_load_bytes(
428 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
429 &ttl, 1);
430 if (rc != 0) {
431 metrics->errors_total_malformed_encapsulation++;
432 return TC_ACT_SHOT;
433 }
434
435 if (ttl == 0) {
436 metrics->errors_total_redirect_loop++;
437 return TC_ACT_SHOT;
438 }
439
440 ttl--;
441 rc = bpf_skb_store_bytes(
442 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
443 &ttl, 1, 0);
444 if (rc != 0) {
445 metrics->errors_total_malformed_encapsulation++;
446 return TC_ACT_SHOT;
447 }
448 } else {
449 uint8_t ttl;
450 int rc;
451
452 rc = bpf_skb_load_bytes(
453 skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
454 1);
455 if (rc != 0) {
456 metrics->errors_total_malformed_encapsulation++;
457 return TC_ACT_SHOT;
458 }
459
460 if (ttl == 0) {
461 metrics->errors_total_redirect_loop++;
462 return TC_ACT_SHOT;
463 }
464
465 /* IPv4 also has a checksum to patch. While the TTL is only one byte,
466 * this function only works for 2 and 4 bytes arguments (the result is
467 * the same).
468 */
469 rc = bpf_l3_csum_replace(
470 skb, payload_off + offsetof(struct iphdr, check), ttl,
471 ttl - 1, 2);
472 if (rc != 0) {
473 metrics->errors_total_malformed_encapsulation++;
474 return TC_ACT_SHOT;
475 }
476
477 ttl--;
478 rc = bpf_skb_store_bytes(
479 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
480 0);
481 if (rc != 0) {
482 metrics->errors_total_malformed_encapsulation++;
483 return TC_ACT_SHOT;
484 }
485 }
486
487 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
488 metrics->errors_total_encap_mtu_violate++;
489 return TC_ACT_SHOT;
490 }
491
492 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
493 BPF_F_ADJ_ROOM_FIXED_GSO |
494 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
495 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
496 metrics->errors_total_encap_adjust_failed++;
497 return TC_ACT_SHOT;
498 }
499
500 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
501 metrics->errors_total_encap_buffer_too_small++;
502 return TC_ACT_SHOT;
503 }
504
505 buf_t pkt = {
506 .skb = skb,
507 .head = (uint8_t *)(long)skb->data,
508 .tail = (uint8_t *)(long)skb->data_end,
509 };
510
511 encap_gre_t *encap_gre = buf_assign(buf: &pkt, len: sizeof(encap_gre_t), NULL);
512 if (encap_gre == NULL) {
513 metrics->errors_total_encap_buffer_too_small++;
514 return TC_ACT_SHOT;
515 }
516
517 encap_gre->ip.protocol = IPPROTO_GRE;
518 encap_gre->ip.daddr = next_hop->s_addr;
519 encap_gre->ip.saddr = ENCAPSULATION_IP;
520 encap_gre->ip.tot_len =
521 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
522 encap_gre->gre.flags = 0;
523 encap_gre->gre.protocol = bpf_htons(proto);
524 pkt_ipv4_checksum(iph: (void *)&encap_gre->ip);
525
526 return bpf_redirect(skb->ifindex, 0);
527}
528
529static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
530 struct in_addr *next_hop, metrics_t *metrics)
531{
532 /* swap L2 addresses */
533 /* This assumes that packets are received from a router.
534 * So just swapping the MAC addresses here will make the packet go back to
535 * the router, which will send it to the appropriate machine.
536 */
537 unsigned char temp[ETH_ALEN];
538 memcpy(temp, encap->eth.h_dest, sizeof(temp));
539 memcpy(encap->eth.h_dest, encap->eth.h_source,
540 sizeof(encap->eth.h_dest));
541 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
542
543 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
544 encap->unigue.last_hop_gre) {
545 return forward_with_gre(skb, encap, next_hop, metrics);
546 }
547
548 metrics->forwarded_packets_total_gue++;
549 uint32_t old_saddr = encap->ip.saddr;
550 encap->ip.saddr = encap->ip.daddr;
551 encap->ip.daddr = next_hop->s_addr;
552 if (encap->unigue.next_hop < encap->unigue.hop_count) {
553 encap->unigue.next_hop++;
554 }
555
556 /* Remove ip->saddr, add next_hop->s_addr */
557 const uint64_t off = offsetof(typeof(*encap), ip.check);
558 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
559 if (ret < 0) {
560 return TC_ACT_SHOT;
561 }
562
563 return bpf_redirect(skb->ifindex, 0);
564}
565
566static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
567{
568 switch (n) {
569 case 1:
570 if (!buf_skip(buf: pkt, len: sizeof(struct in_addr)))
571 return TC_ACT_SHOT;
572 case 0:
573 return CONTINUE_PROCESSING;
574
575 default:
576 return TC_ACT_SHOT;
577 }
578}
579
580/* Get the next hop from the GLB header.
581 *
582 * Sets next_hop->s_addr to 0 if there are no more hops left.
583 * pkt is positioned just after the variable length GLB header
584 * iff the call is successful.
585 */
586static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
587 struct in_addr *next_hop)
588{
589 if (encap->unigue.next_hop > encap->unigue.hop_count) {
590 return TC_ACT_SHOT;
591 }
592
593 /* Skip "used" next hops. */
594 MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
595
596 if (encap->unigue.next_hop == encap->unigue.hop_count) {
597 /* No more next hops, we are at the end of the GLB header. */
598 next_hop->s_addr = 0;
599 return CONTINUE_PROCESSING;
600 }
601
602 if (!buf_copy(buf: pkt, dst: next_hop, len: sizeof(*next_hop))) {
603 return TC_ACT_SHOT;
604 }
605
606 /* Skip the remaining next hops (may be zero). */
607 return skip_next_hops(pkt, n: encap->unigue.hop_count -
608 encap->unigue.next_hop - 1);
609}
610
611/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
612 * This is a kludge that let's us work around verifier limitations:
613 *
614 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
615 *
616 * clang will substitute a constant for sizeof, which allows the verifier
617 * to track its value. Based on this, it can figure out the constant
618 * return value, and calling code works while still being "generic" to
619 * IPv4 and IPv6.
620 */
621static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
622 uint64_t iphlen, uint16_t sport, uint16_t dport)
623{
624 switch (iphlen) {
625 case sizeof(struct iphdr): {
626 struct iphdr *ipv4 = (struct iphdr *)iph;
627 tuple->ipv4.daddr = ipv4->daddr;
628 tuple->ipv4.saddr = ipv4->saddr;
629 tuple->ipv4.sport = sport;
630 tuple->ipv4.dport = dport;
631 return sizeof(tuple->ipv4);
632 }
633
634 case sizeof(struct ipv6hdr): {
635 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
636 memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
637 sizeof(tuple->ipv6.daddr));
638 memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
639 sizeof(tuple->ipv6.saddr));
640 tuple->ipv6.sport = sport;
641 tuple->ipv6.dport = dport;
642 return sizeof(tuple->ipv6);
643 }
644
645 default:
646 return 0;
647 }
648}
649
650static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
651 struct bpf_sock_tuple *tuple, uint64_t tuplen,
652 void *iph, struct tcphdr *tcp)
653{
654 struct bpf_sock *sk =
655 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
656 if (sk == NULL) {
657 return UNKNOWN;
658 }
659
660 if (sk->state != BPF_TCP_LISTEN) {
661 bpf_sk_release(sk);
662 return ESTABLISHED;
663 }
664
665 if (iph != NULL && tcp != NULL) {
666 /* Kludge: we've run out of arguments, but need the length of the ip header. */
667 uint64_t iphlen = sizeof(struct iphdr);
668 if (tuplen == sizeof(tuple->ipv6)) {
669 iphlen = sizeof(struct ipv6hdr);
670 }
671
672 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
673 sizeof(*tcp)) == 0) {
674 bpf_sk_release(sk);
675 return SYN_COOKIE;
676 }
677 }
678
679 bpf_sk_release(sk);
680 return UNKNOWN;
681}
682
683static INLINING verdict_t classify_udp(struct __sk_buff *skb,
684 struct bpf_sock_tuple *tuple, uint64_t tuplen)
685{
686 struct bpf_sock *sk =
687 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
688 if (sk == NULL) {
689 return UNKNOWN;
690 }
691
692 if (sk->state == BPF_TCP_ESTABLISHED) {
693 bpf_sk_release(sk);
694 return ESTABLISHED;
695 }
696
697 bpf_sk_release(sk);
698 return UNKNOWN;
699}
700
701static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
702 struct bpf_sock_tuple *tuple, uint64_t tuplen,
703 metrics_t *metrics)
704{
705 switch (proto) {
706 case IPPROTO_TCP:
707 return classify_tcp(skb, tuple, tuplen, NULL, NULL);
708
709 case IPPROTO_UDP:
710 return classify_udp(skb, tuple, tuplen);
711
712 default:
713 metrics->errors_total_malformed_icmp++;
714 return INVALID;
715 }
716}
717
718static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
719{
720 struct icmphdr icmp;
721 if (!buf_copy(buf: pkt, dst: &icmp, len: sizeof(icmp))) {
722 metrics->errors_total_malformed_icmp++;
723 return INVALID;
724 }
725
726 /* We should never receive encapsulated echo replies. */
727 if (icmp.type == ICMP_ECHOREPLY) {
728 metrics->errors_total_icmp_echo_replies++;
729 return INVALID;
730 }
731
732 if (icmp.type == ICMP_ECHO) {
733 return ECHO_REQUEST;
734 }
735
736 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
737 metrics->errors_total_unwanted_icmp++;
738 return INVALID;
739 }
740
741 struct iphdr _ip4;
742 const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, scratch: &_ip4);
743 if (ipv4 == NULL) {
744 metrics->errors_total_malformed_icmp_pkt_too_big++;
745 return INVALID;
746 }
747
748 /* The source address in the outer IP header is from the entity that
749 * originated the ICMP message. Use the original IP header to restore
750 * the correct flow tuple.
751 */
752 struct bpf_sock_tuple tuple;
753 tuple.ipv4.saddr = ipv4->daddr;
754 tuple.ipv4.daddr = ipv4->saddr;
755
756 if (!pkt_parse_icmp_l4_ports(pkt, ports: (flow_ports_t *)&tuple.ipv4.sport)) {
757 metrics->errors_total_malformed_icmp_pkt_too_big++;
758 return INVALID;
759 }
760
761 return classify_icmp(skb: pkt->skb, proto: ipv4->protocol, tuple: &tuple,
762 tuplen: sizeof(tuple.ipv4), metrics);
763}
764
765static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
766{
767 struct icmp6hdr icmp6;
768 if (!buf_copy(buf: pkt, dst: &icmp6, len: sizeof(icmp6))) {
769 metrics->errors_total_malformed_icmp++;
770 return INVALID;
771 }
772
773 /* We should never receive encapsulated echo replies. */
774 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
775 metrics->errors_total_icmp_echo_replies++;
776 return INVALID;
777 }
778
779 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
780 return ECHO_REQUEST;
781 }
782
783 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
784 metrics->errors_total_unwanted_icmp++;
785 return INVALID;
786 }
787
788 bool is_fragment;
789 uint8_t l4_proto;
790 struct ipv6hdr _ipv6;
791 const struct ipv6hdr *ipv6 =
792 pkt_parse_ipv6(pkt, scratch: &_ipv6, proto: &l4_proto, is_fragment: &is_fragment);
793 if (ipv6 == NULL) {
794 metrics->errors_total_malformed_icmp_pkt_too_big++;
795 return INVALID;
796 }
797
798 if (is_fragment) {
799 metrics->errors_total_fragmented_ip++;
800 return INVALID;
801 }
802
803 /* Swap source and dest addresses. */
804 struct bpf_sock_tuple tuple;
805 memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
806 memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
807
808 if (!pkt_parse_icmp_l4_ports(pkt, ports: (flow_ports_t *)&tuple.ipv6.sport)) {
809 metrics->errors_total_malformed_icmp_pkt_too_big++;
810 return INVALID;
811 }
812
813 return classify_icmp(skb: pkt->skb, proto: l4_proto, tuple: &tuple, tuplen: sizeof(tuple.ipv6),
814 metrics);
815}
816
817static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
818 metrics_t *metrics)
819{
820 metrics->l4_protocol_packets_total_tcp++;
821
822 struct tcphdr _tcp;
823 struct tcphdr *tcp = buf_assign(buf: pkt, len: sizeof(_tcp), scratch: &_tcp);
824 if (tcp == NULL) {
825 metrics->errors_total_malformed_tcp++;
826 return INVALID;
827 }
828
829 if (tcp->syn) {
830 return SYN;
831 }
832
833 struct bpf_sock_tuple tuple;
834 uint64_t tuplen =
835 fill_tuple(tuple: &tuple, iph, iphlen, sport: tcp->source, dport: tcp->dest);
836 return classify_tcp(skb: pkt->skb, tuple: &tuple, tuplen, iph, tcp);
837}
838
839static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
840 metrics_t *metrics)
841{
842 metrics->l4_protocol_packets_total_udp++;
843
844 struct udphdr _udp;
845 struct udphdr *udph = buf_assign(buf: pkt, len: sizeof(_udp), scratch: &_udp);
846 if (udph == NULL) {
847 metrics->errors_total_malformed_udp++;
848 return INVALID;
849 }
850
851 struct bpf_sock_tuple tuple;
852 uint64_t tuplen =
853 fill_tuple(tuple: &tuple, iph, iphlen, sport: udph->source, dport: udph->dest);
854 return classify_udp(skb: pkt->skb, tuple: &tuple, tuplen);
855}
856
857static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
858{
859 metrics->l3_protocol_packets_total_ipv4++;
860
861 struct iphdr _ip4;
862 struct iphdr *ipv4 = pkt_parse_ipv4(pkt, scratch: &_ip4);
863 if (ipv4 == NULL) {
864 metrics->errors_total_malformed_ip++;
865 return INVALID;
866 }
867
868 if (ipv4->version != 4) {
869 metrics->errors_total_malformed_ip++;
870 return INVALID;
871 }
872
873 if (ipv4_is_fragment(ip: ipv4)) {
874 metrics->errors_total_fragmented_ip++;
875 return INVALID;
876 }
877
878 switch (ipv4->protocol) {
879 case IPPROTO_ICMP:
880 return process_icmpv4(pkt, metrics);
881
882 case IPPROTO_TCP:
883 return process_tcp(pkt, iph: ipv4, iphlen: sizeof(*ipv4), metrics);
884
885 case IPPROTO_UDP:
886 return process_udp(pkt, iph: ipv4, iphlen: sizeof(*ipv4), metrics);
887
888 default:
889 metrics->errors_total_unknown_l4_proto++;
890 return INVALID;
891 }
892}
893
894static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
895{
896 metrics->l3_protocol_packets_total_ipv6++;
897
898 uint8_t l4_proto;
899 bool is_fragment;
900 struct ipv6hdr _ipv6;
901 struct ipv6hdr *ipv6 =
902 pkt_parse_ipv6(pkt, scratch: &_ipv6, proto: &l4_proto, is_fragment: &is_fragment);
903 if (ipv6 == NULL) {
904 metrics->errors_total_malformed_ip++;
905 return INVALID;
906 }
907
908 if (ipv6->version != 6) {
909 metrics->errors_total_malformed_ip++;
910 return INVALID;
911 }
912
913 if (is_fragment) {
914 metrics->errors_total_fragmented_ip++;
915 return INVALID;
916 }
917
918 switch (l4_proto) {
919 case IPPROTO_ICMPV6:
920 return process_icmpv6(pkt, metrics);
921
922 case IPPROTO_TCP:
923 return process_tcp(pkt, iph: ipv6, iphlen: sizeof(*ipv6), metrics);
924
925 case IPPROTO_UDP:
926 return process_udp(pkt, iph: ipv6, iphlen: sizeof(*ipv6), metrics);
927
928 default:
929 metrics->errors_total_unknown_l4_proto++;
930 return INVALID;
931 }
932}
933
934SEC("tc")
935int cls_redirect(struct __sk_buff *skb)
936{
937 metrics_t *metrics = get_global_metrics();
938 if (metrics == NULL) {
939 return TC_ACT_SHOT;
940 }
941
942 metrics->processed_packets_total++;
943
944 /* Pass bogus packets as long as we're not sure they're
945 * destined for us.
946 */
947 if (skb->protocol != bpf_htons(ETH_P_IP)) {
948 return TC_ACT_OK;
949 }
950
951 encap_headers_t *encap;
952
953 /* Make sure that all encapsulation headers are available in
954 * the linear portion of the skb. This makes it easy to manipulate them.
955 */
956 if (bpf_skb_pull_data(skb, sizeof(*encap))) {
957 return TC_ACT_OK;
958 }
959
960 buf_t pkt = {
961 .skb = skb,
962 .head = (uint8_t *)(long)skb->data,
963 .tail = (uint8_t *)(long)skb->data_end,
964 };
965
966 encap = buf_assign(buf: &pkt, len: sizeof(*encap), NULL);
967 if (encap == NULL) {
968 return TC_ACT_OK;
969 }
970
971 if (encap->ip.ihl != 5) {
972 /* We never have any options. */
973 return TC_ACT_OK;
974 }
975
976 if (encap->ip.daddr != ENCAPSULATION_IP ||
977 encap->ip.protocol != IPPROTO_UDP) {
978 return TC_ACT_OK;
979 }
980
981 /* TODO Check UDP length? */
982 if (encap->udp.dest != ENCAPSULATION_PORT) {
983 return TC_ACT_OK;
984 }
985
986 /* We now know that the packet is destined to us, we can
987 * drop bogus ones.
988 */
989 if (ipv4_is_fragment(ip: (void *)&encap->ip)) {
990 metrics->errors_total_fragmented_ip++;
991 return TC_ACT_SHOT;
992 }
993
994 if (encap->gue.variant != 0) {
995 metrics->errors_total_malformed_encapsulation++;
996 return TC_ACT_SHOT;
997 }
998
999 if (encap->gue.control != 0) {
1000 metrics->errors_total_malformed_encapsulation++;
1001 return TC_ACT_SHOT;
1002 }
1003
1004 if (encap->gue.flags != 0) {
1005 metrics->errors_total_malformed_encapsulation++;
1006 return TC_ACT_SHOT;
1007 }
1008
1009 if (encap->gue.hlen !=
1010 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1011 metrics->errors_total_malformed_encapsulation++;
1012 return TC_ACT_SHOT;
1013 }
1014
1015 if (encap->unigue.version != 0) {
1016 metrics->errors_total_malformed_encapsulation++;
1017 return TC_ACT_SHOT;
1018 }
1019
1020 if (encap->unigue.reserved != 0) {
1021 return TC_ACT_SHOT;
1022 }
1023
1024 struct in_addr next_hop;
1025 MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1026
1027 if (next_hop.s_addr == 0) {
1028 metrics->accepted_packets_total_last_hop++;
1029 return accept_locally(skb, encap);
1030 }
1031
1032 verdict_t verdict;
1033 switch (encap->gue.proto_ctype) {
1034 case IPPROTO_IPIP:
1035 verdict = process_ipv4(pkt: &pkt, metrics);
1036 break;
1037
1038 case IPPROTO_IPV6:
1039 verdict = process_ipv6(pkt: &pkt, metrics);
1040 break;
1041
1042 default:
1043 metrics->errors_total_unknown_l3_proto++;
1044 return TC_ACT_SHOT;
1045 }
1046
1047 switch (verdict) {
1048 case INVALID:
1049 /* metrics have already been bumped */
1050 return TC_ACT_SHOT;
1051
1052 case UNKNOWN:
1053 return forward_to_next_hop(skb, encap, next_hop: &next_hop, metrics);
1054
1055 case ECHO_REQUEST:
1056 metrics->accepted_packets_total_icmp_echo_request++;
1057 break;
1058
1059 case SYN:
1060 if (encap->unigue.forward_syn) {
1061 return forward_to_next_hop(skb, encap, next_hop: &next_hop,
1062 metrics);
1063 }
1064
1065 metrics->accepted_packets_total_syn++;
1066 break;
1067
1068 case SYN_COOKIE:
1069 metrics->accepted_packets_total_syn_cookies++;
1070 break;
1071
1072 case ESTABLISHED:
1073 metrics->accepted_packets_total_established++;
1074 break;
1075 }
1076
1077 return accept_locally(skb, encap);
1078}
1079

source code of linux/tools/testing/selftests/bpf/progs/test_cls_redirect.c