1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2// Copyright (c) 2019, 2020 Cloudflare
3
4#include <stdbool.h>
5#include <stddef.h>
6#include <stdint.h>
7#include <string.h>
8
9#include <linux/bpf.h>
10#include <linux/icmp.h>
11#include <linux/icmpv6.h>
12#include <linux/if_ether.h>
13#include <linux/in.h>
14#include <linux/ip.h>
15#include <linux/ipv6.h>
16#include <linux/pkt_cls.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19
20#include <bpf/bpf_helpers.h>
21#include <bpf/bpf_endian.h>
22
23#include "test_cls_redirect.h"
24#include "bpf_kfuncs.h"
25
26#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27
28#define offsetofend(TYPE, MEMBER) \
29 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
30
31#define IP_OFFSET_MASK (0x1FFF)
32#define IP_MF (0x2000)
33
34char _license[] SEC("license") = "Dual BSD/GPL";
35
36/**
37 * Destination port and IP used for UDP encapsulation.
38 */
39volatile const __be16 ENCAPSULATION_PORT;
40volatile const __be32 ENCAPSULATION_IP;
41
42typedef struct {
43 uint64_t processed_packets_total;
44 uint64_t l3_protocol_packets_total_ipv4;
45 uint64_t l3_protocol_packets_total_ipv6;
46 uint64_t l4_protocol_packets_total_tcp;
47 uint64_t l4_protocol_packets_total_udp;
48 uint64_t accepted_packets_total_syn;
49 uint64_t accepted_packets_total_syn_cookies;
50 uint64_t accepted_packets_total_last_hop;
51 uint64_t accepted_packets_total_icmp_echo_request;
52 uint64_t accepted_packets_total_established;
53 uint64_t forwarded_packets_total_gue;
54 uint64_t forwarded_packets_total_gre;
55
56 uint64_t errors_total_unknown_l3_proto;
57 uint64_t errors_total_unknown_l4_proto;
58 uint64_t errors_total_malformed_ip;
59 uint64_t errors_total_fragmented_ip;
60 uint64_t errors_total_malformed_icmp;
61 uint64_t errors_total_unwanted_icmp;
62 uint64_t errors_total_malformed_icmp_pkt_too_big;
63 uint64_t errors_total_malformed_tcp;
64 uint64_t errors_total_malformed_udp;
65 uint64_t errors_total_icmp_echo_replies;
66 uint64_t errors_total_malformed_encapsulation;
67 uint64_t errors_total_encap_adjust_failed;
68 uint64_t errors_total_encap_buffer_too_small;
69 uint64_t errors_total_redirect_loop;
70 uint64_t errors_total_encap_mtu_violate;
71} metrics_t;
72
73typedef enum {
74 INVALID = 0,
75 UNKNOWN,
76 ECHO_REQUEST,
77 SYN,
78 SYN_COOKIE,
79 ESTABLISHED,
80} verdict_t;
81
82typedef struct {
83 uint16_t src, dst;
84} flow_ports_t;
85
86_Static_assert(
87 sizeof(flow_ports_t) !=
88 offsetofend(struct bpf_sock_tuple, ipv4.dport) -
89 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
90 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
91_Static_assert(
92 sizeof(flow_ports_t) !=
93 offsetofend(struct bpf_sock_tuple, ipv6.dport) -
94 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
95 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
96
97struct iphdr_info {
98 void *hdr;
99 __u64 len;
100};
101
102typedef int ret_t;
103
104/* This is a bit of a hack. We need a return value which allows us to
105 * indicate that the regular flow of the program should continue,
106 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
107 */
108static const ret_t CONTINUE_PROCESSING = -1;
109
110/* Convenience macro to call functions which return ret_t.
111 */
112#define MAYBE_RETURN(x) \
113 do { \
114 ret_t __ret = x; \
115 if (__ret != CONTINUE_PROCESSING) \
116 return __ret; \
117 } while (0)
118
119static bool ipv4_is_fragment(const struct iphdr *ip)
120{
121 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
122 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
123}
124
125static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr)
126{
127 if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0))
128 return -1;
129
130 *offset += sizeof(*iphdr);
131
132 if (iphdr->ihl < 5)
133 return -1;
134
135 /* skip ipv4 options */
136 *offset += (iphdr->ihl - 5) * 4;
137
138 return 0;
139}
140
141/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
142static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports)
143{
144 if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0))
145 return false;
146
147 *offset += sizeof(*ports);
148
149 /* Ports in the L4 headers are reversed, since we are parsing an ICMP
150 * payload which is going towards the eyeball.
151 */
152 uint16_t dst = ports->src;
153 ports->src = ports->dst;
154 ports->dst = dst;
155 return true;
156}
157
158static uint16_t pkt_checksum_fold(uint32_t csum)
159{
160 /* The highest reasonable value for an IPv4 header
161 * checksum requires two folds, so we just do that always.
162 */
163 csum = (csum & 0xffff) + (csum >> 16);
164 csum = (csum & 0xffff) + (csum >> 16);
165 return (uint16_t)~csum;
166}
167
168static void pkt_ipv4_checksum(struct iphdr *iph)
169{
170 iph->check = 0;
171
172 /* An IP header without options is 20 bytes. Two of those
173 * are the checksum, which we always set to zero. Hence,
174 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
175 * which fits in 32 bit.
176 */
177 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
178 uint32_t acc = 0;
179 uint16_t *ipw = (uint16_t *)iph;
180
181 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++)
182 acc += ipw[i];
183
184 iph->check = pkt_checksum_fold(csum: acc);
185}
186
187static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset,
188 const struct ipv6hdr *ipv6, uint8_t *upper_proto,
189 bool *is_fragment)
190{
191 /* We understand five extension headers.
192 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
193 * headers should occur once, except Destination Options, which may
194 * occur twice. Hence we give up after 6 headers.
195 */
196 struct {
197 uint8_t next;
198 uint8_t len;
199 } exthdr = {
200 .next = ipv6->nexthdr,
201 };
202 *is_fragment = false;
203
204 for (int i = 0; i < 6; i++) {
205 switch (exthdr.next) {
206 case IPPROTO_FRAGMENT:
207 *is_fragment = true;
208 /* NB: We don't check that hdrlen == 0 as per spec. */
209 /* fallthrough; */
210
211 case IPPROTO_HOPOPTS:
212 case IPPROTO_ROUTING:
213 case IPPROTO_DSTOPTS:
214 case IPPROTO_MH:
215 if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0))
216 return false;
217
218 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
219 *offset += (exthdr.len + 1) * 8;
220
221 /* Decode next header */
222 break;
223
224 default:
225 /* The next header is not one of the known extension
226 * headers, treat it as the upper layer header.
227 *
228 * This handles IPPROTO_NONE.
229 *
230 * Encapsulating Security Payload (50) and Authentication
231 * Header (51) also end up here (and will trigger an
232 * unknown proto error later). They have a custom header
233 * format and seem too esoteric to care about.
234 */
235 *upper_proto = exthdr.next;
236 return true;
237 }
238 }
239
240 /* We never found an upper layer header. */
241 return false;
242}
243
244static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6,
245 uint8_t *proto, bool *is_fragment)
246{
247 if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0))
248 return -1;
249
250 *offset += sizeof(*ipv6);
251
252 if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, upper_proto: proto, is_fragment))
253 return -1;
254
255 return 0;
256}
257
258/* Global metrics, per CPU
259 */
260struct {
261 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
262 __uint(max_entries, 1);
263 __type(key, unsigned int);
264 __type(value, metrics_t);
265} metrics_map SEC(".maps");
266
267static metrics_t *get_global_metrics(void)
268{
269 uint64_t key = 0;
270 return bpf_map_lookup_elem(&metrics_map, &key);
271}
272
273static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
274{
275 const int payload_off =
276 sizeof(*encap) +
277 sizeof(struct in_addr) * encap->unigue.hop_count;
278 int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
279
280 /* Changing the ethertype if the encapsulated packet is ipv6 */
281 if (encap->gue.proto_ctype == IPPROTO_IPV6)
282 encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
283
284 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
285 BPF_F_ADJ_ROOM_FIXED_GSO |
286 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
287 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
288 return TC_ACT_SHOT;
289
290 return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
291}
292
293static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
294 encap_headers_t *encap, struct in_addr *next_hop,
295 metrics_t *metrics)
296{
297 const int payload_off =
298 sizeof(*encap) +
299 sizeof(struct in_addr) * encap->unigue.hop_count;
300 int32_t encap_overhead =
301 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
302 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
303 __u8 encap_buffer[sizeof(encap_gre_t)] = {};
304 uint16_t proto = ETH_P_IP;
305 uint32_t mtu_len = 0;
306 encap_gre_t *encap_gre;
307
308 metrics->forwarded_packets_total_gre++;
309
310 /* Loop protection: the inner packet's TTL is decremented as a safeguard
311 * against any forwarding loop. As the only interesting field is the TTL
312 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
313 * as they handle the split packets if needed (no need for the data to be
314 * in the linear section).
315 */
316 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
317 proto = ETH_P_IPV6;
318 uint8_t ttl;
319 int rc;
320
321 rc = bpf_skb_load_bytes(
322 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
323 &ttl, 1);
324 if (rc != 0) {
325 metrics->errors_total_malformed_encapsulation++;
326 return TC_ACT_SHOT;
327 }
328
329 if (ttl == 0) {
330 metrics->errors_total_redirect_loop++;
331 return TC_ACT_SHOT;
332 }
333
334 ttl--;
335 rc = bpf_skb_store_bytes(
336 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
337 &ttl, 1, 0);
338 if (rc != 0) {
339 metrics->errors_total_malformed_encapsulation++;
340 return TC_ACT_SHOT;
341 }
342 } else {
343 uint8_t ttl;
344 int rc;
345
346 rc = bpf_skb_load_bytes(
347 skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
348 1);
349 if (rc != 0) {
350 metrics->errors_total_malformed_encapsulation++;
351 return TC_ACT_SHOT;
352 }
353
354 if (ttl == 0) {
355 metrics->errors_total_redirect_loop++;
356 return TC_ACT_SHOT;
357 }
358
359 /* IPv4 also has a checksum to patch. While the TTL is only one byte,
360 * this function only works for 2 and 4 bytes arguments (the result is
361 * the same).
362 */
363 rc = bpf_l3_csum_replace(
364 skb, payload_off + offsetof(struct iphdr, check), ttl,
365 ttl - 1, 2);
366 if (rc != 0) {
367 metrics->errors_total_malformed_encapsulation++;
368 return TC_ACT_SHOT;
369 }
370
371 ttl--;
372 rc = bpf_skb_store_bytes(
373 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
374 0);
375 if (rc != 0) {
376 metrics->errors_total_malformed_encapsulation++;
377 return TC_ACT_SHOT;
378 }
379 }
380
381 if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
382 metrics->errors_total_encap_mtu_violate++;
383 return TC_ACT_SHOT;
384 }
385
386 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
387 BPF_F_ADJ_ROOM_FIXED_GSO |
388 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
389 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
390 metrics->errors_total_encap_adjust_failed++;
391 return TC_ACT_SHOT;
392 }
393
394 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
395 metrics->errors_total_encap_buffer_too_small++;
396 return TC_ACT_SHOT;
397 }
398
399 encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer));
400 if (!encap_gre) {
401 metrics->errors_total_encap_buffer_too_small++;
402 return TC_ACT_SHOT;
403 }
404
405 encap_gre->ip.protocol = IPPROTO_GRE;
406 encap_gre->ip.daddr = next_hop->s_addr;
407 encap_gre->ip.saddr = ENCAPSULATION_IP;
408 encap_gre->ip.tot_len =
409 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
410 encap_gre->gre.flags = 0;
411 encap_gre->gre.protocol = bpf_htons(proto);
412 pkt_ipv4_checksum(iph: (void *)&encap_gre->ip);
413
414 if (encap_gre == encap_buffer)
415 bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
416
417 return bpf_redirect(skb->ifindex, 0);
418}
419
420static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
421 encap_headers_t *encap, struct in_addr *next_hop,
422 metrics_t *metrics)
423{
424 /* swap L2 addresses */
425 /* This assumes that packets are received from a router.
426 * So just swapping the MAC addresses here will make the packet go back to
427 * the router, which will send it to the appropriate machine.
428 */
429 unsigned char temp[ETH_ALEN];
430 memcpy(temp, encap->eth.h_dest, sizeof(temp));
431 memcpy(encap->eth.h_dest, encap->eth.h_source,
432 sizeof(encap->eth.h_dest));
433 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
434
435 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
436 encap->unigue.last_hop_gre) {
437 return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
438 }
439
440 metrics->forwarded_packets_total_gue++;
441 uint32_t old_saddr = encap->ip.saddr;
442 encap->ip.saddr = encap->ip.daddr;
443 encap->ip.daddr = next_hop->s_addr;
444 if (encap->unigue.next_hop < encap->unigue.hop_count) {
445 encap->unigue.next_hop++;
446 }
447
448 /* Remove ip->saddr, add next_hop->s_addr */
449 const uint64_t off = offsetof(typeof(*encap), ip.check);
450 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
451 if (ret < 0) {
452 return TC_ACT_SHOT;
453 }
454
455 return bpf_redirect(skb->ifindex, 0);
456}
457
458static ret_t skip_next_hops(__u64 *offset, int n)
459{
460 switch (n) {
461 case 1:
462 *offset += sizeof(struct in_addr);
463 case 0:
464 return CONTINUE_PROCESSING;
465
466 default:
467 return TC_ACT_SHOT;
468 }
469}
470
471/* Get the next hop from the GLB header.
472 *
473 * Sets next_hop->s_addr to 0 if there are no more hops left.
474 * pkt is positioned just after the variable length GLB header
475 * iff the call is successful.
476 */
477static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap,
478 struct in_addr *next_hop)
479{
480 if (encap->unigue.next_hop > encap->unigue.hop_count)
481 return TC_ACT_SHOT;
482
483 /* Skip "used" next hops. */
484 MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
485
486 if (encap->unigue.next_hop == encap->unigue.hop_count) {
487 /* No more next hops, we are at the end of the GLB header. */
488 next_hop->s_addr = 0;
489 return CONTINUE_PROCESSING;
490 }
491
492 if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0))
493 return TC_ACT_SHOT;
494
495 *offset += sizeof(*next_hop);
496
497 /* Skip the remainig next hops (may be zero). */
498 return skip_next_hops(offset, n: encap->unigue.hop_count - encap->unigue.next_hop - 1);
499}
500
501/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
502 * This is a kludge that let's us work around verifier limitations:
503 *
504 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
505 *
506 * clang will substitue a costant for sizeof, which allows the verifier
507 * to track it's value. Based on this, it can figure out the constant
508 * return value, and calling code works while still being "generic" to
509 * IPv4 and IPv6.
510 */
511static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
512 uint64_t iphlen, uint16_t sport, uint16_t dport)
513{
514 switch (iphlen) {
515 case sizeof(struct iphdr): {
516 struct iphdr *ipv4 = (struct iphdr *)iph;
517 tuple->ipv4.daddr = ipv4->daddr;
518 tuple->ipv4.saddr = ipv4->saddr;
519 tuple->ipv4.sport = sport;
520 tuple->ipv4.dport = dport;
521 return sizeof(tuple->ipv4);
522 }
523
524 case sizeof(struct ipv6hdr): {
525 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
526 memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
527 sizeof(tuple->ipv6.daddr));
528 memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
529 sizeof(tuple->ipv6.saddr));
530 tuple->ipv6.sport = sport;
531 tuple->ipv6.dport = dport;
532 return sizeof(tuple->ipv6);
533 }
534
535 default:
536 return 0;
537 }
538}
539
540static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple,
541 uint64_t tuplen, void *iph, struct tcphdr *tcp)
542{
543 struct bpf_sock *sk =
544 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
545
546 if (sk == NULL)
547 return UNKNOWN;
548
549 if (sk->state != BPF_TCP_LISTEN) {
550 bpf_sk_release(sk);
551 return ESTABLISHED;
552 }
553
554 if (iph != NULL && tcp != NULL) {
555 /* Kludge: we've run out of arguments, but need the length of the ip header. */
556 uint64_t iphlen = sizeof(struct iphdr);
557
558 if (tuplen == sizeof(tuple->ipv6))
559 iphlen = sizeof(struct ipv6hdr);
560
561 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
562 sizeof(*tcp)) == 0) {
563 bpf_sk_release(sk);
564 return SYN_COOKIE;
565 }
566 }
567
568 bpf_sk_release(sk);
569 return UNKNOWN;
570}
571
572static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen)
573{
574 struct bpf_sock *sk =
575 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
576
577 if (sk == NULL)
578 return UNKNOWN;
579
580 if (sk->state == BPF_TCP_ESTABLISHED) {
581 bpf_sk_release(sk);
582 return ESTABLISHED;
583 }
584
585 bpf_sk_release(sk);
586 return UNKNOWN;
587}
588
589static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple,
590 uint64_t tuplen, metrics_t *metrics)
591{
592 switch (proto) {
593 case IPPROTO_TCP:
594 return classify_tcp(skb, tuple, tuplen, NULL, NULL);
595
596 case IPPROTO_UDP:
597 return classify_udp(skb, tuple, tuplen);
598
599 default:
600 metrics->errors_total_malformed_icmp++;
601 return INVALID;
602 }
603}
604
605static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset,
606 metrics_t *metrics)
607{
608 struct icmphdr icmp;
609 struct iphdr ipv4;
610
611 if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) {
612 metrics->errors_total_malformed_icmp++;
613 return INVALID;
614 }
615
616 *offset += sizeof(icmp);
617
618 /* We should never receive encapsulated echo replies. */
619 if (icmp.type == ICMP_ECHOREPLY) {
620 metrics->errors_total_icmp_echo_replies++;
621 return INVALID;
622 }
623
624 if (icmp.type == ICMP_ECHO)
625 return ECHO_REQUEST;
626
627 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
628 metrics->errors_total_unwanted_icmp++;
629 return INVALID;
630 }
631
632 if (pkt_parse_ipv4(dynptr, offset, iphdr: &ipv4)) {
633 metrics->errors_total_malformed_icmp_pkt_too_big++;
634 return INVALID;
635 }
636
637 /* The source address in the outer IP header is from the entity that
638 * originated the ICMP message. Use the original IP header to restore
639 * the correct flow tuple.
640 */
641 struct bpf_sock_tuple tuple;
642 tuple.ipv4.saddr = ipv4.daddr;
643 tuple.ipv4.daddr = ipv4.saddr;
644
645 if (!pkt_parse_icmp_l4_ports(dynptr, offset, ports: (flow_ports_t *)&tuple.ipv4.sport)) {
646 metrics->errors_total_malformed_icmp_pkt_too_big++;
647 return INVALID;
648 }
649
650 return classify_icmp(skb, proto: ipv4.protocol, tuple: &tuple,
651 tuplen: sizeof(tuple.ipv4), metrics);
652}
653
654static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
655 metrics_t *metrics)
656{
657 struct bpf_sock_tuple tuple;
658 struct ipv6hdr ipv6;
659 struct icmp6hdr icmp6;
660 bool is_fragment;
661 uint8_t l4_proto;
662
663 if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) {
664 metrics->errors_total_malformed_icmp++;
665 return INVALID;
666 }
667
668 /* We should never receive encapsulated echo replies. */
669 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
670 metrics->errors_total_icmp_echo_replies++;
671 return INVALID;
672 }
673
674 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
675 return ECHO_REQUEST;
676 }
677
678 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
679 metrics->errors_total_unwanted_icmp++;
680 return INVALID;
681 }
682
683 if (pkt_parse_ipv6(dynptr, offset, ipv6: &ipv6, proto: &l4_proto, is_fragment: &is_fragment)) {
684 metrics->errors_total_malformed_icmp_pkt_too_big++;
685 return INVALID;
686 }
687
688 if (is_fragment) {
689 metrics->errors_total_fragmented_ip++;
690 return INVALID;
691 }
692
693 /* Swap source and dest addresses. */
694 memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
695 memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
696
697 if (!pkt_parse_icmp_l4_ports(dynptr, offset, ports: (flow_ports_t *)&tuple.ipv6.sport)) {
698 metrics->errors_total_malformed_icmp_pkt_too_big++;
699 return INVALID;
700 }
701
702 return classify_icmp(skb, proto: l4_proto, tuple: &tuple, tuplen: sizeof(tuple.ipv6),
703 metrics);
704}
705
706static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
707 struct iphdr_info *info, metrics_t *metrics)
708{
709 struct bpf_sock_tuple tuple;
710 struct tcphdr tcp;
711 uint64_t tuplen;
712
713 metrics->l4_protocol_packets_total_tcp++;
714
715 if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) {
716 metrics->errors_total_malformed_tcp++;
717 return INVALID;
718 }
719
720 *offset += sizeof(tcp);
721
722 if (tcp.syn)
723 return SYN;
724
725 tuplen = fill_tuple(tuple: &tuple, iph: info->hdr, iphlen: info->len, sport: tcp.source, dport: tcp.dest);
726 return classify_tcp(skb, tuple: &tuple, tuplen, iph: info->hdr, tcp: &tcp);
727}
728
729static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
730 struct iphdr_info *info, metrics_t *metrics)
731{
732 struct bpf_sock_tuple tuple;
733 struct udphdr udph;
734 uint64_t tuplen;
735
736 metrics->l4_protocol_packets_total_udp++;
737
738 if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) {
739 metrics->errors_total_malformed_udp++;
740 return INVALID;
741 }
742 *offset += sizeof(udph);
743
744 tuplen = fill_tuple(tuple: &tuple, iph: info->hdr, iphlen: info->len, sport: udph.source, dport: udph.dest);
745 return classify_udp(skb, tuple: &tuple, tuplen);
746}
747
748static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
749 __u64 *offset, metrics_t *metrics)
750{
751 struct iphdr ipv4;
752 struct iphdr_info info = {
753 .hdr = &ipv4,
754 .len = sizeof(ipv4),
755 };
756
757 metrics->l3_protocol_packets_total_ipv4++;
758
759 if (pkt_parse_ipv4(dynptr, offset, iphdr: &ipv4)) {
760 metrics->errors_total_malformed_ip++;
761 return INVALID;
762 }
763
764 if (ipv4.version != 4) {
765 metrics->errors_total_malformed_ip++;
766 return INVALID;
767 }
768
769 if (ipv4_is_fragment(ip: &ipv4)) {
770 metrics->errors_total_fragmented_ip++;
771 return INVALID;
772 }
773
774 switch (ipv4.protocol) {
775 case IPPROTO_ICMP:
776 return process_icmpv4(skb, dynptr, offset, metrics);
777
778 case IPPROTO_TCP:
779 return process_tcp(dynptr, offset, skb, info: &info, metrics);
780
781 case IPPROTO_UDP:
782 return process_udp(dynptr, offset, skb, info: &info, metrics);
783
784 default:
785 metrics->errors_total_unknown_l4_proto++;
786 return INVALID;
787 }
788}
789
790static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
791 __u64 *offset, metrics_t *metrics)
792{
793 struct ipv6hdr ipv6;
794 struct iphdr_info info = {
795 .hdr = &ipv6,
796 .len = sizeof(ipv6),
797 };
798 uint8_t l4_proto;
799 bool is_fragment;
800
801 metrics->l3_protocol_packets_total_ipv6++;
802
803 if (pkt_parse_ipv6(dynptr, offset, ipv6: &ipv6, proto: &l4_proto, is_fragment: &is_fragment)) {
804 metrics->errors_total_malformed_ip++;
805 return INVALID;
806 }
807
808 if (ipv6.version != 6) {
809 metrics->errors_total_malformed_ip++;
810 return INVALID;
811 }
812
813 if (is_fragment) {
814 metrics->errors_total_fragmented_ip++;
815 return INVALID;
816 }
817
818 switch (l4_proto) {
819 case IPPROTO_ICMPV6:
820 return process_icmpv6(dynptr, offset, skb, metrics);
821
822 case IPPROTO_TCP:
823 return process_tcp(dynptr, offset, skb, info: &info, metrics);
824
825 case IPPROTO_UDP:
826 return process_udp(dynptr, offset, skb, info: &info, metrics);
827
828 default:
829 metrics->errors_total_unknown_l4_proto++;
830 return INVALID;
831 }
832}
833
834SEC("tc")
835int cls_redirect(struct __sk_buff *skb)
836{
837 __u8 encap_buffer[sizeof(encap_headers_t)] = {};
838 struct bpf_dynptr dynptr;
839 struct in_addr next_hop;
840 /* Tracks offset of the dynptr. This will be unnecessary once
841 * bpf_dynptr_advance() is available.
842 */
843 __u64 off = 0;
844 ret_t ret;
845
846 bpf_dynptr_from_skb(skb, 0, &dynptr);
847
848 metrics_t *metrics = get_global_metrics();
849 if (metrics == NULL)
850 return TC_ACT_SHOT;
851
852 metrics->processed_packets_total++;
853
854 /* Pass bogus packets as long as we're not sure they're
855 * destined for us.
856 */
857 if (skb->protocol != bpf_htons(ETH_P_IP))
858 return TC_ACT_OK;
859
860 encap_headers_t *encap;
861
862 /* Make sure that all encapsulation headers are available in
863 * the linear portion of the skb. This makes it easy to manipulate them.
864 */
865 if (bpf_skb_pull_data(skb, sizeof(*encap)))
866 return TC_ACT_OK;
867
868 encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer));
869 if (!encap)
870 return TC_ACT_OK;
871
872 off += sizeof(*encap);
873
874 if (encap->ip.ihl != 5)
875 /* We never have any options. */
876 return TC_ACT_OK;
877
878 if (encap->ip.daddr != ENCAPSULATION_IP ||
879 encap->ip.protocol != IPPROTO_UDP)
880 return TC_ACT_OK;
881
882 /* TODO Check UDP length? */
883 if (encap->udp.dest != ENCAPSULATION_PORT)
884 return TC_ACT_OK;
885
886 /* We now know that the packet is destined to us, we can
887 * drop bogus ones.
888 */
889 if (ipv4_is_fragment(ip: (void *)&encap->ip)) {
890 metrics->errors_total_fragmented_ip++;
891 return TC_ACT_SHOT;
892 }
893
894 if (encap->gue.variant != 0) {
895 metrics->errors_total_malformed_encapsulation++;
896 return TC_ACT_SHOT;
897 }
898
899 if (encap->gue.control != 0) {
900 metrics->errors_total_malformed_encapsulation++;
901 return TC_ACT_SHOT;
902 }
903
904 if (encap->gue.flags != 0) {
905 metrics->errors_total_malformed_encapsulation++;
906 return TC_ACT_SHOT;
907 }
908
909 if (encap->gue.hlen !=
910 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
911 metrics->errors_total_malformed_encapsulation++;
912 return TC_ACT_SHOT;
913 }
914
915 if (encap->unigue.version != 0) {
916 metrics->errors_total_malformed_encapsulation++;
917 return TC_ACT_SHOT;
918 }
919
920 if (encap->unigue.reserved != 0)
921 return TC_ACT_SHOT;
922
923 MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
924
925 if (next_hop.s_addr == 0) {
926 metrics->accepted_packets_total_last_hop++;
927 return accept_locally(skb, encap);
928 }
929
930 verdict_t verdict;
931 switch (encap->gue.proto_ctype) {
932 case IPPROTO_IPIP:
933 verdict = process_ipv4(skb, dynptr: &dynptr, offset: &off, metrics);
934 break;
935
936 case IPPROTO_IPV6:
937 verdict = process_ipv6(skb, dynptr: &dynptr, offset: &off, metrics);
938 break;
939
940 default:
941 metrics->errors_total_unknown_l3_proto++;
942 return TC_ACT_SHOT;
943 }
944
945 switch (verdict) {
946 case INVALID:
947 /* metrics have already been bumped */
948 return TC_ACT_SHOT;
949
950 case UNKNOWN:
951 return forward_to_next_hop(skb, dynptr: &dynptr, encap, next_hop: &next_hop, metrics);
952
953 case ECHO_REQUEST:
954 metrics->accepted_packets_total_icmp_echo_request++;
955 break;
956
957 case SYN:
958 if (encap->unigue.forward_syn) {
959 return forward_to_next_hop(skb, dynptr: &dynptr, encap, next_hop: &next_hop,
960 metrics);
961 }
962
963 metrics->accepted_packets_total_syn++;
964 break;
965
966 case SYN_COOKIE:
967 metrics->accepted_packets_total_syn_cookies++;
968 break;
969
970 case ESTABLISHED:
971 metrics->accepted_packets_total_established++;
972 break;
973 }
974
975 ret = accept_locally(skb, encap);
976
977 if (encap == encap_buffer)
978 bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
979
980 return ret;
981}
982

source code of linux/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c