1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | // Copyright (c) 2019, 2020 Cloudflare |
3 | |
4 | #include <stdbool.h> |
5 | #include <stddef.h> |
6 | #include <stdint.h> |
7 | #include <string.h> |
8 | |
9 | #include <linux/bpf.h> |
10 | #include <linux/icmp.h> |
11 | #include <linux/icmpv6.h> |
12 | #include <linux/if_ether.h> |
13 | #include <linux/in.h> |
14 | #include <linux/ip.h> |
15 | #include <linux/ipv6.h> |
16 | #include <linux/pkt_cls.h> |
17 | #include <linux/tcp.h> |
18 | #include <linux/udp.h> |
19 | |
20 | #include <bpf/bpf_helpers.h> |
21 | #include <bpf/bpf_endian.h> |
22 | |
23 | #include "test_cls_redirect.h" |
24 | #include "bpf_kfuncs.h" |
25 | |
26 | #pragma GCC diagnostic ignored "-Waddress-of-packed-member" |
27 | |
28 | #define offsetofend(TYPE, MEMBER) \ |
29 | (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) |
30 | |
31 | #define IP_OFFSET_MASK (0x1FFF) |
32 | #define IP_MF (0x2000) |
33 | |
34 | char _license[] SEC("license" ) = "Dual BSD/GPL" ; |
35 | |
36 | /** |
37 | * Destination port and IP used for UDP encapsulation. |
38 | */ |
39 | volatile const __be16 ENCAPSULATION_PORT; |
40 | volatile const __be32 ENCAPSULATION_IP; |
41 | |
42 | typedef struct { |
43 | uint64_t processed_packets_total; |
44 | uint64_t l3_protocol_packets_total_ipv4; |
45 | uint64_t l3_protocol_packets_total_ipv6; |
46 | uint64_t l4_protocol_packets_total_tcp; |
47 | uint64_t l4_protocol_packets_total_udp; |
48 | uint64_t accepted_packets_total_syn; |
49 | uint64_t accepted_packets_total_syn_cookies; |
50 | uint64_t accepted_packets_total_last_hop; |
51 | uint64_t accepted_packets_total_icmp_echo_request; |
52 | uint64_t accepted_packets_total_established; |
53 | uint64_t forwarded_packets_total_gue; |
54 | uint64_t forwarded_packets_total_gre; |
55 | |
56 | uint64_t errors_total_unknown_l3_proto; |
57 | uint64_t errors_total_unknown_l4_proto; |
58 | uint64_t errors_total_malformed_ip; |
59 | uint64_t errors_total_fragmented_ip; |
60 | uint64_t errors_total_malformed_icmp; |
61 | uint64_t errors_total_unwanted_icmp; |
62 | uint64_t errors_total_malformed_icmp_pkt_too_big; |
63 | uint64_t errors_total_malformed_tcp; |
64 | uint64_t errors_total_malformed_udp; |
65 | uint64_t errors_total_icmp_echo_replies; |
66 | uint64_t errors_total_malformed_encapsulation; |
67 | uint64_t errors_total_encap_adjust_failed; |
68 | uint64_t errors_total_encap_buffer_too_small; |
69 | uint64_t errors_total_redirect_loop; |
70 | uint64_t errors_total_encap_mtu_violate; |
71 | } metrics_t; |
72 | |
73 | typedef enum { |
74 | INVALID = 0, |
75 | UNKNOWN, |
76 | ECHO_REQUEST, |
77 | SYN, |
78 | SYN_COOKIE, |
79 | ESTABLISHED, |
80 | } verdict_t; |
81 | |
82 | typedef struct { |
83 | uint16_t src, dst; |
84 | } flow_ports_t; |
85 | |
86 | _Static_assert( |
87 | sizeof(flow_ports_t) != |
88 | offsetofend(struct bpf_sock_tuple, ipv4.dport) - |
89 | offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, |
90 | "flow_ports_t must match sport and dport in struct bpf_sock_tuple" ); |
91 | _Static_assert( |
92 | sizeof(flow_ports_t) != |
93 | offsetofend(struct bpf_sock_tuple, ipv6.dport) - |
94 | offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, |
95 | "flow_ports_t must match sport and dport in struct bpf_sock_tuple" ); |
96 | |
97 | struct iphdr_info { |
98 | void *hdr; |
99 | __u64 len; |
100 | }; |
101 | |
102 | typedef int ret_t; |
103 | |
104 | /* This is a bit of a hack. We need a return value which allows us to |
105 | * indicate that the regular flow of the program should continue, |
106 | * while allowing functions to use XDP_PASS and XDP_DROP, etc. |
107 | */ |
108 | static const ret_t CONTINUE_PROCESSING = -1; |
109 | |
110 | /* Convenience macro to call functions which return ret_t. |
111 | */ |
112 | #define MAYBE_RETURN(x) \ |
113 | do { \ |
114 | ret_t __ret = x; \ |
115 | if (__ret != CONTINUE_PROCESSING) \ |
116 | return __ret; \ |
117 | } while (0) |
118 | |
119 | static bool ipv4_is_fragment(const struct iphdr *ip) |
120 | { |
121 | uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); |
122 | return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; |
123 | } |
124 | |
125 | static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr) |
126 | { |
127 | if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0)) |
128 | return -1; |
129 | |
130 | *offset += sizeof(*iphdr); |
131 | |
132 | if (iphdr->ihl < 5) |
133 | return -1; |
134 | |
135 | /* skip ipv4 options */ |
136 | *offset += (iphdr->ihl - 5) * 4; |
137 | |
138 | return 0; |
139 | } |
140 | |
141 | /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ |
142 | static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports) |
143 | { |
144 | if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0)) |
145 | return false; |
146 | |
147 | *offset += sizeof(*ports); |
148 | |
149 | /* Ports in the L4 headers are reversed, since we are parsing an ICMP |
150 | * payload which is going towards the eyeball. |
151 | */ |
152 | uint16_t dst = ports->src; |
153 | ports->src = ports->dst; |
154 | ports->dst = dst; |
155 | return true; |
156 | } |
157 | |
158 | static uint16_t pkt_checksum_fold(uint32_t csum) |
159 | { |
160 | /* The highest reasonable value for an IPv4 header |
161 | * checksum requires two folds, so we just do that always. |
162 | */ |
163 | csum = (csum & 0xffff) + (csum >> 16); |
164 | csum = (csum & 0xffff) + (csum >> 16); |
165 | return (uint16_t)~csum; |
166 | } |
167 | |
168 | static void pkt_ipv4_checksum(struct iphdr *iph) |
169 | { |
170 | iph->check = 0; |
171 | |
172 | /* An IP header without options is 20 bytes. Two of those |
173 | * are the checksum, which we always set to zero. Hence, |
174 | * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, |
175 | * which fits in 32 bit. |
176 | */ |
177 | _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes" ); |
178 | uint32_t acc = 0; |
179 | uint16_t *ipw = (uint16_t *)iph; |
180 | |
181 | for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) |
182 | acc += ipw[i]; |
183 | |
184 | iph->check = pkt_checksum_fold(csum: acc); |
185 | } |
186 | |
187 | static bool (struct bpf_dynptr *dynptr, __u64 *offset, |
188 | const struct ipv6hdr *ipv6, uint8_t *upper_proto, |
189 | bool *is_fragment) |
190 | { |
191 | /* We understand five extension headers. |
192 | * https://tools.ietf.org/html/rfc8200#section-4.1 states that all |
193 | * headers should occur once, except Destination Options, which may |
194 | * occur twice. Hence we give up after 6 headers. |
195 | */ |
196 | struct { |
197 | uint8_t next; |
198 | uint8_t len; |
199 | } exthdr = { |
200 | .next = ipv6->nexthdr, |
201 | }; |
202 | *is_fragment = false; |
203 | |
204 | for (int i = 0; i < 6; i++) { |
205 | switch (exthdr.next) { |
206 | case IPPROTO_FRAGMENT: |
207 | *is_fragment = true; |
208 | /* NB: We don't check that hdrlen == 0 as per spec. */ |
209 | /* fallthrough; */ |
210 | |
211 | case IPPROTO_HOPOPTS: |
212 | case IPPROTO_ROUTING: |
213 | case IPPROTO_DSTOPTS: |
214 | case IPPROTO_MH: |
215 | if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0)) |
216 | return false; |
217 | |
218 | /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ |
219 | *offset += (exthdr.len + 1) * 8; |
220 | |
221 | /* Decode next header */ |
222 | break; |
223 | |
224 | default: |
225 | /* The next header is not one of the known extension |
226 | * headers, treat it as the upper layer header. |
227 | * |
228 | * This handles IPPROTO_NONE. |
229 | * |
230 | * Encapsulating Security Payload (50) and Authentication |
231 | * Header (51) also end up here (and will trigger an |
232 | * unknown proto error later). They have a custom header |
233 | * format and seem too esoteric to care about. |
234 | */ |
235 | *upper_proto = exthdr.next; |
236 | return true; |
237 | } |
238 | } |
239 | |
240 | /* We never found an upper layer header. */ |
241 | return false; |
242 | } |
243 | |
244 | static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6, |
245 | uint8_t *proto, bool *is_fragment) |
246 | { |
247 | if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0)) |
248 | return -1; |
249 | |
250 | *offset += sizeof(*ipv6); |
251 | |
252 | if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, upper_proto: proto, is_fragment)) |
253 | return -1; |
254 | |
255 | return 0; |
256 | } |
257 | |
258 | /* Global metrics, per CPU |
259 | */ |
260 | struct { |
261 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
262 | __uint(max_entries, 1); |
263 | __type(key, unsigned int); |
264 | __type(value, metrics_t); |
265 | } metrics_map SEC(".maps" ); |
266 | |
267 | static metrics_t *get_global_metrics(void) |
268 | { |
269 | uint64_t key = 0; |
270 | return bpf_map_lookup_elem(&metrics_map, &key); |
271 | } |
272 | |
273 | static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) |
274 | { |
275 | const int payload_off = |
276 | sizeof(*encap) + |
277 | sizeof(struct in_addr) * encap->unigue.hop_count; |
278 | int32_t encap_overhead = payload_off - sizeof(struct ethhdr); |
279 | |
280 | /* Changing the ethertype if the encapsulated packet is ipv6 */ |
281 | if (encap->gue.proto_ctype == IPPROTO_IPV6) |
282 | encap->eth.h_proto = bpf_htons(ETH_P_IPV6); |
283 | |
284 | if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, |
285 | BPF_F_ADJ_ROOM_FIXED_GSO | |
286 | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || |
287 | bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) |
288 | return TC_ACT_SHOT; |
289 | |
290 | return bpf_redirect(skb->ifindex, BPF_F_INGRESS); |
291 | } |
292 | |
293 | static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr, |
294 | encap_headers_t *encap, struct in_addr *next_hop, |
295 | metrics_t *metrics) |
296 | { |
297 | const int payload_off = |
298 | sizeof(*encap) + |
299 | sizeof(struct in_addr) * encap->unigue.hop_count; |
300 | int32_t encap_overhead = |
301 | payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); |
302 | int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; |
303 | __u8 encap_buffer[sizeof(encap_gre_t)] = {}; |
304 | uint16_t proto = ETH_P_IP; |
305 | uint32_t mtu_len = 0; |
306 | encap_gre_t *encap_gre; |
307 | |
308 | metrics->forwarded_packets_total_gre++; |
309 | |
310 | /* Loop protection: the inner packet's TTL is decremented as a safeguard |
311 | * against any forwarding loop. As the only interesting field is the TTL |
312 | * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes |
313 | * as they handle the split packets if needed (no need for the data to be |
314 | * in the linear section). |
315 | */ |
316 | if (encap->gue.proto_ctype == IPPROTO_IPV6) { |
317 | proto = ETH_P_IPV6; |
318 | uint8_t ttl; |
319 | int rc; |
320 | |
321 | rc = bpf_skb_load_bytes( |
322 | skb, payload_off + offsetof(struct ipv6hdr, hop_limit), |
323 | &ttl, 1); |
324 | if (rc != 0) { |
325 | metrics->errors_total_malformed_encapsulation++; |
326 | return TC_ACT_SHOT; |
327 | } |
328 | |
329 | if (ttl == 0) { |
330 | metrics->errors_total_redirect_loop++; |
331 | return TC_ACT_SHOT; |
332 | } |
333 | |
334 | ttl--; |
335 | rc = bpf_skb_store_bytes( |
336 | skb, payload_off + offsetof(struct ipv6hdr, hop_limit), |
337 | &ttl, 1, 0); |
338 | if (rc != 0) { |
339 | metrics->errors_total_malformed_encapsulation++; |
340 | return TC_ACT_SHOT; |
341 | } |
342 | } else { |
343 | uint8_t ttl; |
344 | int rc; |
345 | |
346 | rc = bpf_skb_load_bytes( |
347 | skb, payload_off + offsetof(struct iphdr, ttl), &ttl, |
348 | 1); |
349 | if (rc != 0) { |
350 | metrics->errors_total_malformed_encapsulation++; |
351 | return TC_ACT_SHOT; |
352 | } |
353 | |
354 | if (ttl == 0) { |
355 | metrics->errors_total_redirect_loop++; |
356 | return TC_ACT_SHOT; |
357 | } |
358 | |
359 | /* IPv4 also has a checksum to patch. While the TTL is only one byte, |
360 | * this function only works for 2 and 4 bytes arguments (the result is |
361 | * the same). |
362 | */ |
363 | rc = bpf_l3_csum_replace( |
364 | skb, payload_off + offsetof(struct iphdr, check), ttl, |
365 | ttl - 1, 2); |
366 | if (rc != 0) { |
367 | metrics->errors_total_malformed_encapsulation++; |
368 | return TC_ACT_SHOT; |
369 | } |
370 | |
371 | ttl--; |
372 | rc = bpf_skb_store_bytes( |
373 | skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, |
374 | 0); |
375 | if (rc != 0) { |
376 | metrics->errors_total_malformed_encapsulation++; |
377 | return TC_ACT_SHOT; |
378 | } |
379 | } |
380 | |
381 | if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { |
382 | metrics->errors_total_encap_mtu_violate++; |
383 | return TC_ACT_SHOT; |
384 | } |
385 | |
386 | if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, |
387 | BPF_F_ADJ_ROOM_FIXED_GSO | |
388 | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || |
389 | bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { |
390 | metrics->errors_total_encap_adjust_failed++; |
391 | return TC_ACT_SHOT; |
392 | } |
393 | |
394 | if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { |
395 | metrics->errors_total_encap_buffer_too_small++; |
396 | return TC_ACT_SHOT; |
397 | } |
398 | |
399 | encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer)); |
400 | if (!encap_gre) { |
401 | metrics->errors_total_encap_buffer_too_small++; |
402 | return TC_ACT_SHOT; |
403 | } |
404 | |
405 | encap_gre->ip.protocol = IPPROTO_GRE; |
406 | encap_gre->ip.daddr = next_hop->s_addr; |
407 | encap_gre->ip.saddr = ENCAPSULATION_IP; |
408 | encap_gre->ip.tot_len = |
409 | bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); |
410 | encap_gre->gre.flags = 0; |
411 | encap_gre->gre.protocol = bpf_htons(proto); |
412 | pkt_ipv4_checksum(iph: (void *)&encap_gre->ip); |
413 | |
414 | if (encap_gre == encap_buffer) |
415 | bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0); |
416 | |
417 | return bpf_redirect(skb->ifindex, 0); |
418 | } |
419 | |
420 | static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr, |
421 | encap_headers_t *encap, struct in_addr *next_hop, |
422 | metrics_t *metrics) |
423 | { |
424 | /* swap L2 addresses */ |
425 | /* This assumes that packets are received from a router. |
426 | * So just swapping the MAC addresses here will make the packet go back to |
427 | * the router, which will send it to the appropriate machine. |
428 | */ |
429 | unsigned char temp[ETH_ALEN]; |
430 | memcpy(temp, encap->eth.h_dest, sizeof(temp)); |
431 | memcpy(encap->eth.h_dest, encap->eth.h_source, |
432 | sizeof(encap->eth.h_dest)); |
433 | memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); |
434 | |
435 | if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && |
436 | encap->unigue.last_hop_gre) { |
437 | return forward_with_gre(skb, dynptr, encap, next_hop, metrics); |
438 | } |
439 | |
440 | metrics->forwarded_packets_total_gue++; |
441 | uint32_t old_saddr = encap->ip.saddr; |
442 | encap->ip.saddr = encap->ip.daddr; |
443 | encap->ip.daddr = next_hop->s_addr; |
444 | if (encap->unigue.next_hop < encap->unigue.hop_count) { |
445 | encap->unigue.next_hop++; |
446 | } |
447 | |
448 | /* Remove ip->saddr, add next_hop->s_addr */ |
449 | const uint64_t off = offsetof(typeof(*encap), ip.check); |
450 | int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); |
451 | if (ret < 0) { |
452 | return TC_ACT_SHOT; |
453 | } |
454 | |
455 | return bpf_redirect(skb->ifindex, 0); |
456 | } |
457 | |
458 | static ret_t skip_next_hops(__u64 *offset, int n) |
459 | { |
460 | switch (n) { |
461 | case 1: |
462 | *offset += sizeof(struct in_addr); |
463 | case 0: |
464 | return CONTINUE_PROCESSING; |
465 | |
466 | default: |
467 | return TC_ACT_SHOT; |
468 | } |
469 | } |
470 | |
471 | /* Get the next hop from the GLB header. |
472 | * |
473 | * Sets next_hop->s_addr to 0 if there are no more hops left. |
474 | * pkt is positioned just after the variable length GLB header |
475 | * iff the call is successful. |
476 | */ |
477 | static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap, |
478 | struct in_addr *next_hop) |
479 | { |
480 | if (encap->unigue.next_hop > encap->unigue.hop_count) |
481 | return TC_ACT_SHOT; |
482 | |
483 | /* Skip "used" next hops. */ |
484 | MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop)); |
485 | |
486 | if (encap->unigue.next_hop == encap->unigue.hop_count) { |
487 | /* No more next hops, we are at the end of the GLB header. */ |
488 | next_hop->s_addr = 0; |
489 | return CONTINUE_PROCESSING; |
490 | } |
491 | |
492 | if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0)) |
493 | return TC_ACT_SHOT; |
494 | |
495 | *offset += sizeof(*next_hop); |
496 | |
497 | /* Skip the remainig next hops (may be zero). */ |
498 | return skip_next_hops(offset, n: encap->unigue.hop_count - encap->unigue.next_hop - 1); |
499 | } |
500 | |
501 | /* Fill a bpf_sock_tuple to be used with the socket lookup functions. |
502 | * This is a kludge that let's us work around verifier limitations: |
503 | * |
504 | * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) |
505 | * |
506 | * clang will substitue a costant for sizeof, which allows the verifier |
507 | * to track it's value. Based on this, it can figure out the constant |
508 | * return value, and calling code works while still being "generic" to |
509 | * IPv4 and IPv6. |
510 | */ |
511 | static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, |
512 | uint64_t iphlen, uint16_t sport, uint16_t dport) |
513 | { |
514 | switch (iphlen) { |
515 | case sizeof(struct iphdr): { |
516 | struct iphdr *ipv4 = (struct iphdr *)iph; |
517 | tuple->ipv4.daddr = ipv4->daddr; |
518 | tuple->ipv4.saddr = ipv4->saddr; |
519 | tuple->ipv4.sport = sport; |
520 | tuple->ipv4.dport = dport; |
521 | return sizeof(tuple->ipv4); |
522 | } |
523 | |
524 | case sizeof(struct ipv6hdr): { |
525 | struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; |
526 | memcpy(&tuple->ipv6.daddr, &ipv6->daddr, |
527 | sizeof(tuple->ipv6.daddr)); |
528 | memcpy(&tuple->ipv6.saddr, &ipv6->saddr, |
529 | sizeof(tuple->ipv6.saddr)); |
530 | tuple->ipv6.sport = sport; |
531 | tuple->ipv6.dport = dport; |
532 | return sizeof(tuple->ipv6); |
533 | } |
534 | |
535 | default: |
536 | return 0; |
537 | } |
538 | } |
539 | |
540 | static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, |
541 | uint64_t tuplen, void *iph, struct tcphdr *tcp) |
542 | { |
543 | struct bpf_sock *sk = |
544 | bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); |
545 | |
546 | if (sk == NULL) |
547 | return UNKNOWN; |
548 | |
549 | if (sk->state != BPF_TCP_LISTEN) { |
550 | bpf_sk_release(sk); |
551 | return ESTABLISHED; |
552 | } |
553 | |
554 | if (iph != NULL && tcp != NULL) { |
555 | /* Kludge: we've run out of arguments, but need the length of the ip header. */ |
556 | uint64_t iphlen = sizeof(struct iphdr); |
557 | |
558 | if (tuplen == sizeof(tuple->ipv6)) |
559 | iphlen = sizeof(struct ipv6hdr); |
560 | |
561 | if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, |
562 | sizeof(*tcp)) == 0) { |
563 | bpf_sk_release(sk); |
564 | return SYN_COOKIE; |
565 | } |
566 | } |
567 | |
568 | bpf_sk_release(sk); |
569 | return UNKNOWN; |
570 | } |
571 | |
572 | static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen) |
573 | { |
574 | struct bpf_sock *sk = |
575 | bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); |
576 | |
577 | if (sk == NULL) |
578 | return UNKNOWN; |
579 | |
580 | if (sk->state == BPF_TCP_ESTABLISHED) { |
581 | bpf_sk_release(sk); |
582 | return ESTABLISHED; |
583 | } |
584 | |
585 | bpf_sk_release(sk); |
586 | return UNKNOWN; |
587 | } |
588 | |
589 | static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple, |
590 | uint64_t tuplen, metrics_t *metrics) |
591 | { |
592 | switch (proto) { |
593 | case IPPROTO_TCP: |
594 | return classify_tcp(skb, tuple, tuplen, NULL, NULL); |
595 | |
596 | case IPPROTO_UDP: |
597 | return classify_udp(skb, tuple, tuplen); |
598 | |
599 | default: |
600 | metrics->errors_total_malformed_icmp++; |
601 | return INVALID; |
602 | } |
603 | } |
604 | |
605 | static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset, |
606 | metrics_t *metrics) |
607 | { |
608 | struct icmphdr icmp; |
609 | struct iphdr ipv4; |
610 | |
611 | if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) { |
612 | metrics->errors_total_malformed_icmp++; |
613 | return INVALID; |
614 | } |
615 | |
616 | *offset += sizeof(icmp); |
617 | |
618 | /* We should never receive encapsulated echo replies. */ |
619 | if (icmp.type == ICMP_ECHOREPLY) { |
620 | metrics->errors_total_icmp_echo_replies++; |
621 | return INVALID; |
622 | } |
623 | |
624 | if (icmp.type == ICMP_ECHO) |
625 | return ECHO_REQUEST; |
626 | |
627 | if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { |
628 | metrics->errors_total_unwanted_icmp++; |
629 | return INVALID; |
630 | } |
631 | |
632 | if (pkt_parse_ipv4(dynptr, offset, iphdr: &ipv4)) { |
633 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
634 | return INVALID; |
635 | } |
636 | |
637 | /* The source address in the outer IP header is from the entity that |
638 | * originated the ICMP message. Use the original IP header to restore |
639 | * the correct flow tuple. |
640 | */ |
641 | struct bpf_sock_tuple tuple; |
642 | tuple.ipv4.saddr = ipv4.daddr; |
643 | tuple.ipv4.daddr = ipv4.saddr; |
644 | |
645 | if (!pkt_parse_icmp_l4_ports(dynptr, offset, ports: (flow_ports_t *)&tuple.ipv4.sport)) { |
646 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
647 | return INVALID; |
648 | } |
649 | |
650 | return classify_icmp(skb, proto: ipv4.protocol, tuple: &tuple, |
651 | tuplen: sizeof(tuple.ipv4), metrics); |
652 | } |
653 | |
654 | static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, |
655 | metrics_t *metrics) |
656 | { |
657 | struct bpf_sock_tuple tuple; |
658 | struct ipv6hdr ipv6; |
659 | struct icmp6hdr icmp6; |
660 | bool is_fragment; |
661 | uint8_t l4_proto; |
662 | |
663 | if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) { |
664 | metrics->errors_total_malformed_icmp++; |
665 | return INVALID; |
666 | } |
667 | |
668 | /* We should never receive encapsulated echo replies. */ |
669 | if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { |
670 | metrics->errors_total_icmp_echo_replies++; |
671 | return INVALID; |
672 | } |
673 | |
674 | if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { |
675 | return ECHO_REQUEST; |
676 | } |
677 | |
678 | if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { |
679 | metrics->errors_total_unwanted_icmp++; |
680 | return INVALID; |
681 | } |
682 | |
683 | if (pkt_parse_ipv6(dynptr, offset, ipv6: &ipv6, proto: &l4_proto, is_fragment: &is_fragment)) { |
684 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
685 | return INVALID; |
686 | } |
687 | |
688 | if (is_fragment) { |
689 | metrics->errors_total_fragmented_ip++; |
690 | return INVALID; |
691 | } |
692 | |
693 | /* Swap source and dest addresses. */ |
694 | memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr)); |
695 | memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr)); |
696 | |
697 | if (!pkt_parse_icmp_l4_ports(dynptr, offset, ports: (flow_ports_t *)&tuple.ipv6.sport)) { |
698 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
699 | return INVALID; |
700 | } |
701 | |
702 | return classify_icmp(skb, proto: l4_proto, tuple: &tuple, tuplen: sizeof(tuple.ipv6), |
703 | metrics); |
704 | } |
705 | |
706 | static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, |
707 | struct iphdr_info *info, metrics_t *metrics) |
708 | { |
709 | struct bpf_sock_tuple tuple; |
710 | struct tcphdr tcp; |
711 | uint64_t tuplen; |
712 | |
713 | metrics->l4_protocol_packets_total_tcp++; |
714 | |
715 | if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) { |
716 | metrics->errors_total_malformed_tcp++; |
717 | return INVALID; |
718 | } |
719 | |
720 | *offset += sizeof(tcp); |
721 | |
722 | if (tcp.syn) |
723 | return SYN; |
724 | |
725 | tuplen = fill_tuple(tuple: &tuple, iph: info->hdr, iphlen: info->len, sport: tcp.source, dport: tcp.dest); |
726 | return classify_tcp(skb, tuple: &tuple, tuplen, iph: info->hdr, tcp: &tcp); |
727 | } |
728 | |
729 | static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb, |
730 | struct iphdr_info *info, metrics_t *metrics) |
731 | { |
732 | struct bpf_sock_tuple tuple; |
733 | struct udphdr udph; |
734 | uint64_t tuplen; |
735 | |
736 | metrics->l4_protocol_packets_total_udp++; |
737 | |
738 | if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) { |
739 | metrics->errors_total_malformed_udp++; |
740 | return INVALID; |
741 | } |
742 | *offset += sizeof(udph); |
743 | |
744 | tuplen = fill_tuple(tuple: &tuple, iph: info->hdr, iphlen: info->len, sport: udph.source, dport: udph.dest); |
745 | return classify_udp(skb, tuple: &tuple, tuplen); |
746 | } |
747 | |
748 | static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, |
749 | __u64 *offset, metrics_t *metrics) |
750 | { |
751 | struct iphdr ipv4; |
752 | struct iphdr_info info = { |
753 | .hdr = &ipv4, |
754 | .len = sizeof(ipv4), |
755 | }; |
756 | |
757 | metrics->l3_protocol_packets_total_ipv4++; |
758 | |
759 | if (pkt_parse_ipv4(dynptr, offset, iphdr: &ipv4)) { |
760 | metrics->errors_total_malformed_ip++; |
761 | return INVALID; |
762 | } |
763 | |
764 | if (ipv4.version != 4) { |
765 | metrics->errors_total_malformed_ip++; |
766 | return INVALID; |
767 | } |
768 | |
769 | if (ipv4_is_fragment(ip: &ipv4)) { |
770 | metrics->errors_total_fragmented_ip++; |
771 | return INVALID; |
772 | } |
773 | |
774 | switch (ipv4.protocol) { |
775 | case IPPROTO_ICMP: |
776 | return process_icmpv4(skb, dynptr, offset, metrics); |
777 | |
778 | case IPPROTO_TCP: |
779 | return process_tcp(dynptr, offset, skb, info: &info, metrics); |
780 | |
781 | case IPPROTO_UDP: |
782 | return process_udp(dynptr, offset, skb, info: &info, metrics); |
783 | |
784 | default: |
785 | metrics->errors_total_unknown_l4_proto++; |
786 | return INVALID; |
787 | } |
788 | } |
789 | |
790 | static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr, |
791 | __u64 *offset, metrics_t *metrics) |
792 | { |
793 | struct ipv6hdr ipv6; |
794 | struct iphdr_info info = { |
795 | .hdr = &ipv6, |
796 | .len = sizeof(ipv6), |
797 | }; |
798 | uint8_t l4_proto; |
799 | bool is_fragment; |
800 | |
801 | metrics->l3_protocol_packets_total_ipv6++; |
802 | |
803 | if (pkt_parse_ipv6(dynptr, offset, ipv6: &ipv6, proto: &l4_proto, is_fragment: &is_fragment)) { |
804 | metrics->errors_total_malformed_ip++; |
805 | return INVALID; |
806 | } |
807 | |
808 | if (ipv6.version != 6) { |
809 | metrics->errors_total_malformed_ip++; |
810 | return INVALID; |
811 | } |
812 | |
813 | if (is_fragment) { |
814 | metrics->errors_total_fragmented_ip++; |
815 | return INVALID; |
816 | } |
817 | |
818 | switch (l4_proto) { |
819 | case IPPROTO_ICMPV6: |
820 | return process_icmpv6(dynptr, offset, skb, metrics); |
821 | |
822 | case IPPROTO_TCP: |
823 | return process_tcp(dynptr, offset, skb, info: &info, metrics); |
824 | |
825 | case IPPROTO_UDP: |
826 | return process_udp(dynptr, offset, skb, info: &info, metrics); |
827 | |
828 | default: |
829 | metrics->errors_total_unknown_l4_proto++; |
830 | return INVALID; |
831 | } |
832 | } |
833 | |
834 | SEC("tc" ) |
835 | int cls_redirect(struct __sk_buff *skb) |
836 | { |
837 | __u8 encap_buffer[sizeof(encap_headers_t)] = {}; |
838 | struct bpf_dynptr dynptr; |
839 | struct in_addr next_hop; |
840 | /* Tracks offset of the dynptr. This will be unnecessary once |
841 | * bpf_dynptr_advance() is available. |
842 | */ |
843 | __u64 off = 0; |
844 | ret_t ret; |
845 | |
846 | bpf_dynptr_from_skb(skb, 0, &dynptr); |
847 | |
848 | metrics_t *metrics = get_global_metrics(); |
849 | if (metrics == NULL) |
850 | return TC_ACT_SHOT; |
851 | |
852 | metrics->processed_packets_total++; |
853 | |
854 | /* Pass bogus packets as long as we're not sure they're |
855 | * destined for us. |
856 | */ |
857 | if (skb->protocol != bpf_htons(ETH_P_IP)) |
858 | return TC_ACT_OK; |
859 | |
860 | encap_headers_t *encap; |
861 | |
862 | /* Make sure that all encapsulation headers are available in |
863 | * the linear portion of the skb. This makes it easy to manipulate them. |
864 | */ |
865 | if (bpf_skb_pull_data(skb, sizeof(*encap))) |
866 | return TC_ACT_OK; |
867 | |
868 | encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer)); |
869 | if (!encap) |
870 | return TC_ACT_OK; |
871 | |
872 | off += sizeof(*encap); |
873 | |
874 | if (encap->ip.ihl != 5) |
875 | /* We never have any options. */ |
876 | return TC_ACT_OK; |
877 | |
878 | if (encap->ip.daddr != ENCAPSULATION_IP || |
879 | encap->ip.protocol != IPPROTO_UDP) |
880 | return TC_ACT_OK; |
881 | |
882 | /* TODO Check UDP length? */ |
883 | if (encap->udp.dest != ENCAPSULATION_PORT) |
884 | return TC_ACT_OK; |
885 | |
886 | /* We now know that the packet is destined to us, we can |
887 | * drop bogus ones. |
888 | */ |
889 | if (ipv4_is_fragment(ip: (void *)&encap->ip)) { |
890 | metrics->errors_total_fragmented_ip++; |
891 | return TC_ACT_SHOT; |
892 | } |
893 | |
894 | if (encap->gue.variant != 0) { |
895 | metrics->errors_total_malformed_encapsulation++; |
896 | return TC_ACT_SHOT; |
897 | } |
898 | |
899 | if (encap->gue.control != 0) { |
900 | metrics->errors_total_malformed_encapsulation++; |
901 | return TC_ACT_SHOT; |
902 | } |
903 | |
904 | if (encap->gue.flags != 0) { |
905 | metrics->errors_total_malformed_encapsulation++; |
906 | return TC_ACT_SHOT; |
907 | } |
908 | |
909 | if (encap->gue.hlen != |
910 | sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { |
911 | metrics->errors_total_malformed_encapsulation++; |
912 | return TC_ACT_SHOT; |
913 | } |
914 | |
915 | if (encap->unigue.version != 0) { |
916 | metrics->errors_total_malformed_encapsulation++; |
917 | return TC_ACT_SHOT; |
918 | } |
919 | |
920 | if (encap->unigue.reserved != 0) |
921 | return TC_ACT_SHOT; |
922 | |
923 | MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop)); |
924 | |
925 | if (next_hop.s_addr == 0) { |
926 | metrics->accepted_packets_total_last_hop++; |
927 | return accept_locally(skb, encap); |
928 | } |
929 | |
930 | verdict_t verdict; |
931 | switch (encap->gue.proto_ctype) { |
932 | case IPPROTO_IPIP: |
933 | verdict = process_ipv4(skb, dynptr: &dynptr, offset: &off, metrics); |
934 | break; |
935 | |
936 | case IPPROTO_IPV6: |
937 | verdict = process_ipv6(skb, dynptr: &dynptr, offset: &off, metrics); |
938 | break; |
939 | |
940 | default: |
941 | metrics->errors_total_unknown_l3_proto++; |
942 | return TC_ACT_SHOT; |
943 | } |
944 | |
945 | switch (verdict) { |
946 | case INVALID: |
947 | /* metrics have already been bumped */ |
948 | return TC_ACT_SHOT; |
949 | |
950 | case UNKNOWN: |
951 | return forward_to_next_hop(skb, dynptr: &dynptr, encap, next_hop: &next_hop, metrics); |
952 | |
953 | case ECHO_REQUEST: |
954 | metrics->accepted_packets_total_icmp_echo_request++; |
955 | break; |
956 | |
957 | case SYN: |
958 | if (encap->unigue.forward_syn) { |
959 | return forward_to_next_hop(skb, dynptr: &dynptr, encap, next_hop: &next_hop, |
960 | metrics); |
961 | } |
962 | |
963 | metrics->accepted_packets_total_syn++; |
964 | break; |
965 | |
966 | case SYN_COOKIE: |
967 | metrics->accepted_packets_total_syn_cookies++; |
968 | break; |
969 | |
970 | case ESTABLISHED: |
971 | metrics->accepted_packets_total_established++; |
972 | break; |
973 | } |
974 | |
975 | ret = accept_locally(skb, encap); |
976 | |
977 | if (encap == encap_buffer) |
978 | bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0); |
979 | |
980 | return ret; |
981 | } |
982 | |