1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
2 | // Copyright (c) 2019, 2020 Cloudflare |
3 | |
4 | #include <stdbool.h> |
5 | #include <stddef.h> |
6 | #include <stdint.h> |
7 | #include <string.h> |
8 | |
9 | #include <linux/bpf.h> |
10 | #include <linux/icmp.h> |
11 | #include <linux/icmpv6.h> |
12 | #include <linux/if_ether.h> |
13 | #include <linux/in.h> |
14 | #include <linux/ip.h> |
15 | #include <linux/ipv6.h> |
16 | #include <linux/pkt_cls.h> |
17 | #include <linux/tcp.h> |
18 | #include <linux/udp.h> |
19 | |
20 | #include <bpf/bpf_helpers.h> |
21 | #include <bpf/bpf_endian.h> |
22 | |
23 | #include "bpf_compiler.h" |
24 | #include "test_cls_redirect.h" |
25 | |
26 | #pragma GCC diagnostic ignored "-Waddress-of-packed-member" |
27 | |
28 | #ifdef SUBPROGS |
29 | #define INLINING __noinline |
30 | #else |
31 | #define INLINING __always_inline |
32 | #endif |
33 | |
34 | #define offsetofend(TYPE, MEMBER) \ |
35 | (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) |
36 | |
37 | #define IP_OFFSET_MASK (0x1FFF) |
38 | #define IP_MF (0x2000) |
39 | |
40 | char _license[] SEC("license" ) = "Dual BSD/GPL" ; |
41 | |
42 | /** |
43 | * Destination port and IP used for UDP encapsulation. |
44 | */ |
45 | volatile const __be16 ENCAPSULATION_PORT; |
46 | volatile const __be32 ENCAPSULATION_IP; |
47 | |
48 | typedef struct { |
49 | uint64_t processed_packets_total; |
50 | uint64_t l3_protocol_packets_total_ipv4; |
51 | uint64_t l3_protocol_packets_total_ipv6; |
52 | uint64_t l4_protocol_packets_total_tcp; |
53 | uint64_t l4_protocol_packets_total_udp; |
54 | uint64_t accepted_packets_total_syn; |
55 | uint64_t accepted_packets_total_syn_cookies; |
56 | uint64_t accepted_packets_total_last_hop; |
57 | uint64_t accepted_packets_total_icmp_echo_request; |
58 | uint64_t accepted_packets_total_established; |
59 | uint64_t forwarded_packets_total_gue; |
60 | uint64_t forwarded_packets_total_gre; |
61 | |
62 | uint64_t errors_total_unknown_l3_proto; |
63 | uint64_t errors_total_unknown_l4_proto; |
64 | uint64_t errors_total_malformed_ip; |
65 | uint64_t errors_total_fragmented_ip; |
66 | uint64_t errors_total_malformed_icmp; |
67 | uint64_t errors_total_unwanted_icmp; |
68 | uint64_t errors_total_malformed_icmp_pkt_too_big; |
69 | uint64_t errors_total_malformed_tcp; |
70 | uint64_t errors_total_malformed_udp; |
71 | uint64_t errors_total_icmp_echo_replies; |
72 | uint64_t errors_total_malformed_encapsulation; |
73 | uint64_t errors_total_encap_adjust_failed; |
74 | uint64_t errors_total_encap_buffer_too_small; |
75 | uint64_t errors_total_redirect_loop; |
76 | uint64_t errors_total_encap_mtu_violate; |
77 | } metrics_t; |
78 | |
79 | typedef enum { |
80 | INVALID = 0, |
81 | UNKNOWN, |
82 | ECHO_REQUEST, |
83 | SYN, |
84 | SYN_COOKIE, |
85 | ESTABLISHED, |
86 | } verdict_t; |
87 | |
88 | typedef struct { |
89 | uint16_t src, dst; |
90 | } flow_ports_t; |
91 | |
92 | _Static_assert( |
93 | sizeof(flow_ports_t) != |
94 | offsetofend(struct bpf_sock_tuple, ipv4.dport) - |
95 | offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, |
96 | "flow_ports_t must match sport and dport in struct bpf_sock_tuple" ); |
97 | _Static_assert( |
98 | sizeof(flow_ports_t) != |
99 | offsetofend(struct bpf_sock_tuple, ipv6.dport) - |
100 | offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, |
101 | "flow_ports_t must match sport and dport in struct bpf_sock_tuple" ); |
102 | |
103 | typedef int ret_t; |
104 | |
105 | /* This is a bit of a hack. We need a return value which allows us to |
106 | * indicate that the regular flow of the program should continue, |
107 | * while allowing functions to use XDP_PASS and XDP_DROP, etc. |
108 | */ |
109 | static const ret_t CONTINUE_PROCESSING = -1; |
110 | |
111 | /* Convenience macro to call functions which return ret_t. |
112 | */ |
113 | #define MAYBE_RETURN(x) \ |
114 | do { \ |
115 | ret_t __ret = x; \ |
116 | if (__ret != CONTINUE_PROCESSING) \ |
117 | return __ret; \ |
118 | } while (0) |
119 | |
120 | /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), |
121 | * or not aligned if the arch supports efficient unaligned access. |
122 | * |
123 | * Since the verifier ensures that eBPF packet accesses follow these rules, |
124 | * we can tell LLVM to emit code as if we always had a larger alignment. |
125 | * It will yell at us if we end up on a platform where this is not valid. |
126 | */ |
127 | typedef uint8_t *net_ptr __attribute__((align_value(8))); |
128 | |
129 | typedef struct buf { |
130 | struct __sk_buff *skb; |
131 | net_ptr head; |
132 | /* NB: tail musn't have alignment other than 1, otherwise |
133 | * LLVM will go and eliminate code, e.g. when checking packet lengths. |
134 | */ |
135 | uint8_t *const tail; |
136 | } buf_t; |
137 | |
138 | static __always_inline size_t buf_off(const buf_t *buf) |
139 | { |
140 | /* Clang seems to optimize constructs like |
141 | * a - b + c |
142 | * if c is known: |
143 | * r? = c |
144 | * r? -= b |
145 | * r? += a |
146 | * |
147 | * This is a problem if a and b are packet pointers, |
148 | * since the verifier allows subtracting two pointers to |
149 | * get a scalar, but not a scalar and a pointer. |
150 | * |
151 | * Use inline asm to break this optimization. |
152 | */ |
153 | size_t off = (size_t)buf->head; |
154 | asm("%0 -= %1" : "+r" (off) : "r" (buf->skb->data)); |
155 | return off; |
156 | } |
157 | |
158 | static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) |
159 | { |
160 | if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { |
161 | return false; |
162 | } |
163 | |
164 | buf->head += len; |
165 | return true; |
166 | } |
167 | |
168 | static __always_inline bool buf_skip(buf_t *buf, const size_t len) |
169 | { |
170 | /* Check whether off + len is valid in the non-linear part. */ |
171 | if (buf_off(buf) + len > buf->skb->len) { |
172 | return false; |
173 | } |
174 | |
175 | buf->head += len; |
176 | return true; |
177 | } |
178 | |
179 | /* Returns a pointer to the start of buf, or NULL if len is |
180 | * larger than the remaining data. Consumes len bytes on a successful |
181 | * call. |
182 | * |
183 | * If scratch is not NULL, the function will attempt to load non-linear |
184 | * data via bpf_skb_load_bytes. On success, scratch is returned. |
185 | */ |
186 | static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) |
187 | { |
188 | if (buf->head + len > buf->tail) { |
189 | if (scratch == NULL) { |
190 | return NULL; |
191 | } |
192 | |
193 | return buf_copy(buf, dst: scratch, len) ? scratch : NULL; |
194 | } |
195 | |
196 | void *ptr = buf->head; |
197 | buf->head += len; |
198 | return ptr; |
199 | } |
200 | |
201 | static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) |
202 | { |
203 | if (ipv4->ihl <= 5) { |
204 | return true; |
205 | } |
206 | |
207 | return buf_skip(buf, len: (ipv4->ihl - 5) * 4); |
208 | } |
209 | |
210 | static INLINING bool ipv4_is_fragment(const struct iphdr *ip) |
211 | { |
212 | uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); |
213 | return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; |
214 | } |
215 | |
216 | static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) |
217 | { |
218 | struct iphdr *ipv4 = buf_assign(buf: pkt, len: sizeof(*ipv4), scratch); |
219 | if (ipv4 == NULL) { |
220 | return NULL; |
221 | } |
222 | |
223 | if (ipv4->ihl < 5) { |
224 | return NULL; |
225 | } |
226 | |
227 | if (!pkt_skip_ipv4_options(buf: pkt, ipv4)) { |
228 | return NULL; |
229 | } |
230 | |
231 | return ipv4; |
232 | } |
233 | |
234 | /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ |
235 | static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) |
236 | { |
237 | if (!buf_copy(buf: pkt, dst: ports, len: sizeof(*ports))) { |
238 | return false; |
239 | } |
240 | |
241 | /* Ports in the L4 headers are reversed, since we are parsing an ICMP |
242 | * payload which is going towards the eyeball. |
243 | */ |
244 | uint16_t dst = ports->src; |
245 | ports->src = ports->dst; |
246 | ports->dst = dst; |
247 | return true; |
248 | } |
249 | |
250 | static INLINING uint16_t pkt_checksum_fold(uint32_t csum) |
251 | { |
252 | /* The highest reasonable value for an IPv4 header |
253 | * checksum requires two folds, so we just do that always. |
254 | */ |
255 | csum = (csum & 0xffff) + (csum >> 16); |
256 | csum = (csum & 0xffff) + (csum >> 16); |
257 | return (uint16_t)~csum; |
258 | } |
259 | |
260 | static INLINING void pkt_ipv4_checksum(struct iphdr *iph) |
261 | { |
262 | iph->check = 0; |
263 | |
264 | /* An IP header without options is 20 bytes. Two of those |
265 | * are the checksum, which we always set to zero. Hence, |
266 | * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, |
267 | * which fits in 32 bit. |
268 | */ |
269 | _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes" ); |
270 | uint32_t acc = 0; |
271 | uint16_t *ipw = (uint16_t *)iph; |
272 | |
273 | __pragma_loop_unroll_full |
274 | for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { |
275 | acc += ipw[i]; |
276 | } |
277 | |
278 | iph->check = pkt_checksum_fold(csum: acc); |
279 | } |
280 | |
281 | static INLINING |
282 | bool (buf_t *pkt, |
283 | const struct ipv6hdr *ipv6, |
284 | uint8_t *upper_proto, |
285 | bool *is_fragment) |
286 | { |
287 | /* We understand five extension headers. |
288 | * https://tools.ietf.org/html/rfc8200#section-4.1 states that all |
289 | * headers should occur once, except Destination Options, which may |
290 | * occur twice. Hence we give up after 6 headers. |
291 | */ |
292 | struct { |
293 | uint8_t next; |
294 | uint8_t len; |
295 | } exthdr = { |
296 | .next = ipv6->nexthdr, |
297 | }; |
298 | *is_fragment = false; |
299 | |
300 | __pragma_loop_unroll_full |
301 | for (int i = 0; i < 6; i++) { |
302 | switch (exthdr.next) { |
303 | case IPPROTO_FRAGMENT: |
304 | *is_fragment = true; |
305 | /* NB: We don't check that hdrlen == 0 as per spec. */ |
306 | /* fallthrough; */ |
307 | |
308 | case IPPROTO_HOPOPTS: |
309 | case IPPROTO_ROUTING: |
310 | case IPPROTO_DSTOPTS: |
311 | case IPPROTO_MH: |
312 | if (!buf_copy(buf: pkt, dst: &exthdr, len: sizeof(exthdr))) { |
313 | return false; |
314 | } |
315 | |
316 | /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ |
317 | if (!buf_skip(buf: pkt, |
318 | len: (exthdr.len + 1) * 8 - sizeof(exthdr))) { |
319 | return false; |
320 | } |
321 | |
322 | /* Decode next header */ |
323 | break; |
324 | |
325 | default: |
326 | /* The next header is not one of the known extension |
327 | * headers, treat it as the upper layer header. |
328 | * |
329 | * This handles IPPROTO_NONE. |
330 | * |
331 | * Encapsulating Security Payload (50) and Authentication |
332 | * Header (51) also end up here (and will trigger an |
333 | * unknown proto error later). They have a custom header |
334 | * format and seem too esoteric to care about. |
335 | */ |
336 | *upper_proto = exthdr.next; |
337 | return true; |
338 | } |
339 | } |
340 | |
341 | /* We never found an upper layer header. */ |
342 | return false; |
343 | } |
344 | |
345 | /* This function has to be inlined, because the verifier otherwise rejects it |
346 | * due to returning a pointer to the stack. This is technically correct, since |
347 | * scratch is allocated on the stack. However, this usage should be safe since |
348 | * it's the callers stack after all. |
349 | */ |
350 | static __always_inline struct ipv6hdr * |
351 | pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, |
352 | bool *is_fragment) |
353 | { |
354 | struct ipv6hdr *ipv6 = buf_assign(buf: pkt, len: sizeof(*ipv6), scratch); |
355 | if (ipv6 == NULL) { |
356 | return NULL; |
357 | } |
358 | |
359 | if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, upper_proto: proto, is_fragment)) { |
360 | return NULL; |
361 | } |
362 | |
363 | return ipv6; |
364 | } |
365 | |
366 | /* Global metrics, per CPU |
367 | */ |
368 | struct { |
369 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
370 | __uint(max_entries, 1); |
371 | __type(key, unsigned int); |
372 | __type(value, metrics_t); |
373 | } metrics_map SEC(".maps" ); |
374 | |
375 | static INLINING metrics_t *get_global_metrics(void) |
376 | { |
377 | uint64_t key = 0; |
378 | return bpf_map_lookup_elem(&metrics_map, &key); |
379 | } |
380 | |
381 | static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) |
382 | { |
383 | const int payload_off = |
384 | sizeof(*encap) + |
385 | sizeof(struct in_addr) * encap->unigue.hop_count; |
386 | int32_t encap_overhead = payload_off - sizeof(struct ethhdr); |
387 | |
388 | // Changing the ethertype if the encapsulated packet is ipv6 |
389 | if (encap->gue.proto_ctype == IPPROTO_IPV6) { |
390 | encap->eth.h_proto = bpf_htons(ETH_P_IPV6); |
391 | } |
392 | |
393 | if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, |
394 | BPF_F_ADJ_ROOM_FIXED_GSO | |
395 | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || |
396 | bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) |
397 | return TC_ACT_SHOT; |
398 | |
399 | return bpf_redirect(skb->ifindex, BPF_F_INGRESS); |
400 | } |
401 | |
402 | static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, |
403 | struct in_addr *next_hop, metrics_t *metrics) |
404 | { |
405 | metrics->forwarded_packets_total_gre++; |
406 | |
407 | const int payload_off = |
408 | sizeof(*encap) + |
409 | sizeof(struct in_addr) * encap->unigue.hop_count; |
410 | int32_t encap_overhead = |
411 | payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); |
412 | int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; |
413 | uint16_t proto = ETH_P_IP; |
414 | uint32_t mtu_len = 0; |
415 | |
416 | /* Loop protection: the inner packet's TTL is decremented as a safeguard |
417 | * against any forwarding loop. As the only interesting field is the TTL |
418 | * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes |
419 | * as they handle the split packets if needed (no need for the data to be |
420 | * in the linear section). |
421 | */ |
422 | if (encap->gue.proto_ctype == IPPROTO_IPV6) { |
423 | proto = ETH_P_IPV6; |
424 | uint8_t ttl; |
425 | int rc; |
426 | |
427 | rc = bpf_skb_load_bytes( |
428 | skb, payload_off + offsetof(struct ipv6hdr, hop_limit), |
429 | &ttl, 1); |
430 | if (rc != 0) { |
431 | metrics->errors_total_malformed_encapsulation++; |
432 | return TC_ACT_SHOT; |
433 | } |
434 | |
435 | if (ttl == 0) { |
436 | metrics->errors_total_redirect_loop++; |
437 | return TC_ACT_SHOT; |
438 | } |
439 | |
440 | ttl--; |
441 | rc = bpf_skb_store_bytes( |
442 | skb, payload_off + offsetof(struct ipv6hdr, hop_limit), |
443 | &ttl, 1, 0); |
444 | if (rc != 0) { |
445 | metrics->errors_total_malformed_encapsulation++; |
446 | return TC_ACT_SHOT; |
447 | } |
448 | } else { |
449 | uint8_t ttl; |
450 | int rc; |
451 | |
452 | rc = bpf_skb_load_bytes( |
453 | skb, payload_off + offsetof(struct iphdr, ttl), &ttl, |
454 | 1); |
455 | if (rc != 0) { |
456 | metrics->errors_total_malformed_encapsulation++; |
457 | return TC_ACT_SHOT; |
458 | } |
459 | |
460 | if (ttl == 0) { |
461 | metrics->errors_total_redirect_loop++; |
462 | return TC_ACT_SHOT; |
463 | } |
464 | |
465 | /* IPv4 also has a checksum to patch. While the TTL is only one byte, |
466 | * this function only works for 2 and 4 bytes arguments (the result is |
467 | * the same). |
468 | */ |
469 | rc = bpf_l3_csum_replace( |
470 | skb, payload_off + offsetof(struct iphdr, check), ttl, |
471 | ttl - 1, 2); |
472 | if (rc != 0) { |
473 | metrics->errors_total_malformed_encapsulation++; |
474 | return TC_ACT_SHOT; |
475 | } |
476 | |
477 | ttl--; |
478 | rc = bpf_skb_store_bytes( |
479 | skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, |
480 | 0); |
481 | if (rc != 0) { |
482 | metrics->errors_total_malformed_encapsulation++; |
483 | return TC_ACT_SHOT; |
484 | } |
485 | } |
486 | |
487 | if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { |
488 | metrics->errors_total_encap_mtu_violate++; |
489 | return TC_ACT_SHOT; |
490 | } |
491 | |
492 | if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, |
493 | BPF_F_ADJ_ROOM_FIXED_GSO | |
494 | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || |
495 | bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { |
496 | metrics->errors_total_encap_adjust_failed++; |
497 | return TC_ACT_SHOT; |
498 | } |
499 | |
500 | if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { |
501 | metrics->errors_total_encap_buffer_too_small++; |
502 | return TC_ACT_SHOT; |
503 | } |
504 | |
505 | buf_t pkt = { |
506 | .skb = skb, |
507 | .head = (uint8_t *)(long)skb->data, |
508 | .tail = (uint8_t *)(long)skb->data_end, |
509 | }; |
510 | |
511 | encap_gre_t *encap_gre = buf_assign(buf: &pkt, len: sizeof(encap_gre_t), NULL); |
512 | if (encap_gre == NULL) { |
513 | metrics->errors_total_encap_buffer_too_small++; |
514 | return TC_ACT_SHOT; |
515 | } |
516 | |
517 | encap_gre->ip.protocol = IPPROTO_GRE; |
518 | encap_gre->ip.daddr = next_hop->s_addr; |
519 | encap_gre->ip.saddr = ENCAPSULATION_IP; |
520 | encap_gre->ip.tot_len = |
521 | bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); |
522 | encap_gre->gre.flags = 0; |
523 | encap_gre->gre.protocol = bpf_htons(proto); |
524 | pkt_ipv4_checksum(iph: (void *)&encap_gre->ip); |
525 | |
526 | return bpf_redirect(skb->ifindex, 0); |
527 | } |
528 | |
529 | static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, |
530 | struct in_addr *next_hop, metrics_t *metrics) |
531 | { |
532 | /* swap L2 addresses */ |
533 | /* This assumes that packets are received from a router. |
534 | * So just swapping the MAC addresses here will make the packet go back to |
535 | * the router, which will send it to the appropriate machine. |
536 | */ |
537 | unsigned char temp[ETH_ALEN]; |
538 | memcpy(temp, encap->eth.h_dest, sizeof(temp)); |
539 | memcpy(encap->eth.h_dest, encap->eth.h_source, |
540 | sizeof(encap->eth.h_dest)); |
541 | memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); |
542 | |
543 | if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && |
544 | encap->unigue.last_hop_gre) { |
545 | return forward_with_gre(skb, encap, next_hop, metrics); |
546 | } |
547 | |
548 | metrics->forwarded_packets_total_gue++; |
549 | uint32_t old_saddr = encap->ip.saddr; |
550 | encap->ip.saddr = encap->ip.daddr; |
551 | encap->ip.daddr = next_hop->s_addr; |
552 | if (encap->unigue.next_hop < encap->unigue.hop_count) { |
553 | encap->unigue.next_hop++; |
554 | } |
555 | |
556 | /* Remove ip->saddr, add next_hop->s_addr */ |
557 | const uint64_t off = offsetof(typeof(*encap), ip.check); |
558 | int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); |
559 | if (ret < 0) { |
560 | return TC_ACT_SHOT; |
561 | } |
562 | |
563 | return bpf_redirect(skb->ifindex, 0); |
564 | } |
565 | |
566 | static INLINING ret_t skip_next_hops(buf_t *pkt, int n) |
567 | { |
568 | switch (n) { |
569 | case 1: |
570 | if (!buf_skip(buf: pkt, len: sizeof(struct in_addr))) |
571 | return TC_ACT_SHOT; |
572 | case 0: |
573 | return CONTINUE_PROCESSING; |
574 | |
575 | default: |
576 | return TC_ACT_SHOT; |
577 | } |
578 | } |
579 | |
580 | /* Get the next hop from the GLB header. |
581 | * |
582 | * Sets next_hop->s_addr to 0 if there are no more hops left. |
583 | * pkt is positioned just after the variable length GLB header |
584 | * iff the call is successful. |
585 | */ |
586 | static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, |
587 | struct in_addr *next_hop) |
588 | { |
589 | if (encap->unigue.next_hop > encap->unigue.hop_count) { |
590 | return TC_ACT_SHOT; |
591 | } |
592 | |
593 | /* Skip "used" next hops. */ |
594 | MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); |
595 | |
596 | if (encap->unigue.next_hop == encap->unigue.hop_count) { |
597 | /* No more next hops, we are at the end of the GLB header. */ |
598 | next_hop->s_addr = 0; |
599 | return CONTINUE_PROCESSING; |
600 | } |
601 | |
602 | if (!buf_copy(buf: pkt, dst: next_hop, len: sizeof(*next_hop))) { |
603 | return TC_ACT_SHOT; |
604 | } |
605 | |
606 | /* Skip the remaining next hops (may be zero). */ |
607 | return skip_next_hops(pkt, n: encap->unigue.hop_count - |
608 | encap->unigue.next_hop - 1); |
609 | } |
610 | |
611 | /* Fill a bpf_sock_tuple to be used with the socket lookup functions. |
612 | * This is a kludge that let's us work around verifier limitations: |
613 | * |
614 | * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) |
615 | * |
616 | * clang will substitute a constant for sizeof, which allows the verifier |
617 | * to track its value. Based on this, it can figure out the constant |
618 | * return value, and calling code works while still being "generic" to |
619 | * IPv4 and IPv6. |
620 | */ |
621 | static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, |
622 | uint64_t iphlen, uint16_t sport, uint16_t dport) |
623 | { |
624 | switch (iphlen) { |
625 | case sizeof(struct iphdr): { |
626 | struct iphdr *ipv4 = (struct iphdr *)iph; |
627 | tuple->ipv4.daddr = ipv4->daddr; |
628 | tuple->ipv4.saddr = ipv4->saddr; |
629 | tuple->ipv4.sport = sport; |
630 | tuple->ipv4.dport = dport; |
631 | return sizeof(tuple->ipv4); |
632 | } |
633 | |
634 | case sizeof(struct ipv6hdr): { |
635 | struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; |
636 | memcpy(&tuple->ipv6.daddr, &ipv6->daddr, |
637 | sizeof(tuple->ipv6.daddr)); |
638 | memcpy(&tuple->ipv6.saddr, &ipv6->saddr, |
639 | sizeof(tuple->ipv6.saddr)); |
640 | tuple->ipv6.sport = sport; |
641 | tuple->ipv6.dport = dport; |
642 | return sizeof(tuple->ipv6); |
643 | } |
644 | |
645 | default: |
646 | return 0; |
647 | } |
648 | } |
649 | |
650 | static INLINING verdict_t classify_tcp(struct __sk_buff *skb, |
651 | struct bpf_sock_tuple *tuple, uint64_t tuplen, |
652 | void *iph, struct tcphdr *tcp) |
653 | { |
654 | struct bpf_sock *sk = |
655 | bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); |
656 | if (sk == NULL) { |
657 | return UNKNOWN; |
658 | } |
659 | |
660 | if (sk->state != BPF_TCP_LISTEN) { |
661 | bpf_sk_release(sk); |
662 | return ESTABLISHED; |
663 | } |
664 | |
665 | if (iph != NULL && tcp != NULL) { |
666 | /* Kludge: we've run out of arguments, but need the length of the ip header. */ |
667 | uint64_t iphlen = sizeof(struct iphdr); |
668 | if (tuplen == sizeof(tuple->ipv6)) { |
669 | iphlen = sizeof(struct ipv6hdr); |
670 | } |
671 | |
672 | if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, |
673 | sizeof(*tcp)) == 0) { |
674 | bpf_sk_release(sk); |
675 | return SYN_COOKIE; |
676 | } |
677 | } |
678 | |
679 | bpf_sk_release(sk); |
680 | return UNKNOWN; |
681 | } |
682 | |
683 | static INLINING verdict_t classify_udp(struct __sk_buff *skb, |
684 | struct bpf_sock_tuple *tuple, uint64_t tuplen) |
685 | { |
686 | struct bpf_sock *sk = |
687 | bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); |
688 | if (sk == NULL) { |
689 | return UNKNOWN; |
690 | } |
691 | |
692 | if (sk->state == BPF_TCP_ESTABLISHED) { |
693 | bpf_sk_release(sk); |
694 | return ESTABLISHED; |
695 | } |
696 | |
697 | bpf_sk_release(sk); |
698 | return UNKNOWN; |
699 | } |
700 | |
701 | static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, |
702 | struct bpf_sock_tuple *tuple, uint64_t tuplen, |
703 | metrics_t *metrics) |
704 | { |
705 | switch (proto) { |
706 | case IPPROTO_TCP: |
707 | return classify_tcp(skb, tuple, tuplen, NULL, NULL); |
708 | |
709 | case IPPROTO_UDP: |
710 | return classify_udp(skb, tuple, tuplen); |
711 | |
712 | default: |
713 | metrics->errors_total_malformed_icmp++; |
714 | return INVALID; |
715 | } |
716 | } |
717 | |
718 | static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) |
719 | { |
720 | struct icmphdr icmp; |
721 | if (!buf_copy(buf: pkt, dst: &icmp, len: sizeof(icmp))) { |
722 | metrics->errors_total_malformed_icmp++; |
723 | return INVALID; |
724 | } |
725 | |
726 | /* We should never receive encapsulated echo replies. */ |
727 | if (icmp.type == ICMP_ECHOREPLY) { |
728 | metrics->errors_total_icmp_echo_replies++; |
729 | return INVALID; |
730 | } |
731 | |
732 | if (icmp.type == ICMP_ECHO) { |
733 | return ECHO_REQUEST; |
734 | } |
735 | |
736 | if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { |
737 | metrics->errors_total_unwanted_icmp++; |
738 | return INVALID; |
739 | } |
740 | |
741 | struct iphdr _ip4; |
742 | const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, scratch: &_ip4); |
743 | if (ipv4 == NULL) { |
744 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
745 | return INVALID; |
746 | } |
747 | |
748 | /* The source address in the outer IP header is from the entity that |
749 | * originated the ICMP message. Use the original IP header to restore |
750 | * the correct flow tuple. |
751 | */ |
752 | struct bpf_sock_tuple tuple; |
753 | tuple.ipv4.saddr = ipv4->daddr; |
754 | tuple.ipv4.daddr = ipv4->saddr; |
755 | |
756 | if (!pkt_parse_icmp_l4_ports(pkt, ports: (flow_ports_t *)&tuple.ipv4.sport)) { |
757 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
758 | return INVALID; |
759 | } |
760 | |
761 | return classify_icmp(skb: pkt->skb, proto: ipv4->protocol, tuple: &tuple, |
762 | tuplen: sizeof(tuple.ipv4), metrics); |
763 | } |
764 | |
765 | static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) |
766 | { |
767 | struct icmp6hdr icmp6; |
768 | if (!buf_copy(buf: pkt, dst: &icmp6, len: sizeof(icmp6))) { |
769 | metrics->errors_total_malformed_icmp++; |
770 | return INVALID; |
771 | } |
772 | |
773 | /* We should never receive encapsulated echo replies. */ |
774 | if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { |
775 | metrics->errors_total_icmp_echo_replies++; |
776 | return INVALID; |
777 | } |
778 | |
779 | if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { |
780 | return ECHO_REQUEST; |
781 | } |
782 | |
783 | if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { |
784 | metrics->errors_total_unwanted_icmp++; |
785 | return INVALID; |
786 | } |
787 | |
788 | bool is_fragment; |
789 | uint8_t l4_proto; |
790 | struct ipv6hdr _ipv6; |
791 | const struct ipv6hdr *ipv6 = |
792 | pkt_parse_ipv6(pkt, scratch: &_ipv6, proto: &l4_proto, is_fragment: &is_fragment); |
793 | if (ipv6 == NULL) { |
794 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
795 | return INVALID; |
796 | } |
797 | |
798 | if (is_fragment) { |
799 | metrics->errors_total_fragmented_ip++; |
800 | return INVALID; |
801 | } |
802 | |
803 | /* Swap source and dest addresses. */ |
804 | struct bpf_sock_tuple tuple; |
805 | memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); |
806 | memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); |
807 | |
808 | if (!pkt_parse_icmp_l4_ports(pkt, ports: (flow_ports_t *)&tuple.ipv6.sport)) { |
809 | metrics->errors_total_malformed_icmp_pkt_too_big++; |
810 | return INVALID; |
811 | } |
812 | |
813 | return classify_icmp(skb: pkt->skb, proto: l4_proto, tuple: &tuple, tuplen: sizeof(tuple.ipv6), |
814 | metrics); |
815 | } |
816 | |
817 | static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, |
818 | metrics_t *metrics) |
819 | { |
820 | metrics->l4_protocol_packets_total_tcp++; |
821 | |
822 | struct tcphdr _tcp; |
823 | struct tcphdr *tcp = buf_assign(buf: pkt, len: sizeof(_tcp), scratch: &_tcp); |
824 | if (tcp == NULL) { |
825 | metrics->errors_total_malformed_tcp++; |
826 | return INVALID; |
827 | } |
828 | |
829 | if (tcp->syn) { |
830 | return SYN; |
831 | } |
832 | |
833 | struct bpf_sock_tuple tuple; |
834 | uint64_t tuplen = |
835 | fill_tuple(tuple: &tuple, iph, iphlen, sport: tcp->source, dport: tcp->dest); |
836 | return classify_tcp(skb: pkt->skb, tuple: &tuple, tuplen, iph, tcp); |
837 | } |
838 | |
839 | static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, |
840 | metrics_t *metrics) |
841 | { |
842 | metrics->l4_protocol_packets_total_udp++; |
843 | |
844 | struct udphdr _udp; |
845 | struct udphdr *udph = buf_assign(buf: pkt, len: sizeof(_udp), scratch: &_udp); |
846 | if (udph == NULL) { |
847 | metrics->errors_total_malformed_udp++; |
848 | return INVALID; |
849 | } |
850 | |
851 | struct bpf_sock_tuple tuple; |
852 | uint64_t tuplen = |
853 | fill_tuple(tuple: &tuple, iph, iphlen, sport: udph->source, dport: udph->dest); |
854 | return classify_udp(skb: pkt->skb, tuple: &tuple, tuplen); |
855 | } |
856 | |
857 | static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) |
858 | { |
859 | metrics->l3_protocol_packets_total_ipv4++; |
860 | |
861 | struct iphdr _ip4; |
862 | struct iphdr *ipv4 = pkt_parse_ipv4(pkt, scratch: &_ip4); |
863 | if (ipv4 == NULL) { |
864 | metrics->errors_total_malformed_ip++; |
865 | return INVALID; |
866 | } |
867 | |
868 | if (ipv4->version != 4) { |
869 | metrics->errors_total_malformed_ip++; |
870 | return INVALID; |
871 | } |
872 | |
873 | if (ipv4_is_fragment(ip: ipv4)) { |
874 | metrics->errors_total_fragmented_ip++; |
875 | return INVALID; |
876 | } |
877 | |
878 | switch (ipv4->protocol) { |
879 | case IPPROTO_ICMP: |
880 | return process_icmpv4(pkt, metrics); |
881 | |
882 | case IPPROTO_TCP: |
883 | return process_tcp(pkt, iph: ipv4, iphlen: sizeof(*ipv4), metrics); |
884 | |
885 | case IPPROTO_UDP: |
886 | return process_udp(pkt, iph: ipv4, iphlen: sizeof(*ipv4), metrics); |
887 | |
888 | default: |
889 | metrics->errors_total_unknown_l4_proto++; |
890 | return INVALID; |
891 | } |
892 | } |
893 | |
894 | static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) |
895 | { |
896 | metrics->l3_protocol_packets_total_ipv6++; |
897 | |
898 | uint8_t l4_proto; |
899 | bool is_fragment; |
900 | struct ipv6hdr _ipv6; |
901 | struct ipv6hdr *ipv6 = |
902 | pkt_parse_ipv6(pkt, scratch: &_ipv6, proto: &l4_proto, is_fragment: &is_fragment); |
903 | if (ipv6 == NULL) { |
904 | metrics->errors_total_malformed_ip++; |
905 | return INVALID; |
906 | } |
907 | |
908 | if (ipv6->version != 6) { |
909 | metrics->errors_total_malformed_ip++; |
910 | return INVALID; |
911 | } |
912 | |
913 | if (is_fragment) { |
914 | metrics->errors_total_fragmented_ip++; |
915 | return INVALID; |
916 | } |
917 | |
918 | switch (l4_proto) { |
919 | case IPPROTO_ICMPV6: |
920 | return process_icmpv6(pkt, metrics); |
921 | |
922 | case IPPROTO_TCP: |
923 | return process_tcp(pkt, iph: ipv6, iphlen: sizeof(*ipv6), metrics); |
924 | |
925 | case IPPROTO_UDP: |
926 | return process_udp(pkt, iph: ipv6, iphlen: sizeof(*ipv6), metrics); |
927 | |
928 | default: |
929 | metrics->errors_total_unknown_l4_proto++; |
930 | return INVALID; |
931 | } |
932 | } |
933 | |
934 | SEC("tc" ) |
935 | int cls_redirect(struct __sk_buff *skb) |
936 | { |
937 | metrics_t *metrics = get_global_metrics(); |
938 | if (metrics == NULL) { |
939 | return TC_ACT_SHOT; |
940 | } |
941 | |
942 | metrics->processed_packets_total++; |
943 | |
944 | /* Pass bogus packets as long as we're not sure they're |
945 | * destined for us. |
946 | */ |
947 | if (skb->protocol != bpf_htons(ETH_P_IP)) { |
948 | return TC_ACT_OK; |
949 | } |
950 | |
951 | encap_headers_t *encap; |
952 | |
953 | /* Make sure that all encapsulation headers are available in |
954 | * the linear portion of the skb. This makes it easy to manipulate them. |
955 | */ |
956 | if (bpf_skb_pull_data(skb, sizeof(*encap))) { |
957 | return TC_ACT_OK; |
958 | } |
959 | |
960 | buf_t pkt = { |
961 | .skb = skb, |
962 | .head = (uint8_t *)(long)skb->data, |
963 | .tail = (uint8_t *)(long)skb->data_end, |
964 | }; |
965 | |
966 | encap = buf_assign(buf: &pkt, len: sizeof(*encap), NULL); |
967 | if (encap == NULL) { |
968 | return TC_ACT_OK; |
969 | } |
970 | |
971 | if (encap->ip.ihl != 5) { |
972 | /* We never have any options. */ |
973 | return TC_ACT_OK; |
974 | } |
975 | |
976 | if (encap->ip.daddr != ENCAPSULATION_IP || |
977 | encap->ip.protocol != IPPROTO_UDP) { |
978 | return TC_ACT_OK; |
979 | } |
980 | |
981 | /* TODO Check UDP length? */ |
982 | if (encap->udp.dest != ENCAPSULATION_PORT) { |
983 | return TC_ACT_OK; |
984 | } |
985 | |
986 | /* We now know that the packet is destined to us, we can |
987 | * drop bogus ones. |
988 | */ |
989 | if (ipv4_is_fragment(ip: (void *)&encap->ip)) { |
990 | metrics->errors_total_fragmented_ip++; |
991 | return TC_ACT_SHOT; |
992 | } |
993 | |
994 | if (encap->gue.variant != 0) { |
995 | metrics->errors_total_malformed_encapsulation++; |
996 | return TC_ACT_SHOT; |
997 | } |
998 | |
999 | if (encap->gue.control != 0) { |
1000 | metrics->errors_total_malformed_encapsulation++; |
1001 | return TC_ACT_SHOT; |
1002 | } |
1003 | |
1004 | if (encap->gue.flags != 0) { |
1005 | metrics->errors_total_malformed_encapsulation++; |
1006 | return TC_ACT_SHOT; |
1007 | } |
1008 | |
1009 | if (encap->gue.hlen != |
1010 | sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { |
1011 | metrics->errors_total_malformed_encapsulation++; |
1012 | return TC_ACT_SHOT; |
1013 | } |
1014 | |
1015 | if (encap->unigue.version != 0) { |
1016 | metrics->errors_total_malformed_encapsulation++; |
1017 | return TC_ACT_SHOT; |
1018 | } |
1019 | |
1020 | if (encap->unigue.reserved != 0) { |
1021 | return TC_ACT_SHOT; |
1022 | } |
1023 | |
1024 | struct in_addr next_hop; |
1025 | MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); |
1026 | |
1027 | if (next_hop.s_addr == 0) { |
1028 | metrics->accepted_packets_total_last_hop++; |
1029 | return accept_locally(skb, encap); |
1030 | } |
1031 | |
1032 | verdict_t verdict; |
1033 | switch (encap->gue.proto_ctype) { |
1034 | case IPPROTO_IPIP: |
1035 | verdict = process_ipv4(pkt: &pkt, metrics); |
1036 | break; |
1037 | |
1038 | case IPPROTO_IPV6: |
1039 | verdict = process_ipv6(pkt: &pkt, metrics); |
1040 | break; |
1041 | |
1042 | default: |
1043 | metrics->errors_total_unknown_l3_proto++; |
1044 | return TC_ACT_SHOT; |
1045 | } |
1046 | |
1047 | switch (verdict) { |
1048 | case INVALID: |
1049 | /* metrics have already been bumped */ |
1050 | return TC_ACT_SHOT; |
1051 | |
1052 | case UNKNOWN: |
1053 | return forward_to_next_hop(skb, encap, next_hop: &next_hop, metrics); |
1054 | |
1055 | case ECHO_REQUEST: |
1056 | metrics->accepted_packets_total_icmp_echo_request++; |
1057 | break; |
1058 | |
1059 | case SYN: |
1060 | if (encap->unigue.forward_syn) { |
1061 | return forward_to_next_hop(skb, encap, next_hop: &next_hop, |
1062 | metrics); |
1063 | } |
1064 | |
1065 | metrics->accepted_packets_total_syn++; |
1066 | break; |
1067 | |
1068 | case SYN_COOKIE: |
1069 | metrics->accepted_packets_total_syn_cookies++; |
1070 | break; |
1071 | |
1072 | case ESTABLISHED: |
1073 | metrics->accepted_packets_total_established++; |
1074 | break; |
1075 | } |
1076 | |
1077 | return accept_locally(skb, encap); |
1078 | } |
1079 | |