1 | // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause |
2 | /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ |
3 | |
4 | #include "vmlinux.h" |
5 | |
6 | #include <bpf/bpf_helpers.h> |
7 | #include <bpf/bpf_endian.h> |
8 | #include <asm/errno.h> |
9 | |
10 | #include "bpf_compiler.h" |
11 | |
12 | #define TC_ACT_OK 0 |
13 | #define TC_ACT_SHOT 2 |
14 | |
15 | #define NSEC_PER_SEC 1000000000L |
16 | |
17 | #define ETH_ALEN 6 |
18 | #define ETH_P_IP 0x0800 |
19 | #define ETH_P_IPV6 0x86DD |
20 | |
21 | #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) |
22 | |
23 | #define IP_DF 0x4000 |
24 | #define IP_MF 0x2000 |
25 | #define IP_OFFSET 0x1fff |
26 | |
27 | #define NEXTHDR_TCP 6 |
28 | |
29 | #define TCPOPT_NOP 1 |
30 | #define TCPOPT_EOL 0 |
31 | #define TCPOPT_MSS 2 |
32 | #define TCPOPT_WINDOW 3 |
33 | #define TCPOPT_SACK_PERM 4 |
34 | #define TCPOPT_TIMESTAMP 8 |
35 | |
36 | #define TCPOLEN_MSS 4 |
37 | #define TCPOLEN_WINDOW 3 |
38 | #define TCPOLEN_SACK_PERM 2 |
39 | #define TCPOLEN_TIMESTAMP 10 |
40 | |
41 | #define TCP_TS_HZ 1000 |
42 | #define TS_OPT_WSCALE_MASK 0xf |
43 | #define TS_OPT_SACK (1 << 4) |
44 | #define TS_OPT_ECN (1 << 5) |
45 | #define TSBITS 6 |
46 | #define TSMASK (((__u32)1 << TSBITS) - 1) |
47 | #define TCP_MAX_WSCALE 14U |
48 | |
49 | #define IPV4_MAXLEN 60 |
50 | #define TCP_MAXLEN 60 |
51 | |
52 | #define DEFAULT_MSS4 1460 |
53 | #define DEFAULT_MSS6 1440 |
54 | #define DEFAULT_WSCALE 7 |
55 | #define DEFAULT_TTL 64 |
56 | #define MAX_ALLOWED_PORTS 8 |
57 | |
58 | #define MAX_PACKET_OFF 0xffff |
59 | |
60 | #define swap(a, b) \ |
61 | do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) |
62 | |
63 | #define __get_unaligned_t(type, ptr) ({ \ |
64 | const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \ |
65 | __pptr->x; \ |
66 | }) |
67 | |
68 | #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) |
69 | |
70 | struct { |
71 | __uint(type, BPF_MAP_TYPE_ARRAY); |
72 | __type(key, __u32); |
73 | __type(value, __u64); |
74 | __uint(max_entries, 2); |
75 | } values SEC(".maps" ); |
76 | |
77 | struct { |
78 | __uint(type, BPF_MAP_TYPE_ARRAY); |
79 | __type(key, __u32); |
80 | __type(value, __u16); |
81 | __uint(max_entries, MAX_ALLOWED_PORTS); |
82 | } allowed_ports SEC(".maps" ); |
83 | |
84 | /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in |
85 | * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally. |
86 | */ |
87 | |
88 | struct bpf_ct_opts___local { |
89 | s32 netns_id; |
90 | s32 error; |
91 | u8 l4proto; |
92 | u8 dir; |
93 | u8 reserved[2]; |
94 | } __attribute__((preserve_access_index)); |
95 | |
96 | #define BPF_F_CURRENT_NETNS (-1) |
97 | |
98 | extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, |
99 | struct bpf_sock_tuple *bpf_tuple, |
100 | __u32 len_tuple, |
101 | struct bpf_ct_opts___local *opts, |
102 | __u32 len_opts) __ksym; |
103 | |
104 | extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, |
105 | struct bpf_sock_tuple *bpf_tuple, |
106 | u32 len_tuple, |
107 | struct bpf_ct_opts___local *opts, |
108 | u32 len_opts) __ksym; |
109 | |
110 | extern void bpf_ct_release(struct nf_conn *ct) __ksym; |
111 | |
112 | static __always_inline void swap_eth_addr(__u8 *a, __u8 *b) |
113 | { |
114 | __u8 tmp[ETH_ALEN]; |
115 | |
116 | __builtin_memcpy(tmp, a, ETH_ALEN); |
117 | __builtin_memcpy(a, b, ETH_ALEN); |
118 | __builtin_memcpy(b, tmp, ETH_ALEN); |
119 | } |
120 | |
121 | static __always_inline __u16 csum_fold(__u32 csum) |
122 | { |
123 | csum = (csum & 0xffff) + (csum >> 16); |
124 | csum = (csum & 0xffff) + (csum >> 16); |
125 | return (__u16)~csum; |
126 | } |
127 | |
128 | static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, |
129 | __u32 len, __u8 proto, |
130 | __u32 csum) |
131 | { |
132 | __u64 s = csum; |
133 | |
134 | s += (__u32)saddr; |
135 | s += (__u32)daddr; |
136 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
137 | s += proto + len; |
138 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
139 | s += (proto + len) << 8; |
140 | #else |
141 | #error Unknown endian |
142 | #endif |
143 | s = (s & 0xffffffff) + (s >> 32); |
144 | s = (s & 0xffffffff) + (s >> 32); |
145 | |
146 | return csum_fold((__u32)s); |
147 | } |
148 | |
149 | static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr, |
150 | const struct in6_addr *daddr, |
151 | __u32 len, __u8 proto, __u32 csum) |
152 | { |
153 | __u64 sum = csum; |
154 | int i; |
155 | |
156 | __pragma_loop_unroll |
157 | for (i = 0; i < 4; i++) |
158 | sum += (__u32)saddr->in6_u.u6_addr32[i]; |
159 | |
160 | __pragma_loop_unroll |
161 | for (i = 0; i < 4; i++) |
162 | sum += (__u32)daddr->in6_u.u6_addr32[i]; |
163 | |
164 | /* Don't combine additions to avoid 32-bit overflow. */ |
165 | sum += bpf_htonl(len); |
166 | sum += bpf_htonl(proto); |
167 | |
168 | sum = (sum & 0xffffffff) + (sum >> 32); |
169 | sum = (sum & 0xffffffff) + (sum >> 32); |
170 | |
171 | return csum_fold((__u32)sum); |
172 | } |
173 | |
174 | static __always_inline __u64 tcp_clock_ns(void) |
175 | { |
176 | return bpf_ktime_get_ns(); |
177 | } |
178 | |
179 | static __always_inline __u32 tcp_ns_to_ts(__u64 ns) |
180 | { |
181 | return ns / (NSEC_PER_SEC / TCP_TS_HZ); |
182 | } |
183 | |
184 | static __always_inline __u32 tcp_clock_ms(void) |
185 | { |
186 | return tcp_ns_to_ts(tcp_clock_ns()); |
187 | } |
188 | |
189 | struct tcpopt_context { |
190 | void *data; |
191 | void *data_end; |
192 | __be32 *tsecr; |
193 | __u8 wscale; |
194 | bool option_timestamp; |
195 | bool option_sack; |
196 | __u32 off; |
197 | }; |
198 | |
199 | static __always_inline u8 *next(struct tcpopt_context *ctx, __u32 sz) |
200 | { |
201 | __u64 off = ctx->off; |
202 | __u8 *data; |
203 | |
204 | /* Verifier forbids access to packet when offset exceeds MAX_PACKET_OFF */ |
205 | if (off > MAX_PACKET_OFF - sz) |
206 | return NULL; |
207 | |
208 | data = ctx->data + off; |
209 | barrier_var(data); |
210 | if (data + sz >= ctx->data_end) |
211 | return NULL; |
212 | |
213 | ctx->off += sz; |
214 | return data; |
215 | } |
216 | |
217 | static int tscookie_tcpopt_parse(struct tcpopt_context *ctx) |
218 | { |
219 | __u8 *opcode, *opsize, *wscale, *tsecr; |
220 | __u32 off = ctx->off; |
221 | |
222 | opcode = next(ctx, 1); |
223 | if (!opcode) |
224 | return 1; |
225 | |
226 | if (*opcode == TCPOPT_EOL) |
227 | return 1; |
228 | if (*opcode == TCPOPT_NOP) |
229 | return 0; |
230 | |
231 | opsize = next(ctx, 1); |
232 | if (!opsize || *opsize < 2) |
233 | return 1; |
234 | |
235 | switch (*opcode) { |
236 | case TCPOPT_WINDOW: |
237 | wscale = next(ctx, 1); |
238 | if (!wscale) |
239 | return 1; |
240 | if (*opsize == TCPOLEN_WINDOW) |
241 | ctx->wscale = *wscale < TCP_MAX_WSCALE ? *wscale : TCP_MAX_WSCALE; |
242 | break; |
243 | case TCPOPT_TIMESTAMP: |
244 | tsecr = next(ctx, 4); |
245 | if (!tsecr) |
246 | return 1; |
247 | if (*opsize == TCPOLEN_TIMESTAMP) { |
248 | ctx->option_timestamp = true; |
249 | /* Client's tsval becomes our tsecr. */ |
250 | *ctx->tsecr = get_unaligned((__be32 *)tsecr); |
251 | } |
252 | break; |
253 | case TCPOPT_SACK_PERM: |
254 | if (*opsize == TCPOLEN_SACK_PERM) |
255 | ctx->option_sack = true; |
256 | break; |
257 | } |
258 | |
259 | ctx->off = off + *opsize; |
260 | |
261 | return 0; |
262 | } |
263 | |
264 | static int tscookie_tcpopt_parse_batch(__u32 index, void *context) |
265 | { |
266 | int i; |
267 | |
268 | for (i = 0; i < 7; i++) |
269 | if (tscookie_tcpopt_parse(ctx: context)) |
270 | return 1; |
271 | return 0; |
272 | } |
273 | |
274 | static __always_inline bool tscookie_init(struct tcphdr *, |
275 | __u16 tcp_len, __be32 *tsval, |
276 | __be32 *tsecr, void *data, void *data_end) |
277 | { |
278 | struct tcpopt_context loop_ctx = { |
279 | .data = data, |
280 | .data_end = data_end, |
281 | .tsecr = tsecr, |
282 | .wscale = TS_OPT_WSCALE_MASK, |
283 | .option_timestamp = false, |
284 | .option_sack = false, |
285 | /* Note: currently verifier would track .off as unbound scalar. |
286 | * In case if verifier would at some point get smarter and |
287 | * compute bounded value for this var, beware that it might |
288 | * hinder bpf_loop() convergence validation. |
289 | */ |
290 | .off = (__u8 *)(tcp_header + 1) - (__u8 *)data, |
291 | }; |
292 | u32 cookie; |
293 | |
294 | bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0); |
295 | |
296 | if (!loop_ctx.option_timestamp) |
297 | return false; |
298 | |
299 | cookie = tcp_clock_ms() & ~TSMASK; |
300 | cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK; |
301 | if (loop_ctx.option_sack) |
302 | cookie |= TS_OPT_SACK; |
303 | if (tcp_header->ece && tcp_header->cwr) |
304 | cookie |= TS_OPT_ECN; |
305 | *tsval = bpf_htonl(cookie); |
306 | |
307 | return true; |
308 | } |
309 | |
310 | static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale, |
311 | __u8 *ttl, bool ipv6) |
312 | { |
313 | __u32 key = 0; |
314 | __u64 *value; |
315 | |
316 | value = bpf_map_lookup_elem(&values, &key); |
317 | if (value && *value != 0) { |
318 | if (ipv6) |
319 | *mss = (*value >> 32) & 0xffff; |
320 | else |
321 | *mss = *value & 0xffff; |
322 | *wscale = (*value >> 16) & 0xf; |
323 | *ttl = (*value >> 24) & 0xff; |
324 | return; |
325 | } |
326 | |
327 | *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4; |
328 | *wscale = DEFAULT_WSCALE; |
329 | *ttl = DEFAULT_TTL; |
330 | } |
331 | |
332 | static __always_inline void values_inc_synacks(void) |
333 | { |
334 | __u32 key = 1; |
335 | __u64 *value; |
336 | |
337 | value = bpf_map_lookup_elem(&values, &key); |
338 | if (value) |
339 | __sync_fetch_and_add(value, 1); |
340 | } |
341 | |
342 | static __always_inline bool check_port_allowed(__u16 port) |
343 | { |
344 | __u32 i; |
345 | |
346 | for (i = 0; i < MAX_ALLOWED_PORTS; i++) { |
347 | __u32 key = i; |
348 | __u16 *value; |
349 | |
350 | value = bpf_map_lookup_elem(&allowed_ports, &key); |
351 | |
352 | if (!value) |
353 | break; |
354 | /* 0 is a terminator value. Check it first to avoid matching on |
355 | * a forbidden port == 0 and returning true. |
356 | */ |
357 | if (*value == 0) |
358 | break; |
359 | |
360 | if (*value == port) |
361 | return true; |
362 | } |
363 | |
364 | return false; |
365 | } |
366 | |
367 | struct { |
368 | struct ethhdr *; |
369 | struct iphdr *; |
370 | struct ipv6hdr *; |
371 | struct tcphdr *; |
372 | __u16 ; |
373 | }; |
374 | |
375 | static __always_inline int tcp_dissect(void *data, void *data_end, |
376 | struct header_pointers *hdr) |
377 | { |
378 | hdr->eth = data; |
379 | if (hdr->eth + 1 > data_end) |
380 | return XDP_DROP; |
381 | |
382 | switch (bpf_ntohs(hdr->eth->h_proto)) { |
383 | case ETH_P_IP: |
384 | hdr->ipv6 = NULL; |
385 | |
386 | hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); |
387 | if (hdr->ipv4 + 1 > data_end) |
388 | return XDP_DROP; |
389 | if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4)) |
390 | return XDP_DROP; |
391 | if (hdr->ipv4->version != 4) |
392 | return XDP_DROP; |
393 | |
394 | if (hdr->ipv4->protocol != IPPROTO_TCP) |
395 | return XDP_PASS; |
396 | |
397 | hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; |
398 | break; |
399 | case ETH_P_IPV6: |
400 | hdr->ipv4 = NULL; |
401 | |
402 | hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); |
403 | if (hdr->ipv6 + 1 > data_end) |
404 | return XDP_DROP; |
405 | if (hdr->ipv6->version != 6) |
406 | return XDP_DROP; |
407 | |
408 | /* XXX: Extension headers are not supported and could circumvent |
409 | * XDP SYN flood protection. |
410 | */ |
411 | if (hdr->ipv6->nexthdr != NEXTHDR_TCP) |
412 | return XDP_PASS; |
413 | |
414 | hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); |
415 | break; |
416 | default: |
417 | /* XXX: VLANs will circumvent XDP SYN flood protection. */ |
418 | return XDP_PASS; |
419 | } |
420 | |
421 | if (hdr->tcp + 1 > data_end) |
422 | return XDP_DROP; |
423 | hdr->tcp_len = hdr->tcp->doff * 4; |
424 | if (hdr->tcp_len < sizeof(*hdr->tcp)) |
425 | return XDP_DROP; |
426 | |
427 | return XDP_TX; |
428 | } |
429 | |
430 | static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp) |
431 | { |
432 | struct bpf_ct_opts___local ct_lookup_opts = { |
433 | .netns_id = BPF_F_CURRENT_NETNS, |
434 | .l4proto = IPPROTO_TCP, |
435 | }; |
436 | struct bpf_sock_tuple tup = {}; |
437 | struct nf_conn *ct; |
438 | __u32 tup_size; |
439 | |
440 | if (hdr->ipv4) { |
441 | /* TCP doesn't normally use fragments, and XDP can't reassemble |
442 | * them. |
443 | */ |
444 | if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF)) |
445 | return XDP_DROP; |
446 | |
447 | tup.ipv4.saddr = hdr->ipv4->saddr; |
448 | tup.ipv4.daddr = hdr->ipv4->daddr; |
449 | tup.ipv4.sport = hdr->tcp->source; |
450 | tup.ipv4.dport = hdr->tcp->dest; |
451 | tup_size = sizeof(tup.ipv4); |
452 | } else if (hdr->ipv6) { |
453 | __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr)); |
454 | __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr)); |
455 | tup.ipv6.sport = hdr->tcp->source; |
456 | tup.ipv6.dport = hdr->tcp->dest; |
457 | tup_size = sizeof(tup.ipv6); |
458 | } else { |
459 | /* The verifier can't track that either ipv4 or ipv6 is not |
460 | * NULL. |
461 | */ |
462 | return XDP_ABORTED; |
463 | } |
464 | if (xdp) |
465 | ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); |
466 | else |
467 | ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts)); |
468 | if (ct) { |
469 | unsigned long status = ct->status; |
470 | |
471 | bpf_ct_release(ct); |
472 | if (status & IPS_CONFIRMED) |
473 | return XDP_PASS; |
474 | } else if (ct_lookup_opts.error != -ENOENT) { |
475 | return XDP_ABORTED; |
476 | } |
477 | |
478 | /* error == -ENOENT || !(status & IPS_CONFIRMED) */ |
479 | return XDP_TX; |
480 | } |
481 | |
482 | static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss, |
483 | __u8 wscale) |
484 | { |
485 | __be32 *start = buf; |
486 | |
487 | *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); |
488 | |
489 | if (!tsopt) |
490 | return buf - start; |
491 | |
492 | if (tsopt[0] & bpf_htonl(1 << 4)) |
493 | *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) | |
494 | (TCPOLEN_SACK_PERM << 16) | |
495 | (TCPOPT_TIMESTAMP << 8) | |
496 | TCPOLEN_TIMESTAMP); |
497 | else |
498 | *buf++ = bpf_htonl((TCPOPT_NOP << 24) | |
499 | (TCPOPT_NOP << 16) | |
500 | (TCPOPT_TIMESTAMP << 8) | |
501 | TCPOLEN_TIMESTAMP); |
502 | *buf++ = tsopt[0]; |
503 | *buf++ = tsopt[1]; |
504 | |
505 | if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf)) |
506 | *buf++ = bpf_htonl((TCPOPT_NOP << 24) | |
507 | (TCPOPT_WINDOW << 16) | |
508 | (TCPOLEN_WINDOW << 8) | |
509 | wscale); |
510 | |
511 | return buf - start; |
512 | } |
513 | |
514 | static __always_inline void tcp_gen_synack(struct tcphdr *, |
515 | __u32 cookie, __be32 *tsopt, |
516 | __u16 mss, __u8 wscale) |
517 | { |
518 | void *tcp_options; |
519 | |
520 | tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK; |
521 | if (tsopt && (tsopt[0] & bpf_htonl(1 << 5))) |
522 | tcp_flag_word(tcp_header) |= TCP_FLAG_ECE; |
523 | tcp_header->doff = 5; /* doff is part of tcp_flag_word. */ |
524 | swap(tcp_header->source, tcp_header->dest); |
525 | tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1); |
526 | tcp_header->seq = bpf_htonl(cookie); |
527 | tcp_header->window = 0; |
528 | tcp_header->urg_ptr = 0; |
529 | tcp_header->check = 0; /* Calculate checksum later. */ |
530 | |
531 | tcp_options = (void *)(tcp_header + 1); |
532 | tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale); |
533 | } |
534 | |
535 | static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr, |
536 | __u32 cookie, __be32 *tsopt) |
537 | { |
538 | __u8 wscale; |
539 | __u16 mss; |
540 | __u8 ttl; |
541 | |
542 | values_get_tcpipopts(&mss, &wscale, &ttl, false); |
543 | |
544 | swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); |
545 | |
546 | swap(hdr->ipv4->saddr, hdr->ipv4->daddr); |
547 | hdr->ipv4->check = 0; /* Calculate checksum later. */ |
548 | hdr->ipv4->tos = 0; |
549 | hdr->ipv4->id = 0; |
550 | hdr->ipv4->ttl = ttl; |
551 | |
552 | tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); |
553 | |
554 | hdr->tcp_len = hdr->tcp->doff * 4; |
555 | hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len); |
556 | } |
557 | |
558 | static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr, |
559 | __u32 cookie, __be32 *tsopt) |
560 | { |
561 | __u8 wscale; |
562 | __u16 mss; |
563 | __u8 ttl; |
564 | |
565 | values_get_tcpipopts(&mss, &wscale, &ttl, true); |
566 | |
567 | swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest); |
568 | |
569 | swap(hdr->ipv6->saddr, hdr->ipv6->daddr); |
570 | *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000); |
571 | hdr->ipv6->hop_limit = ttl; |
572 | |
573 | tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale); |
574 | |
575 | hdr->tcp_len = hdr->tcp->doff * 4; |
576 | hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len); |
577 | } |
578 | |
579 | static __always_inline int syncookie_handle_syn(struct header_pointers *hdr, |
580 | void *ctx, |
581 | void *data, void *data_end, |
582 | bool xdp) |
583 | { |
584 | __u32 old_pkt_size, new_pkt_size; |
585 | /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the |
586 | * BPF verifier if tsopt is not volatile. Volatile forces it to store |
587 | * the pointer value and use it directly, otherwise tcp_mkoptions is |
588 | * (mis)compiled like this: |
589 | * if (!tsopt) |
590 | * return buf - start; |
591 | * reg = stored_return_value_of_tscookie_init; |
592 | * if (reg) |
593 | * tsopt = tsopt_buf; |
594 | * else |
595 | * tsopt = NULL; |
596 | * ... |
597 | * *buf++ = tsopt[1]; |
598 | * It creates a dead branch where tsopt is assigned NULL, but the |
599 | * verifier can't prove it's dead and blocks the program. |
600 | */ |
601 | __be32 * volatile tsopt = NULL; |
602 | __be32 tsopt_buf[2] = {}; |
603 | __u16 ip_len; |
604 | __u32 cookie; |
605 | __s64 value; |
606 | |
607 | /* Checksum is not yet verified, but both checksum failure and TCP |
608 | * header checks return XDP_DROP, so the order doesn't matter. |
609 | */ |
610 | if (hdr->tcp->fin || hdr->tcp->rst) |
611 | return XDP_DROP; |
612 | |
613 | /* Issue SYN cookies on allowed ports, drop SYN packets on blocked |
614 | * ports. |
615 | */ |
616 | if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest))) |
617 | return XDP_DROP; |
618 | |
619 | if (hdr->ipv4) { |
620 | /* Check the IPv4 and TCP checksums before creating a SYNACK. */ |
621 | value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0); |
622 | if (value < 0) |
623 | return XDP_ABORTED; |
624 | if (csum_fold(value) != 0) |
625 | return XDP_DROP; /* Bad IPv4 checksum. */ |
626 | |
627 | value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); |
628 | if (value < 0) |
629 | return XDP_ABORTED; |
630 | if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr, |
631 | hdr->tcp_len, IPPROTO_TCP, value) != 0) |
632 | return XDP_DROP; /* Bad TCP checksum. */ |
633 | |
634 | ip_len = sizeof(*hdr->ipv4); |
635 | |
636 | value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp, |
637 | hdr->tcp_len); |
638 | } else if (hdr->ipv6) { |
639 | /* Check the TCP checksum before creating a SYNACK. */ |
640 | value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); |
641 | if (value < 0) |
642 | return XDP_ABORTED; |
643 | if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr, |
644 | hdr->tcp_len, IPPROTO_TCP, value) != 0) |
645 | return XDP_DROP; /* Bad TCP checksum. */ |
646 | |
647 | ip_len = sizeof(*hdr->ipv6); |
648 | |
649 | value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp, |
650 | hdr->tcp_len); |
651 | } else { |
652 | return XDP_ABORTED; |
653 | } |
654 | |
655 | if (value < 0) |
656 | return XDP_ABORTED; |
657 | cookie = (__u32)value; |
658 | |
659 | if (tscookie_init((void *)hdr->tcp, hdr->tcp_len, |
660 | &tsopt_buf[0], &tsopt_buf[1], data, data_end)) |
661 | tsopt = tsopt_buf; |
662 | |
663 | /* Check that there is enough space for a SYNACK. It also covers |
664 | * the check that the destination of the __builtin_memmove below |
665 | * doesn't overflow. |
666 | */ |
667 | if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end) |
668 | return XDP_ABORTED; |
669 | |
670 | if (hdr->ipv4) { |
671 | if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) { |
672 | struct tcphdr *; |
673 | |
674 | new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4); |
675 | __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp)); |
676 | hdr->tcp = new_tcp_header; |
677 | |
678 | hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4; |
679 | } |
680 | |
681 | tcpv4_gen_synack(hdr, cookie, tsopt); |
682 | } else if (hdr->ipv6) { |
683 | tcpv6_gen_synack(hdr, cookie, tsopt); |
684 | } else { |
685 | return XDP_ABORTED; |
686 | } |
687 | |
688 | /* Recalculate checksums. */ |
689 | hdr->tcp->check = 0; |
690 | value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0); |
691 | if (value < 0) |
692 | return XDP_ABORTED; |
693 | if (hdr->ipv4) { |
694 | hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr, |
695 | hdr->ipv4->daddr, |
696 | hdr->tcp_len, |
697 | IPPROTO_TCP, |
698 | value); |
699 | |
700 | hdr->ipv4->check = 0; |
701 | value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0); |
702 | if (value < 0) |
703 | return XDP_ABORTED; |
704 | hdr->ipv4->check = csum_fold(value); |
705 | } else if (hdr->ipv6) { |
706 | hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr, |
707 | &hdr->ipv6->daddr, |
708 | hdr->tcp_len, |
709 | IPPROTO_TCP, |
710 | value); |
711 | } else { |
712 | return XDP_ABORTED; |
713 | } |
714 | |
715 | /* Set the new packet size. */ |
716 | old_pkt_size = data_end - data; |
717 | new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4; |
718 | if (xdp) { |
719 | if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size)) |
720 | return XDP_ABORTED; |
721 | } else { |
722 | if (bpf_skb_change_tail(ctx, new_pkt_size, 0)) |
723 | return XDP_ABORTED; |
724 | } |
725 | |
726 | values_inc_synacks(); |
727 | |
728 | return XDP_TX; |
729 | } |
730 | |
731 | static __always_inline int syncookie_handle_ack(struct header_pointers *hdr) |
732 | { |
733 | int err; |
734 | |
735 | if (hdr->tcp->rst) |
736 | return XDP_DROP; |
737 | |
738 | if (hdr->ipv4) |
739 | err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp); |
740 | else if (hdr->ipv6) |
741 | err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp); |
742 | else |
743 | return XDP_ABORTED; |
744 | if (err) |
745 | return XDP_DROP; |
746 | |
747 | return XDP_PASS; |
748 | } |
749 | |
750 | static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end, |
751 | struct header_pointers *hdr, bool xdp) |
752 | { |
753 | int ret; |
754 | |
755 | ret = tcp_dissect(data, data_end, hdr); |
756 | if (ret != XDP_TX) |
757 | return ret; |
758 | |
759 | ret = tcp_lookup(ctx, hdr, xdp); |
760 | if (ret != XDP_TX) |
761 | return ret; |
762 | |
763 | /* Packet is TCP and doesn't belong to an established connection. */ |
764 | |
765 | if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1) |
766 | return XDP_DROP; |
767 | |
768 | /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len |
769 | * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier. |
770 | */ |
771 | if (xdp) { |
772 | if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len)) |
773 | return XDP_ABORTED; |
774 | } else { |
775 | /* Without volatile the verifier throws this error: |
776 | * R9 32-bit pointer arithmetic prohibited |
777 | */ |
778 | volatile u64 old_len = data_end - data; |
779 | |
780 | if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0)) |
781 | return XDP_ABORTED; |
782 | } |
783 | |
784 | return XDP_TX; |
785 | } |
786 | |
787 | static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end, |
788 | struct header_pointers *hdr, bool xdp) |
789 | { |
790 | if (hdr->ipv4) { |
791 | hdr->eth = data; |
792 | hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth); |
793 | /* IPV4_MAXLEN is needed when calculating checksum. |
794 | * At least sizeof(struct iphdr) is needed here to access ihl. |
795 | */ |
796 | if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end) |
797 | return XDP_ABORTED; |
798 | hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4; |
799 | } else if (hdr->ipv6) { |
800 | hdr->eth = data; |
801 | hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth); |
802 | hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6); |
803 | } else { |
804 | return XDP_ABORTED; |
805 | } |
806 | |
807 | if ((void *)hdr->tcp + TCP_MAXLEN > data_end) |
808 | return XDP_ABORTED; |
809 | |
810 | /* We run out of registers, tcp_len gets spilled to the stack, and the |
811 | * verifier forgets its min and max values checked above in tcp_dissect. |
812 | */ |
813 | hdr->tcp_len = hdr->tcp->doff * 4; |
814 | if (hdr->tcp_len < sizeof(*hdr->tcp)) |
815 | return XDP_ABORTED; |
816 | |
817 | return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) : |
818 | syncookie_handle_ack(hdr); |
819 | } |
820 | |
821 | SEC("xdp" ) |
822 | int syncookie_xdp(struct xdp_md *ctx) |
823 | { |
824 | void *data_end = (void *)(long)ctx->data_end; |
825 | void *data = (void *)(long)ctx->data; |
826 | struct header_pointers hdr; |
827 | int ret; |
828 | |
829 | ret = syncookie_part1(ctx, data, data_end, &hdr, true); |
830 | if (ret != XDP_TX) |
831 | return ret; |
832 | |
833 | data_end = (void *)(long)ctx->data_end; |
834 | data = (void *)(long)ctx->data; |
835 | |
836 | return syncookie_part2(ctx, data, data_end, &hdr, true); |
837 | } |
838 | |
839 | SEC("tc" ) |
840 | int syncookie_tc(struct __sk_buff *skb) |
841 | { |
842 | void *data_end = (void *)(long)skb->data_end; |
843 | void *data = (void *)(long)skb->data; |
844 | struct header_pointers hdr; |
845 | int ret; |
846 | |
847 | ret = syncookie_part1(skb, data, data_end, &hdr, false); |
848 | if (ret != XDP_TX) |
849 | return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT; |
850 | |
851 | data_end = (void *)(long)skb->data_end; |
852 | data = (void *)(long)skb->data; |
853 | |
854 | ret = syncookie_part2(skb, data, data_end, &hdr, false); |
855 | switch (ret) { |
856 | case XDP_PASS: |
857 | return TC_ACT_OK; |
858 | case XDP_TX: |
859 | return bpf_redirect(skb->ifindex, 0); |
860 | default: |
861 | return TC_ACT_SHOT; |
862 | } |
863 | } |
864 | |
865 | char _license[] SEC("license" ) = "GPL" ; |
866 | |