1 | // SPDX-License-Identifier: GPL-2.0 |
2 | // Copyright (c) 2017 Facebook |
3 | #include <stddef.h> |
4 | #include <stdbool.h> |
5 | #include <string.h> |
6 | #include <linux/pkt_cls.h> |
7 | #include <linux/bpf.h> |
8 | #include <linux/in.h> |
9 | #include <linux/if_ether.h> |
10 | #include <linux/ip.h> |
11 | #include <linux/ipv6.h> |
12 | #include <linux/icmp.h> |
13 | #include <linux/icmpv6.h> |
14 | #include <linux/tcp.h> |
15 | #include <linux/udp.h> |
16 | #include <bpf/bpf_helpers.h> |
17 | #include "test_iptunnel_common.h" |
18 | #include <bpf/bpf_endian.h> |
19 | |
20 | #include "bpf_kfuncs.h" |
21 | |
22 | static __always_inline __u32 rol32(__u32 word, unsigned int shift) |
23 | { |
24 | return (word << shift) | (word >> ((-shift) & 31)); |
25 | } |
26 | |
27 | /* copy paste of jhash from kernel sources to make sure llvm |
28 | * can compile it into valid sequence of bpf instructions |
29 | */ |
30 | #define __jhash_mix(a, b, c) \ |
31 | { \ |
32 | a -= c; a ^= rol32(c, 4); c += b; \ |
33 | b -= a; b ^= rol32(a, 6); a += c; \ |
34 | c -= b; c ^= rol32(b, 8); b += a; \ |
35 | a -= c; a ^= rol32(c, 16); c += b; \ |
36 | b -= a; b ^= rol32(a, 19); a += c; \ |
37 | c -= b; c ^= rol32(b, 4); b += a; \ |
38 | } |
39 | |
40 | #define __jhash_final(a, b, c) \ |
41 | { \ |
42 | c ^= b; c -= rol32(b, 14); \ |
43 | a ^= c; a -= rol32(c, 11); \ |
44 | b ^= a; b -= rol32(a, 25); \ |
45 | c ^= b; c -= rol32(b, 16); \ |
46 | a ^= c; a -= rol32(c, 4); \ |
47 | b ^= a; b -= rol32(a, 14); \ |
48 | c ^= b; c -= rol32(b, 24); \ |
49 | } |
50 | |
51 | #define JHASH_INITVAL 0xdeadbeef |
52 | |
53 | typedef unsigned int u32; |
54 | |
55 | static __noinline u32 jhash(const void *key, u32 length, u32 initval) |
56 | { |
57 | u32 a, b, c; |
58 | const unsigned char *k = key; |
59 | |
60 | a = b = c = JHASH_INITVAL + length + initval; |
61 | |
62 | while (length > 12) { |
63 | a += *(u32 *)(k); |
64 | b += *(u32 *)(k + 4); |
65 | c += *(u32 *)(k + 8); |
66 | __jhash_mix(a, b, c); |
67 | length -= 12; |
68 | k += 12; |
69 | } |
70 | switch (length) { |
71 | case 12: c += (u32)k[11]<<24; |
72 | case 11: c += (u32)k[10]<<16; |
73 | case 10: c += (u32)k[9]<<8; |
74 | case 9: c += k[8]; |
75 | case 8: b += (u32)k[7]<<24; |
76 | case 7: b += (u32)k[6]<<16; |
77 | case 6: b += (u32)k[5]<<8; |
78 | case 5: b += k[4]; |
79 | case 4: a += (u32)k[3]<<24; |
80 | case 3: a += (u32)k[2]<<16; |
81 | case 2: a += (u32)k[1]<<8; |
82 | case 1: a += k[0]; |
83 | __jhash_final(a, b, c); |
84 | case 0: /* Nothing left to add */ |
85 | break; |
86 | } |
87 | |
88 | return c; |
89 | } |
90 | |
91 | static __noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) |
92 | { |
93 | a += initval; |
94 | b += initval; |
95 | c += initval; |
96 | __jhash_final(a, b, c); |
97 | return c; |
98 | } |
99 | |
100 | static __noinline u32 jhash_2words(u32 a, u32 b, u32 initval) |
101 | { |
102 | return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); |
103 | } |
104 | |
105 | #define PCKT_FRAGMENTED 65343 |
106 | #define IPV4_HDR_LEN_NO_OPT 20 |
107 | #define IPV4_PLUS_ICMP_HDR 28 |
108 | #define IPV6_PLUS_ICMP_HDR 48 |
109 | #define RING_SIZE 2 |
110 | #define MAX_VIPS 12 |
111 | #define MAX_REALS 5 |
112 | #define CTL_MAP_SIZE 16 |
113 | #define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE) |
114 | #define F_IPV6 (1 << 0) |
115 | #define F_HASH_NO_SRC_PORT (1 << 0) |
116 | #define F_ICMP (1 << 0) |
117 | #define F_SYN_SET (1 << 1) |
118 | |
119 | struct packet_description { |
120 | union { |
121 | __be32 src; |
122 | __be32 srcv6[4]; |
123 | }; |
124 | union { |
125 | __be32 dst; |
126 | __be32 dstv6[4]; |
127 | }; |
128 | union { |
129 | __u32 ports; |
130 | __u16 port16[2]; |
131 | }; |
132 | __u8 proto; |
133 | __u8 flags; |
134 | }; |
135 | |
136 | struct ctl_value { |
137 | union { |
138 | __u64 value; |
139 | __u32 ifindex; |
140 | __u8 mac[6]; |
141 | }; |
142 | }; |
143 | |
144 | struct vip_meta { |
145 | __u32 flags; |
146 | __u32 vip_num; |
147 | }; |
148 | |
149 | struct real_definition { |
150 | union { |
151 | __be32 dst; |
152 | __be32 dstv6[4]; |
153 | }; |
154 | __u8 flags; |
155 | }; |
156 | |
157 | struct vip_stats { |
158 | __u64 bytes; |
159 | __u64 pkts; |
160 | }; |
161 | |
162 | struct eth_hdr { |
163 | unsigned char eth_dest[ETH_ALEN]; |
164 | unsigned char eth_source[ETH_ALEN]; |
165 | unsigned short eth_proto; |
166 | }; |
167 | |
168 | struct { |
169 | __uint(type, BPF_MAP_TYPE_HASH); |
170 | __uint(max_entries, MAX_VIPS); |
171 | __type(key, struct vip); |
172 | __type(value, struct vip_meta); |
173 | } vip_map SEC(".maps" ); |
174 | |
175 | struct { |
176 | __uint(type, BPF_MAP_TYPE_ARRAY); |
177 | __uint(max_entries, CH_RINGS_SIZE); |
178 | __type(key, __u32); |
179 | __type(value, __u32); |
180 | } ch_rings SEC(".maps" ); |
181 | |
182 | struct { |
183 | __uint(type, BPF_MAP_TYPE_ARRAY); |
184 | __uint(max_entries, MAX_REALS); |
185 | __type(key, __u32); |
186 | __type(value, struct real_definition); |
187 | } reals SEC(".maps" ); |
188 | |
189 | struct { |
190 | __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); |
191 | __uint(max_entries, MAX_VIPS); |
192 | __type(key, __u32); |
193 | __type(value, struct vip_stats); |
194 | } stats SEC(".maps" ); |
195 | |
196 | struct { |
197 | __uint(type, BPF_MAP_TYPE_ARRAY); |
198 | __uint(max_entries, CTL_MAP_SIZE); |
199 | __type(key, __u32); |
200 | __type(value, struct ctl_value); |
201 | } ctl_array SEC(".maps" ); |
202 | |
203 | static __noinline __u32 get_packet_hash(struct packet_description *pckt, bool ipv6) |
204 | { |
205 | if (ipv6) |
206 | return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS), |
207 | pckt->ports, CH_RINGS_SIZE); |
208 | else |
209 | return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE); |
210 | } |
211 | |
212 | static __noinline bool get_packet_dst(struct real_definition **real, |
213 | struct packet_description *pckt, |
214 | struct vip_meta *vip_info, |
215 | bool is_ipv6) |
216 | { |
217 | __u32 hash = get_packet_hash(pckt, is_ipv6); |
218 | __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE; |
219 | __u32 *real_pos; |
220 | |
221 | if (hash != 0x358459b7 /* jhash of ipv4 packet */ && |
222 | hash != 0x2f4bc6bb /* jhash of ipv6 packet */) |
223 | return false; |
224 | |
225 | real_pos = bpf_map_lookup_elem(&ch_rings, &key); |
226 | if (!real_pos) |
227 | return false; |
228 | key = *real_pos; |
229 | *real = bpf_map_lookup_elem(&reals, &key); |
230 | if (!(*real)) |
231 | return false; |
232 | return true; |
233 | } |
234 | |
235 | static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off, |
236 | struct packet_description *pckt) |
237 | { |
238 | __u8 buffer[sizeof(struct ipv6hdr)] = {}; |
239 | struct icmp6hdr *icmp_hdr; |
240 | struct ipv6hdr *ip6h; |
241 | |
242 | icmp_hdr = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); |
243 | if (!icmp_hdr) |
244 | return TC_ACT_SHOT; |
245 | |
246 | if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG) |
247 | return TC_ACT_OK; |
248 | off += sizeof(struct icmp6hdr); |
249 | ip6h = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); |
250 | if (!ip6h) |
251 | return TC_ACT_SHOT; |
252 | pckt->proto = ip6h->nexthdr; |
253 | pckt->flags |= F_ICMP; |
254 | memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16); |
255 | memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16); |
256 | return TC_ACT_UNSPEC; |
257 | } |
258 | |
259 | static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off, |
260 | struct packet_description *pckt) |
261 | { |
262 | __u8 buffer_icmp[sizeof(struct iphdr)] = {}; |
263 | __u8 buffer_ip[sizeof(struct iphdr)] = {}; |
264 | struct icmphdr *icmp_hdr; |
265 | struct iphdr *iph; |
266 | |
267 | icmp_hdr = bpf_dynptr_slice(skb_ptr, off, buffer_icmp, sizeof(buffer_icmp)); |
268 | if (!icmp_hdr) |
269 | return TC_ACT_SHOT; |
270 | if (icmp_hdr->type != ICMP_DEST_UNREACH || |
271 | icmp_hdr->code != ICMP_FRAG_NEEDED) |
272 | return TC_ACT_OK; |
273 | off += sizeof(struct icmphdr); |
274 | iph = bpf_dynptr_slice(skb_ptr, off, buffer_ip, sizeof(buffer_ip)); |
275 | if (!iph || iph->ihl != 5) |
276 | return TC_ACT_SHOT; |
277 | pckt->proto = iph->protocol; |
278 | pckt->flags |= F_ICMP; |
279 | pckt->src = iph->daddr; |
280 | pckt->dst = iph->saddr; |
281 | return TC_ACT_UNSPEC; |
282 | } |
283 | |
284 | static __noinline bool parse_udp(struct bpf_dynptr *skb_ptr, __u64 off, |
285 | struct packet_description *pckt) |
286 | { |
287 | __u8 buffer[sizeof(struct udphdr)] = {}; |
288 | struct udphdr *udp; |
289 | |
290 | udp = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); |
291 | if (!udp) |
292 | return false; |
293 | |
294 | if (!(pckt->flags & F_ICMP)) { |
295 | pckt->port16[0] = udp->source; |
296 | pckt->port16[1] = udp->dest; |
297 | } else { |
298 | pckt->port16[0] = udp->dest; |
299 | pckt->port16[1] = udp->source; |
300 | } |
301 | return true; |
302 | } |
303 | |
304 | static __noinline bool parse_tcp(struct bpf_dynptr *skb_ptr, __u64 off, |
305 | struct packet_description *pckt) |
306 | { |
307 | __u8 buffer[sizeof(struct tcphdr)] = {}; |
308 | struct tcphdr *tcp; |
309 | |
310 | tcp = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); |
311 | if (!tcp) |
312 | return false; |
313 | |
314 | if (tcp->syn) |
315 | pckt->flags |= F_SYN_SET; |
316 | |
317 | if (!(pckt->flags & F_ICMP)) { |
318 | pckt->port16[0] = tcp->source; |
319 | pckt->port16[1] = tcp->dest; |
320 | } else { |
321 | pckt->port16[0] = tcp->dest; |
322 | pckt->port16[1] = tcp->source; |
323 | } |
324 | return true; |
325 | } |
326 | |
327 | static __noinline int process_packet(struct bpf_dynptr *skb_ptr, |
328 | struct eth_hdr *eth, __u64 off, |
329 | bool is_ipv6, struct __sk_buff *skb) |
330 | { |
331 | struct packet_description pckt = {}; |
332 | struct bpf_tunnel_key tkey = {}; |
333 | struct vip_stats *data_stats; |
334 | struct real_definition *dst; |
335 | struct vip_meta *vip_info; |
336 | struct ctl_value *cval; |
337 | __u32 v4_intf_pos = 1; |
338 | __u32 v6_intf_pos = 2; |
339 | struct ipv6hdr *ip6h; |
340 | struct vip vip = {}; |
341 | struct iphdr *iph; |
342 | int tun_flag = 0; |
343 | __u16 pkt_bytes; |
344 | __u64 iph_len; |
345 | __u32 ifindex; |
346 | __u8 protocol; |
347 | __u32 vip_num; |
348 | int action; |
349 | |
350 | tkey.tunnel_ttl = 64; |
351 | if (is_ipv6) { |
352 | __u8 buffer[sizeof(struct ipv6hdr)] = {}; |
353 | |
354 | ip6h = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); |
355 | if (!ip6h) |
356 | return TC_ACT_SHOT; |
357 | |
358 | iph_len = sizeof(struct ipv6hdr); |
359 | protocol = ip6h->nexthdr; |
360 | pckt.proto = protocol; |
361 | pkt_bytes = bpf_ntohs(ip6h->payload_len); |
362 | off += iph_len; |
363 | if (protocol == IPPROTO_FRAGMENT) { |
364 | return TC_ACT_SHOT; |
365 | } else if (protocol == IPPROTO_ICMPV6) { |
366 | action = parse_icmpv6(skb_ptr, off, &pckt); |
367 | if (action >= 0) |
368 | return action; |
369 | off += IPV6_PLUS_ICMP_HDR; |
370 | } else { |
371 | memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16); |
372 | memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16); |
373 | } |
374 | } else { |
375 | __u8 buffer[sizeof(struct iphdr)] = {}; |
376 | |
377 | iph = bpf_dynptr_slice(skb_ptr, off, buffer, sizeof(buffer)); |
378 | if (!iph || iph->ihl != 5) |
379 | return TC_ACT_SHOT; |
380 | |
381 | protocol = iph->protocol; |
382 | pckt.proto = protocol; |
383 | pkt_bytes = bpf_ntohs(iph->tot_len); |
384 | off += IPV4_HDR_LEN_NO_OPT; |
385 | |
386 | if (iph->frag_off & PCKT_FRAGMENTED) |
387 | return TC_ACT_SHOT; |
388 | if (protocol == IPPROTO_ICMP) { |
389 | action = parse_icmp(skb_ptr, off, &pckt); |
390 | if (action >= 0) |
391 | return action; |
392 | off += IPV4_PLUS_ICMP_HDR; |
393 | } else { |
394 | pckt.src = iph->saddr; |
395 | pckt.dst = iph->daddr; |
396 | } |
397 | } |
398 | protocol = pckt.proto; |
399 | |
400 | if (protocol == IPPROTO_TCP) { |
401 | if (!parse_tcp(skb_ptr, off, &pckt)) |
402 | return TC_ACT_SHOT; |
403 | } else if (protocol == IPPROTO_UDP) { |
404 | if (!parse_udp(skb_ptr, off, &pckt)) |
405 | return TC_ACT_SHOT; |
406 | } else { |
407 | return TC_ACT_SHOT; |
408 | } |
409 | |
410 | if (is_ipv6) |
411 | memcpy(vip.daddr.v6, pckt.dstv6, 16); |
412 | else |
413 | vip.daddr.v4 = pckt.dst; |
414 | |
415 | vip.dport = pckt.port16[1]; |
416 | vip.protocol = pckt.proto; |
417 | vip_info = bpf_map_lookup_elem(&vip_map, &vip); |
418 | if (!vip_info) { |
419 | vip.dport = 0; |
420 | vip_info = bpf_map_lookup_elem(&vip_map, &vip); |
421 | if (!vip_info) |
422 | return TC_ACT_SHOT; |
423 | pckt.port16[1] = 0; |
424 | } |
425 | |
426 | if (vip_info->flags & F_HASH_NO_SRC_PORT) |
427 | pckt.port16[0] = 0; |
428 | |
429 | if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) |
430 | return TC_ACT_SHOT; |
431 | |
432 | if (dst->flags & F_IPV6) { |
433 | cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos); |
434 | if (!cval) |
435 | return TC_ACT_SHOT; |
436 | ifindex = cval->ifindex; |
437 | memcpy(tkey.remote_ipv6, dst->dstv6, 16); |
438 | tun_flag = BPF_F_TUNINFO_IPV6; |
439 | } else { |
440 | cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos); |
441 | if (!cval) |
442 | return TC_ACT_SHOT; |
443 | ifindex = cval->ifindex; |
444 | tkey.remote_ipv4 = dst->dst; |
445 | } |
446 | vip_num = vip_info->vip_num; |
447 | data_stats = bpf_map_lookup_elem(&stats, &vip_num); |
448 | if (!data_stats) |
449 | return TC_ACT_SHOT; |
450 | data_stats->pkts++; |
451 | data_stats->bytes += pkt_bytes; |
452 | bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag); |
453 | *(u32 *)eth->eth_dest = tkey.remote_ipv4; |
454 | return bpf_redirect(ifindex, 0); |
455 | } |
456 | |
457 | SEC("tc" ) |
458 | int balancer_ingress(struct __sk_buff *ctx) |
459 | { |
460 | __u8 buffer[sizeof(struct eth_hdr)] = {}; |
461 | struct bpf_dynptr ptr; |
462 | struct eth_hdr *eth; |
463 | __u32 eth_proto; |
464 | __u32 nh_off; |
465 | int err; |
466 | |
467 | nh_off = sizeof(struct eth_hdr); |
468 | |
469 | bpf_dynptr_from_skb(ctx, 0, &ptr); |
470 | eth = bpf_dynptr_slice_rdwr(&ptr, 0, buffer, sizeof(buffer)); |
471 | if (!eth) |
472 | return TC_ACT_SHOT; |
473 | eth_proto = eth->eth_proto; |
474 | if (eth_proto == bpf_htons(ETH_P_IP)) |
475 | err = process_packet(&ptr, eth, nh_off, false, ctx); |
476 | else if (eth_proto == bpf_htons(ETH_P_IPV6)) |
477 | err = process_packet(&ptr, eth, nh_off, true, ctx); |
478 | else |
479 | return TC_ACT_SHOT; |
480 | |
481 | if (eth == buffer) |
482 | bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0); |
483 | |
484 | return err; |
485 | } |
486 | |
487 | char _license[] SEC("license" ) = "GPL" ; |
488 | |