test_cls_redirect_dynptr.c source code [linux/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c]

1	// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2	// Copyright (c) 2019, 2020 Cloudflare
3
4	#include <stdbool.h>
5	#include <stddef.h>
6	#include <stdint.h>
7	#include <string.h>
8
9	#include <linux/bpf.h>
10	#include <linux/icmp.h>
11	#include <linux/icmpv6.h>
12	#include <linux/if_ether.h>
13	#include <linux/in.h>
14	#include <linux/ip.h>
15	#include <linux/ipv6.h>
16	#include <linux/pkt_cls.h>
17	#include <linux/tcp.h>
18	#include <linux/udp.h>
19
20	#include <bpf/bpf_helpers.h>
21	#include <bpf/bpf_endian.h>
22
23	#include "test_cls_redirect.h"
24	#include "bpf_kfuncs.h"
25
26	#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27
28	#define offsetofend(TYPE, MEMBER) \
29	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
30
31	#define IP_OFFSET_MASK (0x1FFF)
32	#define IP_MF (0x2000)
33
34	char _license[] SEC("license") = "Dual BSD/GPL";
35
36	/**
37	* Destination port and IP used for UDP encapsulation.
38	*/
39	volatile const __be16 ENCAPSULATION_PORT;
40	volatile const __be32 ENCAPSULATION_IP;
41
42	typedef struct {
43	uint64_t processed_packets_total;
44	uint64_t l3_protocol_packets_total_ipv4;
45	uint64_t l3_protocol_packets_total_ipv6;
46	uint64_t l4_protocol_packets_total_tcp;
47	uint64_t l4_protocol_packets_total_udp;
48	uint64_t accepted_packets_total_syn;
49	uint64_t accepted_packets_total_syn_cookies;
50	uint64_t accepted_packets_total_last_hop;
51	uint64_t accepted_packets_total_icmp_echo_request;
52	uint64_t accepted_packets_total_established;
53	uint64_t forwarded_packets_total_gue;
54	uint64_t forwarded_packets_total_gre;
55
56	uint64_t errors_total_unknown_l3_proto;
57	uint64_t errors_total_unknown_l4_proto;
58	uint64_t errors_total_malformed_ip;
59	uint64_t errors_total_fragmented_ip;
60	uint64_t errors_total_malformed_icmp;
61	uint64_t errors_total_unwanted_icmp;
62	uint64_t errors_total_malformed_icmp_pkt_too_big;
63	uint64_t errors_total_malformed_tcp;
64	uint64_t errors_total_malformed_udp;
65	uint64_t errors_total_icmp_echo_replies;
66	uint64_t errors_total_malformed_encapsulation;
67	uint64_t errors_total_encap_adjust_failed;
68	uint64_t errors_total_encap_buffer_too_small;
69	uint64_t errors_total_redirect_loop;
70	uint64_t errors_total_encap_mtu_violate;
71	} metrics_t;
72
73	typedef enum {
74	INVALID = `0`,
75	UNKNOWN,
76	ECHO_REQUEST,
77	SYN,
78	SYN_COOKIE,
79	ESTABLISHED,
80	} verdict_t;
81
82	typedef struct {
83	uint16_t src, dst;
84	} flow_ports_t;
85
86	_Static_assert(
87	sizeof(flow_ports_t) !=
88	offsetofend(struct bpf_sock_tuple, ipv4.dport) -
89	offsetof(struct bpf_sock_tuple, ipv4.sport) - `1`,
90	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
91	_Static_assert(
92	sizeof(flow_ports_t) !=
93	offsetofend(struct bpf_sock_tuple, ipv6.dport) -
94	offsetof(struct bpf_sock_tuple, ipv6.sport) - `1`,
95	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
96
97	struct iphdr_info {
98	void *hdr;
99	__u64 len;
100	};
101
102	typedef int ret_t;
103
104	/ This is a bit of a hack. We need a return value which allows us to*
105	* indicate that the regular flow of the program should continue,
106	* while allowing functions to use XDP_PASS and XDP_DROP, etc.
107	*/
108	static const ret_t CONTINUE_PROCESSING = -`1`;
109
110	/ Convenience macro to call functions which return ret_t.*
111	*/
112	#define MAYBE_RETURN(x) \
113	do { \
114	ret_t __ret = x; \
115	if (__ret != CONTINUE_PROCESSING) \
116	return __ret; \
117	} while (0)
118
119	static bool ipv4_is_fragment(const struct iphdr *ip)
120	{
121	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
122	return (ip->frag_off & bpf_htons(IP_MF)) != `0` \|\| frag_off > `0`;
123	}
124
125	static int pkt_parse_ipv4(struct bpf_dynptr dynptr, __u64 offset, struct iphdr *iphdr)
126	{
127	if (bpf_dynptr_read(iphdr, sizeof(iphdr), dynptr, offset, `0`))
128	return -`1`;
129
130	offset += sizeof(iphdr);
131
132	if (iphdr->ihl < `5`)
133	return -`1`;
134
135	/ skip ipv4 options /
136	offset += (iphdr->ihl - `5`) `4`;
137
138	return `0`;
139	}
140
141	/ Parse the L4 ports from a packet, assuming a layout like TCP or UDP. /
142	static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr dynptr, __u64 offset, flow_ports_t *ports)
143	{
144	if (bpf_dynptr_read(ports, sizeof(ports), dynptr, offset, `0`))
145	return false;
146
147	offset += sizeof(ports);
148
149	/ Ports in the L4 headers are reversed, since we are parsing an ICMP*
150	* payload which is going towards the eyeball.
151	*/
152	uint16_t dst = ports->src;
153	ports->src = ports->dst;
154	ports->dst = dst;
155	return true;
156	}
157
158	static uint16_t pkt_checksum_fold(uint32_t csum)
159	{
160	/ The highest reasonable value for an IPv4 header*
161	* checksum requires two folds, so we just do that always.
162	*/
163	csum = (csum & `0xffff`) + (csum >> `16`);
164	csum = (csum & `0xffff`) + (csum >> `16`);
165	return (uint16_t)~csum;
166	}
167
168	static void pkt_ipv4_checksum(struct iphdr *iph)
169	{
170	iph->check = `0`;
171
172	/ An IP header without options is 20 bytes. Two of those*
173	* are the checksum, which we always set to zero. Hence,
174	* the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
175	* which fits in 32 bit.
176	*/
177	_Static_assert(sizeof(struct iphdr) == `20`, "iphdr must be 20 bytes");
178	uint32_t acc = `0`;
179	uint16_t ipw = (uint16_t )iph;
180
181	for (size_t i = `0`; i < sizeof(struct iphdr) / `2`; i++)
182	acc += ipw[i];
183
184	iph->check = pkt_checksum_fold(csum: acc);
185	}
186
187	static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr dynptr, __u64 offset,
188	const struct ipv6hdr ipv6, uint8_t upper_proto,
189	bool *is_fragment)
190	{
191	/ We understand five extension headers.*
192	* https://tools.ietf.org/html/rfc8200#section-4.1 states that all
193	* headers should occur once, except Destination Options, which may
194	* occur twice. Hence we give up after 6 headers.
195	*/
196	struct {
197	uint8_t next;
198	uint8_t len;
199	} exthdr = {
200	.next = ipv6->nexthdr,
201	};
202	*is_fragment = false;
203
204	for (int i = `0`; i < `6`; i++) {
205	switch (exthdr.next) {
206	case IPPROTO_FRAGMENT:
207	*is_fragment = true;
208	/ NB: We don't check that hdrlen == 0 as per spec. /
209	/ fallthrough; /
210
211	case IPPROTO_HOPOPTS:
212	case IPPROTO_ROUTING:
213	case IPPROTO_DSTOPTS:
214	case IPPROTO_MH:
215	if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, `0`))
216	return false;
217
218	/ hdrlen is in 8-octet units, and excludes the first 8 octets. /
219	offset += (exthdr.len + `1`) `8`;
220
221	/ Decode next header /
222	break;
223
224	default:
225	/ The next header is not one of the known extension*
226	* headers, treat it as the upper layer header.
227	*
228	* This handles IPPROTO_NONE.
229	*
230	* Encapsulating Security Payload (50) and Authentication
231	* Header (51) also end up here (and will trigger an
232	* unknown proto error later). They have a custom header
233	* format and seem too esoteric to care about.
234	*/
235	*upper_proto = exthdr.next;
236	return true;
237	}
238	}
239
240	/ We never found an upper layer header. /
241	return false;
242	}
243
244	static int pkt_parse_ipv6(struct bpf_dynptr dynptr, __u64 offset, struct ipv6hdr *ipv6,
245	uint8_t proto, bool is_fragment)
246	{
247	if (bpf_dynptr_read(ipv6, sizeof(ipv6), dynptr, offset, `0`))
248	return -`1`;
249
250	offset += sizeof(ipv6);
251
252	if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, upper_proto: proto, is_fragment))
253	return -`1`;
254
255	return `0`;
256	}
257
258	/ Global metrics, per CPU*
259	*/
260	struct {
261	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
262	__uint(max_entries, `1`);
263	__type(key, unsigned int);
264	__type(value, metrics_t);
265	} metrics_map SEC(".maps");
266
267	static metrics_t get_global_metrics(void*)
268	{
269	uint64_t key = `0`;
270	return bpf_map_lookup_elem(&metrics_map, &key);
271	}
272
273	static ret_t accept_locally(struct __sk_buff skb, encap_headers_t encap)
274	{
275	const int payload_off =
276	sizeof(*encap) +
277	sizeof(struct in_addr) * encap->unigue.hop_count;
278	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
279
280	/ Changing the ethertype if the encapsulated packet is ipv6 /
281	if (encap->gue.proto_ctype == IPPROTO_IPV6)
282	encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
283
284	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
285	BPF_F_ADJ_ROOM_FIXED_GSO \|
286	BPF_F_ADJ_ROOM_NO_CSUM_RESET) \|\|
287	bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
288	return TC_ACT_SHOT;
289
290	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
291	}
292
293	static ret_t forward_with_gre(struct __sk_buff skb, struct* bpf_dynptr *dynptr,
294	encap_headers_t encap, struct* in_addr *next_hop,
295	metrics_t *metrics)
296	{
297	const int payload_off =
298	sizeof(*encap) +
299	sizeof(struct in_addr) * encap->unigue.hop_count;
300	int32_t encap_overhead =
301	payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
302	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
303	__u8 encap_buffer[sizeof(encap_gre_t)] = {};
304	uint16_t proto = ETH_P_IP;
305	uint32_t mtu_len = `0`;
306	encap_gre_t *encap_gre;
307
308	metrics->forwarded_packets_total_gre++;
309
310	/ Loop protection: the inner packet's TTL is decremented as a safeguard*
311	* against any forwarding loop. As the only interesting field is the TTL
312	* hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
313	* as they handle the split packets if needed (no need for the data to be
314	* in the linear section).
315	*/
316	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
317	proto = ETH_P_IPV6;
318	uint8_t ttl;
319	int rc;
320
321	rc = bpf_skb_load_bytes(
322	skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
323	&ttl, `1`);
324	if (rc != `0`) {
325	metrics->errors_total_malformed_encapsulation++;
326	return TC_ACT_SHOT;
327	}
328
329	if (ttl == `0`) {
330	metrics->errors_total_redirect_loop++;
331	return TC_ACT_SHOT;
332	}
333
334	ttl--;
335	rc = bpf_skb_store_bytes(
336	skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
337	&ttl, `1`, `0`);
338	if (rc != `0`) {
339	metrics->errors_total_malformed_encapsulation++;
340	return TC_ACT_SHOT;
341	}
342	} else {
343	uint8_t ttl;
344	int rc;
345
346	rc = bpf_skb_load_bytes(
347	skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
348	`1`);
349	if (rc != `0`) {
350	metrics->errors_total_malformed_encapsulation++;
351	return TC_ACT_SHOT;
352	}
353
354	if (ttl == `0`) {
355	metrics->errors_total_redirect_loop++;
356	return TC_ACT_SHOT;
357	}
358
359	/ IPv4 also has a checksum to patch. While the TTL is only one byte,*
360	* this function only works for 2 and 4 bytes arguments (the result is
361	* the same).
362	*/
363	rc = bpf_l3_csum_replace(
364	skb, payload_off + offsetof(struct iphdr, check), ttl,
365	ttl - `1`, `2`);
366	if (rc != `0`) {
367	metrics->errors_total_malformed_encapsulation++;
368	return TC_ACT_SHOT;
369	}
370
371	ttl--;
372	rc = bpf_skb_store_bytes(
373	skb, payload_off + offsetof(struct iphdr, ttl), &ttl, `1`,
374	`0`);
375	if (rc != `0`) {
376	metrics->errors_total_malformed_encapsulation++;
377	return TC_ACT_SHOT;
378	}
379	}
380
381	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, `0`)) {
382	metrics->errors_total_encap_mtu_violate++;
383	return TC_ACT_SHOT;
384	}
385
386	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
387	BPF_F_ADJ_ROOM_FIXED_GSO \|
388	BPF_F_ADJ_ROOM_NO_CSUM_RESET) \|\|
389	bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
390	metrics->errors_total_encap_adjust_failed++;
391	return TC_ACT_SHOT;
392	}
393
394	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
395	metrics->errors_total_encap_buffer_too_small++;
396	return TC_ACT_SHOT;
397	}
398
399	encap_gre = bpf_dynptr_slice_rdwr(dynptr, `0`, encap_buffer, sizeof(encap_buffer));
400	if (!encap_gre) {
401	metrics->errors_total_encap_buffer_too_small++;
402	return TC_ACT_SHOT;
403	}
404
405	encap_gre->ip.protocol = IPPROTO_GRE;
406	encap_gre->ip.daddr = next_hop->s_addr;
407	encap_gre->ip.saddr = ENCAPSULATION_IP;
408	encap_gre->ip.tot_len =
409	bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
410	encap_gre->gre.flags = `0`;
411	encap_gre->gre.protocol = bpf_htons(proto);
412	pkt_ipv4_checksum(iph: (void *)&encap_gre->ip);
413
414	if (encap_gre == encap_buffer)
415	bpf_dynptr_write(dynptr, `0`, encap_buffer, sizeof(encap_buffer), `0`);
416
417	return bpf_redirect(skb->ifindex, `0`);
418	}
419
420	static ret_t forward_to_next_hop(struct __sk_buff skb, struct* bpf_dynptr *dynptr,
421	encap_headers_t encap, struct* in_addr *next_hop,
422	metrics_t *metrics)
423	{
424	/ swap L2 addresses /
425	/ This assumes that packets are received from a router.*
426	* So just swapping the MAC addresses here will make the packet go back to
427	* the router, which will send it to the appropriate machine.
428	*/
429	unsigned char temp[ETH_ALEN];
430	memcpy(temp, encap->eth.h_dest, sizeof(temp));
431	memcpy(encap->eth.h_dest, encap->eth.h_source,
432	sizeof(encap->eth.h_dest));
433	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
434
435	if (encap->unigue.next_hop == encap->unigue.hop_count - `1` &&
436	encap->unigue.last_hop_gre) {
437	return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
438	}
439
440	metrics->forwarded_packets_total_gue++;
441	uint32_t old_saddr = encap->ip.saddr;
442	encap->ip.saddr = encap->ip.daddr;
443	encap->ip.daddr = next_hop->s_addr;
444	if (encap->unigue.next_hop < encap->unigue.hop_count) {
445	encap->unigue.next_hop++;
446	}
447
448	/ Remove ip->saddr, add next_hop->s_addr /
449	const uint64_t off = offsetof(typeof(*encap), ip.check);
450	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, `4`);
451	if (ret < `0`) {
452	return TC_ACT_SHOT;
453	}
454
455	return bpf_redirect(skb->ifindex, `0`);
456	}
457
458	static ret_t skip_next_hops(__u64 offset, int* n)
459	{
460	switch (n) {
461	case `1`:
462	offset += sizeof(struct* in_addr);
463	case `0`:
464	return CONTINUE_PROCESSING;
465
466	default:
467	return TC_ACT_SHOT;
468	}
469	}
470
471	/ Get the next hop from the GLB header.*
472	*
473	* Sets next_hop->s_addr to 0 if there are no more hops left.
474	* pkt is positioned just after the variable length GLB header
475	* iff the call is successful.
476	*/
477	static ret_t get_next_hop(struct bpf_dynptr dynptr, __u64 offset, encap_headers_t *encap,
478	struct in_addr *next_hop)
479	{
480	if (encap->unigue.next_hop > encap->unigue.hop_count)
481	return TC_ACT_SHOT;
482
483	/ Skip "used" next hops. /
484	MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
485
486	if (encap->unigue.next_hop == encap->unigue.hop_count) {
487	/ No more next hops, we are at the end of the GLB header. /
488	next_hop->s_addr = `0`;
489	return CONTINUE_PROCESSING;
490	}
491
492	if (bpf_dynptr_read(next_hop, sizeof(next_hop), dynptr, offset, `0`))
493	return TC_ACT_SHOT;
494
495	offset += sizeof(next_hop);
496
497	/ Skip the remainig next hops (may be zero). /
498	return skip_next_hops(offset, n: encap->unigue.hop_count - encap->unigue.next_hop - `1`);
499	}
500
501	/ Fill a bpf_sock_tuple to be used with the socket lookup functions.*
502	* This is a kludge that let's us work around verifier limitations:
503	*
504	* fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
505	*
506	* clang will substitue a costant for sizeof, which allows the verifier
507	* to track it's value. Based on this, it can figure out the constant
508	* return value, and calling code works while still being "generic" to
509	* IPv4 and IPv6.
510	*/
511	static uint64_t fill_tuple(struct bpf_sock_tuple tuple, void* *iph,
512	uint64_t iphlen, uint16_t sport, uint16_t dport)
513	{
514	switch (iphlen) {
515	case sizeof(struct iphdr): {
516	struct iphdr ipv4 = (struct* iphdr *)iph;
517	tuple->ipv4.daddr = ipv4->daddr;
518	tuple->ipv4.saddr = ipv4->saddr;
519	tuple->ipv4.sport = sport;
520	tuple->ipv4.dport = dport;
521	return sizeof(tuple->ipv4);
522	}
523
524	case sizeof(struct ipv6hdr): {
525	struct ipv6hdr ipv6 = (struct* ipv6hdr *)iph;
526	memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
527	sizeof(tuple->ipv6.daddr));
528	memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
529	sizeof(tuple->ipv6.saddr));
530	tuple->ipv6.sport = sport;
531	tuple->ipv6.dport = dport;
532	return sizeof(tuple->ipv6);
533	}
534
535	default:
536	return `0`;
537	}
538	}
539
540	static verdict_t classify_tcp(struct __sk_buff skb, struct* bpf_sock_tuple *tuple,
541	uint64_t tuplen, void iph, struct* tcphdr *tcp)
542	{
543	struct bpf_sock *sk =
544	bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, `0`);
545
546	if (sk == NULL)
547	return UNKNOWN;
548
549	if (sk->state != BPF_TCP_LISTEN) {
550	bpf_sk_release(sk);
551	return ESTABLISHED;
552	}
553
554	if (iph != NULL && tcp != NULL) {
555	/ Kludge: we've run out of arguments, but need the length of the ip header. /
556	uint64_t iphlen = sizeof(struct iphdr);
557
558	if (tuplen == sizeof(tuple->ipv6))
559	iphlen = sizeof(struct ipv6hdr);
560
561	if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
562	sizeof(*tcp)) == `0`) {
563	bpf_sk_release(sk);
564	return SYN_COOKIE;
565	}
566	}
567
568	bpf_sk_release(sk);
569	return UNKNOWN;
570	}
571
572	static verdict_t classify_udp(struct __sk_buff skb, struct* bpf_sock_tuple *tuple, uint64_t tuplen)
573	{
574	struct bpf_sock *sk =
575	bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, `0`);
576
577	if (sk == NULL)
578	return UNKNOWN;
579
580	if (sk->state == BPF_TCP_ESTABLISHED) {
581	bpf_sk_release(sk);
582	return ESTABLISHED;
583	}
584
585	bpf_sk_release(sk);
586	return UNKNOWN;
587	}
588
589	static verdict_t classify_icmp(struct __sk_buff skb, uint8_t proto, struct* bpf_sock_tuple *tuple,
590	uint64_t tuplen, metrics_t *metrics)
591	{
592	switch (proto) {
593	case IPPROTO_TCP:
594	return classify_tcp(skb, tuple, tuplen, NULL, NULL);
595
596	case IPPROTO_UDP:
597	return classify_udp(skb, tuple, tuplen);
598
599	default:
600	metrics->errors_total_malformed_icmp++;
601	return INVALID;
602	}
603	}
604
605	static verdict_t process_icmpv4(struct __sk_buff skb, struct* bpf_dynptr dynptr, __u64 offset,
606	metrics_t *metrics)
607	{
608	struct icmphdr icmp;
609	struct iphdr ipv4;
610
611	if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, `0`)) {
612	metrics->errors_total_malformed_icmp++;
613	return INVALID;
614	}
615
616	offset += sizeof*(icmp);
617
618	/ We should never receive encapsulated echo replies. /
619	if (icmp.type == ICMP_ECHOREPLY) {
620	metrics->errors_total_icmp_echo_replies++;
621	return INVALID;
622	}
623
624	if (icmp.type == ICMP_ECHO)
625	return ECHO_REQUEST;
626
627	if (icmp.type != ICMP_DEST_UNREACH \|\| icmp.code != ICMP_FRAG_NEEDED) {
628	metrics->errors_total_unwanted_icmp++;
629	return INVALID;
630	}
631
632	if (pkt_parse_ipv4(dynptr, offset, iphdr: &ipv4)) {
633	metrics->errors_total_malformed_icmp_pkt_too_big++;
634	return INVALID;
635	}
636
637	/ The source address in the outer IP header is from the entity that*
638	* originated the ICMP message. Use the original IP header to restore
639	* the correct flow tuple.
640	*/
641	struct bpf_sock_tuple tuple;
642	tuple.ipv4.saddr = ipv4.daddr;
643	tuple.ipv4.daddr = ipv4.saddr;
644
645	if (!pkt_parse_icmp_l4_ports(dynptr, offset, ports: (flow_ports_t *)&tuple.ipv4.sport)) {
646	metrics->errors_total_malformed_icmp_pkt_too_big++;
647	return INVALID;
648	}
649
650	return classify_icmp(skb, proto: ipv4.protocol, tuple: &tuple,
651	tuplen: sizeof(tuple.ipv4), metrics);
652	}
653
654	static verdict_t process_icmpv6(struct bpf_dynptr dynptr, __u64 offset, struct __sk_buff *skb,
655	metrics_t *metrics)
656	{
657	struct bpf_sock_tuple tuple;
658	struct ipv6hdr ipv6;
659	struct icmp6hdr icmp6;
660	bool is_fragment;
661	uint8_t l4_proto;
662
663	if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, `0`)) {
664	metrics->errors_total_malformed_icmp++;
665	return INVALID;
666	}
667
668	/ We should never receive encapsulated echo replies. /
669	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
670	metrics->errors_total_icmp_echo_replies++;
671	return INVALID;
672	}
673
674	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
675	return ECHO_REQUEST;
676	}
677
678	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
679	metrics->errors_total_unwanted_icmp++;
680	return INVALID;
681	}
682
683	if (pkt_parse_ipv6(dynptr, offset, ipv6: &ipv6, proto: &l4_proto, is_fragment: &is_fragment)) {
684	metrics->errors_total_malformed_icmp_pkt_too_big++;
685	return INVALID;
686	}
687
688	if (is_fragment) {
689	metrics->errors_total_fragmented_ip++;
690	return INVALID;
691	}
692
693	/ Swap source and dest addresses. /
694	memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
695	memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
696
697	if (!pkt_parse_icmp_l4_ports(dynptr, offset, ports: (flow_ports_t *)&tuple.ipv6.sport)) {
698	metrics->errors_total_malformed_icmp_pkt_too_big++;
699	return INVALID;
700	}
701
702	return classify_icmp(skb, proto: l4_proto, tuple: &tuple, tuplen: sizeof(tuple.ipv6),
703	metrics);
704	}
705
706	static verdict_t process_tcp(struct bpf_dynptr dynptr, __u64 offset, struct __sk_buff *skb,
707	struct iphdr_info info, metrics_t metrics)
708	{
709	struct bpf_sock_tuple tuple;
710	struct tcphdr tcp;
711	uint64_t tuplen;
712
713	metrics->l4_protocol_packets_total_tcp++;
714
715	if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, `0`)) {
716	metrics->errors_total_malformed_tcp++;
717	return INVALID;
718	}
719
720	offset += sizeof*(tcp);
721
722	if (tcp.syn)
723	return SYN;
724
725	tuplen = fill_tuple(tuple: &tuple, iph: info->hdr, iphlen: info->len, sport: tcp.source, dport: tcp.dest);
726	return classify_tcp(skb, tuple: &tuple, tuplen, iph: info->hdr, tcp: &tcp);
727	}
728
729	static verdict_t process_udp(struct bpf_dynptr dynptr, __u64 offset, struct __sk_buff *skb,
730	struct iphdr_info info, metrics_t metrics)
731	{
732	struct bpf_sock_tuple tuple;
733	struct udphdr udph;
734	uint64_t tuplen;
735
736	metrics->l4_protocol_packets_total_udp++;
737
738	if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, `0`)) {
739	metrics->errors_total_malformed_udp++;
740	return INVALID;
741	}
742	offset += sizeof*(udph);
743
744	tuplen = fill_tuple(tuple: &tuple, iph: info->hdr, iphlen: info->len, sport: udph.source, dport: udph.dest);
745	return classify_udp(skb, tuple: &tuple, tuplen);
746	}
747
748	static verdict_t process_ipv4(struct __sk_buff skb, struct* bpf_dynptr *dynptr,
749	__u64 offset, metrics_t metrics)
750	{
751	struct iphdr ipv4;
752	struct iphdr_info info = {
753	.hdr = &ipv4,
754	.len = sizeof(ipv4),
755	};
756
757	metrics->l3_protocol_packets_total_ipv4++;
758
759	if (pkt_parse_ipv4(dynptr, offset, iphdr: &ipv4)) {
760	metrics->errors_total_malformed_ip++;
761	return INVALID;
762	}
763
764	if (ipv4.version != `4`) {
765	metrics->errors_total_malformed_ip++;
766	return INVALID;
767	}
768
769	if (ipv4_is_fragment(ip: &ipv4)) {
770	metrics->errors_total_fragmented_ip++;
771	return INVALID;
772	}
773
774	switch (ipv4.protocol) {
775	case IPPROTO_ICMP:
776	return process_icmpv4(skb, dynptr, offset, metrics);
777
778	case IPPROTO_TCP:
779	return process_tcp(dynptr, offset, skb, info: &info, metrics);
780
781	case IPPROTO_UDP:
782	return process_udp(dynptr, offset, skb, info: &info, metrics);
783
784	default:
785	metrics->errors_total_unknown_l4_proto++;
786	return INVALID;
787	}
788	}
789
790	static verdict_t process_ipv6(struct __sk_buff skb, struct* bpf_dynptr *dynptr,
791	__u64 offset, metrics_t metrics)
792	{
793	struct ipv6hdr ipv6;
794	struct iphdr_info info = {
795	.hdr = &ipv6,
796	.len = sizeof(ipv6),
797	};
798	uint8_t l4_proto;
799	bool is_fragment;
800
801	metrics->l3_protocol_packets_total_ipv6++;
802
803	if (pkt_parse_ipv6(dynptr, offset, ipv6: &ipv6, proto: &l4_proto, is_fragment: &is_fragment)) {
804	metrics->errors_total_malformed_ip++;
805	return INVALID;
806	}
807
808	if (ipv6.version != `6`) {
809	metrics->errors_total_malformed_ip++;
810	return INVALID;
811	}
812
813	if (is_fragment) {
814	metrics->errors_total_fragmented_ip++;
815	return INVALID;
816	}
817
818	switch (l4_proto) {
819	case IPPROTO_ICMPV6:
820	return process_icmpv6(dynptr, offset, skb, metrics);
821
822	case IPPROTO_TCP:
823	return process_tcp(dynptr, offset, skb, info: &info, metrics);
824
825	case IPPROTO_UDP:
826	return process_udp(dynptr, offset, skb, info: &info, metrics);
827
828	default:
829	metrics->errors_total_unknown_l4_proto++;
830	return INVALID;
831	}
832	}
833
834	SEC("tc")
835	int cls_redirect(struct __sk_buff *skb)
836	{
837	__u8 encap_buffer[sizeof(encap_headers_t)] = {};
838	struct bpf_dynptr dynptr;
839	struct in_addr next_hop;
840	/ Tracks offset of the dynptr. This will be unnecessary once*
841	* bpf_dynptr_advance() is available.
842	*/
843	__u64 off = `0`;
844	ret_t ret;
845
846	bpf_dynptr_from_skb(skb, `0`, &dynptr);
847
848	metrics_t *metrics = get_global_metrics();
849	if (metrics == NULL)
850	return TC_ACT_SHOT;
851
852	metrics->processed_packets_total++;
853
854	/ Pass bogus packets as long as we're not sure they're*
855	* destined for us.
856	*/
857	if (skb->protocol != bpf_htons(ETH_P_IP))
858	return TC_ACT_OK;
859
860	encap_headers_t *encap;
861
862	/ Make sure that all encapsulation headers are available in*
863	* the linear portion of the skb. This makes it easy to manipulate them.
864	*/
865	if (bpf_skb_pull_data(skb, sizeof(*encap)))
866	return TC_ACT_OK;
867
868	encap = bpf_dynptr_slice_rdwr(&dynptr, `0`, encap_buffer, sizeof(encap_buffer));
869	if (!encap)
870	return TC_ACT_OK;
871
872	off += sizeof(*encap);
873
874	if (encap->ip.ihl != `5`)
875	/ We never have any options. /
876	return TC_ACT_OK;
877
878	if (encap->ip.daddr != ENCAPSULATION_IP \|\|
879	encap->ip.protocol != IPPROTO_UDP)
880	return TC_ACT_OK;
881
882	/ TODO Check UDP length? /
883	if (encap->udp.dest != ENCAPSULATION_PORT)
884	return TC_ACT_OK;
885
886	/ We now know that the packet is destined to us, we can*
887	* drop bogus ones.
888	*/
889	if (ipv4_is_fragment(ip: (void *)&encap->ip)) {
890	metrics->errors_total_fragmented_ip++;
891	return TC_ACT_SHOT;
892	}
893
894	if (encap->gue.variant != `0`) {
895	metrics->errors_total_malformed_encapsulation++;
896	return TC_ACT_SHOT;
897	}
898
899	if (encap->gue.control != `0`) {
900	metrics->errors_total_malformed_encapsulation++;
901	return TC_ACT_SHOT;
902	}
903
904	if (encap->gue.flags != `0`) {
905	metrics->errors_total_malformed_encapsulation++;
906	return TC_ACT_SHOT;
907	}
908
909	if (encap->gue.hlen !=
910	sizeof(encap->unigue) / `4` + encap->unigue.hop_count) {
911	metrics->errors_total_malformed_encapsulation++;
912	return TC_ACT_SHOT;
913	}
914
915	if (encap->unigue.version != `0`) {
916	metrics->errors_total_malformed_encapsulation++;
917	return TC_ACT_SHOT;
918	}
919
920	if (encap->unigue.reserved != `0`)
921	return TC_ACT_SHOT;
922
923	MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
924
925	if (next_hop.s_addr == `0`) {
926	metrics->accepted_packets_total_last_hop++;
927	return accept_locally(skb, encap);
928	}
929
930	verdict_t verdict;
931	switch (encap->gue.proto_ctype) {
932	case IPPROTO_IPIP:
933	verdict = process_ipv4(skb, dynptr: &dynptr, offset: &off, metrics);
934	break;
935
936	case IPPROTO_IPV6:
937	verdict = process_ipv6(skb, dynptr: &dynptr, offset: &off, metrics);
938	break;
939
940	default:
941	metrics->errors_total_unknown_l3_proto++;
942	return TC_ACT_SHOT;
943	}
944
945	switch (verdict) {
946	case INVALID:
947	/ metrics have already been bumped /
948	return TC_ACT_SHOT;
949
950	case UNKNOWN:
951	return forward_to_next_hop(skb, dynptr: &dynptr, encap, next_hop: &next_hop, metrics);
952
953	case ECHO_REQUEST:
954	metrics->accepted_packets_total_icmp_echo_request++;
955	break;
956
957	case SYN:
958	if (encap->unigue.forward_syn) {
959	return forward_to_next_hop(skb, dynptr: &dynptr, encap, next_hop: &next_hop,
960	metrics);
961	}
962
963	metrics->accepted_packets_total_syn++;
964	break;
965
966	case SYN_COOKIE:
967	metrics->accepted_packets_total_syn_cookies++;
968	break;
969
970	case ESTABLISHED:
971	metrics->accepted_packets_total_established++;
972	break;
973	}
974
975	ret = accept_locally(skb, encap);
976
977	if (encap == encap_buffer)
978	bpf_dynptr_write(&dynptr, `0`, encap_buffer, sizeof(encap_buffer), `0`);
979
980	return ret;
981	}
982

source code of linux/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c