test_cls_redirect.c source code [linux/tools/testing/selftests/bpf/progs/test_cls_redirect.c]

1	// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2	// Copyright (c) 2019, 2020 Cloudflare
3
4	#include <stdbool.h>
5	#include <stddef.h>
6	#include <stdint.h>
7	#include <string.h>
8
9	#include <linux/bpf.h>
10	#include <linux/icmp.h>
11	#include <linux/icmpv6.h>
12	#include <linux/if_ether.h>
13	#include <linux/in.h>
14	#include <linux/ip.h>
15	#include <linux/ipv6.h>
16	#include <linux/pkt_cls.h>
17	#include <linux/tcp.h>
18	#include <linux/udp.h>
19
20	#include <bpf/bpf_helpers.h>
21	#include <bpf/bpf_endian.h>
22
23	#include "bpf_compiler.h"
24	#include "test_cls_redirect.h"
25
26	#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27
28	#ifdef SUBPROGS
29	#define INLINING __noinline
30	#else
31	#define INLINING __always_inline
32	#endif
33
34	#define offsetofend(TYPE, MEMBER) \
35	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
36
37	#define IP_OFFSET_MASK (0x1FFF)
38	#define IP_MF (0x2000)
39
40	char _license[] SEC("license") = "Dual BSD/GPL";
41
42	/**
43	* Destination port and IP used for UDP encapsulation.
44	*/
45	volatile const __be16 ENCAPSULATION_PORT;
46	volatile const __be32 ENCAPSULATION_IP;
47
48	typedef struct {
49	uint64_t processed_packets_total;
50	uint64_t l3_protocol_packets_total_ipv4;
51	uint64_t l3_protocol_packets_total_ipv6;
52	uint64_t l4_protocol_packets_total_tcp;
53	uint64_t l4_protocol_packets_total_udp;
54	uint64_t accepted_packets_total_syn;
55	uint64_t accepted_packets_total_syn_cookies;
56	uint64_t accepted_packets_total_last_hop;
57	uint64_t accepted_packets_total_icmp_echo_request;
58	uint64_t accepted_packets_total_established;
59	uint64_t forwarded_packets_total_gue;
60	uint64_t forwarded_packets_total_gre;
61
62	uint64_t errors_total_unknown_l3_proto;
63	uint64_t errors_total_unknown_l4_proto;
64	uint64_t errors_total_malformed_ip;
65	uint64_t errors_total_fragmented_ip;
66	uint64_t errors_total_malformed_icmp;
67	uint64_t errors_total_unwanted_icmp;
68	uint64_t errors_total_malformed_icmp_pkt_too_big;
69	uint64_t errors_total_malformed_tcp;
70	uint64_t errors_total_malformed_udp;
71	uint64_t errors_total_icmp_echo_replies;
72	uint64_t errors_total_malformed_encapsulation;
73	uint64_t errors_total_encap_adjust_failed;
74	uint64_t errors_total_encap_buffer_too_small;
75	uint64_t errors_total_redirect_loop;
76	uint64_t errors_total_encap_mtu_violate;
77	} metrics_t;
78
79	typedef enum {
80	INVALID = `0`,
81	UNKNOWN,
82	ECHO_REQUEST,
83	SYN,
84	SYN_COOKIE,
85	ESTABLISHED,
86	} verdict_t;
87
88	typedef struct {
89	uint16_t src, dst;
90	} flow_ports_t;
91
92	_Static_assert(
93	sizeof(flow_ports_t) !=
94	offsetofend(struct bpf_sock_tuple, ipv4.dport) -
95	offsetof(struct bpf_sock_tuple, ipv4.sport) - `1`,
96	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
97	_Static_assert(
98	sizeof(flow_ports_t) !=
99	offsetofend(struct bpf_sock_tuple, ipv6.dport) -
100	offsetof(struct bpf_sock_tuple, ipv6.sport) - `1`,
101	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
102
103	typedef int ret_t;
104
105	/ This is a bit of a hack. We need a return value which allows us to*
106	* indicate that the regular flow of the program should continue,
107	* while allowing functions to use XDP_PASS and XDP_DROP, etc.
108	*/
109	static const ret_t CONTINUE_PROCESSING = -`1`;
110
111	/ Convenience macro to call functions which return ret_t.*
112	*/
113	#define MAYBE_RETURN(x) \
114	do { \
115	ret_t __ret = x; \
116	if (__ret != CONTINUE_PROCESSING) \
117	return __ret; \
118	} while (0)
119
120	/ Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),*
121	* or not aligned if the arch supports efficient unaligned access.
122	*
123	* Since the verifier ensures that eBPF packet accesses follow these rules,
124	* we can tell LLVM to emit code as if we always had a larger alignment.
125	* It will yell at us if we end up on a platform where this is not valid.
126	*/
127	typedef uint8_t net_ptr __attribute__*((align_value(`8`)));
128
129	typedef struct buf {
130	struct __sk_buff *skb;
131	net_ptr head;
132	/ NB: tail musn't have alignment other than 1, otherwise*
133	* LLVM will go and eliminate code, e.g. when checking packet lengths.
134	*/
135	uint8_t *const tail;
136	} buf_t;
137
138	static __always_inline size_t buf_off(const buf_t *buf)
139	{
140	/ Clang seems to optimize constructs like*
141	* a - b + c
142	* if c is known:
143	* r? = c
144	* r? -= b
145	* r? += a
146	*
147	* This is a problem if a and b are packet pointers,
148	* since the verifier allows subtracting two pointers to
149	* get a scalar, but not a scalar and a pointer.
150	*
151	* Use inline asm to break this optimization.
152	*/
153	size_t off = (size_t)buf->head;
154	asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
155	return off;
156	}
157
158	static __always_inline bool buf_copy(buf_t buf, void* *dst, size_t len)
159	{
160	if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
161	return false;
162	}
163
164	buf->head += len;
165	return true;
166	}
167
168	static __always_inline bool buf_skip(buf_t buf, const* size_t len)
169	{
170	/ Check whether off + len is valid in the non-linear part. /
171	if (buf_off(buf) + len > buf->skb->len) {
172	return false;
173	}
174
175	buf->head += len;
176	return true;
177	}
178
179	/ Returns a pointer to the start of buf, or NULL if len is*
180	* larger than the remaining data. Consumes len bytes on a successful
181	* call.
182	*
183	* If scratch is not NULL, the function will attempt to load non-linear
184	* data via bpf_skb_load_bytes. On success, scratch is returned.
185	*/
186	static __always_inline void buf_assign(buf_t buf, const size_t len, void *scratch)
187	{
188	if (buf->head + len > buf->tail) {
189	if (scratch == NULL) {
190	return NULL;
191	}
192
193	return buf_copy(buf, dst: scratch, len) ? scratch : NULL;
194	}
195
196	void *ptr = buf->head;
197	buf->head += len;
198	return ptr;
199	}
200
201	static INLINING bool pkt_skip_ipv4_options(buf_t buf, const* struct iphdr *ipv4)
202	{
203	if (ipv4->ihl <= `5`) {
204	return true;
205	}
206
207	return buf_skip(buf, len: (ipv4->ihl - `5`) * `4`);
208	}
209
210	static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
211	{
212	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
213	return (ip->frag_off & bpf_htons(IP_MF)) != `0` \|\| frag_off > `0`;
214	}
215
216	static __always_inline struct iphdr pkt_parse_ipv4(buf_t pkt, struct iphdr *scratch)
217	{
218	struct iphdr ipv4 = buf_assign(buf: pkt, len: sizeof(ipv4), scratch);
219	if (ipv4 == NULL) {
220	return NULL;
221	}
222
223	if (ipv4->ihl < `5`) {
224	return NULL;
225	}
226
227	if (!pkt_skip_ipv4_options(buf: pkt, ipv4)) {
228	return NULL;
229	}
230
231	return ipv4;
232	}
233
234	/ Parse the L4 ports from a packet, assuming a layout like TCP or UDP. /
235	static INLINING bool pkt_parse_icmp_l4_ports(buf_t pkt, flow_ports_t ports)
236	{
237	if (!buf_copy(buf: pkt, dst: ports, len: sizeof(*ports))) {
238	return false;
239	}
240
241	/ Ports in the L4 headers are reversed, since we are parsing an ICMP*
242	* payload which is going towards the eyeball.
243	*/
244	uint16_t dst = ports->src;
245	ports->src = ports->dst;
246	ports->dst = dst;
247	return true;
248	}
249
250	static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
251	{
252	/ The highest reasonable value for an IPv4 header*
253	* checksum requires two folds, so we just do that always.
254	*/
255	csum = (csum & `0xffff`) + (csum >> `16`);
256	csum = (csum & `0xffff`) + (csum >> `16`);
257	return (uint16_t)~csum;
258	}
259
260	static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
261	{
262	iph->check = `0`;
263
264	/ An IP header without options is 20 bytes. Two of those*
265	* are the checksum, which we always set to zero. Hence,
266	* the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
267	* which fits in 32 bit.
268	*/
269	_Static_assert(sizeof(struct iphdr) == `20`, "iphdr must be 20 bytes");
270	uint32_t acc = `0`;
271	uint16_t ipw = (uint16_t )iph;
272
273	__pragma_loop_unroll_full
274	for (size_t i = `0`; i < sizeof(struct iphdr) / `2`; i++) {
275	acc += ipw[i];
276	}
277
278	iph->check = pkt_checksum_fold(csum: acc);
279	}
280
281	static INLINING
282	bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
283	const struct ipv6hdr *ipv6,
284	uint8_t *upper_proto,
285	bool *is_fragment)
286	{
287	/ We understand five extension headers.*
288	* https://tools.ietf.org/html/rfc8200#section-4.1 states that all
289	* headers should occur once, except Destination Options, which may
290	* occur twice. Hence we give up after 6 headers.
291	*/
292	struct {
293	uint8_t next;
294	uint8_t len;
295	} exthdr = {
296	.next = ipv6->nexthdr,
297	};
298	*is_fragment = false;
299
300	__pragma_loop_unroll_full
301	for (int i = `0`; i < `6`; i++) {
302	switch (exthdr.next) {
303	case IPPROTO_FRAGMENT:
304	*is_fragment = true;
305	/ NB: We don't check that hdrlen == 0 as per spec. /
306	/ fallthrough; /
307
308	case IPPROTO_HOPOPTS:
309	case IPPROTO_ROUTING:
310	case IPPROTO_DSTOPTS:
311	case IPPROTO_MH:
312	if (!buf_copy(buf: pkt, dst: &exthdr, len: sizeof(exthdr))) {
313	return false;
314	}
315
316	/ hdrlen is in 8-octet units, and excludes the first 8 octets. /
317	if (!buf_skip(buf: pkt,
318	len: (exthdr.len + `1`) * `8` - sizeof(exthdr))) {
319	return false;
320	}
321
322	/ Decode next header /
323	break;
324
325	default:
326	/ The next header is not one of the known extension*
327	* headers, treat it as the upper layer header.
328	*
329	* This handles IPPROTO_NONE.
330	*
331	* Encapsulating Security Payload (50) and Authentication
332	* Header (51) also end up here (and will trigger an
333	* unknown proto error later). They have a custom header
334	* format and seem too esoteric to care about.
335	*/
336	*upper_proto = exthdr.next;
337	return true;
338	}
339	}
340
341	/ We never found an upper layer header. /
342	return false;
343	}
344
345	/ This function has to be inlined, because the verifier otherwise rejects it*
346	* due to returning a pointer to the stack. This is technically correct, since
347	* scratch is allocated on the stack. However, this usage should be safe since
348	* it's the callers stack after all.
349	*/
350	static __always_inline struct ipv6hdr *
351	pkt_parse_ipv6(buf_t pkt, struct* ipv6hdr scratch, uint8_t proto,
352	bool *is_fragment)
353	{
354	struct ipv6hdr ipv6 = buf_assign(buf: pkt, len: sizeof(ipv6), scratch);
355	if (ipv6 == NULL) {
356	return NULL;
357	}
358
359	if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, upper_proto: proto, is_fragment)) {
360	return NULL;
361	}
362
363	return ipv6;
364	}
365
366	/ Global metrics, per CPU*
367	*/
368	struct {
369	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
370	__uint(max_entries, `1`);
371	__type(key, unsigned int);
372	__type(value, metrics_t);
373	} metrics_map SEC(".maps");
374
375	static INLINING metrics_t get_global_metrics(void*)
376	{
377	uint64_t key = `0`;
378	return bpf_map_lookup_elem(&metrics_map, &key);
379	}
380
381	static INLINING ret_t accept_locally(struct __sk_buff skb, encap_headers_t encap)
382	{
383	const int payload_off =
384	sizeof(*encap) +
385	sizeof(struct in_addr) * encap->unigue.hop_count;
386	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
387
388	// Changing the ethertype if the encapsulated packet is ipv6
389	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
390	encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
391	}
392
393	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
394	BPF_F_ADJ_ROOM_FIXED_GSO \|
395	BPF_F_ADJ_ROOM_NO_CSUM_RESET) \|\|
396	bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
397	return TC_ACT_SHOT;
398
399	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
400	}
401
402	static INLINING ret_t forward_with_gre(struct __sk_buff skb, encap_headers_t encap,
403	struct in_addr next_hop, metrics_t metrics)
404	{
405	metrics->forwarded_packets_total_gre++;
406
407	const int payload_off =
408	sizeof(*encap) +
409	sizeof(struct in_addr) * encap->unigue.hop_count;
410	int32_t encap_overhead =
411	payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
412	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
413	uint16_t proto = ETH_P_IP;
414	uint32_t mtu_len = `0`;
415
416	/ Loop protection: the inner packet's TTL is decremented as a safeguard*
417	* against any forwarding loop. As the only interesting field is the TTL
418	* hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
419	* as they handle the split packets if needed (no need for the data to be
420	* in the linear section).
421	*/
422	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
423	proto = ETH_P_IPV6;
424	uint8_t ttl;
425	int rc;
426
427	rc = bpf_skb_load_bytes(
428	skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
429	&ttl, `1`);
430	if (rc != `0`) {
431	metrics->errors_total_malformed_encapsulation++;
432	return TC_ACT_SHOT;
433	}
434
435	if (ttl == `0`) {
436	metrics->errors_total_redirect_loop++;
437	return TC_ACT_SHOT;
438	}
439
440	ttl--;
441	rc = bpf_skb_store_bytes(
442	skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
443	&ttl, `1`, `0`);
444	if (rc != `0`) {
445	metrics->errors_total_malformed_encapsulation++;
446	return TC_ACT_SHOT;
447	}
448	} else {
449	uint8_t ttl;
450	int rc;
451
452	rc = bpf_skb_load_bytes(
453	skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
454	`1`);
455	if (rc != `0`) {
456	metrics->errors_total_malformed_encapsulation++;
457	return TC_ACT_SHOT;
458	}
459
460	if (ttl == `0`) {
461	metrics->errors_total_redirect_loop++;
462	return TC_ACT_SHOT;
463	}
464
465	/ IPv4 also has a checksum to patch. While the TTL is only one byte,*
466	* this function only works for 2 and 4 bytes arguments (the result is
467	* the same).
468	*/
469	rc = bpf_l3_csum_replace(
470	skb, payload_off + offsetof(struct iphdr, check), ttl,
471	ttl - `1`, `2`);
472	if (rc != `0`) {
473	metrics->errors_total_malformed_encapsulation++;
474	return TC_ACT_SHOT;
475	}
476
477	ttl--;
478	rc = bpf_skb_store_bytes(
479	skb, payload_off + offsetof(struct iphdr, ttl), &ttl, `1`,
480	`0`);
481	if (rc != `0`) {
482	metrics->errors_total_malformed_encapsulation++;
483	return TC_ACT_SHOT;
484	}
485	}
486
487	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, `0`)) {
488	metrics->errors_total_encap_mtu_violate++;
489	return TC_ACT_SHOT;
490	}
491
492	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
493	BPF_F_ADJ_ROOM_FIXED_GSO \|
494	BPF_F_ADJ_ROOM_NO_CSUM_RESET) \|\|
495	bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
496	metrics->errors_total_encap_adjust_failed++;
497	return TC_ACT_SHOT;
498	}
499
500	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
501	metrics->errors_total_encap_buffer_too_small++;
502	return TC_ACT_SHOT;
503	}
504
505	buf_t pkt = {
506	.skb = skb,
507	.head = (uint8_t )(long*)skb->data,
508	.tail = (uint8_t )(long*)skb->data_end,
509	};
510
511	encap_gre_t encap_gre = buf_assign(buf: &pkt, len: sizeof*(encap_gre_t), NULL);
512	if (encap_gre == NULL) {
513	metrics->errors_total_encap_buffer_too_small++;
514	return TC_ACT_SHOT;
515	}
516
517	encap_gre->ip.protocol = IPPROTO_GRE;
518	encap_gre->ip.daddr = next_hop->s_addr;
519	encap_gre->ip.saddr = ENCAPSULATION_IP;
520	encap_gre->ip.tot_len =
521	bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
522	encap_gre->gre.flags = `0`;
523	encap_gre->gre.protocol = bpf_htons(proto);
524	pkt_ipv4_checksum(iph: (void *)&encap_gre->ip);
525
526	return bpf_redirect(skb->ifindex, `0`);
527	}
528
529	static INLINING ret_t forward_to_next_hop(struct __sk_buff skb, encap_headers_t encap,
530	struct in_addr next_hop, metrics_t metrics)
531	{
532	/ swap L2 addresses /
533	/ This assumes that packets are received from a router.*
534	* So just swapping the MAC addresses here will make the packet go back to
535	* the router, which will send it to the appropriate machine.
536	*/
537	unsigned char temp[ETH_ALEN];
538	memcpy(temp, encap->eth.h_dest, sizeof(temp));
539	memcpy(encap->eth.h_dest, encap->eth.h_source,
540	sizeof(encap->eth.h_dest));
541	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
542
543	if (encap->unigue.next_hop == encap->unigue.hop_count - `1` &&
544	encap->unigue.last_hop_gre) {
545	return forward_with_gre(skb, encap, next_hop, metrics);
546	}
547
548	metrics->forwarded_packets_total_gue++;
549	uint32_t old_saddr = encap->ip.saddr;
550	encap->ip.saddr = encap->ip.daddr;
551	encap->ip.daddr = next_hop->s_addr;
552	if (encap->unigue.next_hop < encap->unigue.hop_count) {
553	encap->unigue.next_hop++;
554	}
555
556	/ Remove ip->saddr, add next_hop->s_addr /
557	const uint64_t off = offsetof(typeof(*encap), ip.check);
558	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, `4`);
559	if (ret < `0`) {
560	return TC_ACT_SHOT;
561	}
562
563	return bpf_redirect(skb->ifindex, `0`);
564	}
565
566	static INLINING ret_t skip_next_hops(buf_t pkt, int* n)
567	{
568	switch (n) {
569	case `1`:
570	if (!buf_skip(buf: pkt, len: sizeof(struct in_addr)))
571	return TC_ACT_SHOT;
572	case `0`:
573	return CONTINUE_PROCESSING;
574
575	default:
576	return TC_ACT_SHOT;
577	}
578	}
579
580	/ Get the next hop from the GLB header.*
581	*
582	* Sets next_hop->s_addr to 0 if there are no more hops left.
583	* pkt is positioned just after the variable length GLB header
584	* iff the call is successful.
585	*/
586	static INLINING ret_t get_next_hop(buf_t pkt, encap_headers_t encap,
587	struct in_addr *next_hop)
588	{
589	if (encap->unigue.next_hop > encap->unigue.hop_count) {
590	return TC_ACT_SHOT;
591	}
592
593	/ Skip "used" next hops. /
594	MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
595
596	if (encap->unigue.next_hop == encap->unigue.hop_count) {
597	/ No more next hops, we are at the end of the GLB header. /
598	next_hop->s_addr = `0`;
599	return CONTINUE_PROCESSING;
600	}
601
602	if (!buf_copy(buf: pkt, dst: next_hop, len: sizeof(*next_hop))) {
603	return TC_ACT_SHOT;
604	}
605
606	/ Skip the remaining next hops (may be zero). /
607	return skip_next_hops(pkt, n: encap->unigue.hop_count -
608	encap->unigue.next_hop - `1`);
609	}
610
611	/ Fill a bpf_sock_tuple to be used with the socket lookup functions.*
612	* This is a kludge that let's us work around verifier limitations:
613	*
614	* fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
615	*
616	* clang will substitute a constant for sizeof, which allows the verifier
617	* to track its value. Based on this, it can figure out the constant
618	* return value, and calling code works while still being "generic" to
619	* IPv4 and IPv6.
620	*/
621	static INLINING uint64_t fill_tuple(struct bpf_sock_tuple tuple, void* *iph,
622	uint64_t iphlen, uint16_t sport, uint16_t dport)
623	{
624	switch (iphlen) {
625	case sizeof(struct iphdr): {
626	struct iphdr ipv4 = (struct* iphdr *)iph;
627	tuple->ipv4.daddr = ipv4->daddr;
628	tuple->ipv4.saddr = ipv4->saddr;
629	tuple->ipv4.sport = sport;
630	tuple->ipv4.dport = dport;
631	return sizeof(tuple->ipv4);
632	}
633
634	case sizeof(struct ipv6hdr): {
635	struct ipv6hdr ipv6 = (struct* ipv6hdr *)iph;
636	memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
637	sizeof(tuple->ipv6.daddr));
638	memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
639	sizeof(tuple->ipv6.saddr));
640	tuple->ipv6.sport = sport;
641	tuple->ipv6.dport = dport;
642	return sizeof(tuple->ipv6);
643	}
644
645	default:
646	return `0`;
647	}
648	}
649
650	static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
651	struct bpf_sock_tuple *tuple, uint64_t tuplen,
652	void iph, struct* tcphdr *tcp)
653	{
654	struct bpf_sock *sk =
655	bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, `0`);
656	if (sk == NULL) {
657	return UNKNOWN;
658	}
659
660	if (sk->state != BPF_TCP_LISTEN) {
661	bpf_sk_release(sk);
662	return ESTABLISHED;
663	}
664
665	if (iph != NULL && tcp != NULL) {
666	/ Kludge: we've run out of arguments, but need the length of the ip header. /
667	uint64_t iphlen = sizeof(struct iphdr);
668	if (tuplen == sizeof(tuple->ipv6)) {
669	iphlen = sizeof(struct ipv6hdr);
670	}
671
672	if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
673	sizeof(*tcp)) == `0`) {
674	bpf_sk_release(sk);
675	return SYN_COOKIE;
676	}
677	}
678
679	bpf_sk_release(sk);
680	return UNKNOWN;
681	}
682
683	static INLINING verdict_t classify_udp(struct __sk_buff *skb,
684	struct bpf_sock_tuple *tuple, uint64_t tuplen)
685	{
686	struct bpf_sock *sk =
687	bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, `0`);
688	if (sk == NULL) {
689	return UNKNOWN;
690	}
691
692	if (sk->state == BPF_TCP_ESTABLISHED) {
693	bpf_sk_release(sk);
694	return ESTABLISHED;
695	}
696
697	bpf_sk_release(sk);
698	return UNKNOWN;
699	}
700
701	static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
702	struct bpf_sock_tuple *tuple, uint64_t tuplen,
703	metrics_t *metrics)
704	{
705	switch (proto) {
706	case IPPROTO_TCP:
707	return classify_tcp(skb, tuple, tuplen, NULL, NULL);
708
709	case IPPROTO_UDP:
710	return classify_udp(skb, tuple, tuplen);
711
712	default:
713	metrics->errors_total_malformed_icmp++;
714	return INVALID;
715	}
716	}
717
718	static INLINING verdict_t process_icmpv4(buf_t pkt, metrics_t metrics)
719	{
720	struct icmphdr icmp;
721	if (!buf_copy(buf: pkt, dst: &icmp, len: sizeof(icmp))) {
722	metrics->errors_total_malformed_icmp++;
723	return INVALID;
724	}
725
726	/ We should never receive encapsulated echo replies. /
727	if (icmp.type == ICMP_ECHOREPLY) {
728	metrics->errors_total_icmp_echo_replies++;
729	return INVALID;
730	}
731
732	if (icmp.type == ICMP_ECHO) {
733	return ECHO_REQUEST;
734	}
735
736	if (icmp.type != ICMP_DEST_UNREACH \|\| icmp.code != ICMP_FRAG_NEEDED) {
737	metrics->errors_total_unwanted_icmp++;
738	return INVALID;
739	}
740
741	struct iphdr _ip4;
742	const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, scratch: &_ip4);
743	if (ipv4 == NULL) {
744	metrics->errors_total_malformed_icmp_pkt_too_big++;
745	return INVALID;
746	}
747
748	/ The source address in the outer IP header is from the entity that*
749	* originated the ICMP message. Use the original IP header to restore
750	* the correct flow tuple.
751	*/
752	struct bpf_sock_tuple tuple;
753	tuple.ipv4.saddr = ipv4->daddr;
754	tuple.ipv4.daddr = ipv4->saddr;
755
756	if (!pkt_parse_icmp_l4_ports(pkt, ports: (flow_ports_t *)&tuple.ipv4.sport)) {
757	metrics->errors_total_malformed_icmp_pkt_too_big++;
758	return INVALID;
759	}
760
761	return classify_icmp(skb: pkt->skb, proto: ipv4->protocol, tuple: &tuple,
762	tuplen: sizeof(tuple.ipv4), metrics);
763	}
764
765	static INLINING verdict_t process_icmpv6(buf_t pkt, metrics_t metrics)
766	{
767	struct icmp6hdr icmp6;
768	if (!buf_copy(buf: pkt, dst: &icmp6, len: sizeof(icmp6))) {
769	metrics->errors_total_malformed_icmp++;
770	return INVALID;
771	}
772
773	/ We should never receive encapsulated echo replies. /
774	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
775	metrics->errors_total_icmp_echo_replies++;
776	return INVALID;
777	}
778
779	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
780	return ECHO_REQUEST;
781	}
782
783	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
784	metrics->errors_total_unwanted_icmp++;
785	return INVALID;
786	}
787
788	bool is_fragment;
789	uint8_t l4_proto;
790	struct ipv6hdr _ipv6;
791	const struct ipv6hdr *ipv6 =
792	pkt_parse_ipv6(pkt, scratch: &_ipv6, proto: &l4_proto, is_fragment: &is_fragment);
793	if (ipv6 == NULL) {
794	metrics->errors_total_malformed_icmp_pkt_too_big++;
795	return INVALID;
796	}
797
798	if (is_fragment) {
799	metrics->errors_total_fragmented_ip++;
800	return INVALID;
801	}
802
803	/ Swap source and dest addresses. /
804	struct bpf_sock_tuple tuple;
805	memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
806	memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
807
808	if (!pkt_parse_icmp_l4_ports(pkt, ports: (flow_ports_t *)&tuple.ipv6.sport)) {
809	metrics->errors_total_malformed_icmp_pkt_too_big++;
810	return INVALID;
811	}
812
813	return classify_icmp(skb: pkt->skb, proto: l4_proto, tuple: &tuple, tuplen: sizeof(tuple.ipv6),
814	metrics);
815	}
816
817	static INLINING verdict_t process_tcp(buf_t pkt, void* *iph, uint64_t iphlen,
818	metrics_t *metrics)
819	{
820	metrics->l4_protocol_packets_total_tcp++;
821
822	struct tcphdr _tcp;
823	struct tcphdr tcp = buf_assign(buf: pkt, len: sizeof*(_tcp), scratch: &_tcp);
824	if (tcp == NULL) {
825	metrics->errors_total_malformed_tcp++;
826	return INVALID;
827	}
828
829	if (tcp->syn) {
830	return SYN;
831	}
832
833	struct bpf_sock_tuple tuple;
834	uint64_t tuplen =
835	fill_tuple(tuple: &tuple, iph, iphlen, sport: tcp->source, dport: tcp->dest);
836	return classify_tcp(skb: pkt->skb, tuple: &tuple, tuplen, iph, tcp);
837	}
838
839	static INLINING verdict_t process_udp(buf_t pkt, void* *iph, uint64_t iphlen,
840	metrics_t *metrics)
841	{
842	metrics->l4_protocol_packets_total_udp++;
843
844	struct udphdr _udp;
845	struct udphdr udph = buf_assign(buf: pkt, len: sizeof*(_udp), scratch: &_udp);
846	if (udph == NULL) {
847	metrics->errors_total_malformed_udp++;
848	return INVALID;
849	}
850
851	struct bpf_sock_tuple tuple;
852	uint64_t tuplen =
853	fill_tuple(tuple: &tuple, iph, iphlen, sport: udph->source, dport: udph->dest);
854	return classify_udp(skb: pkt->skb, tuple: &tuple, tuplen);
855	}
856
857	static INLINING verdict_t process_ipv4(buf_t pkt, metrics_t metrics)
858	{
859	metrics->l3_protocol_packets_total_ipv4++;
860
861	struct iphdr _ip4;
862	struct iphdr *ipv4 = pkt_parse_ipv4(pkt, scratch: &_ip4);
863	if (ipv4 == NULL) {
864	metrics->errors_total_malformed_ip++;
865	return INVALID;
866	}
867
868	if (ipv4->version != `4`) {
869	metrics->errors_total_malformed_ip++;
870	return INVALID;
871	}
872
873	if (ipv4_is_fragment(ip: ipv4)) {
874	metrics->errors_total_fragmented_ip++;
875	return INVALID;
876	}
877
878	switch (ipv4->protocol) {
879	case IPPROTO_ICMP:
880	return process_icmpv4(pkt, metrics);
881
882	case IPPROTO_TCP:
883	return process_tcp(pkt, iph: ipv4, iphlen: sizeof(*ipv4), metrics);
884
885	case IPPROTO_UDP:
886	return process_udp(pkt, iph: ipv4, iphlen: sizeof(*ipv4), metrics);
887
888	default:
889	metrics->errors_total_unknown_l4_proto++;
890	return INVALID;
891	}
892	}
893
894	static INLINING verdict_t process_ipv6(buf_t pkt, metrics_t metrics)
895	{
896	metrics->l3_protocol_packets_total_ipv6++;
897
898	uint8_t l4_proto;
899	bool is_fragment;
900	struct ipv6hdr _ipv6;
901	struct ipv6hdr *ipv6 =
902	pkt_parse_ipv6(pkt, scratch: &_ipv6, proto: &l4_proto, is_fragment: &is_fragment);
903	if (ipv6 == NULL) {
904	metrics->errors_total_malformed_ip++;
905	return INVALID;
906	}
907
908	if (ipv6->version != `6`) {
909	metrics->errors_total_malformed_ip++;
910	return INVALID;
911	}
912
913	if (is_fragment) {
914	metrics->errors_total_fragmented_ip++;
915	return INVALID;
916	}
917
918	switch (l4_proto) {
919	case IPPROTO_ICMPV6:
920	return process_icmpv6(pkt, metrics);
921
922	case IPPROTO_TCP:
923	return process_tcp(pkt, iph: ipv6, iphlen: sizeof(*ipv6), metrics);
924
925	case IPPROTO_UDP:
926	return process_udp(pkt, iph: ipv6, iphlen: sizeof(*ipv6), metrics);
927
928	default:
929	metrics->errors_total_unknown_l4_proto++;
930	return INVALID;
931	}
932	}
933
934	SEC("tc")
935	int cls_redirect(struct __sk_buff *skb)
936	{
937	metrics_t *metrics = get_global_metrics();
938	if (metrics == NULL) {
939	return TC_ACT_SHOT;
940	}
941
942	metrics->processed_packets_total++;
943
944	/ Pass bogus packets as long as we're not sure they're*
945	* destined for us.
946	*/
947	if (skb->protocol != bpf_htons(ETH_P_IP)) {
948	return TC_ACT_OK;
949	}
950
951	encap_headers_t *encap;
952
953	/ Make sure that all encapsulation headers are available in*
954	* the linear portion of the skb. This makes it easy to manipulate them.
955	*/
956	if (bpf_skb_pull_data(skb, sizeof(*encap))) {
957	return TC_ACT_OK;
958	}
959
960	buf_t pkt = {
961	.skb = skb,
962	.head = (uint8_t )(long*)skb->data,
963	.tail = (uint8_t )(long*)skb->data_end,
964	};
965
966	encap = buf_assign(buf: &pkt, len: sizeof(*encap), NULL);
967	if (encap == NULL) {
968	return TC_ACT_OK;
969	}
970
971	if (encap->ip.ihl != `5`) {
972	/ We never have any options. /
973	return TC_ACT_OK;
974	}
975
976	if (encap->ip.daddr != ENCAPSULATION_IP \|\|
977	encap->ip.protocol != IPPROTO_UDP) {
978	return TC_ACT_OK;
979	}
980
981	/ TODO Check UDP length? /
982	if (encap->udp.dest != ENCAPSULATION_PORT) {
983	return TC_ACT_OK;
984	}
985
986	/ We now know that the packet is destined to us, we can*
987	* drop bogus ones.
988	*/
989	if (ipv4_is_fragment(ip: (void *)&encap->ip)) {
990	metrics->errors_total_fragmented_ip++;
991	return TC_ACT_SHOT;
992	}
993
994	if (encap->gue.variant != `0`) {
995	metrics->errors_total_malformed_encapsulation++;
996	return TC_ACT_SHOT;
997	}
998
999	if (encap->gue.control != `0`) {
1000	metrics->errors_total_malformed_encapsulation++;
1001	return TC_ACT_SHOT;
1002	}
1003
1004	if (encap->gue.flags != `0`) {
1005	metrics->errors_total_malformed_encapsulation++;
1006	return TC_ACT_SHOT;
1007	}
1008
1009	if (encap->gue.hlen !=
1010	sizeof(encap->unigue) / `4` + encap->unigue.hop_count) {
1011	metrics->errors_total_malformed_encapsulation++;
1012	return TC_ACT_SHOT;
1013	}
1014
1015	if (encap->unigue.version != `0`) {
1016	metrics->errors_total_malformed_encapsulation++;
1017	return TC_ACT_SHOT;
1018	}
1019
1020	if (encap->unigue.reserved != `0`) {
1021	return TC_ACT_SHOT;
1022	}
1023
1024	struct in_addr next_hop;
1025	MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1026
1027	if (next_hop.s_addr == `0`) {
1028	metrics->accepted_packets_total_last_hop++;
1029	return accept_locally(skb, encap);
1030	}
1031
1032	verdict_t verdict;
1033	switch (encap->gue.proto_ctype) {
1034	case IPPROTO_IPIP:
1035	verdict = process_ipv4(pkt: &pkt, metrics);
1036	break;
1037
1038	case IPPROTO_IPV6:
1039	verdict = process_ipv6(pkt: &pkt, metrics);
1040	break;
1041
1042	default:
1043	metrics->errors_total_unknown_l3_proto++;
1044	return TC_ACT_SHOT;
1045	}
1046
1047	switch (verdict) {
1048	case INVALID:
1049	/ metrics have already been bumped /
1050	return TC_ACT_SHOT;
1051
1052	case UNKNOWN:
1053	return forward_to_next_hop(skb, encap, next_hop: &next_hop, metrics);
1054
1055	case ECHO_REQUEST:
1056	metrics->accepted_packets_total_icmp_echo_request++;
1057	break;
1058
1059	case SYN:
1060	if (encap->unigue.forward_syn) {
1061	return forward_to_next_hop(skb, encap, next_hop: &next_hop,
1062	metrics);
1063	}
1064
1065	metrics->accepted_packets_total_syn++;
1066	break;
1067
1068	case SYN_COOKIE:
1069	metrics->accepted_packets_total_syn_cookies++;
1070	break;
1071
1072	case ESTABLISHED:
1073	metrics->accepted_packets_total_established++;
1074	break;
1075	}
1076
1077	return accept_locally(skb, encap);
1078	}
1079

source code of linux/tools/testing/selftests/bpf/progs/test_cls_redirect.c