1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (c) 2019 Facebook */ |
3 | |
4 | #include <linux/bpf.h> |
5 | #include <netinet/in.h> |
6 | #include <stdbool.h> |
7 | |
8 | #include <bpf/bpf_helpers.h> |
9 | #include <bpf/bpf_endian.h> |
10 | #include "bpf_tcp_helpers.h" |
11 | |
12 | enum bpf_linum_array_idx { |
13 | EGRESS_LINUM_IDX, |
14 | INGRESS_LINUM_IDX, |
15 | READ_SK_DST_PORT_LINUM_IDX, |
16 | __NR_BPF_LINUM_ARRAY_IDX, |
17 | }; |
18 | |
19 | struct { |
20 | __uint(type, BPF_MAP_TYPE_ARRAY); |
21 | __uint(max_entries, __NR_BPF_LINUM_ARRAY_IDX); |
22 | __type(key, __u32); |
23 | __type(value, __u32); |
24 | } linum_map SEC(".maps" ); |
25 | |
26 | struct bpf_spinlock_cnt { |
27 | struct bpf_spin_lock lock; |
28 | __u32 cnt; |
29 | }; |
30 | |
31 | struct { |
32 | __uint(type, BPF_MAP_TYPE_SK_STORAGE); |
33 | __uint(map_flags, BPF_F_NO_PREALLOC); |
34 | __type(key, int); |
35 | __type(value, struct bpf_spinlock_cnt); |
36 | } sk_pkt_out_cnt SEC(".maps" ); |
37 | |
38 | struct { |
39 | __uint(type, BPF_MAP_TYPE_SK_STORAGE); |
40 | __uint(map_flags, BPF_F_NO_PREALLOC); |
41 | __type(key, int); |
42 | __type(value, struct bpf_spinlock_cnt); |
43 | } sk_pkt_out_cnt10 SEC(".maps" ); |
44 | |
45 | struct bpf_tcp_sock listen_tp = {}; |
46 | struct sockaddr_in6 srv_sa6 = {}; |
47 | struct bpf_tcp_sock cli_tp = {}; |
48 | struct bpf_tcp_sock srv_tp = {}; |
49 | struct bpf_sock listen_sk = {}; |
50 | struct bpf_sock srv_sk = {}; |
51 | struct bpf_sock cli_sk = {}; |
52 | __u64 parent_cg_id = 0; |
53 | __u64 child_cg_id = 0; |
54 | __u64 lsndtime = 0; |
55 | |
56 | static bool is_loopback6(__u32 *a6) |
57 | { |
58 | return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1); |
59 | } |
60 | |
61 | static void skcpy(struct bpf_sock *dst, |
62 | const struct bpf_sock *src) |
63 | { |
64 | dst->bound_dev_if = src->bound_dev_if; |
65 | dst->family = src->family; |
66 | dst->type = src->type; |
67 | dst->protocol = src->protocol; |
68 | dst->mark = src->mark; |
69 | dst->priority = src->priority; |
70 | dst->src_ip4 = src->src_ip4; |
71 | dst->src_ip6[0] = src->src_ip6[0]; |
72 | dst->src_ip6[1] = src->src_ip6[1]; |
73 | dst->src_ip6[2] = src->src_ip6[2]; |
74 | dst->src_ip6[3] = src->src_ip6[3]; |
75 | dst->src_port = src->src_port; |
76 | dst->dst_ip4 = src->dst_ip4; |
77 | dst->dst_ip6[0] = src->dst_ip6[0]; |
78 | dst->dst_ip6[1] = src->dst_ip6[1]; |
79 | dst->dst_ip6[2] = src->dst_ip6[2]; |
80 | dst->dst_ip6[3] = src->dst_ip6[3]; |
81 | dst->dst_port = src->dst_port; |
82 | dst->state = src->state; |
83 | } |
84 | |
85 | static void tpcpy(struct bpf_tcp_sock *dst, |
86 | const struct bpf_tcp_sock *src) |
87 | { |
88 | dst->snd_cwnd = src->snd_cwnd; |
89 | dst->srtt_us = src->srtt_us; |
90 | dst->rtt_min = src->rtt_min; |
91 | dst->snd_ssthresh = src->snd_ssthresh; |
92 | dst->rcv_nxt = src->rcv_nxt; |
93 | dst->snd_nxt = src->snd_nxt; |
94 | dst->snd_una = src->snd_una; |
95 | dst->mss_cache = src->mss_cache; |
96 | dst->ecn_flags = src->ecn_flags; |
97 | dst->rate_delivered = src->rate_delivered; |
98 | dst->rate_interval_us = src->rate_interval_us; |
99 | dst->packets_out = src->packets_out; |
100 | dst->retrans_out = src->retrans_out; |
101 | dst->total_retrans = src->total_retrans; |
102 | dst->segs_in = src->segs_in; |
103 | dst->data_segs_in = src->data_segs_in; |
104 | dst->segs_out = src->segs_out; |
105 | dst->data_segs_out = src->data_segs_out; |
106 | dst->lost_out = src->lost_out; |
107 | dst->sacked_out = src->sacked_out; |
108 | dst->bytes_received = src->bytes_received; |
109 | dst->bytes_acked = src->bytes_acked; |
110 | } |
111 | |
112 | /* Always return CG_OK so that no pkt will be filtered out */ |
113 | #define CG_OK 1 |
114 | |
115 | #define RET_LOG() ({ \ |
116 | linum = __LINE__; \ |
117 | bpf_map_update_elem(&linum_map, &linum_idx, &linum, BPF_ANY); \ |
118 | return CG_OK; \ |
119 | }) |
120 | |
121 | SEC("cgroup_skb/egress" ) |
122 | int egress_read_sock_fields(struct __sk_buff *skb) |
123 | { |
124 | struct bpf_spinlock_cnt cli_cnt_init = { .lock = {}, .cnt = 0xeB9F }; |
125 | struct bpf_spinlock_cnt *pkt_out_cnt, *pkt_out_cnt10; |
126 | struct bpf_tcp_sock *tp, *tp_ret; |
127 | struct bpf_sock *sk, *sk_ret; |
128 | __u32 linum, linum_idx; |
129 | struct tcp_sock *ktp; |
130 | |
131 | linum_idx = EGRESS_LINUM_IDX; |
132 | |
133 | sk = skb->sk; |
134 | if (!sk) |
135 | RET_LOG(); |
136 | |
137 | /* Not testing the egress traffic or the listening socket, |
138 | * which are covered by the cgroup_skb/ingress test program. |
139 | */ |
140 | if (sk->family != AF_INET6 || !is_loopback6(a6: sk->src_ip6) || |
141 | sk->state == BPF_TCP_LISTEN) |
142 | return CG_OK; |
143 | |
144 | if (sk->src_port == bpf_ntohs(srv_sa6.sin6_port)) { |
145 | /* Server socket */ |
146 | sk_ret = &srv_sk; |
147 | tp_ret = &srv_tp; |
148 | } else if (sk->dst_port == srv_sa6.sin6_port) { |
149 | /* Client socket */ |
150 | sk_ret = &cli_sk; |
151 | tp_ret = &cli_tp; |
152 | } else { |
153 | /* Not the testing egress traffic */ |
154 | return CG_OK; |
155 | } |
156 | |
157 | /* It must be a fullsock for cgroup_skb/egress prog */ |
158 | sk = bpf_sk_fullsock(sk); |
159 | if (!sk) |
160 | RET_LOG(); |
161 | |
162 | /* Not the testing egress traffic */ |
163 | if (sk->protocol != IPPROTO_TCP) |
164 | return CG_OK; |
165 | |
166 | tp = bpf_tcp_sock(sk); |
167 | if (!tp) |
168 | RET_LOG(); |
169 | |
170 | skcpy(dst: sk_ret, src: sk); |
171 | tpcpy(dst: tp_ret, src: tp); |
172 | |
173 | if (sk_ret == &srv_sk) { |
174 | ktp = bpf_skc_to_tcp_sock(sk); |
175 | |
176 | if (!ktp) |
177 | RET_LOG(); |
178 | |
179 | lsndtime = ktp->lsndtime; |
180 | |
181 | child_cg_id = bpf_sk_cgroup_id(ktp); |
182 | if (!child_cg_id) |
183 | RET_LOG(); |
184 | |
185 | parent_cg_id = bpf_sk_ancestor_cgroup_id(ktp, 2); |
186 | if (!parent_cg_id) |
187 | RET_LOG(); |
188 | |
189 | /* The userspace has created it for srv sk */ |
190 | pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, ktp, 0, 0); |
191 | pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, ktp, |
192 | 0, 0); |
193 | } else { |
194 | pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk, |
195 | &cli_cnt_init, |
196 | BPF_SK_STORAGE_GET_F_CREATE); |
197 | pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, |
198 | sk, &cli_cnt_init, |
199 | BPF_SK_STORAGE_GET_F_CREATE); |
200 | } |
201 | |
202 | if (!pkt_out_cnt || !pkt_out_cnt10) |
203 | RET_LOG(); |
204 | |
205 | /* Even both cnt and cnt10 have lock defined in their BTF, |
206 | * intentionally one cnt takes lock while one does not |
207 | * as a test for the spinlock support in BPF_MAP_TYPE_SK_STORAGE. |
208 | */ |
209 | pkt_out_cnt->cnt += 1; |
210 | bpf_spin_lock(&pkt_out_cnt10->lock); |
211 | pkt_out_cnt10->cnt += 10; |
212 | bpf_spin_unlock(&pkt_out_cnt10->lock); |
213 | |
214 | return CG_OK; |
215 | } |
216 | |
217 | SEC("cgroup_skb/ingress" ) |
218 | int ingress_read_sock_fields(struct __sk_buff *skb) |
219 | { |
220 | struct bpf_tcp_sock *tp; |
221 | __u32 linum, linum_idx; |
222 | struct bpf_sock *sk; |
223 | |
224 | linum_idx = INGRESS_LINUM_IDX; |
225 | |
226 | sk = skb->sk; |
227 | if (!sk) |
228 | RET_LOG(); |
229 | |
230 | /* Not the testing ingress traffic to the server */ |
231 | if (sk->family != AF_INET6 || !is_loopback6(a6: sk->src_ip6) || |
232 | sk->src_port != bpf_ntohs(srv_sa6.sin6_port)) |
233 | return CG_OK; |
234 | |
235 | /* Only interested in the listening socket */ |
236 | if (sk->state != BPF_TCP_LISTEN) |
237 | return CG_OK; |
238 | |
239 | /* It must be a fullsock for cgroup_skb/ingress prog */ |
240 | sk = bpf_sk_fullsock(sk); |
241 | if (!sk) |
242 | RET_LOG(); |
243 | |
244 | tp = bpf_tcp_sock(sk); |
245 | if (!tp) |
246 | RET_LOG(); |
247 | |
248 | skcpy(dst: &listen_sk, src: sk); |
249 | tpcpy(dst: &listen_tp, src: tp); |
250 | |
251 | return CG_OK; |
252 | } |
253 | |
254 | /* |
255 | * NOTE: 4-byte load from bpf_sock at dst_port offset is quirky. It |
256 | * gets rewritten by the access converter to a 2-byte load for |
257 | * backward compatibility. Treating the load result as a be16 value |
258 | * makes the code portable across little- and big-endian platforms. |
259 | */ |
260 | static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk) |
261 | { |
262 | __u32 *word = (__u32 *)&sk->dst_port; |
263 | return word[0] == bpf_htons(0xcafe); |
264 | } |
265 | |
266 | static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk) |
267 | { |
268 | __u16 *half; |
269 | |
270 | asm volatile ("" ); |
271 | half = (__u16 *)&sk->dst_port; |
272 | return half[0] == bpf_htons(0xcafe); |
273 | } |
274 | |
275 | static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk) |
276 | { |
277 | __u8 *byte = (__u8 *)&sk->dst_port; |
278 | return byte[0] == 0xca && byte[1] == 0xfe; |
279 | } |
280 | |
281 | SEC("cgroup_skb/egress" ) |
282 | int read_sk_dst_port(struct __sk_buff *skb) |
283 | { |
284 | __u32 linum, linum_idx; |
285 | struct bpf_sock *sk; |
286 | |
287 | linum_idx = READ_SK_DST_PORT_LINUM_IDX; |
288 | |
289 | sk = skb->sk; |
290 | if (!sk) |
291 | RET_LOG(); |
292 | |
293 | /* Ignore everything but the SYN from the client socket */ |
294 | if (sk->state != BPF_TCP_SYN_SENT) |
295 | return CG_OK; |
296 | |
297 | if (!sk_dst_port__load_word(sk)) |
298 | RET_LOG(); |
299 | if (!sk_dst_port__load_half(sk)) |
300 | RET_LOG(); |
301 | if (!sk_dst_port__load_byte(sk)) |
302 | RET_LOG(); |
303 | |
304 | return CG_OK; |
305 | } |
306 | |
307 | char _license[] SEC("license" ) = "GPL" ; |
308 | |