1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Syncookies implementation for the Linux kernel |
4 | * |
5 | * Copyright (C) 1997 Andi Kleen |
6 | * Based on ideas by D.J.Bernstein and Eric Schenk. |
7 | */ |
8 | |
9 | #include <linux/tcp.h> |
10 | #include <linux/siphash.h> |
11 | #include <linux/kernel.h> |
12 | #include <linux/export.h> |
13 | #include <net/secure_seq.h> |
14 | #include <net/tcp.h> |
15 | #include <net/route.h> |
16 | |
17 | static siphash_aligned_key_t syncookie_secret[2]; |
18 | |
19 | #define COOKIEBITS 24 /* Upper bits store count */ |
20 | #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) |
21 | |
22 | /* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK |
23 | * stores TCP options: |
24 | * |
25 | * MSB LSB |
26 | * | 31 ... 6 | 5 | 4 | 3 2 1 0 | |
27 | * | Timestamp | ECN | SACK | WScale | |
28 | * |
29 | * When we receive a valid cookie-ACK, we look at the echoed tsval (if |
30 | * any) to figure out which TCP options we should use for the rebuilt |
31 | * connection. |
32 | * |
33 | * A WScale setting of '0xf' (which is an invalid scaling value) |
34 | * means that original syn did not include the TCP window scaling option. |
35 | */ |
36 | #define TS_OPT_WSCALE_MASK 0xf |
37 | #define TS_OPT_SACK BIT(4) |
38 | #define TS_OPT_ECN BIT(5) |
39 | /* There is no TS_OPT_TIMESTAMP: |
40 | * if ACK contains timestamp option, we already know it was |
41 | * requested/supported by the syn/synack exchange. |
42 | */ |
43 | #define TSBITS 6 |
44 | |
45 | static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, |
46 | u32 count, int c) |
47 | { |
48 | net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); |
49 | return siphash_4u32(a: (__force u32)saddr, b: (__force u32)daddr, |
50 | c: (__force u32)sport << 16 | (__force u32)dport, |
51 | d: count, key: &syncookie_secret[c]); |
52 | } |
53 | |
54 | /* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ |
55 | static u64 tcp_ns_to_ts(bool usec_ts, u64 val) |
56 | { |
57 | if (usec_ts) |
58 | return div_u64(dividend: val, NSEC_PER_USEC); |
59 | |
60 | return div_u64(dividend: val, NSEC_PER_MSEC); |
61 | } |
62 | |
63 | /* |
64 | * when syncookies are in effect and tcp timestamps are enabled we encode |
65 | * tcp options in the lower bits of the timestamp value that will be |
66 | * sent in the syn-ack. |
67 | * Since subsequent timestamps use the normal tcp_time_stamp value, we |
68 | * must make sure that the resulting initial timestamp is <= tcp_time_stamp. |
69 | */ |
70 | u64 cookie_init_timestamp(struct request_sock *req, u64 now) |
71 | { |
72 | const struct inet_request_sock *ireq = inet_rsk(sk: req); |
73 | u64 ts, ts_now = tcp_ns_to_ts(usec_ts: false, val: now); |
74 | u32 options = 0; |
75 | |
76 | options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; |
77 | if (ireq->sack_ok) |
78 | options |= TS_OPT_SACK; |
79 | if (ireq->ecn_ok) |
80 | options |= TS_OPT_ECN; |
81 | |
82 | ts = (ts_now >> TSBITS) << TSBITS; |
83 | ts |= options; |
84 | if (ts > ts_now) |
85 | ts -= (1UL << TSBITS); |
86 | |
87 | if (tcp_rsk(req)->req_usec_ts) |
88 | return ts * NSEC_PER_USEC; |
89 | return ts * NSEC_PER_MSEC; |
90 | } |
91 | |
92 | |
93 | static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, |
94 | __be16 dport, __u32 sseq, __u32 data) |
95 | { |
96 | /* |
97 | * Compute the secure sequence number. |
98 | * The output should be: |
99 | * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) |
100 | * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). |
101 | * Where sseq is their sequence number and count increases every |
102 | * minute by 1. |
103 | * As an extra hack, we add a small "data" value that encodes the |
104 | * MSS into the second hash value. |
105 | */ |
106 | u32 count = tcp_cookie_time(); |
107 | return (cookie_hash(saddr, daddr, sport, dport, count: 0, c: 0) + |
108 | sseq + (count << COOKIEBITS) + |
109 | ((cookie_hash(saddr, daddr, sport, dport, count, c: 1) + data) |
110 | & COOKIEMASK)); |
111 | } |
112 | |
113 | /* |
114 | * This retrieves the small "data" value from the syncookie. |
115 | * If the syncookie is bad, the data returned will be out of |
116 | * range. This must be checked by the caller. |
117 | * |
118 | * The count value used to generate the cookie must be less than |
119 | * MAX_SYNCOOKIE_AGE minutes in the past. |
120 | * The return value (__u32)-1 if this test fails. |
121 | */ |
122 | static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, |
123 | __be16 sport, __be16 dport, __u32 sseq) |
124 | { |
125 | u32 diff, count = tcp_cookie_time(); |
126 | |
127 | /* Strip away the layers from the cookie */ |
128 | cookie -= cookie_hash(saddr, daddr, sport, dport, count: 0, c: 0) + sseq; |
129 | |
130 | /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ |
131 | diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); |
132 | if (diff >= MAX_SYNCOOKIE_AGE) |
133 | return (__u32)-1; |
134 | |
135 | return (cookie - |
136 | cookie_hash(saddr, daddr, sport, dport, count: count - diff, c: 1)) |
137 | & COOKIEMASK; /* Leaving the data behind */ |
138 | } |
139 | |
140 | /* |
141 | * MSS Values are chosen based on the 2011 paper |
142 | * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. |
143 | * Values .. |
144 | * .. lower than 536 are rare (< 0.2%) |
145 | * .. between 537 and 1299 account for less than < 1.5% of observed values |
146 | * .. in the 1300-1349 range account for about 15 to 20% of observed mss values |
147 | * .. exceeding 1460 are very rare (< 0.04%) |
148 | * |
149 | * 1460 is the single most frequently announced mss value (30 to 46% depending |
150 | * on monitor location). Table must be sorted. |
151 | */ |
152 | static __u16 const msstab[] = { |
153 | 536, |
154 | 1300, |
155 | 1440, /* 1440, 1452: PPPoE */ |
156 | 1460, |
157 | }; |
158 | |
159 | /* |
160 | * Generate a syncookie. mssp points to the mss, which is returned |
161 | * rounded down to the value encoded in the cookie. |
162 | */ |
163 | u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, |
164 | u16 *mssp) |
165 | { |
166 | int mssind; |
167 | const __u16 mss = *mssp; |
168 | |
169 | for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) |
170 | if (mss >= msstab[mssind]) |
171 | break; |
172 | *mssp = msstab[mssind]; |
173 | |
174 | return secure_tcp_syn_cookie(saddr: iph->saddr, daddr: iph->daddr, |
175 | sport: th->source, dport: th->dest, ntohl(th->seq), |
176 | data: mssind); |
177 | } |
178 | EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); |
179 | |
180 | __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) |
181 | { |
182 | const struct iphdr *iph = ip_hdr(skb); |
183 | const struct tcphdr *th = tcp_hdr(skb); |
184 | |
185 | return __cookie_v4_init_sequence(iph, th, mssp); |
186 | } |
187 | |
188 | /* |
189 | * Check if a ack sequence number is a valid syncookie. |
190 | * Return the decoded mss if it is, or 0 if not. |
191 | */ |
192 | int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, |
193 | u32 cookie) |
194 | { |
195 | __u32 seq = ntohl(th->seq) - 1; |
196 | __u32 mssind = check_tcp_syn_cookie(cookie, saddr: iph->saddr, daddr: iph->daddr, |
197 | sport: th->source, dport: th->dest, sseq: seq); |
198 | |
199 | return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; |
200 | } |
201 | EXPORT_SYMBOL_GPL(__cookie_v4_check); |
202 | |
203 | struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, |
204 | struct request_sock *req, |
205 | struct dst_entry *dst, u32 tsoff) |
206 | { |
207 | struct inet_connection_sock *icsk = inet_csk(sk); |
208 | struct sock *child; |
209 | bool own_req; |
210 | |
211 | child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, |
212 | NULL, &own_req); |
213 | if (child) { |
214 | refcount_set(r: &req->rsk_refcnt, n: 1); |
215 | tcp_sk(child)->tsoffset = tsoff; |
216 | sock_rps_save_rxhash(sk: child, skb); |
217 | |
218 | if (rsk_drop_req(req)) { |
219 | reqsk_put(req); |
220 | return child; |
221 | } |
222 | |
223 | if (inet_csk_reqsk_queue_add(sk, req, child)) |
224 | return child; |
225 | |
226 | bh_unlock_sock(child); |
227 | sock_put(sk: child); |
228 | } |
229 | __reqsk_free(req); |
230 | |
231 | return NULL; |
232 | } |
233 | EXPORT_SYMBOL(tcp_get_cookie_sock); |
234 | |
235 | /* |
236 | * when syncookies are in effect and tcp timestamps are enabled we stored |
237 | * additional tcp options in the timestamp. |
238 | * This extracts these options from the timestamp echo. |
239 | * |
240 | * return false if we decode a tcp option that is disabled |
241 | * on the host. |
242 | */ |
243 | bool cookie_timestamp_decode(const struct net *net, |
244 | struct tcp_options_received *tcp_opt) |
245 | { |
246 | /* echoed timestamp, lowest bits contain options */ |
247 | u32 options = tcp_opt->rcv_tsecr; |
248 | |
249 | if (!tcp_opt->saw_tstamp) { |
250 | tcp_clear_options(rx_opt: tcp_opt); |
251 | return true; |
252 | } |
253 | |
254 | if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) |
255 | return false; |
256 | |
257 | tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; |
258 | |
259 | if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) |
260 | return false; |
261 | |
262 | if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) |
263 | return true; /* no window scaling */ |
264 | |
265 | tcp_opt->wscale_ok = 1; |
266 | tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; |
267 | |
268 | return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; |
269 | } |
270 | EXPORT_SYMBOL(cookie_timestamp_decode); |
271 | |
272 | bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, |
273 | const struct net *net, const struct dst_entry *dst) |
274 | { |
275 | bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN; |
276 | |
277 | if (!ecn_ok) |
278 | return false; |
279 | |
280 | if (READ_ONCE(net->ipv4.sysctl_tcp_ecn)) |
281 | return true; |
282 | |
283 | return dst_feature(dst, RTAX_FEATURE_ECN); |
284 | } |
285 | EXPORT_SYMBOL(cookie_ecn_ok); |
286 | |
287 | struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, |
288 | const struct tcp_request_sock_ops *af_ops, |
289 | struct sock *sk, |
290 | struct sk_buff *skb) |
291 | { |
292 | struct tcp_request_sock *treq; |
293 | struct request_sock *req; |
294 | |
295 | if (sk_is_mptcp(sk)) |
296 | req = mptcp_subflow_reqsk_alloc(ops, sk_listener: sk, attach_listener: false); |
297 | else |
298 | req = inet_reqsk_alloc(ops, sk_listener: sk, attach_listener: false); |
299 | |
300 | if (!req) |
301 | return NULL; |
302 | |
303 | treq = tcp_rsk(req); |
304 | |
305 | /* treq->af_specific might be used to perform TCP_MD5 lookup */ |
306 | treq->af_specific = af_ops; |
307 | |
308 | treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |
309 | treq->req_usec_ts = -1; |
310 | |
311 | #if IS_ENABLED(CONFIG_MPTCP) |
312 | treq->is_mptcp = sk_is_mptcp(sk); |
313 | if (treq->is_mptcp) { |
314 | int err = mptcp_subflow_init_cookie_req(req, sk_listener: sk, skb); |
315 | |
316 | if (err) { |
317 | reqsk_free(req); |
318 | return NULL; |
319 | } |
320 | } |
321 | #endif |
322 | |
323 | return req; |
324 | } |
325 | EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); |
326 | |
327 | /* On input, sk is a listener. |
328 | * Output is listener if incoming packet would not create a child |
329 | * NULL if memory could not be allocated. |
330 | */ |
331 | struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) |
332 | { |
333 | struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; |
334 | struct tcp_options_received tcp_opt; |
335 | struct inet_request_sock *ireq; |
336 | struct tcp_request_sock *treq; |
337 | struct tcp_sock *tp = tcp_sk(sk); |
338 | const struct tcphdr *th = tcp_hdr(skb); |
339 | __u32 cookie = ntohl(th->ack_seq) - 1; |
340 | struct sock *ret = sk; |
341 | struct request_sock *req; |
342 | int full_space, mss; |
343 | struct rtable *rt; |
344 | __u8 rcv_wscale; |
345 | struct flowi4 fl4; |
346 | u32 tsoff = 0; |
347 | int l3index; |
348 | |
349 | if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) || |
350 | !th->ack || th->rst) |
351 | goto out; |
352 | |
353 | if (tcp_synq_no_recent_overflow(sk)) |
354 | goto out; |
355 | |
356 | mss = __cookie_v4_check(ip_hdr(skb), th, cookie); |
357 | if (mss == 0) { |
358 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); |
359 | goto out; |
360 | } |
361 | |
362 | __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); |
363 | |
364 | /* check for timestamp cookie support */ |
365 | memset(&tcp_opt, 0, sizeof(tcp_opt)); |
366 | tcp_parse_options(net: sock_net(sk), skb, opt_rx: &tcp_opt, estab: 0, NULL); |
367 | |
368 | if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { |
369 | tsoff = secure_tcp_ts_off(net: sock_net(sk), |
370 | saddr: ip_hdr(skb)->daddr, |
371 | daddr: ip_hdr(skb)->saddr); |
372 | tcp_opt.rcv_tsecr -= tsoff; |
373 | } |
374 | |
375 | if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt)) |
376 | goto out; |
377 | |
378 | ret = NULL; |
379 | req = cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, |
380 | &tcp_request_sock_ipv4_ops, sk, skb); |
381 | if (!req) |
382 | goto out; |
383 | |
384 | ireq = inet_rsk(sk: req); |
385 | treq = tcp_rsk(req); |
386 | treq->rcv_isn = ntohl(th->seq) - 1; |
387 | treq->snt_isn = cookie; |
388 | treq->ts_off = 0; |
389 | treq->txhash = net_tx_rndhash(); |
390 | req->mss = mss; |
391 | ireq->ir_num = ntohs(th->dest); |
392 | ireq->ir_rmt_port = th->source; |
393 | sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr); |
394 | sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr); |
395 | ireq->ir_mark = inet_request_mark(sk, skb); |
396 | ireq->snd_wscale = tcp_opt.snd_wscale; |
397 | ireq->sack_ok = tcp_opt.sack_ok; |
398 | ireq->wscale_ok = tcp_opt.wscale_ok; |
399 | ireq->tstamp_ok = tcp_opt.saw_tstamp; |
400 | req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; |
401 | treq->snt_synack = 0; |
402 | treq->tfo_listener = false; |
403 | |
404 | if (IS_ENABLED(CONFIG_SMC)) |
405 | ireq->smc_ok = 0; |
406 | |
407 | ireq->ir_iif = inet_request_bound_dev_if(sk, skb); |
408 | |
409 | l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif); |
410 | tcp_ao_syncookie(sk, skb, treq, AF_INET, l3index); |
411 | |
412 | /* We throwed the options of the initial SYN away, so we hope |
413 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
414 | */ |
415 | RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb)); |
416 | |
417 | if (security_inet_conn_request(sk, skb, req)) { |
418 | reqsk_free(req); |
419 | goto out; |
420 | } |
421 | |
422 | req->num_retrans = 0; |
423 | |
424 | /* |
425 | * We need to lookup the route here to get at the correct |
426 | * window size. We should better make sure that the window size |
427 | * hasn't changed since we received the original syn, but I see |
428 | * no easy way to do this. |
429 | */ |
430 | flowi4_init_output(fl4: &fl4, oif: ireq->ir_iif, mark: ireq->ir_mark, |
431 | tos: ip_sock_rt_tos(sk), scope: ip_sock_rt_scope(sk), |
432 | IPPROTO_TCP, flags: inet_sk_flowi_flags(sk), |
433 | daddr: opt->srr ? opt->faddr : ireq->ir_rmt_addr, |
434 | saddr: ireq->ir_loc_addr, dport: th->source, sport: th->dest, uid: sk->sk_uid); |
435 | security_req_classify_flow(req, flic: flowi4_to_flowi_common(fl4: &fl4)); |
436 | rt = ip_route_output_key(net: sock_net(sk), flp: &fl4); |
437 | if (IS_ERR(ptr: rt)) { |
438 | reqsk_free(req); |
439 | goto out; |
440 | } |
441 | |
442 | /* Try to redo what tcp_v4_send_synack did. */ |
443 | req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst: &rt->dst, RTAX_WINDOW); |
444 | /* limit the window selection if the user enforce a smaller rx buffer */ |
445 | full_space = tcp_full_space(sk); |
446 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && |
447 | (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) |
448 | req->rsk_window_clamp = full_space; |
449 | |
450 | tcp_select_initial_window(sk, space: full_space, mss: req->mss, |
451 | rcv_wnd: &req->rsk_rcv_wnd, window_clamp: &req->rsk_window_clamp, |
452 | wscale_ok: ireq->wscale_ok, rcv_wscale: &rcv_wscale, |
453 | init_rcv_wnd: dst_metric(dst: &rt->dst, RTAX_INITRWND)); |
454 | |
455 | ireq->rcv_wscale = rcv_wscale; |
456 | ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); |
457 | |
458 | ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff); |
459 | /* ip_queue_xmit() depends on our flow being setup |
460 | * Normal sockets get it right from inet_csk_route_child_sock() |
461 | */ |
462 | if (ret) |
463 | inet_sk(ret)->cork.fl.u.ip4 = fl4; |
464 | out: return ret; |
465 | } |
466 | |