tcp_input.c source code [linux/net/ipv4/tcp_input.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
13	* Florian La Roche, <flla@stud.uni-sb.de>
14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
17	* Matthew Dillon, <dillon@apollo.west.oic.com>
18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19	* Jorge Cwik, <jorge@laser.satlink.net>
20	*/
21
22	/*
23	* Changes:
24	* Pedro Roque : Fast Retransmit/Recovery.
25	* Two receive queues.
26	* Retransmit queue handled by TCP.
27	* Better retransmit timer handling.
28	* New congestion avoidance.
29	* Header prediction.
30	* Variable renaming.
31	*
32	* Eric : Fast Retransmit.
33	* Randy Scott : MSS option defines.
34	* Eric Schenk : Fixes to slow start algorithm.
35	* Eric Schenk : Yet another double ACK bug.
36	* Eric Schenk : Delayed ACK bug fixes.
37	* Eric Schenk : Floyd style fast retrans war avoidance.
38	* David S. Miller : Don't allow zero congestion window.
39	* Eric Schenk : Fix retransmitter so that it sends
40	* next packet on ack of previous packet.
41	* Andi Kleen : Moved open_request checking here
42	* and process RSTs for open_requests.
43	* Andi Kleen : Better prune_queue, and other fixes.
44	* Andrey Savochkin: Fix RTT measurements in the presence of
45	* timestamps.
46	* Andrey Savochkin: Check sequence numbers correctly when
47	* removing SACKs due to in sequence incoming
48	* data segments.
49	* Andi Kleen: Make sure we never ack data there is not
50	* enough room for. Also make this condition
51	* a fatal error if it might still happen.
52	* Andi Kleen: Add tcp_measure_rcv_mss to make
53	* connections with MSS<min(MTU,ann. MSS)
54	* work without delayed acks.
55	* Andi Kleen: Process packets with PSH set in the
56	* fast path.
57	* J Hadi Salim: ECN support
58	* Andrei Gurtov,
59	* Pasi Sarolahti,
60	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
61	* engine. Lots of bugs are found.
62	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
63	*/
64
65	#define pr_fmt(fmt) "TCP: " fmt
66
67	#include <linux/mm.h>
68	#include <linux/slab.h>
69	#include <linux/module.h>
70	#include <linux/sysctl.h>
71	#include <linux/kernel.h>
72	#include <linux/prefetch.h>
73	#include <net/dst.h>
74	#include <net/tcp.h>
75	#include <net/inet_common.h>
76	#include <linux/ipsec.h>
77	#include <asm/unaligned.h>
78	#include <linux/errqueue.h>
79	#include <trace/events/tcp.h>
80	#include <linux/jump_label_ratelimit.h>
81	#include <net/busy_poll.h>
82	#include <net/mptcp.h>
83
84	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
85
86	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
87	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
88	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
89	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
90	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
91	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
92	#define FLAG_ECE 0x40 /* ECE in this ACK */
93	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
94	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
95	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
96	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
97	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
98	#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
99	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
100	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
101	#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
102	#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
103	#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
104
105	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
106	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
107	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE\|FLAG_DSACKING_ACK)
108	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
109
110	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
111	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
112
113	#define REXMIT_NONE 0 /* no loss recovery to do */
114	#define REXMIT_LOST 1 /* retransmit packets marked lost */
115	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
116
117	#if IS_ENABLED(CONFIG_TLS_DEVICE)
118	static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
119
120	void clean_acked_data_enable(struct inet_connection_sock *icsk,
121	void (cad)(struct* sock *sk, u32 ack_seq))
122	{
123	icsk->icsk_clean_acked = cad;
124	static_branch_deferred_inc(&clean_acked_data_enabled);
125	}
126	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
127
128	void clean_acked_data_disable(struct inet_connection_sock *icsk)
129	{
130	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
131	icsk->icsk_clean_acked = NULL;
132	}
133	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
134
135	void clean_acked_data_flush(void)
136	{
137	static_key_deferred_flush(&clean_acked_data_enabled);
138	}
139	EXPORT_SYMBOL_GPL(clean_acked_data_flush);
140	#endif
141
142	#ifdef CONFIG_CGROUP_BPF
143	static void bpf_skops_parse_hdr(struct sock sk, struct* sk_buff *skb)
144	{
145	bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
146	BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
147	BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
148	bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
149	BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
150	struct bpf_sock_ops_kern sock_ops;
151
152	if (likely(!unknown_opt && !parse_all_opt))
153	return;
154
155	/ The skb will be handled in the*
156	* bpf_skops_established() or
157	* bpf_skops_write_hdr_opt().
158	*/
159	switch (sk->sk_state) {
160	case TCP_SYN_RECV:
161	case TCP_SYN_SENT:
162	case TCP_LISTEN:
163	return;
164	}
165
166	sock_owned_by_me(sk);
167
168	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
169	sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
170	sock_ops.is_fullsock = `1`;
171	sock_ops.sk = sk;
172	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: tcp_hdrlen(skb));
173
174	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
175	}
176
177	static void bpf_skops_established(struct sock sk, int* bpf_op,
178	struct sk_buff *skb)
179	{
180	struct bpf_sock_ops_kern sock_ops;
181
182	sock_owned_by_me(sk);
183
184	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
185	sock_ops.op = bpf_op;
186	sock_ops.is_fullsock = `1`;
187	sock_ops.sk = sk;
188	/ sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect /
189	if (skb)
190	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: tcp_hdrlen(skb));
191
192	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
193	}
194	#else
195	static void bpf_skops_parse_hdr(struct sock sk, struct* sk_buff *skb)
196	{
197	}
198
199	static void bpf_skops_established(struct sock sk, int* bpf_op,
200	struct sk_buff *skb)
201	{
202	}
203	#endif
204
205	static void tcp_gro_dev_warn(struct sock sk, const* struct sk_buff *skb,
206	unsigned int len)
207	{
208	static bool __once __read_mostly;
209
210	if (!__once) {
211	struct net_device *dev;
212
213	__once = true;
214
215	rcu_read_lock();
216	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: skb->skb_iif);
217	if (!dev \|\| len >= dev->mtu)
218	pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
219	dev ? dev->name : "Unknown driver");
220	rcu_read_unlock();
221	}
222	}
223
224	/ Adapt the MSS value used to make delayed ack decision to the*
225	* real world.
226	*/
227	static void tcp_measure_rcv_mss(struct sock sk, const* struct sk_buff *skb)
228	{
229	struct inet_connection_sock *icsk = inet_csk(sk);
230	const unsigned int lss = icsk->icsk_ack.last_seg_size;
231	unsigned int len;
232
233	icsk->icsk_ack.last_seg_size = `0`;
234
235	/ skb->len may jitter because of SACKs, even if peer*
236	* sends good full-sized frames.
237	*/
238	len = skb_shinfo(skb)->gso_size ? : skb->len;
239	if (len >= icsk->icsk_ack.rcv_mss) {
240	/ Note: divides are still a bit expensive.*
241	* For the moment, only adjust scaling_ratio
242	* when we update icsk_ack.rcv_mss.
243	*/
244	if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
245	u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
246
247	do_div(val, skb->truesize);
248	tcp_sk(sk)->scaling_ratio = val ? val : `1`;
249	}
250	icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
251	tcp_sk(sk)->advmss);
252	/ Account for possibly-removed options /
253	if (unlikely(len > icsk->icsk_ack.rcv_mss +
254	MAX_TCP_OPTION_SPACE))
255	tcp_gro_dev_warn(sk, skb, len);
256	/ If the skb has a len of exactly 1MSS and has the PSH bit
257	* set then it is likely the end of an application write. So
258	* more data may not be arriving soon, and yet the data sender
259	* may be waiting for an ACK if cwnd-bound or using TX zero
260	* copy. So we set ICSK_ACK_PUSHED here so that
261	* tcp_cleanup_rbuf() will send an ACK immediately if the app
262	* reads all of the data and is not ping-pong. If len > MSS
263	* then this logic does not matter (and does not hurt) because
264	* tcp_cleanup_rbuf() will always ACK immediately if the app
265	* reads data and there is more than an MSS of unACKed data.
266	*/
267	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
268	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
269	} else {
270	/ Otherwise, we make more careful check taking into account,*
271	* that SACKs block is variable.
272	*
273	* "len" is invariant segment length, including TCP header.
274	*/
275	len += skb->data - skb_transport_header(skb);
276	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
277	/ If PSH is not set, packet should be*
278	* full sized, provided peer TCP is not badly broken.
279	* This observation (if it is correct 8)) allows
280	* to handle super-low mtu links fairly.
281	*/
282	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
283	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
284	/ Subtract also invariant (if peer is RFC compliant),*
285	* tcp header plus fixed timestamp option length.
286	* Resulting "len" is MSS free of SACK jitter.
287	*/
288	len -= tcp_sk(sk)->tcp_header_len;
289	icsk->icsk_ack.last_seg_size = len;
290	if (len == lss) {
291	icsk->icsk_ack.rcv_mss = len;
292	return;
293	}
294	}
295	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
296	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
297	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
298	}
299	}
300
301	static void tcp_incr_quickack(struct sock sk, unsigned* int max_quickacks)
302	{
303	struct inet_connection_sock *icsk = inet_csk(sk);
304	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (`2` * icsk->icsk_ack.rcv_mss);
305
306	if (quickacks == `0`)
307	quickacks = `2`;
308	quickacks = min(quickacks, max_quickacks);
309	if (quickacks > icsk->icsk_ack.quick)
310	icsk->icsk_ack.quick = quickacks;
311	}
312
313	static void tcp_enter_quickack_mode(struct sock sk, unsigned* int max_quickacks)
314	{
315	struct inet_connection_sock *icsk = inet_csk(sk);
316
317	tcp_incr_quickack(sk, max_quickacks);
318	inet_csk_exit_pingpong_mode(sk);
319	icsk->icsk_ack.ato = TCP_ATO_MIN;
320	}
321
322	/ Send ACKs quickly, if "quick" count is not exhausted*
323	* and the session is not interactive.
324	*/
325
326	static bool tcp_in_quickack_mode(struct sock *sk)
327	{
328	const struct inet_connection_sock *icsk = inet_csk(sk);
329	const struct dst_entry *dst = __sk_dst_get(sk);
330
331	return (dst && dst_metric(dst, RTAX_QUICKACK)) \|\|
332	(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
333	}
334
335	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
336	{
337	if (tp->ecn_flags & TCP_ECN_OK)
338	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
339	}
340
341	static void tcp_ecn_accept_cwr(struct sock sk, const* struct sk_buff *skb)
342	{
343	if (tcp_hdr(skb)->cwr) {
344	tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
345
346	/ If the sender is telling us it has entered CWR, then its*
347	* cwnd may be very low (even just 1 packet), so we should ACK
348	* immediately.
349	*/
350	if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
351	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
352	}
353	}
354
355	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
356	{
357	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
358	}
359
360	static void __tcp_ecn_check_ce(struct sock sk, const* struct sk_buff *skb)
361	{
362	struct tcp_sock *tp = tcp_sk(sk);
363
364	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
365	case INET_ECN_NOT_ECT:
366	/ Funny extension: if ECT is not set on a segment,*
367	* and we already seen ECT on a previous segment,
368	* it is probably a retransmit.
369	*/
370	if (tp->ecn_flags & TCP_ECN_SEEN)
371	tcp_enter_quickack_mode(sk, max_quickacks: `2`);
372	break;
373	case INET_ECN_CE:
374	if (tcp_ca_needs_ecn(sk))
375	tcp_ca_event(sk, event: CA_EVENT_ECN_IS_CE);
376
377	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
378	/ Better not delay acks, sender can have a very low cwnd /
379	tcp_enter_quickack_mode(sk, max_quickacks: `2`);
380	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
381	}
382	tp->ecn_flags \|= TCP_ECN_SEEN;
383	break;
384	default:
385	if (tcp_ca_needs_ecn(sk))
386	tcp_ca_event(sk, event: CA_EVENT_ECN_NO_CE);
387	tp->ecn_flags \|= TCP_ECN_SEEN;
388	break;
389	}
390	}
391
392	static void tcp_ecn_check_ce(struct sock sk, const* struct sk_buff *skb)
393	{
394	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
395	__tcp_ecn_check_ce(sk, skb);
396	}
397
398	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const* struct tcphdr *th)
399	{
400	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| th->cwr))
401	tp->ecn_flags &= ~TCP_ECN_OK;
402	}
403
404	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const* struct tcphdr *th)
405	{
406	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece \|\| !th->cwr))
407	tp->ecn_flags &= ~TCP_ECN_OK;
408	}
409
410	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const* struct tcphdr *th)
411	{
412	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
413	return true;
414	return false;
415	}
416
417	/ Buffer size and advertised window tuning.*
418	*
419	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
420	*/
421
422	static void tcp_sndbuf_expand(struct sock *sk)
423	{
424	const struct tcp_sock *tp = tcp_sk(sk);
425	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
426	int sndmem, per_mss;
427	u32 nr_segs;
428
429	/ Worst case is non GSO/TSO : each frame consumes one skb*
430	* and skb->head is kmalloced using power of two area of memory
431	*/
432	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
433	MAX_TCP_HEADER +
434	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
435
436	per_mss = roundup_pow_of_two(per_mss) +
437	SKB_DATA_ALIGN(sizeof(struct sk_buff));
438
439	nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
440	nr_segs = max_t(u32, nr_segs, tp->reordering + `1`);
441
442	/ Fast Recovery (RFC 5681 3.2) :*
443	* Cubic needs 1.7 factor, rounded to 2 to include
444	* extra cushion (application might react slowly to EPOLLOUT)
445	*/
446	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : `2`;
447	sndmem = nr_segs per_mss;
448
449	if (sk->sk_sndbuf < sndmem)
450	WRITE_ONCE(sk->sk_sndbuf,
451	min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[`2`])));
452	}
453
454	/ 2. Tuning advertised window (window_clamp, rcv_ssthresh)*
455	*
456	* All tcp_full_space() is split to two parts: "network" buffer, allocated
457	* forward and advertised in receiver window (tp->rcv_wnd) and
458	* "application buffer", required to isolate scheduling/application
459	* latencies from network.
460	* window_clamp is maximal advertised window. It can be less than
461	* tcp_full_space(), in this case tcp_full_space() - window_clamp
462	* is reserved for "application" buffer. The less window_clamp is
463	* the smoother our behaviour from viewpoint of network, but the lower
464	* throughput and the higher sensitivity of the connection to losses. 8)
465	*
466	* rcv_ssthresh is more strict window_clamp used at "slow start"
467	* phase to predict further behaviour of this connection.
468	* It is used for two goals:
469	* - to enforce header prediction at sender, even when application
470	* requires some significant "application buffer". It is check #1.
471	* - to prevent pruning of receive queue because of misprediction
472	* of receiver window. Check #2.
473	*
474	* The scheme does not work when sender sends good segments opening
475	* window and then starts to feed us spaghetti. But it should work
476	* in common situations. Otherwise, we have to rely on queue collapsing.
477	*/
478
479	/ Slow part of check#2. /
480	static int __tcp_grow_window(const struct sock sk, const* struct sk_buff *skb,
481	unsigned int skbtruesize)
482	{
483	const struct tcp_sock *tp = tcp_sk(sk);
484	/ Optimize this! /
485	int truesize = tcp_win_from_space(sk, space: skbtruesize) >> `1`;
486	int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`])) >> `1`;
487
488	while (tp->rcv_ssthresh <= window) {
489	if (truesize <= skb->len)
490	return `2` * inet_csk(sk)->icsk_ack.rcv_mss;
491
492	truesize >>= `1`;
493	window >>= `1`;
494	}
495	return `0`;
496	}
497
498	/ Even if skb appears to have a bad len/truesize ratio, TCP coalescing*
499	* can play nice with us, as sk_buff and skb->head might be either
500	* freed or shared with up to MAX_SKB_FRAGS segments.
501	* Only give a boost to drivers using page frag(s) to hold the frame(s),
502	* and if no payload was pulled in skb->head before reaching us.
503	*/
504	static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
505	{
506	u32 truesize = skb->truesize;
507
508	if (adjust && !skb_headlen(skb)) {
509	truesize -= SKB_TRUESIZE(skb_end_offset(skb));
510	/ paranoid check, some drivers might be buggy /
511	if (unlikely((int)truesize < (int)skb->len))
512	truesize = skb->truesize;
513	}
514	return truesize;
515	}
516
517	static void tcp_grow_window(struct sock sk, const* struct sk_buff *skb,
518	bool adjust)
519	{
520	struct tcp_sock *tp = tcp_sk(sk);
521	int room;
522
523	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
524
525	if (room <= `0`)
526	return;
527
528	/ Check #1 /
529	if (!tcp_under_memory_pressure(sk)) {
530	unsigned int truesize = truesize_adjust(adjust, skb);
531	int incr;
532
533	/ Check #2. Increase window, if skb with such overhead*
534	* will fit to rcvbuf in future.
535	*/
536	if (tcp_win_from_space(sk, space: truesize) <= skb->len)
537	incr = `2` * tp->advmss;
538	else
539	incr = __tcp_grow_window(sk, skb, skbtruesize: truesize);
540
541	if (incr) {
542	incr = max_t(int, incr, `2` * skb->len);
543	tp->rcv_ssthresh += min(room, incr);
544	inet_csk(sk)->icsk_ack.quick \|= `1`;
545	}
546	} else {
547	/ Under pressure:*
548	* Adjust rcv_ssthresh according to reserved mem
549	*/
550	tcp_adjust_rcv_ssthresh(sk);
551	}
552	}
553
554	/ 3. Try to fixup all. It is made immediately after connection enters*
555	* established state.
556	*/
557	static void tcp_init_buffer_space(struct sock *sk)
558	{
559	int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
560	struct tcp_sock *tp = tcp_sk(sk);
561	int maxwin;
562
563	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
564	tcp_sndbuf_expand(sk);
565
566	tcp_mstamp_refresh(tp);
567	tp->rcvq_space.time = tp->tcp_mstamp;
568	tp->rcvq_space.seq = tp->copied_seq;
569
570	maxwin = tcp_full_space(sk);
571
572	if (tp->window_clamp >= maxwin) {
573	tp->window_clamp = maxwin;
574
575	if (tcp_app_win && maxwin > `4` * tp->advmss)
576	tp->window_clamp = max(maxwin -
577	(maxwin >> tcp_app_win),
578	`4` * tp->advmss);
579	}
580
581	/ Force reservation of one segment. /
582	if (tcp_app_win &&
583	tp->window_clamp > `2` * tp->advmss &&
584	tp->window_clamp + tp->advmss > maxwin)
585	tp->window_clamp = max(`2` * tp->advmss, maxwin - tp->advmss);
586
587	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
588	tp->snd_cwnd_stamp = tcp_jiffies32;
589	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
590	(u32)TCP_INIT_CWND * tp->advmss);
591	}
592
593	/ 4. Recalculate window clamp after socket hit its memory bounds. /
594	static void tcp_clamp_window(struct sock *sk)
595	{
596	struct tcp_sock *tp = tcp_sk(sk);
597	struct inet_connection_sock *icsk = inet_csk(sk);
598	struct net *net = sock_net(sk);
599	int rmem2;
600
601	icsk->icsk_ack.quick = `0`;
602	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[`2`]);
603
604	if (sk->sk_rcvbuf < rmem2 &&
605	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
606	!tcp_under_memory_pressure(sk) &&
607	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, index: `0`)) {
608	WRITE_ONCE(sk->sk_rcvbuf,
609	min(atomic_read(&sk->sk_rmem_alloc), rmem2));
610	}
611	if (atomic_read(v: &sk->sk_rmem_alloc) > sk->sk_rcvbuf)
612	tp->rcv_ssthresh = min(tp->window_clamp, `2U` * tp->advmss);
613	}
614
615	/ Initialize RCV_MSS value.*
616	* RCV_MSS is an our guess about MSS used by the peer.
617	* We haven't any direct information about the MSS.
618	* It's better to underestimate the RCV_MSS rather than overestimate.
619	* Overestimations make us ACKing less frequently than needed.
620	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
621	*/
622	void tcp_initialize_rcv_mss(struct sock *sk)
623	{
624	const struct tcp_sock *tp = tcp_sk(sk);
625	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
626
627	hint = min(hint, tp->rcv_wnd / `2`);
628	hint = min(hint, TCP_MSS_DEFAULT);
629	hint = max(hint, TCP_MIN_MSS);
630
631	inet_csk(sk)->icsk_ack.rcv_mss = hint;
632	}
633	EXPORT_SYMBOL(tcp_initialize_rcv_mss);
634
635	/ Receiver "autotuning" code.*
636	*
637	* The algorithm for RTT estimation w/o timestamps is based on
638	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
639	* <https://public.lanl.gov/radiant/pubs.html#DRS>
640	*
641	* More detail on this code can be found at
642	* <http://staff.psc.edu/jheffner/>,
643	* though this reference is out of date. A new paper
644	* is pending.
645	*/
646	static void tcp_rcv_rtt_update(struct tcp_sock tp, u32 sample, int* win_dep)
647	{
648	u32 new_sample = tp->rcv_rtt_est.rtt_us;
649	long m = sample;
650
651	if (new_sample != `0`) {
652	/ If we sample in larger samples in the non-timestamp*
653	* case, we could grossly overestimate the RTT especially
654	* with chatty applications or bulk transfer apps which
655	* are stalled on filesystem I/O.
656	*
657	* Also, since we are only going for a minimum in the
658	* non-timestamp case, we do not smooth things out
659	* else with timestamps disabled convergence takes too
660	* long.
661	*/
662	if (!win_dep) {
663	m -= (new_sample >> `3`);
664	new_sample += m;
665	} else {
666	m <<= `3`;
667	if (m < new_sample)
668	new_sample = m;
669	}
670	} else {
671	/ No previous measure. /
672	new_sample = m << `3`;
673	}
674
675	tp->rcv_rtt_est.rtt_us = new_sample;
676	}
677
678	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
679	{
680	u32 delta_us;
681
682	if (tp->rcv_rtt_est.time == `0`)
683	goto new_measure;
684	if (before(seq1: tp->rcv_nxt, seq2: tp->rcv_rtt_est.seq))
685	return;
686	delta_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: tp->rcv_rtt_est.time);
687	if (!delta_us)
688	delta_us = `1`;
689	tcp_rcv_rtt_update(tp, sample: delta_us, win_dep: `1`);
690
691	new_measure:
692	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
693	tp->rcv_rtt_est.time = tp->tcp_mstamp;
694	}
695
696	static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
697	{
698	u32 delta, delta_us;
699
700	delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
701	if (tp->tcp_usec_ts)
702	return delta;
703
704	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
705	if (!delta)
706	delta = `1`;
707	delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
708	return delta_us;
709	}
710	return -`1`;
711	}
712
713	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
714	const struct sk_buff *skb)
715	{
716	struct tcp_sock *tp = tcp_sk(sk);
717
718	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
719	return;
720	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
721
722	if (TCP_SKB_CB(skb)->end_seq -
723	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
724	s32 delta = tcp_rtt_tsopt_us(tp);
725
726	if (delta >= `0`)
727	tcp_rcv_rtt_update(tp, sample: delta, win_dep: `0`);
728	}
729	}
730
731	/*
732	* This function should be called every time data is copied to user space.
733	* It calculates the appropriate TCP receive buffer space.
734	*/
735	void tcp_rcv_space_adjust(struct sock *sk)
736	{
737	struct tcp_sock *tp = tcp_sk(sk);
738	u32 copied;
739	int time;
740
741	trace_tcp_rcv_space_adjust(sk);
742
743	tcp_mstamp_refresh(tp);
744	time = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: tp->rcvq_space.time);
745	if (time < (tp->rcv_rtt_est.rtt_us >> `3`) \|\| tp->rcv_rtt_est.rtt_us == `0`)
746	return;
747
748	/ Number of bytes copied to user in last RTT /
749	copied = tp->copied_seq - tp->rcvq_space.seq;
750	if (copied <= tp->rcvq_space.space)
751	goto new_measure;
752
753	/ A bit of theory :*
754	* copied = bytes received in previous RTT, our base window
755	* To cope with packet losses, we need a 2x factor
756	* To cope with slow start, and sender growing its cwin by 100 %
757	* every RTT, we need a 4x factor, because the ACK we are sending
758	* now is for the next RTT, not the current one :
759	* <prev RTT . ><current RTT .. ><next RTT .... >
760	*/
761
762	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
763	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
764	u64 rcvwin, grow;
765	int rcvbuf;
766
767	/ minimal window to cope with packet losses, assuming*
768	* steady state. Add some cushion because of small variations.
769	*/
770	rcvwin = ((u64)copied << `1`) + `16` * tp->advmss;
771
772	/ Accommodate for sender rate increase (eg. slow start) /
773	grow = rcvwin * (copied - tp->rcvq_space.space);
774	do_div(grow, tp->rcvq_space.space);
775	rcvwin += (grow << `1`);
776
777	rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
778	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`]));
779	if (rcvbuf > sk->sk_rcvbuf) {
780	WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
781
782	/ Make the window clamp follow along. /
783	tp->window_clamp = tcp_win_from_space(sk, space: rcvbuf);
784	}
785	}
786	tp->rcvq_space.space = copied;
787
788	new_measure:
789	tp->rcvq_space.seq = tp->copied_seq;
790	tp->rcvq_space.time = tp->tcp_mstamp;
791	}
792
793	static void tcp_save_lrcv_flowlabel(struct sock sk, const* struct sk_buff *skb)
794	{
795	#if IS_ENABLED(CONFIG_IPV6)
796	struct inet_connection_sock *icsk = inet_csk(sk);
797
798	if (skb->protocol == htons(ETH_P_IPV6))
799	icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
800	#endif
801	}
802
803	/ There is something which you must keep in mind when you analyze the*
804	* behavior of the tp->ato delayed ack timeout interval. When a
805	* connection starts up, we want to ack as quickly as possible. The
806	* problem is that "good" TCP's do slow start at the beginning of data
807	* transmission. The means that until we send the first few ACK's the
808	* sender will sit on his end and only queue most of his data, because
809	* he can only send snd_cwnd unacked packets at any given time. For
810	* each ACK we send, he increments snd_cwnd and transmits more of his
811	* queue. -DaveM
812	*/
813	static void tcp_event_data_recv(struct sock sk, struct* sk_buff *skb)
814	{
815	struct tcp_sock *tp = tcp_sk(sk);
816	struct inet_connection_sock *icsk = inet_csk(sk);
817	u32 now;
818
819	inet_csk_schedule_ack(sk);
820
821	tcp_measure_rcv_mss(sk, skb);
822
823	tcp_rcv_rtt_measure(tp);
824
825	now = tcp_jiffies32;
826
827	if (!icsk->icsk_ack.ato) {
828	/ The _first_ data packet received, initialize*
829	* delayed ACK engine.
830	*/
831	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
832	icsk->icsk_ack.ato = TCP_ATO_MIN;
833	} else {
834	int m = now - icsk->icsk_ack.lrcvtime;
835
836	if (m <= TCP_ATO_MIN / `2`) {
837	/ The fastest case is the first. /
838	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> `1`) + TCP_ATO_MIN / `2`;
839	} else if (m < icsk->icsk_ack.ato) {
840	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> `1`) + m;
841	if (icsk->icsk_ack.ato > icsk->icsk_rto)
842	icsk->icsk_ack.ato = icsk->icsk_rto;
843	} else if (m > icsk->icsk_rto) {
844	/ Too long gap. Apparently sender failed to*
845	* restart window, so that we send ACKs quickly.
846	*/
847	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
848	}
849	}
850	icsk->icsk_ack.lrcvtime = now;
851	tcp_save_lrcv_flowlabel(sk, skb);
852
853	tcp_ecn_check_ce(sk, skb);
854
855	if (skb->len >= `128`)
856	tcp_grow_window(sk, skb, adjust: true);
857	}
858
859	/ Called to compute a smoothed rtt estimate. The data fed to this*
860	* routine either comes from timestamps, or from segments that were
861	* known _not_ to have been retransmitted [see Karn/Partridge
862	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
863	* piece by Van Jacobson.
864	* NOTE: the next three routines used to be one big routine.
865	* To save cycles in the RFC 1323 implementation it was better to break
866	* it up into three procedures. -- erics
867	*/
868	static void tcp_rtt_estimator(struct sock sk, long* mrtt_us)
869	{
870	struct tcp_sock *tp = tcp_sk(sk);
871	long m = mrtt_us; / RTT /
872	u32 srtt = tp->srtt_us;
873
874	/ The following amusing code comes from Jacobson's*
875	* article in SIGCOMM '88. Note that rtt and mdev
876	* are scaled versions of rtt and mean deviation.
877	* This is designed to be as fast as possible
878	* m stands for "measurement".
879	*
880	* On a 1990 paper the rto value is changed to:
881	* RTO = rtt + 4 * mdev
882	*
883	* Funny. This algorithm seems to be very broken.
884	* These formulae increase RTO, when it should be decreased, increase
885	* too slowly, when it should be increased quickly, decrease too quickly
886	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
887	* does not matter how to _calculate_ it. Seems, it was trap
888	* that VJ failed to avoid. 8)
889	*/
890	if (srtt != `0`) {
891	m -= (srtt >> `3`); / m is now error in rtt est /
892	srtt += m; / rtt = 7/8 rtt + 1/8 new /
893	if (m < `0`) {
894	m = -m; / m is now abs(error) /
895	m -= (tp->mdev_us >> `2`); / similar update on mdev /
896	/ This is similar to one of Eifel findings.*
897	* Eifel blocks mdev updates when rtt decreases.
898	* This solution is a bit different: we use finer gain
899	* for mdev in this case (alpha*beta).
900	* Like Eifel it also prevents growth of rto,
901	* but also it limits too fast rto decreases,
902	* happening in pure Eifel.
903	*/
904	if (m > `0`)
905	m >>= `3`;
906	} else {
907	m -= (tp->mdev_us >> `2`); / similar update on mdev /
908	}
909	tp->mdev_us += m; / mdev = 3/4 mdev + 1/4 new /
910	if (tp->mdev_us > tp->mdev_max_us) {
911	tp->mdev_max_us = tp->mdev_us;
912	if (tp->mdev_max_us > tp->rttvar_us)
913	tp->rttvar_us = tp->mdev_max_us;
914	}
915	if (after(tp->snd_una, tp->rtt_seq)) {
916	if (tp->mdev_max_us < tp->rttvar_us)
917	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> `2`;
918	tp->rtt_seq = tp->snd_nxt;
919	tp->mdev_max_us = tcp_rto_min_us(sk);
920
921	tcp_bpf_rtt(sk);
922	}
923	} else {
924	/ no previous measure. /
925	srtt = m << `3`; / take the measured time to be rtt /
926	tp->mdev_us = m << `1`; / make sure rto = 3rtt /*
927	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
928	tp->mdev_max_us = tp->rttvar_us;
929	tp->rtt_seq = tp->snd_nxt;
930
931	tcp_bpf_rtt(sk);
932	}
933	tp->srtt_us = max(`1U`, srtt);
934	}
935
936	static void tcp_update_pacing_rate(struct sock *sk)
937	{
938	const struct tcp_sock *tp = tcp_sk(sk);
939	u64 rate;
940
941	/ set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) /
942	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / `100`) << `3`);
943
944	/ current rate is (cwnd * mss) / srtt*
945	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
946	* In Congestion Avoidance phase, set it to 120 % the current rate.
947	*
948	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
949	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
950	* end of slow start and should slow down.
951	*/
952	if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / `2`)
953	rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
954	else
955	rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
956
957	rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
958
959	if (likely(tp->srtt_us))
960	do_div(rate, tp->srtt_us);
961
962	/ WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate*
963	* without any lock. We want to make sure compiler wont store
964	* intermediate values in this location.
965	*/
966	WRITE_ONCE(sk->sk_pacing_rate,
967	min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
968	}
969
970	/ Calculate rto without backoff. This is the second half of Van Jacobson's*
971	* routine referred to above.
972	*/
973	static void tcp_set_rto(struct sock *sk)
974	{
975	const struct tcp_sock *tp = tcp_sk(sk);
976	/ Old crap is replaced with new one. 8)*
977	*
978	* More seriously:
979	* 1. If rtt variance happened to be less 50msec, it is hallucination.
980	* It cannot be less due to utterly erratic ACK generation made
981	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
982	* to do with delayed acks, because at cwnd>2 true delack timeout
983	* is invisible. Actually, Linux-2.4 also generates erratic
984	* ACKs in some circumstances.
985	*/
986	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
987
988	/ 2. Fixups made earlier cannot be right.*
989	* If we do not estimate RTO correctly without them,
990	* all the algo is pure shit and should be replaced
991	* with correct one. It is exactly, which we pretend to do.
992	*/
993
994	/ NOTE: clamping at TCP_RTO_MIN is not required, current algo*
995	* guarantees that rto is higher.
996	*/
997	tcp_bound_rto(sk);
998	}
999
1000	__u32 tcp_init_cwnd(const struct tcp_sock tp, const* struct dst_entry *dst)
1001	{
1002	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : `0`);
1003
1004	if (!cwnd)
1005	cwnd = TCP_INIT_CWND;
1006	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
1007	}
1008
1009	struct tcp_sacktag_state {
1010	/ Timestamps for earliest and latest never-retransmitted segment*
1011	* that was SACKed. RTO needs the earliest RTT to stay conservative,
1012	* but congestion control should still get an accurate delay signal.
1013	*/
1014	u64 first_sackt;
1015	u64 last_sackt;
1016	u32 reord;
1017	u32 sack_delivered;
1018	int flag;
1019	unsigned int mss_now;
1020	struct rate_sample *rate;
1021	};
1022
1023	/ Take a notice that peer is sending D-SACKs. Skip update of data delivery*
1024	* and spurious retransmission information if this DSACK is unlikely caused by
1025	* sender's action:
1026	* - DSACKed sequence range is larger than maximum receiver's window.
1027	* - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
1028	*/
1029	static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
1030	u32 end_seq, struct tcp_sacktag_state *state)
1031	{
1032	u32 seq_len, dup_segs = `1`;
1033
1034	if (!before(seq1: start_seq, seq2: end_seq))
1035	return `0`;
1036
1037	seq_len = end_seq - start_seq;
1038	/ Dubious DSACK: DSACKed range greater than maximum advertised rwnd /
1039	if (seq_len > tp->max_window)
1040	return `0`;
1041	if (seq_len > tp->mss_cache)
1042	dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
1043	else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
1044	state->flag \|= FLAG_DSACK_TLP;
1045
1046	tp->dsack_dups += dup_segs;
1047	/ Skip the DSACK if dup segs weren't retransmitted by sender /
1048	if (tp->dsack_dups > tp->total_retrans)
1049	return `0`;
1050
1051	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
1052	/ We increase the RACK ordering window in rounds where we receive*
1053	* DSACKs that may have been due to reordering causing RACK to trigger
1054	* a spurious fast recovery. Thus RACK ignores DSACKs that happen
1055	* without having seen reordering, or that match TLP probes (TLP
1056	* is timer-driven, not triggered by RACK).
1057	*/
1058	if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
1059	tp->rack.dsack_seen = `1`;
1060
1061	state->flag \|= FLAG_DSACKING_ACK;
1062	/ A spurious retransmission is delivered /
1063	state->sack_delivered += dup_segs;
1064
1065	return dup_segs;
1066	}
1067
1068	/ It's reordering when higher sequence was delivered (i.e. sacked) before*
1069	* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
1070	* distance is approximated in full-mss packet distance ("reordering").
1071	*/
1072	static void tcp_check_sack_reordering(struct sock sk, const* u32 low_seq,
1073	const int ts)
1074	{
1075	struct tcp_sock *tp = tcp_sk(sk);
1076	const u32 mss = tp->mss_cache;
1077	u32 fack, metric;
1078
1079	fack = tcp_highest_sack_seq(tp);
1080	if (!before(seq1: low_seq, seq2: fack))
1081	return;
1082
1083	metric = fack - low_seq;
1084	if ((metric > tp->reordering * mss) && mss) {
1085	#if FASTRETRANS_DEBUG > 1
1086	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
1087	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
1088	tp->reordering,
1089	`0`,
1090	tp->sacked_out,
1091	tp->undo_marker ? tp->undo_retrans : `0`);
1092	#endif
1093	tp->reordering = min_t(u32, (metric + mss - `1`) / mss,
1094	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
1095	}
1096
1097	/ This exciting event is worth to be remembered. 8) /
1098	tp->reord_seen++;
1099	NET_INC_STATS(sock_net(sk),
1100	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
1101	}
1102
1103	/ This must be called before lost_out or retrans_out are updated*
1104	* on a new loss, because we want to know if all skbs previously
1105	* known to be lost have already been retransmitted, indicating
1106	* that this newly lost skb is our next skb to retransmit.
1107	*/
1108	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct* sk_buff *skb)
1109	{
1110	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
1111	(tp->retransmit_skb_hint &&
1112	before(TCP_SKB_CB(skb)->seq,
1113	TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
1114	tp->retransmit_skb_hint = skb;
1115	}
1116
1117	/ Sum the number of packets on the wire we have marked as lost, and*
1118	* notify the congestion control module that the given skb was marked lost.
1119	*/
1120	static void tcp_notify_skb_loss_event(struct tcp_sock tp, const* struct sk_buff *skb)
1121	{
1122	tp->lost += tcp_skb_pcount(skb);
1123	}
1124
1125	void tcp_mark_skb_lost(struct sock sk, struct* sk_buff *skb)
1126	{
1127	__u8 sacked = TCP_SKB_CB(skb)->sacked;
1128	struct tcp_sock *tp = tcp_sk(sk);
1129
1130	if (sacked & TCPCB_SACKED_ACKED)
1131	return;
1132
1133	tcp_verify_retransmit_hint(tp, skb);
1134	if (sacked & TCPCB_LOST) {
1135	if (sacked & TCPCB_SACKED_RETRANS) {
1136	/ Account for retransmits that are lost again /
1137	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1138	tp->retrans_out -= tcp_skb_pcount(skb);
1139	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
1140	tcp_skb_pcount(skb));
1141	tcp_notify_skb_loss_event(tp, skb);
1142	}
1143	} else {
1144	tp->lost_out += tcp_skb_pcount(skb);
1145	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
1146	tcp_notify_skb_loss_event(tp, skb);
1147	}
1148	}
1149
1150	/ Updates the delivered and delivered_ce counts /
1151	static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
1152	bool ece_ack)
1153	{
1154	tp->delivered += delivered;
1155	if (ece_ack)
1156	tp->delivered_ce += delivered;
1157	}
1158
1159	/ This procedure tags the retransmission queue when SACKs arrive.*
1160	*
1161	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
1162	* Packets in queue with these bits set are counted in variables
1163	* sacked_out, retrans_out and lost_out, correspondingly.
1164	*
1165	* Valid combinations are:
1166	* Tag InFlight Description
1167	* 0 1 - orig segment is in flight.
1168	* S 0 - nothing flies, orig reached receiver.
1169	* L 0 - nothing flies, orig lost by net.
1170	* R 2 - both orig and retransmit are in flight.
1171	* L\|R 1 - orig is lost, retransmit is in flight.
1172	* S\|R 1 - orig reached receiver, retrans is still in flight.
1173	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
1174	* but it is equivalent to plain S and code short-curcuits it to S.
1175	* L\|S is logically invalid, it would mean -1 packet in flight 8))
1176	*
1177	* These 6 states form finite state machine, controlled by the following events:
1178	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
1179	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
1180	* 3. Loss detection event of two flavors:
1181	* A. Scoreboard estimator decided the packet is lost.
1182	* A'. Reno "three dupacks" marks head of queue lost.
1183	* B. SACK arrives sacking SND.NXT at the moment, when the
1184	* segment was retransmitted.
1185	* 4. D-SACK added new rule: D-SACK changes any tag to S.
1186	*
1187	* It is pleasant to note, that state diagram turns out to be commutative,
1188	* so that we are allowed not to be bothered by order of our actions,
1189	* when multiple events arrive simultaneously. (see the function below).
1190	*
1191	* Reordering detection.
1192	* --------------------
1193	* Reordering metric is maximal distance, which a packet can be displaced
1194	* in packet stream. With SACKs we can estimate it:
1195	*
1196	* 1. SACK fills old hole and the corresponding segment was not
1197	* ever retransmitted -> reordering. Alas, we cannot use it
1198	* when segment was retransmitted.
1199	* 2. The last flaw is solved with D-SACK. D-SACK arrives
1200	* for retransmitted and already SACKed segment -> reordering..
1201	* Both of these heuristics are not used in Loss state, when we cannot
1202	* account for retransmits accurately.
1203	*
1204	* SACK block validation.
1205	* ----------------------
1206	*
1207	* SACK block range validation checks that the received SACK block fits to
1208	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
1209	* Note that SND.UNA is not included to the range though being valid because
1210	* it means that the receiver is rather inconsistent with itself reporting
1211	* SACK reneging when it should advance SND.UNA. Such SACK block this is
1212	* perfectly valid, however, in light of RFC2018 which explicitly states
1213	* that "SACK block MUST reflect the newest segment. Even if the newest
1214	* segment is going to be discarded ...", not that it looks very clever
1215	* in case of head skb. Due to potentional receiver driven attacks, we
1216	* choose to avoid immediate execution of a walk in write queue due to
1217	* reneging and defer head skb's loss recovery to standard loss recovery
1218	* procedure that will eventually trigger (nothing forbids us doing this).
1219	*
1220	* Implements also blockage to start_seq wrap-around. Problem lies in the
1221	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
1222	* there's no guarantee that it will be before snd_nxt (n). The problem
1223	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
1224	* wrap (s_w):
1225	*
1226	* <- outs wnd -> <- wrapzone ->
1227	* u e n u_w e_w s n_w
1228	* \| \| \| \| \| \| \|
1229	* \|<------------+------+----- TCP seqno space --------------+---------->\|
1230	* ...-- <2^31 ->\| \|<--------...
1231	* ...---- >2^31 ------>\| \|<--------...
1232	*
1233	* Current code wouldn't be vulnerable but it's better still to discard such
1234	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
1235	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
1236	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
1237	* equal to the ideal case (infinite seqno space without wrap caused issues).
1238	*
1239	* With D-SACK the lower bound is extended to cover sequence space below
1240	* SND.UNA down to undo_marker, which is the last point of interest. Yet
1241	* again, D-SACK block must not to go across snd_una (for the same reason as
1242	* for the normal SACK blocks, explained above). But there all simplicity
1243	* ends, TCP might receive valid D-SACKs below that. As long as they reside
1244	* fully below undo_marker they do not affect behavior in anyway and can
1245	* therefore be safely ignored. In rare cases (which are more or less
1246	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
1247	* fragmentation and packet reordering past skb's retransmission. To consider
1248	* them correctly, the acceptable range must be extended even more though
1249	* the exact amount is rather hard to quantify. However, tp->max_window can
1250	* be used as an exaggerated estimate.
1251	*/
1252	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1253	u32 start_seq, u32 end_seq)
1254	{
1255	/ Too far in future, or reversed (interpretation is ambiguous) /
1256	if (after(end_seq, tp->snd_nxt) \|\| !before(seq1: start_seq, seq2: end_seq))
1257	return false;
1258
1259	/ Nasty start_seq wrap-around check (see comments above) /
1260	if (!before(seq1: start_seq, seq2: tp->snd_nxt))
1261	return false;
1262
1263	/ In outstanding window? ...This is valid exit for D-SACKs too.*
1264	* start_seq == snd_una is non-sensical (see comments above)
1265	*/
1266	if (after(start_seq, tp->snd_una))
1267	return true;
1268
1269	if (!is_dsack \|\| !tp->undo_marker)
1270	return false;
1271
1272	/ ...Then it's D-SACK, and must reside below snd_una completely /
1273	if (after(end_seq, tp->snd_una))
1274	return false;
1275
1276	if (!before(seq1: start_seq, seq2: tp->undo_marker))
1277	return true;
1278
1279	/ Too old /
1280	if (!after(end_seq, tp->undo_marker))
1281	return false;
1282
1283	/ Undo_marker boundary crossing (overestimates a lot). Known already:*
1284	* start_seq < undo_marker and end_seq >= undo_marker.
1285	*/
1286	return !before(seq1: start_seq, seq2: end_seq - tp->max_window);
1287	}
1288
1289	static bool tcp_check_dsack(struct sock sk, const* struct sk_buff *ack_skb,
1290	struct tcp_sack_block_wire sp, int* num_sacks,
1291	u32 prior_snd_una, struct tcp_sacktag_state *state)
1292	{
1293	struct tcp_sock *tp = tcp_sk(sk);
1294	u32 start_seq_0 = get_unaligned_be32(p: &sp[`0`].start_seq);
1295	u32 end_seq_0 = get_unaligned_be32(p: &sp[`0`].end_seq);
1296	u32 dup_segs;
1297
1298	if (before(seq1: start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1299	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1300	} else if (num_sacks > `1`) {
1301	u32 end_seq_1 = get_unaligned_be32(p: &sp[`1`].end_seq);
1302	u32 start_seq_1 = get_unaligned_be32(p: &sp[`1`].start_seq);
1303
1304	if (after(end_seq_0, end_seq_1) \|\| before(seq1: start_seq_0, seq2: start_seq_1))
1305	return false;
1306	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
1307	} else {
1308	return false;
1309	}
1310
1311	dup_segs = tcp_dsack_seen(tp, start_seq: start_seq_0, end_seq: end_seq_0, state);
1312	if (!dup_segs) { / Skip dubious DSACK /
1313	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
1314	return false;
1315	}
1316
1317	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
1318
1319	/ D-SACK for already forgotten data... Do dumb counting. /
1320	if (tp->undo_marker && tp->undo_retrans > `0` &&
1321	!after(end_seq_0, prior_snd_una) &&
1322	after(end_seq_0, tp->undo_marker))
1323	tp->undo_retrans = max_t(int, `0`, tp->undo_retrans - dup_segs);
1324
1325	return true;
1326	}
1327
1328	/ Check if skb is fully within the SACK block. In presence of GSO skbs,*
1329	* the incoming SACK may not exactly match but we can find smaller MSS
1330	* aligned portion of it that matches. Therefore we might need to fragment
1331	* which may fail and creates some hassle (caller must handle error case
1332	* returns).
1333	*
1334	* FIXME: this could be merged to shift decision code
1335	*/
1336	static int tcp_match_skb_to_sack(struct sock sk, struct* sk_buff *skb,
1337	u32 start_seq, u32 end_seq)
1338	{
1339	int err;
1340	bool in_sack;
1341	unsigned int pkt_len;
1342	unsigned int mss;
1343
1344	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1345	!before(seq1: end_seq, TCP_SKB_CB(skb)->end_seq);
1346
1347	if (tcp_skb_pcount(skb) > `1` && !in_sack &&
1348	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1349	mss = tcp_skb_mss(skb);
1350	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1351
1352	if (!in_sack) {
1353	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1354	if (pkt_len < mss)
1355	pkt_len = mss;
1356	} else {
1357	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1358	if (pkt_len < mss)
1359	return -EINVAL;
1360	}
1361
1362	/ Round if necessary so that SACKs cover only full MSSes*
1363	* and/or the remaining small portion (if present)
1364	*/
1365	if (pkt_len > mss) {
1366	unsigned int new_len = (pkt_len / mss) * mss;
1367	if (!in_sack && new_len < pkt_len)
1368	new_len += mss;
1369	pkt_len = new_len;
1370	}
1371
1372	if (pkt_len >= skb->len && !in_sack)
1373	return `0`;
1374
1375	err = tcp_fragment(sk, tcp_queue: TCP_FRAG_IN_RTX_QUEUE, skb,
1376	len: pkt_len, mss_now: mss, GFP_ATOMIC);
1377	if (err < `0`)
1378	return err;
1379	}
1380
1381	return in_sack;
1382	}
1383
1384	/ Mark the given newly-SACKed range as such, adjusting counters and hints. /
1385	static u8 tcp_sacktag_one(struct sock *sk,
1386	struct tcp_sacktag_state *state, u8 sacked,
1387	u32 start_seq, u32 end_seq,
1388	int dup_sack, int pcount,
1389	u64 xmit_time)
1390	{
1391	struct tcp_sock *tp = tcp_sk(sk);
1392
1393	/ Account D-SACK for retransmitted packet. /
1394	if (dup_sack && (sacked & TCPCB_RETRANS)) {
1395	if (tp->undo_marker && tp->undo_retrans > `0` &&
1396	after(end_seq, tp->undo_marker))
1397	tp->undo_retrans = max_t(int, `0`, tp->undo_retrans - pcount);
1398	if ((sacked & TCPCB_SACKED_ACKED) &&
1399	before(seq1: start_seq, seq2: state->reord))
1400	state->reord = start_seq;
1401	}
1402
1403	/ Nothing to do; acked frame is about to be dropped (was ACKed). /
1404	if (!after(end_seq, tp->snd_una))
1405	return sacked;
1406
1407	if (!(sacked & TCPCB_SACKED_ACKED)) {
1408	tcp_rack_advance(tp, sacked, end_seq, xmit_time);
1409
1410	if (sacked & TCPCB_SACKED_RETRANS) {
1411	/ If the segment is not tagged as lost,*
1412	* we do not clear RETRANS, believing
1413	* that retransmission is still in flight.
1414	*/
1415	if (sacked & TCPCB_LOST) {
1416	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
1417	tp->lost_out -= pcount;
1418	tp->retrans_out -= pcount;
1419	}
1420	} else {
1421	if (!(sacked & TCPCB_RETRANS)) {
1422	/ New sack for not retransmitted frame,*
1423	* which was in hole. It is reordering.
1424	*/
1425	if (before(seq1: start_seq,
1426	seq2: tcp_highest_sack_seq(tp)) &&
1427	before(seq1: start_seq, seq2: state->reord))
1428	state->reord = start_seq;
1429
1430	if (!after(end_seq, tp->high_seq))
1431	state->flag \|= FLAG_ORIG_SACK_ACKED;
1432	if (state->first_sackt == `0`)
1433	state->first_sackt = xmit_time;
1434	state->last_sackt = xmit_time;
1435	}
1436
1437	if (sacked & TCPCB_LOST) {
1438	sacked &= ~TCPCB_LOST;
1439	tp->lost_out -= pcount;
1440	}
1441	}
1442
1443	sacked \|= TCPCB_SACKED_ACKED;
1444	state->flag \|= FLAG_DATA_SACKED;
1445	tp->sacked_out += pcount;
1446	/ Out-of-order packets delivered /
1447	state->sack_delivered += pcount;
1448
1449	/ Lost marker hint past SACKed? Tweak RFC3517 cnt /
1450	if (tp->lost_skb_hint &&
1451	before(seq1: start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1452	tp->lost_cnt_hint += pcount;
1453	}
1454
1455	/ D-SACK. We can detect redundant retransmission in S\|R and plain R*
1456	* frames and clear it. undo_retrans is decreased above, L\|R frames
1457	* are accounted above as well.
1458	*/
1459	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1460	sacked &= ~TCPCB_SACKED_RETRANS;
1461	tp->retrans_out -= pcount;
1462	}
1463
1464	return sacked;
1465	}
1466
1467	/ Shift newly-SACKed bytes from this skb to the immediately previous*
1468	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1469	*/
1470	static bool tcp_shifted_skb(struct sock sk, struct* sk_buff *prev,
1471	struct sk_buff *skb,
1472	struct tcp_sacktag_state *state,
1473	unsigned int pcount, int shifted, int mss,
1474	bool dup_sack)
1475	{
1476	struct tcp_sock *tp = tcp_sk(sk);
1477	u32 start_seq = TCP_SKB_CB(skb)->seq; / start of newly-SACKed /
1478	u32 end_seq = start_seq + shifted; / end of newly-SACKed /
1479
1480	BUG_ON(!pcount);
1481
1482	/ Adjust counters and hints for the newly sacked sequence*
1483	* range but discard the return value since prev is already
1484	* marked. We must tag the range first because the seq
1485	* advancement below implicitly advances
1486	* tcp_highest_sack_seq() when skb is highest_sack.
1487	*/
1488	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1489	start_seq, end_seq, dup_sack, pcount,
1490	xmit_time: tcp_skb_timestamp_us(skb));
1491	tcp_rate_skb_delivered(sk, skb, rs: state->rate);
1492
1493	if (skb == tp->lost_skb_hint)
1494	tp->lost_cnt_hint += pcount;
1495
1496	TCP_SKB_CB(prev)->end_seq += shifted;
1497	TCP_SKB_CB(skb)->seq += shifted;
1498
1499	tcp_skb_pcount_add(skb: prev, segs: pcount);
1500	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
1501	tcp_skb_pcount_add(skb, segs: -pcount);
1502
1503	/ When we're adding to gso_segs == 1, gso_size will be zero,*
1504	* in theory this shouldn't be necessary but as long as DSACK
1505	* code can come after this skb later on it's better to keep
1506	* setting gso_size to something.
1507	*/
1508	if (!TCP_SKB_CB(prev)->tcp_gso_size)
1509	TCP_SKB_CB(prev)->tcp_gso_size = mss;
1510
1511	/ CHECKME: To clear or not to clear? Mimics normal skb currently /
1512	if (tcp_skb_pcount(skb) <= `1`)
1513	TCP_SKB_CB(skb)->tcp_gso_size = `0`;
1514
1515	/ Difference in this won't matter, both ACKed by the same cumul. ACK /
1516	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1517
1518	if (skb->len > `0`) {
1519	BUG_ON(!tcp_skb_pcount(skb));
1520	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1521	return false;
1522	}
1523
1524	/ Whole SKB was eaten :-) /
1525
1526	if (skb == tp->retransmit_skb_hint)
1527	tp->retransmit_skb_hint = prev;
1528	if (skb == tp->lost_skb_hint) {
1529	tp->lost_skb_hint = prev;
1530	tp->lost_cnt_hint -= tcp_skb_pcount(skb: prev);
1531	}
1532
1533	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
1534	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
1535	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1536	TCP_SKB_CB(prev)->end_seq++;
1537
1538	if (skb == tcp_highest_sack(sk))
1539	tcp_advance_highest_sack(sk, skb);
1540
1541	tcp_skb_collapse_tstamp(skb: prev, next_skb: skb);
1542	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1543	TCP_SKB_CB(prev)->tx.delivered_mstamp = `0`;
1544
1545	tcp_rtx_queue_unlink_and_free(skb, sk);
1546
1547	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1548
1549	return true;
1550	}
1551
1552	/ I wish gso_size would have a bit more sane initialization than*
1553	* something-or-zero which complicates things
1554	*/
1555	static int tcp_skb_seglen(const struct sk_buff *skb)
1556	{
1557	return tcp_skb_pcount(skb) == `1` ? skb->len : tcp_skb_mss(skb);
1558	}
1559
1560	/ Shifting pages past head area doesn't work /
1561	static int skb_can_shift(const struct sk_buff *skb)
1562	{
1563	return !skb_headlen(skb) && skb_is_nonlinear(skb);
1564	}
1565
1566	int tcp_skb_shift(struct sk_buff to, struct* sk_buff *from,
1567	int pcount, int shiftlen)
1568	{
1569	/ TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)*
1570	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
1571	* to make sure not storing more than 65535 * 8 bytes per skb,
1572	* even if current MSS is bigger.
1573	*/
1574	if (unlikely(to->len + shiftlen >= `65535` * TCP_MIN_GSO_SIZE))
1575	return `0`;
1576	if (unlikely(tcp_skb_pcount(to) + pcount > `65535`))
1577	return `0`;
1578	return skb_shift(tgt: to, skb: from, shiftlen);
1579	}
1580
1581	/ Try collapsing SACK blocks spanning across multiple skbs to a single*
1582	* skb.
1583	*/
1584	static struct sk_buff tcp_shift_skb_data(struct* sock sk, struct* sk_buff *skb,
1585	struct tcp_sacktag_state *state,
1586	u32 start_seq, u32 end_seq,
1587	bool dup_sack)
1588	{
1589	struct tcp_sock *tp = tcp_sk(sk);
1590	struct sk_buff *prev;
1591	int mss;
1592	int pcount = `0`;
1593	int len;
1594	int in_sack;
1595
1596	/ Normally R but no L won't result in plain S /
1597	if (!dup_sack &&
1598	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1599	goto fallback;
1600	if (!skb_can_shift(skb))
1601	goto fallback;
1602	/ This frame is about to be dropped (was ACKed). /
1603	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1604	goto fallback;
1605
1606	/ Can only happen with delayed DSACK + discard craziness /
1607	prev = skb_rb_prev(skb);
1608	if (!prev)
1609	goto fallback;
1610
1611	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1612	goto fallback;
1613
1614	if (!tcp_skb_can_collapse(to: prev, from: skb))
1615	goto fallback;
1616
1617	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1618	!before(seq1: end_seq, TCP_SKB_CB(skb)->end_seq);
1619
1620	if (in_sack) {
1621	len = skb->len;
1622	pcount = tcp_skb_pcount(skb);
1623	mss = tcp_skb_seglen(skb);
1624
1625	/ TODO: Fix DSACKs to not fragment already SACKed and we can*
1626	* drop this restriction as unnecessary
1627	*/
1628	if (mss != tcp_skb_seglen(skb: prev))
1629	goto fallback;
1630	} else {
1631	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1632	goto noop;
1633	/ CHECKME: This is non-MSS split case only?, this will*
1634	* cause skipped skbs due to advancing loop btw, original
1635	* has that feature too
1636	*/
1637	if (tcp_skb_pcount(skb) <= `1`)
1638	goto noop;
1639
1640	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1641	if (!in_sack) {
1642	/ TODO: head merge to next could be attempted here*
1643	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1644	* though it might not be worth of the additional hassle
1645	*
1646	* ...we can probably just fallback to what was done
1647	* previously. We could try merging non-SACKed ones
1648	* as well but it probably isn't going to buy off
1649	* because later SACKs might again split them, and
1650	* it would make skb timestamp tracking considerably
1651	* harder problem.
1652	*/
1653	goto fallback;
1654	}
1655
1656	len = end_seq - TCP_SKB_CB(skb)->seq;
1657	BUG_ON(len < `0`);
1658	BUG_ON(len > skb->len);
1659
1660	/ MSS boundaries should be honoured or else pcount will*
1661	* severely break even though it makes things bit trickier.
1662	* Optimize common case to avoid most of the divides
1663	*/
1664	mss = tcp_skb_mss(skb);
1665
1666	/ TODO: Fix DSACKs to not fragment already SACKed and we can*
1667	* drop this restriction as unnecessary
1668	*/
1669	if (mss != tcp_skb_seglen(skb: prev))
1670	goto fallback;
1671
1672	if (len == mss) {
1673	pcount = `1`;
1674	} else if (len < mss) {
1675	goto noop;
1676	} else {
1677	pcount = len / mss;
1678	len = pcount * mss;
1679	}
1680	}
1681
1682	/ tcp_sacktag_one() won't SACK-tag ranges below snd_una /
1683	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1684	goto fallback;
1685
1686	if (!tcp_skb_shift(to: prev, from: skb, pcount, shiftlen: len))
1687	goto fallback;
1688	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, shifted: len, mss, dup_sack))
1689	goto out;
1690
1691	/ Hole filled allows collapsing with the next as well, this is very*
1692	* useful when hole on every nth skb pattern happens
1693	*/
1694	skb = skb_rb_next(prev);
1695	if (!skb)
1696	goto out;
1697
1698	if (!skb_can_shift(skb) \|\|
1699	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
1700	(mss != tcp_skb_seglen(skb)))
1701	goto out;
1702
1703	if (!tcp_skb_can_collapse(to: prev, from: skb))
1704	goto out;
1705	len = skb->len;
1706	pcount = tcp_skb_pcount(skb);
1707	if (tcp_skb_shift(to: prev, from: skb, pcount, shiftlen: len))
1708	tcp_shifted_skb(sk, prev, skb, state, pcount,
1709	shifted: len, mss, dup_sack: `0`);
1710
1711	out:
1712	return prev;
1713
1714	noop:
1715	return skb;
1716
1717	fallback:
1718	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1719	return NULL;
1720	}
1721
1722	static struct sk_buff tcp_sacktag_walk(struct* sk_buff skb, struct* sock *sk,
1723	struct tcp_sack_block *next_dup,
1724	struct tcp_sacktag_state *state,
1725	u32 start_seq, u32 end_seq,
1726	bool dup_sack_in)
1727	{
1728	struct tcp_sock *tp = tcp_sk(sk);
1729	struct sk_buff *tmp;
1730
1731	skb_rbtree_walk_from(skb) {
1732	int in_sack = `0`;
1733	bool dup_sack = dup_sack_in;
1734
1735	/ queue is in-order => we can short-circuit the walk early /
1736	if (!before(TCP_SKB_CB(skb)->seq, seq2: end_seq))
1737	break;
1738
1739	if (next_dup &&
1740	before(TCP_SKB_CB(skb)->seq, seq2: next_dup->end_seq)) {
1741	in_sack = tcp_match_skb_to_sack(sk, skb,
1742	start_seq: next_dup->start_seq,
1743	end_seq: next_dup->end_seq);
1744	if (in_sack > `0`)
1745	dup_sack = true;
1746	}
1747
1748	/ skb reference here is a bit tricky to get right, since*
1749	* shifting can eat and free both this skb and the next,
1750	* so not even _safe variant of the loop is enough.
1751	*/
1752	if (in_sack <= `0`) {
1753	tmp = tcp_shift_skb_data(sk, skb, state,
1754	start_seq, end_seq, dup_sack);
1755	if (tmp) {
1756	if (tmp != skb) {
1757	skb = tmp;
1758	continue;
1759	}
1760
1761	in_sack = `0`;
1762	} else {
1763	in_sack = tcp_match_skb_to_sack(sk, skb,
1764	start_seq,
1765	end_seq);
1766	}
1767	}
1768
1769	if (unlikely(in_sack < `0`))
1770	break;
1771
1772	if (in_sack) {
1773	TCP_SKB_CB(skb)->sacked =
1774	tcp_sacktag_one(sk,
1775	state,
1776	TCP_SKB_CB(skb)->sacked,
1777	TCP_SKB_CB(skb)->seq,
1778	TCP_SKB_CB(skb)->end_seq,
1779	dup_sack,
1780	pcount: tcp_skb_pcount(skb),
1781	xmit_time: tcp_skb_timestamp_us(skb));
1782	tcp_rate_skb_delivered(sk, skb, rs: state->rate);
1783	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1784	list_del_init(entry: &skb->tcp_tsorted_anchor);
1785
1786	if (!before(TCP_SKB_CB(skb)->seq,
1787	seq2: tcp_highest_sack_seq(tp)))
1788	tcp_advance_highest_sack(sk, skb);
1789	}
1790	}
1791	return skb;
1792	}
1793
1794	static struct sk_buff tcp_sacktag_bsearch(struct* sock *sk, u32 seq)
1795	{
1796	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
1797	struct sk_buff *skb;
1798
1799	while (*p) {
1800	parent = *p;
1801	skb = rb_to_skb(parent);
1802	if (before(seq1: seq, TCP_SKB_CB(skb)->seq)) {
1803	p = &parent->rb_left;
1804	continue;
1805	}
1806	if (!before(seq1: seq, TCP_SKB_CB(skb)->end_seq)) {
1807	p = &parent->rb_right;
1808	continue;
1809	}
1810	return skb;
1811	}
1812	return NULL;
1813	}
1814
1815	static struct sk_buff tcp_sacktag_skip(struct* sk_buff skb, struct* sock *sk,
1816	u32 skip_to_seq)
1817	{
1818	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1819	return skb;
1820
1821	return tcp_sacktag_bsearch(sk, seq: skip_to_seq);
1822	}
1823
1824	static struct sk_buff tcp_maybe_skipping_dsack(struct* sk_buff *skb,
1825	struct sock *sk,
1826	struct tcp_sack_block *next_dup,
1827	struct tcp_sacktag_state *state,
1828	u32 skip_to_seq)
1829	{
1830	if (!next_dup)
1831	return skb;
1832
1833	if (before(seq1: next_dup->start_seq, seq2: skip_to_seq)) {
1834	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: next_dup->start_seq);
1835	skb = tcp_sacktag_walk(skb, sk, NULL, state,
1836	start_seq: next_dup->start_seq, end_seq: next_dup->end_seq,
1837	dup_sack_in: `1`);
1838	}
1839
1840	return skb;
1841	}
1842
1843	static int tcp_sack_cache_ok(const struct tcp_sock tp, const* struct tcp_sack_block *cache)
1844	{
1845	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1846	}
1847
1848	static int
1849	tcp_sacktag_write_queue(struct sock sk, const* struct sk_buff *ack_skb,
1850	u32 prior_snd_una, struct tcp_sacktag_state *state)
1851	{
1852	struct tcp_sock *tp = tcp_sk(sk);
1853	const unsigned char *ptr = (skb_transport_header(skb: ack_skb) +
1854	TCP_SKB_CB(ack_skb)->sacked);
1855	struct tcp_sack_block_wire sp_wire = (struct* tcp_sack_block_wire *)(ptr+`2`);
1856	struct tcp_sack_block sp[TCP_NUM_SACKS];
1857	struct tcp_sack_block *cache;
1858	struct sk_buff *skb;
1859	int num_sacks = min(TCP_NUM_SACKS, (ptr[`1`] - TCPOLEN_SACK_BASE) >> `3`);
1860	int used_sacks;
1861	bool found_dup_sack = false;
1862	int i, j;
1863	int first_sack_index;
1864
1865	state->flag = `0`;
1866	state->reord = tp->snd_nxt;
1867
1868	if (!tp->sacked_out)
1869	tcp_highest_sack_reset(sk);
1870
1871	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp: sp_wire,
1872	num_sacks, prior_snd_una, state);
1873
1874	/ Eliminate too old ACKs, but take into*
1875	* account more or less fresh ones, they can
1876	* contain valid SACK info.
1877	*/
1878	if (before(TCP_SKB_CB(ack_skb)->ack_seq, seq2: prior_snd_una - tp->max_window))
1879	return `0`;
1880
1881	if (!tp->packets_out)
1882	goto out;
1883
1884	used_sacks = `0`;
1885	first_sack_index = `0`;
1886	for (i = `0`; i < num_sacks; i++) {
1887	bool dup_sack = !i && found_dup_sack;
1888
1889	sp[used_sacks].start_seq = get_unaligned_be32(p: &sp_wire[i].start_seq);
1890	sp[used_sacks].end_seq = get_unaligned_be32(p: &sp_wire[i].end_seq);
1891
1892	if (!tcp_is_sackblock_valid(tp, is_dsack: dup_sack,
1893	start_seq: sp[used_sacks].start_seq,
1894	end_seq: sp[used_sacks].end_seq)) {
1895	int mib_idx;
1896
1897	if (dup_sack) {
1898	if (!tp->undo_marker)
1899	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1900	else
1901	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1902	} else {
1903	/ Don't count olds caused by ACK reordering /
1904	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1905	!after(sp[used_sacks].end_seq, tp->snd_una))
1906	continue;
1907	mib_idx = LINUX_MIB_TCPSACKDISCARD;
1908	}
1909
1910	NET_INC_STATS(sock_net(sk), mib_idx);
1911	if (i == `0`)
1912	first_sack_index = -`1`;
1913	continue;
1914	}
1915
1916	/ Ignore very old stuff early /
1917	if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
1918	if (i == `0`)
1919	first_sack_index = -`1`;
1920	continue;
1921	}
1922
1923	used_sacks++;
1924	}
1925
1926	/ order SACK blocks to allow in order walk of the retrans queue /
1927	for (i = used_sacks - `1`; i > `0`; i--) {
1928	for (j = `0`; j < i; j++) {
1929	if (after(sp[j].start_seq, sp[j + `1`].start_seq)) {
1930	swap(sp[j], sp[j + `1`]);
1931
1932	/ Track where the first SACK block goes to /
1933	if (j == first_sack_index)
1934	first_sack_index = j + `1`;
1935	}
1936	}
1937	}
1938
1939	state->mss_now = tcp_current_mss(sk);
1940	skb = NULL;
1941	i = `0`;
1942
1943	if (!tp->sacked_out) {
1944	/ It's already past, so skip checking against it /
1945	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1946	} else {
1947	cache = tp->recv_sack_cache;
1948	/ Skip empty blocks in at head of the cache /
1949	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1950	!cache->end_seq)
1951	cache++;
1952	}
1953
1954	while (i < used_sacks) {
1955	u32 start_seq = sp[i].start_seq;
1956	u32 end_seq = sp[i].end_seq;
1957	bool dup_sack = (found_dup_sack && (i == first_sack_index));
1958	struct tcp_sack_block *next_dup = NULL;
1959
1960	if (found_dup_sack && ((i + `1`) == first_sack_index))
1961	next_dup = &sp[i + `1`];
1962
1963	/ Skip too early cached blocks /
1964	while (tcp_sack_cache_ok(tp, cache) &&
1965	!before(seq1: start_seq, seq2: cache->end_seq))
1966	cache++;
1967
1968	/ Can skip some work by looking recv_sack_cache? /
1969	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1970	after(end_seq, cache->start_seq)) {
1971
1972	/ Head todo? /
1973	if (before(seq1: start_seq, seq2: cache->start_seq)) {
1974	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: start_seq);
1975	skb = tcp_sacktag_walk(skb, sk, next_dup,
1976	state,
1977	start_seq,
1978	end_seq: cache->start_seq,
1979	dup_sack_in: dup_sack);
1980	}
1981
1982	/ Rest of the block already fully processed? /
1983	if (!after(end_seq, cache->end_seq))
1984	goto advance_sp;
1985
1986	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1987	state,
1988	skip_to_seq: cache->end_seq);
1989
1990	/ ...tail remains todo... /
1991	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1992	/ ...but better entrypoint exists! /
1993	skb = tcp_highest_sack(sk);
1994	if (!skb)
1995	break;
1996	cache++;
1997	goto walk;
1998	}
1999
2000	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: cache->end_seq);
2001	/ Check overlap against next cached too (past this one already) /
2002	cache++;
2003	continue;
2004	}
2005
2006	if (!before(seq1: start_seq, seq2: tcp_highest_sack_seq(tp))) {
2007	skb = tcp_highest_sack(sk);
2008	if (!skb)
2009	break;
2010	}
2011	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: start_seq);
2012
2013	walk:
2014	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
2015	start_seq, end_seq, dup_sack_in: dup_sack);
2016
2017	advance_sp:
2018	i++;
2019	}
2020
2021	/ Clear the head of the cache sack blocks so we can skip it next time /
2022	for (i = `0`; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
2023	tp->recv_sack_cache[i].start_seq = `0`;
2024	tp->recv_sack_cache[i].end_seq = `0`;
2025	}
2026	for (j = `0`; j < used_sacks; j++)
2027	tp->recv_sack_cache[i++] = sp[j];
2028
2029	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss \|\| tp->undo_marker)
2030	tcp_check_sack_reordering(sk, low_seq: state->reord, ts: `0`);
2031
2032	tcp_verify_left_out(tp);
2033	out:
2034
2035	#if FASTRETRANS_DEBUG > 0
2036	WARN_ON((int)tp->sacked_out < `0`);
2037	WARN_ON((int)tp->lost_out < `0`);
2038	WARN_ON((int)tp->retrans_out < `0`);
2039	WARN_ON((int)tcp_packets_in_flight(tp) < `0`);
2040	#endif
2041	return state->flag;
2042	}
2043
2044	/ Limits sacked_out so that sum with lost_out isn't ever larger than*
2045	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
2046	*/
2047	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
2048	{
2049	u32 holes;
2050
2051	holes = max(tp->lost_out, `1U`);
2052	holes = min(holes, tp->packets_out);
2053
2054	if ((tp->sacked_out + holes) > tp->packets_out) {
2055	tp->sacked_out = tp->packets_out - holes;
2056	return true;
2057	}
2058	return false;
2059	}
2060
2061	/ If we receive more dupacks than we expected counting segments*
2062	* in assumption of absent reordering, interpret this as reordering.
2063	* The only another reason could be bug in receiver TCP.
2064	*/
2065	static void tcp_check_reno_reordering(struct sock sk, const* int addend)
2066	{
2067	struct tcp_sock *tp = tcp_sk(sk);
2068
2069	if (!tcp_limit_reno_sacked(tp))
2070	return;
2071
2072	tp->reordering = min_t(u32, tp->packets_out + addend,
2073	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
2074	tp->reord_seen++;
2075	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
2076	}
2077
2078	/ Emulate SACKs for SACKless connection: account for a new dupack. /
2079
2080	static void tcp_add_reno_sack(struct sock sk, int* num_dupack, bool ece_ack)
2081	{
2082	if (num_dupack) {
2083	struct tcp_sock *tp = tcp_sk(sk);
2084	u32 prior_sacked = tp->sacked_out;
2085	s32 delivered;
2086
2087	tp->sacked_out += num_dupack;
2088	tcp_check_reno_reordering(sk, addend: `0`);
2089	delivered = tp->sacked_out - prior_sacked;
2090	if (delivered > `0`)
2091	tcp_count_delivered(tp, delivered, ece_ack);
2092	tcp_verify_left_out(tp);
2093	}
2094	}
2095
2096	/ Account for ACK, ACKing some data in Reno Recovery phase. /
2097
2098	static void tcp_remove_reno_sacks(struct sock sk, int* acked, bool ece_ack)
2099	{
2100	struct tcp_sock *tp = tcp_sk(sk);
2101
2102	if (acked > `0`) {
2103	/ One ACK acked hole. The rest eat duplicate ACKs. /
2104	tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, `1`),
2105	ece_ack);
2106	if (acked - `1` >= tp->sacked_out)
2107	tp->sacked_out = `0`;
2108	else
2109	tp->sacked_out -= acked - `1`;
2110	}
2111	tcp_check_reno_reordering(sk, addend: acked);
2112	tcp_verify_left_out(tp);
2113	}
2114
2115	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
2116	{
2117	tp->sacked_out = `0`;
2118	}
2119
2120	void tcp_clear_retrans(struct tcp_sock *tp)
2121	{
2122	tp->retrans_out = `0`;
2123	tp->lost_out = `0`;
2124	tp->undo_marker = `0`;
2125	tp->undo_retrans = -`1`;
2126	tp->sacked_out = `0`;
2127	tp->rto_stamp = `0`;
2128	tp->total_rto = `0`;
2129	tp->total_rto_recoveries = `0`;
2130	tp->total_rto_time = `0`;
2131	}
2132
2133	static inline void tcp_init_undo(struct tcp_sock *tp)
2134	{
2135	tp->undo_marker = tp->snd_una;
2136	/ Retransmission still in flight may cause DSACKs later. /
2137	tp->undo_retrans = tp->retrans_out ? : -`1`;
2138	}
2139
2140	static bool tcp_is_rack(const struct sock *sk)
2141	{
2142	return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
2143	TCP_RACK_LOSS_DETECTION;
2144	}
2145
2146	/ If we detect SACK reneging, forget all SACK information*
2147	* and reset tags completely, otherwise preserve SACKs. If receiver
2148	* dropped its ofo queue, we will know this due to reneging detection.
2149	*/
2150	static void tcp_timeout_mark_lost(struct sock *sk)
2151	{
2152	struct tcp_sock *tp = tcp_sk(sk);
2153	struct sk_buff skb, head;
2154	bool is_reneg; / is receiver reneging on SACKs? /
2155
2156	head = tcp_rtx_queue_head(sk);
2157	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
2158	if (is_reneg) {
2159	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
2160	tp->sacked_out = `0`;
2161	/ Mark SACK reneging until we recover from this loss event. /
2162	tp->is_sack_reneg = `1`;
2163	} else if (tcp_is_reno(tp)) {
2164	tcp_reset_reno_sack(tp);
2165	}
2166
2167	skb = head;
2168	skb_rbtree_walk_from(skb) {
2169	if (is_reneg)
2170	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
2171	else if (tcp_is_rack(sk) && skb != head &&
2172	tcp_rack_skb_timeout(tp, skb, reo_wnd: `0`) > `0`)
2173	continue; / Don't mark recently sent ones lost yet /
2174	tcp_mark_skb_lost(sk, skb);
2175	}
2176	tcp_verify_left_out(tp);
2177	tcp_clear_all_retrans_hints(tp);
2178	}
2179
2180	/ Enter Loss state. /
2181	void tcp_enter_loss(struct sock *sk)
2182	{
2183	const struct inet_connection_sock *icsk = inet_csk(sk);
2184	struct tcp_sock *tp = tcp_sk(sk);
2185	struct net *net = sock_net(sk);
2186	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
2187	u8 reordering;
2188
2189	tcp_timeout_mark_lost(sk);
2190
2191	/ Reduce ssthresh if it has not yet been made inside this window. /
2192	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
2193	!after(tp->high_seq, tp->snd_una) \|\|
2194	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
2195	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2196	tp->prior_cwnd = tcp_snd_cwnd(tp);
2197	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
2198	tcp_ca_event(sk, event: CA_EVENT_LOSS);
2199	tcp_init_undo(tp);
2200	}
2201	tcp_snd_cwnd_set(tp, val: tcp_packets_in_flight(tp) + `1`);
2202	tp->snd_cwnd_cnt = `0`;
2203	tp->snd_cwnd_stamp = tcp_jiffies32;
2204
2205	/ Timeout in disordered state after receiving substantial DUPACKs*
2206	* suggests that the degree of reordering is over-estimated.
2207	*/
2208	reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
2209	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2210	tp->sacked_out >= reordering)
2211	tp->reordering = min_t(unsigned int, tp->reordering,
2212	reordering);
2213
2214	tcp_set_ca_state(sk, ca_state: TCP_CA_Loss);
2215	tp->high_seq = tp->snd_nxt;
2216	tcp_ecn_queue_cwr(tp);
2217
2218	/ F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous*
2219	* loss recovery is underway except recurring timeout(s) on
2220	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2221	*/
2222	tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
2223	(new_recovery \|\| icsk->icsk_retransmits) &&
2224	!inet_csk(sk)->icsk_mtup.probe_size;
2225	}
2226
2227	/ If ACK arrived pointing to a remembered SACK, it means that our*
2228	* remembered SACKs do not reflect real state of receiver i.e.
2229	* receiver _host_ is heavily congested (or buggy).
2230	*
2231	* To avoid big spurious retransmission bursts due to transient SACK
2232	* scoreboard oddities that look like reneging, we give the receiver a
2233	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
2234	* restore sanity to the SACK scoreboard. If the apparent reneging
2235	* persists until this RTO then we'll clear the SACK scoreboard.
2236	*/
2237	static bool tcp_check_sack_reneging(struct sock sk, int* *ack_flag)
2238	{
2239	if (*ack_flag & FLAG_SACK_RENEGING &&
2240	*ack_flag & FLAG_SND_UNA_ADVANCED) {
2241	struct tcp_sock *tp = tcp_sk(sk);
2242	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> `4`),
2243	msecs_to_jiffies(`10`));
2244
2245	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2246	when: delay, TCP_RTO_MAX);
2247	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
2248	return true;
2249	}
2250	return false;
2251	}
2252
2253	/ Heurestics to calculate number of duplicate ACKs. There's no dupACKs*
2254	* counter when SACK is enabled (without SACK, sacked_out is used for
2255	* that purpose).
2256	*
2257	* With reordering, holes may still be in flight, so RFC3517 recovery
2258	* uses pure sacked_out (total number of SACKed segments) even though
2259	* it violates the RFC that uses duplicate ACKs, often these are equal
2260	* but when e.g. out-of-window ACKs or packet duplication occurs,
2261	* they differ. Since neither occurs due to loss, TCP should really
2262	* ignore them.
2263	*/
2264	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2265	{
2266	return tp->sacked_out + `1`;
2267	}
2268
2269	/ Linux NewReno/SACK/ECN state machine.*
2270	* --------------------------------------
2271	*
2272	* "Open" Normal state, no dubious events, fast path.
2273	* "Disorder" In all the respects it is "Open",
2274	* but requires a bit more attention. It is entered when
2275	* we see some SACKs or dupacks. It is split of "Open"
2276	* mainly to move some processing from fast path to slow one.
2277	* "CWR" CWND was reduced due to some Congestion Notification event.
2278	* It can be ECN, ICMP source quench, local device congestion.
2279	* "Recovery" CWND was reduced, we are fast-retransmitting.
2280	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
2281	*
2282	* tcp_fastretrans_alert() is entered:
2283	* - each incoming ACK, if state is not "Open"
2284	* - when arrived ACK is unusual, namely:
2285	* * SACK
2286	* * Duplicate ACK.
2287	* * ECN ECE.
2288	*
2289	* Counting packets in flight is pretty simple.
2290	*
2291	* in_flight = packets_out - left_out + retrans_out
2292	*
2293	* packets_out is SND.NXT-SND.UNA counted in packets.
2294	*
2295	* retrans_out is number of retransmitted segments.
2296	*
2297	* left_out is number of segments left network, but not ACKed yet.
2298	*
2299	* left_out = sacked_out + lost_out
2300	*
2301	* sacked_out: Packets, which arrived to receiver out of order
2302	* and hence not ACKed. With SACKs this number is simply
2303	* amount of SACKed data. Even without SACKs
2304	* it is easy to give pretty reliable estimate of this number,
2305	* counting duplicate ACKs.
2306	*
2307	* lost_out: Packets lost by network. TCP has no explicit
2308	* "loss notification" feedback from network (for now).
2309	* It means that this number can be only _guessed_.
2310	* Actually, it is the heuristics to predict lossage that
2311	* distinguishes different algorithms.
2312	*
2313	* F.e. after RTO, when all the queue is considered as lost,
2314	* lost_out = packets_out and in_flight = retrans_out.
2315	*
2316	* Essentially, we have now a few algorithms detecting
2317	* lost packets.
2318	*
2319	* If the receiver supports SACK:
2320	*
2321	* RFC6675/3517: It is the conventional algorithm. A packet is
2322	* considered lost if the number of higher sequence packets
2323	* SACKed is greater than or equal the DUPACK thoreshold
2324	* (reordering). This is implemented in tcp_mark_head_lost and
2325	* tcp_update_scoreboard.
2326	*
2327	* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
2328	* (2017-) that checks timing instead of counting DUPACKs.
2329	* Essentially a packet is considered lost if it's not S/ACKed
2330	* after RTT + reordering_window, where both metrics are
2331	* dynamically measured and adjusted. This is implemented in
2332	* tcp_rack_mark_lost.
2333	*
2334	* If the receiver does not support SACK:
2335	*
2336	* NewReno (RFC6582): in Recovery we assume that one segment
2337	* is lost (classic Reno). While we are in Recovery and
2338	* a partial ACK arrives, we assume that one more packet
2339	* is lost (NewReno). This heuristics are the same in NewReno
2340	* and SACK.
2341	*
2342	* Really tricky (and requiring careful tuning) part of algorithm
2343	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
2344	* The first determines the moment _when_ we should reduce CWND and,
2345	* hence, slow down forward transmission. In fact, it determines the moment
2346	* when we decide that hole is caused by loss, rather than by a reorder.
2347	*
2348	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
2349	* holes, caused by lost packets.
2350	*
2351	* And the most logically complicated part of algorithm is undo
2352	* heuristics. We detect false retransmits due to both too early
2353	* fast retransmit (reordering) and underestimated RTO, analyzing
2354	* timestamps and D-SACKs. When we detect that some segments were
2355	* retransmitted by mistake and CWND reduction was wrong, we undo
2356	* window reduction and abort recovery phase. This logic is hidden
2357	* inside several functions named tcp_try_undo_<something>.
2358	*/
2359
2360	/ This function decides, when we should leave Disordered state*
2361	* and enter Recovery phase, reducing congestion window.
2362	*
2363	* Main question: may we further continue forward transmission
2364	* with the same cwnd?
2365	*/
2366	static bool tcp_time_to_recover(struct sock sk, int* flag)
2367	{
2368	struct tcp_sock *tp = tcp_sk(sk);
2369
2370	/ Trick#1: The loss is proven. /
2371	if (tp->lost_out)
2372	return true;
2373
2374	/ Not-A-Trick#2 : Classic rule... /
2375	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
2376	return true;
2377
2378	return false;
2379	}
2380
2381	/ Detect loss in event "A" above by marking head of queue up as lost.*
2382	* For RFC3517 SACK, a segment is considered lost if it
2383	* has at least tp->reordering SACKed seqments above it; "packets" refers to
2384	* the maximum SACKed segments to pass before reaching this limit.
2385	*/
2386	static void tcp_mark_head_lost(struct sock sk, int* packets, int mark_head)
2387	{
2388	struct tcp_sock *tp = tcp_sk(sk);
2389	struct sk_buff *skb;
2390	int cnt;
2391	/ Use SACK to deduce losses of new sequences sent during recovery /
2392	const u32 loss_high = tp->snd_nxt;
2393
2394	WARN_ON(packets > tp->packets_out);
2395	skb = tp->lost_skb_hint;
2396	if (skb) {
2397	/ Head already handled? /
2398	if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
2399	return;
2400	cnt = tp->lost_cnt_hint;
2401	} else {
2402	skb = tcp_rtx_queue_head(sk);
2403	cnt = `0`;
2404	}
2405
2406	skb_rbtree_walk_from(skb) {
2407	/ TODO: do this better /
2408	/ this is not the most efficient way to do this... /
2409	tp->lost_skb_hint = skb;
2410	tp->lost_cnt_hint = cnt;
2411
2412	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2413	break;
2414
2415	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2416	cnt += tcp_skb_pcount(skb);
2417
2418	if (cnt > packets)
2419	break;
2420
2421	if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
2422	tcp_mark_skb_lost(sk, skb);
2423
2424	if (mark_head)
2425	break;
2426	}
2427	tcp_verify_left_out(tp);
2428	}
2429
2430	/ Account newly detected lost packet(s) /
2431
2432	static void tcp_update_scoreboard(struct sock sk, int* fast_rexmit)
2433	{
2434	struct tcp_sock *tp = tcp_sk(sk);
2435
2436	if (tcp_is_sack(tp)) {
2437	int sacked_upto = tp->sacked_out - tp->reordering;
2438	if (sacked_upto >= `0`)
2439	tcp_mark_head_lost(sk, packets: sacked_upto, mark_head: `0`);
2440	else if (fast_rexmit)
2441	tcp_mark_head_lost(sk, packets: `1`, mark_head: `1`);
2442	}
2443	}
2444
2445	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2446	{
2447	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2448	before(seq1: tp->rx_opt.rcv_tsecr, seq2: when);
2449	}
2450
2451	/ skb is spurious retransmitted if the returned timestamp echo*
2452	* reply is prior to the skb transmission time
2453	*/
2454	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2455	const struct sk_buff *skb)
2456	{
2457	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2458	tcp_tsopt_ecr_before(tp, when: tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb));
2459	}
2460
2461	/ Nothing was retransmitted or returned timestamp is less*
2462	* than timestamp of the first retransmission.
2463	*/
2464	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2465	{
2466	return tp->retrans_stamp &&
2467	tcp_tsopt_ecr_before(tp, when: tp->retrans_stamp);
2468	}
2469
2470	/ Undo procedures. /
2471
2472	/ We can clear retrans_stamp when there are no retransmissions in the*
2473	* window. It would seem that it is trivially available for us in
2474	* tp->retrans_out, however, that kind of assumptions doesn't consider
2475	* what will happen if errors occur when sending retransmission for the
2476	* second time. ...It could the that such segment has only
2477	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
2478	* the head skb is enough except for some reneging corner cases that
2479	* are not worth the effort.
2480	*
2481	* Main reason for all this complexity is the fact that connection dying
2482	* time now depends on the validity of the retrans_stamp, in particular,
2483	* that successive retransmissions of a segment must not advance
2484	* retrans_stamp under any conditions.
2485	*/
2486	static bool tcp_any_retrans_done(const struct sock *sk)
2487	{
2488	const struct tcp_sock *tp = tcp_sk(sk);
2489	struct sk_buff *skb;
2490
2491	if (tp->retrans_out)
2492	return true;
2493
2494	skb = tcp_rtx_queue_head(sk);
2495	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2496	return true;
2497
2498	return false;
2499	}
2500
2501	static void DBGUNDO(struct sock sk, const* char *msg)
2502	{
2503	#if FASTRETRANS_DEBUG > 1
2504	struct tcp_sock *tp = tcp_sk(sk);
2505	struct inet_sock *inet = inet_sk(sk);
2506
2507	if (sk->sk_family == AF_INET) {
2508	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2509	msg,
2510	&inet->inet_daddr, ntohs(inet->inet_dport),
2511	tcp_snd_cwnd(tp), tcp_left_out(tp),
2512	tp->snd_ssthresh, tp->prior_ssthresh,
2513	tp->packets_out);
2514	}
2515	#if IS_ENABLED(CONFIG_IPV6)
2516	else if (sk->sk_family == AF_INET6) {
2517	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2518	msg,
2519	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
2520	tcp_snd_cwnd(tp), tcp_left_out(tp),
2521	tp->snd_ssthresh, tp->prior_ssthresh,
2522	tp->packets_out);
2523	}
2524	#endif
2525	#endif
2526	}
2527
2528	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2529	{
2530	struct tcp_sock *tp = tcp_sk(sk);
2531
2532	if (unmark_loss) {
2533	struct sk_buff *skb;
2534
2535	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2536	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2537	}
2538	tp->lost_out = `0`;
2539	tcp_clear_all_retrans_hints(tp);
2540	}
2541
2542	if (tp->prior_ssthresh) {
2543	const struct inet_connection_sock *icsk = inet_csk(sk);
2544
2545	tcp_snd_cwnd_set(tp, val: icsk->icsk_ca_ops->undo_cwnd(sk));
2546
2547	if (tp->prior_ssthresh > tp->snd_ssthresh) {
2548	tp->snd_ssthresh = tp->prior_ssthresh;
2549	tcp_ecn_withdraw_cwr(tp);
2550	}
2551	}
2552	tp->snd_cwnd_stamp = tcp_jiffies32;
2553	tp->undo_marker = `0`;
2554	tp->rack.advanced = `1`; / Force RACK to re-exam losses /
2555	}
2556
2557	static inline bool tcp_may_undo(const struct tcp_sock *tp)
2558	{
2559	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
2560	}
2561
2562	static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
2563	{
2564	struct tcp_sock *tp = tcp_sk(sk);
2565
2566	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2567	/ Hold old state until something above high_seq*
2568	* is ACKed. For Reno it is MUST to prevent false
2569	* fast retransmits (RFC2582). SACK TCP is safe. */
2570	if (!tcp_any_retrans_done(sk))
2571	tp->retrans_stamp = `0`;
2572	return true;
2573	}
2574	return false;
2575	}
2576
2577	/ People celebrate: "We love our President!" /
2578	static bool tcp_try_undo_recovery(struct sock *sk)
2579	{
2580	struct tcp_sock *tp = tcp_sk(sk);
2581
2582	if (tcp_may_undo(tp)) {
2583	int mib_idx;
2584
2585	/ Happy end! We did not retransmit anything*
2586	* or our original transmission succeeded.
2587	*/
2588	DBGUNDO(sk, msg: inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2589	tcp_undo_cwnd_reduction(sk, unmark_loss: false);
2590	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2591	mib_idx = LINUX_MIB_TCPLOSSUNDO;
2592	else
2593	mib_idx = LINUX_MIB_TCPFULLUNDO;
2594
2595	NET_INC_STATS(sock_net(sk), mib_idx);
2596	} else if (tp->rack.reo_wnd_persist) {
2597	tp->rack.reo_wnd_persist--;
2598	}
2599	if (tcp_is_non_sack_preventing_reopen(sk))
2600	return true;
2601	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
2602	tp->is_sack_reneg = `0`;
2603	return false;
2604	}
2605
2606	/ Try to undo cwnd reduction, because D-SACKs acked all retransmitted data /
2607	static bool tcp_try_undo_dsack(struct sock *sk)
2608	{
2609	struct tcp_sock *tp = tcp_sk(sk);
2610
2611	if (tp->undo_marker && !tp->undo_retrans) {
2612	tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
2613	tp->rack.reo_wnd_persist + `1`);
2614	DBGUNDO(sk, msg: "D-SACK");
2615	tcp_undo_cwnd_reduction(sk, unmark_loss: false);
2616	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2617	return true;
2618	}
2619	return false;
2620	}
2621
2622	/ Undo during loss recovery after partial ACK or using F-RTO. /
2623	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2624	{
2625	struct tcp_sock *tp = tcp_sk(sk);
2626
2627	if (frto_undo \|\| tcp_may_undo(tp)) {
2628	tcp_undo_cwnd_reduction(sk, unmark_loss: true);
2629
2630	DBGUNDO(sk, msg: "partial loss");
2631	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2632	if (frto_undo)
2633	NET_INC_STATS(sock_net(sk),
2634	LINUX_MIB_TCPSPURIOUSRTOS);
2635	inet_csk(sk)->icsk_retransmits = `0`;
2636	if (tcp_is_non_sack_preventing_reopen(sk))
2637	return true;
2638	if (frto_undo \|\| tcp_is_sack(tp)) {
2639	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
2640	tp->is_sack_reneg = `0`;
2641	}
2642	return true;
2643	}
2644	return false;
2645	}
2646
2647	/ The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.*
2648	* It computes the number of packets to send (sndcnt) based on packets newly
2649	* delivered:
2650	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
2651	* cwnd reductions across a full RTT.
2652	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
2653	* But when SND_UNA is acked without further losses,
2654	* slow starts cwnd up to ssthresh to speed up the recovery.
2655	*/
2656	static void tcp_init_cwnd_reduction(struct sock *sk)
2657	{
2658	struct tcp_sock *tp = tcp_sk(sk);
2659
2660	tp->high_seq = tp->snd_nxt;
2661	tp->tlp_high_seq = `0`;
2662	tp->snd_cwnd_cnt = `0`;
2663	tp->prior_cwnd = tcp_snd_cwnd(tp);
2664	tp->prr_delivered = `0`;
2665	tp->prr_out = `0`;
2666	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2667	tcp_ecn_queue_cwr(tp);
2668	}
2669
2670	void tcp_cwnd_reduction(struct sock sk, int* newly_acked_sacked, int newly_lost, int flag)
2671	{
2672	struct tcp_sock *tp = tcp_sk(sk);
2673	int sndcnt = `0`;
2674	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2675
2676	if (newly_acked_sacked <= `0` \|\| WARN_ON_ONCE(!tp->prior_cwnd))
2677	return;
2678
2679	tp->prr_delivered += newly_acked_sacked;
2680	if (delta < `0`) {
2681	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2682	tp->prior_cwnd - `1`;
2683	sndcnt = div_u64(dividend, divisor: tp->prior_cwnd) - tp->prr_out;
2684	} else {
2685	sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
2686	newly_acked_sacked);
2687	if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
2688	sndcnt++;
2689	sndcnt = min(delta, sndcnt);
2690	}
2691	/ Force a fast retransmit upon entering fast recovery /
2692	sndcnt = max(sndcnt, (tp->prr_out ? `0` : `1`));
2693	tcp_snd_cwnd_set(tp, val: tcp_packets_in_flight(tp) + sndcnt);
2694	}
2695
2696	static inline void tcp_end_cwnd_reduction(struct sock *sk)
2697	{
2698	struct tcp_sock *tp = tcp_sk(sk);
2699
2700	if (inet_csk(sk)->icsk_ca_ops->cong_control)
2701	return;
2702
2703	/ Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) /
2704	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
2705	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
2706	tcp_snd_cwnd_set(tp, val: tp->snd_ssthresh);
2707	tp->snd_cwnd_stamp = tcp_jiffies32;
2708	}
2709	tcp_ca_event(sk, event: CA_EVENT_COMPLETE_CWR);
2710	}
2711
2712	/ Enter CWR state. Disable cwnd undo since congestion is proven with ECN /
2713	void tcp_enter_cwr(struct sock *sk)
2714	{
2715	struct tcp_sock *tp = tcp_sk(sk);
2716
2717	tp->prior_ssthresh = `0`;
2718	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2719	tp->undo_marker = `0`;
2720	tcp_init_cwnd_reduction(sk);
2721	tcp_set_ca_state(sk, ca_state: TCP_CA_CWR);
2722	}
2723	}
2724	EXPORT_SYMBOL(tcp_enter_cwr);
2725
2726	static void tcp_try_keep_open(struct sock *sk)
2727	{
2728	struct tcp_sock *tp = tcp_sk(sk);
2729	int state = TCP_CA_Open;
2730
2731	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
2732	state = TCP_CA_Disorder;
2733
2734	if (inet_csk(sk)->icsk_ca_state != state) {
2735	tcp_set_ca_state(sk, ca_state: state);
2736	tp->high_seq = tp->snd_nxt;
2737	}
2738	}
2739
2740	static void tcp_try_to_open(struct sock sk, int* flag)
2741	{
2742	struct tcp_sock *tp = tcp_sk(sk);
2743
2744	tcp_verify_left_out(tp);
2745
2746	if (!tcp_any_retrans_done(sk))
2747	tp->retrans_stamp = `0`;
2748
2749	if (flag & FLAG_ECE)
2750	tcp_enter_cwr(sk);
2751
2752	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2753	tcp_try_keep_open(sk);
2754	}
2755	}
2756
2757	static void tcp_mtup_probe_failed(struct sock *sk)
2758	{
2759	struct inet_connection_sock *icsk = inet_csk(sk);
2760
2761	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - `1`;
2762	icsk->icsk_mtup.probe_size = `0`;
2763	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
2764	}
2765
2766	static void tcp_mtup_probe_success(struct sock *sk)
2767	{
2768	struct tcp_sock *tp = tcp_sk(sk);
2769	struct inet_connection_sock *icsk = inet_csk(sk);
2770	u64 val;
2771
2772	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2773
2774	val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, mss: tp->mss_cache);
2775	do_div(val, icsk->icsk_mtup.probe_size);
2776	DEBUG_NET_WARN_ON_ONCE((u32)val != val);
2777	tcp_snd_cwnd_set(tp, max_t(u32, `1U`, val));
2778
2779	tp->snd_cwnd_cnt = `0`;
2780	tp->snd_cwnd_stamp = tcp_jiffies32;
2781	tp->snd_ssthresh = tcp_current_ssthresh(sk);
2782
2783	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2784	icsk->icsk_mtup.probe_size = `0`;
2785	tcp_sync_mss(sk, pmtu: icsk->icsk_pmtu_cookie);
2786	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
2787	}
2788
2789	/ Do a simple retransmit without using the backoff mechanisms in*
2790	* tcp_timer. This is used for path mtu discovery.
2791	* The socket is already locked here.
2792	*/
2793	void tcp_simple_retransmit(struct sock *sk)
2794	{
2795	const struct inet_connection_sock *icsk = inet_csk(sk);
2796	struct tcp_sock *tp = tcp_sk(sk);
2797	struct sk_buff *skb;
2798	int mss;
2799
2800	/ A fastopen SYN request is stored as two separate packets within*
2801	* the retransmit queue, this is done by tcp_send_syn_data().
2802	* As a result simply checking the MSS of the frames in the queue
2803	* will not work for the SYN packet.
2804	*
2805	* Us being here is an indication of a path MTU issue so we can
2806	* assume that the fastopen SYN was lost and just mark all the
2807	* frames in the retransmit queue as lost. We will use an MSS of
2808	* -1 to mark all frames as lost, otherwise compute the current MSS.
2809	*/
2810	if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
2811	mss = -`1`;
2812	else
2813	mss = tcp_current_mss(sk);
2814
2815	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2816	if (tcp_skb_seglen(skb) > mss)
2817	tcp_mark_skb_lost(sk, skb);
2818	}
2819
2820	tcp_clear_retrans_hints_partial(tp);
2821
2822	if (!tp->lost_out)
2823	return;
2824
2825	if (tcp_is_reno(tp))
2826	tcp_limit_reno_sacked(tp);
2827
2828	tcp_verify_left_out(tp);
2829
2830	/ Don't muck with the congestion window here.*
2831	* Reason is that we do not increase amount of _data_
2832	* in network, but units changed and effective
2833	* cwnd/ssthresh really reduced now.
2834	*/
2835	if (icsk->icsk_ca_state != TCP_CA_Loss) {
2836	tp->high_seq = tp->snd_nxt;
2837	tp->snd_ssthresh = tcp_current_ssthresh(sk);
2838	tp->prior_ssthresh = `0`;
2839	tp->undo_marker = `0`;
2840	tcp_set_ca_state(sk, ca_state: TCP_CA_Loss);
2841	}
2842	tcp_xmit_retransmit_queue(sk);
2843	}
2844	EXPORT_SYMBOL(tcp_simple_retransmit);
2845
2846	void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2847	{
2848	struct tcp_sock *tp = tcp_sk(sk);
2849	int mib_idx;
2850
2851	if (tcp_is_reno(tp))
2852	mib_idx = LINUX_MIB_TCPRENORECOVERY;
2853	else
2854	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2855
2856	NET_INC_STATS(sock_net(sk), mib_idx);
2857
2858	tp->prior_ssthresh = `0`;
2859	tcp_init_undo(tp);
2860
2861	if (!tcp_in_cwnd_reduction(sk)) {
2862	if (!ece_ack)
2863	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2864	tcp_init_cwnd_reduction(sk);
2865	}
2866	tcp_set_ca_state(sk, ca_state: TCP_CA_Recovery);
2867	}
2868
2869	static void tcp_update_rto_time(struct tcp_sock *tp)
2870	{
2871	if (tp->rto_stamp) {
2872	tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
2873	tp->rto_stamp = `0`;
2874	}
2875	}
2876
2877	/ Process an ACK in CA_Loss state. Move to CA_Open if lost data are*
2878	* recovered or spurious. Otherwise retransmits more on partial ACKs.
2879	*/
2880	static void tcp_process_loss(struct sock sk, int* flag, int num_dupack,
2881	int *rexmit)
2882	{
2883	struct tcp_sock *tp = tcp_sk(sk);
2884	bool recovered = !before(seq1: tp->snd_una, seq2: tp->high_seq);
2885
2886	if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
2887	tcp_try_undo_loss(sk, frto_undo: false))
2888	return;
2889
2890	if (tp->frto) { / F-RTO RFC5682 sec 3.1 (sack enhanced version). /
2891	/ Step 3.b. A timeout is spurious if not all data are*
2892	* lost, i.e., never-retransmitted data are (s)acked.
2893	*/
2894	if ((flag & FLAG_ORIG_SACK_ACKED) &&
2895	tcp_try_undo_loss(sk, frto_undo: true))
2896	return;
2897
2898	if (after(tp->snd_nxt, tp->high_seq)) {
2899	if (flag & FLAG_DATA_SACKED \|\| num_dupack)
2900	tp->frto = `0`; / Step 3.a. loss was real /
2901	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2902	tp->high_seq = tp->snd_nxt;
2903	/ Step 2.b. Try send new data (but deferred until cwnd*
2904	* is updated in tcp_ack()). Otherwise fall back to
2905	* the conventional recovery.
2906	*/
2907	if (!tcp_write_queue_empty(sk) &&
2908	after(tcp_wnd_end(tp), tp->snd_nxt)) {
2909	*rexmit = REXMIT_NEW;
2910	return;
2911	}
2912	tp->frto = `0`;
2913	}
2914	}
2915
2916	if (recovered) {
2917	/ F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a /
2918	tcp_try_undo_recovery(sk);
2919	return;
2920	}
2921	if (tcp_is_reno(tp)) {
2922	/ A Reno DUPACK means new data in F-RTO step 2.b above are*
2923	* delivered. Lower inflight to clock out (re)transmissions.
2924	*/
2925	if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
2926	tcp_add_reno_sack(sk, num_dupack, ece_ack: flag & FLAG_ECE);
2927	else if (flag & FLAG_SND_UNA_ADVANCED)
2928	tcp_reset_reno_sack(tp);
2929	}
2930	*rexmit = REXMIT_LOST;
2931	}
2932
2933	static bool tcp_force_fast_retransmit(struct sock *sk)
2934	{
2935	struct tcp_sock *tp = tcp_sk(sk);
2936
2937	return after(tcp_highest_sack_seq(tp),
2938	tp->snd_una + tp->reordering * tp->mss_cache);
2939	}
2940
2941	/ Undo during fast recovery after partial ACK. /
2942	static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
2943	bool *do_lost)
2944	{
2945	struct tcp_sock *tp = tcp_sk(sk);
2946
2947	if (tp->undo_marker && tcp_packet_delayed(tp)) {
2948	/ Plain luck! Hole if filled with delayed*
2949	* packet, rather than with a retransmit. Check reordering.
2950	*/
2951	tcp_check_sack_reordering(sk, low_seq: prior_snd_una, ts: `1`);
2952
2953	/ We are getting evidence that the reordering degree is higher*
2954	* than we realized. If there are no retransmits out then we
2955	* can undo. Otherwise we clock out new packets but do not
2956	* mark more packets lost or retransmit more.
2957	*/
2958	if (tp->retrans_out)
2959	return true;
2960
2961	if (!tcp_any_retrans_done(sk))
2962	tp->retrans_stamp = `0`;
2963
2964	DBGUNDO(sk, msg: "partial recovery");
2965	tcp_undo_cwnd_reduction(sk, unmark_loss: true);
2966	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
2967	tcp_try_keep_open(sk);
2968	} else {
2969	/ Partial ACK arrived. Force fast retransmit. /
2970	*do_lost = tcp_force_fast_retransmit(sk);
2971	}
2972	return false;
2973	}
2974
2975	static void tcp_identify_packet_loss(struct sock sk, int* *ack_flag)
2976	{
2977	struct tcp_sock *tp = tcp_sk(sk);
2978
2979	if (tcp_rtx_queue_empty(sk))
2980	return;
2981
2982	if (unlikely(tcp_is_reno(tp))) {
2983	tcp_newreno_mark_lost(sk, snd_una_advanced: *ack_flag & FLAG_SND_UNA_ADVANCED);
2984	} else if (tcp_is_rack(sk)) {
2985	u32 prior_retrans = tp->retrans_out;
2986
2987	if (tcp_rack_mark_lost(sk))
2988	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
2989	if (prior_retrans > tp->retrans_out)
2990	*ack_flag \|= FLAG_LOST_RETRANS;
2991	}
2992	}
2993
2994	/ Process an event, which can update packets-in-flight not trivially.*
2995	* Main goal of this function is to calculate new estimate for left_out,
2996	* taking into account both packets sitting in receiver's buffer and
2997	* packets lost by network.
2998	*
2999	* Besides that it updates the congestion state when packet loss or ECN
3000	* is detected. But it does not reduce the cwnd, it is done by the
3001	* congestion control later.
3002	*
3003	* It does _not_ decide what to send, it is made in function
3004	* tcp_xmit_retransmit_queue().
3005	*/
3006	static void tcp_fastretrans_alert(struct sock sk, const* u32 prior_snd_una,
3007	int num_dupack, int ack_flag, int* *rexmit)
3008	{
3009	struct inet_connection_sock *icsk = inet_csk(sk);
3010	struct tcp_sock *tp = tcp_sk(sk);
3011	int fast_rexmit = `0`, flag = *ack_flag;
3012	bool ece_ack = flag & FLAG_ECE;
3013	bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
3014	tcp_force_fast_retransmit(sk));
3015
3016	if (!tp->packets_out && tp->sacked_out)
3017	tp->sacked_out = `0`;
3018
3019	/ Now state machine starts.*
3020	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
3021	if (ece_ack)
3022	tp->prior_ssthresh = `0`;
3023
3024	/ B. In all the states check for reneging SACKs. /
3025	if (tcp_check_sack_reneging(sk, ack_flag))
3026	return;
3027
3028	/ C. Check consistency of the current state. /
3029	tcp_verify_left_out(tp);
3030
3031	/ D. Check state exit conditions. State can be terminated*
3032	* when high_seq is ACKed. */
3033	if (icsk->icsk_ca_state == TCP_CA_Open) {
3034	WARN_ON(tp->retrans_out != `0` && !tp->syn_data);
3035	tp->retrans_stamp = `0`;
3036	} else if (!before(seq1: tp->snd_una, seq2: tp->high_seq)) {
3037	switch (icsk->icsk_ca_state) {
3038	case TCP_CA_CWR:
3039	/ CWR is to be held something above high_seq*
3040	* is ACKed for CWR bit to reach receiver. */
3041	if (tp->snd_una != tp->high_seq) {
3042	tcp_end_cwnd_reduction(sk);
3043	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
3044	}
3045	break;
3046
3047	case TCP_CA_Recovery:
3048	if (tcp_is_reno(tp))
3049	tcp_reset_reno_sack(tp);
3050	if (tcp_try_undo_recovery(sk))
3051	return;
3052	tcp_end_cwnd_reduction(sk);
3053	break;
3054	}
3055	}
3056
3057	/ E. Process state. /
3058	switch (icsk->icsk_ca_state) {
3059	case TCP_CA_Recovery:
3060	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
3061	if (tcp_is_reno(tp))
3062	tcp_add_reno_sack(sk, num_dupack, ece_ack);
3063	} else if (tcp_try_undo_partial(sk, prior_snd_una, do_lost: &do_lost))
3064	return;
3065
3066	if (tcp_try_undo_dsack(sk))
3067	tcp_try_keep_open(sk);
3068
3069	tcp_identify_packet_loss(sk, ack_flag);
3070	if (icsk->icsk_ca_state != TCP_CA_Recovery) {
3071	if (!tcp_time_to_recover(sk, flag))
3072	return;
3073	/ Undo reverts the recovery state. If loss is evident,*
3074	* starts a new recovery (e.g. reordering then loss);
3075	*/
3076	tcp_enter_recovery(sk, ece_ack);
3077	}
3078	break;
3079	case TCP_CA_Loss:
3080	tcp_process_loss(sk, flag, num_dupack, rexmit);
3081	if (icsk->icsk_ca_state != TCP_CA_Loss)
3082	tcp_update_rto_time(tp);
3083	tcp_identify_packet_loss(sk, ack_flag);
3084	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
3085	(*ack_flag & FLAG_LOST_RETRANS)))
3086	return;
3087	/ Change state if cwnd is undone or retransmits are lost /
3088	fallthrough;
3089	default:
3090	if (tcp_is_reno(tp)) {
3091	if (flag & FLAG_SND_UNA_ADVANCED)
3092	tcp_reset_reno_sack(tp);
3093	tcp_add_reno_sack(sk, num_dupack, ece_ack);
3094	}
3095
3096	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3097	tcp_try_undo_dsack(sk);
3098
3099	tcp_identify_packet_loss(sk, ack_flag);
3100	if (!tcp_time_to_recover(sk, flag)) {
3101	tcp_try_to_open(sk, flag);
3102	return;
3103	}
3104
3105	/ MTU probe failure: don't reduce cwnd /
3106	if (icsk->icsk_ca_state < TCP_CA_CWR &&
3107	icsk->icsk_mtup.probe_size &&
3108	tp->snd_una == tp->mtu_probe.probe_seq_start) {
3109	tcp_mtup_probe_failed(sk);
3110	/ Restores the reduction we did in tcp_mtup_probe() /
3111	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) + `1`);
3112	tcp_simple_retransmit(sk);
3113	return;
3114	}
3115
3116	/ Otherwise enter Recovery state /
3117	tcp_enter_recovery(sk, ece_ack);
3118	fast_rexmit = `1`;
3119	}
3120
3121	if (!tcp_is_rack(sk) && do_lost)
3122	tcp_update_scoreboard(sk, fast_rexmit);
3123	*rexmit = REXMIT_LOST;
3124	}
3125
3126	static void tcp_update_rtt_min(struct sock sk, u32 rtt_us, const* int flag)
3127	{
3128	u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
3129	struct tcp_sock *tp = tcp_sk(sk);
3130
3131	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
3132	/ If the remote keeps returning delayed ACKs, eventually*
3133	* the min filter would pick it up and overestimate the
3134	* prop. delay when it expires. Skip suspected delayed ACKs.
3135	*/
3136	return;
3137	}
3138	minmax_running_min(m: &tp->rtt_min, win: wlen, tcp_jiffies32,
3139	meas: rtt_us ? : jiffies_to_usecs(j: `1`));
3140	}
3141
3142	static bool tcp_ack_update_rtt(struct sock sk, const* int flag,
3143	long seq_rtt_us, long sack_rtt_us,
3144	long ca_rtt_us, struct rate_sample *rs)
3145	{
3146	const struct tcp_sock *tp = tcp_sk(sk);
3147
3148	/ Prefer RTT measured from ACK's timing to TS-ECR. This is because*
3149	* broken middle-boxes or peers may corrupt TS-ECR fields. But
3150	* Karn's algorithm forbids taking RTT if some retransmitted data
3151	* is acked (RFC6298).
3152	*/
3153	if (seq_rtt_us < `0`)
3154	seq_rtt_us = sack_rtt_us;
3155
3156	/ RTTM Rule: A TSecr value received in a segment is used to*
3157	* update the averaged RTT measurement only if the segment
3158	* acknowledges some new data, i.e., only if it advances the
3159	* left edge of the send window.
3160	* See draft-ietf-tcplw-high-performance-00, section 3.3.
3161	*/
3162	if (seq_rtt_us < `0` && tp->rx_opt.saw_tstamp &&
3163	tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
3164	seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
3165
3166	rs->rtt_us = ca_rtt_us; / RTT of last (S)ACKed packet (or -1) /
3167	if (seq_rtt_us < `0`)
3168	return false;
3169
3170	/ ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is*
3171	* always taken together with ACK, SACK, or TS-opts. Any negative
3172	* values will be skipped with the seq_rtt_us < 0 check above.
3173	*/
3174	tcp_update_rtt_min(sk, rtt_us: ca_rtt_us, flag);
3175	tcp_rtt_estimator(sk, mrtt_us: seq_rtt_us);
3176	tcp_set_rto(sk);
3177
3178	/ RFC6298: only reset backoff on valid RTT measurement. /
3179	inet_csk(sk)->icsk_backoff = `0`;
3180	return true;
3181	}
3182
3183	/ Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. /
3184	void tcp_synack_rtt_meas(struct sock sk, struct* request_sock *req)
3185	{
3186	struct rate_sample rs;
3187	long rtt_us = -`1L`;
3188
3189	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
3190	rtt_us = tcp_stamp_us_delta(t1: tcp_clock_us(), t0: tcp_rsk(req)->snt_synack);
3191
3192	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us: rtt_us, sack_rtt_us: -`1L`, ca_rtt_us: rtt_us, rs: &rs);
3193	}
3194
3195
3196	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
3197	{
3198	const struct inet_connection_sock *icsk = inet_csk(sk);
3199
3200	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
3201	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
3202	}
3203
3204	/ Restart timer after forward progress on connection.*
3205	* RFC2988 recommends to restart timer to now+rto.
3206	*/
3207	void tcp_rearm_rto(struct sock *sk)
3208	{
3209	const struct inet_connection_sock *icsk = inet_csk(sk);
3210	struct tcp_sock *tp = tcp_sk(sk);
3211
3212	/ If the retrans timer is currently being used by Fast Open*
3213	* for SYN-ACK retrans purpose, stay put.
3214	*/
3215	if (rcu_access_pointer(tp->fastopen_rsk))
3216	return;
3217
3218	if (!tp->packets_out) {
3219	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3220	} else {
3221	u32 rto = inet_csk(sk)->icsk_rto;
3222	/ Offset the time elapsed after installing regular RTO /
3223	if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
3224	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3225	s64 delta_us = tcp_rto_delta_us(sk);
3226	/ delta_us may not be positive if the socket is locked*
3227	* when the retrans timer fires and is rescheduled.
3228	*/
3229	rto = usecs_to_jiffies(max_t(int, delta_us, `1`));
3230	}
3231	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: rto,
3232	TCP_RTO_MAX);
3233	}
3234	}
3235
3236	/ Try to schedule a loss probe; if that doesn't work, then schedule an RTO. /
3237	static void tcp_set_xmit_timer(struct sock *sk)
3238	{
3239	if (!tcp_schedule_loss_probe(sk, advancing_rto: true))
3240	tcp_rearm_rto(sk);
3241	}
3242
3243	/ If we get here, the whole TSO packet has not been acked. /
3244	static u32 tcp_tso_acked(struct sock sk, struct* sk_buff *skb)
3245	{
3246	struct tcp_sock *tp = tcp_sk(sk);
3247	u32 packets_acked;
3248
3249	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3250
3251	packets_acked = tcp_skb_pcount(skb);
3252	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3253	return `0`;
3254	packets_acked -= tcp_skb_pcount(skb);
3255
3256	if (packets_acked) {
3257	BUG_ON(tcp_skb_pcount(skb) == `0`);
3258	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3259	}
3260
3261	return packets_acked;
3262	}
3263
3264	static void tcp_ack_tstamp(struct sock sk, struct* sk_buff *skb,
3265	const struct sk_buff *ack_skb, u32 prior_snd_una)
3266	{
3267	const struct skb_shared_info *shinfo;
3268
3269	/ Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags /
3270	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
3271	return;
3272
3273	shinfo = skb_shinfo(skb);
3274	if (!before(seq1: shinfo->tskey, seq2: prior_snd_una) &&
3275	before(seq1: shinfo->tskey, tcp_sk(sk)->snd_una)) {
3276	tcp_skb_tsorted_save(skb) {
3277	__skb_tstamp_tx(orig_skb: skb, ack_skb, NULL, sk, tstype: SCM_TSTAMP_ACK);
3278	} tcp_skb_tsorted_restore(skb);
3279	}
3280	}
3281
3282	/ Remove acknowledged frames from the retransmission queue. If our packet*
3283	* is before the ack sequence we can discard it as it's confirmed to have
3284	* arrived at the other end.
3285	*/
3286	static int tcp_clean_rtx_queue(struct sock sk, const* struct sk_buff *ack_skb,
3287	u32 prior_fack, u32 prior_snd_una,
3288	struct tcp_sacktag_state *sack, bool ece_ack)
3289	{
3290	const struct inet_connection_sock *icsk = inet_csk(sk);
3291	u64 first_ackt, last_ackt;
3292	struct tcp_sock *tp = tcp_sk(sk);
3293	u32 prior_sacked = tp->sacked_out;
3294	u32 reord = tp->snd_nxt; / lowest acked un-retx un-sacked seq /
3295	struct sk_buff skb, next;
3296	bool fully_acked = true;
3297	long sack_rtt_us = -`1L`;
3298	long seq_rtt_us = -`1L`;
3299	long ca_rtt_us = -`1L`;
3300	u32 pkts_acked = `0`;
3301	bool rtt_update;
3302	int flag = `0`;
3303
3304	first_ackt = `0`;
3305
3306	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3307	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3308	const u32 start_seq = scb->seq;
3309	u8 sacked = scb->sacked;
3310	u32 acked_pcount;
3311
3312	/ Determine how many packets and what bytes were acked, tso and else /
3313	if (after(scb->end_seq, tp->snd_una)) {
3314	if (tcp_skb_pcount(skb) == `1` \|\|
3315	!after(tp->snd_una, scb->seq))
3316	break;
3317
3318	acked_pcount = tcp_tso_acked(sk, skb);
3319	if (!acked_pcount)
3320	break;
3321	fully_acked = false;
3322	} else {
3323	acked_pcount = tcp_skb_pcount(skb);
3324	}
3325
3326	if (unlikely(sacked & TCPCB_RETRANS)) {
3327	if (sacked & TCPCB_SACKED_RETRANS)
3328	tp->retrans_out -= acked_pcount;
3329	flag \|= FLAG_RETRANS_DATA_ACKED;
3330	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
3331	last_ackt = tcp_skb_timestamp_us(skb);
3332	WARN_ON_ONCE(last_ackt == `0`);
3333	if (!first_ackt)
3334	first_ackt = last_ackt;
3335
3336	if (before(seq1: start_seq, seq2: reord))
3337	reord = start_seq;
3338	if (!after(scb->end_seq, tp->high_seq))
3339	flag \|= FLAG_ORIG_SACK_ACKED;
3340	}
3341
3342	if (sacked & TCPCB_SACKED_ACKED) {
3343	tp->sacked_out -= acked_pcount;
3344	} else if (tcp_is_sack(tp)) {
3345	tcp_count_delivered(tp, delivered: acked_pcount, ece_ack);
3346	if (!tcp_skb_spurious_retrans(tp, skb))
3347	tcp_rack_advance(tp, sacked, end_seq: scb->end_seq,
3348	xmit_time: tcp_skb_timestamp_us(skb));
3349	}
3350	if (sacked & TCPCB_LOST)
3351	tp->lost_out -= acked_pcount;
3352
3353	tp->packets_out -= acked_pcount;
3354	pkts_acked += acked_pcount;
3355	tcp_rate_skb_delivered(sk, skb, rs: sack->rate);
3356
3357	/ Initial outgoing SYN's get put onto the write_queue*
3358	* just like anything else we transmit. It is not
3359	* true data, and if we misinform our callers that
3360	* this ACK acks real data, we will erroneously exit
3361	* connection startup slow start one packet too
3362	* quickly. This is severely frowned upon behavior.
3363	*/
3364	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3365	flag \|= FLAG_DATA_ACKED;
3366	} else {
3367	flag \|= FLAG_SYN_ACKED;
3368	tp->retrans_stamp = `0`;
3369	}
3370
3371	if (!fully_acked)
3372	break;
3373
3374	tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3375
3376	next = skb_rb_next(skb);
3377	if (unlikely(skb == tp->retransmit_skb_hint))
3378	tp->retransmit_skb_hint = NULL;
3379	if (unlikely(skb == tp->lost_skb_hint))
3380	tp->lost_skb_hint = NULL;
3381	tcp_highest_sack_replace(sk, old: skb, new: next);
3382	tcp_rtx_queue_unlink_and_free(skb, sk);
3383	}
3384
3385	if (!skb)
3386	tcp_chrono_stop(sk, type: TCP_CHRONO_BUSY);
3387
3388	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3389	tp->snd_up = tp->snd_una;
3390
3391	if (skb) {
3392	tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3393	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3394	flag \|= FLAG_SACK_RENEGING;
3395	}
3396
3397	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3398	seq_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: first_ackt);
3399	ca_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: last_ackt);
3400
3401	if (pkts_acked == `1` && fully_acked && !prior_sacked &&
3402	(tp->snd_una - prior_snd_una) < tp->mss_cache &&
3403	sack->rate->prior_delivered + `1` == tp->delivered &&
3404	!(flag & (FLAG_CA_ALERT \| FLAG_SYN_ACKED))) {
3405	/ Conservatively mark a delayed ACK. It's typically*
3406	* from a lone runt packet over the round trip to
3407	* a receiver w/o out-of-order or CE events.
3408	*/
3409	flag \|= FLAG_ACK_MAYBE_DELAYED;
3410	}
3411	}
3412	if (sack->first_sackt) {
3413	sack_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: sack->first_sackt);
3414	ca_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: sack->last_sackt);
3415	}
3416	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3417	ca_rtt_us, rs: sack->rate);
3418
3419	if (flag & FLAG_ACKED) {
3420	flag \|= FLAG_SET_XMIT_TIMER; / set TLP or RTO timer /
3421	if (unlikely(icsk->icsk_mtup.probe_size &&
3422	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3423	tcp_mtup_probe_success(sk);
3424	}
3425
3426	if (tcp_is_reno(tp)) {
3427	tcp_remove_reno_sacks(sk, acked: pkts_acked, ece_ack);
3428
3429	/ If any of the cumulatively ACKed segments was*
3430	* retransmitted, non-SACK case cannot confirm that
3431	* progress was due to original transmission due to
3432	* lack of TCPCB_SACKED_ACKED bits even if some of
3433	* the packets may have been never retransmitted.
3434	*/
3435	if (flag & FLAG_RETRANS_DATA_ACKED)
3436	flag &= ~FLAG_ORIG_SACK_ACKED;
3437	} else {
3438	int delta;
3439
3440	/ Non-retransmitted hole got filled? That's reordering /
3441	if (before(seq1: reord, seq2: prior_fack))
3442	tcp_check_sack_reordering(sk, low_seq: reord, ts: `0`);
3443
3444	delta = prior_sacked - tp->sacked_out;
3445	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3446	}
3447	} else if (skb && rtt_update && sack_rtt_us >= `0` &&
3448	sack_rtt_us > tcp_stamp_us_delta(t1: tp->tcp_mstamp,
3449	t0: tcp_skb_timestamp_us(skb))) {
3450	/ Do not re-arm RTO if the sack RTT is measured from data sent*
3451	* after when the head was last (re)transmitted. Otherwise the
3452	* timeout may continue to extend in loss recovery.
3453	*/
3454	flag \|= FLAG_SET_XMIT_TIMER; / set TLP or RTO timer /
3455	}
3456
3457	if (icsk->icsk_ca_ops->pkts_acked) {
3458	struct ack_sample sample = { .pkts_acked = pkts_acked,
3459	.rtt_us = sack->rate->rtt_us };
3460
3461	sample.in_flight = tp->mss_cache *
3462	(tp->delivered - sack->rate->prior_delivered);
3463	icsk->icsk_ca_ops->pkts_acked(sk, &sample);
3464	}
3465
3466	#if FASTRETRANS_DEBUG > 0
3467	WARN_ON((int)tp->sacked_out < `0`);
3468	WARN_ON((int)tp->lost_out < `0`);
3469	WARN_ON((int)tp->retrans_out < `0`);
3470	if (!tp->packets_out && tcp_is_sack(tp)) {
3471	icsk = inet_csk(sk);
3472	if (tp->lost_out) {
3473	pr_debug("Leak l=%u %d\n",
3474	tp->lost_out, icsk->icsk_ca_state);
3475	tp->lost_out = `0`;
3476	}
3477	if (tp->sacked_out) {
3478	pr_debug("Leak s=%u %d\n",
3479	tp->sacked_out, icsk->icsk_ca_state);
3480	tp->sacked_out = `0`;
3481	}
3482	if (tp->retrans_out) {
3483	pr_debug("Leak r=%u %d\n",
3484	tp->retrans_out, icsk->icsk_ca_state);
3485	tp->retrans_out = `0`;
3486	}
3487	}
3488	#endif
3489	return flag;
3490	}
3491
3492	static void tcp_ack_probe(struct sock *sk)
3493	{
3494	struct inet_connection_sock *icsk = inet_csk(sk);
3495	struct sk_buff *head = tcp_send_head(sk);
3496	const struct tcp_sock *tp = tcp_sk(sk);
3497
3498	/ Was it a usable window open? /
3499	if (!head)
3500	return;
3501	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3502	icsk->icsk_backoff = `0`;
3503	icsk->icsk_probes_tstamp = `0`;
3504	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3505	/ Socket must be waked up by subsequent tcp_data_snd_check().*
3506	* This function is not for random using!
3507	*/
3508	} else {
3509	unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
3510
3511	when = tcp_clamp_probe0_to_user_timeout(sk, when);
3512	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX);
3513	}
3514	}
3515
3516	static inline bool tcp_ack_is_dubious(const struct sock sk, const* int flag)
3517	{
3518	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
3519	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3520	}
3521
3522	/ Decide wheather to run the increase function of congestion control. /
3523	static inline bool tcp_may_raise_cwnd(const struct sock sk, const* int flag)
3524	{
3525	/ If reordering is high then always grow cwnd whenever data is*
3526	* delivered regardless of its ordering. Otherwise stay conservative
3527	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
3528	* new SACK or ECE mark may first advance cwnd here and later reduce
3529	* cwnd in tcp_fastretrans_alert() based on more states.
3530	*/
3531	if (tcp_sk(sk)->reordering >
3532	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
3533	return flag & FLAG_FORWARD_PROGRESS;
3534
3535	return flag & FLAG_DATA_ACKED;
3536	}
3537
3538	/ The "ultimate" congestion control function that aims to replace the rigid*
3539	* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
3540	* It's called toward the end of processing an ACK with precise rate
3541	* information. All transmission or retransmission are delayed afterwards.
3542	*/
3543	static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3544	int flag, const struct rate_sample *rs)
3545	{
3546	const struct inet_connection_sock *icsk = inet_csk(sk);
3547
3548	if (icsk->icsk_ca_ops->cong_control) {
3549	icsk->icsk_ca_ops->cong_control(sk, rs);
3550	return;
3551	}
3552
3553	if (tcp_in_cwnd_reduction(sk)) {
3554	/ Reduce cwnd if state mandates /
3555	tcp_cwnd_reduction(sk, newly_acked_sacked: acked_sacked, newly_lost: rs->losses, flag);
3556	} else if (tcp_may_raise_cwnd(sk, flag)) {
3557	/ Advance cwnd if state allows /
3558	tcp_cong_avoid(sk, ack, acked: acked_sacked);
3559	}
3560	tcp_update_pacing_rate(sk);
3561	}
3562
3563	/ Check that window update is acceptable.*
3564	* The function assumes that snd_una<=ack<=snd_next.
3565	*/
3566	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3567	const u32 ack, const u32 ack_seq,
3568	const u32 nwin)
3569	{
3570	return after(ack, tp->snd_una) \|\|
3571	after(ack_seq, tp->snd_wl1) \|\|
3572	(ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd \|\| !nwin));
3573	}
3574
3575	static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
3576	{
3577	#ifdef CONFIG_TCP_AO
3578	struct tcp_ao_info *ao;
3579
3580	if (!static_branch_unlikely(&tcp_ao_needed.key))
3581	return;
3582
3583	ao = rcu_dereference_protected(tp->ao_info,
3584	lockdep_sock_is_held((struct sock *)tp));
3585	if (ao && ack < tp->snd_una)
3586	ao->snd_sne++;
3587	#endif
3588	}
3589
3590	/ If we update tp->snd_una, also update tp->bytes_acked /
3591	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3592	{
3593	u32 delta = ack - tp->snd_una;
3594
3595	sock_owned_by_me(sk: (struct sock *)tp);
3596	tp->bytes_acked += delta;
3597	tcp_snd_sne_update(tp, ack);
3598	tp->snd_una = ack;
3599	}
3600
3601	static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
3602	{
3603	#ifdef CONFIG_TCP_AO
3604	struct tcp_ao_info *ao;
3605
3606	if (!static_branch_unlikely(&tcp_ao_needed.key))
3607	return;
3608
3609	ao = rcu_dereference_protected(tp->ao_info,
3610	lockdep_sock_is_held((struct sock *)tp));
3611	if (ao && seq < tp->rcv_nxt)
3612	ao->rcv_sne++;
3613	#endif
3614	}
3615
3616	/ If we update tp->rcv_nxt, also update tp->bytes_received /
3617	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3618	{
3619	u32 delta = seq - tp->rcv_nxt;
3620
3621	sock_owned_by_me(sk: (struct sock *)tp);
3622	tp->bytes_received += delta;
3623	tcp_rcv_sne_update(tp, seq);
3624	WRITE_ONCE(tp->rcv_nxt, seq);
3625	}
3626
3627	/ Update our send window.*
3628	*
3629	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
3630	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
3631	*/
3632	static int tcp_ack_update_window(struct sock sk, const* struct sk_buff *skb, u32 ack,
3633	u32 ack_seq)
3634	{
3635	struct tcp_sock *tp = tcp_sk(sk);
3636	int flag = `0`;
3637	u32 nwin = ntohs(tcp_hdr(skb)->window);
3638
3639	if (likely(!tcp_hdr(skb)->syn))
3640	nwin <<= tp->rx_opt.snd_wscale;
3641
3642	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3643	flag \|= FLAG_WIN_UPDATE;
3644	tcp_update_wl(tp, seq: ack_seq);
3645
3646	if (tp->snd_wnd != nwin) {
3647	tp->snd_wnd = nwin;
3648
3649	/ Note, it is the only place, where*
3650	* fast path is recovered for sending TCP.
3651	*/
3652	tp->pred_flags = `0`;
3653	tcp_fast_path_check(sk);
3654
3655	if (!tcp_write_queue_empty(sk))
3656	tcp_slow_start_after_idle_check(sk);
3657
3658	if (nwin > tp->max_window) {
3659	tp->max_window = nwin;
3660	tcp_sync_mss(sk, pmtu: inet_csk(sk)->icsk_pmtu_cookie);
3661	}
3662	}
3663	}
3664
3665	tcp_snd_una_update(tp, ack);
3666
3667	return flag;
3668	}
3669
3670	static bool __tcp_oow_rate_limited(struct net net, int* mib_idx,
3671	u32 *last_oow_ack_time)
3672	{
3673	/ Paired with the WRITE_ONCE() in this function. /
3674	u32 val = READ_ONCE(*last_oow_ack_time);
3675
3676	if (val) {
3677	s32 elapsed = (s32)(tcp_jiffies32 - val);
3678
3679	if (`0` <= elapsed &&
3680	elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
3681	NET_INC_STATS(net, mib_idx);
3682	return true; / rate-limited: don't send yet! /
3683	}
3684	}
3685
3686	/ Paired with the prior READ_ONCE() and with itself,*
3687	* as we might be lockless.
3688	*/
3689	WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
3690
3691	return false; / not rate-limited: go ahead, send dupack now! /
3692	}
3693
3694	/ Return true if we're currently rate-limiting out-of-window ACKs and*
3695	* thus shouldn't send a dupack right now. We rate-limit dupacks in
3696	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
3697	* attacks that send repeated SYNs or ACKs for the same connection. To
3698	* do this, we do not send a duplicate SYNACK or ACK if the remote
3699	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
3700	*/
3701	bool tcp_oow_rate_limited(struct net net, const* struct sk_buff *skb,
3702	int mib_idx, u32 *last_oow_ack_time)
3703	{
3704	/ Data packets without SYNs are not likely part of an ACK loop. /
3705	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3706	!tcp_hdr(skb)->syn)
3707	return false;
3708
3709	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3710	}
3711
3712	/ RFC 5961 7 [ACK Throttling] /
3713	static void tcp_send_challenge_ack(struct sock *sk)
3714	{
3715	struct tcp_sock *tp = tcp_sk(sk);
3716	struct net *net = sock_net(sk);
3717	u32 count, now, ack_limit;
3718
3719	/ First check our per-socket dupack rate limit. /
3720	if (__tcp_oow_rate_limited(net,
3721	mib_idx: LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3722	last_oow_ack_time: &tp->last_oow_ack_time))
3723	return;
3724
3725	ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
3726	if (ack_limit == INT_MAX)
3727	goto send_ack;
3728
3729	/ Then check host-wide RFC 5961 rate limit. /
3730	now = jiffies / HZ;
3731	if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
3732	u32 half = (ack_limit + `1`) >> `1`;
3733
3734	WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
3735	WRITE_ONCE(net->ipv4.tcp_challenge_count,
3736	get_random_u32_inclusive(half, ack_limit + half - `1`));
3737	}
3738	count = READ_ONCE(net->ipv4.tcp_challenge_count);
3739	if (count > `0`) {
3740	WRITE_ONCE(net->ipv4.tcp_challenge_count, count - `1`);
3741	send_ack:
3742	NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
3743	tcp_send_ack(sk);
3744	}
3745	}
3746
3747	static void tcp_store_ts_recent(struct tcp_sock *tp)
3748	{
3749	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3750	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
3751	}
3752
3753	static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3754	{
3755	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3756	/ PAWS bug workaround wrt. ACK frames, the PAWS discard*
3757	* extra check below makes sure this can only happen
3758	* for pure ACK frames. -DaveM
3759	*
3760	* Not only, also it occurs for expired timestamps.
3761	*/
3762
3763	if (tcp_paws_check(rx_opt: &tp->rx_opt, paws_win: `0`))
3764	tcp_store_ts_recent(tp);
3765	}
3766	}
3767
3768	/ This routine deals with acks during a TLP episode and ends an episode by*
3769	* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
3770	*/
3771	static void tcp_process_tlp_ack(struct sock sk, u32 ack, int* flag)
3772	{
3773	struct tcp_sock *tp = tcp_sk(sk);
3774
3775	if (before(seq1: ack, seq2: tp->tlp_high_seq))
3776	return;
3777
3778	if (!tp->tlp_retrans) {
3779	/ TLP of new data has been acknowledged /
3780	tp->tlp_high_seq = `0`;
3781	} else if (flag & FLAG_DSACK_TLP) {
3782	/ This DSACK means original and TLP probe arrived; no loss /
3783	tp->tlp_high_seq = `0`;
3784	} else if (after(ack, tp->tlp_high_seq)) {
3785	/ ACK advances: there was a loss, so reduce cwnd. Reset*
3786	* tlp_high_seq in tcp_init_cwnd_reduction()
3787	*/
3788	tcp_init_cwnd_reduction(sk);
3789	tcp_set_ca_state(sk, ca_state: TCP_CA_CWR);
3790	tcp_end_cwnd_reduction(sk);
3791	tcp_try_keep_open(sk);
3792	NET_INC_STATS(sock_net(sk),
3793	LINUX_MIB_TCPLOSSPROBERECOVERY);
3794	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
3795	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
3796	/ Pure dupack: original and TLP probe arrived; no loss /
3797	tp->tlp_high_seq = `0`;
3798	}
3799	}
3800
3801	static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
3802	{
3803	const struct inet_connection_sock *icsk = inet_csk(sk);
3804
3805	if (icsk->icsk_ca_ops->in_ack_event)
3806	icsk->icsk_ca_ops->in_ack_event(sk, flags);
3807	}
3808
3809	/ Congestion control has updated the cwnd already. So if we're in*
3810	* loss recovery then now we do any new sends (for FRTO) or
3811	* retransmits (for CA_Loss or CA_recovery) that make sense.
3812	*/
3813	static void tcp_xmit_recovery(struct sock sk, int* rexmit)
3814	{
3815	struct tcp_sock *tp = tcp_sk(sk);
3816
3817	if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
3818	return;
3819
3820	if (unlikely(rexmit == REXMIT_NEW)) {
3821	__tcp_push_pending_frames(sk, cur_mss: tcp_current_mss(sk),
3822	TCP_NAGLE_OFF);
3823	if (after(tp->snd_nxt, tp->high_seq))
3824	return;
3825	tp->frto = `0`;
3826	}
3827	tcp_xmit_retransmit_queue(sk);
3828	}
3829
3830	/ Returns the number of packets newly acked or sacked by the current ACK /
3831	static u32 tcp_newly_delivered(struct sock sk, u32 prior_delivered, int* flag)
3832	{
3833	const struct net *net = sock_net(sk);
3834	struct tcp_sock *tp = tcp_sk(sk);
3835	u32 delivered;
3836
3837	delivered = tp->delivered - prior_delivered;
3838	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3839	if (flag & FLAG_ECE)
3840	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3841
3842	return delivered;
3843	}
3844
3845	/ This routine deals with incoming acks, but not outgoing ones. /
3846	static int tcp_ack(struct sock sk, const* struct sk_buff skb, int* flag)
3847	{
3848	struct inet_connection_sock *icsk = inet_csk(sk);
3849	struct tcp_sock *tp = tcp_sk(sk);
3850	struct tcp_sacktag_state sack_state;
3851	struct rate_sample rs = { .prior_delivered = `0` };
3852	u32 prior_snd_una = tp->snd_una;
3853	bool is_sack_reneg = tp->is_sack_reneg;
3854	u32 ack_seq = TCP_SKB_CB(skb)->seq;
3855	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3856	int num_dupack = `0`;
3857	int prior_packets = tp->packets_out;
3858	u32 delivered = tp->delivered;
3859	u32 lost = tp->lost;
3860	int rexmit = REXMIT_NONE; / Flag to (re)transmit to recover losses /
3861	u32 prior_fack;
3862
3863	sack_state.first_sackt = `0`;
3864	sack_state.rate = &rs;
3865	sack_state.sack_delivered = `0`;
3866
3867	/ We very likely will need to access rtx queue. /
3868	prefetch(sk->tcp_rtx_queue.rb_node);
3869
3870	/ If the ack is older than previous acks*
3871	* then we can probably ignore it.
3872	*/
3873	if (before(seq1: ack, seq2: prior_snd_una)) {
3874	/ RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] /
3875	if (before(seq1: ack, seq2: prior_snd_una - tp->max_window)) {
3876	if (!(flag & FLAG_NO_CHALLENGE_ACK))
3877	tcp_send_challenge_ack(sk);
3878	return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
3879	}
3880	goto old_ack;
3881	}
3882
3883	/ If the ack includes data we haven't sent yet, discard*
3884	* this segment (RFC793 Section 3.9).
3885	*/
3886	if (after(ack, tp->snd_nxt))
3887	return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
3888
3889	if (after(ack, prior_snd_una)) {
3890	flag \|= FLAG_SND_UNA_ADVANCED;
3891	icsk->icsk_retransmits = `0`;
3892
3893	#if IS_ENABLED(CONFIG_TLS_DEVICE)
3894	if (static_branch_unlikely(&clean_acked_data_enabled.key))
3895	if (icsk->icsk_clean_acked)
3896	icsk->icsk_clean_acked(sk, ack);
3897	#endif
3898	}
3899
3900	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
3901	rs.prior_in_flight = tcp_packets_in_flight(tp);
3902
3903	/ ts_recent update must be made after we are sure that the packet*
3904	* is in window.
3905	*/
3906	if (flag & FLAG_UPDATE_TS_RECENT)
3907	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3908
3909	if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
3910	FLAG_SND_UNA_ADVANCED) {
3911	/ Window is constant, pure forward advance.*
3912	* No more checks are required.
3913	* Note, we use the fact that SND.UNA>=SND.WL2.
3914	*/
3915	tcp_update_wl(tp, seq: ack_seq);
3916	tcp_snd_una_update(tp, ack);
3917	flag \|= FLAG_WIN_UPDATE;
3918
3919	tcp_in_ack_event(sk, flags: CA_ACK_WIN_UPDATE);
3920
3921	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
3922	} else {
3923	u32 ack_ev_flags = CA_ACK_SLOWPATH;
3924
3925	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
3926	flag \|= FLAG_DATA;
3927	else
3928	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3929
3930	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
3931
3932	if (TCP_SKB_CB(skb)->sacked)
3933	flag \|= tcp_sacktag_write_queue(sk, ack_skb: skb, prior_snd_una,
3934	state: &sack_state);
3935
3936	if (tcp_ecn_rcv_ecn_echo(tp, th: tcp_hdr(skb))) {
3937	flag \|= FLAG_ECE;
3938	ack_ev_flags \|= CA_ACK_ECE;
3939	}
3940
3941	if (sack_state.sack_delivered)
3942	tcp_count_delivered(tp, delivered: sack_state.sack_delivered,
3943	ece_ack: flag & FLAG_ECE);
3944
3945	if (flag & FLAG_WIN_UPDATE)
3946	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
3947
3948	tcp_in_ack_event(sk, flags: ack_ev_flags);
3949	}
3950
3951	/ This is a deviation from RFC3168 since it states that:*
3952	* "When the TCP data sender is ready to set the CWR bit after reducing
3953	* the congestion window, it SHOULD set the CWR bit only on the first
3954	* new data packet that it transmits."
3955	* We accept CWR on pure ACKs to be more robust
3956	* with widely-deployed TCP implementations that do this.
3957	*/
3958	tcp_ecn_accept_cwr(sk, skb);
3959
3960	/ We passed data and got it acked, remove any soft error*
3961	* log. Something worked...
3962	*/
3963	WRITE_ONCE(sk->sk_err_soft, `0`);
3964	icsk->icsk_probes_out = `0`;
3965	tp->rcv_tstamp = tcp_jiffies32;
3966	if (!prior_packets)
3967	goto no_queue;
3968
3969	/ See if we can take anything off of the retransmit queue. /
3970	flag \|= tcp_clean_rtx_queue(sk, ack_skb: skb, prior_fack, prior_snd_una,
3971	sack: &sack_state, ece_ack: flag & FLAG_ECE);
3972
3973	tcp_rack_update_reo_wnd(sk, rs: &rs);
3974
3975	if (tp->tlp_high_seq)
3976	tcp_process_tlp_ack(sk, ack, flag);
3977
3978	if (tcp_ack_is_dubious(sk, flag)) {
3979	if (!(flag & (FLAG_SND_UNA_ADVANCED \|
3980	FLAG_NOT_DUP \| FLAG_DSACKING_ACK))) {
3981	num_dupack = `1`;
3982	/ Consider if pure acks were aggregated in tcp_add_backlog() /
3983	if (!(flag & FLAG_DATA))
3984	num_dupack = max_t(u16, `1`, skb_shinfo(skb)->gso_segs);
3985	}
3986	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, ack_flag: &flag,
3987	rexmit: &rexmit);
3988	}
3989
3990	/ If needed, reset TLP/RTO timer when RACK doesn't set. /
3991	if (flag & FLAG_SET_XMIT_TIMER)
3992	tcp_set_xmit_timer(sk);
3993
3994	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
3995	sk_dst_confirm(sk);
3996
3997	delivered = tcp_newly_delivered(sk, prior_delivered: delivered, flag);
3998	lost = tp->lost - lost; / freshly marked lost /
3999	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
4000	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, rs: sack_state.rate);
4001	tcp_cong_control(sk, ack, acked_sacked: delivered, flag, rs: sack_state.rate);
4002	tcp_xmit_recovery(sk, rexmit);
4003	return `1`;
4004
4005	no_queue:
4006	/ If data was DSACKed, see if we can undo a cwnd reduction. /
4007	if (flag & FLAG_DSACKING_ACK) {
4008	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, ack_flag: &flag,
4009	rexmit: &rexmit);
4010	tcp_newly_delivered(sk, prior_delivered: delivered, flag);
4011	}
4012	/ If this ack opens up a zero window, clear backoff. It was*
4013	* being used to time the probes, and is probably far higher than
4014	* it needs to be for normal retransmission.
4015	*/
4016	tcp_ack_probe(sk);
4017
4018	if (tp->tlp_high_seq)
4019	tcp_process_tlp_ack(sk, ack, flag);
4020	return `1`;
4021
4022	old_ack:
4023	/ If data was SACKed, tag it and see if we should send more data.*
4024	* If data was DSACKed, see if we can undo a cwnd reduction.
4025	*/
4026	if (TCP_SKB_CB(skb)->sacked) {
4027	flag \|= tcp_sacktag_write_queue(sk, ack_skb: skb, prior_snd_una,
4028	state: &sack_state);
4029	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, ack_flag: &flag,
4030	rexmit: &rexmit);
4031	tcp_newly_delivered(sk, prior_delivered: delivered, flag);
4032	tcp_xmit_recovery(sk, rexmit);
4033	}
4034
4035	return `0`;
4036	}
4037
4038	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
4039	bool syn, struct tcp_fastopen_cookie *foc,
4040	bool exp_opt)
4041	{
4042	/ Valid only in SYN or SYN-ACK with an even length. /
4043	if (!foc \|\| !syn \|\| len < `0` \|\| (len & `1`))
4044	return;
4045
4046	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
4047	len <= TCP_FASTOPEN_COOKIE_MAX)
4048	memcpy(foc->val, cookie, len);
4049	else if (len != `0`)
4050	len = -`1`;
4051	foc->len = len;
4052	foc->exp = exp_opt;
4053	}
4054
4055	static bool smc_parse_options(const struct tcphdr *th,
4056	struct tcp_options_received *opt_rx,
4057	const unsigned char *ptr,
4058	int opsize)
4059	{
4060	#if IS_ENABLED(CONFIG_SMC)
4061	if (static_branch_unlikely(&tcp_have_smc)) {
4062	if (th->syn && !(opsize & `1`) &&
4063	opsize >= TCPOLEN_EXP_SMC_BASE &&
4064	get_unaligned_be32(p: ptr) == TCPOPT_SMC_MAGIC) {
4065	opt_rx->smc_ok = `1`;
4066	return true;
4067	}
4068	}
4069	#endif
4070	return false;
4071	}
4072
4073	/ Try to parse the MSS option from the TCP header. Return 0 on failure, clamped*
4074	* value on success.
4075	*/
4076	u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
4077	{
4078	const unsigned char ptr = (const* unsigned char *)(th + `1`);
4079	int length = (th->doff * `4`) - sizeof(struct tcphdr);
4080	u16 mss = `0`;
4081
4082	while (length > `0`) {
4083	int opcode = *ptr++;
4084	int opsize;
4085
4086	switch (opcode) {
4087	case TCPOPT_EOL:
4088	return mss;
4089	case TCPOPT_NOP: / Ref: RFC 793 section 3.1 /
4090	length--;
4091	continue;
4092	default:
4093	if (length < `2`)
4094	return mss;
4095	opsize = *ptr++;
4096	if (opsize < `2`) / "silly options" /
4097	return mss;
4098	if (opsize > length)
4099	return mss; / fail on partial options /
4100	if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
4101	u16 in_mss = get_unaligned_be16(p: ptr);
4102
4103	if (in_mss) {
4104	if (user_mss && user_mss < in_mss)
4105	in_mss = user_mss;
4106	mss = in_mss;
4107	}
4108	}
4109	ptr += opsize - `2`;
4110	length -= opsize;
4111	}
4112	}
4113	return mss;
4114	}
4115	EXPORT_SYMBOL_GPL(tcp_parse_mss_option);
4116
4117	/ Look for tcp options. Normally only called on SYN and SYNACK packets.*
4118	* But, this can also be called on packets in the established flow when
4119	* the fast version below fails.
4120	*/
4121	void tcp_parse_options(const struct net *net,
4122	const struct sk_buff *skb,
4123	struct tcp_options_received opt_rx, int* estab,
4124	struct tcp_fastopen_cookie *foc)
4125	{
4126	const unsigned char *ptr;
4127	const struct tcphdr *th = tcp_hdr(skb);
4128	int length = (th->doff * `4`) - sizeof(struct tcphdr);
4129
4130	ptr = (const unsigned char *)(th + `1`);
4131	opt_rx->saw_tstamp = `0`;
4132	opt_rx->saw_unknown = `0`;
4133
4134	while (length > `0`) {
4135	int opcode = *ptr++;
4136	int opsize;
4137
4138	switch (opcode) {
4139	case TCPOPT_EOL:
4140	return;
4141	case TCPOPT_NOP: / Ref: RFC 793 section 3.1 /
4142	length--;
4143	continue;
4144	default:
4145	if (length < `2`)
4146	return;
4147	opsize = *ptr++;
4148	if (opsize < `2`) / "silly options" /
4149	return;
4150	if (opsize > length)
4151	return; / don't parse partial options /
4152	switch (opcode) {
4153	case TCPOPT_MSS:
4154	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
4155	u16 in_mss = get_unaligned_be16(p: ptr);
4156	if (in_mss) {
4157	if (opt_rx->user_mss &&
4158	opt_rx->user_mss < in_mss)
4159	in_mss = opt_rx->user_mss;
4160	opt_rx->mss_clamp = in_mss;
4161	}
4162	}
4163	break;
4164	case TCPOPT_WINDOW:
4165	if (opsize == TCPOLEN_WINDOW && th->syn &&
4166	!estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
4167	__u8 snd_wscale = (__u8 )ptr;
4168	opt_rx->wscale_ok = `1`;
4169	if (snd_wscale > TCP_MAX_WSCALE) {
4170	net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
4171	__func__,
4172	snd_wscale,
4173	TCP_MAX_WSCALE);
4174	snd_wscale = TCP_MAX_WSCALE;
4175	}
4176	opt_rx->snd_wscale = snd_wscale;
4177	}
4178	break;
4179	case TCPOPT_TIMESTAMP:
4180	if ((opsize == TCPOLEN_TIMESTAMP) &&
4181	((estab && opt_rx->tstamp_ok) \|\|
4182	(!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
4183	opt_rx->saw_tstamp = `1`;
4184	opt_rx->rcv_tsval = get_unaligned_be32(p: ptr);
4185	opt_rx->rcv_tsecr = get_unaligned_be32(p: ptr + `4`);
4186	}
4187	break;
4188	case TCPOPT_SACK_PERM:
4189	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
4190	!estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
4191	opt_rx->sack_ok = TCP_SACK_SEEN;
4192	tcp_sack_reset(rx_opt: opt_rx);
4193	}
4194	break;
4195
4196	case TCPOPT_SACK:
4197	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
4198	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
4199	opt_rx->sack_ok) {
4200	TCP_SKB_CB(skb)->sacked = (ptr - `2`) - (unsigned char *)th;
4201	}
4202	break;
4203	#ifdef CONFIG_TCP_MD5SIG
4204	case TCPOPT_MD5SIG:
4205	/ The MD5 Hash has already been*
4206	* checked (see tcp_v{4,6}_rcv()).
4207	*/
4208	break;
4209	#endif
4210	case TCPOPT_FASTOPEN:
4211	tcp_parse_fastopen_option(
4212	len: opsize - TCPOLEN_FASTOPEN_BASE,
4213	cookie: ptr, syn: th->syn, foc, exp_opt: false);
4214	break;
4215
4216	case TCPOPT_EXP:
4217	/ Fast Open option shares code 254 using a*
4218	* 16 bits magic number.
4219	*/
4220	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
4221	get_unaligned_be16(p: ptr) ==
4222	TCPOPT_FASTOPEN_MAGIC) {
4223	tcp_parse_fastopen_option(len: opsize -
4224	TCPOLEN_EXP_FASTOPEN_BASE,
4225	cookie: ptr + `2`, syn: th->syn, foc, exp_opt: true);
4226	break;
4227	}
4228
4229	if (smc_parse_options(th, opt_rx, ptr, opsize))
4230	break;
4231
4232	opt_rx->saw_unknown = `1`;
4233	break;
4234
4235	default:
4236	opt_rx->saw_unknown = `1`;
4237	}
4238	ptr += opsize-`2`;
4239	length -= opsize;
4240	}
4241	}
4242	}
4243	EXPORT_SYMBOL(tcp_parse_options);
4244
4245	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const* struct tcphdr *th)
4246	{
4247	const __be32 ptr = (const* __be32 *)(th + `1`);
4248
4249	if (*ptr == htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`)
4250	\| (TCPOPT_TIMESTAMP << `8`) \| TCPOLEN_TIMESTAMP)) {
4251	tp->rx_opt.saw_tstamp = `1`;
4252	++ptr;
4253	tp->rx_opt.rcv_tsval = ntohl(*ptr);
4254	++ptr;
4255	if (*ptr)
4256	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
4257	else
4258	tp->rx_opt.rcv_tsecr = `0`;
4259	return true;
4260	}
4261	return false;
4262	}
4263
4264	/ Fast parse options. This hopes to only see timestamps.*
4265	* If it is wrong it falls back on tcp_parse_options().
4266	*/
4267	static bool tcp_fast_parse_options(const struct net *net,
4268	const struct sk_buff *skb,
4269	const struct tcphdr th, struct* tcp_sock *tp)
4270	{
4271	/ In the spirit of fast parsing, compare doff directly to constant*
4272	* values. Because equality is used, short doff can be ignored here.
4273	*/
4274	if (th->doff == (sizeof(*th) / `4`)) {
4275	tp->rx_opt.saw_tstamp = `0`;
4276	return false;
4277	} else if (tp->rx_opt.tstamp_ok &&
4278	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / `4`)) {
4279	if (tcp_parse_aligned_timestamp(tp, th))
4280	return true;
4281	}
4282
4283	tcp_parse_options(net, skb, &tp->rx_opt, `1`, NULL);
4284	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
4285	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
4286
4287	return true;
4288	}
4289
4290	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
4291	/*
4292	* Parse Signature options
4293	*/
4294	int tcp_do_parse_auth_options(const struct tcphdr *th,
4295	const u8 *md5_hash, const* u8 **ao_hash)
4296	{
4297	int length = (th->doff << `2`) - sizeof(*th);
4298	const u8 ptr = (const* u8 *)(th + `1`);
4299	unsigned int minlen = TCPOLEN_MD5SIG;
4300
4301	if (IS_ENABLED(CONFIG_TCP_AO))
4302	minlen = sizeof(struct tcp_ao_hdr) + `1`;
4303
4304	*md5_hash = NULL;
4305	*ao_hash = NULL;
4306
4307	/ If not enough data remaining, we can short cut /
4308	while (length >= minlen) {
4309	int opcode = *ptr++;
4310	int opsize;
4311
4312	switch (opcode) {
4313	case TCPOPT_EOL:
4314	return `0`;
4315	case TCPOPT_NOP:
4316	length--;
4317	continue;
4318	default:
4319	opsize = *ptr++;
4320	if (opsize < `2` \|\| opsize > length)
4321	return -EINVAL;
4322	if (opcode == TCPOPT_MD5SIG) {
4323	if (opsize != TCPOLEN_MD5SIG)
4324	return -EINVAL;
4325	if (unlikely(md5_hash \|\| ao_hash))
4326	return -EEXIST;
4327	*md5_hash = ptr;
4328	} else if (opcode == TCPOPT_AO) {
4329	if (opsize <= sizeof(struct tcp_ao_hdr))
4330	return -EINVAL;
4331	if (unlikely(md5_hash \|\| ao_hash))
4332	return -EEXIST;
4333	*ao_hash = ptr;
4334	}
4335	}
4336	ptr += opsize - `2`;
4337	length -= opsize;
4338	}
4339	return `0`;
4340	}
4341	EXPORT_SYMBOL(tcp_do_parse_auth_options);
4342	#endif
4343
4344	/ Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM*
4345	*
4346	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
4347	* it can pass through stack. So, the following predicate verifies that
4348	* this segment is not used for anything but congestion avoidance or
4349	* fast retransmit. Moreover, we even are able to eliminate most of such
4350	* second order effects, if we apply some small "replay" window (~RTO)
4351	* to timestamp space.
4352	*
4353	* All these measures still do not guarantee that we reject wrapped ACKs
4354	* on networks with high bandwidth, when sequence space is recycled fastly,
4355	* but it guarantees that such events will be very rare and do not affect
4356	* connection seriously. This doesn't look nice, but alas, PAWS is really
4357	* buggy extension.
4358	*
4359	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
4360	* states that events when retransmit arrives after original data are rare.
4361	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
4362	* the biggest problem on large power networks even with minor reordering.
4363	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
4364	* up to bandwidth of 18Gigabit/sec. 8) ]
4365	*/
4366
4367	static int tcp_disordered_ack(const struct sock sk, const* struct sk_buff *skb)
4368	{
4369	const struct tcp_sock *tp = tcp_sk(sk);
4370	const struct tcphdr *th = tcp_hdr(skb);
4371	u32 seq = TCP_SKB_CB(skb)->seq;
4372	u32 ack = TCP_SKB_CB(skb)->ack_seq;
4373
4374	return (/ 1. Pure ACK with correct sequence number. /
4375	(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
4376
4377	/ 2. ... and duplicate ACK. /
4378	ack == tp->snd_una &&
4379
4380	/ 3. ... and does not update window. /
4381	!tcp_may_update_window(tp, ack, ack_seq: seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
4382
4383	/ 4. ... and sits in replay window. /
4384	(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * `1024`) / HZ);
4385	}
4386
4387	static inline bool tcp_paws_discard(const struct sock *sk,
4388	const struct sk_buff *skb)
4389	{
4390	const struct tcp_sock *tp = tcp_sk(sk);
4391
4392	return !tcp_paws_check(rx_opt: &tp->rx_opt, TCP_PAWS_WINDOW) &&
4393	!tcp_disordered_ack(sk, skb);
4394	}
4395
4396	/ Check segment sequence number for validity.*
4397	*
4398	* Segment controls are considered valid, if the segment
4399	* fits to the window after truncation to the window. Acceptability
4400	* of data (and SYN, FIN, of course) is checked separately.
4401	* See tcp_data_queue(), for example.
4402	*
4403	* Also, controls (RST is main one) are accepted using RCV.WUP instead
4404	* of RCV.NXT. Peer still did not advance his SND.UNA when we
4405	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
4406	* (borrowed from freebsd)
4407	*/
4408
4409	static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
4410	u32 seq, u32 end_seq)
4411	{
4412	if (before(seq1: end_seq, seq2: tp->rcv_wup))
4413	return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
4414
4415	if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
4416	return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
4417
4418	return SKB_NOT_DROPPED_YET;
4419	}
4420
4421	/ When we get a reset we do this. /
4422	void tcp_reset(struct sock sk, struct* sk_buff *skb)
4423	{
4424	trace_tcp_receive_reset(sk);
4425
4426	/ mptcp can't tell us to ignore reset pkts,*
4427	* so just ignore the return value of mptcp_incoming_options().
4428	*/
4429	if (sk_is_mptcp(sk))
4430	mptcp_incoming_options(sk, skb);
4431
4432	/ We want the right error as BSD sees it (and indeed as we do). /
4433	switch (sk->sk_state) {
4434	case TCP_SYN_SENT:
4435	WRITE_ONCE(sk->sk_err, ECONNREFUSED);
4436	break;
4437	case TCP_CLOSE_WAIT:
4438	WRITE_ONCE(sk->sk_err, EPIPE);
4439	break;
4440	case TCP_CLOSE:
4441	return;
4442	default:
4443	WRITE_ONCE(sk->sk_err, ECONNRESET);
4444	}
4445	/ This barrier is coupled with smp_rmb() in tcp_poll() /
4446	smp_wmb();
4447
4448	tcp_write_queue_purge(sk);
4449	tcp_done(sk);
4450
4451	if (!sock_flag(sk, flag: SOCK_DEAD))
4452	sk_error_report(sk);
4453	}
4454
4455	/*
4456	* Process the FIN bit. This now behaves as it is supposed to work
4457	* and the FIN takes effect when it is validly part of sequence
4458	* space. Not before when we get holes.
4459	*
4460	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4461	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
4462	* TIME-WAIT)
4463	*
4464	* If we are in FINWAIT-1, a received FIN indicates simultaneous
4465	* close and we go into CLOSING (and later onto TIME-WAIT)
4466	*
4467	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4468	*/
4469	void tcp_fin(struct sock *sk)
4470	{
4471	struct tcp_sock *tp = tcp_sk(sk);
4472
4473	inet_csk_schedule_ack(sk);
4474
4475	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| RCV_SHUTDOWN);
4476	sock_set_flag(sk, flag: SOCK_DONE);
4477
4478	switch (sk->sk_state) {
4479	case TCP_SYN_RECV:
4480	case TCP_ESTABLISHED:
4481	/ Move to CLOSE_WAIT /
4482	tcp_set_state(sk, state: TCP_CLOSE_WAIT);
4483	inet_csk_enter_pingpong_mode(sk);
4484	break;
4485
4486	case TCP_CLOSE_WAIT:
4487	case TCP_CLOSING:
4488	/ Received a retransmission of the FIN, do*
4489	* nothing.
4490	*/
4491	break;
4492	case TCP_LAST_ACK:
4493	/ RFC793: Remain in the LAST-ACK state. /
4494	break;
4495
4496	case TCP_FIN_WAIT1:
4497	/ This case occurs when a simultaneous close*
4498	* happens, we must ack the received FIN and
4499	* enter the CLOSING state.
4500	*/
4501	tcp_send_ack(sk);
4502	tcp_set_state(sk, state: TCP_CLOSING);
4503	break;
4504	case TCP_FIN_WAIT2:
4505	/ Received a FIN -- send ACK and enter TIME_WAIT. /
4506	tcp_send_ack(sk);
4507	tcp_time_wait(sk, state: TCP_TIME_WAIT, timeo: `0`);
4508	break;
4509	default:
4510	/ Only TCP_LISTEN and TCP_CLOSE are left, in these*
4511	* cases we should never reach this piece of code.
4512	*/
4513	pr_err("%s: Impossible, sk->sk_state=%d\n",
4514	__func__, sk->sk_state);
4515	break;
4516	}
4517
4518	/ It _is_ possible, that we have something out-of-order _after_ FIN.*
4519	* Probably, we should reset in this case. For now drop them.
4520	*/
4521	skb_rbtree_purge(root: &tp->out_of_order_queue);
4522	if (tcp_is_sack(tp))
4523	tcp_sack_reset(rx_opt: &tp->rx_opt);
4524
4525	if (!sock_flag(sk, flag: SOCK_DEAD)) {
4526	sk->sk_state_change(sk);
4527
4528	/ Do not send POLL_HUP for half duplex close. /
4529	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
4530	sk->sk_state == TCP_CLOSE)
4531	sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_HUP);
4532	else
4533	sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_IN);
4534	}
4535	}
4536
4537	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4538	u32 end_seq)
4539	{
4540	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4541	if (before(seq1: seq, seq2: sp->start_seq))
4542	sp->start_seq = seq;
4543	if (after(end_seq, sp->end_seq))
4544	sp->end_seq = end_seq;
4545	return true;
4546	}
4547	return false;
4548	}
4549
4550	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4551	{
4552	struct tcp_sock *tp = tcp_sk(sk);
4553
4554	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4555	int mib_idx;
4556
4557	if (before(seq1: seq, seq2: tp->rcv_nxt))
4558	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4559	else
4560	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4561
4562	NET_INC_STATS(sock_net(sk), mib_idx);
4563
4564	tp->rx_opt.dsack = `1`;
4565	tp->duplicate_sack[`0`].start_seq = seq;
4566	tp->duplicate_sack[`0`].end_seq = end_seq;
4567	}
4568	}
4569
4570	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4571	{
4572	struct tcp_sock *tp = tcp_sk(sk);
4573
4574	if (!tp->rx_opt.dsack)
4575	tcp_dsack_set(sk, seq, end_seq);
4576	else
4577	tcp_sack_extend(sp: tp->duplicate_sack, seq, end_seq);
4578	}
4579
4580	static void tcp_rcv_spurious_retrans(struct sock sk, const* struct sk_buff *skb)
4581	{
4582	/ When the ACK path fails or drops most ACKs, the sender would*
4583	* timeout and spuriously retransmit the same segment repeatedly.
4584	* If it seems our ACKs are not reaching the other side,
4585	* based on receiving a duplicate data segment with new flowlabel
4586	* (suggesting the sender suffered an RTO), and we are not already
4587	* repathing due to our own RTO, then rehash the socket to repath our
4588	* packets.
4589	*/
4590	#if IS_ENABLED(CONFIG_IPV6)
4591	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
4592	skb->protocol == htons(ETH_P_IPV6) &&
4593	(tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
4594	ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
4595	sk_rethink_txhash(sk))
4596	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
4597
4598	/ Save last flowlabel after a spurious retrans. /
4599	tcp_save_lrcv_flowlabel(sk, skb);
4600	#endif
4601	}
4602
4603	static void tcp_send_dupack(struct sock sk, const* struct sk_buff *skb)
4604	{
4605	struct tcp_sock *tp = tcp_sk(sk);
4606
4607	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4608	before(TCP_SKB_CB(skb)->seq, seq2: tp->rcv_nxt)) {
4609	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4610	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4611
4612	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4613	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4614
4615	tcp_rcv_spurious_retrans(sk, skb);
4616	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4617	end_seq = tp->rcv_nxt;
4618	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4619	}
4620	}
4621
4622	tcp_send_ack(sk);
4623	}
4624
4625	/ These routines update the SACK block as out-of-order packets arrive or*
4626	* in-order packets close up the sequence space.
4627	*/
4628	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4629	{
4630	int this_sack;
4631	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
4632	struct tcp_sack_block *swalk = sp + `1`;
4633
4634	/ See if the recent change to the first SACK eats into*
4635	* or hits the sequence space of other SACK blocks, if so coalesce.
4636	*/
4637	for (this_sack = `1`; this_sack < tp->rx_opt.num_sacks;) {
4638	if (tcp_sack_extend(sp, seq: swalk->start_seq, end_seq: swalk->end_seq)) {
4639	int i;
4640
4641	/ Zap SWALK, by moving every further SACK up by one slot.*
4642	* Decrease num_sacks.
4643	*/
4644	tp->rx_opt.num_sacks--;
4645	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4646	sp[i] = sp[i + `1`];
4647	continue;
4648	}
4649	this_sack++;
4650	swalk++;
4651	}
4652	}
4653
4654	void tcp_sack_compress_send_ack(struct sock *sk)
4655	{
4656	struct tcp_sock *tp = tcp_sk(sk);
4657
4658	if (!tp->compressed_ack)
4659	return;
4660
4661	if (hrtimer_try_to_cancel(timer: &tp->compressed_ack_timer) == `1`)
4662	__sock_put(sk);
4663
4664	/ Since we have to send one ack finally,*
4665	* substract one from tp->compressed_ack to keep
4666	* LINUX_MIB_TCPACKCOMPRESSED accurate.
4667	*/
4668	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
4669	tp->compressed_ack - `1`);
4670
4671	tp->compressed_ack = `0`;
4672	tcp_send_ack(sk);
4673	}
4674
4675	/ Reasonable amount of sack blocks included in TCP SACK option*
4676	* The max is 4, but this becomes 3 if TCP timestamps are there.
4677	* Given that SACK packets might be lost, be conservative and use 2.
4678	*/
4679	#define TCP_SACK_BLOCKS_EXPECTED 2
4680
4681	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4682	{
4683	struct tcp_sock *tp = tcp_sk(sk);
4684	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
4685	int cur_sacks = tp->rx_opt.num_sacks;
4686	int this_sack;
4687
4688	if (!cur_sacks)
4689	goto new_sack;
4690
4691	for (this_sack = `0`; this_sack < cur_sacks; this_sack++, sp++) {
4692	if (tcp_sack_extend(sp, seq, end_seq)) {
4693	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4694	tcp_sack_compress_send_ack(sk);
4695	/ Rotate this_sack to the first one. /
4696	for (; this_sack > `0`; this_sack--, sp--)
4697	swap(sp, (sp - `1`));
4698	if (cur_sacks > `1`)
4699	tcp_sack_maybe_coalesce(tp);
4700	return;
4701	}
4702	}
4703
4704	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4705	tcp_sack_compress_send_ack(sk);
4706
4707	/ Could not find an adjacent existing SACK, build a new one,*
4708	* put it at the front, and shift everyone else down. We
4709	* always know there is at least one SACK present already here.
4710	*
4711	* If the sack array is full, forget about the last one.
4712	*/
4713	if (this_sack >= TCP_NUM_SACKS) {
4714	this_sack--;
4715	tp->rx_opt.num_sacks--;
4716	sp--;
4717	}
4718	for (; this_sack > `0`; this_sack--, sp--)
4719	sp = (sp - `1`);
4720
4721	new_sack:
4722	/ Build the new head SACK, and we're done. /
4723	sp->start_seq = seq;
4724	sp->end_seq = end_seq;
4725	tp->rx_opt.num_sacks++;
4726	}
4727
4728	/ RCV.NXT advances, some SACKs should be eaten. /
4729
4730	static void tcp_sack_remove(struct tcp_sock *tp)
4731	{
4732	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
4733	int num_sacks = tp->rx_opt.num_sacks;
4734	int this_sack;
4735
4736	/ Empty ofo queue, hence, all the SACKs are eaten. Clear. /
4737	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4738	tp->rx_opt.num_sacks = `0`;
4739	return;
4740	}
4741
4742	for (this_sack = `0`; this_sack < num_sacks;) {
4743	/ Check if the start of the sack is covered by RCV.NXT. /
4744	if (!before(seq1: tp->rcv_nxt, seq2: sp->start_seq)) {
4745	int i;
4746
4747	/ RCV.NXT must cover all the block! /
4748	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4749
4750	/ Zap this SACK, by moving forward any other SACKS. /
4751	for (i = this_sack+`1`; i < num_sacks; i++)
4752	tp->selective_acks[i-`1`] = tp->selective_acks[i];
4753	num_sacks--;
4754	continue;
4755	}
4756	this_sack++;
4757	sp++;
4758	}
4759	tp->rx_opt.num_sacks = num_sacks;
4760	}
4761
4762	/**
4763	* tcp_try_coalesce - try to merge skb to prior one
4764	* @sk: socket
4765	* @to: prior buffer
4766	* @from: buffer to add in queue
4767	* @fragstolen: pointer to boolean
4768	*
4769	* Before queueing skb @from after @to, try to merge them
4770	* to reduce overall memory use and queue lengths, if cost is small.
4771	* Packets in ofo or receive queues can stay a long time.
4772	* Better try to coalesce them right now to avoid future collapses.
4773	* Returns true if caller should free @from instead of queueing it
4774	*/
4775	static bool tcp_try_coalesce(struct sock *sk,
4776	struct sk_buff *to,
4777	struct sk_buff *from,
4778	bool *fragstolen)
4779	{
4780	int delta;
4781
4782	*fragstolen = false;
4783
4784	/ Its possible this segment overlaps with prior segment in queue /
4785	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4786	return false;
4787
4788	if (!mptcp_skb_can_collapse(to, from))
4789	return false;
4790
4791	#ifdef CONFIG_TLS_DEVICE
4792	if (from->decrypted != to->decrypted)
4793	return false;
4794	#endif
4795
4796	if (!skb_try_coalesce(to, from, fragstolen, delta_truesize: &delta))
4797	return false;
4798
4799	atomic_add(i: delta, v: &sk->sk_rmem_alloc);
4800	sk_mem_charge(sk, size: delta);
4801	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4802	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4803	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4804	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
4805
4806	if (TCP_SKB_CB(from)->has_rxtstamp) {
4807	TCP_SKB_CB(to)->has_rxtstamp = true;
4808	to->tstamp = from->tstamp;
4809	skb_hwtstamps(skb: to)->hwtstamp = skb_hwtstamps(skb: from)->hwtstamp;
4810	}
4811
4812	return true;
4813	}
4814
4815	static bool tcp_ooo_try_coalesce(struct sock *sk,
4816	struct sk_buff *to,
4817	struct sk_buff *from,
4818	bool *fragstolen)
4819	{
4820	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
4821
4822	/ In case tcp_drop_reason() is called later, update to->gso_segs /
4823	if (res) {
4824	u32 gso_segs = max_t(u16, `1`, skb_shinfo(to)->gso_segs) +
4825	max_t(u16, `1`, skb_shinfo(from)->gso_segs);
4826
4827	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, `0xFFFF`);
4828	}
4829	return res;
4830	}
4831
4832	static void tcp_drop_reason(struct sock sk, struct* sk_buff *skb,
4833	enum skb_drop_reason reason)
4834	{
4835	sk_drops_add(sk, skb);
4836	kfree_skb_reason(skb, reason);
4837	}
4838
4839	/ This one checks to see if we can put data from the*
4840	* out_of_order queue into the receive_queue.
4841	*/
4842	static void tcp_ofo_queue(struct sock *sk)
4843	{
4844	struct tcp_sock *tp = tcp_sk(sk);
4845	__u32 dsack_high = tp->rcv_nxt;
4846	bool fin, fragstolen, eaten;
4847	struct sk_buff skb, tail;
4848	struct rb_node *p;
4849
4850	p = rb_first(&tp->out_of_order_queue);
4851	while (p) {
4852	skb = rb_to_skb(p);
4853	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4854	break;
4855
4856	if (before(TCP_SKB_CB(skb)->seq, seq2: dsack_high)) {
4857	__u32 dsack = dsack_high;
4858	if (before(TCP_SKB_CB(skb)->end_seq, seq2: dsack_high))
4859	dsack_high = TCP_SKB_CB(skb)->end_seq;
4860	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, end_seq: dsack);
4861	}
4862	p = rb_next(p);
4863	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4864
4865	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4866	tcp_drop_reason(sk, skb, reason: SKB_DROP_REASON_TCP_OFO_DROP);
4867	continue;
4868	}
4869
4870	tail = skb_peek_tail(list_: &sk->sk_receive_queue);
4871	eaten = tail && tcp_try_coalesce(sk, to: tail, from: skb, fragstolen: &fragstolen);
4872	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4873	fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4874	if (!eaten)
4875	__skb_queue_tail(list: &sk->sk_receive_queue, newsk: skb);
4876	else
4877	kfree_skb_partial(skb, head_stolen: fragstolen);
4878
4879	if (unlikely(fin)) {
4880	tcp_fin(sk);
4881	/ tcp_fin() purges tp->out_of_order_queue,*
4882	* so we must end this loop right now.
4883	*/
4884	break;
4885	}
4886	}
4887	}
4888
4889	static bool tcp_prune_ofo_queue(struct sock sk, const* struct sk_buff *in_skb);
4890	static int tcp_prune_queue(struct sock sk, const* struct sk_buff *in_skb);
4891
4892	static int tcp_try_rmem_schedule(struct sock sk, struct* sk_buff *skb,
4893	unsigned int size)
4894	{
4895	if (atomic_read(v: &sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
4896	!sk_rmem_schedule(sk, skb, size)) {
4897
4898	if (tcp_prune_queue(sk, in_skb: skb) < `0`)
4899	return -`1`;
4900
4901	while (!sk_rmem_schedule(sk, skb, size)) {
4902	if (!tcp_prune_ofo_queue(sk, in_skb: skb))
4903	return -`1`;
4904	}
4905	}
4906	return `0`;
4907	}
4908
4909	static void tcp_data_queue_ofo(struct sock sk, struct* sk_buff *skb)
4910	{
4911	struct tcp_sock *tp = tcp_sk(sk);
4912	struct rb_node *p, parent;
4913	struct sk_buff *skb1;
4914	u32 seq, end_seq;
4915	bool fragstolen;
4916
4917	tcp_save_lrcv_flowlabel(sk, skb);
4918	tcp_ecn_check_ce(sk, skb);
4919
4920	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4921	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
4922	sk->sk_data_ready(sk);
4923	tcp_drop_reason(sk, skb, reason: SKB_DROP_REASON_PROTO_MEM);
4924	return;
4925	}
4926
4927	/ Disable header prediction. /
4928	tp->pred_flags = `0`;
4929	inet_csk_schedule_ack(sk);
4930
4931	tp->rcv_ooopack += max_t(u16, `1`, skb_shinfo(skb)->gso_segs);
4932	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
4933	seq = TCP_SKB_CB(skb)->seq;
4934	end_seq = TCP_SKB_CB(skb)->end_seq;
4935
4936	p = &tp->out_of_order_queue.rb_node;
4937	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4938	/ Initial out of order segment, build 1 SACK. /
4939	if (tcp_is_sack(tp)) {
4940	tp->rx_opt.num_sacks = `1`;
4941	tp->selective_acks[`0`].start_seq = seq;
4942	tp->selective_acks[`0`].end_seq = end_seq;
4943	}
4944	rb_link_node(node: &skb->rbnode, NULL, rb_link: p);
4945	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
4946	tp->ooo_last_skb = skb;
4947	goto end;
4948	}
4949
4950	/ In the typical case, we are adding an skb to the end of the list.*
4951	* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
4952	*/
4953	if (tcp_ooo_try_coalesce(sk, to: tp->ooo_last_skb,
4954	from: skb, fragstolen: &fragstolen)) {
4955	coalesce_done:
4956	/ For non sack flows, do not grow window to force DUPACK*
4957	* and trigger fast retransmit.
4958	*/
4959	if (tcp_is_sack(tp))
4960	tcp_grow_window(sk, skb, adjust: true);
4961	kfree_skb_partial(skb, head_stolen: fragstolen);
4962	skb = NULL;
4963	goto add_sack;
4964	}
4965	/ Can avoid an rbtree lookup if we are adding skb after ooo_last_skb /
4966	if (!before(seq1: seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
4967	parent = &tp->ooo_last_skb->rbnode;
4968	p = &parent->rb_right;
4969	goto insert;
4970	}
4971
4972	/ Find place to insert this segment. Handle overlaps on the way. /
4973	parent = NULL;
4974	while (*p) {
4975	parent = *p;
4976	skb1 = rb_to_skb(parent);
4977	if (before(seq1: seq, TCP_SKB_CB(skb1)->seq)) {
4978	p = &parent->rb_left;
4979	continue;
4980	}
4981	if (before(seq1: seq, TCP_SKB_CB(skb1)->end_seq)) {
4982	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
4983	/ All the bits are present. Drop. /
4984	NET_INC_STATS(sock_net(sk),
4985	LINUX_MIB_TCPOFOMERGE);
4986	tcp_drop_reason(sk, skb,
4987	reason: SKB_DROP_REASON_TCP_OFOMERGE);
4988	skb = NULL;
4989	tcp_dsack_set(sk, seq, end_seq);
4990	goto add_sack;
4991	}
4992	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
4993	/ Partial overlap. /
4994	tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
4995	} else {
4996	/ skb's seq == skb1's seq and skb covers skb1.*
4997	* Replace skb1 with skb.
4998	*/
4999	rb_replace_node(victim: &skb1->rbnode, new: &skb->rbnode,
5000	root: &tp->out_of_order_queue);
5001	tcp_dsack_extend(sk,
5002	TCP_SKB_CB(skb1)->seq,
5003	TCP_SKB_CB(skb1)->end_seq);
5004	NET_INC_STATS(sock_net(sk),
5005	LINUX_MIB_TCPOFOMERGE);
5006	tcp_drop_reason(sk, skb: skb1,
5007	reason: SKB_DROP_REASON_TCP_OFOMERGE);
5008	goto merge_right;
5009	}
5010	} else if (tcp_ooo_try_coalesce(sk, to: skb1,
5011	from: skb, fragstolen: &fragstolen)) {
5012	goto coalesce_done;
5013	}
5014	p = &parent->rb_right;
5015	}
5016	insert:
5017	/ Insert segment into RB tree. /
5018	rb_link_node(node: &skb->rbnode, parent, rb_link: p);
5019	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
5020
5021	merge_right:
5022	/ Remove other segments covered by skb. /
5023	while ((skb1 = skb_rb_next(skb)) != NULL) {
5024	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
5025	break;
5026	if (before(seq1: end_seq, TCP_SKB_CB(skb1)->end_seq)) {
5027	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
5028	end_seq);
5029	break;
5030	}
5031	rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
5032	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
5033	TCP_SKB_CB(skb1)->end_seq);
5034	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
5035	tcp_drop_reason(sk, skb: skb1, reason: SKB_DROP_REASON_TCP_OFOMERGE);
5036	}
5037	/ If there is no skb after us, we are the last_skb ! /
5038	if (!skb1)
5039	tp->ooo_last_skb = skb;
5040
5041	add_sack:
5042	if (tcp_is_sack(tp))
5043	tcp_sack_new_ofo_skb(sk, seq, end_seq);
5044	end:
5045	if (skb) {
5046	/ For non sack flows, do not grow window to force DUPACK*
5047	* and trigger fast retransmit.
5048	*/
5049	if (tcp_is_sack(tp))
5050	tcp_grow_window(sk, skb, adjust: false);
5051	skb_condense(skb);
5052	skb_set_owner_r(skb, sk);
5053	}
5054	}
5055
5056	static int __must_check tcp_queue_rcv(struct sock sk, struct* sk_buff *skb,
5057	bool *fragstolen)
5058	{
5059	int eaten;
5060	struct sk_buff *tail = skb_peek_tail(list_: &sk->sk_receive_queue);
5061
5062	eaten = (tail &&
5063	tcp_try_coalesce(sk, to: tail,
5064	from: skb, fragstolen)) ? `1` : `0`;
5065	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
5066	if (!eaten) {
5067	__skb_queue_tail(list: &sk->sk_receive_queue, newsk: skb);
5068	skb_set_owner_r(skb, sk);
5069	}
5070	return eaten;
5071	}
5072
5073	int tcp_send_rcvq(struct sock sk, struct* msghdr *msg, size_t size)
5074	{
5075	struct sk_buff *skb;
5076	int err = -ENOMEM;
5077	int data_len = `0`;
5078	bool fragstolen;
5079
5080	if (size == `0`)
5081	return `0`;
5082
5083	if (size > PAGE_SIZE) {
5084	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
5085
5086	data_len = npages << PAGE_SHIFT;
5087	size = data_len + (size & ~PAGE_MASK);
5088	}
5089	skb = alloc_skb_with_frags(header_len: size - data_len, data_len,
5090	PAGE_ALLOC_COSTLY_ORDER,
5091	errcode: &err, gfp_mask: sk->sk_allocation);
5092	if (!skb)
5093	goto err;
5094
5095	skb_put(skb, len: size - data_len);
5096	skb->data_len = data_len;
5097	skb->len = size;
5098
5099	if (tcp_try_rmem_schedule(sk, skb, size: skb->truesize)) {
5100	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
5101	goto err_free;
5102	}
5103
5104	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len: size);
5105	if (err)
5106	goto err_free;
5107
5108	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
5109	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
5110	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - `1`;
5111
5112	if (tcp_queue_rcv(sk, skb, fragstolen: &fragstolen)) {
5113	WARN_ON_ONCE(fragstolen); / should not happen /
5114	__kfree_skb(skb);
5115	}
5116	return size;
5117
5118	err_free:
5119	kfree_skb(skb);
5120	err:
5121	return err;
5122
5123	}
5124
5125	void tcp_data_ready(struct sock *sk)
5126	{
5127	if (tcp_epollin_ready(sk, target: sk->sk_rcvlowat) \|\| sock_flag(sk, flag: SOCK_DONE))
5128	sk->sk_data_ready(sk);
5129	}
5130
5131	static void tcp_data_queue(struct sock sk, struct* sk_buff *skb)
5132	{
5133	struct tcp_sock *tp = tcp_sk(sk);
5134	enum skb_drop_reason reason;
5135	bool fragstolen;
5136	int eaten;
5137
5138	/ If a subflow has been reset, the packet should not continue*
5139	* to be processed, drop the packet.
5140	*/
5141	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
5142	__kfree_skb(skb);
5143	return;
5144	}
5145
5146	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
5147	__kfree_skb(skb);
5148	return;
5149	}
5150	skb_dst_drop(skb);
5151	__skb_pull(skb, len: tcp_hdr(skb)->doff * `4`);
5152
5153	reason = SKB_DROP_REASON_NOT_SPECIFIED;
5154	tp->rx_opt.dsack = `0`;
5155
5156	/ Queue data for delivery to the user.*
5157	* Packets in sequence go to the receive queue.
5158	* Out of sequence packets to the out_of_order_queue.
5159	*/
5160	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
5161	if (tcp_receive_window(tp) == `0`) {
5162	reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5163	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
5164	goto out_of_window;
5165	}
5166
5167	/ Ok. In sequence. In window. /
5168	queue_and_out:
5169	if (tcp_try_rmem_schedule(sk, skb, size: skb->truesize)) {
5170	/ TODO: maybe ratelimit these WIN 0 ACK ? /
5171	inet_csk(sk)->icsk_ack.pending \|=
5172	(ICSK_ACK_NOMEM \| ICSK_ACK_NOW);
5173	inet_csk_schedule_ack(sk);
5174	sk->sk_data_ready(sk);
5175
5176	if (skb_queue_len(list_: &sk->sk_receive_queue)) {
5177	reason = SKB_DROP_REASON_PROTO_MEM;
5178	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
5179	goto drop;
5180	}
5181	sk_forced_mem_schedule(sk, size: skb->truesize);
5182	}
5183
5184	eaten = tcp_queue_rcv(sk, skb, fragstolen: &fragstolen);
5185	if (skb->len)
5186	tcp_event_data_recv(sk, skb);
5187	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
5188	tcp_fin(sk);
5189
5190	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5191	tcp_ofo_queue(sk);
5192
5193	/ RFC5681. 4.2. SHOULD send immediate ACK, when*
5194	* gap in queue is filled.
5195	*/
5196	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5197	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
5198	}
5199
5200	if (tp->rx_opt.num_sacks)
5201	tcp_sack_remove(tp);
5202
5203	tcp_fast_path_check(sk);
5204
5205	if (eaten > `0`)
5206	kfree_skb_partial(skb, head_stolen: fragstolen);
5207	if (!sock_flag(sk, flag: SOCK_DEAD))
5208	tcp_data_ready(sk);
5209	return;
5210	}
5211
5212	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
5213	tcp_rcv_spurious_retrans(sk, skb);
5214	/ A retransmit, 2nd most common case. Force an immediate ack. /
5215	reason = SKB_DROP_REASON_TCP_OLD_DATA;
5216	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
5217	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
5218
5219	out_of_window:
5220	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
5221	inet_csk_schedule_ack(sk);
5222	drop:
5223	tcp_drop_reason(sk, skb, reason);
5224	return;
5225	}
5226
5227	/ Out of window. F.e. zero window probe. /
5228	if (!before(TCP_SKB_CB(skb)->seq,
5229	seq2: tp->rcv_nxt + tcp_receive_window(tp))) {
5230	reason = SKB_DROP_REASON_TCP_OVERWINDOW;
5231	goto out_of_window;
5232	}
5233
5234	if (before(TCP_SKB_CB(skb)->seq, seq2: tp->rcv_nxt)) {
5235	/ Partial packet, seq < rcv_next < end_seq /
5236	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq: tp->rcv_nxt);
5237
5238	/ If window is closed, drop tail of packet. But after*
5239	* remembering D-SACK for its head made in previous line.
5240	*/
5241	if (!tcp_receive_window(tp)) {
5242	reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5243	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
5244	goto out_of_window;
5245	}
5246	goto queue_and_out;
5247	}
5248
5249	tcp_data_queue_ofo(sk, skb);
5250	}
5251
5252	static struct sk_buff tcp_skb_next(struct* sk_buff skb, struct* sk_buff_head *list)
5253	{
5254	if (list)
5255	return !skb_queue_is_last(list, skb) ? skb->next : NULL;
5256
5257	return skb_rb_next(skb);
5258	}
5259
5260	static struct sk_buff tcp_collapse_one(struct* sock sk, struct* sk_buff *skb,
5261	struct sk_buff_head *list,
5262	struct rb_root *root)
5263	{
5264	struct sk_buff *next = tcp_skb_next(skb, list);
5265
5266	if (list)
5267	__skb_unlink(skb, list);
5268	else
5269	rb_erase(&skb->rbnode, root);
5270
5271	__kfree_skb(skb);
5272	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
5273
5274	return next;
5275	}
5276
5277	/ Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq /
5278	void tcp_rbtree_insert(struct rb_root root, struct* sk_buff *skb)
5279	{
5280	struct rb_node **p = &root->rb_node;
5281	struct rb_node *parent = NULL;
5282	struct sk_buff *skb1;
5283
5284	while (*p) {
5285	parent = *p;
5286	skb1 = rb_to_skb(parent);
5287	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
5288	p = &parent->rb_left;
5289	else
5290	p = &parent->rb_right;
5291	}
5292	rb_link_node(node: &skb->rbnode, parent, rb_link: p);
5293	rb_insert_color(&skb->rbnode, root);
5294	}
5295
5296	/ Collapse contiguous sequence of skbs head..tail with*
5297	* sequence numbers start..end.
5298	*
5299	* If tail is NULL, this means until the end of the queue.
5300	*
5301	* Segments with FIN/SYN are not collapsed (only because this
5302	* simplifies code)
5303	*/
5304	static void
5305	tcp_collapse(struct sock sk, struct* sk_buff_head list, struct* rb_root *root,
5306	struct sk_buff head, struct* sk_buff *tail, u32 start, u32 end)
5307	{
5308	struct sk_buff skb = head, n;
5309	struct sk_buff_head tmp;
5310	bool end_of_skbs;
5311
5312	/ First, check that queue is collapsible and find*
5313	* the point where collapsing can be useful.
5314	*/
5315	restart:
5316	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
5317	n = tcp_skb_next(skb, list);
5318
5319	/ No new bits? It is possible on ofo queue. /
5320	if (!before(seq1: start, TCP_SKB_CB(skb)->end_seq)) {
5321	skb = tcp_collapse_one(sk, skb, list, root);
5322	if (!skb)
5323	break;
5324	goto restart;
5325	}
5326
5327	/ The first skb to collapse is:*
5328	* - not SYN/FIN and
5329	* - bloated or contains data before "start" or
5330	* overlaps to the next one and mptcp allow collapsing.
5331	*/
5332	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
5333	(tcp_win_from_space(sk, space: skb->truesize) > skb->len \|\|
5334	before(TCP_SKB_CB(skb)->seq, seq2: start))) {
5335	end_of_skbs = false;
5336	break;
5337	}
5338
5339	if (n && n != tail && mptcp_skb_can_collapse(to: skb, from: n) &&
5340	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
5341	end_of_skbs = false;
5342	break;
5343	}
5344
5345	/ Decided to skip this, advance start seq. /
5346	start = TCP_SKB_CB(skb)->end_seq;
5347	}
5348	if (end_of_skbs \|\|
5349	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
5350	return;
5351
5352	__skb_queue_head_init(list: &tmp);
5353
5354	while (before(seq1: start, seq2: end)) {
5355	int copy = min_t(int, SKB_MAX_ORDER(`0`, `0`), end - start);
5356	struct sk_buff *nskb;
5357
5358	nskb = alloc_skb(size: copy, GFP_ATOMIC);
5359	if (!nskb)
5360	break;
5361
5362	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
5363	#ifdef CONFIG_TLS_DEVICE
5364	nskb->decrypted = skb->decrypted;
5365	#endif
5366	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
5367	if (list)
5368	__skb_queue_before(list, next: skb, newsk: nskb);
5369	else
5370	__skb_queue_tail(list: &tmp, newsk: nskb); / defer rbtree insertion /
5371	skb_set_owner_r(skb: nskb, sk);
5372	mptcp_skb_ext_move(to: nskb, from: skb);
5373
5374	/ Copy data, releasing collapsed skbs. /
5375	while (copy > `0`) {
5376	int offset = start - TCP_SKB_CB(skb)->seq;
5377	int size = TCP_SKB_CB(skb)->end_seq - start;
5378
5379	BUG_ON(offset < `0`);
5380	if (size > `0`) {
5381	size = min(copy, size);
5382	if (skb_copy_bits(skb, offset, to: skb_put(skb: nskb, len: size), len: size))
5383	BUG();
5384	TCP_SKB_CB(nskb)->end_seq += size;
5385	copy -= size;
5386	start += size;
5387	}
5388	if (!before(seq1: start, TCP_SKB_CB(skb)->end_seq)) {
5389	skb = tcp_collapse_one(sk, skb, list, root);
5390	if (!skb \|\|
5391	skb == tail \|\|
5392	!mptcp_skb_can_collapse(to: nskb, from: skb) \|\|
5393	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)))
5394	goto end;
5395	#ifdef CONFIG_TLS_DEVICE
5396	if (skb->decrypted != nskb->decrypted)
5397	goto end;
5398	#endif
5399	}
5400	}
5401	}
5402	end:
5403	skb_queue_walk_safe(&tmp, skb, n)
5404	tcp_rbtree_insert(root, skb);
5405	}
5406
5407	/ Collapse ofo queue. Algorithm: select contiguous sequence of skbs*
5408	* and tcp_collapse() them until all the queue is collapsed.
5409	*/
5410	static void tcp_collapse_ofo_queue(struct sock *sk)
5411	{
5412	struct tcp_sock *tp = tcp_sk(sk);
5413	u32 range_truesize, sum_tiny = `0`;
5414	struct sk_buff skb, head;
5415	u32 start, end;
5416
5417	skb = skb_rb_first(&tp->out_of_order_queue);
5418	new_range:
5419	if (!skb) {
5420	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
5421	return;
5422	}
5423	start = TCP_SKB_CB(skb)->seq;
5424	end = TCP_SKB_CB(skb)->end_seq;
5425	range_truesize = skb->truesize;
5426
5427	for (head = skb;;) {
5428	skb = skb_rb_next(skb);
5429
5430	/ Range is terminated when we see a gap or when*
5431	* we are at the queue end.
5432	*/
5433	if (!skb \|\|
5434	after(TCP_SKB_CB(skb)->seq, end) \|\|
5435	before(TCP_SKB_CB(skb)->end_seq, seq2: start)) {
5436	/ Do not attempt collapsing tiny skbs /
5437	if (range_truesize != head->truesize \|\|
5438	end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
5439	tcp_collapse(sk, NULL, root: &tp->out_of_order_queue,
5440	head, tail: skb, start, end);
5441	} else {
5442	sum_tiny += range_truesize;
5443	if (sum_tiny > sk->sk_rcvbuf >> `3`)
5444	return;
5445	}
5446	goto new_range;
5447	}
5448
5449	range_truesize += skb->truesize;
5450	if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
5451	start = TCP_SKB_CB(skb)->seq;
5452	if (after(TCP_SKB_CB(skb)->end_seq, end))
5453	end = TCP_SKB_CB(skb)->end_seq;
5454	}
5455	}
5456
5457	/*
5458	* Clean the out-of-order queue to make room.
5459	* We drop high sequences packets to :
5460	* 1) Let a chance for holes to be filled.
5461	* This means we do not drop packets from ooo queue if their sequence
5462	* is before incoming packet sequence.
5463	* 2) not add too big latencies if thousands of packets sit there.
5464	* (But if application shrinks SO_RCVBUF, we could still end up
5465	* freeing whole queue here)
5466	* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
5467	*
5468	* Return true if queue has shrunk.
5469	*/
5470	static bool tcp_prune_ofo_queue(struct sock sk, const* struct sk_buff *in_skb)
5471	{
5472	struct tcp_sock *tp = tcp_sk(sk);
5473	struct rb_node node, prev;
5474	bool pruned = false;
5475	int goal;
5476
5477	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5478	return false;
5479
5480	goal = sk->sk_rcvbuf >> `3`;
5481	node = &tp->ooo_last_skb->rbnode;
5482
5483	do {
5484	struct sk_buff *skb = rb_to_skb(node);
5485
5486	/ If incoming skb would land last in ofo queue, stop pruning. /
5487	if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
5488	break;
5489	pruned = true;
5490	prev = rb_prev(node);
5491	rb_erase(node, &tp->out_of_order_queue);
5492	goal -= skb->truesize;
5493	tcp_drop_reason(sk, skb, reason: SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
5494	tp->ooo_last_skb = rb_to_skb(prev);
5495	if (!prev \|\| goal <= `0`) {
5496	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
5497	!tcp_under_memory_pressure(sk))
5498	break;
5499	goal = sk->sk_rcvbuf >> `3`;
5500	}
5501	node = prev;
5502	} while (node);
5503
5504	if (pruned) {
5505	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
5506	/ Reset SACK state. A conforming SACK implementation will*
5507	* do the same at a timeout based retransmit. When a connection
5508	* is in a sad state like this, we care only about integrity
5509	* of the connection not performance.
5510	*/
5511	if (tp->rx_opt.sack_ok)
5512	tcp_sack_reset(rx_opt: &tp->rx_opt);
5513	}
5514	return pruned;
5515	}
5516
5517	/ Reduce allocated memory if we can, trying to get*
5518	* the socket within its memory limits again.
5519	*
5520	* Return less than zero if we should start dropping frames
5521	* until the socket owning process reads some of the data
5522	* to stabilize the situation.
5523	*/
5524	static int tcp_prune_queue(struct sock sk, const* struct sk_buff *in_skb)
5525	{
5526	struct tcp_sock *tp = tcp_sk(sk);
5527
5528	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
5529
5530	if (atomic_read(v: &sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
5531	tcp_clamp_window(sk);
5532	else if (tcp_under_memory_pressure(sk))
5533	tcp_adjust_rcv_ssthresh(sk);
5534
5535	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5536	return `0`;
5537
5538	tcp_collapse_ofo_queue(sk);
5539	if (!skb_queue_empty(list: &sk->sk_receive_queue))
5540	tcp_collapse(sk, list: &sk->sk_receive_queue, NULL,
5541	head: skb_peek(list_: &sk->sk_receive_queue),
5542	NULL,
5543	start: tp->copied_seq, end: tp->rcv_nxt);
5544
5545	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5546	return `0`;
5547
5548	/ Collapsing did not help, destructive actions follow.*
5549	* This must not ever occur. */
5550
5551	tcp_prune_ofo_queue(sk, in_skb);
5552
5553	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5554	return `0`;
5555
5556	/ If we are really being abused, tell the caller to silently*
5557	* drop receive data on the floor. It will get retransmitted
5558	* and hopefully then we'll have sufficient space.
5559	*/
5560	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
5561
5562	/ Massive buffer overcommit. /
5563	tp->pred_flags = `0`;
5564	return -`1`;
5565	}
5566
5567	static bool tcp_should_expand_sndbuf(struct sock *sk)
5568	{
5569	const struct tcp_sock *tp = tcp_sk(sk);
5570
5571	/ If the user specified a specific send buffer setting, do*
5572	* not modify it.
5573	*/
5574	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
5575	return false;
5576
5577	/ If we are under global TCP memory pressure, do not expand. /
5578	if (tcp_under_memory_pressure(sk)) {
5579	int unused_mem = sk_unused_reserved_mem(sk);
5580
5581	/ Adjust sndbuf according to reserved mem. But make sure*
5582	* it never goes below SOCK_MIN_SNDBUF.
5583	* See sk_stream_moderate_sndbuf() for more details.
5584	*/
5585	if (unused_mem > SOCK_MIN_SNDBUF)
5586	WRITE_ONCE(sk->sk_sndbuf, unused_mem);
5587
5588	return false;
5589	}
5590
5591	/ If we are under soft global TCP memory pressure, do not expand. /
5592	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, index: `0`))
5593	return false;
5594
5595	/ If we filled the congestion window, do not expand. /
5596	if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
5597	return false;
5598
5599	return true;
5600	}
5601
5602	static void tcp_new_space(struct sock *sk)
5603	{
5604	struct tcp_sock *tp = tcp_sk(sk);
5605
5606	if (tcp_should_expand_sndbuf(sk)) {
5607	tcp_sndbuf_expand(sk);
5608	tp->snd_cwnd_stamp = tcp_jiffies32;
5609	}
5610
5611	INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
5612	}
5613
5614	/ Caller made space either from:*
5615	* 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
5616	* 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
5617	*
5618	* We might be able to generate EPOLLOUT to the application if:
5619	* 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
5620	* 2) notsent amount (tp->write_seq - tp->snd_nxt) became
5621	* small enough that tcp_stream_memory_free() decides it
5622	* is time to generate EPOLLOUT.
5623	*/
5624	void tcp_check_space(struct sock *sk)
5625	{
5626	/ pairs with tcp_poll() /
5627	smp_mb();
5628	if (sk->sk_socket &&
5629	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5630	tcp_new_space(sk);
5631	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5632	tcp_chrono_stop(sk, type: TCP_CHRONO_SNDBUF_LIMITED);
5633	}
5634	}
5635
5636	static inline void tcp_data_snd_check(struct sock *sk)
5637	{
5638	tcp_push_pending_frames(sk);
5639	tcp_check_space(sk);
5640	}
5641
5642	/*
5643	* Check if sending an ack is needed.
5644	*/
5645	static void __tcp_ack_snd_check(struct sock sk, int* ofo_possible)
5646	{
5647	struct tcp_sock *tp = tcp_sk(sk);
5648	unsigned long rtt, delay;
5649
5650	/ More than one full frame received... /
5651	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5652	/ ... and right edge of window advances far enough.*
5653	* (tcp_recvmsg() will send ACK otherwise).
5654	* If application uses SO_RCVLOWAT, we want send ack now if
5655	* we have not received enough bytes to satisfy the condition.
5656	*/
5657	(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat \|\|
5658	__tcp_select_window(sk) >= tp->rcv_wnd)) \|\|
5659	/ We ACK each frame or... /
5660	tcp_in_quickack_mode(sk) \|\|
5661	/ Protocol state mandates a one-time immediate ACK /
5662	inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
5663	/ If we are running from __release_sock() in user context,*
5664	* Defer the ack until tcp_release_cb().
5665	*/
5666	if (sock_owned_by_user_nocheck(sk) &&
5667	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
5668	set_bit(nr: TCP_ACK_DEFERRED, addr: &sk->sk_tsq_flags);
5669	return;
5670	}
5671	send_now:
5672	tcp_send_ack(sk);
5673	return;
5674	}
5675
5676	if (!ofo_possible \|\| RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5677	tcp_send_delayed_ack(sk);
5678	return;
5679	}
5680
5681	if (!tcp_is_sack(tp) \|\|
5682	tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
5683	goto send_now;
5684
5685	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
5686	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5687	tp->dup_ack_counter = `0`;
5688	}
5689	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
5690	tp->dup_ack_counter++;
5691	goto send_now;
5692	}
5693	tp->compressed_ack++;
5694	if (hrtimer_is_queued(timer: &tp->compressed_ack_timer))
5695	return;
5696
5697	/ compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns /
5698
5699	rtt = tp->rcv_rtt_est.rtt_us;
5700	if (tp->srtt_us && tp->srtt_us < rtt)
5701	rtt = tp->srtt_us;
5702
5703	delay = min_t(unsigned long,
5704	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
5705	rtt * (NSEC_PER_USEC >> `3`)/`20`);
5706	sock_hold(sk);
5707	hrtimer_start_range_ns(timer: &tp->compressed_ack_timer, tim: ns_to_ktime(ns: delay),
5708	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
5709	mode: HRTIMER_MODE_REL_PINNED_SOFT);
5710	}
5711
5712	static inline void tcp_ack_snd_check(struct sock *sk)
5713	{
5714	if (!inet_csk_ack_scheduled(sk)) {
5715	/ We sent a data segment already. /
5716	return;
5717	}
5718	__tcp_ack_snd_check(sk, ofo_possible: `1`);
5719	}
5720
5721	/*
5722	* This routine is only called when we have urgent data
5723	* signaled. Its the 'slow' part of tcp_urg. It could be
5724	* moved inline now as tcp_urg is only called from one
5725	* place. We handle URGent data wrong. We have to - as
5726	* BSD still doesn't use the correction from RFC961.
5727	* For 1003.1g we should support a new option TCP_STDURG to permit
5728	* either form (or just set the sysctl tcp_stdurg).
5729	*/
5730
5731	static void tcp_check_urg(struct sock sk, const* struct tcphdr *th)
5732	{
5733	struct tcp_sock *tp = tcp_sk(sk);
5734	u32 ptr = ntohs(th->urg_ptr);
5735
5736	if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
5737	ptr--;
5738	ptr += ntohl(th->seq);
5739
5740	/ Ignore urgent data that we've already seen and read. /
5741	if (after(tp->copied_seq, ptr))
5742	return;
5743
5744	/ Do not replay urg ptr.*
5745	*
5746	* NOTE: interesting situation not covered by specs.
5747	* Misbehaving sender may send urg ptr, pointing to segment,
5748	* which we already have in ofo queue. We are not able to fetch
5749	* such data and will stay in TCP_URG_NOTYET until will be eaten
5750	* by recvmsg(). Seems, we are not obliged to handle such wicked
5751	* situations. But it is worth to think about possibility of some
5752	* DoSes using some hypothetical application level deadlock.
5753	*/
5754	if (before(seq1: ptr, seq2: tp->rcv_nxt))
5755	return;
5756
5757	/ Do we already have a newer (or duplicate) urgent pointer? /
5758	if (tp->urg_data && !after(ptr, tp->urg_seq))
5759	return;
5760
5761	/ Tell the world about our new urgent pointer. /
5762	sk_send_sigurg(sk);
5763
5764	/ We may be adding urgent data when the last byte read was*
5765	* urgent. To do this requires some care. We cannot just ignore
5766	* tp->copied_seq since we would read the last urgent byte again
5767	* as data, nor can we alter copied_seq until this data arrives
5768	* or we break the semantics of SIOCATMARK (and thus sockatmark())
5769	*
5770	* NOTE. Double Dutch. Rendering to plain English: author of comment
5771	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
5772	* and expect that both A and B disappear from stream. This is _wrong_.
5773	* Though this happens in BSD with high probability, this is occasional.
5774	* Any application relying on this is buggy. Note also, that fix "works"
5775	* only in this artificial test. Insert some normal data between A and B and we will
5776	* decline of BSD again. Verdict: it is better to remove to trap
5777	* buggy users.
5778	*/
5779	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5780	!sock_flag(sk, flag: SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5781	struct sk_buff *skb = skb_peek(list_: &sk->sk_receive_queue);
5782	tp->copied_seq++;
5783	if (skb && !before(seq1: tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5784	__skb_unlink(skb, list: &sk->sk_receive_queue);
5785	__kfree_skb(skb);
5786	}
5787	}
5788
5789	WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
5790	WRITE_ONCE(tp->urg_seq, ptr);
5791
5792	/ Disable header prediction. /
5793	tp->pred_flags = `0`;
5794	}
5795
5796	/ This is the 'fast' part of urgent handling. /
5797	static void tcp_urg(struct sock sk, struct* sk_buff skb, const* struct tcphdr *th)
5798	{
5799	struct tcp_sock *tp = tcp_sk(sk);
5800
5801	/ Check if we get a new urgent pointer - normally not. /
5802	if (unlikely(th->urg))
5803	tcp_check_urg(sk, th);
5804
5805	/ Do we wait for any urgent data? - normally not... /
5806	if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
5807	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * `4`) -
5808	th->syn;
5809
5810	/ Is the urgent pointer pointing into this packet? /
5811	if (ptr < skb->len) {
5812	u8 tmp;
5813	if (skb_copy_bits(skb, offset: ptr, to: &tmp, len: `1`))
5814	BUG();
5815	WRITE_ONCE(tp->urg_data, TCP_URG_VALID \| tmp);
5816	if (!sock_flag(sk, flag: SOCK_DEAD))
5817	sk->sk_data_ready(sk);
5818	}
5819	}
5820	}
5821
5822	/ Accept RST for rcv_nxt - 1 after a FIN.*
5823	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5824	* FIN is sent followed by a RST packet. The RST is sent with the same
5825	* sequence number as the FIN, and thus according to RFC 5961 a challenge
5826	* ACK should be sent. However, Mac OSX rate limits replies to challenge
5827	* ACKs on the closed socket. In addition middleboxes can drop either the
5828	* challenge ACK or a subsequent RST.
5829	*/
5830	static bool tcp_reset_check(const struct sock sk, const* struct sk_buff *skb)
5831	{
5832	const struct tcp_sock *tp = tcp_sk(sk);
5833
5834	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - `1`) &&
5835	(`1` << sk->sk_state) & (TCPF_CLOSE_WAIT \| TCPF_LAST_ACK \|
5836	TCPF_CLOSING));
5837	}
5838
5839	/ Does PAWS and seqno based validation of an incoming segment, flags will*
5840	* play significant role here.
5841	*/
5842	static bool tcp_validate_incoming(struct sock sk, struct* sk_buff *skb,
5843	const struct tcphdr th, int* syn_inerr)
5844	{
5845	struct tcp_sock *tp = tcp_sk(sk);
5846	SKB_DR(reason);
5847
5848	/ RFC1323: H1. Apply PAWS check first. /
5849	if (tcp_fast_parse_options(net: sock_net(sk), skb, th, tp) &&
5850	tp->rx_opt.saw_tstamp &&
5851	tcp_paws_discard(sk, skb)) {
5852	if (!th->rst) {
5853	if (unlikely(th->syn))
5854	goto syn_challenge;
5855	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
5856	if (!tcp_oow_rate_limited(net: sock_net(sk), skb,
5857	mib_idx: LINUX_MIB_TCPACKSKIPPEDPAWS,
5858	last_oow_ack_time: &tp->last_oow_ack_time))
5859	tcp_send_dupack(sk, skb);
5860	SKB_DR_SET(reason, TCP_RFC7323_PAWS);
5861	goto discard;
5862	}
5863	/ Reset is accepted even if it did not pass PAWS. /
5864	}
5865
5866	/ Step 1: check sequence number /
5867	reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
5868	if (reason) {
5869	/ RFC793, page 37: "In all states except SYN-SENT, all reset*
5870	* (RST) segments are validated by checking their SEQ-fields."
5871	* And page 69: "If an incoming segment is not acceptable,
5872	* an acknowledgment should be sent in reply (unless the RST
5873	* bit is set, if so drop the segment and return)".
5874	*/
5875	if (!th->rst) {
5876	if (th->syn)
5877	goto syn_challenge;
5878	if (!tcp_oow_rate_limited(net: sock_net(sk), skb,
5879	mib_idx: LINUX_MIB_TCPACKSKIPPEDSEQ,
5880	last_oow_ack_time: &tp->last_oow_ack_time))
5881	tcp_send_dupack(sk, skb);
5882	} else if (tcp_reset_check(sk, skb)) {
5883	goto reset;
5884	}
5885	goto discard;
5886	}
5887
5888	/ Step 2: check RST bit /
5889	if (th->rst) {
5890	/ RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a*
5891	* FIN and SACK too if available):
5892	* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
5893	* the right-most SACK block,
5894	* then
5895	* RESET the connection
5896	* else
5897	* Send a challenge ACK
5898	*/
5899	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt \|\|
5900	tcp_reset_check(sk, skb))
5901	goto reset;
5902
5903	if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > `0`) {
5904	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
5905	int max_sack = sp[`0`].end_seq;
5906	int this_sack;
5907
5908	for (this_sack = `1`; this_sack < tp->rx_opt.num_sacks;
5909	++this_sack) {
5910	max_sack = after(sp[this_sack].end_seq,
5911	max_sack) ?
5912	sp[this_sack].end_seq : max_sack;
5913	}
5914
5915	if (TCP_SKB_CB(skb)->seq == max_sack)
5916	goto reset;
5917	}
5918
5919	/ Disable TFO if RST is out-of-order*
5920	* and no data has been received
5921	* for current active TFO socket
5922	*/
5923	if (tp->syn_fastopen && !tp->data_segs_in &&
5924	sk->sk_state == TCP_ESTABLISHED)
5925	tcp_fastopen_active_disable(sk);
5926	tcp_send_challenge_ack(sk);
5927	SKB_DR_SET(reason, TCP_RESET);
5928	goto discard;
5929	}
5930
5931	/ step 3: check security and precedence [ignored] /
5932
5933	/ step 4: Check for a SYN*
5934	* RFC 5961 4.2 : Send a challenge ack
5935	*/
5936	if (th->syn) {
5937	syn_challenge:
5938	if (syn_inerr)
5939	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
5940	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
5941	tcp_send_challenge_ack(sk);
5942	SKB_DR_SET(reason, TCP_INVALID_SYN);
5943	goto discard;
5944	}
5945
5946	bpf_skops_parse_hdr(sk, skb);
5947
5948	return true;
5949
5950	discard:
5951	tcp_drop_reason(sk, skb, reason);
5952	return false;
5953
5954	reset:
5955	tcp_reset(sk, skb);
5956	__kfree_skb(skb);
5957	return false;
5958	}
5959
5960	/*
5961	* TCP receive function for the ESTABLISHED state.
5962	*
5963	* It is split into a fast path and a slow path. The fast path is
5964	* disabled when:
5965	* - A zero window was announced from us - zero window probing
5966	* is only handled properly in the slow path.
5967	* - Out of order segments arrived.
5968	* - Urgent data is expected.
5969	* - There is no buffer space left
5970	* - Unexpected TCP flags/window values/header lengths are received
5971	* (detected by checking the TCP header against pred_flags)
5972	* - Data is sent in both directions. Fast path only supports pure senders
5973	* or pure receivers (this means either the sequence number or the ack
5974	* value must stay constant)
5975	* - Unexpected TCP option.
5976	*
5977	* When these conditions are not satisfied it drops into a standard
5978	* receive procedure patterned after RFC793 to handle all cases.
5979	* The first three cases are guaranteed by proper pred_flags setting,
5980	* the rest is checked inline. Fast processing is turned on in
5981	* tcp_data_queue when everything is OK.
5982	*/
5983	void tcp_rcv_established(struct sock sk, struct* sk_buff *skb)
5984	{
5985	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
5986	const struct tcphdr th = (const* struct tcphdr *)skb->data;
5987	struct tcp_sock *tp = tcp_sk(sk);
5988	unsigned int len = skb->len;
5989
5990	/ TCP congestion window tracking /
5991	trace_tcp_probe(sk, skb);
5992
5993	tcp_mstamp_refresh(tp);
5994	if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
5995	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5996	/*
5997	* Header prediction.
5998	* The code loosely follows the one in the famous
5999	* "30 instruction TCP receive" Van Jacobson mail.
6000	*
6001	* Van's trick is to deposit buffers into socket queue
6002	* on a device interrupt, to call tcp_recv function
6003	* on the receive process context and checksum and copy
6004	* the buffer to user space. smart...
6005	*
6006	* Our current scheme is not silly either but we take the
6007	* extra cost of the net_bh soft interrupt processing...
6008	* We do checksum and copy also but from device to kernel.
6009	*/
6010
6011	tp->rx_opt.saw_tstamp = `0`;
6012
6013	/ pred_flags is 0xS?10 << 16 + snd_wnd*
6014	* if header_prediction is to be made
6015	* 'S' will always be tp->tcp_header_len >> 2
6016	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
6017	* turn it off (when there are holes in the receive
6018	* space for instance)
6019	* PSH flag is ignored.
6020	*/
6021
6022	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
6023	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
6024	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6025	int tcp_header_len = tp->tcp_header_len;
6026
6027	/ Timestamp header prediction: tcp_header_len*
6028	* is automatically equal to th->doff*4 due to pred_flags
6029	* match.
6030	*/
6031
6032	/ Check timestamp /
6033	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
6034	/ No? Slow path! /
6035	if (!tcp_parse_aligned_timestamp(tp, th))
6036	goto slow_path;
6037
6038	/ If PAWS failed, check it more carefully in slow path /
6039	if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < `0`)
6040	goto slow_path;
6041
6042	/ DO NOT update ts_recent here, if checksum fails*
6043	* and timestamp was corrupted part, it will result
6044	* in a hung connection since we will drop all
6045	* future packets due to the PAWS test.
6046	*/
6047	}
6048
6049	if (len <= tcp_header_len) {
6050	/ Bulk data transfer: sender /
6051	if (len == tcp_header_len) {
6052	/ Predicted packet is in window by definition.*
6053	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
6054	* Hence, check seq<=rcv_wup reduces to:
6055	*/
6056	if (tcp_header_len ==
6057	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
6058	tp->rcv_nxt == tp->rcv_wup)
6059	tcp_store_ts_recent(tp);
6060
6061	/ We know that such packets are checksummed*
6062	* on entry.
6063	*/
6064	tcp_ack(sk, skb, flag: `0`);
6065	__kfree_skb(skb);
6066	tcp_data_snd_check(sk);
6067	/ When receiving pure ack in fast path, update*
6068	* last ts ecr directly instead of calling
6069	* tcp_rcv_rtt_measure_ts()
6070	*/
6071	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
6072	return;
6073	} else { / Header too small /
6074	reason = SKB_DROP_REASON_PKT_TOO_SMALL;
6075	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
6076	goto discard;
6077	}
6078	} else {
6079	int eaten = `0`;
6080	bool fragstolen = false;
6081
6082	if (tcp_checksum_complete(skb))
6083	goto csum_error;
6084
6085	if ((int)skb->truesize > sk->sk_forward_alloc)
6086	goto step5;
6087
6088	/ Predicted packet is in window by definition.*
6089	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
6090	* Hence, check seq<=rcv_wup reduces to:
6091	*/
6092	if (tcp_header_len ==
6093	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
6094	tp->rcv_nxt == tp->rcv_wup)
6095	tcp_store_ts_recent(tp);
6096
6097	tcp_rcv_rtt_measure_ts(sk, skb);
6098
6099	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
6100
6101	/ Bulk data transfer: receiver /
6102	skb_dst_drop(skb);
6103	__skb_pull(skb, len: tcp_header_len);
6104	eaten = tcp_queue_rcv(sk, skb, fragstolen: &fragstolen);
6105
6106	tcp_event_data_recv(sk, skb);
6107
6108	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
6109	/ Well, only one small jumplet in fast path... /
6110	tcp_ack(sk, skb, FLAG_DATA);
6111	tcp_data_snd_check(sk);
6112	if (!inet_csk_ack_scheduled(sk))
6113	goto no_ack;
6114	} else {
6115	tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
6116	}
6117
6118	__tcp_ack_snd_check(sk, ofo_possible: `0`);
6119	no_ack:
6120	if (eaten)
6121	kfree_skb_partial(skb, head_stolen: fragstolen);
6122	tcp_data_ready(sk);
6123	return;
6124	}
6125	}
6126
6127	slow_path:
6128	if (len < (th->doff << `2`) \|\| tcp_checksum_complete(skb))
6129	goto csum_error;
6130
6131	if (!th->ack && !th->rst && !th->syn) {
6132	reason = SKB_DROP_REASON_TCP_FLAGS;
6133	goto discard;
6134	}
6135
6136	/*
6137	* Standard slow path.
6138	*/
6139
6140	if (!tcp_validate_incoming(sk, skb, th, syn_inerr: `1`))
6141	return;
6142
6143	step5:
6144	reason = tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT);
6145	if ((int)reason < `0`) {
6146	reason = -reason;
6147	goto discard;
6148	}
6149	tcp_rcv_rtt_measure_ts(sk, skb);
6150
6151	/ Process urgent data. /
6152	tcp_urg(sk, skb, th);
6153
6154	/ step 7: process the segment text /
6155	tcp_data_queue(sk, skb);
6156
6157	tcp_data_snd_check(sk);
6158	tcp_ack_snd_check(sk);
6159	return;
6160
6161	csum_error:
6162	reason = SKB_DROP_REASON_TCP_CSUM;
6163	trace_tcp_bad_csum(skb);
6164	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
6165	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
6166
6167	discard:
6168	tcp_drop_reason(sk, skb, reason);
6169	}
6170	EXPORT_SYMBOL(tcp_rcv_established);
6171
6172	void tcp_init_transfer(struct sock sk, int* bpf_op, struct sk_buff *skb)
6173	{
6174	struct inet_connection_sock *icsk = inet_csk(sk);
6175	struct tcp_sock *tp = tcp_sk(sk);
6176
6177	tcp_mtup_init(sk);
6178	icsk->icsk_af_ops->rebuild_header(sk);
6179	tcp_init_metrics(sk);
6180
6181	/ Initialize the congestion window to start the transfer.*
6182	* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
6183	* retransmitted. In light of RFC6298 more aggressive 1sec
6184	* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
6185	* retransmission has occurred.
6186	*/
6187	if (tp->total_retrans > `1` && tp->undo_marker)
6188	tcp_snd_cwnd_set(tp, val: `1`);
6189	else
6190	tcp_snd_cwnd_set(tp, val: tcp_init_cwnd(tp, dst: __sk_dst_get(sk)));
6191	tp->snd_cwnd_stamp = tcp_jiffies32;
6192
6193	bpf_skops_established(sk, bpf_op, skb);
6194	/ Initialize congestion control unless BPF initialized it already: /
6195	if (!icsk->icsk_ca_initialized)
6196	tcp_init_congestion_control(sk);
6197	tcp_init_buffer_space(sk);
6198	}
6199
6200	void tcp_finish_connect(struct sock sk, struct* sk_buff *skb)
6201	{
6202	struct tcp_sock *tp = tcp_sk(sk);
6203	struct inet_connection_sock *icsk = inet_csk(sk);
6204
6205	tcp_ao_finish_connect(sk, skb);
6206	tcp_set_state(sk, state: TCP_ESTABLISHED);
6207	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
6208
6209	if (skb) {
6210	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
6211	security_inet_conn_established(sk, skb);
6212	sk_mark_napi_id(sk, skb);
6213	}
6214
6215	tcp_init_transfer(sk, bpf_op: BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
6216
6217	/ Prevent spurious tcp_cwnd_restart() on first data*
6218	* packet.
6219	*/
6220	tp->lsndtime = tcp_jiffies32;
6221
6222	if (sock_flag(sk, flag: SOCK_KEEPOPEN))
6223	inet_csk_reset_keepalive_timer(sk, timeout: keepalive_time_when(tp));
6224
6225	if (!tp->rx_opt.snd_wscale)
6226	__tcp_fast_path_on(tp, snd_wnd: tp->snd_wnd);
6227	else
6228	tp->pred_flags = `0`;
6229	}
6230
6231	static bool tcp_rcv_fastopen_synack(struct sock sk, struct* sk_buff *synack,
6232	struct tcp_fastopen_cookie *cookie)
6233	{
6234	struct tcp_sock *tp = tcp_sk(sk);
6235	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
6236	u16 mss = tp->rx_opt.mss_clamp, try_exp = `0`;
6237	bool syn_drop = false;
6238
6239	if (mss == tp->rx_opt.user_mss) {
6240	struct tcp_options_received opt;
6241
6242	/ Get original SYNACK MSS value if user MSS sets mss_clamp /
6243	tcp_clear_options(rx_opt: &opt);
6244	opt.user_mss = opt.mss_clamp = `0`;
6245	tcp_parse_options(sock_net(sk), synack, &opt, `0`, NULL);
6246	mss = opt.mss_clamp;
6247	}
6248
6249	if (!tp->syn_fastopen) {
6250	/ Ignore an unsolicited cookie /
6251	cookie->len = -`1`;
6252	} else if (tp->total_retrans) {
6253	/ SYN timed out and the SYN-ACK neither has a cookie nor*
6254	* acknowledges data. Presumably the remote received only
6255	* the retransmitted (regular) SYNs: either the original
6256	* SYN-data or the corresponding SYN-ACK was dropped.
6257	*/
6258	syn_drop = (cookie->len < `0` && data);
6259	} else if (cookie->len < `0` && !tp->syn_data) {
6260	/ We requested a cookie but didn't get it. If we did not use*
6261	* the (old) exp opt format then try so next time (try_exp=1).
6262	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
6263	*/
6264	try_exp = tp->syn_fastopen_exp ? `2` : `1`;
6265	}
6266
6267	tcp_fastopen_cache_set(sk, mss, cookie, syn_lost: syn_drop, try_exp);
6268
6269	if (data) { / Retransmit unacked data in SYN /
6270	if (tp->total_retrans)
6271	tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
6272	else
6273	tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
6274	skb_rbtree_walk_from(data)
6275	tcp_mark_skb_lost(sk, skb: data);
6276	tcp_xmit_retransmit_queue(sk);
6277	NET_INC_STATS(sock_net(sk),
6278	LINUX_MIB_TCPFASTOPENACTIVEFAIL);
6279	return true;
6280	}
6281	tp->syn_data_acked = tp->syn_data;
6282	if (tp->syn_data_acked) {
6283	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
6284	/ SYN-data is counted as two separate packets in tcp_ack() /
6285	if (tp->delivered > `1`)
6286	--tp->delivered;
6287	}
6288
6289	tcp_fastopen_add_skb(sk, skb: synack);
6290
6291	return false;
6292	}
6293
6294	static void smc_check_reset_syn(struct tcp_sock *tp)
6295	{
6296	#if IS_ENABLED(CONFIG_SMC)
6297	if (static_branch_unlikely(&tcp_have_smc)) {
6298	if (tp->syn_smc && !tp->rx_opt.smc_ok)
6299	tp->syn_smc = `0`;
6300	}
6301	#endif
6302	}
6303
6304	static void tcp_try_undo_spurious_syn(struct sock *sk)
6305	{
6306	struct tcp_sock *tp = tcp_sk(sk);
6307	u32 syn_stamp;
6308
6309	/ undo_marker is set when SYN or SYNACK times out. The timeout is*
6310	* spurious if the ACK's timestamp option echo value matches the
6311	* original SYN timestamp.
6312	*/
6313	syn_stamp = tp->retrans_stamp;
6314	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
6315	syn_stamp == tp->rx_opt.rcv_tsecr)
6316	tp->undo_marker = `0`;
6317	}
6318
6319	static int tcp_rcv_synsent_state_process(struct sock sk, struct* sk_buff *skb,
6320	const struct tcphdr *th)
6321	{
6322	struct inet_connection_sock *icsk = inet_csk(sk);
6323	struct tcp_sock *tp = tcp_sk(sk);
6324	struct tcp_fastopen_cookie foc = { .len = -`1` };
6325	int saved_clamp = tp->rx_opt.mss_clamp;
6326	bool fastopen_fail;
6327	SKB_DR(reason);
6328
6329	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, `0`, &foc);
6330	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
6331	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
6332
6333	if (th->ack) {
6334	/ rfc793:*
6335	* "If the state is SYN-SENT then
6336	* first check the ACK bit
6337	* If the ACK bit is set
6338	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
6339	* a reset (unless the RST bit is set, if so drop
6340	* the segment and return)"
6341	*/
6342	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
6343	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6344	/ Previous FIN/ACK or RST/ACK might be ignored. /
6345	if (icsk->icsk_retransmits == `0`)
6346	inet_csk_reset_xmit_timer(sk,
6347	ICSK_TIME_RETRANS,
6348	TCP_TIMEOUT_MIN, TCP_RTO_MAX);
6349	goto reset_and_undo;
6350	}
6351
6352	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
6353	!between(seq1: tp->rx_opt.rcv_tsecr, seq2: tp->retrans_stamp,
6354	seq3: tcp_time_stamp_ts(tp))) {
6355	NET_INC_STATS(sock_net(sk),
6356	LINUX_MIB_PAWSACTIVEREJECTED);
6357	goto reset_and_undo;
6358	}
6359
6360	/ Now ACK is acceptable.*
6361	*
6362	* "If the RST bit is set
6363	* If the ACK was acceptable then signal the user "error:
6364	* connection reset", drop the segment, enter CLOSED state,
6365	* delete TCB, and return."
6366	*/
6367
6368	if (th->rst) {
6369	tcp_reset(sk, skb);
6370	consume:
6371	__kfree_skb(skb);
6372	return `0`;
6373	}
6374
6375	/ rfc793:*
6376	* "fifth, if neither of the SYN or RST bits is set then
6377	* drop the segment and return."
6378	*
6379	* See note below!
6380	* --ANK(990513)
6381	*/
6382	if (!th->syn) {
6383	SKB_DR_SET(reason, TCP_FLAGS);
6384	goto discard_and_undo;
6385	}
6386	/ rfc793:*
6387	* "If the SYN bit is on ...
6388	* are acceptable then ...
6389	* (our SYN has been ACKed), change the connection
6390	* state to ESTABLISHED..."
6391	*/
6392
6393	tcp_ecn_rcv_synack(tp, th);
6394
6395	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6396	tcp_try_undo_spurious_syn(sk);
6397	tcp_ack(sk, skb, FLAG_SLOWPATH);
6398
6399	/ Ok.. it's good. Set up sequence numbers and*
6400	* move to established.
6401	*/
6402	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + `1`);
6403	tp->rcv_wup = TCP_SKB_CB(skb)->seq + `1`;
6404
6405	/ RFC1323: The window in SYN & SYN/ACK segments is*
6406	* never scaled.
6407	*/
6408	tp->snd_wnd = ntohs(th->window);
6409
6410	if (!tp->rx_opt.wscale_ok) {
6411	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = `0`;
6412	tp->window_clamp = min(tp->window_clamp, `65535U`);
6413	}
6414
6415	if (tp->rx_opt.saw_tstamp) {
6416	tp->rx_opt.tstamp_ok = `1`;
6417	tp->tcp_header_len =
6418	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
6419	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6420	tcp_store_ts_recent(tp);
6421	} else {
6422	tp->tcp_header_len = sizeof(struct tcphdr);
6423	}
6424
6425	tcp_sync_mss(sk, pmtu: icsk->icsk_pmtu_cookie);
6426	tcp_initialize_rcv_mss(sk);
6427
6428	/ Remember, tcp_poll() does not lock socket!*
6429	* Change state from SYN-SENT only after copied_seq
6430	* is initialized. */
6431	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6432
6433	smc_check_reset_syn(tp);
6434
6435	smp_mb();
6436
6437	tcp_finish_connect(sk, skb);
6438
6439	fastopen_fail = (tp->syn_fastopen \|\| tp->syn_data) &&
6440	tcp_rcv_fastopen_synack(sk, synack: skb, cookie: &foc);
6441
6442	if (!sock_flag(sk, flag: SOCK_DEAD)) {
6443	sk->sk_state_change(sk);
6444	sk_wake_async(sk, how: SOCK_WAKE_IO, POLL_OUT);
6445	}
6446	if (fastopen_fail)
6447	return -`1`;
6448	if (sk->sk_write_pending \|\|
6449	READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) \|\|
6450	inet_csk_in_pingpong_mode(sk)) {
6451	/ Save one ACK. Data will be ready after*
6452	* several ticks, if write_pending is set.
6453	*
6454	* It may be deleted, but with this feature tcpdumps
6455	* look so _wonderfully_ clever, that I was not able
6456	* to stand against the temptation 8) --ANK
6457	*/
6458	inet_csk_schedule_ack(sk);
6459	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
6460	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
6461	TCP_DELACK_MAX, TCP_RTO_MAX);
6462	goto consume;
6463	}
6464	tcp_send_ack(sk);
6465	return -`1`;
6466	}
6467
6468	/ No ACK in the segment /
6469
6470	if (th->rst) {
6471	/ rfc793:*
6472	* "If the RST bit is set
6473	*
6474	* Otherwise (no ACK) drop the segment and return."
6475	*/
6476	SKB_DR_SET(reason, TCP_RESET);
6477	goto discard_and_undo;
6478	}
6479
6480	/ PAWS check. /
6481	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
6482	tcp_paws_reject(rx_opt: &tp->rx_opt, rst: `0`)) {
6483	SKB_DR_SET(reason, TCP_RFC7323_PAWS);
6484	goto discard_and_undo;
6485	}
6486	if (th->syn) {
6487	/ We see SYN without ACK. It is attempt of*
6488	* simultaneous connect with crossed SYNs.
6489	* Particularly, it can be connect to self.
6490	*/
6491	#ifdef CONFIG_TCP_AO
6492	struct tcp_ao_info *ao;
6493
6494	ao = rcu_dereference_protected(tp->ao_info,
6495	lockdep_sock_is_held(sk));
6496	if (ao) {
6497	WRITE_ONCE(ao->risn, th->seq);
6498	ao->rcv_sne = `0`;
6499	}
6500	#endif
6501	tcp_set_state(sk, state: TCP_SYN_RECV);
6502
6503	if (tp->rx_opt.saw_tstamp) {
6504	tp->rx_opt.tstamp_ok = `1`;
6505	tcp_store_ts_recent(tp);
6506	tp->tcp_header_len =
6507	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
6508	} else {
6509	tp->tcp_header_len = sizeof(struct tcphdr);
6510	}
6511
6512	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + `1`);
6513	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6514	tp->rcv_wup = TCP_SKB_CB(skb)->seq + `1`;
6515
6516	/ RFC1323: The window in SYN & SYN/ACK segments is*
6517	* never scaled.
6518	*/
6519	tp->snd_wnd = ntohs(th->window);
6520	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
6521	tp->max_window = tp->snd_wnd;
6522
6523	tcp_ecn_rcv_syn(tp, th);
6524
6525	tcp_mtup_init(sk);
6526	tcp_sync_mss(sk, pmtu: icsk->icsk_pmtu_cookie);
6527	tcp_initialize_rcv_mss(sk);
6528
6529	tcp_send_synack(sk);
6530	#if 0
6531	/ Note, we could accept data and URG from this segment.*
6532	* There are no obstacles to make this (except that we must
6533	* either change tcp_recvmsg() to prevent it from returning data
6534	* before 3WHS completes per RFC793, or employ TCP Fast Open).
6535	*
6536	* However, if we ignore data in ACKless segments sometimes,
6537	* we have no reasons to accept it sometimes.
6538	* Also, seems the code doing it in step6 of tcp_rcv_state_process
6539	* is not flawless. So, discard packet for sanity.
6540	* Uncomment this return to process the data.
6541	*/
6542	return -`1`;
6543	#else
6544	goto consume;
6545	#endif
6546	}
6547	/ "fifth, if neither of the SYN or RST bits is set then*
6548	* drop the segment and return."
6549	*/
6550
6551	discard_and_undo:
6552	tcp_clear_options(rx_opt: &tp->rx_opt);
6553	tp->rx_opt.mss_clamp = saved_clamp;
6554	tcp_drop_reason(sk, skb, reason);
6555	return `0`;
6556
6557	reset_and_undo:
6558	tcp_clear_options(rx_opt: &tp->rx_opt);
6559	tp->rx_opt.mss_clamp = saved_clamp;
6560	return `1`;
6561	}
6562
6563	static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
6564	{
6565	struct tcp_sock *tp = tcp_sk(sk);
6566	struct request_sock *req;
6567
6568	/ If we are still handling the SYNACK RTO, see if timestamp ECR allows*
6569	* undo. If peer SACKs triggered fast recovery, we can't undo here.
6570	*/
6571	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
6572	tcp_try_undo_recovery(sk);
6573
6574	/ Reset rtx states to prevent spurious retransmits_timed_out() /
6575	tcp_update_rto_time(tp);
6576	tp->retrans_stamp = `0`;
6577	inet_csk(sk)->icsk_retransmits = `0`;
6578
6579	/ Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,*
6580	* we no longer need req so release it.
6581	*/
6582	req = rcu_dereference_protected(tp->fastopen_rsk,
6583	lockdep_sock_is_held(sk));
6584	reqsk_fastopen_remove(sk, req, reset: false);
6585
6586	/ Re-arm the timer because data may have been sent out.*
6587	* This is similar to the regular data transmission case
6588	* when new data has just been ack'ed.
6589	*
6590	* (TFO) - we could try to be more aggressive and
6591	* retransmitting any data sooner based on when they
6592	* are sent out.
6593	*/
6594	tcp_rearm_rto(sk);
6595	}
6596
6597	/*
6598	* This function implements the receiving procedure of RFC 793 for
6599	* all states except ESTABLISHED and TIME_WAIT.
6600	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
6601	* address independent.
6602	*/
6603
6604	int tcp_rcv_state_process(struct sock sk, struct* sk_buff *skb)
6605	{
6606	struct tcp_sock *tp = tcp_sk(sk);
6607	struct inet_connection_sock *icsk = inet_csk(sk);
6608	const struct tcphdr *th = tcp_hdr(skb);
6609	struct request_sock *req;
6610	int queued = `0`;
6611	bool acceptable;
6612	SKB_DR(reason);
6613
6614	switch (sk->sk_state) {
6615	case TCP_CLOSE:
6616	SKB_DR_SET(reason, TCP_CLOSE);
6617	goto discard;
6618
6619	case TCP_LISTEN:
6620	if (th->ack)
6621	return `1`;
6622
6623	if (th->rst) {
6624	SKB_DR_SET(reason, TCP_RESET);
6625	goto discard;
6626	}
6627	if (th->syn) {
6628	if (th->fin) {
6629	SKB_DR_SET(reason, TCP_FLAGS);
6630	goto discard;
6631	}
6632	/ It is possible that we process SYN packets from backlog,*
6633	* so we need to make sure to disable BH and RCU right there.
6634	*/
6635	rcu_read_lock();
6636	local_bh_disable();
6637	acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= `0`;
6638	local_bh_enable();
6639	rcu_read_unlock();
6640
6641	if (!acceptable)
6642	return `1`;
6643	consume_skb(skb);
6644	return `0`;
6645	}
6646	SKB_DR_SET(reason, TCP_FLAGS);
6647	goto discard;
6648
6649	case TCP_SYN_SENT:
6650	tp->rx_opt.saw_tstamp = `0`;
6651	tcp_mstamp_refresh(tp);
6652	queued = tcp_rcv_synsent_state_process(sk, skb, th);
6653	if (queued >= `0`)
6654	return queued;
6655
6656	/ Do step6 onward by hand. /
6657	tcp_urg(sk, skb, th);
6658	__kfree_skb(skb);
6659	tcp_data_snd_check(sk);
6660	return `0`;
6661	}
6662
6663	tcp_mstamp_refresh(tp);
6664	tp->rx_opt.saw_tstamp = `0`;
6665	req = rcu_dereference_protected(tp->fastopen_rsk,
6666	lockdep_sock_is_held(sk));
6667	if (req) {
6668	bool req_stolen;
6669
6670	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
6671	sk->sk_state != TCP_FIN_WAIT1);
6672
6673	if (!tcp_check_req(sk, skb, req, fastopen: true, lost_race: &req_stolen)) {
6674	SKB_DR_SET(reason, TCP_FASTOPEN);
6675	goto discard;
6676	}
6677	}
6678
6679	if (!th->ack && !th->rst && !th->syn) {
6680	SKB_DR_SET(reason, TCP_FLAGS);
6681	goto discard;
6682	}
6683	if (!tcp_validate_incoming(sk, skb, th, syn_inerr: `0`))
6684	return `0`;
6685
6686	/ step 5: check the ACK field /
6687	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH \|
6688	FLAG_UPDATE_TS_RECENT \|
6689	FLAG_NO_CHALLENGE_ACK) > `0`;
6690
6691	if (!acceptable) {
6692	if (sk->sk_state == TCP_SYN_RECV)
6693	return `1`; / send one RST /
6694	tcp_send_challenge_ack(sk);
6695	SKB_DR_SET(reason, TCP_OLD_ACK);
6696	goto discard;
6697	}
6698	switch (sk->sk_state) {
6699	case TCP_SYN_RECV:
6700	tp->delivered++; / SYN-ACK delivery isn't tracked in tcp_ack /
6701	if (!tp->srtt_us)
6702	tcp_synack_rtt_meas(sk, req);
6703
6704	if (req) {
6705	tcp_rcv_synrecv_state_fastopen(sk);
6706	} else {
6707	tcp_try_undo_spurious_syn(sk);
6708	tp->retrans_stamp = `0`;
6709	tcp_init_transfer(sk, bpf_op: BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
6710	skb);
6711	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6712	}
6713	tcp_ao_established(sk);
6714	smp_mb();
6715	tcp_set_state(sk, state: TCP_ESTABLISHED);
6716	sk->sk_state_change(sk);
6717
6718	/ Note, that this wakeup is only for marginal crossed SYN case.*
6719	* Passively open sockets are not waked up, because
6720	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
6721	*/
6722	if (sk->sk_socket)
6723	sk_wake_async(sk, how: SOCK_WAKE_IO, POLL_OUT);
6724
6725	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
6726	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
6727	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6728
6729	if (tp->rx_opt.tstamp_ok)
6730	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6731
6732	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
6733	tcp_update_pacing_rate(sk);
6734
6735	/ Prevent spurious tcp_cwnd_restart() on first data packet /
6736	tp->lsndtime = tcp_jiffies32;
6737
6738	tcp_initialize_rcv_mss(sk);
6739	tcp_fast_path_on(tp);
6740	break;
6741
6742	case TCP_FIN_WAIT1: {
6743	int tmo;
6744
6745	if (req)
6746	tcp_rcv_synrecv_state_fastopen(sk);
6747
6748	if (tp->snd_una != tp->write_seq)
6749	break;
6750
6751	tcp_set_state(sk, state: TCP_FIN_WAIT2);
6752	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| SEND_SHUTDOWN);
6753
6754	sk_dst_confirm(sk);
6755
6756	if (!sock_flag(sk, flag: SOCK_DEAD)) {
6757	/ Wake up lingering close() /
6758	sk->sk_state_change(sk);
6759	break;
6760	}
6761
6762	if (READ_ONCE(tp->linger2) < `0`) {
6763	tcp_done(sk);
6764	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6765	return `1`;
6766	}
6767	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6768	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6769	/ Receive out of order FIN after close() /
6770	if (tp->syn_fastopen && th->fin)
6771	tcp_fastopen_active_disable(sk);
6772	tcp_done(sk);
6773	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6774	return `1`;
6775	}
6776
6777	tmo = tcp_fin_time(sk);
6778	if (tmo > TCP_TIMEWAIT_LEN) {
6779	inet_csk_reset_keepalive_timer(sk, timeout: tmo - TCP_TIMEWAIT_LEN);
6780	} else if (th->fin \|\| sock_owned_by_user(sk)) {
6781	/ Bad case. We could lose such FIN otherwise.*
6782	* It is not a big problem, but it looks confusing
6783	* and not so rare event. We still can lose it now,
6784	* if it spins in bh_lock_sock(), but it is really
6785	* marginal case.
6786	*/
6787	inet_csk_reset_keepalive_timer(sk, timeout: tmo);
6788	} else {
6789	tcp_time_wait(sk, state: TCP_FIN_WAIT2, timeo: tmo);
6790	goto consume;
6791	}
6792	break;
6793	}
6794
6795	case TCP_CLOSING:
6796	if (tp->snd_una == tp->write_seq) {
6797	tcp_time_wait(sk, state: TCP_TIME_WAIT, timeo: `0`);
6798	goto consume;
6799	}
6800	break;
6801
6802	case TCP_LAST_ACK:
6803	if (tp->snd_una == tp->write_seq) {
6804	tcp_update_metrics(sk);
6805	tcp_done(sk);
6806	goto consume;
6807	}
6808	break;
6809	}
6810
6811	/ step 6: check the URG bit /
6812	tcp_urg(sk, skb, th);
6813
6814	/ step 7: process the segment text /
6815	switch (sk->sk_state) {
6816	case TCP_CLOSE_WAIT:
6817	case TCP_CLOSING:
6818	case TCP_LAST_ACK:
6819	if (!before(TCP_SKB_CB(skb)->seq, seq2: tp->rcv_nxt)) {
6820	/ If a subflow has been reset, the packet should not*
6821	* continue to be processed, drop the packet.
6822	*/
6823	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
6824	goto discard;
6825	break;
6826	}
6827	fallthrough;
6828	case TCP_FIN_WAIT1:
6829	case TCP_FIN_WAIT2:
6830	/ RFC 793 says to queue data in these states,*
6831	* RFC 1122 says we MUST send a reset.
6832	* BSD 4.4 also does reset.
6833	*/
6834	if (sk->sk_shutdown & RCV_SHUTDOWN) {
6835	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6836	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6837	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6838	tcp_reset(sk, skb);
6839	return `1`;
6840	}
6841	}
6842	fallthrough;
6843	case TCP_ESTABLISHED:
6844	tcp_data_queue(sk, skb);
6845	queued = `1`;
6846	break;
6847	}
6848
6849	/ tcp_data could move socket to TIME-WAIT /
6850	if (sk->sk_state != TCP_CLOSE) {
6851	tcp_data_snd_check(sk);
6852	tcp_ack_snd_check(sk);
6853	}
6854
6855	if (!queued) {
6856	discard:
6857	tcp_drop_reason(sk, skb, reason);
6858	}
6859	return `0`;
6860
6861	consume:
6862	__kfree_skb(skb);
6863	return `0`;
6864	}
6865	EXPORT_SYMBOL(tcp_rcv_state_process);
6866
6867	static inline void pr_drop_req(struct request_sock req, __u16 port, int* family)
6868	{
6869	struct inet_request_sock *ireq = inet_rsk(sk: req);
6870
6871	if (family == AF_INET)
6872	net_dbg_ratelimited("drop open request from %pI4/%u\n",
6873	&ireq->ir_rmt_addr, port);
6874	#if IS_ENABLED(CONFIG_IPV6)
6875	else if (family == AF_INET6)
6876	net_dbg_ratelimited("drop open request from %pI6/%u\n",
6877	&ireq->ir_v6_rmt_addr, port);
6878	#endif
6879	}
6880
6881	/ RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set*
6882	*
6883	* If we receive a SYN packet with these bits set, it means a
6884	* network is playing bad games with TOS bits. In order to
6885	* avoid possible false congestion notifications, we disable
6886	* TCP ECN negotiation.
6887	*
6888	* Exception: tcp_ca wants ECN. This is required for DCTCP
6889	* congestion control: Linux DCTCP asserts ECT on all packets,
6890	* including SYN, which is most optimal solution; however,
6891	* others, such as FreeBSD do not.
6892	*
6893	* Exception: At least one of the reserved bits of the TCP header (th->res1) is
6894	* set, indicating the use of a future TCP extension (such as AccECN). See
6895	* RFC8311 §4.3 which updates RFC3168 to allow the development of such
6896	* extensions.
6897	*/
6898	static void tcp_ecn_create_request(struct request_sock *req,
6899	const struct sk_buff *skb,
6900	const struct sock *listen_sk,
6901	const struct dst_entry *dst)
6902	{
6903	const struct tcphdr *th = tcp_hdr(skb);
6904	const struct net *net = sock_net(sk: listen_sk);
6905	bool th_ecn = th->ece && th->cwr;
6906	bool ect, ecn_ok;
6907	u32 ecn_ok_dst;
6908
6909	if (!th_ecn)
6910	return;
6911
6912	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
6913	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
6914	ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) \|\| ecn_ok_dst;
6915
6916	if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(sk: listen_sk) \|\|
6917	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
6918	tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
6919	inet_rsk(sk: req)->ecn_ok = `1`;
6920	}
6921
6922	static void tcp_openreq_init(struct request_sock *req,
6923	const struct tcp_options_received *rx_opt,
6924	struct sk_buff skb, const* struct sock *sk)
6925	{
6926	struct inet_request_sock *ireq = inet_rsk(sk: req);
6927
6928	req->rsk_rcv_wnd = `0`; / So that tcp_send_synack() knows! /
6929	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
6930	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + `1`;
6931	tcp_rsk(req)->snt_synack = `0`;
6932	tcp_rsk(req)->last_oow_ack_time = `0`;
6933	req->mss = rx_opt->mss_clamp;
6934	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : `0`;
6935	ireq->tstamp_ok = rx_opt->tstamp_ok;
6936	ireq->sack_ok = rx_opt->sack_ok;
6937	ireq->snd_wscale = rx_opt->snd_wscale;
6938	ireq->wscale_ok = rx_opt->wscale_ok;
6939	ireq->acked = `0`;
6940	ireq->ecn_ok = `0`;
6941	ireq->ir_rmt_port = tcp_hdr(skb)->source;
6942	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
6943	ireq->ir_mark = inet_request_mark(sk, skb);
6944	#if IS_ENABLED(CONFIG_SMC)
6945	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
6946	tcp_sk(sk)->smc_hs_congested(sk));
6947	#endif
6948	}
6949
6950	struct request_sock inet_reqsk_alloc(const* struct request_sock_ops *ops,
6951	struct sock *sk_listener,
6952	bool attach_listener)
6953	{
6954	struct request_sock *req = reqsk_alloc(ops, sk_listener,
6955	attach_listener);
6956
6957	if (req) {
6958	struct inet_request_sock *ireq = inet_rsk(sk: req);
6959
6960	ireq->ireq_opt = NULL;
6961	#if IS_ENABLED(CONFIG_IPV6)
6962	ireq->pktopts = NULL;
6963	#endif
6964	atomic64_set(v: &ireq->ir_cookie, i: `0`);
6965	ireq->ireq_state = TCP_NEW_SYN_RECV;
6966	write_pnet(pnet: &ireq->ireq_net, net: sock_net(sk: sk_listener));
6967	ireq->ireq_family = sk_listener->sk_family;
6968	req->timeout = TCP_TIMEOUT_INIT;
6969	}
6970
6971	return req;
6972	}
6973	EXPORT_SYMBOL(inet_reqsk_alloc);
6974
6975	/*
6976	* Return true if a syncookie should be sent
6977	*/
6978	static bool tcp_syn_flood_action(const struct sock sk, const* char *proto)
6979	{
6980	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
6981	const char *msg = "Dropping request";
6982	struct net *net = sock_net(sk);
6983	bool want_cookie = false;
6984	u8 syncookies;
6985
6986	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
6987
6988	#ifdef CONFIG_SYN_COOKIES
6989	if (syncookies) {
6990	msg = "Sending cookies";
6991	want_cookie = true;
6992	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
6993	} else
6994	#endif
6995	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
6996
6997	if (!READ_ONCE(queue->synflood_warned) && syncookies != `2` &&
6998	xchg(&queue->synflood_warned, `1`) == `0`) {
6999	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
7000	net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
7001	proto, inet6_rcv_saddr(sk),
7002	sk->sk_num, msg);
7003	} else {
7004	net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
7005	proto, &sk->sk_rcv_saddr,
7006	sk->sk_num, msg);
7007	}
7008	}
7009
7010	return want_cookie;
7011	}
7012
7013	static void tcp_reqsk_record_syn(const struct sock *sk,
7014	struct request_sock *req,
7015	const struct sk_buff *skb)
7016	{
7017	if (tcp_sk(sk)->save_syn) {
7018	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
7019	struct saved_syn *saved_syn;
7020	u32 mac_hdrlen;
7021	void *base;
7022
7023	if (tcp_sk(sk)->save_syn == `2`) { / Save full header. /
7024	base = skb_mac_header(skb);
7025	mac_hdrlen = skb_mac_header_len(skb);
7026	len += mac_hdrlen;
7027	} else {
7028	base = skb_network_header(skb);
7029	mac_hdrlen = `0`;
7030	}
7031
7032	saved_syn = kmalloc(struct_size(saved_syn, data, len),
7033	GFP_ATOMIC);
7034	if (saved_syn) {
7035	saved_syn->mac_hdrlen = mac_hdrlen;
7036	saved_syn->network_hdrlen = skb_network_header_len(skb);
7037	saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
7038	memcpy(saved_syn->data, base, len);
7039	req->saved_syn = saved_syn;
7040	}
7041	}
7042	}
7043
7044	/ If a SYN cookie is required and supported, returns a clamped MSS value to be*
7045	* used for SYN cookie generation.
7046	*/
7047	u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
7048	const struct tcp_request_sock_ops *af_ops,
7049	struct sock sk, struct* tcphdr *th)
7050	{
7051	struct tcp_sock *tp = tcp_sk(sk);
7052	u16 mss;
7053
7054	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != `2` &&
7055	!inet_csk_reqsk_queue_is_full(sk))
7056	return `0`;
7057
7058	if (!tcp_syn_flood_action(sk, proto: rsk_ops->slab_name))
7059	return `0`;
7060
7061	if (sk_acceptq_is_full(sk)) {
7062	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
7063	return `0`;
7064	}
7065
7066	mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
7067	if (!mss)
7068	mss = af_ops->mss_clamp;
7069
7070	return mss;
7071	}
7072	EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
7073
7074	int tcp_conn_request(struct request_sock_ops *rsk_ops,
7075	const struct tcp_request_sock_ops *af_ops,
7076	struct sock sk, struct* sk_buff *skb)
7077	{
7078	struct tcp_fastopen_cookie foc = { .len = -`1` };
7079	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
7080	struct tcp_options_received tmp_opt;
7081	struct tcp_sock *tp = tcp_sk(sk);
7082	struct net *net = sock_net(sk);
7083	struct sock *fastopen_sk = NULL;
7084	struct request_sock *req;
7085	bool want_cookie = false;
7086	struct dst_entry *dst;
7087	struct flowi fl;
7088	u8 syncookies;
7089
7090	#ifdef CONFIG_TCP_AO
7091	const struct tcp_ao_hdr *aoh;
7092	#endif
7093
7094	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
7095
7096	/ TW buckets are converted to open requests without*
7097	* limitations, they conserve resources and peer is
7098	* evidently real one.
7099	*/
7100	if ((syncookies == `2` \|\| inet_csk_reqsk_queue_is_full(sk)) && !isn) {
7101	want_cookie = tcp_syn_flood_action(sk, proto: rsk_ops->slab_name);
7102	if (!want_cookie)
7103	goto drop;
7104	}
7105
7106	if (sk_acceptq_is_full(sk)) {
7107	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
7108	goto drop;
7109	}
7110
7111	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
7112	if (!req)
7113	goto drop;
7114
7115	req->syncookie = want_cookie;
7116	tcp_rsk(req)->af_specific = af_ops;
7117	tcp_rsk(req)->ts_off = `0`;
7118	tcp_rsk(req)->req_usec_ts = -`1`;
7119	#if IS_ENABLED(CONFIG_MPTCP)
7120	tcp_rsk(req)->is_mptcp = `0`;
7121	#endif
7122
7123	tcp_clear_options(rx_opt: &tmp_opt);
7124	tmp_opt.mss_clamp = af_ops->mss_clamp;
7125	tmp_opt.user_mss = tp->rx_opt.user_mss;
7126	tcp_parse_options(sock_net(sk), skb, &tmp_opt, `0`,
7127	want_cookie ? NULL : &foc);
7128
7129	if (want_cookie && !tmp_opt.saw_tstamp)
7130	tcp_clear_options(rx_opt: &tmp_opt);
7131
7132	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
7133	tmp_opt.smc_ok = `0`;
7134
7135	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
7136	tcp_openreq_init(req, rx_opt: &tmp_opt, skb, sk);
7137	inet_rsk(sk: req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);
7138
7139	/ Note: tcp_v6_init_req() might override ir_iif for link locals /
7140	inet_rsk(sk: req)->ir_iif = inet_request_bound_dev_if(sk, skb);
7141
7142	dst = af_ops->route_req(sk, skb, &fl, req);
7143	if (!dst)
7144	goto drop_and_free;
7145
7146	if (tmp_opt.tstamp_ok)
7147	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
7148
7149	if (!want_cookie && !isn) {
7150	int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
7151
7152	/ Kill the following clause, if you dislike this way. /
7153	if (!syncookies &&
7154	(max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
7155	(max_syn_backlog >> `2`)) &&
7156	!tcp_peer_is_proven(req, dst)) {
7157	/ Without syncookies last quarter of*
7158	* backlog is filled with destinations,
7159	* proven to be alive.
7160	* It means that we continue to communicate
7161	* to destinations, already remembered
7162	* to the moment of synflood.
7163	*/
7164	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
7165	family: rsk_ops->family);
7166	goto drop_and_release;
7167	}
7168
7169	isn = af_ops->init_seq(skb);
7170	}
7171
7172	tcp_ecn_create_request(req, skb, listen_sk: sk, dst);
7173
7174	if (want_cookie) {
7175	isn = cookie_init_sequence(ops: af_ops, sk, skb, mss: &req->mss);
7176	if (!tmp_opt.tstamp_ok)
7177	inet_rsk(sk: req)->ecn_ok = `0`;
7178	}
7179
7180	#ifdef CONFIG_TCP_AO
7181	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh))
7182	goto drop_and_release; / Invalid TCP options /
7183	if (aoh) {
7184	tcp_rsk(req)->maclen = aoh->length - sizeof(struct tcp_ao_hdr);
7185	tcp_rsk(req)->ao_rcv_next = aoh->keyid;
7186	tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
7187	} else {
7188	tcp_rsk(req)->maclen = `0`;
7189	}
7190	#endif
7191	tcp_rsk(req)->snt_isn = isn;
7192	tcp_rsk(req)->txhash = net_tx_rndhash();
7193	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
7194	tcp_openreq_init_rwin(req, sk_listener: sk, dst);
7195	sk_rx_queue_set(sk: req_to_sk(req), skb);
7196	if (!want_cookie) {
7197	tcp_reqsk_record_syn(sk, req, skb);
7198	fastopen_sk = tcp_try_fastopen(sk, skb, req, foc: &foc, dst);
7199	}
7200	if (fastopen_sk) {
7201	af_ops->send_synack(fastopen_sk, dst, &fl, req,
7202	&foc, TCP_SYNACK_FASTOPEN, skb);
7203	/ Add the child socket directly into the accept queue /
7204	if (!inet_csk_reqsk_queue_add(sk, req, child: fastopen_sk)) {
7205	reqsk_fastopen_remove(sk: fastopen_sk, req, reset: false);
7206	bh_unlock_sock(fastopen_sk);
7207	sock_put(sk: fastopen_sk);
7208	goto drop_and_free;
7209	}
7210	sk->sk_data_ready(sk);
7211	bh_unlock_sock(fastopen_sk);
7212	sock_put(sk: fastopen_sk);
7213	} else {
7214	tcp_rsk(req)->tfo_listener = false;
7215	if (!want_cookie) {
7216	req->timeout = tcp_timeout_init(sk: (struct sock *)req);
7217	inet_csk_reqsk_queue_hash_add(sk, req, timeout: req->timeout);
7218	}
7219	af_ops->send_synack(sk, dst, &fl, req, &foc,
7220	!want_cookie ? TCP_SYNACK_NORMAL :
7221	TCP_SYNACK_COOKIE,
7222	skb);
7223	if (want_cookie) {
7224	reqsk_free(req);
7225	return `0`;
7226	}
7227	}
7228	reqsk_put(req);
7229	return `0`;
7230
7231	drop_and_release:
7232	dst_release(dst);
7233	drop_and_free:
7234	__reqsk_free(req);
7235	drop:
7236	tcp_listendrop(sk);
7237	return `0`;
7238	}
7239	EXPORT_SYMBOL(tcp_conn_request);
7240

source code of linux/net/ipv4/tcp_input.c