tcp_output.c source code [linux/net/ipv4/tcp_output.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
13	* Florian La Roche, <flla@stud.uni-sb.de>
14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
17	* Matthew Dillon, <dillon@apollo.west.oic.com>
18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19	* Jorge Cwik, <jorge@laser.satlink.net>
20	*/
21
22	/*
23	* Changes: Pedro Roque : Retransmit queue handled by TCP.
24	* : Fragmentation on mtu decrease
25	* : Segment collapse on retransmit
26	* : AF independence
27	*
28	* Linus Torvalds : send_delayed_ack
29	* David S. Miller : Charge memory using the right skb
30	* during syn/ack processing.
31	* David S. Miller : Output engine completely rewritten.
32	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
33	* Cacophonix Gaul : draft-minshall-nagle-01
34	* J Hadi Salim : ECN support
35	*
36	*/
37
38	#define pr_fmt(fmt) "TCP: " fmt
39
40	#include <net/tcp.h>
41	#include <net/mptcp.h>
42
43	#include <linux/compiler.h>
44	#include <linux/gfp.h>
45	#include <linux/module.h>
46	#include <linux/static_key.h>
47
48	#include <trace/events/tcp.h>
49
50	/ Refresh clocks of a TCP socket,*
51	* ensuring monotically increasing values.
52	*/
53	void tcp_mstamp_refresh(struct tcp_sock *tp)
54	{
55	u64 val = tcp_clock_ns();
56
57	tp->tcp_clock_cache = val;
58	tp->tcp_mstamp = div_u64(dividend: val, NSEC_PER_USEC);
59	}
60
61	static bool tcp_write_xmit(struct sock sk, unsigned* int mss_now, int nonagle,
62	int push_one, gfp_t gfp);
63
64	/ Account for new data that has been sent to the network. /
65	static void tcp_event_new_data_sent(struct sock sk, struct* sk_buff *skb)
66	{
67	struct inet_connection_sock *icsk = inet_csk(sk);
68	struct tcp_sock *tp = tcp_sk(sk);
69	unsigned int prior_packets = tp->packets_out;
70
71	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
72
73	__skb_unlink(skb, list: &sk->sk_write_queue);
74	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb);
75
76	if (tp->highest_sack == NULL)
77	tp->highest_sack = skb;
78
79	tp->packets_out += tcp_skb_pcount(skb);
80	if (!prior_packets \|\| icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
81	tcp_rearm_rto(sk);
82
83	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
84	tcp_skb_pcount(skb));
85	tcp_check_space(sk);
86	}
87
88	/ SND.NXT, if window was not shrunk or the amount of shrunk was less than one*
89	* window scaling factor due to loss of precision.
90	* If window has been shrunk, what should we make? It is not clear at all.
91	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
92	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
93	* invalid. OK, let's make this for now:
94	*/
95	static inline __u32 tcp_acceptable_seq(const struct sock *sk)
96	{
97	const struct tcp_sock *tp = tcp_sk(sk);
98
99	if (!before(seq1: tcp_wnd_end(tp), seq2: tp->snd_nxt) \|\|
100	(tp->rx_opt.wscale_ok &&
101	((tp->snd_nxt - tcp_wnd_end(tp)) < (`1` << tp->rx_opt.rcv_wscale))))
102	return tp->snd_nxt;
103	else
104	return tcp_wnd_end(tp);
105	}
106
107	/ Calculate mss to advertise in SYN segment.*
108	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
109	*
110	* 1. It is independent of path mtu.
111	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
112	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
113	* attached devices, because some buggy hosts are confused by
114	* large MSS.
115	* 4. We do not make 3, we advertise MSS, calculated from first
116	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
117	* This may be overridden via information stored in routing table.
118	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
119	* probably even Jumbo".
120	*/
121	static __u16 tcp_advertise_mss(struct sock *sk)
122	{
123	struct tcp_sock *tp = tcp_sk(sk);
124	const struct dst_entry *dst = __sk_dst_get(sk);
125	int mss = tp->advmss;
126
127	if (dst) {
128	unsigned int metric = dst_metric_advmss(dst);
129
130	if (metric < mss) {
131	mss = metric;
132	tp->advmss = mss;
133	}
134	}
135
136	return (__u16)mss;
137	}
138
139	/ RFC2861. Reset CWND after idle period longer RTO to "restart window".*
140	* This is the first part of cwnd validation mechanism.
141	*/
142	void tcp_cwnd_restart(struct sock *sk, s32 delta)
143	{
144	struct tcp_sock *tp = tcp_sk(sk);
145	u32 restart_cwnd = tcp_init_cwnd(tp, dst: __sk_dst_get(sk));
146	u32 cwnd = tcp_snd_cwnd(tp);
147
148	tcp_ca_event(sk, event: CA_EVENT_CWND_RESTART);
149
150	tp->snd_ssthresh = tcp_current_ssthresh(sk);
151	restart_cwnd = min(restart_cwnd, cwnd);
152
153	while ((delta -= inet_csk(sk)->icsk_rto) > `0` && cwnd > restart_cwnd)
154	cwnd >>= `1`;
155	tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
156	tp->snd_cwnd_stamp = tcp_jiffies32;
157	tp->snd_cwnd_used = `0`;
158	}
159
160	/ Congestion state accounting after a packet has been sent. /
161	static void tcp_event_data_sent(struct tcp_sock *tp,
162	struct sock *sk)
163	{
164	struct inet_connection_sock *icsk = inet_csk(sk);
165	const u32 now = tcp_jiffies32;
166
167	if (tcp_packets_in_flight(tp) == `0`)
168	tcp_ca_event(sk, event: CA_EVENT_TX_START);
169
170	tp->lsndtime = now;
171
172	/ If it is a reply for ato after last received*
173	* packet, increase pingpong count.
174	*/
175	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
176	inet_csk_inc_pingpong_cnt(sk);
177	}
178
179	/ Account for an ACK we sent. /
180	static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
181	{
182	struct tcp_sock *tp = tcp_sk(sk);
183
184	if (unlikely(tp->compressed_ack)) {
185	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
186	tp->compressed_ack);
187	tp->compressed_ack = `0`;
188	if (hrtimer_try_to_cancel(timer: &tp->compressed_ack_timer) == `1`)
189	__sock_put(sk);
190	}
191
192	if (unlikely(rcv_nxt != tp->rcv_nxt))
193	return; / Special ACK sent by DCTCP to reflect ECN /
194	tcp_dec_quickack_mode(sk);
195	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
196	}
197
198	/ Determine a window scaling and initial window to offer.*
199	* Based on the assumption that the given amount of space
200	* will be offered. Store the results in the tp structure.
201	* NOTE: for smooth operation initial space offering should
202	* be a multiple of mss if possible. We assume here that mss >= 1.
203	* This MUST be enforced by all callers.
204	*/
205	void tcp_select_initial_window(const struct sock sk, int* __space, __u32 mss,
206	__u32 rcv_wnd, __u32 window_clamp,
207	int wscale_ok, __u8 *rcv_wscale,
208	__u32 init_rcv_wnd)
209	{
210	unsigned int space = (__space < `0` ? `0` : __space);
211
212	/ If no clamp set the clamp to the max possible scaled window /
213	if (*window_clamp == `0`)
214	(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
215	space = min(*window_clamp, space);
216
217	/ Quantize space offering to a multiple of mss if possible. /
218	if (space > mss)
219	space = rounddown(space, mss);
220
221	/ NOTE: offering an initial window larger than 32767*
222	* will break some buggy TCP stacks. If the admin tells us
223	* it is likely we could be speaking with such a buggy stack
224	* we will truncate our initial window offering to 32K-1
225	* unless the remote has sent us a window scaling option,
226	* which we interpret as a sign the remote TCP is not
227	* misinterpreting the window field as a signed quantity.
228	*/
229	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
230	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
231	else
232	(*rcv_wnd) = min_t(u32, space, U16_MAX);
233
234	if (init_rcv_wnd)
235	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
236
237	*rcv_wscale = `0`;
238	if (wscale_ok) {
239	/ Set window scaling on max possible window /
240	space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`]));
241	space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
242	space = min_t(u32, space, *window_clamp);
243	rcv_wscale = clamp_t(int*, ilog2(space) - `15`,
244	`0`, TCP_MAX_WSCALE);
245	}
246	/ Set the clamp no higher than max representable value /
247	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
248	}
249	EXPORT_SYMBOL(tcp_select_initial_window);
250
251	/ Chose a new window to advertise, update state in tcp_sock for the*
252	* socket, and return result with RFC1323 scaling applied. The return
253	* value can be stuffed directly into th->window for an outgoing
254	* frame.
255	*/
256	static u16 tcp_select_window(struct sock *sk)
257	{
258	struct tcp_sock *tp = tcp_sk(sk);
259	struct net *net = sock_net(sk);
260	u32 old_win = tp->rcv_wnd;
261	u32 cur_win, new_win;
262
263	/ Make the window 0 if we failed to queue the data because we*
264	* are out of memory. The window is temporary, so we don't store
265	* it on the socket.
266	*/
267	if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
268	return `0`;
269
270	cur_win = tcp_receive_window(tp);
271	new_win = __tcp_select_window(sk);
272	if (new_win < cur_win) {
273	/ Danger Will Robinson!*
274	* Don't update rcv_wup/rcv_wnd here or else
275	* we will not be able to advertise a zero
276	* window in time. --DaveM
277	*
278	* Relax Will Robinson.
279	*/
280	if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) \|\| !tp->rx_opt.rcv_wscale) {
281	/ Never shrink the offered window /
282	if (new_win == `0`)
283	NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
284	new_win = ALIGN(cur_win, `1` << tp->rx_opt.rcv_wscale);
285	}
286	}
287
288	tp->rcv_wnd = new_win;
289	tp->rcv_wup = tp->rcv_nxt;
290
291	/ Make sure we do not exceed the maximum possible*
292	* scaled window.
293	*/
294	if (!tp->rx_opt.rcv_wscale &&
295	READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
296	new_win = min(new_win, MAX_TCP_WINDOW);
297	else
298	new_win = min(new_win, (`65535U` << tp->rx_opt.rcv_wscale));
299
300	/ RFC1323 scaling applied /
301	new_win >>= tp->rx_opt.rcv_wscale;
302
303	/ If we advertise zero window, disable fast path. /
304	if (new_win == `0`) {
305	tp->pred_flags = `0`;
306	if (old_win)
307	NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
308	} else if (old_win == `0`) {
309	NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
310	}
311
312	return new_win;
313	}
314
315	/ Packet ECN state for a SYN-ACK /
316	static void tcp_ecn_send_synack(struct sock sk, struct* sk_buff *skb)
317	{
318	const struct tcp_sock *tp = tcp_sk(sk);
319
320	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
321	if (!(tp->ecn_flags & TCP_ECN_OK))
322	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
323	else if (tcp_ca_needs_ecn(sk) \|\|
324	tcp_bpf_ca_needs_ecn(sk))
325	INET_ECN_xmit(sk);
326	}
327
328	/ Packet ECN state for a SYN. /
329	static void tcp_ecn_send_syn(struct sock sk, struct* sk_buff *skb)
330	{
331	struct tcp_sock *tp = tcp_sk(sk);
332	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
333	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == `1` \|\|
334	tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn;
335
336	if (!use_ecn) {
337	const struct dst_entry *dst = __sk_dst_get(sk);
338
339	if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
340	use_ecn = true;
341	}
342
343	tp->ecn_flags = `0`;
344
345	if (use_ecn) {
346	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ECE \| TCPHDR_CWR;
347	tp->ecn_flags = TCP_ECN_OK;
348	if (tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn)
349	INET_ECN_xmit(sk);
350	}
351	}
352
353	static void tcp_ecn_clear_syn(struct sock sk, struct* sk_buff *skb)
354	{
355	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
356	/ tp->ecn_flags are cleared at a later point in time when*
357	* SYN ACK is ultimatively being received.
358	*/
359	TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE \| TCPHDR_CWR);
360	}
361
362	static void
363	tcp_ecn_make_synack(const struct request_sock req, struct* tcphdr *th)
364	{
365	if (inet_rsk(sk: req)->ecn_ok)
366	th->ece = `1`;
367	}
368
369	/ Set up ECN state for a packet on a ESTABLISHED socket that is about to*
370	* be sent.
371	*/
372	static void tcp_ecn_send(struct sock sk, struct* sk_buff *skb,
373	struct tcphdr th, int* tcp_header_len)
374	{
375	struct tcp_sock *tp = tcp_sk(sk);
376
377	if (tp->ecn_flags & TCP_ECN_OK) {
378	/ Not-retransmitted data segment: set ECT and inject CWR. /
379	if (skb->len != tcp_header_len &&
380	!before(TCP_SKB_CB(skb)->seq, seq2: tp->snd_nxt)) {
381	INET_ECN_xmit(sk);
382	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
383	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
384	th->cwr = `1`;
385	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
386	}
387	} else if (!tcp_ca_needs_ecn(sk)) {
388	/ ACK or retransmitted segment: clear ECT\|CE /
389	INET_ECN_dontxmit(sk);
390	}
391	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
392	th->ece = `1`;
393	}
394	}
395
396	/ Constructs common control bits of non-data skb. If SYN/FIN is present,*
397	* auto increment end seqno.
398	*/
399	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
400	{
401	skb->ip_summed = CHECKSUM_PARTIAL;
402
403	TCP_SKB_CB(skb)->tcp_flags = flags;
404
405	tcp_skb_pcount_set(skb, segs: `1`);
406
407	TCP_SKB_CB(skb)->seq = seq;
408	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
409	seq++;
410	TCP_SKB_CB(skb)->end_seq = seq;
411	}
412
413	static inline bool tcp_urg_mode(const struct tcp_sock *tp)
414	{
415	return tp->snd_una != tp->snd_up;
416	}
417
418	#define OPTION_SACK_ADVERTISE BIT(0)
419	#define OPTION_TS BIT(1)
420	#define OPTION_MD5 BIT(2)
421	#define OPTION_WSCALE BIT(3)
422	#define OPTION_FAST_OPEN_COOKIE BIT(8)
423	#define OPTION_SMC BIT(9)
424	#define OPTION_MPTCP BIT(10)
425	#define OPTION_AO BIT(11)
426
427	static void smc_options_write(__be32 ptr, u16 options)
428	{
429	#if IS_ENABLED(CONFIG_SMC)
430	if (static_branch_unlikely(&tcp_have_smc)) {
431	if (unlikely(OPTION_SMC & *options)) {
432	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
433	(TCPOPT_NOP << `16`) \|
434	(TCPOPT_EXP << `8`) \|
435	(TCPOLEN_EXP_SMC_BASE));
436	*ptr++ = htonl(TCPOPT_SMC_MAGIC);
437	}
438	}
439	#endif
440	}
441
442	struct tcp_out_options {
443	u16 options; / bit field of OPTION_* /
444	u16 mss; / 0 to disable /
445	u8 ws; / window scale, 0 to disable /
446	u8 num_sack_blocks; / number of SACK blocks to include /
447	u8 hash_size; / bytes in hash_location /
448	u8 bpf_opt_len; / length of BPF hdr option /
449	__u8 hash_location; /* temporary pointer, overloaded /
450	__u32 tsval, tsecr; / need to include OPTION_TS /
451	struct tcp_fastopen_cookie fastopen_cookie; /* Fast open cookie /
452	struct mptcp_out_options mptcp;
453	};
454
455	static void mptcp_options_write(struct tcphdr th, __be32 ptr,
456	struct tcp_sock *tp,
457	struct tcp_out_options *opts)
458	{
459	#if IS_ENABLED(CONFIG_MPTCP)
460	if (unlikely(OPTION_MPTCP & opts->options))
461	mptcp_write_options(th, ptr, tp, opts: &opts->mptcp);
462	#endif
463	}
464
465	#ifdef CONFIG_CGROUP_BPF
466	static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
467	enum tcp_synack_type synack_type)
468	{
469	if (unlikely(!skb))
470	return BPF_WRITE_HDR_TCP_CURRENT_MSS;
471
472	if (unlikely(synack_type == TCP_SYNACK_COOKIE))
473	return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
474
475	return `0`;
476	}
477
478	/ req, syn_skb and synack_type are used when writing synack /
479	static void bpf_skops_hdr_opt_len(struct sock sk, struct* sk_buff *skb,
480	struct request_sock *req,
481	struct sk_buff *syn_skb,
482	enum tcp_synack_type synack_type,
483	struct tcp_out_options *opts,
484	unsigned int *remaining)
485	{
486	struct bpf_sock_ops_kern sock_ops;
487	int err;
488
489	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
490	BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) \|\|
491	!*remaining)
492	return;
493
494	/ remaining has already been aligned to 4 bytes, so remaining >= 4 /
495
496	/ init sock_ops /
497	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
498
499	sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
500
501	if (req) {
502	/ The listen "sk" cannot be passed here because*
503	* it is not locked. It would not make too much
504	* sense to do bpf_setsockopt(listen_sk) based
505	* on individual connection request also.
506	*
507	* Thus, "req" is passed here and the cgroup-bpf-progs
508	* of the listen "sk" will be run.
509	*
510	* "req" is also used here for fastopen even the "sk" here is
511	* a fullsock "child" sk. It is to keep the behavior
512	* consistent between fastopen and non-fastopen on
513	* the bpf programming side.
514	*/
515	sock_ops.sk = (struct sock *)req;
516	sock_ops.syn_skb = syn_skb;
517	} else {
518	sock_owned_by_me(sk);
519
520	sock_ops.is_fullsock = `1`;
521	sock_ops.sk = sk;
522	}
523
524	sock_ops.args[`0`] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
525	sock_ops.remaining_opt_len = *remaining;
526	/ tcp_current_mss() does not pass a skb /
527	if (skb)
528	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: `0`);
529
530	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
531
532	if (err \|\| sock_ops.remaining_opt_len == *remaining)
533	return;
534
535	opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
536	/ round up to 4 bytes /
537	opts->bpf_opt_len = (opts->bpf_opt_len + `3`) & ~`3`;
538
539	*remaining -= opts->bpf_opt_len;
540	}
541
542	static void bpf_skops_write_hdr_opt(struct sock sk, struct* sk_buff *skb,
543	struct request_sock *req,
544	struct sk_buff *syn_skb,
545	enum tcp_synack_type synack_type,
546	struct tcp_out_options *opts)
547	{
548	u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
549	struct bpf_sock_ops_kern sock_ops;
550	int err;
551
552	if (likely(!max_opt_len))
553	return;
554
555	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
556
557	sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
558
559	if (req) {
560	sock_ops.sk = (struct sock *)req;
561	sock_ops.syn_skb = syn_skb;
562	} else {
563	sock_owned_by_me(sk);
564
565	sock_ops.is_fullsock = `1`;
566	sock_ops.sk = sk;
567	}
568
569	sock_ops.args[`0`] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
570	sock_ops.remaining_opt_len = max_opt_len;
571	first_opt_off = tcp_hdrlen(skb) - max_opt_len;
572	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: first_opt_off);
573
574	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
575
576	if (err)
577	nr_written = `0`;
578	else
579	nr_written = max_opt_len - sock_ops.remaining_opt_len;
580
581	if (nr_written < max_opt_len)
582	memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
583	max_opt_len - nr_written);
584	}
585	#else
586	static void bpf_skops_hdr_opt_len(struct sock sk, struct* sk_buff *skb,
587	struct request_sock *req,
588	struct sk_buff *syn_skb,
589	enum tcp_synack_type synack_type,
590	struct tcp_out_options *opts,
591	unsigned int *remaining)
592	{
593	}
594
595	static void bpf_skops_write_hdr_opt(struct sock sk, struct* sk_buff *skb,
596	struct request_sock *req,
597	struct sk_buff *syn_skb,
598	enum tcp_synack_type synack_type,
599	struct tcp_out_options *opts)
600	{
601	}
602	#endif
603
604	/ Write previously computed TCP options to the packet.*
605	*
606	* Beware: Something in the Internet is very sensitive to the ordering of
607	* TCP options, we learned this through the hard way, so be careful here.
608	* Luckily we can at least blame others for their non-compliance but from
609	* inter-operability perspective it seems that we're somewhat stuck with
610	* the ordering which we have been using if we want to keep working with
611	* those broken things (not that it currently hurts anybody as there isn't
612	* particular reason why the ordering would need to be changed).
613	*
614	* At least SACK_PERM as the first option is known to lead to a disaster
615	* (but it may well be that other scenarios fail similarly).
616	*/
617	static void tcp_options_write(struct tcphdr th, struct* tcp_sock *tp,
618	const struct tcp_request_sock *tcprsk,
619	struct tcp_out_options *opts,
620	struct tcp_key *key)
621	{
622	__be32 ptr = (__be32 )(th + `1`);
623	u16 options = opts->options; / mungable copy /
624
625	if (tcp_key_is_md5(key)) {
626	*ptr++ = htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`) \|
627	(TCPOPT_MD5SIG << `8`) \| TCPOLEN_MD5SIG);
628	/ overload cookie hash location /
629	opts->hash_location = (__u8 *)ptr;
630	ptr += `4`;
631	} else if (tcp_key_is_ao(key)) {
632	#ifdef CONFIG_TCP_AO
633	u8 maclen = tcp_ao_maclen(key: key->ao_key);
634
635	if (tcprsk) {
636	u8 aolen = maclen + sizeof(struct tcp_ao_hdr);
637
638	*ptr++ = htonl((TCPOPT_AO << `24`) \| (aolen << `16`) \|
639	(tcprsk->ao_keyid << `8`) \|
640	(tcprsk->ao_rcv_next));
641	} else {
642	struct tcp_ao_key *rnext_key;
643	struct tcp_ao_info *ao_info;
644
645	ao_info = rcu_dereference_check(tp->ao_info,
646	lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
647	rnext_key = READ_ONCE(ao_info->rnext_key);
648	if (WARN_ON_ONCE(!rnext_key))
649	goto out_ao;
650	*ptr++ = htonl((TCPOPT_AO << `24`) \|
651	(tcp_ao_len(key->ao_key) << `16`) \|
652	(key->ao_key->sndid << `8`) \|
653	(rnext_key->rcvid));
654	}
655	opts->hash_location = (__u8 *)ptr;
656	ptr += maclen / sizeof(*ptr);
657	if (unlikely(maclen % sizeof(*ptr))) {
658	memset(ptr, TCPOPT_NOP, sizeof(*ptr));
659	ptr++;
660	}
661	out_ao:
662	#endif
663	}
664	if (unlikely(opts->mss)) {
665	*ptr++ = htonl((TCPOPT_MSS << `24`) \|
666	(TCPOLEN_MSS << `16`) \|
667	opts->mss);
668	}
669
670	if (likely(OPTION_TS & options)) {
671	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
672	*ptr++ = htonl((TCPOPT_SACK_PERM << `24`) \|
673	(TCPOLEN_SACK_PERM << `16`) \|
674	(TCPOPT_TIMESTAMP << `8`) \|
675	TCPOLEN_TIMESTAMP);
676	options &= ~OPTION_SACK_ADVERTISE;
677	} else {
678	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
679	(TCPOPT_NOP << `16`) \|
680	(TCPOPT_TIMESTAMP << `8`) \|
681	TCPOLEN_TIMESTAMP);
682	}
683	*ptr++ = htonl(opts->tsval);
684	*ptr++ = htonl(opts->tsecr);
685	}
686
687	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
688	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
689	(TCPOPT_NOP << `16`) \|
690	(TCPOPT_SACK_PERM << `8`) \|
691	TCPOLEN_SACK_PERM);
692	}
693
694	if (unlikely(OPTION_WSCALE & options)) {
695	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
696	(TCPOPT_WINDOW << `16`) \|
697	(TCPOLEN_WINDOW << `8`) \|
698	opts->ws);
699	}
700
701	if (unlikely(opts->num_sack_blocks)) {
702	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
703	tp->duplicate_sack : tp->selective_acks;
704	int this_sack;
705
706	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
707	(TCPOPT_NOP << `16`) \|
708	(TCPOPT_SACK << `8`) \|
709	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
710	TCPOLEN_SACK_PERBLOCK)));
711
712	for (this_sack = `0`; this_sack < opts->num_sack_blocks;
713	++this_sack) {
714	*ptr++ = htonl(sp[this_sack].start_seq);
715	*ptr++ = htonl(sp[this_sack].end_seq);
716	}
717
718	tp->rx_opt.dsack = `0`;
719	}
720
721	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
722	struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
723	u8 p = (u8 )ptr;
724	u32 len; / Fast Open option length /
725
726	if (foc->exp) {
727	len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
728	*ptr = htonl((TCPOPT_EXP << `24`) \| (len << `16`) \|
729	TCPOPT_FASTOPEN_MAGIC);
730	p += TCPOLEN_EXP_FASTOPEN_BASE;
731	} else {
732	len = TCPOLEN_FASTOPEN_BASE + foc->len;
733	*p++ = TCPOPT_FASTOPEN;
734	*p++ = len;
735	}
736
737	memcpy(p, foc->val, foc->len);
738	if ((len & `3`) == `2`) {
739	p[foc->len] = TCPOPT_NOP;
740	p[foc->len + `1`] = TCPOPT_NOP;
741	}
742	ptr += (len + `3`) >> `2`;
743	}
744
745	smc_options_write(ptr, options: &options);
746
747	mptcp_options_write(th, ptr, tp, opts);
748	}
749
750	static void smc_set_option(const struct tcp_sock *tp,
751	struct tcp_out_options *opts,
752	unsigned int *remaining)
753	{
754	#if IS_ENABLED(CONFIG_SMC)
755	if (static_branch_unlikely(&tcp_have_smc)) {
756	if (tp->syn_smc) {
757	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
758	opts->options \|= OPTION_SMC;
759	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
760	}
761	}
762	}
763	#endif
764	}
765
766	static void smc_set_option_cond(const struct tcp_sock *tp,
767	const struct inet_request_sock *ireq,
768	struct tcp_out_options *opts,
769	unsigned int *remaining)
770	{
771	#if IS_ENABLED(CONFIG_SMC)
772	if (static_branch_unlikely(&tcp_have_smc)) {
773	if (tp->syn_smc && ireq->smc_ok) {
774	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
775	opts->options \|= OPTION_SMC;
776	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
777	}
778	}
779	}
780	#endif
781	}
782
783	static void mptcp_set_option_cond(const struct request_sock *req,
784	struct tcp_out_options *opts,
785	unsigned int *remaining)
786	{
787	if (rsk_is_mptcp(req)) {
788	unsigned int size;
789
790	if (mptcp_synack_options(req, size: &size, opts: &opts->mptcp)) {
791	if (*remaining >= size) {
792	opts->options \|= OPTION_MPTCP;
793	*remaining -= size;
794	}
795	}
796	}
797	}
798
799	/ Compute TCP options for SYN packets. This is not the final*
800	* network wire format yet.
801	*/
802	static unsigned int tcp_syn_options(struct sock sk, struct* sk_buff *skb,
803	struct tcp_out_options *opts,
804	struct tcp_key *key)
805	{
806	struct tcp_sock *tp = tcp_sk(sk);
807	unsigned int remaining = MAX_TCP_OPTION_SPACE;
808	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
809	bool timestamps;
810
811	/ Better than switch (key.type) as it has static branches /
812	if (tcp_key_is_md5(key)) {
813	timestamps = false;
814	opts->options \|= OPTION_MD5;
815	remaining -= TCPOLEN_MD5SIG_ALIGNED;
816	} else {
817	timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
818	if (tcp_key_is_ao(key)) {
819	opts->options \|= OPTION_AO;
820	remaining -= tcp_ao_len(key: key->ao_key);
821	}
822	}
823
824	/ We always get an MSS option. The option bytes which will be seen in*
825	* normal data packets should timestamps be used, must be in the MSS
826	* advertised. But we subtract them from tp->mss_cache so that
827	* calculations in tcp_sendmsg are simpler etc. So account for this
828	* fact here if necessary. If we don't do this correctly, as a
829	* receiver we won't recognize data packets as being full sized when we
830	* should, and thus we won't abide by the delayed ACK rules correctly.
831	* SACKs don't matter, we never delay an ACK when we have any of those
832	* going out. */
833	opts->mss = tcp_advertise_mss(sk);
834	remaining -= TCPOLEN_MSS_ALIGNED;
835
836	if (likely(timestamps)) {
837	opts->options \|= OPTION_TS;
838	opts->tsval = tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb) + tp->tsoffset;
839	opts->tsecr = tp->rx_opt.ts_recent;
840	remaining -= TCPOLEN_TSTAMP_ALIGNED;
841	}
842	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
843	opts->ws = tp->rx_opt.rcv_wscale;
844	opts->options \|= OPTION_WSCALE;
845	remaining -= TCPOLEN_WSCALE_ALIGNED;
846	}
847	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
848	opts->options \|= OPTION_SACK_ADVERTISE;
849	if (unlikely(!(OPTION_TS & opts->options)))
850	remaining -= TCPOLEN_SACKPERM_ALIGNED;
851	}
852
853	if (fastopen && fastopen->cookie.len >= `0`) {
854	u32 need = fastopen->cookie.len;
855
856	need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
857	TCPOLEN_FASTOPEN_BASE;
858	need = (need + `3`) & ~`3U`; / Align to 32 bits /
859	if (remaining >= need) {
860	opts->options \|= OPTION_FAST_OPEN_COOKIE;
861	opts->fastopen_cookie = &fastopen->cookie;
862	remaining -= need;
863	tp->syn_fastopen = `1`;
864	tp->syn_fastopen_exp = fastopen->cookie.exp ? `1` : `0`;
865	}
866	}
867
868	smc_set_option(tp, opts, remaining: &remaining);
869
870	if (sk_is_mptcp(sk)) {
871	unsigned int size;
872
873	if (mptcp_syn_options(sk, skb, size: &size, opts: &opts->mptcp)) {
874	opts->options \|= OPTION_MPTCP;
875	remaining -= size;
876	}
877	}
878
879	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, synack_type: `0`, opts, remaining: &remaining);
880
881	return MAX_TCP_OPTION_SPACE - remaining;
882	}
883
884	/ Set up TCP options for SYN-ACKs. /
885	static unsigned int tcp_synack_options(const struct sock *sk,
886	struct request_sock *req,
887	unsigned int mss, struct sk_buff *skb,
888	struct tcp_out_options *opts,
889	const struct tcp_key *key,
890	struct tcp_fastopen_cookie *foc,
891	enum tcp_synack_type synack_type,
892	struct sk_buff *syn_skb)
893	{
894	struct inet_request_sock *ireq = inet_rsk(sk: req);
895	unsigned int remaining = MAX_TCP_OPTION_SPACE;
896
897	if (tcp_key_is_md5(key)) {
898	opts->options \|= OPTION_MD5;
899	remaining -= TCPOLEN_MD5SIG_ALIGNED;
900
901	/ We can't fit any SACK blocks in a packet with MD5 + TS*
902	* options. There was discussion about disabling SACK
903	* rather than TS in order to fit in better with old,
904	* buggy kernels, but that was deemed to be unnecessary.
905	*/
906	if (synack_type != TCP_SYNACK_COOKIE)
907	ireq->tstamp_ok &= !ireq->sack_ok;
908	} else if (tcp_key_is_ao(key)) {
909	opts->options \|= OPTION_AO;
910	remaining -= tcp_ao_len(key: key->ao_key);
911	ireq->tstamp_ok &= !ireq->sack_ok;
912	}
913
914	/ We always send an MSS option. /
915	opts->mss = mss;
916	remaining -= TCPOLEN_MSS_ALIGNED;
917
918	if (likely(ireq->wscale_ok)) {
919	opts->ws = ireq->rcv_wscale;
920	opts->options \|= OPTION_WSCALE;
921	remaining -= TCPOLEN_WSCALE_ALIGNED;
922	}
923	if (likely(ireq->tstamp_ok)) {
924	opts->options \|= OPTION_TS;
925	opts->tsval = tcp_skb_timestamp_ts(usec_ts: tcp_rsk(req)->req_usec_ts, skb) +
926	tcp_rsk(req)->ts_off;
927	opts->tsecr = READ_ONCE(req->ts_recent);
928	remaining -= TCPOLEN_TSTAMP_ALIGNED;
929	}
930	if (likely(ireq->sack_ok)) {
931	opts->options \|= OPTION_SACK_ADVERTISE;
932	if (unlikely(!ireq->tstamp_ok))
933	remaining -= TCPOLEN_SACKPERM_ALIGNED;
934	}
935	if (foc != NULL && foc->len >= `0`) {
936	u32 need = foc->len;
937
938	need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
939	TCPOLEN_FASTOPEN_BASE;
940	need = (need + `3`) & ~`3U`; / Align to 32 bits /
941	if (remaining >= need) {
942	opts->options \|= OPTION_FAST_OPEN_COOKIE;
943	opts->fastopen_cookie = foc;
944	remaining -= need;
945	}
946	}
947
948	mptcp_set_option_cond(req, opts, remaining: &remaining);
949
950	smc_set_option_cond(tcp_sk(sk), ireq, opts, remaining: &remaining);
951
952	bpf_skops_hdr_opt_len(sk: (struct sock *)sk, skb, req, syn_skb,
953	synack_type, opts, remaining: &remaining);
954
955	return MAX_TCP_OPTION_SPACE - remaining;
956	}
957
958	/ Compute TCP options for ESTABLISHED sockets. This is not the*
959	* final wire format yet.
960	*/
961	static unsigned int tcp_established_options(struct sock sk, struct* sk_buff *skb,
962	struct tcp_out_options *opts,
963	struct tcp_key *key)
964	{
965	struct tcp_sock *tp = tcp_sk(sk);
966	unsigned int size = `0`;
967	unsigned int eff_sacks;
968
969	opts->options = `0`;
970
971	/ Better than switch (key.type) as it has static branches /
972	if (tcp_key_is_md5(key)) {
973	opts->options \|= OPTION_MD5;
974	size += TCPOLEN_MD5SIG_ALIGNED;
975	} else if (tcp_key_is_ao(key)) {
976	opts->options \|= OPTION_AO;
977	size += tcp_ao_len(key: key->ao_key);
978	}
979
980	if (likely(tp->rx_opt.tstamp_ok)) {
981	opts->options \|= OPTION_TS;
982	opts->tsval = skb ? tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb) +
983	tp->tsoffset : `0`;
984	opts->tsecr = tp->rx_opt.ts_recent;
985	size += TCPOLEN_TSTAMP_ALIGNED;
986	}
987
988	/ MPTCP options have precedence over SACK for the limited TCP*
989	* option space because a MPTCP connection would be forced to
990	* fall back to regular TCP if a required multipath option is
991	* missing. SACK still gets a chance to use whatever space is
992	* left.
993	*/
994	if (sk_is_mptcp(sk)) {
995	unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
996	unsigned int opt_size = `0`;
997
998	if (mptcp_established_options(sk, skb, size: &opt_size, remaining,
999	opts: &opts->mptcp)) {
1000	opts->options \|= OPTION_MPTCP;
1001	size += opt_size;
1002	}
1003	}
1004
1005	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
1006	if (unlikely(eff_sacks)) {
1007	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
1008	if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
1009	TCPOLEN_SACK_PERBLOCK))
1010	return size;
1011
1012	opts->num_sack_blocks =
1013	min_t(unsigned int, eff_sacks,
1014	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
1015	TCPOLEN_SACK_PERBLOCK);
1016
1017	size += TCPOLEN_SACK_BASE_ALIGNED +
1018	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
1019	}
1020
1021	if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
1022	BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
1023	unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
1024
1025	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, synack_type: `0`, opts, remaining: &remaining);
1026
1027	size = MAX_TCP_OPTION_SPACE - remaining;
1028	}
1029
1030	return size;
1031	}
1032
1033
1034	/ TCP SMALL QUEUES (TSQ)*
1035	*
1036	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
1037	* to reduce RTT and bufferbloat.
1038	* We do this using a special skb destructor (tcp_wfree).
1039	*
1040	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
1041	* needs to be reallocated in a driver.
1042	* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
1043	*
1044	* Since transmit from skb destructor is forbidden, we use a tasklet
1045	* to process all sockets that eventually need to send more skbs.
1046	* We use one tasklet per cpu, with its own queue of sockets.
1047	*/
1048	struct tsq_tasklet {
1049	struct tasklet_struct tasklet;
1050	struct list_head head; / queue of tcp sockets /
1051	};
1052	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
1053
1054	static void tcp_tsq_write(struct sock *sk)
1055	{
1056	if ((`1` << sk->sk_state) &
1057	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|
1058	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK)) {
1059	struct tcp_sock *tp = tcp_sk(sk);
1060
1061	if (tp->lost_out > tp->retrans_out &&
1062	tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
1063	tcp_mstamp_refresh(tp);
1064	tcp_xmit_retransmit_queue(sk);
1065	}
1066
1067	tcp_write_xmit(sk, mss_now: tcp_current_mss(sk), nonagle: tp->nonagle,
1068	push_one: `0`, GFP_ATOMIC);
1069	}
1070	}
1071
1072	static void tcp_tsq_handler(struct sock *sk)
1073	{
1074	bh_lock_sock(sk);
1075	if (!sock_owned_by_user(sk))
1076	tcp_tsq_write(sk);
1077	else if (!test_and_set_bit(nr: TCP_TSQ_DEFERRED, addr: &sk->sk_tsq_flags))
1078	sock_hold(sk);
1079	bh_unlock_sock(sk);
1080	}
1081	/*
1082	* One tasklet per cpu tries to send more skbs.
1083	* We run in tasklet context but need to disable irqs when
1084	* transferring tsq->head because tcp_wfree() might
1085	* interrupt us (non NAPI drivers)
1086	*/
1087	static void tcp_tasklet_func(struct tasklet_struct *t)
1088	{
1089	struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
1090	LIST_HEAD(list);
1091	unsigned long flags;
1092	struct list_head q, n;
1093	struct tcp_sock *tp;
1094	struct sock *sk;
1095
1096	local_irq_save(flags);
1097	list_splice_init(list: &tsq->head, head: &list);
1098	local_irq_restore(flags);
1099
1100	list_for_each_safe(q, n, &list) {
1101	tp = list_entry(q, struct tcp_sock, tsq_node);
1102	list_del(entry: &tp->tsq_node);
1103
1104	sk = (struct sock *)tp;
1105	smp_mb__before_atomic();
1106	clear_bit(nr: TSQ_QUEUED, addr: &sk->sk_tsq_flags);
1107
1108	tcp_tsq_handler(sk);
1109	sk_free(sk);
1110	}
1111	}
1112
1113	#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED \| \
1114	TCPF_WRITE_TIMER_DEFERRED \| \
1115	TCPF_DELACK_TIMER_DEFERRED \| \
1116	TCPF_MTU_REDUCED_DEFERRED \| \
1117	TCPF_ACK_DEFERRED)
1118	/**
1119	* tcp_release_cb - tcp release_sock() callback
1120	* @sk: socket
1121	*
1122	* called from release_sock() to perform protocol dependent
1123	* actions before socket release.
1124	*/
1125	void tcp_release_cb(struct sock *sk)
1126	{
1127	unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
1128	unsigned long nflags;
1129
1130	/ perform an atomic operation only if at least one flag is set /
1131	do {
1132	if (!(flags & TCP_DEFERRED_ALL))
1133	return;
1134	nflags = flags & ~TCP_DEFERRED_ALL;
1135	} while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));
1136
1137	if (flags & TCPF_TSQ_DEFERRED) {
1138	tcp_tsq_write(sk);
1139	__sock_put(sk);
1140	}
1141
1142	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
1143	tcp_write_timer_handler(sk);
1144	__sock_put(sk);
1145	}
1146	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
1147	tcp_delack_timer_handler(sk);
1148	__sock_put(sk);
1149	}
1150	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
1151	inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1152	__sock_put(sk);
1153	}
1154	if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
1155	tcp_send_ack(sk);
1156	}
1157	EXPORT_SYMBOL(tcp_release_cb);
1158
1159	void __init tcp_tasklet_init(void)
1160	{
1161	int i;
1162
1163	for_each_possible_cpu(i) {
1164	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
1165
1166	INIT_LIST_HEAD(list: &tsq->head);
1167	tasklet_setup(t: &tsq->tasklet, callback: tcp_tasklet_func);
1168	}
1169	}
1170
1171	/*
1172	* Write buffer destructor automatically called from kfree_skb.
1173	* We can't xmit new skbs from this context, as we might already
1174	* hold qdisc lock.
1175	*/
1176	void tcp_wfree(struct sk_buff *skb)
1177	{
1178	struct sock *sk = skb->sk;
1179	struct tcp_sock *tp = tcp_sk(sk);
1180	unsigned long flags, nval, oval;
1181	struct tsq_tasklet *tsq;
1182	bool empty;
1183
1184	/ Keep one reference on sk_wmem_alloc.*
1185	* Will be released by sk_free() from here or tcp_tasklet_func()
1186	*/
1187	WARN_ON(refcount_sub_and_test(skb->truesize - `1`, &sk->sk_wmem_alloc));
1188
1189	/ If this softirq is serviced by ksoftirqd, we are likely under stress.*
1190	* Wait until our queues (qdisc + devices) are drained.
1191	* This gives :
1192	* - less callbacks to tcp_write_xmit(), reducing stress (batches)
1193	* - chance for incoming ACK (processed by another cpu maybe)
1194	* to migrate this flow (skb->ooo_okay will be eventually set)
1195	*/
1196	if (refcount_read(r: &sk->sk_wmem_alloc) >= SKB_TRUESIZE(`1`) && this_cpu_ksoftirqd() == current)
1197	goto out;
1198
1199	oval = smp_load_acquire(&sk->sk_tsq_flags);
1200	do {
1201	if (!(oval & TSQF_THROTTLED) \|\| (oval & TSQF_QUEUED))
1202	goto out;
1203
1204	nval = (oval & ~TSQF_THROTTLED) \| TSQF_QUEUED;
1205	} while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
1206
1207	/ queue this socket to tasklet queue /
1208	local_irq_save(flags);
1209	tsq = this_cpu_ptr(&tsq_tasklet);
1210	empty = list_empty(head: &tsq->head);
1211	list_add(new: &tp->tsq_node, head: &tsq->head);
1212	if (empty)
1213	tasklet_schedule(t: &tsq->tasklet);
1214	local_irq_restore(flags);
1215	return;
1216	out:
1217	sk_free(sk);
1218	}
1219
1220	/ Note: Called under soft irq.*
1221	* We can call TCP stack right away, unless socket is owned by user.
1222	*/
1223	enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1224	{
1225	struct tcp_sock tp = container_of(timer, struct* tcp_sock, pacing_timer);
1226	struct sock sk = (struct* sock *)tp;
1227
1228	tcp_tsq_handler(sk);
1229	sock_put(sk);
1230
1231	return HRTIMER_NORESTART;
1232	}
1233
1234	static void tcp_update_skb_after_send(struct sock sk, struct* sk_buff *skb,
1235	u64 prior_wstamp)
1236	{
1237	struct tcp_sock *tp = tcp_sk(sk);
1238
1239	if (sk->sk_pacing_status != SK_PACING_NONE) {
1240	unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
1241
1242	/ Original sch_fq does not pace first 10 MSS*
1243	* Note that tp->data_segs_out overflows after 2^32 packets,
1244	* this is a minor annoyance.
1245	*/
1246	if (rate != ~`0UL` && rate && tp->data_segs_out >= `10`) {
1247	u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1248	u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1249
1250	/ take into account OS jitter /
1251	len_ns -= min_t(u64, len_ns / `2`, credit);
1252	tp->tcp_wstamp_ns += len_ns;
1253	}
1254	}
1255	list_move_tail(list: &skb->tcp_tsorted_anchor, head: &tp->tsorted_sent_queue);
1256	}
1257
1258	INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock sk, struct* sk_buff skb, struct* flowi *fl));
1259	INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock sk, struct* sk_buff skb, struct* flowi *fl));
1260	INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock sk, struct* sk_buff *skb));
1261
1262	/ This routine actually transmits TCP packets queued in by*
1263	* tcp_do_sendmsg(). This is used by both the initial
1264	* transmission and possible later retransmissions.
1265	* All SKB's seen here are completely headerless. It is our
1266	* job to build the TCP header, and pass the packet down to
1267	* IP so it can do the same plus pass the packet off to the
1268	* device.
1269	*
1270	* We are working here with either a clone of the original
1271	* SKB, or a fresh unique copy made by the retransmit engine.
1272	*/
1273	static int __tcp_transmit_skb(struct sock sk, struct* sk_buff *skb,
1274	int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1275	{
1276	const struct inet_connection_sock *icsk = inet_csk(sk);
1277	struct inet_sock *inet;
1278	struct tcp_sock *tp;
1279	struct tcp_skb_cb *tcb;
1280	struct tcp_out_options opts;
1281	unsigned int tcp_options_size, tcp_header_size;
1282	struct sk_buff *oskb = NULL;
1283	struct tcp_key key;
1284	struct tcphdr *th;
1285	u64 prior_wstamp;
1286	int err;
1287
1288	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
1289	tp = tcp_sk(sk);
1290	prior_wstamp = tp->tcp_wstamp_ns;
1291	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1292	skb_set_delivery_time(skb, kt: tp->tcp_wstamp_ns, mono: true);
1293	if (clone_it) {
1294	oskb = skb;
1295
1296	tcp_skb_tsorted_save(oskb) {
1297	if (unlikely(skb_cloned(oskb)))
1298	skb = pskb_copy(skb: oskb, gfp_mask);
1299	else
1300	skb = skb_clone(skb: oskb, priority: gfp_mask);
1301	} tcp_skb_tsorted_restore(oskb);
1302
1303	if (unlikely(!skb))
1304	return -ENOBUFS;
1305	/ retransmit skbs might have a non zero value in skb->dev*
1306	* because skb->dev is aliased with skb->rbnode.rb_left
1307	*/
1308	skb->dev = NULL;
1309	}
1310
1311	inet = inet_sk(sk);
1312	tcb = TCP_SKB_CB(skb);
1313	memset(&opts, `0`, sizeof(opts));
1314
1315	tcp_get_current_key(sk, out: &key);
1316	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1317	tcp_options_size = tcp_syn_options(sk, skb, opts: &opts, key: &key);
1318	} else {
1319	tcp_options_size = tcp_established_options(sk, skb, opts: &opts, key: &key);
1320	/ Force a PSH flag on all (GSO) packets to expedite GRO flush*
1321	* at receiver : This slightly improve GRO performance.
1322	* Note that we do not force the PSH flag for non GSO packets,
1323	* because they might be sent under high congestion events,
1324	* and in this case it is better to delay the delivery of 1-MSS
1325	* packets and thus the corresponding ACK packet that would
1326	* release the following packet.
1327	*/
1328	if (tcp_skb_pcount(skb) > `1`)
1329	tcb->tcp_flags \|= TCPHDR_PSH;
1330	}
1331	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1332
1333	/ We set skb->ooo_okay to one if this packet can select*
1334	* a different TX queue than prior packets of this flow,
1335	* to avoid self inflicted reorders.
1336	* The 'other' queue decision is based on current cpu number
1337	* if XPS is enabled, or sk->sk_txhash otherwise.
1338	* We can switch to another (and better) queue if:
1339	* 1) No packet with payload is in qdisc/device queues.
1340	* Delays in TX completion can defeat the test
1341	* even if packets were already sent.
1342	* 2) Or rtx queue is empty.
1343	* This mitigates above case if ACK packets for
1344	* all prior packets were already processed.
1345	*/
1346	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(`1`) \|\|
1347	tcp_rtx_queue_empty(sk);
1348
1349	/ If we had to use memory reserve to allocate this skb,*
1350	* this might cause drops if packet is looped back :
1351	* Other socket might not have SOCK_MEMALLOC.
1352	* Packets not looped back do not care about pfmemalloc.
1353	*/
1354	skb->pfmemalloc = `0`;
1355
1356	skb_push(skb, len: tcp_header_size);
1357	skb_reset_transport_header(skb);
1358
1359	skb_orphan(skb);
1360	skb->sk = sk;
1361	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1362	refcount_add(i: skb->truesize, r: &sk->sk_wmem_alloc);
1363
1364	skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
1365
1366	/ Build TCP header and checksum it. /
1367	th = (struct tcphdr *)skb->data;
1368	th->source = inet->inet_sport;
1369	th->dest = inet->inet_dport;
1370	th->seq = htonl(tcb->seq);
1371	th->ack_seq = htonl(rcv_nxt);
1372	(((__be16 )th) + `6`) = htons(((tcp_header_size >> `2`) << `12`) \|
1373	tcb->tcp_flags);
1374
1375	th->check = `0`;
1376	th->urg_ptr = `0`;
1377
1378	/ The urg_mode check is necessary during a below snd_una win probe /
1379	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1380	if (before(seq1: tp->snd_up, seq2: tcb->seq + `0x10000`)) {
1381	th->urg_ptr = htons(tp->snd_up - tcb->seq);
1382	th->urg = `1`;
1383	} else if (after(tcb->seq + `0xFFFF`, tp->snd_nxt)) {
1384	th->urg_ptr = htons(`0xFFFF`);
1385	th->urg = `1`;
1386	}
1387	}
1388
1389	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1390	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1391	th->window = htons(tcp_select_window(sk));
1392	tcp_ecn_send(sk, skb, th, tcp_header_len: tcp_header_size);
1393	} else {
1394	/ RFC1323: The window in SYN & SYN/ACK segments*
1395	* is never scaled.
1396	*/
1397	th->window = htons(min(tp->rcv_wnd, `65535U`));
1398	}
1399
1400	tcp_options_write(th, tp, NULL, opts: &opts, key: &key);
1401
1402	if (tcp_key_is_md5(key: &key)) {
1403	#ifdef CONFIG_TCP_MD5SIG
1404	/ Calculate the MD5 hash, as we have all we need now /
1405	sk_gso_disable(sk);
1406	tp->af_specific->calc_md5_hash(opts.hash_location,
1407	key.md5_key, sk, skb);
1408	#endif
1409	} else if (tcp_key_is_ao(key: &key)) {
1410	int err;
1411
1412	err = tcp_ao_transmit_skb(sk, skb, key: key.ao_key, th,
1413	hash_location: opts.hash_location);
1414	if (err) {
1415	kfree_skb_reason(skb, reason: SKB_DROP_REASON_NOT_SPECIFIED);
1416	return -ENOMEM;
1417	}
1418	}
1419
1420	/ BPF prog is the last one writing header option /
1421	bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, synack_type: `0`, opts: &opts);
1422
1423	INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
1424	tcp_v6_send_check, tcp_v4_send_check,
1425	sk, skb);
1426
1427	if (likely(tcb->tcp_flags & TCPHDR_ACK))
1428	tcp_event_ack_sent(sk, rcv_nxt);
1429
1430	if (skb->len != tcp_header_size) {
1431	tcp_event_data_sent(tp, sk);
1432	tp->data_segs_out += tcp_skb_pcount(skb);
1433	tp->bytes_sent += skb->len - tcp_header_size;
1434	}
1435
1436	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
1437	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1438	tcp_skb_pcount(skb));
1439
1440	tp->segs_out += tcp_skb_pcount(skb);
1441	skb_set_hash_from_sk(skb, sk);
1442	/ OK, its time to fill skb_shinfo(skb)->gso_{segs\|size} /
1443	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1444	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1445
1446	/ Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) /
1447
1448	/ Cleanup our debris for IP stacks /
1449	memset(skb->cb, `0`, max(sizeof(struct inet_skb_parm),
1450	sizeof(struct inet6_skb_parm)));
1451
1452	tcp_add_tx_delay(skb, tp);
1453
1454	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
1455	inet6_csk_xmit, ip_queue_xmit,
1456	sk, skb, &inet->cork.fl);
1457
1458	if (unlikely(err > `0`)) {
1459	tcp_enter_cwr(sk);
1460	err = net_xmit_eval(err);
1461	}
1462	if (!err && oskb) {
1463	tcp_update_skb_after_send(sk, skb: oskb, prior_wstamp);
1464	tcp_rate_skb_sent(sk, skb: oskb);
1465	}
1466	return err;
1467	}
1468
1469	static int tcp_transmit_skb(struct sock sk, struct* sk_buff skb, int* clone_it,
1470	gfp_t gfp_mask)
1471	{
1472	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1473	tcp_sk(sk)->rcv_nxt);
1474	}
1475
1476	/ This routine just queues the buffer for sending.*
1477	*
1478	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
1479	* otherwise socket can stall.
1480	*/
1481	static void tcp_queue_skb(struct sock sk, struct* sk_buff *skb)
1482	{
1483	struct tcp_sock *tp = tcp_sk(sk);
1484
1485	/ Advance write_seq and place onto the write_queue. /
1486	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1487	__skb_header_release(skb);
1488	tcp_add_write_queue_tail(sk, skb);
1489	sk_wmem_queued_add(sk, val: skb->truesize);
1490	sk_mem_charge(sk, size: skb->truesize);
1491	}
1492
1493	/ Initialize TSO segments for a packet. /
1494	static void tcp_set_skb_tso_segs(struct sk_buff skb, unsigned* int mss_now)
1495	{
1496	if (skb->len <= mss_now) {
1497	/ Avoid the costly divide in the normal*
1498	* non-TSO case.
1499	*/
1500	tcp_skb_pcount_set(skb, segs: `1`);
1501	TCP_SKB_CB(skb)->tcp_gso_size = `0`;
1502	} else {
1503	tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1504	TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1505	}
1506	}
1507
1508	/ Pcount in the middle of the write queue got changed, we need to do various*
1509	* tweaks to fix counters
1510	*/
1511	static void tcp_adjust_pcount(struct sock sk, const* struct sk_buff skb, int* decr)
1512	{
1513	struct tcp_sock *tp = tcp_sk(sk);
1514
1515	tp->packets_out -= decr;
1516
1517	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1518	tp->sacked_out -= decr;
1519	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1520	tp->retrans_out -= decr;
1521	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1522	tp->lost_out -= decr;
1523
1524	/ Reno case is special. Sigh... /
1525	if (tcp_is_reno(tp) && decr > `0`)
1526	tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1527
1528	if (tp->lost_skb_hint &&
1529	before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1530	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1531	tp->lost_cnt_hint -= decr;
1532
1533	tcp_verify_left_out(tp);
1534	}
1535
1536	static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1537	{
1538	return TCP_SKB_CB(skb)->txstamp_ack \|\|
1539	(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1540	}
1541
1542	static void tcp_fragment_tstamp(struct sk_buff skb, struct* sk_buff *skb2)
1543	{
1544	struct skb_shared_info *shinfo = skb_shinfo(skb);
1545
1546	if (unlikely(tcp_has_tx_tstamp(skb)) &&
1547	!before(seq1: shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1548	struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1549	u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1550
1551	shinfo->tx_flags &= ~tsflags;
1552	shinfo2->tx_flags \|= tsflags;
1553	swap(shinfo->tskey, shinfo2->tskey);
1554	TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1555	TCP_SKB_CB(skb)->txstamp_ack = `0`;
1556	}
1557	}
1558
1559	static void tcp_skb_fragment_eor(struct sk_buff skb, struct* sk_buff *skb2)
1560	{
1561	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1562	TCP_SKB_CB(skb)->eor = `0`;
1563	}
1564
1565	/ Insert buff after skb on the write or rtx queue of sk. /
1566	static void tcp_insert_write_queue_after(struct sk_buff *skb,
1567	struct sk_buff *buff,
1568	struct sock *sk,
1569	enum tcp_queue tcp_queue)
1570	{
1571	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1572	__skb_queue_after(list: &sk->sk_write_queue, prev: skb, newsk: buff);
1573	else
1574	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: buff);
1575	}
1576
1577	/ Function to create two new TCP segments. Shrinks the given segment*
1578	* to the specified size and appends a new segment with the rest of the
1579	* packet to the list. This won't be called frequently, I hope.
1580	* Remember, these are still headerless SKBs at this point.
1581	*/
1582	int tcp_fragment(struct sock sk, enum* tcp_queue tcp_queue,
1583	struct sk_buff *skb, u32 len,
1584	unsigned int mss_now, gfp_t gfp)
1585	{
1586	struct tcp_sock *tp = tcp_sk(sk);
1587	struct sk_buff *buff;
1588	int old_factor;
1589	long limit;
1590	int nlen;
1591	u8 flags;
1592
1593	if (WARN_ON(len > skb->len))
1594	return -EINVAL;
1595
1596	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
1597
1598	/ tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.*
1599	* We need some allowance to not penalize applications setting small
1600	* SO_SNDBUF values.
1601	* Also allow first and last skb in retransmit queue to be split.
1602	*/
1603	limit = sk->sk_sndbuf + `2` * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
1604	if (unlikely((sk->sk_wmem_queued >> `1`) > limit &&
1605	tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1606	skb != tcp_rtx_queue_head(sk) &&
1607	skb != tcp_rtx_queue_tail(sk))) {
1608	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1609	return -ENOMEM;
1610	}
1611
1612	if (skb_unclone_keeptruesize(skb, pri: gfp))
1613	return -ENOMEM;
1614
1615	/ Get a new skb... force flag on. /
1616	buff = tcp_stream_alloc_skb(sk, gfp, force_schedule: true);
1617	if (!buff)
1618	return -ENOMEM; / We'll just try again later. /
1619	skb_copy_decrypted(to: buff, from: skb);
1620	mptcp_skb_ext_copy(to: buff, from: skb);
1621
1622	sk_wmem_queued_add(sk, val: buff->truesize);
1623	sk_mem_charge(sk, size: buff->truesize);
1624	nlen = skb->len - len;
1625	buff->truesize += nlen;
1626	skb->truesize -= nlen;
1627
1628	/ Correct the sequence numbers. /
1629	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1630	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1631	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1632
1633	/ PSH and FIN should only be set in the second packet. /
1634	flags = TCP_SKB_CB(skb)->tcp_flags;
1635	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
1636	TCP_SKB_CB(buff)->tcp_flags = flags;
1637	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1638	tcp_skb_fragment_eor(skb, skb2: buff);
1639
1640	skb_split(skb, skb1: buff, len);
1641
1642	skb_set_delivery_time(skb: buff, kt: skb->tstamp, mono: true);
1643	tcp_fragment_tstamp(skb, skb2: buff);
1644
1645	old_factor = tcp_skb_pcount(skb);
1646
1647	/ Fix up tso_factor for both original and new SKB. /
1648	tcp_set_skb_tso_segs(skb, mss_now);
1649	tcp_set_skb_tso_segs(skb: buff, mss_now);
1650
1651	/ Update delivered info for the new segment /
1652	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1653
1654	/ If this packet has been sent out already, we must*
1655	* adjust the various packet counters.
1656	*/
1657	if (!before(seq1: tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1658	int diff = old_factor - tcp_skb_pcount(skb) -
1659	tcp_skb_pcount(skb: buff);
1660
1661	if (diff)
1662	tcp_adjust_pcount(sk, skb, decr: diff);
1663	}
1664
1665	/ Link BUFF into the send queue. /
1666	__skb_header_release(skb: buff);
1667	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1668	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1669	list_add(new: &buff->tcp_tsorted_anchor, head: &skb->tcp_tsorted_anchor);
1670
1671	return `0`;
1672	}
1673
1674	/ This is similar to __pskb_pull_tail(). The difference is that pulled*
1675	* data is not copied, but immediately discarded.
1676	*/
1677	static int __pskb_trim_head(struct sk_buff skb, int* len)
1678	{
1679	struct skb_shared_info *shinfo;
1680	int i, k, eat;
1681
1682	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
1683	eat = len;
1684	k = `0`;
1685	shinfo = skb_shinfo(skb);
1686	for (i = `0`; i < shinfo->nr_frags; i++) {
1687	int size = skb_frag_size(frag: &shinfo->frags[i]);
1688
1689	if (size <= eat) {
1690	skb_frag_unref(skb, f: i);
1691	eat -= size;
1692	} else {
1693	shinfo->frags[k] = shinfo->frags[i];
1694	if (eat) {
1695	skb_frag_off_add(frag: &shinfo->frags[k], delta: eat);
1696	skb_frag_size_sub(frag: &shinfo->frags[k], delta: eat);
1697	eat = `0`;
1698	}
1699	k++;
1700	}
1701	}
1702	shinfo->nr_frags = k;
1703
1704	skb->data_len -= len;
1705	skb->len = skb->data_len;
1706	return len;
1707	}
1708
1709	/ Remove acked data from a packet in the transmit queue. /
1710	int tcp_trim_head(struct sock sk, struct* sk_buff *skb, u32 len)
1711	{
1712	u32 delta_truesize;
1713
1714	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
1715	return -ENOMEM;
1716
1717	delta_truesize = __pskb_trim_head(skb, len);
1718
1719	TCP_SKB_CB(skb)->seq += len;
1720
1721	skb->truesize -= delta_truesize;
1722	sk_wmem_queued_add(sk, val: -delta_truesize);
1723	if (!skb_zcopy_pure(skb))
1724	sk_mem_uncharge(sk, size: delta_truesize);
1725
1726	/ Any change of skb->len requires recalculation of tso factor. /
1727	if (tcp_skb_pcount(skb) > `1`)
1728	tcp_set_skb_tso_segs(skb, mss_now: tcp_skb_mss(skb));
1729
1730	return `0`;
1731	}
1732
1733	/ Calculate MSS not accounting any TCP options. /
1734	static inline int __tcp_mtu_to_mss(struct sock sk, int* pmtu)
1735	{
1736	const struct tcp_sock *tp = tcp_sk(sk);
1737	const struct inet_connection_sock *icsk = inet_csk(sk);
1738	int mss_now;
1739
1740	/ Calculate base mss without TCP options:*
1741	It is MMS_S - sizeof(tcphdr) of rfc1122
1742	*/
1743	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1744
1745	/ Clamp it (mss_clamp does not include tcp options) /
1746	if (mss_now > tp->rx_opt.mss_clamp)
1747	mss_now = tp->rx_opt.mss_clamp;
1748
1749	/ Now subtract optional transport overhead /
1750	mss_now -= icsk->icsk_ext_hdr_len;
1751
1752	/ Then reserve room for full set of TCP options and 8 bytes of data /
1753	mss_now = max(mss_now,
1754	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
1755	return mss_now;
1756	}
1757
1758	/ Calculate MSS. Not accounting for SACKs here. /
1759	int tcp_mtu_to_mss(struct sock sk, int* pmtu)
1760	{
1761	/ Subtract TCP options size, not including SACKs /
1762	return __tcp_mtu_to_mss(sk, pmtu) -
1763	(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1764	}
1765	EXPORT_SYMBOL(tcp_mtu_to_mss);
1766
1767	/ Inverse of above /
1768	int tcp_mss_to_mtu(struct sock sk, int* mss)
1769	{
1770	const struct tcp_sock *tp = tcp_sk(sk);
1771	const struct inet_connection_sock *icsk = inet_csk(sk);
1772
1773	return mss +
1774	tp->tcp_header_len +
1775	icsk->icsk_ext_hdr_len +
1776	icsk->icsk_af_ops->net_header_len;
1777	}
1778	EXPORT_SYMBOL(tcp_mss_to_mtu);
1779
1780	/ MTU probing init per socket /
1781	void tcp_mtup_init(struct sock *sk)
1782	{
1783	struct tcp_sock *tp = tcp_sk(sk);
1784	struct inet_connection_sock *icsk = inet_csk(sk);
1785	struct net *net = sock_net(sk);
1786
1787	icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > `1`;
1788	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1789	icsk->icsk_af_ops->net_header_len;
1790	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
1791	icsk->icsk_mtup.probe_size = `0`;
1792	if (icsk->icsk_mtup.enabled)
1793	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1794	}
1795	EXPORT_SYMBOL(tcp_mtup_init);
1796
1797	/ This function synchronize snd mss to current pmtu/exthdr set.*
1798
1799	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
1800	for TCP options, but includes only bare TCP header.
1801
1802	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
1803	It is minimum of user_mss and mss received with SYN.
1804	It also does not include TCP options.
1805
1806	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1807
1808	tp->mss_cache is current effective sending mss, including
1809	all tcp options except for SACKs. It is evaluated,
1810	taking into account current pmtu, but never exceeds
1811	tp->rx_opt.mss_clamp.
1812
1813	NOTE1. rfc1122 clearly states that advertised MSS
1814	DOES NOT include either tcp or ip options.
1815
1816	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1817	are READ ONLY outside this function. --ANK (980731)
1818	*/
1819	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1820	{
1821	struct tcp_sock *tp = tcp_sk(sk);
1822	struct inet_connection_sock *icsk = inet_csk(sk);
1823	int mss_now;
1824
1825	if (icsk->icsk_mtup.search_high > pmtu)
1826	icsk->icsk_mtup.search_high = pmtu;
1827
1828	mss_now = tcp_mtu_to_mss(sk, pmtu);
1829	mss_now = tcp_bound_to_half_wnd(tp, pktsize: mss_now);
1830
1831	/ And store cached results /
1832	icsk->icsk_pmtu_cookie = pmtu;
1833	if (icsk->icsk_mtup.enabled)
1834	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1835	tp->mss_cache = mss_now;
1836
1837	return mss_now;
1838	}
1839	EXPORT_SYMBOL(tcp_sync_mss);
1840
1841	/ Compute the current effective MSS, taking SACKs and IP options,*
1842	* and even PMTU discovery events into account.
1843	*/
1844	unsigned int tcp_current_mss(struct sock *sk)
1845	{
1846	const struct tcp_sock *tp = tcp_sk(sk);
1847	const struct dst_entry *dst = __sk_dst_get(sk);
1848	u32 mss_now;
1849	unsigned int header_len;
1850	struct tcp_out_options opts;
1851	struct tcp_key key;
1852
1853	mss_now = tp->mss_cache;
1854
1855	if (dst) {
1856	u32 mtu = dst_mtu(dst);
1857	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1858	mss_now = tcp_sync_mss(sk, mtu);
1859	}
1860	tcp_get_current_key(sk, out: &key);
1861	header_len = tcp_established_options(sk, NULL, opts: &opts, key: &key) +
1862	sizeof(struct tcphdr);
1863	/ The mss_cache is sized based on tp->tcp_header_len, which assumes*
1864	* some common options. If this is an odd packet (because we have SACK
1865	* blocks etc) then our calculated header_len will be different, and
1866	* we have to adjust mss_now correspondingly */
1867	if (header_len != tp->tcp_header_len) {
1868	int delta = (int) header_len - tp->tcp_header_len;
1869	mss_now -= delta;
1870	}
1871
1872	return mss_now;
1873	}
1874
1875	/ RFC2861, slow part. Adjust cwnd, after it was not full during one rto.*
1876	* As additional protections, we do not touch cwnd in retransmission phases,
1877	* and if application hit its sndbuf limit recently.
1878	*/
1879	static void tcp_cwnd_application_limited(struct sock *sk)
1880	{
1881	struct tcp_sock *tp = tcp_sk(sk);
1882
1883	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1884	sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1885	/ Limited by application or receiver window. /
1886	u32 init_win = tcp_init_cwnd(tp, dst: __sk_dst_get(sk));
1887	u32 win_used = max(tp->snd_cwnd_used, init_win);
1888	if (win_used < tcp_snd_cwnd(tp)) {
1889	tp->snd_ssthresh = tcp_current_ssthresh(sk);
1890	tcp_snd_cwnd_set(tp, val: (tcp_snd_cwnd(tp) + win_used) >> `1`);
1891	}
1892	tp->snd_cwnd_used = `0`;
1893	}
1894	tp->snd_cwnd_stamp = tcp_jiffies32;
1895	}
1896
1897	static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1898	{
1899	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1900	struct tcp_sock *tp = tcp_sk(sk);
1901
1902	/ Track the strongest available signal of the degree to which the cwnd*
1903	* is fully utilized. If cwnd-limited then remember that fact for the
1904	* current window. If not cwnd-limited then track the maximum number of
1905	* outstanding packets in the current window. (If cwnd-limited then we
1906	* chose to not update tp->max_packets_out to avoid an extra else
1907	* clause with no functional impact.)
1908	*/
1909	if (!before(seq1: tp->snd_una, seq2: tp->cwnd_usage_seq) \|\|
1910	is_cwnd_limited \|\|
1911	(!tp->is_cwnd_limited &&
1912	tp->packets_out > tp->max_packets_out)) {
1913	tp->is_cwnd_limited = is_cwnd_limited;
1914	tp->max_packets_out = tp->packets_out;
1915	tp->cwnd_usage_seq = tp->snd_nxt;
1916	}
1917
1918	if (tcp_is_cwnd_limited(sk)) {
1919	/ Network is feed fully. /
1920	tp->snd_cwnd_used = `0`;
1921	tp->snd_cwnd_stamp = tcp_jiffies32;
1922	} else {
1923	/ Network starves. /
1924	if (tp->packets_out > tp->snd_cwnd_used)
1925	tp->snd_cwnd_used = tp->packets_out;
1926
1927	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
1928	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1929	!ca_ops->cong_control)
1930	tcp_cwnd_application_limited(sk);
1931
1932	/ The following conditions together indicate the starvation*
1933	* is caused by insufficient sender buffer:
1934	* 1) just sent some data (see tcp_write_xmit)
1935	* 2) not cwnd limited (this else condition)
1936	* 3) no more data to send (tcp_write_queue_empty())
1937	* 4) application is hitting buffer limit (SOCK_NOSPACE)
1938	*/
1939	if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1940	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1941	(`1` << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
1942	tcp_chrono_start(sk, type: TCP_CHRONO_SNDBUF_LIMITED);
1943	}
1944	}
1945
1946	/ Minshall's variant of the Nagle send check. /
1947	static bool tcp_minshall_check(const struct tcp_sock *tp)
1948	{
1949	return after(tp->snd_sml, tp->snd_una) &&
1950	!after(tp->snd_sml, tp->snd_nxt);
1951	}
1952
1953	/ Update snd_sml if this skb is under mss*
1954	* Note that a TSO packet might end with a sub-mss segment
1955	* The test is really :
1956	* if ((skb->len % mss) != 0)
1957	* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1958	* But we can avoid doing the divide again given we already have
1959	* skb_pcount = skb->len / mss_now
1960	*/
1961	static void tcp_minshall_update(struct tcp_sock tp, unsigned* int mss_now,
1962	const struct sk_buff *skb)
1963	{
1964	if (skb->len < tcp_skb_pcount(skb) * mss_now)
1965	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1966	}
1967
1968	/ Return false, if packet can be sent now without violation Nagle's rules:*
1969	* 1. It is full sized. (provided by caller in %partial bool)
1970	* 2. Or it contains FIN. (already checked by caller)
1971	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1972	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1973	* With Minshall's modification: all sent small packets are ACKed.
1974	*/
1975	static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1976	int nonagle)
1977	{
1978	return partial &&
1979	((nonagle & TCP_NAGLE_CORK) \|\|
1980	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1981	}
1982
1983	/ Return how many segs we'd like on a TSO packet,*
1984	* depending on current pacing rate, and how close the peer is.
1985	*
1986	* Rationale is:
1987	* - For close peers, we rather send bigger packets to reduce
1988	* cpu costs, because occasional losses will be repaired fast.
1989	* - For long distance/rtt flows, we would like to get ACK clocking
1990	* with 1 ACK per ms.
1991	*
1992	* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
1993	* in bigger TSO bursts. We we cut the RTT-based allowance in half
1994	* for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
1995	* is below 1500 bytes after 6 * ~500 usec = 3ms.
1996	*/
1997	static u32 tcp_tso_autosize(const struct sock sk, unsigned* int mss_now,
1998	int min_tso_segs)
1999	{
2000	unsigned long bytes;
2001	u32 r;
2002
2003	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
2004
2005	r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
2006	if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
2007	bytes += sk->sk_gso_max_size >> r;
2008
2009	bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
2010
2011	return max_t(u32, bytes / mss_now, min_tso_segs);
2012	}
2013
2014	/ Return the number of segments we want in the skb we are transmitting.*
2015	* See if congestion control module wants to decide; otherwise, autosize.
2016	*/
2017	static u32 tcp_tso_segs(struct sock sk, unsigned* int mss_now)
2018	{
2019	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
2020	u32 min_tso, tso_segs;
2021
2022	min_tso = ca_ops->min_tso_segs ?
2023	ca_ops->min_tso_segs(sk) :
2024	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
2025
2026	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso_segs: min_tso);
2027	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
2028	}
2029
2030	/ Returns the portion of skb which can be sent right away /
2031	static unsigned int tcp_mss_split_point(const struct sock *sk,
2032	const struct sk_buff *skb,
2033	unsigned int mss_now,
2034	unsigned int max_segs,
2035	int nonagle)
2036	{
2037	const struct tcp_sock *tp = tcp_sk(sk);
2038	u32 partial, needed, window, max_len;
2039
2040	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2041	max_len = mss_now * max_segs;
2042
2043	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
2044	return max_len;
2045
2046	needed = min(skb->len, window);
2047
2048	if (max_len <= needed)
2049	return max_len;
2050
2051	partial = needed % mss_now;
2052	/ If last segment is not a full MSS, check if Nagle rules allow us*
2053	* to include this last segment in this skb.
2054	* Otherwise, we'll split the skb at last MSS boundary
2055	*/
2056	if (tcp_nagle_check(partial: partial != `0`, tp, nonagle))
2057	return needed - partial;
2058
2059	return needed;
2060	}
2061
2062	/ Can at least one segment of SKB be sent right now, according to the*
2063	* congestion window rules? If so, return how many segments are allowed.
2064	*/
2065	static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
2066	const struct sk_buff *skb)
2067	{
2068	u32 in_flight, cwnd, halfcwnd;
2069
2070	/ Don't be strict about the congestion window for the final FIN. /
2071	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
2072	tcp_skb_pcount(skb) == `1`)
2073	return `1`;
2074
2075	in_flight = tcp_packets_in_flight(tp);
2076	cwnd = tcp_snd_cwnd(tp);
2077	if (in_flight >= cwnd)
2078	return `0`;
2079
2080	/ For better scheduling, ensure we have at least*
2081	* 2 GSO packets in flight.
2082	*/
2083	halfcwnd = max(cwnd >> `1`, `1U`);
2084	return min(halfcwnd, cwnd - in_flight);
2085	}
2086
2087	/ Initialize TSO state of a skb.*
2088	* This must be invoked the first time we consider transmitting
2089	* SKB onto the wire.
2090	*/
2091	static int tcp_init_tso_segs(struct sk_buff skb, unsigned* int mss_now)
2092	{
2093	int tso_segs = tcp_skb_pcount(skb);
2094
2095	if (!tso_segs \|\| (tso_segs > `1` && tcp_skb_mss(skb) != mss_now)) {
2096	tcp_set_skb_tso_segs(skb, mss_now);
2097	tso_segs = tcp_skb_pcount(skb);
2098	}
2099	return tso_segs;
2100	}
2101
2102
2103	/ Return true if the Nagle test allows this packet to be*
2104	* sent now.
2105	*/
2106	static inline bool tcp_nagle_test(const struct tcp_sock tp, const* struct sk_buff *skb,
2107	unsigned int cur_mss, int nonagle)
2108	{
2109	/ Nagle rule does not apply to frames, which sit in the middle of the*
2110	* write_queue (they have no chances to get new data).
2111	*
2112	* This is implemented in the callers, where they modify the 'nonagle'
2113	* argument based upon the location of SKB in the send queue.
2114	*/
2115	if (nonagle & TCP_NAGLE_PUSH)
2116	return true;
2117
2118	/ Don't use the nagle rule for urgent data (or for the final FIN). /
2119	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
2120	return true;
2121
2122	if (!tcp_nagle_check(partial: skb->len < cur_mss, tp, nonagle))
2123	return true;
2124
2125	return false;
2126	}
2127
2128	/ Does at least the first segment of SKB fit into the send window? /
2129	static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
2130	const struct sk_buff *skb,
2131	unsigned int cur_mss)
2132	{
2133	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
2134
2135	if (skb->len > cur_mss)
2136	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
2137
2138	return !after(end_seq, tcp_wnd_end(tp));
2139	}
2140
2141	/ Trim TSO SKB to LEN bytes, put the remaining data into a new packet*
2142	* which is put after SKB on the list. It is very much like
2143	* tcp_fragment() except that it may make several kinds of assumptions
2144	* in order to speed up the splitting operation. In particular, we
2145	* know that all the data is in scatter-gather pages, and that the
2146	* packet has never been sent out before (and thus is not cloned).
2147	*/
2148	static int tso_fragment(struct sock sk, struct* sk_buff skb, unsigned* int len,
2149	unsigned int mss_now, gfp_t gfp)
2150	{
2151	int nlen = skb->len - len;
2152	struct sk_buff *buff;
2153	u8 flags;
2154
2155	/ All of a TSO frame must be composed of paged data. /
2156	DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
2157
2158	buff = tcp_stream_alloc_skb(sk, gfp, force_schedule: true);
2159	if (unlikely(!buff))
2160	return -ENOMEM;
2161	skb_copy_decrypted(to: buff, from: skb);
2162	mptcp_skb_ext_copy(to: buff, from: skb);
2163
2164	sk_wmem_queued_add(sk, val: buff->truesize);
2165	sk_mem_charge(sk, size: buff->truesize);
2166	buff->truesize += nlen;
2167	skb->truesize -= nlen;
2168
2169	/ Correct the sequence numbers. /
2170	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
2171	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
2172	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
2173
2174	/ PSH and FIN should only be set in the second packet. /
2175	flags = TCP_SKB_CB(skb)->tcp_flags;
2176	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
2177	TCP_SKB_CB(buff)->tcp_flags = flags;
2178
2179	tcp_skb_fragment_eor(skb, skb2: buff);
2180
2181	skb_split(skb, skb1: buff, len);
2182	tcp_fragment_tstamp(skb, skb2: buff);
2183
2184	/ Fix up tso_factor for both original and new SKB. /
2185	tcp_set_skb_tso_segs(skb, mss_now);
2186	tcp_set_skb_tso_segs(skb: buff, mss_now);
2187
2188	/ Link BUFF into the send queue. /
2189	__skb_header_release(skb: buff);
2190	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue: TCP_FRAG_IN_WRITE_QUEUE);
2191
2192	return `0`;
2193	}
2194
2195	/ Try to defer sending, if possible, in order to minimize the amount*
2196	* of TSO splitting we do. View it as a kind of TSO Nagle test.
2197	*
2198	* This algorithm is from John Heffner.
2199	*/
2200	static bool tcp_tso_should_defer(struct sock sk, struct* sk_buff *skb,
2201	bool *is_cwnd_limited,
2202	bool *is_rwnd_limited,
2203	u32 max_segs)
2204	{
2205	const struct inet_connection_sock *icsk = inet_csk(sk);
2206	u32 send_win, cong_win, limit, in_flight;
2207	struct tcp_sock *tp = tcp_sk(sk);
2208	struct sk_buff *head;
2209	int win_divisor;
2210	s64 delta;
2211
2212	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2213	goto send_now;
2214
2215	/ Avoid bursty behavior by allowing defer*
2216	* only if the last write was recent (1 ms).
2217	* Note that tp->tcp_wstamp_ns can be in the future if we have
2218	* packets waiting in a qdisc or device for EDT delivery.
2219	*/
2220	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2221	if (delta > `0`)
2222	goto send_now;
2223
2224	in_flight = tcp_packets_in_flight(tp);
2225
2226	BUG_ON(tcp_skb_pcount(skb) <= `1`);
2227	BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
2228
2229	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2230
2231	/ From in_flight test above, we know that cwnd > in_flight. /
2232	cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
2233
2234	limit = min(send_win, cong_win);
2235
2236	/ If a full-sized TSO skb can be sent, do it. /
2237	if (limit >= max_segs * tp->mss_cache)
2238	goto send_now;
2239
2240	/ Middle in queue won't get any more data, full sendable already? /
2241	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2242	goto send_now;
2243
2244	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2245	if (win_divisor) {
2246	u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
2247
2248	/ If at least some fraction of a window is available,*
2249	* just use it.
2250	*/
2251	chunk /= win_divisor;
2252	if (limit >= chunk)
2253	goto send_now;
2254	} else {
2255	/ Different approach, try not to defer past a single*
2256	* ACK. Receiver should ACK every other full sized
2257	* frame, so if we have space for more than 3 frames
2258	* then send now.
2259	*/
2260	if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2261	goto send_now;
2262	}
2263
2264	/ TODO : use tsorted_sent_queue ? /
2265	head = tcp_rtx_queue_head(sk);
2266	if (!head)
2267	goto send_now;
2268	delta = tp->tcp_clock_cache - head->tstamp;
2269	/ If next ACK is likely to come too late (half srtt), do not defer /
2270	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> `4`)) < `0`)
2271	goto send_now;
2272
2273	/ Ok, it looks like it is advisable to defer.*
2274	* Three cases are tracked :
2275	* 1) We are cwnd-limited
2276	* 2) We are rwnd-limited
2277	* 3) We are application limited.
2278	*/
2279	if (cong_win < send_win) {
2280	if (cong_win <= skb->len) {
2281	*is_cwnd_limited = true;
2282	return true;
2283	}
2284	} else {
2285	if (send_win <= skb->len) {
2286	*is_rwnd_limited = true;
2287	return true;
2288	}
2289	}
2290
2291	/ If this packet won't get more data, do not wait. /
2292	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) \|\|
2293	TCP_SKB_CB(skb)->eor)
2294	goto send_now;
2295
2296	return true;
2297
2298	send_now:
2299	return false;
2300	}
2301
2302	static inline void tcp_mtu_check_reprobe(struct sock *sk)
2303	{
2304	struct inet_connection_sock *icsk = inet_csk(sk);
2305	struct tcp_sock *tp = tcp_sk(sk);
2306	struct net *net = sock_net(sk);
2307	u32 interval;
2308	s32 delta;
2309
2310	interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
2311	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2312	if (unlikely(delta >= interval * HZ)) {
2313	int mss = tcp_current_mss(sk);
2314
2315	/ Update current search range /
2316	icsk->icsk_mtup.probe_size = `0`;
2317	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2318	sizeof(struct tcphdr) +
2319	icsk->icsk_af_ops->net_header_len;
2320	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2321
2322	/ Update probe time stamp /
2323	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2324	}
2325	}
2326
2327	static bool tcp_can_coalesce_send_queue_head(struct sock sk, int* len)
2328	{
2329	struct sk_buff skb, next;
2330
2331	skb = tcp_send_head(sk);
2332	tcp_for_write_queue_from_safe(skb, next, sk) {
2333	if (len <= skb->len)
2334	break;
2335
2336	if (unlikely(TCP_SKB_CB(skb)->eor) \|\|
2337	tcp_has_tx_tstamp(skb) \|\|
2338	!skb_pure_zcopy_same(skb1: skb, skb2: next))
2339	return false;
2340
2341	len -= skb->len;
2342	}
2343
2344	return true;
2345	}
2346
2347	static int tcp_clone_payload(struct sock sk, struct* sk_buff *to,
2348	int probe_size)
2349	{
2350	skb_frag_t lastfrag = NULL, fragto = skb_shinfo(to)->frags;
2351	int i, todo, len = `0`, nr_frags = `0`;
2352	const struct sk_buff *skb;
2353
2354	if (!sk_wmem_schedule(sk, size: to->truesize + probe_size))
2355	return -ENOMEM;
2356
2357	skb_queue_walk(&sk->sk_write_queue, skb) {
2358	const skb_frag_t *fragfrom = skb_shinfo(skb)->frags;
2359
2360	if (skb_headlen(skb))
2361	return -EINVAL;
2362
2363	for (i = `0`; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) {
2364	if (len >= probe_size)
2365	goto commit;
2366	todo = min_t(int, skb_frag_size(fragfrom),
2367	probe_size - len);
2368	len += todo;
2369	if (lastfrag &&
2370	skb_frag_page(frag: fragfrom) == skb_frag_page(frag: lastfrag) &&
2371	skb_frag_off(frag: fragfrom) == skb_frag_off(frag: lastfrag) +
2372	skb_frag_size(frag: lastfrag)) {
2373	skb_frag_size_add(frag: lastfrag, delta: todo);
2374	continue;
2375	}
2376	if (unlikely(nr_frags == MAX_SKB_FRAGS))
2377	return -E2BIG;
2378	skb_frag_page_copy(fragto, fragfrom);
2379	skb_frag_off_copy(fragto, fragfrom);
2380	skb_frag_size_set(frag: fragto, size: todo);
2381	nr_frags++;
2382	lastfrag = fragto++;
2383	}
2384	}
2385	commit:
2386	WARN_ON_ONCE(len != probe_size);
2387	for (i = `0`; i < nr_frags; i++)
2388	skb_frag_ref(skb: to, f: i);
2389
2390	skb_shinfo(to)->nr_frags = nr_frags;
2391	to->truesize += probe_size;
2392	to->len += probe_size;
2393	to->data_len += probe_size;
2394	__skb_header_release(skb: to);
2395	return `0`;
2396	}
2397
2398	/ Create a new MTU probe if we are ready.*
2399	* MTU probe is regularly attempting to increase the path MTU by
2400	* deliberately sending larger packets. This discovers routing
2401	* changes resulting in larger path MTUs.
2402	*
2403	* Returns 0 if we should wait to probe (no cwnd available),
2404	* 1 if a probe was sent,
2405	* -1 otherwise
2406	*/
2407	static int tcp_mtu_probe(struct sock *sk)
2408	{
2409	struct inet_connection_sock *icsk = inet_csk(sk);
2410	struct tcp_sock *tp = tcp_sk(sk);
2411	struct sk_buff skb, nskb, *next;
2412	struct net *net = sock_net(sk);
2413	int probe_size;
2414	int size_needed;
2415	int copy, len;
2416	int mss_now;
2417	int interval;
2418
2419	/ Not currently probing/verifying,*
2420	* not in recovery,
2421	* have enough cwnd, and
2422	* not SACKing (the variable headers throw things off)
2423	*/
2424	if (likely(!icsk->icsk_mtup.enabled \|\|
2425	icsk->icsk_mtup.probe_size \|\|
2426	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
2427	tcp_snd_cwnd(tp) < `11` \|\|
2428	tp->rx_opt.num_sacks \|\| tp->rx_opt.dsack))
2429	return -`1`;
2430
2431	/ Use binary search for probe_size between tcp_mss_base,*
2432	* and current mss_clamp. if (search_high - search_low)
2433	* smaller than a threshold, backoff from probing.
2434	*/
2435	mss_now = tcp_current_mss(sk);
2436	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2437	icsk->icsk_mtup.search_low) >> `1`);
2438	size_needed = probe_size + (tp->reordering + `1`) * tp->mss_cache;
2439	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2440	/ When misfortune happens, we are reprobing actively,*
2441	* and then reprobe timer has expired. We stick with current
2442	* probing process by not resetting search range to its orignal.
2443	*/
2444	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
2445	interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
2446	/ Check whether enough time has elaplased for*
2447	* another round of probing.
2448	*/
2449	tcp_mtu_check_reprobe(sk);
2450	return -`1`;
2451	}
2452
2453	/ Have enough data in the send queue to probe? /
2454	if (tp->write_seq - tp->snd_nxt < size_needed)
2455	return -`1`;
2456
2457	if (tp->snd_wnd < size_needed)
2458	return -`1`;
2459	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2460	return `0`;
2461
2462	/ Do we need to wait to drain cwnd? With none in flight, don't stall /
2463	if (tcp_packets_in_flight(tp) + `2` > tcp_snd_cwnd(tp)) {
2464	if (!tcp_packets_in_flight(tp))
2465	return -`1`;
2466	else
2467	return `0`;
2468	}
2469
2470	if (!tcp_can_coalesce_send_queue_head(sk, len: probe_size))
2471	return -`1`;
2472
2473	/ We're allowed to probe. Build it now. /
2474	nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, force_schedule: false);
2475	if (!nskb)
2476	return -`1`;
2477
2478	/ build the payload, and be prepared to abort if this fails. /
2479	if (tcp_clone_payload(sk, to: nskb, probe_size)) {
2480	tcp_skb_tsorted_anchor_cleanup(skb: nskb);
2481	consume_skb(skb: nskb);
2482	return -`1`;
2483	}
2484	sk_wmem_queued_add(sk, val: nskb->truesize);
2485	sk_mem_charge(sk, size: nskb->truesize);
2486
2487	skb = tcp_send_head(sk);
2488	skb_copy_decrypted(to: nskb, from: skb);
2489	mptcp_skb_ext_copy(to: nskb, from: skb);
2490
2491	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2492	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2493	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2494
2495	tcp_insert_write_queue_before(new: nskb, skb, sk);
2496	tcp_highest_sack_replace(sk, old: skb, new: nskb);
2497
2498	len = `0`;
2499	tcp_for_write_queue_from_safe(skb, next, sk) {
2500	copy = min_t(int, skb->len, probe_size - len);
2501
2502	if (skb->len <= copy) {
2503	/ We've eaten all the data from this skb.*
2504	* Throw it away. */
2505	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
2506	/ If this is the last SKB we copy and eor is set*
2507	* we need to propagate it to the new skb.
2508	*/
2509	TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2510	tcp_skb_collapse_tstamp(skb: nskb, next_skb: skb);
2511	tcp_unlink_write_queue(skb, sk);
2512	tcp_wmem_free_skb(sk, skb);
2513	} else {
2514	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags &
2515	~(TCPHDR_FIN\|TCPHDR_PSH);
2516	__pskb_trim_head(skb, len: copy);
2517	tcp_set_skb_tso_segs(skb, mss_now);
2518	TCP_SKB_CB(skb)->seq += copy;
2519	}
2520
2521	len += copy;
2522
2523	if (len >= probe_size)
2524	break;
2525	}
2526	tcp_init_tso_segs(skb: nskb, mss_now: nskb->len);
2527
2528	/ We're ready to send. If this fails, the probe will*
2529	* be resegmented into mss-sized pieces by tcp_write_xmit().
2530	*/
2531	if (!tcp_transmit_skb(sk, skb: nskb, clone_it: `1`, GFP_ATOMIC)) {
2532	/ Decrement cwnd here because we are sending*
2533	* effectively two packets. */
2534	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) - `1`);
2535	tcp_event_new_data_sent(sk, skb: nskb);
2536
2537	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2538	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2539	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2540
2541	return `1`;
2542	}
2543
2544	return -`1`;
2545	}
2546
2547	static bool tcp_pacing_check(struct sock *sk)
2548	{
2549	struct tcp_sock *tp = tcp_sk(sk);
2550
2551	if (!tcp_needs_internal_pacing(sk))
2552	return false;
2553
2554	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2555	return false;
2556
2557	if (!hrtimer_is_queued(timer: &tp->pacing_timer)) {
2558	hrtimer_start(timer: &tp->pacing_timer,
2559	tim: ns_to_ktime(ns: tp->tcp_wstamp_ns),
2560	mode: HRTIMER_MODE_ABS_PINNED_SOFT);
2561	sock_hold(sk);
2562	}
2563	return true;
2564	}
2565
2566	static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
2567	{
2568	const struct rb_node *node = sk->tcp_rtx_queue.rb_node;
2569
2570	/ No skb in the rtx queue. /
2571	if (!node)
2572	return true;
2573
2574	/ Only one skb in rtx queue. /
2575	return !node->rb_left && !node->rb_right;
2576	}
2577
2578	/ TCP Small Queues :*
2579	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
2580	* (These limits are doubled for retransmits)
2581	* This allows for :
2582	* - better RTT estimation and ACK scheduling
2583	* - faster recovery
2584	* - high rates
2585	* Alas, some drivers / subsystems require a fair amount
2586	* of queued bytes to ensure line rate.
2587	* One example is wifi aggregation (802.11 AMPDU)
2588	*/
2589	static bool tcp_small_queue_check(struct sock sk, const* struct sk_buff *skb,
2590	unsigned int factor)
2591	{
2592	unsigned long limit;
2593
2594	limit = max_t(unsigned long,
2595	`2` * skb->truesize,
2596	READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
2597	if (sk->sk_pacing_status == SK_PACING_NONE)
2598	limit = min_t(unsigned long, limit,
2599	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
2600	limit <<= factor;
2601
2602	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2603	tcp_sk(sk)->tcp_tx_delay) {
2604	u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
2605	tcp_sk(sk)->tcp_tx_delay;
2606
2607	/ TSQ is based on skb truesize sum (sk_wmem_alloc), so we*
2608	* approximate our needs assuming an ~100% skb->truesize overhead.
2609	* USEC_PER_SEC is approximated by 2^20.
2610	* do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
2611	*/
2612	extra_bytes >>= (`20` - `1`);
2613	limit += extra_bytes;
2614	}
2615	if (refcount_read(r: &sk->sk_wmem_alloc) > limit) {
2616	/ Always send skb if rtx queue is empty or has one skb.*
2617	* No need to wait for TX completion to call us back,
2618	* after softirq/tasklet schedule.
2619	* This helps when TX completions are delayed too much.
2620	*/
2621	if (tcp_rtx_queue_empty_or_single_skb(sk))
2622	return false;
2623
2624	set_bit(nr: TSQ_THROTTLED, addr: &sk->sk_tsq_flags);
2625	/ It is possible TX completion already happened*
2626	* before we set TSQ_THROTTLED, so we must
2627	* test again the condition.
2628	*/
2629	smp_mb__after_atomic();
2630	if (refcount_read(r: &sk->sk_wmem_alloc) > limit)
2631	return true;
2632	}
2633	return false;
2634	}
2635
2636	static void tcp_chrono_set(struct tcp_sock tp, const* enum tcp_chrono new)
2637	{
2638	const u32 now = tcp_jiffies32;
2639	enum tcp_chrono old = tp->chrono_type;
2640
2641	if (old > TCP_CHRONO_UNSPEC)
2642	tp->chrono_stat[old - `1`] += now - tp->chrono_start;
2643	tp->chrono_start = now;
2644	tp->chrono_type = new;
2645	}
2646
2647	void tcp_chrono_start(struct sock sk, const* enum tcp_chrono type)
2648	{
2649	struct tcp_sock *tp = tcp_sk(sk);
2650
2651	/ If there are multiple conditions worthy of tracking in a*
2652	* chronograph then the highest priority enum takes precedence
2653	* over the other conditions. So that if something "more interesting"
2654	* starts happening, stop the previous chrono and start a new one.
2655	*/
2656	if (type > tp->chrono_type)
2657	tcp_chrono_set(tp, new: type);
2658	}
2659
2660	void tcp_chrono_stop(struct sock sk, const* enum tcp_chrono type)
2661	{
2662	struct tcp_sock *tp = tcp_sk(sk);
2663
2664
2665	/ There are multiple conditions worthy of tracking in a*
2666	* chronograph, so that the highest priority enum takes
2667	* precedence over the other conditions (see tcp_chrono_start).
2668	* If a condition stops, we only stop chrono tracking if
2669	* it's the "most interesting" or current chrono we are
2670	* tracking and starts busy chrono if we have pending data.
2671	*/
2672	if (tcp_rtx_and_write_queues_empty(sk))
2673	tcp_chrono_set(tp, new: TCP_CHRONO_UNSPEC);
2674	else if (type == tp->chrono_type)
2675	tcp_chrono_set(tp, new: TCP_CHRONO_BUSY);
2676	}
2677
2678	/ This routine writes packets to the network. It advances the*
2679	* send_head. This happens as incoming acks open up the remote
2680	* window for us.
2681	*
2682	* LARGESEND note: !tcp_urg_mode is overkill, only frames between
2683	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
2684	* account rare use of URG, this is not a big flaw.
2685	*
2686	* Send at most one packet when push_one > 0. Temporarily ignore
2687	* cwnd limit to force at most one packet out when push_one == 2.
2688
2689	* Returns true, if no segments are in flight and we have queued segments,
2690	* but cannot send anything now because of SWS or another problem.
2691	*/
2692	static bool tcp_write_xmit(struct sock sk, unsigned* int mss_now, int nonagle,
2693	int push_one, gfp_t gfp)
2694	{
2695	struct tcp_sock *tp = tcp_sk(sk);
2696	struct sk_buff *skb;
2697	unsigned int tso_segs, sent_pkts;
2698	int cwnd_quota;
2699	int result;
2700	bool is_cwnd_limited = false, is_rwnd_limited = false;
2701	u32 max_segs;
2702
2703	sent_pkts = `0`;
2704
2705	tcp_mstamp_refresh(tp);
2706	if (!push_one) {
2707	/ Do MTU probing. /
2708	result = tcp_mtu_probe(sk);
2709	if (!result) {
2710	return false;
2711	} else if (result > `0`) {
2712	sent_pkts = `1`;
2713	}
2714	}
2715
2716	max_segs = tcp_tso_segs(sk, mss_now);
2717	while ((skb = tcp_send_head(sk))) {
2718	unsigned int limit;
2719
2720	if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2721	/ "skb_mstamp_ns" is used as a start point for the retransmit timer /
2722	tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2723	skb_set_delivery_time(skb, kt: tp->tcp_wstamp_ns, mono: true);
2724	list_move_tail(list: &skb->tcp_tsorted_anchor, head: &tp->tsorted_sent_queue);
2725	tcp_init_tso_segs(skb, mss_now);
2726	goto repair; / Skip network transmission /
2727	}
2728
2729	if (tcp_pacing_check(sk))
2730	break;
2731
2732	tso_segs = tcp_init_tso_segs(skb, mss_now);
2733	BUG_ON(!tso_segs);
2734
2735	cwnd_quota = tcp_cwnd_test(tp, skb);
2736	if (!cwnd_quota) {
2737	if (push_one == `2`)
2738	/ Force out a loss probe pkt. /
2739	cwnd_quota = `1`;
2740	else
2741	break;
2742	}
2743
2744	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2745	is_rwnd_limited = true;
2746	break;
2747	}
2748
2749	if (tso_segs == `1`) {
2750	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2751	(tcp_skb_is_last(sk, skb) ?
2752	nonagle : TCP_NAGLE_PUSH))))
2753	break;
2754	} else {
2755	if (!push_one &&
2756	tcp_tso_should_defer(sk, skb, is_cwnd_limited: &is_cwnd_limited,
2757	is_rwnd_limited: &is_rwnd_limited, max_segs))
2758	break;
2759	}
2760
2761	limit = mss_now;
2762	if (tso_segs > `1` && !tcp_urg_mode(tp))
2763	limit = tcp_mss_split_point(sk, skb, mss_now,
2764	min_t(unsigned int,
2765	cwnd_quota,
2766	max_segs),
2767	nonagle);
2768
2769	if (skb->len > limit &&
2770	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2771	break;
2772
2773	if (tcp_small_queue_check(sk, skb, factor: `0`))
2774	break;
2775
2776	/ Argh, we hit an empty skb(), presumably a thread*
2777	* is sleeping in sendmsg()/sk_stream_wait_memory().
2778	* We do not want to send a pure-ack packet and have
2779	* a strange looking rtx queue with empty packet(s).
2780	*/
2781	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2782	break;
2783
2784	if (unlikely(tcp_transmit_skb(sk, skb, `1`, gfp)))
2785	break;
2786
2787	repair:
2788	/ Advance the send_head. This one is sent out.*
2789	* This call will increment packets_out.
2790	*/
2791	tcp_event_new_data_sent(sk, skb);
2792
2793	tcp_minshall_update(tp, mss_now, skb);
2794	sent_pkts += tcp_skb_pcount(skb);
2795
2796	if (push_one)
2797	break;
2798	}
2799
2800	if (is_rwnd_limited)
2801	tcp_chrono_start(sk, type: TCP_CHRONO_RWND_LIMITED);
2802	else
2803	tcp_chrono_stop(sk, type: TCP_CHRONO_RWND_LIMITED);
2804
2805	is_cwnd_limited \|= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
2806	if (likely(sent_pkts \|\| is_cwnd_limited))
2807	tcp_cwnd_validate(sk, is_cwnd_limited);
2808
2809	if (likely(sent_pkts)) {
2810	if (tcp_in_cwnd_reduction(sk))
2811	tp->prr_out += sent_pkts;
2812
2813	/ Send one loss probe per tail loss episode. /
2814	if (push_one != `2`)
2815	tcp_schedule_loss_probe(sk, advancing_rto: false);
2816	return false;
2817	}
2818	return !tp->packets_out && !tcp_write_queue_empty(sk);
2819	}
2820
2821	bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2822	{
2823	struct inet_connection_sock *icsk = inet_csk(sk);
2824	struct tcp_sock *tp = tcp_sk(sk);
2825	u32 timeout, timeout_us, rto_delta_us;
2826	int early_retrans;
2827
2828	/ Don't do any loss probe on a Fast Open connection before 3WHS*
2829	* finishes.
2830	*/
2831	if (rcu_access_pointer(tp->fastopen_rsk))
2832	return false;
2833
2834	early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
2835	/ Schedule a loss probe in 2RTT for SACK capable connections
2836	* not in loss recovery, that are either limited by cwnd or application.
2837	*/
2838	if ((early_retrans != `3` && early_retrans != `4`) \|\|
2839	!tp->packets_out \|\| !tcp_is_sack(tp) \|\|
2840	(icsk->icsk_ca_state != TCP_CA_Open &&
2841	icsk->icsk_ca_state != TCP_CA_CWR))
2842	return false;
2843
2844	/ Probe timeout is 2rtt. Add minimum RTO to account
2845	* for delayed ack when there's one outstanding packet. If no RTT
2846	* sample is available then probe after TCP_TIMEOUT_INIT.
2847	*/
2848	if (tp->srtt_us) {
2849	timeout_us = tp->srtt_us >> `2`;
2850	if (tp->packets_out == `1`)
2851	timeout_us += tcp_rto_min_us(sk);
2852	else
2853	timeout_us += TCP_TIMEOUT_MIN_US;
2854	timeout = usecs_to_jiffies(u: timeout_us);
2855	} else {
2856	timeout = TCP_TIMEOUT_INIT;
2857	}
2858
2859	/ If the RTO formula yields an earlier time, then use that time. /
2860	rto_delta_us = advancing_rto ?
2861	jiffies_to_usecs(j: inet_csk(sk)->icsk_rto) :
2862	tcp_rto_delta_us(sk); / How far in future is RTO? /
2863	if (rto_delta_us > `0`)
2864	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2865
2866	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, when: timeout, TCP_RTO_MAX);
2867	return true;
2868	}
2869
2870	/ Thanks to skb fast clones, we can detect if a prior transmit of*
2871	* a packet is still in a qdisc or driver queue.
2872	* In this case, there is very little point doing a retransmit !
2873	*/
2874	static bool skb_still_in_host_queue(struct sock *sk,
2875	const struct sk_buff *skb)
2876	{
2877	if (unlikely(skb_fclone_busy(sk, skb))) {
2878	set_bit(nr: TSQ_THROTTLED, addr: &sk->sk_tsq_flags);
2879	smp_mb__after_atomic();
2880	if (skb_fclone_busy(sk, skb)) {
2881	NET_INC_STATS(sock_net(sk),
2882	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2883	return true;
2884	}
2885	}
2886	return false;
2887	}
2888
2889	/ When probe timeout (PTO) fires, try send a new segment if possible, else*
2890	* retransmit the last segment.
2891	*/
2892	void tcp_send_loss_probe(struct sock *sk)
2893	{
2894	struct tcp_sock *tp = tcp_sk(sk);
2895	struct sk_buff *skb;
2896	int pcount;
2897	int mss = tcp_current_mss(sk);
2898
2899	/ At most one outstanding TLP /
2900	if (tp->tlp_high_seq)
2901	goto rearm_timer;
2902
2903	tp->tlp_retrans = `0`;
2904	skb = tcp_send_head(sk);
2905	if (skb && tcp_snd_wnd_test(tp, skb, cur_mss: mss)) {
2906	pcount = tp->packets_out;
2907	tcp_write_xmit(sk, mss_now: mss, TCP_NAGLE_OFF, push_one: `2`, GFP_ATOMIC);
2908	if (tp->packets_out > pcount)
2909	goto probe_sent;
2910	goto rearm_timer;
2911	}
2912	skb = skb_rb_last(&sk->tcp_rtx_queue);
2913	if (unlikely(!skb)) {
2914	WARN_ONCE(tp->packets_out,
2915	"invalid inflight: %u state %u cwnd %u mss %d\n",
2916	tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
2917	inet_csk(sk)->icsk_pending = `0`;
2918	return;
2919	}
2920
2921	if (skb_still_in_host_queue(sk, skb))
2922	goto rearm_timer;
2923
2924	pcount = tcp_skb_pcount(skb);
2925	if (WARN_ON(!pcount))
2926	goto rearm_timer;
2927
2928	if ((pcount > `1`) && (skb->len > (pcount - `1`) * mss)) {
2929	if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2930	(pcount - `1`) * mss, mss,
2931	GFP_ATOMIC)))
2932	goto rearm_timer;
2933	skb = skb_rb_next(skb);
2934	}
2935
2936	if (WARN_ON(!skb \|\| !tcp_skb_pcount(skb)))
2937	goto rearm_timer;
2938
2939	if (__tcp_retransmit_skb(sk, skb, segs: `1`))
2940	goto rearm_timer;
2941
2942	tp->tlp_retrans = `1`;
2943
2944	probe_sent:
2945	/ Record snd_nxt for loss detection. /
2946	tp->tlp_high_seq = tp->snd_nxt;
2947
2948	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2949	/ Reset s.t. tcp_rearm_rto will restart timer from now /
2950	inet_csk(sk)->icsk_pending = `0`;
2951	rearm_timer:
2952	tcp_rearm_rto(sk);
2953	}
2954
2955	/ Push out any pending frames which were held back due to*
2956	* TCP_CORK or attempt at coalescing tiny packets.
2957	* The socket must be locked by the caller.
2958	*/
2959	void __tcp_push_pending_frames(struct sock sk, unsigned* int cur_mss,
2960	int nonagle)
2961	{
2962	/ If we are closed, the bytes will have to remain here.*
2963	* In time closedown will finish, we empty the write queue and
2964	* all will be happy.
2965	*/
2966	if (unlikely(sk->sk_state == TCP_CLOSE))
2967	return;
2968
2969	if (tcp_write_xmit(sk, mss_now: cur_mss, nonagle, push_one: `0`,
2970	gfp: sk_gfp_mask(sk, GFP_ATOMIC)))
2971	tcp_check_probe_timer(sk);
2972	}
2973
2974	/ Send _single_ skb sitting at the send head. This function requires*
2975	* true push pending frames to setup probe timer etc.
2976	*/
2977	void tcp_push_one(struct sock sk, unsigned* int mss_now)
2978	{
2979	struct sk_buff *skb = tcp_send_head(sk);
2980
2981	BUG_ON(!skb \|\| skb->len < mss_now);
2982
2983	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, push_one: `1`, gfp: sk->sk_allocation);
2984	}
2985
2986	/ This function returns the amount that we can raise the*
2987	* usable window based on the following constraints
2988	*
2989	* 1. The window can never be shrunk once it is offered (RFC 793)
2990	* 2. We limit memory per socket
2991	*
2992	* RFC 1122:
2993	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
2994	* RECV.NEXT + RCV.WIN fixed until:
2995	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
2996	*
2997	* i.e. don't raise the right edge of the window until you can raise
2998	* it at least MSS bytes.
2999	*
3000	* Unfortunately, the recommended algorithm breaks header prediction,
3001	* since header prediction assumes th->window stays fixed.
3002	*
3003	* Strictly speaking, keeping th->window fixed violates the receiver
3004	* side SWS prevention criteria. The problem is that under this rule
3005	* a stream of single byte packets will cause the right side of the
3006	* window to always advance by a single byte.
3007	*
3008	* Of course, if the sender implements sender side SWS prevention
3009	* then this will not be a problem.
3010	*
3011	* BSD seems to make the following compromise:
3012	*
3013	* If the free space is less than the 1/4 of the maximum
3014	* space available and the free space is less than 1/2 mss,
3015	* then set the window to 0.
3016	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
3017	* Otherwise, just prevent the window from shrinking
3018	* and from being larger than the largest representable value.
3019	*
3020	* This prevents incremental opening of the window in the regime
3021	* where TCP is limited by the speed of the reader side taking
3022	* data out of the TCP receive queue. It does nothing about
3023	* those cases where the window is constrained on the sender side
3024	* because the pipeline is full.
3025	*
3026	* BSD also seems to "accidentally" limit itself to windows that are a
3027	* multiple of MSS, at least until the free space gets quite small.
3028	* This would appear to be a side effect of the mbuf implementation.
3029	* Combining these two algorithms results in the observed behavior
3030	* of having a fixed window size at almost all times.
3031	*
3032	* Below we obtain similar behavior by forcing the offered window to
3033	* a multiple of the mss when it is feasible to do so.
3034	*
3035	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
3036	* Regular options like TIMESTAMP are taken into account.
3037	*/
3038	u32 __tcp_select_window(struct sock *sk)
3039	{
3040	struct inet_connection_sock *icsk = inet_csk(sk);
3041	struct tcp_sock *tp = tcp_sk(sk);
3042	struct net *net = sock_net(sk);
3043	/ MSS for the peer's data. Previous versions used mss_clamp*
3044	* here. I don't know if the value based on our guesses
3045	* of peer's MSS is better for the performance. It's more correct
3046	* but may be worse for the performance because of rcv_mss
3047	* fluctuations. --SAW 1998/11/1
3048	*/
3049	int mss = icsk->icsk_ack.rcv_mss;
3050	int free_space = tcp_space(sk);
3051	int allowed_space = tcp_full_space(sk);
3052	int full_space, window;
3053
3054	if (sk_is_mptcp(sk))
3055	mptcp_space(ssk: sk, space: &free_space, full_space: &allowed_space);
3056
3057	full_space = min_t(int, tp->window_clamp, allowed_space);
3058
3059	if (unlikely(mss > full_space)) {
3060	mss = full_space;
3061	if (mss <= `0`)
3062	return `0`;
3063	}
3064
3065	/ Only allow window shrink if the sysctl is enabled and we have*
3066	* a non-zero scaling factor in effect.
3067	*/
3068	if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
3069	goto shrink_window_allowed;
3070
3071	/ do not allow window to shrink /
3072
3073	if (free_space < (full_space >> `1`)) {
3074	icsk->icsk_ack.quick = `0`;
3075
3076	if (tcp_under_memory_pressure(sk))
3077	tcp_adjust_rcv_ssthresh(sk);
3078
3079	/ free_space might become our new window, make sure we don't*
3080	* increase it due to wscale.
3081	*/
3082	free_space = round_down(free_space, `1` << tp->rx_opt.rcv_wscale);
3083
3084	/ if free space is less than mss estimate, or is below 1/16th*
3085	* of the maximum allowed, try to move to zero-window, else
3086	* tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
3087	* new incoming data is dropped due to memory limits.
3088	* With large window, mss test triggers way too late in order
3089	* to announce zero window in time before rmem limit kicks in.
3090	*/
3091	if (free_space < (allowed_space >> `4`) \|\| free_space < mss)
3092	return `0`;
3093	}
3094
3095	if (free_space > tp->rcv_ssthresh)
3096	free_space = tp->rcv_ssthresh;
3097
3098	/ Don't do rounding if we are using window scaling, since the*
3099	* scaled window will not line up with the MSS boundary anyway.
3100	*/
3101	if (tp->rx_opt.rcv_wscale) {
3102	window = free_space;
3103
3104	/ Advertise enough space so that it won't get scaled away.*
3105	* Import case: prevent zero window announcement if
3106	* 1<<rcv_wscale > mss.
3107	*/
3108	window = ALIGN(window, (`1` << tp->rx_opt.rcv_wscale));
3109	} else {
3110	window = tp->rcv_wnd;
3111	/ Get the largest window that is a nice multiple of mss.*
3112	* Window clamp already applied above.
3113	* If our current window offering is within 1 mss of the
3114	* free space we just keep it. This prevents the divide
3115	* and multiply from happening most of the time.
3116	* We also don't do any window rounding when the free space
3117	* is too small.
3118	*/
3119	if (window <= free_space - mss \|\| window > free_space)
3120	window = rounddown(free_space, mss);
3121	else if (mss == full_space &&
3122	free_space > window + (full_space >> `1`))
3123	window = free_space;
3124	}
3125
3126	return window;
3127
3128	shrink_window_allowed:
3129	/ new window should always be an exact multiple of scaling factor /
3130	free_space = round_down(free_space, `1` << tp->rx_opt.rcv_wscale);
3131
3132	if (free_space < (full_space >> `1`)) {
3133	icsk->icsk_ack.quick = `0`;
3134
3135	if (tcp_under_memory_pressure(sk))
3136	tcp_adjust_rcv_ssthresh(sk);
3137
3138	/ if free space is too low, return a zero window /
3139	if (free_space < (allowed_space >> `4`) \|\| free_space < mss \|\|
3140	free_space < (`1` << tp->rx_opt.rcv_wscale))
3141	return `0`;
3142	}
3143
3144	if (free_space > tp->rcv_ssthresh) {
3145	free_space = tp->rcv_ssthresh;
3146	/ new window should always be an exact multiple of scaling factor*
3147	*
3148	* For this case, we ALIGN "up" (increase free_space) because
3149	* we know free_space is not zero here, it has been reduced from
3150	* the memory-based limit, and rcv_ssthresh is not a hard limit
3151	* (unlike sk_rcvbuf).
3152	*/
3153	free_space = ALIGN(free_space, (`1` << tp->rx_opt.rcv_wscale));
3154	}
3155
3156	return free_space;
3157	}
3158
3159	void tcp_skb_collapse_tstamp(struct sk_buff *skb,
3160	const struct sk_buff *next_skb)
3161	{
3162	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
3163	const struct skb_shared_info *next_shinfo =
3164	skb_shinfo(next_skb);
3165	struct skb_shared_info *shinfo = skb_shinfo(skb);
3166
3167	shinfo->tx_flags \|= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
3168	shinfo->tskey = next_shinfo->tskey;
3169	TCP_SKB_CB(skb)->txstamp_ack \|=
3170	TCP_SKB_CB(next_skb)->txstamp_ack;
3171	}
3172	}
3173
3174	/ Collapses two adjacent SKB's during retransmission. /
3175	static bool tcp_collapse_retrans(struct sock sk, struct* sk_buff *skb)
3176	{
3177	struct tcp_sock *tp = tcp_sk(sk);
3178	struct sk_buff *next_skb = skb_rb_next(skb);
3179	int next_skb_size;
3180
3181	next_skb_size = next_skb->len;
3182
3183	BUG_ON(tcp_skb_pcount(skb) != `1` \|\| tcp_skb_pcount(next_skb) != `1`);
3184
3185	if (next_skb_size && !tcp_skb_shift(to: skb, from: next_skb, pcount: `1`, shiftlen: next_skb_size))
3186	return false;
3187
3188	tcp_highest_sack_replace(sk, old: next_skb, new: skb);
3189
3190	/ Update sequence range on original skb. /
3191	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
3192
3193	/ Merge over control information. This moves PSH/FIN etc. over /
3194	TCP_SKB_CB(skb)->tcp_flags \|= TCP_SKB_CB(next_skb)->tcp_flags;
3195
3196	/ All done, get rid of second SKB and account for it so*
3197	* packet counting does not break.
3198	*/
3199	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
3200	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
3201
3202	/ changed transmit queue under us so clear hints /
3203	tcp_clear_retrans_hints_partial(tp);
3204	if (next_skb == tp->retransmit_skb_hint)
3205	tp->retransmit_skb_hint = skb;
3206
3207	tcp_adjust_pcount(sk, skb: next_skb, decr: tcp_skb_pcount(skb: next_skb));
3208
3209	tcp_skb_collapse_tstamp(skb, next_skb);
3210
3211	tcp_rtx_queue_unlink_and_free(skb: next_skb, sk);
3212	return true;
3213	}
3214
3215	/ Check if coalescing SKBs is legal. /
3216	static bool tcp_can_collapse(const struct sock sk, const* struct sk_buff *skb)
3217	{
3218	if (tcp_skb_pcount(skb) > `1`)
3219	return false;
3220	if (skb_cloned(skb))
3221	return false;
3222	/ Some heuristics for collapsing over SACK'd could be invented /
3223	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3224	return false;
3225
3226	return true;
3227	}
3228
3229	/ Collapse packets in the retransmit queue to make to create*
3230	* less packets on the wire. This is only done on retransmission.
3231	*/
3232	static void tcp_retrans_try_collapse(struct sock sk, struct* sk_buff *to,
3233	int space)
3234	{
3235	struct tcp_sock *tp = tcp_sk(sk);
3236	struct sk_buff skb = to, tmp;
3237	bool first = true;
3238
3239	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
3240	return;
3241	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3242	return;
3243
3244	skb_rbtree_walk_from_safe(skb, tmp) {
3245	if (!tcp_can_collapse(sk, skb))
3246	break;
3247
3248	if (!tcp_skb_can_collapse(to, from: skb))
3249	break;
3250
3251	space -= skb->len;
3252
3253	if (first) {
3254	first = false;
3255	continue;
3256	}
3257
3258	if (space < `0`)
3259	break;
3260
3261	if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
3262	break;
3263
3264	if (!tcp_collapse_retrans(sk, skb: to))
3265	break;
3266	}
3267	}
3268
3269	/ This retransmits one SKB. Policy decisions and retransmit queue*
3270	* state updates are done by the caller. Returns non-zero if an
3271	* error occurred which prevented the send.
3272	*/
3273	int __tcp_retransmit_skb(struct sock sk, struct* sk_buff skb, int* segs)
3274	{
3275	struct inet_connection_sock *icsk = inet_csk(sk);
3276	struct tcp_sock *tp = tcp_sk(sk);
3277	unsigned int cur_mss;
3278	int diff, len, err;
3279	int avail_wnd;
3280
3281	/ Inconclusive MTU probe /
3282	if (icsk->icsk_mtup.probe_size)
3283	icsk->icsk_mtup.probe_size = `0`;
3284
3285	if (skb_still_in_host_queue(sk, skb))
3286	return -EBUSY;
3287
3288	if (before(TCP_SKB_CB(skb)->seq, seq2: tp->snd_una)) {
3289	if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
3290	WARN_ON_ONCE(`1`);
3291	return -EINVAL;
3292	}
3293	if (tcp_trim_head(sk, skb, len: tp->snd_una - TCP_SKB_CB(skb)->seq))
3294	return -ENOMEM;
3295	}
3296
3297	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3298	return -EHOSTUNREACH; / Routing failure or similar. /
3299
3300	cur_mss = tcp_current_mss(sk);
3301	avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3302
3303	/ If receiver has shrunk his window, and skb is out of*
3304	* new window, do not retransmit it. The exception is the
3305	* case, when window is shrunk to zero. In this case
3306	* our retransmit of one segment serves as a zero window probe.
3307	*/
3308	if (avail_wnd <= `0`) {
3309	if (TCP_SKB_CB(skb)->seq != tp->snd_una)
3310	return -EAGAIN;
3311	avail_wnd = cur_mss;
3312	}
3313
3314	len = cur_mss * segs;
3315	if (len > avail_wnd) {
3316	len = rounddown(avail_wnd, cur_mss);
3317	if (!len)
3318	len = avail_wnd;
3319	}
3320	if (skb->len > len) {
3321	if (tcp_fragment(sk, tcp_queue: TCP_FRAG_IN_RTX_QUEUE, skb, len,
3322	mss_now: cur_mss, GFP_ATOMIC))
3323	return -ENOMEM; / We'll try again later. /
3324	} else {
3325	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
3326	return -ENOMEM;
3327
3328	diff = tcp_skb_pcount(skb);
3329	tcp_set_skb_tso_segs(skb, mss_now: cur_mss);
3330	diff -= tcp_skb_pcount(skb);
3331	if (diff)
3332	tcp_adjust_pcount(sk, skb, decr: diff);
3333	avail_wnd = min_t(int, avail_wnd, cur_mss);
3334	if (skb->len < avail_wnd)
3335	tcp_retrans_try_collapse(sk, to: skb, space: avail_wnd);
3336	}
3337
3338	/ RFC3168, section 6.1.1.1. ECN fallback /
3339	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3340	tcp_ecn_clear_syn(sk, skb);
3341
3342	/ Update global and local TCP statistics. /
3343	segs = tcp_skb_pcount(skb);
3344	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3345	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3346	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3347	tp->total_retrans += segs;
3348	tp->bytes_retrans += skb->len;
3349
3350	/ make sure skb->data is aligned on arches that require it*
3351	* and check if ack-trimming & collapsing extended the headroom
3352	* beyond what csum_start can cover.
3353	*/
3354	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & `3`)) \|\|
3355	skb_headroom(skb) >= `0xFFFF`)) {
3356	struct sk_buff *nskb;
3357
3358	tcp_skb_tsorted_save(skb) {
3359	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3360	if (nskb) {
3361	nskb->dev = NULL;
3362	err = tcp_transmit_skb(sk, skb: nskb, clone_it: `0`, GFP_ATOMIC);
3363	} else {
3364	err = -ENOBUFS;
3365	}
3366	} tcp_skb_tsorted_restore(skb);
3367
3368	if (!err) {
3369	tcp_update_skb_after_send(sk, skb, prior_wstamp: tp->tcp_wstamp_ns);
3370	tcp_rate_skb_sent(sk, skb);
3371	}
3372	} else {
3373	err = tcp_transmit_skb(sk, skb, clone_it: `1`, GFP_ATOMIC);
3374	}
3375
3376	/ To avoid taking spuriously low RTT samples based on a timestamp*
3377	* for a transmit that never happened, always mark EVER_RETRANS
3378	*/
3379	TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
3380
3381	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3382	tcp_call_bpf_3arg(sk, op: BPF_SOCK_OPS_RETRANS_CB,
3383	TCP_SKB_CB(skb)->seq, arg2: segs, arg3: err);
3384
3385	if (likely(!err)) {
3386	trace_tcp_retransmit_skb(sk, skb);
3387	} else if (err != -EBUSY) {
3388	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
3389	}
3390	return err;
3391	}
3392
3393	int tcp_retransmit_skb(struct sock sk, struct* sk_buff skb, int* segs)
3394	{
3395	struct tcp_sock *tp = tcp_sk(sk);
3396	int err = __tcp_retransmit_skb(sk, skb, segs);
3397
3398	if (err == `0`) {
3399	#if FASTRETRANS_DEBUG > 0
3400	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3401	net_dbg_ratelimited("retrans_out leaked\n");
3402	}
3403	#endif
3404	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
3405	tp->retrans_out += tcp_skb_pcount(skb);
3406	}
3407
3408	/ Save stamp of the first (attempted) retransmit. /
3409	if (!tp->retrans_stamp)
3410	tp->retrans_stamp = tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb);
3411
3412	if (tp->undo_retrans < `0`)
3413	tp->undo_retrans = `0`;
3414	tp->undo_retrans += tcp_skb_pcount(skb);
3415	return err;
3416	}
3417
3418	/ This gets called after a retransmit timeout, and the initially*
3419	* retransmitted data is acknowledged. It tries to continue
3420	* resending the rest of the retransmit queue, until either
3421	* we've sent it all or the congestion window limit is reached.
3422	*/
3423	void tcp_xmit_retransmit_queue(struct sock *sk)
3424	{
3425	const struct inet_connection_sock *icsk = inet_csk(sk);
3426	struct sk_buff skb, rtx_head, *hole = NULL;
3427	struct tcp_sock *tp = tcp_sk(sk);
3428	bool rearm_timer = false;
3429	u32 max_segs;
3430	int mib_idx;
3431
3432	if (!tp->packets_out)
3433	return;
3434
3435	rtx_head = tcp_rtx_queue_head(sk);
3436	skb = tp->retransmit_skb_hint ?: rtx_head;
3437	max_segs = tcp_tso_segs(sk, mss_now: tcp_current_mss(sk));
3438	skb_rbtree_walk_from(skb) {
3439	__u8 sacked;
3440	int segs;
3441
3442	if (tcp_pacing_check(sk))
3443	break;
3444
3445	/ we could do better than to assign each time /
3446	if (!hole)
3447	tp->retransmit_skb_hint = skb;
3448
3449	segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
3450	if (segs <= `0`)
3451	break;
3452	sacked = TCP_SKB_CB(skb)->sacked;
3453	/ In case tcp_shift_skb_data() have aggregated large skbs,*
3454	* we need to make sure not sending too bigs TSO packets
3455	*/
3456	segs = min_t(int, segs, max_segs);
3457
3458	if (tp->retrans_out >= tp->lost_out) {
3459	break;
3460	} else if (!(sacked & TCPCB_LOST)) {
3461	if (!hole && !(sacked & (TCPCB_SACKED_RETRANS\|TCPCB_SACKED_ACKED)))
3462	hole = skb;
3463	continue;
3464
3465	} else {
3466	if (icsk->icsk_ca_state != TCP_CA_Loss)
3467	mib_idx = LINUX_MIB_TCPFASTRETRANS;
3468	else
3469	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3470	}
3471
3472	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
3473	continue;
3474
3475	if (tcp_small_queue_check(sk, skb, factor: `1`))
3476	break;
3477
3478	if (tcp_retransmit_skb(sk, skb, segs))
3479	break;
3480
3481	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3482
3483	if (tcp_in_cwnd_reduction(sk))
3484	tp->prr_out += tcp_skb_pcount(skb);
3485
3486	if (skb == rtx_head &&
3487	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3488	rearm_timer = true;
3489
3490	}
3491	if (rearm_timer)
3492	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3493	when: inet_csk(sk)->icsk_rto,
3494	TCP_RTO_MAX);
3495	}
3496
3497	/ We allow to exceed memory limits for FIN packets to expedite*
3498	* connection tear down and (memory) recovery.
3499	* Otherwise tcp_send_fin() could be tempted to either delay FIN
3500	* or even be forced to close flow without any FIN.
3501	* In general, we want to allow one skb per socket to avoid hangs
3502	* with edge trigger epoll()
3503	*/
3504	void sk_forced_mem_schedule(struct sock sk, int* size)
3505	{
3506	int delta, amt;
3507
3508	delta = size - sk->sk_forward_alloc;
3509	if (delta <= `0`)
3510	return;
3511	amt = sk_mem_pages(amt: delta);
3512	sk_forward_alloc_add(sk, val: amt << PAGE_SHIFT);
3513	sk_memory_allocated_add(sk, amt);
3514
3515	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3516	mem_cgroup_charge_skmem(memcg: sk->sk_memcg, nr_pages: amt,
3517	gfp_mask: gfp_memcg_charge() \| __GFP_NOFAIL);
3518	}
3519
3520	/ Send a FIN. The caller locks the socket for us.*
3521	* We should try to send a FIN packet really hard, but eventually give up.
3522	*/
3523	void tcp_send_fin(struct sock *sk)
3524	{
3525	struct sk_buff skb, tskb, *tail = tcp_write_queue_tail(sk);
3526	struct tcp_sock *tp = tcp_sk(sk);
3527
3528	/ Optimization, tack on the FIN if we have one skb in write queue and*
3529	* this skb was not yet sent, or we are under memory pressure.
3530	* Note: in the latter case, FIN packet will be sent after a timeout,
3531	* as TCP stack thinks it has already been transmitted.
3532	*/
3533	tskb = tail;
3534	if (!tskb && tcp_under_memory_pressure(sk))
3535	tskb = skb_rb_last(&sk->tcp_rtx_queue);
3536
3537	if (tskb) {
3538	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
3539	TCP_SKB_CB(tskb)->end_seq++;
3540	tp->write_seq++;
3541	if (!tail) {
3542	/ This means tskb was already sent.*
3543	* Pretend we included the FIN on previous transmit.
3544	* We need to set tp->snd_nxt to the value it would have
3545	* if FIN had been sent. This is because retransmit path
3546	* does not change tp->snd_nxt.
3547	*/
3548	WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + `1`);
3549	return;
3550	}
3551	} else {
3552	skb = alloc_skb_fclone(MAX_TCP_HEADER, priority: sk->sk_allocation);
3553	if (unlikely(!skb))
3554	return;
3555
3556	INIT_LIST_HEAD(list: &skb->tcp_tsorted_anchor);
3557	skb_reserve(skb, MAX_TCP_HEADER);
3558	sk_forced_mem_schedule(sk, size: skb->truesize);
3559	/ FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). /
3560	tcp_init_nondata_skb(skb, seq: tp->write_seq,
3561	TCPHDR_ACK \| TCPHDR_FIN);
3562	tcp_queue_skb(sk, skb);
3563	}
3564	__tcp_push_pending_frames(sk, cur_mss: tcp_current_mss(sk), TCP_NAGLE_OFF);
3565	}
3566
3567	/ We get here when a process closes a file descriptor (either due to*
3568	* an explicit close() or as a byproduct of exit()'ing) and there
3569	* was unread data in the receive queue. This behavior is recommended
3570	* by RFC 2525, section 2.17. -DaveM
3571	*/
3572	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3573	{
3574	struct sk_buff *skb;
3575
3576	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3577
3578	/ NOTE: No TCP options attached and we never retransmit this. /
3579	skb = alloc_skb(MAX_TCP_HEADER, priority);
3580	if (!skb) {
3581	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3582	return;
3583	}
3584
3585	/ Reserve space for headers and prepare control bits. /
3586	skb_reserve(skb, MAX_TCP_HEADER);
3587	tcp_init_nondata_skb(skb, seq: tcp_acceptable_seq(sk),
3588	TCPHDR_ACK \| TCPHDR_RST);
3589	tcp_mstamp_refresh(tcp_sk(sk));
3590	/ Send it off. /
3591	if (tcp_transmit_skb(sk, skb, clone_it: `0`, gfp_mask: priority))
3592	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3593
3594	/ skb of trace_tcp_send_reset() keeps the skb that caused RST,*
3595	* skb here is different to the troublesome skb, so use NULL
3596	*/
3597	trace_tcp_send_reset(sk, NULL);
3598	}
3599
3600	/ Send a crossed SYN-ACK during socket establishment.*
3601	* WARNING: This routine must only be called when we have already sent
3602	* a SYN packet that crossed the incoming SYN that caused this routine
3603	* to get called. If this assumption fails then the initial rcv_wnd
3604	* and rcv_wscale values will not be correct.
3605	*/
3606	int tcp_send_synack(struct sock *sk)
3607	{
3608	struct sk_buff *skb;
3609
3610	skb = tcp_rtx_queue_head(sk);
3611	if (!skb \|\| !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3612	pr_err("%s: wrong queue state\n", __func__);
3613	return -EFAULT;
3614	}
3615	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3616	if (skb_cloned(skb)) {
3617	struct sk_buff *nskb;
3618
3619	tcp_skb_tsorted_save(skb) {
3620	nskb = skb_copy(skb, GFP_ATOMIC);
3621	} tcp_skb_tsorted_restore(skb);
3622	if (!nskb)
3623	return -ENOMEM;
3624	INIT_LIST_HEAD(list: &nskb->tcp_tsorted_anchor);
3625	tcp_highest_sack_replace(sk, old: skb, new: nskb);
3626	tcp_rtx_queue_unlink_and_free(skb, sk);
3627	__skb_header_release(skb: nskb);
3628	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: nskb);
3629	sk_wmem_queued_add(sk, val: nskb->truesize);
3630	sk_mem_charge(sk, size: nskb->truesize);
3631	skb = nskb;
3632	}
3633
3634	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ACK;
3635	tcp_ecn_send_synack(sk, skb);
3636	}
3637	return tcp_transmit_skb(sk, skb, clone_it: `1`, GFP_ATOMIC);
3638	}
3639
3640	/**
3641	* tcp_make_synack - Allocate one skb and build a SYNACK packet.
3642	* @sk: listener socket
3643	* @dst: dst entry attached to the SYNACK. It is consumed and caller
3644	* should not use it again.
3645	* @req: request_sock pointer
3646	* @foc: cookie for tcp fast open
3647	* @synack_type: Type of synack to prepare
3648	* @syn_skb: SYN packet just received. It could be NULL for rtx case.
3649	*/
3650	struct sk_buff tcp_make_synack(const* struct sock sk, struct* dst_entry *dst,
3651	struct request_sock *req,
3652	struct tcp_fastopen_cookie *foc,
3653	enum tcp_synack_type synack_type,
3654	struct sk_buff *syn_skb)
3655	{
3656	struct inet_request_sock *ireq = inet_rsk(sk: req);
3657	const struct tcp_sock *tp = tcp_sk(sk);
3658	struct tcp_out_options opts;
3659	struct tcp_key key = {};
3660	struct sk_buff *skb;
3661	int tcp_header_size;
3662	struct tcphdr *th;
3663	int mss;
3664	u64 now;
3665
3666	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3667	if (unlikely(!skb)) {
3668	dst_release(dst);
3669	return NULL;
3670	}
3671	/ Reserve space for headers. /
3672	skb_reserve(skb, MAX_TCP_HEADER);
3673
3674	switch (synack_type) {
3675	case TCP_SYNACK_NORMAL:
3676	skb_set_owner_w(skb, sk: req_to_sk(req));
3677	break;
3678	case TCP_SYNACK_COOKIE:
3679	/ Under synflood, we do not attach skb to a socket,*
3680	* to avoid false sharing.
3681	*/
3682	break;
3683	case TCP_SYNACK_FASTOPEN:
3684	/ sk is a const pointer, because we want to express multiple*
3685	* cpu might call us concurrently.
3686	* sk->sk_wmem_alloc in an atomic, we can promote to rw.
3687	*/
3688	skb_set_owner_w(skb, sk: (struct sock *)sk);
3689	break;
3690	}
3691	skb_dst_set(skb, dst);
3692
3693	mss = tcp_mss_clamp(tp, mss: dst_metric_advmss(dst));
3694
3695	memset(&opts, `0`, sizeof(opts));
3696	if (tcp_rsk(req)->req_usec_ts < `0`)
3697	tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
3698	now = tcp_clock_ns();
3699	#ifdef CONFIG_SYN_COOKIES
3700	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3701	skb_set_delivery_time(skb, kt: cookie_init_timestamp(req, now),
3702	mono: true);
3703	else
3704	#endif
3705	{
3706	skb_set_delivery_time(skb, kt: now, mono: true);
3707	if (!tcp_rsk(req)->snt_synack) / Timestamp first SYNACK /
3708	tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3709	}
3710
3711	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
3712	rcu_read_lock();
3713	#endif
3714	if (tcp_rsk_used_ao(req)) {
3715	#ifdef CONFIG_TCP_AO
3716	struct tcp_ao_key *ao_key = NULL;
3717	u8 maclen = tcp_rsk(req)->maclen;
3718	u8 keyid = tcp_rsk(req)->ao_keyid;
3719
3720	ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
3721	keyid, -`1`);
3722	/ If there is no matching key - avoid sending anything,*
3723	* especially usigned segments. It could try harder and lookup
3724	* for another peer-matching key, but the peer has requested
3725	* ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
3726	*/
3727	if (unlikely(!ao_key \|\| tcp_ao_maclen(ao_key) != maclen)) {
3728	u8 key_maclen = ao_key ? tcp_ao_maclen(key: ao_key) : `0`;
3729
3730	rcu_read_unlock();
3731	kfree_skb(skb);
3732	net_warn_ratelimited("TCP-AO: the keyid %u with maclen %u\|%u from SYN packet is not present - not sending SYNACK\n",
3733	keyid, maclen, key_maclen);
3734	return NULL;
3735	}
3736	key.ao_key = ao_key;
3737	key.type = TCP_KEY_AO;
3738	#endif
3739	} else {
3740	#ifdef CONFIG_TCP_MD5SIG
3741	key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk,
3742	req_to_sk(req));
3743	if (key.md5_key)
3744	key.type = TCP_KEY_MD5;
3745	#endif
3746	}
3747	skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), type: PKT_HASH_TYPE_L4);
3748	/ bpf program will be interested in the tcp_flags /
3749	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN \| TCPHDR_ACK;
3750	tcp_header_size = tcp_synack_options(sk, req, mss, skb, opts: &opts,
3751	key: &key, foc, synack_type, syn_skb)
3752	+ sizeof(*th);
3753
3754	skb_push(skb, len: tcp_header_size);
3755	skb_reset_transport_header(skb);
3756
3757	th = (struct tcphdr *)skb->data;
3758	memset(th, `0`, sizeof(struct tcphdr));
3759	th->syn = `1`;
3760	th->ack = `1`;
3761	tcp_ecn_make_synack(req, th);
3762	th->source = htons(ireq->ir_num);
3763	th->dest = ireq->ir_rmt_port;
3764	skb->mark = ireq->ir_mark;
3765	skb->ip_summed = CHECKSUM_PARTIAL;
3766	th->seq = htonl(tcp_rsk(req)->snt_isn);
3767	/ XXX data is queued and acked as is. No buffer/window check /
3768	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3769
3770	/ RFC1323: The window in SYN & SYN/ACK segments is never scaled. /
3771	th->window = htons(min(req->rsk_rcv_wnd, `65535U`));
3772	tcp_options_write(th, NULL, tcprsk: tcp_rsk(req), opts: &opts, key: &key);
3773	th->doff = (tcp_header_size >> `2`);
3774	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3775
3776	/ Okay, we have all we need - do the md5 hash if needed /
3777	if (tcp_key_is_md5(key: &key)) {
3778	#ifdef CONFIG_TCP_MD5SIG
3779	tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3780	key.md5_key, req_to_sk(req), skb);
3781	#endif
3782	} else if (tcp_key_is_ao(key: &key)) {
3783	#ifdef CONFIG_TCP_AO
3784	tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location,
3785	key.ao_key, req, skb,
3786	opts.hash_location - (u8 *)th, `0`);
3787	#endif
3788	}
3789	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
3790	rcu_read_unlock();
3791	#endif
3792
3793	bpf_skops_write_hdr_opt(sk: (struct sock *)sk, skb, req, syn_skb,
3794	synack_type, opts: &opts);
3795
3796	skb_set_delivery_time(skb, kt: now, mono: true);
3797	tcp_add_tx_delay(skb, tp);
3798
3799	return skb;
3800	}
3801	EXPORT_SYMBOL(tcp_make_synack);
3802
3803	static void tcp_ca_dst_init(struct sock sk, const* struct dst_entry *dst)
3804	{
3805	struct inet_connection_sock *icsk = inet_csk(sk);
3806	const struct tcp_congestion_ops *ca;
3807	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3808
3809	if (ca_key == TCP_CA_UNSPEC)
3810	return;
3811
3812	rcu_read_lock();
3813	ca = tcp_ca_find_key(key: ca_key);
3814	if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3815	bpf_module_put(data: icsk->icsk_ca_ops, owner: icsk->icsk_ca_ops->owner);
3816	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3817	icsk->icsk_ca_ops = ca;
3818	}
3819	rcu_read_unlock();
3820	}
3821
3822	/ Do all connect socket setups that can be done AF independent. /
3823	static void tcp_connect_init(struct sock *sk)
3824	{
3825	const struct dst_entry *dst = __sk_dst_get(sk);
3826	struct tcp_sock *tp = tcp_sk(sk);
3827	__u8 rcv_wscale;
3828	u32 rcv_wnd;
3829
3830	/ We'll fix this up when we get a response from the other end.*
3831	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
3832	*/
3833	tp->tcp_header_len = sizeof(struct tcphdr);
3834	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
3835	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3836
3837	tcp_ao_connect_init(sk);
3838
3839	/ If user gave his TCP_MAXSEG, record it to clamp /
3840	if (tp->rx_opt.user_mss)
3841	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3842	tp->max_window = `0`;
3843	tcp_mtup_init(sk);
3844	tcp_sync_mss(sk, dst_mtu(dst));
3845
3846	tcp_ca_dst_init(sk, dst);
3847
3848	if (!tp->window_clamp)
3849	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3850	tp->advmss = tcp_mss_clamp(tp, mss: dst_metric_advmss(dst));
3851
3852	tcp_initialize_rcv_mss(sk);
3853
3854	/ limit the window selection if the user enforce a smaller rx buffer /
3855	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3856	(tp->window_clamp > tcp_full_space(sk) \|\| tp->window_clamp == `0`))
3857	tp->window_clamp = tcp_full_space(sk);
3858
3859	rcv_wnd = tcp_rwnd_init_bpf(sk);
3860	if (rcv_wnd == `0`)
3861	rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3862
3863	tcp_select_initial_window(sk, tcp_full_space(sk),
3864	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : `0`),
3865	&tp->rcv_wnd,
3866	&tp->window_clamp,
3867	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
3868	&rcv_wscale,
3869	rcv_wnd);
3870
3871	tp->rx_opt.rcv_wscale = rcv_wscale;
3872	tp->rcv_ssthresh = tp->rcv_wnd;
3873
3874	WRITE_ONCE(sk->sk_err, `0`);
3875	sock_reset_flag(sk, flag: SOCK_DONE);
3876	tp->snd_wnd = `0`;
3877	tcp_init_wl(tp, seq: `0`);
3878	tcp_write_queue_purge(sk);
3879	tp->snd_una = tp->write_seq;
3880	tp->snd_sml = tp->write_seq;
3881	tp->snd_up = tp->write_seq;
3882	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3883
3884	if (likely(!tp->repair))
3885	tp->rcv_nxt = `0`;
3886	else
3887	tp->rcv_tstamp = tcp_jiffies32;
3888	tp->rcv_wup = tp->rcv_nxt;
3889	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3890
3891	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3892	inet_csk(sk)->icsk_retransmits = `0`;
3893	tcp_clear_retrans(tp);
3894	}
3895
3896	static void tcp_connect_queue_skb(struct sock sk, struct* sk_buff *skb)
3897	{
3898	struct tcp_sock *tp = tcp_sk(sk);
3899	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3900
3901	tcb->end_seq += skb->len;
3902	__skb_header_release(skb);
3903	sk_wmem_queued_add(sk, val: skb->truesize);
3904	sk_mem_charge(sk, size: skb->truesize);
3905	WRITE_ONCE(tp->write_seq, tcb->end_seq);
3906	tp->packets_out += tcp_skb_pcount(skb);
3907	}
3908
3909	/ Build and send a SYN with data and (cached) Fast Open cookie. However,*
3910	* queue a data-only packet after the regular SYN, such that regular SYNs
3911	* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
3912	* only the SYN sequence, the data are retransmitted in the first ACK.
3913	* If cookie is not cached or other error occurs, falls back to send a
3914	* regular SYN with Fast Open cookie request option.
3915	*/
3916	static int tcp_send_syn_data(struct sock sk, struct* sk_buff *syn)
3917	{
3918	struct inet_connection_sock *icsk = inet_csk(sk);
3919	struct tcp_sock *tp = tcp_sk(sk);
3920	struct tcp_fastopen_request *fo = tp->fastopen_req;
3921	struct page_frag *pfrag = sk_page_frag(sk);
3922	struct sk_buff *syn_data;
3923	int space, err = `0`;
3924
3925	tp->rx_opt.mss_clamp = tp->advmss; / If MSS is not cached /
3926	if (!tcp_fastopen_cookie_check(sk, mss: &tp->rx_opt.mss_clamp, cookie: &fo->cookie))
3927	goto fallback;
3928
3929	/ MSS for SYN-data is based on cached MSS and bounded by PMTU and*
3930	* user-MSS. Reserve maximum option space for middleboxes that add
3931	* private TCP options. The cost is reduced data space in SYN :(
3932	*/
3933	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, mss: tp->rx_opt.mss_clamp);
3934	/ Sync mss_cache after updating the mss_clamp /
3935	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
3936
3937	space = __tcp_mtu_to_mss(sk, pmtu: icsk->icsk_pmtu_cookie) -
3938	MAX_TCP_OPTION_SPACE;
3939
3940	space = min_t(size_t, space, fo->size);
3941
3942	if (space &&
3943	!skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE),
3944	pfrag, prio: sk->sk_allocation))
3945	goto fallback;
3946	syn_data = tcp_stream_alloc_skb(sk, gfp: sk->sk_allocation, force_schedule: false);
3947	if (!syn_data)
3948	goto fallback;
3949	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3950	if (space) {
3951	space = min_t(size_t, space, pfrag->size - pfrag->offset);
3952	space = tcp_wmem_schedule(sk, copy: space);
3953	}
3954	if (space) {
3955	space = copy_page_from_iter(page: pfrag->page, offset: pfrag->offset,
3956	bytes: space, i: &fo->data->msg_iter);
3957	if (unlikely(!space)) {
3958	tcp_skb_tsorted_anchor_cleanup(skb: syn_data);
3959	kfree_skb(skb: syn_data);
3960	goto fallback;
3961	}
3962	skb_fill_page_desc(skb: syn_data, i: `0`, page: pfrag->page,
3963	off: pfrag->offset, size: space);
3964	page_ref_inc(page: pfrag->page);
3965	pfrag->offset += space;
3966	skb_len_add(skb: syn_data, delta: space);
3967	skb_zcopy_set(skb: syn_data, uarg: fo->uarg, NULL);
3968	}
3969	/ No more data pending in inet_wait_for_connect() /
3970	if (space == fo->size)
3971	fo->data = NULL;
3972	fo->copied = space;
3973
3974	tcp_connect_queue_skb(sk, skb: syn_data);
3975	if (syn_data->len)
3976	tcp_chrono_start(sk, type: TCP_CHRONO_BUSY);
3977
3978	err = tcp_transmit_skb(sk, skb: syn_data, clone_it: `1`, gfp_mask: sk->sk_allocation);
3979
3980	skb_set_delivery_time(skb: syn, kt: syn_data->skb_mstamp_ns, mono: true);
3981
3982	/ Now full SYN+DATA was cloned and sent (or not),*
3983	* remove the SYN from the original skb (syn_data)
3984	* we keep in write queue in case of a retransmit, as we
3985	* also have the SYN packet (with no data) in the same queue.
3986	*/
3987	TCP_SKB_CB(syn_data)->seq++;
3988	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK \| TCPHDR_PSH;
3989	if (!err) {
3990	tp->syn_data = (fo->copied > `0`);
3991	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: syn_data);
3992	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
3993	goto done;
3994	}
3995
3996	/ data was not sent, put it in write_queue /
3997	__skb_queue_tail(list: &sk->sk_write_queue, newsk: syn_data);
3998	tp->packets_out -= tcp_skb_pcount(skb: syn_data);
3999
4000	fallback:
4001	/ Send a regular SYN with Fast Open cookie request option /
4002	if (fo->cookie.len > `0`)
4003	fo->cookie.len = `0`;
4004	err = tcp_transmit_skb(sk, skb: syn, clone_it: `1`, gfp_mask: sk->sk_allocation);
4005	if (err)
4006	tp->syn_fastopen = `0`;
4007	done:
4008	fo->cookie.len = -`1`; / Exclude Fast Open option for SYN retries /
4009	return err;
4010	}
4011
4012	/ Build a SYN and send it off. /
4013	int tcp_connect(struct sock *sk)
4014	{
4015	struct tcp_sock *tp = tcp_sk(sk);
4016	struct sk_buff *buff;
4017	int err;
4018
4019	tcp_call_bpf(sk, op: BPF_SOCK_OPS_TCP_CONNECT_CB, nargs: `0`, NULL);
4020
4021	#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
4022	/ Has to be checked late, after setting daddr/saddr/ops.*
4023	* Return error if the peer has both a md5 and a tcp-ao key
4024	* configured as this is ambiguous.
4025	*/
4026	if (unlikely(rcu_dereference_protected(tp->md5sig_info,
4027	lockdep_sock_is_held(sk)))) {
4028	bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -`1`, -`1`);
4029	bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk);
4030	struct tcp_ao_info *ao_info;
4031
4032	ao_info = rcu_dereference_check(tp->ao_info,
4033	lockdep_sock_is_held(sk));
4034	if (ao_info) {
4035	/ This is an extra check: tcp_ao_required() in*
4036	* tcp_v{4,6}_parse_md5_keys() should prevent adding
4037	* md5 keys on ao_required socket.
4038	*/
4039	needs_ao \|= ao_info->ao_required;
4040	WARN_ON_ONCE(ao_info->ao_required && needs_md5);
4041	}
4042	if (needs_md5 && needs_ao)
4043	return -EKEYREJECTED;
4044
4045	/ If we have a matching md5 key and no matching tcp-ao key*
4046	* then free up ao_info if allocated.
4047	*/
4048	if (needs_md5) {
4049	tcp_ao_destroy_sock(sk, twsk: false);
4050	} else if (needs_ao) {
4051	tcp_clear_md5_list(sk);
4052	kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
4053	lockdep_sock_is_held(sk)));
4054	}
4055	}
4056	#endif
4057	#ifdef CONFIG_TCP_AO
4058	if (unlikely(rcu_dereference_protected(tp->ao_info,
4059	lockdep_sock_is_held(sk)))) {
4060	/ Don't allow connecting if ao is configured but no*
4061	* matching key is found.
4062	*/
4063	if (!tp->af_specific->ao_lookup(sk, sk, -`1`, -`1`))
4064	return -EKEYREJECTED;
4065	}
4066	#endif
4067
4068	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
4069	return -EHOSTUNREACH; / Routing failure or similar. /
4070
4071	tcp_connect_init(sk);
4072
4073	if (unlikely(tp->repair)) {
4074	tcp_finish_connect(sk, NULL);
4075	return `0`;
4076	}
4077
4078	buff = tcp_stream_alloc_skb(sk, gfp: sk->sk_allocation, force_schedule: true);
4079	if (unlikely(!buff))
4080	return -ENOBUFS;
4081
4082	tcp_init_nondata_skb(skb: buff, seq: tp->write_seq++, TCPHDR_SYN);
4083	tcp_mstamp_refresh(tp);
4084	tp->retrans_stamp = tcp_time_stamp_ts(tp);
4085	tcp_connect_queue_skb(sk, skb: buff);
4086	tcp_ecn_send_syn(sk, skb: buff);
4087	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: buff);
4088
4089	/ Send off SYN; include data in Fast Open. /
4090	err = tp->fastopen_req ? tcp_send_syn_data(sk, syn: buff) :
4091	tcp_transmit_skb(sk, skb: buff, clone_it: `1`, gfp_mask: sk->sk_allocation);
4092	if (err == -ECONNREFUSED)
4093	return err;
4094
4095	/ We change tp->snd_nxt after the tcp_transmit_skb() call*
4096	* in order to make this packet get counted in tcpOutSegs.
4097	*/
4098	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
4099	tp->pushed_seq = tp->write_seq;
4100	buff = tcp_send_head(sk);
4101	if (unlikely(buff)) {
4102	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
4103	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
4104	}
4105	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
4106
4107	/ Timer for repeating the SYN until an answer. /
4108	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
4109	when: inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
4110	return `0`;
4111	}
4112	EXPORT_SYMBOL(tcp_connect);
4113
4114	u32 tcp_delack_max(const struct sock *sk)
4115	{
4116	const struct dst_entry *dst = __sk_dst_get(sk);
4117	u32 delack_max = inet_csk(sk)->icsk_delack_max;
4118
4119	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) {
4120	u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
4121	u32 delack_from_rto_min = max_t(int, `1`, rto_min - `1`);
4122
4123	delack_max = min_t(u32, delack_max, delack_from_rto_min);
4124	}
4125	return delack_max;
4126	}
4127
4128	/ Send out a delayed ack, the caller does the policy checking*
4129	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
4130	* for details.
4131	*/
4132	void tcp_send_delayed_ack(struct sock *sk)
4133	{
4134	struct inet_connection_sock *icsk = inet_csk(sk);
4135	int ato = icsk->icsk_ack.ato;
4136	unsigned long timeout;
4137
4138	if (ato > TCP_DELACK_MIN) {
4139	const struct tcp_sock *tp = tcp_sk(sk);
4140	int max_ato = HZ / `2`;
4141
4142	if (inet_csk_in_pingpong_mode(sk) \|\|
4143	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
4144	max_ato = TCP_DELACK_MAX;
4145
4146	/ Slow path, intersegment interval is "high". /
4147
4148	/ If some rtt estimate is known, use it to bound delayed ack.*
4149	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
4150	* directly.
4151	*/
4152	if (tp->srtt_us) {
4153	int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> `3`),
4154	TCP_DELACK_MIN);
4155
4156	if (rtt < max_ato)
4157	max_ato = rtt;
4158	}
4159
4160	ato = min(ato, max_ato);
4161	}
4162
4163	ato = min_t(u32, ato, tcp_delack_max(sk));
4164
4165	/ Stay within the limit we were given /
4166	timeout = jiffies + ato;
4167
4168	/ Use new timeout only if there wasn't a older one earlier. /
4169	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
4170	/ If delack timer is about to expire, send ACK now. /
4171	if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> `2`))) {
4172	tcp_send_ack(sk);
4173	return;
4174	}
4175
4176	if (!time_before(timeout, icsk->icsk_ack.timeout))
4177	timeout = icsk->icsk_ack.timeout;
4178	}
4179	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
4180	icsk->icsk_ack.timeout = timeout;
4181	sk_reset_timer(sk, timer: &icsk->icsk_delack_timer, expires: timeout);
4182	}
4183
4184	/ This routine sends an ack and also updates the window. /
4185	void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
4186	{
4187	struct sk_buff *buff;
4188
4189	/ If we have been reset, we may not send again. /
4190	if (sk->sk_state == TCP_CLOSE)
4191	return;
4192
4193	/ We are not putting this on the write queue, so*
4194	* tcp_transmit_skb() will set the ownership to this
4195	* sock.
4196	*/
4197	buff = alloc_skb(MAX_TCP_HEADER,
4198	priority: sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
4199	if (unlikely(!buff)) {
4200	struct inet_connection_sock *icsk = inet_csk(sk);
4201	unsigned long delay;
4202
4203	delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
4204	if (delay < TCP_RTO_MAX)
4205	icsk->icsk_ack.retry++;
4206	inet_csk_schedule_ack(sk);
4207	icsk->icsk_ack.ato = TCP_ATO_MIN;
4208	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, when: delay, TCP_RTO_MAX);
4209	return;
4210	}
4211
4212	/ Reserve space for headers and prepare control bits. /
4213	skb_reserve(skb: buff, MAX_TCP_HEADER);
4214	tcp_init_nondata_skb(skb: buff, seq: tcp_acceptable_seq(sk), TCPHDR_ACK);
4215
4216	/ We do not want pure acks influencing TCP Small Queues or fq/pacing*
4217	* too much.
4218	* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
4219	*/
4220	skb_set_tcp_pure_ack(skb: buff);
4221
4222	/ Send it off, this clears delayed acks for us. /
4223	__tcp_transmit_skb(sk, skb: buff, clone_it: `0`, gfp_mask: (__force gfp_t)`0`, rcv_nxt);
4224	}
4225	EXPORT_SYMBOL_GPL(__tcp_send_ack);
4226
4227	void tcp_send_ack(struct sock *sk)
4228	{
4229	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
4230	}
4231
4232	/ This routine sends a packet with an out of date sequence*
4233	* number. It assumes the other end will try to ack it.
4234	*
4235	* Question: what should we make while urgent mode?
4236	* 4.4BSD forces sending single byte of data. We cannot send
4237	* out of window data, because we have SND.NXT==SND.MAX...
4238	*
4239	* Current solution: to send TWO zero-length segments in urgent mode:
4240	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
4241	* out-of-date with SND.UNA-1 to probe window.
4242	*/
4243	static int tcp_xmit_probe_skb(struct sock sk, int* urgent, int mib)
4244	{
4245	struct tcp_sock *tp = tcp_sk(sk);
4246	struct sk_buff *skb;
4247
4248	/ We don't queue it, tcp_transmit_skb() sets ownership. /
4249	skb = alloc_skb(MAX_TCP_HEADER,
4250	priority: sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
4251	if (!skb)
4252	return -`1`;
4253
4254	/ Reserve space for headers and set control bits. /
4255	skb_reserve(skb, MAX_TCP_HEADER);
4256	/ Use a previous sequence. This should cause the other*
4257	* end to send an ack. Don't queue or clone SKB, just
4258	* send it.
4259	*/
4260	tcp_init_nondata_skb(skb, seq: tp->snd_una - !urgent, TCPHDR_ACK);
4261	NET_INC_STATS(sock_net(sk), mib);
4262	return tcp_transmit_skb(sk, skb, clone_it: `0`, gfp_mask: (__force gfp_t)`0`);
4263	}
4264
4265	/ Called from setsockopt( ... TCP_REPAIR ) /
4266	void tcp_send_window_probe(struct sock *sk)
4267	{
4268	if (sk->sk_state == TCP_ESTABLISHED) {
4269	tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - `1`;
4270	tcp_mstamp_refresh(tcp_sk(sk));
4271	tcp_xmit_probe_skb(sk, urgent: `0`, mib: LINUX_MIB_TCPWINPROBE);
4272	}
4273	}
4274
4275	/ Initiate keepalive or window probe from timer. /
4276	int tcp_write_wakeup(struct sock sk, int* mib)
4277	{
4278	struct tcp_sock *tp = tcp_sk(sk);
4279	struct sk_buff *skb;
4280
4281	if (sk->sk_state == TCP_CLOSE)
4282	return -`1`;
4283
4284	skb = tcp_send_head(sk);
4285	if (skb && before(TCP_SKB_CB(skb)->seq, seq2: tcp_wnd_end(tp))) {
4286	int err;
4287	unsigned int mss = tcp_current_mss(sk);
4288	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4289
4290	if (before(seq1: tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
4291	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
4292
4293	/ We are probing the opening of a window*
4294	* but the window size is != 0
4295	* must have been a result SWS avoidance ( sender )
4296	*/
4297	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
4298	skb->len > mss) {
4299	seg_size = min(seg_size, mss);
4300	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
4301	if (tcp_fragment(sk, tcp_queue: TCP_FRAG_IN_WRITE_QUEUE,
4302	skb, len: seg_size, mss_now: mss, GFP_ATOMIC))
4303	return -`1`;
4304	} else if (!tcp_skb_pcount(skb))
4305	tcp_set_skb_tso_segs(skb, mss_now: mss);
4306
4307	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
4308	err = tcp_transmit_skb(sk, skb, clone_it: `1`, GFP_ATOMIC);
4309	if (!err)
4310	tcp_event_new_data_sent(sk, skb);
4311	return err;
4312	} else {
4313	if (between(seq1: tp->snd_up, seq2: tp->snd_una + `1`, seq3: tp->snd_una + `0xFFFF`))
4314	tcp_xmit_probe_skb(sk, urgent: `1`, mib);
4315	return tcp_xmit_probe_skb(sk, urgent: `0`, mib);
4316	}
4317	}
4318
4319	/ A window probe timeout has occurred. If window is not closed send*
4320	* a partial packet else a zero probe.
4321	*/
4322	void tcp_send_probe0(struct sock *sk)
4323	{
4324	struct inet_connection_sock *icsk = inet_csk(sk);
4325	struct tcp_sock *tp = tcp_sk(sk);
4326	struct net *net = sock_net(sk);
4327	unsigned long timeout;
4328	int err;
4329
4330	err = tcp_write_wakeup(sk, mib: LINUX_MIB_TCPWINPROBE);
4331
4332	if (tp->packets_out \|\| tcp_write_queue_empty(sk)) {
4333	/ Cancel probe timer, if it is not required. /
4334	icsk->icsk_probes_out = `0`;
4335	icsk->icsk_backoff = `0`;
4336	icsk->icsk_probes_tstamp = `0`;
4337	return;
4338	}
4339
4340	icsk->icsk_probes_out++;
4341	if (err <= `0`) {
4342	if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
4343	icsk->icsk_backoff++;
4344	timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
4345	} else {
4346	/ If packet was not sent due to local congestion,*
4347	* Let senders fight for local resources conservatively.
4348	*/
4349	timeout = TCP_RESOURCE_PROBE_INTERVAL;
4350	}
4351
4352	timeout = tcp_clamp_probe0_to_user_timeout(sk, when: timeout);
4353	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when: timeout, TCP_RTO_MAX);
4354	}
4355
4356	int tcp_rtx_synack(const struct sock sk, struct* request_sock *req)
4357	{
4358	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
4359	struct flowi fl;
4360	int res;
4361
4362	/ Paired with WRITE_ONCE() in sock_setsockopt() /
4363	if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
4364	WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
4365	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
4366	NULL);
4367	if (!res) {
4368	TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
4369	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4370	if (unlikely(tcp_passive_fastopen(sk))) {
4371	/ sk has const attribute because listeners are lockless.*
4372	* However in this case, we are dealing with a passive fastopen
4373	* socket thus we can change total_retrans value.
4374	*/
4375	tcp_sk_rw(sk)->total_retrans++;
4376	}
4377	trace_tcp_retransmit_synack(sk, req);
4378	}
4379	return res;
4380	}
4381	EXPORT_SYMBOL(tcp_rtx_synack);
4382

source code of linux/net/ipv4/tcp_output.c