tcp.c source code [linux/net/ipv4/tcp.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
13	* Florian La Roche, <flla@stud.uni-sb.de>
14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
17	* Matthew Dillon, <dillon@apollo.west.oic.com>
18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19	* Jorge Cwik, <jorge@laser.satlink.net>
20	*
21	* Fixes:
22	* Alan Cox : Numerous verify_area() calls
23	* Alan Cox : Set the ACK bit on a reset
24	* Alan Cox : Stopped it crashing if it closed while
25	* sk->inuse=1 and was trying to connect
26	* (tcp_err()).
27	* Alan Cox : All icmp error handling was broken
28	* pointers passed where wrong and the
29	* socket was looked up backwards. Nobody
30	* tested any icmp error code obviously.
31	* Alan Cox : tcp_err() now handled properly. It
32	* wakes people on errors. poll
33	* behaves and the icmp error race
34	* has gone by moving it into sock.c
35	* Alan Cox : tcp_send_reset() fixed to work for
36	* everything not just packets for
37	* unknown sockets.
38	* Alan Cox : tcp option processing.
39	* Alan Cox : Reset tweaked (still not 100%) [Had
40	* syn rule wrong]
41	* Herp Rosmanith : More reset fixes
42	* Alan Cox : No longer acks invalid rst frames.
43	* Acking any kind of RST is right out.
44	* Alan Cox : Sets an ignore me flag on an rst
45	* receive otherwise odd bits of prattle
46	* escape still
47	* Alan Cox : Fixed another acking RST frame bug.
48	* Should stop LAN workplace lockups.
49	* Alan Cox : Some tidyups using the new skb list
50	* facilities
51	* Alan Cox : sk->keepopen now seems to work
52	* Alan Cox : Pulls options out correctly on accepts
53	* Alan Cox : Fixed assorted sk->rqueue->next errors
54	* Alan Cox : PSH doesn't end a TCP read. Switched a
55	* bit to skb ops.
56	* Alan Cox : Tidied tcp_data to avoid a potential
57	* nasty.
58	* Alan Cox : Added some better commenting, as the
59	* tcp is hard to follow
60	* Alan Cox : Removed incorrect check for 20 * psh
61	* Michael O'Reilly : ack < copied bug fix.
62	* Johannes Stille : Misc tcp fixes (not all in yet).
63	* Alan Cox : FIN with no memory -> CRASH
64	* Alan Cox : Added socket option proto entries.
65	* Also added awareness of them to accept.
66	* Alan Cox : Added TCP options (SOL_TCP)
67	* Alan Cox : Switched wakeup calls to callbacks,
68	* so the kernel can layer network
69	* sockets.
70	* Alan Cox : Use ip_tos/ip_ttl settings.
71	* Alan Cox : Handle FIN (more) properly (we hope).
72	* Alan Cox : RST frames sent on unsynchronised
73	* state ack error.
74	* Alan Cox : Put in missing check for SYN bit.
75	* Alan Cox : Added tcp_select_window() aka NET2E
76	* window non shrink trick.
77	* Alan Cox : Added a couple of small NET2E timer
78	* fixes
79	* Charles Hedrick : TCP fixes
80	* Toomas Tamm : TCP window fixes
81	* Alan Cox : Small URG fix to rlogin ^C ack fight
82	* Charles Hedrick : Rewrote most of it to actually work
83	* Linus : Rewrote tcp_read() and URG handling
84	* completely
85	* Gerhard Koerting: Fixed some missing timer handling
86	* Matthew Dillon : Reworked TCP machine states as per RFC
87	* Gerhard Koerting: PC/TCP workarounds
88	* Adam Caldwell : Assorted timer/timing errors
89	* Matthew Dillon : Fixed another RST bug
90	* Alan Cox : Move to kernel side addressing changes.
91	* Alan Cox : Beginning work on TCP fastpathing
92	* (not yet usable)
93	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
94	* Alan Cox : TCP fast path debugging
95	* Alan Cox : Window clamping
96	* Michael Riepe : Bug in tcp_check()
97	* Matt Dillon : More TCP improvements and RST bug fixes
98	* Matt Dillon : Yet more small nasties remove from the
99	* TCP code (Be very nice to this man if
100	* tcp finally works 100%) 8)
101	* Alan Cox : BSD accept semantics.
102	* Alan Cox : Reset on closedown bug.
103	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
104	* Michael Pall : Handle poll() after URG properly in
105	* all cases.
106	* Michael Pall : Undo the last fix in tcp_read_urg()
107	* (multi URG PUSH broke rlogin).
108	* Michael Pall : Fix the multi URG PUSH problem in
109	* tcp_readable(), poll() after URG
110	* works now.
111	* Michael Pall : recv(...,MSG_OOB) never blocks in the
112	* BSD api.
113	* Alan Cox : Changed the semantics of sk->socket to
114	* fix a race and a signal problem with
115	* accept() and async I/O.
116	* Alan Cox : Relaxed the rules on tcp_sendto().
117	* Yury Shevchuk : Really fixed accept() blocking problem.
118	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
119	* clients/servers which listen in on
120	* fixed ports.
121	* Alan Cox : Cleaned the above up and shrank it to
122	* a sensible code size.
123	* Alan Cox : Self connect lockup fix.
124	* Alan Cox : No connect to multicast.
125	* Ross Biro : Close unaccepted children on master
126	* socket close.
127	* Alan Cox : Reset tracing code.
128	* Alan Cox : Spurious resets on shutdown.
129	* Alan Cox : Giant 15 minute/60 second timer error
130	* Alan Cox : Small whoops in polling before an
131	* accept.
132	* Alan Cox : Kept the state trace facility since
133	* it's handy for debugging.
134	* Alan Cox : More reset handler fixes.
135	* Alan Cox : Started rewriting the code based on
136	* the RFC's for other useful protocol
137	* references see: Comer, KA9Q NOS, and
138	* for a reference on the difference
139	* between specifications and how BSD
140	* works see the 4.4lite source.
141	* A.N.Kuznetsov : Don't time wait on completion of tidy
142	* close.
143	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
144	* Linus Torvalds : Fixed BSD port reuse to work first syn
145	* Alan Cox : Reimplemented timers as per the RFC
146	* and using multiple timers for sanity.
147	* Alan Cox : Small bug fixes, and a lot of new
148	* comments.
149	* Alan Cox : Fixed dual reader crash by locking
150	* the buffers (much like datagram.c)
151	* Alan Cox : Fixed stuck sockets in probe. A probe
152	* now gets fed up of retrying without
153	* (even a no space) answer.
154	* Alan Cox : Extracted closing code better
155	* Alan Cox : Fixed the closing state machine to
156	* resemble the RFC.
157	* Alan Cox : More 'per spec' fixes.
158	* Jorge Cwik : Even faster checksumming.
159	* Alan Cox : tcp_data() doesn't ack illegal PSH
160	* only frames. At least one pc tcp stack
161	* generates them.
162	* Alan Cox : Cache last socket.
163	* Alan Cox : Per route irtt.
164	* Matt Day : poll()->select() match BSD precisely on error
165	* Alan Cox : New buffers
166	* Marc Tamsky : Various sk->prot->retransmits and
167	* sk->retransmits misupdating fixed.
168	* Fixed tcp_write_timeout: stuck close,
169	* and TCP syn retries gets used now.
170	* Mark Yarvis : In tcp_read_wakeup(), don't send an
171	* ack if state is TCP_CLOSED.
172	* Alan Cox : Look up device on a retransmit - routes may
173	* change. Doesn't yet cope with MSS shrink right
174	* but it's a start!
175	* Marc Tamsky : Closing in closing fixes.
176	* Mike Shaver : RFC1122 verifications.
177	* Alan Cox : rcv_saddr errors.
178	* Alan Cox : Block double connect().
179	* Alan Cox : Small hooks for enSKIP.
180	* Alexey Kuznetsov: Path MTU discovery.
181	* Alan Cox : Support soft errors.
182	* Alan Cox : Fix MTU discovery pathological case
183	* when the remote claims no mtu!
184	* Marc Tamsky : TCP_CLOSE fix.
185	* Colin (G3TNE) : Send a reset on syn ack replies in
186	* window but wrong (fixes NT lpd problems)
187	* Pedro Roque : Better TCP window handling, delayed ack.
188	* Joerg Reuter : No modification of locked buffers in
189	* tcp_do_retransmit()
190	* Eric Schenk : Changed receiver side silly window
191	* avoidance algorithm to BSD style
192	* algorithm. This doubles throughput
193	* against machines running Solaris,
194	* and seems to result in general
195	* improvement.
196	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
197	* Willy Konynenberg : Transparent proxying support.
198	* Mike McLagan : Routing by source
199	* Keith Owens : Do proper merging with partial SKB's in
200	* tcp_do_sendmsg to avoid burstiness.
201	* Eric Schenk : Fix fast close down bug with
202	* shutdown() followed by close().
203	* Andi Kleen : Make poll agree with SIGIO
204	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
205	* lingertime == 0 (RFC 793 ABORT Call)
206	* Hirokazu Takahashi : Use copy_from_user() instead of
207	* csum_and_copy_from_user() if possible.
208	*
209	* Description of States:
210	*
211	* TCP_SYN_SENT sent a connection request, waiting for ack
212	*
213	* TCP_SYN_RECV received a connection request, sent ack,
214	* waiting for final ack in three-way handshake.
215	*
216	* TCP_ESTABLISHED connection established
217	*
218	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
219	* transmission of remaining buffered data
220	*
221	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
222	* to shutdown
223	*
224	* TCP_CLOSING both sides have shutdown but we still have
225	* data we have to finish sending
226	*
227	* TCP_TIME_WAIT timeout to catch resent junk before entering
228	* closed, can only be entered from FIN_WAIT2
229	* or CLOSING. Required because the other end
230	* may not have gotten our last ACK causing it
231	* to retransmit the data packet (which we ignore)
232	*
233	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
234	* us to finish writing our data and to shutdown
235	* (we have to close() to move on to LAST_ACK)
236	*
237	* TCP_LAST_ACK out side has shutdown after remote has
238	* shutdown. There may still be data in our
239	* buffer that we have to finish sending
240	*
241	* TCP_CLOSE socket is finished
242	*/
243
244	#define pr_fmt(fmt) "TCP: " fmt
245
246	#include <crypto/hash.h>
247	#include <linux/kernel.h>
248	#include <linux/module.h>
249	#include <linux/types.h>
250	#include <linux/fcntl.h>
251	#include <linux/poll.h>
252	#include <linux/inet_diag.h>
253	#include <linux/init.h>
254	#include <linux/fs.h>
255	#include <linux/skbuff.h>
256	#include <linux/scatterlist.h>
257	#include <linux/splice.h>
258	#include <linux/net.h>
259	#include <linux/socket.h>
260	#include <linux/random.h>
261	#include <linux/memblock.h>
262	#include <linux/highmem.h>
263	#include <linux/cache.h>
264	#include <linux/err.h>
265	#include <linux/time.h>
266	#include <linux/slab.h>
267	#include <linux/errqueue.h>
268	#include <linux/static_key.h>
269	#include <linux/btf.h>
270
271	#include <net/icmp.h>
272	#include <net/inet_common.h>
273	#include <net/tcp.h>
274	#include <net/mptcp.h>
275	#include <net/xfrm.h>
276	#include <net/ip.h>
277	#include <net/sock.h>
278
279	#include <linux/uaccess.h>
280	#include <asm/ioctls.h>
281	#include <net/busy_poll.h>
282
283	/ Track pending CMSGs. /
284	enum {
285	TCP_CMSG_INQ = `1`,
286	TCP_CMSG_TS = `2`
287	};
288
289	DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
290	EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
291
292	long sysctl_tcp_mem[`3`] __read_mostly;
293	EXPORT_SYMBOL(sysctl_tcp_mem);
294
295	atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; / Current allocated memory. /
296	EXPORT_SYMBOL(tcp_memory_allocated);
297	DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
298	EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
299
300	#if IS_ENABLED(CONFIG_SMC)
301	DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
302	EXPORT_SYMBOL(tcp_have_smc);
303	#endif
304
305	/*
306	* Current number of TCP sockets.
307	*/
308	struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
309	EXPORT_SYMBOL(tcp_sockets_allocated);
310
311	/*
312	* TCP splice context
313	*/
314	struct tcp_splice_state {
315	struct pipe_inode_info *pipe;
316	size_t len;
317	unsigned int flags;
318	};
319
320	/*
321	* Pressure flag: try to collapse.
322	* Technical note: it is used by multiple contexts non atomically.
323	* All the __sk_mem_schedule() is of this nature: accounting
324	* is strict, actions are advisory and have some latency.
325	*/
326	unsigned long tcp_memory_pressure __read_mostly;
327	EXPORT_SYMBOL_GPL(tcp_memory_pressure);
328
329	void tcp_enter_memory_pressure(struct sock *sk)
330	{
331	unsigned long val;
332
333	if (READ_ONCE(tcp_memory_pressure))
334	return;
335	val = jiffies;
336
337	if (!val)
338	val--;
339	if (!cmpxchg(&tcp_memory_pressure, `0`, val))
340	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
341	}
342	EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
343
344	void tcp_leave_memory_pressure(struct sock *sk)
345	{
346	unsigned long val;
347
348	if (!READ_ONCE(tcp_memory_pressure))
349	return;
350	val = xchg(&tcp_memory_pressure, `0`);
351	if (val)
352	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
353	jiffies_to_msecs(jiffies - val));
354	}
355	EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
356
357	/ Convert seconds to retransmits based on initial and max timeout /
358	static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
359	{
360	u8 res = `0`;
361
362	if (seconds > `0`) {
363	int period = timeout;
364
365	res = `1`;
366	while (seconds > period && res < `255`) {
367	res++;
368	timeout <<= `1`;
369	if (timeout > rto_max)
370	timeout = rto_max;
371	period += timeout;
372	}
373	}
374	return res;
375	}
376
377	/ Convert retransmits to seconds based on initial and max timeout /
378	static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
379	{
380	int period = `0`;
381
382	if (retrans > `0`) {
383	period = timeout;
384	while (--retrans) {
385	timeout <<= `1`;
386	if (timeout > rto_max)
387	timeout = rto_max;
388	period += timeout;
389	}
390	}
391	return period;
392	}
393
394	static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
395	{
396	u32 rate = READ_ONCE(tp->rate_delivered);
397	u32 intv = READ_ONCE(tp->rate_interval_us);
398	u64 rate64 = `0`;
399
400	if (rate && intv) {
401	rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
402	do_div(rate64, intv);
403	}
404	return rate64;
405	}
406
407	/ Address-family independent initialization for a tcp_sock.*
408	*
409	* NOTE: A lot of things set to zero explicitly by call to
410	* sk_alloc() so need not be done here.
411	*/
412	void tcp_init_sock(struct sock *sk)
413	{
414	struct inet_connection_sock *icsk = inet_csk(sk);
415	struct tcp_sock *tp = tcp_sk(sk);
416
417	tp->out_of_order_queue = RB_ROOT;
418	sk->tcp_rtx_queue = RB_ROOT;
419	tcp_init_xmit_timers(sk);
420	INIT_LIST_HEAD(list: &tp->tsq_node);
421	INIT_LIST_HEAD(list: &tp->tsorted_sent_queue);
422
423	icsk->icsk_rto = TCP_TIMEOUT_INIT;
424	icsk->icsk_rto_min = TCP_RTO_MIN;
425	icsk->icsk_delack_max = TCP_DELACK_MAX;
426	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
427	minmax_reset(m: &tp->rtt_min, tcp_jiffies32, meas: ~`0U`);
428
429	/ So many TCP implementations out there (incorrectly) count the*
430	* initial SYN frame in their delayed-ACK and congestion control
431	* algorithms that we must have the following bandaid to talk
432	* efficiently to them. -DaveM
433	*/
434	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
435
436	/ There's a bubble in the pipe until at least the first ACK. /
437	tp->app_limited = ~`0U`;
438	tp->rate_app_limited = `1`;
439
440	/ See draft-stevens-tcpca-spec-01 for discussion of the*
441	* initialization of these values.
442	*/
443	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
444	tp->snd_cwnd_clamp = ~`0`;
445	tp->mss_cache = TCP_MSS_DEFAULT;
446
447	tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
448	tcp_assign_congestion_control(sk);
449
450	tp->tsoffset = `0`;
451	tp->rack.reo_wnd_steps = `1`;
452
453	sk->sk_write_space = sk_stream_write_space;
454	sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
455
456	icsk->icsk_sync_mss = tcp_sync_mss;
457
458	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[`1`]));
459	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`1`]));
460	tcp_scaling_ratio_init(sk);
461
462	set_bit(SOCK_SUPPORT_ZC, addr: &sk->sk_socket->flags);
463	sk_sockets_allocated_inc(sk);
464	}
465	EXPORT_SYMBOL(tcp_init_sock);
466
467	static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
468	{
469	struct sk_buff *skb = tcp_write_queue_tail(sk);
470
471	if (tsflags && skb) {
472	struct skb_shared_info *shinfo = skb_shinfo(skb);
473	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
474
475	sock_tx_timestamp(sk, tsflags, tx_flags: &shinfo->tx_flags);
476	if (tsflags & SOF_TIMESTAMPING_TX_ACK)
477	tcb->txstamp_ack = `1`;
478	if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
479	shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - `1`;
480	}
481	}
482
483	static bool tcp_stream_is_readable(struct sock sk, int* target)
484	{
485	if (tcp_epollin_ready(sk, target))
486	return true;
487	return sk_is_readable(sk);
488	}
489
490	/*
491	* Wait for a TCP event.
492	*
493	* Note that we don't need to lock the socket, as the upper poll layers
494	* take care of normal races (between the test and the event) and we don't
495	* go look at any of the socket buffers directly.
496	*/
497	__poll_t tcp_poll(struct file file, struct* socket sock, poll_table wait)
498	{
499	__poll_t mask;
500	struct sock *sk = sock->sk;
501	const struct tcp_sock *tp = tcp_sk(sk);
502	u8 shutdown;
503	int state;
504
505	sock_poll_wait(filp: file, sock, p: wait);
506
507	state = inet_sk_state_load(sk);
508	if (state == TCP_LISTEN)
509	return inet_csk_listen_poll(sk);
510
511	/ Socket is not locked. We are protected from async events*
512	* by poll logic and correct handling of state changes
513	* made by other threads is impossible in any case.
514	*/
515
516	mask = `0`;
517
518	/*
519	* EPOLLHUP is certainly not done right. But poll() doesn't
520	* have a notion of HUP in just one direction, and for a
521	* socket the read side is more interesting.
522	*
523	* Some poll() documentation says that EPOLLHUP is incompatible
524	* with the EPOLLOUT/POLLWR flags, so somebody should check this
525	* all. But careful, it tends to be safer to return too many
526	* bits than too few, and you can easily break real applications
527	* if you don't tell them that something has hung up!
528	*
529	* Check-me.
530	*
531	* Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
532	* our fs/select.c). It means that after we received EOF,
533	* poll always returns immediately, making impossible poll() on write()
534	* in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
535	* if and only if shutdown has been made in both directions.
536	* Actually, it is interesting to look how Solaris and DUX
537	* solve this dilemma. I would prefer, if EPOLLHUP were maskable,
538	* then we could set it on SND_SHUTDOWN. BTW examples given
539	* in Stevens' books assume exactly this behaviour, it explains
540	* why EPOLLHUP is incompatible with EPOLLOUT. --ANK
541	*
542	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
543	* blocking on fresh not-connected or disconnected socket. --ANK
544	*/
545	shutdown = READ_ONCE(sk->sk_shutdown);
546	if (shutdown == SHUTDOWN_MASK \|\| state == TCP_CLOSE)
547	mask \|= EPOLLHUP;
548	if (shutdown & RCV_SHUTDOWN)
549	mask \|= EPOLLIN \| EPOLLRDNORM \| EPOLLRDHUP;
550
551	/ Connected or passive Fast Open socket? /
552	if (state != TCP_SYN_SENT &&
553	(state != TCP_SYN_RECV \|\| rcu_access_pointer(tp->fastopen_rsk))) {
554	int target = sock_rcvlowat(sk, waitall: `0`, INT_MAX);
555	u16 urg_data = READ_ONCE(tp->urg_data);
556
557	if (unlikely(urg_data) &&
558	READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
559	!sock_flag(sk, flag: SOCK_URGINLINE))
560	target++;
561
562	if (tcp_stream_is_readable(sk, target))
563	mask \|= EPOLLIN \| EPOLLRDNORM;
564
565	if (!(shutdown & SEND_SHUTDOWN)) {
566	if (__sk_stream_is_writeable(sk, wake: `1`)) {
567	mask \|= EPOLLOUT \| EPOLLWRNORM;
568	} else { / send SIGIO later /
569	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
570	set_bit(SOCK_NOSPACE, addr: &sk->sk_socket->flags);
571
572	/ Race breaker. If space is freed after*
573	* wspace test but before the flags are set,
574	* IO signal will be lost. Memory barrier
575	* pairs with the input side.
576	*/
577	smp_mb__after_atomic();
578	if (__sk_stream_is_writeable(sk, wake: `1`))
579	mask \|= EPOLLOUT \| EPOLLWRNORM;
580	}
581	} else
582	mask \|= EPOLLOUT \| EPOLLWRNORM;
583
584	if (urg_data & TCP_URG_VALID)
585	mask \|= EPOLLPRI;
586	} else if (state == TCP_SYN_SENT &&
587	inet_test_bit(DEFER_CONNECT, sk)) {
588	/ Active TCP fastopen socket with defer_connect*
589	* Return EPOLLOUT so application can call write()
590	* in order for kernel to generate SYN+data
591	*/
592	mask \|= EPOLLOUT \| EPOLLWRNORM;
593	}
594	/ This barrier is coupled with smp_wmb() in tcp_reset() /
595	smp_rmb();
596	if (READ_ONCE(sk->sk_err) \|\|
597	!skb_queue_empty_lockless(list: &sk->sk_error_queue))
598	mask \|= EPOLLERR;
599
600	return mask;
601	}
602	EXPORT_SYMBOL(tcp_poll);
603
604	int tcp_ioctl(struct sock sk, int* cmd, int *karg)
605	{
606	struct tcp_sock *tp = tcp_sk(sk);
607	int answ;
608	bool slow;
609
610	switch (cmd) {
611	case SIOCINQ:
612	if (sk->sk_state == TCP_LISTEN)
613	return -EINVAL;
614
615	slow = lock_sock_fast(sk);
616	answ = tcp_inq(sk);
617	unlock_sock_fast(sk, slow);
618	break;
619	case SIOCATMARK:
620	answ = READ_ONCE(tp->urg_data) &&
621	READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
622	break;
623	case SIOCOUTQ:
624	if (sk->sk_state == TCP_LISTEN)
625	return -EINVAL;
626
627	if ((`1` << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
628	answ = `0`;
629	else
630	answ = READ_ONCE(tp->write_seq) - tp->snd_una;
631	break;
632	case SIOCOUTQNSD:
633	if (sk->sk_state == TCP_LISTEN)
634	return -EINVAL;
635
636	if ((`1` << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
637	answ = `0`;
638	else
639	answ = READ_ONCE(tp->write_seq) -
640	READ_ONCE(tp->snd_nxt);
641	break;
642	default:
643	return -ENOIOCTLCMD;
644	}
645
646	*karg = answ;
647	return `0`;
648	}
649	EXPORT_SYMBOL(tcp_ioctl);
650
651	void tcp_mark_push(struct tcp_sock tp, struct* sk_buff *skb)
652	{
653	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
654	tp->pushed_seq = tp->write_seq;
655	}
656
657	static inline bool forced_push(const struct tcp_sock *tp)
658	{
659	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> `1`));
660	}
661
662	void tcp_skb_entail(struct sock sk, struct* sk_buff *skb)
663	{
664	struct tcp_sock *tp = tcp_sk(sk);
665	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
666
667	tcb->seq = tcb->end_seq = tp->write_seq;
668	tcb->tcp_flags = TCPHDR_ACK;
669	__skb_header_release(skb);
670	tcp_add_write_queue_tail(sk, skb);
671	sk_wmem_queued_add(sk, val: skb->truesize);
672	sk_mem_charge(sk, size: skb->truesize);
673	if (tp->nonagle & TCP_NAGLE_PUSH)
674	tp->nonagle &= ~TCP_NAGLE_PUSH;
675
676	tcp_slow_start_after_idle_check(sk);
677	}
678
679	static inline void tcp_mark_urg(struct tcp_sock tp, int* flags)
680	{
681	if (flags & MSG_OOB)
682	tp->snd_up = tp->write_seq;
683	}
684
685	/ If a not yet filled skb is pushed, do not send it if*
686	* we have data packets in Qdisc or NIC queues :
687	* Because TX completion will happen shortly, it gives a chance
688	* to coalesce future sendmsg() payload into this skb, without
689	* need for a timer, and with no latency trade off.
690	* As packets containing data payload have a bigger truesize
691	* than pure acks (dataless) packets, the last checks prevent
692	* autocorking if we only have an ACK in Qdisc/NIC queues,
693	* or if TX completion was delayed after we processed ACK packet.
694	*/
695	static bool tcp_should_autocork(struct sock sk, struct* sk_buff *skb,
696	int size_goal)
697	{
698	return skb->len < size_goal &&
699	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
700	!tcp_rtx_queue_empty(sk) &&
701	refcount_read(r: &sk->sk_wmem_alloc) > skb->truesize &&
702	tcp_skb_can_collapse_to(skb);
703	}
704
705	void tcp_push(struct sock sk, int* flags, int mss_now,
706	int nonagle, int size_goal)
707	{
708	struct tcp_sock *tp = tcp_sk(sk);
709	struct sk_buff *skb;
710
711	skb = tcp_write_queue_tail(sk);
712	if (!skb)
713	return;
714	if (!(flags & MSG_MORE) \|\| forced_push(tp))
715	tcp_mark_push(tp, skb);
716
717	tcp_mark_urg(tp, flags);
718
719	if (tcp_should_autocork(sk, skb, size_goal)) {
720
721	/ avoid atomic op if TSQ_THROTTLED bit is already set /
722	if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
723	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
724	set_bit(nr: TSQ_THROTTLED, addr: &sk->sk_tsq_flags);
725	}
726	/ It is possible TX completion already happened*
727	* before we set TSQ_THROTTLED.
728	*/
729	if (refcount_read(r: &sk->sk_wmem_alloc) > skb->truesize)
730	return;
731	}
732
733	if (flags & MSG_MORE)
734	nonagle = TCP_NAGLE_CORK;
735
736	__tcp_push_pending_frames(sk, cur_mss: mss_now, nonagle);
737	}
738
739	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct* sk_buff *skb,
740	unsigned int offset, size_t len)
741	{
742	struct tcp_splice_state *tss = rd_desc->arg.data;
743	int ret;
744
745	ret = skb_splice_bits(skb, sk: skb->sk, offset, pipe: tss->pipe,
746	min(rd_desc->count, len), flags: tss->flags);
747	if (ret > `0`)
748	rd_desc->count -= ret;
749	return ret;
750	}
751
752	static int __tcp_splice_read(struct sock sk, struct* tcp_splice_state *tss)
753	{
754	/ Store TCP splice context information in read_descriptor_t. /
755	read_descriptor_t rd_desc = {
756	.arg.data = tss,
757	.count = tss->len,
758	};
759
760	return tcp_read_sock(sk, desc: &rd_desc, recv_actor: tcp_splice_data_recv);
761	}
762
763	/**
764	* tcp_splice_read - splice data from TCP socket to a pipe
765	* @sock: socket to splice from
766	* @ppos: position (not valid)
767	* @pipe: pipe to splice to
768	* @len: number of bytes to splice
769	* @flags: splice modifier flags
770	*
771	* Description:
772	* Will read pages from given socket and fill them into a pipe.
773	*
774	**/
775	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,
776	struct pipe_inode_info *pipe, size_t len,
777	unsigned int flags)
778	{
779	struct sock *sk = sock->sk;
780	struct tcp_splice_state tss = {
781	.pipe = pipe,
782	.len = len,
783	.flags = flags,
784	};
785	long timeo;
786	ssize_t spliced;
787	int ret;
788
789	sock_rps_record_flow(sk);
790	/*
791	* We can't seek on a socket input
792	*/
793	if (unlikely(*ppos))
794	return -ESPIPE;
795
796	ret = spliced = `0`;
797
798	lock_sock(sk);
799
800	timeo = sock_rcvtimeo(sk, noblock: sock->file->f_flags & O_NONBLOCK);
801	while (tss.len) {
802	ret = __tcp_splice_read(sk, tss: &tss);
803	if (ret < `0`)
804	break;
805	else if (!ret) {
806	if (spliced)
807	break;
808	if (sock_flag(sk, flag: SOCK_DONE))
809	break;
810	if (sk->sk_err) {
811	ret = sock_error(sk);
812	break;
813	}
814	if (sk->sk_shutdown & RCV_SHUTDOWN)
815	break;
816	if (sk->sk_state == TCP_CLOSE) {
817	/*
818	* This occurs when user tries to read
819	* from never connected socket.
820	*/
821	ret = -ENOTCONN;
822	break;
823	}
824	if (!timeo) {
825	ret = -EAGAIN;
826	break;
827	}
828	/ if __tcp_splice_read() got nothing while we have*
829	* an skb in receive queue, we do not want to loop.
830	* This might happen with URG data.
831	*/
832	if (!skb_queue_empty(list: &sk->sk_receive_queue))
833	break;
834	ret = sk_wait_data(sk, timeo: &timeo, NULL);
835	if (ret < `0`)
836	break;
837	if (signal_pending(current)) {
838	ret = sock_intr_errno(timeo);
839	break;
840	}
841	continue;
842	}
843	tss.len -= ret;
844	spliced += ret;
845
846	if (!tss.len \|\| !timeo)
847	break;
848	release_sock(sk);
849	lock_sock(sk);
850
851	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|
852	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
853	signal_pending(current))
854	break;
855	}
856
857	release_sock(sk);
858
859	if (spliced)
860	return spliced;
861
862	return ret;
863	}
864	EXPORT_SYMBOL(tcp_splice_read);
865
866	struct sk_buff tcp_stream_alloc_skb(struct* sock *sk, gfp_t gfp,
867	bool force_schedule)
868	{
869	struct sk_buff *skb;
870
871	skb = alloc_skb_fclone(MAX_TCP_HEADER, priority: gfp);
872	if (likely(skb)) {
873	bool mem_scheduled;
874
875	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
876	if (force_schedule) {
877	mem_scheduled = true;
878	sk_forced_mem_schedule(sk, size: skb->truesize);
879	} else {
880	mem_scheduled = sk_wmem_schedule(sk, size: skb->truesize);
881	}
882	if (likely(mem_scheduled)) {
883	skb_reserve(skb, MAX_TCP_HEADER);
884	skb->ip_summed = CHECKSUM_PARTIAL;
885	INIT_LIST_HEAD(list: &skb->tcp_tsorted_anchor);
886	return skb;
887	}
888	__kfree_skb(skb);
889	} else {
890	sk->sk_prot->enter_memory_pressure(sk);
891	sk_stream_moderate_sndbuf(sk);
892	}
893	return NULL;
894	}
895
896	static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
897	int large_allowed)
898	{
899	struct tcp_sock *tp = tcp_sk(sk);
900	u32 new_size_goal, size_goal;
901
902	if (!large_allowed)
903	return mss_now;
904
905	/ Note : tcp_tso_autosize() will eventually split this later /
906	new_size_goal = tcp_bound_to_half_wnd(tp, pktsize: sk->sk_gso_max_size);
907
908	/ We try hard to avoid divides here /
909	size_goal = tp->gso_segs * mss_now;
910	if (unlikely(new_size_goal < size_goal \|\|
911	new_size_goal >= size_goal + mss_now)) {
912	tp->gso_segs = min_t(u16, new_size_goal / mss_now,
913	sk->sk_gso_max_segs);
914	size_goal = tp->gso_segs * mss_now;
915	}
916
917	return max(size_goal, mss_now);
918	}
919
920	int tcp_send_mss(struct sock sk, int* size_goal, int* flags)
921	{
922	int mss_now;
923
924	mss_now = tcp_current_mss(sk);
925	*size_goal = tcp_xmit_size_goal(sk, mss_now, large_allowed: !(flags & MSG_OOB));
926
927	return mss_now;
928	}
929
930	/ In some cases, sendmsg() could have added an skb to the write queue,*
931	* but failed adding payload on it. We need to remove it to consume less
932	* memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
933	* epoll() users. Another reason is that tcp_write_xmit() does not like
934	* finding an empty skb in the write queue.
935	*/
936	void tcp_remove_empty_skb(struct sock *sk)
937	{
938	struct sk_buff *skb = tcp_write_queue_tail(sk);
939
940	if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
941	tcp_unlink_write_queue(skb, sk);
942	if (tcp_write_queue_empty(sk))
943	tcp_chrono_stop(sk, type: TCP_CHRONO_BUSY);
944	tcp_wmem_free_skb(sk, skb);
945	}
946	}
947
948	/ skb changing from pure zc to mixed, must charge zc /
949	static int tcp_downgrade_zcopy_pure(struct sock sk, struct* sk_buff *skb)
950	{
951	if (unlikely(skb_zcopy_pure(skb))) {
952	u32 extra = skb->truesize -
953	SKB_TRUESIZE(skb_end_offset(skb));
954
955	if (!sk_wmem_schedule(sk, size: extra))
956	return -ENOMEM;
957
958	sk_mem_charge(sk, size: extra);
959	skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
960	}
961	return `0`;
962	}
963
964
965	int tcp_wmem_schedule(struct sock sk, int* copy)
966	{
967	int left;
968
969	if (likely(sk_wmem_schedule(sk, copy)))
970	return copy;
971
972	/ We could be in trouble if we have nothing queued.*
973	* Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
974	* to guarantee some progress.
975	*/
976	left = sock_net(sk)->ipv4.sysctl_tcp_wmem[`0`] - sk->sk_wmem_queued;
977	if (left > `0`)
978	sk_forced_mem_schedule(sk, min(left, copy));
979	return min(copy, sk->sk_forward_alloc);
980	}
981
982	void tcp_free_fastopen_req(struct tcp_sock *tp)
983	{
984	if (tp->fastopen_req) {
985	kfree(objp: tp->fastopen_req);
986	tp->fastopen_req = NULL;
987	}
988	}
989
990	int tcp_sendmsg_fastopen(struct sock sk, struct* msghdr msg, int* *copied,
991	size_t size, struct ubuf_info *uarg)
992	{
993	struct tcp_sock *tp = tcp_sk(sk);
994	struct inet_sock *inet = inet_sk(sk);
995	struct sockaddr *uaddr = msg->msg_name;
996	int err, flags;
997
998	if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
999	TFO_CLIENT_ENABLE) \|\|
1000	(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1001	uaddr->sa_family == AF_UNSPEC))
1002	return -EOPNOTSUPP;
1003	if (tp->fastopen_req)
1004	return -EALREADY; / Another Fast Open is in progress /
1005
1006	tp->fastopen_req = kzalloc(size: sizeof(struct tcp_fastopen_request),
1007	flags: sk->sk_allocation);
1008	if (unlikely(!tp->fastopen_req))
1009	return -ENOBUFS;
1010	tp->fastopen_req->data = msg;
1011	tp->fastopen_req->size = size;
1012	tp->fastopen_req->uarg = uarg;
1013
1014	if (inet_test_bit(DEFER_CONNECT, sk)) {
1015	err = tcp_connect(sk);
1016	/ Same failure procedure as in tcp_v4/6_connect /
1017	if (err) {
1018	tcp_set_state(sk, state: TCP_CLOSE);
1019	inet->inet_dport = `0`;
1020	sk->sk_route_caps = `0`;
1021	}
1022	}
1023	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : `0`;
1024	err = __inet_stream_connect(sock: sk->sk_socket, uaddr,
1025	addr_len: msg->msg_namelen, flags, is_sendmsg: `1`);
1026	/ fastopen_req could already be freed in __inet_stream_connect*
1027	* if the connection times out or gets rst
1028	*/
1029	if (tp->fastopen_req) {
1030	*copied = tp->fastopen_req->copied;
1031	tcp_free_fastopen_req(tp);
1032	inet_clear_bit(DEFER_CONNECT, sk);
1033	}
1034	return err;
1035	}
1036
1037	int tcp_sendmsg_locked(struct sock sk, struct* msghdr *msg, size_t size)
1038	{
1039	struct tcp_sock *tp = tcp_sk(sk);
1040	struct ubuf_info *uarg = NULL;
1041	struct sk_buff *skb;
1042	struct sockcm_cookie sockc;
1043	int flags, err, copied = `0`;
1044	int mss_now = `0`, size_goal, copied_syn = `0`;
1045	int process_backlog = `0`;
1046	int zc = `0`;
1047	long timeo;
1048
1049	flags = msg->msg_flags;
1050
1051	if ((flags & MSG_ZEROCOPY) && size) {
1052	if (msg->msg_ubuf) {
1053	uarg = msg->msg_ubuf;
1054	if (sk->sk_route_caps & NETIF_F_SG)
1055	zc = MSG_ZEROCOPY;
1056	} else if (sock_flag(sk, flag: SOCK_ZEROCOPY)) {
1057	skb = tcp_write_queue_tail(sk);
1058	uarg = msg_zerocopy_realloc(sk, size, uarg: skb_zcopy(skb));
1059	if (!uarg) {
1060	err = -ENOBUFS;
1061	goto out_err;
1062	}
1063	if (sk->sk_route_caps & NETIF_F_SG)
1064	zc = MSG_ZEROCOPY;
1065	else
1066	uarg_to_msgzc(uarg)->zerocopy = `0`;
1067	}
1068	} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
1069	if (sk->sk_route_caps & NETIF_F_SG)
1070	zc = MSG_SPLICE_PAGES;
1071	}
1072
1073	if (unlikely(flags & MSG_FASTOPEN \|\|
1074	inet_test_bit(DEFER_CONNECT, sk)) &&
1075	!tp->repair) {
1076	err = tcp_sendmsg_fastopen(sk, msg, copied: &copied_syn, size, uarg);
1077	if (err == -EINPROGRESS && copied_syn > `0`)
1078	goto out;
1079	else if (err)
1080	goto out_err;
1081	}
1082
1083	timeo = sock_sndtimeo(sk, noblock: flags & MSG_DONTWAIT);
1084
1085	tcp_rate_check_app_limited(sk); / is sending application-limited? /
1086
1087	/ Wait for a connection to finish. One exception is TCP Fast Open*
1088	* (passive side) where data is allowed to be sent before a connection
1089	* is fully established.
1090	*/
1091	if (((`1` << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) &&
1092	!tcp_passive_fastopen(sk)) {
1093	err = sk_stream_wait_connect(sk, timeo_p: &timeo);
1094	if (err != `0`)
1095	goto do_error;
1096	}
1097
1098	if (unlikely(tp->repair)) {
1099	if (tp->repair_queue == TCP_RECV_QUEUE) {
1100	copied = tcp_send_rcvq(sk, msg, size);
1101	goto out_nopush;
1102	}
1103
1104	err = -EINVAL;
1105	if (tp->repair_queue == TCP_NO_QUEUE)
1106	goto out_err;
1107
1108	/ 'common' sending to sendq /
1109	}
1110
1111	sockcm_init(sockc: &sockc, sk);
1112	if (msg->msg_controllen) {
1113	err = sock_cmsg_send(sk, msg, sockc: &sockc);
1114	if (unlikely(err)) {
1115	err = -EINVAL;
1116	goto out_err;
1117	}
1118	}
1119
1120	/ This should be in poll /
1121	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1122
1123	/ Ok commence sending. /
1124	copied = `0`;
1125
1126	restart:
1127	mss_now = tcp_send_mss(sk, size_goal: &size_goal, flags);
1128
1129	err = -EPIPE;
1130	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
1131	goto do_error;
1132
1133	while (msg_data_left(msg)) {
1134	ssize_t copy = `0`;
1135
1136	skb = tcp_write_queue_tail(sk);
1137	if (skb)
1138	copy = size_goal - skb->len;
1139
1140	if (copy <= `0` \|\| !tcp_skb_can_collapse_to(skb)) {
1141	bool first_skb;
1142
1143	new_segment:
1144	if (!sk_stream_memory_free(sk))
1145	goto wait_for_space;
1146
1147	if (unlikely(process_backlog >= `16`)) {
1148	process_backlog = `0`;
1149	if (sk_flush_backlog(sk))
1150	goto restart;
1151	}
1152	first_skb = tcp_rtx_and_write_queues_empty(sk);
1153	skb = tcp_stream_alloc_skb(sk, gfp: sk->sk_allocation,
1154	force_schedule: first_skb);
1155	if (!skb)
1156	goto wait_for_space;
1157
1158	process_backlog++;
1159
1160	tcp_skb_entail(sk, skb);
1161	copy = size_goal;
1162
1163	/ All packets are restored as if they have*
1164	* already been sent. skb_mstamp_ns isn't set to
1165	* avoid wrong rtt estimation.
1166	*/
1167	if (tp->repair)
1168	TCP_SKB_CB(skb)->sacked \|= TCPCB_REPAIRED;
1169	}
1170
1171	/ Try to append data to the end of skb. /
1172	if (copy > msg_data_left(msg))
1173	copy = msg_data_left(msg);
1174
1175	if (zc == `0`) {
1176	bool merge = true;
1177	int i = skb_shinfo(skb)->nr_frags;
1178	struct page_frag *pfrag = sk_page_frag(sk);
1179
1180	if (!sk_page_frag_refill(sk, pfrag))
1181	goto wait_for_space;
1182
1183	if (!skb_can_coalesce(skb, i, page: pfrag->page,
1184	off: pfrag->offset)) {
1185	if (i >= READ_ONCE(sysctl_max_skb_frags)) {
1186	tcp_mark_push(tp, skb);
1187	goto new_segment;
1188	}
1189	merge = false;
1190	}
1191
1192	copy = min_t(int, copy, pfrag->size - pfrag->offset);
1193
1194	if (unlikely(skb_zcopy_pure(skb) \|\| skb_zcopy_managed(skb))) {
1195	if (tcp_downgrade_zcopy_pure(sk, skb))
1196	goto wait_for_space;
1197	skb_zcopy_downgrade_managed(skb);
1198	}
1199
1200	copy = tcp_wmem_schedule(sk, copy);
1201	if (!copy)
1202	goto wait_for_space;
1203
1204	err = skb_copy_to_page_nocache(sk, from: &msg->msg_iter, skb,
1205	page: pfrag->page,
1206	off: pfrag->offset,
1207	copy);
1208	if (err)
1209	goto do_error;
1210
1211	/ Update the skb. /
1212	if (merge) {
1213	skb_frag_size_add(frag: &skb_shinfo(skb)->frags[i - `1`], delta: copy);
1214	} else {
1215	skb_fill_page_desc(skb, i, page: pfrag->page,
1216	off: pfrag->offset, size: copy);
1217	page_ref_inc(page: pfrag->page);
1218	}
1219	pfrag->offset += copy;
1220	} else if (zc == MSG_ZEROCOPY) {
1221	/ First append to a fragless skb builds initial*
1222	* pure zerocopy skb
1223	*/
1224	if (!skb->len)
1225	skb_shinfo(skb)->flags \|= SKBFL_PURE_ZEROCOPY;
1226
1227	if (!skb_zcopy_pure(skb)) {
1228	copy = tcp_wmem_schedule(sk, copy);
1229	if (!copy)
1230	goto wait_for_space;
1231	}
1232
1233	err = skb_zerocopy_iter_stream(sk, skb, msg, len: copy, uarg);
1234	if (err == -EMSGSIZE \|\| err == -EEXIST) {
1235	tcp_mark_push(tp, skb);
1236	goto new_segment;
1237	}
1238	if (err < `0`)
1239	goto do_error;
1240	copy = err;
1241	} else if (zc == MSG_SPLICE_PAGES) {
1242	/ Splice in data if we can; copy if we can't. /
1243	if (tcp_downgrade_zcopy_pure(sk, skb))
1244	goto wait_for_space;
1245	copy = tcp_wmem_schedule(sk, copy);
1246	if (!copy)
1247	goto wait_for_space;
1248
1249	err = skb_splice_from_iter(skb, iter: &msg->msg_iter, maxsize: copy,
1250	gfp: sk->sk_allocation);
1251	if (err < `0`) {
1252	if (err == -EMSGSIZE) {
1253	tcp_mark_push(tp, skb);
1254	goto new_segment;
1255	}
1256	goto do_error;
1257	}
1258	copy = err;
1259
1260	if (!(flags & MSG_NO_SHARED_FRAGS))
1261	skb_shinfo(skb)->flags \|= SKBFL_SHARED_FRAG;
1262
1263	sk_wmem_queued_add(sk, val: copy);
1264	sk_mem_charge(sk, size: copy);
1265	}
1266
1267	if (!copied)
1268	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1269
1270	WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1271	TCP_SKB_CB(skb)->end_seq += copy;
1272	tcp_skb_pcount_set(skb, segs: `0`);
1273
1274	copied += copy;
1275	if (!msg_data_left(msg)) {
1276	if (unlikely(flags & MSG_EOR))
1277	TCP_SKB_CB(skb)->eor = `1`;
1278	goto out;
1279	}
1280
1281	if (skb->len < size_goal \|\| (flags & MSG_OOB) \|\| unlikely(tp->repair))
1282	continue;
1283
1284	if (forced_push(tp)) {
1285	tcp_mark_push(tp, skb);
1286	__tcp_push_pending_frames(sk, cur_mss: mss_now, TCP_NAGLE_PUSH);
1287	} else if (skb == tcp_send_head(sk))
1288	tcp_push_one(sk, mss_now);
1289	continue;
1290
1291	wait_for_space:
1292	set_bit(SOCK_NOSPACE, addr: &sk->sk_socket->flags);
1293	tcp_remove_empty_skb(sk);
1294	if (copied)
1295	tcp_push(sk, flags: flags & ~MSG_MORE, mss_now,
1296	TCP_NAGLE_PUSH, size_goal);
1297
1298	err = sk_stream_wait_memory(sk, timeo_p: &timeo);
1299	if (err != `0`)
1300	goto do_error;
1301
1302	mss_now = tcp_send_mss(sk, size_goal: &size_goal, flags);
1303	}
1304
1305	out:
1306	if (copied) {
1307	tcp_tx_timestamp(sk, tsflags: sockc.tsflags);
1308	tcp_push(sk, flags, mss_now, nonagle: tp->nonagle, size_goal);
1309	}
1310	out_nopush:
1311	/ msg->msg_ubuf is pinned by the caller so we don't take extra refs /
1312	if (uarg && !msg->msg_ubuf)
1313	net_zcopy_put(uarg);
1314	return copied + copied_syn;
1315
1316	do_error:
1317	tcp_remove_empty_skb(sk);
1318
1319	if (copied + copied_syn)
1320	goto out;
1321	out_err:
1322	/ msg->msg_ubuf is pinned by the caller so we don't take extra refs /
1323	if (uarg && !msg->msg_ubuf)
1324	net_zcopy_put_abort(uarg, have_uref: true);
1325	err = sk_stream_error(sk, flags, err);
1326	/ make sure we wake any epoll edge trigger waiter /
1327	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1328	sk->sk_write_space(sk);
1329	tcp_chrono_stop(sk, type: TCP_CHRONO_SNDBUF_LIMITED);
1330	}
1331	return err;
1332	}
1333	EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1334
1335	int tcp_sendmsg(struct sock sk, struct* msghdr *msg, size_t size)
1336	{
1337	int ret;
1338
1339	lock_sock(sk);
1340	ret = tcp_sendmsg_locked(sk, msg, size);
1341	release_sock(sk);
1342
1343	return ret;
1344	}
1345	EXPORT_SYMBOL(tcp_sendmsg);
1346
1347	void tcp_splice_eof(struct socket *sock)
1348	{
1349	struct sock *sk = sock->sk;
1350	struct tcp_sock *tp = tcp_sk(sk);
1351	int mss_now, size_goal;
1352
1353	if (!tcp_write_queue_tail(sk))
1354	return;
1355
1356	lock_sock(sk);
1357	mss_now = tcp_send_mss(sk, size_goal: &size_goal, flags: `0`);
1358	tcp_push(sk, flags: `0`, mss_now, nonagle: tp->nonagle, size_goal);
1359	release_sock(sk);
1360	}
1361	EXPORT_SYMBOL_GPL(tcp_splice_eof);
1362
1363	/*
1364	* Handle reading urgent data. BSD has very simple semantics for
1365	* this, no blocking and very strange errors 8)
1366	*/
1367
1368	static int tcp_recv_urg(struct sock sk, struct* msghdr msg, int* len, int flags)
1369	{
1370	struct tcp_sock *tp = tcp_sk(sk);
1371
1372	/ No URG data to read. /
1373	if (sock_flag(sk, flag: SOCK_URGINLINE) \|\| !tp->urg_data \|\|
1374	tp->urg_data == TCP_URG_READ)
1375	return -EINVAL; / Yes this is right ! /
1376
1377	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, flag: SOCK_DONE))
1378	return -ENOTCONN;
1379
1380	if (tp->urg_data & TCP_URG_VALID) {
1381	int err = `0`;
1382	char c = tp->urg_data;
1383
1384	if (!(flags & MSG_PEEK))
1385	WRITE_ONCE(tp->urg_data, TCP_URG_READ);
1386
1387	/ Read urgent data. /
1388	msg->msg_flags \|= MSG_OOB;
1389
1390	if (len > `0`) {
1391	if (!(flags & MSG_TRUNC))
1392	err = memcpy_to_msg(msg, data: &c, len: `1`);
1393	len = `1`;
1394	} else
1395	msg->msg_flags \|= MSG_TRUNC;
1396
1397	return err ? -EFAULT : len;
1398	}
1399
1400	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
1401	return `0`;
1402
1403	/ Fixed the recv(..., MSG_OOB) behaviour. BSD docs and*
1404	* the available implementations agree in this case:
1405	* this call should never block, independent of the
1406	* blocking state of the socket.
1407	* Mike <pall@rz.uni-karlsruhe.de>
1408	*/
1409	return -EAGAIN;
1410	}
1411
1412	static int tcp_peek_sndq(struct sock sk, struct* msghdr msg, int* len)
1413	{
1414	struct sk_buff *skb;
1415	int copied = `0`, err = `0`;
1416
1417	/ XXX -- need to support SO_PEEK_OFF /
1418
1419	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1420	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: skb->len);
1421	if (err)
1422	return err;
1423	copied += skb->len;
1424	}
1425
1426	skb_queue_walk(&sk->sk_write_queue, skb) {
1427	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: skb->len);
1428	if (err)
1429	break;
1430
1431	copied += skb->len;
1432	}
1433
1434	return err ?: copied;
1435	}
1436
1437	/ Clean up the receive buffer for full frames taken by the user,*
1438	* then send an ACK if necessary. COPIED is the number of bytes
1439	* tcp_recvmsg has given to the user so far, it speeds up the
1440	* calculation of whether or not we must ACK for the sake of
1441	* a window update.
1442	*/
1443	void __tcp_cleanup_rbuf(struct sock sk, int* copied)
1444	{
1445	struct tcp_sock *tp = tcp_sk(sk);
1446	bool time_to_ack = false;
1447
1448	if (inet_csk_ack_scheduled(sk)) {
1449	const struct inet_connection_sock *icsk = inet_csk(sk);
1450
1451	if (/ Once-per-two-segments ACK was not sent by tcp_input.c /
1452	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
1453	/*
1454	* If this read emptied read buffer, we send ACK, if
1455	* connection is not bidirectional, user drained
1456	* receive buffer and there was a small segment
1457	* in queue.
1458	*/
1459	(copied > `0` &&
1460	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|
1461	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1462	!inet_csk_in_pingpong_mode(sk))) &&
1463	!atomic_read(v: &sk->sk_rmem_alloc)))
1464	time_to_ack = true;
1465	}
1466
1467	/ We send an ACK if we can now advertise a non-zero window*
1468	* which has been raised "significantly".
1469	*
1470	* Even if window raised up to infinity, do not send window open ACK
1471	* in states, where we will not receive more. It is useless.
1472	*/
1473	if (copied > `0` && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1474	__u32 rcv_window_now = tcp_receive_window(tp);
1475
1476	/ Optimize, __tcp_select_window() is not cheap. /
1477	if (`2`*rcv_window_now <= tp->window_clamp) {
1478	__u32 new_window = __tcp_select_window(sk);
1479
1480	/ Send ACK now, if this read freed lots of space*
1481	* in our buffer. Certainly, new_window is new window.
1482	* We can advertise it now, if it is not less than current one.
1483	* "Lots" means "at least twice" here.
1484	*/
1485	if (new_window && new_window >= `2` * rcv_window_now)
1486	time_to_ack = true;
1487	}
1488	}
1489	if (time_to_ack)
1490	tcp_send_ack(sk);
1491	}
1492
1493	void tcp_cleanup_rbuf(struct sock sk, int* copied)
1494	{
1495	struct sk_buff *skb = skb_peek(list_: &sk->sk_receive_queue);
1496	struct tcp_sock *tp = tcp_sk(sk);
1497
1498	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1499	"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1500	tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1501	__tcp_cleanup_rbuf(sk, copied);
1502	}
1503
1504	static void tcp_eat_recv_skb(struct sock sk, struct* sk_buff *skb)
1505	{
1506	__skb_unlink(skb, list: &sk->sk_receive_queue);
1507	if (likely(skb->destructor == sock_rfree)) {
1508	sock_rfree(skb);
1509	skb->destructor = NULL;
1510	skb->sk = NULL;
1511	return skb_attempt_defer_free(skb);
1512	}
1513	__kfree_skb(skb);
1514	}
1515
1516	struct sk_buff tcp_recv_skb(struct* sock sk, u32 seq, u32 off)
1517	{
1518	struct sk_buff *skb;
1519	u32 offset;
1520
1521	while ((skb = skb_peek(list_: &sk->sk_receive_queue)) != NULL) {
1522	offset = seq - TCP_SKB_CB(skb)->seq;
1523	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1524	pr_err_once("%s: found a SYN, please report !\n", __func__);
1525	offset--;
1526	}
1527	if (offset < skb->len \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1528	*off = offset;
1529	return skb;
1530	}
1531	/ This looks weird, but this can happen if TCP collapsing*
1532	* splitted a fat GRO packet, while we released socket lock
1533	* in skb_splice_bits()
1534	*/
1535	tcp_eat_recv_skb(sk, skb);
1536	}
1537	return NULL;
1538	}
1539	EXPORT_SYMBOL(tcp_recv_skb);
1540
1541	/*
1542	* This routine provides an alternative to tcp_recvmsg() for routines
1543	* that would like to handle copying from skbuffs directly in 'sendfile'
1544	* fashion.
1545	* Note:
1546	* - It is assumed that the socket was locked by the caller.
1547	* - The routine does not block.
1548	* - At present, there is no support for reading OOB data
1549	* or for 'peeking' the socket using this routine
1550	* (although both would be easy to implement).
1551	*/
1552	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
1553	sk_read_actor_t recv_actor)
1554	{
1555	struct sk_buff *skb;
1556	struct tcp_sock *tp = tcp_sk(sk);
1557	u32 seq = tp->copied_seq;
1558	u32 offset;
1559	int copied = `0`;
1560
1561	if (sk->sk_state == TCP_LISTEN)
1562	return -ENOTCONN;
1563	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1564	if (offset < skb->len) {
1565	int used;
1566	size_t len;
1567
1568	len = skb->len - offset;
1569	/ Stop reading if we hit a patch of urgent data /
1570	if (unlikely(tp->urg_data)) {
1571	u32 urg_offset = tp->urg_seq - seq;
1572	if (urg_offset < len)
1573	len = urg_offset;
1574	if (!len)
1575	break;
1576	}
1577	used = recv_actor(desc, skb, offset, len);
1578	if (used <= `0`) {
1579	if (!copied)
1580	copied = used;
1581	break;
1582	}
1583	if (WARN_ON_ONCE(used > len))
1584	used = len;
1585	seq += used;
1586	copied += used;
1587	offset += used;
1588
1589	/ If recv_actor drops the lock (e.g. TCP splice*
1590	* receive) the skb pointer might be invalid when
1591	* getting here: tcp_collapse might have deleted it
1592	* while aggregating skbs from the socket queue.
1593	*/
1594	skb = tcp_recv_skb(sk, seq - `1`, &offset);
1595	if (!skb)
1596	break;
1597	/ TCP coalescing might have appended data to the skb.*
1598	* Try to splice more frags
1599	*/
1600	if (offset + `1` != skb->len)
1601	continue;
1602	}
1603	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1604	tcp_eat_recv_skb(sk, skb);
1605	++seq;
1606	break;
1607	}
1608	tcp_eat_recv_skb(sk, skb);
1609	if (!desc->count)
1610	break;
1611	WRITE_ONCE(tp->copied_seq, seq);
1612	}
1613	WRITE_ONCE(tp->copied_seq, seq);
1614
1615	tcp_rcv_space_adjust(sk);
1616
1617	/ Clean up data we have read: This will do ACK frames. /
1618	if (copied > `0`) {
1619	tcp_recv_skb(sk, seq, &offset);
1620	tcp_cleanup_rbuf(sk, copied);
1621	}
1622	return copied;
1623	}
1624	EXPORT_SYMBOL(tcp_read_sock);
1625
1626	int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
1627	{
1628	struct sk_buff *skb;
1629	int copied = `0`;
1630
1631	if (sk->sk_state == TCP_LISTEN)
1632	return -ENOTCONN;
1633
1634	while ((skb = skb_peek(list_: &sk->sk_receive_queue)) != NULL) {
1635	u8 tcp_flags;
1636	int used;
1637
1638	__skb_unlink(skb, list: &sk->sk_receive_queue);
1639	WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
1640	tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
1641	used = recv_actor(sk, skb);
1642	if (used < `0`) {
1643	if (!copied)
1644	copied = used;
1645	break;
1646	}
1647	copied += used;
1648
1649	if (tcp_flags & TCPHDR_FIN)
1650	break;
1651	}
1652	return copied;
1653	}
1654	EXPORT_SYMBOL(tcp_read_skb);
1655
1656	void tcp_read_done(struct sock *sk, size_t len)
1657	{
1658	struct tcp_sock *tp = tcp_sk(sk);
1659	u32 seq = tp->copied_seq;
1660	struct sk_buff *skb;
1661	size_t left;
1662	u32 offset;
1663
1664	if (sk->sk_state == TCP_LISTEN)
1665	return;
1666
1667	left = len;
1668	while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1669	int used;
1670
1671	used = min_t(size_t, skb->len - offset, left);
1672	seq += used;
1673	left -= used;
1674
1675	if (skb->len > offset + used)
1676	break;
1677
1678	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1679	tcp_eat_recv_skb(sk, skb);
1680	++seq;
1681	break;
1682	}
1683	tcp_eat_recv_skb(sk, skb);
1684	}
1685	WRITE_ONCE(tp->copied_seq, seq);
1686
1687	tcp_rcv_space_adjust(sk);
1688
1689	/ Clean up data we have read: This will do ACK frames. /
1690	if (left != len)
1691	tcp_cleanup_rbuf(sk, copied: len - left);
1692	}
1693	EXPORT_SYMBOL(tcp_read_done);
1694
1695	int tcp_peek_len(struct socket *sock)
1696	{
1697	return tcp_inq(sk: sock->sk);
1698	}
1699	EXPORT_SYMBOL(tcp_peek_len);
1700
1701	/ Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint /
1702	int tcp_set_rcvlowat(struct sock sk, int* val)
1703	{
1704	int space, cap;
1705
1706	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1707	cap = sk->sk_rcvbuf >> `1`;
1708	else
1709	cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`]) >> `1`;
1710	val = min(val, cap);
1711	WRITE_ONCE(sk->sk_rcvlowat, val ? : `1`);
1712
1713	/ Check if we need to signal EPOLLIN right now /
1714	tcp_data_ready(sk);
1715
1716	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1717	return `0`;
1718
1719	space = tcp_space_from_win(sk, win: val);
1720	if (space > sk->sk_rcvbuf) {
1721	WRITE_ONCE(sk->sk_rcvbuf, space);
1722	tcp_sk(sk)->window_clamp = val;
1723	}
1724	return `0`;
1725	}
1726	EXPORT_SYMBOL(tcp_set_rcvlowat);
1727
1728	void tcp_update_recv_tstamps(struct sk_buff *skb,
1729	struct scm_timestamping_internal *tss)
1730	{
1731	if (skb->tstamp)
1732	tss->ts[`0`] = ktime_to_timespec64(skb->tstamp);
1733	else
1734	tss->ts[`0`] = (struct timespec64) {`0`};
1735
1736	if (skb_hwtstamps(skb)->hwtstamp)
1737	tss->ts[`2`] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1738	else
1739	tss->ts[`2`] = (struct timespec64) {`0`};
1740	}
1741
1742	#ifdef CONFIG_MMU
1743	static const struct vm_operations_struct tcp_vm_ops = {
1744	};
1745
1746	int tcp_mmap(struct file file, struct* socket *sock,
1747	struct vm_area_struct *vma)
1748	{
1749	if (vma->vm_flags & (VM_WRITE \| VM_EXEC))
1750	return -EPERM;
1751	vm_flags_clear(vma, VM_MAYWRITE \| VM_MAYEXEC);
1752
1753	/ Instruct vm_insert_page() to not mmap_read_lock(mm) /
1754	vm_flags_set(vma, VM_MIXEDMAP);
1755
1756	vma->vm_ops = &tcp_vm_ops;
1757	return `0`;
1758	}
1759	EXPORT_SYMBOL(tcp_mmap);
1760
1761	static skb_frag_t skb_advance_to_frag(struct* sk_buff *skb, u32 offset_skb,
1762	u32 *offset_frag)
1763	{
1764	skb_frag_t *frag;
1765
1766	if (unlikely(offset_skb >= skb->len))
1767	return NULL;
1768
1769	offset_skb -= skb_headlen(skb);
1770	if ((int)offset_skb < `0` \|\| skb_has_frag_list(skb))
1771	return NULL;
1772
1773	frag = skb_shinfo(skb)->frags;
1774	while (offset_skb) {
1775	if (skb_frag_size(frag) > offset_skb) {
1776	*offset_frag = offset_skb;
1777	return frag;
1778	}
1779	offset_skb -= skb_frag_size(frag);
1780	++frag;
1781	}
1782	*offset_frag = `0`;
1783	return frag;
1784	}
1785
1786	static bool can_map_frag(const skb_frag_t *frag)
1787	{
1788	return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
1789	}
1790
1791	static int find_next_mappable_frag(const skb_frag_t *frag,
1792	int remaining_in_skb)
1793	{
1794	int offset = `0`;
1795
1796	if (likely(can_map_frag(frag)))
1797	return `0`;
1798
1799	while (offset < remaining_in_skb && !can_map_frag(frag)) {
1800	offset += skb_frag_size(frag);
1801	++frag;
1802	}
1803	return offset;
1804	}
1805
1806	static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1807	struct tcp_zerocopy_receive *zc,
1808	struct sk_buff *skb, u32 offset)
1809	{
1810	u32 frag_offset, partial_frag_remainder = `0`;
1811	int mappable_offset;
1812	skb_frag_t *frag;
1813
1814	/ worst case: skip to next skb. try to improve on this case below /
1815	zc->recv_skip_hint = skb->len - offset;
1816
1817	/ Find the frag containing this offset (and how far into that frag) /
1818	frag = skb_advance_to_frag(skb, offset_skb: offset, offset_frag: &frag_offset);
1819	if (!frag)
1820	return;
1821
1822	if (frag_offset) {
1823	struct skb_shared_info *info = skb_shinfo(skb);
1824
1825	/ We read part of the last frag, must recvmsg() rest of skb. /
1826	if (frag == &info->frags[info->nr_frags - `1`])
1827	return;
1828
1829	/ Else, we must at least read the remainder in this frag. /
1830	partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1831	zc->recv_skip_hint -= partial_frag_remainder;
1832	++frag;
1833	}
1834
1835	/ partial_frag_remainder: If part way through a frag, must read rest.*
1836	* mappable_offset: Bytes till next mappable frag, not counting bytes
1837	* in partial_frag_remainder.
1838	*/
1839	mappable_offset = find_next_mappable_frag(frag, remaining_in_skb: zc->recv_skip_hint);
1840	zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1841	}
1842
1843	static int tcp_recvmsg_locked(struct sock sk, struct* msghdr *msg, size_t len,
1844	int flags, struct scm_timestamping_internal *tss,
1845	int *cmsg_flags);
1846	static int receive_fallback_to_copy(struct sock *sk,
1847	struct tcp_zerocopy_receive zc, int* inq,
1848	struct scm_timestamping_internal *tss)
1849	{
1850	unsigned long copy_address = (unsigned long)zc->copybuf_address;
1851	struct msghdr msg = {};
1852	struct iovec iov;
1853	int err;
1854
1855	zc->length = `0`;
1856	zc->recv_skip_hint = `0`;
1857
1858	if (copy_address != zc->copybuf_address)
1859	return -EINVAL;
1860
1861	err = import_single_range(ITER_DEST, buf: (void __user *)copy_address,
1862	len: inq, iov: &iov, i: &msg.msg_iter);
1863	if (err)
1864	return err;
1865
1866	err = tcp_recvmsg_locked(sk, msg: &msg, len: inq, MSG_DONTWAIT,
1867	tss, cmsg_flags: &zc->msg_flags);
1868	if (err < `0`)
1869	return err;
1870
1871	zc->copybuf_len = err;
1872	if (likely(zc->copybuf_len)) {
1873	struct sk_buff *skb;
1874	u32 offset;
1875
1876	skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1877	if (skb)
1878	tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1879	}
1880	return `0`;
1881	}
1882
1883	static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1884	struct sk_buff *skb, u32 copylen,
1885	u32 offset, u32 seq)
1886	{
1887	unsigned long copy_address = (unsigned long)zc->copybuf_address;
1888	struct msghdr msg = {};
1889	struct iovec iov;
1890	int err;
1891
1892	if (copy_address != zc->copybuf_address)
1893	return -EINVAL;
1894
1895	err = import_single_range(ITER_DEST, buf: (void __user *)copy_address,
1896	len: copylen, iov: &iov, i: &msg.msg_iter);
1897	if (err)
1898	return err;
1899	err = skb_copy_datagram_msg(from: skb, offset: *offset, msg: &msg, size: copylen);
1900	if (err)
1901	return err;
1902	zc->recv_skip_hint -= copylen;
1903	*offset += copylen;
1904	*seq += copylen;
1905	return (__s32)copylen;
1906	}
1907
1908	static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
1909	struct sock *sk,
1910	struct sk_buff *skb,
1911	u32 *seq,
1912	s32 copybuf_len,
1913	struct scm_timestamping_internal *tss)
1914	{
1915	u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
1916
1917	if (!copylen)
1918	return `0`;
1919	/ skb is null if inq < PAGE_SIZE. /
1920	if (skb) {
1921	offset = *seq - TCP_SKB_CB(skb)->seq;
1922	} else {
1923	skb = tcp_recv_skb(sk, *seq, &offset);
1924	if (TCP_SKB_CB(skb)->has_rxtstamp) {
1925	tcp_update_recv_tstamps(skb, tss);
1926	zc->msg_flags \|= TCP_CMSG_TS;
1927	}
1928	}
1929
1930	zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, offset: &offset,
1931	seq);
1932	return zc->copybuf_len < `0` ? `0` : copylen;
1933	}
1934
1935	static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
1936	struct page **pending_pages,
1937	unsigned long pages_remaining,
1938	unsigned long *address,
1939	u32 *length,
1940	u32 *seq,
1941	struct tcp_zerocopy_receive *zc,
1942	u32 total_bytes_to_map,
1943	int err)
1944	{
1945	/ At least one page did not map. Try zapping if we skipped earlier. /
1946	if (err == -EBUSY &&
1947	zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
1948	u32 maybe_zap_len;
1949
1950	maybe_zap_len = total_bytes_to_map - / All bytes to map /
1951	length + /* Mapped or pending /
1952	(pages_remaining * PAGE_SIZE); / Failed map. /
1953	zap_page_range_single(vma, address: *address, size: maybe_zap_len, NULL);
1954	err = `0`;
1955	}
1956
1957	if (!err) {
1958	unsigned long leftover_pages = pages_remaining;
1959	int bytes_mapped;
1960
1961	/ We called zap_page_range_single, try to reinsert. /
1962	err = vm_insert_pages(vma, addr: *address,
1963	pages: pending_pages,
1964	num: &pages_remaining);
1965	bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
1966	*seq += bytes_mapped;
1967	*address += bytes_mapped;
1968	}
1969	if (err) {
1970	/ Either we were unable to zap, OR we zapped, retried an*
1971	* insert, and still had an issue. Either ways, pages_remaining
1972	* is the number of pages we were unable to map, and we unroll
1973	* some state we speculatively touched before.
1974	*/
1975	const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
1976
1977	*length -= bytes_not_mapped;
1978	zc->recv_skip_hint += bytes_not_mapped;
1979	}
1980	return err;
1981	}
1982
1983	static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
1984	struct page **pages,
1985	unsigned int pages_to_map,
1986	unsigned long *address,
1987	u32 *length,
1988	u32 *seq,
1989	struct tcp_zerocopy_receive *zc,
1990	u32 total_bytes_to_map)
1991	{
1992	unsigned long pages_remaining = pages_to_map;
1993	unsigned int pages_mapped;
1994	unsigned int bytes_mapped;
1995	int err;
1996
1997	err = vm_insert_pages(vma, addr: *address, pages, num: &pages_remaining);
1998	pages_mapped = pages_to_map - (unsigned int)pages_remaining;
1999	bytes_mapped = PAGE_SIZE * pages_mapped;
2000	/ Even if vm_insert_pages fails, it may have partially succeeded in*
2001	* mapping (some but not all of the pages).
2002	*/
2003	*seq += bytes_mapped;
2004	*address += bytes_mapped;
2005
2006	if (likely(!err))
2007	return `0`;
2008
2009	/ Error: maybe zap and retry + rollback state for failed inserts. /
2010	return tcp_zerocopy_vm_insert_batch_error(vma, pending_pages: pages + pages_mapped,
2011	pages_remaining, address, length, seq, zc, total_bytes_to_map,
2012	err);
2013	}
2014
2015	#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
2016	static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2017	struct tcp_zerocopy_receive *zc,
2018	struct scm_timestamping_internal *tss)
2019	{
2020	unsigned long msg_control_addr;
2021	struct msghdr cmsg_dummy;
2022
2023	msg_control_addr = (unsigned long)zc->msg_control;
2024	cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
2025	cmsg_dummy.msg_controllen =
2026	(__kernel_size_t)zc->msg_controllen;
2027	cmsg_dummy.msg_flags = in_compat_syscall()
2028	? MSG_CMSG_COMPAT : `0`;
2029	cmsg_dummy.msg_control_is_user = true;
2030	zc->msg_flags = `0`;
2031	if (zc->msg_control == msg_control_addr &&
2032	zc->msg_controllen == cmsg_dummy.msg_controllen) {
2033	tcp_recv_timestamp(msg: &cmsg_dummy, sk, tss);
2034	zc->msg_control = (__u64)
2035	((uintptr_t)cmsg_dummy.msg_control_user);
2036	zc->msg_controllen =
2037	(__u64)cmsg_dummy.msg_controllen;
2038	zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2039	}
2040	}
2041
2042	static struct vm_area_struct find_tcp_vma(struct* mm_struct *mm,
2043	unsigned long address,
2044	bool *mmap_locked)
2045	{
2046	struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
2047
2048	if (vma) {
2049	if (vma->vm_ops != &tcp_vm_ops) {
2050	vma_end_read(vma);
2051	return NULL;
2052	}
2053	*mmap_locked = false;
2054	return vma;
2055	}
2056
2057	mmap_read_lock(mm);
2058	vma = vma_lookup(mm, addr: address);
2059	if (!vma \|\| vma->vm_ops != &tcp_vm_ops) {
2060	mmap_read_unlock(mm);
2061	return NULL;
2062	}
2063	*mmap_locked = true;
2064	return vma;
2065	}
2066
2067	#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2068	static int tcp_zerocopy_receive(struct sock *sk,
2069	struct tcp_zerocopy_receive *zc,
2070	struct scm_timestamping_internal *tss)
2071	{
2072	u32 length = `0`, offset, vma_len, avail_len, copylen = `0`;
2073	unsigned long address = (unsigned long)zc->address;
2074	struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2075	s32 copybuf_len = zc->copybuf_len;
2076	struct tcp_sock *tp = tcp_sk(sk);
2077	const skb_frag_t *frags = NULL;
2078	unsigned int pages_to_map = `0`;
2079	struct vm_area_struct *vma;
2080	struct sk_buff *skb = NULL;
2081	u32 seq = tp->copied_seq;
2082	u32 total_bytes_to_map;
2083	int inq = tcp_inq(sk);
2084	bool mmap_locked;
2085	int ret;
2086
2087	zc->copybuf_len = `0`;
2088	zc->msg_flags = `0`;
2089
2090	if (address & (PAGE_SIZE - `1`) \|\| address != zc->address)
2091	return -EINVAL;
2092
2093	if (sk->sk_state == TCP_LISTEN)
2094	return -ENOTCONN;
2095
2096	sock_rps_record_flow(sk);
2097
2098	if (inq && inq <= copybuf_len)
2099	return receive_fallback_to_copy(sk, zc, inq, tss);
2100
2101	if (inq < PAGE_SIZE) {
2102	zc->length = `0`;
2103	zc->recv_skip_hint = inq;
2104	if (!inq && sock_flag(sk, flag: SOCK_DONE))
2105	return -EIO;
2106	return `0`;
2107	}
2108
2109	vma = find_tcp_vma(current->mm, address, mmap_locked: &mmap_locked);
2110	if (!vma)
2111	return -EINVAL;
2112
2113	vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2114	avail_len = min_t(u32, vma_len, inq);
2115	total_bytes_to_map = avail_len & ~(PAGE_SIZE - `1`);
2116	if (total_bytes_to_map) {
2117	if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2118	zap_page_range_single(vma, address, size: total_bytes_to_map,
2119	NULL);
2120	zc->length = total_bytes_to_map;
2121	zc->recv_skip_hint = `0`;
2122	} else {
2123	zc->length = avail_len;
2124	zc->recv_skip_hint = avail_len;
2125	}
2126	ret = `0`;
2127	while (length + PAGE_SIZE <= zc->length) {
2128	int mappable_offset;
2129	struct page *page;
2130
2131	if (zc->recv_skip_hint < PAGE_SIZE) {
2132	u32 offset_frag;
2133
2134	if (skb) {
2135	if (zc->recv_skip_hint > `0`)
2136	break;
2137	skb = skb->next;
2138	offset = seq - TCP_SKB_CB(skb)->seq;
2139	} else {
2140	skb = tcp_recv_skb(sk, seq, &offset);
2141	}
2142
2143	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2144	tcp_update_recv_tstamps(skb, tss);
2145	zc->msg_flags \|= TCP_CMSG_TS;
2146	}
2147	zc->recv_skip_hint = skb->len - offset;
2148	frags = skb_advance_to_frag(skb, offset_skb: offset, offset_frag: &offset_frag);
2149	if (!frags \|\| offset_frag)
2150	break;
2151	}
2152
2153	mappable_offset = find_next_mappable_frag(frag: frags,
2154	remaining_in_skb: zc->recv_skip_hint);
2155	if (mappable_offset) {
2156	zc->recv_skip_hint = mappable_offset;
2157	break;
2158	}
2159	page = skb_frag_page(frag: frags);
2160	prefetchw(x: page);
2161	pages[pages_to_map++] = page;
2162	length += PAGE_SIZE;
2163	zc->recv_skip_hint -= PAGE_SIZE;
2164	frags++;
2165	if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE \|\|
2166	zc->recv_skip_hint < PAGE_SIZE) {
2167	/ Either full batch, or we're about to go to next skb*
2168	* (and we cannot unroll failed ops across skbs).
2169	*/
2170	ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2171	pages_to_map,
2172	address: &address, length: &length,
2173	seq: &seq, zc,
2174	total_bytes_to_map);
2175	if (ret)
2176	goto out;
2177	pages_to_map = `0`;
2178	}
2179	}
2180	if (pages_to_map) {
2181	ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2182	address: &address, length: &length, seq: &seq,
2183	zc, total_bytes_to_map);
2184	}
2185	out:
2186	if (mmap_locked)
2187	mmap_read_unlock(current->mm);
2188	else
2189	vma_end_read(vma);
2190	/ Try to copy straggler data. /
2191	if (!ret)
2192	copylen = tcp_zc_handle_leftover(zc, sk, skb, seq: &seq, copybuf_len, tss);
2193
2194	if (length + copylen) {
2195	WRITE_ONCE(tp->copied_seq, seq);
2196	tcp_rcv_space_adjust(sk);
2197
2198	/ Clean up data we have read: This will do ACK frames. /
2199	tcp_recv_skb(sk, seq, &offset);
2200	tcp_cleanup_rbuf(sk, copied: length + copylen);
2201	ret = `0`;
2202	if (length == zc->length)
2203	zc->recv_skip_hint = `0`;
2204	} else {
2205	if (!zc->recv_skip_hint && sock_flag(sk, flag: SOCK_DONE))
2206	ret = -EIO;
2207	}
2208	zc->length = length;
2209	return ret;
2210	}
2211	#endif
2212
2213	/ Similar to __sock_recv_timestamp, but does not require an skb /
2214	void tcp_recv_timestamp(struct msghdr msg, const* struct sock *sk,
2215	struct scm_timestamping_internal *tss)
2216	{
2217	int new_tstamp = sock_flag(sk, flag: SOCK_TSTAMP_NEW);
2218	bool has_timestamping = false;
2219
2220	if (tss->ts[`0`].tv_sec \|\| tss->ts[`0`].tv_nsec) {
2221	if (sock_flag(sk, flag: SOCK_RCVTSTAMP)) {
2222	if (sock_flag(sk, flag: SOCK_RCVTSTAMPNS)) {
2223	if (new_tstamp) {
2224	struct __kernel_timespec kts = {
2225	.tv_sec = tss->ts[`0`].tv_sec,
2226	.tv_nsec = tss->ts[`0`].tv_nsec,
2227	};
2228	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2229	len: sizeof(kts), data: &kts);
2230	} else {
2231	struct __kernel_old_timespec ts_old = {
2232	.tv_sec = tss->ts[`0`].tv_sec,
2233	.tv_nsec = tss->ts[`0`].tv_nsec,
2234	};
2235	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2236	len: sizeof(ts_old), data: &ts_old);
2237	}
2238	} else {
2239	if (new_tstamp) {
2240	struct __kernel_sock_timeval stv = {
2241	.tv_sec = tss->ts[`0`].tv_sec,
2242	.tv_usec = tss->ts[`0`].tv_nsec / `1000`,
2243	};
2244	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2245	len: sizeof(stv), data: &stv);
2246	} else {
2247	struct __kernel_old_timeval tv = {
2248	.tv_sec = tss->ts[`0`].tv_sec,
2249	.tv_usec = tss->ts[`0`].tv_nsec / `1000`,
2250	};
2251	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2252	len: sizeof(tv), data: &tv);
2253	}
2254	}
2255	}
2256
2257	if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE)
2258	has_timestamping = true;
2259	else
2260	tss->ts[`0`] = (struct timespec64) {`0`};
2261	}
2262
2263	if (tss->ts[`2`].tv_sec \|\| tss->ts[`2`].tv_nsec) {
2264	if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE)
2265	has_timestamping = true;
2266	else
2267	tss->ts[`2`] = (struct timespec64) {`0`};
2268	}
2269
2270	if (has_timestamping) {
2271	tss->ts[`1`] = (struct timespec64) {`0`};
2272	if (sock_flag(sk, SOCK_TSTAMP_NEW))
2273	put_cmsg_scm_timestamping64(msg, tss);
2274	else
2275	put_cmsg_scm_timestamping(msg, tss);
2276	}
2277	}
2278
2279	static int tcp_inq_hint(struct sock *sk)
2280	{
2281	const struct tcp_sock *tp = tcp_sk(sk);
2282	u32 copied_seq = READ_ONCE(tp->copied_seq);
2283	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2284	int inq;
2285
2286	inq = rcv_nxt - copied_seq;
2287	if (unlikely(inq < `0` \|\| copied_seq != READ_ONCE(tp->copied_seq))) {
2288	lock_sock(sk);
2289	inq = tp->rcv_nxt - tp->copied_seq;
2290	release_sock(sk);
2291	}
2292	/ After receiving a FIN, tell the user-space to continue reading*
2293	* by returning a non-zero inq.
2294	*/
2295	if (inq == `0` && sock_flag(sk, flag: SOCK_DONE))
2296	inq = `1`;
2297	return inq;
2298	}
2299
2300	/*
2301	* This routine copies from a sock struct into the user buffer.
2302	*
2303	* Technical note: in 2.3 we work on _locked_ socket, so that
2304	* tricks with *seq access order and skb->users are not required.
2305	* Probably, code can be easily improved even more.
2306	*/
2307
2308	static int tcp_recvmsg_locked(struct sock sk, struct* msghdr *msg, size_t len,
2309	int flags, struct scm_timestamping_internal *tss,
2310	int *cmsg_flags)
2311	{
2312	struct tcp_sock *tp = tcp_sk(sk);
2313	int copied = `0`;
2314	u32 peek_seq;
2315	u32 *seq;
2316	unsigned long used;
2317	int err;
2318	int target; / Read at least this many bytes /
2319	long timeo;
2320	struct sk_buff skb, last;
2321	u32 urg_hole = `0`;
2322
2323	err = -ENOTCONN;
2324	if (sk->sk_state == TCP_LISTEN)
2325	goto out;
2326
2327	if (tp->recvmsg_inq) {
2328	*cmsg_flags = TCP_CMSG_INQ;
2329	msg->msg_get_inq = `1`;
2330	}
2331	timeo = sock_rcvtimeo(sk, noblock: flags & MSG_DONTWAIT);
2332
2333	/ Urgent data needs to be handled specially. /
2334	if (flags & MSG_OOB)
2335	goto recv_urg;
2336
2337	if (unlikely(tp->repair)) {
2338	err = -EPERM;
2339	if (!(flags & MSG_PEEK))
2340	goto out;
2341
2342	if (tp->repair_queue == TCP_SEND_QUEUE)
2343	goto recv_sndq;
2344
2345	err = -EINVAL;
2346	if (tp->repair_queue == TCP_NO_QUEUE)
2347	goto out;
2348
2349	/ 'common' recv queue MSG_PEEK-ing /
2350	}
2351
2352	seq = &tp->copied_seq;
2353	if (flags & MSG_PEEK) {
2354	peek_seq = tp->copied_seq;
2355	seq = &peek_seq;
2356	}
2357
2358	target = sock_rcvlowat(sk, waitall: flags & MSG_WAITALL, len);
2359
2360	do {
2361	u32 offset;
2362
2363	/ Are we at urgent data? Stop if we have read anything or have SIGURG pending. /
2364	if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
2365	if (copied)
2366	break;
2367	if (signal_pending(current)) {
2368	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2369	break;
2370	}
2371	}
2372
2373	/ Next get a buffer. /
2374
2375	last = skb_peek_tail(list_: &sk->sk_receive_queue);
2376	skb_queue_walk(&sk->sk_receive_queue, skb) {
2377	last = skb;
2378	/ Now that we have two receive queues this*
2379	* shouldn't happen.
2380	*/
2381	if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2382	"TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2383	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2384	flags))
2385	break;
2386
2387	offset = *seq - TCP_SKB_CB(skb)->seq;
2388	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2389	pr_err_once("%s: found a SYN, please report !\n", __func__);
2390	offset--;
2391	}
2392	if (offset < skb->len)
2393	goto found_ok_skb;
2394	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2395	goto found_fin_ok;
2396	WARN(!(flags & MSG_PEEK),
2397	"TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2398	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2399	}
2400
2401	/ Well, if we have backlog, try to process it now yet. /
2402
2403	if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2404	break;
2405
2406	if (copied) {
2407	if (!timeo \|\|
2408	sk->sk_err \|\|
2409	sk->sk_state == TCP_CLOSE \|\|
2410	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
2411	signal_pending(current))
2412	break;
2413	} else {
2414	if (sock_flag(sk, flag: SOCK_DONE))
2415	break;
2416
2417	if (sk->sk_err) {
2418	copied = sock_error(sk);
2419	break;
2420	}
2421
2422	if (sk->sk_shutdown & RCV_SHUTDOWN)
2423	break;
2424
2425	if (sk->sk_state == TCP_CLOSE) {
2426	/ This occurs when user tries to read*
2427	* from never connected socket.
2428	*/
2429	copied = -ENOTCONN;
2430	break;
2431	}
2432
2433	if (!timeo) {
2434	copied = -EAGAIN;
2435	break;
2436	}
2437
2438	if (signal_pending(current)) {
2439	copied = sock_intr_errno(timeo);
2440	break;
2441	}
2442	}
2443
2444	if (copied >= target) {
2445	/ Do not sleep, just process backlog. /
2446	__sk_flush_backlog(sk);
2447	} else {
2448	tcp_cleanup_rbuf(sk, copied);
2449	err = sk_wait_data(sk, timeo: &timeo, skb: last);
2450	if (err < `0`) {
2451	err = copied ? : err;
2452	goto out;
2453	}
2454	}
2455
2456	if ((flags & MSG_PEEK) &&
2457	(peek_seq - copied - urg_hole != tp->copied_seq)) {
2458	net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2459	current->comm,
2460	task_pid_nr(current));
2461	peek_seq = tp->copied_seq;
2462	}
2463	continue;
2464
2465	found_ok_skb:
2466	/ Ok so how much can we use? /
2467	used = skb->len - offset;
2468	if (len < used)
2469	used = len;
2470
2471	/ Do we have urgent data here? /
2472	if (unlikely(tp->urg_data)) {
2473	u32 urg_offset = tp->urg_seq - *seq;
2474	if (urg_offset < used) {
2475	if (!urg_offset) {
2476	if (!sock_flag(sk, flag: SOCK_URGINLINE)) {
2477	WRITE_ONCE(seq, seq + `1`);
2478	urg_hole++;
2479	offset++;
2480	used--;
2481	if (!used)
2482	goto skip_copy;
2483	}
2484	} else
2485	used = urg_offset;
2486	}
2487	}
2488
2489	if (!(flags & MSG_TRUNC)) {
2490	err = skb_copy_datagram_msg(from: skb, offset, msg, size: used);
2491	if (err) {
2492	/ Exception. Bailout! /
2493	if (!copied)
2494	copied = -EFAULT;
2495	break;
2496	}
2497	}
2498
2499	WRITE_ONCE(seq, seq + used);
2500	copied += used;
2501	len -= used;
2502
2503	tcp_rcv_space_adjust(sk);
2504
2505	skip_copy:
2506	if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
2507	WRITE_ONCE(tp->urg_data, `0`);
2508	tcp_fast_path_check(sk);
2509	}
2510
2511	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2512	tcp_update_recv_tstamps(skb, tss);
2513	*cmsg_flags \|= TCP_CMSG_TS;
2514	}
2515
2516	if (used + offset < skb->len)
2517	continue;
2518
2519	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2520	goto found_fin_ok;
2521	if (!(flags & MSG_PEEK))
2522	tcp_eat_recv_skb(sk, skb);
2523	continue;
2524
2525	found_fin_ok:
2526	/ Process the FIN. /
2527	WRITE_ONCE(seq, seq + `1`);
2528	if (!(flags & MSG_PEEK))
2529	tcp_eat_recv_skb(sk, skb);
2530	break;
2531	} while (len > `0`);
2532
2533	/ According to UNIX98, msg_name/msg_namelen are ignored*
2534	* on connected socket. I was just happy when found this 8) --ANK
2535	*/
2536
2537	/ Clean up data we have read: This will do ACK frames. /
2538	tcp_cleanup_rbuf(sk, copied);
2539	return copied;
2540
2541	out:
2542	return err;
2543
2544	recv_urg:
2545	err = tcp_recv_urg(sk, msg, len, flags);
2546	goto out;
2547
2548	recv_sndq:
2549	err = tcp_peek_sndq(sk, msg, len);
2550	goto out;
2551	}
2552
2553	int tcp_recvmsg(struct sock sk, struct* msghdr msg, size_t len, int* flags,
2554	int *addr_len)
2555	{
2556	int cmsg_flags = `0`, ret;
2557	struct scm_timestamping_internal tss;
2558
2559	if (unlikely(flags & MSG_ERRQUEUE))
2560	return inet_recv_error(sk, msg, len, addr_len);
2561
2562	if (sk_can_busy_loop(sk) &&
2563	skb_queue_empty_lockless(list: &sk->sk_receive_queue) &&
2564	sk->sk_state == TCP_ESTABLISHED)
2565	sk_busy_loop(sk, nonblock: flags & MSG_DONTWAIT);
2566
2567	lock_sock(sk);
2568	ret = tcp_recvmsg_locked(sk, msg, len, flags, tss: &tss, cmsg_flags: &cmsg_flags);
2569	release_sock(sk);
2570
2571	if ((cmsg_flags \|\| msg->msg_get_inq) && ret >= `0`) {
2572	if (cmsg_flags & TCP_CMSG_TS)
2573	tcp_recv_timestamp(msg, sk, tss: &tss);
2574	if (msg->msg_get_inq) {
2575	msg->msg_inq = tcp_inq_hint(sk);
2576	if (cmsg_flags & TCP_CMSG_INQ)
2577	put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
2578	len: sizeof(msg->msg_inq), data: &msg->msg_inq);
2579	}
2580	}
2581	return ret;
2582	}
2583	EXPORT_SYMBOL(tcp_recvmsg);
2584
2585	void tcp_set_state(struct sock sk, int* state)
2586	{
2587	int oldstate = sk->sk_state;
2588
2589	/ We defined a new enum for TCP states that are exported in BPF*
2590	* so as not force the internal TCP states to be frozen. The
2591	* following checks will detect if an internal state value ever
2592	* differs from the BPF value. If this ever happens, then we will
2593	* need to remap the internal value to the BPF value before calling
2594	* tcp_call_bpf_2arg.
2595	*/
2596	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2597	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2598	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2599	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2600	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2601	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2602	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2603	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2604	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2605	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2606	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2607	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2608	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2609
2610	/ bpf uapi header bpf.h defines an anonymous enum with values*
2611	* BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
2612	* is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
2613	* But clang built vmlinux does not have this enum in DWARF
2614	* since clang removes the above code before generating IR/debuginfo.
2615	* Let us explicitly emit the type debuginfo to ensure the
2616	* above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
2617	* regardless of which compiler is used.
2618	*/
2619	BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2620
2621	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2622	tcp_call_bpf_2arg(sk, op: BPF_SOCK_OPS_STATE_CB, arg1: oldstate, arg2: state);
2623
2624	switch (state) {
2625	case TCP_ESTABLISHED:
2626	if (oldstate != TCP_ESTABLISHED)
2627	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2628	break;
2629
2630	case TCP_CLOSE:
2631	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
2632	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2633
2634	sk->sk_prot->unhash(sk);
2635	if (inet_csk(sk)->icsk_bind_hash &&
2636	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2637	inet_put_port(sk);
2638	fallthrough;
2639	default:
2640	if (oldstate == TCP_ESTABLISHED)
2641	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2642	}
2643
2644	/ Change state AFTER socket is unhashed to avoid closed*
2645	* socket sitting in hash tables.
2646	*/
2647	inet_sk_state_store(sk, newstate: state);
2648	}
2649	EXPORT_SYMBOL_GPL(tcp_set_state);
2650
2651	/*
2652	* State processing on a close. This implements the state shift for
2653	* sending our FIN frame. Note that we only send a FIN for some
2654	* states. A shutdown() may have already sent the FIN, or we may be
2655	* closed.
2656	*/
2657
2658	static const unsigned char new_state[`16`] = {
2659	/ current state: new state: action: /
2660	[`0` / (Invalid) /] = TCP_CLOSE,
2661	[TCP_ESTABLISHED] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
2662	[TCP_SYN_SENT] = TCP_CLOSE,
2663	[TCP_SYN_RECV] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
2664	[TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2665	[TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2666	[TCP_TIME_WAIT] = TCP_CLOSE,
2667	[TCP_CLOSE] = TCP_CLOSE,
2668	[TCP_CLOSE_WAIT] = TCP_LAST_ACK \| TCP_ACTION_FIN,
2669	[TCP_LAST_ACK] = TCP_LAST_ACK,
2670	[TCP_LISTEN] = TCP_CLOSE,
2671	[TCP_CLOSING] = TCP_CLOSING,
2672	[TCP_NEW_SYN_RECV] = TCP_CLOSE, / should not happen ! /
2673	};
2674
2675	static int tcp_close_state(struct sock *sk)
2676	{
2677	int next = (int)new_state[sk->sk_state];
2678	int ns = next & TCP_STATE_MASK;
2679
2680	tcp_set_state(sk, ns);
2681
2682	return next & TCP_ACTION_FIN;
2683	}
2684
2685	/*
2686	* Shutdown the sending side of a connection. Much like close except
2687	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
2688	*/
2689
2690	void tcp_shutdown(struct sock sk, int* how)
2691	{
2692	/ We need to grab some memory, and put together a FIN,*
2693	* and then put it into the queue to be sent.
2694	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
2695	*/
2696	if (!(how & SEND_SHUTDOWN))
2697	return;
2698
2699	/ If we've already sent a FIN, or it's a closed state, skip this. /
2700	if ((`1` << sk->sk_state) &
2701	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
2702	TCPF_SYN_RECV \| TCPF_CLOSE_WAIT)) {
2703	/ Clear out any half completed packets. FIN if needed. /
2704	if (tcp_close_state(sk))
2705	tcp_send_fin(sk);
2706	}
2707	}
2708	EXPORT_SYMBOL(tcp_shutdown);
2709
2710	int tcp_orphan_count_sum(void)
2711	{
2712	int i, total = `0`;
2713
2714	for_each_possible_cpu(i)
2715	total += per_cpu(tcp_orphan_count, i);
2716
2717	return max(total, `0`);
2718	}
2719
2720	static int tcp_orphan_cache;
2721	static struct timer_list tcp_orphan_timer;
2722	#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
2723
2724	static void tcp_orphan_update(struct timer_list *unused)
2725	{
2726	WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
2727	mod_timer(timer: &tcp_orphan_timer, expires: jiffies + TCP_ORPHAN_TIMER_PERIOD);
2728	}
2729
2730	static bool tcp_too_many_orphans(int shift)
2731	{
2732	return READ_ONCE(tcp_orphan_cache) << shift >
2733	READ_ONCE(sysctl_tcp_max_orphans);
2734	}
2735
2736	bool tcp_check_oom(struct sock sk, int* shift)
2737	{
2738	bool too_many_orphans, out_of_socket_memory;
2739
2740	too_many_orphans = tcp_too_many_orphans(shift);
2741	out_of_socket_memory = tcp_out_of_memory(sk);
2742
2743	if (too_many_orphans)
2744	net_info_ratelimited("too many orphaned sockets\n");
2745	if (out_of_socket_memory)
2746	net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2747	return too_many_orphans \|\| out_of_socket_memory;
2748	}
2749
2750	void __tcp_close(struct sock sk, long* timeout)
2751	{
2752	struct sk_buff *skb;
2753	int data_was_unread = `0`;
2754	int state;
2755
2756	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
2757
2758	if (sk->sk_state == TCP_LISTEN) {
2759	tcp_set_state(sk, TCP_CLOSE);
2760
2761	/ Special case. /
2762	inet_csk_listen_stop(sk);
2763
2764	goto adjudge_to_death;
2765	}
2766
2767	/ We need to flush the recv. buffs. We do this only on the*
2768	* descriptor close, not protocol-sourced closes, because the
2769	* reader process may not have drained the data yet!
2770	*/
2771	while ((skb = __skb_dequeue(list: &sk->sk_receive_queue)) != NULL) {
2772	u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2773
2774	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2775	len--;
2776	data_was_unread += len;
2777	__kfree_skb(skb);
2778	}
2779
2780	/ If socket has been already reset (e.g. in tcp_reset()) - kill it. /
2781	if (sk->sk_state == TCP_CLOSE)
2782	goto adjudge_to_death;
2783
2784	/ As outlined in RFC 2525, section 2.17, we send a RST here because*
2785	* data was lost. To witness the awful effects of the old behavior of
2786	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
2787	* GET in an FTP client, suspend the process, wait for the client to
2788	* advertise a zero window, then kill -9 the FTP client, wheee...
2789	* Note: timeout is always zero in such a case.
2790	*/
2791	if (unlikely(tcp_sk(sk)->repair)) {
2792	sk->sk_prot->disconnect(sk, `0`);
2793	} else if (data_was_unread) {
2794	/ Unread data was tossed, zap the connection. /
2795	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2796	tcp_set_state(sk, TCP_CLOSE);
2797	tcp_send_active_reset(sk, priority: sk->sk_allocation);
2798	} else if (sock_flag(sk, flag: SOCK_LINGER) && !sk->sk_lingertime) {
2799	/ Check zero linger _after_ checking for unread data. /
2800	sk->sk_prot->disconnect(sk, `0`);
2801	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2802	} else if (tcp_close_state(sk)) {
2803	/ We FIN if the application ate all the data before*
2804	* zapping the connection.
2805	*/
2806
2807	/ RED-PEN. Formally speaking, we have broken TCP state*
2808	* machine. State transitions:
2809	*
2810	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
2811	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
2812	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
2813	*
2814	* are legal only when FIN has been sent (i.e. in window),
2815	* rather than queued out of window. Purists blame.
2816	*
2817	* F.e. "RFC state" is ESTABLISHED,
2818	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
2819	*
2820	* The visible declinations are that sometimes
2821	* we enter time-wait state, when it is not required really
2822	* (harmless), do not send active resets, when they are
2823	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2824	* they look as CLOSING or LAST_ACK for Linux)
2825	* Probably, I missed some more holelets.
2826	* --ANK
2827	* XXX (TFO) - To start off we don't support SYN+ACK+FIN
2828	* in a single packet! (May consider it later but will
2829	* probably need API support or TCP_CORK SYN-ACK until
2830	* data is written and socket is closed.)
2831	*/
2832	tcp_send_fin(sk);
2833	}
2834
2835	sk_stream_wait_close(sk, timeo_p: timeout);
2836
2837	adjudge_to_death:
2838	state = sk->sk_state;
2839	sock_hold(sk);
2840	sock_orphan(sk);
2841
2842	local_bh_disable();
2843	bh_lock_sock(sk);
2844	/ remove backlog if any, without releasing ownership. /
2845	__release_sock(sk);
2846
2847	this_cpu_inc(tcp_orphan_count);
2848
2849	/ Have we already been destroyed by a softirq or backlog? /
2850	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2851	goto out;
2852
2853	/ This is a (useful) BSD violating of the RFC. There is a*
2854	* problem with TCP as specified in that the other end could
2855	* keep a socket open forever with no application left this end.
2856	* We use a 1 minute timeout (about the same as BSD) then kill
2857	* our end. If they send after that then tough - BUT: long enough
2858	* that we won't make the old 4*rto = almost no time - whoops
2859	* reset mistake.
2860	*
2861	* Nope, it was not mistake. It is really desired behaviour
2862	* f.e. on http servers, when such sockets are useless, but
2863	* consume significant resources. Let's do it with special
2864	* linger2 option. --ANK
2865	*/
2866
2867	if (sk->sk_state == TCP_FIN_WAIT2) {
2868	struct tcp_sock *tp = tcp_sk(sk);
2869	if (READ_ONCE(tp->linger2) < `0`) {
2870	tcp_set_state(sk, TCP_CLOSE);
2871	tcp_send_active_reset(sk, GFP_ATOMIC);
2872	__NET_INC_STATS(sock_net(sk),
2873	LINUX_MIB_TCPABORTONLINGER);
2874	} else {
2875	const int tmo = tcp_fin_time(sk);
2876
2877	if (tmo > TCP_TIMEWAIT_LEN) {
2878	inet_csk_reset_keepalive_timer(sk,
2879	timeout: tmo - TCP_TIMEWAIT_LEN);
2880	} else {
2881	tcp_time_wait(sk, state: TCP_FIN_WAIT2, timeo: tmo);
2882	goto out;
2883	}
2884	}
2885	}
2886	if (sk->sk_state != TCP_CLOSE) {
2887	if (tcp_check_oom(sk, shift: `0`)) {
2888	tcp_set_state(sk, TCP_CLOSE);
2889	tcp_send_active_reset(sk, GFP_ATOMIC);
2890	__NET_INC_STATS(sock_net(sk),
2891	LINUX_MIB_TCPABORTONMEMORY);
2892	} else if (!check_net(net: sock_net(sk))) {
2893	/ Not possible to send reset; just close /
2894	tcp_set_state(sk, TCP_CLOSE);
2895	}
2896	}
2897
2898	if (sk->sk_state == TCP_CLOSE) {
2899	struct request_sock *req;
2900
2901	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2902	lockdep_sock_is_held(sk));
2903	/ We could get here with a non-NULL req if the socket is*
2904	* aborted (e.g., closed with unread data) before 3WHS
2905	* finishes.
2906	*/
2907	if (req)
2908	reqsk_fastopen_remove(sk, req, reset: false);
2909	inet_csk_destroy_sock(sk);
2910	}
2911	/ Otherwise, socket is reprieved until protocol close. /
2912
2913	out:
2914	bh_unlock_sock(sk);
2915	local_bh_enable();
2916	}
2917
2918	void tcp_close(struct sock sk, long* timeout)
2919	{
2920	lock_sock(sk);
2921	__tcp_close(sk, timeout);
2922	release_sock(sk);
2923	sock_put(sk);
2924	}
2925	EXPORT_SYMBOL(tcp_close);
2926
2927	/ These states need RST on ABORT according to RFC793 /
2928
2929	static inline bool tcp_need_reset(int state)
2930	{
2931	return (`1` << state) &
2932	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
2933	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
2934	}
2935
2936	static void tcp_rtx_queue_purge(struct sock *sk)
2937	{
2938	struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2939
2940	tcp_sk(sk)->highest_sack = NULL;
2941	while (p) {
2942	struct sk_buff *skb = rb_to_skb(p);
2943
2944	p = rb_next(p);
2945	/ Since we are deleting whole queue, no need to*
2946	* list_del(&skb->tcp_tsorted_anchor)
2947	*/
2948	tcp_rtx_queue_unlink(skb, sk);
2949	tcp_wmem_free_skb(sk, skb);
2950	}
2951	}
2952
2953	void tcp_write_queue_purge(struct sock *sk)
2954	{
2955	struct sk_buff *skb;
2956
2957	tcp_chrono_stop(sk, type: TCP_CHRONO_BUSY);
2958	while ((skb = __skb_dequeue(list: &sk->sk_write_queue)) != NULL) {
2959	tcp_skb_tsorted_anchor_cleanup(skb);
2960	tcp_wmem_free_skb(sk, skb);
2961	}
2962	tcp_rtx_queue_purge(sk);
2963	INIT_LIST_HEAD(list: &tcp_sk(sk)->tsorted_sent_queue);
2964	tcp_clear_all_retrans_hints(tcp_sk(sk));
2965	tcp_sk(sk)->packets_out = `0`;
2966	inet_csk(sk)->icsk_backoff = `0`;
2967	}
2968
2969	int tcp_disconnect(struct sock sk, int* flags)
2970	{
2971	struct inet_sock *inet = inet_sk(sk);
2972	struct inet_connection_sock *icsk = inet_csk(sk);
2973	struct tcp_sock *tp = tcp_sk(sk);
2974	int old_state = sk->sk_state;
2975	u32 seq;
2976
2977	if (old_state != TCP_CLOSE)
2978	tcp_set_state(sk, TCP_CLOSE);
2979
2980	/ ABORT function of RFC793 /
2981	if (old_state == TCP_LISTEN) {
2982	inet_csk_listen_stop(sk);
2983	} else if (unlikely(tp->repair)) {
2984	WRITE_ONCE(sk->sk_err, ECONNABORTED);
2985	} else if (tcp_need_reset(state: old_state) \|\|
2986	(tp->snd_nxt != tp->write_seq &&
2987	(`1` << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK))) {
2988	/ The last check adjusts for discrepancy of Linux wrt. RFC*
2989	* states
2990	*/
2991	tcp_send_active_reset(sk, priority: gfp_any());
2992	WRITE_ONCE(sk->sk_err, ECONNRESET);
2993	} else if (old_state == TCP_SYN_SENT)
2994	WRITE_ONCE(sk->sk_err, ECONNRESET);
2995
2996	tcp_clear_xmit_timers(sk);
2997	__skb_queue_purge(list: &sk->sk_receive_queue);
2998	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2999	WRITE_ONCE(tp->urg_data, `0`);
3000	tcp_write_queue_purge(sk);
3001	tcp_fastopen_active_disable_ofo_check(sk);
3002	skb_rbtree_purge(root: &tp->out_of_order_queue);
3003
3004	inet->inet_dport = `0`;
3005
3006	inet_bhash2_reset_saddr(sk);
3007
3008	WRITE_ONCE(sk->sk_shutdown, `0`);
3009	sock_reset_flag(sk, flag: SOCK_DONE);
3010	tp->srtt_us = `0`;
3011	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
3012	tp->rcv_rtt_last_tsecr = `0`;
3013
3014	seq = tp->write_seq + tp->max_window + `2`;
3015	if (!seq)
3016	seq = `1`;
3017	WRITE_ONCE(tp->write_seq, seq);
3018
3019	icsk->icsk_backoff = `0`;
3020	icsk->icsk_probes_out = `0`;
3021	icsk->icsk_probes_tstamp = `0`;
3022	icsk->icsk_rto = TCP_TIMEOUT_INIT;
3023	icsk->icsk_rto_min = TCP_RTO_MIN;
3024	icsk->icsk_delack_max = TCP_DELACK_MAX;
3025	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
3026	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
3027	tp->snd_cwnd_cnt = `0`;
3028	tp->is_cwnd_limited = `0`;
3029	tp->max_packets_out = `0`;
3030	tp->window_clamp = `0`;
3031	tp->delivered = `0`;
3032	tp->delivered_ce = `0`;
3033	if (icsk->icsk_ca_ops->release)
3034	icsk->icsk_ca_ops->release(sk);
3035	memset(icsk->icsk_ca_priv, `0`, sizeof(icsk->icsk_ca_priv));
3036	icsk->icsk_ca_initialized = `0`;
3037	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
3038	tp->is_sack_reneg = `0`;
3039	tcp_clear_retrans(tp);
3040	tp->total_retrans = `0`;
3041	inet_csk_delack_init(sk);
3042	/ Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0*
3043	* issue in __tcp_select_window()
3044	*/
3045	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3046	memset(&tp->rx_opt, `0`, sizeof(tp->rx_opt));
3047	__sk_dst_reset(sk);
3048	dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL));
3049	tcp_saved_syn_free(tp);
3050	tp->compressed_ack = `0`;
3051	tp->segs_in = `0`;
3052	tp->segs_out = `0`;
3053	tp->bytes_sent = `0`;
3054	tp->bytes_acked = `0`;
3055	tp->bytes_received = `0`;
3056	tp->bytes_retrans = `0`;
3057	tp->data_segs_in = `0`;
3058	tp->data_segs_out = `0`;
3059	tp->duplicate_sack[`0`].start_seq = `0`;
3060	tp->duplicate_sack[`0`].end_seq = `0`;
3061	tp->dsack_dups = `0`;
3062	tp->reord_seen = `0`;
3063	tp->retrans_out = `0`;
3064	tp->sacked_out = `0`;
3065	tp->tlp_high_seq = `0`;
3066	tp->last_oow_ack_time = `0`;
3067	tp->plb_rehash = `0`;
3068	/ There's a bubble in the pipe until at least the first ACK. /
3069	tp->app_limited = ~`0U`;
3070	tp->rate_app_limited = `1`;
3071	tp->rack.mstamp = `0`;
3072	tp->rack.advanced = `0`;
3073	tp->rack.reo_wnd_steps = `1`;
3074	tp->rack.last_delivered = `0`;
3075	tp->rack.reo_wnd_persist = `0`;
3076	tp->rack.dsack_seen = `0`;
3077	tp->syn_data_acked = `0`;
3078	tp->rx_opt.saw_tstamp = `0`;
3079	tp->rx_opt.dsack = `0`;
3080	tp->rx_opt.num_sacks = `0`;
3081	tp->rcv_ooopack = `0`;
3082
3083
3084	/ Clean up fastopen related fields /
3085	tcp_free_fastopen_req(tp);
3086	inet_clear_bit(DEFER_CONNECT, sk);
3087	tp->fastopen_client_fail = `0`;
3088
3089	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3090
3091	if (sk->sk_frag.page) {
3092	put_page(page: sk->sk_frag.page);
3093	sk->sk_frag.page = NULL;
3094	sk->sk_frag.offset = `0`;
3095	}
3096	sk_error_report(sk);
3097	return `0`;
3098	}
3099	EXPORT_SYMBOL(tcp_disconnect);
3100
3101	static inline bool tcp_can_repair_sock(const struct sock *sk)
3102	{
3103	return sockopt_ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3104	(sk->sk_state != TCP_LISTEN);
3105	}
3106
3107	static int tcp_repair_set_window(struct tcp_sock tp, sockptr_t optbuf, int* len)
3108	{
3109	struct tcp_repair_window opt;
3110
3111	if (!tp->repair)
3112	return -EPERM;
3113
3114	if (len != sizeof(opt))
3115	return -EINVAL;
3116
3117	if (copy_from_sockptr(dst: &opt, src: optbuf, size: sizeof(opt)))
3118	return -EFAULT;
3119
3120	if (opt.max_window < opt.snd_wnd)
3121	return -EINVAL;
3122
3123	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3124	return -EINVAL;
3125
3126	if (after(opt.rcv_wup, tp->rcv_nxt))
3127	return -EINVAL;
3128
3129	tp->snd_wl1 = opt.snd_wl1;
3130	tp->snd_wnd = opt.snd_wnd;
3131	tp->max_window = opt.max_window;
3132
3133	tp->rcv_wnd = opt.rcv_wnd;
3134	tp->rcv_wup = opt.rcv_wup;
3135
3136	return `0`;
3137	}
3138
3139	static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3140	unsigned int len)
3141	{
3142	struct tcp_sock *tp = tcp_sk(sk);
3143	struct tcp_repair_opt opt;
3144	size_t offset = `0`;
3145
3146	while (len >= sizeof(opt)) {
3147	if (copy_from_sockptr_offset(dst: &opt, src: optbuf, offset, size: sizeof(opt)))
3148	return -EFAULT;
3149
3150	offset += sizeof(opt);
3151	len -= sizeof(opt);
3152
3153	switch (opt.opt_code) {
3154	case TCPOPT_MSS:
3155	tp->rx_opt.mss_clamp = opt.opt_val;
3156	tcp_mtup_init(sk);
3157	break;
3158	case TCPOPT_WINDOW:
3159	{
3160	u16 snd_wscale = opt.opt_val & `0xFFFF`;
3161	u16 rcv_wscale = opt.opt_val >> `16`;
3162
3163	if (snd_wscale > TCP_MAX_WSCALE \|\| rcv_wscale > TCP_MAX_WSCALE)
3164	return -EFBIG;
3165
3166	tp->rx_opt.snd_wscale = snd_wscale;
3167	tp->rx_opt.rcv_wscale = rcv_wscale;
3168	tp->rx_opt.wscale_ok = `1`;
3169	}
3170	break;
3171	case TCPOPT_SACK_PERM:
3172	if (opt.opt_val != `0`)
3173	return -EINVAL;
3174
3175	tp->rx_opt.sack_ok \|= TCP_SACK_SEEN;
3176	break;
3177	case TCPOPT_TIMESTAMP:
3178	if (opt.opt_val != `0`)
3179	return -EINVAL;
3180
3181	tp->rx_opt.tstamp_ok = `1`;
3182	break;
3183	}
3184	}
3185
3186	return `0`;
3187	}
3188
3189	DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3190	EXPORT_SYMBOL(tcp_tx_delay_enabled);
3191
3192	static void tcp_enable_tx_delay(void)
3193	{
3194	if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
3195	static int __tcp_tx_delay_enabled = `0`;
3196
3197	if (cmpxchg(&__tcp_tx_delay_enabled, `0`, `1`) == `0`) {
3198	static_branch_enable(&tcp_tx_delay_enabled);
3199	pr_info("TCP_TX_DELAY enabled\n");
3200	}
3201	}
3202	}
3203
3204	/ When set indicates to always queue non-full frames. Later the user clears*
3205	* this option and we transmit any pending partial frames in the queue. This is
3206	* meant to be used alongside sendfile() to get properly filled frames when the
3207	* user (for example) must write out headers with a write() call first and then
3208	* use sendfile to send out the data parts.
3209	*
3210	* TCP_CORK can be set together with TCP_NODELAY and it is stronger than
3211	* TCP_NODELAY.
3212	*/
3213	void __tcp_sock_set_cork(struct sock *sk, bool on)
3214	{
3215	struct tcp_sock *tp = tcp_sk(sk);
3216
3217	if (on) {
3218	tp->nonagle \|= TCP_NAGLE_CORK;
3219	} else {
3220	tp->nonagle &= ~TCP_NAGLE_CORK;
3221	if (tp->nonagle & TCP_NAGLE_OFF)
3222	tp->nonagle \|= TCP_NAGLE_PUSH;
3223	tcp_push_pending_frames(sk);
3224	}
3225	}
3226
3227	void tcp_sock_set_cork(struct sock *sk, bool on)
3228	{
3229	lock_sock(sk);
3230	__tcp_sock_set_cork(sk, on);
3231	release_sock(sk);
3232	}
3233	EXPORT_SYMBOL(tcp_sock_set_cork);
3234
3235	/ TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is*
3236	* remembered, but it is not activated until cork is cleared.
3237	*
3238	* However, when TCP_NODELAY is set we make an explicit push, which overrides
3239	* even TCP_CORK for currently queued segments.
3240	*/
3241	void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3242	{
3243	if (on) {
3244	tcp_sk(sk)->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
3245	tcp_push_pending_frames(sk);
3246	} else {
3247	tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3248	}
3249	}
3250
3251	void tcp_sock_set_nodelay(struct sock *sk)
3252	{
3253	lock_sock(sk);
3254	__tcp_sock_set_nodelay(sk, on: true);
3255	release_sock(sk);
3256	}
3257	EXPORT_SYMBOL(tcp_sock_set_nodelay);
3258
3259	static void __tcp_sock_set_quickack(struct sock sk, int* val)
3260	{
3261	if (!val) {
3262	inet_csk_enter_pingpong_mode(sk);
3263	return;
3264	}
3265
3266	inet_csk_exit_pingpong_mode(sk);
3267	if ((`1` << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
3268	inet_csk_ack_scheduled(sk)) {
3269	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_PUSHED;
3270	tcp_cleanup_rbuf(sk, copied: `1`);
3271	if (!(val & `1`))
3272	inet_csk_enter_pingpong_mode(sk);
3273	}
3274	}
3275
3276	void tcp_sock_set_quickack(struct sock sk, int* val)
3277	{
3278	lock_sock(sk);
3279	__tcp_sock_set_quickack(sk, val);
3280	release_sock(sk);
3281	}
3282	EXPORT_SYMBOL(tcp_sock_set_quickack);
3283
3284	int tcp_sock_set_syncnt(struct sock sk, int* val)
3285	{
3286	if (val < `1` \|\| val > MAX_TCP_SYNCNT)
3287	return -EINVAL;
3288
3289	WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
3290	return `0`;
3291	}
3292	EXPORT_SYMBOL(tcp_sock_set_syncnt);
3293
3294	int tcp_sock_set_user_timeout(struct sock sk, int* val)
3295	{
3296	/ Cap the max time in ms TCP will retry or probe the window*
3297	* before giving up and aborting (ETIMEDOUT) a connection.
3298	*/
3299	if (val < `0`)
3300	return -EINVAL;
3301
3302	WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
3303	return `0`;
3304	}
3305	EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3306
3307	int tcp_sock_set_keepidle_locked(struct sock sk, int* val)
3308	{
3309	struct tcp_sock *tp = tcp_sk(sk);
3310
3311	if (val < `1` \|\| val > MAX_TCP_KEEPIDLE)
3312	return -EINVAL;
3313
3314	/ Paired with WRITE_ONCE() in keepalive_time_when() /
3315	WRITE_ONCE(tp->keepalive_time, val * HZ);
3316	if (sock_flag(sk, flag: SOCK_KEEPOPEN) &&
3317	!((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN))) {
3318	u32 elapsed = keepalive_time_elapsed(tp);
3319
3320	if (tp->keepalive_time > elapsed)
3321	elapsed = tp->keepalive_time - elapsed;
3322	else
3323	elapsed = `0`;
3324	inet_csk_reset_keepalive_timer(sk, timeout: elapsed);
3325	}
3326
3327	return `0`;
3328	}
3329
3330	int tcp_sock_set_keepidle(struct sock sk, int* val)
3331	{
3332	int err;
3333
3334	lock_sock(sk);
3335	err = tcp_sock_set_keepidle_locked(sk, val);
3336	release_sock(sk);
3337	return err;
3338	}
3339	EXPORT_SYMBOL(tcp_sock_set_keepidle);
3340
3341	int tcp_sock_set_keepintvl(struct sock sk, int* val)
3342	{
3343	if (val < `1` \|\| val > MAX_TCP_KEEPINTVL)
3344	return -EINVAL;
3345
3346	WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
3347	return `0`;
3348	}
3349	EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3350
3351	int tcp_sock_set_keepcnt(struct sock sk, int* val)
3352	{
3353	if (val < `1` \|\| val > MAX_TCP_KEEPCNT)
3354	return -EINVAL;
3355
3356	/ Paired with READ_ONCE() in keepalive_probes() /
3357	WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
3358	return `0`;
3359	}
3360	EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3361
3362	int tcp_set_window_clamp(struct sock sk, int* val)
3363	{
3364	struct tcp_sock *tp = tcp_sk(sk);
3365
3366	if (!val) {
3367	if (sk->sk_state != TCP_CLOSE)
3368	return -EINVAL;
3369	tp->window_clamp = `0`;
3370	} else {
3371	tp->window_clamp = val < SOCK_MIN_RCVBUF / `2` ?
3372	SOCK_MIN_RCVBUF / `2` : val;
3373	tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp);
3374	}
3375	return `0`;
3376	}
3377
3378	/*
3379	* Socket option code for TCP.
3380	*/
3381	int do_tcp_setsockopt(struct sock sk, int* level, int optname,
3382	sockptr_t optval, unsigned int optlen)
3383	{
3384	struct tcp_sock *tp = tcp_sk(sk);
3385	struct inet_connection_sock *icsk = inet_csk(sk);
3386	struct net *net = sock_net(sk);
3387	int val;
3388	int err = `0`;
3389
3390	/ These are data/string values, all the others are ints /
3391	switch (optname) {
3392	case TCP_CONGESTION: {
3393	char name[TCP_CA_NAME_MAX];
3394
3395	if (optlen < `1`)
3396	return -EINVAL;
3397
3398	val = strncpy_from_sockptr(dst: name, src: optval,
3399	min_t(long, TCP_CA_NAME_MAX-`1`, optlen));
3400	if (val < `0`)
3401	return -EFAULT;
3402	name[val] = `0`;
3403
3404	sockopt_lock_sock(sk);
3405	err = tcp_set_congestion_control(sk, name, load: !has_current_bpf_ctx(),
3406	cap_net_admin: sockopt_ns_capable(ns: sock_net(sk)->user_ns,
3407	CAP_NET_ADMIN));
3408	sockopt_release_sock(sk);
3409	return err;
3410	}
3411	case TCP_ULP: {
3412	char name[TCP_ULP_NAME_MAX];
3413
3414	if (optlen < `1`)
3415	return -EINVAL;
3416
3417	val = strncpy_from_sockptr(dst: name, src: optval,
3418	min_t(long, TCP_ULP_NAME_MAX - `1`,
3419	optlen));
3420	if (val < `0`)
3421	return -EFAULT;
3422	name[val] = `0`;
3423
3424	sockopt_lock_sock(sk);
3425	err = tcp_set_ulp(sk, name);
3426	sockopt_release_sock(sk);
3427	return err;
3428	}
3429	case TCP_FASTOPEN_KEY: {
3430	__u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3431	__u8 *backup_key = NULL;
3432
3433	/ Allow a backup key as well to facilitate key rotation*
3434	* First key is the active one.
3435	*/
3436	if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3437	optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3438	return -EINVAL;
3439
3440	if (copy_from_sockptr(dst: key, src: optval, size: optlen))
3441	return -EFAULT;
3442
3443	if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3444	backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3445
3446	return tcp_fastopen_reset_cipher(net, sk, primary_key: key, backup_key);
3447	}
3448	default:
3449	/ fallthru /
3450	break;
3451	}
3452
3453	if (optlen < sizeof(int))
3454	return -EINVAL;
3455
3456	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3457	return -EFAULT;
3458
3459	/ Handle options that can be set without locking the socket. /
3460	switch (optname) {
3461	case TCP_SYNCNT:
3462	return tcp_sock_set_syncnt(sk, val);
3463	case TCP_USER_TIMEOUT:
3464	return tcp_sock_set_user_timeout(sk, val);
3465	case TCP_KEEPINTVL:
3466	return tcp_sock_set_keepintvl(sk, val);
3467	case TCP_KEEPCNT:
3468	return tcp_sock_set_keepcnt(sk, val);
3469	case TCP_LINGER2:
3470	if (val < `0`)
3471	WRITE_ONCE(tp->linger2, -`1`);
3472	else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3473	WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
3474	else
3475	WRITE_ONCE(tp->linger2, val * HZ);
3476	return `0`;
3477	case TCP_DEFER_ACCEPT:
3478	/ Translate value in seconds to number of retransmits /
3479	WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
3480	secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3481	TCP_RTO_MAX / HZ));
3482	return `0`;
3483	}
3484
3485	sockopt_lock_sock(sk);
3486
3487	switch (optname) {
3488	case TCP_MAXSEG:
3489	/ Values greater than interface MTU won't take effect. However*
3490	* at the point when this call is done we typically don't yet
3491	* know which interface is going to be used
3492	*/
3493	if (val && (val < TCP_MIN_MSS \|\| val > MAX_TCP_WINDOW)) {
3494	err = -EINVAL;
3495	break;
3496	}
3497	tp->rx_opt.user_mss = val;
3498	break;
3499
3500	case TCP_NODELAY:
3501	__tcp_sock_set_nodelay(sk, on: val);
3502	break;
3503
3504	case TCP_THIN_LINEAR_TIMEOUTS:
3505	if (val < `0` \|\| val > `1`)
3506	err = -EINVAL;
3507	else
3508	tp->thin_lto = val;
3509	break;
3510
3511	case TCP_THIN_DUPACK:
3512	if (val < `0` \|\| val > `1`)
3513	err = -EINVAL;
3514	break;
3515
3516	case TCP_REPAIR:
3517	if (!tcp_can_repair_sock(sk))
3518	err = -EPERM;
3519	else if (val == TCP_REPAIR_ON) {
3520	tp->repair = `1`;
3521	sk->sk_reuse = SK_FORCE_REUSE;
3522	tp->repair_queue = TCP_NO_QUEUE;
3523	} else if (val == TCP_REPAIR_OFF) {
3524	tp->repair = `0`;
3525	sk->sk_reuse = SK_NO_REUSE;
3526	tcp_send_window_probe(sk);
3527	} else if (val == TCP_REPAIR_OFF_NO_WP) {
3528	tp->repair = `0`;
3529	sk->sk_reuse = SK_NO_REUSE;
3530	} else
3531	err = -EINVAL;
3532
3533	break;
3534
3535	case TCP_REPAIR_QUEUE:
3536	if (!tp->repair)
3537	err = -EPERM;
3538	else if ((unsigned int)val < TCP_QUEUES_NR)
3539	tp->repair_queue = val;
3540	else
3541	err = -EINVAL;
3542	break;
3543
3544	case TCP_QUEUE_SEQ:
3545	if (sk->sk_state != TCP_CLOSE) {
3546	err = -EPERM;
3547	} else if (tp->repair_queue == TCP_SEND_QUEUE) {
3548	if (!tcp_rtx_queue_empty(sk))
3549	err = -EPERM;
3550	else
3551	WRITE_ONCE(tp->write_seq, val);
3552	} else if (tp->repair_queue == TCP_RECV_QUEUE) {
3553	if (tp->rcv_nxt != tp->copied_seq) {
3554	err = -EPERM;
3555	} else {
3556	WRITE_ONCE(tp->rcv_nxt, val);
3557	WRITE_ONCE(tp->copied_seq, val);
3558	}
3559	} else {
3560	err = -EINVAL;
3561	}
3562	break;
3563
3564	case TCP_REPAIR_OPTIONS:
3565	if (!tp->repair)
3566	err = -EINVAL;
3567	else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
3568	err = tcp_repair_options_est(sk, optbuf: optval, len: optlen);
3569	else
3570	err = -EPERM;
3571	break;
3572
3573	case TCP_CORK:
3574	__tcp_sock_set_cork(sk, on: val);
3575	break;
3576
3577	case TCP_KEEPIDLE:
3578	err = tcp_sock_set_keepidle_locked(sk, val);
3579	break;
3580	case TCP_SAVE_SYN:
3581	/ 0: disable, 1: enable, 2: start from ether_header /
3582	if (val < `0` \|\| val > `2`)
3583	err = -EINVAL;
3584	else
3585	tp->save_syn = val;
3586	break;
3587
3588	case TCP_WINDOW_CLAMP:
3589	err = tcp_set_window_clamp(sk, val);
3590	break;
3591
3592	case TCP_QUICKACK:
3593	__tcp_sock_set_quickack(sk, val);
3594	break;
3595
3596	case TCP_AO_REPAIR:
3597	err = tcp_ao_set_repair(sk, optval, optlen);
3598	break;
3599	#ifdef CONFIG_TCP_AO
3600	case TCP_AO_ADD_KEY:
3601	case TCP_AO_DEL_KEY:
3602	case TCP_AO_INFO: {
3603	/ If this is the first TCP-AO setsockopt() on the socket,*
3604	* sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
3605	* in any state.
3606	*/
3607	if ((`1` << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
3608	goto ao_parse;
3609	if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
3610	lockdep_sock_is_held(sk)))
3611	goto ao_parse;
3612	if (tp->repair)
3613	goto ao_parse;
3614	err = -EISCONN;
3615	break;
3616	ao_parse:
3617	err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
3618	break;
3619	}
3620	#endif
3621	#ifdef CONFIG_TCP_MD5SIG
3622	case TCP_MD5SIG:
3623	case TCP_MD5SIG_EXT:
3624	err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3625	break;
3626	#endif
3627	case TCP_FASTOPEN:
3628	if (val >= `0` && ((`1` << sk->sk_state) & (TCPF_CLOSE \|
3629	TCPF_LISTEN))) {
3630	tcp_fastopen_init_key_once(net);
3631
3632	fastopen_queue_tune(sk, backlog: val);
3633	} else {
3634	err = -EINVAL;
3635	}
3636	break;
3637	case TCP_FASTOPEN_CONNECT:
3638	if (val > `1` \|\| val < `0`) {
3639	err = -EINVAL;
3640	} else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
3641	TFO_CLIENT_ENABLE) {
3642	if (sk->sk_state == TCP_CLOSE)
3643	tp->fastopen_connect = val;
3644	else
3645	err = -EINVAL;
3646	} else {
3647	err = -EOPNOTSUPP;
3648	}
3649	break;
3650	case TCP_FASTOPEN_NO_COOKIE:
3651	if (val > `1` \|\| val < `0`)
3652	err = -EINVAL;
3653	else if (!((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
3654	err = -EINVAL;
3655	else
3656	tp->fastopen_no_cookie = val;
3657	break;
3658	case TCP_TIMESTAMP:
3659	if (!tp->repair) {
3660	err = -EPERM;
3661	break;
3662	}
3663	/ val is an opaque field,*
3664	* and low order bit contains usec_ts enable bit.
3665	* Its a best effort, and we do not care if user makes an error.
3666	*/
3667	tp->tcp_usec_ts = val & `1`;
3668	WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
3669	break;
3670	case TCP_REPAIR_WINDOW:
3671	err = tcp_repair_set_window(tp, optbuf: optval, len: optlen);
3672	break;
3673	case TCP_NOTSENT_LOWAT:
3674	WRITE_ONCE(tp->notsent_lowat, val);
3675	sk->sk_write_space(sk);
3676	break;
3677	case TCP_INQ:
3678	if (val > `1` \|\| val < `0`)
3679	err = -EINVAL;
3680	else
3681	tp->recvmsg_inq = val;
3682	break;
3683	case TCP_TX_DELAY:
3684	if (val)
3685	tcp_enable_tx_delay();
3686	WRITE_ONCE(tp->tcp_tx_delay, val);
3687	break;
3688	default:
3689	err = -ENOPROTOOPT;
3690	break;
3691	}
3692
3693	sockopt_release_sock(sk);
3694	return err;
3695	}
3696
3697	int tcp_setsockopt(struct sock sk, int* level, int optname, sockptr_t optval,
3698	unsigned int optlen)
3699	{
3700	const struct inet_connection_sock *icsk = inet_csk(sk);
3701
3702	if (level != SOL_TCP)
3703	/ Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() /
3704	return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
3705	optval, optlen);
3706	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3707	}
3708	EXPORT_SYMBOL(tcp_setsockopt);
3709
3710	static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3711	struct tcp_info *info)
3712	{
3713	u64 stats[__TCP_CHRONO_MAX], total = `0`;
3714	enum tcp_chrono i;
3715
3716	for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3717	stats[i] = tp->chrono_stat[i - `1`];
3718	if (i == tp->chrono_type)
3719	stats[i] += tcp_jiffies32 - tp->chrono_start;
3720	stats[i] *= USEC_PER_SEC / HZ;
3721	total += stats[i];
3722	}
3723
3724	info->tcpi_busy_time = total;
3725	info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3726	info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3727	}
3728
3729	/ Return information about state of tcp endpoint in API format. /
3730	void tcp_get_info(struct sock sk, struct* tcp_info *info)
3731	{
3732	const struct tcp_sock tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM /
3733	const struct inet_connection_sock *icsk = inet_csk(sk);
3734	unsigned long rate;
3735	u32 now;
3736	u64 rate64;
3737	bool slow;
3738
3739	memset(info, `0`, sizeof(*info));
3740	if (sk->sk_type != SOCK_STREAM)
3741	return;
3742
3743	info->tcpi_state = inet_sk_state_load(sk);
3744
3745	/ Report meaningful fields for all TCP states, including listeners /
3746	rate = READ_ONCE(sk->sk_pacing_rate);
3747	rate64 = (rate != ~`0UL`) ? rate : ~`0ULL`;
3748	info->tcpi_pacing_rate = rate64;
3749
3750	rate = READ_ONCE(sk->sk_max_pacing_rate);
3751	rate64 = (rate != ~`0UL`) ? rate : ~`0ULL`;
3752	info->tcpi_max_pacing_rate = rate64;
3753
3754	info->tcpi_reordering = tp->reordering;
3755	info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
3756
3757	if (info->tcpi_state == TCP_LISTEN) {
3758	/ listeners aliased fields :*
3759	* tcpi_unacked -> Number of children ready for accept()
3760	* tcpi_sacked -> max backlog
3761	*/
3762	info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
3763	info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
3764	return;
3765	}
3766
3767	slow = lock_sock_fast(sk);
3768
3769	info->tcpi_ca_state = icsk->icsk_ca_state;
3770	info->tcpi_retransmits = icsk->icsk_retransmits;
3771	info->tcpi_probes = icsk->icsk_probes_out;
3772	info->tcpi_backoff = icsk->icsk_backoff;
3773
3774	if (tp->rx_opt.tstamp_ok)
3775	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
3776	if (tcp_is_sack(tp))
3777	info->tcpi_options \|= TCPI_OPT_SACK;
3778	if (tp->rx_opt.wscale_ok) {
3779	info->tcpi_options \|= TCPI_OPT_WSCALE;
3780	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3781	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3782	}
3783
3784	if (tp->ecn_flags & TCP_ECN_OK)
3785	info->tcpi_options \|= TCPI_OPT_ECN;
3786	if (tp->ecn_flags & TCP_ECN_SEEN)
3787	info->tcpi_options \|= TCPI_OPT_ECN_SEEN;
3788	if (tp->syn_data_acked)
3789	info->tcpi_options \|= TCPI_OPT_SYN_DATA;
3790	if (tp->tcp_usec_ts)
3791	info->tcpi_options \|= TCPI_OPT_USEC_TS;
3792
3793	info->tcpi_rto = jiffies_to_usecs(j: icsk->icsk_rto);
3794	info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
3795	tcp_delack_max(sk)));
3796	info->tcpi_snd_mss = tp->mss_cache;
3797	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3798
3799	info->tcpi_unacked = tp->packets_out;
3800	info->tcpi_sacked = tp->sacked_out;
3801
3802	info->tcpi_lost = tp->lost_out;
3803	info->tcpi_retrans = tp->retrans_out;
3804
3805	now = tcp_jiffies32;
3806	info->tcpi_last_data_sent = jiffies_to_msecs(j: now - tp->lsndtime);
3807	info->tcpi_last_data_recv = jiffies_to_msecs(j: now - icsk->icsk_ack.lrcvtime);
3808	info->tcpi_last_ack_recv = jiffies_to_msecs(j: now - tp->rcv_tstamp);
3809
3810	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3811	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3812	info->tcpi_rtt = tp->srtt_us >> `3`;
3813	info->tcpi_rttvar = tp->mdev_us >> `2`;
3814	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3815	info->tcpi_advmss = tp->advmss;
3816
3817	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> `3`;
3818	info->tcpi_rcv_space = tp->rcvq_space.space;
3819
3820	info->tcpi_total_retrans = tp->total_retrans;
3821
3822	info->tcpi_bytes_acked = tp->bytes_acked;
3823	info->tcpi_bytes_received = tp->bytes_received;
3824	info->tcpi_notsent_bytes = max_t(int, `0`, tp->write_seq - tp->snd_nxt);
3825	tcp_get_info_chrono_stats(tp, info);
3826
3827	info->tcpi_segs_out = tp->segs_out;
3828
3829	/ segs_in and data_segs_in can be updated from tcp_segs_in() from BH /
3830	info->tcpi_segs_in = READ_ONCE(tp->segs_in);
3831	info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
3832
3833	info->tcpi_min_rtt = tcp_min_rtt(tp);
3834	info->tcpi_data_segs_out = tp->data_segs_out;
3835
3836	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? `1` : `0`;
3837	rate64 = tcp_compute_delivery_rate(tp);
3838	if (rate64)
3839	info->tcpi_delivery_rate = rate64;
3840	info->tcpi_delivered = tp->delivered;
3841	info->tcpi_delivered_ce = tp->delivered_ce;
3842	info->tcpi_bytes_sent = tp->bytes_sent;
3843	info->tcpi_bytes_retrans = tp->bytes_retrans;
3844	info->tcpi_dsack_dups = tp->dsack_dups;
3845	info->tcpi_reord_seen = tp->reord_seen;
3846	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3847	info->tcpi_snd_wnd = tp->snd_wnd;
3848	info->tcpi_rcv_wnd = tp->rcv_wnd;
3849	info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
3850	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
3851
3852	info->tcpi_total_rto = tp->total_rto;
3853	info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
3854	info->tcpi_total_rto_time = tp->total_rto_time;
3855	if (tp->rto_stamp)
3856	info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
3857
3858	unlock_sock_fast(sk, slow);
3859	}
3860	EXPORT_SYMBOL_GPL(tcp_get_info);
3861
3862	static size_t tcp_opt_stats_get_size(void)
3863	{
3864	return
3865	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_BUSY /
3866	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_RWND_LIMITED /
3867	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_SNDBUF_LIMITED /
3868	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_DATA_SEGS_OUT /
3869	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_TOTAL_RETRANS /
3870	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_PACING_RATE /
3871	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_DELIVERY_RATE /
3872	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SND_CWND /
3873	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_REORDERING /
3874	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_MIN_RTT /
3875	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_RECUR_RETRANS /
3876	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_DELIVERY_RATE_APP_LMT /
3877	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SNDQ_SIZE /
3878	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_CA_STATE /
3879	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SND_SSTHRESH /
3880	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_DELIVERED /
3881	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_DELIVERED_CE /
3882	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_BYTES_SENT /
3883	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_BYTES_RETRANS /
3884	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_DSACK_DUPS /
3885	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_REORD_SEEN /
3886	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SRTT /
3887	nla_total_size(payload: sizeof(u16)) + / TCP_NLA_TIMEOUT_REHASH /
3888	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_BYTES_NOTSENT /
3889	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_EDT /
3890	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_TTL /
3891	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_REHASH /
3892	`0`;
3893	}
3894
3895	/ Returns TTL or hop limit of an incoming packet from skb. /
3896	static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
3897	{
3898	if (skb->protocol == htons(ETH_P_IP))
3899	return ip_hdr(skb)->ttl;
3900	else if (skb->protocol == htons(ETH_P_IPV6))
3901	return ipv6_hdr(skb)->hop_limit;
3902	else
3903	return `0`;
3904	}
3905
3906	struct sk_buff tcp_get_timestamping_opt_stats(const* struct sock *sk,
3907	const struct sk_buff *orig_skb,
3908	const struct sk_buff *ack_skb)
3909	{
3910	const struct tcp_sock *tp = tcp_sk(sk);
3911	struct sk_buff *stats;
3912	struct tcp_info info;
3913	unsigned long rate;
3914	u64 rate64;
3915
3916	stats = alloc_skb(size: tcp_opt_stats_get_size(), GFP_ATOMIC);
3917	if (!stats)
3918	return NULL;
3919
3920	tcp_get_info_chrono_stats(tp, info: &info);
3921	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_BUSY,
3922	value: info.tcpi_busy_time, padattr: TCP_NLA_PAD);
3923	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_RWND_LIMITED,
3924	value: info.tcpi_rwnd_limited, padattr: TCP_NLA_PAD);
3925	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_SNDBUF_LIMITED,
3926	value: info.tcpi_sndbuf_limited, padattr: TCP_NLA_PAD);
3927	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_DATA_SEGS_OUT,
3928	value: tp->data_segs_out, padattr: TCP_NLA_PAD);
3929	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_TOTAL_RETRANS,
3930	value: tp->total_retrans, padattr: TCP_NLA_PAD);
3931
3932	rate = READ_ONCE(sk->sk_pacing_rate);
3933	rate64 = (rate != ~`0UL`) ? rate : ~`0ULL`;
3934	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_PACING_RATE, value: rate64, padattr: TCP_NLA_PAD);
3935
3936	rate64 = tcp_compute_delivery_rate(tp);
3937	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_DELIVERY_RATE, value: rate64, padattr: TCP_NLA_PAD);
3938
3939	nla_put_u32(skb: stats, attrtype: TCP_NLA_SND_CWND, value: tcp_snd_cwnd(tp));
3940	nla_put_u32(skb: stats, attrtype: TCP_NLA_REORDERING, value: tp->reordering);
3941	nla_put_u32(skb: stats, attrtype: TCP_NLA_MIN_RTT, value: tcp_min_rtt(tp));
3942
3943	nla_put_u8(skb: stats, attrtype: TCP_NLA_RECUR_RETRANS, value: inet_csk(sk)->icsk_retransmits);
3944	nla_put_u8(skb: stats, attrtype: TCP_NLA_DELIVERY_RATE_APP_LMT, value: !!tp->rate_app_limited);
3945	nla_put_u32(skb: stats, attrtype: TCP_NLA_SND_SSTHRESH, value: tp->snd_ssthresh);
3946	nla_put_u32(skb: stats, attrtype: TCP_NLA_DELIVERED, value: tp->delivered);
3947	nla_put_u32(skb: stats, attrtype: TCP_NLA_DELIVERED_CE, value: tp->delivered_ce);
3948
3949	nla_put_u32(skb: stats, attrtype: TCP_NLA_SNDQ_SIZE, value: tp->write_seq - tp->snd_una);
3950	nla_put_u8(skb: stats, attrtype: TCP_NLA_CA_STATE, value: inet_csk(sk)->icsk_ca_state);
3951
3952	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_BYTES_SENT, value: tp->bytes_sent,
3953	padattr: TCP_NLA_PAD);
3954	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_BYTES_RETRANS, value: tp->bytes_retrans,
3955	padattr: TCP_NLA_PAD);
3956	nla_put_u32(skb: stats, attrtype: TCP_NLA_DSACK_DUPS, value: tp->dsack_dups);
3957	nla_put_u32(skb: stats, attrtype: TCP_NLA_REORD_SEEN, value: tp->reord_seen);
3958	nla_put_u32(skb: stats, attrtype: TCP_NLA_SRTT, value: tp->srtt_us >> `3`);
3959	nla_put_u16(skb: stats, attrtype: TCP_NLA_TIMEOUT_REHASH, value: tp->timeout_rehash);
3960	nla_put_u32(skb: stats, attrtype: TCP_NLA_BYTES_NOTSENT,
3961	max_t(int, `0`, tp->write_seq - tp->snd_nxt));
3962	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_EDT, value: orig_skb->skb_mstamp_ns,
3963	padattr: TCP_NLA_PAD);
3964	if (ack_skb)
3965	nla_put_u8(skb: stats, attrtype: TCP_NLA_TTL,
3966	value: tcp_skb_ttl_or_hop_limit(skb: ack_skb));
3967
3968	nla_put_u32(skb: stats, attrtype: TCP_NLA_REHASH, value: tp->plb_rehash + tp->timeout_rehash);
3969	return stats;
3970	}
3971
3972	int do_tcp_getsockopt(struct sock sk, int* level,
3973	int optname, sockptr_t optval, sockptr_t optlen)
3974	{
3975	struct inet_connection_sock *icsk = inet_csk(sk);
3976	struct tcp_sock *tp = tcp_sk(sk);
3977	struct net *net = sock_net(sk);
3978	int val, len;
3979
3980	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
3981	return -EFAULT;
3982
3983	len = min_t(unsigned int, len, sizeof(int));
3984
3985	if (len < `0`)
3986	return -EINVAL;
3987
3988	switch (optname) {
3989	case TCP_MAXSEG:
3990	val = tp->mss_cache;
3991	if (tp->rx_opt.user_mss &&
3992	((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
3993	val = tp->rx_opt.user_mss;
3994	if (tp->repair)
3995	val = tp->rx_opt.mss_clamp;
3996	break;
3997	case TCP_NODELAY:
3998	val = !!(tp->nonagle&TCP_NAGLE_OFF);
3999	break;
4000	case TCP_CORK:
4001	val = !!(tp->nonagle&TCP_NAGLE_CORK);
4002	break;
4003	case TCP_KEEPIDLE:
4004	val = keepalive_time_when(tp) / HZ;
4005	break;
4006	case TCP_KEEPINTVL:
4007	val = keepalive_intvl_when(tp) / HZ;
4008	break;
4009	case TCP_KEEPCNT:
4010	val = keepalive_probes(tp);
4011	break;
4012	case TCP_SYNCNT:
4013	val = READ_ONCE(icsk->icsk_syn_retries) ? :
4014	READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
4015	break;
4016	case TCP_LINGER2:
4017	val = READ_ONCE(tp->linger2);
4018	if (val >= `0`)
4019	val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
4020	break;
4021	case TCP_DEFER_ACCEPT:
4022	val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
4023	val = retrans_to_secs(retrans: val, TCP_TIMEOUT_INIT / HZ,
4024	TCP_RTO_MAX / HZ);
4025	break;
4026	case TCP_WINDOW_CLAMP:
4027	val = tp->window_clamp;
4028	break;
4029	case TCP_INFO: {
4030	struct tcp_info info;
4031
4032	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4033	return -EFAULT;
4034
4035	tcp_get_info(sk, &info);
4036
4037	len = min_t(unsigned int, len, sizeof(info));
4038	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4039	return -EFAULT;
4040	if (copy_to_sockptr(dst: optval, src: &info, size: len))
4041	return -EFAULT;
4042	return `0`;
4043	}
4044	case TCP_CC_INFO: {
4045	const struct tcp_congestion_ops *ca_ops;
4046	union tcp_cc_info info;
4047	size_t sz = `0`;
4048	int attr;
4049
4050	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4051	return -EFAULT;
4052
4053	ca_ops = icsk->icsk_ca_ops;
4054	if (ca_ops && ca_ops->get_info)
4055	sz = ca_ops->get_info(sk, ~`0U`, &attr, &info);
4056
4057	len = min_t(unsigned int, len, sz);
4058	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4059	return -EFAULT;
4060	if (copy_to_sockptr(dst: optval, src: &info, size: len))
4061	return -EFAULT;
4062	return `0`;
4063	}
4064	case TCP_QUICKACK:
4065	val = !inet_csk_in_pingpong_mode(sk);
4066	break;
4067
4068	case TCP_CONGESTION:
4069	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4070	return -EFAULT;
4071	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4072	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4073	return -EFAULT;
4074	if (copy_to_sockptr(dst: optval, src: icsk->icsk_ca_ops->name, size: len))
4075	return -EFAULT;
4076	return `0`;
4077
4078	case TCP_ULP:
4079	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4080	return -EFAULT;
4081	len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4082	if (!icsk->icsk_ulp_ops) {
4083	len = `0`;
4084	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4085	return -EFAULT;
4086	return `0`;
4087	}
4088	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4089	return -EFAULT;
4090	if (copy_to_sockptr(dst: optval, src: icsk->icsk_ulp_ops->name, size: len))
4091	return -EFAULT;
4092	return `0`;
4093
4094	case TCP_FASTOPEN_KEY: {
4095	u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4096	unsigned int key_len;
4097
4098	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4099	return -EFAULT;
4100
4101	key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4102	TCP_FASTOPEN_KEY_LENGTH;
4103	len = min_t(unsigned int, len, key_len);
4104	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4105	return -EFAULT;
4106	if (copy_to_sockptr(dst: optval, src: key, size: len))
4107	return -EFAULT;
4108	return `0`;
4109	}
4110	case TCP_THIN_LINEAR_TIMEOUTS:
4111	val = tp->thin_lto;
4112	break;
4113
4114	case TCP_THIN_DUPACK:
4115	val = `0`;
4116	break;
4117
4118	case TCP_REPAIR:
4119	val = tp->repair;
4120	break;
4121
4122	case TCP_REPAIR_QUEUE:
4123	if (tp->repair)
4124	val = tp->repair_queue;
4125	else
4126	return -EINVAL;
4127	break;
4128
4129	case TCP_REPAIR_WINDOW: {
4130	struct tcp_repair_window opt;
4131
4132	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4133	return -EFAULT;
4134
4135	if (len != sizeof(opt))
4136	return -EINVAL;
4137
4138	if (!tp->repair)
4139	return -EPERM;
4140
4141	opt.snd_wl1 = tp->snd_wl1;
4142	opt.snd_wnd = tp->snd_wnd;
4143	opt.max_window = tp->max_window;
4144	opt.rcv_wnd = tp->rcv_wnd;
4145	opt.rcv_wup = tp->rcv_wup;
4146
4147	if (copy_to_sockptr(dst: optval, src: &opt, size: len))
4148	return -EFAULT;
4149	return `0`;
4150	}
4151	case TCP_QUEUE_SEQ:
4152	if (tp->repair_queue == TCP_SEND_QUEUE)
4153	val = tp->write_seq;
4154	else if (tp->repair_queue == TCP_RECV_QUEUE)
4155	val = tp->rcv_nxt;
4156	else
4157	return -EINVAL;
4158	break;
4159
4160	case TCP_USER_TIMEOUT:
4161	val = READ_ONCE(icsk->icsk_user_timeout);
4162	break;
4163
4164	case TCP_FASTOPEN:
4165	val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
4166	break;
4167
4168	case TCP_FASTOPEN_CONNECT:
4169	val = tp->fastopen_connect;
4170	break;
4171
4172	case TCP_FASTOPEN_NO_COOKIE:
4173	val = tp->fastopen_no_cookie;
4174	break;
4175
4176	case TCP_TX_DELAY:
4177	val = READ_ONCE(tp->tcp_tx_delay);
4178	break;
4179
4180	case TCP_TIMESTAMP:
4181	val = tcp_clock_ts(usec_ts: tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
4182	if (tp->tcp_usec_ts)
4183	val \|= `1`;
4184	else
4185	val &= ~`1`;
4186	break;
4187	case TCP_NOTSENT_LOWAT:
4188	val = READ_ONCE(tp->notsent_lowat);
4189	break;
4190	case TCP_INQ:
4191	val = tp->recvmsg_inq;
4192	break;
4193	case TCP_SAVE_SYN:
4194	val = tp->save_syn;
4195	break;
4196	case TCP_SAVED_SYN: {
4197	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4198	return -EFAULT;
4199
4200	sockopt_lock_sock(sk);
4201	if (tp->saved_syn) {
4202	if (len < tcp_saved_syn_len(saved_syn: tp->saved_syn)) {
4203	len = tcp_saved_syn_len(saved_syn: tp->saved_syn);
4204	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int))) {
4205	sockopt_release_sock(sk);
4206	return -EFAULT;
4207	}
4208	sockopt_release_sock(sk);
4209	return -EINVAL;
4210	}
4211	len = tcp_saved_syn_len(saved_syn: tp->saved_syn);
4212	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int))) {
4213	sockopt_release_sock(sk);
4214	return -EFAULT;
4215	}
4216	if (copy_to_sockptr(dst: optval, src: tp->saved_syn->data, size: len)) {
4217	sockopt_release_sock(sk);
4218	return -EFAULT;
4219	}
4220	tcp_saved_syn_free(tp);
4221	sockopt_release_sock(sk);
4222	} else {
4223	sockopt_release_sock(sk);
4224	len = `0`;
4225	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4226	return -EFAULT;
4227	}
4228	return `0`;
4229	}
4230	#ifdef CONFIG_MMU
4231	case TCP_ZEROCOPY_RECEIVE: {
4232	struct scm_timestamping_internal tss;
4233	struct tcp_zerocopy_receive zc = {};
4234	int err;
4235
4236	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4237	return -EFAULT;
4238	if (len < `0` \|\|
4239	len < offsetofend(struct tcp_zerocopy_receive, length))
4240	return -EINVAL;
4241	if (unlikely(len > sizeof(zc))) {
4242	err = check_zeroed_sockptr(src: optval, offset: sizeof(zc),
4243	size: len - sizeof(zc));
4244	if (err < `1`)
4245	return err == `0` ? -EINVAL : err;
4246	len = sizeof(zc);
4247	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4248	return -EFAULT;
4249	}
4250	if (copy_from_sockptr(dst: &zc, src: optval, size: len))
4251	return -EFAULT;
4252	if (zc.reserved)
4253	return -EINVAL;
4254	if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
4255	return -EINVAL;
4256	sockopt_lock_sock(sk);
4257	err = tcp_zerocopy_receive(sk, zc: &zc, tss: &tss);
4258	err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4259	&zc, &len, err);
4260	sockopt_release_sock(sk);
4261	if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4262	goto zerocopy_rcv_cmsg;
4263	switch (len) {
4264	case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4265	goto zerocopy_rcv_cmsg;
4266	case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4267	case offsetofend(struct tcp_zerocopy_receive, msg_control):
4268	case offsetofend(struct tcp_zerocopy_receive, flags):
4269	case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4270	case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4271	case offsetofend(struct tcp_zerocopy_receive, err):
4272	goto zerocopy_rcv_sk_err;
4273	case offsetofend(struct tcp_zerocopy_receive, inq):
4274	goto zerocopy_rcv_inq;
4275	case offsetofend(struct tcp_zerocopy_receive, length):
4276	default:
4277	goto zerocopy_rcv_out;
4278	}
4279	zerocopy_rcv_cmsg:
4280	if (zc.msg_flags & TCP_CMSG_TS)
4281	tcp_zc_finalize_rx_tstamp(sk, zc: &zc, tss: &tss);
4282	else
4283	zc.msg_flags = `0`;
4284	zerocopy_rcv_sk_err:
4285	if (!err)
4286	zc.err = sock_error(sk);
4287	zerocopy_rcv_inq:
4288	zc.inq = tcp_inq_hint(sk);
4289	zerocopy_rcv_out:
4290	if (!err && copy_to_sockptr(dst: optval, src: &zc, size: len))
4291	err = -EFAULT;
4292	return err;
4293	}
4294	#endif
4295	case TCP_AO_REPAIR:
4296	return tcp_ao_get_repair(sk, optval, optlen);
4297	case TCP_AO_GET_KEYS:
4298	case TCP_AO_INFO: {
4299	int err;
4300
4301	sockopt_lock_sock(sk);
4302	if (optname == TCP_AO_GET_KEYS)
4303	err = tcp_ao_get_mkts(sk, optval, optlen);
4304	else
4305	err = tcp_ao_get_sock_info(sk, optval, optlen);
4306	sockopt_release_sock(sk);
4307
4308	return err;
4309	}
4310	default:
4311	return -ENOPROTOOPT;
4312	}
4313
4314	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4315	return -EFAULT;
4316	if (copy_to_sockptr(dst: optval, src: &val, size: len))
4317	return -EFAULT;
4318	return `0`;
4319	}
4320
4321	bool tcp_bpf_bypass_getsockopt(int level, int optname)
4322	{
4323	/ TCP do_tcp_getsockopt has optimized getsockopt implementation*
4324	* to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
4325	*/
4326	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4327	return true;
4328
4329	return false;
4330	}
4331	EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4332
4333	int tcp_getsockopt(struct sock sk, int* level, int optname, char __user *optval,
4334	int __user *optlen)
4335	{
4336	struct inet_connection_sock *icsk = inet_csk(sk);
4337
4338	if (level != SOL_TCP)
4339	/ Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() /
4340	return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
4341	optval, optlen);
4342	return do_tcp_getsockopt(sk, level, optname, optval: USER_SOCKPTR(p: optval),
4343	optlen: USER_SOCKPTR(p: optlen));
4344	}
4345	EXPORT_SYMBOL(tcp_getsockopt);
4346
4347	#ifdef CONFIG_TCP_MD5SIG
4348	int tcp_md5_sigpool_id = -`1`;
4349	EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id);
4350
4351	int tcp_md5_alloc_sigpool(void)
4352	{
4353	size_t scratch_size;
4354	int ret;
4355
4356	scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr);
4357	ret = tcp_sigpool_alloc_ahash(alg: "md5", scratch_size);
4358	if (ret >= `0`) {
4359	/ As long as any md5 sigpool was allocated, the return*
4360	* id would stay the same. Re-write the id only for the case
4361	* when previously all MD5 keys were deleted and this call
4362	* allocates the first MD5 key, which may return a different
4363	* sigpool id than was used previously.
4364	*/
4365	WRITE_ONCE(tcp_md5_sigpool_id, ret); / Avoids the compiler potentially being smart here /
4366	return `0`;
4367	}
4368	return ret;
4369	}
4370
4371	void tcp_md5_release_sigpool(void)
4372	{
4373	tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id));
4374	}
4375
4376	void tcp_md5_add_sigpool(void)
4377	{
4378	tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id));
4379	}
4380
4381	int tcp_md5_hash_key(struct tcp_sigpool *hp,
4382	const struct tcp_md5sig_key *key)
4383	{
4384	u8 keylen = READ_ONCE(key->keylen); / paired with WRITE_ONCE() in tcp_md5_do_add /
4385	struct scatterlist sg;
4386
4387	sg_init_one(&sg, key->key, keylen);
4388	ahash_request_set_crypt(req: hp->req, src: &sg, NULL, nbytes: keylen);
4389
4390	/ We use data_race() because tcp_md5_do_add() might change*
4391	* key->key under us
4392	*/
4393	return data_race(crypto_ahash_update(hp->req));
4394	}
4395	EXPORT_SYMBOL(tcp_md5_hash_key);
4396
4397	/ Called with rcu_read_lock() /
4398	enum skb_drop_reason
4399	tcp_inbound_md5_hash(const struct sock sk, const* struct sk_buff *skb,
4400	const void saddr, const* void *daddr,
4401	int family, int l3index, const __u8 *hash_location)
4402	{
4403	/ This gets called for each TCP segment that has TCP-MD5 option.*
4404	* We have 3 drop cases:
4405	* o No MD5 hash and one expected.
4406	* o MD5 hash and we're not expecting one.
4407	* o MD5 hash and its wrong.
4408	*/
4409	const struct tcp_sock *tp = tcp_sk(sk);
4410	struct tcp_md5sig_key *key;
4411	u8 newhash[`16`];
4412	int genhash;
4413
4414	key = tcp_md5_do_lookup(sk, l3index, addr: saddr, family);
4415
4416	if (!key && hash_location) {
4417	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
4418	tcp_hash_fail("Unexpected MD5 Hash found", family, skb, "");
4419	return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
4420	}
4421
4422	/ Check the signature.*
4423	* To support dual stack listeners, we need to handle
4424	* IPv4-mapped case.
4425	*/
4426	if (family == AF_INET)
4427	genhash = tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
4428	else
4429	genhash = tp->af_specific->calc_md5_hash(newhash, key,
4430	NULL, skb);
4431	if (genhash \|\| memcmp(p: hash_location, q: newhash, size: `16`) != `0`) {
4432	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
4433	if (family == AF_INET) {
4434	tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d",
4435	genhash ? "tcp_v4_calc_md5_hash failed"
4436	: "", l3index);
4437	} else {
4438	if (genhash) {
4439	tcp_hash_fail("MD5 Hash failed",
4440	AF_INET6, skb, "L3 index %d",
4441	l3index);
4442	} else {
4443	tcp_hash_fail("MD5 Hash mismatch",
4444	AF_INET6, skb, "L3 index %d",
4445	l3index);
4446	}
4447	}
4448	return SKB_DROP_REASON_TCP_MD5FAILURE;
4449	}
4450	return SKB_NOT_DROPPED_YET;
4451	}
4452	EXPORT_SYMBOL(tcp_inbound_md5_hash);
4453
4454	#endif
4455
4456	void tcp_done(struct sock *sk)
4457	{
4458	struct request_sock *req;
4459
4460	/ We might be called with a new socket, after*
4461	* inet_csk_prepare_forced_close() has been called
4462	* so we can not use lockdep_sock_is_held(sk)
4463	*/
4464	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, `1`);
4465
4466	if (sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)
4467	TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
4468
4469	tcp_set_state(sk, TCP_CLOSE);
4470	tcp_clear_xmit_timers(sk);
4471	if (req)
4472	reqsk_fastopen_remove(sk, req, reset: false);
4473
4474	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
4475
4476	if (!sock_flag(sk, flag: SOCK_DEAD))
4477	sk->sk_state_change(sk);
4478	else
4479	inet_csk_destroy_sock(sk);
4480	}
4481	EXPORT_SYMBOL_GPL(tcp_done);
4482
4483	int tcp_abort(struct sock sk, int* err)
4484	{
4485	int state = inet_sk_state_load(sk);
4486
4487	if (state == TCP_NEW_SYN_RECV) {
4488	struct request_sock *req = inet_reqsk(sk);
4489
4490	local_bh_disable();
4491	inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
4492	local_bh_enable();
4493	return `0`;
4494	}
4495	if (state == TCP_TIME_WAIT) {
4496	struct inet_timewait_sock *tw = inet_twsk(sk);
4497
4498	refcount_inc(r: &tw->tw_refcnt);
4499	local_bh_disable();
4500	inet_twsk_deschedule_put(tw);
4501	local_bh_enable();
4502	return `0`;
4503	}
4504
4505	/ BPF context ensures sock locking. /
4506	if (!has_current_bpf_ctx())
4507	/ Don't race with userspace socket closes such as tcp_close. /
4508	lock_sock(sk);
4509
4510	if (sk->sk_state == TCP_LISTEN) {
4511	tcp_set_state(sk, TCP_CLOSE);
4512	inet_csk_listen_stop(sk);
4513	}
4514
4515	/ Don't race with BH socket closes such as inet_csk_listen_stop. /
4516	local_bh_disable();
4517	bh_lock_sock(sk);
4518
4519	if (!sock_flag(sk, flag: SOCK_DEAD)) {
4520	WRITE_ONCE(sk->sk_err, err);
4521	/ This barrier is coupled with smp_rmb() in tcp_poll() /
4522	smp_wmb();
4523	sk_error_report(sk);
4524	if (tcp_need_reset(state: sk->sk_state))
4525	tcp_send_active_reset(sk, GFP_ATOMIC);
4526	tcp_done(sk);
4527	}
4528
4529	bh_unlock_sock(sk);
4530	local_bh_enable();
4531	tcp_write_queue_purge(sk);
4532	if (!has_current_bpf_ctx())
4533	release_sock(sk);
4534	return `0`;
4535	}
4536	EXPORT_SYMBOL_GPL(tcp_abort);
4537
4538	extern struct tcp_congestion_ops tcp_reno;
4539
4540	static __initdata unsigned long thash_entries;
4541	static int __init set_thash_entries(char *str)
4542	{
4543	ssize_t ret;
4544
4545	if (!str)
4546	return `0`;
4547
4548	ret = kstrtoul(s: str, base: `0`, res: &thash_entries);
4549	if (ret)
4550	return `0`;
4551
4552	return `1`;
4553	}
4554	__setup("thash_entries=", set_thash_entries);
4555
4556	static void __init tcp_init_mem(void)
4557	{
4558	unsigned long limit = nr_free_buffer_pages() / `16`;
4559
4560	limit = max(limit, `128UL`);
4561	sysctl_tcp_mem[`0`] = limit / `4` * `3`; / 4.68 % /
4562	sysctl_tcp_mem[`1`] = limit; / 6.25 % /
4563	sysctl_tcp_mem[`2`] = sysctl_tcp_mem[`0`] * `2`; / 9.37 % /
4564	}
4565
4566	void __init tcp_init(void)
4567	{
4568	int max_rshare, max_wshare, cnt;
4569	unsigned long limit;
4570	unsigned int i;
4571
4572	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
4573	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
4574	sizeof_field(struct sk_buff, cb));
4575
4576	percpu_counter_init(&tcp_sockets_allocated, `0`, GFP_KERNEL);
4577
4578	timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
4579	mod_timer(timer: &tcp_orphan_timer, expires: jiffies + TCP_ORPHAN_TIMER_PERIOD);
4580
4581	inet_hashinfo2_init(h: &tcp_hashinfo, name: "tcp_listen_portaddr_hash",
4582	numentries: thash_entries, scale: `21`, / one slot per 2 MB/
4583	low_limit: `0`, high_limit: `64` * `1024`);
4584	tcp_hashinfo.bind_bucket_cachep =
4585	kmem_cache_create(name: "tcp_bind_bucket",
4586	size: sizeof(struct inet_bind_bucket), align: `0`,
4587	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \|
4588	SLAB_ACCOUNT,
4589	NULL);
4590	tcp_hashinfo.bind2_bucket_cachep =
4591	kmem_cache_create(name: "tcp_bind2_bucket",
4592	size: sizeof(struct inet_bind2_bucket), align: `0`,
4593	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \|
4594	SLAB_ACCOUNT,
4595	NULL);
4596
4597	/ Size and allocate the main established and bind bucket*
4598	* hash tables.
4599	*
4600	* The methodology is similar to that of the buffer cache.
4601	*/
4602	tcp_hashinfo.ehash =
4603	alloc_large_system_hash(tablename: "TCP established",
4604	bucketsize: sizeof(struct inet_ehash_bucket),
4605	numentries: thash_entries,
4606	scale: `17`, / one slot per 128 KB of memory /
4607	flags: `0`,
4608	NULL,
4609	hash_mask: &tcp_hashinfo.ehash_mask,
4610	low_limit: `0`,
4611	high_limit: thash_entries ? `0` : `512` * `1024`);
4612	for (i = `0`; i <= tcp_hashinfo.ehash_mask; i++)
4613	INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
4614
4615	if (inet_ehash_locks_alloc(hashinfo: &tcp_hashinfo))
4616	panic(fmt: "TCP: failed to alloc ehash_locks");
4617	tcp_hashinfo.bhash =
4618	alloc_large_system_hash(tablename: "TCP bind",
4619	bucketsize: `2` * sizeof(struct inet_bind_hashbucket),
4620	numentries: tcp_hashinfo.ehash_mask + `1`,
4621	scale: `17`, / one slot per 128 KB of memory /
4622	flags: `0`,
4623	hash_shift: &tcp_hashinfo.bhash_size,
4624	NULL,
4625	low_limit: `0`,
4626	high_limit: `64` * `1024`);
4627	tcp_hashinfo.bhash_size = `1U` << tcp_hashinfo.bhash_size;
4628	tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
4629	for (i = `0`; i < tcp_hashinfo.bhash_size; i++) {
4630	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4631	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4632	spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
4633	INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
4634	}
4635
4636	tcp_hashinfo.pernet = false;
4637
4638	cnt = tcp_hashinfo.ehash_mask + `1`;
4639	sysctl_tcp_max_orphans = cnt / `2`;
4640
4641	tcp_init_mem();
4642	/ Set per-socket limits to no more than 1/128 the pressure threshold /
4643	limit = nr_free_buffer_pages() << (PAGE_SHIFT - `7`);
4644	max_wshare = min(`4UL``1024``1024`, limit);
4645	max_rshare = min(`6UL``1024``1024`, limit);
4646
4647	init_net.ipv4.sysctl_tcp_wmem[`0`] = PAGE_SIZE;
4648	init_net.ipv4.sysctl_tcp_wmem[`1`] = `16`*`1024`;
4649	init_net.ipv4.sysctl_tcp_wmem[`2`] = max(`64`*`1024`, max_wshare);
4650
4651	init_net.ipv4.sysctl_tcp_rmem[`0`] = PAGE_SIZE;
4652	init_net.ipv4.sysctl_tcp_rmem[`1`] = `131072`;
4653	init_net.ipv4.sysctl_tcp_rmem[`2`] = max(`131072`, max_rshare);
4654
4655	pr_info("Hash tables configured (established %u bind %u)\n",
4656	tcp_hashinfo.ehash_mask + `1`, tcp_hashinfo.bhash_size);
4657
4658	tcp_v4_init();
4659	tcp_metrics_init();
4660	BUG_ON(tcp_register_congestion_control(&tcp_reno) != `0`);
4661	tcp_tasklet_init();
4662	mptcp_init();
4663	}
4664

source code of linux/net/ipv4/tcp.c