sock.c source code [linux/net/core/sock.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Generic socket support routines. Memory allocators, socket lock/release
8	* handler for protocols to use and generic option handler.
9	*
10	* Authors: Ross Biro
11	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12	* Florian La Roche, <flla@stud.uni-sb.de>
13	* Alan Cox, <A.Cox@swansea.ac.uk>
14	*
15	* Fixes:
16	* Alan Cox : Numerous verify_area() problems
17	* Alan Cox : Connecting on a connecting socket
18	* now returns an error for tcp.
19	* Alan Cox : sock->protocol is set correctly.
20	* and is not sometimes left as 0.
21	* Alan Cox : connect handles icmp errors on a
22	* connect properly. Unfortunately there
23	* is a restart syscall nasty there. I
24	* can't match BSD without hacking the C
25	* library. Ideas urgently sought!
26	* Alan Cox : Disallow bind() to addresses that are
27	* not ours - especially broadcast ones!!
28	* Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29	* Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30	* instead they leave that for the DESTROY timer.
31	* Alan Cox : Clean up error flag in accept
32	* Alan Cox : TCP ack handling is buggy, the DESTROY timer
33	* was buggy. Put a remove_sock() in the handler
34	* for memory when we hit 0. Also altered the timer
35	* code. The ACK stuff can wait and needs major
36	* TCP layer surgery.
37	* Alan Cox : Fixed TCP ack bug, removed remove sock
38	* and fixed timer/inet_bh race.
39	* Alan Cox : Added zapped flag for TCP
40	* Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41	* Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42	* Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43	* Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44	* Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45	* Rick Sladkey : Relaxed UDP rules for matching packets.
46	* C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47	* Pauline Middelink : identd support
48	* Alan Cox : Fixed connect() taking signals I think.
49	* Alan Cox : SO_LINGER supported
50	* Alan Cox : Error reporting fixes
51	* Anonymous : inet_create tidied up (sk->reuse setting)
52	* Alan Cox : inet sockets don't set sk->type!
53	* Alan Cox : Split socket option code
54	* Alan Cox : Callbacks
55	* Alan Cox : Nagle flag for Charles & Johannes stuff
56	* Alex : Removed restriction on inet fioctl
57	* Alan Cox : Splitting INET from NET core
58	* Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59	* Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60	* Alan Cox : Split IP from generic code
61	* Alan Cox : New kfree_skbmem()
62	* Alan Cox : Make SO_DEBUG superuser only.
63	* Alan Cox : Allow anyone to clear SO_DEBUG
64	* (compatibility fix)
65	* Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66	* Alan Cox : Allocator for a socket is settable.
67	* Alan Cox : SO_ERROR includes soft errors.
68	* Alan Cox : Allow NULL arguments on some SO_ opts
69	* Alan Cox : Generic socket allocation to make hooks
70	* easier (suggested by Craig Metz).
71	* Michael Pall : SO_ERROR returns positive errno again
72	* Steve Whitehouse: Added default destructor to free
73	* protocol private data.
74	* Steve Whitehouse: Added various other default routines
75	* common to several socket families.
76	* Chris Evans : Call suser() check last on F_SETOWN
77	* Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78	* Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79	* Andi Kleen : Fix write_space callback
80	* Chris Evans : Security fixes - signedness again
81	* Arnaldo C. Melo : cleanups, use skb_queue_purge
82	*
83	* To Fix:
84	*/
85
86	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88	#include <asm/unaligned.h>
89	#include <linux/capability.h>
90	#include <linux/errno.h>
91	#include <linux/errqueue.h>
92	#include <linux/types.h>
93	#include <linux/socket.h>
94	#include <linux/in.h>
95	#include <linux/kernel.h>
96	#include <linux/module.h>
97	#include <linux/proc_fs.h>
98	#include <linux/seq_file.h>
99	#include <linux/sched.h>
100	#include <linux/sched/mm.h>
101	#include <linux/timer.h>
102	#include <linux/string.h>
103	#include <linux/sockios.h>
104	#include <linux/net.h>
105	#include <linux/mm.h>
106	#include <linux/slab.h>
107	#include <linux/interrupt.h>
108	#include <linux/poll.h>
109	#include <linux/tcp.h>
110	#include <linux/init.h>
111	#include <linux/highmem.h>
112	#include <linux/user_namespace.h>
113	#include <linux/static_key.h>
114	#include <linux/memcontrol.h>
115	#include <linux/prefetch.h>
116	#include <linux/compat.h>
117	#include <linux/mroute.h>
118	#include <linux/mroute6.h>
119	#include <linux/icmpv6.h>
120
121	#include <linux/uaccess.h>
122
123	#include <linux/netdevice.h>
124	#include <net/protocol.h>
125	#include <linux/skbuff.h>
126	#include <net/net_namespace.h>
127	#include <net/request_sock.h>
128	#include <net/sock.h>
129	#include <linux/net_tstamp.h>
130	#include <net/xfrm.h>
131	#include <linux/ipsec.h>
132	#include <net/cls_cgroup.h>
133	#include <net/netprio_cgroup.h>
134	#include <linux/sock_diag.h>
135
136	#include <linux/filter.h>
137	#include <net/sock_reuseport.h>
138	#include <net/bpf_sk_storage.h>
139
140	#include <trace/events/sock.h>
141
142	#include <net/tcp.h>
143	#include <net/busy_poll.h>
144	#include <net/phonet/phonet.h>
145
146	#include <linux/ethtool.h>
147
148	#include "dev.h"
149
150	static DEFINE_MUTEX(proto_list_mutex);
151	static LIST_HEAD(proto_list);
152
153	static void sock_def_write_space_wfree(struct sock *sk);
154	static void sock_def_write_space(struct sock *sk);
155
156	/**
157	* sk_ns_capable - General socket capability test
158	* @sk: Socket to use a capability on or through
159	* @user_ns: The user namespace of the capability to use
160	* @cap: The capability to use
161	*
162	* Test to see if the opener of the socket had when the socket was
163	* created and the current process has the capability @cap in the user
164	* namespace @user_ns.
165	*/
166	bool sk_ns_capable(const struct sock *sk,
167	struct user_namespace user_ns, int* cap)
168	{
169	return file_ns_capable(file: sk->sk_socket->file, ns: user_ns, cap) &&
170	ns_capable(ns: user_ns, cap);
171	}
172	EXPORT_SYMBOL(sk_ns_capable);
173
174	/**
175	* sk_capable - Socket global capability test
176	* @sk: Socket to use a capability on or through
177	* @cap: The global capability to use
178	*
179	* Test to see if the opener of the socket had when the socket was
180	* created and the current process has the capability @cap in all user
181	* namespaces.
182	*/
183	bool sk_capable(const struct sock sk, int* cap)
184	{
185	return sk_ns_capable(sk, &init_user_ns, cap);
186	}
187	EXPORT_SYMBOL(sk_capable);
188
189	/**
190	* sk_net_capable - Network namespace socket capability test
191	* @sk: Socket to use a capability on or through
192	* @cap: The capability to use
193	*
194	* Test to see if the opener of the socket had when the socket was created
195	* and the current process has the capability @cap over the network namespace
196	* the socket is a member of.
197	*/
198	bool sk_net_capable(const struct sock sk, int* cap)
199	{
200	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201	}
202	EXPORT_SYMBOL(sk_net_capable);
203
204	/*
205	* Each address family might have different locking rules, so we have
206	* one slock key per address family and separate keys for internal and
207	* userspace sockets.
208	*/
209	static struct lock_class_key af_family_keys[AF_MAX];
210	static struct lock_class_key af_family_kern_keys[AF_MAX];
211	static struct lock_class_key af_family_slock_keys[AF_MAX];
212	static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
213
214	/*
215	* Make lock validator output more readable. (we pre-construct these
216	* strings build-time, so that runtime initialization of socket
217	* locks is fast):
218	*/
219
220	#define _sock_locks(x) \
221	x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
222	x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
223	x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
224	x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
225	x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
226	x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
227	x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
228	x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
229	x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
230	x "27" , x "28" , x "AF_CAN" , \
231	x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
232	x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
233	x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
234	x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
235	x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
236	x "AF_MCTP" , \
237	x "AF_MAX"
238
239	static const char *const af_family_key_strings[AF_MAX+`1`] = {
240	_sock_locks("sk_lock-")
241	};
242	static const char *const af_family_slock_key_strings[AF_MAX+`1`] = {
243	_sock_locks("slock-")
244	};
245	static const char *const af_family_clock_key_strings[AF_MAX+`1`] = {
246	_sock_locks("clock-")
247	};
248
249	static const char *const af_family_kern_key_strings[AF_MAX+`1`] = {
250	_sock_locks("k-sk_lock-")
251	};
252	static const char *const af_family_kern_slock_key_strings[AF_MAX+`1`] = {
253	_sock_locks("k-slock-")
254	};
255	static const char *const af_family_kern_clock_key_strings[AF_MAX+`1`] = {
256	_sock_locks("k-clock-")
257	};
258	static const char *const af_family_rlock_key_strings[AF_MAX+`1`] = {
259	_sock_locks("rlock-")
260	};
261	static const char *const af_family_wlock_key_strings[AF_MAX+`1`] = {
262	_sock_locks("wlock-")
263	};
264	static const char *const af_family_elock_key_strings[AF_MAX+`1`] = {
265	_sock_locks("elock-")
266	};
267
268	/*
269	* sk_callback_lock and sk queues locking rules are per-address-family,
270	* so split the lock classes by using a per-AF key:
271	*/
272	static struct lock_class_key af_callback_keys[AF_MAX];
273	static struct lock_class_key af_rlock_keys[AF_MAX];
274	static struct lock_class_key af_wlock_keys[AF_MAX];
275	static struct lock_class_key af_elock_keys[AF_MAX];
276	static struct lock_class_key af_kern_callback_keys[AF_MAX];
277
278	/ Run time adjustable parameters. /
279	__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280	EXPORT_SYMBOL(sysctl_wmem_max);
281	__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282	EXPORT_SYMBOL(sysctl_rmem_max);
283	__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284	__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285
286	/ Maximal space eaten by iovec or ancillary data plus some space /
287	int sysctl_optmem_max __read_mostly = sizeof(unsigned long)(`2`UIO_MAXIOV+`512`);
288	EXPORT_SYMBOL(sysctl_optmem_max);
289
290	int sysctl_tstamp_allow_data __read_mostly = `1`;
291
292	DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
293	EXPORT_SYMBOL_GPL(memalloc_socks_key);
294
295	/**
296	* sk_set_memalloc - sets %SOCK_MEMALLOC
297	* @sk: socket to set it on
298	*
299	* Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
300	* It's the responsibility of the admin to adjust min_free_kbytes
301	* to meet the requirements
302	*/
303	void sk_set_memalloc(struct sock *sk)
304	{
305	sock_set_flag(sk, flag: SOCK_MEMALLOC);
306	sk->sk_allocation \|= __GFP_MEMALLOC;
307	static_branch_inc(&memalloc_socks_key);
308	}
309	EXPORT_SYMBOL_GPL(sk_set_memalloc);
310
311	void sk_clear_memalloc(struct sock *sk)
312	{
313	sock_reset_flag(sk, flag: SOCK_MEMALLOC);
314	sk->sk_allocation &= ~__GFP_MEMALLOC;
315	static_branch_dec(&memalloc_socks_key);
316
317	/*
318	* SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
319	* progress of swapping. SOCK_MEMALLOC may be cleared while
320	* it has rmem allocations due to the last swapfile being deactivated
321	* but there is a risk that the socket is unusable due to exceeding
322	* the rmem limits. Reclaim the reserves and obey rmem limits again.
323	*/
324	sk_mem_reclaim(sk);
325	}
326	EXPORT_SYMBOL_GPL(sk_clear_memalloc);
327
328	int __sk_backlog_rcv(struct sock sk, struct* sk_buff *skb)
329	{
330	int ret;
331	unsigned int noreclaim_flag;
332
333	/ these should have been dropped before queueing /
334	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
335
336	noreclaim_flag = memalloc_noreclaim_save();
337	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
338	tcp_v6_do_rcv,
339	tcp_v4_do_rcv,
340	sk, skb);
341	memalloc_noreclaim_restore(flags: noreclaim_flag);
342
343	return ret;
344	}
345	EXPORT_SYMBOL(__sk_backlog_rcv);
346
347	void sk_error_report(struct sock *sk)
348	{
349	sk->sk_error_report(sk);
350
351	switch (sk->sk_family) {
352	case AF_INET:
353	fallthrough;
354	case AF_INET6:
355	trace_inet_sk_error_report(sk);
356	break;
357	default:
358	break;
359	}
360	}
361	EXPORT_SYMBOL(sk_error_report);
362
363	int sock_get_timeout(long timeo, void *optval, bool old_timeval)
364	{
365	struct __kernel_sock_timeval tv;
366
367	if (timeo == MAX_SCHEDULE_TIMEOUT) {
368	tv.tv_sec = `0`;
369	tv.tv_usec = `0`;
370	} else {
371	tv.tv_sec = timeo / HZ;
372	tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
373	}
374
375	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
376	struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
377	(struct* old_timeval32 *)optval = tv32;
378	return sizeof(tv32);
379	}
380
381	if (old_timeval) {
382	struct __kernel_old_timeval old_tv;
383	old_tv.tv_sec = tv.tv_sec;
384	old_tv.tv_usec = tv.tv_usec;
385	(struct* __kernel_old_timeval *)optval = old_tv;
386	return sizeof(old_tv);
387	}
388
389	(struct* __kernel_sock_timeval *)optval = tv;
390	return sizeof(tv);
391	}
392	EXPORT_SYMBOL(sock_get_timeout);
393
394	int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
395	sockptr_t optval, int optlen, bool old_timeval)
396	{
397	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
398	struct old_timeval32 tv32;
399
400	if (optlen < sizeof(tv32))
401	return -EINVAL;
402
403	if (copy_from_sockptr(dst: &tv32, src: optval, size: sizeof(tv32)))
404	return -EFAULT;
405	tv->tv_sec = tv32.tv_sec;
406	tv->tv_usec = tv32.tv_usec;
407	} else if (old_timeval) {
408	struct __kernel_old_timeval old_tv;
409
410	if (optlen < sizeof(old_tv))
411	return -EINVAL;
412	if (copy_from_sockptr(dst: &old_tv, src: optval, size: sizeof(old_tv)))
413	return -EFAULT;
414	tv->tv_sec = old_tv.tv_sec;
415	tv->tv_usec = old_tv.tv_usec;
416	} else {
417	if (optlen < sizeof(*tv))
418	return -EINVAL;
419	if (copy_from_sockptr(dst: tv, src: optval, size: sizeof(*tv)))
420	return -EFAULT;
421	}
422
423	return `0`;
424	}
425	EXPORT_SYMBOL(sock_copy_user_timeval);
426
427	static int sock_set_timeout(long timeo_p, sockptr_t optval, int* optlen,
428	bool old_timeval)
429	{
430	struct __kernel_sock_timeval tv;
431	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
432	long val;
433
434	if (err)
435	return err;
436
437	if (tv.tv_usec < `0` \|\| tv.tv_usec >= USEC_PER_SEC)
438	return -EDOM;
439
440	if (tv.tv_sec < `0`) {
441	static int warned __read_mostly;
442
443	WRITE_ONCE(*timeo_p, `0`);
444	if (warned < `10` && net_ratelimit()) {
445	warned++;
446	pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
447	__func__, current->comm, task_pid_nr(current));
448	}
449	return `0`;
450	}
451	val = MAX_SCHEDULE_TIMEOUT;
452	if ((tv.tv_sec \|\| tv.tv_usec) &&
453	(tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - `1`)))
454	val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
455	USEC_PER_SEC / HZ);
456	WRITE_ONCE(*timeo_p, val);
457	return `0`;
458	}
459
460	static bool sock_needs_netstamp(const struct sock *sk)
461	{
462	switch (sk->sk_family) {
463	case AF_UNSPEC:
464	case AF_UNIX:
465	return false;
466	default:
467	return true;
468	}
469	}
470
471	static void sock_disable_timestamp(struct sock sk, unsigned* long flags)
472	{
473	if (sk->sk_flags & flags) {
474	sk->sk_flags &= ~flags;
475	if (sock_needs_netstamp(sk) &&
476	!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
477	net_disable_timestamp();
478	}
479	}
480
481
482	int __sock_queue_rcv_skb(struct sock sk, struct* sk_buff *skb)
483	{
484	unsigned long flags;
485	struct sk_buff_head *list = &sk->sk_receive_queue;
486
487	if (atomic_read(v: &sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
488	atomic_inc(v: &sk->sk_drops);
489	trace_sock_rcvqueue_full(sk, skb);
490	return -ENOMEM;
491	}
492
493	if (!sk_rmem_schedule(sk, skb, size: skb->truesize)) {
494	atomic_inc(v: &sk->sk_drops);
495	return -ENOBUFS;
496	}
497
498	skb->dev = NULL;
499	skb_set_owner_r(skb, sk);
500
501	/ we escape from rcu protected region, make sure we dont leak*
502	* a norefcounted dst
503	*/
504	skb_dst_force(skb);
505
506	spin_lock_irqsave(&list->lock, flags);
507	sock_skb_set_dropcount(sk, skb);
508	__skb_queue_tail(list, newsk: skb);
509	spin_unlock_irqrestore(lock: &list->lock, flags);
510
511	if (!sock_flag(sk, flag: SOCK_DEAD))
512	sk->sk_data_ready(sk);
513	return `0`;
514	}
515	EXPORT_SYMBOL(__sock_queue_rcv_skb);
516
517	int sock_queue_rcv_skb_reason(struct sock sk, struct* sk_buff *skb,
518	enum skb_drop_reason *reason)
519	{
520	enum skb_drop_reason drop_reason;
521	int err;
522
523	err = sk_filter(sk, skb);
524	if (err) {
525	drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
526	goto out;
527	}
528	err = __sock_queue_rcv_skb(sk, skb);
529	switch (err) {
530	case -ENOMEM:
531	drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
532	break;
533	case -ENOBUFS:
534	drop_reason = SKB_DROP_REASON_PROTO_MEM;
535	break;
536	default:
537	drop_reason = SKB_NOT_DROPPED_YET;
538	break;
539	}
540	out:
541	if (reason)
542	*reason = drop_reason;
543	return err;
544	}
545	EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
546
547	int __sk_receive_skb(struct sock sk, struct* sk_buff *skb,
548	const int nested, unsigned int trim_cap, bool refcounted)
549	{
550	int rc = NET_RX_SUCCESS;
551
552	if (sk_filter_trim_cap(sk, skb, cap: trim_cap))
553	goto discard_and_relse;
554
555	skb->dev = NULL;
556
557	if (sk_rcvqueues_full(sk, limit: sk->sk_rcvbuf)) {
558	atomic_inc(v: &sk->sk_drops);
559	goto discard_and_relse;
560	}
561	if (nested)
562	bh_lock_sock_nested(sk);
563	else
564	bh_lock_sock(sk);
565	if (!sock_owned_by_user(sk)) {
566	/*
567	* trylock + unlock semantics:
568	*/
569	mutex_acquire(&sk->sk_lock.dep_map, `0`, `1`, _RET_IP_);
570
571	rc = sk_backlog_rcv(sk, skb);
572
573	mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
574	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
575	bh_unlock_sock(sk);
576	atomic_inc(v: &sk->sk_drops);
577	goto discard_and_relse;
578	}
579
580	bh_unlock_sock(sk);
581	out:
582	if (refcounted)
583	sock_put(sk);
584	return rc;
585	discard_and_relse:
586	kfree_skb(skb);
587	goto out;
588	}
589	EXPORT_SYMBOL(__sk_receive_skb);
590
591	INDIRECT_CALLABLE_DECLARE(struct dst_entry ip6_dst_check(struct* dst_entry *,
592	u32));
593	INDIRECT_CALLABLE_DECLARE(struct dst_entry ipv4_dst_check(struct* dst_entry *,
594	u32));
595	struct dst_entry __sk_dst_check(struct* sock *sk, u32 cookie)
596	{
597	struct dst_entry *dst = __sk_dst_get(sk);
598
599	if (dst && dst->obsolete &&
600	INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
601	dst, cookie) == NULL) {
602	sk_tx_queue_clear(sk);
603	WRITE_ONCE(sk->sk_dst_pending_confirm, `0`);
604	RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
605	dst_release(dst);
606	return NULL;
607	}
608
609	return dst;
610	}
611	EXPORT_SYMBOL(__sk_dst_check);
612
613	struct dst_entry sk_dst_check(struct* sock *sk, u32 cookie)
614	{
615	struct dst_entry *dst = sk_dst_get(sk);
616
617	if (dst && dst->obsolete &&
618	INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
619	dst, cookie) == NULL) {
620	sk_dst_reset(sk);
621	dst_release(dst);
622	return NULL;
623	}
624
625	return dst;
626	}
627	EXPORT_SYMBOL(sk_dst_check);
628
629	static int sock_bindtoindex_locked(struct sock sk, int* ifindex)
630	{
631	int ret = -ENOPROTOOPT;
632	#ifdef CONFIG_NETDEVICES
633	struct net *net = sock_net(sk);
634
635	/ Sorry... /
636	ret = -EPERM;
637	if (sk->sk_bound_dev_if && !ns_capable(ns: net->user_ns, CAP_NET_RAW))
638	goto out;
639
640	ret = -EINVAL;
641	if (ifindex < `0`)
642	goto out;
643
644	/ Paired with all READ_ONCE() done locklessly. /
645	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
646
647	if (sk->sk_prot->rehash)
648	sk->sk_prot->rehash(sk);
649	sk_dst_reset(sk);
650
651	ret = `0`;
652
653	out:
654	#endif
655
656	return ret;
657	}
658
659	int sock_bindtoindex(struct sock sk, int* ifindex, bool lock_sk)
660	{
661	int ret;
662
663	if (lock_sk)
664	lock_sock(sk);
665	ret = sock_bindtoindex_locked(sk, ifindex);
666	if (lock_sk)
667	release_sock(sk);
668
669	return ret;
670	}
671	EXPORT_SYMBOL(sock_bindtoindex);
672
673	static int sock_setbindtodevice(struct sock sk, sockptr_t optval, int* optlen)
674	{
675	int ret = -ENOPROTOOPT;
676	#ifdef CONFIG_NETDEVICES
677	struct net *net = sock_net(sk);
678	char devname[IFNAMSIZ];
679	int index;
680
681	ret = -EINVAL;
682	if (optlen < `0`)
683	goto out;
684
685	/ Bind this socket to a particular device like "eth0",*
686	* as specified in the passed interface name. If the
687	* name is "" or the option length is zero the socket
688	* is not bound.
689	*/
690	if (optlen > IFNAMSIZ - `1`)
691	optlen = IFNAMSIZ - `1`;
692	memset(devname, `0`, sizeof(devname));
693
694	ret = -EFAULT;
695	if (copy_from_sockptr(dst: devname, src: optval, size: optlen))
696	goto out;
697
698	index = `0`;
699	if (devname[`0`] != `'\0'`) {
700	struct net_device *dev;
701
702	rcu_read_lock();
703	dev = dev_get_by_name_rcu(net, name: devname);
704	if (dev)
705	index = dev->ifindex;
706	rcu_read_unlock();
707	ret = -ENODEV;
708	if (!dev)
709	goto out;
710	}
711
712	sockopt_lock_sock(sk);
713	ret = sock_bindtoindex_locked(sk, ifindex: index);
714	sockopt_release_sock(sk);
715	out:
716	#endif
717
718	return ret;
719	}
720
721	static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
722	sockptr_t optlen, int len)
723	{
724	int ret = -ENOPROTOOPT;
725	#ifdef CONFIG_NETDEVICES
726	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
727	struct net *net = sock_net(sk);
728	char devname[IFNAMSIZ];
729
730	if (bound_dev_if == `0`) {
731	len = `0`;
732	goto zero;
733	}
734
735	ret = -EINVAL;
736	if (len < IFNAMSIZ)
737	goto out;
738
739	ret = netdev_get_name(net, name: devname, ifindex: bound_dev_if);
740	if (ret)
741	goto out;
742
743	len = strlen(devname) + `1`;
744
745	ret = -EFAULT;
746	if (copy_to_sockptr(dst: optval, src: devname, size: len))
747	goto out;
748
749	zero:
750	ret = -EFAULT;
751	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
752	goto out;
753
754	ret = `0`;
755
756	out:
757	#endif
758
759	return ret;
760	}
761
762	bool sk_mc_loop(const struct sock *sk)
763	{
764	if (dev_recursion_level())
765	return false;
766	if (!sk)
767	return true;
768	/ IPV6_ADDRFORM can change sk->sk_family under us. /
769	switch (READ_ONCE(sk->sk_family)) {
770	case AF_INET:
771	return inet_test_bit(MC_LOOP, sk);
772	#if IS_ENABLED(CONFIG_IPV6)
773	case AF_INET6:
774	return inet6_test_bit(MC6_LOOP, sk);
775	#endif
776	}
777	WARN_ON_ONCE(`1`);
778	return true;
779	}
780	EXPORT_SYMBOL(sk_mc_loop);
781
782	void sock_set_reuseaddr(struct sock *sk)
783	{
784	lock_sock(sk);
785	sk->sk_reuse = SK_CAN_REUSE;
786	release_sock(sk);
787	}
788	EXPORT_SYMBOL(sock_set_reuseaddr);
789
790	void sock_set_reuseport(struct sock *sk)
791	{
792	lock_sock(sk);
793	sk->sk_reuseport = true;
794	release_sock(sk);
795	}
796	EXPORT_SYMBOL(sock_set_reuseport);
797
798	void sock_no_linger(struct sock *sk)
799	{
800	lock_sock(sk);
801	WRITE_ONCE(sk->sk_lingertime, `0`);
802	sock_set_flag(sk, flag: SOCK_LINGER);
803	release_sock(sk);
804	}
805	EXPORT_SYMBOL(sock_no_linger);
806
807	void sock_set_priority(struct sock *sk, u32 priority)
808	{
809	WRITE_ONCE(sk->sk_priority, priority);
810	}
811	EXPORT_SYMBOL(sock_set_priority);
812
813	void sock_set_sndtimeo(struct sock *sk, s64 secs)
814	{
815	lock_sock(sk);
816	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - `1`)
817	WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
818	else
819	WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
820	release_sock(sk);
821	}
822	EXPORT_SYMBOL(sock_set_sndtimeo);
823
824	static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
825	{
826	if (val) {
827	sock_valbool_flag(sk, bit: SOCK_TSTAMP_NEW, valbool: new);
828	sock_valbool_flag(sk, bit: SOCK_RCVTSTAMPNS, valbool: ns);
829	sock_set_flag(sk, flag: SOCK_RCVTSTAMP);
830	sock_enable_timestamp(sk, flag: SOCK_TIMESTAMP);
831	} else {
832	sock_reset_flag(sk, flag: SOCK_RCVTSTAMP);
833	sock_reset_flag(sk, flag: SOCK_RCVTSTAMPNS);
834	}
835	}
836
837	void sock_enable_timestamps(struct sock *sk)
838	{
839	lock_sock(sk);
840	__sock_set_timestamps(sk, val: true, new: false, ns: true);
841	release_sock(sk);
842	}
843	EXPORT_SYMBOL(sock_enable_timestamps);
844
845	void sock_set_timestamp(struct sock sk, int* optname, bool valbool)
846	{
847	switch (optname) {
848	case SO_TIMESTAMP_OLD:
849	__sock_set_timestamps(sk, val: valbool, new: false, ns: false);
850	break;
851	case SO_TIMESTAMP_NEW:
852	__sock_set_timestamps(sk, val: valbool, new: true, ns: false);
853	break;
854	case SO_TIMESTAMPNS_OLD:
855	__sock_set_timestamps(sk, val: valbool, new: false, ns: true);
856	break;
857	case SO_TIMESTAMPNS_NEW:
858	__sock_set_timestamps(sk, val: valbool, new: true, ns: true);
859	break;
860	}
861	}
862
863	static int sock_timestamping_bind_phc(struct sock sk, int* phc_index)
864	{
865	struct net *net = sock_net(sk);
866	struct net_device *dev = NULL;
867	bool match = false;
868	int *vclock_index;
869	int i, num;
870
871	if (sk->sk_bound_dev_if)
872	dev = dev_get_by_index(net, ifindex: sk->sk_bound_dev_if);
873
874	if (!dev) {
875	pr_err("%s: sock not bind to device\n", __func__);
876	return -EOPNOTSUPP;
877	}
878
879	num = ethtool_get_phc_vclocks(dev, vclock_index: &vclock_index);
880	dev_put(dev);
881
882	for (i = `0`; i < num; i++) {
883	if (*(vclock_index + i) == phc_index) {
884	match = true;
885	break;
886	}
887	}
888
889	if (num > `0`)
890	kfree(objp: vclock_index);
891
892	if (!match)
893	return -EINVAL;
894
895	WRITE_ONCE(sk->sk_bind_phc, phc_index);
896
897	return `0`;
898	}
899
900	int sock_set_timestamping(struct sock sk, int* optname,
901	struct so_timestamping timestamping)
902	{
903	int val = timestamping.flags;
904	int ret;
905
906	if (val & ~SOF_TIMESTAMPING_MASK)
907	return -EINVAL;
908
909	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
910	!(val & SOF_TIMESTAMPING_OPT_ID))
911	return -EINVAL;
912
913	if (val & SOF_TIMESTAMPING_OPT_ID &&
914	!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
915	if (sk_is_tcp(sk)) {
916	if ((`1` << sk->sk_state) &
917	(TCPF_CLOSE \| TCPF_LISTEN))
918	return -EINVAL;
919	if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
920	atomic_set(v: &sk->sk_tskey, tcp_sk(sk)->write_seq);
921	else
922	atomic_set(v: &sk->sk_tskey, tcp_sk(sk)->snd_una);
923	} else {
924	atomic_set(v: &sk->sk_tskey, i: `0`);
925	}
926	}
927
928	if (val & SOF_TIMESTAMPING_OPT_STATS &&
929	!(val & SOF_TIMESTAMPING_OPT_TSONLY))
930	return -EINVAL;
931
932	if (val & SOF_TIMESTAMPING_BIND_PHC) {
933	ret = sock_timestamping_bind_phc(sk, phc_index: timestamping.bind_phc);
934	if (ret)
935	return ret;
936	}
937
938	WRITE_ONCE(sk->sk_tsflags, val);
939	sock_valbool_flag(sk, bit: SOCK_TSTAMP_NEW, valbool: optname == SO_TIMESTAMPING_NEW);
940
941	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
942	sock_enable_timestamp(sk,
943	flag: SOCK_TIMESTAMPING_RX_SOFTWARE);
944	else
945	sock_disable_timestamp(sk,
946	flags: (`1UL` << SOCK_TIMESTAMPING_RX_SOFTWARE));
947	return `0`;
948	}
949
950	void sock_set_keepalive(struct sock *sk)
951	{
952	lock_sock(sk);
953	if (sk->sk_prot->keepalive)
954	sk->sk_prot->keepalive(sk, true);
955	sock_valbool_flag(sk, bit: SOCK_KEEPOPEN, valbool: true);
956	release_sock(sk);
957	}
958	EXPORT_SYMBOL(sock_set_keepalive);
959
960	static void __sock_set_rcvbuf(struct sock sk, int* val)
961	{
962	/ Ensure val * 2 fits into an int, to prevent max_t() from treating it*
963	* as a negative value.
964	*/
965	val = min_t(int, val, INT_MAX / `2`);
966	sk->sk_userlocks \|= SOCK_RCVBUF_LOCK;
967
968	/ We double it on the way in to account for "struct sk_buff" etc.*
969	* overhead. Applications assume that the SO_RCVBUF setting they make
970	* will allow that much actual data to be received on that socket.
971	*
972	* Applications are unaware that "struct sk_buff" and other overheads
973	* allocate from the receive buffer during socket buffer allocation.
974	*
975	* And after considering the possible alternatives, returning the value
976	* we actually used in getsockopt is the most desirable behavior.
977	*/
978	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * `2`, SOCK_MIN_RCVBUF));
979	}
980
981	void sock_set_rcvbuf(struct sock sk, int* val)
982	{
983	lock_sock(sk);
984	__sock_set_rcvbuf(sk, val);
985	release_sock(sk);
986	}
987	EXPORT_SYMBOL(sock_set_rcvbuf);
988
989	static void __sock_set_mark(struct sock *sk, u32 val)
990	{
991	if (val != sk->sk_mark) {
992	WRITE_ONCE(sk->sk_mark, val);
993	sk_dst_reset(sk);
994	}
995	}
996
997	void sock_set_mark(struct sock *sk, u32 val)
998	{
999	lock_sock(sk);
1000	__sock_set_mark(sk, val);
1001	release_sock(sk);
1002	}
1003	EXPORT_SYMBOL(sock_set_mark);
1004
1005	static void sock_release_reserved_memory(struct sock sk, int* bytes)
1006	{
1007	/ Round down bytes to multiple of pages /
1008	bytes = round_down(bytes, PAGE_SIZE);
1009
1010	WARN_ON(bytes > sk->sk_reserved_mem);
1011	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1012	sk_mem_reclaim(sk);
1013	}
1014
1015	static int sock_reserve_memory(struct sock sk, int* bytes)
1016	{
1017	long allocated;
1018	bool charged;
1019	int pages;
1020
1021	if (!mem_cgroup_sockets_enabled \|\| !sk->sk_memcg \|\| !sk_has_account(sk))
1022	return -EOPNOTSUPP;
1023
1024	if (!bytes)
1025	return `0`;
1026
1027	pages = sk_mem_pages(amt: bytes);
1028
1029	/ pre-charge to memcg /
1030	charged = mem_cgroup_charge_skmem(memcg: sk->sk_memcg, nr_pages: pages,
1031	GFP_KERNEL \| __GFP_RETRY_MAYFAIL);
1032	if (!charged)
1033	return -ENOMEM;
1034
1035	/ pre-charge to forward_alloc /
1036	sk_memory_allocated_add(sk, amt: pages);
1037	allocated = sk_memory_allocated(sk);
1038	/ If the system goes into memory pressure with this*
1039	* precharge, give up and return error.
1040	*/
1041	if (allocated > sk_prot_mem_limits(sk, index: `1`)) {
1042	sk_memory_allocated_sub(sk, amt: pages);
1043	mem_cgroup_uncharge_skmem(memcg: sk->sk_memcg, nr_pages: pages);
1044	return -ENOMEM;
1045	}
1046	sk_forward_alloc_add(sk, val: pages << PAGE_SHIFT);
1047
1048	WRITE_ONCE(sk->sk_reserved_mem,
1049	sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1050
1051	return `0`;
1052	}
1053
1054	void sockopt_lock_sock(struct sock *sk)
1055	{
1056	/ When current->bpf_ctx is set, the setsockopt is called from*
1057	* a bpf prog. bpf has ensured the sk lock has been
1058	* acquired before calling setsockopt().
1059	*/
1060	if (has_current_bpf_ctx())
1061	return;
1062
1063	lock_sock(sk);
1064	}
1065	EXPORT_SYMBOL(sockopt_lock_sock);
1066
1067	void sockopt_release_sock(struct sock *sk)
1068	{
1069	if (has_current_bpf_ctx())
1070	return;
1071
1072	release_sock(sk);
1073	}
1074	EXPORT_SYMBOL(sockopt_release_sock);
1075
1076	bool sockopt_ns_capable(struct user_namespace ns, int* cap)
1077	{
1078	return has_current_bpf_ctx() \|\| ns_capable(ns, cap);
1079	}
1080	EXPORT_SYMBOL(sockopt_ns_capable);
1081
1082	bool sockopt_capable(int cap)
1083	{
1084	return has_current_bpf_ctx() \|\| capable(cap);
1085	}
1086	EXPORT_SYMBOL(sockopt_capable);
1087
1088	/*
1089	* This is meant for all protocols to use and covers goings on
1090	* at the socket level. Everything here is generic.
1091	*/
1092
1093	int sk_setsockopt(struct sock sk, int* level, int optname,
1094	sockptr_t optval, unsigned int optlen)
1095	{
1096	struct so_timestamping timestamping;
1097	struct socket *sock = sk->sk_socket;
1098	struct sock_txtime sk_txtime;
1099	int val;
1100	int valbool;
1101	struct linger ling;
1102	int ret = `0`;
1103
1104	/*
1105	* Options without arguments
1106	*/
1107
1108	if (optname == SO_BINDTODEVICE)
1109	return sock_setbindtodevice(sk, optval, optlen);
1110
1111	if (optlen < sizeof(int))
1112	return -EINVAL;
1113
1114	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
1115	return -EFAULT;
1116
1117	valbool = val ? `1` : `0`;
1118
1119	/ handle options which do not require locking the socket. /
1120	switch (optname) {
1121	case SO_PRIORITY:
1122	if ((val >= `0` && val <= `6`) \|\|
1123	sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) \|\|
1124	sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125	sock_set_priority(sk, val);
1126	return `0`;
1127	}
1128	return -EPERM;
1129	case SO_PASSSEC:
1130	assign_bit(SOCK_PASSSEC, addr: &sock->flags, value: valbool);
1131	return `0`;
1132	case SO_PASSCRED:
1133	assign_bit(SOCK_PASSCRED, addr: &sock->flags, value: valbool);
1134	return `0`;
1135	case SO_PASSPIDFD:
1136	assign_bit(SOCK_PASSPIDFD, addr: &sock->flags, value: valbool);
1137	return `0`;
1138	case SO_TYPE:
1139	case SO_PROTOCOL:
1140	case SO_DOMAIN:
1141	case SO_ERROR:
1142	return -ENOPROTOOPT;
1143	#ifdef CONFIG_NET_RX_BUSY_POLL
1144	case SO_BUSY_POLL:
1145	if (val < `0`)
1146	return -EINVAL;
1147	WRITE_ONCE(sk->sk_ll_usec, val);
1148	return `0`;
1149	case SO_PREFER_BUSY_POLL:
1150	if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1151	return -EPERM;
1152	WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1153	return `0`;
1154	case SO_BUSY_POLL_BUDGET:
1155	if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1156	!sockopt_capable(CAP_NET_ADMIN))
1157	return -EPERM;
1158	if (val < `0` \|\| val > U16_MAX)
1159	return -EINVAL;
1160	WRITE_ONCE(sk->sk_busy_poll_budget, val);
1161	return `0`;
1162	#endif
1163	case SO_MAX_PACING_RATE:
1164	{
1165	unsigned long ulval = (val == ~`0U`) ? ~`0UL` : (unsigned int)val;
1166	unsigned long pacing_rate;
1167
1168	if (sizeof(ulval) != sizeof(val) &&
1169	optlen >= sizeof(ulval) &&
1170	copy_from_sockptr(dst: &ulval, src: optval, size: sizeof(ulval))) {
1171	return -EFAULT;
1172	}
1173	if (ulval != ~`0UL`)
1174	cmpxchg(&sk->sk_pacing_status,
1175	SK_PACING_NONE,
1176	SK_PACING_NEEDED);
1177	/ Pairs with READ_ONCE() from sk_getsockopt() /
1178	WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1179	pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1180	if (ulval < pacing_rate)
1181	WRITE_ONCE(sk->sk_pacing_rate, ulval);
1182	return `0`;
1183	}
1184	case SO_TXREHASH:
1185	if (val < -`1` \|\| val > `1`)
1186	return -EINVAL;
1187	if ((u8)val == SOCK_TXREHASH_DEFAULT)
1188	val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1189	/ Paired with READ_ONCE() in tcp_rtx_synack()*
1190	* and sk_getsockopt().
1191	*/
1192	WRITE_ONCE(sk->sk_txrehash, (u8)val);
1193	return `0`;
1194	}
1195
1196	sockopt_lock_sock(sk);
1197
1198	switch (optname) {
1199	case SO_DEBUG:
1200	if (val && !sockopt_capable(CAP_NET_ADMIN))
1201	ret = -EACCES;
1202	else
1203	sock_valbool_flag(sk, bit: SOCK_DBG, valbool);
1204	break;
1205	case SO_REUSEADDR:
1206	sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1207	break;
1208	case SO_REUSEPORT:
1209	sk->sk_reuseport = valbool;
1210	break;
1211	case SO_DONTROUTE:
1212	sock_valbool_flag(sk, bit: SOCK_LOCALROUTE, valbool);
1213	sk_dst_reset(sk);
1214	break;
1215	case SO_BROADCAST:
1216	sock_valbool_flag(sk, bit: SOCK_BROADCAST, valbool);
1217	break;
1218	case SO_SNDBUF:
1219	/ Don't error on this BSD doesn't and if you think*
1220	* about it this is right. Otherwise apps have to
1221	* play 'guess the biggest size' games. RCVBUF/SNDBUF
1222	* are treated in BSD as hints
1223	*/
1224	val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1225	set_sndbuf:
1226	/ Ensure val * 2 fits into an int, to prevent max_t()*
1227	* from treating it as a negative value.
1228	*/
1229	val = min_t(int, val, INT_MAX / `2`);
1230	sk->sk_userlocks \|= SOCK_SNDBUF_LOCK;
1231	WRITE_ONCE(sk->sk_sndbuf,
1232	max_t(int, val * `2`, SOCK_MIN_SNDBUF));
1233	/ Wake up sending tasks if we upped the value. /
1234	sk->sk_write_space(sk);
1235	break;
1236
1237	case SO_SNDBUFFORCE:
1238	if (!sockopt_capable(CAP_NET_ADMIN)) {
1239	ret = -EPERM;
1240	break;
1241	}
1242
1243	/ No negative values (to prevent underflow, as val will be*
1244	* multiplied by 2).
1245	*/
1246	if (val < `0`)
1247	val = `0`;
1248	goto set_sndbuf;
1249
1250	case SO_RCVBUF:
1251	/ Don't error on this BSD doesn't and if you think*
1252	* about it this is right. Otherwise apps have to
1253	* play 'guess the biggest size' games. RCVBUF/SNDBUF
1254	* are treated in BSD as hints
1255	*/
1256	__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1257	break;
1258
1259	case SO_RCVBUFFORCE:
1260	if (!sockopt_capable(CAP_NET_ADMIN)) {
1261	ret = -EPERM;
1262	break;
1263	}
1264
1265	/ No negative values (to prevent underflow, as val will be*
1266	* multiplied by 2).
1267	*/
1268	__sock_set_rcvbuf(sk, max(val, `0`));
1269	break;
1270
1271	case SO_KEEPALIVE:
1272	if (sk->sk_prot->keepalive)
1273	sk->sk_prot->keepalive(sk, valbool);
1274	sock_valbool_flag(sk, bit: SOCK_KEEPOPEN, valbool);
1275	break;
1276
1277	case SO_OOBINLINE:
1278	sock_valbool_flag(sk, bit: SOCK_URGINLINE, valbool);
1279	break;
1280
1281	case SO_NO_CHECK:
1282	sk->sk_no_check_tx = valbool;
1283	break;
1284
1285	case SO_LINGER:
1286	if (optlen < sizeof(ling)) {
1287	ret = -EINVAL; / 1003.1g /
1288	break;
1289	}
1290	if (copy_from_sockptr(dst: &ling, src: optval, size: sizeof(ling))) {
1291	ret = -EFAULT;
1292	break;
1293	}
1294	if (!ling.l_onoff) {
1295	sock_reset_flag(sk, flag: SOCK_LINGER);
1296	} else {
1297	unsigned long t_sec = ling.l_linger;
1298
1299	if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1300	WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1301	else
1302	WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1303	sock_set_flag(sk, flag: SOCK_LINGER);
1304	}
1305	break;
1306
1307	case SO_BSDCOMPAT:
1308	break;
1309
1310	case SO_TIMESTAMP_OLD:
1311	case SO_TIMESTAMP_NEW:
1312	case SO_TIMESTAMPNS_OLD:
1313	case SO_TIMESTAMPNS_NEW:
1314	sock_set_timestamp(sk, optname, valbool);
1315	break;
1316
1317	case SO_TIMESTAMPING_NEW:
1318	case SO_TIMESTAMPING_OLD:
1319	if (optlen == sizeof(timestamping)) {
1320	if (copy_from_sockptr(dst: &timestamping, src: optval,
1321	size: sizeof(timestamping))) {
1322	ret = -EFAULT;
1323	break;
1324	}
1325	} else {
1326	memset(&timestamping, `0`, sizeof(timestamping));
1327	timestamping.flags = val;
1328	}
1329	ret = sock_set_timestamping(sk, optname, timestamping);
1330	break;
1331
1332	case SO_RCVLOWAT:
1333	{
1334	int (set_rcvlowat)(struct* sock sk, int* val) = NULL;
1335
1336	if (val < `0`)
1337	val = INT_MAX;
1338	if (sock)
1339	set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1340	if (set_rcvlowat)
1341	ret = set_rcvlowat(sk, val);
1342	else
1343	WRITE_ONCE(sk->sk_rcvlowat, val ? : `1`);
1344	break;
1345	}
1346	case SO_RCVTIMEO_OLD:
1347	case SO_RCVTIMEO_NEW:
1348	ret = sock_set_timeout(timeo_p: &sk->sk_rcvtimeo, optval,
1349	optlen, old_timeval: optname == SO_RCVTIMEO_OLD);
1350	break;
1351
1352	case SO_SNDTIMEO_OLD:
1353	case SO_SNDTIMEO_NEW:
1354	ret = sock_set_timeout(timeo_p: &sk->sk_sndtimeo, optval,
1355	optlen, old_timeval: optname == SO_SNDTIMEO_OLD);
1356	break;
1357
1358	case SO_ATTACH_FILTER: {
1359	struct sock_fprog fprog;
1360
1361	ret = copy_bpf_fprog_from_user(dst: &fprog, src: optval, len: optlen);
1362	if (!ret)
1363	ret = sk_attach_filter(fprog: &fprog, sk);
1364	break;
1365	}
1366	case SO_ATTACH_BPF:
1367	ret = -EINVAL;
1368	if (optlen == sizeof(u32)) {
1369	u32 ufd;
1370
1371	ret = -EFAULT;
1372	if (copy_from_sockptr(dst: &ufd, src: optval, size: sizeof(ufd)))
1373	break;
1374
1375	ret = sk_attach_bpf(ufd, sk);
1376	}
1377	break;
1378
1379	case SO_ATTACH_REUSEPORT_CBPF: {
1380	struct sock_fprog fprog;
1381
1382	ret = copy_bpf_fprog_from_user(dst: &fprog, src: optval, len: optlen);
1383	if (!ret)
1384	ret = sk_reuseport_attach_filter(fprog: &fprog, sk);
1385	break;
1386	}
1387	case SO_ATTACH_REUSEPORT_EBPF:
1388	ret = -EINVAL;
1389	if (optlen == sizeof(u32)) {
1390	u32 ufd;
1391
1392	ret = -EFAULT;
1393	if (copy_from_sockptr(dst: &ufd, src: optval, size: sizeof(ufd)))
1394	break;
1395
1396	ret = sk_reuseport_attach_bpf(ufd, sk);
1397	}
1398	break;
1399
1400	case SO_DETACH_REUSEPORT_BPF:
1401	ret = reuseport_detach_prog(sk);
1402	break;
1403
1404	case SO_DETACH_FILTER:
1405	ret = sk_detach_filter(sk);
1406	break;
1407
1408	case SO_LOCK_FILTER:
1409	if (sock_flag(sk, flag: SOCK_FILTER_LOCKED) && !valbool)
1410	ret = -EPERM;
1411	else
1412	sock_valbool_flag(sk, bit: SOCK_FILTER_LOCKED, valbool);
1413	break;
1414
1415	case SO_MARK:
1416	if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1417	!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1418	ret = -EPERM;
1419	break;
1420	}
1421
1422	__sock_set_mark(sk, val);
1423	break;
1424	case SO_RCVMARK:
1425	sock_valbool_flag(sk, bit: SOCK_RCVMARK, valbool);
1426	break;
1427
1428	case SO_RXQ_OVFL:
1429	sock_valbool_flag(sk, bit: SOCK_RXQ_OVFL, valbool);
1430	break;
1431
1432	case SO_WIFI_STATUS:
1433	sock_valbool_flag(sk, bit: SOCK_WIFI_STATUS, valbool);
1434	break;
1435
1436	case SO_PEEK_OFF:
1437	{
1438	int (set_peek_off)(struct* sock sk, int* val);
1439
1440	set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1441	if (set_peek_off)
1442	ret = set_peek_off(sk, val);
1443	else
1444	ret = -EOPNOTSUPP;
1445	break;
1446	}
1447
1448	case SO_NOFCS:
1449	sock_valbool_flag(sk, bit: SOCK_NOFCS, valbool);
1450	break;
1451
1452	case SO_SELECT_ERR_QUEUE:
1453	sock_valbool_flag(sk, bit: SOCK_SELECT_ERR_QUEUE, valbool);
1454	break;
1455
1456
1457	case SO_INCOMING_CPU:
1458	reuseport_update_incoming_cpu(sk, val);
1459	break;
1460
1461	case SO_CNX_ADVICE:
1462	if (val == `1`)
1463	dst_negative_advice(sk);
1464	break;
1465
1466	case SO_ZEROCOPY:
1467	if (sk->sk_family == PF_INET \|\| sk->sk_family == PF_INET6) {
1468	if (!(sk_is_tcp(sk) \|\|
1469	(sk->sk_type == SOCK_DGRAM &&
1470	sk->sk_protocol == IPPROTO_UDP)))
1471	ret = -EOPNOTSUPP;
1472	} else if (sk->sk_family != PF_RDS) {
1473	ret = -EOPNOTSUPP;
1474	}
1475	if (!ret) {
1476	if (val < `0` \|\| val > `1`)
1477	ret = -EINVAL;
1478	else
1479	sock_valbool_flag(sk, bit: SOCK_ZEROCOPY, valbool);
1480	}
1481	break;
1482
1483	case SO_TXTIME:
1484	if (optlen != sizeof(struct sock_txtime)) {
1485	ret = -EINVAL;
1486	break;
1487	} else if (copy_from_sockptr(dst: &sk_txtime, src: optval,
1488	size: sizeof(struct sock_txtime))) {
1489	ret = -EFAULT;
1490	break;
1491	} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1492	ret = -EINVAL;
1493	break;
1494	}
1495	/ CLOCK_MONOTONIC is only used by sch_fq, and this packet*
1496	* scheduler has enough safe guards.
1497	*/
1498	if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1499	!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1500	ret = -EPERM;
1501	break;
1502	}
1503	sock_valbool_flag(sk, bit: SOCK_TXTIME, valbool: true);
1504	sk->sk_clockid = sk_txtime.clockid;
1505	sk->sk_txtime_deadline_mode =
1506	!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1507	sk->sk_txtime_report_errors =
1508	!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1509	break;
1510
1511	case SO_BINDTOIFINDEX:
1512	ret = sock_bindtoindex_locked(sk, ifindex: val);
1513	break;
1514
1515	case SO_BUF_LOCK:
1516	if (val & ~SOCK_BUF_LOCK_MASK) {
1517	ret = -EINVAL;
1518	break;
1519	}
1520	sk->sk_userlocks = val \| (sk->sk_userlocks &
1521	~SOCK_BUF_LOCK_MASK);
1522	break;
1523
1524	case SO_RESERVE_MEM:
1525	{
1526	int delta;
1527
1528	if (val < `0`) {
1529	ret = -EINVAL;
1530	break;
1531	}
1532
1533	delta = val - sk->sk_reserved_mem;
1534	if (delta < `0`)
1535	sock_release_reserved_memory(sk, bytes: -delta);
1536	else
1537	ret = sock_reserve_memory(sk, bytes: delta);
1538	break;
1539	}
1540
1541	default:
1542	ret = -ENOPROTOOPT;
1543	break;
1544	}
1545	sockopt_release_sock(sk);
1546	return ret;
1547	}
1548
1549	int sock_setsockopt(struct socket sock, int* level, int optname,
1550	sockptr_t optval, unsigned int optlen)
1551	{
1552	return sk_setsockopt(sk: sock->sk, level, optname,
1553	optval, optlen);
1554	}
1555	EXPORT_SYMBOL(sock_setsockopt);
1556
1557	static const struct cred sk_get_peer_cred(struct* sock *sk)
1558	{
1559	const struct cred *cred;
1560
1561	spin_lock(lock: &sk->sk_peer_lock);
1562	cred = get_cred(cred: sk->sk_peer_cred);
1563	spin_unlock(lock: &sk->sk_peer_lock);
1564
1565	return cred;
1566	}
1567
1568	static void cred_to_ucred(struct pid pid, const* struct cred *cred,
1569	struct ucred *ucred)
1570	{
1571	ucred->pid = pid_vnr(pid);
1572	ucred->uid = ucred->gid = -`1`;
1573	if (cred) {
1574	struct user_namespace *current_ns = current_user_ns();
1575
1576	ucred->uid = from_kuid_munged(to: current_ns, uid: cred->euid);
1577	ucred->gid = from_kgid_munged(to: current_ns, gid: cred->egid);
1578	}
1579	}
1580
1581	static int groups_to_user(sockptr_t dst, const struct group_info *src)
1582	{
1583	struct user_namespace *user_ns = current_user_ns();
1584	int i;
1585
1586	for (i = `0`; i < src->ngroups; i++) {
1587	gid_t gid = from_kgid_munged(to: user_ns, gid: src->gid[i]);
1588
1589	if (copy_to_sockptr_offset(dst, offset: i * sizeof(gid), src: &gid, size: sizeof(gid)))
1590	return -EFAULT;
1591	}
1592
1593	return `0`;
1594	}
1595
1596	int sk_getsockopt(struct sock sk, int* level, int optname,
1597	sockptr_t optval, sockptr_t optlen)
1598	{
1599	struct socket *sock = sk->sk_socket;
1600
1601	union {
1602	int val;
1603	u64 val64;
1604	unsigned long ulval;
1605	struct linger ling;
1606	struct old_timeval32 tm32;
1607	struct __kernel_old_timeval tm;
1608	struct __kernel_sock_timeval stm;
1609	struct sock_txtime txtime;
1610	struct so_timestamping timestamping;
1611	} v;
1612
1613	int lv = sizeof(int);
1614	int len;
1615
1616	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
1617	return -EFAULT;
1618	if (len < `0`)
1619	return -EINVAL;
1620
1621	memset(&v, `0`, sizeof(v));
1622
1623	switch (optname) {
1624	case SO_DEBUG:
1625	v.val = sock_flag(sk, flag: SOCK_DBG);
1626	break;
1627
1628	case SO_DONTROUTE:
1629	v.val = sock_flag(sk, flag: SOCK_LOCALROUTE);
1630	break;
1631
1632	case SO_BROADCAST:
1633	v.val = sock_flag(sk, flag: SOCK_BROADCAST);
1634	break;
1635
1636	case SO_SNDBUF:
1637	v.val = READ_ONCE(sk->sk_sndbuf);
1638	break;
1639
1640	case SO_RCVBUF:
1641	v.val = READ_ONCE(sk->sk_rcvbuf);
1642	break;
1643
1644	case SO_REUSEADDR:
1645	v.val = sk->sk_reuse;
1646	break;
1647
1648	case SO_REUSEPORT:
1649	v.val = sk->sk_reuseport;
1650	break;
1651
1652	case SO_KEEPALIVE:
1653	v.val = sock_flag(sk, flag: SOCK_KEEPOPEN);
1654	break;
1655
1656	case SO_TYPE:
1657	v.val = sk->sk_type;
1658	break;
1659
1660	case SO_PROTOCOL:
1661	v.val = sk->sk_protocol;
1662	break;
1663
1664	case SO_DOMAIN:
1665	v.val = sk->sk_family;
1666	break;
1667
1668	case SO_ERROR:
1669	v.val = -sock_error(sk);
1670	if (v.val == `0`)
1671	v.val = xchg(&sk->sk_err_soft, `0`);
1672	break;
1673
1674	case SO_OOBINLINE:
1675	v.val = sock_flag(sk, flag: SOCK_URGINLINE);
1676	break;
1677
1678	case SO_NO_CHECK:
1679	v.val = sk->sk_no_check_tx;
1680	break;
1681
1682	case SO_PRIORITY:
1683	v.val = READ_ONCE(sk->sk_priority);
1684	break;
1685
1686	case SO_LINGER:
1687	lv = sizeof(v.ling);
1688	v.ling.l_onoff = sock_flag(sk, flag: SOCK_LINGER);
1689	v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1690	break;
1691
1692	case SO_BSDCOMPAT:
1693	break;
1694
1695	case SO_TIMESTAMP_OLD:
1696	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMP) &&
1697	!sock_flag(sk, flag: SOCK_TSTAMP_NEW) &&
1698	!sock_flag(sk, flag: SOCK_RCVTSTAMPNS);
1699	break;
1700
1701	case SO_TIMESTAMPNS_OLD:
1702	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMPNS) && !sock_flag(sk, flag: SOCK_TSTAMP_NEW);
1703	break;
1704
1705	case SO_TIMESTAMP_NEW:
1706	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMP) && sock_flag(sk, flag: SOCK_TSTAMP_NEW);
1707	break;
1708
1709	case SO_TIMESTAMPNS_NEW:
1710	v.val = sock_flag(sk, flag: SOCK_RCVTSTAMPNS) && sock_flag(sk, flag: SOCK_TSTAMP_NEW);
1711	break;
1712
1713	case SO_TIMESTAMPING_OLD:
1714	lv = sizeof(v.timestamping);
1715	v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1716	v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1717	break;
1718
1719	case SO_RCVTIMEO_OLD:
1720	case SO_RCVTIMEO_NEW:
1721	lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1722	SO_RCVTIMEO_OLD == optname);
1723	break;
1724
1725	case SO_SNDTIMEO_OLD:
1726	case SO_SNDTIMEO_NEW:
1727	lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1728	SO_SNDTIMEO_OLD == optname);
1729	break;
1730
1731	case SO_RCVLOWAT:
1732	v.val = READ_ONCE(sk->sk_rcvlowat);
1733	break;
1734
1735	case SO_SNDLOWAT:
1736	v.val = `1`;
1737	break;
1738
1739	case SO_PASSCRED:
1740	v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1741	break;
1742
1743	case SO_PASSPIDFD:
1744	v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1745	break;
1746
1747	case SO_PEERCRED:
1748	{
1749	struct ucred peercred;
1750	if (len > sizeof(peercred))
1751	len = sizeof(peercred);
1752
1753	spin_lock(lock: &sk->sk_peer_lock);
1754	cred_to_ucred(pid: sk->sk_peer_pid, cred: sk->sk_peer_cred, ucred: &peercred);
1755	spin_unlock(lock: &sk->sk_peer_lock);
1756
1757	if (copy_to_sockptr(dst: optval, src: &peercred, size: len))
1758	return -EFAULT;
1759	goto lenout;
1760	}
1761
1762	case SO_PEERPIDFD:
1763	{
1764	struct pid *peer_pid;
1765	struct file *pidfd_file = NULL;
1766	int pidfd;
1767
1768	if (len > sizeof(pidfd))
1769	len = sizeof(pidfd);
1770
1771	spin_lock(lock: &sk->sk_peer_lock);
1772	peer_pid = get_pid(pid: sk->sk_peer_pid);
1773	spin_unlock(lock: &sk->sk_peer_lock);
1774
1775	if (!peer_pid)
1776	return -ENODATA;
1777
1778	pidfd = pidfd_prepare(pid: peer_pid, flags: `0`, ret: &pidfd_file);
1779	put_pid(pid: peer_pid);
1780	if (pidfd < `0`)
1781	return pidfd;
1782
1783	if (copy_to_sockptr(dst: optval, src: &pidfd, size: len) \|\|
1784	copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int))) {
1785	put_unused_fd(fd: pidfd);
1786	fput(pidfd_file);
1787
1788	return -EFAULT;
1789	}
1790
1791	fd_install(fd: pidfd, file: pidfd_file);
1792	return `0`;
1793	}
1794
1795	case SO_PEERGROUPS:
1796	{
1797	const struct cred *cred;
1798	int ret, n;
1799
1800	cred = sk_get_peer_cred(sk);
1801	if (!cred)
1802	return -ENODATA;
1803
1804	n = cred->group_info->ngroups;
1805	if (len < n * sizeof(gid_t)) {
1806	len = n * sizeof(gid_t);
1807	put_cred(cred);
1808	return copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)) ? -EFAULT : -ERANGE;
1809	}
1810	len = n * sizeof(gid_t);
1811
1812	ret = groups_to_user(dst: optval, src: cred->group_info);
1813	put_cred(cred);
1814	if (ret)
1815	return ret;
1816	goto lenout;
1817	}
1818
1819	case SO_PEERNAME:
1820	{
1821	struct sockaddr_storage address;
1822
1823	lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, `2`);
1824	if (lv < `0`)
1825	return -ENOTCONN;
1826	if (lv < len)
1827	return -EINVAL;
1828	if (copy_to_sockptr(dst: optval, src: &address, size: len))
1829	return -EFAULT;
1830	goto lenout;
1831	}
1832
1833	/ Dubious BSD thing... Probably nobody even uses it, but*
1834	* the UNIX standard wants it for whatever reason... -DaveM
1835	*/
1836	case SO_ACCEPTCONN:
1837	v.val = sk->sk_state == TCP_LISTEN;
1838	break;
1839
1840	case SO_PASSSEC:
1841	v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1842	break;
1843
1844	case SO_PEERSEC:
1845	return security_socket_getpeersec_stream(sock,
1846	optval, optlen, len);
1847
1848	case SO_MARK:
1849	v.val = READ_ONCE(sk->sk_mark);
1850	break;
1851
1852	case SO_RCVMARK:
1853	v.val = sock_flag(sk, flag: SOCK_RCVMARK);
1854	break;
1855
1856	case SO_RXQ_OVFL:
1857	v.val = sock_flag(sk, flag: SOCK_RXQ_OVFL);
1858	break;
1859
1860	case SO_WIFI_STATUS:
1861	v.val = sock_flag(sk, flag: SOCK_WIFI_STATUS);
1862	break;
1863
1864	case SO_PEEK_OFF:
1865	if (!READ_ONCE(sock->ops)->set_peek_off)
1866	return -EOPNOTSUPP;
1867
1868	v.val = READ_ONCE(sk->sk_peek_off);
1869	break;
1870	case SO_NOFCS:
1871	v.val = sock_flag(sk, flag: SOCK_NOFCS);
1872	break;
1873
1874	case SO_BINDTODEVICE:
1875	return sock_getbindtodevice(sk, optval, optlen, len);
1876
1877	case SO_GET_FILTER:
1878	len = sk_get_filter(sk, optval, len);
1879	if (len < `0`)
1880	return len;
1881
1882	goto lenout;
1883
1884	case SO_LOCK_FILTER:
1885	v.val = sock_flag(sk, flag: SOCK_FILTER_LOCKED);
1886	break;
1887
1888	case SO_BPF_EXTENSIONS:
1889	v.val = bpf_tell_extensions();
1890	break;
1891
1892	case SO_SELECT_ERR_QUEUE:
1893	v.val = sock_flag(sk, flag: SOCK_SELECT_ERR_QUEUE);
1894	break;
1895
1896	#ifdef CONFIG_NET_RX_BUSY_POLL
1897	case SO_BUSY_POLL:
1898	v.val = READ_ONCE(sk->sk_ll_usec);
1899	break;
1900	case SO_PREFER_BUSY_POLL:
1901	v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1902	break;
1903	#endif
1904
1905	case SO_MAX_PACING_RATE:
1906	/ The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() /
1907	if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1908	lv = sizeof(v.ulval);
1909	v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1910	} else {
1911	/ 32bit version /
1912	v.val = min_t(unsigned long, ~`0U`,
1913	READ_ONCE(sk->sk_max_pacing_rate));
1914	}
1915	break;
1916
1917	case SO_INCOMING_CPU:
1918	v.val = READ_ONCE(sk->sk_incoming_cpu);
1919	break;
1920
1921	case SO_MEMINFO:
1922	{
1923	u32 meminfo[SK_MEMINFO_VARS];
1924
1925	sk_get_meminfo(sk, meminfo);
1926
1927	len = min_t(unsigned int, len, sizeof(meminfo));
1928	if (copy_to_sockptr(dst: optval, src: &meminfo, size: len))
1929	return -EFAULT;
1930
1931	goto lenout;
1932	}
1933
1934	#ifdef CONFIG_NET_RX_BUSY_POLL
1935	case SO_INCOMING_NAPI_ID:
1936	v.val = READ_ONCE(sk->sk_napi_id);
1937
1938	/ aggregate non-NAPI IDs down to 0 /
1939	if (v.val < MIN_NAPI_ID)
1940	v.val = `0`;
1941
1942	break;
1943	#endif
1944
1945	case SO_COOKIE:
1946	lv = sizeof(u64);
1947	if (len < lv)
1948	return -EINVAL;
1949	v.val64 = sock_gen_cookie(sk);
1950	break;
1951
1952	case SO_ZEROCOPY:
1953	v.val = sock_flag(sk, flag: SOCK_ZEROCOPY);
1954	break;
1955
1956	case SO_TXTIME:
1957	lv = sizeof(v.txtime);
1958	v.txtime.clockid = sk->sk_clockid;
1959	v.txtime.flags \|= sk->sk_txtime_deadline_mode ?
1960	SOF_TXTIME_DEADLINE_MODE : `0`;
1961	v.txtime.flags \|= sk->sk_txtime_report_errors ?
1962	SOF_TXTIME_REPORT_ERRORS : `0`;
1963	break;
1964
1965	case SO_BINDTOIFINDEX:
1966	v.val = READ_ONCE(sk->sk_bound_dev_if);
1967	break;
1968
1969	case SO_NETNS_COOKIE:
1970	lv = sizeof(u64);
1971	if (len != lv)
1972	return -EINVAL;
1973	v.val64 = sock_net(sk)->net_cookie;
1974	break;
1975
1976	case SO_BUF_LOCK:
1977	v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1978	break;
1979
1980	case SO_RESERVE_MEM:
1981	v.val = READ_ONCE(sk->sk_reserved_mem);
1982	break;
1983
1984	case SO_TXREHASH:
1985	/ Paired with WRITE_ONCE() in sk_setsockopt() /
1986	v.val = READ_ONCE(sk->sk_txrehash);
1987	break;
1988
1989	default:
1990	/ We implement the SO_SNDLOWAT etc to not be settable*
1991	* (1003.1g 7).
1992	*/
1993	return -ENOPROTOOPT;
1994	}
1995
1996	if (len > lv)
1997	len = lv;
1998	if (copy_to_sockptr(dst: optval, src: &v, size: len))
1999	return -EFAULT;
2000	lenout:
2001	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
2002	return -EFAULT;
2003	return `0`;
2004	}
2005
2006	/*
2007	* Initialize an sk_lock.
2008	*
2009	* (We also register the sk_lock with the lock validator.)
2010	*/
2011	static inline void sock_lock_init(struct sock *sk)
2012	{
2013	if (sk->sk_kern_sock)
2014	sock_lock_init_class_and_name(
2015	sk,
2016	af_family_kern_slock_key_strings[sk->sk_family],
2017	af_family_kern_slock_keys + sk->sk_family,
2018	af_family_kern_key_strings[sk->sk_family],
2019	af_family_kern_keys + sk->sk_family);
2020	else
2021	sock_lock_init_class_and_name(
2022	sk,
2023	af_family_slock_key_strings[sk->sk_family],
2024	af_family_slock_keys + sk->sk_family,
2025	af_family_key_strings[sk->sk_family],
2026	af_family_keys + sk->sk_family);
2027	}
2028
2029	/*
2030	* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2031	* even temporarly, because of RCU lookups. sk_node should also be left as is.
2032	* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2033	*/
2034	static void sock_copy(struct sock nsk, const* struct sock *osk)
2035	{
2036	const struct proto *prot = READ_ONCE(osk->sk_prot);
2037	#ifdef CONFIG_SECURITY_NETWORK
2038	void *sptr = nsk->sk_security;
2039	#endif
2040
2041	/ If we move sk_tx_queue_mapping out of the private section,*
2042	* we must check if sk_tx_queue_clear() is called after
2043	* sock_copy() in sk_clone_lock().
2044	*/
2045	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2046	offsetof(struct sock, sk_dontcopy_begin) \|\|
2047	offsetof(struct sock, sk_tx_queue_mapping) >=
2048	offsetof(struct sock, sk_dontcopy_end));
2049
2050	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2051
2052	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2053	prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2054
2055	#ifdef CONFIG_SECURITY_NETWORK
2056	nsk->sk_security = sptr;
2057	security_sk_clone(sk: osk, newsk: nsk);
2058	#endif
2059	}
2060
2061	static struct sock sk_prot_alloc(struct* proto *prot, gfp_t priority,
2062	int family)
2063	{
2064	struct sock *sk;
2065	struct kmem_cache *slab;
2066
2067	slab = prot->slab;
2068	if (slab != NULL) {
2069	sk = kmem_cache_alloc(cachep: slab, flags: priority & ~__GFP_ZERO);
2070	if (!sk)
2071	return sk;
2072	if (want_init_on_alloc(flags: priority))
2073	sk_prot_clear_nulls(sk, size: prot->obj_size);
2074	} else
2075	sk = kmalloc(size: prot->obj_size, flags: priority);
2076
2077	if (sk != NULL) {
2078	if (security_sk_alloc(sk, family, priority))
2079	goto out_free;
2080
2081	if (!try_module_get(module: prot->owner))
2082	goto out_free_sec;
2083	}
2084
2085	return sk;
2086
2087	out_free_sec:
2088	security_sk_free(sk);
2089	out_free:
2090	if (slab != NULL)
2091	kmem_cache_free(s: slab, objp: sk);
2092	else
2093	kfree(objp: sk);
2094	return NULL;
2095	}
2096
2097	static void sk_prot_free(struct proto prot, struct* sock *sk)
2098	{
2099	struct kmem_cache *slab;
2100	struct module *owner;
2101
2102	owner = prot->owner;
2103	slab = prot->slab;
2104
2105	cgroup_sk_free(skcd: &sk->sk_cgrp_data);
2106	mem_cgroup_sk_free(sk);
2107	security_sk_free(sk);
2108	if (slab != NULL)
2109	kmem_cache_free(s: slab, objp: sk);
2110	else
2111	kfree(objp: sk);
2112	module_put(module: owner);
2113	}
2114
2115	/**
2116	* sk_alloc - All socket objects are allocated here
2117	* @net: the applicable net namespace
2118	* @family: protocol family
2119	* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2120	* @prot: struct proto associated with this new sock instance
2121	* @kern: is this to be a kernel socket?
2122	*/
2123	struct sock sk_alloc(struct* net net, int* family, gfp_t priority,
2124	struct proto prot, int* kern)
2125	{
2126	struct sock *sk;
2127
2128	sk = sk_prot_alloc(prot, priority: priority \| __GFP_ZERO, family);
2129	if (sk) {
2130	sk->sk_family = family;
2131	/*
2132	* See comment in struct sock definition to understand
2133	* why we need sk_prot_creator -acme
2134	*/
2135	sk->sk_prot = sk->sk_prot_creator = prot;
2136	sk->sk_kern_sock = kern;
2137	sock_lock_init(sk);
2138	sk->sk_net_refcnt = kern ? `0` : `1`;
2139	if (likely(sk->sk_net_refcnt)) {
2140	get_net_track(net, tracker: &sk->ns_tracker, gfp: priority);
2141	sock_inuse_add(net, val: `1`);
2142	} else {
2143	__netns_tracker_alloc(net, tracker: &sk->ns_tracker,
2144	refcounted: false, gfp: priority);
2145	}
2146
2147	sock_net_set(sk, net);
2148	refcount_set(r: &sk->sk_wmem_alloc, n: `1`);
2149
2150	mem_cgroup_sk_alloc(sk);
2151	cgroup_sk_alloc(skcd: &sk->sk_cgrp_data);
2152	sock_update_classid(skcd: &sk->sk_cgrp_data);
2153	sock_update_netprioidx(skcd: &sk->sk_cgrp_data);
2154	sk_tx_queue_clear(sk);
2155	}
2156
2157	return sk;
2158	}
2159	EXPORT_SYMBOL(sk_alloc);
2160
2161	/ Sockets having SOCK_RCU_FREE will call this function after one RCU*
2162	* grace period. This is the case for UDP sockets and TCP listeners.
2163	*/
2164	static void __sk_destruct(struct rcu_head *head)
2165	{
2166	struct sock sk = container_of(head, struct* sock, sk_rcu);
2167	struct sk_filter *filter;
2168
2169	if (sk->sk_destruct)
2170	sk->sk_destruct(sk);
2171
2172	filter = rcu_dereference_check(sk->sk_filter,
2173	refcount_read(&sk->sk_wmem_alloc) == `0`);
2174	if (filter) {
2175	sk_filter_uncharge(sk, fp: filter);
2176	RCU_INIT_POINTER(sk->sk_filter, NULL);
2177	}
2178
2179	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2180
2181	#ifdef CONFIG_BPF_SYSCALL
2182	bpf_sk_storage_free(sk);
2183	#endif
2184
2185	if (atomic_read(v: &sk->sk_omem_alloc))
2186	pr_debug("%s: optmem leakage (%d bytes) detected\n",
2187	__func__, atomic_read(&sk->sk_omem_alloc));
2188
2189	if (sk->sk_frag.page) {
2190	put_page(page: sk->sk_frag.page);
2191	sk->sk_frag.page = NULL;
2192	}
2193
2194	/ We do not need to acquire sk->sk_peer_lock, we are the last user. /
2195	put_cred(cred: sk->sk_peer_cred);
2196	put_pid(pid: sk->sk_peer_pid);
2197
2198	if (likely(sk->sk_net_refcnt))
2199	put_net_track(net: sock_net(sk), tracker: &sk->ns_tracker);
2200	else
2201	__netns_tracker_free(net: sock_net(sk), tracker: &sk->ns_tracker, refcounted: false);
2202
2203	sk_prot_free(prot: sk->sk_prot_creator, sk);
2204	}
2205
2206	void sk_destruct(struct sock *sk)
2207	{
2208	bool use_call_rcu = sock_flag(sk, flag: SOCK_RCU_FREE);
2209
2210	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2211	reuseport_detach_sock(sk);
2212	use_call_rcu = true;
2213	}
2214
2215	if (use_call_rcu)
2216	call_rcu(head: &sk->sk_rcu, func: __sk_destruct);
2217	else
2218	__sk_destruct(head: &sk->sk_rcu);
2219	}
2220
2221	static void __sk_free(struct sock *sk)
2222	{
2223	if (likely(sk->sk_net_refcnt))
2224	sock_inuse_add(net: sock_net(sk), val: -`1`);
2225
2226	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2227	sock_diag_broadcast_destroy(sk);
2228	else
2229	sk_destruct(sk);
2230	}
2231
2232	void sk_free(struct sock *sk)
2233	{
2234	/*
2235	* We subtract one from sk_wmem_alloc and can know if
2236	* some packets are still in some tx queue.
2237	* If not null, sock_wfree() will call __sk_free(sk) later
2238	*/
2239	if (refcount_dec_and_test(r: &sk->sk_wmem_alloc))
2240	__sk_free(sk);
2241	}
2242	EXPORT_SYMBOL(sk_free);
2243
2244	static void sk_init_common(struct sock *sk)
2245	{
2246	skb_queue_head_init(list: &sk->sk_receive_queue);
2247	skb_queue_head_init(list: &sk->sk_write_queue);
2248	skb_queue_head_init(list: &sk->sk_error_queue);
2249
2250	rwlock_init(&sk->sk_callback_lock);
2251	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2252	af_rlock_keys + sk->sk_family,
2253	af_family_rlock_key_strings[sk->sk_family]);
2254	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2255	af_wlock_keys + sk->sk_family,
2256	af_family_wlock_key_strings[sk->sk_family]);
2257	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2258	af_elock_keys + sk->sk_family,
2259	af_family_elock_key_strings[sk->sk_family]);
2260	lockdep_set_class_and_name(&sk->sk_callback_lock,
2261	af_callback_keys + sk->sk_family,
2262	af_family_clock_key_strings[sk->sk_family]);
2263	}
2264
2265	/**
2266	* sk_clone_lock - clone a socket, and lock its clone
2267	* @sk: the socket to clone
2268	* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2269	*
2270	* Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2271	*/
2272	struct sock sk_clone_lock(const* struct sock sk, const* gfp_t priority)
2273	{
2274	struct proto *prot = READ_ONCE(sk->sk_prot);
2275	struct sk_filter *filter;
2276	bool is_charged = true;
2277	struct sock *newsk;
2278
2279	newsk = sk_prot_alloc(prot, priority, family: sk->sk_family);
2280	if (!newsk)
2281	goto out;
2282
2283	sock_copy(nsk: newsk, osk: sk);
2284
2285	newsk->sk_prot_creator = prot;
2286
2287	/ SANITY /
2288	if (likely(newsk->sk_net_refcnt)) {
2289	get_net_track(net: sock_net(sk: newsk), tracker: &newsk->ns_tracker, gfp: priority);
2290	sock_inuse_add(net: sock_net(sk: newsk), val: `1`);
2291	} else {
2292	/ Kernel sockets are not elevating the struct net refcount.*
2293	* Instead, use a tracker to more easily detect if a layer
2294	* is not properly dismantling its kernel sockets at netns
2295	* destroy time.
2296	*/
2297	__netns_tracker_alloc(net: sock_net(sk: newsk), tracker: &newsk->ns_tracker,
2298	refcounted: false, gfp: priority);
2299	}
2300	sk_node_init(node: &newsk->sk_node);
2301	sock_lock_init(sk: newsk);
2302	bh_lock_sock(newsk);
2303	newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2304	newsk->sk_backlog.len = `0`;
2305
2306	atomic_set(v: &newsk->sk_rmem_alloc, i: `0`);
2307
2308	/ sk_wmem_alloc set to one (see sk_free() and sock_wfree()) /
2309	refcount_set(r: &newsk->sk_wmem_alloc, n: `1`);
2310
2311	atomic_set(v: &newsk->sk_omem_alloc, i: `0`);
2312	sk_init_common(sk: newsk);
2313
2314	newsk->sk_dst_cache = NULL;
2315	newsk->sk_dst_pending_confirm = `0`;
2316	newsk->sk_wmem_queued = `0`;
2317	newsk->sk_forward_alloc = `0`;
2318	newsk->sk_reserved_mem = `0`;
2319	atomic_set(v: &newsk->sk_drops, i: `0`);
2320	newsk->sk_send_head = NULL;
2321	newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2322	atomic_set(v: &newsk->sk_zckey, i: `0`);
2323
2324	sock_reset_flag(sk: newsk, flag: SOCK_DONE);
2325
2326	/ sk->sk_memcg will be populated at accept() time /
2327	newsk->sk_memcg = NULL;
2328
2329	cgroup_sk_clone(skcd: &newsk->sk_cgrp_data);
2330
2331	rcu_read_lock();
2332	filter = rcu_dereference(sk->sk_filter);
2333	if (filter != NULL)
2334	/ though it's an empty new sock, the charging may fail*
2335	* if sysctl_optmem_max was changed between creation of
2336	* original socket and cloning
2337	*/
2338	is_charged = sk_filter_charge(sk: newsk, fp: filter);
2339	RCU_INIT_POINTER(newsk->sk_filter, filter);
2340	rcu_read_unlock();
2341
2342	if (unlikely(!is_charged \|\| xfrm_sk_clone_policy(newsk, sk))) {
2343	/ We need to make sure that we don't uncharge the new*
2344	* socket if we couldn't charge it in the first place
2345	* as otherwise we uncharge the parent's filter.
2346	*/
2347	if (!is_charged)
2348	RCU_INIT_POINTER(newsk->sk_filter, NULL);
2349	sk_free_unlock_clone(sk: newsk);
2350	newsk = NULL;
2351	goto out;
2352	}
2353	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2354
2355	if (bpf_sk_storage_clone(sk, newsk)) {
2356	sk_free_unlock_clone(sk: newsk);
2357	newsk = NULL;
2358	goto out;
2359	}
2360
2361	/ Clear sk_user_data if parent had the pointer tagged*
2362	* as not suitable for copying when cloning.
2363	*/
2364	if (sk_user_data_is_nocopy(sk: newsk))
2365	newsk->sk_user_data = NULL;
2366
2367	newsk->sk_err = `0`;
2368	newsk->sk_err_soft = `0`;
2369	newsk->sk_priority = `0`;
2370	newsk->sk_incoming_cpu = raw_smp_processor_id();
2371
2372	/ Before updating sk_refcnt, we must commit prior changes to memory*
2373	* (Documentation/RCU/rculist_nulls.rst for details)
2374	*/
2375	smp_wmb();
2376	refcount_set(r: &newsk->sk_refcnt, n: `2`);
2377
2378	sk_set_socket(sk: newsk, NULL);
2379	sk_tx_queue_clear(sk: newsk);
2380	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2381
2382	if (newsk->sk_prot->sockets_allocated)
2383	sk_sockets_allocated_inc(sk: newsk);
2384
2385	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2386	net_enable_timestamp();
2387	out:
2388	return newsk;
2389	}
2390	EXPORT_SYMBOL_GPL(sk_clone_lock);
2391
2392	void sk_free_unlock_clone(struct sock *sk)
2393	{
2394	/ It is still raw copy of parent, so invalidate*
2395	* destructor and make plain sk_free() */
2396	sk->sk_destruct = NULL;
2397	bh_unlock_sock(sk);
2398	sk_free(sk);
2399	}
2400	EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2401
2402	static u32 sk_dst_gso_max_size(struct sock sk, struct* dst_entry *dst)
2403	{
2404	bool is_ipv6 = false;
2405	u32 max_size;
2406
2407	#if IS_ENABLED(CONFIG_IPV6)
2408	is_ipv6 = (sk->sk_family == AF_INET6 &&
2409	!ipv6_addr_v4mapped(a: &sk->sk_v6_rcv_saddr));
2410	#endif
2411	/ pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() /
2412	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2413	READ_ONCE(dst->dev->gso_ipv4_max_size);
2414	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2415	max_size = GSO_LEGACY_MAX_SIZE;
2416
2417	return max_size - (MAX_TCP_HEADER + `1`);
2418	}
2419
2420	void sk_setup_caps(struct sock sk, struct* dst_entry *dst)
2421	{
2422	u32 max_segs = `1`;
2423
2424	sk->sk_route_caps = dst->dev->features;
2425	if (sk_is_tcp(sk))
2426	sk->sk_route_caps \|= NETIF_F_GSO;
2427	if (sk->sk_route_caps & NETIF_F_GSO)
2428	sk->sk_route_caps \|= NETIF_F_GSO_SOFTWARE;
2429	if (unlikely(sk->sk_gso_disabled))
2430	sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2431	if (sk_can_gso(sk)) {
2432	if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2433	sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2434	} else {
2435	sk->sk_route_caps \|= NETIF_F_SG \| NETIF_F_HW_CSUM;
2436	sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2437	/ pairs with the WRITE_ONCE() in netif_set_gso_max_segs() /
2438	max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), `1`);
2439	}
2440	}
2441	sk->sk_gso_max_segs = max_segs;
2442	sk_dst_set(sk, dst);
2443	}
2444	EXPORT_SYMBOL_GPL(sk_setup_caps);
2445
2446	/*
2447	* Simple resource managers for sockets.
2448	*/
2449
2450
2451	/*
2452	* Write buffer destructor automatically called from kfree_skb.
2453	*/
2454	void sock_wfree(struct sk_buff *skb)
2455	{
2456	struct sock *sk = skb->sk;
2457	unsigned int len = skb->truesize;
2458	bool free;
2459
2460	if (!sock_flag(sk, flag: SOCK_USE_WRITE_QUEUE)) {
2461	if (sock_flag(sk, flag: SOCK_RCU_FREE) &&
2462	sk->sk_write_space == sock_def_write_space) {
2463	rcu_read_lock();
2464	free = refcount_sub_and_test(i: len, r: &sk->sk_wmem_alloc);
2465	sock_def_write_space_wfree(sk);
2466	rcu_read_unlock();
2467	if (unlikely(free))
2468	__sk_free(sk);
2469	return;
2470	}
2471
2472	/*
2473	* Keep a reference on sk_wmem_alloc, this will be released
2474	* after sk_write_space() call
2475	*/
2476	WARN_ON(refcount_sub_and_test(len - `1`, &sk->sk_wmem_alloc));
2477	sk->sk_write_space(sk);
2478	len = `1`;
2479	}
2480	/*
2481	* if sk_wmem_alloc reaches 0, we must finish what sk_free()
2482	* could not do because of in-flight packets
2483	*/
2484	if (refcount_sub_and_test(i: len, r: &sk->sk_wmem_alloc))
2485	__sk_free(sk);
2486	}
2487	EXPORT_SYMBOL(sock_wfree);
2488
2489	/ This variant of sock_wfree() is used by TCP,*
2490	* since it sets SOCK_USE_WRITE_QUEUE.
2491	*/
2492	void __sock_wfree(struct sk_buff *skb)
2493	{
2494	struct sock *sk = skb->sk;
2495
2496	if (refcount_sub_and_test(i: skb->truesize, r: &sk->sk_wmem_alloc))
2497	__sk_free(sk);
2498	}
2499
2500	void skb_set_owner_w(struct sk_buff skb, struct* sock *sk)
2501	{
2502	skb_orphan(skb);
2503	skb->sk = sk;
2504	#ifdef CONFIG_INET
2505	if (unlikely(!sk_fullsock(sk))) {
2506	skb->destructor = sock_edemux;
2507	sock_hold(sk);
2508	return;
2509	}
2510	#endif
2511	skb->destructor = sock_wfree;
2512	skb_set_hash_from_sk(skb, sk);
2513	/*
2514	* We used to take a refcount on sk, but following operation
2515	* is enough to guarantee sk_free() wont free this sock until
2516	* all in-flight packets are completed
2517	*/
2518	refcount_add(i: skb->truesize, r: &sk->sk_wmem_alloc);
2519	}
2520	EXPORT_SYMBOL(skb_set_owner_w);
2521
2522	static bool can_skb_orphan_partial(const struct sk_buff *skb)
2523	{
2524	#ifdef CONFIG_TLS_DEVICE
2525	/ Drivers depend on in-order delivery for crypto offload,*
2526	* partial orphan breaks out-of-order-OK logic.
2527	*/
2528	if (skb->decrypted)
2529	return false;
2530	#endif
2531	return (skb->destructor == sock_wfree \|\|
2532	(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2533	}
2534
2535	/ This helper is used by netem, as it can hold packets in its*
2536	* delay queue. We want to allow the owner socket to send more
2537	* packets, as if they were already TX completed by a typical driver.
2538	* But we also want to keep skb->sk set because some packet schedulers
2539	* rely on it (sch_fq for example).
2540	*/
2541	void skb_orphan_partial(struct sk_buff *skb)
2542	{
2543	if (skb_is_tcp_pure_ack(skb))
2544	return;
2545
2546	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, sk: skb->sk))
2547	return;
2548
2549	skb_orphan(skb);
2550	}
2551	EXPORT_SYMBOL(skb_orphan_partial);
2552
2553	/*
2554	* Read buffer destructor automatically called from kfree_skb.
2555	*/
2556	void sock_rfree(struct sk_buff *skb)
2557	{
2558	struct sock *sk = skb->sk;
2559	unsigned int len = skb->truesize;
2560
2561	atomic_sub(i: len, v: &sk->sk_rmem_alloc);
2562	sk_mem_uncharge(sk, size: len);
2563	}
2564	EXPORT_SYMBOL(sock_rfree);
2565
2566	/*
2567	* Buffer destructor for skbs that are not used directly in read or write
2568	* path, e.g. for error handler skbs. Automatically called from kfree_skb.
2569	*/
2570	void sock_efree(struct sk_buff *skb)
2571	{
2572	sock_put(sk: skb->sk);
2573	}
2574	EXPORT_SYMBOL(sock_efree);
2575
2576	/ Buffer destructor for prefetch/receive path where reference count may*
2577	* not be held, e.g. for listen sockets.
2578	*/
2579	#ifdef CONFIG_INET
2580	void sock_pfree(struct sk_buff *skb)
2581	{
2582	if (sk_is_refcounted(sk: skb->sk))
2583	sock_gen_put(sk: skb->sk);
2584	}
2585	EXPORT_SYMBOL(sock_pfree);
2586	#endif /* CONFIG_INET */
2587
2588	kuid_t sock_i_uid(struct sock *sk)
2589	{
2590	kuid_t uid;
2591
2592	read_lock_bh(&sk->sk_callback_lock);
2593	uid = sk->sk_socket ? SOCK_INODE(socket: sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2594	read_unlock_bh(&sk->sk_callback_lock);
2595	return uid;
2596	}
2597	EXPORT_SYMBOL(sock_i_uid);
2598
2599	unsigned long __sock_i_ino(struct sock *sk)
2600	{
2601	unsigned long ino;
2602
2603	read_lock(&sk->sk_callback_lock);
2604	ino = sk->sk_socket ? SOCK_INODE(socket: sk->sk_socket)->i_ino : `0`;
2605	read_unlock(&sk->sk_callback_lock);
2606	return ino;
2607	}
2608	EXPORT_SYMBOL(__sock_i_ino);
2609
2610	unsigned long sock_i_ino(struct sock *sk)
2611	{
2612	unsigned long ino;
2613
2614	local_bh_disable();
2615	ino = __sock_i_ino(sk);
2616	local_bh_enable();
2617	return ino;
2618	}
2619	EXPORT_SYMBOL(sock_i_ino);
2620
2621	/*
2622	* Allocate a skb from the socket's send buffer.
2623	*/
2624	struct sk_buff sock_wmalloc(struct* sock sk, unsigned* long size, int force,
2625	gfp_t priority)
2626	{
2627	if (force \|\|
2628	refcount_read(r: &sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2629	struct sk_buff *skb = alloc_skb(size, priority);
2630
2631	if (skb) {
2632	skb_set_owner_w(skb, sk);
2633	return skb;
2634	}
2635	}
2636	return NULL;
2637	}
2638	EXPORT_SYMBOL(sock_wmalloc);
2639
2640	static void sock_ofree(struct sk_buff *skb)
2641	{
2642	struct sock *sk = skb->sk;
2643
2644	atomic_sub(i: skb->truesize, v: &sk->sk_omem_alloc);
2645	}
2646
2647	struct sk_buff sock_omalloc(struct* sock sk, unsigned* long size,
2648	gfp_t priority)
2649	{
2650	struct sk_buff *skb;
2651
2652	/ small safe race: SKB_TRUESIZE may differ from final skb->truesize /
2653	if (atomic_read(v: &sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2654	READ_ONCE(sysctl_optmem_max))
2655	return NULL;
2656
2657	skb = alloc_skb(size, priority);
2658	if (!skb)
2659	return NULL;
2660
2661	atomic_add(i: skb->truesize, v: &sk->sk_omem_alloc);
2662	skb->sk = sk;
2663	skb->destructor = sock_ofree;
2664	return skb;
2665	}
2666
2667	/*
2668	* Allocate a memory block from the socket's option memory buffer.
2669	*/
2670	void sock_kmalloc(struct* sock sk, int* size, gfp_t priority)
2671	{
2672	int optmem_max = READ_ONCE(sysctl_optmem_max);
2673
2674	if ((unsigned int)size <= optmem_max &&
2675	atomic_read(v: &sk->sk_omem_alloc) + size < optmem_max) {
2676	void *mem;
2677	/ First do the add, to avoid the race if kmalloc*
2678	* might sleep.
2679	*/
2680	atomic_add(i: size, v: &sk->sk_omem_alloc);
2681	mem = kmalloc(size, flags: priority);
2682	if (mem)
2683	return mem;
2684	atomic_sub(i: size, v: &sk->sk_omem_alloc);
2685	}
2686	return NULL;
2687	}
2688	EXPORT_SYMBOL(sock_kmalloc);
2689
2690	/ Free an option memory block. Note, we actually want the inline*
2691	* here as this allows gcc to detect the nullify and fold away the
2692	* condition entirely.
2693	*/
2694	static inline void __sock_kfree_s(struct sock sk, void* mem, int* size,
2695	const bool nullify)
2696	{
2697	if (WARN_ON_ONCE(!mem))
2698	return;
2699	if (nullify)
2700	kfree_sensitive(objp: mem);
2701	else
2702	kfree(objp: mem);
2703	atomic_sub(i: size, v: &sk->sk_omem_alloc);
2704	}
2705
2706	void sock_kfree_s(struct sock sk, void* mem, int* size)
2707	{
2708	__sock_kfree_s(sk, mem, size, nullify: false);
2709	}
2710	EXPORT_SYMBOL(sock_kfree_s);
2711
2712	void sock_kzfree_s(struct sock sk, void* mem, int* size)
2713	{
2714	__sock_kfree_s(sk, mem, size, nullify: true);
2715	}
2716	EXPORT_SYMBOL(sock_kzfree_s);
2717
2718	/ It is almost wait_for_tcp_memory minus release_sock/lock_sock.*
2719	I think, these locks should be removed for datagram sockets.
2720	*/
2721	static long sock_wait_for_wmem(struct sock sk, long* timeo)
2722	{
2723	DEFINE_WAIT(wait);
2724
2725	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2726	for (;;) {
2727	if (!timeo)
2728	break;
2729	if (signal_pending(current))
2730	break;
2731	set_bit(SOCK_NOSPACE, addr: &sk->sk_socket->flags);
2732	prepare_to_wait(wq_head: sk_sleep(sk), wq_entry: &wait, TASK_INTERRUPTIBLE);
2733	if (refcount_read(r: &sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2734	break;
2735	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2736	break;
2737	if (READ_ONCE(sk->sk_err))
2738	break;
2739	timeo = schedule_timeout(timeout: timeo);
2740	}
2741	finish_wait(wq_head: sk_sleep(sk), wq_entry: &wait);
2742	return timeo;
2743	}
2744
2745
2746	/*
2747	* Generic send/receive buffer handlers
2748	*/
2749
2750	struct sk_buff sock_alloc_send_pskb(struct* sock sk, unsigned* long header_len,
2751	unsigned long data_len, int noblock,
2752	int errcode, int* max_page_order)
2753	{
2754	struct sk_buff *skb;
2755	long timeo;
2756	int err;
2757
2758	timeo = sock_sndtimeo(sk, noblock);
2759	for (;;) {
2760	err = sock_error(sk);
2761	if (err != `0`)
2762	goto failure;
2763
2764	err = -EPIPE;
2765	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2766	goto failure;
2767
2768	if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2769	break;
2770
2771	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2772	set_bit(SOCK_NOSPACE, addr: &sk->sk_socket->flags);
2773	err = -EAGAIN;
2774	if (!timeo)
2775	goto failure;
2776	if (signal_pending(current))
2777	goto interrupted;
2778	timeo = sock_wait_for_wmem(sk, timeo);
2779	}
2780	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2781	errcode, gfp_mask: sk->sk_allocation);
2782	if (skb)
2783	skb_set_owner_w(skb, sk);
2784	return skb;
2785
2786	interrupted:
2787	err = sock_intr_errno(timeo);
2788	failure:
2789	*errcode = err;
2790	return NULL;
2791	}
2792	EXPORT_SYMBOL(sock_alloc_send_pskb);
2793
2794	int __sock_cmsg_send(struct sock sk, struct* cmsghdr *cmsg,
2795	struct sockcm_cookie *sockc)
2796	{
2797	u32 tsflags;
2798
2799	switch (cmsg->cmsg_type) {
2800	case SO_MARK:
2801	if (!ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_RAW) &&
2802	!ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN))
2803	return -EPERM;
2804	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2805	return -EINVAL;
2806	sockc->mark = (u32 )CMSG_DATA(cmsg);
2807	break;
2808	case SO_TIMESTAMPING_OLD:
2809	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2810	return -EINVAL;
2811
2812	tsflags = (u32 )CMSG_DATA(cmsg);
2813	if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2814	return -EINVAL;
2815
2816	sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2817	sockc->tsflags \|= tsflags;
2818	break;
2819	case SCM_TXTIME:
2820	if (!sock_flag(sk, flag: SOCK_TXTIME))
2821	return -EINVAL;
2822	if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2823	return -EINVAL;
2824	sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2825	break;
2826	/ SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. /
2827	case SCM_RIGHTS:
2828	case SCM_CREDENTIALS:
2829	break;
2830	default:
2831	return -EINVAL;
2832	}
2833	return `0`;
2834	}
2835	EXPORT_SYMBOL(__sock_cmsg_send);
2836
2837	int sock_cmsg_send(struct sock sk, struct* msghdr *msg,
2838	struct sockcm_cookie *sockc)
2839	{
2840	struct cmsghdr *cmsg;
2841	int ret;
2842
2843	for_each_cmsghdr(cmsg, msg) {
2844	if (!CMSG_OK(msg, cmsg))
2845	return -EINVAL;
2846	if (cmsg->cmsg_level != SOL_SOCKET)
2847	continue;
2848	ret = __sock_cmsg_send(sk, cmsg, sockc);
2849	if (ret)
2850	return ret;
2851	}
2852	return `0`;
2853	}
2854	EXPORT_SYMBOL(sock_cmsg_send);
2855
2856	static void sk_enter_memory_pressure(struct sock *sk)
2857	{
2858	if (!sk->sk_prot->enter_memory_pressure)
2859	return;
2860
2861	sk->sk_prot->enter_memory_pressure(sk);
2862	}
2863
2864	static void sk_leave_memory_pressure(struct sock *sk)
2865	{
2866	if (sk->sk_prot->leave_memory_pressure) {
2867	INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2868	tcp_leave_memory_pressure, sk);
2869	} else {
2870	unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2871
2872	if (memory_pressure && READ_ONCE(*memory_pressure))
2873	WRITE_ONCE(*memory_pressure, `0`);
2874	}
2875	}
2876
2877	DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2878
2879	/**
2880	* skb_page_frag_refill - check that a page_frag contains enough room
2881	* @sz: minimum size of the fragment we want to get
2882	* @pfrag: pointer to page_frag
2883	* @gfp: priority for memory allocation
2884	*
2885	* Note: While this allocator tries to use high order pages, there is
2886	* no guarantee that allocations succeed. Therefore, @sz MUST be
2887	* less or equal than PAGE_SIZE.
2888	*/
2889	bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2890	{
2891	if (pfrag->page) {
2892	if (page_ref_count(page: pfrag->page) == `1`) {
2893	pfrag->offset = `0`;
2894	return true;
2895	}
2896	if (pfrag->offset + sz <= pfrag->size)
2897	return true;
2898	put_page(page: pfrag->page);
2899	}
2900
2901	pfrag->offset = `0`;
2902	if (SKB_FRAG_PAGE_ORDER &&
2903	!static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2904	/ Avoid direct reclaim but allow kswapd to wake /
2905	pfrag->page = alloc_pages(gfp: (gfp & ~__GFP_DIRECT_RECLAIM) \|
2906	__GFP_COMP \| __GFP_NOWARN \|
2907	__GFP_NORETRY,
2908	SKB_FRAG_PAGE_ORDER);
2909	if (likely(pfrag->page)) {
2910	pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2911	return true;
2912	}
2913	}
2914	pfrag->page = alloc_page(gfp);
2915	if (likely(pfrag->page)) {
2916	pfrag->size = PAGE_SIZE;
2917	return true;
2918	}
2919	return false;
2920	}
2921	EXPORT_SYMBOL(skb_page_frag_refill);
2922
2923	bool sk_page_frag_refill(struct sock sk, struct* page_frag *pfrag)
2924	{
2925	if (likely(skb_page_frag_refill(`32U`, pfrag, sk->sk_allocation)))
2926	return true;
2927
2928	sk_enter_memory_pressure(sk);
2929	sk_stream_moderate_sndbuf(sk);
2930	return false;
2931	}
2932	EXPORT_SYMBOL(sk_page_frag_refill);
2933
2934	void __lock_sock(struct sock *sk)
2935	__releases(&sk->sk_lock.slock)
2936	__acquires(&sk->sk_lock.slock)
2937	{
2938	DEFINE_WAIT(wait);
2939
2940	for (;;) {
2941	prepare_to_wait_exclusive(wq_head: &sk->sk_lock.wq, wq_entry: &wait,
2942	TASK_UNINTERRUPTIBLE);
2943	spin_unlock_bh(lock: &sk->sk_lock.slock);
2944	schedule();
2945	spin_lock_bh(lock: &sk->sk_lock.slock);
2946	if (!sock_owned_by_user(sk))
2947	break;
2948	}
2949	finish_wait(wq_head: &sk->sk_lock.wq, wq_entry: &wait);
2950	}
2951
2952	void __release_sock(struct sock *sk)
2953	__releases(&sk->sk_lock.slock)
2954	__acquires(&sk->sk_lock.slock)
2955	{
2956	struct sk_buff skb, next;
2957
2958	while ((skb = sk->sk_backlog.head) != NULL) {
2959	sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2960
2961	spin_unlock_bh(lock: &sk->sk_lock.slock);
2962
2963	do {
2964	next = skb->next;
2965	prefetch(next);
2966	DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2967	skb_mark_not_on_list(skb);
2968	sk_backlog_rcv(sk, skb);
2969
2970	cond_resched();
2971
2972	skb = next;
2973	} while (skb != NULL);
2974
2975	spin_lock_bh(lock: &sk->sk_lock.slock);
2976	}
2977
2978	/*
2979	* Doing the zeroing here guarantee we can not loop forever
2980	* while a wild producer attempts to flood us.
2981	*/
2982	sk->sk_backlog.len = `0`;
2983	}
2984
2985	void __sk_flush_backlog(struct sock *sk)
2986	{
2987	spin_lock_bh(lock: &sk->sk_lock.slock);
2988	__release_sock(sk);
2989
2990	if (sk->sk_prot->release_cb)
2991	INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
2992	tcp_release_cb, sk);
2993
2994	spin_unlock_bh(lock: &sk->sk_lock.slock);
2995	}
2996	EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2997
2998	/**
2999	* sk_wait_data - wait for data to arrive at sk_receive_queue
3000	* @sk: sock to wait on
3001	* @timeo: for how long
3002	* @skb: last skb seen on sk_receive_queue
3003	*
3004	* Now socket state including sk->sk_err is changed only under lock,
3005	* hence we may omit checks after joining wait queue.
3006	* We check receive queue before schedule() only as optimization;
3007	* it is very likely that release_sock() added new data.
3008	*/
3009	int sk_wait_data(struct sock sk, long* timeo, const* struct sk_buff *skb)
3010	{
3011	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3012	int rc;
3013
3014	add_wait_queue(wq_head: sk_sleep(sk), wq_entry: &wait);
3015	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3016	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3017	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3018	remove_wait_queue(wq_head: sk_sleep(sk), wq_entry: &wait);
3019	return rc;
3020	}
3021	EXPORT_SYMBOL(sk_wait_data);
3022
3023	/**
3024	* __sk_mem_raise_allocated - increase memory_allocated
3025	* @sk: socket
3026	* @size: memory size to allocate
3027	* @amt: pages to allocate
3028	* @kind: allocation type
3029	*
3030	* Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3031	*
3032	* Unlike the globally shared limits among the sockets under same protocol,
3033	* consuming the budget of a memcg won't have direct effect on other ones.
3034	* So be optimistic about memcg's tolerance, and leave the callers to decide
3035	* whether or not to raise allocated through sk_under_memory_pressure() or
3036	* its variants.
3037	*/
3038	int __sk_mem_raise_allocated(struct sock sk, int* size, int amt, int kind)
3039	{
3040	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3041	struct proto *prot = sk->sk_prot;
3042	bool charged = false;
3043	long allocated;
3044
3045	sk_memory_allocated_add(sk, amt);
3046	allocated = sk_memory_allocated(sk);
3047
3048	if (memcg) {
3049	if (!mem_cgroup_charge_skmem(memcg, nr_pages: amt, gfp_mask: gfp_memcg_charge()))
3050	goto suppress_allocation;
3051	charged = true;
3052	}
3053
3054	/ Under limit. /
3055	if (allocated <= sk_prot_mem_limits(sk, index: `0`)) {
3056	sk_leave_memory_pressure(sk);
3057	return `1`;
3058	}
3059
3060	/ Under pressure. /
3061	if (allocated > sk_prot_mem_limits(sk, index: `1`))
3062	sk_enter_memory_pressure(sk);
3063
3064	/ Over hard limit. /
3065	if (allocated > sk_prot_mem_limits(sk, index: `2`))
3066	goto suppress_allocation;
3067
3068	/ Guarantee minimum buffer size under pressure (either global*
3069	* or memcg) to make sure features described in RFC 7323 (TCP
3070	* Extensions for High Performance) work properly.
3071	*
3072	* This rule does NOT stand when exceeds global or memcg's hard
3073	* limit, or else a DoS attack can be taken place by spawning
3074	* lots of sockets whose usage are under minimum buffer size.
3075	*/
3076	if (kind == SK_MEM_RECV) {
3077	if (atomic_read(v: &sk->sk_rmem_alloc) < sk_get_rmem0(sk, proto: prot))
3078	return `1`;
3079
3080	} else { / SK_MEM_SEND /
3081	int wmem0 = sk_get_wmem0(sk, proto: prot);
3082
3083	if (sk->sk_type == SOCK_STREAM) {
3084	if (sk->sk_wmem_queued < wmem0)
3085	return `1`;
3086	} else if (refcount_read(r: &sk->sk_wmem_alloc) < wmem0) {
3087	return `1`;
3088	}
3089	}
3090
3091	if (sk_has_memory_pressure(sk)) {
3092	u64 alloc;
3093
3094	/ The following 'average' heuristic is within the*
3095	* scope of global accounting, so it only makes
3096	* sense for global memory pressure.
3097	*/
3098	if (!sk_under_global_memory_pressure(sk))
3099	return `1`;
3100
3101	/ Try to be fair among all the sockets under global*
3102	* pressure by allowing the ones that below average
3103	* usage to raise.
3104	*/
3105	alloc = sk_sockets_allocated_read_positive(sk);
3106	if (sk_prot_mem_limits(sk, index: `2`) > alloc *
3107	sk_mem_pages(amt: sk->sk_wmem_queued +
3108	atomic_read(v: &sk->sk_rmem_alloc) +
3109	sk->sk_forward_alloc))
3110	return `1`;
3111	}
3112
3113	suppress_allocation:
3114
3115	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3116	sk_stream_moderate_sndbuf(sk);
3117
3118	/ Fail only if socket is _under_ its sndbuf.*
3119	* In this case we cannot block, so that we have to fail.
3120	*/
3121	if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3122	/ Force charge with __GFP_NOFAIL /
3123	if (memcg && !charged) {
3124	mem_cgroup_charge_skmem(memcg, nr_pages: amt,
3125	gfp_mask: gfp_memcg_charge() \| __GFP_NOFAIL);
3126	}
3127	return `1`;
3128	}
3129	}
3130
3131	if (kind == SK_MEM_SEND \|\| (kind == SK_MEM_RECV && charged))
3132	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3133
3134	sk_memory_allocated_sub(sk, amt);
3135
3136	if (charged)
3137	mem_cgroup_uncharge_skmem(memcg, nr_pages: amt);
3138
3139	return `0`;
3140	}
3141
3142	/**
3143	* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3144	* @sk: socket
3145	* @size: memory size to allocate
3146	* @kind: allocation type
3147	*
3148	* If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3149	* rmem allocation. This function assumes that protocols which have
3150	* memory_pressure use sk_wmem_queued as write buffer accounting.
3151	*/
3152	int __sk_mem_schedule(struct sock sk, int* size, int kind)
3153	{
3154	int ret, amt = sk_mem_pages(amt: size);
3155
3156	sk_forward_alloc_add(sk, val: amt << PAGE_SHIFT);
3157	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3158	if (!ret)
3159	sk_forward_alloc_add(sk, val: -(amt << PAGE_SHIFT));
3160	return ret;
3161	}
3162	EXPORT_SYMBOL(__sk_mem_schedule);
3163
3164	/**
3165	* __sk_mem_reduce_allocated - reclaim memory_allocated
3166	* @sk: socket
3167	* @amount: number of quanta
3168	*
3169	* Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3170	*/
3171	void __sk_mem_reduce_allocated(struct sock sk, int* amount)
3172	{
3173	sk_memory_allocated_sub(sk, amt: amount);
3174
3175	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3176	mem_cgroup_uncharge_skmem(memcg: sk->sk_memcg, nr_pages: amount);
3177
3178	if (sk_under_global_memory_pressure(sk) &&
3179	(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, index: `0`)))
3180	sk_leave_memory_pressure(sk);
3181	}
3182
3183	/**
3184	* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3185	* @sk: socket
3186	* @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3187	*/
3188	void __sk_mem_reclaim(struct sock sk, int* amount)
3189	{
3190	amount >>= PAGE_SHIFT;
3191	sk_forward_alloc_add(sk, val: -(amount << PAGE_SHIFT));
3192	__sk_mem_reduce_allocated(sk, amount);
3193	}
3194	EXPORT_SYMBOL(__sk_mem_reclaim);
3195
3196	int sk_set_peek_off(struct sock sk, int* val)
3197	{
3198	WRITE_ONCE(sk->sk_peek_off, val);
3199	return `0`;
3200	}
3201	EXPORT_SYMBOL_GPL(sk_set_peek_off);
3202
3203	/*
3204	* Set of default routines for initialising struct proto_ops when
3205	* the protocol does not support a particular function. In certain
3206	* cases where it makes no sense for a protocol to have a "do nothing"
3207	* function, some default processing is provided.
3208	*/
3209
3210	int sock_no_bind(struct socket sock, struct* sockaddr saddr, int* len)
3211	{
3212	return -EOPNOTSUPP;
3213	}
3214	EXPORT_SYMBOL(sock_no_bind);
3215
3216	int sock_no_connect(struct socket sock, struct* sockaddr *saddr,
3217	int len, int flags)
3218	{
3219	return -EOPNOTSUPP;
3220	}
3221	EXPORT_SYMBOL(sock_no_connect);
3222
3223	int sock_no_socketpair(struct socket sock1, struct* socket *sock2)
3224	{
3225	return -EOPNOTSUPP;
3226	}
3227	EXPORT_SYMBOL(sock_no_socketpair);
3228
3229	int sock_no_accept(struct socket sock, struct* socket newsock, int* flags,
3230	bool kern)
3231	{
3232	return -EOPNOTSUPP;
3233	}
3234	EXPORT_SYMBOL(sock_no_accept);
3235
3236	int sock_no_getname(struct socket sock, struct* sockaddr *saddr,
3237	int peer)
3238	{
3239	return -EOPNOTSUPP;
3240	}
3241	EXPORT_SYMBOL(sock_no_getname);
3242
3243	int sock_no_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg)
3244	{
3245	return -EOPNOTSUPP;
3246	}
3247	EXPORT_SYMBOL(sock_no_ioctl);
3248
3249	int sock_no_listen(struct socket sock, int* backlog)
3250	{
3251	return -EOPNOTSUPP;
3252	}
3253	EXPORT_SYMBOL(sock_no_listen);
3254
3255	int sock_no_shutdown(struct socket sock, int* how)
3256	{
3257	return -EOPNOTSUPP;
3258	}
3259	EXPORT_SYMBOL(sock_no_shutdown);
3260
3261	int sock_no_sendmsg(struct socket sock, struct* msghdr *m, size_t len)
3262	{
3263	return -EOPNOTSUPP;
3264	}
3265	EXPORT_SYMBOL(sock_no_sendmsg);
3266
3267	int sock_no_sendmsg_locked(struct sock sk, struct* msghdr *m, size_t len)
3268	{
3269	return -EOPNOTSUPP;
3270	}
3271	EXPORT_SYMBOL(sock_no_sendmsg_locked);
3272
3273	int sock_no_recvmsg(struct socket sock, struct* msghdr *m, size_t len,
3274	int flags)
3275	{
3276	return -EOPNOTSUPP;
3277	}
3278	EXPORT_SYMBOL(sock_no_recvmsg);
3279
3280	int sock_no_mmap(struct file file, struct* socket sock, struct* vm_area_struct *vma)
3281	{
3282	/ Mirror missing mmap method error code /
3283	return -ENODEV;
3284	}
3285	EXPORT_SYMBOL(sock_no_mmap);
3286
3287	/*
3288	* When a file is received (via SCM_RIGHTS, etc), we must bump the
3289	* various sock-based usage counts.
3290	*/
3291	void __receive_sock(struct file *file)
3292	{
3293	struct socket *sock;
3294
3295	sock = sock_from_file(file);
3296	if (sock) {
3297	sock_update_netprioidx(skcd: &sock->sk->sk_cgrp_data);
3298	sock_update_classid(skcd: &sock->sk->sk_cgrp_data);
3299	}
3300	}
3301
3302	/*
3303	* Default Socket Callbacks
3304	*/
3305
3306	static void sock_def_wakeup(struct sock *sk)
3307	{
3308	struct socket_wq *wq;
3309
3310	rcu_read_lock();
3311	wq = rcu_dereference(sk->sk_wq);
3312	if (skwq_has_sleeper(wq))
3313	wake_up_interruptible_all(&wq->wait);
3314	rcu_read_unlock();
3315	}
3316
3317	static void sock_def_error_report(struct sock *sk)
3318	{
3319	struct socket_wq *wq;
3320
3321	rcu_read_lock();
3322	wq = rcu_dereference(sk->sk_wq);
3323	if (skwq_has_sleeper(wq))
3324	wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3325	sk_wake_async(sk, how: SOCK_WAKE_IO, POLL_ERR);
3326	rcu_read_unlock();
3327	}
3328
3329	void sock_def_readable(struct sock *sk)
3330	{
3331	struct socket_wq *wq;
3332
3333	trace_sk_data_ready(sk);
3334
3335	rcu_read_lock();
3336	wq = rcu_dereference(sk->sk_wq);
3337	if (skwq_has_sleeper(wq))
3338	wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN \| EPOLLPRI \|
3339	EPOLLRDNORM \| EPOLLRDBAND);
3340	sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_IN);
3341	rcu_read_unlock();
3342	}
3343
3344	static void sock_def_write_space(struct sock *sk)
3345	{
3346	struct socket_wq *wq;
3347
3348	rcu_read_lock();
3349
3350	/ Do not wake up a writer until he can make "significant"*
3351	* progress. --DaveM
3352	*/
3353	if (sock_writeable(sk)) {
3354	wq = rcu_dereference(sk->sk_wq);
3355	if (skwq_has_sleeper(wq))
3356	wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT \|
3357	EPOLLWRNORM \| EPOLLWRBAND);
3358
3359	/ Should agree with poll, otherwise some programs break /
3360	sk_wake_async(sk, how: SOCK_WAKE_SPACE, POLL_OUT);
3361	}
3362
3363	rcu_read_unlock();
3364	}
3365
3366	/ An optimised version of sock_def_write_space(), should only be called*
3367	* for SOCK_RCU_FREE sockets under RCU read section and after putting
3368	* ->sk_wmem_alloc.
3369	*/
3370	static void sock_def_write_space_wfree(struct sock *sk)
3371	{
3372	/ Do not wake up a writer until he can make "significant"*
3373	* progress. --DaveM
3374	*/
3375	if (sock_writeable(sk)) {
3376	struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3377
3378	/ rely on refcount_sub from sock_wfree() /
3379	smp_mb__after_atomic();
3380	if (wq && waitqueue_active(wq_head: &wq->wait))
3381	wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT \|
3382	EPOLLWRNORM \| EPOLLWRBAND);
3383
3384	/ Should agree with poll, otherwise some programs break /
3385	sk_wake_async(sk, how: SOCK_WAKE_SPACE, POLL_OUT);
3386	}
3387	}
3388
3389	static void sock_def_destruct(struct sock *sk)
3390	{
3391	}
3392
3393	void sk_send_sigurg(struct sock *sk)
3394	{
3395	if (sk->sk_socket && sk->sk_socket->file)
3396	if (send_sigurg(fown: &sk->sk_socket->file->f_owner))
3397	sk_wake_async(sk, how: SOCK_WAKE_URG, POLL_PRI);
3398	}
3399	EXPORT_SYMBOL(sk_send_sigurg);
3400
3401	void sk_reset_timer(struct sock sk, struct* timer_list* timer,
3402	unsigned long expires)
3403	{
3404	if (!mod_timer(timer, expires))
3405	sock_hold(sk);
3406	}
3407	EXPORT_SYMBOL(sk_reset_timer);
3408
3409	void sk_stop_timer(struct sock sk, struct* timer_list* timer)
3410	{
3411	if (del_timer(timer))
3412	__sock_put(sk);
3413	}
3414	EXPORT_SYMBOL(sk_stop_timer);
3415
3416	void sk_stop_timer_sync(struct sock sk, struct* timer_list *timer)
3417	{
3418	if (del_timer_sync(timer))
3419	__sock_put(sk);
3420	}
3421	EXPORT_SYMBOL(sk_stop_timer_sync);
3422
3423	void sock_init_data_uid(struct socket sock, struct* sock *sk, kuid_t uid)
3424	{
3425	sk_init_common(sk);
3426	sk->sk_send_head = NULL;
3427
3428	timer_setup(&sk->sk_timer, NULL, `0`);
3429
3430	sk->sk_allocation = GFP_KERNEL;
3431	sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3432	sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3433	sk->sk_state = TCP_CLOSE;
3434	sk->sk_use_task_frag = true;
3435	sk_set_socket(sk, sock);
3436
3437	sock_set_flag(sk, flag: SOCK_ZAPPED);
3438
3439	if (sock) {
3440	sk->sk_type = sock->type;
3441	RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3442	sock->sk = sk;
3443	} else {
3444	RCU_INIT_POINTER(sk->sk_wq, NULL);
3445	}
3446	sk->sk_uid = uid;
3447
3448	rwlock_init(&sk->sk_callback_lock);
3449	if (sk->sk_kern_sock)
3450	lockdep_set_class_and_name(
3451	&sk->sk_callback_lock,
3452	af_kern_callback_keys + sk->sk_family,
3453	af_family_kern_clock_key_strings[sk->sk_family]);
3454	else
3455	lockdep_set_class_and_name(
3456	&sk->sk_callback_lock,
3457	af_callback_keys + sk->sk_family,
3458	af_family_clock_key_strings[sk->sk_family]);
3459
3460	sk->sk_state_change = sock_def_wakeup;
3461	sk->sk_data_ready = sock_def_readable;
3462	sk->sk_write_space = sock_def_write_space;
3463	sk->sk_error_report = sock_def_error_report;
3464	sk->sk_destruct = sock_def_destruct;
3465
3466	sk->sk_frag.page = NULL;
3467	sk->sk_frag.offset = `0`;
3468	sk->sk_peek_off = -`1`;
3469
3470	sk->sk_peer_pid = NULL;
3471	sk->sk_peer_cred = NULL;
3472	spin_lock_init(&sk->sk_peer_lock);
3473
3474	sk->sk_write_pending = `0`;
3475	sk->sk_rcvlowat = `1`;
3476	sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3477	sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3478
3479	sk->sk_stamp = SK_DEFAULT_STAMP;
3480	#if BITS_PER_LONG==32
3481	seqlock_init(&sk->sk_stamp_seq);
3482	#endif
3483	atomic_set(v: &sk->sk_zckey, i: `0`);
3484
3485	#ifdef CONFIG_NET_RX_BUSY_POLL
3486	sk->sk_napi_id = `0`;
3487	sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3488	#endif
3489
3490	sk->sk_max_pacing_rate = ~`0UL`;
3491	sk->sk_pacing_rate = ~`0UL`;
3492	WRITE_ONCE(sk->sk_pacing_shift, `10`);
3493	sk->sk_incoming_cpu = -`1`;
3494
3495	sk_rx_queue_clear(sk);
3496	/*
3497	* Before updating sk_refcnt, we must commit prior changes to memory
3498	* (Documentation/RCU/rculist_nulls.rst for details)
3499	*/
3500	smp_wmb();
3501	refcount_set(r: &sk->sk_refcnt, n: `1`);
3502	atomic_set(v: &sk->sk_drops, i: `0`);
3503	}
3504	EXPORT_SYMBOL(sock_init_data_uid);
3505
3506	void sock_init_data(struct socket sock, struct* sock *sk)
3507	{
3508	kuid_t uid = sock ?
3509	SOCK_INODE(socket: sock)->i_uid :
3510	make_kuid(from: sock_net(sk)->user_ns, uid: `0`);
3511
3512	sock_init_data_uid(sock, sk, uid);
3513	}
3514	EXPORT_SYMBOL(sock_init_data);
3515
3516	void lock_sock_nested(struct sock sk, int* subclass)
3517	{
3518	/ The sk_lock has mutex_lock() semantics here. /
3519	mutex_acquire(&sk->sk_lock.dep_map, subclass, `0`, _RET_IP_);
3520
3521	might_sleep();
3522	spin_lock_bh(lock: &sk->sk_lock.slock);
3523	if (sock_owned_by_user_nocheck(sk))
3524	__lock_sock(sk);
3525	sk->sk_lock.owned = `1`;
3526	spin_unlock_bh(lock: &sk->sk_lock.slock);
3527	}
3528	EXPORT_SYMBOL(lock_sock_nested);
3529
3530	void release_sock(struct sock *sk)
3531	{
3532	spin_lock_bh(lock: &sk->sk_lock.slock);
3533	if (sk->sk_backlog.tail)
3534	__release_sock(sk);
3535
3536	if (sk->sk_prot->release_cb)
3537	INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3538	tcp_release_cb, sk);
3539
3540	sock_release_ownership(sk);
3541	if (waitqueue_active(wq_head: &sk->sk_lock.wq))
3542	wake_up(&sk->sk_lock.wq);
3543	spin_unlock_bh(lock: &sk->sk_lock.slock);
3544	}
3545	EXPORT_SYMBOL(release_sock);
3546
3547	bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3548	{
3549	might_sleep();
3550	spin_lock_bh(lock: &sk->sk_lock.slock);
3551
3552	if (!sock_owned_by_user_nocheck(sk)) {
3553	/*
3554	* Fast path return with bottom halves disabled and
3555	* sock::sk_lock.slock held.
3556	*
3557	* The 'mutex' is not contended and holding
3558	* sock::sk_lock.slock prevents all other lockers to
3559	* proceed so the corresponding unlock_sock_fast() can
3560	* avoid the slow path of release_sock() completely and
3561	* just release slock.
3562	*
3563	* From a semantical POV this is equivalent to 'acquiring'
3564	* the 'mutex', hence the corresponding lockdep
3565	* mutex_release() has to happen in the fast path of
3566	* unlock_sock_fast().
3567	*/
3568	return false;
3569	}
3570
3571	__lock_sock(sk);
3572	sk->sk_lock.owned = `1`;
3573	__acquire(&sk->sk_lock.slock);
3574	spin_unlock_bh(lock: &sk->sk_lock.slock);
3575	return true;
3576	}
3577	EXPORT_SYMBOL(__lock_sock_fast);
3578
3579	int sock_gettstamp(struct socket sock, void* __user *userstamp,
3580	bool timeval, bool time32)
3581	{
3582	struct sock *sk = sock->sk;
3583	struct timespec64 ts;
3584
3585	sock_enable_timestamp(sk, flag: SOCK_TIMESTAMP);
3586	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3587	if (ts.tv_sec == -`1`)
3588	return -ENOENT;
3589	if (ts.tv_sec == `0`) {
3590	ktime_t kt = ktime_get_real();
3591	sock_write_timestamp(sk, kt);
3592	ts = ktime_to_timespec64(kt);
3593	}
3594
3595	if (timeval)
3596	ts.tv_nsec /= `1000`;
3597
3598	#ifdef CONFIG_COMPAT_32BIT_TIME
3599	if (time32)
3600	return put_old_timespec32(&ts, userstamp);
3601	#endif
3602	#ifdef CONFIG_SPARC64
3603	/ beware of padding in sparc64 timeval /
3604	if (timeval && !in_compat_syscall()) {
3605	struct __kernel_old_timeval __user tv = {
3606	.tv_sec = ts.tv_sec,
3607	.tv_usec = ts.tv_nsec,
3608	};
3609	if (copy_to_user(userstamp, &tv, sizeof(tv)))
3610	return -EFAULT;
3611	return `0`;
3612	}
3613	#endif
3614	return put_timespec64(ts: &ts, uts: userstamp);
3615	}
3616	EXPORT_SYMBOL(sock_gettstamp);
3617
3618	void sock_enable_timestamp(struct sock sk, enum* sock_flags flag)
3619	{
3620	if (!sock_flag(sk, flag)) {
3621	unsigned long previous_flags = sk->sk_flags;
3622
3623	sock_set_flag(sk, flag);
3624	/*
3625	* we just set one of the two flags which require net
3626	* time stamping, but time stamping might have been on
3627	* already because of the other one
3628	*/
3629	if (sock_needs_netstamp(sk) &&
3630	!(previous_flags & SK_FLAGS_TIMESTAMP))
3631	net_enable_timestamp();
3632	}
3633	}
3634
3635	int sock_recv_errqueue(struct sock sk, struct* msghdr msg, int* len,
3636	int level, int type)
3637	{
3638	struct sock_exterr_skb *serr;
3639	struct sk_buff *skb;
3640	int copied, err;
3641
3642	err = -EAGAIN;
3643	skb = sock_dequeue_err_skb(sk);
3644	if (skb == NULL)
3645	goto out;
3646
3647	copied = skb->len;
3648	if (copied > len) {
3649	msg->msg_flags \|= MSG_TRUNC;
3650	copied = len;
3651	}
3652	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: copied);
3653	if (err)
3654	goto out_free_skb;
3655
3656	sock_recv_timestamp(msg, sk, skb);
3657
3658	serr = SKB_EXT_ERR(skb);
3659	put_cmsg(msg, level, type, len: sizeof(serr->ee), data: &serr->ee);
3660
3661	msg->msg_flags \|= MSG_ERRQUEUE;
3662	err = copied;
3663
3664	out_free_skb:
3665	kfree_skb(skb);
3666	out:
3667	return err;
3668	}
3669	EXPORT_SYMBOL(sock_recv_errqueue);
3670
3671	/*
3672	* Get a socket option on an socket.
3673	*
3674	* FIX: POSIX 1003.1g is very ambiguous here. It states that
3675	* asynchronous errors should be reported by getsockopt. We assume
3676	* this means if you specify SO_ERROR (otherwise whats the point of it).
3677	*/
3678	int sock_common_getsockopt(struct socket sock, int* level, int optname,
3679	char __user optval, int* __user *optlen)
3680	{
3681	struct sock *sk = sock->sk;
3682
3683	/ IPV6_ADDRFORM can change sk->sk_prot under us. /
3684	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3685	}
3686	EXPORT_SYMBOL(sock_common_getsockopt);
3687
3688	int sock_common_recvmsg(struct socket sock, struct* msghdr *msg, size_t size,
3689	int flags)
3690	{
3691	struct sock *sk = sock->sk;
3692	int addr_len = `0`;
3693	int err;
3694
3695	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3696	if (err >= `0`)
3697	msg->msg_namelen = addr_len;
3698	return err;
3699	}
3700	EXPORT_SYMBOL(sock_common_recvmsg);
3701
3702	/*
3703	* Set socket options on an inet socket.
3704	*/
3705	int sock_common_setsockopt(struct socket sock, int* level, int optname,
3706	sockptr_t optval, unsigned int optlen)
3707	{
3708	struct sock *sk = sock->sk;
3709
3710	/ IPV6_ADDRFORM can change sk->sk_prot under us. /
3711	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3712	}
3713	EXPORT_SYMBOL(sock_common_setsockopt);
3714
3715	void sk_common_release(struct sock *sk)
3716	{
3717	if (sk->sk_prot->destroy)
3718	sk->sk_prot->destroy(sk);
3719
3720	/*
3721	* Observation: when sk_common_release is called, processes have
3722	* no access to socket. But net still has.
3723	* Step one, detach it from networking:
3724	*
3725	* A. Remove from hash tables.
3726	*/
3727
3728	sk->sk_prot->unhash(sk);
3729
3730	/*
3731	* In this point socket cannot receive new packets, but it is possible
3732	* that some packets are in flight because some CPU runs receiver and
3733	* did hash table lookup before we unhashed socket. They will achieve
3734	* receive queue and will be purged by socket destructor.
3735	*
3736	* Also we still have packets pending on receive queue and probably,
3737	* our own packets waiting in device queues. sock_destroy will drain
3738	* receive queue, but transmitted packets will delay socket destruction
3739	* until the last reference will be released.
3740	*/
3741
3742	sock_orphan(sk);
3743
3744	xfrm_sk_free_policy(sk);
3745
3746	sock_put(sk);
3747	}
3748	EXPORT_SYMBOL(sk_common_release);
3749
3750	void sk_get_meminfo(const struct sock sk, u32 mem)
3751	{
3752	memset(mem, `0`, sizeof(mem) SK_MEMINFO_VARS);
3753
3754	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3755	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3756	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3757	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3758	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3759	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3760	mem[SK_MEMINFO_OPTMEM] = atomic_read(v: &sk->sk_omem_alloc);
3761	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3762	mem[SK_MEMINFO_DROPS] = atomic_read(v: &sk->sk_drops);
3763	}
3764
3765	#ifdef CONFIG_PROC_FS
3766	static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3767
3768	int sock_prot_inuse_get(struct net net, struct* proto *prot)
3769	{
3770	int cpu, idx = prot->inuse_idx;
3771	int res = `0`;
3772
3773	for_each_possible_cpu(cpu)
3774	res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3775
3776	return res >= `0` ? res : `0`;
3777	}
3778	EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3779
3780	int sock_inuse_get(struct net *net)
3781	{
3782	int cpu, res = `0`;
3783
3784	for_each_possible_cpu(cpu)
3785	res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3786
3787	return res;
3788	}
3789
3790	EXPORT_SYMBOL_GPL(sock_inuse_get);
3791
3792	static int __net_init sock_inuse_init_net(struct net *net)
3793	{
3794	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3795	if (net->core.prot_inuse == NULL)
3796	return -ENOMEM;
3797	return `0`;
3798	}
3799
3800	static void __net_exit sock_inuse_exit_net(struct net *net)
3801	{
3802	free_percpu(pdata: net->core.prot_inuse);
3803	}
3804
3805	static struct pernet_operations net_inuse_ops = {
3806	.init = sock_inuse_init_net,
3807	.exit = sock_inuse_exit_net,
3808	};
3809
3810	static __init int net_inuse_init(void)
3811	{
3812	if (register_pernet_subsys(&net_inuse_ops))
3813	panic(fmt: "Cannot initialize net inuse counters");
3814
3815	return `0`;
3816	}
3817
3818	core_initcall(net_inuse_init);
3819
3820	static int assign_proto_idx(struct proto *prot)
3821	{
3822	prot->inuse_idx = find_first_zero_bit(addr: proto_inuse_idx, PROTO_INUSE_NR);
3823
3824	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - `1`)) {
3825	pr_err("PROTO_INUSE_NR exhausted\n");
3826	return -ENOSPC;
3827	}
3828
3829	set_bit(nr: prot->inuse_idx, addr: proto_inuse_idx);
3830	return `0`;
3831	}
3832
3833	static void release_proto_idx(struct proto *prot)
3834	{
3835	if (prot->inuse_idx != PROTO_INUSE_NR - `1`)
3836	clear_bit(nr: prot->inuse_idx, addr: proto_inuse_idx);
3837	}
3838	#else
3839	static inline int assign_proto_idx(struct proto *prot)
3840	{
3841	return `0`;
3842	}
3843
3844	static inline void release_proto_idx(struct proto *prot)
3845	{
3846	}
3847
3848	#endif
3849
3850	static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3851	{
3852	if (!twsk_prot)
3853	return;
3854	kfree(objp: twsk_prot->twsk_slab_name);
3855	twsk_prot->twsk_slab_name = NULL;
3856	kmem_cache_destroy(s: twsk_prot->twsk_slab);
3857	twsk_prot->twsk_slab = NULL;
3858	}
3859
3860	static int tw_prot_init(const struct proto *prot)
3861	{
3862	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3863
3864	if (!twsk_prot)
3865	return `0`;
3866
3867	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, fmt: "tw_sock_%s",
3868	prot->name);
3869	if (!twsk_prot->twsk_slab_name)
3870	return -ENOMEM;
3871
3872	twsk_prot->twsk_slab =
3873	kmem_cache_create(name: twsk_prot->twsk_slab_name,
3874	size: twsk_prot->twsk_obj_size, align: `0`,
3875	SLAB_ACCOUNT \| prot->slab_flags,
3876	NULL);
3877	if (!twsk_prot->twsk_slab) {
3878	pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3879	prot->name);
3880	return -ENOMEM;
3881	}
3882
3883	return `0`;
3884	}
3885
3886	static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3887	{
3888	if (!rsk_prot)
3889	return;
3890	kfree(objp: rsk_prot->slab_name);
3891	rsk_prot->slab_name = NULL;
3892	kmem_cache_destroy(s: rsk_prot->slab);
3893	rsk_prot->slab = NULL;
3894	}
3895
3896	static int req_prot_init(const struct proto *prot)
3897	{
3898	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3899
3900	if (!rsk_prot)
3901	return `0`;
3902
3903	rsk_prot->slab_name = kasprintf(GFP_KERNEL, fmt: "request_sock_%s",
3904	prot->name);
3905	if (!rsk_prot->slab_name)
3906	return -ENOMEM;
3907
3908	rsk_prot->slab = kmem_cache_create(name: rsk_prot->slab_name,
3909	size: rsk_prot->obj_size, align: `0`,
3910	SLAB_ACCOUNT \| prot->slab_flags,
3911	NULL);
3912
3913	if (!rsk_prot->slab) {
3914	pr_crit("%s: Can't create request sock SLAB cache!\n",
3915	prot->name);
3916	return -ENOMEM;
3917	}
3918	return `0`;
3919	}
3920
3921	int proto_register(struct proto prot, int* alloc_slab)
3922	{
3923	int ret = -ENOBUFS;
3924
3925	if (prot->memory_allocated && !prot->sysctl_mem) {
3926	pr_err("%s: missing sysctl_mem\n", prot->name);
3927	return -EINVAL;
3928	}
3929	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3930	pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3931	return -EINVAL;
3932	}
3933	if (alloc_slab) {
3934	prot->slab = kmem_cache_create_usercopy(name: prot->name,
3935	size: prot->obj_size, align: `0`,
3936	SLAB_HWCACHE_ALIGN \| SLAB_ACCOUNT \|
3937	prot->slab_flags,
3938	useroffset: prot->useroffset, usersize: prot->usersize,
3939	NULL);
3940
3941	if (prot->slab == NULL) {
3942	pr_crit("%s: Can't create sock SLAB cache!\n",
3943	prot->name);
3944	goto out;
3945	}
3946
3947	if (req_prot_init(prot))
3948	goto out_free_request_sock_slab;
3949
3950	if (tw_prot_init(prot))
3951	goto out_free_timewait_sock_slab;
3952	}
3953
3954	mutex_lock(&proto_list_mutex);
3955	ret = assign_proto_idx(prot);
3956	if (ret) {
3957	mutex_unlock(lock: &proto_list_mutex);
3958	goto out_free_timewait_sock_slab;
3959	}
3960	list_add(new: &prot->node, head: &proto_list);
3961	mutex_unlock(lock: &proto_list_mutex);
3962	return ret;
3963
3964	out_free_timewait_sock_slab:
3965	if (alloc_slab)
3966	tw_prot_cleanup(twsk_prot: prot->twsk_prot);
3967	out_free_request_sock_slab:
3968	if (alloc_slab) {
3969	req_prot_cleanup(rsk_prot: prot->rsk_prot);
3970
3971	kmem_cache_destroy(s: prot->slab);
3972	prot->slab = NULL;
3973	}
3974	out:
3975	return ret;
3976	}
3977	EXPORT_SYMBOL(proto_register);
3978
3979	void proto_unregister(struct proto *prot)
3980	{
3981	mutex_lock(&proto_list_mutex);
3982	release_proto_idx(prot);
3983	list_del(entry: &prot->node);
3984	mutex_unlock(lock: &proto_list_mutex);
3985
3986	kmem_cache_destroy(s: prot->slab);
3987	prot->slab = NULL;
3988
3989	req_prot_cleanup(rsk_prot: prot->rsk_prot);
3990	tw_prot_cleanup(twsk_prot: prot->twsk_prot);
3991	}
3992	EXPORT_SYMBOL(proto_unregister);
3993
3994	int sock_load_diag_module(int family, int protocol)
3995	{
3996	if (!protocol) {
3997	if (!sock_is_registered(family))
3998	return -ENOENT;
3999
4000	return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4001	NETLINK_SOCK_DIAG, family);
4002	}
4003
4004	#ifdef CONFIG_INET
4005	if (family == AF_INET &&
4006	protocol != IPPROTO_RAW &&
4007	protocol < MAX_INET_PROTOS &&
4008	!rcu_access_pointer(inet_protos[protocol]))
4009	return -ENOENT;
4010	#endif
4011
4012	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4013	NETLINK_SOCK_DIAG, family, protocol);
4014	}
4015	EXPORT_SYMBOL(sock_load_diag_module);
4016
4017	#ifdef CONFIG_PROC_FS
4018	static void proto_seq_start(struct* seq_file seq, loff_t pos)
4019	__acquires(proto_list_mutex)
4020	{
4021	mutex_lock(&proto_list_mutex);
4022	return seq_list_start_head(head: &proto_list, pos: *pos);
4023	}
4024
4025	static void proto_seq_next(struct* seq_file seq, void* v, loff_t pos)
4026	{
4027	return seq_list_next(v, head: &proto_list, ppos: pos);
4028	}
4029
4030	static void proto_seq_stop(struct seq_file seq, void* *v)
4031	__releases(proto_list_mutex)
4032	{
4033	mutex_unlock(lock: &proto_list_mutex);
4034	}
4035
4036	static char proto_method_implemented(const void *method)
4037	{
4038	return method == NULL ? `'n'` : `'y'`;
4039	}
4040	static long sock_prot_memory_allocated(struct proto *proto)
4041	{
4042	return proto->memory_allocated != NULL ? proto_memory_allocated(prot: proto) : -`1L`;
4043	}
4044
4045	static const char sock_prot_memory_pressure(struct* proto *proto)
4046	{
4047	return proto->memory_pressure != NULL ?
4048	proto_memory_pressure(prot: proto) ? "yes" : "no" : "NI";
4049	}
4050
4051	static void proto_seq_printf(struct seq_file seq, struct* proto *proto)
4052	{
4053
4054	seq_printf(m: seq, fmt: "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4055	"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4056	proto->name,
4057	proto->obj_size,
4058	sock_prot_inuse_get(seq_file_net(seq), proto),
4059	sock_prot_memory_allocated(proto),
4060	sock_prot_memory_pressure(proto),
4061	proto->max_header,
4062	proto->slab == NULL ? "no" : "yes",
4063	module_name(proto->owner),
4064	proto_method_implemented(method: proto->close),
4065	proto_method_implemented(method: proto->connect),
4066	proto_method_implemented(method: proto->disconnect),
4067	proto_method_implemented(method: proto->accept),
4068	proto_method_implemented(method: proto->ioctl),
4069	proto_method_implemented(method: proto->init),
4070	proto_method_implemented(method: proto->destroy),
4071	proto_method_implemented(method: proto->shutdown),
4072	proto_method_implemented(method: proto->setsockopt),
4073	proto_method_implemented(method: proto->getsockopt),
4074	proto_method_implemented(method: proto->sendmsg),
4075	proto_method_implemented(method: proto->recvmsg),
4076	proto_method_implemented(method: proto->bind),
4077	proto_method_implemented(method: proto->backlog_rcv),
4078	proto_method_implemented(method: proto->hash),
4079	proto_method_implemented(method: proto->unhash),
4080	proto_method_implemented(method: proto->get_port),
4081	proto_method_implemented(method: proto->enter_memory_pressure));
4082	}
4083
4084	static int proto_seq_show(struct seq_file seq, void* *v)
4085	{
4086	if (v == &proto_list)
4087	seq_printf(m: seq, fmt: "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4088	"protocol",
4089	"size",
4090	"sockets",
4091	"memory",
4092	"press",
4093	"maxhdr",
4094	"slab",
4095	"module",
4096	"cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4097	else
4098	proto_seq_printf(seq, list_entry(v, struct proto, node));
4099	return `0`;
4100	}
4101
4102	static const struct seq_operations proto_seq_ops = {
4103	.start = proto_seq_start,
4104	.next = proto_seq_next,
4105	.stop = proto_seq_stop,
4106	.show = proto_seq_show,
4107	};
4108
4109	static __net_init int proto_init_net(struct net *net)
4110	{
4111	if (!proc_create_net("protocols", `0444`, net->proc_net, &proto_seq_ops,
4112	sizeof(struct seq_net_private)))
4113	return -ENOMEM;
4114
4115	return `0`;
4116	}
4117
4118	static __net_exit void proto_exit_net(struct net *net)
4119	{
4120	remove_proc_entry("protocols", net->proc_net);
4121	}
4122
4123
4124	static __net_initdata struct pernet_operations proto_net_ops = {
4125	.init = proto_init_net,
4126	.exit = proto_exit_net,
4127	};
4128
4129	static int __init proto_init(void)
4130	{
4131	return register_pernet_subsys(&proto_net_ops);
4132	}
4133
4134	subsys_initcall(proto_init);
4135
4136	#endif /* PROC_FS */
4137
4138	#ifdef CONFIG_NET_RX_BUSY_POLL
4139	bool sk_busy_loop_end(void p, unsigned* long start_time)
4140	{
4141	struct sock *sk = p;
4142
4143	return !skb_queue_empty_lockless(list: &sk->sk_receive_queue) \|\|
4144	sk_busy_loop_timeout(sk, start_time);
4145	}
4146	EXPORT_SYMBOL(sk_busy_loop_end);
4147	#endif /* CONFIG_NET_RX_BUSY_POLL */
4148
4149	int sock_bind_add(struct sock sk, struct* sockaddr addr, int* addr_len)
4150	{
4151	if (!sk->sk_prot->bind_add)
4152	return -EOPNOTSUPP;
4153	return sk->sk_prot->bind_add(sk, addr, addr_len);
4154	}
4155	EXPORT_SYMBOL(sock_bind_add);
4156
4157	/ Copy 'size' bytes from userspace and return `size` back to userspace /
4158	int sock_ioctl_inout(struct sock sk, unsigned* int cmd,
4159	void __user arg, void* *karg, size_t size)
4160	{
4161	int ret;
4162
4163	if (copy_from_user(to: karg, from: arg, n: size))
4164	return -EFAULT;
4165
4166	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4167	if (ret)
4168	return ret;
4169
4170	if (copy_to_user(to: arg, from: karg, n: size))
4171	return -EFAULT;
4172
4173	return `0`;
4174	}
4175	EXPORT_SYMBOL(sock_ioctl_inout);
4176
4177	/ This is the most common ioctl prep function, where the result (4 bytes) is*
4178	* copied back to userspace if the ioctl() returns successfully. No input is
4179	* copied from userspace as input argument.
4180	*/
4181	static int sock_ioctl_out(struct sock sk, unsigned* int cmd, void __user *arg)
4182	{
4183	int ret, karg = `0`;
4184
4185	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4186	if (ret)
4187	return ret;
4188
4189	return put_user(karg, (int __user *)arg);
4190	}
4191
4192	/ A wrapper around sock ioctls, which copies the data from userspace*
4193	* (depending on the protocol/ioctl), and copies back the result to userspace.
4194	* The main motivation for this function is to pass kernel memory to the
4195	* protocol ioctl callbacks, instead of userspace memory.
4196	*/
4197	int sk_ioctl(struct sock sk, unsigned* int cmd, void __user *arg)
4198	{
4199	int rc = `1`;
4200
4201	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4202	rc = ipmr_sk_ioctl(sk, cmd, arg);
4203	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4204	rc = ip6mr_sk_ioctl(sk, cmd, arg);
4205	else if (sk_is_phonet(sk))
4206	rc = phonet_sk_ioctl(sk, cmd, arg);
4207
4208	/ If ioctl was processed, returns its value /
4209	if (rc <= `0`)
4210	return rc;
4211
4212	/ Otherwise call the default handler /
4213	return sock_ioctl_out(sk, cmd, arg);
4214	}
4215	EXPORT_SYMBOL(sk_ioctl);
4216

source code of linux/net/core/sock.c