af_unix.c source code [linux/net/unix/af_unix.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* NET4: Implementation of BSD Unix domain sockets.
4	*
5	* Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
6	*
7	* Fixes:
8	* Linus Torvalds : Assorted bug cures.
9	* Niibe Yutaka : async I/O support.
10	* Carsten Paeth : PF_UNIX check, address fixes.
11	* Alan Cox : Limit size of allocated blocks.
12	* Alan Cox : Fixed the stupid socketpair bug.
13	* Alan Cox : BSD compatibility fine tuning.
14	* Alan Cox : Fixed a bug in connect when interrupted.
15	* Alan Cox : Sorted out a proper draft version of
16	* file descriptor passing hacked up from
17	* Mike Shaver's work.
18	* Marty Leisner : Fixes to fd passing
19	* Nick Nevin : recvmsg bugfix.
20	* Alan Cox : Started proper garbage collector
21	* Heiko EiBfeldt : Missing verify_area check
22	* Alan Cox : Started POSIXisms
23	* Andreas Schwab : Replace inode by dentry for proper
24	* reference counting
25	* Kirk Petersen : Made this a module
26	* Christoph Rohland : Elegant non-blocking accept/connect algorithm.
27	* Lots of bug fixes.
28	* Alexey Kuznetosv : Repaired (I hope) bugs introduces
29	* by above two patches.
30	* Andrea Arcangeli : If possible we block in connect(2)
31	* if the max backlog of the listen socket
32	* is been reached. This won't break
33	* old apps and it will avoid huge amount
34	* of socks hashed (this for unix_gc()
35	* performances reasons).
36	* Security fix that limits the max
37	* number of socks to 2*max_files and
38	* the number of skb queueable in the
39	* dgram receiver.
40	* Artur Skawina : Hash function optimizations
41	* Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42	* Malcolm Beattie : Set peercred for socketpair
43	* Michal Ostrowski : Module initialization cleanup.
44	* Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45	* the core infrastructure is doing that
46	* for all net proto families now (2.5.69+)
47	*
48	* Known differences from reference BSD that was tested:
49	*
50	* [TO FIX]
51	* ECONNREFUSED is not returned from one end of a connected() socket to the
52	* other the moment one end closes.
53	* fstat() doesn't return st_dev=0, and give the blksize as high water mark
54	* and a fake inode identifier (nor the BSD first socket fstat twice bug).
55	* [NOT TO FIX]
56	* accept() returns a path name even if the connecting socket has closed
57	* in the meantime (BSD loses the path and gives up).
58	* accept() returns 0 length path for an unbound connector. BSD returns 16
59	* and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60	* socketpair(...SOCK_RAW..) doesn't panic the kernel.
61	* BSD af_unix apparently has connect forgetting to block properly.
62	* (need to check this with the POSIX spec in detail)
63	*
64	* Differences from 2.0.0-11-... (ANK)
65	* Bug fixes and improvements.
66	* - client shutdown killed server socket.
67	* - removed all useless cli/sti pairs.
68	*
69	* Semantic changes/extensions.
70	* - generic control message passing.
71	* - SCM_CREDENTIALS control message.
72	* - "Abstract" (not FS based) socket bindings.
73	* Abstract names are sequences of bytes (not zero terminated)
74	* started by 0, so that this name space does not intersect
75	* with BSD names.
76	*/
77
78	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79
80	#include <linux/module.h>
81	#include <linux/kernel.h>
82	#include <linux/signal.h>
83	#include <linux/sched/signal.h>
84	#include <linux/errno.h>
85	#include <linux/string.h>
86	#include <linux/stat.h>
87	#include <linux/dcache.h>
88	#include <linux/namei.h>
89	#include <linux/socket.h>
90	#include <linux/un.h>
91	#include <linux/fcntl.h>
92	#include <linux/filter.h>
93	#include <linux/termios.h>
94	#include <linux/sockios.h>
95	#include <linux/net.h>
96	#include <linux/in.h>
97	#include <linux/fs.h>
98	#include <linux/slab.h>
99	#include <linux/uaccess.h>
100	#include <linux/skbuff.h>
101	#include <linux/netdevice.h>
102	#include <net/net_namespace.h>
103	#include <net/sock.h>
104	#include <net/tcp_states.h>
105	#include <net/af_unix.h>
106	#include <linux/proc_fs.h>
107	#include <linux/seq_file.h>
108	#include <net/scm.h>
109	#include <linux/init.h>
110	#include <linux/poll.h>
111	#include <linux/rtnetlink.h>
112	#include <linux/mount.h>
113	#include <net/checksum.h>
114	#include <linux/security.h>
115	#include <linux/splice.h>
116	#include <linux/freezer.h>
117	#include <linux/file.h>
118	#include <linux/btf_ids.h>
119	#include <linux/bpf-cgroup.h>
120
121	#include "scm.h"
122
123	static atomic_long_t unix_nr_socks;
124	static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / `2`];
125	static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / `2`];
126
127	/ SMP locking strategy:*
128	* hash table is protected with spinlock.
129	* each socket state is protected by separate spinlock.
130	*/
131
132	static unsigned int unix_unbound_hash(struct sock *sk)
133	{
134	unsigned long hash = (unsigned long)sk;
135
136	hash ^= hash >> `16`;
137	hash ^= hash >> `8`;
138	hash ^= sk->sk_type;
139
140	return hash & UNIX_HASH_MOD;
141	}
142
143	static unsigned int unix_bsd_hash(struct inode *i)
144	{
145	return i->i_ino & UNIX_HASH_MOD;
146	}
147
148	static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
149	int addr_len, int type)
150	{
151	__wsum csum = csum_partial(buff: sunaddr, len: addr_len, sum: `0`);
152	unsigned int hash;
153
154	hash = (__force unsigned int)csum_fold(sum: csum);
155	hash ^= hash >> `8`;
156	hash ^= type;
157
158	return UNIX_HASH_MOD + `1` + (hash & UNIX_HASH_MOD);
159	}
160
161	static void unix_table_double_lock(struct net *net,
162	unsigned int hash1, unsigned int hash2)
163	{
164	if (hash1 == hash2) {
165	spin_lock(lock: &net->unx.table.locks[hash1]);
166	return;
167	}
168
169	if (hash1 > hash2)
170	swap(hash1, hash2);
171
172	spin_lock(lock: &net->unx.table.locks[hash1]);
173	spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
174	}
175
176	static void unix_table_double_unlock(struct net *net,
177	unsigned int hash1, unsigned int hash2)
178	{
179	if (hash1 == hash2) {
180	spin_unlock(lock: &net->unx.table.locks[hash1]);
181	return;
182	}
183
184	spin_unlock(lock: &net->unx.table.locks[hash1]);
185	spin_unlock(lock: &net->unx.table.locks[hash2]);
186	}
187
188	#ifdef CONFIG_SECURITY_NETWORK
189	static void unix_get_secdata(struct scm_cookie scm, struct* sk_buff *skb)
190	{
191	UNIXCB(skb).secid = scm->secid;
192	}
193
194	static inline void unix_set_secdata(struct scm_cookie scm, struct* sk_buff *skb)
195	{
196	scm->secid = UNIXCB(skb).secid;
197	}
198
199	static inline bool unix_secdata_eq(struct scm_cookie scm, struct* sk_buff *skb)
200	{
201	return (scm->secid == UNIXCB(skb).secid);
202	}
203	#else
204	static inline void unix_get_secdata(struct scm_cookie scm, struct* sk_buff *skb)
205	{ }
206
207	static inline void unix_set_secdata(struct scm_cookie scm, struct* sk_buff *skb)
208	{ }
209
210	static inline bool unix_secdata_eq(struct scm_cookie scm, struct* sk_buff *skb)
211	{
212	return true;
213	}
214	#endif /* CONFIG_SECURITY_NETWORK */
215
216	#define unix_peer(sk) (unix_sk(sk)->peer)
217
218	static inline int unix_our_peer(struct sock sk, struct* sock *osk)
219	{
220	return unix_peer(osk) == sk;
221	}
222
223	static inline int unix_may_send(struct sock sk, struct* sock *osk)
224	{
225	return unix_peer(osk) == NULL \|\| unix_our_peer(sk, osk);
226	}
227
228	static inline int unix_recvq_full(const struct sock *sk)
229	{
230	return skb_queue_len(list_: &sk->sk_receive_queue) > sk->sk_max_ack_backlog;
231	}
232
233	static inline int unix_recvq_full_lockless(const struct sock *sk)
234	{
235	return skb_queue_len_lockless(list_: &sk->sk_receive_queue) >
236	READ_ONCE(sk->sk_max_ack_backlog);
237	}
238
239	struct sock unix_peer_get(struct* sock *s)
240	{
241	struct sock *peer;
242
243	unix_state_lock(s);
244	peer = unix_peer(s);
245	if (peer)
246	sock_hold(sk: peer);
247	unix_state_unlock(s);
248	return peer;
249	}
250	EXPORT_SYMBOL_GPL(unix_peer_get);
251
252	static struct unix_address unix_create_addr(struct* sockaddr_un *sunaddr,
253	int addr_len)
254	{
255	struct unix_address *addr;
256
257	addr = kmalloc(size: sizeof(*addr) + addr_len, GFP_KERNEL);
258	if (!addr)
259	return NULL;
260
261	refcount_set(r: &addr->refcnt, n: `1`);
262	addr->len = addr_len;
263	memcpy(addr->name, sunaddr, addr_len);
264
265	return addr;
266	}
267
268	static inline void unix_release_addr(struct unix_address *addr)
269	{
270	if (refcount_dec_and_test(r: &addr->refcnt))
271	kfree(objp: addr);
272	}
273
274	/*
275	* Check unix socket name:
276	* - should be not zero length.
277	* - if started by not zero, should be NULL terminated (FS object)
278	* - if started by zero, it is abstract name.
279	*/
280
281	static int unix_validate_addr(struct sockaddr_un sunaddr, int* addr_len)
282	{
283	if (addr_len <= offsetof(struct sockaddr_un, sun_path) \|\|
284	addr_len > sizeof(*sunaddr))
285	return -EINVAL;
286
287	if (sunaddr->sun_family != AF_UNIX)
288	return -EINVAL;
289
290	return `0`;
291	}
292
293	static int unix_mkname_bsd(struct sockaddr_un sunaddr, int* addr_len)
294	{
295	struct sockaddr_storage addr = (struct* sockaddr_storage *)sunaddr;
296	short offset = offsetof(struct sockaddr_storage, __data);
297
298	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
299
300	/ This may look like an off by one error but it is a bit more*
301	* subtle. 108 is the longest valid AF_UNIX path for a binding.
302	* sun_path[108] doesn't as such exist. However in kernel space
303	* we are guaranteed that it is a valid memory location in our
304	* kernel address buffer because syscall functions always pass
305	* a pointer of struct sockaddr_storage which has a bigger buffer
306	* than 108. Also, we must terminate sun_path for strlen() in
307	* getname_kernel().
308	*/
309	addr->__data[addr_len - offset] = `0`;
310
311	/ Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will*
312	* cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
313	* know the actual buffer.
314	*/
315	return strlen(addr->__data) + offset + `1`;
316	}
317
318	static void __unix_remove_socket(struct sock *sk)
319	{
320	sk_del_node_init(sk);
321	}
322
323	static void __unix_insert_socket(struct net net, struct* sock *sk)
324	{
325	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
326	sk_add_node(sk, list: &net->unx.table.buckets[sk->sk_hash]);
327	}
328
329	static void __unix_set_addr_hash(struct net net, struct* sock *sk,
330	struct unix_address addr, unsigned* int hash)
331	{
332	__unix_remove_socket(sk);
333	smp_store_release(&unix_sk(sk)->addr, addr);
334
335	sk->sk_hash = hash;
336	__unix_insert_socket(net, sk);
337	}
338
339	static void unix_remove_socket(struct net net, struct* sock *sk)
340	{
341	spin_lock(lock: &net->unx.table.locks[sk->sk_hash]);
342	__unix_remove_socket(sk);
343	spin_unlock(lock: &net->unx.table.locks[sk->sk_hash]);
344	}
345
346	static void unix_insert_unbound_socket(struct net net, struct* sock *sk)
347	{
348	spin_lock(lock: &net->unx.table.locks[sk->sk_hash]);
349	__unix_insert_socket(net, sk);
350	spin_unlock(lock: &net->unx.table.locks[sk->sk_hash]);
351	}
352
353	static void unix_insert_bsd_socket(struct sock *sk)
354	{
355	spin_lock(lock: &bsd_socket_locks[sk->sk_hash]);
356	sk_add_bind_node(sk, list: &bsd_socket_buckets[sk->sk_hash]);
357	spin_unlock(lock: &bsd_socket_locks[sk->sk_hash]);
358	}
359
360	static void unix_remove_bsd_socket(struct sock *sk)
361	{
362	if (!hlist_unhashed(h: &sk->sk_bind_node)) {
363	spin_lock(lock: &bsd_socket_locks[sk->sk_hash]);
364	__sk_del_bind_node(sk);
365	spin_unlock(lock: &bsd_socket_locks[sk->sk_hash]);
366
367	sk_node_init(node: &sk->sk_bind_node);
368	}
369	}
370
371	static struct sock __unix_find_socket_byname(struct* net *net,
372	struct sockaddr_un *sunname,
373	int len, unsigned int hash)
374	{
375	struct sock *s;
376
377	sk_for_each(s, &net->unx.table.buckets[hash]) {
378	struct unix_sock *u = unix_sk(s);
379
380	if (u->addr->len == len &&
381	!memcmp(p: u->addr->name, q: sunname, size: len))
382	return s;
383	}
384	return NULL;
385	}
386
387	static inline struct sock unix_find_socket_byname(struct* net *net,
388	struct sockaddr_un *sunname,
389	int len, unsigned int hash)
390	{
391	struct sock *s;
392
393	spin_lock(lock: &net->unx.table.locks[hash]);
394	s = __unix_find_socket_byname(net, sunname, len, hash);
395	if (s)
396	sock_hold(sk: s);
397	spin_unlock(lock: &net->unx.table.locks[hash]);
398	return s;
399	}
400
401	static struct sock unix_find_socket_byinode(struct* inode *i)
402	{
403	unsigned int hash = unix_bsd_hash(i);
404	struct sock *s;
405
406	spin_lock(lock: &bsd_socket_locks[hash]);
407	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
408	struct dentry *dentry = unix_sk(s)->path.dentry;
409
410	if (dentry && d_backing_inode(upper: dentry) == i) {
411	sock_hold(sk: s);
412	spin_unlock(lock: &bsd_socket_locks[hash]);
413	return s;
414	}
415	}
416	spin_unlock(lock: &bsd_socket_locks[hash]);
417	return NULL;
418	}
419
420	/ Support code for asymmetrically connected dgram sockets*
421	*
422	* If a datagram socket is connected to a socket not itself connected
423	* to the first socket (eg, /dev/log), clients may only enqueue more
424	* messages if the present receive queue of the server socket is not
425	* "too large". This means there's a second writeability condition
426	* poll and sendmsg need to test. The dgram recv code will do a wake
427	* up on the peer_wait wait queue of a socket upon reception of a
428	* datagram which needs to be propagated to sleeping would-be writers
429	* since these might not have sent anything so far. This can't be
430	* accomplished via poll_wait because the lifetime of the server
431	* socket might be less than that of its clients if these break their
432	* association with it or if the server socket is closed while clients
433	* are still connected to it and there's no way to inform "a polling
434	* implementation" that it should let go of a certain wait queue
435	*
436	* In order to propagate a wake up, a wait_queue_entry_t of the client
437	* socket is enqueued on the peer_wait queue of the server socket
438	* whose wake function does a wake_up on the ordinary client socket
439	* wait queue. This connection is established whenever a write (or
440	* poll for write) hit the flow control condition and broken when the
441	* association to the server socket is dissolved or after a wake up
442	* was relayed.
443	*/
444
445	static int unix_dgram_peer_wake_relay(wait_queue_entry_t q, unsigned* mode, int flags,
446	void *key)
447	{
448	struct unix_sock *u;
449	wait_queue_head_t *u_sleep;
450
451	u = container_of(q, struct unix_sock, peer_wake);
452
453	__remove_wait_queue(wq_head: &unix_sk(u->peer_wake.private)->peer_wait,
454	wq_entry: q);
455	u->peer_wake.private = NULL;
456
457	/ relaying can only happen while the wq still exists /
458	u_sleep = sk_sleep(sk: &u->sk);
459	if (u_sleep)
460	wake_up_interruptible_poll(u_sleep, key_to_poll(key));
461
462	return `0`;
463	}
464
465	static int unix_dgram_peer_wake_connect(struct sock sk, struct* sock *other)
466	{
467	struct unix_sock u, u_other;
468	int rc;
469
470	u = unix_sk(sk);
471	u_other = unix_sk(other);
472	rc = `0`;
473	spin_lock(lock: &u_other->peer_wait.lock);
474
475	if (!u->peer_wake.private) {
476	u->peer_wake.private = other;
477	__add_wait_queue(wq_head: &u_other->peer_wait, wq_entry: &u->peer_wake);
478
479	rc = `1`;
480	}
481
482	spin_unlock(lock: &u_other->peer_wait.lock);
483	return rc;
484	}
485
486	static void unix_dgram_peer_wake_disconnect(struct sock *sk,
487	struct sock *other)
488	{
489	struct unix_sock u, u_other;
490
491	u = unix_sk(sk);
492	u_other = unix_sk(other);
493	spin_lock(lock: &u_other->peer_wait.lock);
494
495	if (u->peer_wake.private == other) {
496	__remove_wait_queue(wq_head: &u_other->peer_wait, wq_entry: &u->peer_wake);
497	u->peer_wake.private = NULL;
498	}
499
500	spin_unlock(lock: &u_other->peer_wait.lock);
501	}
502
503	static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
504	struct sock *other)
505	{
506	unix_dgram_peer_wake_disconnect(sk, other);
507	wake_up_interruptible_poll(sk_sleep(sk),
508	EPOLLOUT \|
509	EPOLLWRNORM \|
510	EPOLLWRBAND);
511	}
512
513	/ preconditions:*
514	* - unix_peer(sk) == other
515	* - association is stable
516	*/
517	static int unix_dgram_peer_wake_me(struct sock sk, struct* sock *other)
518	{
519	int connected;
520
521	connected = unix_dgram_peer_wake_connect(sk, other);
522
523	/ If other is SOCK_DEAD, we want to make sure we signal*
524	* POLLOUT, such that a subsequent write() can get a
525	* -ECONNREFUSED. Otherwise, if we haven't queued any skbs
526	* to other and its full, we will hang waiting for POLLOUT.
527	*/
528	if (unix_recvq_full_lockless(sk: other) && !sock_flag(sk: other, flag: SOCK_DEAD))
529	return `1`;
530
531	if (connected)
532	unix_dgram_peer_wake_disconnect(sk, other);
533
534	return `0`;
535	}
536
537	static int unix_writable(const struct sock *sk)
538	{
539	return sk->sk_state != TCP_LISTEN &&
540	(refcount_read(r: &sk->sk_wmem_alloc) << `2`) <= sk->sk_sndbuf;
541	}
542
543	static void unix_write_space(struct sock *sk)
544	{
545	struct socket_wq *wq;
546
547	rcu_read_lock();
548	if (unix_writable(sk)) {
549	wq = rcu_dereference(sk->sk_wq);
550	if (skwq_has_sleeper(wq))
551	wake_up_interruptible_sync_poll(&wq->wait,
552	EPOLLOUT \| EPOLLWRNORM \| EPOLLWRBAND);
553	sk_wake_async(sk, how: SOCK_WAKE_SPACE, POLL_OUT);
554	}
555	rcu_read_unlock();
556	}
557
558	/ When dgram socket disconnects (or changes its peer), we clear its receive*
559	* queue of packets arrived from previous peer. First, it allows to do
560	* flow control based only on wmem_alloc; second, sk connected to peer
561	* may receive messages only from that peer. */
562	static void unix_dgram_disconnected(struct sock sk, struct* sock *other)
563	{
564	if (!skb_queue_empty(list: &sk->sk_receive_queue)) {
565	skb_queue_purge(list: &sk->sk_receive_queue);
566	wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
567
568	/ If one link of bidirectional dgram pipe is disconnected,*
569	* we signal error. Messages are lost. Do not make this,
570	* when peer was not connected to us.
571	*/
572	if (!sock_flag(sk: other, flag: SOCK_DEAD) && unix_peer(other) == sk) {
573	WRITE_ONCE(other->sk_err, ECONNRESET);
574	sk_error_report(sk: other);
575	}
576	}
577	other->sk_state = TCP_CLOSE;
578	}
579
580	static void unix_sock_destructor(struct sock *sk)
581	{
582	struct unix_sock *u = unix_sk(sk);
583
584	skb_queue_purge(list: &sk->sk_receive_queue);
585
586	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
587	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
588	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
589	if (!sock_flag(sk, flag: SOCK_DEAD)) {
590	pr_info("Attempt to release alive unix socket: %p\n", sk);
591	return;
592	}
593
594	if (u->addr)
595	unix_release_addr(addr: u->addr);
596
597	atomic_long_dec(v: &unix_nr_socks);
598	sock_prot_inuse_add(net: sock_net(sk), prot: sk->sk_prot, val: -`1`);
599	#ifdef UNIX_REFCNT_DEBUG
600	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
601	atomic_long_read(&unix_nr_socks));
602	#endif
603	}
604
605	static void unix_release_sock(struct sock sk, int* embrion)
606	{
607	struct unix_sock *u = unix_sk(sk);
608	struct sock *skpair;
609	struct sk_buff *skb;
610	struct path path;
611	int state;
612
613	unix_remove_socket(net: sock_net(sk), sk);
614	unix_remove_bsd_socket(sk);
615
616	/ Clear state /
617	unix_state_lock(sk);
618	sock_orphan(sk);
619	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
620	path = u->path;
621	u->path.dentry = NULL;
622	u->path.mnt = NULL;
623	state = sk->sk_state;
624	sk->sk_state = TCP_CLOSE;
625
626	skpair = unix_peer(sk);
627	unix_peer(sk) = NULL;
628
629	unix_state_unlock(sk);
630
631	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
632	if (u->oob_skb) {
633	kfree_skb(skb: u->oob_skb);
634	u->oob_skb = NULL;
635	}
636	#endif
637
638	wake_up_interruptible_all(&u->peer_wait);
639
640	if (skpair != NULL) {
641	if (sk->sk_type == SOCK_STREAM \|\| sk->sk_type == SOCK_SEQPACKET) {
642	unix_state_lock(skpair);
643	/ No more writes /
644	WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
645	if (!skb_queue_empty(list: &sk->sk_receive_queue) \|\| embrion)
646	WRITE_ONCE(skpair->sk_err, ECONNRESET);
647	unix_state_unlock(skpair);
648	skpair->sk_state_change(skpair);
649	sk_wake_async(sk: skpair, how: SOCK_WAKE_WAITD, POLL_HUP);
650	}
651
652	unix_dgram_peer_wake_disconnect(sk, other: skpair);
653	sock_put(sk: skpair); / It may now die /
654	}
655
656	/ Try to flush out this socket. Throw out buffers at least /
657
658	while ((skb = skb_dequeue(list: &sk->sk_receive_queue)) != NULL) {
659	if (state == TCP_LISTEN)
660	unix_release_sock(sk: skb->sk, embrion: `1`);
661	/ passed fds are erased in the kfree_skb hook /
662	UNIXCB(skb).consumed = skb->len;
663	kfree_skb(skb);
664	}
665
666	if (path.dentry)
667	path_put(&path);
668
669	sock_put(sk);
670
671	/ ---- Socket is dead now and most probably destroyed ---- /
672
673	/*
674	* Fixme: BSD difference: In BSD all sockets connected to us get
675	* ECONNRESET and we die on the spot. In Linux we behave
676	* like files and pipes do and wait for the last
677	* dereference.
678	*
679	* Can't we simply set sock->err?
680	*
681	* What the above comment does talk about? --ANK(980817)
682	*/
683
684	if (READ_ONCE(unix_tot_inflight))
685	unix_gc(); / Garbage collect fds /
686	}
687
688	static void init_peercred(struct sock *sk)
689	{
690	const struct cred *old_cred;
691	struct pid *old_pid;
692
693	spin_lock(lock: &sk->sk_peer_lock);
694	old_pid = sk->sk_peer_pid;
695	old_cred = sk->sk_peer_cred;
696	sk->sk_peer_pid = get_pid(pid: task_tgid(current));
697	sk->sk_peer_cred = get_current_cred();
698	spin_unlock(lock: &sk->sk_peer_lock);
699
700	put_pid(pid: old_pid);
701	put_cred(cred: old_cred);
702	}
703
704	static void copy_peercred(struct sock sk, struct* sock *peersk)
705	{
706	const struct cred *old_cred;
707	struct pid *old_pid;
708
709	if (sk < peersk) {
710	spin_lock(lock: &sk->sk_peer_lock);
711	spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
712	} else {
713	spin_lock(lock: &peersk->sk_peer_lock);
714	spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
715	}
716	old_pid = sk->sk_peer_pid;
717	old_cred = sk->sk_peer_cred;
718	sk->sk_peer_pid = get_pid(pid: peersk->sk_peer_pid);
719	sk->sk_peer_cred = get_cred(cred: peersk->sk_peer_cred);
720
721	spin_unlock(lock: &sk->sk_peer_lock);
722	spin_unlock(lock: &peersk->sk_peer_lock);
723
724	put_pid(pid: old_pid);
725	put_cred(cred: old_cred);
726	}
727
728	static int unix_listen(struct socket sock, int* backlog)
729	{
730	int err;
731	struct sock *sk = sock->sk;
732	struct unix_sock *u = unix_sk(sk);
733
734	err = -EOPNOTSUPP;
735	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
736	goto out; / Only stream/seqpacket sockets accept /
737	err = -EINVAL;
738	if (!u->addr)
739	goto out; / No listens on an unbound socket /
740	unix_state_lock(sk);
741	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
742	goto out_unlock;
743	if (backlog > sk->sk_max_ack_backlog)
744	wake_up_interruptible_all(&u->peer_wait);
745	sk->sk_max_ack_backlog = backlog;
746	sk->sk_state = TCP_LISTEN;
747	/ set credentials so connect can copy them /
748	init_peercred(sk);
749	err = `0`;
750
751	out_unlock:
752	unix_state_unlock(sk);
753	out:
754	return err;
755	}
756
757	static int unix_release(struct socket *);
758	static int unix_bind(struct socket , struct* sockaddr , int*);
759	static int unix_stream_connect(struct socket , struct* sockaddr *,
760	int addr_len, int flags);
761	static int unix_socketpair(struct socket , struct* socket *);
762	static int unix_accept(struct socket , struct* socket , int*, bool);
763	static int unix_getname(struct socket , struct* sockaddr , int*);
764	static __poll_t unix_poll(struct file , struct* socket , poll_table );
765	static __poll_t unix_dgram_poll(struct file , struct* socket *,
766	poll_table *);
767	static int unix_ioctl(struct socket , unsigned* int, unsigned long);
768	#ifdef CONFIG_COMPAT
769	static int unix_compat_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg);
770	#endif
771	static int unix_shutdown(struct socket , int*);
772	static int unix_stream_sendmsg(struct socket , struct* msghdr *, size_t);
773	static int unix_stream_recvmsg(struct socket , struct* msghdr , size_t, int*);
774	static ssize_t unix_stream_splice_read(struct socket , loff_t ppos,
775	struct pipe_inode_info *, size_t size,
776	unsigned int flags);
777	static int unix_dgram_sendmsg(struct socket , struct* msghdr *, size_t);
778	static int unix_dgram_recvmsg(struct socket , struct* msghdr , size_t, int*);
779	static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
780	static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
781	static int unix_dgram_connect(struct socket , struct* sockaddr *,
782	int, int);
783	static int unix_seqpacket_sendmsg(struct socket , struct* msghdr *, size_t);
784	static int unix_seqpacket_recvmsg(struct socket , struct* msghdr *, size_t,
785	int);
786
787	static int unix_set_peek_off(struct sock sk, int* val)
788	{
789	struct unix_sock *u = unix_sk(sk);
790
791	if (mutex_lock_interruptible(&u->iolock))
792	return -EINTR;
793
794	WRITE_ONCE(sk->sk_peek_off, val);
795	mutex_unlock(lock: &u->iolock);
796
797	return `0`;
798	}
799
800	#ifdef CONFIG_PROC_FS
801	static int unix_count_nr_fds(struct sock *sk)
802	{
803	struct sk_buff *skb;
804	struct unix_sock *u;
805	int nr_fds = `0`;
806
807	spin_lock(lock: &sk->sk_receive_queue.lock);
808	skb = skb_peek(list_: &sk->sk_receive_queue);
809	while (skb) {
810	u = unix_sk(skb->sk);
811	nr_fds += atomic_read(v: &u->scm_stat.nr_fds);
812	skb = skb_peek_next(skb, list_: &sk->sk_receive_queue);
813	}
814	spin_unlock(lock: &sk->sk_receive_queue.lock);
815
816	return nr_fds;
817	}
818
819	static void unix_show_fdinfo(struct seq_file m, struct* socket *sock)
820	{
821	struct sock *sk = sock->sk;
822	unsigned char s_state;
823	struct unix_sock *u;
824	int nr_fds = `0`;
825
826	if (sk) {
827	s_state = READ_ONCE(sk->sk_state);
828	u = unix_sk(sk);
829
830	/ SOCK_STREAM and SOCK_SEQPACKET sockets never change their*
831	* sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
832	* SOCK_DGRAM is ordinary. So, no lock is needed.
833	*/
834	if (sock->type == SOCK_DGRAM \|\| s_state == TCP_ESTABLISHED)
835	nr_fds = atomic_read(v: &u->scm_stat.nr_fds);
836	else if (s_state == TCP_LISTEN)
837	nr_fds = unix_count_nr_fds(sk);
838
839	seq_printf(m, fmt: "scm_fds: %u\n", nr_fds);
840	}
841	}
842	#else
843	#define unix_show_fdinfo NULL
844	#endif
845
846	static const struct proto_ops unix_stream_ops = {
847	.family = PF_UNIX,
848	.owner = THIS_MODULE,
849	.release = unix_release,
850	.bind = unix_bind,
851	.connect = unix_stream_connect,
852	.socketpair = unix_socketpair,
853	.accept = unix_accept,
854	.getname = unix_getname,
855	.poll = unix_poll,
856	.ioctl = unix_ioctl,
857	#ifdef CONFIG_COMPAT
858	.compat_ioctl = unix_compat_ioctl,
859	#endif
860	.listen = unix_listen,
861	.shutdown = unix_shutdown,
862	.sendmsg = unix_stream_sendmsg,
863	.recvmsg = unix_stream_recvmsg,
864	.read_skb = unix_stream_read_skb,
865	.mmap = sock_no_mmap,
866	.splice_read = unix_stream_splice_read,
867	.set_peek_off = unix_set_peek_off,
868	.show_fdinfo = unix_show_fdinfo,
869	};
870
871	static const struct proto_ops unix_dgram_ops = {
872	.family = PF_UNIX,
873	.owner = THIS_MODULE,
874	.release = unix_release,
875	.bind = unix_bind,
876	.connect = unix_dgram_connect,
877	.socketpair = unix_socketpair,
878	.accept = sock_no_accept,
879	.getname = unix_getname,
880	.poll = unix_dgram_poll,
881	.ioctl = unix_ioctl,
882	#ifdef CONFIG_COMPAT
883	.compat_ioctl = unix_compat_ioctl,
884	#endif
885	.listen = sock_no_listen,
886	.shutdown = unix_shutdown,
887	.sendmsg = unix_dgram_sendmsg,
888	.read_skb = unix_read_skb,
889	.recvmsg = unix_dgram_recvmsg,
890	.mmap = sock_no_mmap,
891	.set_peek_off = unix_set_peek_off,
892	.show_fdinfo = unix_show_fdinfo,
893	};
894
895	static const struct proto_ops unix_seqpacket_ops = {
896	.family = PF_UNIX,
897	.owner = THIS_MODULE,
898	.release = unix_release,
899	.bind = unix_bind,
900	.connect = unix_stream_connect,
901	.socketpair = unix_socketpair,
902	.accept = unix_accept,
903	.getname = unix_getname,
904	.poll = unix_dgram_poll,
905	.ioctl = unix_ioctl,
906	#ifdef CONFIG_COMPAT
907	.compat_ioctl = unix_compat_ioctl,
908	#endif
909	.listen = unix_listen,
910	.shutdown = unix_shutdown,
911	.sendmsg = unix_seqpacket_sendmsg,
912	.recvmsg = unix_seqpacket_recvmsg,
913	.mmap = sock_no_mmap,
914	.set_peek_off = unix_set_peek_off,
915	.show_fdinfo = unix_show_fdinfo,
916	};
917
918	static void unix_close(struct sock sk, long* timeout)
919	{
920	/ Nothing to do here, unix socket does not need a ->close().*
921	* This is merely for sockmap.
922	*/
923	}
924
925	static void unix_unhash(struct sock *sk)
926	{
927	/ Nothing to do here, unix socket does not need a ->unhash().*
928	* This is merely for sockmap.
929	*/
930	}
931
932	static bool unix_bpf_bypass_getsockopt(int level, int optname)
933	{
934	if (level == SOL_SOCKET) {
935	switch (optname) {
936	case SO_PEERPIDFD:
937	return true;
938	default:
939	return false;
940	}
941	}
942
943	return false;
944	}
945
946	struct proto unix_dgram_proto = {
947	.name = "UNIX",
948	.owner = THIS_MODULE,
949	.obj_size = sizeof(struct unix_sock),
950	.close = unix_close,
951	.bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
952	#ifdef CONFIG_BPF_SYSCALL
953	.psock_update_sk_prot = unix_dgram_bpf_update_proto,
954	#endif
955	};
956
957	struct proto unix_stream_proto = {
958	.name = "UNIX-STREAM",
959	.owner = THIS_MODULE,
960	.obj_size = sizeof(struct unix_sock),
961	.close = unix_close,
962	.unhash = unix_unhash,
963	.bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt,
964	#ifdef CONFIG_BPF_SYSCALL
965	.psock_update_sk_prot = unix_stream_bpf_update_proto,
966	#endif
967	};
968
969	static struct sock unix_create1(struct* net net, struct* socket sock, int* kern, int type)
970	{
971	struct unix_sock *u;
972	struct sock *sk;
973	int err;
974
975	atomic_long_inc(v: &unix_nr_socks);
976	if (atomic_long_read(v: &unix_nr_socks) > `2` * get_max_files()) {
977	err = -ENFILE;
978	goto err;
979	}
980
981	if (type == SOCK_STREAM)
982	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, prot: &unix_stream_proto, kern);
983	else /dgram and seqpacket /
984	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, prot: &unix_dgram_proto, kern);
985
986	if (!sk) {
987	err = -ENOMEM;
988	goto err;
989	}
990
991	sock_init_data(sock, sk);
992
993	sk->sk_hash = unix_unbound_hash(sk);
994	sk->sk_allocation = GFP_KERNEL_ACCOUNT;
995	sk->sk_write_space = unix_write_space;
996	sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
997	sk->sk_destruct = unix_sock_destructor;
998	u = unix_sk(sk);
999	u->path.dentry = NULL;
1000	u->path.mnt = NULL;
1001	spin_lock_init(&u->lock);
1002	atomic_long_set(v: &u->inflight, i: `0`);
1003	INIT_LIST_HEAD(list: &u->link);
1004	mutex_init(&u->iolock); / single task reading lock /
1005	mutex_init(&u->bindlock); / single task binding lock /
1006	init_waitqueue_head(&u->peer_wait);
1007	init_waitqueue_func_entry(wq_entry: &u->peer_wake, func: unix_dgram_peer_wake_relay);
1008	memset(&u->scm_stat, `0`, sizeof(struct scm_stat));
1009	unix_insert_unbound_socket(net, sk);
1010
1011	sock_prot_inuse_add(net, prot: sk->sk_prot, val: `1`);
1012
1013	return sk;
1014
1015	err:
1016	atomic_long_dec(v: &unix_nr_socks);
1017	return ERR_PTR(error: err);
1018	}
1019
1020	static int unix_create(struct net net, struct* socket sock, int* protocol,
1021	int kern)
1022	{
1023	struct sock *sk;
1024
1025	if (protocol && protocol != PF_UNIX)
1026	return -EPROTONOSUPPORT;
1027
1028	sock->state = SS_UNCONNECTED;
1029
1030	switch (sock->type) {
1031	case SOCK_STREAM:
1032	sock->ops = &unix_stream_ops;
1033	break;
1034	/*
1035	* Believe it or not BSD has AF_UNIX, SOCK_RAW though
1036	* nothing uses it.
1037	*/
1038	case SOCK_RAW:
1039	sock->type = SOCK_DGRAM;
1040	fallthrough;
1041	case SOCK_DGRAM:
1042	sock->ops = &unix_dgram_ops;
1043	break;
1044	case SOCK_SEQPACKET:
1045	sock->ops = &unix_seqpacket_ops;
1046	break;
1047	default:
1048	return -ESOCKTNOSUPPORT;
1049	}
1050
1051	sk = unix_create1(net, sock, kern, type: sock->type);
1052	if (IS_ERR(ptr: sk))
1053	return PTR_ERR(ptr: sk);
1054
1055	return `0`;
1056	}
1057
1058	static int unix_release(struct socket *sock)
1059	{
1060	struct sock *sk = sock->sk;
1061
1062	if (!sk)
1063	return `0`;
1064
1065	sk->sk_prot->close(sk, `0`);
1066	unix_release_sock(sk, embrion: `0`);
1067	sock->sk = NULL;
1068
1069	return `0`;
1070	}
1071
1072	static struct sock unix_find_bsd(struct* sockaddr_un sunaddr, int* addr_len,
1073	int type)
1074	{
1075	struct inode *inode;
1076	struct path path;
1077	struct sock *sk;
1078	int err;
1079
1080	unix_mkname_bsd(sunaddr, addr_len);
1081	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1082	if (err)
1083	goto fail;
1084
1085	err = path_permission(path: &path, MAY_WRITE);
1086	if (err)
1087	goto path_put;
1088
1089	err = -ECONNREFUSED;
1090	inode = d_backing_inode(upper: path.dentry);
1091	if (!S_ISSOCK(inode->i_mode))
1092	goto path_put;
1093
1094	sk = unix_find_socket_byinode(i: inode);
1095	if (!sk)
1096	goto path_put;
1097
1098	err = -EPROTOTYPE;
1099	if (sk->sk_type == type)
1100	touch_atime(&path);
1101	else
1102	goto sock_put;
1103
1104	path_put(&path);
1105
1106	return sk;
1107
1108	sock_put:
1109	sock_put(sk);
1110	path_put:
1111	path_put(&path);
1112	fail:
1113	return ERR_PTR(error: err);
1114	}
1115
1116	static struct sock unix_find_abstract(struct* net *net,
1117	struct sockaddr_un *sunaddr,
1118	int addr_len, int type)
1119	{
1120	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1121	struct dentry *dentry;
1122	struct sock *sk;
1123
1124	sk = unix_find_socket_byname(net, sunname: sunaddr, len: addr_len, hash);
1125	if (!sk)
1126	return ERR_PTR(error: -ECONNREFUSED);
1127
1128	dentry = unix_sk(sk)->path.dentry;
1129	if (dentry)
1130	touch_atime(&unix_sk(sk)->path);
1131
1132	return sk;
1133	}
1134
1135	static struct sock unix_find_other(struct* net *net,
1136	struct sockaddr_un *sunaddr,
1137	int addr_len, int type)
1138	{
1139	struct sock *sk;
1140
1141	if (sunaddr->sun_path[`0`])
1142	sk = unix_find_bsd(sunaddr, addr_len, type);
1143	else
1144	sk = unix_find_abstract(net, sunaddr, addr_len, type);
1145
1146	return sk;
1147	}
1148
1149	static int unix_autobind(struct sock *sk)
1150	{
1151	unsigned int new_hash, old_hash = sk->sk_hash;
1152	struct unix_sock *u = unix_sk(sk);
1153	struct net *net = sock_net(sk);
1154	struct unix_address *addr;
1155	u32 lastnum, ordernum;
1156	int err;
1157
1158	err = mutex_lock_interruptible(&u->bindlock);
1159	if (err)
1160	return err;
1161
1162	if (u->addr)
1163	goto out;
1164
1165	err = -ENOMEM;
1166	addr = kzalloc(size: sizeof(*addr) +
1167	offsetof(struct sockaddr_un, sun_path) + `16`, GFP_KERNEL);
1168	if (!addr)
1169	goto out;
1170
1171	addr->len = offsetof(struct sockaddr_un, sun_path) + `6`;
1172	addr->name->sun_family = AF_UNIX;
1173	refcount_set(r: &addr->refcnt, n: `1`);
1174
1175	ordernum = get_random_u32();
1176	lastnum = ordernum & `0xFFFFF`;
1177	retry:
1178	ordernum = (ordernum + `1`) & `0xFFFFF`;
1179	sprintf(buf: addr->name->sun_path + `1`, fmt: "%05x", ordernum);
1180
1181	new_hash = unix_abstract_hash(sunaddr: addr->name, addr_len: addr->len, type: sk->sk_type);
1182	unix_table_double_lock(net, hash1: old_hash, hash2: new_hash);
1183
1184	if (__unix_find_socket_byname(net, sunname: addr->name, len: addr->len, hash: new_hash)) {
1185	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1186
1187	/ __unix_find_socket_byname() may take long time if many names*
1188	* are already in use.
1189	*/
1190	cond_resched();
1191
1192	if (ordernum == lastnum) {
1193	/ Give up if all names seems to be in use. /
1194	err = -ENOSPC;
1195	unix_release_addr(addr);
1196	goto out;
1197	}
1198
1199	goto retry;
1200	}
1201
1202	__unix_set_addr_hash(net, sk, addr, hash: new_hash);
1203	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1204	err = `0`;
1205
1206	out: mutex_unlock(lock: &u->bindlock);
1207	return err;
1208	}
1209
1210	static int unix_bind_bsd(struct sock sk, struct* sockaddr_un *sunaddr,
1211	int addr_len)
1212	{
1213	umode_t mode = S_IFSOCK \|
1214	(SOCK_INODE(socket: sk->sk_socket)->i_mode & ~current_umask());
1215	unsigned int new_hash, old_hash = sk->sk_hash;
1216	struct unix_sock *u = unix_sk(sk);
1217	struct net *net = sock_net(sk);
1218	struct mnt_idmap *idmap;
1219	struct unix_address *addr;
1220	struct dentry *dentry;
1221	struct path parent;
1222	int err;
1223
1224	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1225	addr = unix_create_addr(sunaddr, addr_len);
1226	if (!addr)
1227	return -ENOMEM;
1228
1229	/*
1230	* Get the parent directory, calculate the hash for last
1231	* component.
1232	*/
1233	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, `0`);
1234	if (IS_ERR(ptr: dentry)) {
1235	err = PTR_ERR(ptr: dentry);
1236	goto out;
1237	}
1238
1239	/*
1240	* All right, let's create it.
1241	*/
1242	idmap = mnt_idmap(mnt: parent.mnt);
1243	err = security_path_mknod(dir: &parent, dentry, mode, dev: `0`);
1244	if (!err)
1245	err = vfs_mknod(idmap, d_inode(dentry: parent.dentry), dentry, mode, `0`);
1246	if (err)
1247	goto out_path;
1248	err = mutex_lock_interruptible(&u->bindlock);
1249	if (err)
1250	goto out_unlink;
1251	if (u->addr)
1252	goto out_unlock;
1253
1254	new_hash = unix_bsd_hash(i: d_backing_inode(upper: dentry));
1255	unix_table_double_lock(net, hash1: old_hash, hash2: new_hash);
1256	u->path.mnt = mntget(mnt: parent.mnt);
1257	u->path.dentry = dget(dentry);
1258	__unix_set_addr_hash(net, sk, addr, hash: new_hash);
1259	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1260	unix_insert_bsd_socket(sk);
1261	mutex_unlock(lock: &u->bindlock);
1262	done_path_create(&parent, dentry);
1263	return `0`;
1264
1265	out_unlock:
1266	mutex_unlock(lock: &u->bindlock);
1267	err = -EINVAL;
1268	out_unlink:
1269	/ failed after successful mknod? unlink what we'd created... /
1270	vfs_unlink(idmap, d_inode(dentry: parent.dentry), dentry, NULL);
1271	out_path:
1272	done_path_create(&parent, dentry);
1273	out:
1274	unix_release_addr(addr);
1275	return err == -EEXIST ? -EADDRINUSE : err;
1276	}
1277
1278	static int unix_bind_abstract(struct sock sk, struct* sockaddr_un *sunaddr,
1279	int addr_len)
1280	{
1281	unsigned int new_hash, old_hash = sk->sk_hash;
1282	struct unix_sock *u = unix_sk(sk);
1283	struct net *net = sock_net(sk);
1284	struct unix_address *addr;
1285	int err;
1286
1287	addr = unix_create_addr(sunaddr, addr_len);
1288	if (!addr)
1289	return -ENOMEM;
1290
1291	err = mutex_lock_interruptible(&u->bindlock);
1292	if (err)
1293	goto out;
1294
1295	if (u->addr) {
1296	err = -EINVAL;
1297	goto out_mutex;
1298	}
1299
1300	new_hash = unix_abstract_hash(sunaddr: addr->name, addr_len: addr->len, type: sk->sk_type);
1301	unix_table_double_lock(net, hash1: old_hash, hash2: new_hash);
1302
1303	if (__unix_find_socket_byname(net, sunname: addr->name, len: addr->len, hash: new_hash))
1304	goto out_spin;
1305
1306	__unix_set_addr_hash(net, sk, addr, hash: new_hash);
1307	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1308	mutex_unlock(lock: &u->bindlock);
1309	return `0`;
1310
1311	out_spin:
1312	unix_table_double_unlock(net, hash1: old_hash, hash2: new_hash);
1313	err = -EADDRINUSE;
1314	out_mutex:
1315	mutex_unlock(lock: &u->bindlock);
1316	out:
1317	unix_release_addr(addr);
1318	return err;
1319	}
1320
1321	static int unix_bind(struct socket sock, struct* sockaddr uaddr, int* addr_len)
1322	{
1323	struct sockaddr_un sunaddr = (struct* sockaddr_un *)uaddr;
1324	struct sock *sk = sock->sk;
1325	int err;
1326
1327	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1328	sunaddr->sun_family == AF_UNIX)
1329	return unix_autobind(sk);
1330
1331	err = unix_validate_addr(sunaddr, addr_len);
1332	if (err)
1333	return err;
1334
1335	if (sunaddr->sun_path[`0`])
1336	err = unix_bind_bsd(sk, sunaddr, addr_len);
1337	else
1338	err = unix_bind_abstract(sk, sunaddr, addr_len);
1339
1340	return err;
1341	}
1342
1343	static void unix_state_double_lock(struct sock sk1, struct* sock *sk2)
1344	{
1345	if (unlikely(sk1 == sk2) \|\| !sk2) {
1346	unix_state_lock(sk1);
1347	return;
1348	}
1349	if (sk1 < sk2) {
1350	unix_state_lock(sk1);
1351	unix_state_lock_nested(sk2);
1352	} else {
1353	unix_state_lock(sk2);
1354	unix_state_lock_nested(sk1);
1355	}
1356	}
1357
1358	static void unix_state_double_unlock(struct sock sk1, struct* sock *sk2)
1359	{
1360	if (unlikely(sk1 == sk2) \|\| !sk2) {
1361	unix_state_unlock(sk1);
1362	return;
1363	}
1364	unix_state_unlock(sk1);
1365	unix_state_unlock(sk2);
1366	}
1367
1368	static int unix_dgram_connect(struct socket sock, struct* sockaddr *addr,
1369	int alen, int flags)
1370	{
1371	struct sockaddr_un sunaddr = (struct* sockaddr_un *)addr;
1372	struct sock *sk = sock->sk;
1373	struct sock *other;
1374	int err;
1375
1376	err = -EINVAL;
1377	if (alen < offsetofend(struct sockaddr, sa_family))
1378	goto out;
1379
1380	if (addr->sa_family != AF_UNSPEC) {
1381	err = unix_validate_addr(sunaddr, addr_len: alen);
1382	if (err)
1383	goto out;
1384
1385	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1386	if (err)
1387	goto out;
1388
1389	if ((test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1390	test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1391	!unix_sk(sk)->addr) {
1392	err = unix_autobind(sk);
1393	if (err)
1394	goto out;
1395	}
1396
1397	restart:
1398	other = unix_find_other(net: sock_net(sk), sunaddr, addr_len: alen, type: sock->type);
1399	if (IS_ERR(ptr: other)) {
1400	err = PTR_ERR(ptr: other);
1401	goto out;
1402	}
1403
1404	unix_state_double_lock(sk1: sk, sk2: other);
1405
1406	/ Apparently VFS overslept socket death. Retry. /
1407	if (sock_flag(sk: other, flag: SOCK_DEAD)) {
1408	unix_state_double_unlock(sk1: sk, sk2: other);
1409	sock_put(sk: other);
1410	goto restart;
1411	}
1412
1413	err = -EPERM;
1414	if (!unix_may_send(sk, osk: other))
1415	goto out_unlock;
1416
1417	err = security_unix_may_send(sock: sk->sk_socket, other: other->sk_socket);
1418	if (err)
1419	goto out_unlock;
1420
1421	sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1422	} else {
1423	/*
1424	* 1003.1g breaking connected state with AF_UNSPEC
1425	*/
1426	other = NULL;
1427	unix_state_double_lock(sk1: sk, sk2: other);
1428	}
1429
1430	/*
1431	* If it was connected, reconnect.
1432	*/
1433	if (unix_peer(sk)) {
1434	struct sock *old_peer = unix_peer(sk);
1435
1436	unix_peer(sk) = other;
1437	if (!other)
1438	sk->sk_state = TCP_CLOSE;
1439	unix_dgram_peer_wake_disconnect_wakeup(sk, other: old_peer);
1440
1441	unix_state_double_unlock(sk1: sk, sk2: other);
1442
1443	if (other != old_peer)
1444	unix_dgram_disconnected(sk, other: old_peer);
1445	sock_put(sk: old_peer);
1446	} else {
1447	unix_peer(sk) = other;
1448	unix_state_double_unlock(sk1: sk, sk2: other);
1449	}
1450
1451	return `0`;
1452
1453	out_unlock:
1454	unix_state_double_unlock(sk1: sk, sk2: other);
1455	sock_put(sk: other);
1456	out:
1457	return err;
1458	}
1459
1460	static long unix_wait_for_peer(struct sock other, long* timeo)
1461	__releases(&unix_sk(other)->lock)
1462	{
1463	struct unix_sock *u = unix_sk(other);
1464	int sched;
1465	DEFINE_WAIT(wait);
1466
1467	prepare_to_wait_exclusive(wq_head: &u->peer_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
1468
1469	sched = !sock_flag(sk: other, flag: SOCK_DEAD) &&
1470	!(other->sk_shutdown & RCV_SHUTDOWN) &&
1471	unix_recvq_full_lockless(sk: other);
1472
1473	unix_state_unlock(other);
1474
1475	if (sched)
1476	timeo = schedule_timeout(timeout: timeo);
1477
1478	finish_wait(wq_head: &u->peer_wait, wq_entry: &wait);
1479	return timeo;
1480	}
1481
1482	static int unix_stream_connect(struct socket sock, struct* sockaddr *uaddr,
1483	int addr_len, int flags)
1484	{
1485	struct sockaddr_un sunaddr = (struct* sockaddr_un *)uaddr;
1486	struct sock sk = sock->sk, newsk = NULL, *other = NULL;
1487	struct unix_sock u = unix_sk(sk), newu, *otheru;
1488	struct net *net = sock_net(sk);
1489	struct sk_buff *skb = NULL;
1490	long timeo;
1491	int err;
1492	int st;
1493
1494	err = unix_validate_addr(sunaddr, addr_len);
1495	if (err)
1496	goto out;
1497
1498	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1499	if (err)
1500	goto out;
1501
1502	if ((test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1503	test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1504	err = unix_autobind(sk);
1505	if (err)
1506	goto out;
1507	}
1508
1509	timeo = sock_sndtimeo(sk, noblock: flags & O_NONBLOCK);
1510
1511	/ First of all allocate resources.*
1512	If we will make it after state is locked,
1513	we will have to recheck all again in any case.
1514	*/
1515
1516	/ create new sock for complete connection /
1517	newsk = unix_create1(net, NULL, kern: `0`, type: sock->type);
1518	if (IS_ERR(ptr: newsk)) {
1519	err = PTR_ERR(ptr: newsk);
1520	newsk = NULL;
1521	goto out;
1522	}
1523
1524	err = -ENOMEM;
1525
1526	/ Allocate skb for sending to listening sock /
1527	skb = sock_wmalloc(sk: newsk, size: `1`, force: `0`, GFP_KERNEL);
1528	if (skb == NULL)
1529	goto out;
1530
1531	restart:
1532	/ Find listening sock. /
1533	other = unix_find_other(net, sunaddr, addr_len, type: sk->sk_type);
1534	if (IS_ERR(ptr: other)) {
1535	err = PTR_ERR(ptr: other);
1536	other = NULL;
1537	goto out;
1538	}
1539
1540	/ Latch state of peer /
1541	unix_state_lock(other);
1542
1543	/ Apparently VFS overslept socket death. Retry. /
1544	if (sock_flag(sk: other, flag: SOCK_DEAD)) {
1545	unix_state_unlock(other);
1546	sock_put(sk: other);
1547	goto restart;
1548	}
1549
1550	err = -ECONNREFUSED;
1551	if (other->sk_state != TCP_LISTEN)
1552	goto out_unlock;
1553	if (other->sk_shutdown & RCV_SHUTDOWN)
1554	goto out_unlock;
1555
1556	if (unix_recvq_full(sk: other)) {
1557	err = -EAGAIN;
1558	if (!timeo)
1559	goto out_unlock;
1560
1561	timeo = unix_wait_for_peer(other, timeo);
1562
1563	err = sock_intr_errno(timeo);
1564	if (signal_pending(current))
1565	goto out;
1566	sock_put(sk: other);
1567	goto restart;
1568	}
1569
1570	/ Latch our state.*
1571
1572	It is tricky place. We need to grab our state lock and cannot
1573	drop lock on peer. It is dangerous because deadlock is
1574	possible. Connect to self case and simultaneous
1575	attempt to connect are eliminated by checking socket
1576	state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1577	check this before attempt to grab lock.
1578
1579	Well, and we have to recheck the state after socket locked.
1580	*/
1581	st = sk->sk_state;
1582
1583	switch (st) {
1584	case TCP_CLOSE:
1585	/ This is ok... continue with connect /
1586	break;
1587	case TCP_ESTABLISHED:
1588	/ Socket is already connected /
1589	err = -EISCONN;
1590	goto out_unlock;
1591	default:
1592	err = -EINVAL;
1593	goto out_unlock;
1594	}
1595
1596	unix_state_lock_nested(sk);
1597
1598	if (sk->sk_state != st) {
1599	unix_state_unlock(sk);
1600	unix_state_unlock(other);
1601	sock_put(sk: other);
1602	goto restart;
1603	}
1604
1605	err = security_unix_stream_connect(sock: sk, other, newsk);
1606	if (err) {
1607	unix_state_unlock(sk);
1608	goto out_unlock;
1609	}
1610
1611	/ The way is open! Fastly set all the necessary fields... /
1612
1613	sock_hold(sk);
1614	unix_peer(newsk) = sk;
1615	newsk->sk_state = TCP_ESTABLISHED;
1616	newsk->sk_type = sk->sk_type;
1617	init_peercred(sk: newsk);
1618	newu = unix_sk(newsk);
1619	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1620	otheru = unix_sk(other);
1621
1622	/ copy address information from listening to new sock*
1623	*
1624	* The contents of *(otheru->addr) and otheru->path
1625	* are seen fully set up here, since we have found
1626	* otheru in hash under its lock. Insertion into the
1627	* hash chain we'd found it in had been done in an
1628	* earlier critical area protected by the chain's lock,
1629	* the same one where we'd set *(otheru->addr) contents,
1630	* as well as otheru->path and otheru->addr itself.
1631	*
1632	* Using smp_store_release() here to set newu->addr
1633	* is enough to make those stores, as well as stores
1634	* to newu->path visible to anyone who gets newu->addr
1635	* by smp_load_acquire(). IOW, the same warranties
1636	* as for unix_sock instances bound in unix_bind() or
1637	* in unix_autobind().
1638	*/
1639	if (otheru->path.dentry) {
1640	path_get(&otheru->path);
1641	newu->path = otheru->path;
1642	}
1643	refcount_inc(r: &otheru->addr->refcnt);
1644	smp_store_release(&newu->addr, otheru->addr);
1645
1646	/ Set credentials /
1647	copy_peercred(sk, peersk: other);
1648
1649	sock->state = SS_CONNECTED;
1650	sk->sk_state = TCP_ESTABLISHED;
1651	sock_hold(sk: newsk);
1652
1653	smp_mb__after_atomic(); / sock_hold() does an atomic_inc() /
1654	unix_peer(sk) = newsk;
1655
1656	unix_state_unlock(sk);
1657
1658	/ take ten and send info to listening sock /
1659	spin_lock(lock: &other->sk_receive_queue.lock);
1660	__skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
1661	spin_unlock(lock: &other->sk_receive_queue.lock);
1662	unix_state_unlock(other);
1663	other->sk_data_ready(other);
1664	sock_put(sk: other);
1665	return `0`;
1666
1667	out_unlock:
1668	if (other)
1669	unix_state_unlock(other);
1670
1671	out:
1672	kfree_skb(skb);
1673	if (newsk)
1674	unix_release_sock(sk: newsk, embrion: `0`);
1675	if (other)
1676	sock_put(sk: other);
1677	return err;
1678	}
1679
1680	static int unix_socketpair(struct socket socka, struct* socket *sockb)
1681	{
1682	struct sock ska = socka->sk, skb = sockb->sk;
1683
1684	/ Join our sockets back to back /
1685	sock_hold(sk: ska);
1686	sock_hold(sk: skb);
1687	unix_peer(ska) = skb;
1688	unix_peer(skb) = ska;
1689	init_peercred(sk: ska);
1690	init_peercred(sk: skb);
1691
1692	ska->sk_state = TCP_ESTABLISHED;
1693	skb->sk_state = TCP_ESTABLISHED;
1694	socka->state = SS_CONNECTED;
1695	sockb->state = SS_CONNECTED;
1696	return `0`;
1697	}
1698
1699	static void unix_sock_inherit_flags(const struct socket *old,
1700	struct socket *new)
1701	{
1702	if (test_bit(SOCK_PASSCRED, &old->flags))
1703	set_bit(SOCK_PASSCRED, addr: &new->flags);
1704	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1705	set_bit(SOCK_PASSPIDFD, addr: &new->flags);
1706	if (test_bit(SOCK_PASSSEC, &old->flags))
1707	set_bit(SOCK_PASSSEC, addr: &new->flags);
1708	}
1709
1710	static int unix_accept(struct socket sock, struct* socket newsock, int* flags,
1711	bool kern)
1712	{
1713	struct sock *sk = sock->sk;
1714	struct sock *tsk;
1715	struct sk_buff *skb;
1716	int err;
1717
1718	err = -EOPNOTSUPP;
1719	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1720	goto out;
1721
1722	err = -EINVAL;
1723	if (sk->sk_state != TCP_LISTEN)
1724	goto out;
1725
1726	/ If socket state is TCP_LISTEN it cannot change (for now...),*
1727	* so that no locks are necessary.
1728	*/
1729
1730	skb = skb_recv_datagram(sk, flags: (flags & O_NONBLOCK) ? MSG_DONTWAIT : `0`,
1731	err: &err);
1732	if (!skb) {
1733	/ This means receive shutdown. /
1734	if (err == `0`)
1735	err = -EINVAL;
1736	goto out;
1737	}
1738
1739	tsk = skb->sk;
1740	skb_free_datagram(sk, skb);
1741	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1742
1743	/ attach accepted sock to socket /
1744	unix_state_lock(tsk);
1745	newsock->state = SS_CONNECTED;
1746	unix_sock_inherit_flags(old: sock, new: newsock);
1747	sock_graft(sk: tsk, parent: newsock);
1748	unix_state_unlock(tsk);
1749	return `0`;
1750
1751	out:
1752	return err;
1753	}
1754
1755
1756	static int unix_getname(struct socket sock, struct* sockaddr uaddr, int* peer)
1757	{
1758	struct sock *sk = sock->sk;
1759	struct unix_address *addr;
1760	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1761	int err = `0`;
1762
1763	if (peer) {
1764	sk = unix_peer_get(sk);
1765
1766	err = -ENOTCONN;
1767	if (!sk)
1768	goto out;
1769	err = `0`;
1770	} else {
1771	sock_hold(sk);
1772	}
1773
1774	addr = smp_load_acquire(&unix_sk(sk)->addr);
1775	if (!addr) {
1776	sunaddr->sun_family = AF_UNIX;
1777	sunaddr->sun_path[`0`] = `0`;
1778	err = offsetof(struct sockaddr_un, sun_path);
1779	} else {
1780	err = addr->len;
1781	memcpy(sunaddr, addr->name, addr->len);
1782
1783	if (peer)
1784	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1785	CGROUP_UNIX_GETPEERNAME);
1786	else
1787	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1788	CGROUP_UNIX_GETSOCKNAME);
1789	}
1790	sock_put(sk);
1791	out:
1792	return err;
1793	}
1794
1795	static void unix_peek_fds(struct scm_cookie scm, struct* sk_buff *skb)
1796	{
1797	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1798
1799	/*
1800	* Garbage collection of unix sockets starts by selecting a set of
1801	* candidate sockets which have reference only from being in flight
1802	* (total_refs == inflight_refs). This condition is checked once during
1803	* the candidate collection phase, and candidates are marked as such, so
1804	* that non-candidates can later be ignored. While inflight_refs is
1805	* protected by unix_gc_lock, total_refs (file count) is not, hence this
1806	* is an instantaneous decision.
1807	*
1808	* Once a candidate, however, the socket must not be reinstalled into a
1809	* file descriptor while the garbage collection is in progress.
1810	*
1811	* If the above conditions are met, then the directed graph of
1812	* candidates (*) does not change while unix_gc_lock is held.
1813	*
1814	* Any operations that changes the file count through file descriptors
1815	* (dup, close, sendmsg) does not change the graph since candidates are
1816	* not installed in fds.
1817	*
1818	* Dequeing a candidate via recvmsg would install it into an fd, but
1819	* that takes unix_gc_lock to decrement the inflight count, so it's
1820	* serialized with garbage collection.
1821	*
1822	* MSG_PEEK is special in that it does not change the inflight count,
1823	* yet does install the socket into an fd. The following lock/unlock
1824	* pair is to ensure serialization with garbage collection. It must be
1825	* done between incrementing the file count and installing the file into
1826	* an fd.
1827	*
1828	* If garbage collection starts after the barrier provided by the
1829	* lock/unlock, then it will see the elevated refcount and not mark this
1830	* as a candidate. If a garbage collection is already in progress
1831	* before the file count was incremented, then the lock/unlock pair will
1832	* ensure that garbage collection is finished before progressing to
1833	* installing the fd.
1834	*
1835	* (*) A -> B where B is on the queue of A or B is on the queue of C
1836	* which is on the queue of listening socket A.
1837	*/
1838	spin_lock(lock: &unix_gc_lock);
1839	spin_unlock(lock: &unix_gc_lock);
1840	}
1841
1842	static int unix_scm_to_skb(struct scm_cookie scm, struct* sk_buff *skb, bool send_fds)
1843	{
1844	int err = `0`;
1845
1846	UNIXCB(skb).pid = get_pid(pid: scm->pid);
1847	UNIXCB(skb).uid = scm->creds.uid;
1848	UNIXCB(skb).gid = scm->creds.gid;
1849	UNIXCB(skb).fp = NULL;
1850	unix_get_secdata(scm, skb);
1851	if (scm->fp && send_fds)
1852	err = unix_attach_fds(scm, skb);
1853
1854	skb->destructor = unix_destruct_scm;
1855	return err;
1856	}
1857
1858	static bool unix_passcred_enabled(const struct socket *sock,
1859	const struct sock *other)
1860	{
1861	return test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1862	test_bit(SOCK_PASSPIDFD, &sock->flags) \|\|
1863	!other->sk_socket \|\|
1864	test_bit(SOCK_PASSCRED, &other->sk_socket->flags) \|\|
1865	test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1866	}
1867
1868	/*
1869	* Some apps rely on write() giving SCM_CREDENTIALS
1870	* We include credentials if source or destination socket
1871	* asserted SOCK_PASSCRED.
1872	*/
1873	static void maybe_add_creds(struct sk_buff skb, const* struct socket *sock,
1874	const struct sock *other)
1875	{
1876	if (UNIXCB(skb).pid)
1877	return;
1878	if (unix_passcred_enabled(sock, other)) {
1879	UNIXCB(skb).pid = get_pid(pid: task_tgid(current));
1880	current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1881	}
1882	}
1883
1884	static bool unix_skb_scm_eq(struct sk_buff *skb,
1885	struct scm_cookie *scm)
1886	{
1887	return UNIXCB(skb).pid == scm->pid &&
1888	uid_eq(UNIXCB(skb).uid, right: scm->creds.uid) &&
1889	gid_eq(UNIXCB(skb).gid, right: scm->creds.gid) &&
1890	unix_secdata_eq(scm, skb);
1891	}
1892
1893	static void scm_stat_add(struct sock sk, struct* sk_buff *skb)
1894	{
1895	struct scm_fp_list *fp = UNIXCB(skb).fp;
1896	struct unix_sock *u = unix_sk(sk);
1897
1898	if (unlikely(fp && fp->count))
1899	atomic_add(i: fp->count, v: &u->scm_stat.nr_fds);
1900	}
1901
1902	static void scm_stat_del(struct sock sk, struct* sk_buff *skb)
1903	{
1904	struct scm_fp_list *fp = UNIXCB(skb).fp;
1905	struct unix_sock *u = unix_sk(sk);
1906
1907	if (unlikely(fp && fp->count))
1908	atomic_sub(i: fp->count, v: &u->scm_stat.nr_fds);
1909	}
1910
1911	/*
1912	* Send AF_UNIX data.
1913	*/
1914
1915	static int unix_dgram_sendmsg(struct socket sock, struct* msghdr *msg,
1916	size_t len)
1917	{
1918	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1919	struct sock sk = sock->sk, other = NULL;
1920	struct unix_sock *u = unix_sk(sk);
1921	struct scm_cookie scm;
1922	struct sk_buff *skb;
1923	int data_len = `0`;
1924	int sk_locked;
1925	long timeo;
1926	int err;
1927
1928	wait_for_unix_gc();
1929	err = scm_send(sock, msg, scm: &scm, forcecreds: false);
1930	if (err < `0`)
1931	return err;
1932
1933	err = -EOPNOTSUPP;
1934	if (msg->msg_flags&MSG_OOB)
1935	goto out;
1936
1937	if (msg->msg_namelen) {
1938	err = unix_validate_addr(sunaddr, addr_len: msg->msg_namelen);
1939	if (err)
1940	goto out;
1941
1942	err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1943	msg->msg_name,
1944	&msg->msg_namelen,
1945	NULL);
1946	if (err)
1947	goto out;
1948	} else {
1949	sunaddr = NULL;
1950	err = -ENOTCONN;
1951	other = unix_peer_get(sk);
1952	if (!other)
1953	goto out;
1954	}
1955
1956	if ((test_bit(SOCK_PASSCRED, &sock->flags) \|\|
1957	test_bit(SOCK_PASSPIDFD, &sock->flags)) && !u->addr) {
1958	err = unix_autobind(sk);
1959	if (err)
1960	goto out;
1961	}
1962
1963	err = -EMSGSIZE;
1964	if (len > sk->sk_sndbuf - `32`)
1965	goto out;
1966
1967	if (len > SKB_MAX_ALLOC) {
1968	data_len = min_t(size_t,
1969	len - SKB_MAX_ALLOC,
1970	MAX_SKB_FRAGS * PAGE_SIZE);
1971	data_len = PAGE_ALIGN(data_len);
1972
1973	BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1974	}
1975
1976	skb = sock_alloc_send_pskb(sk, header_len: len - data_len, data_len,
1977	noblock: msg->msg_flags & MSG_DONTWAIT, errcode: &err,
1978	PAGE_ALLOC_COSTLY_ORDER);
1979	if (skb == NULL)
1980	goto out;
1981
1982	err = unix_scm_to_skb(scm: &scm, skb, send_fds: true);
1983	if (err < `0`)
1984	goto out_free;
1985
1986	skb_put(skb, len: len - data_len);
1987	skb->data_len = data_len;
1988	skb->len = len;
1989	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len);
1990	if (err)
1991	goto out_free;
1992
1993	timeo = sock_sndtimeo(sk, noblock: msg->msg_flags & MSG_DONTWAIT);
1994
1995	restart:
1996	if (!other) {
1997	err = -ECONNRESET;
1998	if (sunaddr == NULL)
1999	goto out_free;
2000
2001	other = unix_find_other(net: sock_net(sk), sunaddr, addr_len: msg->msg_namelen,
2002	type: sk->sk_type);
2003	if (IS_ERR(ptr: other)) {
2004	err = PTR_ERR(ptr: other);
2005	other = NULL;
2006	goto out_free;
2007	}
2008	}
2009
2010	if (sk_filter(sk: other, skb) < `0`) {
2011	/ Toss the packet but do not return any error to the sender /
2012	err = len;
2013	goto out_free;
2014	}
2015
2016	sk_locked = `0`;
2017	unix_state_lock(other);
2018	restart_locked:
2019	err = -EPERM;
2020	if (!unix_may_send(sk, osk: other))
2021	goto out_unlock;
2022
2023	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2024	/*
2025	* Check with 1003.1g - what should
2026	* datagram error
2027	*/
2028	unix_state_unlock(other);
2029	sock_put(sk: other);
2030
2031	if (!sk_locked)
2032	unix_state_lock(sk);
2033
2034	err = `0`;
2035	if (sk->sk_type == SOCK_SEQPACKET) {
2036	/ We are here only when racing with unix_release_sock()*
2037	* is clearing @other. Never change state to TCP_CLOSE
2038	* unlike SOCK_DGRAM wants.
2039	*/
2040	unix_state_unlock(sk);
2041	err = -EPIPE;
2042	} else if (unix_peer(sk) == other) {
2043	unix_peer(sk) = NULL;
2044	unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2045
2046	sk->sk_state = TCP_CLOSE;
2047	unix_state_unlock(sk);
2048
2049	unix_dgram_disconnected(sk, other);
2050	sock_put(sk: other);
2051	err = -ECONNREFUSED;
2052	} else {
2053	unix_state_unlock(sk);
2054	}
2055
2056	other = NULL;
2057	if (err)
2058	goto out_free;
2059	goto restart;
2060	}
2061
2062	err = -EPIPE;
2063	if (other->sk_shutdown & RCV_SHUTDOWN)
2064	goto out_unlock;
2065
2066	if (sk->sk_type != SOCK_SEQPACKET) {
2067	err = security_unix_may_send(sock: sk->sk_socket, other: other->sk_socket);
2068	if (err)
2069	goto out_unlock;
2070	}
2071
2072	/ other == sk && unix_peer(other) != sk if*
2073	* - unix_peer(sk) == NULL, destination address bound to sk
2074	* - unix_peer(sk) == sk by time of get but disconnected before lock
2075	*/
2076	if (other != sk &&
2077	unlikely(unix_peer(other) != sk &&
2078	unix_recvq_full_lockless(other))) {
2079	if (timeo) {
2080	timeo = unix_wait_for_peer(other, timeo);
2081
2082	err = sock_intr_errno(timeo);
2083	if (signal_pending(current))
2084	goto out_free;
2085
2086	goto restart;
2087	}
2088
2089	if (!sk_locked) {
2090	unix_state_unlock(other);
2091	unix_state_double_lock(sk1: sk, sk2: other);
2092	}
2093
2094	if (unix_peer(sk) != other \|\|
2095	unix_dgram_peer_wake_me(sk, other)) {
2096	err = -EAGAIN;
2097	sk_locked = `1`;
2098	goto out_unlock;
2099	}
2100
2101	if (!sk_locked) {
2102	sk_locked = `1`;
2103	goto restart_locked;
2104	}
2105	}
2106
2107	if (unlikely(sk_locked))
2108	unix_state_unlock(sk);
2109
2110	if (sock_flag(sk: other, flag: SOCK_RCVTSTAMP))
2111	__net_timestamp(skb);
2112	maybe_add_creds(skb, sock, other);
2113	scm_stat_add(sk: other, skb);
2114	skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
2115	unix_state_unlock(other);
2116	other->sk_data_ready(other);
2117	sock_put(sk: other);
2118	scm_destroy(scm: &scm);
2119	return len;
2120
2121	out_unlock:
2122	if (sk_locked)
2123	unix_state_unlock(sk);
2124	unix_state_unlock(other);
2125	out_free:
2126	kfree_skb(skb);
2127	out:
2128	if (other)
2129	sock_put(sk: other);
2130	scm_destroy(scm: &scm);
2131	return err;
2132	}
2133
2134	/ We use paged skbs for stream sockets, and limit occupancy to 32768*
2135	* bytes, and a minimum of a full page.
2136	*/
2137	#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2138
2139	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2140	static int queue_oob(struct socket sock, struct* msghdr msg, struct* sock *other,
2141	struct scm_cookie *scm, bool fds_sent)
2142	{
2143	struct unix_sock *ousk = unix_sk(other);
2144	struct sk_buff *skb;
2145	int err = `0`;
2146
2147	skb = sock_alloc_send_skb(sk: sock->sk, size: `1`, noblock: msg->msg_flags & MSG_DONTWAIT, errcode: &err);
2148
2149	if (!skb)
2150	return err;
2151
2152	err = unix_scm_to_skb(scm, skb, send_fds: !fds_sent);
2153	if (err < `0`) {
2154	kfree_skb(skb);
2155	return err;
2156	}
2157	skb_put(skb, len: `1`);
2158	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len: `1`);
2159
2160	if (err) {
2161	kfree_skb(skb);
2162	return err;
2163	}
2164
2165	unix_state_lock(other);
2166
2167	if (sock_flag(sk: other, flag: SOCK_DEAD) \|\|
2168	(other->sk_shutdown & RCV_SHUTDOWN)) {
2169	unix_state_unlock(other);
2170	kfree_skb(skb);
2171	return -EPIPE;
2172	}
2173
2174	maybe_add_creds(skb, sock, other);
2175	skb_get(skb);
2176
2177	if (ousk->oob_skb)
2178	consume_skb(skb: ousk->oob_skb);
2179
2180	WRITE_ONCE(ousk->oob_skb, skb);
2181
2182	scm_stat_add(sk: other, skb);
2183	skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
2184	sk_send_sigurg(sk: other);
2185	unix_state_unlock(other);
2186	other->sk_data_ready(other);
2187
2188	return err;
2189	}
2190	#endif
2191
2192	static int unix_stream_sendmsg(struct socket sock, struct* msghdr *msg,
2193	size_t len)
2194	{
2195	struct sock *sk = sock->sk;
2196	struct sock *other = NULL;
2197	int err, size;
2198	struct sk_buff *skb;
2199	int sent = `0`;
2200	struct scm_cookie scm;
2201	bool fds_sent = false;
2202	int data_len;
2203
2204	wait_for_unix_gc();
2205	err = scm_send(sock, msg, scm: &scm, forcecreds: false);
2206	if (err < `0`)
2207	return err;
2208
2209	err = -EOPNOTSUPP;
2210	if (msg->msg_flags & MSG_OOB) {
2211	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2212	if (len)
2213	len--;
2214	else
2215	#endif
2216	goto out_err;
2217	}
2218
2219	if (msg->msg_namelen) {
2220	err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2221	goto out_err;
2222	} else {
2223	err = -ENOTCONN;
2224	other = unix_peer(sk);
2225	if (!other)
2226	goto out_err;
2227	}
2228
2229	if (sk->sk_shutdown & SEND_SHUTDOWN)
2230	goto pipe_err;
2231
2232	while (sent < len) {
2233	size = len - sent;
2234
2235	if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2236	skb = sock_alloc_send_pskb(sk, header_len: `0`, data_len: `0`,
2237	noblock: msg->msg_flags & MSG_DONTWAIT,
2238	errcode: &err, max_page_order: `0`);
2239	} else {
2240	/ Keep two messages in the pipe so it schedules better /
2241	size = min_t(int, size, (sk->sk_sndbuf >> `1`) - `64`);
2242
2243	/ allow fallback to order-0 allocations /
2244	size = min_t(int, size, SKB_MAX_HEAD(`0`) + UNIX_SKB_FRAGS_SZ);
2245
2246	data_len = max_t(int, `0`, size - SKB_MAX_HEAD(`0`));
2247
2248	data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2249
2250	skb = sock_alloc_send_pskb(sk, header_len: size - data_len, data_len,
2251	noblock: msg->msg_flags & MSG_DONTWAIT, errcode: &err,
2252	max_page_order: get_order(UNIX_SKB_FRAGS_SZ));
2253	}
2254	if (!skb)
2255	goto out_err;
2256
2257	/ Only send the fds in the first buffer /
2258	err = unix_scm_to_skb(scm: &scm, skb, send_fds: !fds_sent);
2259	if (err < `0`) {
2260	kfree_skb(skb);
2261	goto out_err;
2262	}
2263	fds_sent = true;
2264
2265	if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2266	err = skb_splice_from_iter(skb, iter: &msg->msg_iter, maxsize: size,
2267	gfp: sk->sk_allocation);
2268	if (err < `0`) {
2269	kfree_skb(skb);
2270	goto out_err;
2271	}
2272	size = err;
2273	refcount_add(i: size, r: &sk->sk_wmem_alloc);
2274	} else {
2275	skb_put(skb, len: size - data_len);
2276	skb->data_len = data_len;
2277	skb->len = size;
2278	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len: size);
2279	if (err) {
2280	kfree_skb(skb);
2281	goto out_err;
2282	}
2283	}
2284
2285	unix_state_lock(other);
2286
2287	if (sock_flag(sk: other, flag: SOCK_DEAD) \|\|
2288	(other->sk_shutdown & RCV_SHUTDOWN))
2289	goto pipe_err_free;
2290
2291	maybe_add_creds(skb, sock, other);
2292	scm_stat_add(sk: other, skb);
2293	skb_queue_tail(list: &other->sk_receive_queue, newsk: skb);
2294	unix_state_unlock(other);
2295	other->sk_data_ready(other);
2296	sent += size;
2297	}
2298
2299	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2300	if (msg->msg_flags & MSG_OOB) {
2301	err = queue_oob(sock, msg, other, scm: &scm, fds_sent);
2302	if (err)
2303	goto out_err;
2304	sent++;
2305	}
2306	#endif
2307
2308	scm_destroy(scm: &scm);
2309
2310	return sent;
2311
2312	pipe_err_free:
2313	unix_state_unlock(other);
2314	kfree_skb(skb);
2315	pipe_err:
2316	if (sent == `0` && !(msg->msg_flags&MSG_NOSIGNAL))
2317	send_sig(SIGPIPE, current, `0`);
2318	err = -EPIPE;
2319	out_err:
2320	scm_destroy(scm: &scm);
2321	return sent ? : err;
2322	}
2323
2324	static int unix_seqpacket_sendmsg(struct socket sock, struct* msghdr *msg,
2325	size_t len)
2326	{
2327	int err;
2328	struct sock *sk = sock->sk;
2329
2330	err = sock_error(sk);
2331	if (err)
2332	return err;
2333
2334	if (sk->sk_state != TCP_ESTABLISHED)
2335	return -ENOTCONN;
2336
2337	if (msg->msg_namelen)
2338	msg->msg_namelen = `0`;
2339
2340	return unix_dgram_sendmsg(sock, msg, len);
2341	}
2342
2343	static int unix_seqpacket_recvmsg(struct socket sock, struct* msghdr *msg,
2344	size_t size, int flags)
2345	{
2346	struct sock *sk = sock->sk;
2347
2348	if (sk->sk_state != TCP_ESTABLISHED)
2349	return -ENOTCONN;
2350
2351	return unix_dgram_recvmsg(sock, msg, size, flags);
2352	}
2353
2354	static void unix_copy_addr(struct msghdr msg, struct* sock *sk)
2355	{
2356	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2357
2358	if (addr) {
2359	msg->msg_namelen = addr->len;
2360	memcpy(msg->msg_name, addr->name, addr->len);
2361	}
2362	}
2363
2364	int __unix_dgram_recvmsg(struct sock sk, struct* msghdr *msg, size_t size,
2365	int flags)
2366	{
2367	struct scm_cookie scm;
2368	struct socket *sock = sk->sk_socket;
2369	struct unix_sock *u = unix_sk(sk);
2370	struct sk_buff skb, last;
2371	long timeo;
2372	int skip;
2373	int err;
2374
2375	err = -EOPNOTSUPP;
2376	if (flags&MSG_OOB)
2377	goto out;
2378
2379	timeo = sock_rcvtimeo(sk, noblock: flags & MSG_DONTWAIT);
2380
2381	do {
2382	mutex_lock(&u->iolock);
2383
2384	skip = sk_peek_offset(sk, flags);
2385	skb = __skb_try_recv_datagram(sk, queue: &sk->sk_receive_queue, flags,
2386	off: &skip, err: &err, last: &last);
2387	if (skb) {
2388	if (!(flags & MSG_PEEK))
2389	scm_stat_del(sk, skb);
2390	break;
2391	}
2392
2393	mutex_unlock(lock: &u->iolock);
2394
2395	if (err != -EAGAIN)
2396	break;
2397	} while (timeo &&
2398	!__skb_wait_for_more_packets(sk, queue: &sk->sk_receive_queue,
2399	err: &err, timeo_p: &timeo, skb: last));
2400
2401	if (!skb) { / implies iolock unlocked /
2402	unix_state_lock(sk);
2403	/ Signal EOF on disconnected non-blocking SEQPACKET socket. /
2404	if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2405	(sk->sk_shutdown & RCV_SHUTDOWN))
2406	err = `0`;
2407	unix_state_unlock(sk);
2408	goto out;
2409	}
2410
2411	if (wq_has_sleeper(wq_head: &u->peer_wait))
2412	wake_up_interruptible_sync_poll(&u->peer_wait,
2413	EPOLLOUT \| EPOLLWRNORM \|
2414	EPOLLWRBAND);
2415
2416	if (msg->msg_name) {
2417	unix_copy_addr(msg, sk: skb->sk);
2418
2419	BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2420	msg->msg_name,
2421	&msg->msg_namelen);
2422	}
2423
2424	if (size > skb->len - skip)
2425	size = skb->len - skip;
2426	else if (size < skb->len - skip)
2427	msg->msg_flags \|= MSG_TRUNC;
2428
2429	err = skb_copy_datagram_msg(from: skb, offset: skip, msg, size);
2430	if (err)
2431	goto out_free;
2432
2433	if (sock_flag(sk, flag: SOCK_RCVTSTAMP))
2434	__sock_recv_timestamp(msg, sk, skb);
2435
2436	memset(&scm, `0`, sizeof(scm));
2437
2438	scm_set_cred(scm: &scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2439	unix_set_secdata(scm: &scm, skb);
2440
2441	if (!(flags & MSG_PEEK)) {
2442	if (UNIXCB(skb).fp)
2443	unix_detach_fds(scm: &scm, skb);
2444
2445	sk_peek_offset_bwd(sk, val: skb->len);
2446	} else {
2447	/ It is questionable: on PEEK we could:*
2448	- do not return fds - good, but too simple 8)
2449	- return fds, and do not return them on read (old strategy,
2450	apparently wrong)
2451	- clone fds (I chose it for now, it is the most universal
2452	solution)
2453
2454	POSIX 1003.1g does not actually define this clearly
2455	at all. POSIX 1003.1g doesn't define a lot of things
2456	clearly however!
2457
2458	*/
2459
2460	sk_peek_offset_fwd(sk, val: size);
2461
2462	if (UNIXCB(skb).fp)
2463	unix_peek_fds(scm: &scm, skb);
2464	}
2465	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2466
2467	scm_recv_unix(sock, msg, scm: &scm, flags);
2468
2469	out_free:
2470	skb_free_datagram(sk, skb);
2471	mutex_unlock(lock: &u->iolock);
2472	out:
2473	return err;
2474	}
2475
2476	static int unix_dgram_recvmsg(struct socket sock, struct* msghdr *msg, size_t size,
2477	int flags)
2478	{
2479	struct sock *sk = sock->sk;
2480
2481	#ifdef CONFIG_BPF_SYSCALL
2482	const struct proto *prot = READ_ONCE(sk->sk_prot);
2483
2484	if (prot != &unix_dgram_proto)
2485	return prot->recvmsg(sk, msg, size, flags, NULL);
2486	#endif
2487	return __unix_dgram_recvmsg(sk, msg, size, flags);
2488	}
2489
2490	static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2491	{
2492	struct unix_sock *u = unix_sk(sk);
2493	struct sk_buff *skb;
2494	int err;
2495
2496	mutex_lock(&u->iolock);
2497	skb = skb_recv_datagram(sk, MSG_DONTWAIT, err: &err);
2498	mutex_unlock(lock: &u->iolock);
2499	if (!skb)
2500	return err;
2501
2502	return recv_actor(sk, skb);
2503	}
2504
2505	/*
2506	* Sleep until more data has arrived. But check for races..
2507	*/
2508	static long unix_stream_data_wait(struct sock sk, long* timeo,
2509	struct sk_buff last, unsigned* int last_len,
2510	bool freezable)
2511	{
2512	unsigned int state = TASK_INTERRUPTIBLE \| freezable * TASK_FREEZABLE;
2513	struct sk_buff *tail;
2514	DEFINE_WAIT(wait);
2515
2516	unix_state_lock(sk);
2517
2518	for (;;) {
2519	prepare_to_wait(wq_head: sk_sleep(sk), wq_entry: &wait, state);
2520
2521	tail = skb_peek_tail(list_: &sk->sk_receive_queue);
2522	if (tail != last \|\|
2523	(tail && tail->len != last_len) \|\|
2524	sk->sk_err \|\|
2525	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
2526	signal_pending(current) \|\|
2527	!timeo)
2528	break;
2529
2530	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2531	unix_state_unlock(sk);
2532	timeo = schedule_timeout(timeout: timeo);
2533	unix_state_lock(sk);
2534
2535	if (sock_flag(sk, flag: SOCK_DEAD))
2536	break;
2537
2538	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2539	}
2540
2541	finish_wait(wq_head: sk_sleep(sk), wq_entry: &wait);
2542	unix_state_unlock(sk);
2543	return timeo;
2544	}
2545
2546	static unsigned int unix_skb_len(const struct sk_buff *skb)
2547	{
2548	return skb->len - UNIXCB(skb).consumed;
2549	}
2550
2551	struct unix_stream_read_state {
2552	int (recv_actor)(struct* sk_buff , int, int*,
2553	struct unix_stream_read_state *);
2554	struct socket *socket;
2555	struct msghdr *msg;
2556	struct pipe_inode_info *pipe;
2557	size_t size;
2558	int flags;
2559	unsigned int splice_flags;
2560	};
2561
2562	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2563	static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2564	{
2565	struct socket *sock = state->socket;
2566	struct sock *sk = sock->sk;
2567	struct unix_sock *u = unix_sk(sk);
2568	int chunk = `1`;
2569	struct sk_buff *oob_skb;
2570
2571	mutex_lock(&u->iolock);
2572	unix_state_lock(sk);
2573
2574	if (sock_flag(sk, flag: SOCK_URGINLINE) \|\| !u->oob_skb) {
2575	unix_state_unlock(sk);
2576	mutex_unlock(lock: &u->iolock);
2577	return -EINVAL;
2578	}
2579
2580	oob_skb = u->oob_skb;
2581
2582	if (!(state->flags & MSG_PEEK))
2583	WRITE_ONCE(u->oob_skb, NULL);
2584
2585	unix_state_unlock(sk);
2586
2587	chunk = state->recv_actor(oob_skb, `0`, chunk, state);
2588
2589	if (!(state->flags & MSG_PEEK)) {
2590	UNIXCB(oob_skb).consumed += `1`;
2591	kfree_skb(skb: oob_skb);
2592	}
2593
2594	mutex_unlock(lock: &u->iolock);
2595
2596	if (chunk < `0`)
2597	return -EFAULT;
2598
2599	state->msg->msg_flags \|= MSG_OOB;
2600	return `1`;
2601	}
2602
2603	static struct sk_buff manage_oob(struct* sk_buff skb, struct* sock *sk,
2604	int flags, int copied)
2605	{
2606	struct unix_sock *u = unix_sk(sk);
2607
2608	if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2609	skb_unlink(skb, list: &sk->sk_receive_queue);
2610	consume_skb(skb);
2611	skb = NULL;
2612	} else {
2613	if (skb == u->oob_skb) {
2614	if (copied) {
2615	skb = NULL;
2616	} else if (sock_flag(sk, flag: SOCK_URGINLINE)) {
2617	if (!(flags & MSG_PEEK)) {
2618	WRITE_ONCE(u->oob_skb, NULL);
2619	consume_skb(skb);
2620	}
2621	} else if (!(flags & MSG_PEEK)) {
2622	skb_unlink(skb, list: &sk->sk_receive_queue);
2623	consume_skb(skb);
2624	skb = skb_peek(list_: &sk->sk_receive_queue);
2625	}
2626	}
2627	}
2628	return skb;
2629	}
2630	#endif
2631
2632	static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2633	{
2634	if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2635	return -ENOTCONN;
2636
2637	return unix_read_skb(sk, recv_actor);
2638	}
2639
2640	static int unix_stream_read_generic(struct unix_stream_read_state *state,
2641	bool freezable)
2642	{
2643	struct scm_cookie scm;
2644	struct socket *sock = state->socket;
2645	struct sock *sk = sock->sk;
2646	struct unix_sock *u = unix_sk(sk);
2647	int copied = `0`;
2648	int flags = state->flags;
2649	int noblock = flags & MSG_DONTWAIT;
2650	bool check_creds = false;
2651	int target;
2652	int err = `0`;
2653	long timeo;
2654	int skip;
2655	size_t size = state->size;
2656	unsigned int last_len;
2657
2658	if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2659	err = -EINVAL;
2660	goto out;
2661	}
2662
2663	if (unlikely(flags & MSG_OOB)) {
2664	err = -EOPNOTSUPP;
2665	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2666	err = unix_stream_recv_urg(state);
2667	#endif
2668	goto out;
2669	}
2670
2671	target = sock_rcvlowat(sk, waitall: flags & MSG_WAITALL, len: size);
2672	timeo = sock_rcvtimeo(sk, noblock);
2673
2674	memset(&scm, `0`, sizeof(scm));
2675
2676	/ Lock the socket to prevent queue disordering*
2677	* while sleeps in memcpy_tomsg
2678	*/
2679	mutex_lock(&u->iolock);
2680
2681	skip = max(sk_peek_offset(sk, flags), `0`);
2682
2683	do {
2684	int chunk;
2685	bool drop_skb;
2686	struct sk_buff skb, last;
2687
2688	redo:
2689	unix_state_lock(sk);
2690	if (sock_flag(sk, flag: SOCK_DEAD)) {
2691	err = -ECONNRESET;
2692	goto unlock;
2693	}
2694	last = skb = skb_peek(list_: &sk->sk_receive_queue);
2695	last_len = last ? last->len : `0`;
2696
2697	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2698	if (skb) {
2699	skb = manage_oob(skb, sk, flags, copied);
2700	if (!skb) {
2701	unix_state_unlock(sk);
2702	if (copied)
2703	break;
2704	goto redo;
2705	}
2706	}
2707	#endif
2708	again:
2709	if (skb == NULL) {
2710	if (copied >= target)
2711	goto unlock;
2712
2713	/*
2714	* POSIX 1003.1g mandates this order.
2715	*/
2716
2717	err = sock_error(sk);
2718	if (err)
2719	goto unlock;
2720	if (sk->sk_shutdown & RCV_SHUTDOWN)
2721	goto unlock;
2722
2723	unix_state_unlock(sk);
2724	if (!timeo) {
2725	err = -EAGAIN;
2726	break;
2727	}
2728
2729	mutex_unlock(lock: &u->iolock);
2730
2731	timeo = unix_stream_data_wait(sk, timeo, last,
2732	last_len, freezable);
2733
2734	if (signal_pending(current)) {
2735	err = sock_intr_errno(timeo);
2736	scm_destroy(scm: &scm);
2737	goto out;
2738	}
2739
2740	mutex_lock(&u->iolock);
2741	goto redo;
2742	unlock:
2743	unix_state_unlock(sk);
2744	break;
2745	}
2746
2747	while (skip >= unix_skb_len(skb)) {
2748	skip -= unix_skb_len(skb);
2749	last = skb;
2750	last_len = skb->len;
2751	skb = skb_peek_next(skb, list_: &sk->sk_receive_queue);
2752	if (!skb)
2753	goto again;
2754	}
2755
2756	unix_state_unlock(sk);
2757
2758	if (check_creds) {
2759	/ Never glue messages from different writers /
2760	if (!unix_skb_scm_eq(skb, scm: &scm))
2761	break;
2762	} else if (test_bit(SOCK_PASSCRED, &sock->flags) \|\|
2763	test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2764	/ Copy credentials /
2765	scm_set_cred(scm: &scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2766	unix_set_secdata(scm: &scm, skb);
2767	check_creds = true;
2768	}
2769
2770	/ Copy address just once /
2771	if (state->msg && state->msg->msg_name) {
2772	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2773	state->msg->msg_name);
2774	unix_copy_addr(msg: state->msg, sk: skb->sk);
2775
2776	BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2777	state->msg->msg_name,
2778	&state->msg->msg_namelen);
2779
2780	sunaddr = NULL;
2781	}
2782
2783	chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2784	skb_get(skb);
2785	chunk = state->recv_actor(skb, skip, chunk, state);
2786	drop_skb = !unix_skb_len(skb);
2787	/ skb is only safe to use if !drop_skb /
2788	consume_skb(skb);
2789	if (chunk < `0`) {
2790	if (copied == `0`)
2791	copied = -EFAULT;
2792	break;
2793	}
2794	copied += chunk;
2795	size -= chunk;
2796
2797	if (drop_skb) {
2798	/ the skb was touched by a concurrent reader;*
2799	* we should not expect anything from this skb
2800	* anymore and assume it invalid - we can be
2801	* sure it was dropped from the socket queue
2802	*
2803	* let's report a short read
2804	*/
2805	err = `0`;
2806	break;
2807	}
2808
2809	/ Mark read part of skb as used /
2810	if (!(flags & MSG_PEEK)) {
2811	UNIXCB(skb).consumed += chunk;
2812
2813	sk_peek_offset_bwd(sk, val: chunk);
2814
2815	if (UNIXCB(skb).fp) {
2816	scm_stat_del(sk, skb);
2817	unix_detach_fds(scm: &scm, skb);
2818	}
2819
2820	if (unix_skb_len(skb))
2821	break;
2822
2823	skb_unlink(skb, list: &sk->sk_receive_queue);
2824	consume_skb(skb);
2825
2826	if (scm.fp)
2827	break;
2828	} else {
2829	/ It is questionable, see note in unix_dgram_recvmsg.*
2830	*/
2831	if (UNIXCB(skb).fp)
2832	unix_peek_fds(scm: &scm, skb);
2833
2834	sk_peek_offset_fwd(sk, val: chunk);
2835
2836	if (UNIXCB(skb).fp)
2837	break;
2838
2839	skip = `0`;
2840	last = skb;
2841	last_len = skb->len;
2842	unix_state_lock(sk);
2843	skb = skb_peek_next(skb, list_: &sk->sk_receive_queue);
2844	if (skb)
2845	goto again;
2846	unix_state_unlock(sk);
2847	break;
2848	}
2849	} while (size);
2850
2851	mutex_unlock(lock: &u->iolock);
2852	if (state->msg)
2853	scm_recv_unix(sock, msg: state->msg, scm: &scm, flags);
2854	else
2855	scm_destroy(scm: &scm);
2856	out:
2857	return copied ? : err;
2858	}
2859
2860	static int unix_stream_read_actor(struct sk_buff *skb,
2861	int skip, int chunk,
2862	struct unix_stream_read_state *state)
2863	{
2864	int ret;
2865
2866	ret = skb_copy_datagram_msg(from: skb, UNIXCB(skb).consumed + skip,
2867	msg: state->msg, size: chunk);
2868	return ret ?: chunk;
2869	}
2870
2871	int __unix_stream_recvmsg(struct sock sk, struct* msghdr *msg,
2872	size_t size, int flags)
2873	{
2874	struct unix_stream_read_state state = {
2875	.recv_actor = unix_stream_read_actor,
2876	.socket = sk->sk_socket,
2877	.msg = msg,
2878	.size = size,
2879	.flags = flags
2880	};
2881
2882	return unix_stream_read_generic(state: &state, freezable: true);
2883	}
2884
2885	static int unix_stream_recvmsg(struct socket sock, struct* msghdr *msg,
2886	size_t size, int flags)
2887	{
2888	struct unix_stream_read_state state = {
2889	.recv_actor = unix_stream_read_actor,
2890	.socket = sock,
2891	.msg = msg,
2892	.size = size,
2893	.flags = flags
2894	};
2895
2896	#ifdef CONFIG_BPF_SYSCALL
2897	struct sock *sk = sock->sk;
2898	const struct proto *prot = READ_ONCE(sk->sk_prot);
2899
2900	if (prot != &unix_stream_proto)
2901	return prot->recvmsg(sk, msg, size, flags, NULL);
2902	#endif
2903	return unix_stream_read_generic(state: &state, freezable: true);
2904	}
2905
2906	static int unix_stream_splice_actor(struct sk_buff *skb,
2907	int skip, int chunk,
2908	struct unix_stream_read_state *state)
2909	{
2910	return skb_splice_bits(skb, sk: state->socket->sk,
2911	UNIXCB(skb).consumed + skip,
2912	pipe: state->pipe, len: chunk, flags: state->splice_flags);
2913	}
2914
2915	static ssize_t unix_stream_splice_read(struct socket sock, loff_t ppos,
2916	struct pipe_inode_info *pipe,
2917	size_t size, unsigned int flags)
2918	{
2919	struct unix_stream_read_state state = {
2920	.recv_actor = unix_stream_splice_actor,
2921	.socket = sock,
2922	.pipe = pipe,
2923	.size = size,
2924	.splice_flags = flags,
2925	};
2926
2927	if (unlikely(*ppos))
2928	return -ESPIPE;
2929
2930	if (sock->file->f_flags & O_NONBLOCK \|\|
2931	flags & SPLICE_F_NONBLOCK)
2932	state.flags = MSG_DONTWAIT;
2933
2934	return unix_stream_read_generic(state: &state, freezable: false);
2935	}
2936
2937	static int unix_shutdown(struct socket sock, int* mode)
2938	{
2939	struct sock *sk = sock->sk;
2940	struct sock *other;
2941
2942	if (mode < SHUT_RD \|\| mode > SHUT_RDWR)
2943	return -EINVAL;
2944	/ This maps:*
2945	* SHUT_RD (0) -> RCV_SHUTDOWN (1)
2946	* SHUT_WR (1) -> SEND_SHUTDOWN (2)
2947	* SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2948	*/
2949	++mode;
2950
2951	unix_state_lock(sk);
2952	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| mode);
2953	other = unix_peer(sk);
2954	if (other)
2955	sock_hold(sk: other);
2956	unix_state_unlock(sk);
2957	sk->sk_state_change(sk);
2958
2959	if (other &&
2960	(sk->sk_type == SOCK_STREAM \|\| sk->sk_type == SOCK_SEQPACKET)) {
2961
2962	int peer_mode = `0`;
2963	const struct proto *prot = READ_ONCE(other->sk_prot);
2964
2965	if (prot->unhash)
2966	prot->unhash(other);
2967	if (mode&RCV_SHUTDOWN)
2968	peer_mode \|= SEND_SHUTDOWN;
2969	if (mode&SEND_SHUTDOWN)
2970	peer_mode \|= RCV_SHUTDOWN;
2971	unix_state_lock(other);
2972	WRITE_ONCE(other->sk_shutdown, other->sk_shutdown \| peer_mode);
2973	unix_state_unlock(other);
2974	other->sk_state_change(other);
2975	if (peer_mode == SHUTDOWN_MASK)
2976	sk_wake_async(sk: other, how: SOCK_WAKE_WAITD, POLL_HUP);
2977	else if (peer_mode & RCV_SHUTDOWN)
2978	sk_wake_async(sk: other, how: SOCK_WAKE_WAITD, POLL_IN);
2979	}
2980	if (other)
2981	sock_put(sk: other);
2982
2983	return `0`;
2984	}
2985
2986	long unix_inq_len(struct sock *sk)
2987	{
2988	struct sk_buff *skb;
2989	long amount = `0`;
2990
2991	if (sk->sk_state == TCP_LISTEN)
2992	return -EINVAL;
2993
2994	spin_lock(lock: &sk->sk_receive_queue.lock);
2995	if (sk->sk_type == SOCK_STREAM \|\|
2996	sk->sk_type == SOCK_SEQPACKET) {
2997	skb_queue_walk(&sk->sk_receive_queue, skb)
2998	amount += unix_skb_len(skb);
2999	} else {
3000	skb = skb_peek(list_: &sk->sk_receive_queue);
3001	if (skb)
3002	amount = skb->len;
3003	}
3004	spin_unlock(lock: &sk->sk_receive_queue.lock);
3005
3006	return amount;
3007	}
3008	EXPORT_SYMBOL_GPL(unix_inq_len);
3009
3010	long unix_outq_len(struct sock *sk)
3011	{
3012	return sk_wmem_alloc_get(sk);
3013	}
3014	EXPORT_SYMBOL_GPL(unix_outq_len);
3015
3016	static int unix_open_file(struct sock *sk)
3017	{
3018	struct path path;
3019	struct file *f;
3020	int fd;
3021
3022	if (!ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN))
3023	return -EPERM;
3024
3025	if (!smp_load_acquire(&unix_sk(sk)->addr))
3026	return -ENOENT;
3027
3028	path = unix_sk(sk)->path;
3029	if (!path.dentry)
3030	return -ENOENT;
3031
3032	path_get(&path);
3033
3034	fd = get_unused_fd_flags(O_CLOEXEC);
3035	if (fd < `0`)
3036	goto out;
3037
3038	f = dentry_open(path: &path, O_PATH, current_cred());
3039	if (IS_ERR(ptr: f)) {
3040	put_unused_fd(fd);
3041	fd = PTR_ERR(ptr: f);
3042	goto out;
3043	}
3044
3045	fd_install(fd, file: f);
3046	out:
3047	path_put(&path);
3048
3049	return fd;
3050	}
3051
3052	static int unix_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg)
3053	{
3054	struct sock *sk = sock->sk;
3055	long amount = `0`;
3056	int err;
3057
3058	switch (cmd) {
3059	case SIOCOUTQ:
3060	amount = unix_outq_len(sk);
3061	err = put_user(amount, (int __user *)arg);
3062	break;
3063	case SIOCINQ:
3064	amount = unix_inq_len(sk);
3065	if (amount < `0`)
3066	err = amount;
3067	else
3068	err = put_user(amount, (int __user *)arg);
3069	break;
3070	case SIOCUNIXFILE:
3071	err = unix_open_file(sk);
3072	break;
3073	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3074	case SIOCATMARK:
3075	{
3076	struct sk_buff *skb;
3077	int answ = `0`;
3078
3079	skb = skb_peek(list_: &sk->sk_receive_queue);
3080	if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3081	answ = `1`;
3082	err = put_user(answ, (int __user *)arg);
3083	}
3084	break;
3085	#endif
3086	default:
3087	err = -ENOIOCTLCMD;
3088	break;
3089	}
3090	return err;
3091	}
3092
3093	#ifdef CONFIG_COMPAT
3094	static int unix_compat_ioctl(struct socket sock, unsigned* int cmd, unsigned long arg)
3095	{
3096	return unix_ioctl(sock, cmd, arg: (unsigned long)compat_ptr(uptr: arg));
3097	}
3098	#endif
3099
3100	static __poll_t unix_poll(struct file file, struct* socket sock, poll_table wait)
3101	{
3102	struct sock *sk = sock->sk;
3103	__poll_t mask;
3104	u8 shutdown;
3105
3106	sock_poll_wait(filp: file, sock, p: wait);
3107	mask = `0`;
3108	shutdown = READ_ONCE(sk->sk_shutdown);
3109
3110	/ exceptional events? /
3111	if (READ_ONCE(sk->sk_err))
3112	mask \|= EPOLLERR;
3113	if (shutdown == SHUTDOWN_MASK)
3114	mask \|= EPOLLHUP;
3115	if (shutdown & RCV_SHUTDOWN)
3116	mask \|= EPOLLRDHUP \| EPOLLIN \| EPOLLRDNORM;
3117
3118	/ readable? /
3119	if (!skb_queue_empty_lockless(list: &sk->sk_receive_queue))
3120	mask \|= EPOLLIN \| EPOLLRDNORM;
3121	if (sk_is_readable(sk))
3122	mask \|= EPOLLIN \| EPOLLRDNORM;
3123	#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3124	if (READ_ONCE(unix_sk(sk)->oob_skb))
3125	mask \|= EPOLLPRI;
3126	#endif
3127
3128	/ Connection-based need to check for termination and startup /
3129	if ((sk->sk_type == SOCK_STREAM \|\| sk->sk_type == SOCK_SEQPACKET) &&
3130	sk->sk_state == TCP_CLOSE)
3131	mask \|= EPOLLHUP;
3132
3133	/*
3134	* we set writable also when the other side has shut down the
3135	* connection. This prevents stuck sockets.
3136	*/
3137	if (unix_writable(sk))
3138	mask \|= EPOLLOUT \| EPOLLWRNORM \| EPOLLWRBAND;
3139
3140	return mask;
3141	}
3142
3143	static __poll_t unix_dgram_poll(struct file file, struct* socket *sock,
3144	poll_table *wait)
3145	{
3146	struct sock sk = sock->sk, other;
3147	unsigned int writable;
3148	__poll_t mask;
3149	u8 shutdown;
3150
3151	sock_poll_wait(filp: file, sock, p: wait);
3152	mask = `0`;
3153	shutdown = READ_ONCE(sk->sk_shutdown);
3154
3155	/ exceptional events? /
3156	if (READ_ONCE(sk->sk_err) \|\|
3157	!skb_queue_empty_lockless(list: &sk->sk_error_queue))
3158	mask \|= EPOLLERR \|
3159	(sock_flag(sk, flag: SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : `0`);
3160
3161	if (shutdown & RCV_SHUTDOWN)
3162	mask \|= EPOLLRDHUP \| EPOLLIN \| EPOLLRDNORM;
3163	if (shutdown == SHUTDOWN_MASK)
3164	mask \|= EPOLLHUP;
3165
3166	/ readable? /
3167	if (!skb_queue_empty_lockless(list: &sk->sk_receive_queue))
3168	mask \|= EPOLLIN \| EPOLLRDNORM;
3169	if (sk_is_readable(sk))
3170	mask \|= EPOLLIN \| EPOLLRDNORM;
3171
3172	/ Connection-based need to check for termination and startup /
3173	if (sk->sk_type == SOCK_SEQPACKET) {
3174	if (sk->sk_state == TCP_CLOSE)
3175	mask \|= EPOLLHUP;
3176	/ connection hasn't started yet? /
3177	if (sk->sk_state == TCP_SYN_SENT)
3178	return mask;
3179	}
3180
3181	/ No write status requested, avoid expensive OUT tests. /
3182	if (!(poll_requested_events(p: wait) & (EPOLLWRBAND\|EPOLLWRNORM\|EPOLLOUT)))
3183	return mask;
3184
3185	writable = unix_writable(sk);
3186	if (writable) {
3187	unix_state_lock(sk);
3188
3189	other = unix_peer(sk);
3190	if (other && unix_peer(other) != sk &&
3191	unix_recvq_full_lockless(sk: other) &&
3192	unix_dgram_peer_wake_me(sk, other))
3193	writable = `0`;
3194
3195	unix_state_unlock(sk);
3196	}
3197
3198	if (writable)
3199	mask \|= EPOLLOUT \| EPOLLWRNORM \| EPOLLWRBAND;
3200	else
3201	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3202
3203	return mask;
3204	}
3205
3206	#ifdef CONFIG_PROC_FS
3207
3208	#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3209
3210	#define get_bucket(x) ((x) >> BUCKET_SPACE)
3211	#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3212	#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE \| (o))
3213
3214	static struct sock unix_from_bucket(struct* seq_file seq, loff_t pos)
3215	{
3216	unsigned long offset = get_offset(*pos);
3217	unsigned long bucket = get_bucket(*pos);
3218	unsigned long count = `0`;
3219	struct sock *sk;
3220
3221	for (sk = sk_head(head: &seq_file_net(seq)->unx.table.buckets[bucket]);
3222	sk; sk = sk_next(sk)) {
3223	if (++count == offset)
3224	break;
3225	}
3226
3227	return sk;
3228	}
3229
3230	static struct sock unix_get_first(struct* seq_file seq, loff_t pos)
3231	{
3232	unsigned long bucket = get_bucket(*pos);
3233	struct net *net = seq_file_net(seq);
3234	struct sock *sk;
3235
3236	while (bucket < UNIX_HASH_SIZE) {
3237	spin_lock(lock: &net->unx.table.locks[bucket]);
3238
3239	sk = unix_from_bucket(seq, pos);
3240	if (sk)
3241	return sk;
3242
3243	spin_unlock(lock: &net->unx.table.locks[bucket]);
3244
3245	*pos = set_bucket_offset(++bucket, `1`);
3246	}
3247
3248	return NULL;
3249	}
3250
3251	static struct sock unix_get_next(struct* seq_file seq, struct* sock *sk,
3252	loff_t *pos)
3253	{
3254	unsigned long bucket = get_bucket(*pos);
3255
3256	sk = sk_next(sk);
3257	if (sk)
3258	return sk;
3259
3260
3261	spin_unlock(lock: &seq_file_net(seq)->unx.table.locks[bucket]);
3262
3263	*pos = set_bucket_offset(++bucket, `1`);
3264
3265	return unix_get_first(seq, pos);
3266	}
3267
3268	static void unix_seq_start(struct* seq_file seq, loff_t pos)
3269	{
3270	if (!*pos)
3271	return SEQ_START_TOKEN;
3272
3273	return unix_get_first(seq, pos);
3274	}
3275
3276	static void unix_seq_next(struct* seq_file seq, void* v, loff_t pos)
3277	{
3278	++*pos;
3279
3280	if (v == SEQ_START_TOKEN)
3281	return unix_get_first(seq, pos);
3282
3283	return unix_get_next(seq, sk: v, pos);
3284	}
3285
3286	static void unix_seq_stop(struct seq_file seq, void* *v)
3287	{
3288	struct sock *sk = v;
3289
3290	if (sk)
3291	spin_unlock(lock: &seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3292	}
3293
3294	static int unix_seq_show(struct seq_file seq, void* *v)
3295	{
3296
3297	if (v == SEQ_START_TOKEN)
3298	seq_puts(m: seq, s: "Num RefCount Protocol Flags Type St "
3299	"Inode Path\n");
3300	else {
3301	struct sock *s = v;
3302	struct unix_sock *u = unix_sk(s);
3303	unix_state_lock(s);
3304
3305	seq_printf(m: seq, fmt: "%pK: %08X %08X %08X %04X %02X %5lu",
3306	s,
3307	refcount_read(r: &s->sk_refcnt),
3308	`0`,
3309	s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : `0`,
3310	s->sk_type,
3311	s->sk_socket ?
3312	(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3313	(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3314	sock_i_ino(sk: s));
3315
3316	if (u->addr) { // under a hash table lock here
3317	int i, len;
3318	seq_putc(m: seq, c: `' '`);
3319
3320	i = `0`;
3321	len = u->addr->len -
3322	offsetof(struct sockaddr_un, sun_path);
3323	if (u->addr->name->sun_path[`0`]) {
3324	len--;
3325	} else {
3326	seq_putc(m: seq, c: `'@'`);
3327	i++;
3328	}
3329	for ( ; i < len; i++)
3330	seq_putc(m: seq, c: u->addr->name->sun_path[i] ?:
3331	`'@'`);
3332	}
3333	unix_state_unlock(s);
3334	seq_putc(m: seq, c: `'\n'`);
3335	}
3336
3337	return `0`;
3338	}
3339
3340	static const struct seq_operations unix_seq_ops = {
3341	.start = unix_seq_start,
3342	.next = unix_seq_next,
3343	.stop = unix_seq_stop,
3344	.show = unix_seq_show,
3345	};
3346
3347	#ifdef CONFIG_BPF_SYSCALL
3348	struct bpf_unix_iter_state {
3349	struct seq_net_private p;
3350	unsigned int cur_sk;
3351	unsigned int end_sk;
3352	unsigned int max_sk;
3353	struct sock **batch;
3354	bool st_bucket_done;
3355	};
3356
3357	struct bpf_iter__unix {
3358	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3359	__bpf_md_ptr(struct unix_sock *, unix_sk);
3360	uid_t uid __aligned(`8`);
3361	};
3362
3363	static int unix_prog_seq_show(struct bpf_prog prog, struct* bpf_iter_meta *meta,
3364	struct unix_sock *unix_sk, uid_t uid)
3365	{
3366	struct bpf_iter__unix ctx;
3367
3368	meta->seq_num--; / skip SEQ_START_TOKEN /
3369	ctx.meta = meta;
3370	ctx.unix_sk = unix_sk;
3371	ctx.uid = uid;
3372	return bpf_iter_run_prog(prog, ctx: &ctx);
3373	}
3374
3375	static int bpf_iter_unix_hold_batch(struct seq_file seq, struct* sock *start_sk)
3376
3377	{
3378	struct bpf_unix_iter_state *iter = seq->private;
3379	unsigned int expected = `1`;
3380	struct sock *sk;
3381
3382	sock_hold(sk: start_sk);
3383	iter->batch[iter->end_sk++] = start_sk;
3384
3385	for (sk = sk_next(sk: start_sk); sk; sk = sk_next(sk)) {
3386	if (iter->end_sk < iter->max_sk) {
3387	sock_hold(sk);
3388	iter->batch[iter->end_sk++] = sk;
3389	}
3390
3391	expected++;
3392	}
3393
3394	spin_unlock(lock: &seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3395
3396	return expected;
3397	}
3398
3399	static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3400	{
3401	while (iter->cur_sk < iter->end_sk)
3402	sock_put(sk: iter->batch[iter->cur_sk++]);
3403	}
3404
3405	static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3406	unsigned int new_batch_sz)
3407	{
3408	struct sock **new_batch;
3409
3410	new_batch = kvmalloc(size: sizeof(new_batch) new_batch_sz,
3411	GFP_USER \| __GFP_NOWARN);
3412	if (!new_batch)
3413	return -ENOMEM;
3414
3415	bpf_iter_unix_put_batch(iter);
3416	kvfree(addr: iter->batch);
3417	iter->batch = new_batch;
3418	iter->max_sk = new_batch_sz;
3419
3420	return `0`;
3421	}
3422
3423	static struct sock bpf_iter_unix_batch(struct* seq_file *seq,
3424	loff_t *pos)
3425	{
3426	struct bpf_unix_iter_state *iter = seq->private;
3427	unsigned int expected;
3428	bool resized = false;
3429	struct sock *sk;
3430
3431	if (iter->st_bucket_done)
3432	pos = set_bucket_offset(get_bucket(pos) + `1`, `1`);
3433
3434	again:
3435	/ Get a new batch /
3436	iter->cur_sk = `0`;
3437	iter->end_sk = `0`;
3438
3439	sk = unix_get_first(seq, pos);
3440	if (!sk)
3441	return NULL; / Done /
3442
3443	expected = bpf_iter_unix_hold_batch(seq, start_sk: sk);
3444
3445	if (iter->end_sk == expected) {
3446	iter->st_bucket_done = true;
3447	return sk;
3448	}
3449
3450	if (!resized && !bpf_iter_unix_realloc_batch(iter, new_batch_sz: expected * `3` / `2`)) {
3451	resized = true;
3452	goto again;
3453	}
3454
3455	return sk;
3456	}
3457
3458	static void bpf_iter_unix_seq_start(struct* seq_file seq, loff_t pos)
3459	{
3460	if (!*pos)
3461	return SEQ_START_TOKEN;
3462
3463	/ bpf iter does not support lseek, so it always*
3464	* continue from where it was stop()-ped.
3465	*/
3466	return bpf_iter_unix_batch(seq, pos);
3467	}
3468
3469	static void bpf_iter_unix_seq_next(struct* seq_file seq, void* v, loff_t pos)
3470	{
3471	struct bpf_unix_iter_state *iter = seq->private;
3472	struct sock *sk;
3473
3474	/ Whenever seq_next() is called, the iter->cur_sk is*
3475	* done with seq_show(), so advance to the next sk in
3476	* the batch.
3477	*/
3478	if (iter->cur_sk < iter->end_sk)
3479	sock_put(sk: iter->batch[iter->cur_sk++]);
3480
3481	++*pos;
3482
3483	if (iter->cur_sk < iter->end_sk)
3484	sk = iter->batch[iter->cur_sk];
3485	else
3486	sk = bpf_iter_unix_batch(seq, pos);
3487
3488	return sk;
3489	}
3490
3491	static int bpf_iter_unix_seq_show(struct seq_file seq, void* *v)
3492	{
3493	struct bpf_iter_meta meta;
3494	struct bpf_prog *prog;
3495	struct sock *sk = v;
3496	uid_t uid;
3497	bool slow;
3498	int ret;
3499
3500	if (v == SEQ_START_TOKEN)
3501	return `0`;
3502
3503	slow = lock_sock_fast(sk);
3504
3505	if (unlikely(sk_unhashed(sk))) {
3506	ret = SEQ_SKIP;
3507	goto unlock;
3508	}
3509
3510	uid = from_kuid_munged(to: seq_user_ns(seq), uid: sock_i_uid(sk));
3511	meta.seq = seq;
3512	prog = bpf_iter_get_info(meta: &meta, in_stop: false);
3513	ret = unix_prog_seq_show(prog, meta: &meta, unix_sk: v, uid);
3514	unlock:
3515	unlock_sock_fast(sk, slow);
3516	return ret;
3517	}
3518
3519	static void bpf_iter_unix_seq_stop(struct seq_file seq, void* *v)
3520	{
3521	struct bpf_unix_iter_state *iter = seq->private;
3522	struct bpf_iter_meta meta;
3523	struct bpf_prog *prog;
3524
3525	if (!v) {
3526	meta.seq = seq;
3527	prog = bpf_iter_get_info(meta: &meta, in_stop: true);
3528	if (prog)
3529	(void)unix_prog_seq_show(prog, meta: &meta, unix_sk: v, uid: `0`);
3530	}
3531
3532	if (iter->cur_sk < iter->end_sk)
3533	bpf_iter_unix_put_batch(iter);
3534	}
3535
3536	static const struct seq_operations bpf_iter_unix_seq_ops = {
3537	.start = bpf_iter_unix_seq_start,
3538	.next = bpf_iter_unix_seq_next,
3539	.stop = bpf_iter_unix_seq_stop,
3540	.show = bpf_iter_unix_seq_show,
3541	};
3542	#endif
3543	#endif
3544
3545	static const struct net_proto_family unix_family_ops = {
3546	.family = PF_UNIX,
3547	.create = unix_create,
3548	.owner = THIS_MODULE,
3549	};
3550
3551
3552	static int __net_init unix_net_init(struct net *net)
3553	{
3554	int i;
3555
3556	net->unx.sysctl_max_dgram_qlen = `10`;
3557	if (unix_sysctl_register(net))
3558	goto out;
3559
3560	#ifdef CONFIG_PROC_FS
3561	if (!proc_create_net("unix", `0`, net->proc_net, &unix_seq_ops,
3562	sizeof(struct seq_net_private)))
3563	goto err_sysctl;
3564	#endif
3565
3566	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3567	size: sizeof(spinlock_t), GFP_KERNEL);
3568	if (!net->unx.table.locks)
3569	goto err_proc;
3570
3571	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3572	size: sizeof(struct hlist_head),
3573	GFP_KERNEL);
3574	if (!net->unx.table.buckets)
3575	goto free_locks;
3576
3577	for (i = `0`; i < UNIX_HASH_SIZE; i++) {
3578	spin_lock_init(&net->unx.table.locks[i]);
3579	INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3580	}
3581
3582	return `0`;
3583
3584	free_locks:
3585	kvfree(addr: net->unx.table.locks);
3586	err_proc:
3587	#ifdef CONFIG_PROC_FS
3588	remove_proc_entry("unix", net->proc_net);
3589	err_sysctl:
3590	#endif
3591	unix_sysctl_unregister(net);
3592	out:
3593	return -ENOMEM;
3594	}
3595
3596	static void __net_exit unix_net_exit(struct net *net)
3597	{
3598	kvfree(addr: net->unx.table.buckets);
3599	kvfree(addr: net->unx.table.locks);
3600	unix_sysctl_unregister(net);
3601	remove_proc_entry("unix", net->proc_net);
3602	}
3603
3604	static struct pernet_operations unix_net_ops = {
3605	.init = unix_net_init,
3606	.exit = unix_net_exit,
3607	};
3608
3609	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3610	DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3611	struct unix_sock *unix_sk, uid_t uid)
3612
3613	#define INIT_BATCH_SZ 16
3614
3615	static int bpf_iter_init_unix(void priv_data, struct* bpf_iter_aux_info *aux)
3616	{
3617	struct bpf_unix_iter_state *iter = priv_data;
3618	int err;
3619
3620	err = bpf_iter_init_seq_net(priv_data, aux);
3621	if (err)
3622	return err;
3623
3624	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3625	if (err) {
3626	bpf_iter_fini_seq_net(priv_data);
3627	return err;
3628	}
3629
3630	return `0`;
3631	}
3632
3633	static void bpf_iter_fini_unix(void *priv_data)
3634	{
3635	struct bpf_unix_iter_state *iter = priv_data;
3636
3637	bpf_iter_fini_seq_net(priv_data);
3638	kvfree(addr: iter->batch);
3639	}
3640
3641	static const struct bpf_iter_seq_info unix_seq_info = {
3642	.seq_ops = &bpf_iter_unix_seq_ops,
3643	.init_seq_private = bpf_iter_init_unix,
3644	.fini_seq_private = bpf_iter_fini_unix,
3645	.seq_priv_size = sizeof(struct bpf_unix_iter_state),
3646	};
3647
3648	static const struct bpf_func_proto *
3649	bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3650	const struct bpf_prog *prog)
3651	{
3652	switch (func_id) {
3653	case BPF_FUNC_setsockopt:
3654	return &bpf_sk_setsockopt_proto;
3655	case BPF_FUNC_getsockopt:
3656	return &bpf_sk_getsockopt_proto;
3657	default:
3658	return NULL;
3659	}
3660	}
3661
3662	static struct bpf_iter_reg unix_reg_info = {
3663	.target = "unix",
3664	.ctx_arg_info_size = `1`,
3665	.ctx_arg_info = {
3666	{ offsetof(struct bpf_iter__unix, unix_sk),
3667	PTR_TO_BTF_ID_OR_NULL },
3668	},
3669	.get_func_proto = bpf_iter_unix_get_func_proto,
3670	.seq_info = &unix_seq_info,
3671	};
3672
3673	static void __init bpf_iter_register(void)
3674	{
3675	unix_reg_info.ctx_arg_info[`0`].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3676	if (bpf_iter_reg_target(reg_info: &unix_reg_info))
3677	pr_warn("Warning: could not register bpf iterator unix\n");
3678	}
3679	#endif
3680
3681	static int __init af_unix_init(void)
3682	{
3683	int i, rc = -`1`;
3684
3685	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3686
3687	for (i = `0`; i < UNIX_HASH_SIZE / `2`; i++) {
3688	spin_lock_init(&bsd_socket_locks[i]);
3689	INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3690	}
3691
3692	rc = proto_register(prot: &unix_dgram_proto, alloc_slab: `1`);
3693	if (rc != `0`) {
3694	pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3695	goto out;
3696	}
3697
3698	rc = proto_register(prot: &unix_stream_proto, alloc_slab: `1`);
3699	if (rc != `0`) {
3700	pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3701	proto_unregister(prot: &unix_dgram_proto);
3702	goto out;
3703	}
3704
3705	sock_register(fam: &unix_family_ops);
3706	register_pernet_subsys(&unix_net_ops);
3707	unix_bpf_build_proto();
3708
3709	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3710	bpf_iter_register();
3711	#endif
3712
3713	out:
3714	return rc;
3715	}
3716
3717	/ Later than subsys_initcall() because we depend on stuff initialised there /
3718	fs_initcall(af_unix_init);
3719

source code of linux/net/unix/af_unix.c