pipe.c source code [linux/fs/pipe.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/pipe.c
4	*
5	* Copyright (C) 1991, 1992, 1999 Linus Torvalds
6	*/
7
8	#include <linux/mm.h>
9	#include <linux/file.h>
10	#include <linux/poll.h>
11	#include <linux/slab.h>
12	#include <linux/module.h>
13	#include <linux/init.h>
14	#include <linux/fs.h>
15	#include <linux/log2.h>
16	#include <linux/mount.h>
17	#include <linux/pseudo_fs.h>
18	#include <linux/magic.h>
19	#include <linux/pipe_fs_i.h>
20	#include <linux/uio.h>
21	#include <linux/highmem.h>
22	#include <linux/pagemap.h>
23	#include <linux/audit.h>
24	#include <linux/syscalls.h>
25	#include <linux/fcntl.h>
26	#include <linux/memcontrol.h>
27	#include <linux/watch_queue.h>
28	#include <linux/sysctl.h>
29
30	#include <linux/uaccess.h>
31	#include <asm/ioctls.h>
32
33	#include "internal.h"
34
35	/*
36	* New pipe buffers will be restricted to this size while the user is exceeding
37	* their pipe buffer quota. The general pipe use case needs at least two
38	* buffers: one for data yet to be read, and one for new data. If this is less
39	* than two, then a write to a non-empty pipe may block even if the pipe is not
40	* full. This can occur with GNU make jobserver or similar uses of pipes as
41	* semaphores: multiple processes may be waiting to write tokens back to the
42	* pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
43	*
44	* Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
45	* own risk, namely: pipe writes to non-full pipes may block until the pipe is
46	* emptied.
47	*/
48	#define PIPE_MIN_DEF_BUFFERS 2
49
50	/*
51	* The max size that a non-root user is allowed to grow the pipe. Can
52	* be set by root in /proc/sys/fs/pipe-max-size
53	*/
54	static unsigned int pipe_max_size = `1048576`;
55
56	/ Maximum allocatable pages per user. Hard limit is unset by default, soft*
57	* matches default values.
58	*/
59	static unsigned long pipe_user_pages_hard;
60	static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
61
62	/*
63	* We use head and tail indices that aren't masked off, except at the point of
64	* dereference, but rather they're allowed to wrap naturally. This means there
65	* isn't a dead spot in the buffer, but the ring has to be a power of two and
66	* <= 2^31.
67	* -- David Howells 2019-09-23.
68	*
69	* Reads with count = 0 should always return 0.
70	* -- Julian Bradfield 1999-06-07.
71	*
72	* FIFOs and Pipes now generate SIGIO for both readers and writers.
73	* -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
74	*
75	* pipe_read & write cleanup
76	* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
77	*/
78
79	#define cmp_int(l, r) ((l > r) - (l < r))
80
81	#ifdef CONFIG_PROVE_LOCKING
82	static int pipe_lock_cmp_fn(const struct lockdep_map *a,
83	const struct lockdep_map *b)
84	{
85	return cmp_int((unsigned long) a, (unsigned long) b);
86	}
87	#endif
88
89	void pipe_lock(struct pipe_inode_info *pipe)
90	{
91	if (pipe->files)
92	mutex_lock(&pipe->mutex);
93	}
94	EXPORT_SYMBOL(pipe_lock);
95
96	void pipe_unlock(struct pipe_inode_info *pipe)
97	{
98	if (pipe->files)
99	mutex_unlock(lock: &pipe->mutex);
100	}
101	EXPORT_SYMBOL(pipe_unlock);
102
103	void pipe_double_lock(struct pipe_inode_info *pipe1,
104	struct pipe_inode_info *pipe2)
105	{
106	BUG_ON(pipe1 == pipe2);
107
108	if (pipe1 > pipe2)
109	swap(pipe1, pipe2);
110
111	pipe_lock(pipe1);
112	pipe_lock(pipe2);
113	}
114
115	static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
116	struct pipe_buffer *buf)
117	{
118	struct page *page = buf->page;
119
120	/*
121	* If nobody else uses this page, and we don't already have a
122	* temporary page, let's keep track of it as a one-deep
123	* allocation cache. (Otherwise just release our reference to it)
124	*/
125	if (page_count(page) == `1` && !pipe->tmp_page)
126	pipe->tmp_page = page;
127	else
128	put_page(page);
129	}
130
131	static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
132	struct pipe_buffer *buf)
133	{
134	struct page *page = buf->page;
135
136	if (page_count(page) != `1`)
137	return false;
138	memcg_kmem_uncharge_page(page, order: `0`);
139	__SetPageLocked(page);
140	return true;
141	}
142
143	/**
144	* generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
145	* @pipe: the pipe that the buffer belongs to
146	* @buf: the buffer to attempt to steal
147	*
148	* Description:
149	* This function attempts to steal the &struct page attached to
150	* @buf. If successful, this function returns 0 and returns with
151	* the page locked. The caller may then reuse the page for whatever
152	* he wishes; the typical use is insertion into a different file
153	* page cache.
154	*/
155	bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
156	struct pipe_buffer *buf)
157	{
158	struct page *page = buf->page;
159
160	/*
161	* A reference of one is golden, that means that the owner of this
162	* page is the only one holding a reference to it. lock the page
163	* and return OK.
164	*/
165	if (page_count(page) == `1`) {
166	lock_page(page);
167	return true;
168	}
169	return false;
170	}
171	EXPORT_SYMBOL(generic_pipe_buf_try_steal);
172
173	/**
174	* generic_pipe_buf_get - get a reference to a &struct pipe_buffer
175	* @pipe: the pipe that the buffer belongs to
176	* @buf: the buffer to get a reference to
177	*
178	* Description:
179	* This function grabs an extra reference to @buf. It's used in
180	* the tee() system call, when we duplicate the buffers in one
181	* pipe into another.
182	*/
183	bool generic_pipe_buf_get(struct pipe_inode_info pipe, struct* pipe_buffer *buf)
184	{
185	return try_get_page(page: buf->page);
186	}
187	EXPORT_SYMBOL(generic_pipe_buf_get);
188
189	/**
190	* generic_pipe_buf_release - put a reference to a &struct pipe_buffer
191	* @pipe: the pipe that the buffer belongs to
192	* @buf: the buffer to put a reference to
193	*
194	* Description:
195	* This function releases a reference to @buf.
196	*/
197	void generic_pipe_buf_release(struct pipe_inode_info *pipe,
198	struct pipe_buffer *buf)
199	{
200	put_page(page: buf->page);
201	}
202	EXPORT_SYMBOL(generic_pipe_buf_release);
203
204	static const struct pipe_buf_operations anon_pipe_buf_ops = {
205	.release = anon_pipe_buf_release,
206	.try_steal = anon_pipe_buf_try_steal,
207	.get = generic_pipe_buf_get,
208	};
209
210	/ Done while waiting without holding the pipe lock - thus the READ_ONCE() /
211	static inline bool pipe_readable(const struct pipe_inode_info *pipe)
212	{
213	unsigned int head = READ_ONCE(pipe->head);
214	unsigned int tail = READ_ONCE(pipe->tail);
215	unsigned int writers = READ_ONCE(pipe->writers);
216
217	return !pipe_empty(head, tail) \|\| !writers;
218	}
219
220	static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
221	struct pipe_buffer *buf,
222	unsigned int tail)
223	{
224	pipe_buf_release(pipe, buf);
225
226	/*
227	* If the pipe has a watch_queue, we need additional protection
228	* by the spinlock because notifications get posted with only
229	* this spinlock, no mutex
230	*/
231	if (pipe_has_watch_queue(pipe)) {
232	spin_lock_irq(lock: &pipe->rd_wait.lock);
233	#ifdef CONFIG_WATCH_QUEUE
234	if (buf->flags & PIPE_BUF_FLAG_LOSS)
235	pipe->note_loss = true;
236	#endif
237	pipe->tail = ++tail;
238	spin_unlock_irq(lock: &pipe->rd_wait.lock);
239	return tail;
240	}
241
242	/*
243	* Without a watch_queue, we can simply increment the tail
244	* without the spinlock - the mutex is enough.
245	*/
246	pipe->tail = ++tail;
247	return tail;
248	}
249
250	static ssize_t
251	pipe_read(struct kiocb iocb, struct* iov_iter *to)
252	{
253	size_t total_len = iov_iter_count(i: to);
254	struct file *filp = iocb->ki_filp;
255	struct pipe_inode_info *pipe = filp->private_data;
256	bool was_full, wake_next_reader = false;
257	ssize_t ret;
258
259	/ Null read succeeds. /
260	if (unlikely(total_len == `0`))
261	return `0`;
262
263	ret = `0`;
264	mutex_lock(&pipe->mutex);
265
266	/*
267	* We only wake up writers if the pipe was full when we started
268	* reading in order to avoid unnecessary wakeups.
269	*
270	* But when we do wake up writers, we do so using a sync wakeup
271	* (WF_SYNC), because we want them to get going and generate more
272	* data for us.
273	*/
274	was_full = pipe_full(head: pipe->head, tail: pipe->tail, limit: pipe->max_usage);
275	for (;;) {
276	/ Read ->head with a barrier vs post_one_notification() /
277	unsigned int head = smp_load_acquire(&pipe->head);
278	unsigned int tail = pipe->tail;
279	unsigned int mask = pipe->ring_size - `1`;
280
281	#ifdef CONFIG_WATCH_QUEUE
282	if (pipe->note_loss) {
283	struct watch_notification n;
284
285	if (total_len < `8`) {
286	if (ret == `0`)
287	ret = -ENOBUFS;
288	break;
289	}
290
291	n.type = WATCH_TYPE_META;
292	n.subtype = WATCH_META_LOSS_NOTIFICATION;
293	n.info = watch_sizeof(n);
294	if (copy_to_iter(addr: &n, bytes: sizeof(n), i: to) != sizeof(n)) {
295	if (ret == `0`)
296	ret = -EFAULT;
297	break;
298	}
299	ret += sizeof(n);
300	total_len -= sizeof(n);
301	pipe->note_loss = false;
302	}
303	#endif
304
305	if (!pipe_empty(head, tail)) {
306	struct pipe_buffer *buf = &pipe->bufs[tail & mask];
307	size_t chars = buf->len;
308	size_t written;
309	int error;
310
311	if (chars > total_len) {
312	if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
313	if (ret == `0`)
314	ret = -ENOBUFS;
315	break;
316	}
317	chars = total_len;
318	}
319
320	error = pipe_buf_confirm(pipe, buf);
321	if (error) {
322	if (!ret)
323	ret = error;
324	break;
325	}
326
327	written = copy_page_to_iter(page: buf->page, offset: buf->offset, bytes: chars, i: to);
328	if (unlikely(written < chars)) {
329	if (!ret)
330	ret = -EFAULT;
331	break;
332	}
333	ret += chars;
334	buf->offset += chars;
335	buf->len -= chars;
336
337	/ Was it a packet buffer? Clean up and exit /
338	if (buf->flags & PIPE_BUF_FLAG_PACKET) {
339	total_len = chars;
340	buf->len = `0`;
341	}
342
343	if (!buf->len)
344	tail = pipe_update_tail(pipe, buf, tail);
345	total_len -= chars;
346	if (!total_len)
347	break; / common path: read succeeded /
348	if (!pipe_empty(head, tail)) / More to do? /
349	continue;
350	}
351
352	if (!pipe->writers)
353	break;
354	if (ret)
355	break;
356	if ((filp->f_flags & O_NONBLOCK) \|\|
357	(iocb->ki_flags & IOCB_NOWAIT)) {
358	ret = -EAGAIN;
359	break;
360	}
361	mutex_unlock(lock: &pipe->mutex);
362
363	/*
364	* We only get here if we didn't actually read anything.
365	*
366	* However, we could have seen (and removed) a zero-sized
367	* pipe buffer, and might have made space in the buffers
368	* that way.
369	*
370	* You can't make zero-sized pipe buffers by doing an empty
371	* write (not even in packet mode), but they can happen if
372	* the writer gets an EFAULT when trying to fill a buffer
373	* that already got allocated and inserted in the buffer
374	* array.
375	*
376	* So we still need to wake up any pending writers in the
377	* _very_ unlikely case that the pipe was full, but we got
378	* no data.
379	*/
380	if (unlikely(was_full))
381	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
382	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
383
384	/*
385	* But because we didn't read anything, at this point we can
386	* just return directly with -ERESTARTSYS if we're interrupted,
387	* since we've done any required wakeups and there's no need
388	* to mark anything accessed. And we've dropped the lock.
389	*/
390	if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < `0`)
391	return -ERESTARTSYS;
392
393	mutex_lock(&pipe->mutex);
394	was_full = pipe_full(head: pipe->head, tail: pipe->tail, limit: pipe->max_usage);
395	wake_next_reader = true;
396	}
397	if (pipe_empty(head: pipe->head, tail: pipe->tail))
398	wake_next_reader = false;
399	mutex_unlock(lock: &pipe->mutex);
400
401	if (was_full)
402	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
403	if (wake_next_reader)
404	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
405	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
406	if (ret > `0`)
407	file_accessed(file: filp);
408	return ret;
409	}
410
411	static inline int is_packetized(struct file *file)
412	{
413	return (file->f_flags & O_DIRECT) != `0`;
414	}
415
416	/ Done while waiting without holding the pipe lock - thus the READ_ONCE() /
417	static inline bool pipe_writable(const struct pipe_inode_info *pipe)
418	{
419	unsigned int head = READ_ONCE(pipe->head);
420	unsigned int tail = READ_ONCE(pipe->tail);
421	unsigned int max_usage = READ_ONCE(pipe->max_usage);
422
423	return !pipe_full(head, tail, limit: max_usage) \|\|
424	!READ_ONCE(pipe->readers);
425	}
426
427	static ssize_t
428	pipe_write(struct kiocb iocb, struct* iov_iter *from)
429	{
430	struct file *filp = iocb->ki_filp;
431	struct pipe_inode_info *pipe = filp->private_data;
432	unsigned int head;
433	ssize_t ret = `0`;
434	size_t total_len = iov_iter_count(i: from);
435	ssize_t chars;
436	bool was_empty = false;
437	bool wake_next_writer = false;
438
439	/*
440	* Reject writing to watch queue pipes before the point where we lock
441	* the pipe.
442	* Otherwise, lockdep would be unhappy if the caller already has another
443	* pipe locked.
444	* If we had to support locking a normal pipe and a notification pipe at
445	* the same time, we could set up lockdep annotations for that, but
446	* since we don't actually need that, it's simpler to just bail here.
447	*/
448	if (pipe_has_watch_queue(pipe))
449	return -EXDEV;
450
451	/ Null write succeeds. /
452	if (unlikely(total_len == `0`))
453	return `0`;
454
455	mutex_lock(&pipe->mutex);
456
457	if (!pipe->readers) {
458	send_sig(SIGPIPE, current, `0`);
459	ret = -EPIPE;
460	goto out;
461	}
462
463	/*
464	* If it wasn't empty we try to merge new data into
465	* the last buffer.
466	*
467	* That naturally merges small writes, but it also
468	* page-aligns the rest of the writes for large writes
469	* spanning multiple pages.
470	*/
471	head = pipe->head;
472	was_empty = pipe_empty(head, tail: pipe->tail);
473	chars = total_len & (PAGE_SIZE-`1`);
474	if (chars && !was_empty) {
475	unsigned int mask = pipe->ring_size - `1`;
476	struct pipe_buffer *buf = &pipe->bufs[(head - `1`) & mask];
477	int offset = buf->offset + buf->len;
478
479	if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
480	offset + chars <= PAGE_SIZE) {
481	ret = pipe_buf_confirm(pipe, buf);
482	if (ret)
483	goto out;
484
485	ret = copy_page_from_iter(page: buf->page, offset, bytes: chars, i: from);
486	if (unlikely(ret < chars)) {
487	ret = -EFAULT;
488	goto out;
489	}
490
491	buf->len += ret;
492	if (!iov_iter_count(i: from))
493	goto out;
494	}
495	}
496
497	for (;;) {
498	if (!pipe->readers) {
499	send_sig(SIGPIPE, current, `0`);
500	if (!ret)
501	ret = -EPIPE;
502	break;
503	}
504
505	head = pipe->head;
506	if (!pipe_full(head, tail: pipe->tail, limit: pipe->max_usage)) {
507	unsigned int mask = pipe->ring_size - `1`;
508	struct pipe_buffer *buf;
509	struct page *page = pipe->tmp_page;
510	int copied;
511
512	if (!page) {
513	page = alloc_page(GFP_HIGHUSER \| __GFP_ACCOUNT);
514	if (unlikely(!page)) {
515	ret = ret ? : -ENOMEM;
516	break;
517	}
518	pipe->tmp_page = page;
519	}
520
521	/ Allocate a slot in the ring in advance and attach an*
522	* empty buffer. If we fault or otherwise fail to use
523	* it, either the reader will consume it or it'll still
524	* be there for the next write.
525	*/
526	pipe->head = head + `1`;
527
528	/ Insert it into the buffer array /
529	buf = &pipe->bufs[head & mask];
530	buf->page = page;
531	buf->ops = &anon_pipe_buf_ops;
532	buf->offset = `0`;
533	buf->len = `0`;
534	if (is_packetized(file: filp))
535	buf->flags = PIPE_BUF_FLAG_PACKET;
536	else
537	buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
538	pipe->tmp_page = NULL;
539
540	copied = copy_page_from_iter(page, offset: `0`, PAGE_SIZE, i: from);
541	if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
542	if (!ret)
543	ret = -EFAULT;
544	break;
545	}
546	ret += copied;
547	buf->len = copied;
548
549	if (!iov_iter_count(i: from))
550	break;
551	}
552
553	if (!pipe_full(head, tail: pipe->tail, limit: pipe->max_usage))
554	continue;
555
556	/ Wait for buffer space to become available. /
557	if ((filp->f_flags & O_NONBLOCK) \|\|
558	(iocb->ki_flags & IOCB_NOWAIT)) {
559	if (!ret)
560	ret = -EAGAIN;
561	break;
562	}
563	if (signal_pending(current)) {
564	if (!ret)
565	ret = -ERESTARTSYS;
566	break;
567	}
568
569	/*
570	* We're going to release the pipe lock and wait for more
571	* space. We wake up any readers if necessary, and then
572	* after waiting we need to re-check whether the pipe
573	* become empty while we dropped the lock.
574	*/
575	mutex_unlock(lock: &pipe->mutex);
576	if (was_empty)
577	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
578	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
579	wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
580	mutex_lock(&pipe->mutex);
581	was_empty = pipe_empty(head: pipe->head, tail: pipe->tail);
582	wake_next_writer = true;
583	}
584	out:
585	if (pipe_full(head: pipe->head, tail: pipe->tail, limit: pipe->max_usage))
586	wake_next_writer = false;
587	mutex_unlock(lock: &pipe->mutex);
588
589	/*
590	* If we do do a wakeup event, we do a 'sync' wakeup, because we
591	* want the reader to start processing things asap, rather than
592	* leave the data pending.
593	*
594	* This is particularly important for small writes, because of
595	* how (for example) the GNU make jobserver uses small writes to
596	* wake up pending jobs
597	*
598	* Epoll nonsensically wants a wakeup whether the pipe
599	* was already empty or not.
600	*/
601	if (was_empty \|\| pipe->poll_usage)
602	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
603	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
604	if (wake_next_writer)
605	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
606	if (ret > `0` && sb_start_write_trylock(sb: file_inode(f: filp)->i_sb)) {
607	int err = file_update_time(file: filp);
608	if (err)
609	ret = err;
610	sb_end_write(sb: file_inode(f: filp)->i_sb);
611	}
612	return ret;
613	}
614
615	static long pipe_ioctl(struct file filp, unsigned* int cmd, unsigned long arg)
616	{
617	struct pipe_inode_info *pipe = filp->private_data;
618	unsigned int count, head, tail, mask;
619
620	switch (cmd) {
621	case FIONREAD:
622	mutex_lock(&pipe->mutex);
623	count = `0`;
624	head = pipe->head;
625	tail = pipe->tail;
626	mask = pipe->ring_size - `1`;
627
628	while (tail != head) {
629	count += pipe->bufs[tail & mask].len;
630	tail++;
631	}
632	mutex_unlock(lock: &pipe->mutex);
633
634	return put_user(count, (int __user *)arg);
635
636	#ifdef CONFIG_WATCH_QUEUE
637	case IOC_WATCH_QUEUE_SET_SIZE: {
638	int ret;
639	mutex_lock(&pipe->mutex);
640	ret = watch_queue_set_size(pipe, arg);
641	mutex_unlock(lock: &pipe->mutex);
642	return ret;
643	}
644
645	case IOC_WATCH_QUEUE_SET_FILTER:
646	return watch_queue_set_filter(
647	pipe, (struct watch_notification_filter __user *)arg);
648	#endif
649
650	default:
651	return -ENOIOCTLCMD;
652	}
653	}
654
655	/ No kernel lock held - fine /
656	static __poll_t
657	pipe_poll(struct file filp, poll_table wait)
658	{
659	__poll_t mask;
660	struct pipe_inode_info *pipe = filp->private_data;
661	unsigned int head, tail;
662
663	/ Epoll has some historical nasty semantics, this enables them /
664	WRITE_ONCE(pipe->poll_usage, true);
665
666	/*
667	* Reading pipe state only -- no need for acquiring the semaphore.
668	*
669	* But because this is racy, the code has to add the
670	* entry to the poll table _first_ ..
671	*/
672	if (filp->f_mode & FMODE_READ)
673	poll_wait(filp, wait_address: &pipe->rd_wait, p: wait);
674	if (filp->f_mode & FMODE_WRITE)
675	poll_wait(filp, wait_address: &pipe->wr_wait, p: wait);
676
677	/*
678	* .. and only then can you do the racy tests. That way,
679	* if something changes and you got it wrong, the poll
680	* table entry will wake you up and fix it.
681	*/
682	head = READ_ONCE(pipe->head);
683	tail = READ_ONCE(pipe->tail);
684
685	mask = `0`;
686	if (filp->f_mode & FMODE_READ) {
687	if (!pipe_empty(head, tail))
688	mask \|= EPOLLIN \| EPOLLRDNORM;
689	if (!pipe->writers && filp->f_version != pipe->w_counter)
690	mask \|= EPOLLHUP;
691	}
692
693	if (filp->f_mode & FMODE_WRITE) {
694	if (!pipe_full(head, tail, limit: pipe->max_usage))
695	mask \|= EPOLLOUT \| EPOLLWRNORM;
696	/*
697	* Most Unices do not set EPOLLERR for FIFOs but on Linux they
698	* behave exactly like pipes for poll().
699	*/
700	if (!pipe->readers)
701	mask \|= EPOLLERR;
702	}
703
704	return mask;
705	}
706
707	static void put_pipe_info(struct inode inode, struct* pipe_inode_info *pipe)
708	{
709	int kill = `0`;
710
711	spin_lock(lock: &inode->i_lock);
712	if (!--pipe->files) {
713	inode->i_pipe = NULL;
714	kill = `1`;
715	}
716	spin_unlock(lock: &inode->i_lock);
717
718	if (kill)
719	free_pipe_info(pipe);
720	}
721
722	static int
723	pipe_release(struct inode inode, struct* file *file)
724	{
725	struct pipe_inode_info *pipe = file->private_data;
726
727	mutex_lock(&pipe->mutex);
728	if (file->f_mode & FMODE_READ)
729	pipe->readers--;
730	if (file->f_mode & FMODE_WRITE)
731	pipe->writers--;
732
733	/ Was that the last reader or writer, but not the other side? /
734	if (!pipe->readers != !pipe->writers) {
735	wake_up_interruptible_all(&pipe->rd_wait);
736	wake_up_interruptible_all(&pipe->wr_wait);
737	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
738	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
739	}
740	mutex_unlock(lock: &pipe->mutex);
741
742	put_pipe_info(inode, pipe);
743	return `0`;
744	}
745
746	static int
747	pipe_fasync(int fd, struct file filp, int* on)
748	{
749	struct pipe_inode_info *pipe = filp->private_data;
750	int retval = `0`;
751
752	mutex_lock(&pipe->mutex);
753	if (filp->f_mode & FMODE_READ)
754	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
755	if ((filp->f_mode & FMODE_WRITE) && retval >= `0`) {
756	retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
757	if (retval < `0` && (filp->f_mode & FMODE_READ))
758	/ this can happen only if on == T /
759	fasync_helper(-`1`, filp, `0`, &pipe->fasync_readers);
760	}
761	mutex_unlock(lock: &pipe->mutex);
762	return retval;
763	}
764
765	unsigned long account_pipe_buffers(struct user_struct *user,
766	unsigned long old, unsigned long new)
767	{
768	return atomic_long_add_return(i: new - old, v: &user->pipe_bufs);
769	}
770
771	bool too_many_pipe_buffers_soft(unsigned long user_bufs)
772	{
773	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
774
775	return soft_limit && user_bufs > soft_limit;
776	}
777
778	bool too_many_pipe_buffers_hard(unsigned long user_bufs)
779	{
780	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
781
782	return hard_limit && user_bufs > hard_limit;
783	}
784
785	bool pipe_is_unprivileged_user(void)
786	{
787	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
788	}
789
790	struct pipe_inode_info alloc_pipe_info(void*)
791	{
792	struct pipe_inode_info *pipe;
793	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
794	struct user_struct *user = get_current_user();
795	unsigned long user_bufs;
796	unsigned int max_size = READ_ONCE(pipe_max_size);
797
798	pipe = kzalloc(size: sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
799	if (pipe == NULL)
800	goto out_free_uid;
801
802	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
803	pipe_bufs = max_size >> PAGE_SHIFT;
804
805	user_bufs = account_pipe_buffers(user, old: `0`, new: pipe_bufs);
806
807	if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
808	user_bufs = account_pipe_buffers(user, old: pipe_bufs, PIPE_MIN_DEF_BUFFERS);
809	pipe_bufs = PIPE_MIN_DEF_BUFFERS;
810	}
811
812	if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
813	goto out_revert_acct;
814
815	pipe->bufs = kcalloc(n: pipe_bufs, size: sizeof(struct pipe_buffer),
816	GFP_KERNEL_ACCOUNT);
817
818	if (pipe->bufs) {
819	init_waitqueue_head(&pipe->rd_wait);
820	init_waitqueue_head(&pipe->wr_wait);
821	pipe->r_counter = pipe->w_counter = `1`;
822	pipe->max_usage = pipe_bufs;
823	pipe->ring_size = pipe_bufs;
824	pipe->nr_accounted = pipe_bufs;
825	pipe->user = user;
826	mutex_init(&pipe->mutex);
827	lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
828	return pipe;
829	}
830
831	out_revert_acct:
832	(void) account_pipe_buffers(user, old: pipe_bufs, new: `0`);
833	kfree(objp: pipe);
834	out_free_uid:
835	free_uid(user);
836	return NULL;
837	}
838
839	void free_pipe_info(struct pipe_inode_info *pipe)
840	{
841	unsigned int i;
842
843	#ifdef CONFIG_WATCH_QUEUE
844	if (pipe->watch_queue)
845	watch_queue_clear(pipe->watch_queue);
846	#endif
847
848	(void) account_pipe_buffers(user: pipe->user, old: pipe->nr_accounted, new: `0`);
849	free_uid(pipe->user);
850	for (i = `0`; i < pipe->ring_size; i++) {
851	struct pipe_buffer *buf = pipe->bufs + i;
852	if (buf->ops)
853	pipe_buf_release(pipe, buf);
854	}
855	#ifdef CONFIG_WATCH_QUEUE
856	if (pipe->watch_queue)
857	put_watch_queue(pipe->watch_queue);
858	#endif
859	if (pipe->tmp_page)
860	__free_page(pipe->tmp_page);
861	kfree(objp: pipe->bufs);
862	kfree(objp: pipe);
863	}
864
865	static struct vfsmount *pipe_mnt __ro_after_init;
866
867	/*
868	* pipefs_dname() is called from d_path().
869	*/
870	static char pipefs_dname(struct* dentry dentry, char* buffer, int* buflen)
871	{
872	return dynamic_dname(buffer, buflen, "pipe:[%lu]",
873	d_inode(dentry)->i_ino);
874	}
875
876	static const struct dentry_operations pipefs_dentry_operations = {
877	.d_dname = pipefs_dname,
878	};
879
880	static struct inode * get_pipe_inode(void)
881	{
882	struct inode *inode = new_inode_pseudo(sb: pipe_mnt->mnt_sb);
883	struct pipe_inode_info *pipe;
884
885	if (!inode)
886	goto fail_inode;
887
888	inode->i_ino = get_next_ino();
889
890	pipe = alloc_pipe_info();
891	if (!pipe)
892	goto fail_iput;
893
894	inode->i_pipe = pipe;
895	pipe->files = `2`;
896	pipe->readers = pipe->writers = `1`;
897	inode->i_fop = &pipefifo_fops;
898
899	/*
900	* Mark the inode dirty from the very beginning,
901	* that way it will never be moved to the dirty
902	* list because "mark_inode_dirty()" will think
903	* that it already _is_ on the dirty list.
904	*/
905	inode->i_state = I_DIRTY;
906	inode->i_mode = S_IFIFO \| S_IRUSR \| S_IWUSR;
907	inode->i_uid = current_fsuid();
908	inode->i_gid = current_fsgid();
909	simple_inode_init_ts(inode);
910
911	return inode;
912
913	fail_iput:
914	iput(inode);
915
916	fail_inode:
917	return NULL;
918	}
919
920	int create_pipe_files(struct file *res, int* flags)
921	{
922	struct inode *inode = get_pipe_inode();
923	struct file *f;
924	int error;
925
926	if (!inode)
927	return -ENFILE;
928
929	if (flags & O_NOTIFICATION_PIPE) {
930	error = watch_queue_init(inode->i_pipe);
931	if (error) {
932	free_pipe_info(pipe: inode->i_pipe);
933	iput(inode);
934	return error;
935	}
936	}
937
938	f = alloc_file_pseudo(inode, pipe_mnt, "",
939	O_WRONLY \| (flags & (O_NONBLOCK \| O_DIRECT)),
940	&pipefifo_fops);
941	if (IS_ERR(ptr: f)) {
942	free_pipe_info(pipe: inode->i_pipe);
943	iput(inode);
944	return PTR_ERR(ptr: f);
945	}
946
947	f->private_data = inode->i_pipe;
948
949	res[`0`] = alloc_file_clone(f, O_RDONLY \| (flags & O_NONBLOCK),
950	&pipefifo_fops);
951	if (IS_ERR(ptr: res[`0`])) {
952	put_pipe_info(inode, pipe: inode->i_pipe);
953	fput(f);
954	return PTR_ERR(ptr: res[`0`]);
955	}
956	res[`0`]->private_data = inode->i_pipe;
957	res[`1`] = f;
958	stream_open(inode, filp: res[`0`]);
959	stream_open(inode, filp: res[`1`]);
960	return `0`;
961	}
962
963	static int __do_pipe_flags(int fd, struct* file *files, int* flags)
964	{
965	int error;
966	int fdw, fdr;
967
968	if (flags & ~(O_CLOEXEC \| O_NONBLOCK \| O_DIRECT \| O_NOTIFICATION_PIPE))
969	return -EINVAL;
970
971	error = create_pipe_files(res: files, flags);
972	if (error)
973	return error;
974
975	error = get_unused_fd_flags(flags);
976	if (error < `0`)
977	goto err_read_pipe;
978	fdr = error;
979
980	error = get_unused_fd_flags(flags);
981	if (error < `0`)
982	goto err_fdr;
983	fdw = error;
984
985	audit_fd_pair(fd1: fdr, fd2: fdw);
986	fd[`0`] = fdr;
987	fd[`1`] = fdw;
988	/ pipe groks IOCB_NOWAIT /
989	files[`0`]->f_mode \|= FMODE_NOWAIT;
990	files[`1`]->f_mode \|= FMODE_NOWAIT;
991	return `0`;
992
993	err_fdr:
994	put_unused_fd(fd: fdr);
995	err_read_pipe:
996	fput(files[`0`]);
997	fput(files[`1`]);
998	return error;
999	}
1000
1001	int do_pipe_flags(int fd, int* flags)
1002	{
1003	struct file *files[`2`];
1004	int error = __do_pipe_flags(fd, files, flags);
1005	if (!error) {
1006	fd_install(fd: fd[`0`], file: files[`0`]);
1007	fd_install(fd: fd[`1`], file: files[`1`]);
1008	}
1009	return error;
1010	}
1011
1012	/*
1013	* sys_pipe() is the normal C calling standard for creating
1014	* a pipe. It's not the way Unix traditionally does this, though.
1015	*/
1016	static int do_pipe2(int __user fildes, int* flags)
1017	{
1018	struct file *files[`2`];
1019	int fd[`2`];
1020	int error;
1021
1022	error = __do_pipe_flags(fd, files, flags);
1023	if (!error) {
1024	if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
1025	fput(files[`0`]);
1026	fput(files[`1`]);
1027	put_unused_fd(fd: fd[`0`]);
1028	put_unused_fd(fd: fd[`1`]);
1029	error = -EFAULT;
1030	} else {
1031	fd_install(fd: fd[`0`], file: files[`0`]);
1032	fd_install(fd: fd[`1`], file: files[`1`]);
1033	}
1034	}
1035	return error;
1036	}
1037
1038	SYSCALL_DEFINE2(pipe2, int __user , fildes, int*, flags)
1039	{
1040	return do_pipe2(fildes, flags);
1041	}
1042
1043	SYSCALL_DEFINE1(pipe, int __user *, fildes)
1044	{
1045	return do_pipe2(fildes, flags: `0`);
1046	}
1047
1048	/*
1049	* This is the stupid "wait for pipe to be readable or writable"
1050	* model.
1051	*
1052	* See pipe_read/write() for the proper kind of exclusive wait,
1053	* but that requires that we wake up any other readers/writers
1054	* if we then do not end up reading everything (ie the whole
1055	* "wake_next_reader/writer" logic in pipe_read/write()).
1056	*/
1057	void pipe_wait_readable(struct pipe_inode_info *pipe)
1058	{
1059	pipe_unlock(pipe);
1060	wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
1061	pipe_lock(pipe);
1062	}
1063
1064	void pipe_wait_writable(struct pipe_inode_info *pipe)
1065	{
1066	pipe_unlock(pipe);
1067	wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
1068	pipe_lock(pipe);
1069	}
1070
1071	/*
1072	* This depends on both the wait (here) and the wakeup (wake_up_partner)
1073	* holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
1074	* race with the count check and waitqueue prep.
1075	*
1076	* Normally in order to avoid races, you'd do the prepare_to_wait() first,
1077	* then check the condition you're waiting for, and only then sleep. But
1078	* because of the pipe lock, we can check the condition before being on
1079	* the wait queue.
1080	*
1081	* We use the 'rd_wait' waitqueue for pipe partner waiting.
1082	*/
1083	static int wait_for_partner(struct pipe_inode_info pipe, unsigned* int *cnt)
1084	{
1085	DEFINE_WAIT(rdwait);
1086	int cur = *cnt;
1087
1088	while (cur == *cnt) {
1089	prepare_to_wait(wq_head: &pipe->rd_wait, wq_entry: &rdwait, TASK_INTERRUPTIBLE);
1090	pipe_unlock(pipe);
1091	schedule();
1092	finish_wait(wq_head: &pipe->rd_wait, wq_entry: &rdwait);
1093	pipe_lock(pipe);
1094	if (signal_pending(current))
1095	break;
1096	}
1097	return cur == *cnt ? -ERESTARTSYS : `0`;
1098	}
1099
1100	static void wake_up_partner(struct pipe_inode_info *pipe)
1101	{
1102	wake_up_interruptible_all(&pipe->rd_wait);
1103	}
1104
1105	static int fifo_open(struct inode inode, struct* file *filp)
1106	{
1107	struct pipe_inode_info *pipe;
1108	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1109	int ret;
1110
1111	filp->f_version = `0`;
1112
1113	spin_lock(lock: &inode->i_lock);
1114	if (inode->i_pipe) {
1115	pipe = inode->i_pipe;
1116	pipe->files++;
1117	spin_unlock(lock: &inode->i_lock);
1118	} else {
1119	spin_unlock(lock: &inode->i_lock);
1120	pipe = alloc_pipe_info();
1121	if (!pipe)
1122	return -ENOMEM;
1123	pipe->files = `1`;
1124	spin_lock(lock: &inode->i_lock);
1125	if (unlikely(inode->i_pipe)) {
1126	inode->i_pipe->files++;
1127	spin_unlock(lock: &inode->i_lock);
1128	free_pipe_info(pipe);
1129	pipe = inode->i_pipe;
1130	} else {
1131	inode->i_pipe = pipe;
1132	spin_unlock(lock: &inode->i_lock);
1133	}
1134	}
1135	filp->private_data = pipe;
1136	/ OK, we have a pipe and it's pinned down /
1137
1138	mutex_lock(&pipe->mutex);
1139
1140	/ We can only do regular read/write on fifos /
1141	stream_open(inode, filp);
1142
1143	switch (filp->f_mode & (FMODE_READ \| FMODE_WRITE)) {
1144	case FMODE_READ:
1145	/*
1146	* O_RDONLY
1147	* POSIX.1 says that O_NONBLOCK means return with the FIFO
1148	* opened, even when there is no process writing the FIFO.
1149	*/
1150	pipe->r_counter++;
1151	if (pipe->readers++ == `0`)
1152	wake_up_partner(pipe);
1153
1154	if (!is_pipe && !pipe->writers) {
1155	if ((filp->f_flags & O_NONBLOCK)) {
1156	/ suppress EPOLLHUP until we have*
1157	* seen a writer */
1158	filp->f_version = pipe->w_counter;
1159	} else {
1160	if (wait_for_partner(pipe, cnt: &pipe->w_counter))
1161	goto err_rd;
1162	}
1163	}
1164	break;
1165
1166	case FMODE_WRITE:
1167	/*
1168	* O_WRONLY
1169	* POSIX.1 says that O_NONBLOCK means return -1 with
1170	* errno=ENXIO when there is no process reading the FIFO.
1171	*/
1172	ret = -ENXIO;
1173	if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1174	goto err;
1175
1176	pipe->w_counter++;
1177	if (!pipe->writers++)
1178	wake_up_partner(pipe);
1179
1180	if (!is_pipe && !pipe->readers) {
1181	if (wait_for_partner(pipe, cnt: &pipe->r_counter))
1182	goto err_wr;
1183	}
1184	break;
1185
1186	case FMODE_READ \| FMODE_WRITE:
1187	/*
1188	* O_RDWR
1189	* POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1190	* This implementation will NEVER block on a O_RDWR open, since
1191	* the process can at least talk to itself.
1192	*/
1193
1194	pipe->readers++;
1195	pipe->writers++;
1196	pipe->r_counter++;
1197	pipe->w_counter++;
1198	if (pipe->readers == `1` \|\| pipe->writers == `1`)
1199	wake_up_partner(pipe);
1200	break;
1201
1202	default:
1203	ret = -EINVAL;
1204	goto err;
1205	}
1206
1207	/ Ok! /
1208	mutex_unlock(lock: &pipe->mutex);
1209	return `0`;
1210
1211	err_rd:
1212	if (!--pipe->readers)
1213	wake_up_interruptible(&pipe->wr_wait);
1214	ret = -ERESTARTSYS;
1215	goto err;
1216
1217	err_wr:
1218	if (!--pipe->writers)
1219	wake_up_interruptible_all(&pipe->rd_wait);
1220	ret = -ERESTARTSYS;
1221	goto err;
1222
1223	err:
1224	mutex_unlock(lock: &pipe->mutex);
1225
1226	put_pipe_info(inode, pipe);
1227	return ret;
1228	}
1229
1230	const struct file_operations pipefifo_fops = {
1231	.open = fifo_open,
1232	.llseek = no_llseek,
1233	.read_iter = pipe_read,
1234	.write_iter = pipe_write,
1235	.poll = pipe_poll,
1236	.unlocked_ioctl = pipe_ioctl,
1237	.release = pipe_release,
1238	.fasync = pipe_fasync,
1239	.splice_write = iter_file_splice_write,
1240	};
1241
1242	/*
1243	* Currently we rely on the pipe array holding a power-of-2 number
1244	* of pages. Returns 0 on error.
1245	*/
1246	unsigned int round_pipe_size(unsigned int size)
1247	{
1248	if (size > (`1U` << `31`))
1249	return `0`;
1250
1251	/ Minimum pipe size, as required by POSIX /
1252	if (size < PAGE_SIZE)
1253	return PAGE_SIZE;
1254
1255	return roundup_pow_of_two(size);
1256	}
1257
1258	/*
1259	* Resize the pipe ring to a number of slots.
1260	*
1261	* Note the pipe can be reduced in capacity, but only if the current
1262	* occupancy doesn't exceed nr_slots; if it does, EBUSY will be
1263	* returned instead.
1264	*/
1265	int pipe_resize_ring(struct pipe_inode_info pipe, unsigned* int nr_slots)
1266	{
1267	struct pipe_buffer *bufs;
1268	unsigned int head, tail, mask, n;
1269
1270	bufs = kcalloc(n: nr_slots, size: sizeof(*bufs),
1271	GFP_KERNEL_ACCOUNT \| __GFP_NOWARN);
1272	if (unlikely(!bufs))
1273	return -ENOMEM;
1274
1275	spin_lock_irq(lock: &pipe->rd_wait.lock);
1276	mask = pipe->ring_size - `1`;
1277	head = pipe->head;
1278	tail = pipe->tail;
1279
1280	n = pipe_occupancy(head, tail);
1281	if (nr_slots < n) {
1282	spin_unlock_irq(lock: &pipe->rd_wait.lock);
1283	kfree(objp: bufs);
1284	return -EBUSY;
1285	}
1286
1287	/*
1288	* The pipe array wraps around, so just start the new one at zero
1289	* and adjust the indices.
1290	*/
1291	if (n > `0`) {
1292	unsigned int h = head & mask;
1293	unsigned int t = tail & mask;
1294	if (h > t) {
1295	memcpy(bufs, pipe->bufs + t,
1296	n * sizeof(struct pipe_buffer));
1297	} else {
1298	unsigned int tsize = pipe->ring_size - t;
1299	if (h > `0`)
1300	memcpy(bufs + tsize, pipe->bufs,
1301	h * sizeof(struct pipe_buffer));
1302	memcpy(bufs, pipe->bufs + t,
1303	tsize * sizeof(struct pipe_buffer));
1304	}
1305	}
1306
1307	head = n;
1308	tail = `0`;
1309
1310	kfree(objp: pipe->bufs);
1311	pipe->bufs = bufs;
1312	pipe->ring_size = nr_slots;
1313	if (pipe->max_usage > nr_slots)
1314	pipe->max_usage = nr_slots;
1315	pipe->tail = tail;
1316	pipe->head = head;
1317
1318	if (!pipe_has_watch_queue(pipe)) {
1319	pipe->max_usage = nr_slots;
1320	pipe->nr_accounted = nr_slots;
1321	}
1322
1323	spin_unlock_irq(lock: &pipe->rd_wait.lock);
1324
1325	/ This might have made more room for writers /
1326	wake_up_interruptible(&pipe->wr_wait);
1327	return `0`;
1328	}
1329
1330	/*
1331	* Allocate a new array of pipe buffers and copy the info over. Returns the
1332	* pipe size if successful, or return -ERROR on error.
1333	*/
1334	static long pipe_set_size(struct pipe_inode_info pipe, unsigned* int arg)
1335	{
1336	unsigned long user_bufs;
1337	unsigned int nr_slots, size;
1338	long ret = `0`;
1339
1340	if (pipe_has_watch_queue(pipe))
1341	return -EBUSY;
1342
1343	size = round_pipe_size(size: arg);
1344	nr_slots = size >> PAGE_SHIFT;
1345
1346	if (!nr_slots)
1347	return -EINVAL;
1348
1349	/*
1350	* If trying to increase the pipe capacity, check that an
1351	* unprivileged user is not trying to exceed various limits
1352	* (soft limit check here, hard limit check just below).
1353	* Decreasing the pipe capacity is always permitted, even
1354	* if the user is currently over a limit.
1355	*/
1356	if (nr_slots > pipe->max_usage &&
1357	size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1358	return -EPERM;
1359
1360	user_bufs = account_pipe_buffers(user: pipe->user, old: pipe->nr_accounted, new: nr_slots);
1361
1362	if (nr_slots > pipe->max_usage &&
1363	(too_many_pipe_buffers_hard(user_bufs) \|\|
1364	too_many_pipe_buffers_soft(user_bufs)) &&
1365	pipe_is_unprivileged_user()) {
1366	ret = -EPERM;
1367	goto out_revert_acct;
1368	}
1369
1370	ret = pipe_resize_ring(pipe, nr_slots);
1371	if (ret < `0`)
1372	goto out_revert_acct;
1373
1374	return pipe->max_usage * PAGE_SIZE;
1375
1376	out_revert_acct:
1377	(void) account_pipe_buffers(user: pipe->user, old: nr_slots, new: pipe->nr_accounted);
1378	return ret;
1379	}
1380
1381	/*
1382	* Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
1383	* not enough to verify that this is a pipe.
1384	*/
1385	struct pipe_inode_info get_pipe_info(struct* file *file, bool for_splice)
1386	{
1387	struct pipe_inode_info *pipe = file->private_data;
1388
1389	if (file->f_op != &pipefifo_fops \|\| !pipe)
1390	return NULL;
1391	if (for_splice && pipe_has_watch_queue(pipe))
1392	return NULL;
1393	return pipe;
1394	}
1395
1396	long pipe_fcntl(struct file file, unsigned* int cmd, unsigned int arg)
1397	{
1398	struct pipe_inode_info *pipe;
1399	long ret;
1400
1401	pipe = get_pipe_info(file, for_splice: false);
1402	if (!pipe)
1403	return -EBADF;
1404
1405	mutex_lock(&pipe->mutex);
1406
1407	switch (cmd) {
1408	case F_SETPIPE_SZ:
1409	ret = pipe_set_size(pipe, arg);
1410	break;
1411	case F_GETPIPE_SZ:
1412	ret = pipe->max_usage * PAGE_SIZE;
1413	break;
1414	default:
1415	ret = -EINVAL;
1416	break;
1417	}
1418
1419	mutex_unlock(lock: &pipe->mutex);
1420	return ret;
1421	}
1422
1423	static const struct super_operations pipefs_ops = {
1424	.destroy_inode = free_inode_nonrcu,
1425	.statfs = simple_statfs,
1426	};
1427
1428	/*
1429	* pipefs should _never_ be mounted by userland - too much of security hassle,
1430	* no real gain from having the whole whorehouse mounted. So we don't need
1431	* any operations on the root directory. However, we need a non-trivial
1432	* d_name - pipe: will go nicely and kill the special-casing in procfs.
1433	*/
1434
1435	static int pipefs_init_fs_context(struct fs_context *fc)
1436	{
1437	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1438	if (!ctx)
1439	return -ENOMEM;
1440	ctx->ops = &pipefs_ops;
1441	ctx->dops = &pipefs_dentry_operations;
1442	return `0`;
1443	}
1444
1445	static struct file_system_type pipe_fs_type = {
1446	.name = "pipefs",
1447	.init_fs_context = pipefs_init_fs_context,
1448	.kill_sb = kill_anon_super,
1449	};
1450
1451	#ifdef CONFIG_SYSCTL
1452	static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
1453	unsigned int *valp,
1454	int write, void *data)
1455	{
1456	if (write) {
1457	unsigned int val;
1458
1459	val = round_pipe_size(size: *lvalp);
1460	if (val == `0`)
1461	return -EINVAL;
1462
1463	*valp = val;
1464	} else {
1465	unsigned int val = *valp;
1466	lvalp = (unsigned* long) val;
1467	}
1468
1469	return `0`;
1470	}
1471
1472	static int proc_dopipe_max_size(struct ctl_table table, int* write,
1473	void buffer, size_t lenp, loff_t *ppos)
1474	{
1475	return do_proc_douintvec(table, write, buffer, lenp, ppos,
1476	conv: do_proc_dopipe_max_size_conv, NULL);
1477	}
1478
1479	static struct ctl_table fs_pipe_sysctls[] = {
1480	{
1481	.procname = "pipe-max-size",
1482	.data = &pipe_max_size,
1483	.maxlen = sizeof(pipe_max_size),
1484	.mode = `0644`,
1485	.proc_handler = proc_dopipe_max_size,
1486	},
1487	{
1488	.procname = "pipe-user-pages-hard",
1489	.data = &pipe_user_pages_hard,
1490	.maxlen = sizeof(pipe_user_pages_hard),
1491	.mode = `0644`,
1492	.proc_handler = proc_doulongvec_minmax,
1493	},
1494	{
1495	.procname = "pipe-user-pages-soft",
1496	.data = &pipe_user_pages_soft,
1497	.maxlen = sizeof(pipe_user_pages_soft),
1498	.mode = `0644`,
1499	.proc_handler = proc_doulongvec_minmax,
1500	},
1501	};
1502	#endif
1503
1504	static int __init init_pipe_fs(void)
1505	{
1506	int err = register_filesystem(&pipe_fs_type);
1507
1508	if (!err) {
1509	pipe_mnt = kern_mount(&pipe_fs_type);
1510	if (IS_ERR(ptr: pipe_mnt)) {
1511	err = PTR_ERR(ptr: pipe_mnt);
1512	unregister_filesystem(&pipe_fs_type);
1513	}
1514	}
1515	#ifdef CONFIG_SYSCTL
1516	register_sysctl_init("fs", fs_pipe_sysctls);
1517	#endif
1518	return err;
1519	}
1520
1521	fs_initcall(init_pipe_fs);
1522

source code of linux/fs/pipe.c