eventfd.c source code [linux/fs/eventfd.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* fs/eventfd.c
4	*
5	* Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6	*
7	*/
8
9	#include <linux/file.h>
10	#include <linux/poll.h>
11	#include <linux/init.h>
12	#include <linux/fs.h>
13	#include <linux/sched/signal.h>
14	#include <linux/kernel.h>
15	#include <linux/slab.h>
16	#include <linux/list.h>
17	#include <linux/spinlock.h>
18	#include <linux/anon_inodes.h>
19	#include <linux/syscalls.h>
20	#include <linux/export.h>
21	#include <linux/kref.h>
22	#include <linux/eventfd.h>
23	#include <linux/proc_fs.h>
24	#include <linux/seq_file.h>
25	#include <linux/idr.h>
26	#include <linux/uio.h>
27
28	static DEFINE_IDA(eventfd_ida);
29
30	struct eventfd_ctx {
31	struct kref kref;
32	wait_queue_head_t wqh;
33	/*
34	* Every time that a write(2) is performed on an eventfd, the
35	* value of the __u64 being written is added to "count" and a
36	* wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not
37	* specified, a read(2) will return the "count" value to userspace,
38	* and will reset "count" to zero. The kernel side eventfd_signal()
39	* also, adds to the "count" counter and issue a wakeup.
40	*/
41	__u64 count;
42	unsigned int flags;
43	int id;
44	};
45
46	/**
47	* eventfd_signal_mask - Increment the event counter
48	* @ctx: [in] Pointer to the eventfd context.
49	* @mask: [in] poll mask
50	*
51	* This function is supposed to be called by the kernel in paths that do not
52	* allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
53	* value, and we signal this as overflow condition by returning a EPOLLERR
54	* to poll(2).
55	*/
56	void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
57	{
58	unsigned long flags;
59
60	/*
61	* Deadlock or stack overflow issues can happen if we recurse here
62	* through waitqueue wakeup handlers. If the caller users potentially
63	* nested waitqueues with custom wakeup handlers, then it should
64	* check eventfd_signal_allowed() before calling this function. If
65	* it returns false, the eventfd_signal() call should be deferred to a
66	* safe context.
67	*/
68	if (WARN_ON_ONCE(current->in_eventfd))
69	return;
70
71	spin_lock_irqsave(&ctx->wqh.lock, flags);
72	current->in_eventfd = `1`;
73	if (ctx->count < ULLONG_MAX)
74	ctx->count++;
75	if (waitqueue_active(wq_head: &ctx->wqh))
76	wake_up_locked_poll(&ctx->wqh, EPOLLIN \| mask);
77	current->in_eventfd = `0`;
78	spin_unlock_irqrestore(lock: &ctx->wqh.lock, flags);
79	}
80	EXPORT_SYMBOL_GPL(eventfd_signal_mask);
81
82	static void eventfd_free_ctx(struct eventfd_ctx *ctx)
83	{
84	if (ctx->id >= `0`)
85	ida_free(&eventfd_ida, id: ctx->id);
86	kfree(objp: ctx);
87	}
88
89	static void eventfd_free(struct kref *kref)
90	{
91	struct eventfd_ctx ctx = container_of(kref, struct* eventfd_ctx, kref);
92
93	eventfd_free_ctx(ctx);
94	}
95
96	/**
97	* eventfd_ctx_put - Releases a reference to the internal eventfd context.
98	* @ctx: [in] Pointer to eventfd context.
99	*
100	* The eventfd context reference must have been previously acquired either
101	* with eventfd_ctx_fdget() or eventfd_ctx_fileget().
102	*/
103	void eventfd_ctx_put(struct eventfd_ctx *ctx)
104	{
105	kref_put(kref: &ctx->kref, release: eventfd_free);
106	}
107	EXPORT_SYMBOL_GPL(eventfd_ctx_put);
108
109	static int eventfd_release(struct inode inode, struct* file *file)
110	{
111	struct eventfd_ctx *ctx = file->private_data;
112
113	wake_up_poll(&ctx->wqh, EPOLLHUP);
114	eventfd_ctx_put(ctx);
115	return `0`;
116	}
117
118	static __poll_t eventfd_poll(struct file file, poll_table wait)
119	{
120	struct eventfd_ctx *ctx = file->private_data;
121	__poll_t events = `0`;
122	u64 count;
123
124	poll_wait(filp: file, wait_address: &ctx->wqh, p: wait);
125
126	/*
127	* All writes to ctx->count occur within ctx->wqh.lock. This read
128	* can be done outside ctx->wqh.lock because we know that poll_wait
129	* takes that lock (through add_wait_queue) if our caller will sleep.
130	*
131	* The read _can_ therefore seep into add_wait_queue's critical
132	* section, but cannot move above it! add_wait_queue's spin_lock acts
133	* as an acquire barrier and ensures that the read be ordered properly
134	* against the writes. The following CAN happen and is safe:
135	*
136	* poll write
137	* ----------------- ------------
138	* lock ctx->wqh.lock (in poll_wait)
139	* count = ctx->count
140	* __add_wait_queue
141	* unlock ctx->wqh.lock
142	* lock ctx->qwh.lock
143	* ctx->count += n
144	* if (waitqueue_active)
145	* wake_up_locked_poll
146	* unlock ctx->qwh.lock
147	* eventfd_poll returns 0
148	*
149	* but the following, which would miss a wakeup, cannot happen:
150	*
151	* poll write
152	* ----------------- ------------
153	* count = ctx->count (INVALID!)
154	* lock ctx->qwh.lock
155	* ctx->count += n
156	* waitqueue_active is false
157	* no wake_up_locked_poll!
158	* unlock ctx->qwh.lock
159	* lock ctx->wqh.lock (in poll_wait)
160	* __add_wait_queue
161	* unlock ctx->wqh.lock
162	* eventfd_poll returns 0
163	*/
164	count = READ_ONCE(ctx->count);
165
166	if (count > `0`)
167	events \|= EPOLLIN;
168	if (count == ULLONG_MAX)
169	events \|= EPOLLERR;
170	if (ULLONG_MAX - `1` > count)
171	events \|= EPOLLOUT;
172
173	return events;
174	}
175
176	void eventfd_ctx_do_read(struct eventfd_ctx ctx, __u64 cnt)
177	{
178	lockdep_assert_held(&ctx->wqh.lock);
179
180	*cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? `1` : ctx->count;
181	ctx->count -= *cnt;
182	}
183	EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
184
185	/**
186	* eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
187	* @ctx: [in] Pointer to eventfd context.
188	* @wait: [in] Wait queue to be removed.
189	* @cnt: [out] Pointer to the 64-bit counter value.
190	*
191	* Returns %0 if successful, or the following error codes:
192	*
193	* -EAGAIN : The operation would have blocked.
194	*
195	* This is used to atomically remove a wait queue entry from the eventfd wait
196	* queue head, and read/reset the counter value.
197	*/
198	int eventfd_ctx_remove_wait_queue(struct eventfd_ctx ctx, wait_queue_entry_t wait,
199	__u64 *cnt)
200	{
201	unsigned long flags;
202
203	spin_lock_irqsave(&ctx->wqh.lock, flags);
204	eventfd_ctx_do_read(ctx, cnt);
205	__remove_wait_queue(wq_head: &ctx->wqh, wq_entry: wait);
206	if (*cnt != `0` && waitqueue_active(wq_head: &ctx->wqh))
207	wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
208	spin_unlock_irqrestore(lock: &ctx->wqh.lock, flags);
209
210	return *cnt != `0` ? `0` : -EAGAIN;
211	}
212	EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
213
214	static ssize_t eventfd_read(struct kiocb iocb, struct* iov_iter *to)
215	{
216	struct file *file = iocb->ki_filp;
217	struct eventfd_ctx *ctx = file->private_data;
218	__u64 ucnt = `0`;
219
220	if (iov_iter_count(i: to) < sizeof(ucnt))
221	return -EINVAL;
222	spin_lock_irq(lock: &ctx->wqh.lock);
223	if (!ctx->count) {
224	if ((file->f_flags & O_NONBLOCK) \|\|
225	(iocb->ki_flags & IOCB_NOWAIT)) {
226	spin_unlock_irq(lock: &ctx->wqh.lock);
227	return -EAGAIN;
228	}
229
230	if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) {
231	spin_unlock_irq(lock: &ctx->wqh.lock);
232	return -ERESTARTSYS;
233	}
234	}
235	eventfd_ctx_do_read(ctx, &ucnt);
236	current->in_eventfd = `1`;
237	if (waitqueue_active(wq_head: &ctx->wqh))
238	wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
239	current->in_eventfd = `0`;
240	spin_unlock_irq(lock: &ctx->wqh.lock);
241	if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
242	return -EFAULT;
243
244	return sizeof(ucnt);
245	}
246
247	static ssize_t eventfd_write(struct file file, const* char __user *buf, size_t count,
248	loff_t *ppos)
249	{
250	struct eventfd_ctx *ctx = file->private_data;
251	ssize_t res;
252	__u64 ucnt;
253
254	if (count != sizeof(ucnt))
255	return -EINVAL;
256	if (copy_from_user(to: &ucnt, from: buf, n: sizeof(ucnt)))
257	return -EFAULT;
258	if (ucnt == ULLONG_MAX)
259	return -EINVAL;
260	spin_lock_irq(lock: &ctx->wqh.lock);
261	res = -EAGAIN;
262	if (ULLONG_MAX - ctx->count > ucnt)
263	res = sizeof(ucnt);
264	else if (!(file->f_flags & O_NONBLOCK)) {
265	res = wait_event_interruptible_locked_irq(ctx->wqh,
266	ULLONG_MAX - ctx->count > ucnt);
267	if (!res)
268	res = sizeof(ucnt);
269	}
270	if (likely(res > `0`)) {
271	ctx->count += ucnt;
272	current->in_eventfd = `1`;
273	if (waitqueue_active(wq_head: &ctx->wqh))
274	wake_up_locked_poll(&ctx->wqh, EPOLLIN);
275	current->in_eventfd = `0`;
276	}
277	spin_unlock_irq(lock: &ctx->wqh.lock);
278
279	return res;
280	}
281
282	#ifdef CONFIG_PROC_FS
283	static void eventfd_show_fdinfo(struct seq_file m, struct* file *f)
284	{
285	struct eventfd_ctx *ctx = f->private_data;
286	__u64 cnt;
287
288	spin_lock_irq(lock: &ctx->wqh.lock);
289	cnt = ctx->count;
290	spin_unlock_irq(lock: &ctx->wqh.lock);
291
292	seq_printf(m,
293	fmt: "eventfd-count: %16llx\n"
294	"eventfd-id: %d\n"
295	"eventfd-semaphore: %d\n",
296	cnt,
297	ctx->id,
298	!!(ctx->flags & EFD_SEMAPHORE));
299	}
300	#endif
301
302	static const struct file_operations eventfd_fops = {
303	#ifdef CONFIG_PROC_FS
304	.show_fdinfo = eventfd_show_fdinfo,
305	#endif
306	.release = eventfd_release,
307	.poll = eventfd_poll,
308	.read_iter = eventfd_read,
309	.write = eventfd_write,
310	.llseek = noop_llseek,
311	};
312
313	/**
314	* eventfd_fget - Acquire a reference of an eventfd file descriptor.
315	* @fd: [in] Eventfd file descriptor.
316	*
317	* Returns a pointer to the eventfd file structure in case of success, or the
318	* following error pointer:
319	*
320	* -EBADF : Invalid @fd file descriptor.
321	* -EINVAL : The @fd file descriptor is not an eventfd file.
322	*/
323	struct file eventfd_fget(int* fd)
324	{
325	struct file *file;
326
327	file = fget(fd);
328	if (!file)
329	return ERR_PTR(error: -EBADF);
330	if (file->f_op != &eventfd_fops) {
331	fput(file);
332	return ERR_PTR(error: -EINVAL);
333	}
334
335	return file;
336	}
337	EXPORT_SYMBOL_GPL(eventfd_fget);
338
339	/**
340	* eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
341	* @fd: [in] Eventfd file descriptor.
342	*
343	* Returns a pointer to the internal eventfd context, otherwise the error
344	* pointers returned by the following functions:
345	*
346	* eventfd_fget
347	*/
348	struct eventfd_ctx eventfd_ctx_fdget(int* fd)
349	{
350	struct eventfd_ctx *ctx;
351	struct fd f = fdget(fd);
352	if (!f.file)
353	return ERR_PTR(error: -EBADF);
354	ctx = eventfd_ctx_fileget(file: f.file);
355	fdput(fd: f);
356	return ctx;
357	}
358	EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
359
360	/**
361	* eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
362	* @file: [in] Eventfd file pointer.
363	*
364	* Returns a pointer to the internal eventfd context, otherwise the error
365	* pointer:
366	*
367	* -EINVAL : The @fd file descriptor is not an eventfd file.
368	*/
369	struct eventfd_ctx eventfd_ctx_fileget(struct* file *file)
370	{
371	struct eventfd_ctx *ctx;
372
373	if (file->f_op != &eventfd_fops)
374	return ERR_PTR(error: -EINVAL);
375
376	ctx = file->private_data;
377	kref_get(kref: &ctx->kref);
378	return ctx;
379	}
380	EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
381
382	static int do_eventfd(unsigned int count, int flags)
383	{
384	struct eventfd_ctx *ctx;
385	struct file *file;
386	int fd;
387
388	/ Check the EFD_* constants for consistency. /
389	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
390	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
391	BUILD_BUG_ON(EFD_SEMAPHORE != (`1` << `0`));
392
393	if (flags & ~EFD_FLAGS_SET)
394	return -EINVAL;
395
396	ctx = kmalloc(size: sizeof(*ctx), GFP_KERNEL);
397	if (!ctx)
398	return -ENOMEM;
399
400	kref_init(kref: &ctx->kref);
401	init_waitqueue_head(&ctx->wqh);
402	ctx->count = count;
403	ctx->flags = flags;
404	ctx->id = ida_alloc(ida: &eventfd_ida, GFP_KERNEL);
405
406	flags &= EFD_SHARED_FCNTL_FLAGS;
407	flags \|= O_RDWR;
408	fd = get_unused_fd_flags(flags);
409	if (fd < `0`)
410	goto err;
411
412	file = anon_inode_getfile(name: "[eventfd]", fops: &eventfd_fops, priv: ctx, flags);
413	if (IS_ERR(ptr: file)) {
414	put_unused_fd(fd);
415	fd = PTR_ERR(ptr: file);
416	goto err;
417	}
418
419	file->f_mode \|= FMODE_NOWAIT;
420	fd_install(fd, file);
421	return fd;
422	err:
423	eventfd_free_ctx(ctx);
424	return fd;
425	}
426
427	SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
428	{
429	return do_eventfd(count, flags);
430	}
431
432	SYSCALL_DEFINE1(eventfd, unsigned int, count)
433	{
434	return do_eventfd(count, flags: `0`);
435	}
436
437

source code of linux/fs/eventfd.c