file.c source code [linux/fs/file.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/file.c
4	*
5	* Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
6	*
7	* Manage the dynamic fd arrays in the process files_struct.
8	*/
9
10	#include <linux/syscalls.h>
11	#include <linux/export.h>
12	#include <linux/fs.h>
13	#include <linux/kernel.h>
14	#include <linux/mm.h>
15	#include <linux/sched/signal.h>
16	#include <linux/slab.h>
17	#include <linux/file.h>
18	#include <linux/fdtable.h>
19	#include <linux/bitops.h>
20	#include <linux/spinlock.h>
21	#include <linux/rcupdate.h>
22	#include <linux/close_range.h>
23	#include <net/sock.h>
24
25	#include "internal.h"
26
27	unsigned int sysctl_nr_open __read_mostly = `1024`*`1024`;
28	unsigned int sysctl_nr_open_min = BITS_PER_LONG;
29	/ our min() is unusable in constant expressions ;-/ /
30	#define __const_min(x, y) ((x) < (y) ? (x) : (y))
31	unsigned int sysctl_nr_open_max =
32	__const_min(INT_MAX, ~(size_t)`0`/sizeof(void *)) & -BITS_PER_LONG;
33
34	static void __free_fdtable(struct fdtable *fdt)
35	{
36	kvfree(addr: fdt->fd);
37	kvfree(addr: fdt->open_fds);
38	kfree(objp: fdt);
39	}
40
41	static void free_fdtable_rcu(struct rcu_head *rcu)
42	{
43	__free_fdtable(container_of(rcu, struct fdtable, rcu));
44	}
45
46	#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
47	#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
48
49	/*
50	* Copy 'count' fd bits from the old table to the new table and clear the extra
51	* space if any. This does not copy the file pointers. Called with the files
52	* spinlock held for write.
53	*/
54	static void copy_fd_bitmaps(struct fdtable nfdt, struct* fdtable *ofdt,
55	unsigned int count)
56	{
57	unsigned int cpy, set;
58
59	cpy = count / BITS_PER_BYTE;
60	set = (nfdt->max_fds - count) / BITS_PER_BYTE;
61	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
62	memset((char *)nfdt->open_fds + cpy, `0`, set);
63	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
64	memset((char *)nfdt->close_on_exec + cpy, `0`, set);
65
66	cpy = BITBIT_SIZE(count);
67	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
68	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
69	memset((char *)nfdt->full_fds_bits + cpy, `0`, set);
70	}
71
72	/*
73	* Copy all file descriptors from the old table to the new, expanded table and
74	* clear the extra space. Called with the files spinlock held for write.
75	*/
76	static void copy_fdtable(struct fdtable nfdt, struct* fdtable *ofdt)
77	{
78	size_t cpy, set;
79
80	BUG_ON(nfdt->max_fds < ofdt->max_fds);
81
82	cpy = ofdt->max_fds * sizeof(struct file *);
83	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
84	memcpy(nfdt->fd, ofdt->fd, cpy);
85	memset((char *)nfdt->fd + cpy, `0`, set);
86
87	copy_fd_bitmaps(nfdt, ofdt, count: ofdt->max_fds);
88	}
89
90	/*
91	* Note how the fdtable bitmap allocations very much have to be a multiple of
92	* BITS_PER_LONG. This is not only because we walk those things in chunks of
93	* 'unsigned long' in some places, but simply because that is how the Linux
94	* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
95	* they are very much "bits in an array of unsigned long".
96	*
97	* The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
98	* by that "1024/sizeof(ptr)" before, we already know there are sufficient
99	* clear low bits. Clang seems to realize that, gcc ends up being confused.
100	*
101	* On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
102	* let's consider it documentation (and maybe a test-case for gcc to improve
103	* its code generation ;)
104	*/
105	static struct fdtable * alloc_fdtable(unsigned int nr)
106	{
107	struct fdtable *fdt;
108	void *data;
109
110	/*
111	* Figure out how many fds we actually want to support in this fdtable.
112	* Allocation steps are keyed to the size of the fdarray, since it
113	* grows far faster than any of the other dynamic data. We try to fit
114	* the fdarray into comfortable page-tuned chunks: starting at 1024B
115	* and growing in powers of two from there on.
116	*/
117	nr /= (`1024` / sizeof(struct file *));
118	nr = roundup_pow_of_two(nr + `1`);
119	nr = (`1024` / sizeof(struct* file *));
120	nr = ALIGN(nr, BITS_PER_LONG);
121	/*
122	* Note that this can drive nr below what we had passed if sysctl_nr_open
123	* had been set lower between the check in expand_files() and here. Deal
124	* with that in caller, it's cheaper that way.
125	*
126	* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
127	* bitmaps handling below becomes unpleasant, to put it mildly...
128	*/
129	if (unlikely(nr > sysctl_nr_open))
130	nr = ((sysctl_nr_open - `1`) \| (BITS_PER_LONG - `1`)) + `1`;
131
132	fdt = kmalloc(size: sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
133	if (!fdt)
134	goto out;
135	fdt->max_fds = nr;
136	data = kvmalloc_array(n: nr, size: sizeof(struct file *), GFP_KERNEL_ACCOUNT);
137	if (!data)
138	goto out_fdt;
139	fdt->fd = data;
140
141	data = kvmalloc(max_t(size_t,
142	`2` * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
143	GFP_KERNEL_ACCOUNT);
144	if (!data)
145	goto out_arr;
146	fdt->open_fds = data;
147	data += nr / BITS_PER_BYTE;
148	fdt->close_on_exec = data;
149	data += nr / BITS_PER_BYTE;
150	fdt->full_fds_bits = data;
151
152	return fdt;
153
154	out_arr:
155	kvfree(addr: fdt->fd);
156	out_fdt:
157	kfree(objp: fdt);
158	out:
159	return NULL;
160	}
161
162	/*
163	* Expand the file descriptor table.
164	* This function will allocate a new fdtable and both fd array and fdset, of
165	* the given size.
166	* Return <0 error code on error; 1 on successful completion.
167	* The files->file_lock should be held on entry, and will be held on exit.
168	*/
169	static int expand_fdtable(struct files_struct files, unsigned* int nr)
170	__releases(files->file_lock)
171	__acquires(files->file_lock)
172	{
173	struct fdtable new_fdt, cur_fdt;
174
175	spin_unlock(lock: &files->file_lock);
176	new_fdt = alloc_fdtable(nr);
177
178	/ make sure all fd_install() have seen resize_in_progress*
179	* or have finished their rcu_read_lock_sched() section.
180	*/
181	if (atomic_read(v: &files->count) > `1`)
182	synchronize_rcu();
183
184	spin_lock(lock: &files->file_lock);
185	if (!new_fdt)
186	return -ENOMEM;
187	/*
188	* extremely unlikely race - sysctl_nr_open decreased between the check in
189	* caller and alloc_fdtable(). Cheaper to catch it here...
190	*/
191	if (unlikely(new_fdt->max_fds <= nr)) {
192	__free_fdtable(fdt: new_fdt);
193	return -EMFILE;
194	}
195	cur_fdt = files_fdtable(files);
196	BUG_ON(nr < cur_fdt->max_fds);
197	copy_fdtable(nfdt: new_fdt, ofdt: cur_fdt);
198	rcu_assign_pointer(files->fdt, new_fdt);
199	if (cur_fdt != &files->fdtab)
200	call_rcu(head: &cur_fdt->rcu, func: free_fdtable_rcu);
201	/ coupled with smp_rmb() in fd_install() /
202	smp_wmb();
203	return `1`;
204	}
205
206	/*
207	* Expand files.
208	* This function will expand the file structures, if the requested size exceeds
209	* the current capacity and there is room for expansion.
210	* Return <0 error code on error; 0 when nothing done; 1 when files were
211	* expanded and execution may have blocked.
212	* The files->file_lock should be held on entry, and will be held on exit.
213	*/
214	static int expand_files(struct files_struct files, unsigned* int nr)
215	__releases(files->file_lock)
216	__acquires(files->file_lock)
217	{
218	struct fdtable *fdt;
219	int expanded = `0`;
220
221	repeat:
222	fdt = files_fdtable(files);
223
224	/ Do we need to expand? /
225	if (nr < fdt->max_fds)
226	return expanded;
227
228	/ Can we expand? /
229	if (nr >= sysctl_nr_open)
230	return -EMFILE;
231
232	if (unlikely(files->resize_in_progress)) {
233	spin_unlock(lock: &files->file_lock);
234	expanded = `1`;
235	wait_event(files->resize_wait, !files->resize_in_progress);
236	spin_lock(lock: &files->file_lock);
237	goto repeat;
238	}
239
240	/ All good, so we try /
241	files->resize_in_progress = true;
242	expanded = expand_fdtable(files, nr);
243	files->resize_in_progress = false;
244
245	wake_up_all(&files->resize_wait);
246	return expanded;
247	}
248
249	static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
250	{
251	__set_bit(fd, fdt->close_on_exec);
252	}
253
254	static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
255	{
256	if (test_bit(fd, fdt->close_on_exec))
257	__clear_bit(fd, fdt->close_on_exec);
258	}
259
260	static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
261	{
262	__set_bit(fd, fdt->open_fds);
263	fd /= BITS_PER_LONG;
264	if (!~fdt->open_fds[fd])
265	__set_bit(fd, fdt->full_fds_bits);
266	}
267
268	static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
269	{
270	__clear_bit(fd, fdt->open_fds);
271	__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
272	}
273
274	static unsigned int count_open_files(struct fdtable *fdt)
275	{
276	unsigned int size = fdt->max_fds;
277	unsigned int i;
278
279	/ Find the last open fd /
280	for (i = size / BITS_PER_LONG; i > `0`; ) {
281	if (fdt->open_fds[--i])
282	break;
283	}
284	i = (i + `1`) * BITS_PER_LONG;
285	return i;
286	}
287
288	/*
289	* Note that a sane fdtable size always has to be a multiple of
290	* BITS_PER_LONG, since we have bitmaps that are sized by this.
291	*
292	* 'max_fds' will normally already be properly aligned, but it
293	* turns out that in the close_range() -> __close_range() ->
294	* unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
295	* up having a 'max_fds' value that isn't already aligned.
296	*
297	* Rather than make close_range() have to worry about this,
298	* just make that BITS_PER_LONG alignment be part of a sane
299	* fdtable size. Becuase that's really what it is.
300	*/
301	static unsigned int sane_fdtable_size(struct fdtable fdt, unsigned* int max_fds)
302	{
303	unsigned int count;
304
305	count = count_open_files(fdt);
306	if (max_fds < NR_OPEN_DEFAULT)
307	max_fds = NR_OPEN_DEFAULT;
308	return ALIGN(min(count, max_fds), BITS_PER_LONG);
309	}
310
311	/*
312	* Allocate a new files structure and copy contents from the
313	* passed in files structure.
314	* errorp will be valid only when the returned files_struct is NULL.
315	*/
316	struct files_struct dup_fd(struct* files_struct oldf, unsigned* int max_fds, int *errorp)
317	{
318	struct files_struct *newf;
319	struct file old_fds, new_fds;
320	unsigned int open_files, i;
321	struct fdtable old_fdt, new_fdt;
322
323	*errorp = -ENOMEM;
324	newf = kmem_cache_alloc(cachep: files_cachep, GFP_KERNEL);
325	if (!newf)
326	goto out;
327
328	atomic_set(v: &newf->count, i: `1`);
329
330	spin_lock_init(&newf->file_lock);
331	newf->resize_in_progress = false;
332	init_waitqueue_head(&newf->resize_wait);
333	newf->next_fd = `0`;
334	new_fdt = &newf->fdtab;
335	new_fdt->max_fds = NR_OPEN_DEFAULT;
336	new_fdt->close_on_exec = newf->close_on_exec_init;
337	new_fdt->open_fds = newf->open_fds_init;
338	new_fdt->full_fds_bits = newf->full_fds_bits_init;
339	new_fdt->fd = &newf->fd_array[`0`];
340
341	spin_lock(lock: &oldf->file_lock);
342	old_fdt = files_fdtable(oldf);
343	open_files = sane_fdtable_size(fdt: old_fdt, max_fds);
344
345	/*
346	* Check whether we need to allocate a larger fd array and fd set.
347	*/
348	while (unlikely(open_files > new_fdt->max_fds)) {
349	spin_unlock(lock: &oldf->file_lock);
350
351	if (new_fdt != &newf->fdtab)
352	__free_fdtable(fdt: new_fdt);
353
354	new_fdt = alloc_fdtable(nr: open_files - `1`);
355	if (!new_fdt) {
356	*errorp = -ENOMEM;
357	goto out_release;
358	}
359
360	/ beyond sysctl_nr_open; nothing to do /
361	if (unlikely(new_fdt->max_fds < open_files)) {
362	__free_fdtable(fdt: new_fdt);
363	*errorp = -EMFILE;
364	goto out_release;
365	}
366
367	/*
368	* Reacquire the oldf lock and a pointer to its fd table
369	* who knows it may have a new bigger fd table. We need
370	* the latest pointer.
371	*/
372	spin_lock(lock: &oldf->file_lock);
373	old_fdt = files_fdtable(oldf);
374	open_files = sane_fdtable_size(fdt: old_fdt, max_fds);
375	}
376
377	copy_fd_bitmaps(nfdt: new_fdt, ofdt: old_fdt, count: open_files);
378
379	old_fds = old_fdt->fd;
380	new_fds = new_fdt->fd;
381
382	for (i = open_files; i != `0`; i--) {
383	struct file f = old_fds++;
384	if (f) {
385	get_file(f);
386	} else {
387	/*
388	* The fd may be claimed in the fd bitmap but not yet
389	* instantiated in the files array if a sibling thread
390	* is partway through open(). So make sure that this
391	* fd is available to the new process.
392	*/
393	__clear_open_fd(fd: open_files - i, fdt: new_fdt);
394	}
395	rcu_assign_pointer(*new_fds++, f);
396	}
397	spin_unlock(lock: &oldf->file_lock);
398
399	/ clear the remainder /
400	memset(new_fds, `0`, (new_fdt->max_fds - open_files) * sizeof(struct file *));
401
402	rcu_assign_pointer(newf->fdt, new_fdt);
403
404	return newf;
405
406	out_release:
407	kmem_cache_free(s: files_cachep, objp: newf);
408	out:
409	return NULL;
410	}
411
412	static struct fdtable close_files(struct* files_struct * files)
413	{
414	/*
415	* It is safe to dereference the fd table without RCU or
416	* ->file_lock because this is the last reference to the
417	* files structure.
418	*/
419	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
420	unsigned int i, j = `0`;
421
422	for (;;) {
423	unsigned long set;
424	i = j * BITS_PER_LONG;
425	if (i >= fdt->max_fds)
426	break;
427	set = fdt->open_fds[j++];
428	while (set) {
429	if (set & `1`) {
430	struct file * file = xchg(&fdt->fd[i], NULL);
431	if (file) {
432	filp_close(file, id: files);
433	cond_resched();
434	}
435	}
436	i++;
437	set >>= `1`;
438	}
439	}
440
441	return fdt;
442	}
443
444	void put_files_struct(struct files_struct *files)
445	{
446	if (atomic_dec_and_test(v: &files->count)) {
447	struct fdtable *fdt = close_files(files);
448
449	/ free the arrays if they are not embedded /
450	if (fdt != &files->fdtab)
451	__free_fdtable(fdt);
452	kmem_cache_free(s: files_cachep, objp: files);
453	}
454	}
455
456	void exit_files(struct task_struct *tsk)
457	{
458	struct files_struct * files = tsk->files;
459
460	if (files) {
461	task_lock(p: tsk);
462	tsk->files = NULL;
463	task_unlock(p: tsk);
464	put_files_struct(files);
465	}
466	}
467
468	struct files_struct init_files = {
469	.count = ATOMIC_INIT(`1`),
470	.fdt = &init_files.fdtab,
471	.fdtab = {
472	.max_fds = NR_OPEN_DEFAULT,
473	.fd = &init_files.fd_array[`0`],
474	.close_on_exec = init_files.close_on_exec_init,
475	.open_fds = init_files.open_fds_init,
476	.full_fds_bits = init_files.full_fds_bits_init,
477	},
478	.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
479	.resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
480	};
481
482	static unsigned int find_next_fd(struct fdtable fdt, unsigned* int start)
483	{
484	unsigned int maxfd = fdt->max_fds;
485	unsigned int maxbit = maxfd / BITS_PER_LONG;
486	unsigned int bitbit = start / BITS_PER_LONG;
487
488	bitbit = find_next_zero_bit(addr: fdt->full_fds_bits, size: maxbit, offset: bitbit) * BITS_PER_LONG;
489	if (bitbit > maxfd)
490	return maxfd;
491	if (bitbit > start)
492	start = bitbit;
493	return find_next_zero_bit(addr: fdt->open_fds, size: maxfd, offset: start);
494	}
495
496	/*
497	* allocate a file descriptor, mark it busy.
498	*/
499	static int alloc_fd(unsigned start, unsigned end, unsigned flags)
500	{
501	struct files_struct *files = current->files;
502	unsigned int fd;
503	int error;
504	struct fdtable *fdt;
505
506	spin_lock(lock: &files->file_lock);
507	repeat:
508	fdt = files_fdtable(files);
509	fd = start;
510	if (fd < files->next_fd)
511	fd = files->next_fd;
512
513	if (fd < fdt->max_fds)
514	fd = find_next_fd(fdt, start: fd);
515
516	/*
517	* N.B. For clone tasks sharing a files structure, this test
518	* will limit the total number of files that can be opened.
519	*/
520	error = -EMFILE;
521	if (fd >= end)
522	goto out;
523
524	error = expand_files(files, nr: fd);
525	if (error < `0`)
526	goto out;
527
528	/*
529	* If we needed to expand the fs array we
530	* might have blocked - try again.
531	*/
532	if (error)
533	goto repeat;
534
535	if (start <= files->next_fd)
536	files->next_fd = fd + `1`;
537
538	__set_open_fd(fd, fdt);
539	if (flags & O_CLOEXEC)
540	__set_close_on_exec(fd, fdt);
541	else
542	__clear_close_on_exec(fd, fdt);
543	error = fd;
544	#if 1
545	/ Sanity check /
546	if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
547	printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
548	rcu_assign_pointer(fdt->fd[fd], NULL);
549	}
550	#endif
551
552	out:
553	spin_unlock(lock: &files->file_lock);
554	return error;
555	}
556
557	int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
558	{
559	return alloc_fd(start: `0`, end: nofile, flags);
560	}
561
562	int get_unused_fd_flags(unsigned flags)
563	{
564	return __get_unused_fd_flags(flags, nofile: rlimit(RLIMIT_NOFILE));
565	}
566	EXPORT_SYMBOL(get_unused_fd_flags);
567
568	static void __put_unused_fd(struct files_struct files, unsigned* int fd)
569	{
570	struct fdtable *fdt = files_fdtable(files);
571	__clear_open_fd(fd, fdt);
572	if (fd < files->next_fd)
573	files->next_fd = fd;
574	}
575
576	void put_unused_fd(unsigned int fd)
577	{
578	struct files_struct *files = current->files;
579	spin_lock(lock: &files->file_lock);
580	__put_unused_fd(files, fd);
581	spin_unlock(lock: &files->file_lock);
582	}
583
584	EXPORT_SYMBOL(put_unused_fd);
585
586	/*
587	* Install a file pointer in the fd array.
588	*
589	* The VFS is full of places where we drop the files lock between
590	* setting the open_fds bitmap and installing the file in the file
591	* array. At any such point, we are vulnerable to a dup2() race
592	* installing a file in the array before us. We need to detect this and
593	* fput() the struct file we are about to overwrite in this case.
594	*
595	* It should never happen - if we allow dup2() do it, _really_ bad things
596	* will follow.
597	*
598	* This consumes the "file" refcount, so callers should treat it
599	* as if they had called fput(file).
600	*/
601
602	void fd_install(unsigned int fd, struct file *file)
603	{
604	struct files_struct *files = current->files;
605	struct fdtable *fdt;
606
607	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
608	return;
609
610	rcu_read_lock_sched();
611
612	if (unlikely(files->resize_in_progress)) {
613	rcu_read_unlock_sched();
614	spin_lock(lock: &files->file_lock);
615	fdt = files_fdtable(files);
616	BUG_ON(fdt->fd[fd] != NULL);
617	rcu_assign_pointer(fdt->fd[fd], file);
618	spin_unlock(lock: &files->file_lock);
619	return;
620	}
621	/ coupled with smp_wmb() in expand_fdtable() /
622	smp_rmb();
623	fdt = rcu_dereference_sched(files->fdt);
624	BUG_ON(fdt->fd[fd] != NULL);
625	rcu_assign_pointer(fdt->fd[fd], file);
626	rcu_read_unlock_sched();
627	}
628
629	EXPORT_SYMBOL(fd_install);
630
631	/**
632	* pick_file - return file associatd with fd
633	* @files: file struct to retrieve file from
634	* @fd: file descriptor to retrieve file for
635	*
636	* Context: files_lock must be held.
637	*
638	* Returns: The file associated with @fd (NULL if @fd is not open)
639	*/
640	static struct file pick_file(struct* files_struct files, unsigned* fd)
641	{
642	struct fdtable *fdt = files_fdtable(files);
643	struct file *file;
644
645	if (fd >= fdt->max_fds)
646	return NULL;
647
648	fd = array_index_nospec(fd, fdt->max_fds);
649	file = fdt->fd[fd];
650	if (file) {
651	rcu_assign_pointer(fdt->fd[fd], NULL);
652	__put_unused_fd(files, fd);
653	}
654	return file;
655	}
656
657	int close_fd(unsigned fd)
658	{
659	struct files_struct *files = current->files;
660	struct file *file;
661
662	spin_lock(lock: &files->file_lock);
663	file = pick_file(files, fd);
664	spin_unlock(lock: &files->file_lock);
665	if (!file)
666	return -EBADF;
667
668	return filp_close(file, id: files);
669	}
670	EXPORT_SYMBOL(close_fd); / for ksys_close() /
671
672	/**
673	* last_fd - return last valid index into fd table
674	* @fdt: File descriptor table.
675	*
676	* Context: Either rcu read lock or files_lock must be held.
677	*
678	* Returns: Last valid index into fdtable.
679	*/
680	static inline unsigned last_fd(struct fdtable *fdt)
681	{
682	return fdt->max_fds - `1`;
683	}
684
685	static inline void __range_cloexec(struct files_struct *cur_fds,
686	unsigned int fd, unsigned int max_fd)
687	{
688	struct fdtable *fdt;
689
690	/ make sure we're using the correct maximum value /
691	spin_lock(lock: &cur_fds->file_lock);
692	fdt = files_fdtable(cur_fds);
693	max_fd = min(last_fd(fdt), max_fd);
694	if (fd <= max_fd)
695	bitmap_set(map: fdt->close_on_exec, start: fd, nbits: max_fd - fd + `1`);
696	spin_unlock(lock: &cur_fds->file_lock);
697	}
698
699	static inline void __range_close(struct files_struct files, unsigned* int fd,
700	unsigned int max_fd)
701	{
702	struct file *file;
703	unsigned n;
704
705	spin_lock(lock: &files->file_lock);
706	n = last_fd(files_fdtable(files));
707	max_fd = min(max_fd, n);
708
709	for (; fd <= max_fd; fd++) {
710	file = pick_file(files, fd);
711	if (file) {
712	spin_unlock(lock: &files->file_lock);
713	filp_close(file, id: files);
714	cond_resched();
715	spin_lock(lock: &files->file_lock);
716	} else if (need_resched()) {
717	spin_unlock(lock: &files->file_lock);
718	cond_resched();
719	spin_lock(lock: &files->file_lock);
720	}
721	}
722	spin_unlock(lock: &files->file_lock);
723	}
724
725	/**
726	* __close_range() - Close all file descriptors in a given range.
727	*
728	* @fd: starting file descriptor to close
729	* @max_fd: last file descriptor to close
730	* @flags: CLOSE_RANGE flags.
731	*
732	* This closes a range of file descriptors. All file descriptors
733	* from @fd up to and including @max_fd are closed.
734	*/
735	int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
736	{
737	struct task_struct *me = current;
738	struct files_struct cur_fds = me->files, fds = NULL;
739
740	if (flags & ~(CLOSE_RANGE_UNSHARE \| CLOSE_RANGE_CLOEXEC))
741	return -EINVAL;
742
743	if (fd > max_fd)
744	return -EINVAL;
745
746	if (flags & CLOSE_RANGE_UNSHARE) {
747	int ret;
748	unsigned int max_unshare_fds = NR_OPEN_MAX;
749
750	/*
751	* If the caller requested all fds to be made cloexec we always
752	* copy all of the file descriptors since they still want to
753	* use them.
754	*/
755	if (!(flags & CLOSE_RANGE_CLOEXEC)) {
756	/*
757	* If the requested range is greater than the current
758	* maximum, we're closing everything so only copy all
759	* file descriptors beneath the lowest file descriptor.
760	*/
761	rcu_read_lock();
762	if (max_fd >= last_fd(files_fdtable(cur_fds)))
763	max_unshare_fds = fd;
764	rcu_read_unlock();
765	}
766
767	ret = unshare_fd(CLONE_FILES, max_fds: max_unshare_fds, new_fdp: &fds);
768	if (ret)
769	return ret;
770
771	/*
772	* We used to share our file descriptor table, and have now
773	* created a private one, make sure we're using it below.
774	*/
775	if (fds)
776	swap(cur_fds, fds);
777	}
778
779	if (flags & CLOSE_RANGE_CLOEXEC)
780	__range_cloexec(cur_fds, fd, max_fd);
781	else
782	__range_close(files: cur_fds, fd, max_fd);
783
784	if (fds) {
785	/*
786	* We're done closing the files we were supposed to. Time to install
787	* the new file descriptor table and drop the old one.
788	*/
789	task_lock(p: me);
790	me->files = cur_fds;
791	task_unlock(p: me);
792	put_files_struct(files: fds);
793	}
794
795	return `0`;
796	}
797
798	/*
799	* See close_fd_get_file() below, this variant assumes current->files->file_lock
800	* is held.
801	*/
802	struct file __close_fd_get_file(unsigned* int fd)
803	{
804	return pick_file(current->files, fd);
805	}
806
807	/*
808	* variant of close_fd that gets a ref on the file for later fput.
809	* The caller must ensure that filp_close() called on the file.
810	*/
811	struct file close_fd_get_file(unsigned* int fd)
812	{
813	struct files_struct *files = current->files;
814	struct file *file;
815
816	spin_lock(lock: &files->file_lock);
817	file = pick_file(files, fd);
818	spin_unlock(lock: &files->file_lock);
819
820	return file;
821	}
822
823	void do_close_on_exec(struct files_struct *files)
824	{
825	unsigned i;
826	struct fdtable *fdt;
827
828	/ exec unshares first /
829	spin_lock(lock: &files->file_lock);
830	for (i = `0`; ; i++) {
831	unsigned long set;
832	unsigned fd = i * BITS_PER_LONG;
833	fdt = files_fdtable(files);
834	if (fd >= fdt->max_fds)
835	break;
836	set = fdt->close_on_exec[i];
837	if (!set)
838	continue;
839	fdt->close_on_exec[i] = `0`;
840	for ( ; set ; fd++, set >>= `1`) {
841	struct file *file;
842	if (!(set & `1`))
843	continue;
844	file = fdt->fd[fd];
845	if (!file)
846	continue;
847	rcu_assign_pointer(fdt->fd[fd], NULL);
848	__put_unused_fd(files, fd);
849	spin_unlock(lock: &files->file_lock);
850	filp_close(file, id: files);
851	cond_resched();
852	spin_lock(lock: &files->file_lock);
853	}
854
855	}
856	spin_unlock(lock: &files->file_lock);
857	}
858
859	static struct file __get_file_rcu(struct* file __rcu **f)
860	{
861	struct file __rcu *file;
862	struct file __rcu *file_reloaded;
863	struct file __rcu *file_reloaded_cmp;
864
865	file = rcu_dereference_raw(*f);
866	if (!file)
867	return NULL;
868
869	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
870	return ERR_PTR(error: -EAGAIN);
871
872	file_reloaded = rcu_dereference_raw(*f);
873
874	/*
875	* Ensure that all accesses have a dependency on the load from
876	* rcu_dereference_raw() above so we get correct ordering
877	* between reuse/allocation and the pointer check below.
878	*/
879	file_reloaded_cmp = file_reloaded;
880	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
881
882	/*
883	* atomic_long_inc_not_zero() above provided a full memory
884	* barrier when we acquired a reference.
885	*
886	* This is paired with the write barrier from assigning to the
887	* __rcu protected file pointer so that if that pointer still
888	* matches the current file, we know we have successfully
889	* acquired a reference to the right file.
890	*
891	* If the pointers don't match the file has been reallocated by
892	* SLAB_TYPESAFE_BY_RCU.
893	*/
894	if (file == file_reloaded_cmp)
895	return file_reloaded;
896
897	fput(file);
898	return ERR_PTR(error: -EAGAIN);
899	}
900
901	/**
902	* get_file_rcu - try go get a reference to a file under rcu
903	* @f: the file to get a reference on
904	*
905	* This function tries to get a reference on @f carefully verifying that
906	* @f hasn't been reused.
907	*
908	* This function should rarely have to be used and only by users who
909	* understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
910	*
911	* Return: Returns @f with the reference count increased or NULL.
912	*/
913	struct file get_file_rcu(struct* file __rcu **f)
914	{
915	for (;;) {
916	struct file __rcu *file;
917
918	file = __get_file_rcu(f);
919	if (unlikely(!file))
920	return NULL;
921
922	if (unlikely(IS_ERR(file)))
923	continue;
924
925	return file;
926	}
927	}
928	EXPORT_SYMBOL_GPL(get_file_rcu);
929
930	/**
931	* get_file_active - try go get a reference to a file
932	* @f: the file to get a reference on
933	*
934	* In contast to get_file_rcu() the pointer itself isn't part of the
935	* reference counting.
936	*
937	* This function should rarely have to be used and only by users who
938	* understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
939	*
940	* Return: Returns @f with the reference count increased or NULL.
941	*/
942	struct file get_file_active(struct* file **f)
943	{
944	struct file __rcu *file;
945
946	rcu_read_lock();
947	file = __get_file_rcu(f);
948	rcu_read_unlock();
949	if (IS_ERR(ptr: file))
950	file = NULL;
951	return file;
952	}
953	EXPORT_SYMBOL_GPL(get_file_active);
954
955	static inline struct file __fget_files_rcu(struct* files_struct *files,
956	unsigned int fd, fmode_t mask)
957	{
958	for (;;) {
959	struct file *file;
960	struct fdtable *fdt = rcu_dereference_raw(files->fdt);
961	struct file __rcu **fdentry;
962
963	if (unlikely(fd >= fdt->max_fds))
964	return NULL;
965
966	fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
967
968	/*
969	* Ok, we have a file pointer. However, because we do
970	* this all locklessly under RCU, we may be racing with
971	* that file being closed.
972	*
973	* Such a race can take two forms:
974	*
975	* (a) the file ref already went down to zero and the
976	* file hasn't been reused yet or the file count
977	* isn't zero but the file has already been reused.
978	*/
979	file = __get_file_rcu(f: fdentry);
980	if (unlikely(!file))
981	return NULL;
982
983	if (unlikely(IS_ERR(file)))
984	continue;
985
986	/*
987	* (b) the file table entry has changed under us.
988	* Note that we don't need to re-check the 'fdt->fd'
989	* pointer having changed, because it always goes
990	* hand-in-hand with 'fdt'.
991	*
992	* If so, we need to put our ref and try again.
993	*/
994	if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
995	fput(file);
996	continue;
997	}
998
999	/*
1000	* This isn't the file we're looking for or we're not
1001	* allowed to get a reference to it.
1002	*/
1003	if (unlikely(file->f_mode & mask)) {
1004	fput(file);
1005	return NULL;
1006	}
1007
1008	/*
1009	* Ok, we have a ref to the file, and checked that it
1010	* still exists.
1011	*/
1012	return file;
1013	}
1014	}
1015
1016	static struct file __fget_files(struct* files_struct files, unsigned* int fd,
1017	fmode_t mask)
1018	{
1019	struct file *file;
1020
1021	rcu_read_lock();
1022	file = __fget_files_rcu(files, fd, mask);
1023	rcu_read_unlock();
1024
1025	return file;
1026	}
1027
1028	static inline struct file __fget(unsigned* int fd, fmode_t mask)
1029	{
1030	return __fget_files(current->files, fd, mask);
1031	}
1032
1033	struct file fget(unsigned* int fd)
1034	{
1035	return __fget(fd, FMODE_PATH);
1036	}
1037	EXPORT_SYMBOL(fget);
1038
1039	struct file fget_raw(unsigned* int fd)
1040	{
1041	return __fget(fd, mask: `0`);
1042	}
1043	EXPORT_SYMBOL(fget_raw);
1044
1045	struct file fget_task(struct* task_struct task, unsigned* int fd)
1046	{
1047	struct file *file = NULL;
1048
1049	task_lock(p: task);
1050	if (task->files)
1051	file = __fget_files(files: task->files, fd, mask: `0`);
1052	task_unlock(p: task);
1053
1054	return file;
1055	}
1056
1057	struct file lookup_fdget_rcu(unsigned* int fd)
1058	{
1059	return __fget_files_rcu(current->files, fd, mask: `0`);
1060
1061	}
1062	EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
1063
1064	struct file task_lookup_fdget_rcu(struct* task_struct task, unsigned* int fd)
1065	{
1066	/ Must be called with rcu_read_lock held /
1067	struct files_struct *files;
1068	struct file *file = NULL;
1069
1070	task_lock(p: task);
1071	files = task->files;
1072	if (files)
1073	file = __fget_files_rcu(files, fd, mask: `0`);
1074	task_unlock(p: task);
1075
1076	return file;
1077	}
1078
1079	struct file task_lookup_next_fdget_rcu(struct* task_struct task, unsigned* int *ret_fd)
1080	{
1081	/ Must be called with rcu_read_lock held /
1082	struct files_struct *files;
1083	unsigned int fd = *ret_fd;
1084	struct file *file = NULL;
1085
1086	task_lock(p: task);
1087	files = task->files;
1088	if (files) {
1089	for (; fd < files_fdtable(files)->max_fds; fd++) {
1090	file = __fget_files_rcu(files, fd, mask: `0`);
1091	if (file)
1092	break;
1093	}
1094	}
1095	task_unlock(p: task);
1096	*ret_fd = fd;
1097	return file;
1098	}
1099	EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
1100
1101	/*
1102	* Lightweight file lookup - no refcnt increment if fd table isn't shared.
1103	*
1104	* You can use this instead of fget if you satisfy all of the following
1105	* conditions:
1106	* 1) You must call fput_light before exiting the syscall and returning control
1107	* to userspace (i.e. you cannot remember the returned struct file * after
1108	* returning to userspace).
1109	* 2) You must not call filp_close on the returned struct file * in between
1110	* calls to fget_light and fput_light.
1111	* 3) You must not clone the current task in between the calls to fget_light
1112	* and fput_light.
1113	*
1114	* The fput_needed flag returned by fget_light should be passed to the
1115	* corresponding fput_light.
1116	*/
1117	static unsigned long __fget_light(unsigned int fd, fmode_t mask)
1118	{
1119	struct files_struct *files = current->files;
1120	struct file *file;
1121
1122	/*
1123	* If another thread is concurrently calling close_fd() followed
1124	* by put_files_struct(), we must not observe the old table
1125	* entry combined with the new refcount - otherwise we could
1126	* return a file that is concurrently being freed.
1127	*
1128	* atomic_read_acquire() pairs with atomic_dec_and_test() in
1129	* put_files_struct().
1130	*/
1131	if (atomic_read_acquire(v: &files->count) == `1`) {
1132	file = files_lookup_fd_raw(files, fd);
1133	if (!file \|\| unlikely(file->f_mode & mask))
1134	return `0`;
1135	return (unsigned long)file;
1136	} else {
1137	file = __fget(fd, mask);
1138	if (!file)
1139	return `0`;
1140	return FDPUT_FPUT \| (unsigned long)file;
1141	}
1142	}
1143	unsigned long __fdget(unsigned int fd)
1144	{
1145	return __fget_light(fd, FMODE_PATH);
1146	}
1147	EXPORT_SYMBOL(__fdget);
1148
1149	unsigned long __fdget_raw(unsigned int fd)
1150	{
1151	return __fget_light(fd, mask: `0`);
1152	}
1153
1154	/*
1155	* Try to avoid f_pos locking. We only need it if the
1156	* file is marked for FMODE_ATOMIC_POS, and it can be
1157	* accessed multiple ways.
1158	*
1159	* Always do it for directories, because pidfd_getfd()
1160	* can make a file accessible even if it otherwise would
1161	* not be, and for directories this is a correctness
1162	* issue, not a "POSIX requirement".
1163	*/
1164	static inline bool file_needs_f_pos_lock(struct file *file)
1165	{
1166	return (file->f_mode & FMODE_ATOMIC_POS) &&
1167	(file_count(file) > `1` \|\| file->f_op->iterate_shared);
1168	}
1169
1170	unsigned long __fdget_pos(unsigned int fd)
1171	{
1172	unsigned long v = __fdget(fd);
1173	struct file file = (struct* file *)(v & ~`3`);
1174
1175	if (file && file_needs_f_pos_lock(file)) {
1176	v \|= FDPUT_POS_UNLOCK;
1177	mutex_lock(&file->f_pos_lock);
1178	}
1179	return v;
1180	}
1181
1182	void __f_unlock_pos(struct file *f)
1183	{
1184	mutex_unlock(lock: &f->f_pos_lock);
1185	}
1186
1187	/*
1188	* We only lock f_pos if we have threads or if the file might be
1189	* shared with another process. In both cases we'll have an elevated
1190	* file count (done either by fdget() or by fork()).
1191	*/
1192
1193	void set_close_on_exec(unsigned int fd, int flag)
1194	{
1195	struct files_struct *files = current->files;
1196	struct fdtable *fdt;
1197	spin_lock(lock: &files->file_lock);
1198	fdt = files_fdtable(files);
1199	if (flag)
1200	__set_close_on_exec(fd, fdt);
1201	else
1202	__clear_close_on_exec(fd, fdt);
1203	spin_unlock(lock: &files->file_lock);
1204	}
1205
1206	bool get_close_on_exec(unsigned int fd)
1207	{
1208	struct files_struct *files = current->files;
1209	struct fdtable *fdt;
1210	bool res;
1211	rcu_read_lock();
1212	fdt = files_fdtable(files);
1213	res = close_on_exec(fd, fdt);
1214	rcu_read_unlock();
1215	return res;
1216	}
1217
1218	static int do_dup2(struct files_struct *files,
1219	struct file file, unsigned* fd, unsigned flags)
1220	__releases(&files->file_lock)
1221	{
1222	struct file *tofree;
1223	struct fdtable *fdt;
1224
1225	/*
1226	* We need to detect attempts to do dup2() over allocated but still
1227	* not finished descriptor. NB: OpenBSD avoids that at the price of
1228	* extra work in their equivalent of fget() - they insert struct
1229	* file immediately after grabbing descriptor, mark it larval if
1230	* more work (e.g. actual opening) is needed and make sure that
1231	* fget() treats larval files as absent. Potentially interesting,
1232	* but while extra work in fget() is trivial, locking implications
1233	* and amount of surgery on open()-related paths in VFS are not.
1234	* FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
1235	* deadlocks in rather amusing ways, AFAICS. All of that is out of
1236	* scope of POSIX or SUS, since neither considers shared descriptor
1237	* tables and this condition does not arise without those.
1238	*/
1239	fdt = files_fdtable(files);
1240	tofree = fdt->fd[fd];
1241	if (!tofree && fd_is_open(fd, fdt))
1242	goto Ebusy;
1243	get_file(f: file);
1244	rcu_assign_pointer(fdt->fd[fd], file);
1245	__set_open_fd(fd, fdt);
1246	if (flags & O_CLOEXEC)
1247	__set_close_on_exec(fd, fdt);
1248	else
1249	__clear_close_on_exec(fd, fdt);
1250	spin_unlock(lock: &files->file_lock);
1251
1252	if (tofree)
1253	filp_close(tofree, id: files);
1254
1255	return fd;
1256
1257	Ebusy:
1258	spin_unlock(lock: &files->file_lock);
1259	return -EBUSY;
1260	}
1261
1262	int replace_fd(unsigned fd, struct file file, unsigned* flags)
1263	{
1264	int err;
1265	struct files_struct *files = current->files;
1266
1267	if (!file)
1268	return close_fd(fd);
1269
1270	if (fd >= rlimit(RLIMIT_NOFILE))
1271	return -EBADF;
1272
1273	spin_lock(lock: &files->file_lock);
1274	err = expand_files(files, nr: fd);
1275	if (unlikely(err < `0`))
1276	goto out_unlock;
1277	return do_dup2(files, file, fd, flags);
1278
1279	out_unlock:
1280	spin_unlock(lock: &files->file_lock);
1281	return err;
1282	}
1283
1284	/**
1285	* __receive_fd() - Install received file into file descriptor table
1286	* @file: struct file that was received from another process
1287	* @ufd: __user pointer to write new fd number to
1288	* @o_flags: the O_* flags to apply to the new fd entry
1289	*
1290	* Installs a received file into the file descriptor table, with appropriate
1291	* checks and count updates. Optionally writes the fd number to userspace, if
1292	* @ufd is non-NULL.
1293	*
1294	* This helper handles its own reference counting of the incoming
1295	* struct file.
1296	*
1297	* Returns newly install fd or -ve on error.
1298	*/
1299	int __receive_fd(struct file file, int* __user ufd, unsigned* int o_flags)
1300	{
1301	int new_fd;
1302	int error;
1303
1304	error = security_file_receive(file);
1305	if (error)
1306	return error;
1307
1308	new_fd = get_unused_fd_flags(o_flags);
1309	if (new_fd < `0`)
1310	return new_fd;
1311
1312	if (ufd) {
1313	error = put_user(new_fd, ufd);
1314	if (error) {
1315	put_unused_fd(new_fd);
1316	return error;
1317	}
1318	}
1319
1320	fd_install(new_fd, get_file(f: file));
1321	__receive_sock(file);
1322	return new_fd;
1323	}
1324
1325	int receive_fd_replace(int new_fd, struct file file, unsigned* int o_flags)
1326	{
1327	int error;
1328
1329	error = security_file_receive(file);
1330	if (error)
1331	return error;
1332	error = replace_fd(fd: new_fd, file, flags: o_flags);
1333	if (error)
1334	return error;
1335	__receive_sock(file);
1336	return new_fd;
1337	}
1338
1339	int receive_fd(struct file file, unsigned* int o_flags)
1340	{
1341	return __receive_fd(file, NULL, o_flags);
1342	}
1343	EXPORT_SYMBOL_GPL(receive_fd);
1344
1345	static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
1346	{
1347	int err = -EBADF;
1348	struct file *file;
1349	struct files_struct *files = current->files;
1350
1351	if ((flags & ~O_CLOEXEC) != `0`)
1352	return -EINVAL;
1353
1354	if (unlikely(oldfd == newfd))
1355	return -EINVAL;
1356
1357	if (newfd >= rlimit(RLIMIT_NOFILE))
1358	return -EBADF;
1359
1360	spin_lock(lock: &files->file_lock);
1361	err = expand_files(files, nr: newfd);
1362	file = files_lookup_fd_locked(files, fd: oldfd);
1363	if (unlikely(!file))
1364	goto Ebadf;
1365	if (unlikely(err < `0`)) {
1366	if (err == -EMFILE)
1367	goto Ebadf;
1368	goto out_unlock;
1369	}
1370	return do_dup2(files, file, fd: newfd, flags);
1371
1372	Ebadf:
1373	err = -EBADF;
1374	out_unlock:
1375	spin_unlock(lock: &files->file_lock);
1376	return err;
1377	}
1378
1379	SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
1380	{
1381	return ksys_dup3(oldfd, newfd, flags);
1382	}
1383
1384	SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
1385	{
1386	if (unlikely(newfd == oldfd)) { / corner case /
1387	struct files_struct *files = current->files;
1388	struct file *f;
1389	int retval = oldfd;
1390
1391	rcu_read_lock();
1392	f = __fget_files_rcu(files, fd: oldfd, mask: `0`);
1393	if (!f)
1394	retval = -EBADF;
1395	rcu_read_unlock();
1396	if (f)
1397	fput(f);
1398	return retval;
1399	}
1400	return ksys_dup3(oldfd, newfd, flags: `0`);
1401	}
1402
1403	SYSCALL_DEFINE1(dup, unsigned int, fildes)
1404	{
1405	int ret = -EBADF;
1406	struct file *file = fget_raw(fildes);
1407
1408	if (file) {
1409	ret = get_unused_fd_flags(`0`);
1410	if (ret >= `0`)
1411	fd_install(ret, file);
1412	else
1413	fput(file);
1414	}
1415	return ret;
1416	}
1417
1418	int f_dupfd(unsigned int from, struct file file, unsigned* flags)
1419	{
1420	unsigned long nofile = rlimit(RLIMIT_NOFILE);
1421	int err;
1422	if (from >= nofile)
1423	return -EINVAL;
1424	err = alloc_fd(start: from, end: nofile, flags);
1425	if (err >= `0`) {
1426	get_file(f: file);
1427	fd_install(err, file);
1428	}
1429	return err;
1430	}
1431
1432	int iterate_fd(struct files_struct files, unsigned* n,
1433	int (f)(const* void , struct* file , unsigned*),
1434	const void *p)
1435	{
1436	struct fdtable *fdt;
1437	int res = `0`;
1438	if (!files)
1439	return `0`;
1440	spin_lock(lock: &files->file_lock);
1441	for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1442	struct file *file;
1443	file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1444	if (!file)
1445	continue;
1446	res = f(p, file, n);
1447	if (res)
1448	break;
1449	}
1450	spin_unlock(lock: &files->file_lock);
1451	return res;
1452	}
1453	EXPORT_SYMBOL(iterate_fd);
1454

source code of linux/fs/file.c