1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * linux/fs/file_table.c |
4 | * |
5 | * Copyright (C) 1991, 1992 Linus Torvalds |
6 | * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) |
7 | */ |
8 | |
9 | #include <linux/string.h> |
10 | #include <linux/slab.h> |
11 | #include <linux/file.h> |
12 | #include <linux/fdtable.h> |
13 | #include <linux/init.h> |
14 | #include <linux/module.h> |
15 | #include <linux/fs.h> |
16 | #include <linux/filelock.h> |
17 | #include <linux/security.h> |
18 | #include <linux/cred.h> |
19 | #include <linux/eventpoll.h> |
20 | #include <linux/rcupdate.h> |
21 | #include <linux/mount.h> |
22 | #include <linux/capability.h> |
23 | #include <linux/cdev.h> |
24 | #include <linux/fsnotify.h> |
25 | #include <linux/sysctl.h> |
26 | #include <linux/percpu_counter.h> |
27 | #include <linux/percpu.h> |
28 | #include <linux/task_work.h> |
29 | #include <linux/ima.h> |
30 | #include <linux/swap.h> |
31 | #include <linux/kmemleak.h> |
32 | |
33 | #include <linux/atomic.h> |
34 | |
35 | #include "internal.h" |
36 | |
37 | /* sysctl tunables... */ |
38 | static struct files_stat_struct files_stat = { |
39 | .max_files = NR_FILE |
40 | }; |
41 | |
42 | /* SLAB cache for file structures */ |
43 | static struct kmem_cache *filp_cachep __ro_after_init; |
44 | |
45 | static struct percpu_counter nr_files __cacheline_aligned_in_smp; |
46 | |
47 | /* Container for backing file with optional user path */ |
48 | struct backing_file { |
49 | struct file file; |
50 | struct path user_path; |
51 | }; |
52 | |
53 | static inline struct backing_file *backing_file(struct file *f) |
54 | { |
55 | return container_of(f, struct backing_file, file); |
56 | } |
57 | |
58 | struct path *backing_file_user_path(struct file *f) |
59 | { |
60 | return &backing_file(f)->user_path; |
61 | } |
62 | EXPORT_SYMBOL_GPL(backing_file_user_path); |
63 | |
64 | static inline void file_free(struct file *f) |
65 | { |
66 | security_file_free(file: f); |
67 | if (likely(!(f->f_mode & FMODE_NOACCOUNT))) |
68 | percpu_counter_dec(fbc: &nr_files); |
69 | put_cred(cred: f->f_cred); |
70 | if (unlikely(f->f_mode & FMODE_BACKING)) { |
71 | path_put(backing_file_user_path(f)); |
72 | kfree(objp: backing_file(f)); |
73 | } else { |
74 | kmem_cache_free(s: filp_cachep, objp: f); |
75 | } |
76 | } |
77 | |
78 | void release_empty_file(struct file *f) |
79 | { |
80 | WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); |
81 | if (atomic_long_dec_and_test(v: &f->f_count)) { |
82 | security_file_free(file: f); |
83 | put_cred(cred: f->f_cred); |
84 | if (likely(!(f->f_mode & FMODE_NOACCOUNT))) |
85 | percpu_counter_dec(fbc: &nr_files); |
86 | kmem_cache_free(s: filp_cachep, objp: f); |
87 | } |
88 | } |
89 | |
90 | /* |
91 | * Return the total number of open files in the system |
92 | */ |
93 | static long get_nr_files(void) |
94 | { |
95 | return percpu_counter_read_positive(fbc: &nr_files); |
96 | } |
97 | |
98 | /* |
99 | * Return the maximum number of open files in the system |
100 | */ |
101 | unsigned long get_max_files(void) |
102 | { |
103 | return files_stat.max_files; |
104 | } |
105 | EXPORT_SYMBOL_GPL(get_max_files); |
106 | |
107 | #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) |
108 | |
109 | /* |
110 | * Handle nr_files sysctl |
111 | */ |
112 | static int proc_nr_files(struct ctl_table *table, int write, void *buffer, |
113 | size_t *lenp, loff_t *ppos) |
114 | { |
115 | files_stat.nr_files = get_nr_files(); |
116 | return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
117 | } |
118 | |
119 | static struct ctl_table fs_stat_sysctls[] = { |
120 | { |
121 | .procname = "file-nr" , |
122 | .data = &files_stat, |
123 | .maxlen = sizeof(files_stat), |
124 | .mode = 0444, |
125 | .proc_handler = proc_nr_files, |
126 | }, |
127 | { |
128 | .procname = "file-max" , |
129 | .data = &files_stat.max_files, |
130 | .maxlen = sizeof(files_stat.max_files), |
131 | .mode = 0644, |
132 | .proc_handler = proc_doulongvec_minmax, |
133 | .extra1 = SYSCTL_LONG_ZERO, |
134 | .extra2 = SYSCTL_LONG_MAX, |
135 | }, |
136 | { |
137 | .procname = "nr_open" , |
138 | .data = &sysctl_nr_open, |
139 | .maxlen = sizeof(unsigned int), |
140 | .mode = 0644, |
141 | .proc_handler = proc_dointvec_minmax, |
142 | .extra1 = &sysctl_nr_open_min, |
143 | .extra2 = &sysctl_nr_open_max, |
144 | }, |
145 | { } |
146 | }; |
147 | |
148 | static int __init init_fs_stat_sysctls(void) |
149 | { |
150 | register_sysctl_init("fs" , fs_stat_sysctls); |
151 | if (IS_ENABLED(CONFIG_BINFMT_MISC)) { |
152 | struct ctl_table_header *hdr; |
153 | hdr = register_sysctl_mount_point(path: "fs/binfmt_misc" ); |
154 | kmemleak_not_leak(ptr: hdr); |
155 | } |
156 | return 0; |
157 | } |
158 | fs_initcall(init_fs_stat_sysctls); |
159 | #endif |
160 | |
161 | static int init_file(struct file *f, int flags, const struct cred *cred) |
162 | { |
163 | int error; |
164 | |
165 | f->f_cred = get_cred(cred); |
166 | error = security_file_alloc(file: f); |
167 | if (unlikely(error)) { |
168 | put_cred(cred: f->f_cred); |
169 | return error; |
170 | } |
171 | |
172 | rwlock_init(&f->f_owner.lock); |
173 | spin_lock_init(&f->f_lock); |
174 | mutex_init(&f->f_pos_lock); |
175 | f->f_flags = flags; |
176 | f->f_mode = OPEN_FMODE(flags); |
177 | /* f->f_version: 0 */ |
178 | |
179 | /* |
180 | * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While |
181 | * fget-rcu pattern users need to be able to handle spurious |
182 | * refcount bumps we should reinitialize the reused file first. |
183 | */ |
184 | atomic_long_set(v: &f->f_count, i: 1); |
185 | return 0; |
186 | } |
187 | |
188 | /* Find an unused file structure and return a pointer to it. |
189 | * Returns an error pointer if some error happend e.g. we over file |
190 | * structures limit, run out of memory or operation is not permitted. |
191 | * |
192 | * Be very careful using this. You are responsible for |
193 | * getting write access to any mount that you might assign |
194 | * to this filp, if it is opened for write. If this is not |
195 | * done, you will imbalance int the mount's writer count |
196 | * and a warning at __fput() time. |
197 | */ |
198 | struct file *alloc_empty_file(int flags, const struct cred *cred) |
199 | { |
200 | static long old_max; |
201 | struct file *f; |
202 | int error; |
203 | |
204 | /* |
205 | * Privileged users can go above max_files |
206 | */ |
207 | if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { |
208 | /* |
209 | * percpu_counters are inaccurate. Do an expensive check before |
210 | * we go and fail. |
211 | */ |
212 | if (percpu_counter_sum_positive(fbc: &nr_files) >= files_stat.max_files) |
213 | goto over; |
214 | } |
215 | |
216 | f = kmem_cache_zalloc(k: filp_cachep, GFP_KERNEL); |
217 | if (unlikely(!f)) |
218 | return ERR_PTR(error: -ENOMEM); |
219 | |
220 | error = init_file(f, flags, cred); |
221 | if (unlikely(error)) { |
222 | kmem_cache_free(s: filp_cachep, objp: f); |
223 | return ERR_PTR(error); |
224 | } |
225 | |
226 | percpu_counter_inc(fbc: &nr_files); |
227 | |
228 | return f; |
229 | |
230 | over: |
231 | /* Ran out of filps - report that */ |
232 | if (get_nr_files() > old_max) { |
233 | pr_info("VFS: file-max limit %lu reached\n" , get_max_files()); |
234 | old_max = get_nr_files(); |
235 | } |
236 | return ERR_PTR(error: -ENFILE); |
237 | } |
238 | |
239 | /* |
240 | * Variant of alloc_empty_file() that doesn't check and modify nr_files. |
241 | * |
242 | * This is only for kernel internal use, and the allocate file must not be |
243 | * installed into file tables or such. |
244 | */ |
245 | struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) |
246 | { |
247 | struct file *f; |
248 | int error; |
249 | |
250 | f = kmem_cache_zalloc(k: filp_cachep, GFP_KERNEL); |
251 | if (unlikely(!f)) |
252 | return ERR_PTR(error: -ENOMEM); |
253 | |
254 | error = init_file(f, flags, cred); |
255 | if (unlikely(error)) { |
256 | kmem_cache_free(s: filp_cachep, objp: f); |
257 | return ERR_PTR(error); |
258 | } |
259 | |
260 | f->f_mode |= FMODE_NOACCOUNT; |
261 | |
262 | return f; |
263 | } |
264 | |
265 | /* |
266 | * Variant of alloc_empty_file() that allocates a backing_file container |
267 | * and doesn't check and modify nr_files. |
268 | * |
269 | * This is only for kernel internal use, and the allocate file must not be |
270 | * installed into file tables or such. |
271 | */ |
272 | struct file *alloc_empty_backing_file(int flags, const struct cred *cred) |
273 | { |
274 | struct backing_file *ff; |
275 | int error; |
276 | |
277 | ff = kzalloc(size: sizeof(struct backing_file), GFP_KERNEL); |
278 | if (unlikely(!ff)) |
279 | return ERR_PTR(error: -ENOMEM); |
280 | |
281 | error = init_file(f: &ff->file, flags, cred); |
282 | if (unlikely(error)) { |
283 | kfree(objp: ff); |
284 | return ERR_PTR(error); |
285 | } |
286 | |
287 | ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; |
288 | return &ff->file; |
289 | } |
290 | |
291 | /** |
292 | * alloc_file - allocate and initialize a 'struct file' |
293 | * |
294 | * @path: the (dentry, vfsmount) pair for the new file |
295 | * @flags: O_... flags with which the new file will be opened |
296 | * @fop: the 'struct file_operations' for the new file |
297 | */ |
298 | static struct file *alloc_file(const struct path *path, int flags, |
299 | const struct file_operations *fop) |
300 | { |
301 | struct file *file; |
302 | |
303 | file = alloc_empty_file(flags, current_cred()); |
304 | if (IS_ERR(ptr: file)) |
305 | return file; |
306 | |
307 | file->f_path = *path; |
308 | file->f_inode = path->dentry->d_inode; |
309 | file->f_mapping = path->dentry->d_inode->i_mapping; |
310 | file->f_wb_err = filemap_sample_wb_err(mapping: file->f_mapping); |
311 | file->f_sb_err = file_sample_sb_err(file); |
312 | if (fop->llseek) |
313 | file->f_mode |= FMODE_LSEEK; |
314 | if ((file->f_mode & FMODE_READ) && |
315 | likely(fop->read || fop->read_iter)) |
316 | file->f_mode |= FMODE_CAN_READ; |
317 | if ((file->f_mode & FMODE_WRITE) && |
318 | likely(fop->write || fop->write_iter)) |
319 | file->f_mode |= FMODE_CAN_WRITE; |
320 | file->f_iocb_flags = iocb_flags(file); |
321 | file->f_mode |= FMODE_OPENED; |
322 | file->f_op = fop; |
323 | if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) |
324 | i_readcount_inc(inode: path->dentry->d_inode); |
325 | return file; |
326 | } |
327 | |
328 | struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, |
329 | const char *name, int flags, |
330 | const struct file_operations *fops) |
331 | { |
332 | static const struct dentry_operations anon_ops = { |
333 | .d_dname = simple_dname |
334 | }; |
335 | struct qstr this = QSTR_INIT(name, strlen(name)); |
336 | struct path path; |
337 | struct file *file; |
338 | |
339 | path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this); |
340 | if (!path.dentry) |
341 | return ERR_PTR(error: -ENOMEM); |
342 | if (!mnt->mnt_sb->s_d_op) |
343 | d_set_d_op(dentry: path.dentry, op: &anon_ops); |
344 | path.mnt = mntget(mnt); |
345 | d_instantiate(path.dentry, inode); |
346 | file = alloc_file(path: &path, flags, fop: fops); |
347 | if (IS_ERR(ptr: file)) { |
348 | ihold(inode); |
349 | path_put(&path); |
350 | } |
351 | return file; |
352 | } |
353 | EXPORT_SYMBOL(alloc_file_pseudo); |
354 | |
355 | struct file *alloc_file_clone(struct file *base, int flags, |
356 | const struct file_operations *fops) |
357 | { |
358 | struct file *f = alloc_file(path: &base->f_path, flags, fop: fops); |
359 | if (!IS_ERR(ptr: f)) { |
360 | path_get(&f->f_path); |
361 | f->f_mapping = base->f_mapping; |
362 | } |
363 | return f; |
364 | } |
365 | |
366 | /* the real guts of fput() - releasing the last reference to file |
367 | */ |
368 | static void __fput(struct file *file) |
369 | { |
370 | struct dentry *dentry = file->f_path.dentry; |
371 | struct vfsmount *mnt = file->f_path.mnt; |
372 | struct inode *inode = file->f_inode; |
373 | fmode_t mode = file->f_mode; |
374 | |
375 | if (unlikely(!(file->f_mode & FMODE_OPENED))) |
376 | goto out; |
377 | |
378 | might_sleep(); |
379 | |
380 | fsnotify_close(file); |
381 | /* |
382 | * The function eventpoll_release() should be the first called |
383 | * in the file cleanup chain. |
384 | */ |
385 | eventpoll_release(file); |
386 | locks_remove_file(file); |
387 | |
388 | ima_file_free(file); |
389 | if (unlikely(file->f_flags & FASYNC)) { |
390 | if (file->f_op->fasync) |
391 | file->f_op->fasync(-1, file, 0); |
392 | } |
393 | if (file->f_op->release) |
394 | file->f_op->release(inode, file); |
395 | if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && |
396 | !(mode & FMODE_PATH))) { |
397 | cdev_put(p: inode->i_cdev); |
398 | } |
399 | fops_put(file->f_op); |
400 | put_pid(pid: file->f_owner.pid); |
401 | put_file_access(file); |
402 | dput(dentry); |
403 | if (unlikely(mode & FMODE_NEED_UNMOUNT)) |
404 | dissolve_on_fput(mnt); |
405 | mntput(mnt); |
406 | out: |
407 | file_free(f: file); |
408 | } |
409 | |
410 | static LLIST_HEAD(delayed_fput_list); |
411 | static void delayed_fput(struct work_struct *unused) |
412 | { |
413 | struct llist_node *node = llist_del_all(head: &delayed_fput_list); |
414 | struct file *f, *t; |
415 | |
416 | llist_for_each_entry_safe(f, t, node, f_llist) |
417 | __fput(file: f); |
418 | } |
419 | |
420 | static void ____fput(struct callback_head *work) |
421 | { |
422 | __fput(container_of(work, struct file, f_rcuhead)); |
423 | } |
424 | |
425 | /* |
426 | * If kernel thread really needs to have the final fput() it has done |
427 | * to complete, call this. The only user right now is the boot - we |
428 | * *do* need to make sure our writes to binaries on initramfs has |
429 | * not left us with opened struct file waiting for __fput() - execve() |
430 | * won't work without that. Please, don't add more callers without |
431 | * very good reasons; in particular, never call that with locks |
432 | * held and never call that from a thread that might need to do |
433 | * some work on any kind of umount. |
434 | */ |
435 | void flush_delayed_fput(void) |
436 | { |
437 | delayed_fput(NULL); |
438 | } |
439 | EXPORT_SYMBOL_GPL(flush_delayed_fput); |
440 | |
441 | static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); |
442 | |
443 | void fput(struct file *file) |
444 | { |
445 | if (atomic_long_dec_and_test(v: &file->f_count)) { |
446 | struct task_struct *task = current; |
447 | |
448 | if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { |
449 | init_task_work(twork: &file->f_rcuhead, func: ____fput); |
450 | if (!task_work_add(task, twork: &file->f_rcuhead, mode: TWA_RESUME)) |
451 | return; |
452 | /* |
453 | * After this task has run exit_task_work(), |
454 | * task_work_add() will fail. Fall through to delayed |
455 | * fput to avoid leaking *file. |
456 | */ |
457 | } |
458 | |
459 | if (llist_add(new: &file->f_llist, head: &delayed_fput_list)) |
460 | schedule_delayed_work(dwork: &delayed_fput_work, delay: 1); |
461 | } |
462 | } |
463 | |
464 | /* |
465 | * synchronous analog of fput(); for kernel threads that might be needed |
466 | * in some umount() (and thus can't use flush_delayed_fput() without |
467 | * risking deadlocks), need to wait for completion of __fput() and know |
468 | * for this specific struct file it won't involve anything that would |
469 | * need them. Use only if you really need it - at the very least, |
470 | * don't blindly convert fput() by kernel thread to that. |
471 | */ |
472 | void __fput_sync(struct file *file) |
473 | { |
474 | if (atomic_long_dec_and_test(v: &file->f_count)) |
475 | __fput(file); |
476 | } |
477 | |
478 | EXPORT_SYMBOL(fput); |
479 | EXPORT_SYMBOL(__fput_sync); |
480 | |
481 | void __init files_init(void) |
482 | { |
483 | filp_cachep = kmem_cache_create(name: "filp" , size: sizeof(struct file), align: 0, |
484 | SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | |
485 | SLAB_PANIC | SLAB_ACCOUNT, NULL); |
486 | percpu_counter_init(&nr_files, 0, GFP_KERNEL); |
487 | } |
488 | |
489 | /* |
490 | * One file with associated inode and dcache is very roughly 1K. Per default |
491 | * do not use more than 10% of our memory for files. |
492 | */ |
493 | void __init files_maxfiles_init(void) |
494 | { |
495 | unsigned long n; |
496 | unsigned long nr_pages = totalram_pages(); |
497 | unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; |
498 | |
499 | memreserve = min(memreserve, nr_pages - 1); |
500 | n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; |
501 | |
502 | files_stat.max_files = max_t(unsigned long, n, NR_FILE); |
503 | } |
504 | |