open.c source code [linux/fs/open.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/open.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	#include <linux/string.h>
9	#include <linux/mm.h>
10	#include <linux/file.h>
11	#include <linux/fdtable.h>
12	#include <linux/fsnotify.h>
13	#include <linux/module.h>
14	#include <linux/tty.h>
15	#include <linux/namei.h>
16	#include <linux/backing-dev.h>
17	#include <linux/capability.h>
18	#include <linux/securebits.h>
19	#include <linux/security.h>
20	#include <linux/mount.h>
21	#include <linux/fcntl.h>
22	#include <linux/slab.h>
23	#include <linux/uaccess.h>
24	#include <linux/fs.h>
25	#include <linux/personality.h>
26	#include <linux/pagemap.h>
27	#include <linux/syscalls.h>
28	#include <linux/rcupdate.h>
29	#include <linux/audit.h>
30	#include <linux/falloc.h>
31	#include <linux/fs_struct.h>
32	#include <linux/dnotify.h>
33	#include <linux/compat.h>
34	#include <linux/mnt_idmapping.h>
35	#include <linux/filelock.h>
36
37	#include "internal.h"
38
39	int do_truncate(struct mnt_idmap idmap, struct* dentry *dentry,
40	loff_t length, unsigned int time_attrs, struct file *filp)
41	{
42	int ret;
43	struct iattr newattrs;
44
45	/ Not pretty: "inode->i_size" shouldn't really be signed. But it is. /
46	if (length < `0`)
47	return -EINVAL;
48
49	newattrs.ia_size = length;
50	newattrs.ia_valid = ATTR_SIZE \| time_attrs;
51	if (filp) {
52	newattrs.ia_file = filp;
53	newattrs.ia_valid \|= ATTR_FILE;
54	}
55
56	/ Remove suid, sgid, and file capabilities on truncate too /
57	ret = dentry_needs_remove_privs(idmap, dentry);
58	if (ret < `0`)
59	return ret;
60	if (ret)
61	newattrs.ia_valid \|= ret \| ATTR_FORCE;
62
63	inode_lock(inode: dentry->d_inode);
64	/ Note any delegations or leases have already been broken: /
65	ret = notify_change(idmap, dentry, &newattrs, NULL);
66	inode_unlock(inode: dentry->d_inode);
67	return ret;
68	}
69
70	long vfs_truncate(const struct path *path, loff_t length)
71	{
72	struct mnt_idmap *idmap;
73	struct inode *inode;
74	long error;
75
76	inode = path->dentry->d_inode;
77
78	/ For directories it's -EISDIR, for other non-regulars - -EINVAL /
79	if (S_ISDIR(inode->i_mode))
80	return -EISDIR;
81	if (!S_ISREG(inode->i_mode))
82	return -EINVAL;
83
84	error = mnt_want_write(mnt: path->mnt);
85	if (error)
86	goto out;
87
88	idmap = mnt_idmap(mnt: path->mnt);
89	error = inode_permission(idmap, inode, MAY_WRITE);
90	if (error)
91	goto mnt_drop_write_and_out;
92
93	error = -EPERM;
94	if (IS_APPEND(inode))
95	goto mnt_drop_write_and_out;
96
97	error = get_write_access(inode);
98	if (error)
99	goto mnt_drop_write_and_out;
100
101	/*
102	* Make sure that there are no leases. get_write_access() protects
103	* against the truncate racing with a lease-granting setlease().
104	*/
105	error = break_lease(inode, O_WRONLY);
106	if (error)
107	goto put_write_and_out;
108
109	error = security_path_truncate(path);
110	if (!error)
111	error = do_truncate(idmap, dentry: path->dentry, length, time_attrs: `0`, NULL);
112
113	put_write_and_out:
114	put_write_access(inode);
115	mnt_drop_write_and_out:
116	mnt_drop_write(mnt: path->mnt);
117	out:
118	return error;
119	}
120	EXPORT_SYMBOL_GPL(vfs_truncate);
121
122	long do_sys_truncate(const char __user *pathname, loff_t length)
123	{
124	unsigned int lookup_flags = LOOKUP_FOLLOW;
125	struct path path;
126	int error;
127
128	if (length < `0`) / sorry, but loff_t says... /
129	return -EINVAL;
130
131	retry:
132	error = user_path_at(AT_FDCWD, name: pathname, flags: lookup_flags, path: &path);
133	if (!error) {
134	error = vfs_truncate(&path, length);
135	path_put(&path);
136	}
137	if (retry_estale(error, flags: lookup_flags)) {
138	lookup_flags \|= LOOKUP_REVAL;
139	goto retry;
140	}
141	return error;
142	}
143
144	SYSCALL_DEFINE2(truncate, const char __user , path, long*, length)
145	{
146	return do_sys_truncate(pathname: path, length);
147	}
148
149	#ifdef CONFIG_COMPAT
150	COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
151	{
152	return do_sys_truncate(pathname: path, length);
153	}
154	#endif
155
156	long do_ftruncate(struct file file, loff_t length, int* small)
157	{
158	struct inode *inode;
159	struct dentry *dentry;
160	int error;
161
162	/ explicitly opened as large or we are on 64-bit box /
163	if (file->f_flags & O_LARGEFILE)
164	small = `0`;
165
166	dentry = file->f_path.dentry;
167	inode = dentry->d_inode;
168	if (!S_ISREG(inode->i_mode) \|\| !(file->f_mode & FMODE_WRITE))
169	return -EINVAL;
170
171	/ Cannot ftruncate over 2^31 bytes without large file support /
172	if (small && length > MAX_NON_LFS)
173	return -EINVAL;
174
175	/ Check IS_APPEND on real upper inode /
176	if (IS_APPEND(file_inode(file)))
177	return -EPERM;
178	sb_start_write(sb: inode->i_sb);
179	error = security_file_truncate(file);
180	if (!error)
181	error = do_truncate(idmap: file_mnt_idmap(file), dentry, length,
182	ATTR_MTIME \| ATTR_CTIME, filp: file);
183	sb_end_write(sb: inode->i_sb);
184
185	return error;
186	}
187
188	long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
189	{
190	struct fd f;
191	int error;
192
193	if (length < `0`)
194	return -EINVAL;
195	f = fdget(fd);
196	if (!f.file)
197	return -EBADF;
198
199	error = do_ftruncate(file: f.file, length, small);
200
201	fdput(fd: f);
202	return error;
203	}
204
205	SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
206	{
207	return do_sys_ftruncate(fd, length, small: `1`);
208	}
209
210	#ifdef CONFIG_COMPAT
211	COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
212	{
213	return do_sys_ftruncate(fd, length, small: `1`);
214	}
215	#endif
216
217	/ LFS versions of truncate are only needed on 32 bit machines /
218	#if BITS_PER_LONG == 32
219	SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
220	{
221	return do_sys_truncate(path, length);
222	}
223
224	SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
225	{
226	return do_sys_ftruncate(fd, length, `0`);
227	}
228	#endif /* BITS_PER_LONG == 32 */
229
230	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
231	COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
232	compat_arg_u64_dual(length))
233	{
234	return ksys_truncate(pathname, compat_arg_u64_glue(length));
235	}
236	#endif
237
238	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
239	COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
240	compat_arg_u64_dual(length))
241	{
242	return ksys_ftruncate(fd, compat_arg_u64_glue(length));
243	}
244	#endif
245
246	int vfs_fallocate(struct file file, int* mode, loff_t offset, loff_t len)
247	{
248	struct inode *inode = file_inode(f: file);
249	long ret;
250
251	if (offset < `0` \|\| len <= `0`)
252	return -EINVAL;
253
254	/ Return error if mode is not supported /
255	if (mode & ~FALLOC_FL_SUPPORTED_MASK)
256	return -EOPNOTSUPP;
257
258	/ Punch hole and zero range are mutually exclusive /
259	if ((mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE)) ==
260	(FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE))
261	return -EOPNOTSUPP;
262
263	/ Punch hole must have keep size set /
264	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
265	!(mode & FALLOC_FL_KEEP_SIZE))
266	return -EOPNOTSUPP;
267
268	/ Collapse range should only be used exclusively. /
269	if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
270	(mode & ~FALLOC_FL_COLLAPSE_RANGE))
271	return -EINVAL;
272
273	/ Insert range should only be used exclusively. /
274	if ((mode & FALLOC_FL_INSERT_RANGE) &&
275	(mode & ~FALLOC_FL_INSERT_RANGE))
276	return -EINVAL;
277
278	/ Unshare range should only be used with allocate mode. /
279	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
280	(mode & ~(FALLOC_FL_UNSHARE_RANGE \| FALLOC_FL_KEEP_SIZE)))
281	return -EINVAL;
282
283	if (!(file->f_mode & FMODE_WRITE))
284	return -EBADF;
285
286	/*
287	* We can only allow pure fallocate on append only files
288	*/
289	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
290	return -EPERM;
291
292	if (IS_IMMUTABLE(inode))
293	return -EPERM;
294
295	/*
296	* We cannot allow any fallocate operation on an active swapfile
297	*/
298	if (IS_SWAPFILE(inode))
299	return -ETXTBSY;
300
301	/*
302	* Revalidate the write permissions, in case security policy has
303	* changed since the files were opened.
304	*/
305	ret = security_file_permission(file, MAY_WRITE);
306	if (ret)
307	return ret;
308
309	ret = fsnotify_file_area_perm(file, MAY_WRITE, ppos: &offset, count: len);
310	if (ret)
311	return ret;
312
313	if (S_ISFIFO(inode->i_mode))
314	return -ESPIPE;
315
316	if (S_ISDIR(inode->i_mode))
317	return -EISDIR;
318
319	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
320	return -ENODEV;
321
322	/ Check for wrap through zero too /
323	if (((offset + len) > inode->i_sb->s_maxbytes) \|\| ((offset + len) < `0`))
324	return -EFBIG;
325
326	if (!file->f_op->fallocate)
327	return -EOPNOTSUPP;
328
329	file_start_write(file);
330	ret = file->f_op->fallocate(file, mode, offset, len);
331
332	/*
333	* Create inotify and fanotify events.
334	*
335	* To keep the logic simple always create events if fallocate succeeds.
336	* This implies that events are even created if the file size remains
337	* unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
338	*/
339	if (ret == `0`)
340	fsnotify_modify(file);
341
342	file_end_write(file);
343	return ret;
344	}
345	EXPORT_SYMBOL_GPL(vfs_fallocate);
346
347	int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
348	{
349	struct fd f = fdget(fd);
350	int error = -EBADF;
351
352	if (f.file) {
353	error = vfs_fallocate(f.file, mode, offset, len);
354	fdput(fd: f);
355	}
356	return error;
357	}
358
359	SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
360	{
361	return ksys_fallocate(fd, mode, offset, len);
362	}
363
364	#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
365	COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
366	compat_arg_u64_dual(len))
367	{
368	return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
369	compat_arg_u64_glue(len));
370	}
371	#endif
372
373	/*
374	* access() needs to use the real uid/gid, not the effective uid/gid.
375	* We do this by temporarily clearing all FS-related capabilities and
376	* switching the fsuid/fsgid around to the real ones.
377	*
378	* Creating new credentials is expensive, so we try to skip doing it,
379	* which we can if the result would match what we already got.
380	*/
381	static bool access_need_override_creds(int flags)
382	{
383	const struct cred *cred;
384
385	if (flags & AT_EACCESS)
386	return false;
387
388	cred = current_cred();
389	if (!uid_eq(left: cred->fsuid, right: cred->uid) \|\|
390	!gid_eq(left: cred->fsgid, right: cred->gid))
391	return true;
392
393	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
394	kuid_t root_uid = make_kuid(from: cred->user_ns, uid: `0`);
395	if (!uid_eq(left: cred->uid, right: root_uid)) {
396	if (!cap_isclear(a: cred->cap_effective))
397	return true;
398	} else {
399	if (!cap_isidentical(a: cred->cap_effective,
400	b: cred->cap_permitted))
401	return true;
402	}
403	}
404
405	return false;
406	}
407
408	static const struct cred access_override_creds(void*)
409	{
410	const struct cred *old_cred;
411	struct cred *override_cred;
412
413	override_cred = prepare_creds();
414	if (!override_cred)
415	return NULL;
416
417	/*
418	* XXX access_need_override_creds performs checks in hopes of skipping
419	* this work. Make sure it stays in sync if making any changes in this
420	* routine.
421	*/
422
423	override_cred->fsuid = override_cred->uid;
424	override_cred->fsgid = override_cred->gid;
425
426	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
427	/ Clear the capabilities if we switch to a non-root user /
428	kuid_t root_uid = make_kuid(from: override_cred->user_ns, uid: `0`);
429	if (!uid_eq(left: override_cred->uid, right: root_uid))
430	cap_clear(override_cred->cap_effective);
431	else
432	override_cred->cap_effective =
433	override_cred->cap_permitted;
434	}
435
436	/*
437	* The new set of credentials can only be used in
438	* task-synchronous circumstances, and does not need
439	* RCU freeing, unless somebody then takes a separate
440	* reference to it.
441	*
442	* NOTE! This is _only_ true because this credential
443	* is used purely for override_creds() that installs
444	* it as the subjective cred. Other threads will be
445	* accessing ->real_cred, not the subjective cred.
446	*
447	* If somebody _does_ make a copy of this (using the
448	* 'get_current_cred()' function), that will clear the
449	* non_rcu field, because now that other user may be
450	* expecting RCU freeing. But normal thread-synchronous
451	* cred accesses will keep things non-racy to avoid RCU
452	* freeing.
453	*/
454	override_cred->non_rcu = `1`;
455
456	old_cred = override_creds(override_cred);
457
458	/ override_cred() gets its own ref /
459	put_cred(cred: override_cred);
460
461	return old_cred;
462	}
463
464	static long do_faccessat(int dfd, const char __user filename, int* mode, int flags)
465	{
466	struct path path;
467	struct inode *inode;
468	int res;
469	unsigned int lookup_flags = LOOKUP_FOLLOW;
470	const struct cred *old_cred = NULL;
471
472	if (mode & ~S_IRWXO) / where's F_OK, X_OK, W_OK, R_OK? /
473	return -EINVAL;
474
475	if (flags & ~(AT_EACCESS \| AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH))
476	return -EINVAL;
477
478	if (flags & AT_SYMLINK_NOFOLLOW)
479	lookup_flags &= ~LOOKUP_FOLLOW;
480	if (flags & AT_EMPTY_PATH)
481	lookup_flags \|= LOOKUP_EMPTY;
482
483	if (access_need_override_creds(flags)) {
484	old_cred = access_override_creds();
485	if (!old_cred)
486	return -ENOMEM;
487	}
488
489	retry:
490	res = user_path_at(dfd, name: filename, flags: lookup_flags, path: &path);
491	if (res)
492	goto out;
493
494	inode = d_backing_inode(upper: path.dentry);
495
496	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
497	/*
498	* MAY_EXEC on regular files is denied if the fs is mounted
499	* with the "noexec" flag.
500	*/
501	res = -EACCES;
502	if (path_noexec(path: &path))
503	goto out_path_release;
504	}
505
506	res = inode_permission(mnt_idmap(mnt: path.mnt), inode, mode \| MAY_ACCESS);
507	/ SuS v2 requires we report a read only fs too /
508	if (res \|\| !(mode & S_IWOTH) \|\| special_file(inode->i_mode))
509	goto out_path_release;
510	/*
511	* This is a rare case where using __mnt_is_readonly()
512	* is OK without a mnt_want/drop_write() pair. Since
513	* no actual write to the fs is performed here, we do
514	* not need to telegraph to that to anyone.
515	*
516	* By doing this, we accept that this access is
517	* inherently racy and know that the fs may change
518	* state before we even see this result.
519	*/
520	if (__mnt_is_readonly(mnt: path.mnt))
521	res = -EROFS;
522
523	out_path_release:
524	path_put(&path);
525	if (retry_estale(error: res, flags: lookup_flags)) {
526	lookup_flags \|= LOOKUP_REVAL;
527	goto retry;
528	}
529	out:
530	if (old_cred)
531	revert_creds(old_cred);
532
533	return res;
534	}
535
536	SYSCALL_DEFINE3(faccessat, int, dfd, const char __user , filename, int*, mode)
537	{
538	return do_faccessat(dfd, filename, mode, flags: `0`);
539	}
540
541	SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user , filename, int*, mode,
542	int, flags)
543	{
544	return do_faccessat(dfd, filename, mode, flags);
545	}
546
547	SYSCALL_DEFINE2(access, const char __user , filename, int*, mode)
548	{
549	return do_faccessat(AT_FDCWD, filename, mode, flags: `0`);
550	}
551
552	SYSCALL_DEFINE1(chdir, const char __user *, filename)
553	{
554	struct path path;
555	int error;
556	unsigned int lookup_flags = LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
557	retry:
558	error = user_path_at(AT_FDCWD, name: filename, flags: lookup_flags, path: &path);
559	if (error)
560	goto out;
561
562	error = path_permission(path: &path, MAY_EXEC \| MAY_CHDIR);
563	if (error)
564	goto dput_and_out;
565
566	set_fs_pwd(current->fs, &path);
567
568	dput_and_out:
569	path_put(&path);
570	if (retry_estale(error, flags: lookup_flags)) {
571	lookup_flags \|= LOOKUP_REVAL;
572	goto retry;
573	}
574	out:
575	return error;
576	}
577
578	SYSCALL_DEFINE1(fchdir, unsigned int, fd)
579	{
580	struct fd f = fdget_raw(fd);
581	int error;
582
583	error = -EBADF;
584	if (!f.file)
585	goto out;
586
587	error = -ENOTDIR;
588	if (!d_can_lookup(dentry: f.file->f_path.dentry))
589	goto out_putf;
590
591	error = file_permission(file: f.file, MAY_EXEC \| MAY_CHDIR);
592	if (!error)
593	set_fs_pwd(current->fs, &f.file->f_path);
594	out_putf:
595	fdput(fd: f);
596	out:
597	return error;
598	}
599
600	SYSCALL_DEFINE1(chroot, const char __user *, filename)
601	{
602	struct path path;
603	int error;
604	unsigned int lookup_flags = LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
605	retry:
606	error = user_path_at(AT_FDCWD, name: filename, flags: lookup_flags, path: &path);
607	if (error)
608	goto out;
609
610	error = path_permission(path: &path, MAY_EXEC \| MAY_CHDIR);
611	if (error)
612	goto dput_and_out;
613
614	error = -EPERM;
615	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
616	goto dput_and_out;
617	error = security_path_chroot(path: &path);
618	if (error)
619	goto dput_and_out;
620
621	set_fs_root(current->fs, &path);
622	error = `0`;
623	dput_and_out:
624	path_put(&path);
625	if (retry_estale(error, flags: lookup_flags)) {
626	lookup_flags \|= LOOKUP_REVAL;
627	goto retry;
628	}
629	out:
630	return error;
631	}
632
633	int chmod_common(const struct path *path, umode_t mode)
634	{
635	struct inode *inode = path->dentry->d_inode;
636	struct inode *delegated_inode = NULL;
637	struct iattr newattrs;
638	int error;
639
640	error = mnt_want_write(mnt: path->mnt);
641	if (error)
642	return error;
643	retry_deleg:
644	inode_lock(inode);
645	error = security_path_chmod(path, mode);
646	if (error)
647	goto out_unlock;
648	newattrs.ia_mode = (mode & S_IALLUGO) \| (inode->i_mode & ~S_IALLUGO);
649	newattrs.ia_valid = ATTR_MODE \| ATTR_CTIME;
650	error = notify_change(mnt_idmap(mnt: path->mnt), path->dentry,
651	&newattrs, &delegated_inode);
652	out_unlock:
653	inode_unlock(inode);
654	if (delegated_inode) {
655	error = break_deleg_wait(delegated_inode: &delegated_inode);
656	if (!error)
657	goto retry_deleg;
658	}
659	mnt_drop_write(mnt: path->mnt);
660	return error;
661	}
662
663	int vfs_fchmod(struct file *file, umode_t mode)
664	{
665	audit_file(file);
666	return chmod_common(path: &file->f_path, mode);
667	}
668
669	SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
670	{
671	struct fd f = fdget(fd);
672	int err = -EBADF;
673
674	if (f.file) {
675	err = vfs_fchmod(file: f.file, mode);
676	fdput(fd: f);
677	}
678	return err;
679	}
680
681	static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
682	unsigned int flags)
683	{
684	struct path path;
685	int error;
686	unsigned int lookup_flags;
687
688	if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH)))
689	return -EINVAL;
690
691	lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? `0` : LOOKUP_FOLLOW;
692	if (flags & AT_EMPTY_PATH)
693	lookup_flags \|= LOOKUP_EMPTY;
694
695	retry:
696	error = user_path_at(dfd, name: filename, flags: lookup_flags, path: &path);
697	if (!error) {
698	error = chmod_common(path: &path, mode);
699	path_put(&path);
700	if (retry_estale(error, flags: lookup_flags)) {
701	lookup_flags \|= LOOKUP_REVAL;
702	goto retry;
703	}
704	}
705	return error;
706	}
707
708	SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
709	umode_t, mode, unsigned int, flags)
710	{
711	return do_fchmodat(dfd, filename, mode, flags);
712	}
713
714	SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
715	umode_t, mode)
716	{
717	return do_fchmodat(dfd, filename, mode, flags: `0`);
718	}
719
720	SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
721	{
722	return do_fchmodat(AT_FDCWD, filename, mode, flags: `0`);
723	}
724
725	/*
726	* Check whether @kuid is valid and if so generate and set vfsuid_t in
727	* ia_vfsuid.
728	*
729	* Return: true if @kuid is valid, false if not.
730	*/
731	static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
732	{
733	if (!uid_valid(uid: kuid))
734	return false;
735	attr->ia_valid \|= ATTR_UID;
736	attr->ia_vfsuid = VFSUIDT_INIT(kuid);
737	return true;
738	}
739
740	/*
741	* Check whether @kgid is valid and if so generate and set vfsgid_t in
742	* ia_vfsgid.
743	*
744	* Return: true if @kgid is valid, false if not.
745	*/
746	static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
747	{
748	if (!gid_valid(gid: kgid))
749	return false;
750	attr->ia_valid \|= ATTR_GID;
751	attr->ia_vfsgid = VFSGIDT_INIT(kgid);
752	return true;
753	}
754
755	int chown_common(const struct path *path, uid_t user, gid_t group)
756	{
757	struct mnt_idmap *idmap;
758	struct user_namespace *fs_userns;
759	struct inode *inode = path->dentry->d_inode;
760	struct inode *delegated_inode = NULL;
761	int error;
762	struct iattr newattrs;
763	kuid_t uid;
764	kgid_t gid;
765
766	uid = make_kuid(current_user_ns(), uid: user);
767	gid = make_kgid(current_user_ns(), gid: group);
768
769	idmap = mnt_idmap(mnt: path->mnt);
770	fs_userns = i_user_ns(inode);
771
772	retry_deleg:
773	newattrs.ia_vfsuid = INVALID_VFSUID;
774	newattrs.ia_vfsgid = INVALID_VFSGID;
775	newattrs.ia_valid = ATTR_CTIME;
776	if ((user != (uid_t)-`1`) && !setattr_vfsuid(attr: &newattrs, kuid: uid))
777	return -EINVAL;
778	if ((group != (gid_t)-`1`) && !setattr_vfsgid(attr: &newattrs, kgid: gid))
779	return -EINVAL;
780	inode_lock(inode);
781	if (!S_ISDIR(inode->i_mode))
782	newattrs.ia_valid \|= ATTR_KILL_SUID \| ATTR_KILL_PRIV \|
783	setattr_should_drop_sgid(idmap, inode);
784	/ Continue to send actual fs values, not the mount values. /
785	error = security_path_chown(
786	path,
787	uid: from_vfsuid(idmap, fs_userns, vfsuid: newattrs.ia_vfsuid),
788	gid: from_vfsgid(idmap, fs_userns, vfsgid: newattrs.ia_vfsgid));
789	if (!error)
790	error = notify_change(idmap, path->dentry, &newattrs,
791	&delegated_inode);
792	inode_unlock(inode);
793	if (delegated_inode) {
794	error = break_deleg_wait(delegated_inode: &delegated_inode);
795	if (!error)
796	goto retry_deleg;
797	}
798	return error;
799	}
800
801	int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
802	int flag)
803	{
804	struct path path;
805	int error = -EINVAL;
806	int lookup_flags;
807
808	if ((flag & ~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH)) != `0`)
809	goto out;
810
811	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? `0` : LOOKUP_FOLLOW;
812	if (flag & AT_EMPTY_PATH)
813	lookup_flags \|= LOOKUP_EMPTY;
814	retry:
815	error = user_path_at(dfd, name: filename, flags: lookup_flags, path: &path);
816	if (error)
817	goto out;
818	error = mnt_want_write(mnt: path.mnt);
819	if (error)
820	goto out_release;
821	error = chown_common(path: &path, user, group);
822	mnt_drop_write(mnt: path.mnt);
823	out_release:
824	path_put(&path);
825	if (retry_estale(error, flags: lookup_flags)) {
826	lookup_flags \|= LOOKUP_REVAL;
827	goto retry;
828	}
829	out:
830	return error;
831	}
832
833	SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
834	gid_t, group, int, flag)
835	{
836	return do_fchownat(dfd, filename, user, group, flag);
837	}
838
839	SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
840	{
841	return do_fchownat(AT_FDCWD, filename, user, group, flag: `0`);
842	}
843
844	SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
845	{
846	return do_fchownat(AT_FDCWD, filename, user, group,
847	AT_SYMLINK_NOFOLLOW);
848	}
849
850	int vfs_fchown(struct file *file, uid_t user, gid_t group)
851	{
852	int error;
853
854	error = mnt_want_write_file(file);
855	if (error)
856	return error;
857	audit_file(file);
858	error = chown_common(path: &file->f_path, user, group);
859	mnt_drop_write_file(file);
860	return error;
861	}
862
863	int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
864	{
865	struct fd f = fdget(fd);
866	int error = -EBADF;
867
868	if (f.file) {
869	error = vfs_fchown(file: f.file, user, group);
870	fdput(fd: f);
871	}
872	return error;
873	}
874
875	SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
876	{
877	return ksys_fchown(fd, user, group);
878	}
879
880	static inline int file_get_write_access(struct file *f)
881	{
882	int error;
883
884	error = get_write_access(inode: f->f_inode);
885	if (unlikely(error))
886	return error;
887	error = mnt_get_write_access(mnt: f->f_path.mnt);
888	if (unlikely(error))
889	goto cleanup_inode;
890	if (unlikely(f->f_mode & FMODE_BACKING)) {
891	error = mnt_get_write_access(mnt: backing_file_user_path(f)->mnt);
892	if (unlikely(error))
893	goto cleanup_mnt;
894	}
895	return `0`;
896
897	cleanup_mnt:
898	mnt_put_write_access(mnt: f->f_path.mnt);
899	cleanup_inode:
900	put_write_access(inode: f->f_inode);
901	return error;
902	}
903
904	static int do_dentry_open(struct file *f,
905	struct inode *inode,
906	int (open)(struct* inode , struct* file *))
907	{
908	static const struct file_operations empty_fops = {};
909	int error;
910
911	path_get(&f->f_path);
912	f->f_inode = inode;
913	f->f_mapping = inode->i_mapping;
914	f->f_wb_err = filemap_sample_wb_err(mapping: f->f_mapping);
915	f->f_sb_err = file_sample_sb_err(file: f);
916
917	if (unlikely(f->f_flags & O_PATH)) {
918	f->f_mode = FMODE_PATH \| FMODE_OPENED;
919	f->f_op = &empty_fops;
920	return `0`;
921	}
922
923	if ((f->f_mode & (FMODE_READ \| FMODE_WRITE)) == FMODE_READ) {
924	i_readcount_inc(inode);
925	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
926	error = file_get_write_access(f);
927	if (unlikely(error))
928	goto cleanup_file;
929	f->f_mode \|= FMODE_WRITER;
930	}
931
932	/ POSIX.1-2008/SUSv4 Section XSI 2.9.7 /
933	if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode))
934	f->f_mode \|= FMODE_ATOMIC_POS;
935
936	f->f_op = fops_get(inode->i_fop);
937	if (WARN_ON(!f->f_op)) {
938	error = -ENODEV;
939	goto cleanup_all;
940	}
941
942	error = security_file_open(file: f);
943	if (error)
944	goto cleanup_all;
945
946	error = break_lease(inode: file_inode(f), mode: f->f_flags);
947	if (error)
948	goto cleanup_all;
949
950	/ normally all 3 are set; ->open() can clear them if needed /
951	f->f_mode \|= FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE;
952	if (!open)
953	open = f->f_op->open;
954	if (open) {
955	error = open(inode, f);
956	if (error)
957	goto cleanup_all;
958	}
959	f->f_mode \|= FMODE_OPENED;
960	if ((f->f_mode & FMODE_READ) &&
961	likely(f->f_op->read \|\| f->f_op->read_iter))
962	f->f_mode \|= FMODE_CAN_READ;
963	if ((f->f_mode & FMODE_WRITE) &&
964	likely(f->f_op->write \|\| f->f_op->write_iter))
965	f->f_mode \|= FMODE_CAN_WRITE;
966	if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
967	f->f_mode &= ~FMODE_LSEEK;
968	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
969	f->f_mode \|= FMODE_CAN_ODIRECT;
970
971	f->f_flags &= ~(O_CREAT \| O_EXCL \| O_NOCTTY \| O_TRUNC);
972	f->f_iocb_flags = iocb_flags(file: f);
973
974	file_ra_state_init(ra: &f->f_ra, mapping: f->f_mapping->host->i_mapping);
975
976	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
977	return -EINVAL;
978
979	/*
980	* XXX: Huge page cache doesn't support writing yet. Drop all page
981	* cache for this file before processing writes.
982	*/
983	if (f->f_mode & FMODE_WRITE) {
984	/*
985	* Paired with smp_mb() in collapse_file() to ensure nr_thps
986	* is up to date and the update to i_writecount by
987	* get_write_access() is visible. Ensures subsequent insertion
988	* of THPs into the page cache will fail.
989	*/
990	smp_mb();
991	if (filemap_nr_thps(mapping: inode->i_mapping)) {
992	struct address_space *mapping = inode->i_mapping;
993
994	filemap_invalidate_lock(mapping: inode->i_mapping);
995	/*
996	* unmap_mapping_range just need to be called once
997	* here, because the private pages is not need to be
998	* unmapped mapping (e.g. data segment of dynamic
999	* shared libraries here).
1000	*/
1001	unmap_mapping_range(mapping, holebegin: `0`, holelen: `0`, even_cows: `0`);
1002	truncate_inode_pages(mapping, `0`);
1003	filemap_invalidate_unlock(mapping: inode->i_mapping);
1004	}
1005	}
1006
1007	/*
1008	* Once we return a file with FMODE_OPENED, __fput() will call
1009	* fsnotify_close(), so we need fsnotify_open() here for symmetry.
1010	*/
1011	fsnotify_open(file: f);
1012	return `0`;
1013
1014	cleanup_all:
1015	if (WARN_ON_ONCE(error > `0`))
1016	error = -EINVAL;
1017	fops_put(f->f_op);
1018	put_file_access(file: f);
1019	cleanup_file:
1020	path_put(&f->f_path);
1021	f->f_path.mnt = NULL;
1022	f->f_path.dentry = NULL;
1023	f->f_inode = NULL;
1024	return error;
1025	}
1026
1027	/**
1028	* finish_open - finish opening a file
1029	* @file: file pointer
1030	* @dentry: pointer to dentry
1031	* @open: open callback
1032	*
1033	* This can be used to finish opening a file passed to i_op->atomic_open().
1034	*
1035	* If the open callback is set to NULL, then the standard f_op->open()
1036	* filesystem callback is substituted.
1037	*
1038	* NB: the dentry reference is _not_ consumed. If, for example, the dentry is
1039	* the return value of d_splice_alias(), then the caller needs to perform dput()
1040	* on it after finish_open().
1041	*
1042	* Returns zero on success or -errno if the open failed.
1043	*/
1044	int finish_open(struct file file, struct* dentry *dentry,
1045	int (open)(struct* inode , struct* file *))
1046	{
1047	BUG_ON(file->f_mode & FMODE_OPENED); / once it's opened, it's opened /
1048
1049	file->f_path.dentry = dentry;
1050	return do_dentry_open(f: file, inode: d_backing_inode(upper: dentry), open);
1051	}
1052	EXPORT_SYMBOL(finish_open);
1053
1054	/**
1055	* finish_no_open - finish ->atomic_open() without opening the file
1056	*
1057	* @file: file pointer
1058	* @dentry: dentry or NULL (as returned from ->lookup())
1059	*
1060	* This can be used to set the result of a successful lookup in ->atomic_open().
1061	*
1062	* NB: unlike finish_open() this function does consume the dentry reference and
1063	* the caller need not dput() it.
1064	*
1065	* Returns "0" which must be the return value of ->atomic_open() after having
1066	* called this function.
1067	*/
1068	int finish_no_open(struct file file, struct* dentry *dentry)
1069	{
1070	file->f_path.dentry = dentry;
1071	return `0`;
1072	}
1073	EXPORT_SYMBOL(finish_no_open);
1074
1075	char file_path(struct* file filp, char* buf, int* buflen)
1076	{
1077	return d_path(&filp->f_path, buf, buflen);
1078	}
1079	EXPORT_SYMBOL(file_path);
1080
1081	/**
1082	* vfs_open - open the file at the given path
1083	* @path: path to open
1084	* @file: newly allocated file with f_flag initialized
1085	*/
1086	int vfs_open(const struct path path, struct* file *file)
1087	{
1088	file->f_path = *path;
1089	return do_dentry_open(f: file, inode: d_backing_inode(upper: path->dentry), NULL);
1090	}
1091
1092	struct file dentry_open(const* struct path path, int* flags,
1093	const struct cred *cred)
1094	{
1095	int error;
1096	struct file *f;
1097
1098	/ We must always pass in a valid mount pointer. /
1099	BUG_ON(!path->mnt);
1100
1101	f = alloc_empty_file(flags, cred);
1102	if (!IS_ERR(ptr: f)) {
1103	error = vfs_open(path, file: f);
1104	if (error) {
1105	fput(f);
1106	f = ERR_PTR(error);
1107	}
1108	}
1109	return f;
1110	}
1111	EXPORT_SYMBOL(dentry_open);
1112
1113	/**
1114	* dentry_create - Create and open a file
1115	* @path: path to create
1116	* @flags: O_ flags
1117	* @mode: mode bits for new file
1118	* @cred: credentials to use
1119	*
1120	* Caller must hold the parent directory's lock, and have prepared
1121	* a negative dentry, placed in @path->dentry, for the new file.
1122	*
1123	* Caller sets @path->mnt to the vfsmount of the filesystem where
1124	* the new file is to be created. The parent directory and the
1125	* negative dentry must reside on the same filesystem instance.
1126	*
1127	* On success, returns a "struct file *". Otherwise a ERR_PTR
1128	* is returned.
1129	*/
1130	struct file dentry_create(const* struct path path, int* flags, umode_t mode,
1131	const struct cred *cred)
1132	{
1133	struct file *f;
1134	int error;
1135
1136	f = alloc_empty_file(flags, cred);
1137	if (IS_ERR(ptr: f))
1138	return f;
1139
1140	error = vfs_create(mnt_idmap(mnt: path->mnt),
1141	d_inode(dentry: path->dentry->d_parent),
1142	path->dentry, mode, true);
1143	if (!error)
1144	error = vfs_open(path, file: f);
1145
1146	if (unlikely(error)) {
1147	fput(f);
1148	return ERR_PTR(error);
1149	}
1150	return f;
1151	}
1152	EXPORT_SYMBOL(dentry_create);
1153
1154	/**
1155	* kernel_file_open - open a file for kernel internal use
1156	* @path: path of the file to open
1157	* @flags: open flags
1158	* @inode: the inode
1159	* @cred: credentials for open
1160	*
1161	* Open a file for use by in-kernel consumers. The file is not accounted
1162	* against nr_files and must not be installed into the file descriptor
1163	* table.
1164	*
1165	* Return: Opened file on success, an error pointer on failure.
1166	*/
1167	struct file kernel_file_open(const* struct path path, int* flags,
1168	struct inode inode, const* struct cred *cred)
1169	{
1170	struct file *f;
1171	int error;
1172
1173	f = alloc_empty_file_noaccount(flags, cred);
1174	if (IS_ERR(ptr: f))
1175	return f;
1176
1177	f->f_path = *path;
1178	error = do_dentry_open(f, inode, NULL);
1179	if (error) {
1180	fput(f);
1181	f = ERR_PTR(error);
1182	}
1183	return f;
1184	}
1185	EXPORT_SYMBOL_GPL(kernel_file_open);
1186
1187	#define WILL_CREATE(flags) (flags & (O_CREAT \| __O_TMPFILE))
1188	#define O_PATH_FLAGS (O_DIRECTORY \| O_NOFOLLOW \| O_PATH \| O_CLOEXEC)
1189
1190	inline struct open_how build_open_how(int flags, umode_t mode)
1191	{
1192	struct open_how how = {
1193	.flags = flags & VALID_OPEN_FLAGS,
1194	.mode = mode & S_IALLUGO,
1195	};
1196
1197	/ O_PATH beats everything else. /
1198	if (how.flags & O_PATH)
1199	how.flags &= O_PATH_FLAGS;
1200	/ Modes should only be set for create-like flags. /
1201	if (!WILL_CREATE(how.flags))
1202	how.mode = `0`;
1203	return how;
1204	}
1205
1206	inline int build_open_flags(const struct open_how how, struct* open_flags *op)
1207	{
1208	u64 flags = how->flags;
1209	u64 strip = __FMODE_NONOTIFY \| O_CLOEXEC;
1210	int lookup_flags = `0`;
1211	int acc_mode = ACC_MODE(flags);
1212
1213	BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
1214	"struct open_flags doesn't yet handle flags > 32 bits");
1215
1216	/*
1217	* Strip flags that either shouldn't be set by userspace like
1218	* FMODE_NONOTIFY or that aren't relevant in determining struct
1219	* open_flags like O_CLOEXEC.
1220	*/
1221	flags &= ~strip;
1222
1223	/*
1224	* Older syscalls implicitly clear all of the invalid flags or argument
1225	* values before calling build_open_flags(), but openat2(2) checks all
1226	* of its arguments.
1227	*/
1228	if (flags & ~VALID_OPEN_FLAGS)
1229	return -EINVAL;
1230	if (how->resolve & ~VALID_RESOLVE_FLAGS)
1231	return -EINVAL;
1232
1233	/ Scoping flags are mutually exclusive. /
1234	if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
1235	return -EINVAL;
1236
1237	/ Deal with the mode. /
1238	if (WILL_CREATE(flags)) {
1239	if (how->mode & ~S_IALLUGO)
1240	return -EINVAL;
1241	op->mode = how->mode \| S_IFREG;
1242	} else {
1243	if (how->mode != `0`)
1244	return -EINVAL;
1245	op->mode = `0`;
1246	}
1247
1248	/*
1249	* Block bugs where O_DIRECTORY \| O_CREAT created regular files.
1250	* Note, that blocking O_DIRECTORY \| O_CREAT here also protects
1251	* O_TMPFILE below which requires O_DIRECTORY being raised.
1252	*/
1253	if ((flags & (O_DIRECTORY \| O_CREAT)) == (O_DIRECTORY \| O_CREAT))
1254	return -EINVAL;
1255
1256	/ Now handle the creative implementation of O_TMPFILE. /
1257	if (flags & __O_TMPFILE) {
1258	/*
1259	* In order to ensure programs get explicit errors when trying
1260	* to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
1261	* is raised alongside __O_TMPFILE.
1262	*/
1263	if (!(flags & O_DIRECTORY))
1264	return -EINVAL;
1265	if (!(acc_mode & MAY_WRITE))
1266	return -EINVAL;
1267	}
1268	if (flags & O_PATH) {
1269	/ O_PATH only permits certain other flags to be set. /
1270	if (flags & ~O_PATH_FLAGS)
1271	return -EINVAL;
1272	acc_mode = `0`;
1273	}
1274
1275	/*
1276	* O_SYNC is implemented as __O_SYNC\|O_DSYNC. As many places only
1277	* check for O_DSYNC if the need any syncing at all we enforce it's
1278	* always set instead of having to deal with possibly weird behaviour
1279	* for malicious applications setting only __O_SYNC.
1280	*/
1281	if (flags & __O_SYNC)
1282	flags \|= O_DSYNC;
1283
1284	op->open_flag = flags;
1285
1286	/ O_TRUNC implies we need access checks for write permissions /
1287	if (flags & O_TRUNC)
1288	acc_mode \|= MAY_WRITE;
1289
1290	/ Allow the LSM permission hook to distinguish append*
1291	access from general write access. /*
1292	if (flags & O_APPEND)
1293	acc_mode \|= MAY_APPEND;
1294
1295	op->acc_mode = acc_mode;
1296
1297	op->intent = flags & O_PATH ? `0` : LOOKUP_OPEN;
1298
1299	if (flags & O_CREAT) {
1300	op->intent \|= LOOKUP_CREATE;
1301	if (flags & O_EXCL) {
1302	op->intent \|= LOOKUP_EXCL;
1303	flags \|= O_NOFOLLOW;
1304	}
1305	}
1306
1307	if (flags & O_DIRECTORY)
1308	lookup_flags \|= LOOKUP_DIRECTORY;
1309	if (!(flags & O_NOFOLLOW))
1310	lookup_flags \|= LOOKUP_FOLLOW;
1311
1312	if (how->resolve & RESOLVE_NO_XDEV)
1313	lookup_flags \|= LOOKUP_NO_XDEV;
1314	if (how->resolve & RESOLVE_NO_MAGICLINKS)
1315	lookup_flags \|= LOOKUP_NO_MAGICLINKS;
1316	if (how->resolve & RESOLVE_NO_SYMLINKS)
1317	lookup_flags \|= LOOKUP_NO_SYMLINKS;
1318	if (how->resolve & RESOLVE_BENEATH)
1319	lookup_flags \|= LOOKUP_BENEATH;
1320	if (how->resolve & RESOLVE_IN_ROOT)
1321	lookup_flags \|= LOOKUP_IN_ROOT;
1322	if (how->resolve & RESOLVE_CACHED) {
1323	/ Don't bother even trying for create/truncate/tmpfile open /
1324	if (flags & (O_TRUNC \| O_CREAT \| __O_TMPFILE))
1325	return -EAGAIN;
1326	lookup_flags \|= LOOKUP_CACHED;
1327	}
1328
1329	op->lookup_flags = lookup_flags;
1330	return `0`;
1331	}
1332
1333	/**
1334	* file_open_name - open file and return file pointer
1335	*
1336	* @name: struct filename containing path to open
1337	* @flags: open flags as per the open(2) second argument
1338	* @mode: mode for the new file if O_CREAT is set, else ignored
1339	*
1340	* This is the helper to open a file from kernelspace if you really
1341	* have to. But in generally you should not do this, so please move
1342	* along, nothing to see here..
1343	*/
1344	struct file file_open_name(struct* filename name, int* flags, umode_t mode)
1345	{
1346	struct open_flags op;
1347	struct open_how how = build_open_how(flags, mode);
1348	int err = build_open_flags(how: &how, op: &op);
1349	if (err)
1350	return ERR_PTR(error: err);
1351	return do_filp_open(AT_FDCWD, pathname: name, op: &op);
1352	}
1353
1354	/**
1355	* filp_open - open file and return file pointer
1356	*
1357	* @filename: path to open
1358	* @flags: open flags as per the open(2) second argument
1359	* @mode: mode for the new file if O_CREAT is set, else ignored
1360	*
1361	* This is the helper to open a file from kernelspace if you really
1362	* have to. But in generally you should not do this, so please move
1363	* along, nothing to see here..
1364	*/
1365	struct file filp_open(const* char filename, int* flags, umode_t mode)
1366	{
1367	struct filename *name = getname_kernel(filename);
1368	struct file *file = ERR_CAST(ptr: name);
1369
1370	if (!IS_ERR(ptr: name)) {
1371	file = file_open_name(name, flags, mode);
1372	putname(name);
1373	}
1374	return file;
1375	}
1376	EXPORT_SYMBOL(filp_open);
1377
1378	struct file file_open_root(const* struct path *root,
1379	const char filename, int* flags, umode_t mode)
1380	{
1381	struct open_flags op;
1382	struct open_how how = build_open_how(flags, mode);
1383	int err = build_open_flags(how: &how, op: &op);
1384	if (err)
1385	return ERR_PTR(error: err);
1386	return do_file_open_root(root, filename, &op);
1387	}
1388	EXPORT_SYMBOL(file_open_root);
1389
1390	static long do_sys_openat2(int dfd, const char __user *filename,
1391	struct open_how *how)
1392	{
1393	struct open_flags op;
1394	int fd = build_open_flags(how, op: &op);
1395	struct filename *tmp;
1396
1397	if (fd)
1398	return fd;
1399
1400	tmp = getname(filename);
1401	if (IS_ERR(ptr: tmp))
1402	return PTR_ERR(ptr: tmp);
1403
1404	fd = get_unused_fd_flags(flags: how->flags);
1405	if (fd >= `0`) {
1406	struct file *f = do_filp_open(dfd, pathname: tmp, op: &op);
1407	if (IS_ERR(ptr: f)) {
1408	put_unused_fd(fd);
1409	fd = PTR_ERR(ptr: f);
1410	} else {
1411	fd_install(fd, file: f);
1412	}
1413	}
1414	putname(name: tmp);
1415	return fd;
1416	}
1417
1418	long do_sys_open(int dfd, const char __user filename, int* flags, umode_t mode)
1419	{
1420	struct open_how how = build_open_how(flags, mode);
1421	return do_sys_openat2(dfd, filename, how: &how);
1422	}
1423
1424
1425	SYSCALL_DEFINE3(open, const char __user , filename, int*, flags, umode_t, mode)
1426	{
1427	if (force_o_largefile())
1428	flags \|= O_LARGEFILE;
1429	return do_sys_open(AT_FDCWD, filename, flags, mode);
1430	}
1431
1432	SYSCALL_DEFINE4(openat, int, dfd, const char __user , filename, int*, flags,
1433	umode_t, mode)
1434	{
1435	if (force_o_largefile())
1436	flags \|= O_LARGEFILE;
1437	return do_sys_open(dfd, filename, flags, mode);
1438	}
1439
1440	SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
1441	struct open_how __user *, how, size_t, usize)
1442	{
1443	int err;
1444	struct open_how tmp;
1445
1446	BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
1447	BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);
1448
1449	if (unlikely(usize < OPEN_HOW_SIZE_VER0))
1450	return -EINVAL;
1451
1452	err = copy_struct_from_user(dst: &tmp, ksize: sizeof(tmp), src: how, usize);
1453	if (err)
1454	return err;
1455
1456	audit_openat2_how(how: &tmp);
1457
1458	/ O_LARGEFILE is only allowed for non-O_PATH. /
1459	if (!(tmp.flags & O_PATH) && force_o_largefile())
1460	tmp.flags \|= O_LARGEFILE;
1461
1462	return do_sys_openat2(dfd, filename, how: &tmp);
1463	}
1464
1465	#ifdef CONFIG_COMPAT
1466	/*
1467	* Exactly like sys_open(), except that it doesn't set the
1468	* O_LARGEFILE flag.
1469	*/
1470	COMPAT_SYSCALL_DEFINE3(open, const char __user , filename, int*, flags, umode_t, mode)
1471	{
1472	return do_sys_open(AT_FDCWD, filename, flags, mode);
1473	}
1474
1475	/*
1476	* Exactly like sys_openat(), except that it doesn't set the
1477	* O_LARGEFILE flag.
1478	*/
1479	COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user , filename, int*, flags, umode_t, mode)
1480	{
1481	return do_sys_open(dfd, filename, flags, mode);
1482	}
1483	#endif
1484
1485	#ifndef __alpha__
1486
1487	/*
1488	* For backward compatibility? Maybe this should be moved
1489	* into arch/i386 instead?
1490	*/
1491	SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1492	{
1493	int flags = O_CREAT \| O_WRONLY \| O_TRUNC;
1494
1495	if (force_o_largefile())
1496	flags \|= O_LARGEFILE;
1497	return do_sys_open(AT_FDCWD, filename: pathname, flags, mode);
1498	}
1499	#endif
1500
1501	/*
1502	* "id" is the POSIX thread ID. We use the
1503	* files pointer for this..
1504	*/
1505	static int filp_flush(struct file *filp, fl_owner_t id)
1506	{
1507	int retval = `0`;
1508
1509	if (CHECK_DATA_CORRUPTION(file_count(filp) == `0`,
1510	"VFS: Close: file count is 0 (f_op=%ps)",
1511	filp->f_op)) {
1512	return `0`;
1513	}
1514
1515	if (filp->f_op->flush)
1516	retval = filp->f_op->flush(filp, id);
1517
1518	if (likely(!(filp->f_mode & FMODE_PATH))) {
1519	dnotify_flush(filp, id);
1520	locks_remove_posix(filp, id);
1521	}
1522	return retval;
1523	}
1524
1525	int filp_close(struct file *filp, fl_owner_t id)
1526	{
1527	int retval;
1528
1529	retval = filp_flush(filp, id);
1530	fput(filp);
1531
1532	return retval;
1533	}
1534	EXPORT_SYMBOL(filp_close);
1535
1536	/*
1537	* Careful here! We test whether the file pointer is NULL before
1538	* releasing the fd. This ensures that one clone task can't release
1539	* an fd while another clone is opening it.
1540	*/
1541	SYSCALL_DEFINE1(close, unsigned int, fd)
1542	{
1543	int retval;
1544	struct file *file;
1545
1546	file = file_close_fd(fd);
1547	if (!file)
1548	return -EBADF;
1549
1550	retval = filp_flush(filp: file, current->files);
1551
1552	/*
1553	* We're returning to user space. Don't bother
1554	* with any delayed fput() cases.
1555	*/
1556	__fput_sync(file);
1557
1558	/ can't restart close syscall because file table entry was cleared /
1559	if (unlikely(retval == -ERESTARTSYS \|\|
1560	retval == -ERESTARTNOINTR \|\|
1561	retval == -ERESTARTNOHAND \|\|
1562	retval == -ERESTART_RESTARTBLOCK))
1563	retval = -EINTR;
1564
1565	return retval;
1566	}
1567
1568	/**
1569	* sys_close_range() - Close all file descriptors in a given range.
1570	*
1571	* @fd: starting file descriptor to close
1572	* @max_fd: last file descriptor to close
1573	* @flags: reserved for future extensions
1574	*
1575	* This closes a range of file descriptors. All file descriptors
1576	* from @fd up to and including @max_fd are closed.
1577	* Currently, errors to close a given file descriptor are ignored.
1578	*/
1579	SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
1580	unsigned int, flags)
1581	{
1582	return __close_range(fd, max_fd, flags);
1583	}
1584
1585	/*
1586	* This routine simulates a hangup on the tty, to arrange that users
1587	* are given clean terminals at login time.
1588	*/
1589	SYSCALL_DEFINE0(vhangup)
1590	{
1591	if (capable(CAP_SYS_TTY_CONFIG)) {
1592	tty_vhangup_self();
1593	return `0`;
1594	}
1595	return -EPERM;
1596	}
1597
1598	/*
1599	* Called when an inode is about to be open.
1600	* We use this to disallow opening large files on 32bit systems if
1601	* the caller didn't specify O_LARGEFILE. On 64bit systems we force
1602	* on this flag in sys_open.
1603	*/
1604	int generic_file_open(struct inode * inode, struct file * filp)
1605	{
1606	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1607	return -EOVERFLOW;
1608	return `0`;
1609	}
1610
1611	EXPORT_SYMBOL(generic_file_open);
1612
1613	/*
1614	* This is used by subsystems that don't want seekable
1615	* file descriptors. The function is not supposed to ever fail, the only
1616	* reason it returns an 'int' and not 'void' is so that it can be plugged
1617	* directly into file_operations structure.
1618	*/
1619	int nonseekable_open(struct inode inode, struct* file *filp)
1620	{
1621	filp->f_mode &= ~(FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE);
1622	return `0`;
1623	}
1624
1625	EXPORT_SYMBOL(nonseekable_open);
1626
1627	/*
1628	* stream_open is used by subsystems that want stream-like file descriptors.
1629	* Such file descriptors are not seekable and don't have notion of position
1630	* (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
1631	* Contrary to file descriptors of other regular files, .read() and .write()
1632	* can run simultaneously.
1633	*
1634	* stream_open never fails and is marked to return int so that it could be
1635	* directly used as file_operations.open .
1636	*/
1637	int stream_open(struct inode inode, struct* file *filp)
1638	{
1639	filp->f_mode &= ~(FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE \| FMODE_ATOMIC_POS);
1640	filp->f_mode \|= FMODE_STREAM;
1641	return `0`;
1642	}
1643
1644	EXPORT_SYMBOL(stream_open);
1645

source code of linux/fs/open.c