namei.c source code [linux/fs/namei.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* linux/fs/namei.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* Some corrections by tytso.
10	*/
11
12	/ [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname*
13	* lookup logic.
14	*/
15	/ [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.*
16	*/
17
18	#include <linux/init.h>
19	#include <linux/export.h>
20	#include <linux/kernel.h>
21	#include <linux/slab.h>
22	#include <linux/fs.h>
23	#include <linux/filelock.h>
24	#include <linux/namei.h>
25	#include <linux/pagemap.h>
26	#include <linux/sched/mm.h>
27	#include <linux/fsnotify.h>
28	#include <linux/personality.h>
29	#include <linux/security.h>
30	#include <linux/ima.h>
31	#include <linux/syscalls.h>
32	#include <linux/mount.h>
33	#include <linux/audit.h>
34	#include <linux/capability.h>
35	#include <linux/file.h>
36	#include <linux/fcntl.h>
37	#include <linux/device_cgroup.h>
38	#include <linux/fs_struct.h>
39	#include <linux/posix_acl.h>
40	#include <linux/hash.h>
41	#include <linux/bitops.h>
42	#include <linux/init_task.h>
43	#include <linux/uaccess.h>
44
45	#include "internal.h"
46	#include "mount.h"
47
48	/ [Feb-1997 T. Schoebel-Theuer]*
49	* Fundamental changes in the pathname lookup mechanisms (namei)
50	* were necessary because of omirr. The reason is that omirr needs
51	* to know the _real_ pathname, not the user-supplied one, in case
52	* of symlinks (and also when transname replacements occur).
53	*
54	* The new code replaces the old recursive symlink resolution with
55	* an iterative one (in case of non-nested symlink chains). It does
56	* this with calls to <fs>_follow_link().
57	* As a side effect, dir_namei(), _namei() and follow_link() are now
58	* replaced with a single function lookup_dentry() that can handle all
59	* the special cases of the former code.
60	*
61	* With the new dcache, the pathname is stored at each inode, at least as
62	* long as the refcount of the inode is positive. As a side effect, the
63	* size of the dcache depends on the inode cache and thus is dynamic.
64	*
65	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
66	* resolution to correspond with current state of the code.
67	*
68	* Note that the symlink resolution is not completely iterative.
69	* There is still a significant amount of tail- and mid- recursion in
70	* the algorithm. Also, note that <fs>_readlink() is not used in
71	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
72	* may return different results than <fs>_follow_link(). Many virtual
73	* filesystems (including /proc) exhibit this behavior.
74	*/
75
76	/ [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:*
77	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
78	* and the name already exists in form of a symlink, try to create the new
79	* name indicated by the symlink. The old code always complained that the
80	* name already exists, due to not following the symlink even if its target
81	* is nonexistent. The new semantics affects also mknod() and link() when
82	* the name is a symlink pointing to a non-existent name.
83	*
84	* I don't know which semantics is the right one, since I have no access
85	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
86	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
87	* "old" one. Personally, I think the new semantics is much more logical.
88	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
89	* file does succeed in both HP-UX and SunOs, but not in Solaris
90	* and in the old Linux semantics.
91	*/
92
93	/ [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink*
94	* semantics. See the comments in "open_namei" and "do_link" below.
95	*
96	* [10-Sep-98 Alan Modra] Another symlink change.
97	*/
98
99	/ [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:*
100	* inside the path - always follow.
101	* in the last component in creation/removal/renaming - never follow.
102	* if LOOKUP_FOLLOW passed - follow.
103	* if the pathname has trailing slashes - follow.
104	* otherwise - don't follow.
105	* (applied in that order).
106	*
107	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
108	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
109	* During the 2.4 we need to fix the userland stuff depending on it -
110	* hopefully we will be able to get rid of that wart in 2.5. So far only
111	* XEmacs seems to be relying on it...
112	*/
113	/*
114	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
115	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
116	* any extra contention...
117	*/
118
119	/ In order to reduce some races, while at the same time doing additional*
120	* checking and hopefully speeding things up, we copy filenames to the
121	* kernel data space before using them..
122	*
123	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
124	* PATH_MAX includes the nul terminator --RR.
125	*/
126
127	#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
128
129	struct filename *
130	getname_flags(const char __user filename, int* flags, int *empty)
131	{
132	struct filename *result;
133	char *kname;
134	int len;
135
136	result = audit_reusename(name: filename);
137	if (result)
138	return result;
139
140	result = __getname();
141	if (unlikely(!result))
142	return ERR_PTR(error: -ENOMEM);
143
144	/*
145	* First, try to embed the struct filename inside the names_cache
146	* allocation
147	*/
148	kname = (char *)result->iname;
149	result->name = kname;
150
151	len = strncpy_from_user(dst: kname, src: filename, EMBEDDED_NAME_MAX);
152	if (unlikely(len < `0`)) {
153	__putname(result);
154	return ERR_PTR(error: len);
155	}
156
157	/*
158	* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
159	* separate struct filename so we can dedicate the entire
160	* names_cache allocation for the pathname, and re-do the copy from
161	* userland.
162	*/
163	if (unlikely(len == EMBEDDED_NAME_MAX)) {
164	const size_t size = offsetof(struct filename, iname[`1`]);
165	kname = (char *)result;
166
167	/*
168	* size is chosen that way we to guarantee that
169	* result->iname[0] is within the same object and that
170	* kname can't be equal to result->iname, no matter what.
171	*/
172	result = kzalloc(size, GFP_KERNEL);
173	if (unlikely(!result)) {
174	__putname(kname);
175	return ERR_PTR(error: -ENOMEM);
176	}
177	result->name = kname;
178	len = strncpy_from_user(dst: kname, src: filename, PATH_MAX);
179	if (unlikely(len < `0`)) {
180	__putname(kname);
181	kfree(objp: result);
182	return ERR_PTR(error: len);
183	}
184	if (unlikely(len == PATH_MAX)) {
185	__putname(kname);
186	kfree(objp: result);
187	return ERR_PTR(error: -ENAMETOOLONG);
188	}
189	}
190
191	atomic_set(v: &result->refcnt, i: `1`);
192	/ The empty path is special. /
193	if (unlikely(!len)) {
194	if (empty)
195	*empty = `1`;
196	if (!(flags & LOOKUP_EMPTY)) {
197	putname(name: result);
198	return ERR_PTR(error: -ENOENT);
199	}
200	}
201
202	result->uptr = filename;
203	result->aname = NULL;
204	audit_getname(name: result);
205	return result;
206	}
207
208	struct filename *
209	getname_uflags(const char __user filename, int* uflags)
210	{
211	int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : `0`;
212
213	return getname_flags(filename, flags, NULL);
214	}
215
216	struct filename *
217	getname(const char __user * filename)
218	{
219	return getname_flags(filename, flags: `0`, NULL);
220	}
221
222	struct filename *
223	getname_kernel(const char * filename)
224	{
225	struct filename *result;
226	int len = strlen(filename) + `1`;
227
228	result = __getname();
229	if (unlikely(!result))
230	return ERR_PTR(error: -ENOMEM);
231
232	if (len <= EMBEDDED_NAME_MAX) {
233	result->name = (char *)result->iname;
234	} else if (len <= PATH_MAX) {
235	const size_t size = offsetof(struct filename, iname[`1`]);
236	struct filename *tmp;
237
238	tmp = kmalloc(size, GFP_KERNEL);
239	if (unlikely(!tmp)) {
240	__putname(result);
241	return ERR_PTR(error: -ENOMEM);
242	}
243	tmp->name = (char *)result;
244	result = tmp;
245	} else {
246	__putname(result);
247	return ERR_PTR(error: -ENAMETOOLONG);
248	}
249	memcpy((char *)result->name, filename, len);
250	result->uptr = NULL;
251	result->aname = NULL;
252	atomic_set(v: &result->refcnt, i: `1`);
253	audit_getname(name: result);
254
255	return result;
256	}
257	EXPORT_SYMBOL(getname_kernel);
258
259	void putname(struct filename *name)
260	{
261	if (IS_ERR(ptr: name))
262	return;
263
264	if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
265	return;
266
267	if (!atomic_dec_and_test(v: &name->refcnt))
268	return;
269
270	if (name->name != name->iname) {
271	__putname(name->name);
272	kfree(objp: name);
273	} else
274	__putname(name);
275	}
276	EXPORT_SYMBOL(putname);
277
278	/**
279	* check_acl - perform ACL permission checking
280	* @idmap: idmap of the mount the inode was found from
281	* @inode: inode to check permissions on
282	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
283	*
284	* This function performs the ACL permission checking. Since this function
285	* retrieve POSIX acls it needs to know whether it is called from a blocking or
286	* non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
287	*
288	* If the inode has been found through an idmapped mount the idmap of
289	* the vfsmount must be passed through @idmap. This function will then take
290	* care to map the inode according to @idmap before checking permissions.
291	* On non-idmapped mounts or if permission checking is to be performed on the
292	* raw inode simply passs @nop_mnt_idmap.
293	*/
294	static int check_acl(struct mnt_idmap *idmap,
295	struct inode inode, int* mask)
296	{
297	#ifdef CONFIG_FS_POSIX_ACL
298	struct posix_acl *acl;
299
300	if (mask & MAY_NOT_BLOCK) {
301	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
302	if (!acl)
303	return -EAGAIN;
304	/ no ->get_inode_acl() calls in RCU mode... /
305	if (is_uncached_acl(acl))
306	return -ECHILD;
307	return posix_acl_permission(idmap, inode, acl, mask);
308	}
309
310	acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
311	if (IS_ERR(ptr: acl))
312	return PTR_ERR(ptr: acl);
313	if (acl) {
314	int error = posix_acl_permission(idmap, inode, acl, mask);
315	posix_acl_release(acl);
316	return error;
317	}
318	#endif
319
320	return -EAGAIN;
321	}
322
323	/**
324	* acl_permission_check - perform basic UNIX permission checking
325	* @idmap: idmap of the mount the inode was found from
326	* @inode: inode to check permissions on
327	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
328	*
329	* This function performs the basic UNIX permission checking. Since this
330	* function may retrieve POSIX acls it needs to know whether it is called from a
331	* blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
332	*
333	* If the inode has been found through an idmapped mount the idmap of
334	* the vfsmount must be passed through @idmap. This function will then take
335	* care to map the inode according to @idmap before checking permissions.
336	* On non-idmapped mounts or if permission checking is to be performed on the
337	* raw inode simply passs @nop_mnt_idmap.
338	*/
339	static int acl_permission_check(struct mnt_idmap *idmap,
340	struct inode inode, int* mask)
341	{
342	unsigned int mode = inode->i_mode;
343	vfsuid_t vfsuid;
344
345	/ Are we the owner? If so, ACL's don't matter /
346	vfsuid = i_uid_into_vfsuid(idmap, inode);
347	if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
348	mask &= `7`;
349	mode >>= `6`;
350	return (mask & ~mode) ? -EACCES : `0`;
351	}
352
353	/ Do we have ACL's? /
354	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
355	int error = check_acl(idmap, inode, mask);
356	if (error != -EAGAIN)
357	return error;
358	}
359
360	/ Only RWX matters for group/other mode bits /
361	mask &= `7`;
362
363	/*
364	* Are the group permissions different from
365	* the other permissions in the bits we care
366	* about? Need to check group ownership if so.
367	*/
368	if (mask & (mode ^ (mode >> `3`))) {
369	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
370	if (vfsgid_in_group_p(vfsgid))
371	mode >>= `3`;
372	}
373
374	/ Bits in 'mode' clear that we require? /
375	return (mask & ~mode) ? -EACCES : `0`;
376	}
377
378	/**
379	* generic_permission - check for access rights on a Posix-like filesystem
380	* @idmap: idmap of the mount the inode was found from
381	* @inode: inode to check access rights for
382	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
383	* %MAY_NOT_BLOCK ...)
384	*
385	* Used to check for read/write/execute permissions on a file.
386	* We use "fsuid" for this, letting us set arbitrary permissions
387	* for filesystem access without changing the "normal" uids which
388	* are used for other things.
389	*
390	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
391	* request cannot be satisfied (eg. requires blocking or too much complexity).
392	* It would then be called again in ref-walk mode.
393	*
394	* If the inode has been found through an idmapped mount the idmap of
395	* the vfsmount must be passed through @idmap. This function will then take
396	* care to map the inode according to @idmap before checking permissions.
397	* On non-idmapped mounts or if permission checking is to be performed on the
398	* raw inode simply passs @nop_mnt_idmap.
399	*/
400	int generic_permission(struct mnt_idmap idmap, struct* inode *inode,
401	int mask)
402	{
403	int ret;
404
405	/*
406	* Do the basic permission checks.
407	*/
408	ret = acl_permission_check(idmap, inode, mask);
409	if (ret != -EACCES)
410	return ret;
411
412	if (S_ISDIR(inode->i_mode)) {
413	/ DACs are overridable for directories /
414	if (!(mask & MAY_WRITE))
415	if (capable_wrt_inode_uidgid(idmap, inode,
416	CAP_DAC_READ_SEARCH))
417	return `0`;
418	if (capable_wrt_inode_uidgid(idmap, inode,
419	CAP_DAC_OVERRIDE))
420	return `0`;
421	return -EACCES;
422	}
423
424	/*
425	* Searching includes executable on directories, else just read.
426	*/
427	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
428	if (mask == MAY_READ)
429	if (capable_wrt_inode_uidgid(idmap, inode,
430	CAP_DAC_READ_SEARCH))
431	return `0`;
432	/*
433	* Read/write DACs are always overridable.
434	* Executable DACs are overridable when there is
435	* at least one exec bit set.
436	*/
437	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
438	if (capable_wrt_inode_uidgid(idmap, inode,
439	CAP_DAC_OVERRIDE))
440	return `0`;
441
442	return -EACCES;
443	}
444	EXPORT_SYMBOL(generic_permission);
445
446	/**
447	* do_inode_permission - UNIX permission checking
448	* @idmap: idmap of the mount the inode was found from
449	* @inode: inode to check permissions on
450	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
451	*
452	* We _really_ want to just do "generic_permission()" without
453	* even looking at the inode->i_op values. So we keep a cache
454	* flag in inode->i_opflags, that says "this has not special
455	* permission function, use the fast case".
456	*/
457	static inline int do_inode_permission(struct mnt_idmap *idmap,
458	struct inode inode, int* mask)
459	{
460	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
461	if (likely(inode->i_op->permission))
462	return inode->i_op->permission(idmap, inode, mask);
463
464	/ This gets set once for the inode lifetime /
465	spin_lock(lock: &inode->i_lock);
466	inode->i_opflags \|= IOP_FASTPERM;
467	spin_unlock(lock: &inode->i_lock);
468	}
469	return generic_permission(idmap, inode, mask);
470	}
471
472	/**
473	* sb_permission - Check superblock-level permissions
474	* @sb: Superblock of inode to check permission on
475	* @inode: Inode to check permission on
476	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
477	*
478	* Separate out file-system wide checks from inode-specific permission checks.
479	*/
480	static int sb_permission(struct super_block sb, struct* inode inode, int* mask)
481	{
482	if (unlikely(mask & MAY_WRITE)) {
483	umode_t mode = inode->i_mode;
484
485	/ Nobody gets write access to a read-only fs. /
486	if (sb_rdonly(sb) && (S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
487	return -EROFS;
488	}
489	return `0`;
490	}
491
492	/**
493	* inode_permission - Check for access rights to a given inode
494	* @idmap: idmap of the mount the inode was found from
495	* @inode: Inode to check permission on
496	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
497	*
498	* Check for read/write/execute permissions on an inode. We use fs[ug]id for
499	* this, letting us set arbitrary permissions for filesystem access without
500	* changing the "normal" UIDs which are used for other things.
501	*
502	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
503	*/
504	int inode_permission(struct mnt_idmap *idmap,
505	struct inode inode, int* mask)
506	{
507	int retval;
508
509	retval = sb_permission(sb: inode->i_sb, inode, mask);
510	if (retval)
511	return retval;
512
513	if (unlikely(mask & MAY_WRITE)) {
514	/*
515	* Nobody gets write access to an immutable file.
516	*/
517	if (IS_IMMUTABLE(inode))
518	return -EPERM;
519
520	/*
521	* Updating mtime will likely cause i_uid and i_gid to be
522	* written back improperly if their true value is unknown
523	* to the vfs.
524	*/
525	if (HAS_UNMAPPED_ID(idmap, inode))
526	return -EACCES;
527	}
528
529	retval = do_inode_permission(idmap, inode, mask);
530	if (retval)
531	return retval;
532
533	retval = devcgroup_inode_permission(inode, mask);
534	if (retval)
535	return retval;
536
537	return security_inode_permission(inode, mask);
538	}
539	EXPORT_SYMBOL(inode_permission);
540
541	/**
542	* path_get - get a reference to a path
543	* @path: path to get the reference to
544	*
545	* Given a path increment the reference count to the dentry and the vfsmount.
546	*/
547	void path_get(const struct path *path)
548	{
549	mntget(mnt: path->mnt);
550	dget(dentry: path->dentry);
551	}
552	EXPORT_SYMBOL(path_get);
553
554	/**
555	* path_put - put a reference to a path
556	* @path: path to put the reference to
557	*
558	* Given a path decrement the reference count to the dentry and the vfsmount.
559	*/
560	void path_put(const struct path *path)
561	{
562	dput(path->dentry);
563	mntput(mnt: path->mnt);
564	}
565	EXPORT_SYMBOL(path_put);
566
567	#define EMBEDDED_LEVELS 2
568	struct nameidata {
569	struct path path;
570	struct qstr last;
571	struct path root;
572	struct inode inode; /* path.dentry.d_inode /
573	unsigned int flags, state;
574	unsigned seq, next_seq, m_seq, r_seq;
575	int last_type;
576	unsigned depth;
577	int total_link_count;
578	struct saved {
579	struct path link;
580	struct delayed_call done;
581	const char *name;
582	unsigned seq;
583	} *stack, internal[EMBEDDED_LEVELS];
584	struct filename *name;
585	struct nameidata *saved;
586	unsigned root_seq;
587	int dfd;
588	vfsuid_t dir_vfsuid;
589	umode_t dir_mode;
590	} __randomize_layout;
591
592	#define ND_ROOT_PRESET 1
593	#define ND_ROOT_GRABBED 2
594	#define ND_JUMPED 4
595
596	static void __set_nameidata(struct nameidata p, int* dfd, struct filename *name)
597	{
598	struct nameidata *old = current->nameidata;
599	p->stack = p->internal;
600	p->depth = `0`;
601	p->dfd = dfd;
602	p->name = name;
603	p->path.mnt = NULL;
604	p->path.dentry = NULL;
605	p->total_link_count = old ? old->total_link_count : `0`;
606	p->saved = old;
607	current->nameidata = p;
608	}
609
610	static inline void set_nameidata(struct nameidata p, int* dfd, struct filename *name,
611	const struct path *root)
612	{
613	__set_nameidata(p, dfd, name);
614	p->state = `0`;
615	if (unlikely(root)) {
616	p->state = ND_ROOT_PRESET;
617	p->root = *root;
618	}
619	}
620
621	static void restore_nameidata(void)
622	{
623	struct nameidata now = current->nameidata, old = now->saved;
624
625	current->nameidata = old;
626	if (old)
627	old->total_link_count = now->total_link_count;
628	if (now->stack != now->internal)
629	kfree(objp: now->stack);
630	}
631
632	static bool nd_alloc_stack(struct nameidata *nd)
633	{
634	struct saved *p;
635
636	p= kmalloc_array(MAXSYMLINKS, size: sizeof(struct saved),
637	flags: nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
638	if (unlikely(!p))
639	return false;
640	memcpy(p, nd->internal, sizeof(nd->internal));
641	nd->stack = p;
642	return true;
643	}
644
645	/**
646	* path_connected - Verify that a dentry is below mnt.mnt_root
647	* @mnt: The mountpoint to check.
648	* @dentry: The dentry to check.
649	*
650	* Rename can sometimes move a file or directory outside of a bind
651	* mount, path_connected allows those cases to be detected.
652	*/
653	static bool path_connected(struct vfsmount mnt, struct* dentry *dentry)
654	{
655	struct super_block *sb = mnt->mnt_sb;
656
657	/ Bind mounts can have disconnected paths /
658	if (mnt->mnt_root == sb->s_root)
659	return true;
660
661	return is_subdir(dentry, mnt->mnt_root);
662	}
663
664	static void drop_links(struct nameidata *nd)
665	{
666	int i = nd->depth;
667	while (i--) {
668	struct saved *last = nd->stack + i;
669	do_delayed_call(call: &last->done);
670	clear_delayed_call(call: &last->done);
671	}
672	}
673
674	static void leave_rcu(struct nameidata *nd)
675	{
676	nd->flags &= ~LOOKUP_RCU;
677	nd->seq = nd->next_seq = `0`;
678	rcu_read_unlock();
679	}
680
681	static void terminate_walk(struct nameidata *nd)
682	{
683	drop_links(nd);
684	if (!(nd->flags & LOOKUP_RCU)) {
685	int i;
686	path_put(&nd->path);
687	for (i = `0`; i < nd->depth; i++)
688	path_put(&nd->stack[i].link);
689	if (nd->state & ND_ROOT_GRABBED) {
690	path_put(&nd->root);
691	nd->state &= ~ND_ROOT_GRABBED;
692	}
693	} else {
694	leave_rcu(nd);
695	}
696	nd->depth = `0`;
697	nd->path.mnt = NULL;
698	nd->path.dentry = NULL;
699	}
700
701	/ path_put is needed afterwards regardless of success or failure /
702	static bool __legitimize_path(struct path path, unsigned* seq, unsigned mseq)
703	{
704	int res = __legitimize_mnt(path->mnt, mseq);
705	if (unlikely(res)) {
706	if (res > `0`)
707	path->mnt = NULL;
708	path->dentry = NULL;
709	return false;
710	}
711	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
712	path->dentry = NULL;
713	return false;
714	}
715	return !read_seqcount_retry(&path->dentry->d_seq, seq);
716	}
717
718	static inline bool legitimize_path(struct nameidata *nd,
719	struct path path, unsigned* seq)
720	{
721	return __legitimize_path(path, seq, mseq: nd->m_seq);
722	}
723
724	static bool legitimize_links(struct nameidata *nd)
725	{
726	int i;
727	if (unlikely(nd->flags & LOOKUP_CACHED)) {
728	drop_links(nd);
729	nd->depth = `0`;
730	return false;
731	}
732	for (i = `0`; i < nd->depth; i++) {
733	struct saved *last = nd->stack + i;
734	if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
735	drop_links(nd);
736	nd->depth = i + `1`;
737	return false;
738	}
739	}
740	return true;
741	}
742
743	static bool legitimize_root(struct nameidata *nd)
744	{
745	/ Nothing to do if nd->root is zero or is managed by the VFS user. /
746	if (!nd->root.mnt \|\| (nd->state & ND_ROOT_PRESET))
747	return true;
748	nd->state \|= ND_ROOT_GRABBED;
749	return legitimize_path(nd, path: &nd->root, seq: nd->root_seq);
750	}
751
752	/*
753	* Path walking has 2 modes, rcu-walk and ref-walk (see
754	* Documentation/filesystems/path-lookup.txt). In situations when we can't
755	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
756	* normal reference counts on dentries and vfsmounts to transition to ref-walk
757	* mode. Refcounts are grabbed at the last known good point before rcu-walk
758	* got stuck, so ref-walk may continue from there. If this is not successful
759	* (eg. a seqcount has changed), then failure is returned and it's up to caller
760	* to restart the path walk from the beginning in ref-walk mode.
761	*/
762
763	/**
764	* try_to_unlazy - try to switch to ref-walk mode.
765	* @nd: nameidata pathwalk data
766	* Returns: true on success, false on failure
767	*
768	* try_to_unlazy attempts to legitimize the current nd->path and nd->root
769	* for ref-walk mode.
770	* Must be called from rcu-walk context.
771	* Nothing should touch nameidata between try_to_unlazy() failure and
772	* terminate_walk().
773	*/
774	static bool try_to_unlazy(struct nameidata *nd)
775	{
776	struct dentry *parent = nd->path.dentry;
777
778	BUG_ON(!(nd->flags & LOOKUP_RCU));
779
780	if (unlikely(!legitimize_links(nd)))
781	goto out1;
782	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
783	goto out;
784	if (unlikely(!legitimize_root(nd)))
785	goto out;
786	leave_rcu(nd);
787	BUG_ON(nd->inode != parent->d_inode);
788	return true;
789
790	out1:
791	nd->path.mnt = NULL;
792	nd->path.dentry = NULL;
793	out:
794	leave_rcu(nd);
795	return false;
796	}
797
798	/**
799	* try_to_unlazy_next - try to switch to ref-walk mode.
800	* @nd: nameidata pathwalk data
801	* @dentry: next dentry to step into
802	* Returns: true on success, false on failure
803	*
804	* Similar to try_to_unlazy(), but here we have the next dentry already
805	* picked by rcu-walk and want to legitimize that in addition to the current
806	* nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context.
807	* Nothing should touch nameidata between try_to_unlazy_next() failure and
808	* terminate_walk().
809	*/
810	static bool try_to_unlazy_next(struct nameidata nd, struct* dentry *dentry)
811	{
812	int res;
813	BUG_ON(!(nd->flags & LOOKUP_RCU));
814
815	if (unlikely(!legitimize_links(nd)))
816	goto out2;
817	res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
818	if (unlikely(res)) {
819	if (res > `0`)
820	goto out2;
821	goto out1;
822	}
823	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
824	goto out1;
825
826	/*
827	* We need to move both the parent and the dentry from the RCU domain
828	* to be properly refcounted. And the sequence number in the dentry
829	* validates both dentry counters, since we checked the sequence
830	* number of the parent after we got the child sequence number. So we
831	* know the parent must still be valid if the child sequence number is
832	*/
833	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
834	goto out;
835	if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
836	goto out_dput;
837	/*
838	* Sequence counts matched. Now make sure that the root is
839	* still valid and get it if required.
840	*/
841	if (unlikely(!legitimize_root(nd)))
842	goto out_dput;
843	leave_rcu(nd);
844	return true;
845
846	out2:
847	nd->path.mnt = NULL;
848	out1:
849	nd->path.dentry = NULL;
850	out:
851	leave_rcu(nd);
852	return false;
853	out_dput:
854	leave_rcu(nd);
855	dput(dentry);
856	return false;
857	}
858
859	static inline int d_revalidate(struct dentry dentry, unsigned* int flags)
860	{
861	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
862	return dentry->d_op->d_revalidate(dentry, flags);
863	else
864	return `1`;
865	}
866
867	/**
868	* complete_walk - successful completion of path walk
869	* @nd: pointer nameidata
870	*
871	* If we had been in RCU mode, drop out of it and legitimize nd->path.
872	* Revalidate the final result, unless we'd already done that during
873	* the path walk or the filesystem doesn't ask for it. Return 0 on
874	* success, -error on failure. In case of failure caller does not
875	* need to drop nd->path.
876	*/
877	static int complete_walk(struct nameidata *nd)
878	{
879	struct dentry *dentry = nd->path.dentry;
880	int status;
881
882	if (nd->flags & LOOKUP_RCU) {
883	/*
884	* We don't want to zero nd->root for scoped-lookups or
885	* externally-managed nd->root.
886	*/
887	if (!(nd->state & ND_ROOT_PRESET))
888	if (!(nd->flags & LOOKUP_IS_SCOPED))
889	nd->root.mnt = NULL;
890	nd->flags &= ~LOOKUP_CACHED;
891	if (!try_to_unlazy(nd))
892	return -ECHILD;
893	}
894
895	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
896	/*
897	* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
898	* ever step outside the root during lookup" and should already
899	* be guaranteed by the rest of namei, we want to avoid a namei
900	* BUG resulting in userspace being given a path that was not
901	* scoped within the root at some point during the lookup.
902	*
903	* So, do a final sanity-check to make sure that in the
904	* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
905	* we won't silently return an fd completely outside of the
906	* requested root to userspace.
907	*
908	* Userspace could move the path outside the root after this
909	* check, but as discussed elsewhere this is not a concern (the
910	* resolved file was inside the root at some point).
911	*/
912	if (!path_is_under(&nd->path, &nd->root))
913	return -EXDEV;
914	}
915
916	if (likely(!(nd->state & ND_JUMPED)))
917	return `0`;
918
919	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
920	return `0`;
921
922	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
923	if (status > `0`)
924	return `0`;
925
926	if (!status)
927	status = -ESTALE;
928
929	return status;
930	}
931
932	static int set_root(struct nameidata *nd)
933	{
934	struct fs_struct *fs = current->fs;
935
936	/*
937	* Jumping to the real root in a scoped-lookup is a BUG in namei, but we
938	* still have to ensure it doesn't happen because it will cause a breakout
939	* from the dirfd.
940	*/
941	if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
942	return -ENOTRECOVERABLE;
943
944	if (nd->flags & LOOKUP_RCU) {
945	unsigned seq;
946
947	do {
948	seq = read_seqcount_begin(&fs->seq);
949	nd->root = fs->root;
950	nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
951	} while (read_seqcount_retry(&fs->seq, seq));
952	} else {
953	get_fs_root(fs, root: &nd->root);
954	nd->state \|= ND_ROOT_GRABBED;
955	}
956	return `0`;
957	}
958
959	static int nd_jump_root(struct nameidata *nd)
960	{
961	if (unlikely(nd->flags & LOOKUP_BENEATH))
962	return -EXDEV;
963	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
964	/ Absolute path arguments to path_init() are allowed. /
965	if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
966	return -EXDEV;
967	}
968	if (!nd->root.mnt) {
969	int error = set_root(nd);
970	if (error)
971	return error;
972	}
973	if (nd->flags & LOOKUP_RCU) {
974	struct dentry *d;
975	nd->path = nd->root;
976	d = nd->path.dentry;
977	nd->inode = d->d_inode;
978	nd->seq = nd->root_seq;
979	if (read_seqcount_retry(&d->d_seq, nd->seq))
980	return -ECHILD;
981	} else {
982	path_put(&nd->path);
983	nd->path = nd->root;
984	path_get(&nd->path);
985	nd->inode = nd->path.dentry->d_inode;
986	}
987	nd->state \|= ND_JUMPED;
988	return `0`;
989	}
990
991	/*
992	* Helper to directly jump to a known parsed path from ->get_link,
993	* caller must have taken a reference to path beforehand.
994	*/
995	int nd_jump_link(const struct path *path)
996	{
997	int error = -ELOOP;
998	struct nameidata *nd = current->nameidata;
999
1000	if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
1001	goto err;
1002
1003	error = -EXDEV;
1004	if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
1005	if (nd->path.mnt != path->mnt)
1006	goto err;
1007	}
1008	/ Not currently safe for scoped-lookups. /
1009	if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
1010	goto err;
1011
1012	path_put(&nd->path);
1013	nd->path = *path;
1014	nd->inode = nd->path.dentry->d_inode;
1015	nd->state \|= ND_JUMPED;
1016	return `0`;
1017
1018	err:
1019	path_put(path);
1020	return error;
1021	}
1022
1023	static inline void put_link(struct nameidata *nd)
1024	{
1025	struct saved *last = nd->stack + --nd->depth;
1026	do_delayed_call(call: &last->done);
1027	if (!(nd->flags & LOOKUP_RCU))
1028	path_put(&last->link);
1029	}
1030
1031	static int sysctl_protected_symlinks __read_mostly;
1032	static int sysctl_protected_hardlinks __read_mostly;
1033	static int sysctl_protected_fifos __read_mostly;
1034	static int sysctl_protected_regular __read_mostly;
1035
1036	#ifdef CONFIG_SYSCTL
1037	static struct ctl_table namei_sysctls[] = {
1038	{
1039	.procname = "protected_symlinks",
1040	.data = &sysctl_protected_symlinks,
1041	.maxlen = sizeof(int),
1042	.mode = `0644`,
1043	.proc_handler = proc_dointvec_minmax,
1044	.extra1 = SYSCTL_ZERO,
1045	.extra2 = SYSCTL_ONE,
1046	},
1047	{
1048	.procname = "protected_hardlinks",
1049	.data = &sysctl_protected_hardlinks,
1050	.maxlen = sizeof(int),
1051	.mode = `0644`,
1052	.proc_handler = proc_dointvec_minmax,
1053	.extra1 = SYSCTL_ZERO,
1054	.extra2 = SYSCTL_ONE,
1055	},
1056	{
1057	.procname = "protected_fifos",
1058	.data = &sysctl_protected_fifos,
1059	.maxlen = sizeof(int),
1060	.mode = `0644`,
1061	.proc_handler = proc_dointvec_minmax,
1062	.extra1 = SYSCTL_ZERO,
1063	.extra2 = SYSCTL_TWO,
1064	},
1065	{
1066	.procname = "protected_regular",
1067	.data = &sysctl_protected_regular,
1068	.maxlen = sizeof(int),
1069	.mode = `0644`,
1070	.proc_handler = proc_dointvec_minmax,
1071	.extra1 = SYSCTL_ZERO,
1072	.extra2 = SYSCTL_TWO,
1073	},
1074	{ }
1075	};
1076
1077	static int __init init_fs_namei_sysctls(void)
1078	{
1079	register_sysctl_init("fs", namei_sysctls);
1080	return `0`;
1081	}
1082	fs_initcall(init_fs_namei_sysctls);
1083
1084	#endif /* CONFIG_SYSCTL */
1085
1086	/**
1087	* may_follow_link - Check symlink following for unsafe situations
1088	* @nd: nameidata pathwalk data
1089	* @inode: Used for idmapping.
1090	*
1091	* In the case of the sysctl_protected_symlinks sysctl being enabled,
1092	* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
1093	* in a sticky world-writable directory. This is to protect privileged
1094	* processes from failing races against path names that may change out
1095	* from under them by way of other users creating malicious symlinks.
1096	* It will permit symlinks to be followed only when outside a sticky
1097	* world-writable directory, or when the uid of the symlink and follower
1098	* match, or when the directory owner matches the symlink's owner.
1099	*
1100	* Returns 0 if following the symlink is allowed, -ve on error.
1101	*/
1102	static inline int may_follow_link(struct nameidata nd, const* struct inode *inode)
1103	{
1104	struct mnt_idmap *idmap;
1105	vfsuid_t vfsuid;
1106
1107	if (!sysctl_protected_symlinks)
1108	return `0`;
1109
1110	idmap = mnt_idmap(mnt: nd->path.mnt);
1111	vfsuid = i_uid_into_vfsuid(idmap, inode);
1112	/ Allowed if owner and follower match. /
1113	if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
1114	return `0`;
1115
1116	/ Allowed if parent directory not sticky and world-writable. /
1117	if ((nd->dir_mode & (S_ISVTX\|S_IWOTH)) != (S_ISVTX\|S_IWOTH))
1118	return `0`;
1119
1120	/ Allowed if parent directory and link owner match. /
1121	if (vfsuid_valid(uid: nd->dir_vfsuid) && vfsuid_eq(left: nd->dir_vfsuid, right: vfsuid))
1122	return `0`;
1123
1124	if (nd->flags & LOOKUP_RCU)
1125	return -ECHILD;
1126
1127	audit_inode(name: nd->name, dentry: nd->stack[`0`].link.dentry, aflags: `0`);
1128	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "follow_link");
1129	return -EACCES;
1130	}
1131
1132	/**
1133	* safe_hardlink_source - Check for safe hardlink conditions
1134	* @idmap: idmap of the mount the inode was found from
1135	* @inode: the source inode to hardlink from
1136	*
1137	* Return false if at least one of the following conditions:
1138	* - inode is not a regular file
1139	* - inode is setuid
1140	* - inode is setgid and group-exec
1141	* - access failure for read and write
1142	*
1143	* Otherwise returns true.
1144	*/
1145	static bool safe_hardlink_source(struct mnt_idmap *idmap,
1146	struct inode *inode)
1147	{
1148	umode_t mode = inode->i_mode;
1149
1150	/ Special files should not get pinned to the filesystem. /
1151	if (!S_ISREG(mode))
1152	return false;
1153
1154	/ Setuid files should not get pinned to the filesystem. /
1155	if (mode & S_ISUID)
1156	return false;
1157
1158	/ Executable setgid files should not get pinned to the filesystem. /
1159	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP))
1160	return false;
1161
1162	/ Hardlinking to unreadable or unwritable sources is dangerous. /
1163	if (inode_permission(idmap, inode, MAY_READ \| MAY_WRITE))
1164	return false;
1165
1166	return true;
1167	}
1168
1169	/**
1170	* may_linkat - Check permissions for creating a hardlink
1171	* @idmap: idmap of the mount the inode was found from
1172	* @link: the source to hardlink from
1173	*
1174	* Block hardlink when all of:
1175	* - sysctl_protected_hardlinks enabled
1176	* - fsuid does not match inode
1177	* - hardlink source is unsafe (see safe_hardlink_source() above)
1178	* - not CAP_FOWNER in a namespace with the inode owner uid mapped
1179	*
1180	* If the inode has been found through an idmapped mount the idmap of
1181	* the vfsmount must be passed through @idmap. This function will then take
1182	* care to map the inode according to @idmap before checking permissions.
1183	* On non-idmapped mounts or if permission checking is to be performed on the
1184	* raw inode simply pass @nop_mnt_idmap.
1185	*
1186	* Returns 0 if successful, -ve on error.
1187	*/
1188	int may_linkat(struct mnt_idmap idmap, const* struct path *link)
1189	{
1190	struct inode *inode = link->dentry->d_inode;
1191
1192	/ Inode writeback is not safe when the uid or gid are invalid. /
1193	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
1194	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
1195	return -EOVERFLOW;
1196
1197	if (!sysctl_protected_hardlinks)
1198	return `0`;
1199
1200	/ Source inode owner (or CAP_FOWNER) can hardlink all they like,*
1201	* otherwise, it must be a safe source.
1202	*/
1203	if (safe_hardlink_source(idmap, inode) \|\|
1204	inode_owner_or_capable(idmap, inode))
1205	return `0`;
1206
1207	audit_log_path_denied(AUDIT_ANOM_LINK, operation: "linkat");
1208	return -EPERM;
1209	}
1210
1211	/**
1212	* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
1213	* should be allowed, or not, on files that already
1214	* exist.
1215	* @idmap: idmap of the mount the inode was found from
1216	* @nd: nameidata pathwalk data
1217	* @inode: the inode of the file to open
1218	*
1219	* Block an O_CREAT open of a FIFO (or a regular file) when:
1220	* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
1221	* - the file already exists
1222	* - we are in a sticky directory
1223	* - we don't own the file
1224	* - the owner of the directory doesn't own the file
1225	* - the directory is world writable
1226	* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
1227	* the directory doesn't have to be world writable: being group writable will
1228	* be enough.
1229	*
1230	* If the inode has been found through an idmapped mount the idmap of
1231	* the vfsmount must be passed through @idmap. This function will then take
1232	* care to map the inode according to @idmap before checking permissions.
1233	* On non-idmapped mounts or if permission checking is to be performed on the
1234	* raw inode simply pass @nop_mnt_idmap.
1235	*
1236	* Returns 0 if the open is allowed, -ve on error.
1237	*/
1238	static int may_create_in_sticky(struct mnt_idmap *idmap,
1239	struct nameidata nd, struct* inode *const inode)
1240	{
1241	umode_t dir_mode = nd->dir_mode;
1242	vfsuid_t dir_vfsuid = nd->dir_vfsuid;
1243
1244	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) \|\|
1245	(!sysctl_protected_regular && S_ISREG(inode->i_mode)) \|\|
1246	likely(!(dir_mode & S_ISVTX)) \|\|
1247	vfsuid_eq(left: i_uid_into_vfsuid(idmap, inode), right: dir_vfsuid) \|\|
1248	vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), current_fsuid()))
1249	return `0`;
1250
1251	if (likely(dir_mode & `0002`) \|\|
1252	(dir_mode & `0020` &&
1253	((sysctl_protected_fifos >= `2` && S_ISFIFO(inode->i_mode)) \|\|
1254	(sysctl_protected_regular >= `2` && S_ISREG(inode->i_mode))))) {
1255	const char *operation = S_ISFIFO(inode->i_mode) ?
1256	"sticky_create_fifo" :
1257	"sticky_create_regular";
1258	audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
1259	return -EACCES;
1260	}
1261	return `0`;
1262	}
1263
1264	/*
1265	* follow_up - Find the mountpoint of path's vfsmount
1266	*
1267	* Given a path, find the mountpoint of its source file system.
1268	* Replace @path with the path of the mountpoint in the parent mount.
1269	* Up is towards /.
1270	*
1271	* Return 1 if we went up a level and 0 if we were already at the
1272	* root.
1273	*/
1274	int follow_up(struct path *path)
1275	{
1276	struct mount *mnt = real_mount(mnt: path->mnt);
1277	struct mount *parent;
1278	struct dentry *mountpoint;
1279
1280	read_seqlock_excl(sl: &mount_lock);
1281	parent = mnt->mnt_parent;
1282	if (parent == mnt) {
1283	read_sequnlock_excl(sl: &mount_lock);
1284	return `0`;
1285	}
1286	mntget(mnt: &parent->mnt);
1287	mountpoint = dget(dentry: mnt->mnt_mountpoint);
1288	read_sequnlock_excl(sl: &mount_lock);
1289	dput(path->dentry);
1290	path->dentry = mountpoint;
1291	mntput(mnt: path->mnt);
1292	path->mnt = &parent->mnt;
1293	return `1`;
1294	}
1295	EXPORT_SYMBOL(follow_up);
1296
1297	static bool choose_mountpoint_rcu(struct mount m, const* struct path *root,
1298	struct path path, unsigned* *seqp)
1299	{
1300	while (mnt_has_parent(mnt: m)) {
1301	struct dentry *mountpoint = m->mnt_mountpoint;
1302
1303	m = m->mnt_parent;
1304	if (unlikely(root->dentry == mountpoint &&
1305	root->mnt == &m->mnt))
1306	break;
1307	if (mountpoint != m->mnt.mnt_root) {
1308	path->mnt = &m->mnt;
1309	path->dentry = mountpoint;
1310	*seqp = read_seqcount_begin(&mountpoint->d_seq);
1311	return true;
1312	}
1313	}
1314	return false;
1315	}
1316
1317	static bool choose_mountpoint(struct mount m, const* struct path *root,
1318	struct path *path)
1319	{
1320	bool found;
1321
1322	rcu_read_lock();
1323	while (`1`) {
1324	unsigned seq, mseq = read_seqbegin(sl: &mount_lock);
1325
1326	found = choose_mountpoint_rcu(m, root, path, seqp: &seq);
1327	if (unlikely(!found)) {
1328	if (!read_seqretry(sl: &mount_lock, start: mseq))
1329	break;
1330	} else {
1331	if (likely(__legitimize_path(path, seq, mseq)))
1332	break;
1333	rcu_read_unlock();
1334	path_put(path);
1335	rcu_read_lock();
1336	}
1337	}
1338	rcu_read_unlock();
1339	return found;
1340	}
1341
1342	/*
1343	* Perform an automount
1344	* - return -EISDIR to tell follow_managed() to stop and return the path we
1345	* were called with.
1346	*/
1347	static int follow_automount(struct path path, int* count, unsigned* lookup_flags)
1348	{
1349	struct dentry *dentry = path->dentry;
1350
1351	/ We don't want to mount if someone's just doing a stat -*
1352	* unless they're stat'ing a directory and appended a '/' to
1353	* the name.
1354	*
1355	* We do, however, want to mount if someone wants to open or
1356	* create a file of any type under the mountpoint, wants to
1357	* traverse through the mountpoint or wants to open the
1358	* mounted directory. Also, autofs may mark negative dentries
1359	* as being automount points. These will need the attentions
1360	* of the daemon to instantiate them before they can be used.
1361	*/
1362	if (!(lookup_flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
1363	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
1364	dentry->d_inode)
1365	return -EISDIR;
1366
1367	if (count && (*count)++ >= MAXSYMLINKS)
1368	return -ELOOP;
1369
1370	return finish_automount(dentry->d_op->d_automount(path), path);
1371	}
1372
1373	/*
1374	* mount traversal - out-of-line part. One note on ->d_flags accesses -
1375	* dentries are pinned but not locked here, so negative dentry can go
1376	* positive right under us. Use of smp_load_acquire() provides a barrier
1377	* sufficient for ->d_inode and ->d_flags consistency.
1378	*/
1379	static int __traverse_mounts(struct path path, unsigned* flags, bool *jumped,
1380	int count, unsigned* lookup_flags)
1381	{
1382	struct vfsmount *mnt = path->mnt;
1383	bool need_mntput = false;
1384	int ret = `0`;
1385
1386	while (flags & DCACHE_MANAGED_DENTRY) {
1387	/ Allow the filesystem to manage the transit without i_mutex*
1388	* being held. */
1389	if (flags & DCACHE_MANAGE_TRANSIT) {
1390	ret = path->dentry->d_op->d_manage(path, false);
1391	flags = smp_load_acquire(&path->dentry->d_flags);
1392	if (ret < `0`)
1393	break;
1394	}
1395
1396	if (flags & DCACHE_MOUNTED) { // something's mounted on it..
1397	struct vfsmount *mounted = lookup_mnt(path);
1398	if (mounted) { // ... in our namespace
1399	dput(path->dentry);
1400	if (need_mntput)
1401	mntput(mnt: path->mnt);
1402	path->mnt = mounted;
1403	path->dentry = dget(dentry: mounted->mnt_root);
1404	// here we know it's positive
1405	flags = path->dentry->d_flags;
1406	need_mntput = true;
1407	continue;
1408	}
1409	}
1410
1411	if (!(flags & DCACHE_NEED_AUTOMOUNT))
1412	break;
1413
1414	// uncovered automount point
1415	ret = follow_automount(path, count, lookup_flags);
1416	flags = smp_load_acquire(&path->dentry->d_flags);
1417	if (ret < `0`)
1418	break;
1419	}
1420
1421	if (ret == -EISDIR)
1422	ret = `0`;
1423	// possible if you race with several mount --move
1424	if (need_mntput && path->mnt == mnt)
1425	mntput(mnt: path->mnt);
1426	if (!ret && unlikely(d_flags_negative(flags)))
1427	ret = -ENOENT;
1428	*jumped = need_mntput;
1429	return ret;
1430	}
1431
1432	static inline int traverse_mounts(struct path path, bool jumped,
1433	int count, unsigned* lookup_flags)
1434	{
1435	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
1436
1437	/ fastpath /
1438	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
1439	*jumped = false;
1440	if (unlikely(d_flags_negative(flags)))
1441	return -ENOENT;
1442	return `0`;
1443	}
1444	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
1445	}
1446
1447	int follow_down_one(struct path *path)
1448	{
1449	struct vfsmount *mounted;
1450
1451	mounted = lookup_mnt(path);
1452	if (mounted) {
1453	dput(path->dentry);
1454	mntput(mnt: path->mnt);
1455	path->mnt = mounted;
1456	path->dentry = dget(dentry: mounted->mnt_root);
1457	return `1`;
1458	}
1459	return `0`;
1460	}
1461	EXPORT_SYMBOL(follow_down_one);
1462
1463	/*
1464	* Follow down to the covering mount currently visible to userspace. At each
1465	* point, the filesystem owning that dentry may be queried as to whether the
1466	* caller is permitted to proceed or not.
1467	*/
1468	int follow_down(struct path path, unsigned* int flags)
1469	{
1470	struct vfsmount *mnt = path->mnt;
1471	bool jumped;
1472	int ret = traverse_mounts(path, jumped: &jumped, NULL, lookup_flags: flags);
1473
1474	if (path->mnt != mnt)
1475	mntput(mnt);
1476	return ret;
1477	}
1478	EXPORT_SYMBOL(follow_down);
1479
1480	/*
1481	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
1482	* we meet a managed dentry that would need blocking.
1483	*/
1484	static bool __follow_mount_rcu(struct nameidata nd, struct* path *path)
1485	{
1486	struct dentry *dentry = path->dentry;
1487	unsigned int flags = dentry->d_flags;
1488
1489	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
1490	return true;
1491
1492	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1493	return false;
1494
1495	for (;;) {
1496	/*
1497	* Don't forget we might have a non-mountpoint managed dentry
1498	* that wants to block transit.
1499	*/
1500	if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
1501	int res = dentry->d_op->d_manage(path, true);
1502	if (res)
1503	return res == -EISDIR;
1504	flags = dentry->d_flags;
1505	}
1506
1507	if (flags & DCACHE_MOUNTED) {
1508	struct mount *mounted = __lookup_mnt(path->mnt, dentry);
1509	if (mounted) {
1510	path->mnt = &mounted->mnt;
1511	dentry = path->dentry = mounted->mnt.mnt_root;
1512	nd->state \|= ND_JUMPED;
1513	nd->next_seq = read_seqcount_begin(&dentry->d_seq);
1514	flags = dentry->d_flags;
1515	// makes sure that non-RCU pathwalk could reach
1516	// this state.
1517	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1518	return false;
1519	continue;
1520	}
1521	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1522	return false;
1523	}
1524	return !(flags & DCACHE_NEED_AUTOMOUNT);
1525	}
1526	}
1527
1528	static inline int handle_mounts(struct nameidata nd, struct* dentry *dentry,
1529	struct path *path)
1530	{
1531	bool jumped;
1532	int ret;
1533
1534	path->mnt = nd->path.mnt;
1535	path->dentry = dentry;
1536	if (nd->flags & LOOKUP_RCU) {
1537	unsigned int seq = nd->next_seq;
1538	if (likely(__follow_mount_rcu(nd, path)))
1539	return `0`;
1540	// path and nd->next_seq might've been clobbered*
1541	path->mnt = nd->path.mnt;
1542	path->dentry = dentry;
1543	nd->next_seq = seq;
1544	if (!try_to_unlazy_next(nd, dentry))
1545	return -ECHILD;
1546	}
1547	ret = traverse_mounts(path, jumped: &jumped, count: &nd->total_link_count, lookup_flags: nd->flags);
1548	if (jumped) {
1549	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1550	ret = -EXDEV;
1551	else
1552	nd->state \|= ND_JUMPED;
1553	}
1554	if (unlikely(ret)) {
1555	dput(path->dentry);
1556	if (path->mnt != nd->path.mnt)
1557	mntput(mnt: path->mnt);
1558	}
1559	return ret;
1560	}
1561
1562	/*
1563	* This looks up the name in dcache and possibly revalidates the found dentry.
1564	* NULL is returned if the dentry does not exist in the cache.
1565	*/
1566	static struct dentry lookup_dcache(const* struct qstr *name,
1567	struct dentry *dir,
1568	unsigned int flags)
1569	{
1570	struct dentry *dentry = d_lookup(dir, name);
1571	if (dentry) {
1572	int error = d_revalidate(dentry, flags);
1573	if (unlikely(error <= `0`)) {
1574	if (!error)
1575	d_invalidate(dentry);
1576	dput(dentry);
1577	return ERR_PTR(error);
1578	}
1579	}
1580	return dentry;
1581	}
1582
1583	/*
1584	* Parent directory has inode locked exclusive. This is one
1585	* and only case when ->lookup() gets called on non in-lookup
1586	* dentries - as the matter of fact, this only gets called
1587	* when directory is guaranteed to have no in-lookup children
1588	* at all.
1589	*/
1590	struct dentry lookup_one_qstr_excl(const* struct qstr *name,
1591	struct dentry *base,
1592	unsigned int flags)
1593	{
1594	struct dentry *dentry = lookup_dcache(name, dir: base, flags);
1595	struct dentry *old;
1596	struct inode *dir = base->d_inode;
1597
1598	if (dentry)
1599	return dentry;
1600
1601	/ Don't create child dentry for a dead directory. /
1602	if (unlikely(IS_DEADDIR(dir)))
1603	return ERR_PTR(error: -ENOENT);
1604
1605	dentry = d_alloc(base, name);
1606	if (unlikely(!dentry))
1607	return ERR_PTR(error: -ENOMEM);
1608
1609	old = dir->i_op->lookup(dir, dentry, flags);
1610	if (unlikely(old)) {
1611	dput(dentry);
1612	dentry = old;
1613	}
1614	return dentry;
1615	}
1616	EXPORT_SYMBOL(lookup_one_qstr_excl);
1617
1618	static struct dentry lookup_fast(struct* nameidata *nd)
1619	{
1620	struct dentry dentry, parent = nd->path.dentry;
1621	int status = `1`;
1622
1623	/*
1624	* Rename seqlock is not required here because in the off chance
1625	* of a false negative due to a concurrent rename, the caller is
1626	* going to fall back to non-racy lookup.
1627	*/
1628	if (nd->flags & LOOKUP_RCU) {
1629	dentry = __d_lookup_rcu(parent, name: &nd->last, seq: &nd->next_seq);
1630	if (unlikely(!dentry)) {
1631	if (!try_to_unlazy(nd))
1632	return ERR_PTR(error: -ECHILD);
1633	return NULL;
1634	}
1635
1636	/*
1637	* This sequence count validates that the parent had no
1638	* changes while we did the lookup of the dentry above.
1639	*/
1640	if (read_seqcount_retry(&parent->d_seq, nd->seq))
1641	return ERR_PTR(error: -ECHILD);
1642
1643	status = d_revalidate(dentry, flags: nd->flags);
1644	if (likely(status > `0`))
1645	return dentry;
1646	if (!try_to_unlazy_next(nd, dentry))
1647	return ERR_PTR(error: -ECHILD);
1648	if (status == -ECHILD)
1649	/ we'd been told to redo it in non-rcu mode /
1650	status = d_revalidate(dentry, flags: nd->flags);
1651	} else {
1652	dentry = __d_lookup(parent, &nd->last);
1653	if (unlikely(!dentry))
1654	return NULL;
1655	status = d_revalidate(dentry, flags: nd->flags);
1656	}
1657	if (unlikely(status <= `0`)) {
1658	if (!status)
1659	d_invalidate(dentry);
1660	dput(dentry);
1661	return ERR_PTR(error: status);
1662	}
1663	return dentry;
1664	}
1665
1666	/ Fast lookup failed, do it the slow way /
1667	static struct dentry __lookup_slow(const* struct qstr *name,
1668	struct dentry *dir,
1669	unsigned int flags)
1670	{
1671	struct dentry dentry, old;
1672	struct inode *inode = dir->d_inode;
1673	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1674
1675	/ Don't go there if it's already dead /
1676	if (unlikely(IS_DEADDIR(inode)))
1677	return ERR_PTR(error: -ENOENT);
1678	again:
1679	dentry = d_alloc_parallel(dir, name, &wq);
1680	if (IS_ERR(ptr: dentry))
1681	return dentry;
1682	if (unlikely(!d_in_lookup(dentry))) {
1683	int error = d_revalidate(dentry, flags);
1684	if (unlikely(error <= `0`)) {
1685	if (!error) {
1686	d_invalidate(dentry);
1687	dput(dentry);
1688	goto again;
1689	}
1690	dput(dentry);
1691	dentry = ERR_PTR(error);
1692	}
1693	} else {
1694	old = inode->i_op->lookup(inode, dentry, flags);
1695	d_lookup_done(dentry);
1696	if (unlikely(old)) {
1697	dput(dentry);
1698	dentry = old;
1699	}
1700	}
1701	return dentry;
1702	}
1703
1704	static struct dentry lookup_slow(const* struct qstr *name,
1705	struct dentry *dir,
1706	unsigned int flags)
1707	{
1708	struct inode *inode = dir->d_inode;
1709	struct dentry *res;
1710	inode_lock_shared(inode);
1711	res = __lookup_slow(name, dir, flags);
1712	inode_unlock_shared(inode);
1713	return res;
1714	}
1715
1716	static inline int may_lookup(struct mnt_idmap *idmap,
1717	struct nameidata *nd)
1718	{
1719	if (nd->flags & LOOKUP_RCU) {
1720	int err = inode_permission(idmap, nd->inode, MAY_EXEC\|MAY_NOT_BLOCK);
1721	if (err != -ECHILD \|\| !try_to_unlazy(nd))
1722	return err;
1723	}
1724	return inode_permission(idmap, nd->inode, MAY_EXEC);
1725	}
1726
1727	static int reserve_stack(struct nameidata nd, struct* path *link)
1728	{
1729	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
1730	return -ELOOP;
1731
1732	if (likely(nd->depth != EMBEDDED_LEVELS))
1733	return `0`;
1734	if (likely(nd->stack != nd->internal))
1735	return `0`;
1736	if (likely(nd_alloc_stack(nd)))
1737	return `0`;
1738
1739	if (nd->flags & LOOKUP_RCU) {
1740	// we need to grab link before we do unlazy. And we can't skip
1741	// unlazy even if we fail to grab the link - cleanup needs it
1742	bool grabbed_link = legitimize_path(nd, path: link, seq: nd->next_seq);
1743
1744	if (!try_to_unlazy(nd) \|\| !grabbed_link)
1745	return -ECHILD;
1746
1747	if (nd_alloc_stack(nd))
1748	return `0`;
1749	}
1750	return -ENOMEM;
1751	}
1752
1753	enum {WALK_TRAILING = `1`, WALK_MORE = `2`, WALK_NOFOLLOW = `4`};
1754
1755	static const char pick_link(struct* nameidata nd, struct* path *link,
1756	struct inode inode, int* flags)
1757	{
1758	struct saved *last;
1759	const char *res;
1760	int error = reserve_stack(nd, link);
1761
1762	if (unlikely(error)) {
1763	if (!(nd->flags & LOOKUP_RCU))
1764	path_put(link);
1765	return ERR_PTR(error);
1766	}
1767	last = nd->stack + nd->depth++;
1768	last->link = *link;
1769	clear_delayed_call(call: &last->done);
1770	last->seq = nd->next_seq;
1771
1772	if (flags & WALK_TRAILING) {
1773	error = may_follow_link(nd, inode);
1774	if (unlikely(error))
1775	return ERR_PTR(error);
1776	}
1777
1778	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) \|\|
1779	unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
1780	return ERR_PTR(error: -ELOOP);
1781
1782	if (!(nd->flags & LOOKUP_RCU)) {
1783	touch_atime(&last->link);
1784	cond_resched();
1785	} else if (atime_needs_update(&last->link, inode)) {
1786	if (!try_to_unlazy(nd))
1787	return ERR_PTR(error: -ECHILD);
1788	touch_atime(&last->link);
1789	}
1790
1791	error = security_inode_follow_link(dentry: link->dentry, inode,
1792	rcu: nd->flags & LOOKUP_RCU);
1793	if (unlikely(error))
1794	return ERR_PTR(error);
1795
1796	res = READ_ONCE(inode->i_link);
1797	if (!res) {
1798	const char * (get)(struct* dentry , struct* inode *,
1799	struct delayed_call *);
1800	get = inode->i_op->get_link;
1801	if (nd->flags & LOOKUP_RCU) {
1802	res = get(NULL, inode, &last->done);
1803	if (res == ERR_PTR(error: -ECHILD) && try_to_unlazy(nd))
1804	res = get(link->dentry, inode, &last->done);
1805	} else {
1806	res = get(link->dentry, inode, &last->done);
1807	}
1808	if (!res)
1809	goto all_done;
1810	if (IS_ERR(ptr: res))
1811	return res;
1812	}
1813	if (*res == `'/'`) {
1814	error = nd_jump_root(nd);
1815	if (unlikely(error))
1816	return ERR_PTR(error);
1817	while (unlikely(*++res == `'/'`))
1818	;
1819	}
1820	if (*res)
1821	return res;
1822	all_done: // pure jump
1823	put_link(nd);
1824	return NULL;
1825	}
1826
1827	/*
1828	* Do we need to follow links? We _really_ want to be able
1829	* to do this check without having to look at inode->i_op,
1830	* so we keep a cache of "no, this doesn't need follow_link"
1831	* for the common case.
1832	*
1833	* NOTE: dentry must be what nd->next_seq had been sampled from.
1834	*/
1835	static const char step_into(struct* nameidata nd, int* flags,
1836	struct dentry *dentry)
1837	{
1838	struct path path;
1839	struct inode *inode;
1840	int err = handle_mounts(nd, dentry, path: &path);
1841
1842	if (err < `0`)
1843	return ERR_PTR(error: err);
1844	inode = path.dentry->d_inode;
1845	if (likely(!d_is_symlink(path.dentry)) \|\|
1846	((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) \|\|
1847	(flags & WALK_NOFOLLOW)) {
1848	/ not a symlink or should not follow /
1849	if (nd->flags & LOOKUP_RCU) {
1850	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1851	return ERR_PTR(error: -ECHILD);
1852	if (unlikely(!inode))
1853	return ERR_PTR(error: -ENOENT);
1854	} else {
1855	dput(nd->path.dentry);
1856	if (nd->path.mnt != path.mnt)
1857	mntput(mnt: nd->path.mnt);
1858	}
1859	nd->path = path;
1860	nd->inode = inode;
1861	nd->seq = nd->next_seq;
1862	return NULL;
1863	}
1864	if (nd->flags & LOOKUP_RCU) {
1865	/ make sure that d_is_symlink above matches inode /
1866	if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
1867	return ERR_PTR(error: -ECHILD);
1868	} else {
1869	if (path.mnt == nd->path.mnt)
1870	mntget(mnt: path.mnt);
1871	}
1872	return pick_link(nd, link: &path, inode, flags);
1873	}
1874
1875	static struct dentry follow_dotdot_rcu(struct* nameidata *nd)
1876	{
1877	struct dentry parent, old;
1878
1879	if (path_equal(path1: &nd->path, path2: &nd->root))
1880	goto in_root;
1881	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1882	struct path path;
1883	unsigned seq;
1884	if (!choose_mountpoint_rcu(m: real_mount(mnt: nd->path.mnt),
1885	root: &nd->root, path: &path, seqp: &seq))
1886	goto in_root;
1887	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1888	return ERR_PTR(error: -ECHILD);
1889	nd->path = path;
1890	nd->inode = path.dentry->d_inode;
1891	nd->seq = seq;
1892	// makes sure that non-RCU pathwalk could reach this state
1893	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1894	return ERR_PTR(error: -ECHILD);
1895	/ we know that mountpoint was pinned /
1896	}
1897	old = nd->path.dentry;
1898	parent = old->d_parent;
1899	nd->next_seq = read_seqcount_begin(&parent->d_seq);
1900	// makes sure that non-RCU pathwalk could reach this state
1901	if (read_seqcount_retry(&old->d_seq, nd->seq))
1902	return ERR_PTR(error: -ECHILD);
1903	if (unlikely(!path_connected(nd->path.mnt, parent)))
1904	return ERR_PTR(error: -ECHILD);
1905	return parent;
1906	in_root:
1907	if (read_seqretry(sl: &mount_lock, start: nd->m_seq))
1908	return ERR_PTR(error: -ECHILD);
1909	if (unlikely(nd->flags & LOOKUP_BENEATH))
1910	return ERR_PTR(error: -ECHILD);
1911	nd->next_seq = nd->seq;
1912	return nd->path.dentry;
1913	}
1914
1915	static struct dentry follow_dotdot(struct* nameidata *nd)
1916	{
1917	struct dentry *parent;
1918
1919	if (path_equal(path1: &nd->path, path2: &nd->root))
1920	goto in_root;
1921	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
1922	struct path path;
1923
1924	if (!choose_mountpoint(m: real_mount(mnt: nd->path.mnt),
1925	root: &nd->root, path: &path))
1926	goto in_root;
1927	path_put(&nd->path);
1928	nd->path = path;
1929	nd->inode = path.dentry->d_inode;
1930	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
1931	return ERR_PTR(error: -EXDEV);
1932	}
1933	/ rare case of legitimate dget_parent()... /
1934	parent = dget_parent(dentry: nd->path.dentry);
1935	if (unlikely(!path_connected(nd->path.mnt, parent))) {
1936	dput(parent);
1937	return ERR_PTR(error: -ENOENT);
1938	}
1939	return parent;
1940
1941	in_root:
1942	if (unlikely(nd->flags & LOOKUP_BENEATH))
1943	return ERR_PTR(error: -EXDEV);
1944	return dget(dentry: nd->path.dentry);
1945	}
1946
1947	static const char handle_dots(struct* nameidata nd, int* type)
1948	{
1949	if (type == LAST_DOTDOT) {
1950	const char *error = NULL;
1951	struct dentry *parent;
1952
1953	if (!nd->root.mnt) {
1954	error = ERR_PTR(error: set_root(nd));
1955	if (error)
1956	return error;
1957	}
1958	if (nd->flags & LOOKUP_RCU)
1959	parent = follow_dotdot_rcu(nd);
1960	else
1961	parent = follow_dotdot(nd);
1962	if (IS_ERR(ptr: parent))
1963	return ERR_CAST(ptr: parent);
1964	error = step_into(nd, flags: WALK_NOFOLLOW, dentry: parent);
1965	if (unlikely(error))
1966	return error;
1967
1968	if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
1969	/*
1970	* If there was a racing rename or mount along our
1971	* path, then we can't be sure that ".." hasn't jumped
1972	* above nd->root (and so userspace should retry or use
1973	* some fallback).
1974	*/
1975	smp_rmb();
1976	if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
1977	return ERR_PTR(error: -EAGAIN);
1978	if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
1979	return ERR_PTR(error: -EAGAIN);
1980	}
1981	}
1982	return NULL;
1983	}
1984
1985	static const char walk_component(struct* nameidata nd, int* flags)
1986	{
1987	struct dentry *dentry;
1988	/*
1989	* "." and ".." are special - ".." especially so because it has
1990	* to be able to know about the current root directory and
1991	* parent relationships.
1992	*/
1993	if (unlikely(nd->last_type != LAST_NORM)) {
1994	if (!(flags & WALK_MORE) && nd->depth)
1995	put_link(nd);
1996	return handle_dots(nd, type: nd->last_type);
1997	}
1998	dentry = lookup_fast(nd);
1999	if (IS_ERR(ptr: dentry))
2000	return ERR_CAST(ptr: dentry);
2001	if (unlikely(!dentry)) {
2002	dentry = lookup_slow(name: &nd->last, dir: nd->path.dentry, flags: nd->flags);
2003	if (IS_ERR(ptr: dentry))
2004	return ERR_CAST(ptr: dentry);
2005	}
2006	if (!(flags & WALK_MORE) && nd->depth)
2007	put_link(nd);
2008	return step_into(nd, flags, dentry);
2009	}
2010
2011	/*
2012	* We can do the critical dentry name comparison and hashing
2013	* operations one word at a time, but we are limited to:
2014	*
2015	* - Architectures with fast unaligned word accesses. We could
2016	* do a "get_unaligned()" if this helps and is sufficiently
2017	* fast.
2018	*
2019	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
2020	* do not trap on the (extremely unlikely) case of a page
2021	* crossing operation.
2022	*
2023	* - Furthermore, we need an efficient 64-bit compile for the
2024	* 64-bit case in order to generate the "number of bytes in
2025	* the final mask". Again, that could be replaced with a
2026	* efficient population count instruction or similar.
2027	*/
2028	#ifdef CONFIG_DCACHE_WORD_ACCESS
2029
2030	#include <asm/word-at-a-time.h>
2031
2032	#ifdef HASH_MIX
2033
2034	/ Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> /
2035
2036	#elif defined(CONFIG_64BIT)
2037	/*
2038	* Register pressure in the mixing function is an issue, particularly
2039	* on 32-bit x86, but almost any function requires one state value and
2040	* one temporary. Instead, use a function designed for two state values
2041	* and no temporaries.
2042	*
2043	* This function cannot create a collision in only two iterations, so
2044	* we have two iterations to achieve avalanche. In those two iterations,
2045	* we have six layers of mixing, which is enough to spread one bit's
2046	* influence out to 2^6 = 64 state bits.
2047	*
2048	* Rotate constants are scored by considering either 64 one-bit input
2049	* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
2050	* probability of that delta causing a change to each of the 128 output
2051	* bits, using a sample of random initial states.
2052	*
2053	* The Shannon entropy of the computed probabilities is then summed
2054	* to produce a score. Ideally, any input change has a 50% chance of
2055	* toggling any given output bit.
2056	*
2057	* Mixing scores (in bits) for (12,45):
2058	* Input delta: 1-bit 2-bit
2059	* 1 round: 713.3 42542.6
2060	* 2 rounds: 2753.7 140389.8
2061	* 3 rounds: 5954.1 233458.2
2062	* 4 rounds: 7862.6 256672.2
2063	* Perfect: 8192 258048
2064	* (64128) (6463/2 * 128)
2065	*/
2066	#define HASH_MIX(x, y, a) \
2067	( x ^= (a), \
2068	y ^= x, x = rol64(x,12),\
2069	x += y, y = rol64(y,45),\
2070	y *= 9 )
2071
2072	/*
2073	* Fold two longs into one 32-bit hash value. This must be fast, but
2074	* latency isn't quite as critical, as there is a fair bit of additional
2075	* work done before the hash value is used.
2076	*/
2077	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2078	{
2079	y ^= x * GOLDEN_RATIO_64;
2080	y *= GOLDEN_RATIO_64;
2081	return y >> `32`;
2082	}
2083
2084	#else /* 32-bit case */
2085
2086	/*
2087	* Mixing scores (in bits) for (7,20):
2088	* Input delta: 1-bit 2-bit
2089	* 1 round: 330.3 9201.6
2090	* 2 rounds: 1246.4 25475.4
2091	* 3 rounds: 1907.1 31295.1
2092	* 4 rounds: 2042.3 31718.6
2093	* Perfect: 2048 31744
2094	* (3264) (3231/2 * 64)
2095	*/
2096	#define HASH_MIX(x, y, a) \
2097	( x ^= (a), \
2098	y ^= x, x = rol32(x, 7),\
2099	x += y, y = rol32(y,20),\
2100	y *= 9 )
2101
2102	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
2103	{
2104	/ Use arch-optimized multiply if one exists /
2105	return __hash_32(y ^ __hash_32(x));
2106	}
2107
2108	#endif
2109
2110	/*
2111	* Return the hash of a string of known length. This is carfully
2112	* designed to match hash_name(), which is the more critical function.
2113	* In particular, we must end by hashing a final word containing 0..7
2114	* payload bytes, to match the way that hash_name() iterates until it
2115	* finds the delimiter after the name.
2116	*/
2117	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2118	{
2119	unsigned long a, x = `0`, y = (unsigned long)salt;
2120
2121	for (;;) {
2122	if (!len)
2123	goto done;
2124	a = load_unaligned_zeropad(addr: name);
2125	if (len < sizeof(unsigned long))
2126	break;
2127	HASH_MIX(x, y, a);
2128	name += sizeof(unsigned long);
2129	len -= sizeof(unsigned long);
2130	}
2131	x ^= a & bytemask_from_count(len);
2132	done:
2133	return fold_hash(x, y);
2134	}
2135	EXPORT_SYMBOL(full_name_hash);
2136
2137	/ Return the "hash_len" (hash and length) of a null-terminated string /
2138	u64 hashlen_string(const void salt, const* char *name)
2139	{
2140	unsigned long a = `0`, x = `0`, y = (unsigned long)salt;
2141	unsigned long adata, mask, len;
2142	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2143
2144	len = `0`;
2145	goto inside;
2146
2147	do {
2148	HASH_MIX(x, y, a);
2149	len += sizeof(unsigned long);
2150	inside:
2151	a = load_unaligned_zeropad(addr: name+len);
2152	} while (!has_zero(a, bits: &adata, c: &constants));
2153
2154	adata = prep_zero_mask(a, bits: adata, c: &constants);
2155	mask = create_zero_mask(bits: adata);
2156	x ^= a & zero_bytemask(mask);
2157
2158	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2159	}
2160	EXPORT_SYMBOL(hashlen_string);
2161
2162	/*
2163	* Calculate the length and hash of the path component, and
2164	* return the "hash_len" as the result.
2165	*/
2166	static inline u64 hash_name(const void salt, const* char *name)
2167	{
2168	unsigned long a = `0`, b, x = `0`, y = (unsigned long)salt;
2169	unsigned long adata, bdata, mask, len;
2170	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
2171
2172	len = `0`;
2173	goto inside;
2174
2175	do {
2176	HASH_MIX(x, y, a);
2177	len += sizeof(unsigned long);
2178	inside:
2179	a = load_unaligned_zeropad(addr: name+len);
2180	b = a ^ REPEAT_BYTE(`'/'`);
2181	} while (!(has_zero(a, bits: &adata, c: &constants) \| has_zero(a: b, bits: &bdata, c: &constants)));
2182
2183	adata = prep_zero_mask(a, bits: adata, c: &constants);
2184	bdata = prep_zero_mask(a: b, bits: bdata, c: &constants);
2185	mask = create_zero_mask(bits: adata \| bdata);
2186	x ^= a & zero_bytemask(mask);
2187
2188	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
2189	}
2190
2191	#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
2192
2193	/ Return the hash of a string of known length /
2194	unsigned int full_name_hash(const void salt, const* char name, unsigned* int len)
2195	{
2196	unsigned long hash = init_name_hash(salt);
2197	while (len--)
2198	hash = partial_name_hash((unsigned char)*name++, hash);
2199	return end_name_hash(hash);
2200	}
2201	EXPORT_SYMBOL(full_name_hash);
2202
2203	/ Return the "hash_len" (hash and length) of a null-terminated string /
2204	u64 hashlen_string(const void salt, const* char *name)
2205	{
2206	unsigned long hash = init_name_hash(salt);
2207	unsigned long len = `0`, c;
2208
2209	c = (unsigned char)*name;
2210	while (c) {
2211	len++;
2212	hash = partial_name_hash(c, hash);
2213	c = (unsigned char)name[len];
2214	}
2215	return hashlen_create(end_name_hash(hash), len);
2216	}
2217	EXPORT_SYMBOL(hashlen_string);
2218
2219	/*
2220	* We know there's a real path component here of at least
2221	* one character.
2222	*/
2223	static inline u64 hash_name(const void salt, const* char *name)
2224	{
2225	unsigned long hash = init_name_hash(salt);
2226	unsigned long len = `0`, c;
2227
2228	c = (unsigned char)*name;
2229	do {
2230	len++;
2231	hash = partial_name_hash(c, hash);
2232	c = (unsigned char)name[len];
2233	} while (c && c != `'/'`);
2234	return hashlen_create(end_name_hash(hash), len);
2235	}
2236
2237	#endif
2238
2239	/*
2240	* Name resolution.
2241	* This is the basic name resolution function, turning a pathname into
2242	* the final dentry. We expect 'base' to be positive and a directory.
2243	*
2244	* Returns 0 and nd will have valid dentry and mnt on success.
2245	* Returns error and drops reference to input namei data on failure.
2246	*/
2247	static int link_path_walk(const char name, struct* nameidata *nd)
2248	{
2249	int depth = `0`; // depth <= nd->depth
2250	int err;
2251
2252	nd->last_type = LAST_ROOT;
2253	nd->flags \|= LOOKUP_PARENT;
2254	if (IS_ERR(ptr: name))
2255	return PTR_ERR(ptr: name);
2256	while (*name==`'/'`)
2257	name++;
2258	if (!*name) {
2259	nd->dir_mode = `0`; // short-circuit the 'hardening' idiocy
2260	return `0`;
2261	}
2262
2263	/ At this point we know we have a real path component. /
2264	for(;;) {
2265	struct mnt_idmap *idmap;
2266	const char *link;
2267	u64 hash_len;
2268	int type;
2269
2270	idmap = mnt_idmap(mnt: nd->path.mnt);
2271	err = may_lookup(idmap, nd);
2272	if (err)
2273	return err;
2274
2275	hash_len = hash_name(salt: nd->path.dentry, name);
2276
2277	type = LAST_NORM;
2278	if (name[`0`] == `'.'`) switch (hashlen_len(hash_len)) {
2279	case `2`:
2280	if (name[`1`] == `'.'`) {
2281	type = LAST_DOTDOT;
2282	nd->state \|= ND_JUMPED;
2283	}
2284	break;
2285	case `1`:
2286	type = LAST_DOT;
2287	}
2288	if (likely(type == LAST_NORM)) {
2289	struct dentry *parent = nd->path.dentry;
2290	nd->state &= ~ND_JUMPED;
2291	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
2292	struct qstr this = { { .hash_len = hash_len }, .name = name };
2293	err = parent->d_op->d_hash(parent, &this);
2294	if (err < `0`)
2295	return err;
2296	hash_len = this.hash_len;
2297	name = this.name;
2298	}
2299	}
2300
2301	nd->last.hash_len = hash_len;
2302	nd->last.name = name;
2303	nd->last_type = type;
2304
2305	name += hashlen_len(hash_len);
2306	if (!*name)
2307	goto OK;
2308	/*
2309	* If it wasn't NUL, we know it was '/'. Skip that
2310	* slash, and continue until no more slashes.
2311	*/
2312	do {
2313	name++;
2314	} while (unlikely(*name == `'/'`));
2315	if (unlikely(!*name)) {
2316	OK:
2317	/ pathname or trailing symlink, done /
2318	if (!depth) {
2319	nd->dir_vfsuid = i_uid_into_vfsuid(idmap, inode: nd->inode);
2320	nd->dir_mode = nd->inode->i_mode;
2321	nd->flags &= ~LOOKUP_PARENT;
2322	return `0`;
2323	}
2324	/ last component of nested symlink /
2325	name = nd->stack[--depth].name;
2326	link = walk_component(nd, flags: `0`);
2327	} else {
2328	/ not the last component /
2329	link = walk_component(nd, flags: WALK_MORE);
2330	}
2331	if (unlikely(link)) {
2332	if (IS_ERR(ptr: link))
2333	return PTR_ERR(ptr: link);
2334	/ a symlink to follow /
2335	nd->stack[depth++].name = name;
2336	name = link;
2337	continue;
2338	}
2339	if (unlikely(!d_can_lookup(nd->path.dentry))) {
2340	if (nd->flags & LOOKUP_RCU) {
2341	if (!try_to_unlazy(nd))
2342	return -ECHILD;
2343	}
2344	return -ENOTDIR;
2345	}
2346	}
2347	}
2348
2349	/ must be paired with terminate_walk() /
2350	static const char path_init(struct* nameidata nd, unsigned* flags)
2351	{
2352	int error;
2353	const char *s = nd->name->name;
2354
2355	/ LOOKUP_CACHED requires RCU, ask caller to retry /
2356	if ((flags & (LOOKUP_RCU \| LOOKUP_CACHED)) == LOOKUP_CACHED)
2357	return ERR_PTR(error: -EAGAIN);
2358
2359	if (!*s)
2360	flags &= ~LOOKUP_RCU;
2361	if (flags & LOOKUP_RCU)
2362	rcu_read_lock();
2363	else
2364	nd->seq = nd->next_seq = `0`;
2365
2366	nd->flags = flags;
2367	nd->state \|= ND_JUMPED;
2368
2369	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
2370	nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
2371	smp_rmb();
2372
2373	if (nd->state & ND_ROOT_PRESET) {
2374	struct dentry *root = nd->root.dentry;
2375	struct inode *inode = root->d_inode;
2376	if (*s && unlikely(!d_can_lookup(root)))
2377	return ERR_PTR(error: -ENOTDIR);
2378	nd->path = nd->root;
2379	nd->inode = inode;
2380	if (flags & LOOKUP_RCU) {
2381	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2382	nd->root_seq = nd->seq;
2383	} else {
2384	path_get(&nd->path);
2385	}
2386	return s;
2387	}
2388
2389	nd->root.mnt = NULL;
2390
2391	/ Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). /
2392	if (*s == `'/'` && !(flags & LOOKUP_IN_ROOT)) {
2393	error = nd_jump_root(nd);
2394	if (unlikely(error))
2395	return ERR_PTR(error);
2396	return s;
2397	}
2398
2399	/ Relative pathname -- get the starting-point it is relative to. /
2400	if (nd->dfd == AT_FDCWD) {
2401	if (flags & LOOKUP_RCU) {
2402	struct fs_struct *fs = current->fs;
2403	unsigned seq;
2404
2405	do {
2406	seq = read_seqcount_begin(&fs->seq);
2407	nd->path = fs->pwd;
2408	nd->inode = nd->path.dentry->d_inode;
2409	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
2410	} while (read_seqcount_retry(&fs->seq, seq));
2411	} else {
2412	get_fs_pwd(current->fs, pwd: &nd->path);
2413	nd->inode = nd->path.dentry->d_inode;
2414	}
2415	} else {
2416	/ Caller must check execute permissions on the starting path component /
2417	struct fd f = fdget_raw(fd: nd->dfd);
2418	struct dentry *dentry;
2419
2420	if (!f.file)
2421	return ERR_PTR(error: -EBADF);
2422
2423	dentry = f.file->f_path.dentry;
2424
2425	if (*s && unlikely(!d_can_lookup(dentry))) {
2426	fdput(fd: f);
2427	return ERR_PTR(error: -ENOTDIR);
2428	}
2429
2430	nd->path = f.file->f_path;
2431	if (flags & LOOKUP_RCU) {
2432	nd->inode = nd->path.dentry->d_inode;
2433	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
2434	} else {
2435	path_get(&nd->path);
2436	nd->inode = nd->path.dentry->d_inode;
2437	}
2438	fdput(fd: f);
2439	}
2440
2441	/ For scoped-lookups we need to set the root to the dirfd as well. /
2442	if (flags & LOOKUP_IS_SCOPED) {
2443	nd->root = nd->path;
2444	if (flags & LOOKUP_RCU) {
2445	nd->root_seq = nd->seq;
2446	} else {
2447	path_get(&nd->root);
2448	nd->state \|= ND_ROOT_GRABBED;
2449	}
2450	}
2451	return s;
2452	}
2453
2454	static inline const char lookup_last(struct* nameidata *nd)
2455	{
2456	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
2457	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
2458
2459	return walk_component(nd, flags: WALK_TRAILING);
2460	}
2461
2462	static int handle_lookup_down(struct nameidata *nd)
2463	{
2464	if (!(nd->flags & LOOKUP_RCU))
2465	dget(dentry: nd->path.dentry);
2466	nd->next_seq = nd->seq;
2467	return PTR_ERR(ptr: step_into(nd, flags: WALK_NOFOLLOW, dentry: nd->path.dentry));
2468	}
2469
2470	/ Returns 0 and nd will be valid on success; Retuns error, otherwise. /
2471	static int path_lookupat(struct nameidata nd, unsigned* flags, struct path *path)
2472	{
2473	const char *s = path_init(nd, flags);
2474	int err;
2475
2476	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(ptr: s)) {
2477	err = handle_lookup_down(nd);
2478	if (unlikely(err < `0`))
2479	s = ERR_PTR(error: err);
2480	}
2481
2482	while (!(err = link_path_walk(name: s, nd)) &&
2483	(s = lookup_last(nd)) != NULL)
2484	;
2485	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
2486	err = handle_lookup_down(nd);
2487	nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
2488	}
2489	if (!err)
2490	err = complete_walk(nd);
2491
2492	if (!err && nd->flags & LOOKUP_DIRECTORY)
2493	if (!d_can_lookup(dentry: nd->path.dentry))
2494	err = -ENOTDIR;
2495	if (!err) {
2496	*path = nd->path;
2497	nd->path.mnt = NULL;
2498	nd->path.dentry = NULL;
2499	}
2500	terminate_walk(nd);
2501	return err;
2502	}
2503
2504	int filename_lookup(int dfd, struct filename name, unsigned* flags,
2505	struct path path, struct* path *root)
2506	{
2507	int retval;
2508	struct nameidata nd;
2509	if (IS_ERR(ptr: name))
2510	return PTR_ERR(ptr: name);
2511	set_nameidata(p: &nd, dfd, name, root);
2512	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_RCU, path);
2513	if (unlikely(retval == -ECHILD))
2514	retval = path_lookupat(nd: &nd, flags, path);
2515	if (unlikely(retval == -ESTALE))
2516	retval = path_lookupat(nd: &nd, flags: flags \| LOOKUP_REVAL, path);
2517
2518	if (likely(!retval))
2519	audit_inode(name, dentry: path->dentry,
2520	aflags: flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : `0`);
2521	restore_nameidata();
2522	return retval;
2523	}
2524
2525	/ Returns 0 and nd will be valid on success; Retuns error, otherwise. /
2526	static int path_parentat(struct nameidata nd, unsigned* flags,
2527	struct path *parent)
2528	{
2529	const char *s = path_init(nd, flags);
2530	int err = link_path_walk(name: s, nd);
2531	if (!err)
2532	err = complete_walk(nd);
2533	if (!err) {
2534	*parent = nd->path;
2535	nd->path.mnt = NULL;
2536	nd->path.dentry = NULL;
2537	}
2538	terminate_walk(nd);
2539	return err;
2540	}
2541
2542	/ Note: this does not consume "name" /
2543	static int __filename_parentat(int dfd, struct filename *name,
2544	unsigned int flags, struct path *parent,
2545	struct qstr last, int* *type,
2546	const struct path *root)
2547	{
2548	int retval;
2549	struct nameidata nd;
2550
2551	if (IS_ERR(ptr: name))
2552	return PTR_ERR(ptr: name);
2553	set_nameidata(p: &nd, dfd, name, root);
2554	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_RCU, parent);
2555	if (unlikely(retval == -ECHILD))
2556	retval = path_parentat(nd: &nd, flags, parent);
2557	if (unlikely(retval == -ESTALE))
2558	retval = path_parentat(nd: &nd, flags: flags \| LOOKUP_REVAL, parent);
2559	if (likely(!retval)) {
2560	*last = nd.last;
2561	*type = nd.last_type;
2562	audit_inode(name, dentry: parent->dentry, AUDIT_INODE_PARENT);
2563	}
2564	restore_nameidata();
2565	return retval;
2566	}
2567
2568	static int filename_parentat(int dfd, struct filename *name,
2569	unsigned int flags, struct path *parent,
2570	struct qstr last, int* *type)
2571	{
2572	return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
2573	}
2574
2575	/ does lookup, returns the object with parent locked /
2576	static struct dentry __kern_path_locked(struct* filename name, struct* path *path)
2577	{
2578	struct dentry *d;
2579	struct qstr last;
2580	int type, error;
2581
2582	error = filename_parentat(AT_FDCWD, name, flags: `0`, parent: path, last: &last, type: &type);
2583	if (error)
2584	return ERR_PTR(error);
2585	if (unlikely(type != LAST_NORM)) {
2586	path_put(path);
2587	return ERR_PTR(error: -EINVAL);
2588	}
2589	inode_lock_nested(inode: path->dentry->d_inode, subclass: I_MUTEX_PARENT);
2590	d = lookup_one_qstr_excl(&last, path->dentry, `0`);
2591	if (IS_ERR(ptr: d)) {
2592	inode_unlock(inode: path->dentry->d_inode);
2593	path_put(path);
2594	}
2595	return d;
2596	}
2597
2598	struct dentry kern_path_locked(const* char name, struct* path *path)
2599	{
2600	struct filename *filename = getname_kernel(name);
2601	struct dentry *res = __kern_path_locked(name: filename, path);
2602
2603	putname(filename);
2604	return res;
2605	}
2606
2607	int kern_path(const char name, unsigned* int flags, struct path *path)
2608	{
2609	struct filename *filename = getname_kernel(name);
2610	int ret = filename_lookup(AT_FDCWD, name: filename, flags, path, NULL);
2611
2612	putname(filename);
2613	return ret;
2614
2615	}
2616	EXPORT_SYMBOL(kern_path);
2617
2618	/**
2619	* vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
2620	* @filename: filename structure
2621	* @flags: lookup flags
2622	* @parent: pointer to struct path to fill
2623	* @last: last component
2624	* @type: type of the last component
2625	* @root: pointer to struct path of the base directory
2626	*/
2627	int vfs_path_parent_lookup(struct filename filename, unsigned* int flags,
2628	struct path parent, struct* qstr last, int* *type,
2629	const struct path *root)
2630	{
2631	return __filename_parentat(AT_FDCWD, name: filename, flags, parent, last,
2632	type, root);
2633	}
2634	EXPORT_SYMBOL(vfs_path_parent_lookup);
2635
2636	/**
2637	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
2638	* @dentry: pointer to dentry of the base directory
2639	* @mnt: pointer to vfs mount of the base directory
2640	* @name: pointer to file name
2641	* @flags: lookup flags
2642	* @path: pointer to struct path to fill
2643	*/
2644	int vfs_path_lookup(struct dentry dentry, struct* vfsmount *mnt,
2645	const char name, unsigned* int flags,
2646	struct path *path)
2647	{
2648	struct filename *filename;
2649	struct path root = {.mnt = mnt, .dentry = dentry};
2650	int ret;
2651
2652	filename = getname_kernel(name);
2653	/ the first argument of filename_lookup() is ignored with root /
2654	ret = filename_lookup(AT_FDCWD, name: filename, flags, path, root: &root);
2655	putname(filename);
2656	return ret;
2657	}
2658	EXPORT_SYMBOL(vfs_path_lookup);
2659
2660	static int lookup_one_common(struct mnt_idmap *idmap,
2661	const char name, struct* dentry base, int* len,
2662	struct qstr *this)
2663	{
2664	this->name = name;
2665	this->len = len;
2666	this->hash = full_name_hash(base, name, len);
2667	if (!len)
2668	return -EACCES;
2669
2670	if (unlikely(name[`0`] == `'.'`)) {
2671	if (len < `2` \|\| (len == `2` && name[`1`] == `'.'`))
2672	return -EACCES;
2673	}
2674
2675	while (len--) {
2676	unsigned int c = (const* unsigned char *)name++;
2677	if (c == `'/'` \|\| c == `'\0'`)
2678	return -EACCES;
2679	}
2680	/*
2681	* See if the low-level filesystem might want
2682	* to use its own hash..
2683	*/
2684	if (base->d_flags & DCACHE_OP_HASH) {
2685	int err = base->d_op->d_hash(base, this);
2686	if (err < `0`)
2687	return err;
2688	}
2689
2690	return inode_permission(idmap, base->d_inode, MAY_EXEC);
2691	}
2692
2693	/**
2694	* try_lookup_one_len - filesystem helper to lookup single pathname component
2695	* @name: pathname component to lookup
2696	* @base: base directory to lookup from
2697	* @len: maximum length @len should be interpreted to
2698	*
2699	* Look up a dentry by name in the dcache, returning NULL if it does not
2700	* currently exist. The function does not try to create a dentry.
2701	*
2702	* Note that this routine is purely a helper for filesystem usage and should
2703	* not be called by generic code.
2704	*
2705	* The caller must hold base->i_mutex.
2706	*/
2707	struct dentry try_lookup_one_len(const* char name, struct* dentry base, int* len)
2708	{
2709	struct qstr this;
2710	int err;
2711
2712	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2713
2714	err = lookup_one_common(idmap: &nop_mnt_idmap, name, base, len, this: &this);
2715	if (err)
2716	return ERR_PTR(error: err);
2717
2718	return lookup_dcache(name: &this, dir: base, flags: `0`);
2719	}
2720	EXPORT_SYMBOL(try_lookup_one_len);
2721
2722	/**
2723	* lookup_one_len - filesystem helper to lookup single pathname component
2724	* @name: pathname component to lookup
2725	* @base: base directory to lookup from
2726	* @len: maximum length @len should be interpreted to
2727	*
2728	* Note that this routine is purely a helper for filesystem usage and should
2729	* not be called by generic code.
2730	*
2731	* The caller must hold base->i_mutex.
2732	*/
2733	struct dentry lookup_one_len(const* char name, struct* dentry base, int* len)
2734	{
2735	struct dentry *dentry;
2736	struct qstr this;
2737	int err;
2738
2739	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2740
2741	err = lookup_one_common(idmap: &nop_mnt_idmap, name, base, len, this: &this);
2742	if (err)
2743	return ERR_PTR(error: err);
2744
2745	dentry = lookup_dcache(name: &this, dir: base, flags: `0`);
2746	return dentry ? dentry : __lookup_slow(name: &this, dir: base, flags: `0`);
2747	}
2748	EXPORT_SYMBOL(lookup_one_len);
2749
2750	/**
2751	* lookup_one - filesystem helper to lookup single pathname component
2752	* @idmap: idmap of the mount the lookup is performed from
2753	* @name: pathname component to lookup
2754	* @base: base directory to lookup from
2755	* @len: maximum length @len should be interpreted to
2756	*
2757	* Note that this routine is purely a helper for filesystem usage and should
2758	* not be called by generic code.
2759	*
2760	* The caller must hold base->i_mutex.
2761	*/
2762	struct dentry lookup_one(struct* mnt_idmap idmap, const* char *name,
2763	struct dentry base, int* len)
2764	{
2765	struct dentry *dentry;
2766	struct qstr this;
2767	int err;
2768
2769	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
2770
2771	err = lookup_one_common(idmap, name, base, len, this: &this);
2772	if (err)
2773	return ERR_PTR(error: err);
2774
2775	dentry = lookup_dcache(name: &this, dir: base, flags: `0`);
2776	return dentry ? dentry : __lookup_slow(name: &this, dir: base, flags: `0`);
2777	}
2778	EXPORT_SYMBOL(lookup_one);
2779
2780	/**
2781	* lookup_one_unlocked - filesystem helper to lookup single pathname component
2782	* @idmap: idmap of the mount the lookup is performed from
2783	* @name: pathname component to lookup
2784	* @base: base directory to lookup from
2785	* @len: maximum length @len should be interpreted to
2786	*
2787	* Note that this routine is purely a helper for filesystem usage and should
2788	* not be called by generic code.
2789	*
2790	* Unlike lookup_one_len, it should be called without the parent
2791	* i_mutex held, and will take the i_mutex itself if necessary.
2792	*/
2793	struct dentry lookup_one_unlocked(struct* mnt_idmap *idmap,
2794	const char name, struct* dentry *base,
2795	int len)
2796	{
2797	struct qstr this;
2798	int err;
2799	struct dentry *ret;
2800
2801	err = lookup_one_common(idmap, name, base, len, this: &this);
2802	if (err)
2803	return ERR_PTR(error: err);
2804
2805	ret = lookup_dcache(name: &this, dir: base, flags: `0`);
2806	if (!ret)
2807	ret = lookup_slow(name: &this, dir: base, flags: `0`);
2808	return ret;
2809	}
2810	EXPORT_SYMBOL(lookup_one_unlocked);
2811
2812	/**
2813	* lookup_one_positive_unlocked - filesystem helper to lookup single
2814	* pathname component
2815	* @idmap: idmap of the mount the lookup is performed from
2816	* @name: pathname component to lookup
2817	* @base: base directory to lookup from
2818	* @len: maximum length @len should be interpreted to
2819	*
2820	* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
2821	* known positive or ERR_PTR(). This is what most of the users want.
2822	*
2823	* Note that pinned negative with unlocked parent _can_ become positive at any
2824	* time, so callers of lookup_one_unlocked() need to be very careful; pinned
2825	* positives have >d_inode stable, so this one avoids such problems.
2826	*
2827	* Note that this routine is purely a helper for filesystem usage and should
2828	* not be called by generic code.
2829	*
2830	* The helper should be called without i_mutex held.
2831	*/
2832	struct dentry lookup_one_positive_unlocked(struct* mnt_idmap *idmap,
2833	const char *name,
2834	struct dentry base, int* len)
2835	{
2836	struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);
2837
2838	if (!IS_ERR(ptr: ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
2839	dput(ret);
2840	ret = ERR_PTR(error: -ENOENT);
2841	}
2842	return ret;
2843	}
2844	EXPORT_SYMBOL(lookup_one_positive_unlocked);
2845
2846	/**
2847	* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
2848	* @name: pathname component to lookup
2849	* @base: base directory to lookup from
2850	* @len: maximum length @len should be interpreted to
2851	*
2852	* Note that this routine is purely a helper for filesystem usage and should
2853	* not be called by generic code.
2854	*
2855	* Unlike lookup_one_len, it should be called without the parent
2856	* i_mutex held, and will take the i_mutex itself if necessary.
2857	*/
2858	struct dentry lookup_one_len_unlocked(const* char *name,
2859	struct dentry base, int* len)
2860	{
2861	return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
2862	}
2863	EXPORT_SYMBOL(lookup_one_len_unlocked);
2864
2865	/*
2866	* Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
2867	* on negatives. Returns known positive or ERR_PTR(); that's what
2868	* most of the users want. Note that pinned negative with unlocked parent
2869	* _can_ become positive at any time, so callers of lookup_one_len_unlocked()
2870	* need to be very careful; pinned positives have ->d_inode stable, so
2871	* this one avoids such problems.
2872	*/
2873	struct dentry lookup_positive_unlocked(const* char *name,
2874	struct dentry base, int* len)
2875	{
2876	return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
2877	}
2878	EXPORT_SYMBOL(lookup_positive_unlocked);
2879
2880	#ifdef CONFIG_UNIX98_PTYS
2881	int path_pts(struct path *path)
2882	{
2883	/ Find something mounted on "pts" in the same directory as*
2884	* the input path.
2885	*/
2886	struct dentry *parent = dget_parent(dentry: path->dentry);
2887	struct dentry *child;
2888	struct qstr this = QSTR_INIT("pts", `3`);
2889
2890	if (unlikely(!path_connected(path->mnt, parent))) {
2891	dput(parent);
2892	return -ENOENT;
2893	}
2894	dput(path->dentry);
2895	path->dentry = parent;
2896	child = d_hash_and_lookup(parent, &this);
2897	if (IS_ERR_OR_NULL(ptr: child))
2898	return -ENOENT;
2899
2900	path->dentry = child;
2901	dput(parent);
2902	follow_down(path, `0`);
2903	return `0`;
2904	}
2905	#endif
2906
2907	int user_path_at_empty(int dfd, const char __user name, unsigned* flags,
2908	struct path path, int* *empty)
2909	{
2910	struct filename *filename = getname_flags(filename: name, flags, empty);
2911	int ret = filename_lookup(dfd, name: filename, flags, path, NULL);
2912
2913	putname(filename);
2914	return ret;
2915	}
2916	EXPORT_SYMBOL(user_path_at_empty);
2917
2918	int __check_sticky(struct mnt_idmap idmap, struct* inode *dir,
2919	struct inode *inode)
2920	{
2921	kuid_t fsuid = current_fsuid();
2922
2923	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode), kuid: fsuid))
2924	return `0`;
2925	if (vfsuid_eq_kuid(vfsuid: i_uid_into_vfsuid(idmap, inode: dir), kuid: fsuid))
2926	return `0`;
2927	return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
2928	}
2929	EXPORT_SYMBOL(__check_sticky);
2930
2931	/*
2932	* Check whether we can remove a link victim from directory dir, check
2933	* whether the type of victim is right.
2934	* 1. We can't do it if dir is read-only (done in permission())
2935	* 2. We should have write and exec permissions on dir
2936	* 3. We can't remove anything from append-only dir
2937	* 4. We can't do anything with immutable dir (done in permission())
2938	* 5. If the sticky bit on dir is set we should either
2939	* a. be owner of dir, or
2940	* b. be owner of victim, or
2941	* c. have CAP_FOWNER capability
2942	* 6. If the victim is append-only or immutable we can't do antyhing with
2943	* links pointing to it.
2944	* 7. If the victim has an unknown uid or gid we can't change the inode.
2945	* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2946	* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2947	* 10. We can't remove a root or mountpoint.
2948	* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
2949	* nfs_async_unlink().
2950	*/
2951	static int may_delete(struct mnt_idmap idmap, struct* inode *dir,
2952	struct dentry *victim, bool isdir)
2953	{
2954	struct inode *inode = d_backing_inode(upper: victim);
2955	int error;
2956
2957	if (d_is_negative(dentry: victim))
2958	return -ENOENT;
2959	BUG_ON(!inode);
2960
2961	BUG_ON(victim->d_parent->d_inode != dir);
2962
2963	/ Inode writeback is not safe when the uid or gid are invalid. /
2964	if (!vfsuid_valid(uid: i_uid_into_vfsuid(idmap, inode)) \|\|
2965	!vfsgid_valid(gid: i_gid_into_vfsgid(idmap, inode)))
2966	return -EOVERFLOW;
2967
2968	audit_inode_child(parent: dir, dentry: victim, AUDIT_TYPE_CHILD_DELETE);
2969
2970	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
2971	if (error)
2972	return error;
2973	if (IS_APPEND(dir))
2974	return -EPERM;
2975
2976	if (check_sticky(idmap, dir, inode) \|\| IS_APPEND(inode) \|\|
2977	IS_IMMUTABLE(inode) \|\| IS_SWAPFILE(inode) \|\|
2978	HAS_UNMAPPED_ID(idmap, inode))
2979	return -EPERM;
2980	if (isdir) {
2981	if (!d_is_dir(dentry: victim))
2982	return -ENOTDIR;
2983	if (IS_ROOT(victim))
2984	return -EBUSY;
2985	} else if (d_is_dir(dentry: victim))
2986	return -EISDIR;
2987	if (IS_DEADDIR(dir))
2988	return -ENOENT;
2989	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2990	return -EBUSY;
2991	return `0`;
2992	}
2993
2994	/ Check whether we can create an object with dentry child in directory*
2995	* dir.
2996	* 1. We can't do it if child already exists (open has special treatment for
2997	* this case, but since we are inlined it's OK)
2998	* 2. We can't do it if dir is read-only (done in permission())
2999	* 3. We can't do it if the fs can't represent the fsuid or fsgid.
3000	* 4. We should have write and exec permissions on dir
3001	* 5. We can't do it if dir is immutable (done in permission())
3002	*/
3003	static inline int may_create(struct mnt_idmap *idmap,
3004	struct inode dir, struct* dentry *child)
3005	{
3006	audit_inode_child(parent: dir, dentry: child, AUDIT_TYPE_CHILD_CREATE);
3007	if (child->d_inode)
3008	return -EEXIST;
3009	if (IS_DEADDIR(dir))
3010	return -ENOENT;
3011	if (!fsuidgid_has_mapping(sb: dir->i_sb, idmap))
3012	return -EOVERFLOW;
3013
3014	return inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3015	}
3016
3017	static struct dentry lock_two_directories(struct* dentry p1, struct* dentry *p2)
3018	{
3019	struct dentry *p;
3020
3021	p = d_ancestor(p2, p1);
3022	if (p) {
3023	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3024	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_CHILD);
3025	return p;
3026	}
3027
3028	p = d_ancestor(p1, p2);
3029	if (p) {
3030	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3031	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_CHILD);
3032	return p;
3033	}
3034
3035	lock_two_inodes(inode1: p1->d_inode, inode2: p2->d_inode,
3036	subclass1: I_MUTEX_PARENT, subclass2: I_MUTEX_PARENT2);
3037	return NULL;
3038	}
3039
3040	/*
3041	* p1 and p2 should be directories on the same fs.
3042	*/
3043	struct dentry lock_rename(struct* dentry p1, struct* dentry *p2)
3044	{
3045	if (p1 == p2) {
3046	inode_lock_nested(inode: p1->d_inode, subclass: I_MUTEX_PARENT);
3047	return NULL;
3048	}
3049
3050	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
3051	return lock_two_directories(p1, p2);
3052	}
3053	EXPORT_SYMBOL(lock_rename);
3054
3055	/*
3056	* c1 and p2 should be on the same fs.
3057	*/
3058	struct dentry lock_rename_child(struct* dentry c1, struct* dentry *p2)
3059	{
3060	if (READ_ONCE(c1->d_parent) == p2) {
3061	/*
3062	* hopefully won't need to touch ->s_vfs_rename_mutex at all.
3063	*/
3064	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3065	/*
3066	* now that p2 is locked, nobody can move in or out of it,
3067	* so the test below is safe.
3068	*/
3069	if (likely(c1->d_parent == p2))
3070	return NULL;
3071
3072	/*
3073	* c1 got moved out of p2 while we'd been taking locks;
3074	* unlock and fall back to slow case.
3075	*/
3076	inode_unlock(inode: p2->d_inode);
3077	}
3078
3079	mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
3080	/*
3081	* nobody can move out of any directories on this fs.
3082	*/
3083	if (likely(c1->d_parent != p2))
3084	return lock_two_directories(p1: c1->d_parent, p2);
3085
3086	/*
3087	* c1 got moved into p2 while we were taking locks;
3088	* we need p2 locked and ->s_vfs_rename_mutex unlocked,
3089	* for consistency with lock_rename().
3090	*/
3091	inode_lock_nested(inode: p2->d_inode, subclass: I_MUTEX_PARENT);
3092	mutex_unlock(lock: &c1->d_sb->s_vfs_rename_mutex);
3093	return NULL;
3094	}
3095	EXPORT_SYMBOL(lock_rename_child);
3096
3097	void unlock_rename(struct dentry p1, struct* dentry *p2)
3098	{
3099	inode_unlock(inode: p1->d_inode);
3100	if (p1 != p2) {
3101	inode_unlock(inode: p2->d_inode);
3102	mutex_unlock(lock: &p1->d_sb->s_vfs_rename_mutex);
3103	}
3104	}
3105	EXPORT_SYMBOL(unlock_rename);
3106
3107	/**
3108	* vfs_prepare_mode - prepare the mode to be used for a new inode
3109	* @idmap: idmap of the mount the inode was found from
3110	* @dir: parent directory of the new inode
3111	* @mode: mode of the new inode
3112	* @mask_perms: allowed permission by the vfs
3113	* @type: type of file to be created
3114	*
3115	* This helper consolidates and enforces vfs restrictions on the @mode of a new
3116	* object to be created.
3117	*
3118	* Umask stripping depends on whether the filesystem supports POSIX ACLs (see
3119	* the kernel documentation for mode_strip_umask()). Moving umask stripping
3120	* after setgid stripping allows the same ordering for both non-POSIX ACL and
3121	* POSIX ACL supporting filesystems.
3122	*
3123	* Note that it's currently valid for @type to be 0 if a directory is created.
3124	* Filesystems raise that flag individually and we need to check whether each
3125	* filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
3126	* non-zero type.
3127	*
3128	* Returns: mode to be passed to the filesystem
3129	*/
3130	static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
3131	const struct inode *dir, umode_t mode,
3132	umode_t mask_perms, umode_t type)
3133	{
3134	mode = mode_strip_sgid(idmap, dir, mode);
3135	mode = mode_strip_umask(dir, mode);
3136
3137	/*
3138	* Apply the vfs mandated allowed permission mask and set the type of
3139	* file to be created before we call into the filesystem.
3140	*/
3141	mode &= (mask_perms & ~S_IFMT);
3142	mode \|= (type & S_IFMT);
3143
3144	return mode;
3145	}
3146
3147	/**
3148	* vfs_create - create new file
3149	* @idmap: idmap of the mount the inode was found from
3150	* @dir: inode of @dentry
3151	* @dentry: pointer to dentry of the base directory
3152	* @mode: mode of the new file
3153	* @want_excl: whether the file must not yet exist
3154	*
3155	* Create a new file.
3156	*
3157	* If the inode has been found through an idmapped mount the idmap of
3158	* the vfsmount must be passed through @idmap. This function will then take
3159	* care to map the inode according to @idmap before checking permissions.
3160	* On non-idmapped mounts or if permission checking is to be performed on the
3161	* raw inode simply passs @nop_mnt_idmap.
3162	*/
3163	int vfs_create(struct mnt_idmap idmap, struct* inode *dir,
3164	struct dentry *dentry, umode_t mode, bool want_excl)
3165	{
3166	int error;
3167
3168	error = may_create(idmap, dir, child: dentry);
3169	if (error)
3170	return error;
3171
3172	if (!dir->i_op->create)
3173	return -EACCES; / shouldn't it be ENOSYS? /
3174
3175	mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
3176	error = security_inode_create(dir, dentry, mode);
3177	if (error)
3178	return error;
3179	error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
3180	if (!error)
3181	fsnotify_create(dir, dentry);
3182	return error;
3183	}
3184	EXPORT_SYMBOL(vfs_create);
3185
3186	int vfs_mkobj(struct dentry *dentry, umode_t mode,
3187	int (f)(struct* dentry , umode_t, void* *),
3188	void *arg)
3189	{
3190	struct inode *dir = dentry->d_parent->d_inode;
3191	int error = may_create(idmap: &nop_mnt_idmap, dir, child: dentry);
3192	if (error)
3193	return error;
3194
3195	mode &= S_IALLUGO;
3196	mode \|= S_IFREG;
3197	error = security_inode_create(dir, dentry, mode);
3198	if (error)
3199	return error;
3200	error = f(dentry, mode, arg);
3201	if (!error)
3202	fsnotify_create(dir, dentry);
3203	return error;
3204	}
3205	EXPORT_SYMBOL(vfs_mkobj);
3206
3207	bool may_open_dev(const struct path *path)
3208	{
3209	return !(path->mnt->mnt_flags & MNT_NODEV) &&
3210	!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
3211	}
3212
3213	static int may_open(struct mnt_idmap idmap, const* struct path *path,
3214	int acc_mode, int flag)
3215	{
3216	struct dentry *dentry = path->dentry;
3217	struct inode *inode = dentry->d_inode;
3218	int error;
3219
3220	if (!inode)
3221	return -ENOENT;
3222
3223	switch (inode->i_mode & S_IFMT) {
3224	case S_IFLNK:
3225	return -ELOOP;
3226	case S_IFDIR:
3227	if (acc_mode & MAY_WRITE)
3228	return -EISDIR;
3229	if (acc_mode & MAY_EXEC)
3230	return -EACCES;
3231	break;
3232	case S_IFBLK:
3233	case S_IFCHR:
3234	if (!may_open_dev(path))
3235	return -EACCES;
3236	fallthrough;
3237	case S_IFIFO:
3238	case S_IFSOCK:
3239	if (acc_mode & MAY_EXEC)
3240	return -EACCES;
3241	flag &= ~O_TRUNC;
3242	break;
3243	case S_IFREG:
3244	if ((acc_mode & MAY_EXEC) && path_noexec(path))
3245	return -EACCES;
3246	break;
3247	}
3248
3249	error = inode_permission(idmap, inode, MAY_OPEN \| acc_mode);
3250	if (error)
3251	return error;
3252
3253	/*
3254	* An append-only file must be opened in append mode for writing.
3255	*/
3256	if (IS_APPEND(inode)) {
3257	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
3258	return -EPERM;
3259	if (flag & O_TRUNC)
3260	return -EPERM;
3261	}
3262
3263	/ O_NOATIME can only be set by the owner or superuser /
3264	if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
3265	return -EPERM;
3266
3267	return `0`;
3268	}
3269
3270	static int handle_truncate(struct mnt_idmap idmap, struct* file *filp)
3271	{
3272	const struct path *path = &filp->f_path;
3273	struct inode *inode = path->dentry->d_inode;
3274	int error = get_write_access(inode);
3275	if (error)
3276	return error;
3277
3278	error = security_file_truncate(file: filp);
3279	if (!error) {
3280	error = do_truncate(idmap, path->dentry, start: `0`,
3281	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
3282	filp);
3283	}
3284	put_write_access(inode);
3285	return error;
3286	}
3287
3288	static inline int open_to_namei_flags(int flag)
3289	{
3290	if ((flag & O_ACCMODE) == `3`)
3291	flag--;
3292	return flag;
3293	}
3294
3295	static int may_o_create(struct mnt_idmap *idmap,
3296	const struct path dir, struct* dentry *dentry,
3297	umode_t mode)
3298	{
3299	int error = security_path_mknod(dir, dentry, mode, dev: `0`);
3300	if (error)
3301	return error;
3302
3303	if (!fsuidgid_has_mapping(sb: dir->dentry->d_sb, idmap))
3304	return -EOVERFLOW;
3305
3306	error = inode_permission(idmap, dir->dentry->d_inode,
3307	MAY_WRITE \| MAY_EXEC);
3308	if (error)
3309	return error;
3310
3311	return security_inode_create(dir: dir->dentry->d_inode, dentry, mode);
3312	}
3313
3314	/*
3315	* Attempt to atomically look up, create and open a file from a negative
3316	* dentry.
3317	*
3318	* Returns 0 if successful. The file will have been created and attached to
3319	* @file by the filesystem calling finish_open().
3320	*
3321	* If the file was looked up only or didn't need creating, FMODE_OPENED won't
3322	* be set. The caller will need to perform the open themselves. @path will
3323	* have been updated to point to the new dentry. This may be negative.
3324	*
3325	* Returns an error code otherwise.
3326	*/
3327	static struct dentry atomic_open(struct* nameidata nd, struct* dentry *dentry,
3328	struct file *file,
3329	int open_flag, umode_t mode)
3330	{
3331	struct dentry *const DENTRY_NOT_SET = (void *) -`1UL`;
3332	struct inode *dir = nd->path.dentry->d_inode;
3333	int error;
3334
3335	if (nd->flags & LOOKUP_DIRECTORY)
3336	open_flag \|= O_DIRECTORY;
3337
3338	file->f_path.dentry = DENTRY_NOT_SET;
3339	file->f_path.mnt = nd->path.mnt;
3340	error = dir->i_op->atomic_open(dir, dentry, file,
3341	open_to_namei_flags(flag: open_flag), mode);
3342	d_lookup_done(dentry);
3343	if (!error) {
3344	if (file->f_mode & FMODE_OPENED) {
3345	if (unlikely(dentry != file->f_path.dentry)) {
3346	dput(dentry);
3347	dentry = dget(dentry: file->f_path.dentry);
3348	}
3349	} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
3350	error = -EIO;
3351	} else {
3352	if (file->f_path.dentry) {
3353	dput(dentry);
3354	dentry = file->f_path.dentry;
3355	}
3356	if (unlikely(d_is_negative(dentry)))
3357	error = -ENOENT;
3358	}
3359	}
3360	if (error) {
3361	dput(dentry);
3362	dentry = ERR_PTR(error);
3363	}
3364	return dentry;
3365	}
3366
3367	/*
3368	* Look up and maybe create and open the last component.
3369	*
3370	* Must be called with parent locked (exclusive in O_CREAT case).
3371	*
3372	* Returns 0 on success, that is, if
3373	* the file was successfully atomically created (if necessary) and opened, or
3374	* the file was not completely opened at this time, though lookups and
3375	* creations were performed.
3376	* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
3377	* In the latter case dentry returned in @path might be negative if O_CREAT
3378	* hadn't been specified.
3379	*
3380	* An error code is returned on failure.
3381	*/
3382	static struct dentry lookup_open(struct* nameidata nd, struct* file *file,
3383	const struct open_flags *op,
3384	bool got_write)
3385	{
3386	struct mnt_idmap *idmap;
3387	struct dentry *dir = nd->path.dentry;
3388	struct inode *dir_inode = dir->d_inode;
3389	int open_flag = op->open_flag;
3390	struct dentry *dentry;
3391	int error, create_error = `0`;
3392	umode_t mode = op->mode;
3393	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
3394
3395	if (unlikely(IS_DEADDIR(dir_inode)))
3396	return ERR_PTR(error: -ENOENT);
3397
3398	file->f_mode &= ~FMODE_CREATED;
3399	dentry = d_lookup(dir, &nd->last);
3400	for (;;) {
3401	if (!dentry) {
3402	dentry = d_alloc_parallel(dir, &nd->last, &wq);
3403	if (IS_ERR(ptr: dentry))
3404	return dentry;
3405	}
3406	if (d_in_lookup(dentry))
3407	break;
3408
3409	error = d_revalidate(dentry, flags: nd->flags);
3410	if (likely(error > `0`))
3411	break;
3412	if (error)
3413	goto out_dput;
3414	d_invalidate(dentry);
3415	dput(dentry);
3416	dentry = NULL;
3417	}
3418	if (dentry->d_inode) {
3419	/ Cached positive dentry: will open in f_op->open /
3420	return dentry;
3421	}
3422
3423	/*
3424	* Checking write permission is tricky, bacuse we don't know if we are
3425	* going to actually need it: O_CREAT opens should work as long as the
3426	* file exists. But checking existence breaks atomicity. The trick is
3427	* to check access and if not granted clear O_CREAT from the flags.
3428	*
3429	* Another problem is returing the "right" error value (e.g. for an
3430	* O_EXCL open we want to return EEXIST not EROFS).
3431	*/
3432	if (unlikely(!got_write))
3433	open_flag &= ~O_TRUNC;
3434	idmap = mnt_idmap(mnt: nd->path.mnt);
3435	if (open_flag & O_CREAT) {
3436	if (open_flag & O_EXCL)
3437	open_flag &= ~O_TRUNC;
3438	mode = vfs_prepare_mode(idmap, dir: dir->d_inode, mode, mask_perms: mode, type: mode);
3439	if (likely(got_write))
3440	create_error = may_o_create(idmap, dir: &nd->path,
3441	dentry, mode);
3442	else
3443	create_error = -EROFS;
3444	}
3445	if (create_error)
3446	open_flag &= ~O_CREAT;
3447	if (dir_inode->i_op->atomic_open) {
3448	dentry = atomic_open(nd, dentry, file, open_flag, mode);
3449	if (unlikely(create_error) && dentry == ERR_PTR(error: -ENOENT))
3450	dentry = ERR_PTR(error: create_error);
3451	return dentry;
3452	}
3453
3454	if (d_in_lookup(dentry)) {
3455	struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
3456	nd->flags);
3457	d_lookup_done(dentry);
3458	if (unlikely(res)) {
3459	if (IS_ERR(ptr: res)) {
3460	error = PTR_ERR(ptr: res);
3461	goto out_dput;
3462	}
3463	dput(dentry);
3464	dentry = res;
3465	}
3466	}
3467
3468	/ Negative dentry, just create the file /
3469	if (!dentry->d_inode && (open_flag & O_CREAT)) {
3470	file->f_mode \|= FMODE_CREATED;
3471	audit_inode_child(parent: dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
3472	if (!dir_inode->i_op->create) {
3473	error = -EACCES;
3474	goto out_dput;
3475	}
3476
3477	error = dir_inode->i_op->create(idmap, dir_inode, dentry,
3478	mode, open_flag & O_EXCL);
3479	if (error)
3480	goto out_dput;
3481	}
3482	if (unlikely(create_error) && !dentry->d_inode) {
3483	error = create_error;
3484	goto out_dput;
3485	}
3486	return dentry;
3487
3488	out_dput:
3489	dput(dentry);
3490	return ERR_PTR(error);
3491	}
3492
3493	static const char open_last_lookups(struct* nameidata *nd,
3494	struct file file, const* struct open_flags *op)
3495	{
3496	struct dentry *dir = nd->path.dentry;
3497	int open_flag = op->open_flag;
3498	bool got_write = false;
3499	struct dentry *dentry;
3500	const char *res;
3501
3502	nd->flags \|= op->intent;
3503
3504	if (nd->last_type != LAST_NORM) {
3505	if (nd->depth)
3506	put_link(nd);
3507	return handle_dots(nd, type: nd->last_type);
3508	}
3509
3510	if (!(open_flag & O_CREAT)) {
3511	if (nd->last.name[nd->last.len])
3512	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
3513	/ we _can_ be in RCU mode here /
3514	dentry = lookup_fast(nd);
3515	if (IS_ERR(ptr: dentry))
3516	return ERR_CAST(ptr: dentry);
3517	if (likely(dentry))
3518	goto finish_lookup;
3519
3520	if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
3521	return ERR_PTR(error: -ECHILD);
3522	} else {
3523	/ create side of things /
3524	if (nd->flags & LOOKUP_RCU) {
3525	if (!try_to_unlazy(nd))
3526	return ERR_PTR(error: -ECHILD);
3527	}
3528	audit_inode(name: nd->name, dentry: dir, AUDIT_INODE_PARENT);
3529	/ trailing slashes? /
3530	if (unlikely(nd->last.name[nd->last.len]))
3531	return ERR_PTR(error: -EISDIR);
3532	}
3533
3534	if (open_flag & (O_CREAT \| O_TRUNC \| O_WRONLY \| O_RDWR)) {
3535	got_write = !mnt_want_write(mnt: nd->path.mnt);
3536	/*
3537	* do _not_ fail yet - we might not need that or fail with
3538	* a different error; let lookup_open() decide; we'll be
3539	* dropping this one anyway.
3540	*/
3541	}
3542	if (open_flag & O_CREAT)
3543	inode_lock(inode: dir->d_inode);
3544	else
3545	inode_lock_shared(inode: dir->d_inode);
3546	dentry = lookup_open(nd, file, op, got_write);
3547	if (!IS_ERR(ptr: dentry) && (file->f_mode & FMODE_CREATED))
3548	fsnotify_create(dir: dir->d_inode, dentry);
3549	if (open_flag & O_CREAT)
3550	inode_unlock(inode: dir->d_inode);
3551	else
3552	inode_unlock_shared(inode: dir->d_inode);
3553
3554	if (got_write)
3555	mnt_drop_write(mnt: nd->path.mnt);
3556
3557	if (IS_ERR(ptr: dentry))
3558	return ERR_CAST(ptr: dentry);
3559
3560	if (file->f_mode & (FMODE_OPENED \| FMODE_CREATED)) {
3561	dput(nd->path.dentry);
3562	nd->path.dentry = dentry;
3563	return NULL;
3564	}
3565
3566	finish_lookup:
3567	if (nd->depth)
3568	put_link(nd);
3569	res = step_into(nd, flags: WALK_TRAILING, dentry);
3570	if (unlikely(res))
3571	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
3572	return res;
3573	}
3574
3575	/*
3576	* Handle the last step of open()
3577	*/
3578	static int do_open(struct nameidata *nd,
3579	struct file file, const* struct open_flags *op)
3580	{
3581	struct mnt_idmap *idmap;
3582	int open_flag = op->open_flag;
3583	bool do_truncate;
3584	int acc_mode;
3585	int error;
3586
3587	if (!(file->f_mode & (FMODE_OPENED \| FMODE_CREATED))) {
3588	error = complete_walk(nd);
3589	if (error)
3590	return error;
3591	}
3592	if (!(file->f_mode & FMODE_CREATED))
3593	audit_inode(name: nd->name, dentry: nd->path.dentry, aflags: `0`);
3594	idmap = mnt_idmap(mnt: nd->path.mnt);
3595	if (open_flag & O_CREAT) {
3596	if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
3597	return -EEXIST;
3598	if (d_is_dir(dentry: nd->path.dentry))
3599	return -EISDIR;
3600	error = may_create_in_sticky(idmap, nd,
3601	inode: d_backing_inode(upper: nd->path.dentry));
3602	if (unlikely(error))
3603	return error;
3604	}
3605	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(dentry: nd->path.dentry))
3606	return -ENOTDIR;
3607
3608	do_truncate = false;
3609	acc_mode = op->acc_mode;
3610	if (file->f_mode & FMODE_CREATED) {
3611	/ Don't check for write permission, don't truncate /
3612	open_flag &= ~O_TRUNC;
3613	acc_mode = `0`;
3614	} else if (d_is_reg(dentry: nd->path.dentry) && open_flag & O_TRUNC) {
3615	error = mnt_want_write(mnt: nd->path.mnt);
3616	if (error)
3617	return error;
3618	do_truncate = true;
3619	}
3620	error = may_open(idmap, path: &nd->path, acc_mode, flag: open_flag);
3621	if (!error && !(file->f_mode & FMODE_OPENED))
3622	error = vfs_open(&nd->path, file);
3623	if (!error)
3624	error = ima_file_check(file, mask: op->acc_mode);
3625	if (!error && do_truncate)
3626	error = handle_truncate(idmap, filp: file);
3627	if (unlikely(error > `0`)) {
3628	WARN_ON(`1`);
3629	error = -EINVAL;
3630	}
3631	if (do_truncate)
3632	mnt_drop_write(mnt: nd->path.mnt);
3633	return error;
3634	}
3635
3636	/**
3637	* vfs_tmpfile - create tmpfile
3638	* @idmap: idmap of the mount the inode was found from
3639	* @parentpath: pointer to the path of the base directory
3640	* @file: file descriptor of the new tmpfile
3641	* @mode: mode of the new tmpfile
3642	*
3643	* Create a temporary file.
3644	*
3645	* If the inode has been found through an idmapped mount the idmap of
3646	* the vfsmount must be passed through @idmap. This function will then take
3647	* care to map the inode according to @idmap before checking permissions.
3648	* On non-idmapped mounts or if permission checking is to be performed on the
3649	* raw inode simply passs @nop_mnt_idmap.
3650	*/
3651	static int vfs_tmpfile(struct mnt_idmap *idmap,
3652	const struct path *parentpath,
3653	struct file *file, umode_t mode)
3654	{
3655	struct dentry *child;
3656	struct inode *dir = d_inode(dentry: parentpath->dentry);
3657	struct inode *inode;
3658	int error;
3659	int open_flag = file->f_flags;
3660
3661	/ we want directory to be writable /
3662	error = inode_permission(idmap, dir, MAY_WRITE \| MAY_EXEC);
3663	if (error)
3664	return error;
3665	if (!dir->i_op->tmpfile)
3666	return -EOPNOTSUPP;
3667	child = d_alloc(parentpath->dentry, &slash_name);
3668	if (unlikely(!child))
3669	return -ENOMEM;
3670	file->f_path.mnt = parentpath->mnt;
3671	file->f_path.dentry = child;
3672	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
3673	error = dir->i_op->tmpfile(idmap, dir, file, mode);
3674	dput(child);
3675	if (error)
3676	return error;
3677	/ Don't check for other permissions, the inode was just created /
3678	error = may_open(idmap, path: &file->f_path, acc_mode: `0`, flag: file->f_flags);
3679	if (error)
3680	return error;
3681	inode = file_inode(f: file);
3682	if (!(open_flag & O_EXCL)) {
3683	spin_lock(lock: &inode->i_lock);
3684	inode->i_state \|= I_LINKABLE;
3685	spin_unlock(lock: &inode->i_lock);
3686	}
3687	ima_post_create_tmpfile(idmap, inode);
3688	return `0`;
3689	}
3690
3691	/**
3692	* kernel_tmpfile_open - open a tmpfile for kernel internal use
3693	* @idmap: idmap of the mount the inode was found from
3694	* @parentpath: path of the base directory
3695	* @mode: mode of the new tmpfile
3696	* @open_flag: flags
3697	* @cred: credentials for open
3698	*
3699	* Create and open a temporary file. The file is not accounted in nr_files,
3700	* hence this is only for kernel internal use, and must not be installed into
3701	* file tables or such.
3702	*/
3703	struct file kernel_tmpfile_open(struct* mnt_idmap *idmap,
3704	const struct path *parentpath,
3705	umode_t mode, int open_flag,
3706	const struct cred *cred)
3707	{
3708	struct file *file;
3709	int error;
3710
3711	file = alloc_empty_file_noaccount(flags: open_flag, cred);
3712	if (IS_ERR(ptr: file))
3713	return file;
3714
3715	error = vfs_tmpfile(idmap, parentpath, file, mode);
3716	if (error) {
3717	fput(file);
3718	file = ERR_PTR(error);
3719	}
3720	return file;
3721	}
3722	EXPORT_SYMBOL(kernel_tmpfile_open);
3723
3724	static int do_tmpfile(struct nameidata nd, unsigned* flags,
3725	const struct open_flags *op,
3726	struct file *file)
3727	{
3728	struct path path;
3729	int error = path_lookupat(nd, flags: flags \| LOOKUP_DIRECTORY, path: &path);
3730
3731	if (unlikely(error))
3732	return error;
3733	error = mnt_want_write(mnt: path.mnt);
3734	if (unlikely(error))
3735	goto out;
3736	error = vfs_tmpfile(idmap: mnt_idmap(mnt: path.mnt), parentpath: &path, file, mode: op->mode);
3737	if (error)
3738	goto out2;
3739	audit_inode(name: nd->name, dentry: file->f_path.dentry, aflags: `0`);
3740	out2:
3741	mnt_drop_write(mnt: path.mnt);
3742	out:
3743	path_put(&path);
3744	return error;
3745	}
3746
3747	static int do_o_path(struct nameidata nd, unsigned* flags, struct file *file)
3748	{
3749	struct path path;
3750	int error = path_lookupat(nd, flags, path: &path);
3751	if (!error) {
3752	audit_inode(name: nd->name, dentry: path.dentry, aflags: `0`);
3753	error = vfs_open(&path, file);
3754	path_put(&path);
3755	}
3756	return error;
3757	}
3758
3759	static struct file path_openat(struct* nameidata *nd,
3760	const struct open_flags op, unsigned* flags)
3761	{
3762	struct file *file;
3763	int error;
3764
3765	file = alloc_empty_file(flags: op->open_flag, current_cred());
3766	if (IS_ERR(ptr: file))
3767	return file;
3768
3769	if (unlikely(file->f_flags & __O_TMPFILE)) {
3770	error = do_tmpfile(nd, flags, op, file);
3771	} else if (unlikely(file->f_flags & O_PATH)) {
3772	error = do_o_path(nd, flags, file);
3773	} else {
3774	const char *s = path_init(nd, flags);
3775	while (!(error = link_path_walk(name: s, nd)) &&
3776	(s = open_last_lookups(nd, file, op)) != NULL)
3777	;
3778	if (!error)
3779	error = do_open(nd, file, op);
3780	terminate_walk(nd);
3781	}
3782	if (likely(!error)) {
3783	if (likely(file->f_mode & FMODE_OPENED))
3784	return file;
3785	WARN_ON(`1`);
3786	error = -EINVAL;
3787	}
3788	if (unlikely(file->f_mode & FMODE_OPENED))
3789	fput(file);
3790	else
3791	release_empty_file(f: file);
3792	if (error == -EOPENSTALE) {
3793	if (flags & LOOKUP_RCU)
3794	error = -ECHILD;
3795	else
3796	error = -ESTALE;
3797	}
3798	return ERR_PTR(error);
3799	}
3800
3801	struct file do_filp_open(int* dfd, struct filename *pathname,
3802	const struct open_flags *op)
3803	{
3804	struct nameidata nd;
3805	int flags = op->lookup_flags;
3806	struct file *filp;
3807
3808	set_nameidata(p: &nd, dfd, name: pathname, NULL);
3809	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
3810	if (unlikely(filp == ERR_PTR(-ECHILD)))
3811	filp = path_openat(nd: &nd, op, flags);
3812	if (unlikely(filp == ERR_PTR(-ESTALE)))
3813	filp = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
3814	restore_nameidata();
3815	return filp;
3816	}
3817
3818	struct file do_file_open_root(const* struct path *root,
3819	const char name, const* struct open_flags *op)
3820	{
3821	struct nameidata nd;
3822	struct file *file;
3823	struct filename *filename;
3824	int flags = op->lookup_flags;
3825
3826	if (d_is_symlink(dentry: root->dentry) && op->intent & LOOKUP_OPEN)
3827	return ERR_PTR(error: -ELOOP);
3828
3829	filename = getname_kernel(name);
3830	if (IS_ERR(ptr: filename))
3831	return ERR_CAST(ptr: filename);
3832
3833	set_nameidata(p: &nd, dfd: -`1`, name: filename, root);
3834	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_RCU);
3835	if (unlikely(file == ERR_PTR(-ECHILD)))
3836	file = path_openat(nd: &nd, op, flags);
3837	if (unlikely(file == ERR_PTR(-ESTALE)))
3838	file = path_openat(nd: &nd, op, flags: flags \| LOOKUP_REVAL);
3839	restore_nameidata();
3840	putname(filename);
3841	return file;
3842	}
3843
3844	static struct dentry filename_create(int* dfd, struct filename *name,
3845	struct path path, unsigned* int lookup_flags)
3846	{
3847	struct dentry *dentry = ERR_PTR(error: -EEXIST);
3848	struct qstr last;
3849	bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
3850	unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
3851	unsigned int create_flags = LOOKUP_CREATE \| LOOKUP_EXCL;
3852	int type;
3853	int err2;
3854	int error;
3855
3856	error = filename_parentat(dfd, name, flags: reval_flag, parent: path, last: &last, type: &type);
3857	if (error)
3858	return ERR_PTR(error);
3859
3860	/*
3861	* Yucky last component or no last component at all?
3862	* (foo/., foo/.., /////)
3863	*/
3864	if (unlikely(type != LAST_NORM))
3865	goto out;
3866
3867	/ don't fail immediately if it's r/o, at least try to report other errors /
3868	err2 = mnt_want_write(mnt: path->mnt);
3869	/*
3870	* Do the final lookup. Suppress 'create' if there is a trailing
3871	* '/', and a directory wasn't requested.
3872	*/
3873	if (last.name[last.len] && !want_dir)
3874	create_flags = `0`;
3875	inode_lock_nested(inode: path->dentry->d_inode, subclass: I_MUTEX_PARENT);
3876	dentry = lookup_one_qstr_excl(&last, path->dentry,
3877	reval_flag \| create_flags);
3878	if (IS_ERR(ptr: dentry))
3879	goto unlock;
3880
3881	error = -EEXIST;
3882	if (d_is_positive(dentry))
3883	goto fail;
3884
3885	/*
3886	* Special case - lookup gave negative, but... we had foo/bar/
3887	* From the vfs_mknod() POV we just have a negative dentry -
3888	* all is fine. Let's be bastards - you had / on the end, you've
3889	* been asking for (non-existent) directory. -ENOENT for you.
3890	*/
3891	if (unlikely(!create_flags)) {
3892	error = -ENOENT;
3893	goto fail;
3894	}
3895	if (unlikely(err2)) {
3896	error = err2;
3897	goto fail;
3898	}
3899	return dentry;
3900	fail:
3901	dput(dentry);
3902	dentry = ERR_PTR(error);
3903	unlock:
3904	inode_unlock(inode: path->dentry->d_inode);
3905	if (!err2)
3906	mnt_drop_write(mnt: path->mnt);
3907	out:
3908	path_put(path);
3909	return dentry;
3910	}
3911
3912	struct dentry kern_path_create(int* dfd, const char *pathname,
3913	struct path path, unsigned* int lookup_flags)
3914	{
3915	struct filename *filename = getname_kernel(pathname);
3916	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
3917
3918	putname(filename);
3919	return res;
3920	}
3921	EXPORT_SYMBOL(kern_path_create);
3922
3923	void done_path_create(struct path path, struct* dentry *dentry)
3924	{
3925	dput(dentry);
3926	inode_unlock(inode: path->dentry->d_inode);
3927	mnt_drop_write(mnt: path->mnt);
3928	path_put(path);
3929	}
3930	EXPORT_SYMBOL(done_path_create);
3931
3932	inline struct dentry user_path_create(int* dfd, const char __user *pathname,
3933	struct path path, unsigned* int lookup_flags)
3934	{
3935	struct filename *filename = getname(filename: pathname);
3936	struct dentry *res = filename_create(dfd, name: filename, path, lookup_flags);
3937
3938	putname(filename);
3939	return res;
3940	}
3941	EXPORT_SYMBOL(user_path_create);
3942
3943	/**
3944	* vfs_mknod - create device node or file
3945	* @idmap: idmap of the mount the inode was found from
3946	* @dir: inode of @dentry
3947	* @dentry: pointer to dentry of the base directory
3948	* @mode: mode of the new device node or file
3949	* @dev: device number of device to create
3950	*
3951	* Create a device node or file.
3952	*
3953	* If the inode has been found through an idmapped mount the idmap of
3954	* the vfsmount must be passed through @idmap. This function will then take
3955	* care to map the inode according to @idmap before checking permissions.
3956	* On non-idmapped mounts or if permission checking is to be performed on the
3957	* raw inode simply passs @nop_mnt_idmap.
3958	*/
3959	int vfs_mknod(struct mnt_idmap idmap, struct* inode *dir,
3960	struct dentry *dentry, umode_t mode, dev_t dev)
3961	{
3962	bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
3963	int error = may_create(idmap, dir, child: dentry);
3964
3965	if (error)
3966	return error;
3967
3968	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && !is_whiteout &&
3969	!capable(CAP_MKNOD))
3970	return -EPERM;
3971
3972	if (!dir->i_op->mknod)
3973	return -EPERM;
3974
3975	mode = vfs_prepare_mode(idmap, dir, mode, mask_perms: mode, type: mode);
3976	error = devcgroup_inode_mknod(mode, dev);
3977	if (error)
3978	return error;
3979
3980	error = security_inode_mknod(dir, dentry, mode, dev);
3981	if (error)
3982	return error;
3983
3984	error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
3985	if (!error)
3986	fsnotify_create(dir, dentry);
3987	return error;
3988	}
3989	EXPORT_SYMBOL(vfs_mknod);
3990
3991	static int may_mknod(umode_t mode)
3992	{
3993	switch (mode & S_IFMT) {
3994	case S_IFREG:
3995	case S_IFCHR:
3996	case S_IFBLK:
3997	case S_IFIFO:
3998	case S_IFSOCK:
3999	case `0`: / zero mode translates to S_IFREG /
4000	return `0`;
4001	case S_IFDIR:
4002	return -EPERM;
4003	default:
4004	return -EINVAL;
4005	}
4006	}
4007
4008	static int do_mknodat(int dfd, struct filename *name, umode_t mode,
4009	unsigned int dev)
4010	{
4011	struct mnt_idmap *idmap;
4012	struct dentry *dentry;
4013	struct path path;
4014	int error;
4015	unsigned int lookup_flags = `0`;
4016
4017	error = may_mknod(mode);
4018	if (error)
4019	goto out1;
4020	retry:
4021	dentry = filename_create(dfd, name, path: &path, lookup_flags);
4022	error = PTR_ERR(ptr: dentry);
4023	if (IS_ERR(ptr: dentry))
4024	goto out1;
4025
4026	error = security_path_mknod(dir: &path, dentry,
4027	mode: mode_strip_umask(dir: path.dentry->d_inode, mode), dev);
4028	if (error)
4029	goto out2;
4030
4031	idmap = mnt_idmap(mnt: path.mnt);
4032	switch (mode & S_IFMT) {
4033	case `0`: case S_IFREG:
4034	error = vfs_create(idmap, path.dentry->d_inode,
4035	dentry, mode, true);
4036	if (!error)
4037	ima_post_path_mknod(idmap, dentry);
4038	break;
4039	case S_IFCHR: case S_IFBLK:
4040	error = vfs_mknod(idmap, path.dentry->d_inode,
4041	dentry, mode, new_decode_dev(dev));
4042	break;
4043	case S_IFIFO: case S_IFSOCK:
4044	error = vfs_mknod(idmap, path.dentry->d_inode,
4045	dentry, mode, `0`);
4046	break;
4047	}
4048	out2:
4049	done_path_create(&path, dentry);
4050	if (retry_estale(error, flags: lookup_flags)) {
4051	lookup_flags \|= LOOKUP_REVAL;
4052	goto retry;
4053	}
4054	out1:
4055	putname(name);
4056	return error;
4057	}
4058
4059	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
4060	unsigned int, dev)
4061	{
4062	return do_mknodat(dfd, name: getname(filename), mode, dev);
4063	}
4064
4065	SYSCALL_DEFINE3(mknod, const char __user , filename, umode_t, mode, unsigned*, dev)
4066	{
4067	return do_mknodat(AT_FDCWD, name: getname(filename), mode, dev);
4068	}
4069
4070	/**
4071	* vfs_mkdir - create directory
4072	* @idmap: idmap of the mount the inode was found from
4073	* @dir: inode of @dentry
4074	* @dentry: pointer to dentry of the base directory
4075	* @mode: mode of the new directory
4076	*
4077	* Create a directory.
4078	*
4079	* If the inode has been found through an idmapped mount the idmap of
4080	* the vfsmount must be passed through @idmap. This function will then take
4081	* care to map the inode according to @idmap before checking permissions.
4082	* On non-idmapped mounts or if permission checking is to be performed on the
4083	* raw inode simply passs @nop_mnt_idmap.
4084	*/
4085	int vfs_mkdir(struct mnt_idmap idmap, struct* inode *dir,
4086	struct dentry *dentry, umode_t mode)
4087	{
4088	int error;
4089	unsigned max_links = dir->i_sb->s_max_links;
4090
4091	error = may_create(idmap, dir, child: dentry);
4092	if (error)
4093	return error;
4094
4095	if (!dir->i_op->mkdir)
4096	return -EPERM;
4097
4098	mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO \| S_ISVTX, type: `0`);
4099	error = security_inode_mkdir(dir, dentry, mode);
4100	if (error)
4101	return error;
4102
4103	if (max_links && dir->i_nlink >= max_links)
4104	return -EMLINK;
4105
4106	error = dir->i_op->mkdir(idmap, dir, dentry, mode);
4107	if (!error)
4108	fsnotify_mkdir(dir, dentry);
4109	return error;
4110	}
4111	EXPORT_SYMBOL(vfs_mkdir);
4112
4113	int do_mkdirat(int dfd, struct filename *name, umode_t mode)
4114	{
4115	struct dentry *dentry;
4116	struct path path;
4117	int error;
4118	unsigned int lookup_flags = LOOKUP_DIRECTORY;
4119
4120	retry:
4121	dentry = filename_create(dfd, name, path: &path, lookup_flags);
4122	error = PTR_ERR(ptr: dentry);
4123	if (IS_ERR(ptr: dentry))
4124	goto out_putname;
4125
4126	error = security_path_mkdir(dir: &path, dentry,
4127	mode: mode_strip_umask(dir: path.dentry->d_inode, mode));
4128	if (!error) {
4129	error = vfs_mkdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4130	dentry, mode);
4131	}
4132	done_path_create(&path, dentry);
4133	if (retry_estale(error, flags: lookup_flags)) {
4134	lookup_flags \|= LOOKUP_REVAL;
4135	goto retry;
4136	}
4137	out_putname:
4138	putname(name);
4139	return error;
4140	}
4141
4142	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
4143	{
4144	return do_mkdirat(dfd, name: getname(filename: pathname), mode);
4145	}
4146
4147	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
4148	{
4149	return do_mkdirat(AT_FDCWD, name: getname(filename: pathname), mode);
4150	}
4151
4152	/**
4153	* vfs_rmdir - remove directory
4154	* @idmap: idmap of the mount the inode was found from
4155	* @dir: inode of @dentry
4156	* @dentry: pointer to dentry of the base directory
4157	*
4158	* Remove a directory.
4159	*
4160	* If the inode has been found through an idmapped mount the idmap of
4161	* the vfsmount must be passed through @idmap. This function will then take
4162	* care to map the inode according to @idmap before checking permissions.
4163	* On non-idmapped mounts or if permission checking is to be performed on the
4164	* raw inode simply passs @nop_mnt_idmap.
4165	*/
4166	int vfs_rmdir(struct mnt_idmap idmap, struct* inode *dir,
4167	struct dentry *dentry)
4168	{
4169	int error = may_delete(idmap, dir, victim: dentry, isdir: `1`);
4170
4171	if (error)
4172	return error;
4173
4174	if (!dir->i_op->rmdir)
4175	return -EPERM;
4176
4177	dget(dentry);
4178	inode_lock(inode: dentry->d_inode);
4179
4180	error = -EBUSY;
4181	if (is_local_mountpoint(dentry) \|\|
4182	(dentry->d_inode->i_flags & S_KERNEL_FILE))
4183	goto out;
4184
4185	error = security_inode_rmdir(dir, dentry);
4186	if (error)
4187	goto out;
4188
4189	error = dir->i_op->rmdir(dir, dentry);
4190	if (error)
4191	goto out;
4192
4193	shrink_dcache_parent(dentry);
4194	dentry->d_inode->i_flags \|= S_DEAD;
4195	dont_mount(dentry);
4196	detach_mounts(dentry);
4197
4198	out:
4199	inode_unlock(inode: dentry->d_inode);
4200	dput(dentry);
4201	if (!error)
4202	d_delete_notify(dir, dentry);
4203	return error;
4204	}
4205	EXPORT_SYMBOL(vfs_rmdir);
4206
4207	int do_rmdir(int dfd, struct filename *name)
4208	{
4209	int error;
4210	struct dentry *dentry;
4211	struct path path;
4212	struct qstr last;
4213	int type;
4214	unsigned int lookup_flags = `0`;
4215	retry:
4216	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
4217	if (error)
4218	goto exit1;
4219
4220	switch (type) {
4221	case LAST_DOTDOT:
4222	error = -ENOTEMPTY;
4223	goto exit2;
4224	case LAST_DOT:
4225	error = -EINVAL;
4226	goto exit2;
4227	case LAST_ROOT:
4228	error = -EBUSY;
4229	goto exit2;
4230	}
4231
4232	error = mnt_want_write(mnt: path.mnt);
4233	if (error)
4234	goto exit2;
4235
4236	inode_lock_nested(inode: path.dentry->d_inode, subclass: I_MUTEX_PARENT);
4237	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4238	error = PTR_ERR(ptr: dentry);
4239	if (IS_ERR(ptr: dentry))
4240	goto exit3;
4241	if (!dentry->d_inode) {
4242	error = -ENOENT;
4243	goto exit4;
4244	}
4245	error = security_path_rmdir(dir: &path, dentry);
4246	if (error)
4247	goto exit4;
4248	error = vfs_rmdir(mnt_idmap(mnt: path.mnt), path.dentry->d_inode, dentry);
4249	exit4:
4250	dput(dentry);
4251	exit3:
4252	inode_unlock(inode: path.dentry->d_inode);
4253	mnt_drop_write(mnt: path.mnt);
4254	exit2:
4255	path_put(&path);
4256	if (retry_estale(error, flags: lookup_flags)) {
4257	lookup_flags \|= LOOKUP_REVAL;
4258	goto retry;
4259	}
4260	exit1:
4261	putname(name);
4262	return error;
4263	}
4264
4265	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
4266	{
4267	return do_rmdir(AT_FDCWD, name: getname(filename: pathname));
4268	}
4269
4270	/**
4271	* vfs_unlink - unlink a filesystem object
4272	* @idmap: idmap of the mount the inode was found from
4273	* @dir: parent directory
4274	* @dentry: victim
4275	* @delegated_inode: returns victim inode, if the inode is delegated.
4276	*
4277	* The caller must hold dir->i_mutex.
4278	*
4279	* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
4280	* return a reference to the inode in delegated_inode. The caller
4281	* should then break the delegation on that inode and retry. Because
4282	* breaking a delegation may take a long time, the caller should drop
4283	* dir->i_mutex before doing so.
4284	*
4285	* Alternatively, a caller may pass NULL for delegated_inode. This may
4286	* be appropriate for callers that expect the underlying filesystem not
4287	* to be NFS exported.
4288	*
4289	* If the inode has been found through an idmapped mount the idmap of
4290	* the vfsmount must be passed through @idmap. This function will then take
4291	* care to map the inode according to @idmap before checking permissions.
4292	* On non-idmapped mounts or if permission checking is to be performed on the
4293	* raw inode simply passs @nop_mnt_idmap.
4294	*/
4295	int vfs_unlink(struct mnt_idmap idmap, struct* inode *dir,
4296	struct dentry dentry, struct* inode **delegated_inode)
4297	{
4298	struct inode *target = dentry->d_inode;
4299	int error = may_delete(idmap, dir, victim: dentry, isdir: `0`);
4300
4301	if (error)
4302	return error;
4303
4304	if (!dir->i_op->unlink)
4305	return -EPERM;
4306
4307	inode_lock(inode: target);
4308	if (IS_SWAPFILE(target))
4309	error = -EPERM;
4310	else if (is_local_mountpoint(dentry))
4311	error = -EBUSY;
4312	else {
4313	error = security_inode_unlink(dir, dentry);
4314	if (!error) {
4315	error = try_break_deleg(inode: target, delegated_inode);
4316	if (error)
4317	goto out;
4318	error = dir->i_op->unlink(dir, dentry);
4319	if (!error) {
4320	dont_mount(dentry);
4321	detach_mounts(dentry);
4322	}
4323	}
4324	}
4325	out:
4326	inode_unlock(inode: target);
4327
4328	/ We don't d_delete() NFS sillyrenamed files--they still exist. /
4329	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
4330	fsnotify_unlink(dir, dentry);
4331	} else if (!error) {
4332	fsnotify_link_count(inode: target);
4333	d_delete_notify(dir, dentry);
4334	}
4335
4336	return error;
4337	}
4338	EXPORT_SYMBOL(vfs_unlink);
4339
4340	/*
4341	* Make sure that the actual truncation of the file will occur outside its
4342	* directory's i_mutex. Truncate can take a long time if there is a lot of
4343	* writeout happening, and we don't want to prevent access to the directory
4344	* while waiting on the I/O.
4345	*/
4346	int do_unlinkat(int dfd, struct filename *name)
4347	{
4348	int error;
4349	struct dentry *dentry;
4350	struct path path;
4351	struct qstr last;
4352	int type;
4353	struct inode *inode = NULL;
4354	struct inode *delegated_inode = NULL;
4355	unsigned int lookup_flags = `0`;
4356	retry:
4357	error = filename_parentat(dfd, name, flags: lookup_flags, parent: &path, last: &last, type: &type);
4358	if (error)
4359	goto exit1;
4360
4361	error = -EISDIR;
4362	if (type != LAST_NORM)
4363	goto exit2;
4364
4365	error = mnt_want_write(mnt: path.mnt);
4366	if (error)
4367	goto exit2;
4368	retry_deleg:
4369	inode_lock_nested(inode: path.dentry->d_inode, subclass: I_MUTEX_PARENT);
4370	dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
4371	error = PTR_ERR(ptr: dentry);
4372	if (!IS_ERR(ptr: dentry)) {
4373
4374	/ Why not before? Because we want correct error value /
4375	if (last.name[last.len] \|\| d_is_negative(dentry))
4376	goto slashes;
4377	inode = dentry->d_inode;
4378	ihold(inode);
4379	error = security_path_unlink(dir: &path, dentry);
4380	if (error)
4381	goto exit3;
4382	error = vfs_unlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4383	dentry, &delegated_inode);
4384	exit3:
4385	dput(dentry);
4386	}
4387	inode_unlock(inode: path.dentry->d_inode);
4388	if (inode)
4389	iput(inode); / truncate the inode here /
4390	inode = NULL;
4391	if (delegated_inode) {
4392	error = break_deleg_wait(delegated_inode: &delegated_inode);
4393	if (!error)
4394	goto retry_deleg;
4395	}
4396	mnt_drop_write(mnt: path.mnt);
4397	exit2:
4398	path_put(&path);
4399	if (retry_estale(error, flags: lookup_flags)) {
4400	lookup_flags \|= LOOKUP_REVAL;
4401	inode = NULL;
4402	goto retry;
4403	}
4404	exit1:
4405	putname(name);
4406	return error;
4407
4408	slashes:
4409	if (d_is_negative(dentry))
4410	error = -ENOENT;
4411	else if (d_is_dir(dentry))
4412	error = -EISDIR;
4413	else
4414	error = -ENOTDIR;
4415	goto exit3;
4416	}
4417
4418	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user , pathname, int*, flag)
4419	{
4420	if ((flag & ~AT_REMOVEDIR) != `0`)
4421	return -EINVAL;
4422
4423	if (flag & AT_REMOVEDIR)
4424	return do_rmdir(dfd, name: getname(filename: pathname));
4425	return do_unlinkat(dfd, name: getname(filename: pathname));
4426	}
4427
4428	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
4429	{
4430	return do_unlinkat(AT_FDCWD, name: getname(filename: pathname));
4431	}
4432
4433	/**
4434	* vfs_symlink - create symlink
4435	* @idmap: idmap of the mount the inode was found from
4436	* @dir: inode of @dentry
4437	* @dentry: pointer to dentry of the base directory
4438	* @oldname: name of the file to link to
4439	*
4440	* Create a symlink.
4441	*
4442	* If the inode has been found through an idmapped mount the idmap of
4443	* the vfsmount must be passed through @idmap. This function will then take
4444	* care to map the inode according to @idmap before checking permissions.
4445	* On non-idmapped mounts or if permission checking is to be performed on the
4446	* raw inode simply passs @nop_mnt_idmap.
4447	*/
4448	int vfs_symlink(struct mnt_idmap idmap, struct* inode *dir,
4449	struct dentry dentry, const* char *oldname)
4450	{
4451	int error;
4452
4453	error = may_create(idmap, dir, child: dentry);
4454	if (error)
4455	return error;
4456
4457	if (!dir->i_op->symlink)
4458	return -EPERM;
4459
4460	error = security_inode_symlink(dir, dentry, old_name: oldname);
4461	if (error)
4462	return error;
4463
4464	error = dir->i_op->symlink(idmap, dir, dentry, oldname);
4465	if (!error)
4466	fsnotify_create(dir, dentry);
4467	return error;
4468	}
4469	EXPORT_SYMBOL(vfs_symlink);
4470
4471	int do_symlinkat(struct filename from, int* newdfd, struct filename *to)
4472	{
4473	int error;
4474	struct dentry *dentry;
4475	struct path path;
4476	unsigned int lookup_flags = `0`;
4477
4478	if (IS_ERR(ptr: from)) {
4479	error = PTR_ERR(ptr: from);
4480	goto out_putnames;
4481	}
4482	retry:
4483	dentry = filename_create(dfd: newdfd, name: to, path: &path, lookup_flags);
4484	error = PTR_ERR(ptr: dentry);
4485	if (IS_ERR(ptr: dentry))
4486	goto out_putnames;
4487
4488	error = security_path_symlink(dir: &path, dentry, old_name: from->name);
4489	if (!error)
4490	error = vfs_symlink(mnt_idmap(mnt: path.mnt), path.dentry->d_inode,
4491	dentry, from->name);
4492	done_path_create(&path, dentry);
4493	if (retry_estale(error, flags: lookup_flags)) {
4494	lookup_flags \|= LOOKUP_REVAL;
4495	goto retry;
4496	}
4497	out_putnames:
4498	putname(to);
4499	putname(from);
4500	return error;
4501	}
4502
4503	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
4504	int, newdfd, const char __user *, newname)
4505	{
4506	return do_symlinkat(from: getname(filename: oldname), newdfd, to: getname(filename: newname));
4507	}
4508
4509	SYSCALL_DEFINE2(symlink, const char __user , oldname, const* char __user *, newname)
4510	{
4511	return do_symlinkat(from: getname(filename: oldname), AT_FDCWD, to: getname(filename: newname));
4512	}
4513
4514	/**
4515	* vfs_link - create a new link
4516	* @old_dentry: object to be linked
4517	* @idmap: idmap of the mount
4518	* @dir: new parent
4519	* @new_dentry: where to create the new link
4520	* @delegated_inode: returns inode needing a delegation break
4521	*
4522	* The caller must hold dir->i_mutex
4523	*
4524	* If vfs_link discovers a delegation on the to-be-linked file in need
4525	* of breaking, it will return -EWOULDBLOCK and return a reference to the
4526	* inode in delegated_inode. The caller should then break the delegation
4527	* and retry. Because breaking a delegation may take a long time, the
4528	* caller should drop the i_mutex before doing so.
4529	*
4530	* Alternatively, a caller may pass NULL for delegated_inode. This may
4531	* be appropriate for callers that expect the underlying filesystem not
4532	* to be NFS exported.
4533	*
4534	* If the inode has been found through an idmapped mount the idmap of
4535	* the vfsmount must be passed through @idmap. This function will then take
4536	* care to map the inode according to @idmap before checking permissions.
4537	* On non-idmapped mounts or if permission checking is to be performed on the
4538	* raw inode simply passs @nop_mnt_idmap.
4539	*/
4540	int vfs_link(struct dentry old_dentry, struct* mnt_idmap *idmap,
4541	struct inode dir, struct* dentry *new_dentry,
4542	struct inode **delegated_inode)
4543	{
4544	struct inode *inode = old_dentry->d_inode;
4545	unsigned max_links = dir->i_sb->s_max_links;
4546	int error;
4547
4548	if (!inode)
4549	return -ENOENT;
4550
4551	error = may_create(idmap, dir, child: new_dentry);
4552	if (error)
4553	return error;
4554
4555	if (dir->i_sb != inode->i_sb)
4556	return -EXDEV;
4557
4558	/*
4559	* A link to an append-only or immutable file cannot be created.
4560	*/
4561	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
4562	return -EPERM;
4563	/*
4564	* Updating the link count will likely cause i_uid and i_gid to
4565	* be writen back improperly if their true value is unknown to
4566	* the vfs.
4567	*/
4568	if (HAS_UNMAPPED_ID(idmap, inode))
4569	return -EPERM;
4570	if (!dir->i_op->link)
4571	return -EPERM;
4572	if (S_ISDIR(inode->i_mode))
4573	return -EPERM;
4574
4575	error = security_inode_link(old_dentry, dir, new_dentry);
4576	if (error)
4577	return error;
4578
4579	inode_lock(inode);
4580	/ Make sure we don't allow creating hardlink to an unlinked file /
4581	if (inode->i_nlink == `0` && !(inode->i_state & I_LINKABLE))
4582	error = -ENOENT;
4583	else if (max_links && inode->i_nlink >= max_links)
4584	error = -EMLINK;
4585	else {
4586	error = try_break_deleg(inode, delegated_inode);
4587	if (!error)
4588	error = dir->i_op->link(old_dentry, dir, new_dentry);
4589	}
4590
4591	if (!error && (inode->i_state & I_LINKABLE)) {
4592	spin_lock(lock: &inode->i_lock);
4593	inode->i_state &= ~I_LINKABLE;
4594	spin_unlock(lock: &inode->i_lock);
4595	}
4596	inode_unlock(inode);
4597	if (!error)
4598	fsnotify_link(dir, inode, new_dentry);
4599	return error;
4600	}
4601	EXPORT_SYMBOL(vfs_link);
4602
4603	/*
4604	* Hardlinks are often used in delicate situations. We avoid
4605	* security-related surprises by not following symlinks on the
4606	* newname. --KAB
4607	*
4608	* We don't follow them on the oldname either to be compatible
4609	* with linux 2.0, and to avoid hard-linking to directories
4610	* and other special files. --ADM
4611	*/
4612	int do_linkat(int olddfd, struct filename old, int* newdfd,
4613	struct filename new, int* flags)
4614	{
4615	struct mnt_idmap *idmap;
4616	struct dentry *new_dentry;
4617	struct path old_path, new_path;
4618	struct inode *delegated_inode = NULL;
4619	int how = `0`;
4620	int error;
4621
4622	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != `0`) {
4623	error = -EINVAL;
4624	goto out_putnames;
4625	}
4626	/*
4627	* To use null names we require CAP_DAC_READ_SEARCH
4628	* This ensures that not everyone will be able to create
4629	* handlink using the passed filedescriptor.
4630	*/
4631	if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
4632	error = -ENOENT;
4633	goto out_putnames;
4634	}
4635
4636	if (flags & AT_SYMLINK_FOLLOW)
4637	how \|= LOOKUP_FOLLOW;
4638	retry:
4639	error = filename_lookup(dfd: olddfd, name: old, flags: how, path: &old_path, NULL);
4640	if (error)
4641	goto out_putnames;
4642
4643	new_dentry = filename_create(dfd: newdfd, name: new, path: &new_path,
4644	lookup_flags: (how & LOOKUP_REVAL));
4645	error = PTR_ERR(ptr: new_dentry);
4646	if (IS_ERR(ptr: new_dentry))
4647	goto out_putpath;
4648
4649	error = -EXDEV;
4650	if (old_path.mnt != new_path.mnt)
4651	goto out_dput;
4652	idmap = mnt_idmap(mnt: new_path.mnt);
4653	error = may_linkat(idmap, link: &old_path);
4654	if (unlikely(error))
4655	goto out_dput;
4656	error = security_path_link(old_dentry: old_path.dentry, new_dir: &new_path, new_dentry);
4657	if (error)
4658	goto out_dput;
4659	error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
4660	new_dentry, &delegated_inode);
4661	out_dput:
4662	done_path_create(&new_path, new_dentry);
4663	if (delegated_inode) {
4664	error = break_deleg_wait(delegated_inode: &delegated_inode);
4665	if (!error) {
4666	path_put(&old_path);
4667	goto retry;
4668	}
4669	}
4670	if (retry_estale(error, flags: how)) {
4671	path_put(&old_path);
4672	how \|= LOOKUP_REVAL;
4673	goto retry;
4674	}
4675	out_putpath:
4676	path_put(&old_path);
4677	out_putnames:
4678	putname(old);
4679	putname(new);
4680
4681	return error;
4682	}
4683
4684	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
4685	int, newdfd, const char __user , newname, int*, flags)
4686	{
4687	return do_linkat(olddfd, old: getname_uflags(filename: oldname, uflags: flags),
4688	newdfd, new: getname(filename: newname), flags);
4689	}
4690
4691	SYSCALL_DEFINE2(link, const char __user , oldname, const* char __user *, newname)
4692	{
4693	return do_linkat(AT_FDCWD, old: getname(filename: oldname), AT_FDCWD, new: getname(filename: newname), flags: `0`);
4694	}
4695
4696	/**
4697	* vfs_rename - rename a filesystem object
4698	* @rd: pointer to &struct renamedata info
4699	*
4700	* The caller must hold multiple mutexes--see lock_rename()).
4701	*
4702	* If vfs_rename discovers a delegation in need of breaking at either
4703	* the source or destination, it will return -EWOULDBLOCK and return a
4704	* reference to the inode in delegated_inode. The caller should then
4705	* break the delegation and retry. Because breaking a delegation may
4706	* take a long time, the caller should drop all locks before doing
4707	* so.
4708	*
4709	* Alternatively, a caller may pass NULL for delegated_inode. This may
4710	* be appropriate for callers that expect the underlying filesystem not
4711	* to be NFS exported.
4712	*
4713	* The worst of all namespace operations - renaming directory. "Perverted"
4714	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4715	* Problems:
4716	*
4717	* a) we can get into loop creation.
4718	* b) race potential - two innocent renames can create a loop together.
4719	* That's where 4.4 screws up. Current fix: serialization on
4720	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
4721	* story.
4722	* c) we have to lock _four_ objects - parents and victim (if it exists),
4723	* and source.
4724	* And that - after we got ->i_mutex on parents (until then we don't know
4725	* whether the target exists). Solution: try to be smart with locking
4726	* order for inodes. We rely on the fact that tree topology may change
4727	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
4728	* move will be locked. Thus we can rank directories by the tree
4729	* (ancestors first) and rank all non-directories after them.
4730	* That works since everybody except rename does "lock parent, lookup,
4731	* lock child" and rename is under ->s_vfs_rename_mutex.
4732	* HOWEVER, it relies on the assumption that any object with ->lookup()
4733	* has no more than 1 dentry. If "hybrid" objects will ever appear,
4734	* we'd better make sure that there's no link(2) for them.
4735	* d) conversion from fhandle to dentry may come in the wrong moment - when
4736	* we are removing the target. Solution: we will have to grab ->i_mutex
4737	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
4738	* ->i_mutex on parents, which works but leads to some truly excessive
4739	* locking].
4740	*/
4741	int vfs_rename(struct renamedata *rd)
4742	{
4743	int error;
4744	struct inode old_dir = rd->old_dir, new_dir = rd->new_dir;
4745	struct dentry *old_dentry = rd->old_dentry;
4746	struct dentry *new_dentry = rd->new_dentry;
4747	struct inode **delegated_inode = rd->delegated_inode;
4748	unsigned int flags = rd->flags;
4749	bool is_dir = d_is_dir(dentry: old_dentry);
4750	struct inode *source = old_dentry->d_inode;
4751	struct inode *target = new_dentry->d_inode;
4752	bool new_is_dir = false;
4753	unsigned max_links = new_dir->i_sb->s_max_links;
4754	struct name_snapshot old_name;
4755
4756	if (source == target)
4757	return `0`;
4758
4759	error = may_delete(idmap: rd->old_mnt_idmap, dir: old_dir, victim: old_dentry, isdir: is_dir);
4760	if (error)
4761	return error;
4762
4763	if (!target) {
4764	error = may_create(idmap: rd->new_mnt_idmap, dir: new_dir, child: new_dentry);
4765	} else {
4766	new_is_dir = d_is_dir(dentry: new_dentry);
4767
4768	if (!(flags & RENAME_EXCHANGE))
4769	error = may_delete(idmap: rd->new_mnt_idmap, dir: new_dir,
4770	victim: new_dentry, isdir: is_dir);
4771	else
4772	error = may_delete(idmap: rd->new_mnt_idmap, dir: new_dir,
4773	victim: new_dentry, isdir: new_is_dir);
4774	}
4775	if (error)
4776	return error;
4777
4778	if (!old_dir->i_op->rename)
4779	return -EPERM;
4780
4781	/*
4782	* If we are going to change the parent - check write permissions,
4783	* we'll need to flip '..'.
4784	*/
4785	if (new_dir != old_dir) {
4786	if (is_dir) {
4787	error = inode_permission(rd->old_mnt_idmap, source,
4788	MAY_WRITE);
4789	if (error)
4790	return error;
4791	}
4792	if ((flags & RENAME_EXCHANGE) && new_is_dir) {
4793	error = inode_permission(rd->new_mnt_idmap, target,
4794	MAY_WRITE);
4795	if (error)
4796	return error;
4797	}
4798	}
4799
4800	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
4801	flags);
4802	if (error)
4803	return error;
4804
4805	take_dentry_name_snapshot(&old_name, old_dentry);
4806	dget(dentry: new_dentry);
4807	/*
4808	* Lock all moved children. Moved directories may need to change parent
4809	* pointer so they need the lock to prevent against concurrent
4810	* directory changes moving parent pointer. For regular files we've
4811	* historically always done this. The lockdep locking subclasses are
4812	* somewhat arbitrary but RENAME_EXCHANGE in particular can swap
4813	* regular files and directories so it's difficult to tell which
4814	* subclasses to use.
4815	*/
4816	lock_two_inodes(inode1: source, inode2: target, subclass1: I_MUTEX_NORMAL, subclass2: I_MUTEX_NONDIR2);
4817
4818	error = -EPERM;
4819	if (IS_SWAPFILE(source) \|\| (target && IS_SWAPFILE(target)))
4820	goto out;
4821
4822	error = -EBUSY;
4823	if (is_local_mountpoint(dentry: old_dentry) \|\| is_local_mountpoint(dentry: new_dentry))
4824	goto out;
4825
4826	if (max_links && new_dir != old_dir) {
4827	error = -EMLINK;
4828	if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
4829	goto out;
4830	if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
4831	old_dir->i_nlink >= max_links)
4832	goto out;
4833	}
4834	if (!is_dir) {
4835	error = try_break_deleg(inode: source, delegated_inode);
4836	if (error)
4837	goto out;
4838	}
4839	if (target && !new_is_dir) {
4840	error = try_break_deleg(inode: target, delegated_inode);
4841	if (error)
4842	goto out;
4843	}
4844	error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
4845	new_dir, new_dentry, flags);
4846	if (error)
4847	goto out;
4848
4849	if (!(flags & RENAME_EXCHANGE) && target) {
4850	if (is_dir) {
4851	shrink_dcache_parent(new_dentry);
4852	target->i_flags \|= S_DEAD;
4853	}
4854	dont_mount(dentry: new_dentry);
4855	detach_mounts(dentry: new_dentry);
4856	}
4857	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4858	if (!(flags & RENAME_EXCHANGE))
4859	d_move(old_dentry, new_dentry);
4860	else
4861	d_exchange(old_dentry, new_dentry);
4862	}
4863	out:
4864	inode_unlock(inode: source);
4865	if (target)
4866	inode_unlock(inode: target);
4867	dput(new_dentry);
4868	if (!error) {
4869	fsnotify_move(old_dir, new_dir, old_name: &old_name.name, isdir: is_dir,
4870	target: !(flags & RENAME_EXCHANGE) ? target : NULL, moved: old_dentry);
4871	if (flags & RENAME_EXCHANGE) {
4872	fsnotify_move(old_dir: new_dir, new_dir: old_dir, old_name: &old_dentry->d_name,
4873	isdir: new_is_dir, NULL, moved: new_dentry);
4874	}
4875	}
4876	release_dentry_name_snapshot(&old_name);
4877
4878	return error;
4879	}
4880	EXPORT_SYMBOL(vfs_rename);
4881
4882	int do_renameat2(int olddfd, struct filename from, int* newdfd,
4883	struct filename to, unsigned* int flags)
4884	{
4885	struct renamedata rd;
4886	struct dentry old_dentry, new_dentry;
4887	struct dentry *trap;
4888	struct path old_path, new_path;
4889	struct qstr old_last, new_last;
4890	int old_type, new_type;
4891	struct inode *delegated_inode = NULL;
4892	unsigned int lookup_flags = `0`, target_flags = LOOKUP_RENAME_TARGET;
4893	bool should_retry = false;
4894	int error = -EINVAL;
4895
4896	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
4897	goto put_names;
4898
4899	if ((flags & (RENAME_NOREPLACE \| RENAME_WHITEOUT)) &&
4900	(flags & RENAME_EXCHANGE))
4901	goto put_names;
4902
4903	if (flags & RENAME_EXCHANGE)
4904	target_flags = `0`;
4905
4906	retry:
4907	error = filename_parentat(dfd: olddfd, name: from, flags: lookup_flags, parent: &old_path,
4908	last: &old_last, type: &old_type);
4909	if (error)
4910	goto put_names;
4911
4912	error = filename_parentat(dfd: newdfd, name: to, flags: lookup_flags, parent: &new_path, last: &new_last,
4913	type: &new_type);
4914	if (error)
4915	goto exit1;
4916
4917	error = -EXDEV;
4918	if (old_path.mnt != new_path.mnt)
4919	goto exit2;
4920
4921	error = -EBUSY;
4922	if (old_type != LAST_NORM)
4923	goto exit2;
4924
4925	if (flags & RENAME_NOREPLACE)
4926	error = -EEXIST;
4927	if (new_type != LAST_NORM)
4928	goto exit2;
4929
4930	error = mnt_want_write(mnt: old_path.mnt);
4931	if (error)
4932	goto exit2;
4933
4934	retry_deleg:
4935	trap = lock_rename(new_path.dentry, old_path.dentry);
4936
4937	old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
4938	lookup_flags);
4939	error = PTR_ERR(ptr: old_dentry);
4940	if (IS_ERR(ptr: old_dentry))
4941	goto exit3;
4942	/ source must exist /
4943	error = -ENOENT;
4944	if (d_is_negative(dentry: old_dentry))
4945	goto exit4;
4946	new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
4947	lookup_flags \| target_flags);
4948	error = PTR_ERR(ptr: new_dentry);
4949	if (IS_ERR(ptr: new_dentry))
4950	goto exit4;
4951	error = -EEXIST;
4952	if ((flags & RENAME_NOREPLACE) && d_is_positive(dentry: new_dentry))
4953	goto exit5;
4954	if (flags & RENAME_EXCHANGE) {
4955	error = -ENOENT;
4956	if (d_is_negative(dentry: new_dentry))
4957	goto exit5;
4958
4959	if (!d_is_dir(dentry: new_dentry)) {
4960	error = -ENOTDIR;
4961	if (new_last.name[new_last.len])
4962	goto exit5;
4963	}
4964	}
4965	/ unless the source is a directory trailing slashes give -ENOTDIR /
4966	if (!d_is_dir(dentry: old_dentry)) {
4967	error = -ENOTDIR;
4968	if (old_last.name[old_last.len])
4969	goto exit5;
4970	if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
4971	goto exit5;
4972	}
4973	/ source should not be ancestor of target /
4974	error = -EINVAL;
4975	if (old_dentry == trap)
4976	goto exit5;
4977	/ target should not be an ancestor of source /
4978	if (!(flags & RENAME_EXCHANGE))
4979	error = -ENOTEMPTY;
4980	if (new_dentry == trap)
4981	goto exit5;
4982
4983	error = security_path_rename(old_dir: &old_path, old_dentry,
4984	new_dir: &new_path, new_dentry, flags);
4985	if (error)
4986	goto exit5;
4987
4988	rd.old_dir = old_path.dentry->d_inode;
4989	rd.old_dentry = old_dentry;
4990	rd.old_mnt_idmap = mnt_idmap(mnt: old_path.mnt);
4991	rd.new_dir = new_path.dentry->d_inode;
4992	rd.new_dentry = new_dentry;
4993	rd.new_mnt_idmap = mnt_idmap(mnt: new_path.mnt);
4994	rd.delegated_inode = &delegated_inode;
4995	rd.flags = flags;
4996	error = vfs_rename(&rd);
4997	exit5:
4998	dput(new_dentry);
4999	exit4:
5000	dput(old_dentry);
5001	exit3:
5002	unlock_rename(new_path.dentry, old_path.dentry);
5003	if (delegated_inode) {
5004	error = break_deleg_wait(delegated_inode: &delegated_inode);
5005	if (!error)
5006	goto retry_deleg;
5007	}
5008	mnt_drop_write(mnt: old_path.mnt);
5009	exit2:
5010	if (retry_estale(error, flags: lookup_flags))
5011	should_retry = true;
5012	path_put(&new_path);
5013	exit1:
5014	path_put(&old_path);
5015	if (should_retry) {
5016	should_retry = false;
5017	lookup_flags \|= LOOKUP_REVAL;
5018	goto retry;
5019	}
5020	put_names:
5021	putname(from);
5022	putname(to);
5023	return error;
5024	}
5025
5026	SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
5027	int, newdfd, const char __user , newname, unsigned* int, flags)
5028	{
5029	return do_renameat2(olddfd, from: getname(filename: oldname), newdfd, to: getname(filename: newname),
5030	flags);
5031	}
5032
5033	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
5034	int, newdfd, const char __user *, newname)
5035	{
5036	return do_renameat2(olddfd, from: getname(filename: oldname), newdfd, to: getname(filename: newname),
5037	flags: `0`);
5038	}
5039
5040	SYSCALL_DEFINE2(rename, const char __user , oldname, const* char __user *, newname)
5041	{
5042	return do_renameat2(AT_FDCWD, from: getname(filename: oldname), AT_FDCWD,
5043	to: getname(filename: newname), flags: `0`);
5044	}
5045
5046	int readlink_copy(char __user buffer, int* buflen, const char *link)
5047	{
5048	int len = PTR_ERR(ptr: link);
5049	if (IS_ERR(ptr: link))
5050	goto out;
5051
5052	len = strlen(link);
5053	if (len > (unsigned) buflen)
5054	len = buflen;
5055	if (copy_to_user(to: buffer, from: link, n: len))
5056	len = -EFAULT;
5057	out:
5058	return len;
5059	}
5060
5061	/**
5062	* vfs_readlink - copy symlink body into userspace buffer
5063	* @dentry: dentry on which to get symbolic link
5064	* @buffer: user memory pointer
5065	* @buflen: size of buffer
5066	*
5067	* Does not touch atime. That's up to the caller if necessary
5068	*
5069	* Does not call security hook.
5070	*/
5071	int vfs_readlink(struct dentry dentry, char* __user buffer, int* buflen)
5072	{
5073	struct inode *inode = d_inode(dentry);
5074	DEFINE_DELAYED_CALL(done);
5075	const char *link;
5076	int res;
5077
5078	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
5079	if (unlikely(inode->i_op->readlink))
5080	return inode->i_op->readlink(dentry, buffer, buflen);
5081
5082	if (!d_is_symlink(dentry))
5083	return -EINVAL;
5084
5085	spin_lock(lock: &inode->i_lock);
5086	inode->i_opflags \|= IOP_DEFAULT_READLINK;
5087	spin_unlock(lock: &inode->i_lock);
5088	}
5089
5090	link = READ_ONCE(inode->i_link);
5091	if (!link) {
5092	link = inode->i_op->get_link(dentry, inode, &done);
5093	if (IS_ERR(ptr: link))
5094	return PTR_ERR(ptr: link);
5095	}
5096	res = readlink_copy(buffer, buflen, link);
5097	do_delayed_call(call: &done);
5098	return res;
5099	}
5100	EXPORT_SYMBOL(vfs_readlink);
5101
5102	/**
5103	* vfs_get_link - get symlink body
5104	* @dentry: dentry on which to get symbolic link
5105	* @done: caller needs to free returned data with this
5106	*
5107	* Calls security hook and i_op->get_link() on the supplied inode.
5108	*
5109	* It does not touch atime. That's up to the caller if necessary.
5110	*
5111	* Does not work on "special" symlinks like /proc/$$/fd/N
5112	*/
5113	const char vfs_get_link(struct* dentry dentry, struct* delayed_call *done)
5114	{
5115	const char *res = ERR_PTR(error: -EINVAL);
5116	struct inode *inode = d_inode(dentry);
5117
5118	if (d_is_symlink(dentry)) {
5119	res = ERR_PTR(error: security_inode_readlink(dentry));
5120	if (!res)
5121	res = inode->i_op->get_link(dentry, inode, done);
5122	}
5123	return res;
5124	}
5125	EXPORT_SYMBOL(vfs_get_link);
5126
5127	/ get the link contents into pagecache /
5128	const char page_get_link(struct* dentry dentry, struct* inode *inode,
5129	struct delayed_call *callback)
5130	{
5131	char *kaddr;
5132	struct page *page;
5133	struct address_space *mapping = inode->i_mapping;
5134
5135	if (!dentry) {
5136	page = find_get_page(mapping, offset: `0`);
5137	if (!page)
5138	return ERR_PTR(error: -ECHILD);
5139	if (!PageUptodate(page)) {
5140	put_page(page);
5141	return ERR_PTR(error: -ECHILD);
5142	}
5143	} else {
5144	page = read_mapping_page(mapping, index: `0`, NULL);
5145	if (IS_ERR(ptr: page))
5146	return (char*)page;
5147	}
5148	set_delayed_call(call: callback, fn: page_put_link, arg: page);
5149	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
5150	kaddr = page_address(page);
5151	nd_terminate_link(name: kaddr, len: inode->i_size, PAGE_SIZE - `1`);
5152	return kaddr;
5153	}
5154
5155	EXPORT_SYMBOL(page_get_link);
5156
5157	void page_put_link(void *arg)
5158	{
5159	put_page(page: arg);
5160	}
5161	EXPORT_SYMBOL(page_put_link);
5162
5163	int page_readlink(struct dentry dentry, char* __user buffer, int* buflen)
5164	{
5165	DEFINE_DELAYED_CALL(done);
5166	int res = readlink_copy(buffer, buflen,
5167	link: page_get_link(dentry, d_inode(dentry),
5168	&done));
5169	do_delayed_call(call: &done);
5170	return res;
5171	}
5172	EXPORT_SYMBOL(page_readlink);
5173
5174	int page_symlink(struct inode inode, const* char symname, int* len)
5175	{
5176	struct address_space *mapping = inode->i_mapping;
5177	const struct address_space_operations *aops = mapping->a_ops;
5178	bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
5179	struct page *page;
5180	void *fsdata = NULL;
5181	int err;
5182	unsigned int flags;
5183
5184	retry:
5185	if (nofs)
5186	flags = memalloc_nofs_save();
5187	err = aops->write_begin(NULL, mapping, `0`, len-`1`, &page, &fsdata);
5188	if (nofs)
5189	memalloc_nofs_restore(flags);
5190	if (err)
5191	goto fail;
5192
5193	memcpy(page_address(page), symname, len-`1`);
5194
5195	err = aops->write_end(NULL, mapping, `0`, len-`1`, len-`1`,
5196	page, fsdata);
5197	if (err < `0`)
5198	goto fail;
5199	if (err < len-`1`)
5200	goto retry;
5201
5202	mark_inode_dirty(inode);
5203	return `0`;
5204	fail:
5205	return err;
5206	}
5207	EXPORT_SYMBOL(page_symlink);
5208
5209	const struct inode_operations page_symlink_inode_operations = {
5210	.get_link = page_get_link,
5211	};
5212	EXPORT_SYMBOL(page_symlink_inode_operations);
5213

source code of linux/fs/namei.c